Free cookie consent management tool by TermsFeed Policy Generator

source: branches/3040_VectorBasedGP/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Classification/TimeSeries/TimeSeriesInstanceProvider.cs @ 17418

Last change on this file since 17418 was 17418, checked in by pfleck, 4 years ago

#3040

  • (partially) enabled data preprocessing for vectorial data
  • use flat zip-files for large benchmarks instead of embedded resources (faster build times)
  • added multiple variants of vector benchmark I (vector lengh constraints)
File size: 8.8 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections;
24using System.Collections.Generic;
25using System.Collections.ObjectModel;
26using System.Diagnostics;
27using System.Globalization;
28using System.IO;
29using System.IO.Compression;
30using System.Linq;
31using HeuristicLab.Problems.DataAnalysis;
32
33namespace HeuristicLab.Problems.Instances.DataAnalysis {
34  public abstract class TimeSeriesInstanceProvider : ResourceClassificationInstanceProvider {
35    //public override string Name {
36    //  get { return "TimeSeries (Univariate) Problems"; }
37    //}
38    public override string Description {
39      get { return "UEA & UCR TimeSeries Problems"; }
40    }
41    public override Uri WebLink {
42      get { return new Uri("http://www.timeseriesclassification.com/"); }
43    }
44    public override string ReferencePublication {
45      get { return "Anthony Bagnall, Jason Lines, William Vickers and Eamonn Keogh, The UEA & UCR Time Series Classification Repository, www.timeseriesclassification.com"; }
46    }
47
48    public override IClassificationProblemData LoadData(IDataDescriptor id) {
49      var descriptor = (TimeSeriesDataDescriptor)id;
50      using (var instancesZipFile = OpenZipArchive()) {
51        var trainingEntry = instancesZipFile.GetEntry(descriptor.TrainingEntryName);
52        var testEntry = instancesZipFile.GetEntry(descriptor.TestEntryName);
53
54        if (trainingEntry == null || testEntry == null) {
55          throw new InvalidOperationException("The training or test entry could not be found in the archive.");
56        }
57
58        using (var trainingReader = new StreamReader(trainingEntry.Open()))
59        using (var testReader = new StreamReader(testEntry.Open())) {
60          ParseMetadata(trainingReader, out var inputVariables, out string targetVariable, out var classLabels);
61          ParseMetadata(testReader, out _, out _, out _); // ignore outputs
62
63          // Read data
64          var inputsData = new List<DoubleVector>[inputVariables.Count];
65          for (int i = 0; i < inputsData.Length; i++) inputsData[i] = new List<DoubleVector>();
66          bool numericTarget = classLabels.All(label => !double.IsNaN(ParseNumber(label)));
67          IList targetData = numericTarget ? new List<double>() : new List<string>() as IList;
68          ReadData(trainingReader, inputsData, targetData, out int numTrainingRows);
69          ReadData(testReader, inputsData, targetData, out int numTestRows);
70
71          // Translate class values to numeric values
72          if (targetData is List<string> stringTargetData) {
73            var labelTranslation = classLabels
74              .Select((x, i) => new { Label = x, i })
75              .ToDictionary(x => x.Label, x => (double)x.i);
76            targetData = stringTargetData.Select(label => labelTranslation[label]).ToList();
77          }
78
79          // Build dataset
80          var dataset = new Dataset(
81            inputVariables.Concat(new[] { targetVariable }),
82            inputsData.Concat(new[] { targetData })
83          );
84          Debug.Assert(dataset.Rows == numTrainingRows + numTestRows);
85          Debug.Assert(dataset.Columns == inputVariables.Count + 1);
86
87          // Build problem data
88          var problemData = new ClassificationProblemData(dataset, inputVariables, targetVariable) {
89            Name = descriptor.Name
90          };
91          problemData.TrainingPartition.Start = 0;
92          problemData.TrainingPartition.End = numTrainingRows;
93          problemData.TestPartition.Start = numTrainingRows;
94          problemData.TestPartition.End = numTrainingRows + numTestRows;
95
96          return problemData;
97        }
98      }
99    }
100
101    private static void ParseMetadata(StreamReader reader, out List<string> inputVariables, out string targetVariable, out List<string> classLabels) {
102      int nrOfInputs = 0;
103      IEnumerable<string> labels = null;
104      bool dataStart = false;
105
106      while (!reader.EndOfStream && !dataStart) {
107        var line = reader.ReadLine();
108        if (line.StartsWith("#")) {
109          // Comment
110        } else if (line.StartsWith("@")) {
111          var splits = line.Split(' ');
112          var type = splits.First();
113          var arguments = splits.Skip(1).ToList();
114          switch (type) {
115            case "@univariate":
116              bool univariate = bool.Parse(arguments[0]);
117              if (univariate)
118                nrOfInputs = 1;
119              break;
120            case "@dimensions":
121              int dimensions = int.Parse(arguments[0]);
122              nrOfInputs = dimensions;
123              break;
124            case "@classLabel":
125              bool containLabels = bool.Parse(arguments[0]);
126              if (containLabels)
127                labels = arguments.Skip(1);
128              break;
129            case "@data":
130              dataStart = true;
131              break;
132          }
133        } else {
134          throw new InvalidOperationException("A data section already occurred within metadata section.");
135        }
136      }
137
138      int digits = Math.Max((int)Math.Log10(nrOfInputs - 1) + 1, 1);
139      inputVariables = Enumerable.Range(0, nrOfInputs)
140        .Select(i => "X" + i.ToString("D" + digits))
141        .ToList();
142
143      targetVariable = "Y";
144
145      classLabels = labels.ToList();
146    }
147
148    private static void ReadData(StreamReader reader, List<DoubleVector>[] inputsData, IList targetData, out int count) {
149      var numericTargetData = targetData as List<double>;
150      var stringTargetData = targetData as List<string>;
151
152      count = 0;
153      while (!reader.EndOfStream) {
154        var line = reader.ReadLine();
155        var variables = line.Split(':');
156
157        // parse all except last, which is the non-vector target
158        for (int i = 0; i < variables.Length - 1; i++) {
159          var variable = variables[i];
160          var numbers = variable
161            .Split(',')
162            .Select(ParseNumber)
163            .ToList();
164          inputsData[i].Add(new DoubleVector(numbers));
165        }
166
167        var target = variables[variables.Length - 1];
168        if (numericTargetData != null) numericTargetData.Add(ParseNumber(target));
169        else if (stringTargetData != null) stringTargetData.Add(target);
170        else throw new InvalidOperationException("Target must either be numeric or a string.");
171
172        count++;
173      }
174    }
175
176    private static double ParseNumber(string number) {
177      return
178        double.TryParse(number, NumberStyles.Float, CultureInfo.InvariantCulture, out double parsed)
179          ? parsed
180          : double.NaN;
181    }
182
183    public override IEnumerable<IDataDescriptor> GetDataDescriptors() {
184      using (var instancesZipFile = OpenZipArchive()) {
185        var instances = GroupEntriesByInstance(instancesZipFile.Entries);
186        var descriptors = instances.Select(instance => CreateDescriptor(instance.Key, instance.Value));
187
188        return descriptors.ToList();
189      }
190    }
191
192    private ZipArchive OpenZipArchive() {
193      var instanceArchiveName = Path.Combine("Classification", "Data", FileName + ".zip");
194      var stream = new FileStream(instanceArchiveName, FileMode.Open, FileAccess.Read, FileShare.Read);
195      return new ZipArchive(stream, ZipArchiveMode.Read);
196    }
197
198    private static IDictionary<string, List<ZipArchiveEntry>> GroupEntriesByInstance(ReadOnlyCollection<ZipArchiveEntry> entries) {
199      var topLevelEntries = entries.Where(entry => string.IsNullOrEmpty(entry.Name)).ToList();
200
201      return topLevelEntries.ToDictionary(
202        entry => Path.GetDirectoryName(entry.FullName),
203        entry => entries.Except(topLevelEntries).Where(subEntry => subEntry.FullName.StartsWith(entry.FullName)).ToList());
204    }
205
206    private static TimeSeriesDataDescriptor CreateDescriptor(string name, List<ZipArchiveEntry> subEntries) {
207      var trainingEntry = subEntries.Single(entry => entry.Name.EndsWith("_TRAIN.ts"));
208      var testEntry = subEntries.Single(entry => entry.Name.EndsWith("_TEST.ts"));
209      return new TimeSeriesDataDescriptor(name, trainingEntry.FullName, testEntry.FullName);
210    }
211  }
212}
Note: See TracBrowser for help on using the repository browser.