Free cookie consent management tool by TermsFeed Policy Generator

source: branches/3040_VectorBasedGP/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Classification/TimeSeries/TimeSeriesInstanceProvider.cs @ 17401

Last change on this file since 17401 was 17401, checked in by pfleck, 4 years ago

#3040 Added parser for new benchmark data but did not commit the data yet (too large)

File size: 7.4 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections;
24using System.Collections.Generic;
25using System.Collections.ObjectModel;
26using System.Diagnostics;
27using System.Globalization;
28using System.IO;
29using System.IO.Compression;
30using System.Linq;
31using HeuristicLab.Problems.DataAnalysis;
32
33namespace HeuristicLab.Problems.Instances.DataAnalysis {
34  public abstract class TimeSeriesInstanceProvider : ResourceClassificationInstanceProvider {
35    //public override string Name {
36    //  get { return "TimeSeries (Univariate) Problems"; }
37    //}
38    public override string Description {
39      get { return "UEA & UCR TimeSeries Problems"; }
40    }
41    public override Uri WebLink {
42      get { return new Uri("http://www.timeseriesclassification.com/"); }
43    }
44    public override string ReferencePublication {
45      get { return "Anthony Bagnall, Jason Lines, William Vickers and Eamonn Keogh, The UEA & UCR Time Series Classification Repository, www.timeseriesclassification.com"; }
46    }
47
48    public override IClassificationProblemData LoadData(IDataDescriptor id) {
49      var descriptor = (TimeSeriesDataDescriptor)id;
50      using (var instancesZipFile = OpenZipArchive()) {
51        var trainingEntry = instancesZipFile.GetEntry(descriptor.TrainingEntryName);
52        var testEntry = instancesZipFile.GetEntry(descriptor.TestEntryName);
53
54        if (trainingEntry == null || testEntry == null) {
55          throw new InvalidOperationException("The training or test entry could not be found in the archive.");
56        }
57
58        using (var trainingReader = new StreamReader(trainingEntry.Open()))
59        using (var testReader = new StreamReader(testEntry.Open())) {
60          ParseMetadata(trainingReader, out var inputVariables, out string targetVariable);
61          ParseMetadata(testReader, out _, out _); // ignore outputs
62
63          // Read data
64          var inputsData = new List<DoubleVector>[inputVariables.Count];
65          for (int i = 0; i < inputsData.Length; i++) inputsData[i] = new List<DoubleVector>();
66          var targetData = new List<double>();
67          ReadData(trainingReader, inputsData, targetData, out int numTrainingRows);
68          ReadData(testReader, inputsData, targetData, out int numTestRows);
69
70          // Build dataset
71          var dataset = new Dataset(
72            inputVariables.Concat(new[] { targetVariable }),
73            inputsData.Cast<IList>().Concat(new[] { targetData })
74          );
75          Debug.Assert(dataset.Rows == numTrainingRows + numTestRows);
76          Debug.Assert(dataset.Columns == inputVariables.Count + 1);
77
78          // Build problem data
79          var problemData = new ClassificationProblemData(dataset, inputVariables, targetVariable) {
80            Name = descriptor.Name
81          };
82          problemData.TrainingPartition.Start = 0;
83          problemData.TrainingPartition.End = numTrainingRows;
84          problemData.TestPartition.Start = numTrainingRows;
85          problemData.TestPartition.End = numTrainingRows + numTestRows;
86
87          return problemData;
88        }
89      }
90    }
91
92    private static void ParseMetadata(StreamReader reader, out List<string> inputVariables, out string targetVariable) {
93      int nrOfInputs = 0;
94      bool dataStart = false;
95      while (!reader.EndOfStream && !dataStart) {
96        var line = reader.ReadLine();
97        if (line.StartsWith("#")) {
98          // Comment
99        } else if (line.StartsWith("@")) {
100          var splits = line.Split(' ');
101          var type = splits.First();
102          var arguments = splits.Skip(1).ToList();
103          switch (type.ToLowerInvariant()) {
104            case "@univariate":
105              bool univariate = bool.Parse(arguments[0]);
106              if (univariate)
107                nrOfInputs = 1;
108              break;
109            case "@dimensions":
110              int dimensions = int.Parse(arguments[0]);
111              nrOfInputs = dimensions;
112              break;
113            case "@data":
114              dataStart = true;
115              break;
116          }
117        } else {
118          throw new InvalidOperationException("A data section already occurred within metadata section.");
119        }
120      }
121
122      int digits = Math.Max((int)Math.Log10(nrOfInputs - 1) + 1, 1);
123      inputVariables = Enumerable.Range(0, nrOfInputs)
124        .Select(i => "X" + i.ToString("D" + digits))
125        .ToList();
126
127      targetVariable = "Y";
128    }
129
130    private static void ReadData(StreamReader reader, List<DoubleVector>[] inputsData, List<double> targetData, out int count) {
131      count = 0;
132      while (!reader.EndOfStream) {
133        var line = reader.ReadLine();
134        var variables = line.Split(':');
135
136        // parse all except last, which is the non-vector target
137        for (int i = 0; i < variables.Length - 1; i++) {
138          var variable = variables[i];
139          var numbers = variable
140            .Split(',')
141            .Select(d => double.Parse(d, CultureInfo.InvariantCulture))
142            .ToList();
143          inputsData[i].Add(new DoubleVector(numbers));
144        }
145
146        var target = double.Parse(variables[variables.Length - 1], CultureInfo.InvariantCulture);
147        targetData.Add(target);
148
149        count++;
150      }
151    }
152
153    public override IEnumerable<IDataDescriptor> GetDataDescriptors() {
154      using (var instancesZipFile = OpenZipArchive()) {
155        var instances = GroupEntriesByInstance(instancesZipFile.Entries);
156        var descriptors = instances.Select(instance => CreateDescriptor(instance.Key, instance.Value));
157
158        return descriptors.ToList();
159      }
160    }
161
162    private ZipArchive OpenZipArchive() {
163      var instanceArchiveName = GetResourceName(FileName + @"\.zip");
164      return new ZipArchive(GetType().Assembly.GetManifestResourceStream(instanceArchiveName), ZipArchiveMode.Read);
165    }
166
167    private static IDictionary<string, List<ZipArchiveEntry>> GroupEntriesByInstance(ReadOnlyCollection<ZipArchiveEntry> entries) {
168      var topLevelEntries = entries.Where(entry => string.IsNullOrEmpty(entry.Name)).ToList();
169
170      return topLevelEntries.ToDictionary(
171        entry => Path.GetDirectoryName(entry.FullName),
172        entry => entries.Except(topLevelEntries).Where(subEntry => subEntry.FullName.StartsWith(entry.FullName)).ToList());
173    }
174
175    private static TimeSeriesDataDescriptor CreateDescriptor(string name, List<ZipArchiveEntry> subEntries) {
176      var trainingEntry = subEntries.Single(entry => entry.Name.EndsWith("_TRAIN.ts"));
177      var testEntry = subEntries.Single(entry => entry.Name.EndsWith("_TEST.ts"));
178      return new TimeSeriesDataDescriptor(name, trainingEntry.FullName, testEntry.FullName);
179    }
180  }
181}
Note: See TracBrowser for help on using the repository browser.