Free cookie consent management tool by TermsFeed Policy Generator

Ignore:
Timestamp:
01/16/20 12:34:45 (5 years ago)
Author:
pfleck
Message:

#3040 Added parser for new benchmark data but did not commit the data yet (too large)

Location:
branches/3040_VectorBasedGP/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Classification/TimeSeries
Files:
1 added
4 copied

Legend:

Unmodified
Added
Removed
  • branches/3040_VectorBasedGP/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Classification/TimeSeries/TimeSeriesDataDescriptor.cs

    r17393 r17401  
    2020#endregion
    2121
    22 using System;
     22namespace HeuristicLab.Problems.Instances.DataAnalysis {
     23  public class TimeSeriesDataDescriptor : IDataDescriptor {
     24    public string Name { get; }
     25    public string Description { get { return string.Empty; } }
    2326
    24 namespace HeuristicLab.Problems.Instances.DataAnalysis {
    25   public abstract class UCIDataDescriptor : ResourceClassificationDataDescriptor {
    26     public override string Name { get { return String.Format("{0}, {1}, {2}", Filename, Donor, Year); } }
    27     public abstract string Filename { get; }
    28     public abstract string Donor { get; }
    29     public abstract int Year { get; }
     27    internal string TrainingEntryName { get; }
     28    internal string TestEntryName { get; }
     29
     30    public TimeSeriesDataDescriptor(string name, string trainingEntryName, string testEntryName) {
     31      Name = name;
     32      TrainingEntryName = trainingEntryName;
     33      TestEntryName = testEntryName;
     34    }
    3035  }
    3136}
  • branches/3040_VectorBasedGP/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Classification/TimeSeries/TimeSeriesInstanceProvider.cs

    r17393 r17401  
    2121
    2222using System;
     23using System.Collections;
    2324using System.Collections.Generic;
     25using System.Collections.ObjectModel;
     26using System.Diagnostics;
     27using System.Globalization;
    2428using System.IO;
    2529using System.IO.Compression;
    2630using System.Linq;
     31using HeuristicLab.Problems.DataAnalysis;
    2732
    2833namespace HeuristicLab.Problems.Instances.DataAnalysis {
    29   public class UCIInstanceProvider : ResourceClassificationInstanceProvider {
    30     public override string Name {
    31       get { return "UCI Problems"; }
    32     }
     34  public abstract class TimeSeriesInstanceProvider : ResourceClassificationInstanceProvider {
     35    //public override string Name {
     36    //  get { return "TimeSeries (Univariate) Problems"; }
     37    //}
    3338    public override string Description {
    34       get {
    35         return "";
    36       }
     39      get { return "UEA & UCR TimeSeries Problems"; }
    3740    }
    3841    public override Uri WebLink {
    39       get { return new Uri("http://archive.ics.uci.edu/ml/datasets.html"); }
     42      get { return new Uri("http://www.timeseriesclassification.com/"); }
    4043    }
    4144    public override string ReferencePublication {
    42       get { return ""; }
     45      get { return "Anthony Bagnall, Jason Lines, William Vickers and Eamonn Keogh, The UEA & UCR Time Series Classification Repository, www.timeseriesclassification.com"; }
    4346    }
    4447
    45     protected override string FileName { get { return "UCI"; } }
     48    public override IClassificationProblemData LoadData(IDataDescriptor id) {
     49      var descriptor = (TimeSeriesDataDescriptor)id;
     50      using (var instancesZipFile = OpenZipArchive()) {
     51        var trainingEntry = instancesZipFile.GetEntry(descriptor.TrainingEntryName);
     52        var testEntry = instancesZipFile.GetEntry(descriptor.TestEntryName);
    4653
    47     public override IEnumerable<IDataDescriptor> GetDataDescriptors() {
    48       List<UCIDataDescriptor> descriptorList = new List<UCIDataDescriptor>();
    49       descriptorList.Add(new Iris());
    50       descriptorList.Add(new Mammography());
    51       descriptorList.Add(new Parkinsons());
    52       descriptorList.Add(new Thyroid());
    53       descriptorList.Add(new Vertebral_3C());
    54       descriptorList.Add(new Wine());
    55       descriptorList.Add(new WisconsinDiagnosticBreastCancer());
    56       var solutionsArchiveName = GetResourceName(FileName + @"\.zip");
    57       if (!String.IsNullOrEmpty(solutionsArchiveName)) {
    58         using (var solutionsZipFile = new ZipArchive(GetType().Assembly.GetManifestResourceStream(solutionsArchiveName), ZipArchiveMode.Read)) {
    59           IList<string> entries = new List<string>();
    60           foreach (var curEntry in solutionsZipFile.Entries) {
    61             entries.Add(curEntry.Name);
    62           }
    63           foreach (var entry in entries.OrderBy(x => x)) {
    64             string filename = Path.GetFileNameWithoutExtension(entry);
    65             UCIDataDescriptor desc = descriptorList.Where(x => x.Filename.Equals(filename)).FirstOrDefault();
    66             if (desc != null) {
    67               desc.ResourceName = entry;
    68               yield return desc;
    69             } else
    70               throw new ArgumentNullException("No Descriptor could be found for this entry.");
    71           }
     54        if (trainingEntry == null || testEntry == null) {
     55          throw new InvalidOperationException("The training or test entry could not be found in the archive.");
     56        }
     57
     58        using (var trainingReader = new StreamReader(trainingEntry.Open()))
     59        using (var testReader = new StreamReader(testEntry.Open())) {
     60          ParseMetadata(trainingReader, out var inputVariables, out string targetVariable);
     61          ParseMetadata(testReader, out _, out _); // ignore outputs
     62
     63          // Read data
     64          var inputsData = new List<DoubleVector>[inputVariables.Count];
     65          for (int i = 0; i < inputsData.Length; i++) inputsData[i] = new List<DoubleVector>();
     66          var targetData = new List<double>();
     67          ReadData(trainingReader, inputsData, targetData, out int numTrainingRows);
     68          ReadData(testReader, inputsData, targetData, out int numTestRows);
     69
     70          // Build dataset
     71          var dataset = new Dataset(
     72            inputVariables.Concat(new[] { targetVariable }),
     73            inputsData.Cast<IList>().Concat(new[] { targetData })
     74          );
     75          Debug.Assert(dataset.Rows == numTrainingRows + numTestRows);
     76          Debug.Assert(dataset.Columns == inputVariables.Count + 1);
     77
     78          // Build problem data
     79          var problemData = new ClassificationProblemData(dataset, inputVariables, targetVariable) {
     80            Name = descriptor.Name
     81          };
     82          problemData.TrainingPartition.Start = 0;
     83          problemData.TrainingPartition.End = numTrainingRows;
     84          problemData.TestPartition.Start = numTrainingRows;
     85          problemData.TestPartition.End = numTrainingRows + numTestRows;
     86
     87          return problemData;
    7288        }
    7389      }
    7490    }
     91
     92    private static void ParseMetadata(StreamReader reader, out List<string> inputVariables, out string targetVariable) {
     93      int nrOfInputs = 0;
     94      bool dataStart = false;
     95      while (!reader.EndOfStream && !dataStart) {
     96        var line = reader.ReadLine();
     97        if (line.StartsWith("#")) {
     98          // Comment
     99        } else if (line.StartsWith("@")) {
     100          var splits = line.Split(' ');
     101          var type = splits.First();
     102          var arguments = splits.Skip(1).ToList();
     103          switch (type.ToLowerInvariant()) {
     104            case "@univariate":
     105              bool univariate = bool.Parse(arguments[0]);
     106              if (univariate)
     107                nrOfInputs = 1;
     108              break;
     109            case "@dimensions":
     110              int dimensions = int.Parse(arguments[0]);
     111              nrOfInputs = dimensions;
     112              break;
     113            case "@data":
     114              dataStart = true;
     115              break;
     116          }
     117        } else {
     118          throw new InvalidOperationException("A data section already occurred within metadata section.");
     119        }
     120      }
     121
     122      int digits = Math.Max((int)Math.Log10(nrOfInputs - 1) + 1, 1);
     123      inputVariables = Enumerable.Range(0, nrOfInputs)
     124        .Select(i => "X" + i.ToString("D" + digits))
     125        .ToList();
     126
     127      targetVariable = "Y";
     128    }
     129
     130    private static void ReadData(StreamReader reader, List<DoubleVector>[] inputsData, List<double> targetData, out int count) {
     131      count = 0;
     132      while (!reader.EndOfStream) {
     133        var line = reader.ReadLine();
     134        var variables = line.Split(':');
     135
     136        // parse all except last, which is the non-vector target
     137        for (int i = 0; i < variables.Length - 1; i++) {
     138          var variable = variables[i];
     139          var numbers = variable
     140            .Split(',')
     141            .Select(d => double.Parse(d, CultureInfo.InvariantCulture))
     142            .ToList();
     143          inputsData[i].Add(new DoubleVector(numbers));
     144        }
     145
     146        var target = double.Parse(variables[variables.Length - 1], CultureInfo.InvariantCulture);
     147        targetData.Add(target);
     148
     149        count++;
     150      }
     151    }
     152
     153    public override IEnumerable<IDataDescriptor> GetDataDescriptors() {
     154      using (var instancesZipFile = OpenZipArchive()) {
     155        var instances = GroupEntriesByInstance(instancesZipFile.Entries);
     156        var descriptors = instances.Select(instance => CreateDescriptor(instance.Key, instance.Value));
     157
     158        return descriptors.ToList();
     159      }
     160    }
     161
     162    private ZipArchive OpenZipArchive() {
     163      var instanceArchiveName = GetResourceName(FileName + @"\.zip");
     164      return new ZipArchive(GetType().Assembly.GetManifestResourceStream(instanceArchiveName), ZipArchiveMode.Read);
     165    }
     166
     167    private static IDictionary<string, List<ZipArchiveEntry>> GroupEntriesByInstance(ReadOnlyCollection<ZipArchiveEntry> entries) {
     168      var topLevelEntries = entries.Where(entry => string.IsNullOrEmpty(entry.Name)).ToList();
     169
     170      return topLevelEntries.ToDictionary(
     171        entry => Path.GetDirectoryName(entry.FullName),
     172        entry => entries.Except(topLevelEntries).Where(subEntry => subEntry.FullName.StartsWith(entry.FullName)).ToList());
     173    }
     174
     175    private static TimeSeriesDataDescriptor CreateDescriptor(string name, List<ZipArchiveEntry> subEntries) {
     176      var trainingEntry = subEntries.Single(entry => entry.Name.EndsWith("_TRAIN.ts"));
     177      var testEntry = subEntries.Single(entry => entry.Name.EndsWith("_TEST.ts"));
     178      return new TimeSeriesDataDescriptor(name, trainingEntry.FullName, testEntry.FullName);
     179    }
    75180  }
    76181}
  • branches/3040_VectorBasedGP/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Classification/TimeSeries/TimeSeriesMultivariateInstanceProvider.cs

    r17393 r17401  
    2020#endregion
    2121
    22 using System;
    23 using System.Collections.Generic;
    24 using System.IO;
    25 using System.IO.Compression;
    26 using System.Linq;
    2722
    2823namespace HeuristicLab.Problems.Instances.DataAnalysis {
    29   public class UCIInstanceProvider : ResourceClassificationInstanceProvider {
     24  public class TimeSeriesMultivariateInstanceProvider : TimeSeriesInstanceProvider {
    3025    public override string Name {
    31       get { return "UCI Problems"; }
    32     }
    33     public override string Description {
    34       get {
    35         return "";
    36       }
    37     }
    38     public override Uri WebLink {
    39       get { return new Uri("http://archive.ics.uci.edu/ml/datasets.html"); }
    40     }
    41     public override string ReferencePublication {
    42       get { return ""; }
     26      get { return "TimeSeries (Multivariate)"; }
    4327    }
    4428
    45     protected override string FileName { get { return "UCI"; } }
    46 
    47     public override IEnumerable<IDataDescriptor> GetDataDescriptors() {
    48       List<UCIDataDescriptor> descriptorList = new List<UCIDataDescriptor>();
    49       descriptorList.Add(new Iris());
    50       descriptorList.Add(new Mammography());
    51       descriptorList.Add(new Parkinsons());
    52       descriptorList.Add(new Thyroid());
    53       descriptorList.Add(new Vertebral_3C());
    54       descriptorList.Add(new Wine());
    55       descriptorList.Add(new WisconsinDiagnosticBreastCancer());
    56       var solutionsArchiveName = GetResourceName(FileName + @"\.zip");
    57       if (!String.IsNullOrEmpty(solutionsArchiveName)) {
    58         using (var solutionsZipFile = new ZipArchive(GetType().Assembly.GetManifestResourceStream(solutionsArchiveName), ZipArchiveMode.Read)) {
    59           IList<string> entries = new List<string>();
    60           foreach (var curEntry in solutionsZipFile.Entries) {
    61             entries.Add(curEntry.Name);
    62           }
    63           foreach (var entry in entries.OrderBy(x => x)) {
    64             string filename = Path.GetFileNameWithoutExtension(entry);
    65             UCIDataDescriptor desc = descriptorList.Where(x => x.Filename.Equals(filename)).FirstOrDefault();
    66             if (desc != null) {
    67               desc.ResourceName = entry;
    68               yield return desc;
    69             } else
    70               throw new ArgumentNullException("No Descriptor could be found for this entry.");
    71           }
    72         }
    73       }
    74     }
     29    protected override string FileName { get { return "TimeSeriesMultivariate"; } }
    7530  }
    7631}
  • branches/3040_VectorBasedGP/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Classification/TimeSeries/TimeSeriesUnivariateInstanceProvider.cs

    r17393 r17401  
    2020#endregion
    2121
    22 using System;
    23 using System.Collections.Generic;
    24 using System.IO;
    25 using System.IO.Compression;
    26 using System.Linq;
    2722
    2823namespace HeuristicLab.Problems.Instances.DataAnalysis {
    29   public class UCIInstanceProvider : ResourceClassificationInstanceProvider {
     24  public class TimeSeriesUnivariateInstanceProvider : TimeSeriesInstanceProvider {
    3025    public override string Name {
    31       get { return "UCI Problems"; }
    32     }
    33     public override string Description {
    34       get {
    35         return "";
    36       }
    37     }
    38     public override Uri WebLink {
    39       get { return new Uri("http://archive.ics.uci.edu/ml/datasets.html"); }
    40     }
    41     public override string ReferencePublication {
    42       get { return ""; }
     26      get { return "TimeSeries (Univariate)"; }
    4327    }
    4428
    45     protected override string FileName { get { return "UCI"; } }
    46 
    47     public override IEnumerable<IDataDescriptor> GetDataDescriptors() {
    48       List<UCIDataDescriptor> descriptorList = new List<UCIDataDescriptor>();
    49       descriptorList.Add(new Iris());
    50       descriptorList.Add(new Mammography());
    51       descriptorList.Add(new Parkinsons());
    52       descriptorList.Add(new Thyroid());
    53       descriptorList.Add(new Vertebral_3C());
    54       descriptorList.Add(new Wine());
    55       descriptorList.Add(new WisconsinDiagnosticBreastCancer());
    56       var solutionsArchiveName = GetResourceName(FileName + @"\.zip");
    57       if (!String.IsNullOrEmpty(solutionsArchiveName)) {
    58         using (var solutionsZipFile = new ZipArchive(GetType().Assembly.GetManifestResourceStream(solutionsArchiveName), ZipArchiveMode.Read)) {
    59           IList<string> entries = new List<string>();
    60           foreach (var curEntry in solutionsZipFile.Entries) {
    61             entries.Add(curEntry.Name);
    62           }
    63           foreach (var entry in entries.OrderBy(x => x)) {
    64             string filename = Path.GetFileNameWithoutExtension(entry);
    65             UCIDataDescriptor desc = descriptorList.Where(x => x.Filename.Equals(filename)).FirstOrDefault();
    66             if (desc != null) {
    67               desc.ResourceName = entry;
    68               yield return desc;
    69             } else
    70               throw new ArgumentNullException("No Descriptor could be found for this entry.");
    71           }
    72         }
    73       }
    74     }
     29    protected override string FileName { get { return "TimeSeriesUnivariate"; } }
    7530  }
    7631}
Note: See TracChangeset for help on using the changeset viewer.