Changeset 17401 for branches/3040_VectorBasedGP/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Classification
- Timestamp:
- 01/16/20 12:34:45 (5 years ago)
- Location:
- branches/3040_VectorBasedGP/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Classification/TimeSeries
- Files:
-
- 1 added
- 4 copied
Legend:
- Unmodified
- Added
- Removed
-
branches/3040_VectorBasedGP/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Classification/TimeSeries/TimeSeriesDataDescriptor.cs
r17393 r17401 20 20 #endregion 21 21 22 using System; 22 namespace HeuristicLab.Problems.Instances.DataAnalysis { 23 public class TimeSeriesDataDescriptor : IDataDescriptor { 24 public string Name { get; } 25 public string Description { get { return string.Empty; } } 23 26 24 namespace HeuristicLab.Problems.Instances.DataAnalysis { 25 public abstract class UCIDataDescriptor : ResourceClassificationDataDescriptor { 26 public override string Name { get { return String.Format("{0}, {1}, {2}", Filename, Donor, Year); } } 27 public abstract string Filename { get; } 28 public abstract string Donor { get; } 29 public abstract int Year { get; } 27 internal string TrainingEntryName { get; } 28 internal string TestEntryName { get; } 29 30 public TimeSeriesDataDescriptor(string name, string trainingEntryName, string testEntryName) { 31 Name = name; 32 TrainingEntryName = trainingEntryName; 33 TestEntryName = testEntryName; 34 } 30 35 } 31 36 } -
branches/3040_VectorBasedGP/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Classification/TimeSeries/TimeSeriesInstanceProvider.cs
r17393 r17401 21 21 22 22 using System; 23 using System.Collections; 23 24 using System.Collections.Generic; 25 using System.Collections.ObjectModel; 26 using System.Diagnostics; 27 using System.Globalization; 24 28 using System.IO; 25 29 using System.IO.Compression; 26 30 using System.Linq; 31 using HeuristicLab.Problems.DataAnalysis; 27 32 28 33 namespace HeuristicLab.Problems.Instances.DataAnalysis { 29 public class UCIInstanceProvider : ResourceClassificationInstanceProvider {30 public override string Name {31 get { return "UCIProblems"; }32 }34 public abstract class TimeSeriesInstanceProvider : ResourceClassificationInstanceProvider { 35 //public override string Name { 36 // get { return "TimeSeries (Univariate) Problems"; } 37 //} 33 38 public override string Description { 34 get { 35 return ""; 36 } 39 get { return "UEA & UCR TimeSeries Problems"; } 37 40 } 38 41 public override Uri WebLink { 39 get { return new Uri("http:// archive.ics.uci.edu/ml/datasets.html"); }42 get { return new Uri("http://www.timeseriesclassification.com/"); } 40 43 } 41 44 public override string ReferencePublication { 42 get { return " "; }45 get { return "Anthony Bagnall, Jason Lines, William Vickers and Eamonn Keogh, The UEA & UCR Time Series Classification Repository, www.timeseriesclassification.com"; } 43 46 } 44 47 45 protected override string FileName { get { return "UCI"; } } 48 public override IClassificationProblemData LoadData(IDataDescriptor id) { 49 var descriptor = (TimeSeriesDataDescriptor)id; 50 using (var instancesZipFile = OpenZipArchive()) { 51 var trainingEntry = instancesZipFile.GetEntry(descriptor.TrainingEntryName); 52 var testEntry = instancesZipFile.GetEntry(descriptor.TestEntryName); 46 53 47 public override IEnumerable<IDataDescriptor> GetDataDescriptors() { 48 List<UCIDataDescriptor> descriptorList = new List<UCIDataDescriptor>(); 49 descriptorList.Add(new Iris()); 50 descriptorList.Add(new Mammography()); 51 descriptorList.Add(new Parkinsons()); 52 descriptorList.Add(new Thyroid()); 53 descriptorList.Add(new Vertebral_3C()); 54 descriptorList.Add(new Wine()); 55 descriptorList.Add(new WisconsinDiagnosticBreastCancer()); 56 var solutionsArchiveName = GetResourceName(FileName + @"\.zip"); 57 if (!String.IsNullOrEmpty(solutionsArchiveName)) { 58 using (var solutionsZipFile = new ZipArchive(GetType().Assembly.GetManifestResourceStream(solutionsArchiveName), ZipArchiveMode.Read)) { 59 IList<string> entries = new List<string>(); 60 foreach (var curEntry in solutionsZipFile.Entries) { 61 entries.Add(curEntry.Name); 62 } 63 foreach (var entry in entries.OrderBy(x => x)) { 64 string filename = Path.GetFileNameWithoutExtension(entry); 65 UCIDataDescriptor desc = descriptorList.Where(x => x.Filename.Equals(filename)).FirstOrDefault(); 66 if (desc != null) { 67 desc.ResourceName = entry; 68 yield return desc; 69 } else 70 throw new ArgumentNullException("No Descriptor could be found for this entry."); 71 } 54 if (trainingEntry == null || testEntry == null) { 55 throw new InvalidOperationException("The training or test entry could not be found in the archive."); 56 } 57 58 using (var trainingReader = new StreamReader(trainingEntry.Open())) 59 using (var testReader = new StreamReader(testEntry.Open())) { 60 ParseMetadata(trainingReader, out var inputVariables, out string targetVariable); 61 ParseMetadata(testReader, out _, out _); // ignore outputs 62 63 // Read data 64 var inputsData = new List<DoubleVector>[inputVariables.Count]; 65 for (int i = 0; i < inputsData.Length; i++) inputsData[i] = new List<DoubleVector>(); 66 var targetData = new List<double>(); 67 ReadData(trainingReader, inputsData, targetData, out int numTrainingRows); 68 ReadData(testReader, inputsData, targetData, out int numTestRows); 69 70 // Build dataset 71 var dataset = new Dataset( 72 inputVariables.Concat(new[] { targetVariable }), 73 inputsData.Cast<IList>().Concat(new[] { targetData }) 74 ); 75 Debug.Assert(dataset.Rows == numTrainingRows + numTestRows); 76 Debug.Assert(dataset.Columns == inputVariables.Count + 1); 77 78 // Build problem data 79 var problemData = new ClassificationProblemData(dataset, inputVariables, targetVariable) { 80 Name = descriptor.Name 81 }; 82 problemData.TrainingPartition.Start = 0; 83 problemData.TrainingPartition.End = numTrainingRows; 84 problemData.TestPartition.Start = numTrainingRows; 85 problemData.TestPartition.End = numTrainingRows + numTestRows; 86 87 return problemData; 72 88 } 73 89 } 74 90 } 91 92 private static void ParseMetadata(StreamReader reader, out List<string> inputVariables, out string targetVariable) { 93 int nrOfInputs = 0; 94 bool dataStart = false; 95 while (!reader.EndOfStream && !dataStart) { 96 var line = reader.ReadLine(); 97 if (line.StartsWith("#")) { 98 // Comment 99 } else if (line.StartsWith("@")) { 100 var splits = line.Split(' '); 101 var type = splits.First(); 102 var arguments = splits.Skip(1).ToList(); 103 switch (type.ToLowerInvariant()) { 104 case "@univariate": 105 bool univariate = bool.Parse(arguments[0]); 106 if (univariate) 107 nrOfInputs = 1; 108 break; 109 case "@dimensions": 110 int dimensions = int.Parse(arguments[0]); 111 nrOfInputs = dimensions; 112 break; 113 case "@data": 114 dataStart = true; 115 break; 116 } 117 } else { 118 throw new InvalidOperationException("A data section already occurred within metadata section."); 119 } 120 } 121 122 int digits = Math.Max((int)Math.Log10(nrOfInputs - 1) + 1, 1); 123 inputVariables = Enumerable.Range(0, nrOfInputs) 124 .Select(i => "X" + i.ToString("D" + digits)) 125 .ToList(); 126 127 targetVariable = "Y"; 128 } 129 130 private static void ReadData(StreamReader reader, List<DoubleVector>[] inputsData, List<double> targetData, out int count) { 131 count = 0; 132 while (!reader.EndOfStream) { 133 var line = reader.ReadLine(); 134 var variables = line.Split(':'); 135 136 // parse all except last, which is the non-vector target 137 for (int i = 0; i < variables.Length - 1; i++) { 138 var variable = variables[i]; 139 var numbers = variable 140 .Split(',') 141 .Select(d => double.Parse(d, CultureInfo.InvariantCulture)) 142 .ToList(); 143 inputsData[i].Add(new DoubleVector(numbers)); 144 } 145 146 var target = double.Parse(variables[variables.Length - 1], CultureInfo.InvariantCulture); 147 targetData.Add(target); 148 149 count++; 150 } 151 } 152 153 public override IEnumerable<IDataDescriptor> GetDataDescriptors() { 154 using (var instancesZipFile = OpenZipArchive()) { 155 var instances = GroupEntriesByInstance(instancesZipFile.Entries); 156 var descriptors = instances.Select(instance => CreateDescriptor(instance.Key, instance.Value)); 157 158 return descriptors.ToList(); 159 } 160 } 161 162 private ZipArchive OpenZipArchive() { 163 var instanceArchiveName = GetResourceName(FileName + @"\.zip"); 164 return new ZipArchive(GetType().Assembly.GetManifestResourceStream(instanceArchiveName), ZipArchiveMode.Read); 165 } 166 167 private static IDictionary<string, List<ZipArchiveEntry>> GroupEntriesByInstance(ReadOnlyCollection<ZipArchiveEntry> entries) { 168 var topLevelEntries = entries.Where(entry => string.IsNullOrEmpty(entry.Name)).ToList(); 169 170 return topLevelEntries.ToDictionary( 171 entry => Path.GetDirectoryName(entry.FullName), 172 entry => entries.Except(topLevelEntries).Where(subEntry => subEntry.FullName.StartsWith(entry.FullName)).ToList()); 173 } 174 175 private static TimeSeriesDataDescriptor CreateDescriptor(string name, List<ZipArchiveEntry> subEntries) { 176 var trainingEntry = subEntries.Single(entry => entry.Name.EndsWith("_TRAIN.ts")); 177 var testEntry = subEntries.Single(entry => entry.Name.EndsWith("_TEST.ts")); 178 return new TimeSeriesDataDescriptor(name, trainingEntry.FullName, testEntry.FullName); 179 } 75 180 } 76 181 } -
branches/3040_VectorBasedGP/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Classification/TimeSeries/TimeSeriesMultivariateInstanceProvider.cs
r17393 r17401 20 20 #endregion 21 21 22 using System;23 using System.Collections.Generic;24 using System.IO;25 using System.IO.Compression;26 using System.Linq;27 22 28 23 namespace HeuristicLab.Problems.Instances.DataAnalysis { 29 public class UCIInstanceProvider : ResourceClassificationInstanceProvider {24 public class TimeSeriesMultivariateInstanceProvider : TimeSeriesInstanceProvider { 30 25 public override string Name { 31 get { return "UCI Problems"; } 32 } 33 public override string Description { 34 get { 35 return ""; 36 } 37 } 38 public override Uri WebLink { 39 get { return new Uri("http://archive.ics.uci.edu/ml/datasets.html"); } 40 } 41 public override string ReferencePublication { 42 get { return ""; } 26 get { return "TimeSeries (Multivariate)"; } 43 27 } 44 28 45 protected override string FileName { get { return "UCI"; } } 46 47 public override IEnumerable<IDataDescriptor> GetDataDescriptors() { 48 List<UCIDataDescriptor> descriptorList = new List<UCIDataDescriptor>(); 49 descriptorList.Add(new Iris()); 50 descriptorList.Add(new Mammography()); 51 descriptorList.Add(new Parkinsons()); 52 descriptorList.Add(new Thyroid()); 53 descriptorList.Add(new Vertebral_3C()); 54 descriptorList.Add(new Wine()); 55 descriptorList.Add(new WisconsinDiagnosticBreastCancer()); 56 var solutionsArchiveName = GetResourceName(FileName + @"\.zip"); 57 if (!String.IsNullOrEmpty(solutionsArchiveName)) { 58 using (var solutionsZipFile = new ZipArchive(GetType().Assembly.GetManifestResourceStream(solutionsArchiveName), ZipArchiveMode.Read)) { 59 IList<string> entries = new List<string>(); 60 foreach (var curEntry in solutionsZipFile.Entries) { 61 entries.Add(curEntry.Name); 62 } 63 foreach (var entry in entries.OrderBy(x => x)) { 64 string filename = Path.GetFileNameWithoutExtension(entry); 65 UCIDataDescriptor desc = descriptorList.Where(x => x.Filename.Equals(filename)).FirstOrDefault(); 66 if (desc != null) { 67 desc.ResourceName = entry; 68 yield return desc; 69 } else 70 throw new ArgumentNullException("No Descriptor could be found for this entry."); 71 } 72 } 73 } 74 } 29 protected override string FileName { get { return "TimeSeriesMultivariate"; } } 75 30 } 76 31 } -
branches/3040_VectorBasedGP/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Classification/TimeSeries/TimeSeriesUnivariateInstanceProvider.cs
r17393 r17401 20 20 #endregion 21 21 22 using System;23 using System.Collections.Generic;24 using System.IO;25 using System.IO.Compression;26 using System.Linq;27 22 28 23 namespace HeuristicLab.Problems.Instances.DataAnalysis { 29 public class UCIInstanceProvider : ResourceClassificationInstanceProvider {24 public class TimeSeriesUnivariateInstanceProvider : TimeSeriesInstanceProvider { 30 25 public override string Name { 31 get { return "UCI Problems"; } 32 } 33 public override string Description { 34 get { 35 return ""; 36 } 37 } 38 public override Uri WebLink { 39 get { return new Uri("http://archive.ics.uci.edu/ml/datasets.html"); } 40 } 41 public override string ReferencePublication { 42 get { return ""; } 26 get { return "TimeSeries (Univariate)"; } 43 27 } 44 28 45 protected override string FileName { get { return "UCI"; } } 46 47 public override IEnumerable<IDataDescriptor> GetDataDescriptors() { 48 List<UCIDataDescriptor> descriptorList = new List<UCIDataDescriptor>(); 49 descriptorList.Add(new Iris()); 50 descriptorList.Add(new Mammography()); 51 descriptorList.Add(new Parkinsons()); 52 descriptorList.Add(new Thyroid()); 53 descriptorList.Add(new Vertebral_3C()); 54 descriptorList.Add(new Wine()); 55 descriptorList.Add(new WisconsinDiagnosticBreastCancer()); 56 var solutionsArchiveName = GetResourceName(FileName + @"\.zip"); 57 if (!String.IsNullOrEmpty(solutionsArchiveName)) { 58 using (var solutionsZipFile = new ZipArchive(GetType().Assembly.GetManifestResourceStream(solutionsArchiveName), ZipArchiveMode.Read)) { 59 IList<string> entries = new List<string>(); 60 foreach (var curEntry in solutionsZipFile.Entries) { 61 entries.Add(curEntry.Name); 62 } 63 foreach (var entry in entries.OrderBy(x => x)) { 64 string filename = Path.GetFileNameWithoutExtension(entry); 65 UCIDataDescriptor desc = descriptorList.Where(x => x.Filename.Equals(filename)).FirstOrDefault(); 66 if (desc != null) { 67 desc.ResourceName = entry; 68 yield return desc; 69 } else 70 throw new ArgumentNullException("No Descriptor could be found for this entry."); 71 } 72 } 73 } 74 } 29 protected override string FileName { get { return "TimeSeriesUnivariate"; } } 75 30 } 76 31 }
Note: See TracChangeset
for help on using the changeset viewer.