- Timestamp:
- 11/07/12 16:28:33 (11 years ago)
- Location:
- trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis
- Files:
-
- 9 edited
- 3 copied
Legend:
- Unmodified
- Added
- Removed
-
trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis
- Property svn:mergeinfo changed
-
trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Classification/CSV/ClassifiactionCSVInstanceProvider.cs
r8693 r8877 23 23 using System.Collections; 24 24 using System.Collections.Generic; 25 using System.Globalization;26 25 using System.IO; 27 26 using System.Linq; 28 using System.Text;29 27 using HeuristicLab.Common; 30 28 using HeuristicLab.Problems.DataAnalysis; … … 76 74 } 77 75 } else { 78 allowedInputVars.AddRange(dataset.DoubleVariables.Where(x => x.Equals(targetVar)));76 allowedInputVars.AddRange(dataset.DoubleVariables.Where(x => !x.Equals(targetVar))); 79 77 } 80 78 … … 92 90 } 93 91 94 public override IClassificationProblemData ImportData(string path, DataAnalysisImportType type) { 95 TableFileParser csvFileParser = new TableFileParser(); 96 csvFileParser.Parse(path); 97 92 protected override IClassificationProblemData ImportData(string path, ClassificationImportType type, TableFileParser csvFileParser) { 98 93 int trainingPartEnd = (csvFileParser.Rows * type.Training) / 100; 99 94 List<IList> values = csvFileParser.Values; 100 95 if (type.Shuffle) { 101 values = Shuffle(values); 96 values = Shuffle(values, csvFileParser.VariableNames.ToList().FindIndex(x => x.Equals(type.TargetVariable)), 97 type.Training, out trainingPartEnd); 102 98 } 103 99 104 100 Dataset dataset = new Dataset(csvFileParser.VariableNames, values); 105 string targetVar = dataset.DoubleVariables.Last();106 101 107 102 // turn of input variables that are constant in the training partition 108 103 var allowedInputVars = new List<string>(); 109 104 var trainingIndizes = Enumerable.Range(0, trainingPartEnd); 110 foreach (var variableName in dataset.DoubleVariables) { 111 if (trainingIndizes.Count() >= 2 && dataset.GetDoubleValues(variableName, trainingIndizes).Range() > 0 && 112 variableName != targetVar) 113 allowedInputVars.Add(variableName); 105 if (trainingIndizes.Count() >= 2) { 106 foreach (var variableName in dataset.DoubleVariables) { 107 if (dataset.GetDoubleValues(variableName, trainingIndizes).Range() > 0 && 108 variableName != type.TargetVariable) 109 allowedInputVars.Add(variableName); 110 } 111 } else { 112 allowedInputVars.AddRange(dataset.DoubleVariables.Where(x => !x.Equals(type.TargetVariable))); 114 113 } 115 114 116 ClassificationProblemData classificationData = new ClassificationProblemData(dataset, allowedInputVars, t argetVar);115 ClassificationProblemData classificationData = new ClassificationProblemData(dataset, allowedInputVars, type.TargetVariable); 117 116 118 117 classificationData.TrainingPartition.Start = 0; … … 126 125 } 127 126 128 public override bool CanExportData { 129 get { return true; } 130 } 131 public override void ExportData(IClassificationProblemData instance, string path) { 132 var strBuilder = new StringBuilder(); 133 var colSep = CultureInfo.CurrentCulture.TextInfo.ListSeparator; 134 foreach (var variable in instance.Dataset.VariableNames) { 135 strBuilder.Append(variable.Replace(colSep, String.Empty) + colSep); 136 } 137 strBuilder.Remove(strBuilder.Length - colSep.Length, colSep.Length); 138 strBuilder.AppendLine(); 139 140 var dataset = instance.Dataset; 141 142 for (int i = 0; i < dataset.Rows; i++) { 143 for (int j = 0; j < dataset.Columns; j++) { 144 if (j > 0) strBuilder.Append(colSep); 145 strBuilder.Append(dataset.GetValue(i, j)); 146 } 147 strBuilder.AppendLine(); 127 protected List<IList> Shuffle(List<IList> values, int target, int trainingPercentage, out int trainingPartEnd) { 128 IList targetValues = values[target]; 129 var group = targetValues.Cast<double>().GroupBy(x => x).Select(g => new { Key = g.Key, Count = g.Count() }).ToList(); 130 Dictionary<double, double> taken = new Dictionary<double, double>(); 131 foreach (var classCount in group) { 132 taken[classCount.Key] = (classCount.Count * trainingPercentage) / 100.0; 148 133 } 149 134 150 using (var writer = new StreamWriter(path)) { 151 writer.Write(strBuilder); 135 List<IList> training = GetListOfIListCopy(values); 136 List<IList> test = GetListOfIListCopy(values); 137 138 for (int i = 0; i < targetValues.Count; i++) { 139 if (taken[(double)targetValues[i]] > 0) { 140 AddRow(training, values, i); 141 taken[(double)targetValues[i]]--; 142 } else { 143 AddRow(test, values, i); 144 } 152 145 } 146 147 trainingPartEnd = training.First().Count; 148 149 training = Shuffle(training); 150 test = Shuffle(test); 151 for (int i = 0; i < training.Count; i++) { 152 for (int j = 0; j < test[i].Count; j++) { 153 training[i].Add(test[i][j]); 154 } 155 } 156 157 return training; 158 } 159 160 private void AddRow(List<IList> destination, List<IList> source, int index) { 161 for (int i = 0; i < source.Count; i++) { 162 destination[i].Add(source[i][index]); 163 } 164 } 165 166 private List<IList> GetListOfIListCopy(List<IList> values) { 167 List<IList> newList = new List<IList>(values.Count); 168 foreach (IList t in values) { 169 if (t is List<double>) 170 newList.Add(new List<double>()); 171 else if (t is List<DateTime>) 172 newList.Add(new List<DateTime>()); 173 else if (t is List<string>) 174 newList.Add(new List<string>()); 175 else 176 throw new InvalidOperationException(); 177 } 178 return newList; 153 179 } 154 180 } -
trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Classification/ClassificationInstanceProvider.cs
r8598 r8877 23 23 24 24 namespace HeuristicLab.Problems.Instances.DataAnalysis { 25 public abstract class ClassificationInstanceProvider : DataAnalysisInstanceProvider<IClassificationProblemData > {25 public abstract class ClassificationInstanceProvider : DataAnalysisInstanceProvider<IClassificationProblemData, ClassificationImportType> { 26 26 } 27 27 } -
trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Clustering/CSV/ClusteringCSVInstanceProvider.cs
r8685 r8877 23 23 using System.Collections; 24 24 using System.Collections.Generic; 25 using System.Globalization;26 25 using System.IO; 27 26 using System.Linq; 28 using System.Text;29 27 using HeuristicLab.Common; 30 28 using HeuristicLab.Problems.DataAnalysis; … … 75 73 } 76 74 } else { 77 allowedInputVars.AddRange(dataset.DoubleVariables.Where(x => x.Equals(targetVar)));75 allowedInputVars.AddRange(dataset.DoubleVariables.Where(x => !x.Equals(targetVar))); 78 76 } 79 77 … … 91 89 } 92 90 93 public override IClusteringProblemData ImportData(string path, DataAnalysisImportType type) { 94 TableFileParser csvFileParser = new TableFileParser(); 95 csvFileParser.Parse(path); 96 91 protected override IClusteringProblemData ImportData(string path, DataAnalysisImportType type, TableFileParser csvFileParser) { 97 92 List<IList> values = csvFileParser.Values; 98 93 if (type.Shuffle) { … … 107 102 int trainingPartEnd = (csvFileParser.Rows * type.Training) / 100; 108 103 var trainingIndizes = Enumerable.Range(0, trainingPartEnd); 109 foreach (var variableName in dataset.DoubleVariables) { 110 if (trainingIndizes.Count() >= 2 && dataset.GetDoubleValues(variableName, trainingIndizes).Range() > 0 && 111 variableName != targetVar) 112 allowedInputVars.Add(variableName); 104 if (trainingIndizes.Count() >= 2) { 105 foreach (var variableName in dataset.DoubleVariables) { 106 if (dataset.GetDoubleValues(variableName, trainingIndizes).Range() > 0 && 107 variableName != targetVar) 108 allowedInputVars.Add(variableName); 109 } 110 } else { 111 allowedInputVars.AddRange(dataset.DoubleVariables.Where(x => !x.Equals(targetVar))); 113 112 } 114 113 … … 124 123 return clusteringData; 125 124 } 126 127 public override bool CanExportData {128 get { return true; }129 }130 public override void ExportData(IClusteringProblemData instance, string path) {131 var strBuilder = new StringBuilder();132 var colSep = CultureInfo.CurrentCulture.TextInfo.ListSeparator;133 foreach (var variable in instance.Dataset.VariableNames) {134 strBuilder.Append(variable.Replace(colSep, String.Empty) + colSep);135 }136 strBuilder.Remove(strBuilder.Length - colSep.Length, colSep.Length);137 strBuilder.AppendLine();138 139 var dataset = instance.Dataset;140 141 for (int i = 0; i < dataset.Rows; i++) {142 for (int j = 0; j < dataset.Columns; j++) {143 if (j > 0) strBuilder.Append(colSep);144 strBuilder.Append(dataset.GetValue(i, j));145 }146 strBuilder.AppendLine();147 }148 149 using (var writer = new StreamWriter(path)) {150 writer.Write(strBuilder);151 }152 }153 125 } 154 126 } -
trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Clustering/ClusteringInstanceProvider.cs
r8598 r8877 23 23 24 24 namespace HeuristicLab.Problems.Instances.DataAnalysis { 25 public abstract class ClusteringInstanceProvider : DataAnalysisInstanceProvider<IClusteringProblemData > {25 public abstract class ClusteringInstanceProvider : DataAnalysisInstanceProvider<IClusteringProblemData, DataAnalysisImportType> { 26 26 } 27 27 } -
trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/DataAnalysisInstanceProvider.cs
r8598 r8877 23 23 using System.Collections; 24 24 using System.Collections.Generic; 25 using System.Globalization; 26 using System.IO; 25 27 using System.Linq; 28 using System.Text; 26 29 using HeuristicLab.Problems.DataAnalysis; 30 using HeuristicLab.Random; 27 31 28 32 namespace HeuristicLab.Problems.Instances.DataAnalysis { 29 public abstract class DataAnalysisInstanceProvider<TData> : ProblemInstanceProvider<TData> 30 where TData : class, IDataAnalysisProblemData { 33 public abstract class DataAnalysisInstanceProvider<TData, ImportType> : ProblemInstanceProvider<TData> 34 where TData : class, IDataAnalysisProblemData 35 where ImportType : DataAnalysisImportType { 31 36 32 37 // has to be implemented, if CanImportData is true 33 public virtual TData ImportData(string path, DataAnalysisImportType type) { 34 throw new NotSupportedException(); 38 public TData ImportData(string path, ImportType type) { 39 TableFileParser csvFileParser = new TableFileParser(); 40 csvFileParser.Parse(path); 41 return ImportData(path, type, csvFileParser); 35 42 } 43 public TData ImportData(string path, ImportType type, DataAnalysisCSVFormat csvFormat) { 44 TableFileParser csvFileParser = new TableFileParser(); 45 csvFileParser.Parse(path, csvFormat.NumberFormatInfo, csvFormat.DateTimeFormatInfo, csvFormat.Separator); 46 return ImportData(path, type, csvFileParser); 47 } 48 49 protected abstract TData ImportData(string path, ImportType type, TableFileParser csvFileParser); 36 50 37 51 protected List<IList> Shuffle(List<IList> values) { 38 52 int count = values.First().Count; 39 int[] indices = GetRandomIndices(count);53 int[] indices = Enumerable.Range(0, count).Shuffle(new FastRandom()).ToArray(); 40 54 List<IList> shuffeledValues = new List<IList>(values.Count); 41 55 for (int col = 0; col < values.Count; col++) { … … 57 71 } 58 72 59 //Fisher–Yates shuffle 60 private int[] GetRandomIndices(int amount) { 61 int[] randomIndices = Enumerable.Range(0, amount).ToArray(); 62 System.Random rand = new System.Random(); 63 int n = amount; 64 while (n > 1) { 65 n--; 66 int k = rand.Next(n + 1); 67 int value = randomIndices[k]; 68 randomIndices[k] = randomIndices[n]; 69 randomIndices[n] = value; 73 public override bool CanExportData { 74 get { return true; } 75 } 76 public override void ExportData(TData instance, string path) { 77 var strBuilder = new StringBuilder(); 78 var colSep = CultureInfo.CurrentCulture.TextInfo.ListSeparator; 79 foreach (var variable in instance.Dataset.VariableNames) { 80 strBuilder.Append(variable.Replace(colSep, String.Empty) + colSep); 70 81 } 71 return randomIndices; 82 strBuilder.Remove(strBuilder.Length - colSep.Length, colSep.Length); 83 strBuilder.AppendLine(); 84 85 var dataset = instance.Dataset; 86 87 for (int i = 0; i < dataset.Rows; i++) { 88 for (int j = 0; j < dataset.Columns; j++) { 89 if (j > 0) strBuilder.Append(colSep); 90 strBuilder.Append(dataset.GetValue(i, j)); 91 } 92 strBuilder.AppendLine(); 93 } 94 95 using (var writer = new StreamWriter(path)) { 96 writer.Write(strBuilder); 97 } 72 98 } 73 99 } -
trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/HeuristicLab.Problems.Instances.DataAnalysis-3.3.csproj
r8841 r8877 115 115 <Compile Include="Classification\ArtificialClassificationDataDescriptor.cs" /> 116 116 <Compile Include="Classification\ArtificialClassificationInstanceProvider.cs" /> 117 <Compile Include="Classification\ClassificationImportType.cs" /> 117 118 <Compile Include="Classification\ClassificationInstanceProvider.cs" /> 118 119 <Compile Include="Classification\CSV\ClassifiactionCSVInstanceProvider.cs" /> … … 126 127 <Compile Include="Clustering\ClusteringInstanceProvider.cs" /> 127 128 <Compile Include="Clustering\CSV\ClusteringCSVInstanceProvider.cs" /> 129 <Compile Include="DataAnalysisCSVFormat.cs" /> 128 130 <Compile Include="DataAnalysisImportType.cs" /> 129 131 <Compile Include="DataAnalysisInstanceProvider.cs" /> … … 179 181 <Compile Include="Regression\Nguyen\NguyenInstanceProvider.cs" /> 180 182 <Compile Include="Regression\RealWorld\RegressionRealWorldInstanceProvider.cs" /> 183 <Compile Include="Regression\RegressionImportType.cs" /> 181 184 <Compile Include="Regression\RegressionInstanceProvider.cs" /> 182 185 <Compile Include="Regression\ResourceRegressionDataDescriptor.cs" /> -
trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Regression/CSV/RegressionCSVInstanceProvider.cs
r8685 r8877 23 23 using System.Collections; 24 24 using System.Collections.Generic; 25 using System.Globalization;26 25 using System.IO; 27 26 using System.Linq; 28 using System.Text;29 27 using HeuristicLab.Common; 30 28 using HeuristicLab.Problems.DataAnalysis; … … 67 65 var allowedInputVars = new List<string>(); 68 66 var trainingIndizes = Enumerable.Range(0, (csvFileParser.Rows * 2) / 3); 69 foreach (var variableName in dataset.DoubleVariables) { 70 if (trainingIndizes.Count() >= 2 && dataset.GetDoubleValues(variableName, trainingIndizes).Range() > 0 && 71 variableName != targetVar) 72 allowedInputVars.Add(variableName); 67 if (trainingIndizes.Count() >= 2) { 68 foreach (var variableName in dataset.DoubleVariables) { 69 if (dataset.GetDoubleValues(variableName, trainingIndizes).Range() > 0 && 70 variableName != targetVar) 71 allowedInputVars.Add(variableName); 72 } 73 } else { 74 allowedInputVars.AddRange(dataset.DoubleVariables.Where(x => !x.Equals(targetVar))); 73 75 } 74 76 … … 86 88 } 87 89 88 public override IRegressionProblemData ImportData(string path, DataAnalysisImportType type) { 89 TableFileParser csvFileParser = new TableFileParser(); 90 csvFileParser.Parse(path); 91 90 protected override IRegressionProblemData ImportData(string path, RegressionImportType type, TableFileParser csvFileParser) { 92 91 List<IList> values = csvFileParser.Values; 93 92 if (type.Shuffle) { … … 95 94 } 96 95 Dataset dataset = new Dataset(csvFileParser.VariableNames, values); 97 string targetVar = dataset.DoubleVariables.Last();98 96 99 97 // turn of input variables that are constant in the training partition … … 105 103 foreach (var variableName in dataset.DoubleVariables) { 106 104 if (dataset.GetDoubleValues(variableName, trainingIndizes).Range() > 0 && 107 variableName != t argetVar)105 variableName != type.TargetVariable) 108 106 allowedInputVars.Add(variableName); 109 107 } 110 108 } else { 111 allowedInputVars.AddRange(dataset.DoubleVariables.Where(x => x.Equals(targetVar)));109 allowedInputVars.AddRange(dataset.DoubleVariables.Where(x => !x.Equals(type.TargetVariable))); 112 110 } 113 111 114 RegressionProblemData regressionData = new RegressionProblemData(dataset, allowedInputVars, t argetVar);112 RegressionProblemData regressionData = new RegressionProblemData(dataset, allowedInputVars, type.TargetVariable); 115 113 116 114 regressionData.TrainingPartition.Start = 0; … … 123 121 return regressionData; 124 122 } 125 126 public override bool CanExportData {127 get { return true; }128 }129 public override void ExportData(IRegressionProblemData instance, string path) {130 var strBuilder = new StringBuilder();131 var colSep = CultureInfo.CurrentCulture.TextInfo.ListSeparator;132 foreach (var variable in instance.Dataset.VariableNames) {133 strBuilder.Append(variable.Replace(colSep, String.Empty) + colSep);134 }135 strBuilder.Remove(strBuilder.Length - colSep.Length, colSep.Length);136 strBuilder.AppendLine();137 138 var dataset = instance.Dataset;139 140 for (int i = 0; i < dataset.Rows; i++) {141 for (int j = 0; j < dataset.Columns; j++) {142 if (j > 0) strBuilder.Append(colSep);143 strBuilder.Append(dataset.GetValue(i, j));144 }145 strBuilder.AppendLine();146 }147 148 using (var writer = new StreamWriter(path)) {149 writer.Write(strBuilder);150 }151 }152 123 } 153 124 } -
trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Regression/RegressionInstanceProvider.cs
r8598 r8877 23 23 24 24 namespace HeuristicLab.Problems.Instances.DataAnalysis { 25 public abstract class RegressionInstanceProvider : DataAnalysisInstanceProvider<IRegressionProblemData > {25 public abstract class RegressionInstanceProvider : DataAnalysisInstanceProvider<IRegressionProblemData, RegressionImportType> { 26 26 } 27 27 }
Note: See TracChangeset
for help on using the changeset viewer.