Changeset 9363 for branches/OaaS/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Classification/CSV
- Timestamp:
- 04/16/13 13:13:41 (12 years ago)
- Location:
- branches/OaaS
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
branches/OaaS ¶
- Property svn:ignore
-
TabularUnified
old new 21 21 protoc.exe 22 22 _ReSharper.HeuristicLab 3.3 Tests 23 Google.ProtocolBuffers-2.4.1.473.dll 23 24 packages
-
- Property svn:mergeinfo changed
- Property svn:ignore
-
branches/OaaS/HeuristicLab.Problems.Instances.DataAnalysis ¶
-
Property
svn:mergeinfo
set to
(toggle deleted branches)
/trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis merged eligible
-
Property
svn:mergeinfo
set to
(toggle deleted branches)
-
TabularUnified branches/OaaS/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Classification/CSV/ClassifiactionCSVInstanceProvider.cs ¶
r8211 r9363 21 21 22 22 using System; 23 using System.Collections; 23 24 using System.Collections.Generic; 24 25 using System.IO; 25 26 using System.Linq; 26 using System.Text;27 using HeuristicLab.Common; 27 28 using HeuristicLab.Problems.DataAnalysis; 28 29 … … 61 62 62 63 Dataset dataset = new Dataset(csvFileParser.VariableNames, csvFileParser.Values); 63 string targetVar = csvFileParser.VariableNames.Where(x => dataset.DoubleVariables.Contains(x)).Last(); 64 IEnumerable<string> allowedInputVars = dataset.DoubleVariables.Where(x => !x.Equals(targetVar)); 64 string targetVar = dataset.DoubleVariables.Last(); 65 65 66 ClassificationProblemData claData = new ClassificationProblemData(dataset, allowedInputVars, targetVar); 67 68 int trainingPartEnd = csvFileParser.Rows * 2 / 3; 69 claData.TrainingPartition.Start = 0; 70 claData.TrainingPartition.End = trainingPartEnd; 71 claData.TestPartition.Start = trainingPartEnd; 72 claData.TestPartition.End = csvFileParser.Rows; 73 int pos = path.LastIndexOf('\\'); 74 if (pos < 0) 75 claData.Name = path; 76 else { 77 pos++; 78 claData.Name = path.Substring(pos, path.Length - pos); 66 // turn of input variables that are constant in the training partition 67 var allowedInputVars = new List<string>(); 68 var trainingIndizes = Enumerable.Range(0, (csvFileParser.Rows * 2) / 3); 69 if (trainingIndizes.Count() >= 2) { 70 foreach (var variableName in dataset.DoubleVariables) { 71 if (dataset.GetDoubleValues(variableName, trainingIndizes).Range() > 0 && 72 variableName != targetVar) 73 allowedInputVars.Add(variableName); 74 } 75 } else { 76 allowedInputVars.AddRange(dataset.DoubleVariables.Where(x => !x.Equals(targetVar))); 79 77 } 80 78 81 return claData; 79 ClassificationProblemData classificationData = new ClassificationProblemData(dataset, allowedInputVars, targetVar); 80 81 int trainingPartEnd = trainingIndizes.Last(); 82 classificationData.TrainingPartition.Start = trainingIndizes.First(); 83 classificationData.TrainingPartition.End = trainingPartEnd; 84 classificationData.TestPartition.Start = trainingPartEnd; 85 classificationData.TestPartition.End = csvFileParser.Rows; 86 87 classificationData.Name = Path.GetFileName(path); 88 89 return classificationData; 82 90 } 83 91 84 public override bool CanExportData { 85 get { return true; } 86 } 87 public override void ExportData(IClassificationProblemData instance, string path) { 88 StringBuilder strBuilder = new StringBuilder(); 89 90 foreach (var variable in instance.InputVariables) { 91 strBuilder.Append(variable + ";"); 92 } 93 strBuilder.Remove(strBuilder.Length - 1, 1); 94 strBuilder.AppendLine(); 95 96 Dataset dataset = instance.Dataset; 97 98 for (int i = 0; i < dataset.Rows; i++) { 99 for (int j = 0; j < dataset.Columns; j++) { 100 strBuilder.Append(dataset.GetValue(i, j) + ";"); 92 protected override IClassificationProblemData ImportData(string path, ClassificationImportType type, TableFileParser csvFileParser) { 93 int trainingPartEnd = (csvFileParser.Rows * type.TrainingPercentage) / 100; 94 List<IList> values = csvFileParser.Values; 95 if (type.Shuffle) { 96 values = Shuffle(values); 97 if (type.UniformlyDistributeClasses) { 98 values = Shuffle(values, csvFileParser.VariableNames.ToList().FindIndex(x => x.Equals(type.TargetVariable)), 99 type.TrainingPercentage, out trainingPartEnd); 101 100 } 102 strBuilder.Remove(strBuilder.Length - 1, 1);103 strBuilder.AppendLine();104 101 } 105 102 106 using (StreamWriter writer = new StreamWriter(path)) { 107 writer.Write(strBuilder); 103 Dataset dataset = new Dataset(csvFileParser.VariableNames, values); 104 105 // turn of input variables that are constant in the training partition 106 var allowedInputVars = new List<string>(); 107 var trainingIndizes = Enumerable.Range(0, trainingPartEnd); 108 if (trainingIndizes.Count() >= 2) { 109 foreach (var variableName in dataset.DoubleVariables) { 110 if (dataset.GetDoubleValues(variableName, trainingIndizes).Range() > 0 && 111 variableName != type.TargetVariable) 112 allowedInputVars.Add(variableName); 113 } 114 } else { 115 allowedInputVars.AddRange(dataset.DoubleVariables.Where(x => !x.Equals(type.TargetVariable))); 108 116 } 117 118 ClassificationProblemData classificationData = new ClassificationProblemData(dataset, allowedInputVars, type.TargetVariable); 119 120 classificationData.TrainingPartition.Start = 0; 121 classificationData.TrainingPartition.End = trainingPartEnd; 122 classificationData.TestPartition.Start = trainingPartEnd; 123 classificationData.TestPartition.End = csvFileParser.Rows; 124 125 classificationData.Name = Path.GetFileName(path); 126 127 return classificationData; 128 } 129 130 protected List<IList> Shuffle(List<IList> values, int target, int trainingPercentage, out int trainingPartEnd) { 131 IList targetValues = values[target]; 132 var group = targetValues.Cast<double>().GroupBy(x => x).Select(g => new { Key = g.Key, Count = g.Count() }).ToList(); 133 Dictionary<double, double> taken = new Dictionary<double, double>(); 134 foreach (var classCount in group) { 135 taken[classCount.Key] = (classCount.Count * trainingPercentage) / 100.0; 136 } 137 138 List<IList> training = GetListOfIListCopy(values); 139 List<IList> test = GetListOfIListCopy(values); 140 141 for (int i = 0; i < targetValues.Count; i++) { 142 if (taken[(double)targetValues[i]] > 0) { 143 AddRow(training, values, i); 144 taken[(double)targetValues[i]]--; 145 } else { 146 AddRow(test, values, i); 147 } 148 } 149 150 trainingPartEnd = training.First().Count; 151 152 for (int i = 0; i < training.Count; i++) { 153 for (int j = 0; j < test[i].Count; j++) { 154 training[i].Add(test[i][j]); 155 } 156 } 157 158 return training; 159 } 160 161 private void AddRow(List<IList> destination, List<IList> source, int index) { 162 for (int i = 0; i < source.Count; i++) { 163 destination[i].Add(source[i][index]); 164 } 165 } 166 167 private List<IList> GetListOfIListCopy(List<IList> values) { 168 List<IList> newList = new List<IList>(values.Count); 169 foreach (IList t in values) { 170 if (t is List<double>) 171 newList.Add(new List<double>()); 172 else if (t is List<DateTime>) 173 newList.Add(new List<DateTime>()); 174 else if (t is List<string>) 175 newList.Add(new List<string>()); 176 else 177 throw new InvalidOperationException(); 178 } 179 return newList; 109 180 } 110 181 }
Note: See TracChangeset
for help on using the changeset viewer.