Changeset 8877 for trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Classification/CSV
- Timestamp:
- 11/07/12 16:28:33 (12 years ago)
- Location:
- trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis
- Property svn:mergeinfo changed
-
trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Classification/CSV/ClassifiactionCSVInstanceProvider.cs
r8693 r8877 23 23 using System.Collections; 24 24 using System.Collections.Generic; 25 using System.Globalization;26 25 using System.IO; 27 26 using System.Linq; 28 using System.Text;29 27 using HeuristicLab.Common; 30 28 using HeuristicLab.Problems.DataAnalysis; … … 76 74 } 77 75 } else { 78 allowedInputVars.AddRange(dataset.DoubleVariables.Where(x => x.Equals(targetVar)));76 allowedInputVars.AddRange(dataset.DoubleVariables.Where(x => !x.Equals(targetVar))); 79 77 } 80 78 … … 92 90 } 93 91 94 public override IClassificationProblemData ImportData(string path, DataAnalysisImportType type) { 95 TableFileParser csvFileParser = new TableFileParser(); 96 csvFileParser.Parse(path); 97 92 protected override IClassificationProblemData ImportData(string path, ClassificationImportType type, TableFileParser csvFileParser) { 98 93 int trainingPartEnd = (csvFileParser.Rows * type.Training) / 100; 99 94 List<IList> values = csvFileParser.Values; 100 95 if (type.Shuffle) { 101 values = Shuffle(values); 96 values = Shuffle(values, csvFileParser.VariableNames.ToList().FindIndex(x => x.Equals(type.TargetVariable)), 97 type.Training, out trainingPartEnd); 102 98 } 103 99 104 100 Dataset dataset = new Dataset(csvFileParser.VariableNames, values); 105 string targetVar = dataset.DoubleVariables.Last();106 101 107 102 // turn of input variables that are constant in the training partition 108 103 var allowedInputVars = new List<string>(); 109 104 var trainingIndizes = Enumerable.Range(0, trainingPartEnd); 110 foreach (var variableName in dataset.DoubleVariables) { 111 if (trainingIndizes.Count() >= 2 && dataset.GetDoubleValues(variableName, trainingIndizes).Range() > 0 && 112 variableName != targetVar) 113 allowedInputVars.Add(variableName); 105 if (trainingIndizes.Count() >= 2) { 106 foreach (var variableName in dataset.DoubleVariables) { 107 if (dataset.GetDoubleValues(variableName, trainingIndizes).Range() > 0 && 108 variableName != type.TargetVariable) 109 allowedInputVars.Add(variableName); 110 } 111 } else { 112 allowedInputVars.AddRange(dataset.DoubleVariables.Where(x => !x.Equals(type.TargetVariable))); 114 113 } 115 114 116 ClassificationProblemData classificationData = new ClassificationProblemData(dataset, allowedInputVars, t argetVar);115 ClassificationProblemData classificationData = new ClassificationProblemData(dataset, allowedInputVars, type.TargetVariable); 117 116 118 117 classificationData.TrainingPartition.Start = 0; … … 126 125 } 127 126 128 public override bool CanExportData { 129 get { return true; } 130 } 131 public override void ExportData(IClassificationProblemData instance, string path) { 132 var strBuilder = new StringBuilder(); 133 var colSep = CultureInfo.CurrentCulture.TextInfo.ListSeparator; 134 foreach (var variable in instance.Dataset.VariableNames) { 135 strBuilder.Append(variable.Replace(colSep, String.Empty) + colSep); 136 } 137 strBuilder.Remove(strBuilder.Length - colSep.Length, colSep.Length); 138 strBuilder.AppendLine(); 139 140 var dataset = instance.Dataset; 141 142 for (int i = 0; i < dataset.Rows; i++) { 143 for (int j = 0; j < dataset.Columns; j++) { 144 if (j > 0) strBuilder.Append(colSep); 145 strBuilder.Append(dataset.GetValue(i, j)); 146 } 147 strBuilder.AppendLine(); 127 protected List<IList> Shuffle(List<IList> values, int target, int trainingPercentage, out int trainingPartEnd) { 128 IList targetValues = values[target]; 129 var group = targetValues.Cast<double>().GroupBy(x => x).Select(g => new { Key = g.Key, Count = g.Count() }).ToList(); 130 Dictionary<double, double> taken = new Dictionary<double, double>(); 131 foreach (var classCount in group) { 132 taken[classCount.Key] = (classCount.Count * trainingPercentage) / 100.0; 148 133 } 149 134 150 using (var writer = new StreamWriter(path)) { 151 writer.Write(strBuilder); 135 List<IList> training = GetListOfIListCopy(values); 136 List<IList> test = GetListOfIListCopy(values); 137 138 for (int i = 0; i < targetValues.Count; i++) { 139 if (taken[(double)targetValues[i]] > 0) { 140 AddRow(training, values, i); 141 taken[(double)targetValues[i]]--; 142 } else { 143 AddRow(test, values, i); 144 } 152 145 } 146 147 trainingPartEnd = training.First().Count; 148 149 training = Shuffle(training); 150 test = Shuffle(test); 151 for (int i = 0; i < training.Count; i++) { 152 for (int j = 0; j < test[i].Count; j++) { 153 training[i].Add(test[i][j]); 154 } 155 } 156 157 return training; 158 } 159 160 private void AddRow(List<IList> destination, List<IList> source, int index) { 161 for (int i = 0; i < source.Count; i++) { 162 destination[i].Add(source[i][index]); 163 } 164 } 165 166 private List<IList> GetListOfIListCopy(List<IList> values) { 167 List<IList> newList = new List<IList>(values.Count); 168 foreach (IList t in values) { 169 if (t is List<double>) 170 newList.Add(new List<double>()); 171 else if (t is List<DateTime>) 172 newList.Add(new List<DateTime>()); 173 else if (t is List<string>) 174 newList.Add(new List<string>()); 175 else 176 throw new InvalidOperationException(); 177 } 178 return newList; 153 179 } 154 180 }
Note: See TracChangeset
for help on using the changeset viewer.