Free cookie consent management tool by TermsFeed Policy Generator

source: branches/DataAnalysisCSVImport/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Classification/CSV/ClassifiactionCSVInstanceProvider.cs @ 8701

Last change on this file since 8701 was 8701, checked in by sforsten, 12 years ago

#1942:

  • add combo boxes to DataAnalysisImportTypeDialog to select csv settings
  • get branch ready
File size: 9.2 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2012 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections;
24using System.Collections.Generic;
25using System.Globalization;
26using System.IO;
27using System.Linq;
28using System.Text;
29using HeuristicLab.Common;
30using HeuristicLab.Problems.DataAnalysis;
31
32namespace HeuristicLab.Problems.Instances.DataAnalysis {
33  public class ClassificationCSVInstanceProvider : ClassificationInstanceProvider {
34    public override string Name {
35      get { return "CSV File"; }
36    }
37    public override string Description {
38      get {
39        return "";
40      }
41    }
42    public override Uri WebLink {
43      get { return new Uri("http://dev.heuristiclab.com/trac/hl/core/wiki/UsersFAQ#DataAnalysisImportFileFormat"); }
44    }
45    public override string ReferencePublication {
46      get { return ""; }
47    }
48
49    public override IEnumerable<IDataDescriptor> GetDataDescriptors() {
50      return new List<IDataDescriptor>();
51    }
52
53    public override IClassificationProblemData LoadData(IDataDescriptor descriptor) {
54      throw new NotImplementedException();
55    }
56
57    public override bool CanImportData {
58      get { return true; }
59    }
60    public override IClassificationProblemData ImportData(string path) {
61      TableFileParser csvFileParser = new TableFileParser();
62
63      csvFileParser.Parse(path);
64
65      Dataset dataset = new Dataset(csvFileParser.VariableNames, csvFileParser.Values);
66      string targetVar = dataset.DoubleVariables.Last();
67
68      // turn of input variables that are constant in the training partition
69      var allowedInputVars = new List<string>();
70      var trainingIndizes = Enumerable.Range(0, (csvFileParser.Rows * 2) / 3);
71      if (trainingIndizes.Count() >= 2) {
72        foreach (var variableName in dataset.DoubleVariables) {
73          if (dataset.GetDoubleValues(variableName, trainingIndizes).Range() > 0 &&
74            variableName != targetVar)
75            allowedInputVars.Add(variableName);
76        }
77      } else {
78        allowedInputVars.AddRange(dataset.DoubleVariables.Where(x => x.Equals(targetVar)));
79      }
80
81      ClassificationProblemData classificationData = new ClassificationProblemData(dataset, allowedInputVars, targetVar);
82
83      int trainingPartEnd = trainingIndizes.Last();
84      classificationData.TrainingPartition.Start = trainingIndizes.First();
85      classificationData.TrainingPartition.End = trainingPartEnd;
86      classificationData.TestPartition.Start = trainingPartEnd;
87      classificationData.TestPartition.End = csvFileParser.Rows;
88
89      classificationData.Name = Path.GetFileName(path);
90
91      return classificationData;
92    }
93
94    protected override IClassificationProblemData ImportData(string path, ClassificationImportType type, TableFileParser csvFileParser) {
95      int trainingPartEnd = (csvFileParser.Rows * type.Training) / 100;
96      List<IList> values = csvFileParser.Values;
97      if (type.Shuffle) {
98        values = Shuffle(values);
99      }
100
101      Dataset dataset = new Dataset(csvFileParser.VariableNames, values);
102      string targetVar = dataset.DoubleVariables.Last();
103
104      // turn of input variables that are constant in the training partition
105      var allowedInputVars = new List<string>();
106      var trainingIndizes = Enumerable.Range(0, trainingPartEnd);
107      foreach (var variableName in dataset.DoubleVariables) {
108        if (trainingIndizes.Count() >= 2 && dataset.GetDoubleValues(variableName, trainingIndizes).Range() > 0 &&
109          variableName != targetVar)
110          allowedInputVars.Add(variableName);
111      }
112
113      ClassificationProblemData classificationData = new ClassificationProblemData(dataset, allowedInputVars, targetVar);
114
115      classificationData.TrainingPartition.Start = 0;
116      classificationData.TrainingPartition.End = trainingPartEnd;
117      classificationData.TestPartition.Start = trainingPartEnd;
118      classificationData.TestPartition.End = csvFileParser.Rows;
119
120      classificationData.Name = Path.GetFileName(path);
121
122      return classificationData;
123    }
124
125    protected List<IList> Shuffle(List<IList> values, int target, int trainingPercentage, int trainingPartEnd) {
126      target = 5;
127      IList targetValues = values[target];
128      var group = targetValues.Cast<double>().GroupBy(x => x).Select(g => new { Key = g.Key, Count = g.Count() }).ToList();
129      Dictionary<double, double> taken = new Dictionary<double, double>();
130      foreach (var classCount in group) {
131        taken[classCount.Key] = (classCount.Count * trainingPercentage) / 100;
132      }
133
134      List<IList> training = GetListOfIListCopy(values);
135      List<IList> test = GetListOfIListCopy(values);
136
137      for (int i = 0; i < targetValues.Count; i++) {
138        if (taken[(double)targetValues[i]] > 0) {
139          AddRow(training, values, i);
140          taken[(double)targetValues[i]]--;
141        } else {
142          AddRow(test, values, i);
143        }
144      }
145
146      training = Shuffle(training);
147      test = Shuffle(test);
148      for (int i = 0; i < training.Count; i++) {
149        for (int j = 0; j < test[i].Count; j++) {
150          training[i].Add(test[i][j]);
151        }
152      }
153
154      return training;
155    }
156
157    private void AddRow(List<IList> destination, List<IList> source, int index) {
158      for (int i = 0; i < source.Count; i++) {
159        destination[i].Add(source[i][index]);
160      }
161    }
162
163    private List<IList> GetListOfIListCopy(List<IList> values) {
164      List<IList> newList = new List<IList>(values.Count);
165      for (int col = 0; col < values.Count; col++) {
166
167        if (values[col] is List<double>)
168          newList.Add(new List<double>());
169        else if (values[col] is List<DateTime>)
170          newList.Add(new List<DateTime>());
171        else if (values[col] is List<string>)
172          newList.Add(new List<string>());
173        else
174          throw new InvalidOperationException();
175      }
176      return newList;
177    }
178
179    private List<IList> NormalizeClasses(List<IList> values) {
180      int column = GetLastDoubleColumn(values);
181      Dictionary<object, int> count = new Dictionary<object, int>();
182      foreach (var item in values[column]) {
183        if (count.Keys.Contains(item)) {
184          count[item]++;
185        } else {
186          count.Add(item, 1);
187        }
188      }
189      int min = count.Values.Min();
190      Dictionary<object, int> taken = new Dictionary<object, int>();
191      foreach (var key in count.Keys) {
192        taken[key] = 0;
193      }
194      List<IList> normalizedValues = new List<IList>(values.Count);
195      for (int col = 0; col < values.Count; col++) {
196
197        if (values[col] is List<double>)
198          normalizedValues.Add(new List<double>());
199        else if (values[col] is List<DateTime>)
200          normalizedValues.Add(new List<DateTime>());
201        else if (values[col] is List<string>)
202          normalizedValues.Add(new List<string>());
203        else
204          throw new InvalidOperationException();
205      }
206      for (int i = 0; i < values.First().Count; i++) {
207        if (taken[values[column][i]] < min) {
208          taken[values[column][i]]++;
209          for (int col = 0; col < values.Count; col++) {
210            normalizedValues[col].Add(values[col][i]);
211          }
212        }
213      }
214      return normalizedValues;
215    }
216
217    private int GetLastDoubleColumn(List<IList> values) {
218      for (int i = values.Count - 1; i >= 0; i--) {
219        if (values[i] is List<double>) {
220          return i;
221        }
222      }
223      throw new ArgumentException("No possible Target Variable could be found!");
224    }
225
226    public override bool CanExportData {
227      get { return true; }
228    }
229    public override void ExportData(IClassificationProblemData instance, string path) {
230      var strBuilder = new StringBuilder();
231      var colSep = CultureInfo.CurrentCulture.TextInfo.ListSeparator;
232      foreach (var variable in instance.Dataset.VariableNames) {
233        strBuilder.Append(variable.Replace(colSep, String.Empty) + colSep);
234      }
235      strBuilder.Remove(strBuilder.Length - colSep.Length, colSep.Length);
236      strBuilder.AppendLine();
237
238      var dataset = instance.Dataset;
239
240      for (int i = 0; i < dataset.Rows; i++) {
241        for (int j = 0; j < dataset.Columns; j++) {
242          if (j > 0) strBuilder.Append(colSep);
243          strBuilder.Append(dataset.GetValue(i, j));
244        }
245        strBuilder.AppendLine();
246      }
247
248      using (var writer = new StreamWriter(path)) {
249        writer.Write(strBuilder);
250      }
251    }
252  }
253}
Note: See TracBrowser for help on using the repository browser.