Free cookie consent management tool by TermsFeed Policy Generator

Ignore:
Timestamp:
11/07/12 16:28:33 (12 years ago)
Author:
mkommend
Message:

#1942: Reintegrated branch for CSV import.

Location:
trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis
Files:
3 edited
1 copied

Legend:

Unmodified
Added
Removed
  • trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis

  • trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Classification/CSV/ClassifiactionCSVInstanceProvider.cs

    r8693 r8877  
    2323using System.Collections;
    2424using System.Collections.Generic;
    25 using System.Globalization;
    2625using System.IO;
    2726using System.Linq;
    28 using System.Text;
    2927using HeuristicLab.Common;
    3028using HeuristicLab.Problems.DataAnalysis;
     
    7674        }
    7775      } else {
    78         allowedInputVars.AddRange(dataset.DoubleVariables.Where(x => x.Equals(targetVar)));
     76        allowedInputVars.AddRange(dataset.DoubleVariables.Where(x => !x.Equals(targetVar)));
    7977      }
    8078
     
    9290    }
    9391
    94     public override IClassificationProblemData ImportData(string path, DataAnalysisImportType type) {
    95       TableFileParser csvFileParser = new TableFileParser();
    96       csvFileParser.Parse(path);
    97 
     92    protected override IClassificationProblemData ImportData(string path, ClassificationImportType type, TableFileParser csvFileParser) {
    9893      int trainingPartEnd = (csvFileParser.Rows * type.Training) / 100;
    9994      List<IList> values = csvFileParser.Values;
    10095      if (type.Shuffle) {
    101         values = Shuffle(values);
     96        values = Shuffle(values, csvFileParser.VariableNames.ToList().FindIndex(x => x.Equals(type.TargetVariable)),
     97                         type.Training, out trainingPartEnd);
    10298      }
    10399
    104100      Dataset dataset = new Dataset(csvFileParser.VariableNames, values);
    105       string targetVar = dataset.DoubleVariables.Last();
    106101
    107102      // turn of input variables that are constant in the training partition
    108103      var allowedInputVars = new List<string>();
    109104      var trainingIndizes = Enumerable.Range(0, trainingPartEnd);
    110       foreach (var variableName in dataset.DoubleVariables) {
    111         if (trainingIndizes.Count() >= 2 && dataset.GetDoubleValues(variableName, trainingIndizes).Range() > 0 &&
    112           variableName != targetVar)
    113           allowedInputVars.Add(variableName);
     105      if (trainingIndizes.Count() >= 2) {
     106        foreach (var variableName in dataset.DoubleVariables) {
     107          if (dataset.GetDoubleValues(variableName, trainingIndizes).Range() > 0 &&
     108            variableName != type.TargetVariable)
     109            allowedInputVars.Add(variableName);
     110        }
     111      } else {
     112        allowedInputVars.AddRange(dataset.DoubleVariables.Where(x => !x.Equals(type.TargetVariable)));
    114113      }
    115114
    116       ClassificationProblemData classificationData = new ClassificationProblemData(dataset, allowedInputVars, targetVar);
     115      ClassificationProblemData classificationData = new ClassificationProblemData(dataset, allowedInputVars, type.TargetVariable);
    117116
    118117      classificationData.TrainingPartition.Start = 0;
     
    126125    }
    127126
    128     public override bool CanExportData {
    129       get { return true; }
    130     }
    131     public override void ExportData(IClassificationProblemData instance, string path) {
    132       var strBuilder = new StringBuilder();
    133       var colSep = CultureInfo.CurrentCulture.TextInfo.ListSeparator;
    134       foreach (var variable in instance.Dataset.VariableNames) {
    135         strBuilder.Append(variable.Replace(colSep, String.Empty) + colSep);
    136       }
    137       strBuilder.Remove(strBuilder.Length - colSep.Length, colSep.Length);
    138       strBuilder.AppendLine();
    139 
    140       var dataset = instance.Dataset;
    141 
    142       for (int i = 0; i < dataset.Rows; i++) {
    143         for (int j = 0; j < dataset.Columns; j++) {
    144           if (j > 0) strBuilder.Append(colSep);
    145           strBuilder.Append(dataset.GetValue(i, j));
    146         }
    147         strBuilder.AppendLine();
     127    protected List<IList> Shuffle(List<IList> values, int target, int trainingPercentage, out int trainingPartEnd) {
     128      IList targetValues = values[target];
     129      var group = targetValues.Cast<double>().GroupBy(x => x).Select(g => new { Key = g.Key, Count = g.Count() }).ToList();
     130      Dictionary<double, double> taken = new Dictionary<double, double>();
     131      foreach (var classCount in group) {
     132        taken[classCount.Key] = (classCount.Count * trainingPercentage) / 100.0;
    148133      }
    149134
    150       using (var writer = new StreamWriter(path)) {
    151         writer.Write(strBuilder);
     135      List<IList> training = GetListOfIListCopy(values);
     136      List<IList> test = GetListOfIListCopy(values);
     137
     138      for (int i = 0; i < targetValues.Count; i++) {
     139        if (taken[(double)targetValues[i]] > 0) {
     140          AddRow(training, values, i);
     141          taken[(double)targetValues[i]]--;
     142        } else {
     143          AddRow(test, values, i);
     144        }
    152145      }
     146
     147      trainingPartEnd = training.First().Count;
     148
     149      training = Shuffle(training);
     150      test = Shuffle(test);
     151      for (int i = 0; i < training.Count; i++) {
     152        for (int j = 0; j < test[i].Count; j++) {
     153          training[i].Add(test[i][j]);
     154        }
     155      }
     156
     157      return training;
     158    }
     159
     160    private void AddRow(List<IList> destination, List<IList> source, int index) {
     161      for (int i = 0; i < source.Count; i++) {
     162        destination[i].Add(source[i][index]);
     163      }
     164    }
     165
     166    private List<IList> GetListOfIListCopy(List<IList> values) {
     167      List<IList> newList = new List<IList>(values.Count);
     168      foreach (IList t in values) {
     169        if (t is List<double>)
     170          newList.Add(new List<double>());
     171        else if (t is List<DateTime>)
     172          newList.Add(new List<DateTime>());
     173        else if (t is List<string>)
     174          newList.Add(new List<string>());
     175        else
     176          throw new InvalidOperationException();
     177      }
     178      return newList;
    153179    }
    154180  }
  • trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Classification/ClassificationInstanceProvider.cs

    r8598 r8877  
    2323
    2424namespace HeuristicLab.Problems.Instances.DataAnalysis {
    25   public abstract class ClassificationInstanceProvider : DataAnalysisInstanceProvider<IClassificationProblemData> {
     25  public abstract class ClassificationInstanceProvider : DataAnalysisInstanceProvider<IClassificationProblemData, ClassificationImportType> {
    2626  }
    2727}
Note: See TracChangeset for help on using the changeset viewer.