Free cookie consent management tool by TermsFeed Policy Generator

Ignore:
Timestamp:
11/07/12 16:28:33 (11 years ago)
Author:
mkommend
Message:

#1942: Reintegrated branch for CSV import.

Location:
trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis
Files:
9 edited
3 copied

Legend:

Unmodified
Added
Removed
  • trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis

  • trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Classification/CSV/ClassifiactionCSVInstanceProvider.cs

    r8693 r8877  
    2323using System.Collections;
    2424using System.Collections.Generic;
    25 using System.Globalization;
    2625using System.IO;
    2726using System.Linq;
    28 using System.Text;
    2927using HeuristicLab.Common;
    3028using HeuristicLab.Problems.DataAnalysis;
     
    7674        }
    7775      } else {
    78         allowedInputVars.AddRange(dataset.DoubleVariables.Where(x => x.Equals(targetVar)));
     76        allowedInputVars.AddRange(dataset.DoubleVariables.Where(x => !x.Equals(targetVar)));
    7977      }
    8078
     
    9290    }
    9391
    94     public override IClassificationProblemData ImportData(string path, DataAnalysisImportType type) {
    95       TableFileParser csvFileParser = new TableFileParser();
    96       csvFileParser.Parse(path);
    97 
     92    protected override IClassificationProblemData ImportData(string path, ClassificationImportType type, TableFileParser csvFileParser) {
    9893      int trainingPartEnd = (csvFileParser.Rows * type.Training) / 100;
    9994      List<IList> values = csvFileParser.Values;
    10095      if (type.Shuffle) {
    101         values = Shuffle(values);
     96        values = Shuffle(values, csvFileParser.VariableNames.ToList().FindIndex(x => x.Equals(type.TargetVariable)),
     97                         type.Training, out trainingPartEnd);
    10298      }
    10399
    104100      Dataset dataset = new Dataset(csvFileParser.VariableNames, values);
    105       string targetVar = dataset.DoubleVariables.Last();
    106101
    107102      // turn of input variables that are constant in the training partition
    108103      var allowedInputVars = new List<string>();
    109104      var trainingIndizes = Enumerable.Range(0, trainingPartEnd);
    110       foreach (var variableName in dataset.DoubleVariables) {
    111         if (trainingIndizes.Count() >= 2 && dataset.GetDoubleValues(variableName, trainingIndizes).Range() > 0 &&
    112           variableName != targetVar)
    113           allowedInputVars.Add(variableName);
     105      if (trainingIndizes.Count() >= 2) {
     106        foreach (var variableName in dataset.DoubleVariables) {
     107          if (dataset.GetDoubleValues(variableName, trainingIndizes).Range() > 0 &&
     108            variableName != type.TargetVariable)
     109            allowedInputVars.Add(variableName);
     110        }
     111      } else {
     112        allowedInputVars.AddRange(dataset.DoubleVariables.Where(x => !x.Equals(type.TargetVariable)));
    114113      }
    115114
    116       ClassificationProblemData classificationData = new ClassificationProblemData(dataset, allowedInputVars, targetVar);
     115      ClassificationProblemData classificationData = new ClassificationProblemData(dataset, allowedInputVars, type.TargetVariable);
    117116
    118117      classificationData.TrainingPartition.Start = 0;
     
    126125    }
    127126
    128     public override bool CanExportData {
    129       get { return true; }
    130     }
    131     public override void ExportData(IClassificationProblemData instance, string path) {
    132       var strBuilder = new StringBuilder();
    133       var colSep = CultureInfo.CurrentCulture.TextInfo.ListSeparator;
    134       foreach (var variable in instance.Dataset.VariableNames) {
    135         strBuilder.Append(variable.Replace(colSep, String.Empty) + colSep);
    136       }
    137       strBuilder.Remove(strBuilder.Length - colSep.Length, colSep.Length);
    138       strBuilder.AppendLine();
    139 
    140       var dataset = instance.Dataset;
    141 
    142       for (int i = 0; i < dataset.Rows; i++) {
    143         for (int j = 0; j < dataset.Columns; j++) {
    144           if (j > 0) strBuilder.Append(colSep);
    145           strBuilder.Append(dataset.GetValue(i, j));
    146         }
    147         strBuilder.AppendLine();
     127    protected List<IList> Shuffle(List<IList> values, int target, int trainingPercentage, out int trainingPartEnd) {
     128      IList targetValues = values[target];
     129      var group = targetValues.Cast<double>().GroupBy(x => x).Select(g => new { Key = g.Key, Count = g.Count() }).ToList();
     130      Dictionary<double, double> taken = new Dictionary<double, double>();
     131      foreach (var classCount in group) {
     132        taken[classCount.Key] = (classCount.Count * trainingPercentage) / 100.0;
    148133      }
    149134
    150       using (var writer = new StreamWriter(path)) {
    151         writer.Write(strBuilder);
     135      List<IList> training = GetListOfIListCopy(values);
     136      List<IList> test = GetListOfIListCopy(values);
     137
     138      for (int i = 0; i < targetValues.Count; i++) {
     139        if (taken[(double)targetValues[i]] > 0) {
     140          AddRow(training, values, i);
     141          taken[(double)targetValues[i]]--;
     142        } else {
     143          AddRow(test, values, i);
     144        }
    152145      }
     146
     147      trainingPartEnd = training.First().Count;
     148
     149      training = Shuffle(training);
     150      test = Shuffle(test);
     151      for (int i = 0; i < training.Count; i++) {
     152        for (int j = 0; j < test[i].Count; j++) {
     153          training[i].Add(test[i][j]);
     154        }
     155      }
     156
     157      return training;
     158    }
     159
     160    private void AddRow(List<IList> destination, List<IList> source, int index) {
     161      for (int i = 0; i < source.Count; i++) {
     162        destination[i].Add(source[i][index]);
     163      }
     164    }
     165
     166    private List<IList> GetListOfIListCopy(List<IList> values) {
     167      List<IList> newList = new List<IList>(values.Count);
     168      foreach (IList t in values) {
     169        if (t is List<double>)
     170          newList.Add(new List<double>());
     171        else if (t is List<DateTime>)
     172          newList.Add(new List<DateTime>());
     173        else if (t is List<string>)
     174          newList.Add(new List<string>());
     175        else
     176          throw new InvalidOperationException();
     177      }
     178      return newList;
    153179    }
    154180  }
  • trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Classification/ClassificationInstanceProvider.cs

    r8598 r8877  
    2323
    2424namespace HeuristicLab.Problems.Instances.DataAnalysis {
    25   public abstract class ClassificationInstanceProvider : DataAnalysisInstanceProvider<IClassificationProblemData> {
     25  public abstract class ClassificationInstanceProvider : DataAnalysisInstanceProvider<IClassificationProblemData, ClassificationImportType> {
    2626  }
    2727}
  • trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Clustering/CSV/ClusteringCSVInstanceProvider.cs

    r8685 r8877  
    2323using System.Collections;
    2424using System.Collections.Generic;
    25 using System.Globalization;
    2625using System.IO;
    2726using System.Linq;
    28 using System.Text;
    2927using HeuristicLab.Common;
    3028using HeuristicLab.Problems.DataAnalysis;
     
    7573        }
    7674      } else {
    77         allowedInputVars.AddRange(dataset.DoubleVariables.Where(x => x.Equals(targetVar)));
     75        allowedInputVars.AddRange(dataset.DoubleVariables.Where(x => !x.Equals(targetVar)));
    7876      }
    7977
     
    9189    }
    9290
    93     public override IClusteringProblemData ImportData(string path, DataAnalysisImportType type) {
    94       TableFileParser csvFileParser = new TableFileParser();
    95       csvFileParser.Parse(path);
    96 
     91    protected override IClusteringProblemData ImportData(string path, DataAnalysisImportType type, TableFileParser csvFileParser) {
    9792      List<IList> values = csvFileParser.Values;
    9893      if (type.Shuffle) {
     
    107102      int trainingPartEnd = (csvFileParser.Rows * type.Training) / 100;
    108103      var trainingIndizes = Enumerable.Range(0, trainingPartEnd);
    109       foreach (var variableName in dataset.DoubleVariables) {
    110         if (trainingIndizes.Count() >= 2 && dataset.GetDoubleValues(variableName, trainingIndizes).Range() > 0 &&
    111           variableName != targetVar)
    112           allowedInputVars.Add(variableName);
     104      if (trainingIndizes.Count() >= 2) {
     105        foreach (var variableName in dataset.DoubleVariables) {
     106          if (dataset.GetDoubleValues(variableName, trainingIndizes).Range() > 0 &&
     107            variableName != targetVar)
     108            allowedInputVars.Add(variableName);
     109        }
     110      } else {
     111        allowedInputVars.AddRange(dataset.DoubleVariables.Where(x => !x.Equals(targetVar)));
    113112      }
    114113
     
    124123      return clusteringData;
    125124    }
    126 
    127     public override bool CanExportData {
    128       get { return true; }
    129     }
    130     public override void ExportData(IClusteringProblemData instance, string path) {
    131       var strBuilder = new StringBuilder();
    132       var colSep = CultureInfo.CurrentCulture.TextInfo.ListSeparator;
    133       foreach (var variable in instance.Dataset.VariableNames) {
    134         strBuilder.Append(variable.Replace(colSep, String.Empty) + colSep);
    135       }
    136       strBuilder.Remove(strBuilder.Length - colSep.Length, colSep.Length);
    137       strBuilder.AppendLine();
    138 
    139       var dataset = instance.Dataset;
    140 
    141       for (int i = 0; i < dataset.Rows; i++) {
    142         for (int j = 0; j < dataset.Columns; j++) {
    143           if (j > 0) strBuilder.Append(colSep);
    144           strBuilder.Append(dataset.GetValue(i, j));
    145         }
    146         strBuilder.AppendLine();
    147       }
    148 
    149       using (var writer = new StreamWriter(path)) {
    150         writer.Write(strBuilder);
    151       }
    152     }
    153125  }
    154126}
  • trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Clustering/ClusteringInstanceProvider.cs

    r8598 r8877  
    2323
    2424namespace HeuristicLab.Problems.Instances.DataAnalysis {
    25   public abstract class ClusteringInstanceProvider : DataAnalysisInstanceProvider<IClusteringProblemData> {
     25  public abstract class ClusteringInstanceProvider : DataAnalysisInstanceProvider<IClusteringProblemData, DataAnalysisImportType> {
    2626  }
    2727}
  • trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/DataAnalysisInstanceProvider.cs

    r8598 r8877  
    2323using System.Collections;
    2424using System.Collections.Generic;
     25using System.Globalization;
     26using System.IO;
    2527using System.Linq;
     28using System.Text;
    2629using HeuristicLab.Problems.DataAnalysis;
     30using HeuristicLab.Random;
    2731
    2832namespace HeuristicLab.Problems.Instances.DataAnalysis {
    29   public abstract class DataAnalysisInstanceProvider<TData> : ProblemInstanceProvider<TData>
    30     where TData : class, IDataAnalysisProblemData {
     33  public abstract class DataAnalysisInstanceProvider<TData, ImportType> : ProblemInstanceProvider<TData>
     34    where TData : class, IDataAnalysisProblemData
     35    where ImportType : DataAnalysisImportType {
    3136
    3237    // has to be implemented, if CanImportData is true
    33     public virtual TData ImportData(string path, DataAnalysisImportType type) {
    34       throw new NotSupportedException();
     38    public TData ImportData(string path, ImportType type) {
     39      TableFileParser csvFileParser = new TableFileParser();
     40      csvFileParser.Parse(path);
     41      return ImportData(path, type, csvFileParser);
    3542    }
     43    public TData ImportData(string path, ImportType type, DataAnalysisCSVFormat csvFormat) {
     44      TableFileParser csvFileParser = new TableFileParser();
     45      csvFileParser.Parse(path, csvFormat.NumberFormatInfo, csvFormat.DateTimeFormatInfo, csvFormat.Separator);
     46      return ImportData(path, type, csvFileParser);
     47    }
     48
     49    protected abstract TData ImportData(string path, ImportType type, TableFileParser csvFileParser);
    3650
    3751    protected List<IList> Shuffle(List<IList> values) {
    3852      int count = values.First().Count;
    39       int[] indices = GetRandomIndices(count);
     53      int[] indices = Enumerable.Range(0, count).Shuffle(new FastRandom()).ToArray();
    4054      List<IList> shuffeledValues = new List<IList>(values.Count);
    4155      for (int col = 0; col < values.Count; col++) {
     
    5771    }
    5872
    59     //Fisher–Yates shuffle
    60     private int[] GetRandomIndices(int amount) {
    61       int[] randomIndices = Enumerable.Range(0, amount).ToArray();
    62       System.Random rand = new System.Random();
    63       int n = amount;
    64       while (n > 1) {
    65         n--;
    66         int k = rand.Next(n + 1);
    67         int value = randomIndices[k];
    68         randomIndices[k] = randomIndices[n];
    69         randomIndices[n] = value;
     73    public override bool CanExportData {
     74      get { return true; }
     75    }
     76    public override void ExportData(TData instance, string path) {
     77      var strBuilder = new StringBuilder();
     78      var colSep = CultureInfo.CurrentCulture.TextInfo.ListSeparator;
     79      foreach (var variable in instance.Dataset.VariableNames) {
     80        strBuilder.Append(variable.Replace(colSep, String.Empty) + colSep);
    7081      }
    71       return randomIndices;
     82      strBuilder.Remove(strBuilder.Length - colSep.Length, colSep.Length);
     83      strBuilder.AppendLine();
     84
     85      var dataset = instance.Dataset;
     86
     87      for (int i = 0; i < dataset.Rows; i++) {
     88        for (int j = 0; j < dataset.Columns; j++) {
     89          if (j > 0) strBuilder.Append(colSep);
     90          strBuilder.Append(dataset.GetValue(i, j));
     91        }
     92        strBuilder.AppendLine();
     93      }
     94
     95      using (var writer = new StreamWriter(path)) {
     96        writer.Write(strBuilder);
     97      }
    7298    }
    7399  }
  • trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/HeuristicLab.Problems.Instances.DataAnalysis-3.3.csproj

    r8841 r8877  
    115115    <Compile Include="Classification\ArtificialClassificationDataDescriptor.cs" />
    116116    <Compile Include="Classification\ArtificialClassificationInstanceProvider.cs" />
     117    <Compile Include="Classification\ClassificationImportType.cs" />
    117118    <Compile Include="Classification\ClassificationInstanceProvider.cs" />
    118119    <Compile Include="Classification\CSV\ClassifiactionCSVInstanceProvider.cs" />
     
    126127    <Compile Include="Clustering\ClusteringInstanceProvider.cs" />
    127128    <Compile Include="Clustering\CSV\ClusteringCSVInstanceProvider.cs" />
     129    <Compile Include="DataAnalysisCSVFormat.cs" />
    128130    <Compile Include="DataAnalysisImportType.cs" />
    129131    <Compile Include="DataAnalysisInstanceProvider.cs" />
     
    179181    <Compile Include="Regression\Nguyen\NguyenInstanceProvider.cs" />
    180182    <Compile Include="Regression\RealWorld\RegressionRealWorldInstanceProvider.cs" />
     183    <Compile Include="Regression\RegressionImportType.cs" />
    181184    <Compile Include="Regression\RegressionInstanceProvider.cs" />
    182185    <Compile Include="Regression\ResourceRegressionDataDescriptor.cs" />
  • trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Regression/CSV/RegressionCSVInstanceProvider.cs

    r8685 r8877  
    2323using System.Collections;
    2424using System.Collections.Generic;
    25 using System.Globalization;
    2625using System.IO;
    2726using System.Linq;
    28 using System.Text;
    2927using HeuristicLab.Common;
    3028using HeuristicLab.Problems.DataAnalysis;
     
    6765      var allowedInputVars = new List<string>();
    6866      var trainingIndizes = Enumerable.Range(0, (csvFileParser.Rows * 2) / 3);
    69       foreach (var variableName in dataset.DoubleVariables) {
    70         if (trainingIndizes.Count() >= 2 && dataset.GetDoubleValues(variableName, trainingIndizes).Range() > 0 &&
    71           variableName != targetVar)
    72           allowedInputVars.Add(variableName);
     67      if (trainingIndizes.Count() >= 2) {
     68        foreach (var variableName in dataset.DoubleVariables) {
     69          if (dataset.GetDoubleValues(variableName, trainingIndizes).Range() > 0 &&
     70            variableName != targetVar)
     71            allowedInputVars.Add(variableName);
     72        }
     73      } else {
     74        allowedInputVars.AddRange(dataset.DoubleVariables.Where(x => !x.Equals(targetVar)));
    7375      }
    7476
     
    8688    }
    8789
    88     public override IRegressionProblemData ImportData(string path, DataAnalysisImportType type) {
    89       TableFileParser csvFileParser = new TableFileParser();
    90       csvFileParser.Parse(path);
    91 
     90    protected override IRegressionProblemData ImportData(string path, RegressionImportType type, TableFileParser csvFileParser) {
    9291      List<IList> values = csvFileParser.Values;
    9392      if (type.Shuffle) {
     
    9594      }
    9695      Dataset dataset = new Dataset(csvFileParser.VariableNames, values);
    97       string targetVar = dataset.DoubleVariables.Last();
    9896
    9997      // turn of input variables that are constant in the training partition
     
    105103        foreach (var variableName in dataset.DoubleVariables) {
    106104          if (dataset.GetDoubleValues(variableName, trainingIndizes).Range() > 0 &&
    107             variableName != targetVar)
     105            variableName != type.TargetVariable)
    108106            allowedInputVars.Add(variableName);
    109107        }
    110108      } else {
    111         allowedInputVars.AddRange(dataset.DoubleVariables.Where(x => x.Equals(targetVar)));
     109        allowedInputVars.AddRange(dataset.DoubleVariables.Where(x => !x.Equals(type.TargetVariable)));
    112110      }
    113111
    114       RegressionProblemData regressionData = new RegressionProblemData(dataset, allowedInputVars, targetVar);
     112      RegressionProblemData regressionData = new RegressionProblemData(dataset, allowedInputVars, type.TargetVariable);
    115113
    116114      regressionData.TrainingPartition.Start = 0;
     
    123121      return regressionData;
    124122    }
    125 
    126     public override bool CanExportData {
    127       get { return true; }
    128     }
    129     public override void ExportData(IRegressionProblemData instance, string path) {
    130       var strBuilder = new StringBuilder();
    131       var colSep = CultureInfo.CurrentCulture.TextInfo.ListSeparator;
    132       foreach (var variable in instance.Dataset.VariableNames) {
    133         strBuilder.Append(variable.Replace(colSep, String.Empty) + colSep);
    134       }
    135       strBuilder.Remove(strBuilder.Length - colSep.Length, colSep.Length);
    136       strBuilder.AppendLine();
    137 
    138       var dataset = instance.Dataset;
    139 
    140       for (int i = 0; i < dataset.Rows; i++) {
    141         for (int j = 0; j < dataset.Columns; j++) {
    142           if (j > 0) strBuilder.Append(colSep);
    143           strBuilder.Append(dataset.GetValue(i, j));
    144         }
    145         strBuilder.AppendLine();
    146       }
    147 
    148       using (var writer = new StreamWriter(path)) {
    149         writer.Write(strBuilder);
    150       }
    151     }
    152123  }
    153124}
  • trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Regression/RegressionInstanceProvider.cs

    r8598 r8877  
    2323
    2424namespace HeuristicLab.Problems.Instances.DataAnalysis {
    25   public abstract class RegressionInstanceProvider : DataAnalysisInstanceProvider<IRegressionProblemData> {
     25  public abstract class RegressionInstanceProvider : DataAnalysisInstanceProvider<IRegressionProblemData, RegressionImportType> {
    2626  }
    2727}
Note: See TracChangeset for help on using the changeset viewer.