Free cookie consent management tool by TermsFeed Policy Generator

Ignore:
Timestamp:
12/18/13 15:36:18 (11 years ago)
Author:
rstoll
Message:
  • Renamed StatisticInfo to StatisticsLogic
  • Fixed todo in PreprocessingDataManipulation
File:
1 edited

Legend:

Unmodified
Added
Removed
  • branches/DataPreprocessing/HeuristicLab.DataPreprocessing/3.3/Implementations/PreprocessingDataManipulation.cs

    r10246 r10249  
    1 using HeuristicLab.Data;
    2 using System;
     1using System;
    32using System.Collections.Generic;
    43using System.Linq;
    5 using System.Text;
     4using HeuristicLab.Data;
    65
    7 namespace HeuristicLab.DataPreprocessing
    8 {
    9     class PreprocessingDataManipulation : IPreprocessingDataManipulation
    10     {
    11         private IPreprocessingData preprocessingData;
    12         private StatisticInfo statisticInfo;
    13         private ISearchLogic searchLogic;
     6namespace HeuristicLab.DataPreprocessing {
     7  class PreprocessingDataManipulation : IPreprocessingDataManipulation {
     8    private IPreprocessingData preprocessingData;
     9    private IStatisticsLogic statisticInfo;
     10    private ISearchLogic searchLogic;
    1411
    15         public PreprocessingDataManipulation(IPreprocessingData _prepocessingData) {
    16             preprocessingData = _prepocessingData;
    17           //todo
    18             searchLogic = new SearchLogic(preprocessingData);
    19             statisticInfo = new StatisticInfo(preprocessingData,searchLogic);
    20            
     12    public PreprocessingDataManipulation(IPreprocessingData _prepocessingData, ISearchLogic theSearchLogic, IStatisticsLogic theStatisticsLogic) {
     13      preprocessingData = _prepocessingData;
     14      searchLogic = theSearchLogic;
     15      statisticInfo = theStatisticsLogic;
     16    }
     17
     18    public void ReplaceIndicesByValue<T>(string variableName, IEnumerable<int> indices, T value) {
     19      foreach (int index in indices) {
     20        preprocessingData.SetCell<T>(variableName, index, value);
     21      }
     22    }
     23
     24    public void ReplaceIndicesByAverageValue(string variableName, IEnumerable<int> indices) {
     25      double average = statisticInfo.GetAverage(variableName);
     26      ReplaceIndicesByValue<double>(variableName, indices, average);
     27    }
     28
     29    public void ReplaceIndicesByMedianValue(string variableName, IEnumerable<int> indices) {
     30      double median = statisticInfo.GetMedian(variableName);
     31      ReplaceIndicesByValue<double>(variableName, indices, median);
     32    }
     33
     34    public void ReplaceIndicesByRandomValue(string variableName, IEnumerable<int> indices) {
     35      Random r = new Random();
     36
     37      double max = statisticInfo.GetMax<double>(variableName);
     38      double min = statisticInfo.GetMin<double>(variableName);
     39      double randMultiplier = (max - min);
     40      foreach (int index in indices) {
     41        double rand = r.NextDouble() * randMultiplier + min;
     42        preprocessingData.SetCell<double>(variableName, index, rand);
     43      }
     44    }
     45
     46    public void ReplaceIndicesByLinearInterpolationOfNeighbours(string variableName, IEnumerable<int> indices) {
     47      int countValues = preprocessingData.GetValues<double>(variableName).Count();
     48      foreach (int index in indices) {
     49        // dont replace first or last values
     50        if (index > 0 && index < countValues) {
     51          int prevIndex = indexOfPrevPresentValue(variableName, index);
     52          int nextIndex = indexOfNextPresentValue(variableName, index);
     53
     54          // no neighbours found
     55          if (prevIndex < 0 && nextIndex >= countValues) {
     56            continue;
     57          }
     58          double prev = preprocessingData.GetCell<double>(variableName, prevIndex);
     59          double next = preprocessingData.GetCell<double>(variableName, nextIndex);
     60
     61          int valuesToInterpolate = nextIndex - prevIndex;
     62
     63          double interpolationStep = (prev + next) / valuesToInterpolate;
     64
     65          for (int i = prevIndex; i < nextIndex; ++i) {
     66            double interpolated = prev + (interpolationStep * (i - prevIndex));
     67            preprocessingData.SetCell<double>(variableName, i, interpolated);
     68          }
     69        }
     70      }
     71    }
     72
     73    private int indexOfPrevPresentValue(string variableName, int start) {
     74      int offset = start - 1;
     75      while (offset >= 0 && searchLogic.IsMissingValue(variableName, offset)) {
     76        offset--;
     77      }
     78
     79      return offset;
     80    }
     81
     82    private int indexOfNextPresentValue(string variableName, int start) {
     83      int offset = start + 1;
     84      while (offset < preprocessingData.Rows && searchLogic.IsMissingValue(variableName, offset)) {
     85        offset++;
     86      }
     87
     88      return offset;
     89    }
     90
     91    public void ReplaceIndicesByMostCommonValue(string variableName, IEnumerable<int> indices) {
     92      if (preprocessingData.IsType<double>(variableName)) {
     93        ReplaceIndicesByValue<double>(variableName, indices, statisticInfo.GetMostCommonValue<double>(variableName));
     94      } else if (preprocessingData.IsType<string>(variableName)) {
     95        ReplaceIndicesByValue<string>(variableName, indices, statisticInfo.GetMostCommonValue<string>(variableName));
     96      } else if (preprocessingData.IsType<DateTime>(variableName)) {
     97        ReplaceIndicesByValue<DateTime>(variableName, indices, statisticInfo.GetMostCommonValue<DateTime>(variableName));
     98      } else {
     99        throw new ArgumentException("column with index: " + variableName + " contains a non supported type.");
     100      }
     101    }
     102
     103    public void ShuffleWithRanges(IEnumerable<IntRange> ranges) {
     104      // init random outside loop
     105      Random random = new Random();
     106
     107      // process all given ranges - e.g. TrainingPartition, Trainingpartition
     108      foreach (IntRange range in ranges) {
     109        List<Tuple<int, int>> shuffledIndices = new List<Tuple<int, int>>();
     110
     111        // generate random indices used for shuffeling each column
     112        for (int i = range.End; i > range.Start; --i) {
     113          int rand = random.Next(range.Start, i);
     114          shuffledIndices.Add(new Tuple<int, int>(i, rand));
    21115        }
    22116
    23         public void ReplaceIndicesByValue<T>(string variableName, IEnumerable<int> indices, T value)
    24         {
    25             foreach (int index in indices)
    26             {
    27                 preprocessingData.SetCell<T>(variableName, index, value);
    28             }
     117        foreach (string variableName in preprocessingData.VariableNames) {
     118          if (preprocessingData.IsType<double>(variableName)) {
     119            reOrderToIndices<double>(variableName, shuffledIndices);
     120          } else if (preprocessingData.IsType<string>(variableName)) {
     121            reOrderToIndices<string>(variableName, shuffledIndices);
     122          } else if (preprocessingData.IsType<DateTime>(variableName)) {
     123            reOrderToIndices<DateTime>(variableName, shuffledIndices);
     124          }
    29125        }
     126      }
     127    }
    30128
    31         public void ReplaceIndicesByAverageValue(string variableName, IEnumerable<int> indices)
    32         {
    33             double average = statisticInfo.GetAverage(variableName);
    34             ReplaceIndicesByValue<double>(variableName, indices, average);
    35         }
     129    public void reOrderToIndices<T>(string variableName, List<Tuple<int, int>> indices) {
     130      // process all columns equally
     131      foreach (Tuple<int, int> index in indices) {
     132        int originalIndex = index.Item1;
     133        int replaceIndex = index.Item2;
    36134
    37         public void ReplaceIndicesByMedianValue(string variableName, IEnumerable<int> indices)
    38         {
    39             double median = statisticInfo.GetMedian(variableName);
    40             ReplaceIndicesByValue<double>(variableName, indices, median);
    41         }
     135        T tmp = preprocessingData.GetCell<T>(variableName, originalIndex);
     136        T replaceValue = preprocessingData.GetCell<T>(variableName, replaceIndex);
    42137
    43         public void ReplaceIndicesByRandomValue(string variableName, IEnumerable<int> indices)
    44         {
    45             Random r = new Random();
    46 
    47             double max = statisticInfo.GetMax<double>(variableName);
    48             double min = statisticInfo.GetMin<double>(variableName);
    49             double randMultiplier = (max - min);
    50             foreach (int index in indices)
    51             {
    52                 double rand = r.NextDouble() * randMultiplier + min;
    53                 preprocessingData.SetCell<double>(variableName, index, rand);
    54             }
    55         }
    56 
    57         public void ReplaceIndicesByLinearInterpolationOfNeighbours(string variableName, IEnumerable<int> indices)
    58         {
    59             int countValues = preprocessingData.GetValues<double>(variableName).Count();
    60             foreach (int index in indices)
    61             {
    62                 // dont replace first or last values
    63                 if (index > 0 && index < countValues)
    64                 {
    65                     int prevIndex = indexOfPrevPresentValue(variableName, index);
    66                     int nextIndex = indexOfNextPresentValue(variableName, index);
    67 
    68                     // no neighbours found
    69                     if (prevIndex < 0 && nextIndex >= countValues)
    70                     {
    71                         continue;
    72                     }
    73                     double prev = preprocessingData.GetCell<double>(variableName, prevIndex);
    74                     double next = preprocessingData.GetCell<double>(variableName, nextIndex);
    75 
    76                     int valuesToInterpolate = nextIndex - prevIndex;
    77 
    78                     double interpolationStep = (prev + next) / valuesToInterpolate;
    79 
    80                     for (int i = prevIndex; i < nextIndex; ++i) {
    81                         double interpolated = prev + (interpolationStep * (i-prevIndex));
    82                         preprocessingData.SetCell<double>(variableName, i, interpolated);
    83                     }
    84                 }
    85             }
    86         }
    87 
    88         private int indexOfPrevPresentValue(string variableName, int start) {
    89             int offset = start - 1;
    90             while (offset >= 0 && searchLogic.IsMissingValue(variableName, offset)) {
    91                 offset--;
    92             }
    93 
    94             return offset;
    95         }
    96 
    97         private int indexOfNextPresentValue(string variableName, int start)
    98         {
    99             int offset = start + 1;
    100             while (offset < preprocessingData.Rows && searchLogic.IsMissingValue(variableName, offset))
    101             {
    102                 offset++;
    103             }
    104 
    105             return offset;
    106         }
    107 
    108         public void ReplaceIndicesByMostCommonValue(string variableName, IEnumerable<int> indices)
    109         {
    110             if (preprocessingData.IsType<double>(variableName))
    111             {
    112                 ReplaceIndicesByValue<double>(variableName, indices, statisticInfo.GetMostCommonValue<double>(variableName));
    113             }
    114             else if (preprocessingData.IsType<string>(variableName))
    115             {
    116                 ReplaceIndicesByValue<string>(variableName, indices, statisticInfo.GetMostCommonValue<string>(variableName));
    117             }
    118             else if (preprocessingData.IsType<DateTime>(variableName))
    119             {
    120                 ReplaceIndicesByValue<DateTime>(variableName, indices, statisticInfo.GetMostCommonValue<DateTime>(variableName));
    121             }
    122             else
    123             {
    124                 throw new ArgumentException("column with index: " + variableName + " contains a non supported type.");
    125             }
    126         }
    127 
    128         public void ShuffleWithRanges(IEnumerable<IntRange> ranges)
    129         {
    130             // init random outside loop
    131             Random random = new Random();
    132 
    133             // process all given ranges - e.g. TrainingPartition, Trainingpartition
    134             foreach (IntRange range in ranges) {
    135                 List<Tuple<int, int>> shuffledIndices = new List<Tuple<int,int>>();
    136                
    137                 // generate random indices used for shuffeling each column
    138                 for (int i = range.End; i > range.Start; --i)
    139                 {
    140                     int rand = random.Next(range.Start, i);
    141                     shuffledIndices.Add(new Tuple<int,int>(i,rand));
    142                 }
    143 
    144                 foreach (string variableName in preprocessingData.VariableNames)
    145                 {
    146                     if (preprocessingData.IsType<double>(variableName))
    147                     {
    148                         reOrderToIndices<double>(variableName, shuffledIndices);
    149                     }
    150                     else if (preprocessingData.IsType<string>(variableName))
    151                     {
    152                         reOrderToIndices<string>(variableName, shuffledIndices);
    153                     }
    154                     else if (preprocessingData.IsType<DateTime>(variableName))
    155                     {
    156                         reOrderToIndices<DateTime>(variableName, shuffledIndices);
    157                     }
    158                 }
    159             }     
    160         }
    161 
    162         public void reOrderToIndices<T>(string variableName, List<Tuple<int, int>> indices) {
    163             // process all columns equally
    164             foreach(Tuple<int, int> index in indices)
    165             {
    166                 int originalIndex = index.Item1;
    167                 int replaceIndex = index.Item2;
    168 
    169                 T tmp = preprocessingData.GetCell<T>(variableName, originalIndex);
    170                 T replaceValue = preprocessingData.GetCell<T>(variableName, replaceIndex);
    171 
    172                 preprocessingData.SetCell<T>(variableName, originalIndex, replaceValue);
    173                 preprocessingData.SetCell<T>(variableName, replaceIndex, tmp);
    174             }
    175         }
     138        preprocessingData.SetCell<T>(variableName, originalIndex, replaceValue);
     139        preprocessingData.SetCell<T>(variableName, replaceIndex, tmp);
     140      }
    176141    }
     142  }
    177143}
Note: See TracChangeset for help on using the changeset viewer.