Changeset 10192


Ignore:
Timestamp:
12/04/13 16:33:18 (6 years ago)
Author:
mleitner
Message:

Implement manipulations per attribute, replace by media, average, linear interpolated from prev and next value or random.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • branches/DataPreprocessing/HeuristicLab.DataPreprocessing/3.3/Implementations/PreprocessingData.cs

    r10191 r10192  
    3939
    4040    private double trainingToTestRatio;
     41    private StatisticInfo statisticInfo;
    4142
    4243    private PreprocessingData(PreprocessingData original, Cloner cloner)
     
    7576      Columns = problemData.Dataset.Columns;
    7677      Rows = problemData.Dataset.Rows;
     78
     79      statisticInfo = new StatisticInfo(this);
    7780    }
    7881
     
    104107
    105108    public IEnumerable<T> GetValues<T>(string variableName) {
    106       return (IEnumerable<T>)variableValues[variableName];
     109        return (IEnumerable<T>)variableValues[variableName];
    107110    }
    108111
     
    175178        return GetValues<string>(variableName).Select((s, i) => new { i, s }).Where(t => string.IsNullOrEmpty(t.s)).Select(t => t.i);
    176179      } else if (IsType<DateTime>(variableName)) {
    177         return GetValues<DateTime>(variableName).Select((s, i) => new { i, s }).Where(t => t.s.Equals(DateTime.MinValue)).Select(t => t.i);
     180          return GetValues<DateTime>(variableName).Select((s, i) => new { i, s }).Where(t => t.s.Equals(DateTime.MinValue)).Select(t => t.i);
    178181      } else {
    179182        throw new ArgumentException("column with variableName: " + variableName + " contains a non supported type.");
     183      }
     184    }
     185
     186    public void ReplaceIndicesByValue<T>(string variableName, IEnumerable<int> indices, T value)
     187    {
     188        foreach (int index in indices)
     189        {
     190            SetCell<T>(variableName, index, value);
     191        }
     192    }
     193
     194    public void ReplaceIndicesByAverageValue(string variableName, IEnumerable<int> indices) {
     195        double average = statisticInfo.GetAverage(variableName);
     196        ReplaceIndicesByValue<double>(variableName, indices, average);
     197    }
     198
     199    public void ReplaceIndicesByAverageValue(string variableName, IEnumerable<int> indices) {
     200        double median = statisticInfo.GetMedian(variableName);
     201        ReplaceIndicesByValue<double>(variableName, indices, median);
     202    }
     203
     204    public void ReplaceIndicesByRandomValue(string variableName, IEnumerable<int> indices) {
     205        Random r = new Random();
     206
     207        double max = statisticInfo.GetMax<double>(variableName);
     208        double min = statisticInfo.GetMin<double>(variableName);
     209        double randMultiplier = (max - min);
     210        foreach (int index in indices)
     211        {
     212            double rand = r.NextDouble() * randMultiplier + min;
     213            SetCell<double>(variableName, index, rand);
     214        }
     215    }
     216
     217    public void ReplaceIndicesByLinearInterpolationOfNeighbours(string variableName, IEnumerable<int> indices) {
     218        int countValues = GetValues<double>(variableName).Count();
     219        foreach (int index in indices)
     220        {
     221            // dont replace first or last values
     222            if (index > 0 && index < countValues) {
     223               double prev = GetCell<double>(variableName, index - 1);
     224               double next = GetCell<double>(variableName, index + 1);
     225
     226               double interpolated = (prev + next) / 2;
     227
     228               SetCell<double>(variableName, index, interpolated);
     229            }
     230        }
     231    }
     232
     233    public void ReplaceIndicesByMostCommonValue(string variableName, IEnumerable<int> indices) {   
     234      if (IsType<double>(variableName)) {
     235        ReplaceIndicesByValue<double>(variableName, indices,statisticInfo.GetMostCommonValue<double>(variableName));
     236      } else if (IsType<string>(variableName)) {
     237        ReplaceIndicesByValue<string>(variableName, indices, statisticInfo.GetMostCommonValue<string>(variableName));
     238      } else if (IsType<DateTime>(variableName)) {
     239        ReplaceIndicesByValue<DateTime>(variableName, indices, statisticInfo.GetMostCommonValue<DateTime>(variableName));
     240      } else {
     241        throw new ArgumentException("column with index: " + variableName + " contains a non supported type.");
    180242      }
    181243    }
Note: See TracChangeset for help on using the changeset viewer.