Free cookie consent management tool by TermsFeed Policy Generator

source: stable/HeuristicLab.DataPreprocessing/3.4/Logic/StatisticsLogic.cs @ 15491

Last change on this file since 15491 was 15242, checked in by pfleck, 7 years ago

#2709 merged to stable

File size: 8.4 KB
RevLine 
[10539]1#region License Information
2/* HeuristicLab
[14186]3 * Copyright (C) 2002-2016 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
[10539]4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
[10383]23using System.Collections.Generic;
[10165]24using System.Linq;
[10216]25using HeuristicLab.Common;
[10148]26
[10635]27namespace HeuristicLab.DataPreprocessing {
[13508]28  public class StatisticsLogic {
[10586]29    private readonly ITransactionalPreprocessingData preprocessingData;
[13508]30    private readonly SearchLogic searchLogic;
[10148]31
[13508]32    public StatisticsLogic(ITransactionalPreprocessingData thePreprocessingData, SearchLogic theSearchLogic) {
[10165]33      preprocessingData = thePreprocessingData;
[10236]34      searchLogic = theSearchLogic;
[10148]35    }
36
[10635]37    public int GetColumnCount() {
[12676]38      return searchLogic.Columns;
[10148]39    }
40
[10635]41    public int GetRowCount() {
[12676]42      return searchLogic.Rows;
[10148]43    }
44
[10635]45    public int GetNumericColumnCount() {
[10159]46      int count = 0;
[10369]47
[12676]48      for (int i = 0; i < searchLogic.Columns; ++i) {
[11156]49        if (preprocessingData.VariableHasType<double>(i)) {
[10159]50          ++count;
51        }
52      }
53      return count;
[10148]54    }
55
[10635]56    public int GetNominalColumnCount() {
[12676]57      return searchLogic.Columns - GetNumericColumnCount();
[10148]58    }
59
[10635]60    public int GetMissingValueCount() {
[10148]61      int count = 0;
[12676]62      for (int i = 0; i < searchLogic.Columns; ++i) {
[10367]63        count += GetMissingValueCount(i);
[10148]64      }
65      return count;
66    }
67
[10635]68    public int GetMissingValueCount(int columnIndex) {
[10367]69      return searchLogic.GetMissingValueIndices(columnIndex).Count();
[10148]70    }
71
[14077]72    public T GetMin<T>(int columnIndex, T defaultValue, bool considerSelection = false) where T : IComparable<T> {
73      var min = defaultValue;
[12676]74      if (preprocessingData.VariableHasType<T>(columnIndex)) {
75        var values = GetValuesWithoutNaN<T>(columnIndex, considerSelection);
76        if (values.Any()) {
77          min = values.Min();
78        }
79      }
80      return min;
[10148]81    }
82
[14077]83    public T GetMax<T>(int columnIndex, T defaultValue, bool considerSelection = false) where T : IComparable<T> {
84      var max = defaultValue;
[12676]85      if (preprocessingData.VariableHasType<T>(columnIndex)) {
86        var values = GetValuesWithoutNaN<T>(columnIndex, considerSelection);
87        if (values.Any()) {
88          max = values.Max();
89        }
90      }
91      return max;
[10148]92    }
93
[13508]94    public double GetMedian(int columnIndex, bool considerSelection = false) {
[10166]95      double median = double.NaN;
[11156]96      if (preprocessingData.VariableHasType<double>(columnIndex)) {
[12676]97        var values = GetValuesWithoutNaN<double>(columnIndex, considerSelection);
98        if (values.Any()) {
99          median = values.Median();
100        }
[10166]101      }
102      return median;
[10148]103    }
104
[13508]105    public double GetAverage(int columnIndex, bool considerSelection = false) {
[10166]106      double avg = double.NaN;
[11156]107      if (preprocessingData.VariableHasType<double>(columnIndex)) {
[12676]108        var values = GetValuesWithoutNaN<double>(columnIndex, considerSelection);
109        if (values.Any()) {
110          avg = values.Average();
111        }
[10166]112      }
113      return avg;
[10148]114    }
115
[13508]116    public DateTime GetMedianDateTime(int columnIndex, bool considerSelection = false) {
[10381]117      DateTime median = new DateTime();
[11156]118      if (preprocessingData.VariableHasType<DateTime>(columnIndex)) {
[10809]119        median = GetSecondsAsDateTime(GetDateTimeAsSeconds(columnIndex, considerSelection).Median());
[10381]120      }
121      return median;
122    }
123
[13508]124    public DateTime GetAverageDateTime(int columnIndex, bool considerSelection = false) {
[10381]125      DateTime avg = new DateTime();
[11156]126      if (preprocessingData.VariableHasType<DateTime>(columnIndex)) {
[10809]127        avg = GetSecondsAsDateTime(GetDateTimeAsSeconds(columnIndex, considerSelection).Average());
[10381]128      }
129      return avg;
130    }
131
[14077]132    public T GetMostCommonValue<T>(int columnIndex, T defaultValue, bool considerSelection = false) {
[12676]133      var values = GetValuesWithoutNaN<T>(columnIndex, considerSelection);
134      if (!values.Any())
[14077]135        return defaultValue;
[12676]136      return values.GroupBy(x => x)
[10180]137                              .OrderByDescending(g => g.Count())
138                              .Select(g => g.Key)
139                              .First();
[10148]140    }
141
[10167]142
[10635]143    public double GetStandardDeviation(int columnIndex) {
[10169]144      double stdDev = double.NaN;
[11156]145      if (preprocessingData.VariableHasType<double>(columnIndex)) {
[10811]146        stdDev = GetValuesWithoutNaN<double>(columnIndex).StandardDeviation();
[11156]147      } else if (preprocessingData.VariableHasType<DateTime>(columnIndex)) {
[10811]148        stdDev = GetDateTimeAsSeconds(columnIndex).StandardDeviation();
[10169]149      }
150      return stdDev;
[10148]151    }
152
[10635]153    public double GetVariance(int columnIndex) {
[10383]154      double variance = double.NaN;
[11156]155      if (preprocessingData.VariableHasType<double>(columnIndex)) {
[10812]156        variance = GetValuesWithoutNaN<double>(columnIndex).Variance();
[11156]157      } else if (preprocessingData.VariableHasType<DateTime>(columnIndex)) {
[10811]158        variance = GetDateTimeAsSeconds(columnIndex).Variance();
[10216]159      }
[10381]160      return variance;
[10216]161    }
162
[12889]163    public double GetOneQuarterPercentile(int columnIndex) {
164      double percentile = double.NaN;
165      if (preprocessingData.VariableHasType<double>(columnIndex)) {
[14077]166        percentile = GetValuesWithoutNaN<double>(columnIndex).DefaultIfEmpty(double.NaN).Quantile(0.25);
[12889]167      } else if (preprocessingData.VariableHasType<DateTime>(columnIndex)) {
[14077]168        percentile = GetDateTimeAsSeconds(columnIndex).DefaultIfEmpty(double.NaN).Quantile(0.25);
[12889]169      }
170      return percentile;
171    }
172
173    public double GetThreeQuarterPercentile(int columnIndex) {
174      double percentile = double.NaN;
175      if (preprocessingData.VariableHasType<double>(columnIndex)) {
[14077]176        percentile = GetValuesWithoutNaN<double>(columnIndex).DefaultIfEmpty(double.NaN).Quantile(0.75);
[12889]177      } else if (preprocessingData.VariableHasType<DateTime>(columnIndex)) {
[14077]178        percentile = GetDateTimeAsSeconds(columnIndex).DefaultIfEmpty(double.NaN).Quantile(0.75);
[12889]179      }
180      return percentile;
181    }
182
[10635]183    public int GetDifferentValuesCount<T>(int columnIndex) {
[10811]184      return preprocessingData.GetValues<T>(columnIndex).GroupBy(x => x).Count();
[10179]185    }
[10191]186
[10635]187    public int GetRowMissingValueCount(int rowIndex) {
[10191]188      int count = 0;
[10635]189      for (int i = 0; i < preprocessingData.Columns; ++i) {
190        if (searchLogic.IsMissingValue(i, rowIndex)) {
[10191]191          ++count;
192        }
193      }
194      return count;
195    }
[10367]196
[10635]197    public string GetVariableName(int columnIndex) {
[10367]198      return preprocessingData.GetVariableName(columnIndex);
199    }
200
[11156]201    public bool VariableHasType<T>(int columnIndex) {
202      return preprocessingData.VariableHasType<T>(columnIndex);
[10367]203    }
[10371]204
[10635]205    public string GetColumnTypeAsString(int columnIndex) {
[11156]206      if (preprocessingData.VariableHasType<double>(columnIndex)) {
[10371]207        return "double";
[11156]208      } else if (preprocessingData.VariableHasType<string>(columnIndex)) {
[10371]209        return "string";
[11156]210      } else if (preprocessingData.VariableHasType<DateTime>(columnIndex)) {
[10371]211        return "DateTime";
212      }
213      return "Unknown Type";
214    }
[10624]215
[10811]216    private IEnumerable<double> GetDateTimeAsSeconds(int columnIndex, bool considerSelection = false) {
[10809]217      return GetValuesWithoutNaN<DateTime>(columnIndex, considerSelection).Select(x => (double)x.Ticks / TimeSpan.TicksPerSecond);
[10624]218    }
[10381]219
[10811]220    private IEnumerable<T> GetValuesWithoutNaN<T>(int columnIndex, bool considerSelection = false) {
[10809]221      return searchLogic.GetValuesWithoutNaN<T>(columnIndex, considerSelection);
[10661]222    }
223
[10635]224    private DateTime GetSecondsAsDateTime(double seconds) {
[10383]225      DateTime dateTime = new DateTime();
[10613]226      return dateTime.AddSeconds(seconds);
[10381]227    }
[10551]228
[10635]229    public event DataPreprocessingChangedEventHandler Changed {
[10551]230      add { preprocessingData.Changed += value; }
231      remove { preprocessingData.Changed -= value; }
232    }
[10148]233  }
234}
Note: See TracBrowser for help on using the repository browser.