Free cookie consent management tool by TermsFeed Policy Generator

source: branches/2695_dataset-ids/HeuristicLab.DataPreprocessing/3.4/Logic/StatisticsLogic.cs @ 17399

Last change on this file since 17399 was 15110, checked in by pfleck, 7 years ago

#2709: merged branch to trunk

File size: 8.4 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2016 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Linq;
25using HeuristicLab.Common;
26
27namespace HeuristicLab.DataPreprocessing {
28  public class StatisticsLogic {
29    private readonly ITransactionalPreprocessingData preprocessingData;
30    private readonly SearchLogic searchLogic;
31
32    public StatisticsLogic(ITransactionalPreprocessingData thePreprocessingData, SearchLogic theSearchLogic) {
33      preprocessingData = thePreprocessingData;
34      searchLogic = theSearchLogic;
35    }
36
37    public int GetColumnCount() {
38      return searchLogic.Columns;
39    }
40
41    public int GetRowCount() {
42      return searchLogic.Rows;
43    }
44
45    public int GetNumericColumnCount() {
46      int count = 0;
47
48      for (int i = 0; i < searchLogic.Columns; ++i) {
49        if (preprocessingData.VariableHasType<double>(i)) {
50          ++count;
51        }
52      }
53      return count;
54    }
55
56    public int GetNominalColumnCount() {
57      return searchLogic.Columns - GetNumericColumnCount();
58    }
59
60    public int GetMissingValueCount() {
61      int count = 0;
62      for (int i = 0; i < searchLogic.Columns; ++i) {
63        count += GetMissingValueCount(i);
64      }
65      return count;
66    }
67
68    public int GetMissingValueCount(int columnIndex) {
69      return searchLogic.GetMissingValueIndices(columnIndex).Count();
70    }
71
72    public T GetMin<T>(int columnIndex, T defaultValue, bool considerSelection = false) where T : IComparable<T> {
73      var min = defaultValue;
74      if (preprocessingData.VariableHasType<T>(columnIndex)) {
75        var values = GetValuesWithoutNaN<T>(columnIndex, considerSelection);
76        if (values.Any()) {
77          min = values.Min();
78        }
79      }
80      return min;
81    }
82
83    public T GetMax<T>(int columnIndex, T defaultValue, bool considerSelection = false) where T : IComparable<T> {
84      var max = defaultValue;
85      if (preprocessingData.VariableHasType<T>(columnIndex)) {
86        var values = GetValuesWithoutNaN<T>(columnIndex, considerSelection);
87        if (values.Any()) {
88          max = values.Max();
89        }
90      }
91      return max;
92    }
93
94    public double GetMedian(int columnIndex, bool considerSelection = false) {
95      double median = double.NaN;
96      if (preprocessingData.VariableHasType<double>(columnIndex)) {
97        var values = GetValuesWithoutNaN<double>(columnIndex, considerSelection);
98        if (values.Any()) {
99          median = values.Median();
100        }
101      }
102      return median;
103    }
104
105    public double GetAverage(int columnIndex, bool considerSelection = false) {
106      double avg = double.NaN;
107      if (preprocessingData.VariableHasType<double>(columnIndex)) {
108        var values = GetValuesWithoutNaN<double>(columnIndex, considerSelection);
109        if (values.Any()) {
110          avg = values.Average();
111        }
112      }
113      return avg;
114    }
115
116    public DateTime GetMedianDateTime(int columnIndex, bool considerSelection = false) {
117      DateTime median = new DateTime();
118      if (preprocessingData.VariableHasType<DateTime>(columnIndex)) {
119        median = GetSecondsAsDateTime(GetDateTimeAsSeconds(columnIndex, considerSelection).Median());
120      }
121      return median;
122    }
123
124    public DateTime GetAverageDateTime(int columnIndex, bool considerSelection = false) {
125      DateTime avg = new DateTime();
126      if (preprocessingData.VariableHasType<DateTime>(columnIndex)) {
127        avg = GetSecondsAsDateTime(GetDateTimeAsSeconds(columnIndex, considerSelection).Average());
128      }
129      return avg;
130    }
131
132    public T GetMostCommonValue<T>(int columnIndex, T defaultValue, bool considerSelection = false) {
133      var values = GetValuesWithoutNaN<T>(columnIndex, considerSelection);
134      if (!values.Any())
135        return defaultValue;
136      return values.GroupBy(x => x)
137                              .OrderByDescending(g => g.Count())
138                              .Select(g => g.Key)
139                              .First();
140    }
141
142
143    public double GetStandardDeviation(int columnIndex) {
144      double stdDev = double.NaN;
145      if (preprocessingData.VariableHasType<double>(columnIndex)) {
146        stdDev = GetValuesWithoutNaN<double>(columnIndex).StandardDeviation();
147      } else if (preprocessingData.VariableHasType<DateTime>(columnIndex)) {
148        stdDev = GetDateTimeAsSeconds(columnIndex).StandardDeviation();
149      }
150      return stdDev;
151    }
152
153    public double GetVariance(int columnIndex) {
154      double variance = double.NaN;
155      if (preprocessingData.VariableHasType<double>(columnIndex)) {
156        variance = GetValuesWithoutNaN<double>(columnIndex).Variance();
157      } else if (preprocessingData.VariableHasType<DateTime>(columnIndex)) {
158        variance = GetDateTimeAsSeconds(columnIndex).Variance();
159      }
160      return variance;
161    }
162
163    public double GetOneQuarterPercentile(int columnIndex) {
164      double percentile = double.NaN;
165      if (preprocessingData.VariableHasType<double>(columnIndex)) {
166        percentile = GetValuesWithoutNaN<double>(columnIndex).DefaultIfEmpty(double.NaN).Quantile(0.25);
167      } else if (preprocessingData.VariableHasType<DateTime>(columnIndex)) {
168        percentile = GetDateTimeAsSeconds(columnIndex).DefaultIfEmpty(double.NaN).Quantile(0.25);
169      }
170      return percentile;
171    }
172
173    public double GetThreeQuarterPercentile(int columnIndex) {
174      double percentile = double.NaN;
175      if (preprocessingData.VariableHasType<double>(columnIndex)) {
176        percentile = GetValuesWithoutNaN<double>(columnIndex).DefaultIfEmpty(double.NaN).Quantile(0.75);
177      } else if (preprocessingData.VariableHasType<DateTime>(columnIndex)) {
178        percentile = GetDateTimeAsSeconds(columnIndex).DefaultIfEmpty(double.NaN).Quantile(0.75);
179      }
180      return percentile;
181    }
182
183    public int GetDifferentValuesCount<T>(int columnIndex) {
184      return preprocessingData.GetValues<T>(columnIndex).GroupBy(x => x).Count();
185    }
186
187    public int GetRowMissingValueCount(int rowIndex) {
188      int count = 0;
189      for (int i = 0; i < preprocessingData.Columns; ++i) {
190        if (searchLogic.IsMissingValue(i, rowIndex)) {
191          ++count;
192        }
193      }
194      return count;
195    }
196
197    public string GetVariableName(int columnIndex) {
198      return preprocessingData.GetVariableName(columnIndex);
199    }
200
201    public bool VariableHasType<T>(int columnIndex) {
202      return preprocessingData.VariableHasType<T>(columnIndex);
203    }
204
205    public string GetColumnTypeAsString(int columnIndex) {
206      if (preprocessingData.VariableHasType<double>(columnIndex)) {
207        return "double";
208      } else if (preprocessingData.VariableHasType<string>(columnIndex)) {
209        return "string";
210      } else if (preprocessingData.VariableHasType<DateTime>(columnIndex)) {
211        return "DateTime";
212      }
213      return "Unknown Type";
214    }
215
216    private IEnumerable<double> GetDateTimeAsSeconds(int columnIndex, bool considerSelection = false) {
217      return GetValuesWithoutNaN<DateTime>(columnIndex, considerSelection).Select(x => (double)x.Ticks / TimeSpan.TicksPerSecond);
218    }
219
220    private IEnumerable<T> GetValuesWithoutNaN<T>(int columnIndex, bool considerSelection = false) {
221      return searchLogic.GetValuesWithoutNaN<T>(columnIndex, considerSelection);
222    }
223
224    private DateTime GetSecondsAsDateTime(double seconds) {
225      DateTime dateTime = new DateTime();
226      return dateTime.AddSeconds(seconds);
227    }
228
229    public event DataPreprocessingChangedEventHandler Changed {
230      add { preprocessingData.Changed += value; }
231      remove { preprocessingData.Changed -= value; }
232    }
233  }
234}
Note: See TracBrowser for help on using the repository browser.