Free cookie consent management tool by TermsFeed Policy Generator

source: branches/DataPreprocessing/HeuristicLab.DataPreprocessing/3.3/Implementations/StatisticsLogic.cs @ 10624

Last change on this file since 10624 was 10624, checked in by mleitner, 11 years ago

Filter undefined values on statistics

File size: 6.9 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2013 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Linq;
25using HeuristicLab.Common;
26
27namespace HeuristicLab.DataPreprocessing
28{
29
30  public class StatisticsLogic : IStatisticsLogic
31  {
32
33    private readonly ITransactionalPreprocessingData preprocessingData;
34    private readonly ISearchLogic searchLogic;
35
36    public StatisticsLogic(ITransactionalPreprocessingData thePreprocessingData, ISearchLogic theSearchLogic)
37    {
38      preprocessingData = thePreprocessingData;
39      searchLogic = theSearchLogic;
40    }
41
42    public int GetColumnCount()
43    {
44      return preprocessingData.Columns;
45    }
46
47    public int GetRowCount()
48    {
49      return preprocessingData.Rows;
50    }
51
52    public int GetNumericColumnCount()
53    {
54      int count = 0;
55
56      for (int i = 0; i < preprocessingData.Columns; ++i)
57      {
58        if (preprocessingData.IsType<double>(i))
59        {
60          ++count;
61        }
62      }
63      return count;
64    }
65
66    public int GetNominalColumnCount()
67    {
68      return preprocessingData.Columns - GetNumericColumnCount();
69    }
70
71    public int GetMissingValueCount()
72    {
73      int count = 0;
74      for (int i = 0; i < preprocessingData.Columns; ++i)
75      {
76        count += GetMissingValueCount(i);
77      }
78      return count;
79    }
80
81    public int GetMissingValueCount(int columnIndex)
82    {
83      return searchLogic.GetMissingValueIndices(columnIndex).Count();
84    }
85
86    public T GetMin<T>(int columnIndex) where T : IComparable<T>
87    {
88      return preprocessingData.GetValues<T>(columnIndex).Min();
89    }
90
91    public T GetMax<T>(int columnIndex) where T : IComparable<T>
92    {
93      return preprocessingData.GetValues<T>(columnIndex).Max();
94    }
95
96    public double GetMedian(int columnIndex)
97    {
98      double median = double.NaN;
99      if (preprocessingData.IsType<double>(columnIndex))
100      {
101        median = GetValuesWithoutNaN<double>(columnIndex).Median();
102      }
103      return median;
104    }
105
106    public double GetAverage(int columnIndex)
107    {
108      double avg = double.NaN;
109      if (preprocessingData.IsType<double>(columnIndex))
110      {
111        avg = GetValuesWithoutNaN<double>(columnIndex).Where(x => !double.IsNaN(x)).Average();
112      }
113      return avg;
114    }
115
116    public DateTime GetMedianDateTime(int columnIndex)
117    {
118      DateTime median = new DateTime();
119      if (preprocessingData.IsType<DateTime>(columnIndex))
120      {
121        median = GetSecondsAsDateTime(GetDateTimeAsSeconds(columnIndex).Median());
122      }
123      return median;
124    }
125
126    public DateTime GetAverageDateTime(int columnIndex)
127    {
128      DateTime avg = new DateTime();
129      if (preprocessingData.IsType<DateTime>(columnIndex))
130      {
131        avg = GetSecondsAsDateTime(GetDateTimeAsSeconds(columnIndex).Average());
132      }
133      return avg;
134    }
135
136    public T GetMostCommonValue<T>(int columnIndex)
137    {
138      var t = preprocessingData.GetValues<T>(columnIndex);
139      var t2 = t.GroupBy(x => x);
140      var t3 = t2.Select(g => g.Key);
141
142      return preprocessingData.GetValues<T>(columnIndex)
143                              .GroupBy(x => x)
144                              .OrderByDescending(g => g.Count())
145                              .Select(g => g.Key)
146                              .First();
147    }
148
149
150    public double GetStandardDeviation(int columnIndex)
151    {
152      double stdDev = double.NaN;
153      if (preprocessingData.IsType<double>(columnIndex))
154      {
155        stdDev = GetValuesWithoutNaN<double>(columnIndex).StandardDeviation();
156      }
157      else if (preprocessingData.IsType<DateTime>(columnIndex))
158      {
159        stdDev = GetDateTimeAsSeconds(columnIndex).StandardDeviation();
160      }
161      return stdDev;
162    }
163
164    public double GetVariance(int columnIndex)
165    {
166      double variance = double.NaN;
167      if (preprocessingData.IsType<double>(columnIndex))
168      {
169        variance = preprocessingData.GetValues<double>(columnIndex).Variance();
170      }
171      else if (preprocessingData.IsType<DateTime>(columnIndex))
172      {
173        variance = GetDateTimeAsSeconds(columnIndex).Variance();
174      }
175      return variance;
176    }
177
178    public int GetDifferentValuesCount<T>(int columnIndex)
179    {
180      return preprocessingData.GetValues<T>(columnIndex).GroupBy(x => x).Count();
181    }
182
183    public int GetRowMissingValueCount(int rowIndex)
184    {
185      int count = 0;
186      for (int i = 0; i < preprocessingData.Columns; ++i)
187      {
188        if (searchLogic.IsMissingValue(i, rowIndex))
189        {
190          ++count;
191        }
192      }
193      return count;
194    }
195
196    public string GetVariableName(int columnIndex)
197    {
198      return preprocessingData.GetVariableName(columnIndex);
199    }
200
201    public bool IsType<T>(int columnIndex)
202    {
203      return preprocessingData.IsType<T>(columnIndex);
204    }
205
206    public string GetColumnTypeAsString(int columnIndex)
207    {
208      if (preprocessingData.IsType<double>(columnIndex))
209      {
210        return "double";
211      }
212      else if (preprocessingData.IsType<string>(columnIndex))
213      {
214        return "string";
215      }
216      else if (preprocessingData.IsType<DateTime>(columnIndex))
217      {
218        return "DateTime";
219      }
220      return "Unknown Type";
221    }
222
223    private List<T> GetValuesWithoutNaN<T>(int columnIndex)
224    {
225      IEnumerable<int> missing = searchLogic.GetMissingValueIndices(columnIndex);
226      return (List<T>)preprocessingData.GetValues<T>(columnIndex).Select((v, i) => new { i, v }).Where(x => !missing.Contains(x.i));
227    }
228    private IEnumerable<double> GetDateTimeAsSeconds(int columnIndex)
229    {
230      return GetValuesWithoutNaN<DateTime>(columnIndex).Select(x => (double)x.Ticks / TimeSpan.TicksPerSecond);
231    }
232
233    private DateTime GetSecondsAsDateTime(double seconds)
234    {
235      DateTime dateTime = new DateTime();
236      return dateTime.AddSeconds(seconds);
237    }
238
239    public event DataPreprocessingChangedEventHandler Changed
240    {
241      add { preprocessingData.Changed += value; }
242      remove { preprocessingData.Changed -= value; }
243    }
244  }
245}
Note: See TracBrowser for help on using the repository browser.