Free cookie consent management tool by TermsFeed Policy Generator

source: branches/DataPreprocessing/HeuristicLab.DataPreprocessing/3.3/DataSetStatisticInfo.cs @ 10148

Last change on this file since 10148 was 10148, checked in by rstoll, 10 years ago

Initial work for DatasetStatisticInfo

File size: 4.9 KB
Line 
1using System;
2using System.Linq;
3using HeuristicLab.Problems.DataAnalysis;
4
5namespace HeuristicLab.DataPreprocessing {
6  class DatasetStatisticInfo : IDatasetStatisticInfo {
7
8    private IDataset dataSet;
9
10    public DatasetStatisticInfo(IDataset theDataSet) {
11      dataSet = theDataSet;
12    }
13
14
15    public int GetColumnCount() {
16      return dataSet.Columns;
17    }
18
19    public int GetRowCount() {
20      return dataSet.Rows;
21    }
22
23    public int GetNumericColumnCount() {
24      return dataSet.DoubleVariables.Count();
25    }
26
27    public int GetNominalColumnCount() {
28      return dataSet.Columns - GetNumericColumnCount();
29    }
30
31    public int GetMissingValueCount() {
32      int count = 0;
33      for (int i = 0; i < dataSet.Columns; ++i) {
34        count += GetMissingValueCount(i);
35      }
36      return count;
37    }
38
39    public int GetMissingValueCount(int columnIndex) {
40      Func<string, bool> isMissingValueFunc;
41      if (dataSet.IsType<double>(columnIndex)) {
42        isMissingValueFunc = IsMissingDoubleValue;
43      } else if (dataSet.IsType<string>(columnIndex)) {
44        isMissingValueFunc = IsMissingStringValue;
45      } else if (dataSet.IsType<DateTime>(columnIndex)) {
46        isMissingValueFunc = isMissingDateTimeValue;
47      } else {
48        throw new ArgumentException("column with index: " + columnIndex + " contains a non supported type.");
49      }
50
51      int count = 0;
52      for (int i = 0; i < dataSet.Rows; ++i) {
53        if (isMissingValueFunc(dataSet.GetValue(i, columnIndex))) {
54          ++count;
55        }
56      }
57      return count;
58    }
59
60    private bool IsMissingDoubleValue(string value) {
61      double dummy;
62      bool couldNotParse = !double.TryParse(value, out dummy);
63      return couldNotParse || double.IsNaN(dummy);
64    }
65
66    private bool IsMissingStringValue(string value) {
67      return string.IsNullOrEmpty(value);
68    }
69
70    private bool isMissingDateTimeValue(string value) {
71      DateTime dateTime;
72      bool couldNotParse = DateTime.TryParse(value, out dateTime);
73      return couldNotParse || dateTime.Equals(DateTime.MinValue);
74    }
75
76    public T GetMin<T>(int columnIndex) where T : IComparable<T> {
77      if (!dataSet.IsType<double>(columnIndex)) {
78        throw new ArgumentException("column with index: " + columnIndex + " was assumed to be of type " + typeof(T).Name + " but was different.");
79      }
80      if (typeof(T) == typeof(double)) {
81        return (dynamic)GetMin(columnIndex, double.MaxValue, IsMissingDoubleValue, double.Parse); ;
82      } else if (typeof(T) == typeof(DateTime)) {
83        return (dynamic)GetMin(columnIndex, DateTime.MaxValue, IsMissingDoubleValue, DateTime.Parse);
84      } else {
85        throw new ArgumentException("type of T is not supported");
86      }
87    }
88
89    public T GetMax<T>(int columnIndex) where T : IComparable<T> {
90      if (!dataSet.IsType<double>(columnIndex)) {
91        throw new ArgumentException("column with index: " + columnIndex + " was assumed to be of type " + typeof(T).Name + " but was different.");
92      }
93      if (typeof(T) == typeof(double)) {
94        return (dynamic)GetMax(columnIndex, double.MinValue, IsMissingDoubleValue, double.Parse); ;
95      } else if (typeof(T) == typeof(DateTime)) {
96        return (dynamic)GetMax(columnIndex, DateTime.MinValue, IsMissingDoubleValue, DateTime.Parse);
97      } else {
98        throw new ArgumentException("type of T is not supported");
99      }
100    }
101
102    private T GetMin<T>(int columnIndex, T max, Func<string, bool> isMissingValueFunc, Func<string, T> parseFunc) where T : IComparable<T> {
103      T min = max;
104      for (int i = 0; i < dataSet.Rows; ++i) {
105        var value = dataSet.GetValue(i, columnIndex);
106        if (!isMissingValueFunc(value)) {
107          T parsedValue = parseFunc(value);
108          if (parsedValue.CompareTo(min) < 0) {
109            min = parsedValue;
110          }
111        }
112      }
113      return min;
114    }
115
116    private T GetMax<T>(int columnIndex, T min, Func<string, bool> isMissingValueFunc, Func<string, T> parseFunc) where T : IComparable<T> {
117      T max = min;
118      for (int i = 0; i < dataSet.Rows; ++i) {
119        var value = dataSet.GetValue(i, columnIndex);
120        if (!isMissingValueFunc(value)) {
121          T parsedValue = parseFunc(value);
122          if (parsedValue.CompareTo(min) > 0) {
123            max = parsedValue;
124          }
125        }
126      }
127      return max;
128    }
129
130
131
132   
133
134    public double GetMedian(int columnIndex) {
135      throw new System.NotImplementedException();
136    }
137
138    public double GetAverage(int columnIndex) {
139      throw new System.NotImplementedException();
140    }
141
142    public double GetMostCommonValue(int columnIndex) {
143      throw new System.NotImplementedException();
144    }
145
146    public double GeStandardDeviation(int columnIndex) {
147      throw new System.NotImplementedException();
148    }
149
150  }
151}
Note: See TracBrowser for help on using the repository browser.