1 | using System;
|
---|
2 | using System.Linq;
|
---|
3 | using HeuristicLab.Problems.DataAnalysis;
|
---|
4 |
|
---|
5 | namespace HeuristicLab.DataPreprocessing {
|
---|
6 | class DatasetStatisticInfo : IDatasetStatisticInfo {
|
---|
7 |
|
---|
8 | private IDataset dataSet;
|
---|
9 |
|
---|
10 | public DatasetStatisticInfo(IDataset theDataSet) {
|
---|
11 | dataSet = theDataSet;
|
---|
12 | }
|
---|
13 |
|
---|
14 |
|
---|
15 | public int GetColumnCount() {
|
---|
16 | return dataSet.Columns;
|
---|
17 | }
|
---|
18 |
|
---|
19 | public int GetRowCount() {
|
---|
20 | return dataSet.Rows;
|
---|
21 | }
|
---|
22 |
|
---|
23 | public int GetNumericColumnCount() {
|
---|
24 | return dataSet.DoubleVariables.Count();
|
---|
25 | }
|
---|
26 |
|
---|
27 | public int GetNominalColumnCount() {
|
---|
28 | return dataSet.Columns - GetNumericColumnCount();
|
---|
29 | }
|
---|
30 |
|
---|
31 | public int GetMissingValueCount() {
|
---|
32 | int count = 0;
|
---|
33 | for (int i = 0; i < dataSet.Columns; ++i) {
|
---|
34 | count += GetMissingValueCount(i);
|
---|
35 | }
|
---|
36 | return count;
|
---|
37 | }
|
---|
38 |
|
---|
39 | public int GetMissingValueCount(int columnIndex) {
|
---|
40 | Func<string, bool> isMissingValueFunc;
|
---|
41 | if (dataSet.IsType<double>(columnIndex)) {
|
---|
42 | isMissingValueFunc = IsMissingDoubleValue;
|
---|
43 | } else if (dataSet.IsType<string>(columnIndex)) {
|
---|
44 | isMissingValueFunc = IsMissingStringValue;
|
---|
45 | } else if (dataSet.IsType<DateTime>(columnIndex)) {
|
---|
46 | isMissingValueFunc = isMissingDateTimeValue;
|
---|
47 | } else {
|
---|
48 | throw new ArgumentException("column with index: " + columnIndex + " contains a non supported type.");
|
---|
49 | }
|
---|
50 |
|
---|
51 | int count = 0;
|
---|
52 | for (int i = 0; i < dataSet.Rows; ++i) {
|
---|
53 | if (isMissingValueFunc(dataSet.GetValue(i, columnIndex))) {
|
---|
54 | ++count;
|
---|
55 | }
|
---|
56 | }
|
---|
57 | return count;
|
---|
58 | }
|
---|
59 |
|
---|
60 | private bool IsMissingDoubleValue(string value) {
|
---|
61 | double dummy;
|
---|
62 | bool couldNotParse = !double.TryParse(value, out dummy);
|
---|
63 | return couldNotParse || double.IsNaN(dummy);
|
---|
64 | }
|
---|
65 |
|
---|
66 | private bool IsMissingStringValue(string value) {
|
---|
67 | return string.IsNullOrEmpty(value);
|
---|
68 | }
|
---|
69 |
|
---|
70 | private bool isMissingDateTimeValue(string value) {
|
---|
71 | DateTime dateTime;
|
---|
72 | bool couldNotParse = DateTime.TryParse(value, out dateTime);
|
---|
73 | return couldNotParse || dateTime.Equals(DateTime.MinValue);
|
---|
74 | }
|
---|
75 |
|
---|
76 | public T GetMin<T>(int columnIndex) where T : IComparable<T> {
|
---|
77 | if (!dataSet.IsType<double>(columnIndex)) {
|
---|
78 | throw new ArgumentException("column with index: " + columnIndex + " was assumed to be of type " + typeof(T).Name + " but was different.");
|
---|
79 | }
|
---|
80 | if (typeof(T) == typeof(double)) {
|
---|
81 | return (dynamic)GetMin(columnIndex, double.MaxValue, IsMissingDoubleValue, double.Parse); ;
|
---|
82 | } else if (typeof(T) == typeof(DateTime)) {
|
---|
83 | return (dynamic)GetMin(columnIndex, DateTime.MaxValue, IsMissingDoubleValue, DateTime.Parse);
|
---|
84 | } else {
|
---|
85 | throw new ArgumentException("type of T is not supported");
|
---|
86 | }
|
---|
87 | }
|
---|
88 |
|
---|
89 | public T GetMax<T>(int columnIndex) where T : IComparable<T> {
|
---|
90 | if (!dataSet.IsType<double>(columnIndex)) {
|
---|
91 | throw new ArgumentException("column with index: " + columnIndex + " was assumed to be of type " + typeof(T).Name + " but was different.");
|
---|
92 | }
|
---|
93 | if (typeof(T) == typeof(double)) {
|
---|
94 | return (dynamic)GetMax(columnIndex, double.MinValue, IsMissingDoubleValue, double.Parse); ;
|
---|
95 | } else if (typeof(T) == typeof(DateTime)) {
|
---|
96 | return (dynamic)GetMax(columnIndex, DateTime.MinValue, IsMissingDoubleValue, DateTime.Parse);
|
---|
97 | } else {
|
---|
98 | throw new ArgumentException("type of T is not supported");
|
---|
99 | }
|
---|
100 | }
|
---|
101 |
|
---|
102 | private T GetMin<T>(int columnIndex, T max, Func<string, bool> isMissingValueFunc, Func<string, T> parseFunc) where T : IComparable<T> {
|
---|
103 | T min = max;
|
---|
104 | for (int i = 0; i < dataSet.Rows; ++i) {
|
---|
105 | var value = dataSet.GetValue(i, columnIndex);
|
---|
106 | if (!isMissingValueFunc(value)) {
|
---|
107 | T parsedValue = parseFunc(value);
|
---|
108 | if (parsedValue.CompareTo(min) < 0) {
|
---|
109 | min = parsedValue;
|
---|
110 | }
|
---|
111 | }
|
---|
112 | }
|
---|
113 | return min;
|
---|
114 | }
|
---|
115 |
|
---|
116 | private T GetMax<T>(int columnIndex, T min, Func<string, bool> isMissingValueFunc, Func<string, T> parseFunc) where T : IComparable<T> {
|
---|
117 | T max = min;
|
---|
118 | for (int i = 0; i < dataSet.Rows; ++i) {
|
---|
119 | var value = dataSet.GetValue(i, columnIndex);
|
---|
120 | if (!isMissingValueFunc(value)) {
|
---|
121 | T parsedValue = parseFunc(value);
|
---|
122 | if (parsedValue.CompareTo(min) > 0) {
|
---|
123 | max = parsedValue;
|
---|
124 | }
|
---|
125 | }
|
---|
126 | }
|
---|
127 | return max;
|
---|
128 | }
|
---|
129 |
|
---|
130 |
|
---|
131 |
|
---|
132 |
|
---|
133 |
|
---|
134 | public double GetMedian(int columnIndex) {
|
---|
135 | throw new System.NotImplementedException();
|
---|
136 | }
|
---|
137 |
|
---|
138 | public double GetAverage(int columnIndex) {
|
---|
139 | throw new System.NotImplementedException();
|
---|
140 | }
|
---|
141 |
|
---|
142 | public double GetMostCommonValue(int columnIndex) {
|
---|
143 | throw new System.NotImplementedException();
|
---|
144 | }
|
---|
145 |
|
---|
146 | public double GeStandardDeviation(int columnIndex) {
|
---|
147 | throw new System.NotImplementedException();
|
---|
148 | }
|
---|
149 |
|
---|
150 | }
|
---|
151 | }
|
---|