Free cookie consent management tool by TermsFeed Policy Generator

source: branches/2906_Transformations/HeuristicLab.DataPreprocessing/3.4/Data/PreprocessingData.cs @ 16654

Last change on this file since 16654 was 15885, checked in by pfleck, 7 years ago

#2906 Updated project references + small refactoring

File size: 28.7 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2018 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections;
24using System.Collections.Generic;
25using System.Globalization;
26using System.Linq;
27using HeuristicLab.Common;
28using HeuristicLab.Core;
29using HeuristicLab.Data;
30using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
31using HeuristicLab.Problems.DataAnalysis;
32
33namespace HeuristicLab.DataPreprocessing {
34
35  [Item("PreprocessingData", "Represents data used for preprocessing.")]
36  [StorableClass]
37  public class PreprocessingData : NamedItem, IPreprocessingData {
38
39    [Storable]
40    protected IList<IList> variableValues;
41    [Storable]
42    protected IList<string> variableNames;
43
44    #region Constructor, Cloning & Persistence
45    public PreprocessingData(IDataAnalysisProblemData problemData)
46      : base() {
47      Name = "Preprocessing Data";
48
49      Transformations = new List<PreprocessingTransformation>();
50      selection = new Dictionary<int, IList<int>>();
51
52      Import(problemData);
53
54      RegisterEventHandler();
55    }
56
57    protected PreprocessingData(PreprocessingData original, Cloner cloner)
58      : base(original, cloner) {
59      variableValues = CopyVariableValues(original.variableValues);
60      variableNames = new List<string>(original.variableNames);
61      TrainingPartition = (IntRange)original.TrainingPartition.Clone(cloner);
62      TestPartition = (IntRange)original.TestPartition.Clone(cloner);
63      Transformations = new List<PreprocessingTransformation>(original.Transformations.Select(cloner.Clone));
64
65      InputVariables = new List<string>(original.InputVariables);
66      TargetVariable = original.TargetVariable;
67
68      RegisterEventHandler();
69    }
70    public override IDeepCloneable Clone(Cloner cloner) {
71      return new PreprocessingData(this, cloner);
72    }
73
74    [StorableConstructor]
75    protected PreprocessingData(bool deserializing)
76      : base(deserializing) { }
77    [StorableHook(HookType.AfterDeserialization)]
78    private void AfterDeserialization() {
79      RegisterEventHandler();
80    }
81
82    private void RegisterEventHandler() {
83      Changed += (s, e) => {
84        switch (e.Type) {
85          case DataPreprocessingChangedEventType.DeleteRow:
86          case DataPreprocessingChangedEventType.Any:
87          case DataPreprocessingChangedEventType.Transformation:
88            int maxRowIndex = Math.Max(0, Rows);
89            TrainingPartition.Start = Math.Min(TrainingPartition.Start, maxRowIndex);
90            TrainingPartition.End = Math.Min(TrainingPartition.End, maxRowIndex);
91            TestPartition.Start = Math.Min(TestPartition.Start, maxRowIndex);
92            TestPartition.End = Math.Min(TestPartition.End, maxRowIndex);
93            break;
94        }
95      };
96    }
97    #endregion
98
99    #region Cells
100    public bool IsCellEmpty(int columnIndex, int rowIndex) {
101      var value = variableValues[columnIndex][rowIndex];
102      return IsMissingValue(value);
103    }
104
105    public T GetCell<T>(int columnIndex, int rowIndex) {
106      return (T)variableValues[columnIndex][rowIndex];
107    }
108
109    public void SetCell<T>(int columnIndex, int rowIndex, T value) {
110      SaveSnapshot(DataPreprocessingChangedEventType.ChangeItem, columnIndex, rowIndex);
111
112      for (int i = Rows; i <= rowIndex; i++)
113        InsertRow(i);
114      for (int i = Columns; i <= columnIndex; i++)
115        InsertColumn<T>(i.ToString(), i);
116
117      variableValues[columnIndex][rowIndex] = value;
118      if (!IsInTransaction)
119        OnChanged(DataPreprocessingChangedEventType.ChangeItem, columnIndex, rowIndex);
120    }
121
122    public string GetCellAsString(int columnIndex, int rowIndex) {
123      return variableValues[columnIndex][rowIndex].ToString();
124    }
125
126    public IList<T> GetValues<T>(int columnIndex, bool considerSelection) {
127      if (considerSelection) {
128        var list = new List<T>();
129        foreach (var rowIdx in selection[columnIndex]) {
130          list.Add((T)variableValues[columnIndex][rowIdx]);
131        }
132        return list;
133      } else {
134        return (IList<T>)variableValues[columnIndex];
135      }
136    }
137
138    public void SetValues<T>(int columnIndex, IList<T> values) {
139      SaveSnapshot(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1);
140      if (VariableHasType<T>(columnIndex)) {
141        variableValues[columnIndex] = (IList)values;
142      } else {
143        throw new ArgumentException("The datatype of column " + columnIndex + " must be of type " + variableValues[columnIndex].GetType().Name + " but was " + typeof(T).Name);
144      }
145      if (!IsInTransaction)
146        OnChanged(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1);
147    }
148
149    public bool SetValue(string value, int columnIndex, int rowIndex) {
150      bool valid = false;
151      if (VariableHasType<double>(columnIndex)) {
152        double val;
153        if (string.IsNullOrWhiteSpace(value)) {
154          val = double.NaN;
155          valid = true;
156        } else {
157          valid = double.TryParse(value, out val);
158        }
159        if (valid)
160          SetCell(columnIndex, rowIndex, val);
161      } else if (VariableHasType<string>(columnIndex)) {
162        valid = value != null;
163        if (valid)
164          SetCell(columnIndex, rowIndex, value);
165      } else if (VariableHasType<DateTime>(columnIndex)) {
166        DateTime date;
167        valid = DateTime.TryParse(value, out date);
168        if (valid)
169          SetCell(columnIndex, rowIndex, date);
170      } else {
171        throw new ArgumentException("column " + columnIndex + " contains a non supported type.");
172      }
173
174      if (!IsInTransaction)
175        OnChanged(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1);
176
177      return valid;
178    }
179
180    public int Columns {
181      get { return variableNames.Count; }
182    }
183
184    public int Rows {
185      get { return variableValues.Count > 0 ? variableValues[0].Count : 0; }
186    }
187
188    public static bool IsMissingValue(object value) {
189      if (value is double) return double.IsNaN((double)value);
190      if (value is string) return string.IsNullOrEmpty((string)value);
191      if (value is DateTime) return ((DateTime)value).Equals(DateTime.MinValue);
192      throw new ArgumentException();
193    }
194    #endregion
195
196    #region Rows
197    public void InsertRow(int rowIndex) {
198      SaveSnapshot(DataPreprocessingChangedEventType.DeleteRow, -1, rowIndex);
199      foreach (IList column in variableValues) {
200        Type type = column.GetType().GetGenericArguments()[0];
201        column.Insert(rowIndex, type.IsValueType ? Activator.CreateInstance(type) : null);
202      }
203      if (TrainingPartition.Start <= rowIndex && rowIndex <= TrainingPartition.End) {
204        TrainingPartition.End++;
205        if (TrainingPartition.End <= TestPartition.Start) {
206          TestPartition.Start++;
207          TestPartition.End++;
208        }
209      } else if (TestPartition.Start <= rowIndex && rowIndex <= TestPartition.End) {
210        TestPartition.End++;
211        if (TestPartition.End <= TrainingPartition.Start) {
212          TestPartition.Start++;
213          TestPartition.End++;
214        }
215      }
216      if (!IsInTransaction)
217        OnChanged(DataPreprocessingChangedEventType.AddRow, -1, rowIndex);
218    }
219    public void DeleteRow(int rowIndex) {
220      SaveSnapshot(DataPreprocessingChangedEventType.AddRow, -1, rowIndex);
221      foreach (IList column in variableValues) {
222        column.RemoveAt(rowIndex);
223      }
224      if (TrainingPartition.Start <= rowIndex && rowIndex <= TrainingPartition.End) {
225        TrainingPartition.End--;
226        if (TrainingPartition.End <= TestPartition.Start) {
227          TestPartition.Start--;
228          TestPartition.End--;
229        }
230      } else if (TestPartition.Start <= rowIndex && rowIndex <= TestPartition.End) {
231        TestPartition.End--;
232        if (TestPartition.End <= TrainingPartition.Start) {
233          TestPartition.Start--;
234          TestPartition.End--;
235        }
236      }
237      if (!IsInTransaction)
238        OnChanged(DataPreprocessingChangedEventType.DeleteRow, -1, rowIndex);
239    }
240    public void DeleteRowsWithIndices(IEnumerable<int> rows) {
241      SaveSnapshot(DataPreprocessingChangedEventType.AddRow, -1, -1);
242      foreach (int rowIndex in rows.OrderByDescending(x => x)) {
243        foreach (IList column in variableValues) {
244          column.RemoveAt(rowIndex);
245        }
246        if (TrainingPartition.Start <= rowIndex && rowIndex <= TrainingPartition.End) {
247          TrainingPartition.End--;
248          if (TrainingPartition.End <= TestPartition.Start) {
249            TestPartition.Start--;
250            TestPartition.End--;
251          }
252        } else if (TestPartition.Start <= rowIndex && rowIndex <= TestPartition.End) {
253          TestPartition.End--;
254          if (TestPartition.End <= TrainingPartition.Start) {
255            TestPartition.Start--;
256            TestPartition.End--;
257          }
258        }
259      }
260      if (!IsInTransaction)
261        OnChanged(DataPreprocessingChangedEventType.DeleteRow, -1, -1);
262    }
263
264    public void InsertColumn<T>(string variableName, int columnIndex) {
265      SaveSnapshot(DataPreprocessingChangedEventType.DeleteColumn, columnIndex, -1);
266      variableValues.Insert(columnIndex, new List<T>(Enumerable.Repeat(default(T), Rows)));
267      variableNames.Insert(columnIndex, variableName);
268      if (!IsInTransaction)
269        OnChanged(DataPreprocessingChangedEventType.AddColumn, columnIndex, -1);
270    }
271
272    public void DeleteColumn(int columnIndex) {
273      SaveSnapshot(DataPreprocessingChangedEventType.AddColumn, columnIndex, -1);
274      variableValues.RemoveAt(columnIndex);
275      variableNames.RemoveAt(columnIndex);
276      if (!IsInTransaction)
277        OnChanged(DataPreprocessingChangedEventType.DeleteColumn, columnIndex, -1);
278    }
279
280    public void RenameColumn(int columnIndex, string name) {
281      SaveSnapshot(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1);
282      if (columnIndex < 0 || columnIndex > variableNames.Count)
283        throw new ArgumentOutOfRangeException("columnIndex");
284      variableNames[columnIndex] = name;
285
286      if (!IsInTransaction)
287        OnChanged(DataPreprocessingChangedEventType.ChangeColumn, -1, -1);
288    }
289
290    public void RenameColumns(IList<string> names) {
291      if (names == null) throw new ArgumentNullException("names");
292      if (names.Count != variableNames.Count) throw new ArgumentException("number of names must match the number of columns.", "names");
293
294      SaveSnapshot(DataPreprocessingChangedEventType.ChangeColumn, -1, -1);
295      for (int i = 0; i < names.Count; i++)
296        variableNames[i] = names[i];
297
298      if (!IsInTransaction)
299        OnChanged(DataPreprocessingChangedEventType.ChangeColumn, -1, -1);
300    }
301
302    public bool AreAllStringColumns(IEnumerable<int> columnIndices) {
303      return columnIndices.All(x => VariableHasType<string>(x));
304    }
305    #endregion
306
307    #region Variables
308    public IEnumerable<string> VariableNames {
309      get { return variableNames; }
310    }
311
312    public IEnumerable<string> GetDoubleVariableNames() {
313      var doubleVariableNames = new List<string>();
314      for (int i = 0; i < Columns; ++i) {
315        if (VariableHasType<double>(i)) {
316          doubleVariableNames.Add(variableNames[i]);
317        }
318      }
319      return doubleVariableNames;
320    }
321
322    public string GetVariableName(int columnIndex) {
323      return variableNames[columnIndex];
324    }
325
326    public int GetColumnIndex(string variableName) {
327      return variableNames.IndexOf(variableName);
328    }
329
330    public bool VariableHasType<T>(int columnIndex) {
331      return columnIndex >= variableValues.Count || variableValues[columnIndex] is List<T>;
332    }
333
334    public Type GetVariableType(int columnIndex) {
335      var listType = variableValues[columnIndex].GetType();
336      return listType.GenericTypeArguments.Single();
337    }
338
339    public IList<string> InputVariables { get; private set; }
340    public string TargetVariable { get; private set; } // optional
341    #endregion
342
343    #region Partitions
344    [Storable]
345    public IntRange TrainingPartition { get; set; }
346    [Storable]
347    public IntRange TestPartition { get; set; }
348    #endregion
349
350    #region Transformations
351    [Storable]
352    public IList<PreprocessingTransformation> Transformations { get; protected set; }
353    #endregion
354
355    #region Validation
356    public bool Validate(string value, out string errorMessage, int columnIndex) {
357      if (columnIndex < 0 || columnIndex > VariableNames.Count()) {
358        throw new ArgumentOutOfRangeException("column index is out of range");
359      }
360
361      bool valid = false;
362      errorMessage = string.Empty;
363      if (VariableHasType<double>(columnIndex)) {
364        if (string.IsNullOrWhiteSpace(value)) {
365          valid = true;
366        } else {
367          double val;
368          valid = double.TryParse(value, out val);
369          if (!valid) {
370            errorMessage = "Invalid Value (Valid Value Format: \"" + FormatPatterns.GetDoubleFormatPattern() + "\")";
371          }
372        }
373      } else if (VariableHasType<string>(columnIndex)) {
374        valid = value != null;
375        if (!valid) {
376          errorMessage = "Invalid Value (string must not be null)";
377        }
378      } else if (VariableHasType<DateTime>(columnIndex)) {
379        DateTime date;
380        valid = DateTime.TryParse(value, out date);
381        if (!valid) {
382          errorMessage = "Invalid Value (Valid Value Format: \"" + CultureInfo.CurrentCulture.DateTimeFormat + "\"";
383        }
384      } else {
385        throw new ArgumentException("column " + columnIndex + " contains a non supported type.");
386      }
387
388      return valid;
389    }
390    #endregion
391
392    #region Import & Export
393    public void Import(IDataAnalysisProblemData problemData) {
394      Dataset dataset = (Dataset)problemData.Dataset;
395      variableNames = new List<string>(problemData.Dataset.VariableNames);
396      InputVariables = new List<string>(problemData.AllowedInputVariables);
397      TargetVariable = (problemData is IRegressionProblemData) ? ((IRegressionProblemData)problemData).TargetVariable
398        : (problemData is IClassificationProblemData) ? ((IClassificationProblemData)problemData).TargetVariable
399          : null;
400
401      int columnIndex = 0;
402      variableValues = new List<IList>();
403      foreach (var variableName in problemData.Dataset.VariableNames) {
404        if (dataset.VariableHasType<double>(variableName)) {
405          variableValues.Insert(columnIndex, dataset.GetDoubleValues(variableName).ToList());
406        } else if (dataset.VariableHasType<string>(variableName)) {
407          variableValues.Insert(columnIndex, dataset.GetStringValues(variableName).ToList());
408        } else if (dataset.VariableHasType<DateTime>(variableName)) {
409          variableValues.Insert(columnIndex, dataset.GetDateTimeValues(variableName).ToList());
410        } else {
411          throw new ArgumentException("The datatype of column " + variableName + " must be of type double, string or DateTime");
412        }
413        ++columnIndex;
414      }
415
416      foreach (var trans in problemData.Transformations) {
417        var newTrans = new PreprocessingTransformation(variableNames.Select(x => new StringValue(x))) {
418          OriginalVariable = trans.OriginalVariable,
419          TransformedVariable = trans.TransformedVariable,
420          IsApplied = true
421        };
422        var cloned = (ITransformation)trans.Transformation.Clone();
423        newTrans.TransformationParameter.ValidValues.Add(cloned);
424        newTrans.Transformation = cloned;
425        Transformations.Add(newTrans);
426      }
427
428      TrainingPartition = new IntRange(problemData.TrainingPartition.Start, problemData.TrainingPartition.End);
429      TestPartition = new IntRange(problemData.TestPartition.Start, problemData.TestPartition.End);
430    }
431
432    public Dataset ExportToDataset() {
433      IList<IList> values = new List<IList>();
434
435      for (int i = 0; i < Columns; ++i) {
436        values.Add(variableValues[i]);
437      }
438
439      var dataset = new Dataset(variableNames, values);
440      return dataset;
441    }
442    #endregion
443
444    #region Selection
445    [Storable]
446    protected IDictionary<int, IList<int>> selection;
447    public IDictionary<int, IList<int>> Selection {
448      get { return selection; }
449      set {
450        selection = value;
451        OnSelectionChanged();
452      }
453    }
454    public void ClearSelection() {
455      Selection = new Dictionary<int, IList<int>>();
456    }
457
458    public event EventHandler SelectionChanged;
459    protected void OnSelectionChanged() {
460      var listeners = SelectionChanged;
461      if (listeners != null) listeners(this, EventArgs.Empty);
462    }
463    #endregion
464
465    #region Transactions
466    // Stapshot/History are nost storable/cloneable on purpose
467    private class Snapshot {
468      public IList<IList> VariableValues { get; set; }
469      public IList<string> VariableNames { get; set; }
470
471      public IntRange TrainingPartition { get; set; }
472      public IntRange TestPartition { get; set; }
473      public IList<PreprocessingTransformation> Transformations { get; set; }
474      public DataPreprocessingChangedEventType ChangedType { get; set; }
475
476      public int ChangedColumn { get; set; }
477      public int ChangedRow { get; set; }
478    }
479
480    public event DataPreprocessingChangedEventHandler Changed;
481    protected virtual void OnChanged(DataPreprocessingChangedEventType type, int column, int row) {
482      var listeners = Changed;
483      if (listeners != null) listeners(this, new DataPreprocessingChangedEventArgs(type, column, row));
484    }
485
486    private const int MAX_UNDO_DEPTH = 5;
487
488    private readonly IList<Snapshot> undoHistory = new List<Snapshot>();
489    private readonly Stack<DataPreprocessingChangedEventType> eventStack = new Stack<DataPreprocessingChangedEventType>();
490
491    public bool IsInTransaction { get { return eventStack.Count > 0; } }
492
493    private void SaveSnapshot(DataPreprocessingChangedEventType changedType, int column, int row) {
494      if (IsInTransaction) return;
495
496      var currentSnapshot = new Snapshot {
497        VariableValues = CopyVariableValues(variableValues),
498        VariableNames = new List<string>(variableNames),
499        TrainingPartition = new IntRange(TrainingPartition.Start, TrainingPartition.End),
500        TestPartition = new IntRange(TestPartition.Start, TestPartition.End),
501        Transformations = new List<PreprocessingTransformation>(Transformations),
502        ChangedType = changedType,
503        ChangedColumn = column,
504        ChangedRow = row
505      };
506
507      if (undoHistory.Count >= MAX_UNDO_DEPTH)
508        undoHistory.RemoveAt(0);
509
510      undoHistory.Add(currentSnapshot);
511    }
512
513    public bool IsUndoAvailable {
514      get { return undoHistory.Count > 0; }
515    }
516
517    public void Undo() {
518      if (IsUndoAvailable) {
519        Snapshot previousSnapshot = undoHistory[undoHistory.Count - 1];
520        variableValues = previousSnapshot.VariableValues;
521        variableNames = previousSnapshot.VariableNames;
522        TrainingPartition = previousSnapshot.TrainingPartition;
523        TestPartition = previousSnapshot.TestPartition;
524        Transformations = previousSnapshot.Transformations;
525        undoHistory.Remove(previousSnapshot);
526        OnChanged(previousSnapshot.ChangedType,
527          previousSnapshot.ChangedColumn,
528          previousSnapshot.ChangedRow);
529      }
530    }
531
532    public void InTransaction(Action action, DataPreprocessingChangedEventType type = DataPreprocessingChangedEventType.Any) {
533      BeginTransaction(type);
534      action();
535      EndTransaction();
536    }
537
538    public void BeginTransaction(DataPreprocessingChangedEventType type) {
539      SaveSnapshot(type, -1, -1);
540      eventStack.Push(type);
541    }
542
543    public void EndTransaction() {
544      if (eventStack.Count == 0)
545        throw new InvalidOperationException("There is no open transaction that can be ended.");
546
547      var @event = eventStack.Pop();
548      OnChanged(@event, -1, -1);
549    }
550    #endregion
551
552    #region Statistics
553    public T GetMin<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {
554      var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection);
555      return values.Any() ? values.Min() : emptyValue;
556    }
557
558    public T GetMax<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {
559      var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection);
560      return values.Any() ? values.Max() : emptyValue;
561    }
562
563    public T GetMean<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {
564      if (typeof(T) == typeof(double)) {
565        var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection);
566        return values.Any() ? Convert<T>(values.Average()) : emptyValue;
567      }
568      if (typeof(T) == typeof(string)) {
569        return Convert<T>(string.Empty);
570      }
571      if (typeof(T) == typeof(DateTime)) {
572        var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection);
573        return values.Any() ? Convert<T>(AggregateAsDouble(values, Enumerable.Average)) : emptyValue;
574      }
575
576      throw new InvalidOperationException(typeof(T) + " not supported");
577    }
578
579    public T GetMedian<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> {
580      if (typeof(T) == typeof(double)) {// IEnumerable<double> is faster 
581        var doubleValues = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection);
582        return doubleValues.Any() ? Convert<T>(doubleValues.Median()) : emptyValue;
583      }
584      var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection);
585      return values.Any() ? values.Quantile(0.5) : emptyValue;
586    }
587
588    public T GetMode<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IEquatable<T> {
589      var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection);
590      return values.Any() ? values.GroupBy(x => x).OrderByDescending(g => g.Count()).Select(g => g.Key).First() : emptyValue;
591    }
592
593    public T GetStandardDeviation<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {
594      if (typeof(T) == typeof(double)) {
595        var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection);
596        return values.Any() ? Convert<T>(values.StandardDeviation()) : emptyValue;
597      }
598      // For DateTime, std.dev / variance would have to be TimeSpan
599      //if (typeof(T) == typeof(DateTime)) {
600      //  var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection);
601      //  return values.Any() ? Convert<T>(AggregateAsDouble(values, EnumerableStatisticExtensions.StandardDeviation)) : emptyValue;
602      //}
603      return default(T);
604    }
605
606    public T GetVariance<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {
607      if (typeof(T) == typeof(double)) {
608        var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection);
609        return values.Any() ? Convert<T>(values.Variance()) : emptyValue;
610      }
611      // DateTime variance often overflows long, thus the corresponding DateTime is invalid
612      //if (typeof(T) == typeof(DateTime)) {
613      //  var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection);
614      //  return values.Any() ? Convert<T>(AggregateAsDouble(values, EnumerableStatisticExtensions.Variance)) : emptyValue;
615      //}
616      return default(T);
617    }
618
619    public T GetQuantile<T>(double alpha, int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> {
620      if (typeof(T) == typeof(double)) {// IEnumerable<double> is faster 
621        var doubleValues = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection);
622        return doubleValues.Any() ? Convert<T>(doubleValues.Quantile(alpha)) : emptyValue;
623      }
624      var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection);
625      return values.Any() ? values.Quantile(alpha) : emptyValue;
626    }
627
628    public int GetDistinctValues<T>(int columnIndex, bool considerSelection = false) {
629      var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection);
630      return values.GroupBy(x => x).Count();
631    }
632
633    private IEnumerable<T> GetValuesWithoutMissingValues<T>(int columnIndex, bool considerSelection) {
634      return GetValues<T>(columnIndex, considerSelection).Where(x => !IsMissingValue(x));
635    }
636
637    private static DateTime AggregateAsDouble(IEnumerable<DateTime> values, Func<IEnumerable<double>, double> func) {
638      return new DateTime((long)(func(values.Select(x => (double)x.Ticks / TimeSpan.TicksPerSecond)) * TimeSpan.TicksPerSecond));
639    }
640    private static T Convert<T>(object obj) { return (T)obj; }
641
642    public int GetMissingValueCount() {
643      int count = 0;
644      for (int i = 0; i < Columns; ++i) {
645        count += GetMissingValueCount(i);
646      }
647      return count;
648    }
649    public int GetMissingValueCount(int columnIndex) {
650      int sum = 0;
651      for (int i = 0; i < Rows; i++) {
652        if (IsCellEmpty(columnIndex, i))
653          sum++;
654      }
655      return sum;
656    }
657    public int GetRowMissingValueCount(int rowIndex) {
658      int sum = 0;
659      for (int i = 0; i < Columns; i++) {
660        if (IsCellEmpty(i, rowIndex))
661          sum++;
662      }
663      return sum;
664    }
665    #endregion
666
667    #region Helpers
668    private static IList<IList> CopyVariableValues(IList<IList> original) {
669      var copy = new List<IList>(original);
670      for (int i = 0; i < original.Count; ++i) {
671        copy[i] = (IList)Activator.CreateInstance(original[i].GetType(), original[i]);
672      }
673      return copy;
674    }
675    #endregion
676  }
677
678  // Adapted from HeuristicLab.Common.EnumerableStatisticExtensions
679  internal static class EnumerableExtensions {
680    public static T Quantile<T>(this IEnumerable<T> values, double alpha) where T : IComparable<T> {
681      T[] valuesArr = values.ToArray();
682      int n = valuesArr.Length;
683      if (n == 0) throw new InvalidOperationException("Enumeration contains no elements.");
684
685      var pos = n * alpha;
686
687      return Select((int)Math.Ceiling(pos) - 1, valuesArr);
688
689    }
690
691    private static T Select<T>(int k, T[] arr) where T : IComparable<T> {
692      int i, ir, j, l, mid, n = arr.Length;
693      T a;
694      l = 0;
695      ir = n - 1;
696      for (; ; ) {
697        if (ir <= l + 1) {
698          // Active partition contains 1 or 2 elements.
699          if (ir == l + 1 && arr[ir].CompareTo(arr[l]) < 0) {
700            // Case of 2 elements.
701            Swap(arr, l, ir);
702          }
703          return arr[k];
704        } else {
705          mid = (l + ir) >> 1; // Choose median of left, center, and right elements
706          Swap(arr, mid, l + 1); // as partitioning element a. Also
707
708          if (arr[l].CompareTo(arr[ir]) > 0) {  // rearrange so that arr[l] arr[ir] <= arr[l+1],
709            Swap(arr, l, ir); // . arr[ir] >= arr[l+1]
710          }
711
712          if (arr[l + 1].CompareTo(arr[ir]) > 0) {
713            Swap(arr, l + 1, ir);
714          }
715          if (arr[l].CompareTo(arr[l + 1]) > 0) {
716            Swap(arr, l, l + 1);
717          }
718          i = l + 1; // Initialize pointers for partitioning.
719          j = ir;
720          a = arr[l + 1]; // Partitioning element.
721          for (; ; ) { // Beginning of innermost loop.
722            do i++; while (arr[i].CompareTo(a) < 0); // Scan up to find element > a.
723            do j--; while (arr[j].CompareTo(a) > 0); // Scan down to find element < a.
724            if (j < i) break; // Pointers crossed. Partitioning complete.
725            Swap(arr, i, j);
726          } // End of innermost loop.
727          arr[l + 1] = arr[j]; // Insert partitioning element.
728          arr[j] = a;
729          if (j >= k) ir = j - 1; // Keep active the partition that contains the
730          if (j <= k) l = i; // kth element.
731        }
732      }
733    }
734
735    private static void Swap<T>(T[] arr, int i, int j) {
736      T temp = arr[i];
737      arr[i] = arr[j];
738      arr[j] = temp;
739    }
740  }
741}
Note: See TracBrowser for help on using the repository browser.