Free cookie consent management tool by TermsFeed Policy Generator

source: trunk/HeuristicLab.DataPreprocessing/3.4/Data/PreprocessingData.cs @ 17243

Last change on this file since 17243 was 17180, checked in by swagner, 5 years ago

#2875: Removed years in copyrights

File size: 28.1 KB
RevLine 
[10163]1#region License Information
2/* HeuristicLab
[17180]3 * Copyright (C) Heuristic and Evolutionary Algorithms Laboratory (HEAL)
[10163]4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
[10168]23using System.Collections;
[10163]24using System.Collections.Generic;
[15518]25using System.Globalization;
[10185]26using System.Linq;
27using HeuristicLab.Common;
[10163]28using HeuristicLab.Core;
[10220]29using HeuristicLab.Data;
[16565]30using HEAL.Attic;
[10163]31using HeuristicLab.Problems.DataAnalysis;
32
[10182]33namespace HeuristicLab.DataPreprocessing {
[10550]34
[10163]35  [Item("PreprocessingData", "Represents data used for preprocessing.")]
[16565]36  [StorableType("DDF0FC89-E180-47EB-B96E-CBD9E15D697E")]
[15518]37  public class PreprocessingData : NamedItem, IPreprocessingData {
[10978]38
[15518]39    [Storable]
[10740]40    protected IList<IList> variableValues;
[15518]41    [Storable]
[10586]42    protected IList<string> variableNames;
[10168]43
[15518]44    #region Constructor, Cloning & Persistence
45    public PreprocessingData(IDataAnalysisProblemData problemData)
46      : base() {
47      Name = "Preprocessing Data";
48
49      Transformations = new List<ITransformation>();
50      selection = new Dictionary<int, IList<int>>();
51
52      Import(problemData);
53
54      RegisterEventHandler();
55    }
56
57    protected PreprocessingData(PreprocessingData original, Cloner cloner)
58      : base(original, cloner) {
59      variableValues = CopyVariableValues(original.variableValues);
60      variableNames = new List<string>(original.variableNames);
61      TrainingPartition = (IntRange)original.TrainingPartition.Clone(cloner);
62      TestPartition = (IntRange)original.TestPartition.Clone(cloner);
63      Transformations = new List<ITransformation>(original.Transformations.Select(cloner.Clone));
64
65      InputVariables = new List<string>(original.InputVariables);
66      TargetVariable = original.TargetVariable;
67
68      RegisterEventHandler();
69    }
70    public override IDeepCloneable Clone(Cloner cloner) {
71      return new PreprocessingData(this, cloner);
72    }
73
74    [StorableConstructor]
[16565]75    protected PreprocessingData(StorableConstructorFlag _) : base(_) { }
[15518]76    [StorableHook(HookType.AfterDeserialization)]
77    private void AfterDeserialization() {
78      RegisterEventHandler();
79    }
80
81    private void RegisterEventHandler() {
82      Changed += (s, e) => {
83        switch (e.Type) {
84          case DataPreprocessingChangedEventType.DeleteRow:
85          case DataPreprocessingChangedEventType.Any:
86          case DataPreprocessingChangedEventType.Transformation:
87            int maxRowIndex = Math.Max(0, Rows);
88            TrainingPartition.Start = Math.Min(TrainingPartition.Start, maxRowIndex);
89            TrainingPartition.End = Math.Min(TrainingPartition.End, maxRowIndex);
90            TestPartition.Start = Math.Min(TestPartition.Start, maxRowIndex);
91            TestPartition.End = Math.Min(TestPartition.End, maxRowIndex);
92            break;
93        }
94      };
95    }
96    #endregion
97
98    #region Cells
99    public bool IsCellEmpty(int columnIndex, int rowIndex) {
100      var value = variableValues[columnIndex][rowIndex];
101      return IsMissingValue(value);
102    }
103
104    public T GetCell<T>(int columnIndex, int rowIndex) {
105      return (T)variableValues[columnIndex][rowIndex];
106    }
107
108    public void SetCell<T>(int columnIndex, int rowIndex, T value) {
109      SaveSnapshot(DataPreprocessingChangedEventType.ChangeItem, columnIndex, rowIndex);
110
111      for (int i = Rows; i <= rowIndex; i++)
112        InsertRow(i);
113      for (int i = Columns; i <= columnIndex; i++)
114        InsertColumn<T>(i.ToString(), i);
115
116      variableValues[columnIndex][rowIndex] = value;
117      if (!IsInTransaction)
118        OnChanged(DataPreprocessingChangedEventType.ChangeItem, columnIndex, rowIndex);
119    }
120
121    public string GetCellAsString(int columnIndex, int rowIndex) {
122      return variableValues[columnIndex][rowIndex].ToString();
123    }
124
125    public IList<T> GetValues<T>(int columnIndex, bool considerSelection) {
126      if (considerSelection) {
127        var list = new List<T>();
128        foreach (var rowIdx in selection[columnIndex]) {
129          list.Add((T)variableValues[columnIndex][rowIdx]);
130        }
131        return list;
132      } else {
133        return (IList<T>)variableValues[columnIndex];
134      }
135    }
136
137    public void SetValues<T>(int columnIndex, IList<T> values) {
138      SaveSnapshot(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1);
139      if (VariableHasType<T>(columnIndex)) {
140        variableValues[columnIndex] = (IList)values;
141      } else {
142        throw new ArgumentException("The datatype of column " + columnIndex + " must be of type " + variableValues[columnIndex].GetType().Name + " but was " + typeof(T).Name);
143      }
144      if (!IsInTransaction)
145        OnChanged(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1);
146    }
147
148    public bool SetValue(string value, int columnIndex, int rowIndex) {
149      bool valid = false;
150      if (VariableHasType<double>(columnIndex)) {
151        double val;
152        if (string.IsNullOrWhiteSpace(value)) {
153          val = double.NaN;
154          valid = true;
155        } else {
156          valid = double.TryParse(value, out val);
157        }
158        if (valid)
159          SetCell(columnIndex, rowIndex, val);
160      } else if (VariableHasType<string>(columnIndex)) {
161        valid = value != null;
162        if (valid)
163          SetCell(columnIndex, rowIndex, value);
164      } else if (VariableHasType<DateTime>(columnIndex)) {
165        DateTime date;
166        valid = DateTime.TryParse(value, out date);
167        if (valid)
168          SetCell(columnIndex, rowIndex, date);
169      } else {
170        throw new ArgumentException("column " + columnIndex + " contains a non supported type.");
171      }
172
173      if (!IsInTransaction)
174        OnChanged(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1);
175
176      return valid;
177    }
178
179    public int Columns {
180      get { return variableNames.Count; }
181    }
182
183    public int Rows {
184      get { return variableValues.Count > 0 ? variableValues[0].Count : 0; }
185    }
186
187    public static bool IsMissingValue(object value) {
188      if (value is double) return double.IsNaN((double)value);
189      if (value is string) return string.IsNullOrEmpty((string)value);
190      if (value is DateTime) return ((DateTime)value).Equals(DateTime.MinValue);
191      throw new ArgumentException();
192    }
193    #endregion
194
195    #region Rows
196    public void InsertRow(int rowIndex) {
197      SaveSnapshot(DataPreprocessingChangedEventType.DeleteRow, -1, rowIndex);
198      foreach (IList column in variableValues) {
199        Type type = column.GetType().GetGenericArguments()[0];
200        column.Insert(rowIndex, type.IsValueType ? Activator.CreateInstance(type) : null);
201      }
202      if (TrainingPartition.Start <= rowIndex && rowIndex <= TrainingPartition.End) {
203        TrainingPartition.End++;
204        if (TrainingPartition.End <= TestPartition.Start) {
205          TestPartition.Start++;
206          TestPartition.End++;
207        }
208      } else if (TestPartition.Start <= rowIndex && rowIndex <= TestPartition.End) {
209        TestPartition.End++;
210        if (TestPartition.End <= TrainingPartition.Start) {
211          TestPartition.Start++;
212          TestPartition.End++;
213        }
214      }
215      if (!IsInTransaction)
216        OnChanged(DataPreprocessingChangedEventType.AddRow, -1, rowIndex);
217    }
218    public void DeleteRow(int rowIndex) {
219      SaveSnapshot(DataPreprocessingChangedEventType.AddRow, -1, rowIndex);
220      foreach (IList column in variableValues) {
221        column.RemoveAt(rowIndex);
222      }
223      if (TrainingPartition.Start <= rowIndex && rowIndex <= TrainingPartition.End) {
224        TrainingPartition.End--;
225        if (TrainingPartition.End <= TestPartition.Start) {
226          TestPartition.Start--;
227          TestPartition.End--;
228        }
229      } else if (TestPartition.Start <= rowIndex && rowIndex <= TestPartition.End) {
230        TestPartition.End--;
231        if (TestPartition.End <= TrainingPartition.Start) {
232          TestPartition.Start--;
233          TestPartition.End--;
234        }
235      }
236      if (!IsInTransaction)
237        OnChanged(DataPreprocessingChangedEventType.DeleteRow, -1, rowIndex);
238    }
239    public void DeleteRowsWithIndices(IEnumerable<int> rows) {
240      SaveSnapshot(DataPreprocessingChangedEventType.AddRow, -1, -1);
241      foreach (int rowIndex in rows.OrderByDescending(x => x)) {
242        foreach (IList column in variableValues) {
243          column.RemoveAt(rowIndex);
244        }
245        if (TrainingPartition.Start <= rowIndex && rowIndex <= TrainingPartition.End) {
246          TrainingPartition.End--;
247          if (TrainingPartition.End <= TestPartition.Start) {
248            TestPartition.Start--;
249            TestPartition.End--;
250          }
251        } else if (TestPartition.Start <= rowIndex && rowIndex <= TestPartition.End) {
252          TestPartition.End--;
253          if (TestPartition.End <= TrainingPartition.Start) {
254            TestPartition.Start--;
255            TestPartition.End--;
256          }
257        }
258      }
259      if (!IsInTransaction)
260        OnChanged(DataPreprocessingChangedEventType.DeleteRow, -1, -1);
261    }
262
263    public void InsertColumn<T>(string variableName, int columnIndex) {
264      SaveSnapshot(DataPreprocessingChangedEventType.DeleteColumn, columnIndex, -1);
265      variableValues.Insert(columnIndex, new List<T>(Enumerable.Repeat(default(T), Rows)));
266      variableNames.Insert(columnIndex, variableName);
267      if (!IsInTransaction)
268        OnChanged(DataPreprocessingChangedEventType.AddColumn, columnIndex, -1);
269    }
270
271    public void DeleteColumn(int columnIndex) {
272      SaveSnapshot(DataPreprocessingChangedEventType.AddColumn, columnIndex, -1);
273      variableValues.RemoveAt(columnIndex);
274      variableNames.RemoveAt(columnIndex);
275      if (!IsInTransaction)
276        OnChanged(DataPreprocessingChangedEventType.DeleteColumn, columnIndex, -1);
277    }
278
279    public void RenameColumn(int columnIndex, string name) {
280      SaveSnapshot(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1);
281      if (columnIndex < 0 || columnIndex > variableNames.Count)
282        throw new ArgumentOutOfRangeException("columnIndex");
283      variableNames[columnIndex] = name;
284
285      if (!IsInTransaction)
286        OnChanged(DataPreprocessingChangedEventType.ChangeColumn, -1, -1);
287    }
288
289    public void RenameColumns(IList<string> names) {
290      if (names == null) throw new ArgumentNullException("names");
291      if (names.Count != variableNames.Count) throw new ArgumentException("number of names must match the number of columns.", "names");
292
293      SaveSnapshot(DataPreprocessingChangedEventType.ChangeColumn, -1, -1);
294      for (int i = 0; i < names.Count; i++)
295        variableNames[i] = names[i];
296
297      if (!IsInTransaction)
298        OnChanged(DataPreprocessingChangedEventType.ChangeColumn, -1, -1);
299    }
300
301    public bool AreAllStringColumns(IEnumerable<int> columnIndices) {
302      return columnIndices.All(x => VariableHasType<string>(x));
303    }
304    #endregion
305
306    #region Variables
[10978]307    public IEnumerable<string> VariableNames {
308      get { return variableNames; }
309    }
[10186]310
[10992]311    public IEnumerable<string> GetDoubleVariableNames() {
312      var doubleVariableNames = new List<string>();
313      for (int i = 0; i < Columns; ++i) {
[11156]314        if (VariableHasType<double>(i)) {
[10992]315          doubleVariableNames.Add(variableNames[i]);
316        }
317      }
318      return doubleVariableNames;
319    }
320
[15518]321    public string GetVariableName(int columnIndex) {
322      return variableNames[columnIndex];
323    }
[14381]324
[15518]325    public int GetColumnIndex(string variableName) {
326      return variableNames.IndexOf(variableName);
[10978]327    }
[10695]328
[15518]329    public bool VariableHasType<T>(int columnIndex) {
330      return columnIndex >= variableValues.Count || variableValues[columnIndex] is List<T>;
[10978]331    }
[10804]332
[15518]333    public Type GetVariableType(int columnIndex) {
334      var listType = variableValues[columnIndex].GetType();
335      return listType.GenericTypeArguments.Single();
[10992]336    }
[10978]337
[15518]338    public IList<string> InputVariables { get; private set; }
339    public string TargetVariable { get; private set; } // optional
340    #endregion
[10994]341
[15518]342    #region Partitions
343    [Storable]
344    public IntRange TrainingPartition { get; set; }
345    [Storable]
346    public IntRange TestPartition { get; set; }
347    #endregion
[14381]348
[15518]349    #region Transformations
350    [Storable]
351    public IList<ITransformation> Transformations { get; protected set; }
352    #endregion
[10187]353
[15518]354    #region Validation
355    public bool Validate(string value, out string errorMessage, int columnIndex) {
356      if (columnIndex < 0 || columnIndex > VariableNames.Count()) {
357        throw new ArgumentOutOfRangeException("column index is out of range");
358      }
[10168]359
[15518]360      bool valid = false;
361      errorMessage = string.Empty;
362      if (VariableHasType<double>(columnIndex)) {
363        if (string.IsNullOrWhiteSpace(value)) {
364          valid = true;
365        } else {
366          double val;
367          valid = double.TryParse(value, out val);
368          if (!valid) {
369            errorMessage = "Invalid Value (Valid Value Format: \"" + FormatPatterns.GetDoubleFormatPattern() + "\")";
370          }
371        }
372      } else if (VariableHasType<string>(columnIndex)) {
373        valid = value != null;
374        if (!valid) {
375          errorMessage = "Invalid Value (string must not be null)";
376        }
377      } else if (VariableHasType<DateTime>(columnIndex)) {
378        DateTime date;
379        valid = DateTime.TryParse(value, out date);
380        if (!valid) {
381          errorMessage = "Invalid Value (Valid Value Format: \"" + CultureInfo.CurrentCulture.DateTimeFormat + "\"";
382        }
383      } else {
384        throw new ArgumentException("column " + columnIndex + " contains a non supported type.");
385      }
[10786]386
[15518]387      return valid;
[13502]388    }
[15518]389    #endregion
[13502]390
[15518]391    #region Import & Export
[13502]392    public void Import(IDataAnalysisProblemData problemData) {
[12509]393      Dataset dataset = (Dataset)problemData.Dataset;
[10187]394      variableNames = new List<string>(problemData.Dataset.VariableNames);
[14381]395      InputVariables = new List<string>(problemData.AllowedInputVariables);
396      TargetVariable = (problemData is IRegressionProblemData) ? ((IRegressionProblemData)problemData).TargetVariable
397        : (problemData is IClassificationProblemData) ? ((IClassificationProblemData)problemData).TargetVariable
[15518]398          : null;
[10187]399
[10367]400      int columnIndex = 0;
[10740]401      variableValues = new List<IList>();
[10185]402      foreach (var variableName in problemData.Dataset.VariableNames) {
[11156]403        if (dataset.VariableHasType<double>(variableName)) {
[11002]404          variableValues.Insert(columnIndex, dataset.GetDoubleValues(variableName).ToList());
[11156]405        } else if (dataset.VariableHasType<string>(variableName)) {
[11002]406          variableValues.Insert(columnIndex, dataset.GetStringValues(variableName).ToList());
[11156]407        } else if (dataset.VariableHasType<DateTime>(variableName)) {
[11002]408          variableValues.Insert(columnIndex, dataset.GetDateTimeValues(variableName).ToList());
[10168]409        } else {
[10978]410          throw new ArgumentException("The datatype of column " + variableName + " must be of type double, string or DateTime");
[10168]411        }
[10367]412        ++columnIndex;
[10168]413      }
[10185]414
[10994]415      TrainingPartition = new IntRange(problemData.TrainingPartition.Start, problemData.TrainingPartition.End);
416      TestPartition = new IntRange(problemData.TestPartition.Start, problemData.TestPartition.End);
[10163]417    }
418
[15518]419    public Dataset ExportToDataset() {
420      IList<IList> values = new List<IList>();
421
422      for (int i = 0; i < Columns; ++i) {
423        values.Add(variableValues[i]);
424      }
425
426      var dataset = new Dataset(variableNames, values);
427      return dataset;
428    }
429    #endregion
430
431    #region Selection
432    [Storable]
433    protected IDictionary<int, IList<int>> selection;
434    public IDictionary<int, IList<int>> Selection {
435      get { return selection; }
436      set {
437        selection = value;
438        OnSelectionChanged();
439      }
440    }
441    public void ClearSelection() {
442      Selection = new Dictionary<int, IList<int>>();
443    }
444
445    public event EventHandler SelectionChanged;
446    protected void OnSelectionChanged() {
447      var listeners = SelectionChanged;
448      if (listeners != null) listeners(this, EventArgs.Empty);
449    }
450    #endregion
451
452    #region Transactions
453    // Stapshot/History are nost storable/cloneable on purpose
454    private class Snapshot {
455      public IList<IList> VariableValues { get; set; }
456      public IList<string> VariableNames { get; set; }
457
458      public IntRange TrainingPartition { get; set; }
459      public IntRange TestPartition { get; set; }
460      public IList<ITransformation> Transformations { get; set; }
461      public DataPreprocessingChangedEventType ChangedType { get; set; }
462
463      public int ChangedColumn { get; set; }
464      public int ChangedRow { get; set; }
465    }
466
467    public event DataPreprocessingChangedEventHandler Changed;
468    protected virtual void OnChanged(DataPreprocessingChangedEventType type, int column, int row) {
469      var listeners = Changed;
470      if (listeners != null) listeners(this, new DataPreprocessingChangedEventArgs(type, column, row));
471    }
472
473    private const int MAX_UNDO_DEPTH = 5;
474
475    private readonly IList<Snapshot> undoHistory = new List<Snapshot>();
476    private readonly Stack<DataPreprocessingChangedEventType> eventStack = new Stack<DataPreprocessingChangedEventType>();
477
478    public bool IsInTransaction { get { return eventStack.Count > 0; } }
479
480    private void SaveSnapshot(DataPreprocessingChangedEventType changedType, int column, int row) {
481      if (IsInTransaction) return;
482
483      var currentSnapshot = new Snapshot {
484        VariableValues = CopyVariableValues(variableValues),
485        VariableNames = new List<string>(variableNames),
486        TrainingPartition = new IntRange(TrainingPartition.Start, TrainingPartition.End),
487        TestPartition = new IntRange(TestPartition.Start, TestPartition.End),
488        Transformations = new List<ITransformation>(Transformations),
489        ChangedType = changedType,
490        ChangedColumn = column,
491        ChangedRow = row
[10994]492      };
[15518]493
494      if (undoHistory.Count >= MAX_UNDO_DEPTH)
495        undoHistory.RemoveAt(0);
496
497      undoHistory.Add(currentSnapshot);
[10994]498    }
499
[15518]500    public bool IsUndoAvailable {
501      get { return undoHistory.Count > 0; }
[10994]502    }
503
[15518]504    public void Undo() {
505      if (IsUndoAvailable) {
506        Snapshot previousSnapshot = undoHistory[undoHistory.Count - 1];
507        variableValues = previousSnapshot.VariableValues;
508        variableNames = previousSnapshot.VariableNames;
509        TrainingPartition = previousSnapshot.TrainingPartition;
510        TestPartition = previousSnapshot.TestPartition;
511        Transformations = previousSnapshot.Transformations;
512        undoHistory.Remove(previousSnapshot);
513        OnChanged(previousSnapshot.ChangedType,
514          previousSnapshot.ChangedColumn,
515          previousSnapshot.ChangedRow);
[10550]516      }
517    }
518
[15518]519    public void InTransaction(Action action, DataPreprocessingChangedEventType type = DataPreprocessingChangedEventType.Any) {
520      BeginTransaction(type);
521      action();
522      EndTransaction();
523    }
[10163]524
[15518]525    public void BeginTransaction(DataPreprocessingChangedEventType type) {
526      SaveSnapshot(type, -1, -1);
527      eventStack.Push(type);
528    }
[10181]529
[15518]530    public void EndTransaction() {
531      if (eventStack.Count == 0)
532        throw new InvalidOperationException("There is no open transaction that can be ended.");
[10367]533
[15518]534      var @event = eventStack.Pop();
535      OnChanged(@event, -1, -1);
536    }
537    #endregion
[10367]538
[15518]539    #region Statistics
540    public T GetMin<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {
541      var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection);
542      return values.Any() ? values.Min() : emptyValue;
543    }
[10547]544
[15518]545    public T GetMax<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {
546      var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection);
547      return values.Any() ? values.Max() : emptyValue;
548    }
[10978]549
[15518]550    public T GetMean<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {
551      if (typeof(T) == typeof(double)) {
552        var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection);
553        return values.Any() ? Convert<T>(values.Average()) : emptyValue;
554      }
555      if (typeof(T) == typeof(string)) {
556        return Convert<T>(string.Empty);
557      }
558      if (typeof(T) == typeof(DateTime)) {
559        var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection);
560        return values.Any() ? Convert<T>(AggregateAsDouble(values, Enumerable.Average)) : emptyValue;
561      }
[10978]562
[15518]563      throw new InvalidOperationException(typeof(T) + " not supported");
564    }
[10181]565
[15518]566    public T GetMedian<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> {
567      if (typeof(T) == typeof(double)) {// IEnumerable<double> is faster 
568        var doubleValues = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection);
569        return doubleValues.Any() ? Convert<T>(doubleValues.Median()) : emptyValue;
570      }
571      var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection);
572      return values.Any() ? values.Quantile(0.5) : emptyValue;
573    }
[10367]574
[15518]575    public T GetMode<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IEquatable<T> {
576      var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection);
577      return values.Any() ? values.GroupBy(x => x).OrderByDescending(g => g.Count()).Select(g => g.Key).First() : emptyValue;
578    }
[10181]579
[15518]580    public T GetStandardDeviation<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {
581      if (typeof(T) == typeof(double)) {
582        var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection);
583        return values.Any() ? Convert<T>(values.StandardDeviation()) : emptyValue;
584      }
585      // For DateTime, std.dev / variance would have to be TimeSpan
586      //if (typeof(T) == typeof(DateTime)) {
587      //  var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection);
588      //  return values.Any() ? Convert<T>(AggregateAsDouble(values, EnumerableStatisticExtensions.StandardDeviation)) : emptyValue;
589      //}
590      return default(T);
591    }
[11002]592
[15518]593    public T GetVariance<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {
594      if (typeof(T) == typeof(double)) {
595        var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection);
596        return values.Any() ? Convert<T>(values.Variance()) : emptyValue;
597      }
598      // DateTime variance often overflows long, thus the corresponding DateTime is invalid
599      //if (typeof(T) == typeof(DateTime)) {
600      //  var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection);
601      //  return values.Any() ? Convert<T>(AggregateAsDouble(values, EnumerableStatisticExtensions.Variance)) : emptyValue;
602      //}
603      return default(T);
604    }
[11002]605
[15518]606    public T GetQuantile<T>(double alpha, int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> {
607      if (typeof(T) == typeof(double)) {// IEnumerable<double> is faster 
608        var doubleValues = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection);
609        return doubleValues.Any() ? Convert<T>(doubleValues.Quantile(alpha)) : emptyValue;
610      }
611      var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection);
612      return values.Any() ? values.Quantile(alpha) : emptyValue;
613    }
[11002]614
[15518]615    public int GetDistinctValues<T>(int columnIndex, bool considerSelection = false) {
616      var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection);
617      return values.GroupBy(x => x).Count();
618    }
[11002]619
[15518]620    private IEnumerable<T> GetValuesWithoutMissingValues<T>(int columnIndex, bool considerSelection) {
621      return GetValues<T>(columnIndex, considerSelection).Where(x => !IsMissingValue(x));
622    }
[10163]623
[15518]624    private static DateTime AggregateAsDouble(IEnumerable<DateTime> values, Func<IEnumerable<double>, double> func) {
625      return new DateTime((long)(func(values.Select(x => (double)x.Ticks / TimeSpan.TicksPerSecond)) * TimeSpan.TicksPerSecond));
626    }
627    private static T Convert<T>(object obj) { return (T)obj; }
[10163]628
[15518]629    public int GetMissingValueCount() {
630      int count = 0;
631      for (int i = 0; i < Columns; ++i) {
632        count += GetMissingValueCount(i);
633      }
634      return count;
635    }
636    public int GetMissingValueCount(int columnIndex) {
637      int sum = 0;
638      for (int i = 0; i < Rows; i++) {
639        if (IsCellEmpty(columnIndex, i))
640          sum++;
641      }
642      return sum;
643    }
644    public int GetRowMissingValueCount(int rowIndex) {
645      int sum = 0;
646      for (int i = 0; i < Columns; i++) {
647        if (IsCellEmpty(i, rowIndex))
648          sum++;
649      }
650      return sum;
651    }
652    #endregion
[10163]653
[15518]654    #region Helpers
655    private static IList<IList> CopyVariableValues(IList<IList> original) {
656      var copy = new List<IList>(original);
657      for (int i = 0; i < original.Count; ++i) {
658        copy[i] = (IList)Activator.CreateInstance(original[i].GetType(), original[i]);
659      }
660      return copy;
661    }
662    #endregion
663  }
[10367]664
[15518]665  // Adapted from HeuristicLab.Common.EnumerableStatisticExtensions
666  internal static class EnumerableExtensions {
667    public static T Quantile<T>(this IEnumerable<T> values, double alpha) where T : IComparable<T> {
668      T[] valuesArr = values.ToArray();
669      int n = valuesArr.Length;
670      if (n == 0) throw new InvalidOperationException("Enumeration contains no elements.");
[13252]671
[15518]672      var pos = n * alpha;
[10367]673
[15518]674      return Select((int)Math.Ceiling(pos) - 1, valuesArr);
[10220]675
[15518]676    }
[10220]677
[15518]678    private static T Select<T>(int k, T[] arr) where T : IComparable<T> {
679      int i, ir, j, l, mid, n = arr.Length;
680      T a;
681      l = 0;
682      ir = n - 1;
683      for (;;) {
684        if (ir <= l + 1) {
685          // Active partition contains 1 or 2 elements.
686          if (ir == l + 1 && arr[ir].CompareTo(arr[l]) < 0) {
687            // Case of 2 elements.
688            Swap(arr, l, ir);
689          }
690          return arr[k];
691        } else {
692          mid = (l + ir) >> 1; // Choose median of left, center, and right elements
693          Swap(arr, mid, l + 1); // as partitioning element a. Also
694
695          if (arr[l].CompareTo(arr[ir]) > 0) {  // rearrange so that arr[l] arr[ir] <= arr[l+1],
696            Swap(arr, l, ir); // . arr[ir] >= arr[l+1]
697          }
698
699          if (arr[l + 1].CompareTo(arr[ir]) > 0) {
700            Swap(arr, l + 1, ir);
701          }
702          if (arr[l].CompareTo(arr[l + 1]) > 0) {
703            Swap(arr, l, l + 1);
704          }
705          i = l + 1; // Initialize pointers for partitioning.
706          j = ir;
707          a = arr[l + 1]; // Partitioning element.
708          for (;;) { // Beginning of innermost loop.
709            do i++; while (arr[i].CompareTo(a) < 0); // Scan up to find element > a.
710            do j--; while (arr[j].CompareTo(a) > 0); // Scan down to find element < a.
711            if (j < i) break; // Pointers crossed. Partitioning complete.
712            Swap(arr, i, j);
713          } // End of innermost loop.
714          arr[l + 1] = arr[j]; // Insert partitioning element.
715          arr[j] = a;
716          if (j >= k) ir = j - 1; // Keep active the partition that contains the
717          if (j <= k) l = i; // kth element.
718        }
719      }
[10804]720    }
[15518]721
722    private static void Swap<T>(T[] arr, int i, int j) {
723      T temp = arr[i];
724      arr[i] = arr[j];
725      arr[j] = temp;
726    }
[10163]727  }
728}
Note: See TracBrowser for help on using the repository browser.