Free cookie consent management tool by TermsFeed Policy Generator

source: branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Data/PreprocessingData.cs @ 15309

Last change on this file since 15309 was 15309, checked in by pfleck, 7 years ago

#2809 Worked on type-save PreprocessingDataColumns.

File size: 28.1 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2016 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections;
24using System.Collections.Generic;
25using System.Globalization;
26using System.Linq;
27using HeuristicLab.Common;
28using HeuristicLab.Core;
29using HeuristicLab.Data;
30using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
31using HeuristicLab.Problems.DataAnalysis;
32
33namespace HeuristicLab.DataPreprocessing {
34  [Item("PreprocessingData", "Represents data used for preprocessing.")]
35  [StorableClass]
36  public class PreprocessingData : NamedItem, IPreprocessingData {
37
38    [Storable] private List<PreprocessingDataColumn> dataColumns;
39
40    public IList<PreprocessingDataColumn> DataColumns {
41      get { return dataColumns; }
42    }
43
44
45    #region Constructor, Cloning & Persistence
46    public PreprocessingData(IDataAnalysisProblemData problemData)
47      : base() {
48      Name = "Preprocessing Data";
49
50      dataColumns = new List<PreprocessingDataColumn>();
51      Transformations = new List<ITransformation>();
52      selection = new Dictionary<int, IList<int>>();
53
54      Import(problemData);
55
56      RegisterEventHandler();
57    }
58
59    protected PreprocessingData(PreprocessingData original, Cloner cloner)
60      : base(original, cloner) {
61      dataColumns = new List<PreprocessingDataColumn>(original.dataColumns.Select(cloner.Clone));
62      TrainingPartition = cloner.Clone(original.TrainingPartition);
63      TestPartition = cloner.Clone(original.TestPartition);
64      Transformations = new List<ITransformation>(original.Transformations.Select(cloner.Clone));
65
66      InputVariables = new List<string>(original.InputVariables);
67      TargetVariable = original.TargetVariable;
68
69      RegisterEventHandler();
70    }
71    public override IDeepCloneable Clone(Cloner cloner) {
72      return new PreprocessingData(this, cloner);
73    }
74
75    [StorableConstructor]
76    protected PreprocessingData(bool deserializing)
77      : base(deserializing) { }
78    [StorableHook(HookType.AfterDeserialization)]
79    private void AfterDeserialization() {
80      RegisterEventHandler();
81    }
82
83    private void RegisterEventHandler() {
84      Changed += (s, e) => {
85        switch (e.Type) {
86          case DataPreprocessingChangedEventType.DeleteRow:
87          case DataPreprocessingChangedEventType.Any:
88          case DataPreprocessingChangedEventType.Transformation:
89            int maxRowIndex = Math.Max(0, Rows);
90            TrainingPartition.Start = Math.Min(TrainingPartition.Start, maxRowIndex);
91            TrainingPartition.End = Math.Min(TrainingPartition.End, maxRowIndex);
92            TestPartition.Start = Math.Min(TestPartition.Start, maxRowIndex);
93            TestPartition.End = Math.Min(TestPartition.End, maxRowIndex);
94            break;
95        }
96      };
97    }
98    #endregion
99
100    #region Cells
101    public bool IsCellEmpty(int columnIndex, int rowIndex) {
102      return !dataColumns[columnIndex].IsValidValue(rowIndex);
103    }
104
105    public T GetCell<T>(int columnIndex, int rowIndex) {
106      return dataColumns[columnIndex].TypeSwitch<T>(
107        c => c[rowIndex],
108        c => c[rowIndex],
109        c => c[rowIndex]);
110    }
111
112    public void SetCell<T>(int columnIndex, int rowIndex, T value) {
113      SaveSnapshot(DataPreprocessingChangedEventType.ChangeItem, columnIndex, rowIndex);
114
115      for (int i = Rows; i <= rowIndex; i++)
116        InsertRow(i);
117      for (int i = Columns; i <= columnIndex; i++)
118        InsertColumn<T>(i.ToString(), i);
119
120      dataColumns[columnIndex].TypeSwitch<T>(value,
121        (c, v) => c[rowIndex] = v,
122        (c, v) => c[rowIndex] = v,
123        (c, v) => c[rowIndex] = v);
124
125      if (!IsInTransaction)
126        OnChanged(DataPreprocessingChangedEventType.ChangeItem, columnIndex, rowIndex);
127    }
128
129    public string GetCellAsString(int columnIndex, int rowIndex) {
130      return dataColumns[columnIndex].GetValue(rowIndex);
131    }
132
133    public IEnumerable<T> GetValues<T>(int columnIndex, bool considerSelection) {
134      return dataColumns[columnIndex].TypeSwitch<T>(
135        c => c.GetValues(considerSelection ? selection[columnIndex] : null),
136        c => c.GetValues(considerSelection ? selection[columnIndex] : null),
137        c => c.GetValues(considerSelection ? selection[columnIndex] : null));
138    }
139
140    public void SetValues<T>(int columnIndex, IEnumerable<T> values) {
141      SaveSnapshot(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1);
142      if (VariableHasType<T>(columnIndex)) {
143        var name = dataColumns[columnIndex].Name;
144        if (dataColumns[columnIndex].IsType<double>()) {
145          dataColumns[columnIndex] = new DoublePreprocessingDataColumn(name, (IEnumerable<double>)values);
146        } else if (dataColumns[columnIndex].IsType<string>()) {
147          dataColumns[columnIndex] = new StringPreprocessingDataColumn(name, (IEnumerable<string>)values);
148        } else if (dataColumns[columnIndex].IsType<DateTime>()) {
149          dataColumns[columnIndex] = new DateTimePreprocessingDataColumn(name, (IEnumerable<DateTime>)values);
150        } else {
151          throw new ArgumentException("Unknown column type");
152        }
153      } else {
154        throw new ArgumentException("The datatype of column " + columnIndex + " must be of type " + dataColumns[columnIndex].GetType().Name + " but was " + typeof(T).Name);
155      }
156      if (!IsInTransaction)
157        OnChanged(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1);
158    }
159
160    public bool SetValue(string value, int columnIndex, int rowIndex) {
161      var column = dataColumns[columnIndex];
162      bool successful = column.SetValue(value, rowIndex);
163
164      if (!IsInTransaction)
165        OnChanged(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1);
166
167      return successful;
168    }
169
170    public int Columns {
171      get { return dataColumns.Count; }
172    }
173
174    public int Rows {
175      get { return dataColumns.Any() ? dataColumns.Max(c => c.Length) : 0; }
176    }
177    #endregion
178
179    #region Rows
180    public void InsertRow(int rowIndex) {
181      SaveSnapshot(DataPreprocessingChangedEventType.DeleteRow, -1, rowIndex);
182
183      foreach (var column in dataColumns) {
184        column.TypeSwitch(
185          c => c.Values.Insert(rowIndex, double.NaN),
186          c => c.Values.Insert(rowIndex, null),
187          c => c.Values.Insert(rowIndex, DateTime.MinValue));
188      }
189
190      if (TrainingPartition.Start <= rowIndex && rowIndex <= TrainingPartition.End) {
191        TrainingPartition.End++;
192        if (TrainingPartition.End <= TestPartition.Start) {
193          TestPartition.Start++;
194          TestPartition.End++;
195        }
196      } else if (TestPartition.Start <= rowIndex && rowIndex <= TestPartition.End) {
197        TestPartition.End++;
198        if (TestPartition.End <= TrainingPartition.Start) {
199          TestPartition.Start++;
200          TestPartition.End++;
201        }
202      }
203
204      if (!IsInTransaction)
205        OnChanged(DataPreprocessingChangedEventType.AddRow, -1, rowIndex);
206    }
207
208    public void DeleteRow(int rowIndex) {
209      DeleteRows(new[] { rowIndex });
210    }
211    public void DeleteRows(IEnumerable<int> rowIndices) {
212      SaveSnapshot(DataPreprocessingChangedEventType.AddRow, -1, -1);
213
214      foreach (int rowIndex in rowIndices.OrderByDescending(x => x)) {
215        foreach (var column in dataColumns) {
216          column.TypeSwitch(
217            c => c.Values.RemoveAt(rowIndex),
218            c => c.Values.RemoveAt(rowIndex),
219            c => c.Values.RemoveAt(rowIndex));
220        }
221
222        if (TrainingPartition.Start <= rowIndex && rowIndex <= TrainingPartition.End) {
223          TrainingPartition.End--;
224          if (TrainingPartition.End <= TestPartition.Start) {
225            TestPartition.Start--;
226            TestPartition.End--;
227          }
228        } else if (TestPartition.Start <= rowIndex && rowIndex <= TestPartition.End) {
229          TestPartition.End--;
230          if (TestPartition.End <= TrainingPartition.Start) {
231            TestPartition.Start--;
232            TestPartition.End--;
233          }
234        }
235      }
236
237      if (!IsInTransaction)
238        OnChanged(DataPreprocessingChangedEventType.DeleteRow, -1, -1);
239    }
240
241    public void InsertColumn<T>(string variableName, int columnIndex) {
242      SaveSnapshot(DataPreprocessingChangedEventType.DeleteColumn, columnIndex, -1);
243
244      if (typeof(T) == typeof(double)) {
245        dataColumns.Insert(columnIndex, new DoublePreprocessingDataColumn(variableName, Enumerable.Repeat<double>(double.NaN, Rows)));
246      } else if (typeof(T) == typeof(string)) {
247        dataColumns.Insert(columnIndex, new StringPreprocessingDataColumn(variableName, Enumerable.Repeat<string>(string.Empty, Rows)));
248      } else if (typeof(T) == typeof(DateTime)) {
249        dataColumns.Insert(columnIndex, new DateTimePreprocessingDataColumn(variableName, Enumerable.Repeat<DateTime>(DateTime.MinValue, Rows)));
250      } else {
251        throw new ArgumentException("The datatype of column " + variableName + " must be of type double, string or DateTime");
252      }
253
254      if (!IsInTransaction)
255        OnChanged(DataPreprocessingChangedEventType.AddColumn, columnIndex, -1);
256    }
257
258    public void DeleteColumn(int columnIndex) {
259      SaveSnapshot(DataPreprocessingChangedEventType.AddColumn, columnIndex, -1);
260
261      dataColumns.RemoveAt(columnIndex);
262
263      if (!IsInTransaction)
264        OnChanged(DataPreprocessingChangedEventType.DeleteColumn, columnIndex, -1);
265    }
266
267    public void RenameColumn(int columnIndex, string name) {
268      if (columnIndex < 0 || columnIndex > dataColumns.Count)
269        throw new ArgumentOutOfRangeException("columnIndex");
270
271      SaveSnapshot(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1);
272
273      dataColumns[columnIndex].Name = name;
274
275      if (!IsInTransaction)
276        OnChanged(DataPreprocessingChangedEventType.ChangeColumn, -1, -1);
277    }
278
279    public void RenameColumns(IList<string> names) {
280      if (names == null) throw new ArgumentNullException("names");
281      if (names.Count != dataColumns.Count) throw new ArgumentException("number of names must match the number of columns.", "names");
282
283      SaveSnapshot(DataPreprocessingChangedEventType.ChangeColumn, -1, -1);
284
285      for (int i = 0; i < names.Count; i++)
286        dataColumns[i].Name = names[i];
287
288      if (!IsInTransaction)
289        OnChanged(DataPreprocessingChangedEventType.ChangeColumn, -1, -1);
290    }
291
292    public bool AreAllStringColumns(IEnumerable<int> columnIndices) {
293      return columnIndices.All(VariableHasType<string>);
294    }
295    #endregion
296
297    #region Variables
298    public IEnumerable<string> VariableNames {
299      get { return dataColumns.Select(c => c.Name); }
300    }
301
302    public IEnumerable<string> GetDoubleVariableNames() {
303      return dataColumns.OfType<DoublePreprocessingDataColumn>().Select(c => c.Name);
304    }
305
306    public string GetVariableName(int columnIndex) {
307      return dataColumns[columnIndex].Name;
308    }
309
310    public int GetColumnIndex(string variableName) {
311      return dataColumns.FindIndex(c => c.Name == variableName);
312    }
313
314    public bool VariableHasType<T>(int columnIndex) {
315      return dataColumns[columnIndex].IsType<T>();
316    }
317
318    public Type GetVariableType(int columnIndex) {
319      return dataColumns[columnIndex].GetValueType();
320    }
321
322    public IList<string> InputVariables { get; private set; }
323    public string TargetVariable { get; private set; } // optional
324    #endregion
325
326    #region Partitions
327    [Storable]
328    public IntRange TrainingPartition { get; set; }
329    [Storable]
330    public IntRange TestPartition { get; set; }
331    #endregion
332
333    #region Transformations
334    [Storable]
335    public IList<ITransformation> Transformations { get; protected set; }
336    #endregion
337
338    #region Validation
339    public bool Validate(string value, out string errorMessage, int columnIndex) {
340      if (columnIndex < 0 || columnIndex > VariableNames.Count()) {
341        throw new ArgumentOutOfRangeException("column index is out of range");
342      }
343
344      bool valid = false;
345      errorMessage = string.Empty;
346      if (VariableHasType<double>(columnIndex)) {
347        if (string.IsNullOrWhiteSpace(value)) {
348          valid = true;
349        } else {
350          double val;
351          valid = double.TryParse(value, out val);
352          if (!valid) {
353            errorMessage = "Invalid Value (Valid Value Format: \"" + FormatPatterns.GetDoubleFormatPattern() + "\")";
354          }
355        }
356      } else if (VariableHasType<string>(columnIndex)) {
357        valid = value != null;
358        if (!valid) {
359          errorMessage = "Invalid Value (string must not be null)";
360        }
361      } else if (VariableHasType<DateTime>(columnIndex)) {
362        DateTime date;
363        valid = DateTime.TryParse(value, out date);
364        if (!valid) {
365          errorMessage = "Invalid Value (Valid Value Format: \"" + CultureInfo.CurrentCulture.DateTimeFormat + "\"";
366        }
367      } else {
368        throw new ArgumentException("column " + columnIndex + " contains a non supported type.");
369      }
370
371      return valid;
372    }
373    #endregion
374
375    #region Import & Export
376    public void Import(IDataAnalysisProblemData problemData) {
377      var dataset = problemData.Dataset;
378      InputVariables = new List<string>(problemData.AllowedInputVariables);
379      TargetVariable = problemData is IRegressionProblemData ? ((IRegressionProblemData)problemData).TargetVariable
380        : problemData is IClassificationProblemData ? ((IClassificationProblemData)problemData).TargetVariable
381        : null;
382
383      dataColumns.Clear();
384      foreach (var variableName in problemData.Dataset.VariableNames) {
385        if (dataset.VariableHasType<double>(variableName)) {
386          dataColumns.Add(new DoublePreprocessingDataColumn(variableName, dataset.GetDoubleValues(variableName)));
387        } else if (dataset.VariableHasType<string>(variableName)) {
388          dataColumns.Add(new StringPreprocessingDataColumn(variableName, dataset.GetStringValues(variableName)));
389        } else if (dataset.VariableHasType<DateTime>(variableName)) {
390          dataColumns.Add(new DateTimePreprocessingDataColumn(variableName, dataset.GetDateTimeValues(variableName)));
391        } else {
392          throw new ArgumentException("The datatype of column " + variableName + " must be of type double, string or DateTime");
393        }
394      }
395
396      TrainingPartition = new IntRange(problemData.TrainingPartition.Start, problemData.TrainingPartition.End);
397      TestPartition = new IntRange(problemData.TestPartition.Start, problemData.TestPartition.End);
398    }
399
400    public Dataset ExportToDataset() {
401      IList<IList> values = new List<IList>();
402
403      for (int i = 0; i < Columns; i++) {
404        var doubleColumn = dataColumns[i] as DoublePreprocessingDataColumn;
405        var stringColumn = dataColumns[i] as StringPreprocessingDataColumn;
406        var dateTimeColumn = dataColumns[i] as DateTimePreprocessingDataColumn;
407        if (doubleColumn != null) values.Add(new List<double>(doubleColumn.GetValues()));
408        else if (stringColumn != null) values.Add(new List<string>(stringColumn.GetValues()));
409        else if (dateTimeColumn != null) values.Add(new List<DateTime>(dateTimeColumn.GetValues()));
410        else throw new InvalidOperationException("Column type not supported for export");
411      }
412
413      return new Dataset(VariableNames, values);
414    }
415    #endregion
416
417    #region Selection
418    [Storable]
419    protected IDictionary<int, IList<int>> selection;
420    public IDictionary<int, IList<int>> Selection {
421      get { return selection; }
422      set {
423        selection = value;
424        OnSelectionChanged();
425      }
426    }
427    public void ClearSelection() {
428      Selection = new Dictionary<int, IList<int>>();
429    }
430
431    public event EventHandler SelectionChanged;
432    protected void OnSelectionChanged() {
433      var listeners = SelectionChanged;
434      if (listeners != null) listeners(this, EventArgs.Empty);
435    }
436    #endregion
437
438    #region Transactions
439    // Snapshot/History are not storable/cloneable on purpose
440    private class Snapshot {
441      public List<PreprocessingDataColumn> DataColumns { get; set; }
442
443      public IntRange TrainingPartition { get; set; }
444      public IntRange TestPartition { get; set; }
445      public IList<ITransformation> Transformations { get; set; }
446      public DataPreprocessingChangedEventType ChangedType { get; set; }
447
448      public int ChangedColumn { get; set; }
449      public int ChangedRow { get; set; }
450    }
451
452    public event DataPreprocessingChangedEventHandler Changed;
453    protected virtual void OnChanged(DataPreprocessingChangedEventType type, int column, int row) {
454      var listeners = Changed;
455      if (listeners != null) listeners(this, new DataPreprocessingChangedEventArgs(type, column, row));
456    }
457
458    private const int MaxUndoDepth = 5;
459
460    private readonly IList<Snapshot> undoHistory = new List<Snapshot>();
461    private readonly Stack<DataPreprocessingChangedEventType> eventStack = new Stack<DataPreprocessingChangedEventType>();
462
463    public bool IsInTransaction { get { return eventStack.Count > 0; } }
464
465    private void SaveSnapshot(DataPreprocessingChangedEventType changedType, int column, int row) {
466      if (IsInTransaction) return;
467
468      var cloner = new Cloner();
469      var currentSnapshot = new Snapshot {
470        DataColumns = new List<PreprocessingDataColumn>(dataColumns.Select(cloner.Clone)),
471        TrainingPartition = new IntRange(TrainingPartition.Start, TrainingPartition.End),
472        TestPartition = new IntRange(TestPartition.Start, TestPartition.End),
473        Transformations = new List<ITransformation>(Transformations),
474        ChangedType = changedType,
475        ChangedColumn = column,
476        ChangedRow = row
477      };
478
479      if (undoHistory.Count >= MaxUndoDepth)
480        undoHistory.RemoveAt(0);
481
482      undoHistory.Add(currentSnapshot);
483    }
484
485    public bool IsUndoAvailable {
486      get { return undoHistory.Count > 0; }
487    }
488
489    public void Undo() {
490      if (IsUndoAvailable) {
491        Snapshot previousSnapshot = undoHistory[undoHistory.Count - 1];
492        dataColumns = previousSnapshot.DataColumns;
493        TrainingPartition = previousSnapshot.TrainingPartition;
494        TestPartition = previousSnapshot.TestPartition;
495        Transformations = previousSnapshot.Transformations;
496        undoHistory.Remove(previousSnapshot);
497        OnChanged(previousSnapshot.ChangedType,
498          previousSnapshot.ChangedColumn,
499          previousSnapshot.ChangedRow);
500      }
501    }
502
503    public void InTransaction(Action action, DataPreprocessingChangedEventType type = DataPreprocessingChangedEventType.Any) {
504      BeginTransaction(type);
505      action();
506      EndTransaction();
507    }
508
509    public void BeginTransaction(DataPreprocessingChangedEventType type) {
510      SaveSnapshot(type, -1, -1);
511      eventStack.Push(type);
512    }
513
514    public void EndTransaction() {
515      if (eventStack.Count == 0)
516        throw new InvalidOperationException("There is no open transaction that can be ended.");
517
518      var @event = eventStack.Pop();
519      OnChanged(@event, -1, -1);
520    }
521    #endregion
522
523    /* #region Statistics
524     public T GetMin<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {
525       try {
526         return dataColumns[columnIndex].TypeSwitch<T>(
527           col => col.GetMin(considerSelection ? Selection[columnIndex] : null),
528           col => col.GetMin(considerSelection ? Selection[columnIndex] : null),
529           col => col.GetMin(considerSelection ? Selection[columnIndex] : null));
530       } catch (InvalidOperationException) {
531         return emptyValue;
532       }
533     }
534
535     public T GetMax<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {
536       var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection);
537       return values.Any() ? values.Max() : emptyValue;
538     }
539
540     public T GetMean<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {
541       return
542
543
544       if (typeof(T) == typeof(double)) {
545         var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection);
546         return values.Any() ? Convert<T>(values.Average()) : emptyValue;
547       }
548       if (typeof(T) == typeof(string)) {
549         return Convert<T>(string.Empty);
550       }
551       if (typeof(T) == typeof(DateTime)) {
552         var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection);
553         return values.Any() ? Convert<T>(AggregateAsDouble(values, Enumerable.Average)) : emptyValue;
554       }
555
556       throw new InvalidOperationException(typeof(T) + " not supported");
557     }
558
559     public T GetMedian<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> {
560       if (typeof(T) == typeof(double)) {// IEnumerable<double> is faster 
561         var doubleValues = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection);
562         return doubleValues.Any() ? Convert<T>(doubleValues.Median()) : emptyValue;
563       }
564       var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection);
565       return values.Any() ? values.Quantile(0.5) : emptyValue;
566     }
567
568     public T GetMode<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IEquatable<T> {
569       var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection);
570       return values.Any() ? values.GroupBy(x => x).OrderByDescending(g => g.Count()).Select(g => g.Key).First() : emptyValue;
571     }
572
573     public T GetStandardDeviation<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {
574       if (typeof(T) == typeof(double)) {
575         var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection);
576         return values.Any() ? Convert<T>(values.StandardDeviation()) : emptyValue;
577       }
578       // For DateTime, std.dev / variance would have to be TimeSpan
579       //if (typeof(T) == typeof(DateTime)) {
580       //  var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection);
581       //  return values.Any() ? Convert<T>(AggregateAsDouble(values, EnumerableStatisticExtensions.StandardDeviation)) : emptyValue;
582       //}
583       return default(T);
584     }
585
586     public T GetVariance<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {
587       if (typeof(T) == typeof(double)) {
588         var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection);
589         return values.Any() ? Convert<T>(values.Variance()) : emptyValue;
590       }
591       // DateTime variance often overflows long, thus the corresponding DateTime is invalid
592       //if (typeof(T) == typeof(DateTime)) {
593       //  var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection);
594       //  return values.Any() ? Convert<T>(AggregateAsDouble(values, EnumerableStatisticExtensions.Variance)) : emptyValue;
595       //}
596       return default(T);
597     }
598
599     public T GetQuantile<T>(double alpha, int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> {
600       if (typeof(T) == typeof(double)) {// IEnumerable<double> is faster 
601         var doubleValues = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection);
602         return doubleValues.Any() ? Convert<T>(doubleValues.Quantile(alpha)) : emptyValue;
603       }
604       var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection);
605       return values.Any() ? values.Quantile(alpha) : emptyValue;
606     }
607
608     public int GetDistinctValues<T>(int columnIndex, bool considerSelection = false) {
609       var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection);
610       return values.GroupBy(x => x).Count();
611     }
612
613     private IEnumerable<T> GetValuesWithoutMissingValues<T>(int columnIndex, bool considerSelection) {
614       return GetValues<T>(columnIndex, considerSelection).Where(x =>
615         ColumnTypeSwitch<T, bool>(dataColumns[columnIndex], x,
616           (c, v) => c.IsValidValue(v),
617           (c, v) => c.IsValidValue(v),
618           (c, v) => c.IsValidValue(v)
619       ));
620     }
621
622     private static DateTime AggregateAsDouble(IEnumerable<DateTime> values, Func<IEnumerable<double>, double> func) {
623       return new DateTime((long)(func(values.Select(x => (double)x.Ticks / TimeSpan.TicksPerSecond)) * TimeSpan.TicksPerSecond));
624     }
625
626     public int GetMissingValueCount() {
627       int count = 0;
628       for (int i = 0; i < Columns; ++i) {
629         count += GetMissingValueCount(i);
630       }
631       return count;
632     }
633     public int GetMissingValueCount(int columnIndex) {
634       int sum = 0;
635       for (int i = 0; i < Rows; i++) {
636         if (IsCellEmpty(columnIndex, i))
637           sum++;
638       }
639       return sum;
640     }
641     public int GetRowMissingValueCount(int rowIndex) {
642       int sum = 0;
643       for (int i = 0; i < Columns; i++) {
644         if (IsCellEmpty(i, rowIndex))
645           sum++;
646       }
647       return sum;
648     }
649     #endregion  */
650  }
651
652  // Adapted from HeuristicLab.Common.EnumerableStatisticExtensions
653  internal static class EnumerableExtensions {
654    public static T Quantile<T>(this IEnumerable<T> values, double alpha) where T : IComparable<T> {
655      T[] valuesArr = values.ToArray();
656      int n = valuesArr.Length;
657      if (n == 0) throw new InvalidOperationException("Enumeration contains no elements.");
658
659      var pos = n * alpha;
660
661      return Select((int)Math.Ceiling(pos) - 1, valuesArr);
662
663    }
664
665    private static T Select<T>(int k, T[] arr) where T : IComparable<T> {
666      int i, ir, j, l, mid, n = arr.Length;
667      T a;
668      l = 0;
669      ir = n - 1;
670      for (;;) {
671        if (ir <= l + 1) {
672          // Active partition contains 1 or 2 elements.
673          if (ir == l + 1 && arr[ir].CompareTo(arr[l]) < 0) {
674            // Case of 2 elements.
675            Swap(arr, l, ir);
676          }
677          return arr[k];
678        } else {
679          mid = (l + ir) >> 1; // Choose median of left, center, and right elements
680          Swap(arr, mid, l + 1); // as partitioning element a. Also
681
682          if (arr[l].CompareTo(arr[ir]) > 0) {  // rearrange so that arr[l] arr[ir] <= arr[l+1],
683            Swap(arr, l, ir); // . arr[ir] >= arr[l+1]
684          }
685
686          if (arr[l + 1].CompareTo(arr[ir]) > 0) {
687            Swap(arr, l + 1, ir);
688          }
689          if (arr[l].CompareTo(arr[l + 1]) > 0) {
690            Swap(arr, l, l + 1);
691          }
692          i = l + 1; // Initialize pointers for partitioning.
693          j = ir;
694          a = arr[l + 1]; // Partitioning element.
695          for (;;) { // Beginning of innermost loop.
696            do i++; while (arr[i].CompareTo(a) < 0); // Scan up to find element > a.
697            do j--; while (arr[j].CompareTo(a) > 0); // Scan down to find element < a.
698            if (j < i) break; // Pointers crossed. Partitioning complete.
699            Swap(arr, i, j);
700          } // End of innermost loop.
701          arr[l + 1] = arr[j]; // Insert partitioning element.
702          arr[j] = a;
703          if (j >= k) ir = j - 1; // Keep active the partition that contains the
704          if (j <= k) l = i; // kth element.
705        }
706      }
707    }
708
709    private static void Swap<T>(T[] arr, int i, int j) {
710      T temp = arr[i];
711      arr[i] = arr[j];
712      arr[j] = temp;
713    }
714  }
715}
Note: See TracBrowser for help on using the repository browser.