Changeset 16308 for branches/2845_EnhancedProgress/HeuristicLab.DataPreprocessing/3.4/Data/PreprocessingData.cs
- Timestamp:
- 11/20/18 13:52:40 (5 years ago)
- Location:
- branches/2845_EnhancedProgress
- Files:
-
- 4 edited
Legend:
- Unmodified
- Added
- Removed
-
branches/2845_EnhancedProgress
- Property svn:mergeinfo changed
/stable reverse-merged: 15587-15588 /trunk/sources removed
- Property svn:mergeinfo changed
-
branches/2845_EnhancedProgress/HeuristicLab.DataPreprocessing
- Property svn:mergeinfo changed
/stable/HeuristicLab.DataPreprocessing reverse-merged: 15587 /branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing removed /trunk/sources/HeuristicLab.DataPreprocessing removed
- Property svn:mergeinfo changed
-
branches/2845_EnhancedProgress/HeuristicLab.DataPreprocessing/3.4
- Property svn:mergeinfo changed
/stable/HeuristicLab.DataPreprocessing/3.4 reverse-merged: 15587 /branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4 removed /trunk/sources/HeuristicLab.DataPreprocessing/3.4 removed
- Property svn:mergeinfo changed
-
branches/2845_EnhancedProgress/HeuristicLab.DataPreprocessing/3.4/Data/PreprocessingData.cs
r16307 r16308 1 1 #region License Information 2 2 /* HeuristicLab 3 * Copyright (C) 2002-201 8Heuristic and Evolutionary Algorithms Laboratory (HEAL)3 * Copyright (C) 2002-2016 Heuristic and Evolutionary Algorithms Laboratory (HEAL) 4 4 * 5 5 * This file is part of HeuristicLab. … … 23 23 using System.Collections; 24 24 using System.Collections.Generic; 25 using System.Globalization;26 25 using System.Linq; 27 26 using HeuristicLab.Common; 28 27 using HeuristicLab.Core; 29 28 using HeuristicLab.Data; 30 using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;31 29 using HeuristicLab.Problems.DataAnalysis; 32 30 … … 34 32 35 33 [Item("PreprocessingData", "Represents data used for preprocessing.")] 36 [StorableClass] 37 public class PreprocessingData : NamedItem, IPreprocessingData { 38 39 [Storable] 34 public abstract class PreprocessingData : NamedItem, IPreprocessingData { 35 public IntRange TrainingPartition { get; set; } 36 public IntRange TestPartition { get; set; } 37 38 public IList<ITransformation> Transformations { get; protected set; } 39 40 40 protected IList<IList> variableValues; 41 [Storable]42 41 protected IList<string> variableNames; 43 42 44 #region Constructor, Cloning & Persistence 45 public PreprocessingData(IDataAnalysisProblemData problemData) 46 : base() { 47 Name = "Preprocessing Data"; 48 49 Transformations = new List<ITransformation>(); 50 selection = new Dictionary<int, IList<int>>(); 51 52 Import(problemData); 53 54 RegisterEventHandler(); 43 public IEnumerable<string> VariableNames { 44 get { return variableNames; } 45 } 46 47 public IEnumerable<string> GetDoubleVariableNames() { 48 var doubleVariableNames = new List<string>(); 49 for (int i = 0; i < Columns; ++i) { 50 if (VariableHasType<double>(i)) { 51 doubleVariableNames.Add(variableNames[i]); 52 } 53 } 54 return doubleVariableNames; 55 } 56 57 public IList<string> InputVariables { get; private set; } 58 public string TargetVariable { get; private set; } // optional 59 60 public int Columns { 61 get { return variableNames.Count; } 62 } 63 64 public int Rows { 65 get { return variableValues.Count > 0 ? variableValues[0].Count : 0; } 66 } 67 68 protected IDictionary<int, IList<int>> selection; 69 public IDictionary<int, IList<int>> Selection { 70 get { return selection; } 71 set { 72 selection = value; 73 OnSelectionChanged(); 74 } 55 75 } 56 76 … … 68 88 RegisterEventHandler(); 69 89 } 70 public override IDeepCloneable Clone(Cloner cloner) { 71 return new PreprocessingData(this, cloner); 72 } 73 74 [StorableConstructor] 75 protected PreprocessingData(bool deserializing) 76 : base(deserializing) { } 77 [StorableHook(HookType.AfterDeserialization)] 78 private void AfterDeserialization() { 90 91 protected PreprocessingData(IDataAnalysisProblemData problemData) 92 : base() { 93 Name = "Preprocessing Data"; 94 95 Transformations = new List<ITransformation>(); 96 selection = new Dictionary<int, IList<int>>(); 97 98 Import(problemData); 99 79 100 RegisterEventHandler(); 80 101 } 81 102 82 private void RegisterEventHandler() {83 Changed += (s, e) => {84 switch (e.Type) {85 case DataPreprocessingChangedEventType.DeleteRow:86 case DataPreprocessingChangedEventType.Any:87 case DataPreprocessingChangedEventType.Transformation:88 int maxRowIndex = Math.Max(0, Rows);89 TrainingPartition.Start = Math.Min(TrainingPartition.Start, maxRowIndex);90 TrainingPartition.End = Math.Min(TrainingPartition.End, maxRowIndex);91 TestPartition.Start = Math.Min(TestPartition.Start, maxRowIndex);92 TestPartition.End = Math.Min(TestPartition.End, maxRowIndex);93 break;94 }95 };96 }97 #endregion98 99 #region Cells100 public bool IsCellEmpty(int columnIndex, int rowIndex) {101 var value = variableValues[columnIndex][rowIndex];102 return IsMissingValue(value);103 }104 105 public T GetCell<T>(int columnIndex, int rowIndex) {106 return (T)variableValues[columnIndex][rowIndex];107 }108 109 public void SetCell<T>(int columnIndex, int rowIndex, T value) {110 SaveSnapshot(DataPreprocessingChangedEventType.ChangeItem, columnIndex, rowIndex);111 112 for (int i = Rows; i <= rowIndex; i++)113 InsertRow(i);114 for (int i = Columns; i <= columnIndex; i++)115 InsertColumn<T>(i.ToString(), i);116 117 variableValues[columnIndex][rowIndex] = value;118 if (!IsInTransaction)119 OnChanged(DataPreprocessingChangedEventType.ChangeItem, columnIndex, rowIndex);120 }121 122 public string GetCellAsString(int columnIndex, int rowIndex) {123 return variableValues[columnIndex][rowIndex].ToString();124 }125 126 public IList<T> GetValues<T>(int columnIndex, bool considerSelection) {127 if (considerSelection) {128 var list = new List<T>();129 foreach (var rowIdx in selection[columnIndex]) {130 list.Add((T)variableValues[columnIndex][rowIdx]);131 }132 return list;133 } else {134 return (IList<T>)variableValues[columnIndex];135 }136 }137 138 public void SetValues<T>(int columnIndex, IList<T> values) {139 SaveSnapshot(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1);140 if (VariableHasType<T>(columnIndex)) {141 variableValues[columnIndex] = (IList)values;142 } else {143 throw new ArgumentException("The datatype of column " + columnIndex + " must be of type " + variableValues[columnIndex].GetType().Name + " but was " + typeof(T).Name);144 }145 if (!IsInTransaction)146 OnChanged(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1);147 }148 149 public bool SetValue(string value, int columnIndex, int rowIndex) {150 bool valid = false;151 if (VariableHasType<double>(columnIndex)) {152 double val;153 if (string.IsNullOrWhiteSpace(value)) {154 val = double.NaN;155 valid = true;156 } else {157 valid = double.TryParse(value, out val);158 }159 if (valid)160 SetCell(columnIndex, rowIndex, val);161 } else if (VariableHasType<string>(columnIndex)) {162 valid = value != null;163 if (valid)164 SetCell(columnIndex, rowIndex, value);165 } else if (VariableHasType<DateTime>(columnIndex)) {166 DateTime date;167 valid = DateTime.TryParse(value, out date);168 if (valid)169 SetCell(columnIndex, rowIndex, date);170 } else {171 throw new ArgumentException("column " + columnIndex + " contains a non supported type.");172 }173 174 if (!IsInTransaction)175 OnChanged(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1);176 177 return valid;178 }179 180 public int Columns {181 get { return variableNames.Count; }182 }183 184 public int Rows {185 get { return variableValues.Count > 0 ? variableValues[0].Count : 0; }186 }187 188 public static bool IsMissingValue(object value) {189 if (value is double) return double.IsNaN((double)value);190 if (value is string) return string.IsNullOrEmpty((string)value);191 if (value is DateTime) return ((DateTime)value).Equals(DateTime.MinValue);192 throw new ArgumentException();193 }194 #endregion195 196 #region Rows197 public void InsertRow(int rowIndex) {198 SaveSnapshot(DataPreprocessingChangedEventType.DeleteRow, -1, rowIndex);199 foreach (IList column in variableValues) {200 Type type = column.GetType().GetGenericArguments()[0];201 column.Insert(rowIndex, type.IsValueType ? Activator.CreateInstance(type) : null);202 }203 if (TrainingPartition.Start <= rowIndex && rowIndex <= TrainingPartition.End) {204 TrainingPartition.End++;205 if (TrainingPartition.End <= TestPartition.Start) {206 TestPartition.Start++;207 TestPartition.End++;208 }209 } else if (TestPartition.Start <= rowIndex && rowIndex <= TestPartition.End) {210 TestPartition.End++;211 if (TestPartition.End <= TrainingPartition.Start) {212 TestPartition.Start++;213 TestPartition.End++;214 }215 }216 if (!IsInTransaction)217 OnChanged(DataPreprocessingChangedEventType.AddRow, -1, rowIndex);218 }219 public void DeleteRow(int rowIndex) {220 SaveSnapshot(DataPreprocessingChangedEventType.AddRow, -1, rowIndex);221 foreach (IList column in variableValues) {222 column.RemoveAt(rowIndex);223 }224 if (TrainingPartition.Start <= rowIndex && rowIndex <= TrainingPartition.End) {225 TrainingPartition.End--;226 if (TrainingPartition.End <= TestPartition.Start) {227 TestPartition.Start--;228 TestPartition.End--;229 }230 } else if (TestPartition.Start <= rowIndex && rowIndex <= TestPartition.End) {231 TestPartition.End--;232 if (TestPartition.End <= TrainingPartition.Start) {233 TestPartition.Start--;234 TestPartition.End--;235 }236 }237 if (!IsInTransaction)238 OnChanged(DataPreprocessingChangedEventType.DeleteRow, -1, rowIndex);239 }240 public void DeleteRowsWithIndices(IEnumerable<int> rows) {241 SaveSnapshot(DataPreprocessingChangedEventType.AddRow, -1, -1);242 foreach (int rowIndex in rows.OrderByDescending(x => x)) {243 foreach (IList column in variableValues) {244 column.RemoveAt(rowIndex);245 }246 if (TrainingPartition.Start <= rowIndex && rowIndex <= TrainingPartition.End) {247 TrainingPartition.End--;248 if (TrainingPartition.End <= TestPartition.Start) {249 TestPartition.Start--;250 TestPartition.End--;251 }252 } else if (TestPartition.Start <= rowIndex && rowIndex <= TestPartition.End) {253 TestPartition.End--;254 if (TestPartition.End <= TrainingPartition.Start) {255 TestPartition.Start--;256 TestPartition.End--;257 }258 }259 }260 if (!IsInTransaction)261 OnChanged(DataPreprocessingChangedEventType.DeleteRow, -1, -1);262 }263 264 public void InsertColumn<T>(string variableName, int columnIndex) {265 SaveSnapshot(DataPreprocessingChangedEventType.DeleteColumn, columnIndex, -1);266 variableValues.Insert(columnIndex, new List<T>(Enumerable.Repeat(default(T), Rows)));267 variableNames.Insert(columnIndex, variableName);268 if (!IsInTransaction)269 OnChanged(DataPreprocessingChangedEventType.AddColumn, columnIndex, -1);270 }271 272 public void DeleteColumn(int columnIndex) {273 SaveSnapshot(DataPreprocessingChangedEventType.AddColumn, columnIndex, -1);274 variableValues.RemoveAt(columnIndex);275 variableNames.RemoveAt(columnIndex);276 if (!IsInTransaction)277 OnChanged(DataPreprocessingChangedEventType.DeleteColumn, columnIndex, -1);278 }279 280 public void RenameColumn(int columnIndex, string name) {281 SaveSnapshot(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1);282 if (columnIndex < 0 || columnIndex > variableNames.Count)283 throw new ArgumentOutOfRangeException("columnIndex");284 variableNames[columnIndex] = name;285 286 if (!IsInTransaction)287 OnChanged(DataPreprocessingChangedEventType.ChangeColumn, -1, -1);288 }289 290 public void RenameColumns(IList<string> names) {291 if (names == null) throw new ArgumentNullException("names");292 if (names.Count != variableNames.Count) throw new ArgumentException("number of names must match the number of columns.", "names");293 294 SaveSnapshot(DataPreprocessingChangedEventType.ChangeColumn, -1, -1);295 for (int i = 0; i < names.Count; i++)296 variableNames[i] = names[i];297 298 if (!IsInTransaction)299 OnChanged(DataPreprocessingChangedEventType.ChangeColumn, -1, -1);300 }301 302 public bool AreAllStringColumns(IEnumerable<int> columnIndices) {303 return columnIndices.All(x => VariableHasType<string>(x));304 }305 #endregion306 307 #region Variables308 public IEnumerable<string> VariableNames {309 get { return variableNames; }310 }311 312 public IEnumerable<string> GetDoubleVariableNames() {313 var doubleVariableNames = new List<string>();314 for (int i = 0; i < Columns; ++i) {315 if (VariableHasType<double>(i)) {316 doubleVariableNames.Add(variableNames[i]);317 }318 }319 return doubleVariableNames;320 }321 322 public string GetVariableName(int columnIndex) {323 return variableNames[columnIndex];324 }325 326 public int GetColumnIndex(string variableName) {327 return variableNames.IndexOf(variableName);328 }329 330 public bool VariableHasType<T>(int columnIndex) {331 return columnIndex >= variableValues.Count || variableValues[columnIndex] is List<T>;332 }333 334 public Type GetVariableType(int columnIndex) {335 var listType = variableValues[columnIndex].GetType();336 return listType.GenericTypeArguments.Single();337 }338 339 public IList<string> InputVariables { get; private set; }340 public string TargetVariable { get; private set; } // optional341 #endregion342 343 #region Partitions344 [Storable]345 public IntRange TrainingPartition { get; set; }346 [Storable]347 public IntRange TestPartition { get; set; }348 #endregion349 350 #region Transformations351 [Storable]352 public IList<ITransformation> Transformations { get; protected set; }353 #endregion354 355 #region Validation356 public bool Validate(string value, out string errorMessage, int columnIndex) {357 if (columnIndex < 0 || columnIndex > VariableNames.Count()) {358 throw new ArgumentOutOfRangeException("column index is out of range");359 }360 361 bool valid = false;362 errorMessage = string.Empty;363 if (VariableHasType<double>(columnIndex)) {364 if (string.IsNullOrWhiteSpace(value)) {365 valid = true;366 } else {367 double val;368 valid = double.TryParse(value, out val);369 if (!valid) {370 errorMessage = "Invalid Value (Valid Value Format: \"" + FormatPatterns.GetDoubleFormatPattern() + "\")";371 }372 }373 } else if (VariableHasType<string>(columnIndex)) {374 valid = value != null;375 if (!valid) {376 errorMessage = "Invalid Value (string must not be null)";377 }378 } else if (VariableHasType<DateTime>(columnIndex)) {379 DateTime date;380 valid = DateTime.TryParse(value, out date);381 if (!valid) {382 errorMessage = "Invalid Value (Valid Value Format: \"" + CultureInfo.CurrentCulture.DateTimeFormat + "\"";383 }384 } else {385 throw new ArgumentException("column " + columnIndex + " contains a non supported type.");386 }387 388 return valid;389 }390 #endregion391 392 #region Import & Export393 103 public void Import(IDataAnalysisProblemData problemData) { 394 104 Dataset dataset = (Dataset)problemData.Dataset; … … 397 107 TargetVariable = (problemData is IRegressionProblemData) ? ((IRegressionProblemData)problemData).TargetVariable 398 108 : (problemData is IClassificationProblemData) ? ((IClassificationProblemData)problemData).TargetVariable 399 109 : null; 400 110 401 111 int columnIndex = 0; … … 418 128 } 419 129 420 public Dataset ExportToDataset() { 421 IList<IList> values = new List<IList>(); 422 423 for (int i = 0; i < Columns; ++i) { 424 values.Add(variableValues[i]); 425 } 426 427 var dataset = new Dataset(variableNames, values); 428 return dataset; 429 } 430 #endregion 431 432 #region Selection 433 [Storable] 434 protected IDictionary<int, IList<int>> selection; 435 public IDictionary<int, IList<int>> Selection { 436 get { return selection; } 437 set { 438 selection = value; 439 OnSelectionChanged(); 440 } 441 } 442 public void ClearSelection() { 443 Selection = new Dictionary<int, IList<int>>(); 444 } 445 446 public event EventHandler SelectionChanged; 447 protected void OnSelectionChanged() { 448 var listeners = SelectionChanged; 449 if (listeners != null) listeners(this, EventArgs.Empty); 450 } 451 #endregion 452 453 #region Transactions 454 // Stapshot/History are nost storable/cloneable on purpose 455 private class Snapshot { 456 public IList<IList> VariableValues { get; set; } 457 public IList<string> VariableNames { get; set; } 458 459 public IntRange TrainingPartition { get; set; } 460 public IntRange TestPartition { get; set; } 461 public IList<ITransformation> Transformations { get; set; } 462 public DataPreprocessingChangedEventType ChangedType { get; set; } 463 464 public int ChangedColumn { get; set; } 465 public int ChangedRow { get; set; } 466 } 130 private void RegisterEventHandler() { 131 Changed += (s, e) => { 132 switch (e.Type) { 133 case DataPreprocessingChangedEventType.DeleteRow: 134 CheckPartitionRanges(); 135 break; 136 case DataPreprocessingChangedEventType.Any: 137 CheckPartitionRanges(); 138 break; 139 case DataPreprocessingChangedEventType.Transformation: 140 CheckPartitionRanges(); 141 break; 142 } 143 }; 144 } 145 146 private void CheckPartitionRanges() { 147 int maxRowIndex = Math.Max(0, Rows); 148 TrainingPartition.Start = Math.Min(TrainingPartition.Start, maxRowIndex); 149 TrainingPartition.End = Math.Min(TrainingPartition.End, maxRowIndex); 150 TestPartition.Start = Math.Min(TestPartition.Start, maxRowIndex); 151 TestPartition.End = Math.Min(TestPartition.End, maxRowIndex); 152 } 153 154 protected IList<IList> CopyVariableValues(IList<IList> original) { 155 var copy = new List<IList>(original); 156 for (int i = 0; i < original.Count; ++i) { 157 copy[i] = (IList)Activator.CreateInstance(original[i].GetType(), original[i]); 158 } 159 return copy; 160 } 161 162 163 #region IPreprocessingData Members 164 public abstract T GetCell<T>(int columnIndex, int rowIndex); 165 166 public abstract void SetCell<T>(int columnIndex, int rowIndex, T value); 167 168 public abstract string GetCellAsString(int columnIndex, int rowIndex); 169 170 public abstract string GetVariableName(int columnIndex); 171 172 public abstract int GetColumnIndex(string variableName); 173 174 public abstract bool VariableHasType<T>(int columnIndex); 175 176 [Obsolete("use the index based variant, is faster")] 177 public abstract IList<T> GetValues<T>(string variableName, bool considerSelection); 178 179 public abstract IList<T> GetValues<T>(int columnIndex, bool considerSelection); 180 181 public abstract void SetValues<T>(int columnIndex, IList<T> values); 182 183 public abstract bool SetValue(string value, int columnIndex, int rowIndex); 184 185 public abstract bool Validate(string value, out string errorMessage, int columnIndex); 186 187 public abstract bool AreAllStringColumns(IEnumerable<int> columnIndices); 188 189 public abstract void DeleteRowsWithIndices(IEnumerable<int> rows); 190 191 public abstract void InsertRow(int rowIndex); 192 193 public abstract void DeleteRow(int rowIndex); 194 195 public abstract void InsertColumn<T>(string variableName, int columnIndex); 196 197 public abstract void DeleteColumn(int columnIndex); 198 199 public abstract void RenameColumn(int columnIndex, string name); 200 public abstract void RenameColumns(IList<string> list); 201 202 public abstract Dataset ExportToDataset(); 203 204 public abstract void ClearSelection(); 205 206 public abstract event EventHandler SelectionChanged; 207 protected abstract void OnSelectionChanged(); 467 208 468 209 public event DataPreprocessingChangedEventHandler Changed; … … 471 212 if (listeners != null) listeners(this, new DataPreprocessingChangedEventArgs(type, column, row)); 472 213 } 473 474 private const int MAX_UNDO_DEPTH = 5;475 476 private readonly IList<Snapshot> undoHistory = new List<Snapshot>();477 private readonly Stack<DataPreprocessingChangedEventType> eventStack = new Stack<DataPreprocessingChangedEventType>();478 479 public bool IsInTransaction { get { return eventStack.Count > 0; } }480 481 private void SaveSnapshot(DataPreprocessingChangedEventType changedType, int column, int row) {482 if (IsInTransaction) return;483 484 var currentSnapshot = new Snapshot {485 VariableValues = CopyVariableValues(variableValues),486 VariableNames = new List<string>(variableNames),487 TrainingPartition = new IntRange(TrainingPartition.Start, TrainingPartition.End),488 TestPartition = new IntRange(TestPartition.Start, TestPartition.End),489 Transformations = new List<ITransformation>(Transformations),490 ChangedType = changedType,491 ChangedColumn = column,492 ChangedRow = row493 };494 495 if (undoHistory.Count >= MAX_UNDO_DEPTH)496 undoHistory.RemoveAt(0);497 498 undoHistory.Add(currentSnapshot);499 }500 501 public bool IsUndoAvailable {502 get { return undoHistory.Count > 0; }503 }504 505 public void Undo() {506 if (IsUndoAvailable) {507 Snapshot previousSnapshot = undoHistory[undoHistory.Count - 1];508 variableValues = previousSnapshot.VariableValues;509 variableNames = previousSnapshot.VariableNames;510 TrainingPartition = previousSnapshot.TrainingPartition;511 TestPartition = previousSnapshot.TestPartition;512 Transformations = previousSnapshot.Transformations;513 undoHistory.Remove(previousSnapshot);514 OnChanged(previousSnapshot.ChangedType,515 previousSnapshot.ChangedColumn,516 previousSnapshot.ChangedRow);517 }518 }519 520 public void InTransaction(Action action, DataPreprocessingChangedEventType type = DataPreprocessingChangedEventType.Any) {521 BeginTransaction(type);522 action();523 EndTransaction();524 }525 526 public void BeginTransaction(DataPreprocessingChangedEventType type) {527 SaveSnapshot(type, -1, -1);528 eventStack.Push(type);529 }530 531 public void EndTransaction() {532 if (eventStack.Count == 0)533 throw new InvalidOperationException("There is no open transaction that can be ended.");534 535 var @event = eventStack.Pop();536 OnChanged(@event, -1, -1);537 }538 #endregion539 540 #region Statistics541 public T GetMin<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {542 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection);543 return values.Any() ? values.Min() : emptyValue;544 }545 546 public T GetMax<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {547 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection);548 return values.Any() ? values.Max() : emptyValue;549 }550 551 public T GetMean<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {552 if (typeof(T) == typeof(double)) {553 var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection);554 return values.Any() ? Convert<T>(values.Average()) : emptyValue;555 }556 if (typeof(T) == typeof(string)) {557 return Convert<T>(string.Empty);558 }559 if (typeof(T) == typeof(DateTime)) {560 var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection);561 return values.Any() ? Convert<T>(AggregateAsDouble(values, Enumerable.Average)) : emptyValue;562 }563 564 throw new InvalidOperationException(typeof(T) + " not supported");565 }566 567 public T GetMedian<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> {568 if (typeof(T) == typeof(double)) {// IEnumerable<double> is faster569 var doubleValues = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection);570 return doubleValues.Any() ? Convert<T>(doubleValues.Median()) : emptyValue;571 }572 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection);573 return values.Any() ? values.Quantile(0.5) : emptyValue;574 }575 576 public T GetMode<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IEquatable<T> {577 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection);578 return values.Any() ? values.GroupBy(x => x).OrderByDescending(g => g.Count()).Select(g => g.Key).First() : emptyValue;579 }580 581 public T GetStandardDeviation<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {582 if (typeof(T) == typeof(double)) {583 var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection);584 return values.Any() ? Convert<T>(values.StandardDeviation()) : emptyValue;585 }586 // For DateTime, std.dev / variance would have to be TimeSpan587 //if (typeof(T) == typeof(DateTime)) {588 // var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection);589 // return values.Any() ? Convert<T>(AggregateAsDouble(values, EnumerableStatisticExtensions.StandardDeviation)) : emptyValue;590 //}591 return default(T);592 }593 594 public T GetVariance<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {595 if (typeof(T) == typeof(double)) {596 var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection);597 return values.Any() ? Convert<T>(values.Variance()) : emptyValue;598 }599 // DateTime variance often overflows long, thus the corresponding DateTime is invalid600 //if (typeof(T) == typeof(DateTime)) {601 // var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection);602 // return values.Any() ? Convert<T>(AggregateAsDouble(values, EnumerableStatisticExtensions.Variance)) : emptyValue;603 //}604 return default(T);605 }606 607 public T GetQuantile<T>(double alpha, int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> {608 if (typeof(T) == typeof(double)) {// IEnumerable<double> is faster609 var doubleValues = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection);610 return doubleValues.Any() ? Convert<T>(doubleValues.Quantile(alpha)) : emptyValue;611 }612 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection);613 return values.Any() ? values.Quantile(alpha) : emptyValue;614 }615 616 public int GetDistinctValues<T>(int columnIndex, bool considerSelection = false) {617 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection);618 return values.GroupBy(x => x).Count();619 }620 621 private IEnumerable<T> GetValuesWithoutMissingValues<T>(int columnIndex, bool considerSelection) {622 return GetValues<T>(columnIndex, considerSelection).Where(x => !IsMissingValue(x));623 }624 625 private static DateTime AggregateAsDouble(IEnumerable<DateTime> values, Func<IEnumerable<double>, double> func) {626 return new DateTime((long)(func(values.Select(x => (double)x.Ticks / TimeSpan.TicksPerSecond)) * TimeSpan.TicksPerSecond));627 }628 private static T Convert<T>(object obj) { return (T)obj; }629 630 public int GetMissingValueCount() {631 int count = 0;632 for (int i = 0; i < Columns; ++i) {633 count += GetMissingValueCount(i);634 }635 return count;636 }637 public int GetMissingValueCount(int columnIndex) {638 int sum = 0;639 for (int i = 0; i < Rows; i++) {640 if (IsCellEmpty(columnIndex, i))641 sum++;642 }643 return sum;644 }645 public int GetRowMissingValueCount(int rowIndex) {646 int sum = 0;647 for (int i = 0; i < Columns; i++) {648 if (IsCellEmpty(i, rowIndex))649 sum++;650 }651 return sum;652 }653 #endregion654 655 #region Helpers656 private static IList<IList> CopyVariableValues(IList<IList> original) {657 var copy = new List<IList>(original);658 for (int i = 0; i < original.Count; ++i) {659 copy[i] = (IList)Activator.CreateInstance(original[i].GetType(), original[i]);660 }661 return copy;662 }663 214 #endregion 664 215 } 665 666 // Adapted from HeuristicLab.Common.EnumerableStatisticExtensions667 internal static class EnumerableExtensions {668 public static T Quantile<T>(this IEnumerable<T> values, double alpha) where T : IComparable<T> {669 T[] valuesArr = values.ToArray();670 int n = valuesArr.Length;671 if (n == 0) throw new InvalidOperationException("Enumeration contains no elements.");672 673 var pos = n * alpha;674 675 return Select((int)Math.Ceiling(pos) - 1, valuesArr);676 677 }678 679 private static T Select<T>(int k, T[] arr) where T : IComparable<T> {680 int i, ir, j, l, mid, n = arr.Length;681 T a;682 l = 0;683 ir = n - 1;684 for (;;) {685 if (ir <= l + 1) {686 // Active partition contains 1 or 2 elements.687 if (ir == l + 1 && arr[ir].CompareTo(arr[l]) < 0) {688 // Case of 2 elements.689 Swap(arr, l, ir);690 }691 return arr[k];692 } else {693 mid = (l + ir) >> 1; // Choose median of left, center, and right elements694 Swap(arr, mid, l + 1); // as partitioning element a. Also695 696 if (arr[l].CompareTo(arr[ir]) > 0) { // rearrange so that arr[l] arr[ir] <= arr[l+1],697 Swap(arr, l, ir); // . arr[ir] >= arr[l+1]698 }699 700 if (arr[l + 1].CompareTo(arr[ir]) > 0) {701 Swap(arr, l + 1, ir);702 }703 if (arr[l].CompareTo(arr[l + 1]) > 0) {704 Swap(arr, l, l + 1);705 }706 i = l + 1; // Initialize pointers for partitioning.707 j = ir;708 a = arr[l + 1]; // Partitioning element.709 for (;;) { // Beginning of innermost loop.710 do i++; while (arr[i].CompareTo(a) < 0); // Scan up to find element > a.711 do j--; while (arr[j].CompareTo(a) > 0); // Scan down to find element < a.712 if (j < i) break; // Pointers crossed. Partitioning complete.713 Swap(arr, i, j);714 } // End of innermost loop.715 arr[l + 1] = arr[j]; // Insert partitioning element.716 arr[j] = a;717 if (j >= k) ir = j - 1; // Keep active the partition that contains the718 if (j <= k) l = i; // kth element.719 }720 }721 }722 723 private static void Swap<T>(T[] arr, int i, int j) {724 T temp = arr[i];725 arr[i] = arr[j];726 arr[j] = temp;727 }728 }729 216 }
Note: See TracChangeset
for help on using the changeset viewer.