Changeset 15518 for trunk/sources/HeuristicLab.DataPreprocessing/3.4/Data
- Timestamp:
- 12/12/17 16:32:35 (7 years ago)
- Location:
- trunk/sources/HeuristicLab.DataPreprocessing
- Files:
-
- 2 deleted
- 6 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/sources/HeuristicLab.DataPreprocessing
- Property svn:mergeinfo changed
/branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing (added) merged: 15269-15270,15274,15283,15285,15291,15309,15431,15466,15489
- Property svn:mergeinfo changed
-
trunk/sources/HeuristicLab.DataPreprocessing/3.4
- Property svn:mergeinfo changed
/branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4 (added) merged: 15269-15270,15274,15283,15285,15291,15309,15431,15466,15489
- Property svn:mergeinfo changed
-
trunk/sources/HeuristicLab.DataPreprocessing/3.4/Data/FilteredPreprocessingData.cs
r15110 r15518 22 22 using System; 23 23 using System.Collections.Generic; 24 using System.Linq; 24 25 using HeuristicLab.Common; 25 26 using HeuristicLab.Core; 26 27 using HeuristicLab.Data; 28 using HeuristicLab.Persistence.Default.CompositeSerializers.Storable; 27 29 using HeuristicLab.Problems.DataAnalysis; 28 30 29 31 namespace HeuristicLab.DataPreprocessing { 30 public class FilteredPreprocessingData : NamedItem, IFilteredPreprocessingData { 31 private readonly ITransactionalPreprocessingData originalData; 32 private ITransactionalPreprocessingData filteredData; 33 32 [Item("FilteredPreprocessingData", "Represents filtered data used for preprocessing.")] 33 [StorableClass] 34 public sealed class FilteredPreprocessingData : NamedItem, IFilteredPreprocessingData { 35 36 [Storable] 37 private readonly IPreprocessingData originalData; 38 [Storable] 39 private IPreprocessingData filteredData; 40 41 public IPreprocessingData ActiveData { 42 get { return IsFiltered ? filteredData : originalData; } 43 } 44 45 #region Constructor, Cloning & Persistence 46 public FilteredPreprocessingData(IPreprocessingData preprocessingData) 47 : base() { 48 originalData = preprocessingData; 49 filteredData = null; 50 } 51 52 private FilteredPreprocessingData(FilteredPreprocessingData original, Cloner cloner) 53 : base(original, cloner) { 54 originalData = original.originalData; 55 filteredData = original.filteredData; 56 } 57 public override IDeepCloneable Clone(Cloner cloner) { 58 return new FilteredPreprocessingData(this, cloner); 59 } 60 61 [StorableConstructor] 62 private FilteredPreprocessingData(bool deserializing) 63 : base(deserializing) { } 64 #endregion 65 66 #region Cells 67 public bool IsCellEmpty(int columnIndex, int rowIndex) { 68 return ActiveData.IsCellEmpty(columnIndex, rowIndex); 69 } 70 71 public T GetCell<T>(int columnIndex, int rowIndex) { 72 return ActiveData.GetCell<T>(columnIndex, rowIndex); 73 } 74 75 public void SetCell<T>(int columnIndex, int rowIndex, T value) { 76 if (IsFiltered) 77 throw new InvalidOperationException("SetValues not possible while data is filtered"); 78 originalData.SetCell<T>(columnIndex, rowIndex, value); 79 } 80 81 public string GetCellAsString(int columnIndex, int rowIndex) { 82 return ActiveData.GetCellAsString(columnIndex, rowIndex); 83 } 84 85 public IList<T> GetValues<T>(int columnIndex, bool considerSelection) { 86 return ActiveData.GetValues<T>(columnIndex, considerSelection); 87 } 88 89 public void SetValues<T>(int columnIndex, IList<T> values) { 90 if (IsFiltered) 91 throw new InvalidOperationException("SetValues not possible while data is filtered"); 92 93 originalData.SetValues<T>(columnIndex, values); 94 } 95 96 public bool SetValue(string value, int columnIndex, int rowIndex) { 97 if (IsFiltered) 98 throw new InvalidOperationException("SetValue not possible while data is filtered"); 99 return originalData.SetValue(value, columnIndex, rowIndex); 100 } 101 102 public int Columns { 103 get { return ActiveData.Columns; } 104 } 105 106 public int Rows { 107 get { return ActiveData.Rows; } 108 } 109 #endregion 110 111 #region Rows 112 public void InsertRow(int rowIndex) { 113 if (IsFiltered) 114 throw new InvalidOperationException("InsertRow not possible while data is filtered"); 115 116 originalData.InsertRow(rowIndex); 117 } 118 119 public void DeleteRow(int rowIndex) { 120 if (IsFiltered) 121 throw new InvalidOperationException("DeleteRow not possible while data is filtered"); 122 123 originalData.DeleteRow(rowIndex); 124 } 125 126 public void DeleteRowsWithIndices(IEnumerable<int> rows) { 127 if (IsFiltered) 128 throw new InvalidOperationException("DeleteRowsWithIndices not possible while data is filtered"); 129 130 originalData.DeleteRowsWithIndices(rows); 131 } 132 133 public void InsertColumn<T>(string variableName, int columnIndex) { 134 if (IsFiltered) 135 throw new InvalidOperationException("InsertColumn not possible while data is filtered"); 136 137 originalData.InsertColumn<T>(variableName, columnIndex); 138 } 139 140 public void DeleteColumn(int columnIndex) { 141 if (IsFiltered) 142 throw new InvalidOperationException("DeleteColumn not possible while data is filtered"); 143 originalData.DeleteColumn(columnIndex); 144 } 145 146 public void RenameColumn(int columnIndex, string name) { 147 if (IsFiltered) 148 throw new InvalidOperationException("RenameColumn not possible while data is filtered"); 149 originalData.RenameColumn(columnIndex, name); 150 } 151 152 public void RenameColumns(IList<string> names) { 153 if (IsFiltered) 154 throw new InvalidOperationException("RenameColumns not possible while data is filtered"); 155 originalData.RenameColumns(names); 156 } 157 158 public bool AreAllStringColumns(IEnumerable<int> columnIndices) { 159 return originalData.AreAllStringColumns(columnIndices); 160 } 161 #endregion 162 163 #region Variables 164 public IEnumerable<string> VariableNames { 165 get { return ActiveData.VariableNames; } 166 } 167 public IEnumerable<string> GetDoubleVariableNames() { 168 return originalData.GetDoubleVariableNames(); 169 } 170 public string GetVariableName(int columnIndex) { 171 return ActiveData.GetVariableName(columnIndex); 172 } 173 174 public int GetColumnIndex(string variableName) { 175 return ActiveData.GetColumnIndex(variableName); 176 } 177 178 public bool VariableHasType<T>(int columnIndex) { 179 return originalData.VariableHasType<T>(columnIndex); 180 } 181 182 public Type GetVariableType(int columnIndex) { 183 return ActiveData.GetVariableType(columnIndex); 184 } 185 186 public IList<string> InputVariables { 187 get { return ActiveData.InputVariables; } 188 } 189 190 public string TargetVariable { 191 get { return ActiveData.TargetVariable; } 192 } // optional 193 #endregion 194 195 #region Partitions 34 196 public IntRange TrainingPartition { 35 197 get { return originalData.TrainingPartition; } … … 39 201 get { return originalData.TestPartition; } 40 202 } 41 203 #endregion 204 205 #region Transformations 42 206 public IList<ITransformation> Transformations { 43 207 get { return originalData.Transformations; } 44 208 } 45 46 public IEnumerable<string> VariableNames { 47 get { return ActiveData.VariableNames; } 48 } 49 50 public IList<string> InputVariables { get { return ActiveData.InputVariables; } } 51 public string TargetVariable { get { return ActiveData.TargetVariable; } } // optional 52 209 #endregion 210 211 #region Validation 212 public bool Validate(string value, out string errorMessage, int columnIndex) { 213 return originalData.Validate(value, out errorMessage, columnIndex); 214 } 215 #endregion 216 217 #region Import & Export 218 public void Import(IDataAnalysisProblemData problemData) { 219 if (IsFiltered) 220 throw new InvalidOperationException("Import not possible while data is filtered"); 221 originalData.Import(problemData); 222 } 223 224 public Dataset ExportToDataset() { 225 return originalData.ExportToDataset(); 226 } 227 #endregion 228 229 #region Selection 53 230 public IDictionary<int, IList<int>> Selection { 54 231 get { return originalData.Selection; } … … 56 233 } 57 234 58 public int Columns { 59 get { return ActiveData.Columns; } 60 } 61 62 public int Rows { 63 get { return ActiveData.Rows; } 64 } 65 66 public ITransactionalPreprocessingData ActiveData { 67 get { return IsFiltered ? filteredData : originalData; } 235 public void ClearSelection() { 236 originalData.ClearSelection(); 237 } 238 239 public event EventHandler SelectionChanged { 240 add { originalData.SelectionChanged += value; } 241 remove { originalData.SelectionChanged -= value; } 242 } 243 #endregion 244 245 #region Transactions 246 public event DataPreprocessingChangedEventHandler Changed { 247 add { originalData.Changed += value; } 248 remove { originalData.Changed -= value; } 68 249 } 69 250 … … 72 253 } 73 254 74 public bool IsFiltered { 75 get { return filteredData != null; } 76 } 77 78 79 public FilteredPreprocessingData(ITransactionalPreprocessingData preporcessingData) 80 : base() { 81 originalData = preporcessingData; 82 filteredData = null; 83 } 84 85 protected FilteredPreprocessingData(FilteredPreprocessingData original, Cloner cloner) 86 : base(original, cloner) { 87 originalData = original.originalData; 88 filteredData = original.filteredData; 89 } 90 public override IDeepCloneable Clone(Cloner cloner) { 91 return new FilteredPreprocessingData(this, cloner); 92 } 93 94 public T GetCell<T>(int columnIndex, int rowIndex) { 95 return ActiveData.GetCell<T>(columnIndex, rowIndex); 96 } 97 98 public void SetCell<T>(int columnIndex, int rowIndex, T value) { 99 if (IsFiltered) 100 throw new InvalidOperationException("SetValues not possible while data is filtered"); 101 originalData.SetCell<T>(columnIndex, rowIndex, value); 102 } 103 104 public string GetCellAsString(int columnIndex, int rowIndex) { 105 return ActiveData.GetCellAsString(columnIndex, rowIndex); 106 } 107 108 public IList<T> GetValues<T>(int columnIndex, bool considerSelection) { 109 return ActiveData.GetValues<T>(columnIndex, considerSelection); 110 } 111 112 public void SetValues<T>(int columnIndex, IList<T> values) { 113 if (IsFiltered) 114 throw new InvalidOperationException("SetValues not possible while data is filtered"); 115 116 originalData.SetValues<T>(columnIndex, values); 117 } 118 119 public void InsertRow(int rowIndex) { 120 if (IsFiltered) 121 throw new InvalidOperationException("InsertRow not possible while data is filtered"); 122 123 originalData.InsertRow(rowIndex); 124 } 125 126 public void DeleteRow(int rowIndex) { 127 if (IsFiltered) 128 throw new InvalidOperationException("DeleteRow not possible while data is filtered"); 129 130 originalData.DeleteRow(rowIndex); 131 } 132 133 public void InsertColumn<T>(string variableName, int columnIndex) { 134 if (IsFiltered) 135 throw new InvalidOperationException("InsertColumn not possible while data is filtered"); 136 137 originalData.InsertColumn<T>(variableName, columnIndex); 138 } 139 140 public void DeleteColumn(int columnIndex) { 141 if (IsFiltered) 142 throw new InvalidOperationException("DeleteColumn not possible while data is filtered"); 143 originalData.DeleteColumn(columnIndex); 144 } 145 146 public void RenameColumn(int columnIndex, string name) { 147 if (IsFiltered) 148 throw new InvalidOperationException("RenameColumn not possible while data is filtered"); 149 originalData.RenameColumn(columnIndex, name); 150 } 151 152 public void RenameColumns(IList<string> names) { 153 if (IsFiltered) 154 throw new InvalidOperationException("RenameColumns not possible while data is filtered"); 155 originalData.RenameColumns(names); 156 } 157 158 public string GetVariableName(int columnIndex) { 159 return ActiveData.GetVariableName(columnIndex); 160 } 161 162 public int GetColumnIndex(string variableName) { 163 return ActiveData.GetColumnIndex(variableName); 164 } 165 166 public bool VariableHasType<T>(int columnIndex) { 167 return originalData.VariableHasType<T>(columnIndex); 168 } 169 170 public Dataset ExportToDataset() { 171 return originalData.ExportToDataset(); 172 } 173 174 public void SetFilter(bool[] rowFilters) { 175 filteredData = (ITransactionalPreprocessingData)originalData.Clone(); 255 public void Undo() { 256 if (IsFiltered) 257 throw new InvalidOperationException("Undo not possible while data is filtered"); 258 259 originalData.Undo(); 260 } 261 262 public void InTransaction(Action action, DataPreprocessingChangedEventType type = DataPreprocessingChangedEventType.Any) { 263 if (IsFiltered) 264 throw new InvalidOperationException("Transaction not possible while data is filtered"); 265 originalData.InTransaction(action, type); 266 } 267 268 public void BeginTransaction(DataPreprocessingChangedEventType type) { 269 if (IsFiltered) 270 throw new InvalidOperationException("Transaction not possible while data is filtered"); 271 originalData.BeginTransaction(type); 272 } 273 274 public void EndTransaction() { 275 originalData.EndTransaction(); 276 } 277 #endregion 278 279 #region Statistics 280 public T GetMin<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 281 return ActiveData.GetMin<T>(columnIndex, considerSelection, emptyValue); 282 } 283 public T GetMax<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 284 return ActiveData.GetMax<T>(columnIndex, considerSelection, emptyValue); 285 } 286 public T GetMean<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 287 return ActiveData.GetMean<T>(columnIndex, considerSelection, emptyValue); 288 } 289 public T GetMedian<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> { 290 return ActiveData.GetMean<T>(columnIndex, considerSelection, emptyValue); 291 } 292 public T GetMode<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IEquatable<T> { 293 return ActiveData.GetMode<T>(columnIndex, considerSelection, emptyValue); 294 } 295 public T GetStandardDeviation<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 296 return ActiveData.GetStandardDeviation<T>(columnIndex, considerSelection, emptyValue); 297 } 298 public T GetVariance<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 299 return ActiveData.GetVariance<T>(columnIndex, considerSelection, emptyValue); 300 } 301 public T GetQuantile<T>(double alpha, int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> { 302 return ActiveData.GetQuantile<T>(alpha, columnIndex, considerSelection, emptyValue); 303 } 304 public int GetDistinctValues<T>(int columnIndex, bool considerSelection = false) { 305 return ActiveData.GetDistinctValues<T>(columnIndex, considerSelection); 306 } 307 308 public int GetMissingValueCount() { 309 return ActiveData.GetMissingValueCount(); 310 } 311 public int GetMissingValueCount(int columnIndex) { 312 return ActiveData.GetMissingValueCount(columnIndex); 313 } 314 public int GetRowMissingValueCount(int rowIndex) { 315 return ActiveData.GetRowMissingValueCount(rowIndex); 316 } 317 #endregion 318 319 #region Filters 320 public void SetFilter(bool[] remainingRows) { 321 filteredData = (IPreprocessingData)originalData.Clone(); 176 322 filteredData.InTransaction(() => { 177 for (int row = (rowFilters.Length - 1); row >= 0; --row) { 178 if (rowFilters[row]) { 179 filteredData.DeleteRow(row); 323 var remainingIndices = Enumerable.Range(0, remainingRows.Length).Where(x => remainingRows[x]); 324 325 foreach (var v in filteredData.VariableNames) { 326 var ci = filteredData.GetColumnIndex(v); 327 if (filteredData.VariableHasType<double>(ci)) { 328 var values = filteredData.GetValues<double>(ci); 329 var filteredValues = remainingIndices.Select(x => values[x]).ToList(); 330 filteredData.SetValues(ci, filteredValues); 331 } else if (filteredData.VariableHasType<DateTime>(ci)) { 332 var values = filteredData.GetValues<DateTime>(ci); 333 var filteredValues = remainingIndices.Select(x => values[x]).ToList(); 334 filteredData.SetValues(ci, filteredValues); 335 } else if (filteredData.VariableHasType<string>(ci)) { 336 var values = filteredData.GetValues<string>(ci); 337 var filteredValues = remainingIndices.Select(x => values[x]).ToList(); 338 filteredData.SetValues(ci, filteredValues); 180 339 } 181 340 } … … 206 365 } 207 366 367 public bool IsFiltered { 368 get { return filteredData != null; } 369 } 370 371 public event EventHandler FilterChanged; 372 208 373 private void OnFilterChanged() { 209 374 if (FilterChanged != null) { … … 211 376 } 212 377 } 213 214 public event DataPreprocessingChangedEventHandler Changed {215 add { originalData.Changed += value; }216 remove { originalData.Changed -= value; }217 }218 219 public bool SetValue(string value, int columnIndex, int rowIndex) {220 if (IsFiltered)221 throw new InvalidOperationException("SetValue not possible while data is filtered");222 return originalData.SetValue(value, columnIndex, rowIndex);223 }224 225 public bool AreAllStringColumns(IEnumerable<int> columnIndices) {226 return originalData.AreAllStringColumns(columnIndices);227 }228 229 public void DeleteRowsWithIndices(IEnumerable<int> rows) {230 if (IsFiltered)231 throw new InvalidOperationException("DeleteRowsWithIndices not possible while data is filtered");232 233 originalData.DeleteRowsWithIndices(rows);234 }235 236 public void Undo() {237 if (IsFiltered)238 throw new InvalidOperationException("Undo not possible while data is filtered");239 240 originalData.Undo();241 }242 243 public void InTransaction(Action action, DataPreprocessingChangedEventType type = DataPreprocessingChangedEventType.Any) {244 if (IsFiltered)245 throw new InvalidOperationException("Transaction not possible while data is filtered");246 originalData.InTransaction(action, type);247 }248 249 public void BeginTransaction(DataPreprocessingChangedEventType type) {250 if (IsFiltered)251 throw new InvalidOperationException("Transaction not possible while data is filtered");252 originalData.BeginTransaction(type);253 }254 255 public void EndTransaction() {256 originalData.EndTransaction();257 }258 259 public IEnumerable<string> GetDoubleVariableNames() {260 return originalData.GetDoubleVariableNames();261 }262 263 public void ClearSelection() {264 originalData.ClearSelection();265 }266 267 public event EventHandler SelectionChanged {268 add { originalData.SelectionChanged += value; }269 remove { originalData.SelectionChanged -= value; }270 }271 272 #region IPreprocessingData Members273 public bool Validate(string value, out string errorMessage, int columnIndex) {274 return originalData.Validate(value, out errorMessage, columnIndex);275 }276 277 public event EventHandler FilterChanged;278 378 #endregion 279 379 } -
trunk/sources/HeuristicLab.DataPreprocessing/3.4/Data/IFilteredPreprocessingData.cs
r14185 r15518 23 23 24 24 namespace HeuristicLab.DataPreprocessing { 25 public interface IFilteredPreprocessingData : ITransactionalPreprocessingData { 26 void SetFilter(bool[] rowFilters); 25 public interface IFilteredPreprocessingData : IPreprocessingData { 26 #region Filters 27 void SetFilter(bool[] remainingRows); 27 28 void PersistFilter(); 28 29 void ResetFilter(); … … 30 31 31 32 event EventHandler FilterChanged; 33 #endregion 32 34 } 33 35 } -
trunk/sources/HeuristicLab.DataPreprocessing/3.4/Data/IPreprocessingData.cs
r15110 r15518 28 28 namespace HeuristicLab.DataPreprocessing { 29 29 public interface IPreprocessingData : INamedItem { 30 #region Cells 31 bool IsCellEmpty(int columnIndex, int rowIndex); 30 32 T GetCell<T>(int columnIndex, int rowIndex); 31 33 … … 39 41 bool SetValue(string value, int columnIndex, int rowIndex); 40 42 43 int Columns { get; } 44 int Rows { get; } 45 #endregion 46 47 #region Rows 41 48 void InsertRow(int rowIndex); 42 49 void DeleteRow(int rowIndex); … … 50 57 51 58 bool AreAllStringColumns(IEnumerable<int> columnIndices); 52 bool Validate(string value, out string errorMessage, int columnIndex);59 #endregion 53 60 54 IntRange TrainingPartition { get; } 55 IntRange TestPartition { get; } 56 57 IList<ITransformation> Transformations { get; } 58 61 #region Variables 59 62 IEnumerable<string> VariableNames { get; } 60 63 IEnumerable<string> GetDoubleVariableNames(); … … 63 66 64 67 bool VariableHasType<T>(int columnIndex); 68 Type GetVariableType(int columnIndex); 65 69 66 70 IList<string> InputVariables { get; } 67 71 string TargetVariable { get; } // optional 72 #endregion 68 73 69 int Columns { get; } 70 int Rows { get; } 74 #region Partitions 75 IntRange TrainingPartition { get; } 76 IntRange TestPartition { get; } 77 #endregion 71 78 79 #region Transformations 80 IList<ITransformation> Transformations { get; } 81 #endregion 82 83 #region Validation 84 bool Validate(string value, out string errorMessage, int columnIndex); 85 #endregion 86 87 #region Import & Export 88 void Import(IDataAnalysisProblemData problemData); 72 89 Dataset ExportToDataset(); 90 #endregion 73 91 92 #region Selection 74 93 IDictionary<int, IList<int>> Selection { get; set; } 75 94 void ClearSelection(); 76 95 77 96 event EventHandler SelectionChanged; 97 #endregion 98 99 #region Transactions 100 event DataPreprocessingChangedEventHandler Changed; 101 102 bool IsUndoAvailable { get; } 103 void Undo(); 104 void InTransaction(Action action, DataPreprocessingChangedEventType type = DataPreprocessingChangedEventType.Any); 105 void BeginTransaction(DataPreprocessingChangedEventType type); 106 void EndTransaction(); 107 #endregion 108 109 #region Statistics 110 T GetMin<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)); 111 T GetMax<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)); 112 T GetMean<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)); 113 T GetMedian<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T>; 114 T GetMode<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IEquatable<T>; 115 T GetStandardDeviation<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)); 116 T GetVariance<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)); 117 T GetQuantile<T>(double alpha, int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T>; 118 int GetDistinctValues<T>(int columnIndex, bool considerSelection = false); 119 120 int GetMissingValueCount(); 121 int GetMissingValueCount(int columnIndex); 122 int GetRowMissingValueCount(int rowIndex); 123 #endregion 78 124 } 79 125 } -
trunk/sources/HeuristicLab.DataPreprocessing/3.4/Data/PreprocessingData.cs
r15110 r15518 23 23 using System.Collections; 24 24 using System.Collections.Generic; 25 using System.Globalization; 25 26 using System.Linq; 26 27 using HeuristicLab.Common; 27 28 using HeuristicLab.Core; 28 29 using HeuristicLab.Data; 30 using HeuristicLab.Persistence.Default.CompositeSerializers.Storable; 29 31 using HeuristicLab.Problems.DataAnalysis; 30 32 … … 32 34 33 35 [Item("PreprocessingData", "Represents data used for preprocessing.")] 34 public abstract class PreprocessingData : NamedItem, IPreprocessingData { 35 public IntRange TrainingPartition { get; set; } 36 public IntRange TestPartition { get; set; } 37 38 public IList<ITransformation> Transformations { get; protected set; } 39 36 [StorableClass] 37 public class PreprocessingData : NamedItem, IPreprocessingData { 38 39 [Storable] 40 40 protected IList<IList> variableValues; 41 [Storable] 41 42 protected IList<string> variableNames; 42 43 43 public IEnumerable<string> VariableNames { 44 get { return variableNames; } 45 } 46 47 public IEnumerable<string> GetDoubleVariableNames() { 48 var doubleVariableNames = new List<string>(); 49 for (int i = 0; i < Columns; ++i) { 50 if (VariableHasType<double>(i)) { 51 doubleVariableNames.Add(variableNames[i]); 52 } 53 } 54 return doubleVariableNames; 55 } 56 57 public IList<string> InputVariables { get; private set; } 58 public string TargetVariable { get; private set; } // optional 59 60 public int Columns { 61 get { return variableNames.Count; } 62 } 63 64 public int Rows { 65 get { return variableValues.Count > 0 ? variableValues[0].Count : 0; } 66 } 67 68 protected IDictionary<int, IList<int>> selection; 69 public IDictionary<int, IList<int>> Selection { 70 get { return selection; } 71 set { 72 selection = value; 73 OnSelectionChanged(); 74 } 44 #region Constructor, Cloning & Persistence 45 public PreprocessingData(IDataAnalysisProblemData problemData) 46 : base() { 47 Name = "Preprocessing Data"; 48 49 Transformations = new List<ITransformation>(); 50 selection = new Dictionary<int, IList<int>>(); 51 52 Import(problemData); 53 54 RegisterEventHandler(); 75 55 } 76 56 … … 88 68 RegisterEventHandler(); 89 69 } 90 91 protected PreprocessingData(IDataAnalysisProblemData problemData) 92 : base() { 93 Name = "Preprocessing Data"; 94 95 Transformations = new List<ITransformation>(); 96 selection = new Dictionary<int, IList<int>>(); 97 98 Import(problemData); 99 70 public override IDeepCloneable Clone(Cloner cloner) { 71 return new PreprocessingData(this, cloner); 72 } 73 74 [StorableConstructor] 75 protected PreprocessingData(bool deserializing) 76 : base(deserializing) { } 77 [StorableHook(HookType.AfterDeserialization)] 78 private void AfterDeserialization() { 100 79 RegisterEventHandler(); 101 80 } 102 81 82 private void RegisterEventHandler() { 83 Changed += (s, e) => { 84 switch (e.Type) { 85 case DataPreprocessingChangedEventType.DeleteRow: 86 case DataPreprocessingChangedEventType.Any: 87 case DataPreprocessingChangedEventType.Transformation: 88 int maxRowIndex = Math.Max(0, Rows); 89 TrainingPartition.Start = Math.Min(TrainingPartition.Start, maxRowIndex); 90 TrainingPartition.End = Math.Min(TrainingPartition.End, maxRowIndex); 91 TestPartition.Start = Math.Min(TestPartition.Start, maxRowIndex); 92 TestPartition.End = Math.Min(TestPartition.End, maxRowIndex); 93 break; 94 } 95 }; 96 } 97 #endregion 98 99 #region Cells 100 public bool IsCellEmpty(int columnIndex, int rowIndex) { 101 var value = variableValues[columnIndex][rowIndex]; 102 return IsMissingValue(value); 103 } 104 105 public T GetCell<T>(int columnIndex, int rowIndex) { 106 return (T)variableValues[columnIndex][rowIndex]; 107 } 108 109 public void SetCell<T>(int columnIndex, int rowIndex, T value) { 110 SaveSnapshot(DataPreprocessingChangedEventType.ChangeItem, columnIndex, rowIndex); 111 112 for (int i = Rows; i <= rowIndex; i++) 113 InsertRow(i); 114 for (int i = Columns; i <= columnIndex; i++) 115 InsertColumn<T>(i.ToString(), i); 116 117 variableValues[columnIndex][rowIndex] = value; 118 if (!IsInTransaction) 119 OnChanged(DataPreprocessingChangedEventType.ChangeItem, columnIndex, rowIndex); 120 } 121 122 public string GetCellAsString(int columnIndex, int rowIndex) { 123 return variableValues[columnIndex][rowIndex].ToString(); 124 } 125 126 public IList<T> GetValues<T>(int columnIndex, bool considerSelection) { 127 if (considerSelection) { 128 var list = new List<T>(); 129 foreach (var rowIdx in selection[columnIndex]) { 130 list.Add((T)variableValues[columnIndex][rowIdx]); 131 } 132 return list; 133 } else { 134 return (IList<T>)variableValues[columnIndex]; 135 } 136 } 137 138 public void SetValues<T>(int columnIndex, IList<T> values) { 139 SaveSnapshot(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1); 140 if (VariableHasType<T>(columnIndex)) { 141 variableValues[columnIndex] = (IList)values; 142 } else { 143 throw new ArgumentException("The datatype of column " + columnIndex + " must be of type " + variableValues[columnIndex].GetType().Name + " but was " + typeof(T).Name); 144 } 145 if (!IsInTransaction) 146 OnChanged(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1); 147 } 148 149 public bool SetValue(string value, int columnIndex, int rowIndex) { 150 bool valid = false; 151 if (VariableHasType<double>(columnIndex)) { 152 double val; 153 if (string.IsNullOrWhiteSpace(value)) { 154 val = double.NaN; 155 valid = true; 156 } else { 157 valid = double.TryParse(value, out val); 158 } 159 if (valid) 160 SetCell(columnIndex, rowIndex, val); 161 } else if (VariableHasType<string>(columnIndex)) { 162 valid = value != null; 163 if (valid) 164 SetCell(columnIndex, rowIndex, value); 165 } else if (VariableHasType<DateTime>(columnIndex)) { 166 DateTime date; 167 valid = DateTime.TryParse(value, out date); 168 if (valid) 169 SetCell(columnIndex, rowIndex, date); 170 } else { 171 throw new ArgumentException("column " + columnIndex + " contains a non supported type."); 172 } 173 174 if (!IsInTransaction) 175 OnChanged(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1); 176 177 return valid; 178 } 179 180 public int Columns { 181 get { return variableNames.Count; } 182 } 183 184 public int Rows { 185 get { return variableValues.Count > 0 ? variableValues[0].Count : 0; } 186 } 187 188 public static bool IsMissingValue(object value) { 189 if (value is double) return double.IsNaN((double)value); 190 if (value is string) return string.IsNullOrEmpty((string)value); 191 if (value is DateTime) return ((DateTime)value).Equals(DateTime.MinValue); 192 throw new ArgumentException(); 193 } 194 #endregion 195 196 #region Rows 197 public void InsertRow(int rowIndex) { 198 SaveSnapshot(DataPreprocessingChangedEventType.DeleteRow, -1, rowIndex); 199 foreach (IList column in variableValues) { 200 Type type = column.GetType().GetGenericArguments()[0]; 201 column.Insert(rowIndex, type.IsValueType ? Activator.CreateInstance(type) : null); 202 } 203 if (TrainingPartition.Start <= rowIndex && rowIndex <= TrainingPartition.End) { 204 TrainingPartition.End++; 205 if (TrainingPartition.End <= TestPartition.Start) { 206 TestPartition.Start++; 207 TestPartition.End++; 208 } 209 } else if (TestPartition.Start <= rowIndex && rowIndex <= TestPartition.End) { 210 TestPartition.End++; 211 if (TestPartition.End <= TrainingPartition.Start) { 212 TestPartition.Start++; 213 TestPartition.End++; 214 } 215 } 216 if (!IsInTransaction) 217 OnChanged(DataPreprocessingChangedEventType.AddRow, -1, rowIndex); 218 } 219 public void DeleteRow(int rowIndex) { 220 SaveSnapshot(DataPreprocessingChangedEventType.AddRow, -1, rowIndex); 221 foreach (IList column in variableValues) { 222 column.RemoveAt(rowIndex); 223 } 224 if (TrainingPartition.Start <= rowIndex && rowIndex <= TrainingPartition.End) { 225 TrainingPartition.End--; 226 if (TrainingPartition.End <= TestPartition.Start) { 227 TestPartition.Start--; 228 TestPartition.End--; 229 } 230 } else if (TestPartition.Start <= rowIndex && rowIndex <= TestPartition.End) { 231 TestPartition.End--; 232 if (TestPartition.End <= TrainingPartition.Start) { 233 TestPartition.Start--; 234 TestPartition.End--; 235 } 236 } 237 if (!IsInTransaction) 238 OnChanged(DataPreprocessingChangedEventType.DeleteRow, -1, rowIndex); 239 } 240 public void DeleteRowsWithIndices(IEnumerable<int> rows) { 241 SaveSnapshot(DataPreprocessingChangedEventType.AddRow, -1, -1); 242 foreach (int rowIndex in rows.OrderByDescending(x => x)) { 243 foreach (IList column in variableValues) { 244 column.RemoveAt(rowIndex); 245 } 246 if (TrainingPartition.Start <= rowIndex && rowIndex <= TrainingPartition.End) { 247 TrainingPartition.End--; 248 if (TrainingPartition.End <= TestPartition.Start) { 249 TestPartition.Start--; 250 TestPartition.End--; 251 } 252 } else if (TestPartition.Start <= rowIndex && rowIndex <= TestPartition.End) { 253 TestPartition.End--; 254 if (TestPartition.End <= TrainingPartition.Start) { 255 TestPartition.Start--; 256 TestPartition.End--; 257 } 258 } 259 } 260 if (!IsInTransaction) 261 OnChanged(DataPreprocessingChangedEventType.DeleteRow, -1, -1); 262 } 263 264 public void InsertColumn<T>(string variableName, int columnIndex) { 265 SaveSnapshot(DataPreprocessingChangedEventType.DeleteColumn, columnIndex, -1); 266 variableValues.Insert(columnIndex, new List<T>(Enumerable.Repeat(default(T), Rows))); 267 variableNames.Insert(columnIndex, variableName); 268 if (!IsInTransaction) 269 OnChanged(DataPreprocessingChangedEventType.AddColumn, columnIndex, -1); 270 } 271 272 public void DeleteColumn(int columnIndex) { 273 SaveSnapshot(DataPreprocessingChangedEventType.AddColumn, columnIndex, -1); 274 variableValues.RemoveAt(columnIndex); 275 variableNames.RemoveAt(columnIndex); 276 if (!IsInTransaction) 277 OnChanged(DataPreprocessingChangedEventType.DeleteColumn, columnIndex, -1); 278 } 279 280 public void RenameColumn(int columnIndex, string name) { 281 SaveSnapshot(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1); 282 if (columnIndex < 0 || columnIndex > variableNames.Count) 283 throw new ArgumentOutOfRangeException("columnIndex"); 284 variableNames[columnIndex] = name; 285 286 if (!IsInTransaction) 287 OnChanged(DataPreprocessingChangedEventType.ChangeColumn, -1, -1); 288 } 289 290 public void RenameColumns(IList<string> names) { 291 if (names == null) throw new ArgumentNullException("names"); 292 if (names.Count != variableNames.Count) throw new ArgumentException("number of names must match the number of columns.", "names"); 293 294 SaveSnapshot(DataPreprocessingChangedEventType.ChangeColumn, -1, -1); 295 for (int i = 0; i < names.Count; i++) 296 variableNames[i] = names[i]; 297 298 if (!IsInTransaction) 299 OnChanged(DataPreprocessingChangedEventType.ChangeColumn, -1, -1); 300 } 301 302 public bool AreAllStringColumns(IEnumerable<int> columnIndices) { 303 return columnIndices.All(x => VariableHasType<string>(x)); 304 } 305 #endregion 306 307 #region Variables 308 public IEnumerable<string> VariableNames { 309 get { return variableNames; } 310 } 311 312 public IEnumerable<string> GetDoubleVariableNames() { 313 var doubleVariableNames = new List<string>(); 314 for (int i = 0; i < Columns; ++i) { 315 if (VariableHasType<double>(i)) { 316 doubleVariableNames.Add(variableNames[i]); 317 } 318 } 319 return doubleVariableNames; 320 } 321 322 public string GetVariableName(int columnIndex) { 323 return variableNames[columnIndex]; 324 } 325 326 public int GetColumnIndex(string variableName) { 327 return variableNames.IndexOf(variableName); 328 } 329 330 public bool VariableHasType<T>(int columnIndex) { 331 return columnIndex >= variableValues.Count || variableValues[columnIndex] is List<T>; 332 } 333 334 public Type GetVariableType(int columnIndex) { 335 var listType = variableValues[columnIndex].GetType(); 336 return listType.GenericTypeArguments.Single(); 337 } 338 339 public IList<string> InputVariables { get; private set; } 340 public string TargetVariable { get; private set; } // optional 341 #endregion 342 343 #region Partitions 344 [Storable] 345 public IntRange TrainingPartition { get; set; } 346 [Storable] 347 public IntRange TestPartition { get; set; } 348 #endregion 349 350 #region Transformations 351 [Storable] 352 public IList<ITransformation> Transformations { get; protected set; } 353 #endregion 354 355 #region Validation 356 public bool Validate(string value, out string errorMessage, int columnIndex) { 357 if (columnIndex < 0 || columnIndex > VariableNames.Count()) { 358 throw new ArgumentOutOfRangeException("column index is out of range"); 359 } 360 361 bool valid = false; 362 errorMessage = string.Empty; 363 if (VariableHasType<double>(columnIndex)) { 364 if (string.IsNullOrWhiteSpace(value)) { 365 valid = true; 366 } else { 367 double val; 368 valid = double.TryParse(value, out val); 369 if (!valid) { 370 errorMessage = "Invalid Value (Valid Value Format: \"" + FormatPatterns.GetDoubleFormatPattern() + "\")"; 371 } 372 } 373 } else if (VariableHasType<string>(columnIndex)) { 374 valid = value != null; 375 if (!valid) { 376 errorMessage = "Invalid Value (string must not be null)"; 377 } 378 } else if (VariableHasType<DateTime>(columnIndex)) { 379 DateTime date; 380 valid = DateTime.TryParse(value, out date); 381 if (!valid) { 382 errorMessage = "Invalid Value (Valid Value Format: \"" + CultureInfo.CurrentCulture.DateTimeFormat + "\""; 383 } 384 } else { 385 throw new ArgumentException("column " + columnIndex + " contains a non supported type."); 386 } 387 388 return valid; 389 } 390 #endregion 391 392 #region Import & Export 103 393 public void Import(IDataAnalysisProblemData problemData) { 104 394 Dataset dataset = (Dataset)problemData.Dataset; … … 107 397 TargetVariable = (problemData is IRegressionProblemData) ? ((IRegressionProblemData)problemData).TargetVariable 108 398 : (problemData is IClassificationProblemData) ? ((IClassificationProblemData)problemData).TargetVariable 109 : null;399 : null; 110 400 111 401 int columnIndex = 0; … … 128 418 } 129 419 130 private void RegisterEventHandler() { 131 Changed += (s, e) => { 132 switch (e.Type) { 133 case DataPreprocessingChangedEventType.DeleteRow: 134 CheckPartitionRanges(); 135 break; 136 case DataPreprocessingChangedEventType.Any: 137 CheckPartitionRanges(); 138 break; 139 case DataPreprocessingChangedEventType.Transformation: 140 CheckPartitionRanges(); 141 break; 142 } 143 }; 144 } 145 146 private void CheckPartitionRanges() { 147 int maxRowIndex = Math.Max(0, Rows); 148 TrainingPartition.Start = Math.Min(TrainingPartition.Start, maxRowIndex); 149 TrainingPartition.End = Math.Min(TrainingPartition.End, maxRowIndex); 150 TestPartition.Start = Math.Min(TestPartition.Start, maxRowIndex); 151 TestPartition.End = Math.Min(TestPartition.End, maxRowIndex); 152 } 153 154 protected IList<IList> CopyVariableValues(IList<IList> original) { 155 var copy = new List<IList>(original); 156 for (int i = 0; i < original.Count; ++i) { 157 copy[i] = (IList)Activator.CreateInstance(original[i].GetType(), original[i]); 158 } 159 return copy; 160 } 161 162 163 #region IPreprocessingData Members 164 public abstract T GetCell<T>(int columnIndex, int rowIndex); 165 166 public abstract void SetCell<T>(int columnIndex, int rowIndex, T value); 167 168 public abstract string GetCellAsString(int columnIndex, int rowIndex); 169 170 public abstract string GetVariableName(int columnIndex); 171 172 public abstract int GetColumnIndex(string variableName); 173 174 public abstract bool VariableHasType<T>(int columnIndex); 175 176 [Obsolete("use the index based variant, is faster")] 177 public abstract IList<T> GetValues<T>(string variableName, bool considerSelection); 178 179 public abstract IList<T> GetValues<T>(int columnIndex, bool considerSelection); 180 181 public abstract void SetValues<T>(int columnIndex, IList<T> values); 182 183 public abstract bool SetValue(string value, int columnIndex, int rowIndex); 184 185 public abstract bool Validate(string value, out string errorMessage, int columnIndex); 186 187 public abstract bool AreAllStringColumns(IEnumerable<int> columnIndices); 188 189 public abstract void DeleteRowsWithIndices(IEnumerable<int> rows); 190 191 public abstract void InsertRow(int rowIndex); 192 193 public abstract void DeleteRow(int rowIndex); 194 195 public abstract void InsertColumn<T>(string variableName, int columnIndex); 196 197 public abstract void DeleteColumn(int columnIndex); 198 199 public abstract void RenameColumn(int columnIndex, string name); 200 public abstract void RenameColumns(IList<string> list); 201 202 public abstract Dataset ExportToDataset(); 203 204 public abstract void ClearSelection(); 205 206 public abstract event EventHandler SelectionChanged; 207 protected abstract void OnSelectionChanged(); 420 public Dataset ExportToDataset() { 421 IList<IList> values = new List<IList>(); 422 423 for (int i = 0; i < Columns; ++i) { 424 values.Add(variableValues[i]); 425 } 426 427 var dataset = new Dataset(variableNames, values); 428 return dataset; 429 } 430 #endregion 431 432 #region Selection 433 [Storable] 434 protected IDictionary<int, IList<int>> selection; 435 public IDictionary<int, IList<int>> Selection { 436 get { return selection; } 437 set { 438 selection = value; 439 OnSelectionChanged(); 440 } 441 } 442 public void ClearSelection() { 443 Selection = new Dictionary<int, IList<int>>(); 444 } 445 446 public event EventHandler SelectionChanged; 447 protected void OnSelectionChanged() { 448 var listeners = SelectionChanged; 449 if (listeners != null) listeners(this, EventArgs.Empty); 450 } 451 #endregion 452 453 #region Transactions 454 // Stapshot/History are nost storable/cloneable on purpose 455 private class Snapshot { 456 public IList<IList> VariableValues { get; set; } 457 public IList<string> VariableNames { get; set; } 458 459 public IntRange TrainingPartition { get; set; } 460 public IntRange TestPartition { get; set; } 461 public IList<ITransformation> Transformations { get; set; } 462 public DataPreprocessingChangedEventType ChangedType { get; set; } 463 464 public int ChangedColumn { get; set; } 465 public int ChangedRow { get; set; } 466 } 208 467 209 468 public event DataPreprocessingChangedEventHandler Changed; … … 212 471 if (listeners != null) listeners(this, new DataPreprocessingChangedEventArgs(type, column, row)); 213 472 } 214 #endregion 473 474 private const int MAX_UNDO_DEPTH = 5; 475 476 private readonly IList<Snapshot> undoHistory = new List<Snapshot>(); 477 private readonly Stack<DataPreprocessingChangedEventType> eventStack = new Stack<DataPreprocessingChangedEventType>(); 478 479 public bool IsInTransaction { get { return eventStack.Count > 0; } } 480 481 private void SaveSnapshot(DataPreprocessingChangedEventType changedType, int column, int row) { 482 if (IsInTransaction) return; 483 484 var currentSnapshot = new Snapshot { 485 VariableValues = CopyVariableValues(variableValues), 486 VariableNames = new List<string>(variableNames), 487 TrainingPartition = new IntRange(TrainingPartition.Start, TrainingPartition.End), 488 TestPartition = new IntRange(TestPartition.Start, TestPartition.End), 489 Transformations = new List<ITransformation>(Transformations), 490 ChangedType = changedType, 491 ChangedColumn = column, 492 ChangedRow = row 493 }; 494 495 if (undoHistory.Count >= MAX_UNDO_DEPTH) 496 undoHistory.RemoveAt(0); 497 498 undoHistory.Add(currentSnapshot); 499 } 500 501 public bool IsUndoAvailable { 502 get { return undoHistory.Count > 0; } 503 } 504 505 public void Undo() { 506 if (IsUndoAvailable) { 507 Snapshot previousSnapshot = undoHistory[undoHistory.Count - 1]; 508 variableValues = previousSnapshot.VariableValues; 509 variableNames = previousSnapshot.VariableNames; 510 TrainingPartition = previousSnapshot.TrainingPartition; 511 TestPartition = previousSnapshot.TestPartition; 512 Transformations = previousSnapshot.Transformations; 513 undoHistory.Remove(previousSnapshot); 514 OnChanged(previousSnapshot.ChangedType, 515 previousSnapshot.ChangedColumn, 516 previousSnapshot.ChangedRow); 517 } 518 } 519 520 public void InTransaction(Action action, DataPreprocessingChangedEventType type = DataPreprocessingChangedEventType.Any) { 521 BeginTransaction(type); 522 action(); 523 EndTransaction(); 524 } 525 526 public void BeginTransaction(DataPreprocessingChangedEventType type) { 527 SaveSnapshot(type, -1, -1); 528 eventStack.Push(type); 529 } 530 531 public void EndTransaction() { 532 if (eventStack.Count == 0) 533 throw new InvalidOperationException("There is no open transaction that can be ended."); 534 535 var @event = eventStack.Pop(); 536 OnChanged(@event, -1, -1); 537 } 538 #endregion 539 540 #region Statistics 541 public T GetMin<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 542 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 543 return values.Any() ? values.Min() : emptyValue; 544 } 545 546 public T GetMax<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 547 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 548 return values.Any() ? values.Max() : emptyValue; 549 } 550 551 public T GetMean<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 552 if (typeof(T) == typeof(double)) { 553 var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 554 return values.Any() ? Convert<T>(values.Average()) : emptyValue; 555 } 556 if (typeof(T) == typeof(string)) { 557 return Convert<T>(string.Empty); 558 } 559 if (typeof(T) == typeof(DateTime)) { 560 var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection); 561 return values.Any() ? Convert<T>(AggregateAsDouble(values, Enumerable.Average)) : emptyValue; 562 } 563 564 throw new InvalidOperationException(typeof(T) + " not supported"); 565 } 566 567 public T GetMedian<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> { 568 if (typeof(T) == typeof(double)) {// IEnumerable<double> is faster 569 var doubleValues = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 570 return doubleValues.Any() ? Convert<T>(doubleValues.Median()) : emptyValue; 571 } 572 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 573 return values.Any() ? values.Quantile(0.5) : emptyValue; 574 } 575 576 public T GetMode<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IEquatable<T> { 577 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 578 return values.Any() ? values.GroupBy(x => x).OrderByDescending(g => g.Count()).Select(g => g.Key).First() : emptyValue; 579 } 580 581 public T GetStandardDeviation<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 582 if (typeof(T) == typeof(double)) { 583 var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 584 return values.Any() ? Convert<T>(values.StandardDeviation()) : emptyValue; 585 } 586 // For DateTime, std.dev / variance would have to be TimeSpan 587 //if (typeof(T) == typeof(DateTime)) { 588 // var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection); 589 // return values.Any() ? Convert<T>(AggregateAsDouble(values, EnumerableStatisticExtensions.StandardDeviation)) : emptyValue; 590 //} 591 return default(T); 592 } 593 594 public T GetVariance<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 595 if (typeof(T) == typeof(double)) { 596 var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 597 return values.Any() ? Convert<T>(values.Variance()) : emptyValue; 598 } 599 // DateTime variance often overflows long, thus the corresponding DateTime is invalid 600 //if (typeof(T) == typeof(DateTime)) { 601 // var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection); 602 // return values.Any() ? Convert<T>(AggregateAsDouble(values, EnumerableStatisticExtensions.Variance)) : emptyValue; 603 //} 604 return default(T); 605 } 606 607 public T GetQuantile<T>(double alpha, int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> { 608 if (typeof(T) == typeof(double)) {// IEnumerable<double> is faster 609 var doubleValues = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 610 return doubleValues.Any() ? Convert<T>(doubleValues.Quantile(alpha)) : emptyValue; 611 } 612 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 613 return values.Any() ? values.Quantile(alpha) : emptyValue; 614 } 615 616 public int GetDistinctValues<T>(int columnIndex, bool considerSelection = false) { 617 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 618 return values.GroupBy(x => x).Count(); 619 } 620 621 private IEnumerable<T> GetValuesWithoutMissingValues<T>(int columnIndex, bool considerSelection) { 622 return GetValues<T>(columnIndex, considerSelection).Where(x => !IsMissingValue(x)); 623 } 624 625 private static DateTime AggregateAsDouble(IEnumerable<DateTime> values, Func<IEnumerable<double>, double> func) { 626 return new DateTime((long)(func(values.Select(x => (double)x.Ticks / TimeSpan.TicksPerSecond)) * TimeSpan.TicksPerSecond)); 627 } 628 private static T Convert<T>(object obj) { return (T)obj; } 629 630 public int GetMissingValueCount() { 631 int count = 0; 632 for (int i = 0; i < Columns; ++i) { 633 count += GetMissingValueCount(i); 634 } 635 return count; 636 } 637 public int GetMissingValueCount(int columnIndex) { 638 int sum = 0; 639 for (int i = 0; i < Rows; i++) { 640 if (IsCellEmpty(columnIndex, i)) 641 sum++; 642 } 643 return sum; 644 } 645 public int GetRowMissingValueCount(int rowIndex) { 646 int sum = 0; 647 for (int i = 0; i < Columns; i++) { 648 if (IsCellEmpty(i, rowIndex)) 649 sum++; 650 } 651 return sum; 652 } 653 #endregion 654 655 #region Helpers 656 private static IList<IList> CopyVariableValues(IList<IList> original) { 657 var copy = new List<IList>(original); 658 for (int i = 0; i < original.Count; ++i) { 659 copy[i] = (IList)Activator.CreateInstance(original[i].GetType(), original[i]); 660 } 661 return copy; 662 } 663 #endregion 664 } 665 666 // Adapted from HeuristicLab.Common.EnumerableStatisticExtensions 667 internal static class EnumerableExtensions { 668 public static T Quantile<T>(this IEnumerable<T> values, double alpha) where T : IComparable<T> { 669 T[] valuesArr = values.ToArray(); 670 int n = valuesArr.Length; 671 if (n == 0) throw new InvalidOperationException("Enumeration contains no elements."); 672 673 var pos = n * alpha; 674 675 return Select((int)Math.Ceiling(pos) - 1, valuesArr); 676 677 } 678 679 private static T Select<T>(int k, T[] arr) where T : IComparable<T> { 680 int i, ir, j, l, mid, n = arr.Length; 681 T a; 682 l = 0; 683 ir = n - 1; 684 for (;;) { 685 if (ir <= l + 1) { 686 // Active partition contains 1 or 2 elements. 687 if (ir == l + 1 && arr[ir].CompareTo(arr[l]) < 0) { 688 // Case of 2 elements. 689 Swap(arr, l, ir); 690 } 691 return arr[k]; 692 } else { 693 mid = (l + ir) >> 1; // Choose median of left, center, and right elements 694 Swap(arr, mid, l + 1); // as partitioning element a. Also 695 696 if (arr[l].CompareTo(arr[ir]) > 0) { // rearrange so that arr[l] arr[ir] <= arr[l+1], 697 Swap(arr, l, ir); // . arr[ir] >= arr[l+1] 698 } 699 700 if (arr[l + 1].CompareTo(arr[ir]) > 0) { 701 Swap(arr, l + 1, ir); 702 } 703 if (arr[l].CompareTo(arr[l + 1]) > 0) { 704 Swap(arr, l, l + 1); 705 } 706 i = l + 1; // Initialize pointers for partitioning. 707 j = ir; 708 a = arr[l + 1]; // Partitioning element. 709 for (;;) { // Beginning of innermost loop. 710 do i++; while (arr[i].CompareTo(a) < 0); // Scan up to find element > a. 711 do j--; while (arr[j].CompareTo(a) > 0); // Scan down to find element < a. 712 if (j < i) break; // Pointers crossed. Partitioning complete. 713 Swap(arr, i, j); 714 } // End of innermost loop. 715 arr[l + 1] = arr[j]; // Insert partitioning element. 716 arr[j] = a; 717 if (j >= k) ir = j - 1; // Keep active the partition that contains the 718 if (j <= k) l = i; // kth element. 719 } 720 } 721 } 722 723 private static void Swap<T>(T[] arr, int i, int j) { 724 T temp = arr[i]; 725 arr[i] = arr[j]; 726 arr[j] = temp; 727 } 215 728 } 216 729 }
Note: See TracChangeset
for help on using the changeset viewer.