- Timestamp:
- 07/26/17 14:12:27 (8 years ago)
- Location:
- branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Content
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Content/DataGridContent.cs
r15274 r15285 28 28 using HeuristicLab.Data; 29 29 using HeuristicLab.Persistence.Default.CompositeSerializers.Storable; 30 using HeuristicLab.Random; 30 31 31 32 namespace HeuristicLab.DataPreprocessing { … … 37 38 } 38 39 39 [Storable]40 public ManipulationLogic ManipulationLogic { get; private set; }41 42 40 public int Rows { 43 41 get { return PreprocessingData.Rows; } … … 75 73 76 74 #region Constructor, Cloning & Persistence 77 public DataGridContent(IFilteredPreprocessingData preprocessingData , ManipulationLogic manipulationLogic)75 public DataGridContent(IFilteredPreprocessingData preprocessingData) 78 76 : base(preprocessingData) { 79 ManipulationLogic = manipulationLogic;80 77 } 81 78 82 79 public DataGridContent(DataGridContent original, Cloner cloner) 83 80 : base(original, cloner) { 84 ManipulationLogic = cloner.Clone(original.ManipulationLogic);85 81 } 86 82 public override IDeepCloneable Clone(Cloner cloner) { … … 136 132 #pragma warning restore 0067 137 133 #endregion 134 135 #region Manipulations 136 private void ReplaceIndicesByValue(IDictionary<int, IList<int>> cells, Func<int, double> doubleAggregator = null, 137 Func<int, DateTime> dateTimeAggregator = null, Func<int, string> stringAggregator = null) { 138 PreprocessingData.InTransaction(() => { 139 foreach (var column in cells) { 140 if (doubleAggregator != null && PreprocessingData.VariableHasType<double>(column.Key)) { 141 var value = doubleAggregator(column.Key); 142 foreach (int index in column.Value) 143 PreprocessingData.SetCell<double>(column.Key, index, value); 144 } else if (dateTimeAggregator != null && PreprocessingData.VariableHasType<DateTime>(column.Key)) { 145 var value = dateTimeAggregator(column.Key); 146 foreach (int index in column.Value) 147 PreprocessingData.SetCell<DateTime>(column.Key, index, value); 148 } else if (stringAggregator != null && PreprocessingData.VariableHasType<string>(column.Key)) { 149 var value = stringAggregator(column.Key); 150 foreach (int index in column.Value) 151 PreprocessingData.SetCell<string>(column.Key, index, value); 152 } 153 } 154 }); 155 } 156 157 private void ReplaceIndicesByValues(IDictionary<int, IList<int>> cells, Func<int, IEnumerable<double>> doubleAggregator = null, 158 Func<int, IEnumerable<DateTime>> dateTimeAggregator = null, Func<int, IEnumerable<string>> stringAggregator = null) { 159 PreprocessingData.InTransaction(() => { 160 foreach (var column in cells) { 161 if (doubleAggregator != null && PreprocessingData.VariableHasType<double>(column.Key)) { 162 var values = doubleAggregator(column.Key); 163 foreach (var pair in column.Value.Zip(values, (row, value) => new { row, value })) 164 PreprocessingData.SetCell<double>(column.Key, pair.row, pair.value); 165 } else if (dateTimeAggregator != null && PreprocessingData.VariableHasType<DateTime>(column.Key)) { 166 var values = dateTimeAggregator(column.Key); 167 foreach (var pair in column.Value.Zip(values, (row, value) => new { row, value })) 168 PreprocessingData.SetCell<DateTime>(column.Key, pair.row, pair.value); 169 } else if (stringAggregator != null && PreprocessingData.VariableHasType<string>(column.Key)) { 170 var values = stringAggregator(column.Key); 171 foreach (var pair in column.Value.Zip(values, (row, value) => new { row, value })) 172 PreprocessingData.SetCell<string>(column.Key, pair.row, pair.value); 173 } 174 } 175 }); 176 } 177 178 public void ReplaceIndicesByMean(IDictionary<int, IList<int>> cells, bool considerSelection = false) { 179 ReplaceIndicesByValue(cells, 180 col => PreprocessingData.GetMean<double>(col, considerSelection), 181 col => PreprocessingData.GetMean<DateTime>(col, considerSelection)); 182 } 183 184 public void ReplaceIndicesByMedianValue(IDictionary<int, IList<int>> cells, bool considerSelection = false) { 185 ReplaceIndicesByValue(cells, 186 col => PreprocessingData.GetMedian<double>(col, considerSelection), 187 col => PreprocessingData.GetMedian<DateTime>(col, considerSelection)); 188 } 189 190 public void ReplaceIndicesByMode(IDictionary<int, IList<int>> cells, bool considerSelection = false) { 191 ReplaceIndicesByValue(cells, 192 col => PreprocessingData.GetMode<double>(col, considerSelection), 193 col => PreprocessingData.GetMode<DateTime>(col, considerSelection), 194 col => PreprocessingData.GetMode<string>(col, considerSelection)); 195 } 196 197 public void ReplaceIndicesByRandomValue(IDictionary<int, IList<int>> cells, bool considerSelection = false) { 198 var rand = new FastRandom(); 199 ReplaceIndicesByValues(cells, 200 col => { 201 double min = PreprocessingData.GetMin<double>(col, considerSelection); 202 double max = PreprocessingData.GetMax<double>(col, considerSelection); 203 double range = max - min; 204 return cells[col].Select(_ => rand.NextDouble() * range + min); 205 }, 206 col => { 207 var min = PreprocessingData.GetMin<DateTime>(col, considerSelection); 208 var max = PreprocessingData.GetMax<DateTime>(col, considerSelection); 209 double range = (max - min).TotalSeconds; 210 return cells[col].Select(_ => min + TimeSpan.FromSeconds(rand.NextDouble() * range)); 211 }); 212 } 213 214 public void ReplaceIndicesByString(IDictionary<int, IList<int>> cells, string value) { 215 PreprocessingData.InTransaction(() => { 216 foreach (var column in cells) { 217 foreach (var rowIdx in column.Value) { 218 PreprocessingData.SetValue(value, column.Key, rowIdx); 219 } 220 } 221 }); 222 } 223 224 225 public void ReplaceIndicesByLinearInterpolationOfNeighbours(IDictionary<int, IList<int>> cells) { 226 PreprocessingData.InTransaction(() => { 227 foreach (var column in cells) { 228 IList<Tuple<int, int>> startEndings = GetStartAndEndingsForInterpolation(column); 229 foreach (var tuple in startEndings) { 230 Interpolate(column, tuple.Item1, tuple.Item2); 231 } 232 } 233 }); 234 } 235 236 private List<Tuple<int, int>> GetStartAndEndingsForInterpolation(KeyValuePair<int, IList<int>> column) { 237 var startEndings = new List<Tuple<int, int>>(); 238 var rowIndices = column.Value.OrderBy(x => x).ToList(); 239 var count = rowIndices.Count; 240 int start = int.MinValue; 241 for (int i = 0; i < count; ++i) { 242 if (start == int.MinValue) { 243 start = IndexOfPrevPresentValue(column.Key, rowIndices[i]); 244 } 245 if (i + 1 == count || (i + 1 < count && rowIndices[i + 1] - rowIndices[i] > 1)) { 246 int next = IndexOfNextPresentValue(column.Key, rowIndices[i]); 247 if (start > 0 && next < PreprocessingData.Rows) { 248 startEndings.Add(new Tuple<int, int>(start, next)); 249 } 250 start = int.MinValue; 251 } 252 } 253 return startEndings; 254 } 255 256 private void Interpolate(KeyValuePair<int, IList<int>> column, int prevIndex, int nextIndex) { 257 int valuesToInterpolate = nextIndex - prevIndex; 258 259 if (PreprocessingData.VariableHasType<double>(column.Key)) { 260 double prev = PreprocessingData.GetCell<double>(column.Key, prevIndex); 261 double next = PreprocessingData.GetCell<double>(column.Key, nextIndex); 262 double interpolationStep = (next - prev) / valuesToInterpolate; 263 264 for (int i = prevIndex; i < nextIndex; ++i) { 265 double interpolated = prev + (interpolationStep * (i - prevIndex)); 266 PreprocessingData.SetCell<double>(column.Key, i, interpolated); 267 } 268 } else if (PreprocessingData.VariableHasType<DateTime>(column.Key)) { 269 DateTime prev = PreprocessingData.GetCell<DateTime>(column.Key, prevIndex); 270 DateTime next = PreprocessingData.GetCell<DateTime>(column.Key, nextIndex); 271 double interpolationStep = (next - prev).TotalSeconds / valuesToInterpolate; 272 273 for (int i = prevIndex; i < nextIndex; ++i) { 274 DateTime interpolated = prev.AddSeconds(interpolationStep * (i - prevIndex)); 275 PreprocessingData.SetCell<DateTime>(column.Key, i, interpolated); 276 } 277 } 278 } 279 280 private int IndexOfPrevPresentValue(int columnIndex, int start) { 281 int offset = start - 1; 282 while (offset >= 0 && PreprocessingData.IsCellEmpty(columnIndex, offset)) { 283 offset--; 284 } 285 286 return offset; 287 } 288 289 private int IndexOfNextPresentValue(int columnIndex, int start) { 290 int offset = start + 1; 291 while (offset < PreprocessingData.Rows && PreprocessingData.IsCellEmpty(columnIndex, offset)) { 292 offset++; 293 } 294 295 return offset; 296 } 297 298 public void Shuffle(bool shuffleRangesSeparately) { 299 var random = new FastRandom(); 300 301 if (shuffleRangesSeparately) { 302 var ranges = new[] { PreprocessingData.TestPartition, PreprocessingData.TrainingPartition }; 303 PreprocessingData.InTransaction(() => { 304 // process all given ranges - e.g. TrainingPartition, TestPartition 305 foreach (IntRange range in ranges) { 306 var indices = Enumerable.Range(0, PreprocessingData.Rows).ToArray(); 307 var shuffledIndices = Enumerable.Range(range.Start, range.Size).Shuffle(random).ToArray(); 308 for (int i = range.Start, j = 0; i < range.End; i++, j++) 309 indices[i] = shuffledIndices[j]; 310 311 ReOrderToIndices(indices); 312 } 313 }); 314 315 } else { 316 PreprocessingData.InTransaction(() => { 317 var indices = Enumerable.Range(0, PreprocessingData.Rows).ToArray(); 318 indices.ShuffleInPlace(random); 319 ReOrderToIndices(indices); 320 }); 321 } 322 } 323 324 public void ReOrderToIndices(int[] indices) { 325 PreprocessingData.InTransaction(() => { 326 for (int i = 0; i < PreprocessingData.Columns; ++i) { 327 if (PreprocessingData.VariableHasType<double>(i)) 328 ReOrderToIndices<double>(i, indices); 329 else if (PreprocessingData.VariableHasType<string>(i)) 330 ReOrderToIndices<string>(i, indices); 331 else if (PreprocessingData.VariableHasType<DateTime>(i)) 332 ReOrderToIndices<DateTime>(i, indices); 333 } 334 }); 335 } 336 337 private void ReOrderToIndices<T>(int columnIndex, int[] indices) { 338 var originalData = new List<T>(PreprocessingData.GetValues<T>(columnIndex)); 339 if (indices.Length != originalData.Count) throw new InvalidOperationException("The number of provided indices does not match the values."); 340 341 for (int i = 0; i < indices.Length; i++) { 342 T newValue = originalData[indices[i]]; 343 PreprocessingData.SetCell<T>(columnIndex, i, newValue); 344 } 345 } 346 #endregion 138 347 } 139 348 } -
branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Content/ManipulationContent.cs
r15274 r15285 20 20 #endregion 21 21 22 using System; 23 using System.Collections.Generic; 22 24 using System.Drawing; 25 using System.Linq; 23 26 using HeuristicLab.Common; 24 27 using HeuristicLab.Core; … … 33 36 } 34 37 35 [Storable]36 public ManipulationLogic ManipulationLogic { get; private set; }37 38 38 #region Constructor, Cloning & Persistence 39 public ManipulationContent(IFilteredPreprocessingData preprocessingData , ManipulationLogic manipulationLogic)39 public ManipulationContent(IFilteredPreprocessingData preprocessingData) 40 40 : base(preprocessingData) { 41 ManipulationLogic = manipulationLogic;42 41 } 43 42 44 43 public ManipulationContent(ManipulationContent original, Cloner cloner) : 45 44 base(original, cloner) { 46 ManipulationLogic = cloner.Clone(original.ManipulationLogic);47 45 } 48 46 public override IDeepCloneable Clone(Cloner cloner) { … … 54 52 : base(deserializing) { } 55 53 #endregion 54 55 public List<int> RowsWithMissingValuesGreater(double percent) { 56 List<int> rows = new List<int>(); 57 58 for (int i = 0; i < PreprocessingData.Rows; ++i) { 59 int missingCount = PreprocessingData.GetRowMissingValueCount(i); 60 if (100f / PreprocessingData.Columns * missingCount > percent) { 61 rows.Add(i); 62 } 63 } 64 65 return rows; 66 } 67 68 public List<int> ColumnsWithMissingValuesGreater(double percent) { 69 List<int> columns = new List<int>(); 70 for (int i = 0; i < PreprocessingData.Columns; ++i) { 71 int missingCount = PreprocessingData.GetMissingValueCount(i); 72 if (100f / PreprocessingData.Rows * missingCount > percent) { 73 columns.Add(i); 74 } 75 } 76 77 return columns; 78 } 79 80 public List<int> ColumnsWithVarianceSmaller(double variance) { 81 List<int> columns = new List<int>(); 82 for (int i = 0; i < PreprocessingData.Columns; ++i) { 83 if (PreprocessingData.VariableHasType<double>(i)) { 84 double columnVariance = PreprocessingData.GetVariance<double>(i); 85 if (columnVariance < variance) { 86 columns.Add(i); 87 } 88 } else if (PreprocessingData.VariableHasType<DateTime>(i)) { 89 double columnVariance = (double)PreprocessingData.GetVariance<DateTime>(i).Ticks / TimeSpan.TicksPerSecond; 90 if (columnVariance < variance) { 91 columns.Add(i); 92 } 93 } 94 } 95 return columns; 96 } 97 98 public void DeleteRowsWithMissingValuesGreater(double percent) { 99 DeleteRows(RowsWithMissingValuesGreater(percent)); 100 } 101 102 public void DeleteColumnsWithMissingValuesGreater(double percent) { 103 DeleteColumns(ColumnsWithMissingValuesGreater(percent)); 104 } 105 106 public void DeleteColumnsWithVarianceSmaller(double variance) { 107 DeleteColumns(ColumnsWithVarianceSmaller(variance)); 108 } 109 110 private void DeleteRows(List<int> rows) { 111 PreprocessingData.InTransaction(() => { 112 foreach (int row in rows.OrderByDescending(x => x)) { 113 PreprocessingData.DeleteRow(row); 114 } 115 }); 116 } 117 118 private void DeleteColumns(List<int> columns) { 119 PreprocessingData.InTransaction(() => { 120 foreach (int column in columns.OrderByDescending(x => x)) { 121 PreprocessingData.DeleteColumn(column); 122 } 123 }); 124 } 56 125 } 57 126 }
Note: See TracChangeset
for help on using the changeset viewer.