Changeset 15973 for branches/2522_RefactorPluginInfrastructure/HeuristicLab.Problems.DataAnalysis/3.4/DatasetExtensions.cs
- Timestamp:
- 06/28/18 11:13:37 (6 years ago)
- Location:
- branches/2522_RefactorPluginInfrastructure
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
branches/2522_RefactorPluginInfrastructure
- Property svn:ignore
-
old new 24 24 protoc.exe 25 25 obj 26 .vs
-
- Property svn:mergeinfo changed
- Property svn:ignore
-
branches/2522_RefactorPluginInfrastructure/HeuristicLab.Problems.DataAnalysis
- Property svn:mergeinfo changed
-
branches/2522_RefactorPluginInfrastructure/HeuristicLab.Problems.DataAnalysis/3.4/DatasetExtensions.cs
r12012 r15973 1 1 #region License Information 2 2 /* HeuristicLab 3 * Copyright (C) 2002-201 5Heuristic and Evolutionary Algorithms Laboratory (HEAL)3 * Copyright (C) 2002-2018 Heuristic and Evolutionary Algorithms Laboratory (HEAL) 4 4 * 5 5 * This file is part of HeuristicLab. … … 20 20 #endregion 21 21 22 using System; 22 23 using System.Collections.Generic; 24 using System.Linq; 23 25 24 26 namespace HeuristicLab.Problems.DataAnalysis { 25 27 public static class DatasetExtensions { 26 public static IEnumerable<T> TakeEvery<T>(this IEnumerable<T> xs, int nth) { 27 int i = 0; 28 foreach (var x in xs) { 29 if (i % nth == 0) yield return x; 30 i++; 28 public static double[,] ToArray(this IDataset dataset, IEnumerable<string> variables, IEnumerable<int> rows) { 29 return ToArray(dataset, 30 variables, 31 transformations: variables.Select(_ => (ITransformation<double>)null), // no transform 32 rows: rows); 33 } 34 public static double[,] ToArray(this IDataset dataset, IEnumerable<string> variables, 35 IEnumerable<ITransformation<double>> transformations, IEnumerable<int> rows) { 36 string[] variablesArr = variables.ToArray(); 37 int[] rowsArr = rows.ToArray(); 38 ITransformation<double>[] transformArr = transformations.ToArray(); 39 if (transformArr.Length != variablesArr.Length) 40 throw new ArgumentException("Number of variables and number of transformations must match."); 41 42 double[,] matrix = new double[rowsArr.Length, variablesArr.Length]; 43 44 for (int i = 0; i < variablesArr.Length; i++) { 45 var origValues = dataset.GetDoubleValues(variablesArr[i], rowsArr); 46 var values = transformArr[i] != null ? transformArr[i].Apply(origValues) : origValues; 47 int row = 0; 48 foreach (var value in values) { 49 matrix[row, i] = value; 50 row++; 51 } 31 52 } 53 54 return matrix; 55 } 56 57 /// <summary> 58 /// Prepares a binary data matrix from a number of factors and specified factor values 59 /// </summary> 60 /// <param name="dataset">A dataset that contains the variable values</param> 61 /// <param name="factorVariables">An enumerable of categorical variables (factors). For each variable an enumerable of values must be specified.</param> 62 /// <param name="rows">An enumerable of row indices for the dataset</param> 63 /// <returns></returns> 64 /// <remarks>Factor variables (categorical variables) are split up into multiple binary variables one for each specified value.</remarks> 65 public static double[,] ToArray( 66 this IDataset dataset, 67 IEnumerable<KeyValuePair<string, IEnumerable<string>>> factorVariables, 68 IEnumerable<int> rows) { 69 // check input variables. Only string variables are allowed. 70 var invalidInputs = 71 factorVariables.Select(kvp => kvp.Key).Where(name => !dataset.VariableHasType<string>(name)); 72 if (invalidInputs.Any()) 73 throw new NotSupportedException("Unsupported inputs: " + string.Join(", ", invalidInputs)); 74 75 int numBinaryColumns = factorVariables.Sum(kvp => kvp.Value.Count()); 76 77 List<int> rowsList = rows.ToList(); 78 double[,] matrix = new double[rowsList.Count, numBinaryColumns]; 79 80 int col = 0; 81 foreach (var kvp in factorVariables) { 82 var varName = kvp.Key; 83 var cats = kvp.Value; 84 if (!cats.Any()) continue; 85 foreach (var cat in cats) { 86 var values = dataset.GetStringValues(varName, rows); 87 int row = 0; 88 foreach (var value in values) { 89 matrix[row, col] = value == cat ? 1 : 0; 90 row++; 91 } 92 col++; 93 } 94 } 95 return matrix; 96 } 97 98 public static IEnumerable<KeyValuePair<string, IEnumerable<string>>> GetFactorVariableValues( 99 this IDataset ds, IEnumerable<string> factorVariables, IEnumerable<int> rows) { 100 return from factor in factorVariables 101 let distinctValues = ds.GetStringValues(factor, rows).Distinct().ToArray() 102 // 1 distinct value => skip (constant) 103 // 2 distinct values => only take one of the two values 104 // >=3 distinct values => create a binary value for each value 105 let reducedValues = distinctValues.Length <= 2 106 ? distinctValues.Take(distinctValues.Length - 1) 107 : distinctValues 108 select new KeyValuePair<string, IEnumerable<string>>(factor, reducedValues); 32 109 } 33 110 }
Note: See TracChangeset
for help on using the changeset viewer.