1  #region License Information


2  /* HeuristicLab


3  * Copyright (C) 20022018 Heuristic and Evolutionary Algorithms Laboratory (HEAL)


4  *


5  * This file is part of HeuristicLab.


6  *


7  * HeuristicLab is free software: you can redistribute it and/or modify


8  * it under the terms of the GNU General Public License as published by


9  * the Free Software Foundation, either version 3 of the License, or


10  * (at your option) any later version.


11  *


12  * HeuristicLab is distributed in the hope that it will be useful,


13  * but WITHOUT ANY WARRANTY; without even the implied warranty of


14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the


15  * GNU General Public License for more details.


16  *


17  * You should have received a copy of the GNU General Public License


18  * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.


19  */


20  #endregion


21 


22  using System;


23  using System.Collections.Generic;


24  using System.Linq;


25 


26  namespace HeuristicLab.Problems.DataAnalysis {


27  public static class DatasetExtensions {


28  public static double[,] ToArray(this IDataset dataset, IEnumerable<string> variables, IEnumerable<int> rows) {


29  return ToArray(dataset,


30  variables,


31  transformations: variables.Select(_ => (ITransformation<double>)null), // no transform


32  rows: rows);


33  }


34  public static double[,] ToArray(this IDataset dataset, IEnumerable<string> variables,


35  IEnumerable<ITransformation<double>> transformations, IEnumerable<int> rows) {


36  string[] variablesArr = variables.ToArray();


37  int[] rowsArr = rows.ToArray();


38  ITransformation<double>[] transformArr = transformations.ToArray();


39  if (transformArr.Length != variablesArr.Length)


40  throw new ArgumentException("Number of variables and number of transformations must match.");


41 


42  double[,] matrix = new double[rowsArr.Length, variablesArr.Length];


43 


44  for (int i = 0; i < variablesArr.Length; i++) {


45  var origValues = dataset.GetDoubleValues(variablesArr[i], rowsArr);


46  var values = transformArr[i] != null ? transformArr[i].Apply(origValues) : origValues;


47  int row = 0;


48  foreach (var value in values) {


49  matrix[row, i] = value;


50  row++;


51  }


52  }


53 


54  return matrix;


55  }


56 


57  /// <summary>


58  /// Prepares a binary data matrix from a number of factors and specified factor values


59  /// </summary>


60  /// <param name="dataset">A dataset that contains the variable values</param>


61  /// <param name="factorVariables">An enumerable of categorical variables (factors). For each variable an enumerable of values must be specified.</param>


62  /// <param name="rows">An enumerable of row indices for the dataset</param>


63  /// <returns></returns>


64  /// <remarks>Factor variables (categorical variables) are split up into multiple binary variables one for each specified value.</remarks>


65  public static double[,] ToArray(


66  this IDataset dataset,


67  IEnumerable<KeyValuePair<string, IEnumerable<string>>> factorVariables,


68  IEnumerable<int> rows) {


69  // check input variables. Only string variables are allowed.


70  var invalidInputs =


71  factorVariables.Select(kvp => kvp.Key).Where(name => !dataset.VariableHasType<string>(name));


72  if (invalidInputs.Any())


73  throw new NotSupportedException("Unsupported inputs: " + string.Join(", ", invalidInputs));


74 


75  int numBinaryColumns = factorVariables.Sum(kvp => kvp.Value.Count());


76 


77  List<int> rowsList = rows.ToList();


78  double[,] matrix = new double[rowsList.Count, numBinaryColumns];


79 


80  int col = 0;


81  foreach (var kvp in factorVariables) {


82  var varName = kvp.Key;


83  var cats = kvp.Value;


84  if (!cats.Any()) continue;


85  foreach (var cat in cats) {


86  var values = dataset.GetStringValues(varName, rows);


87  int row = 0;


88  foreach (var value in values) {


89  matrix[row, col] = value == cat ? 1 : 0;


90  row++;


91  }


92  col++;


93  }


94  }


95  return matrix;


96  }


97 


98  public static IEnumerable<KeyValuePair<string, IEnumerable<string>>> GetFactorVariableValues(


99  this IDataset ds, IEnumerable<string> factorVariables, IEnumerable<int> rows) {


100  return from factor in factorVariables


101  let distinctValues = ds.GetStringValues(factor, rows).Distinct().ToArray()


102  // 1 distinct value => skip (constant)


103  // 2 distinct values => only take one of the two values


104  // >=3 distinct values => create a binary value for each value


105  let reducedValues = distinctValues.Length <= 2


106  ? distinctValues.Take(distinctValues.Length  1)


107  : distinctValues


108  select new KeyValuePair<string, IEnumerable<string>>(factor, reducedValues);


109  }


110  }


111  }

