Free cookie consent management tool by TermsFeed Policy Generator

Ignore:
Timestamp:
04/11/17 15:55:44 (7 years ago)
Author:
gkronber
Message:

#2697: applied r14390, r14391, r14393, r14394, r14396 again (resolving conflicts)

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/sources/HeuristicLab.Problems.DataAnalysis/3.4/DatasetExtensions.cs

    r14400 r14843  
    2020#endregion
    2121
     22using System;
    2223using System.Collections.Generic;
     24using System.Linq;
    2325
    2426namespace HeuristicLab.Problems.DataAnalysis {
    2527  public static class DatasetExtensions {
    26     public static IEnumerable<T> TakeEvery<T>(this IEnumerable<T> xs, int nth) {
    27       int i = 0;
    28       foreach (var x in xs) {
    29         if (i % nth == 0) yield return x;
    30         i++;
     28    public static double[,] ToArray(this IDataset dataset, IEnumerable<string> variables, IEnumerable<int> rows) {
     29      return ToArray(dataset,
     30        variables,
     31        transformations: variables.Select(_ => (ITransformation<double>)null), // no transform
     32        rows: rows);
     33    }
     34    public static double[,] ToArray(this IDataset dataset, IEnumerable<string> variables,
     35      IEnumerable<ITransformation<double>> transformations, IEnumerable<int> rows) {
     36      string[] variablesArr = variables.ToArray();
     37      int[] rowsArr = rows.ToArray();
     38      ITransformation<double>[] transformArr = transformations.ToArray();
     39      if (transformArr.Length != variablesArr.Length)
     40        throw new ArgumentException("Number of variables and number of transformations must match.");
     41
     42      double[,] matrix = new double[rowsArr.Length, variablesArr.Length];
     43
     44      for (int i = 0; i < variablesArr.Length; i++) {
     45        var origValues = dataset.GetDoubleValues(variablesArr[i], rowsArr);
     46        var values = transformArr[i] != null ? transformArr[i].Apply(origValues) : origValues;
     47        int row = 0;
     48        foreach (var value in values) {
     49          matrix[row, i] = value;
     50          row++;
     51        }
    3152      }
     53
     54      return matrix;
     55    }
     56
     57    /// <summary>
     58    /// Prepares a binary data matrix from a number of factors and specified factor values
     59    /// </summary>
     60    /// <param name="dataset">A dataset that contains the variable values</param>
     61    /// <param name="factorVariables">An enumerable of categorical variables (factors). For each variable an enumerable of values must be specified.</param>
     62    /// <param name="rows">An enumerable of row indices for the dataset</param>
     63    /// <returns></returns>
     64    /// <remarks>Factor variables (categorical variables) are split up into multiple binary variables one for each specified value.</remarks>
     65    public static double[,] ToArray(
     66      this IDataset dataset,
     67      IEnumerable<KeyValuePair<string, IEnumerable<string>>> factorVariables,
     68      IEnumerable<int> rows) {
     69      // check input variables. Only string variables are allowed.
     70      var invalidInputs =
     71        factorVariables.Select(kvp => kvp.Key).Where(name => !dataset.VariableHasType<string>(name));
     72      if (invalidInputs.Any())
     73        throw new NotSupportedException("Unsupported inputs: " + string.Join(", ", invalidInputs));
     74
     75      int numBinaryColumns = factorVariables.Sum(kvp => kvp.Value.Count());
     76
     77      List<int> rowsList = rows.ToList();
     78      double[,] matrix = new double[rowsList.Count, numBinaryColumns];
     79
     80      int col = 0;
     81      foreach (var kvp in factorVariables) {
     82        var varName = kvp.Key;
     83        var cats = kvp.Value;
     84        if (!cats.Any()) continue;
     85        foreach (var cat in cats) {
     86          var values = dataset.GetStringValues(varName, rows);
     87          int row = 0;
     88          foreach (var value in values) {
     89            matrix[row, col] = value == cat ? 1 : 0;
     90            row++;
     91          }
     92          col++;
     93        }
     94      }
     95      return matrix;
     96    }
     97
     98    public static IEnumerable<KeyValuePair<string, IEnumerable<string>>> GetFactorVariableValues(
     99      this IDataset ds, IEnumerable<string> factorVariables, IEnumerable<int> rows) {
     100      return from factor in factorVariables
     101             let distinctValues = ds.GetStringValues(factor, rows).Distinct().ToArray()
     102             // 1 distinct value => skip (constant)
     103             // 2 distinct values => only take one of the two values
     104             // >=3 distinct values => create a binary value for each value
     105             let reducedValues = distinctValues.Length <= 2
     106               ? distinctValues.Take(distinctValues.Length - 1)
     107               : distinctValues
     108             select new KeyValuePair<string, IEnumerable<string>>(factor, reducedValues);
    32109    }
    33110  }
Note: See TracChangeset for help on using the changeset viewer.