Free cookie consent management tool by TermsFeed Policy Generator

source: branches/3140_NumberSymbol/HeuristicLab.Problems.DataAnalysis/3.4/DatasetExtensions.cs @ 18140

Last change on this file since 18140 was 17999, checked in by mkommend, 3 years ago

#3129: Work on variable range calculation of datasets.

Renamed method to calculate variable ranges to a more appropriate method name.
Added possibility to ignore NaNs for range calculation.

File size: 5.6 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Linq;
25
26namespace HeuristicLab.Problems.DataAnalysis {
27  public static class DatasetExtensions {
28    public static double[,] ToArray(this IDataset dataset, IEnumerable<string> variables, IEnumerable<int> rows) {
29      return ToArray(dataset,
30        variables,
31        transformations: variables.Select(_ => (ITransformation<double>)null), // no transform
32        rows: rows);
33    }
34    public static double[,] ToArray(this IDataset dataset, IEnumerable<string> variables,
35      IEnumerable<ITransformation<double>> transformations, IEnumerable<int> rows) {
36      string[] variablesArr = variables.ToArray();
37      int[] rowsArr = rows.ToArray();
38      ITransformation<double>[] transformArr = transformations.ToArray();
39      if (transformArr.Length != variablesArr.Length)
40        throw new ArgumentException("Number of variables and number of transformations must match.");
41
42      double[,] matrix = new double[rowsArr.Length, variablesArr.Length];
43
44      for (int i = 0; i < variablesArr.Length; i++) {
45        var origValues = dataset.GetDoubleValues(variablesArr[i], rowsArr);
46        var values = transformArr[i] != null ? transformArr[i].Apply(origValues) : origValues;
47        int row = 0;
48        foreach (var value in values) {
49          matrix[row, i] = value;
50          row++;
51        }
52      }
53
54      return matrix;
55    }
56
57    /// <summary>
58    /// Prepares a binary data matrix from a number of factors and specified factor values
59    /// </summary>
60    /// <param name="dataset">A dataset that contains the variable values</param>
61    /// <param name="factorVariables">An enumerable of categorical variables (factors). For each variable an enumerable of values must be specified.</param>
62    /// <param name="rows">An enumerable of row indices for the dataset</param>
63    /// <returns></returns>
64    /// <remarks>Factor variables (categorical variables) are split up into multiple binary variables one for each specified value.</remarks>
65    public static double[,] ToArray(
66      this IDataset dataset,
67      IEnumerable<KeyValuePair<string, IEnumerable<string>>> factorVariables,
68      IEnumerable<int> rows) {
69      // check input variables. Only string variables are allowed.
70      var invalidInputs =
71        factorVariables.Select(kvp => kvp.Key).Where(name => !dataset.VariableHasType<string>(name));
72      if (invalidInputs.Any())
73        throw new NotSupportedException("Unsupported inputs: " + string.Join(", ", invalidInputs));
74
75      int numBinaryColumns = factorVariables.Sum(kvp => kvp.Value.Count());
76
77      List<int> rowsList = rows.ToList();
78      double[,] matrix = new double[rowsList.Count, numBinaryColumns];
79
80      int col = 0;
81      foreach (var kvp in factorVariables) {
82        var varName = kvp.Key;
83        var cats = kvp.Value;
84        if (!cats.Any()) continue;
85        foreach (var cat in cats) {
86          var values = dataset.GetStringValues(varName, rows);
87          int row = 0;
88          foreach (var value in values) {
89            matrix[row, col] = value == cat ? 1 : 0;
90            row++;
91          }
92          col++;
93        }
94      }
95      return matrix;
96    }
97
98    public static IntervalCollection GetVariableRanges(this IDataset dataset, bool ignoreNaNs = true) {
99      IntervalCollection variableRanges = new IntervalCollection();
100      foreach (var variable in dataset.DoubleVariables) { // ranges can only be calculated for double variables
101        var values = dataset.GetDoubleValues(variable);
102
103        if (ignoreNaNs) {
104          values = values.Where(v => !double.IsNaN(v));
105
106          if (!values.Any()) { //handle values with only NaNs explicitly
107            var emptyInterval = new Interval(double.NaN, double.NaN);
108            variableRanges.AddInterval(variable, emptyInterval);
109            continue;
110          }
111        }
112
113        var interval = Interval.GetInterval(values);
114        variableRanges.AddInterval(variable, interval);
115      }
116
117      return variableRanges;
118    }
119
120    public static IEnumerable<KeyValuePair<string, IEnumerable<string>>> GetFactorVariableValues(
121      this IDataset ds, IEnumerable<string> factorVariables, IEnumerable<int> rows) {
122      return from factor in factorVariables
123             let distinctValues = ds.GetStringValues(factor, rows).Distinct().ToArray()
124             // 1 distinct value => skip (constant)
125             // 2 distinct values => only take one of the two values
126             // >=3 distinct values => create a binary value for each value
127             let reducedValues = distinctValues.Length <= 2
128               ? distinctValues.Take(distinctValues.Length - 1)
129               : distinctValues
130             select new KeyValuePair<string, IEnumerable<string>>(factor, reducedValues);
131    }
132  }
133}
Note: See TracBrowser for help on using the repository browser.