Free cookie consent management tool by TermsFeed Policy Generator

Ignore:
Timestamp:
08/05/16 14:25:28 (8 years ago)
Author:
gkronber
Message:

#2650: work in progress..

Location:
branches/symbreg-factors-2650/HeuristicLab.Algorithms.DataAnalysis/3.4
Files:
6 edited

Legend:

Unmodified
Added
Removed
  • branches/symbreg-factors-2650/HeuristicLab.Algorithms.DataAnalysis/3.4/GaussianProcess/GaussianProcessClassificationModelCreator.cs

    r14185 r14237  
    6767        HyperparameterGradientsParameter.ActualValue = new RealVector(model.HyperparameterGradients);
    6868        return base.Apply();
    69       } catch (ArgumentException) { } catch (alglib.alglibexception) { }
     69      }
     70      catch (ArgumentException) { }
     71      catch (alglib.alglibexception) { }
    7072      NegativeLogLikelihoodParameter.ActualValue = new DoubleValue(1E300);
    7173      HyperparameterGradientsParameter.ActualValue = new RealVector(Hyperparameter.Count());
  • branches/symbreg-factors-2650/HeuristicLab.Algorithms.DataAnalysis/3.4/GradientBoostedTrees/GradientBoostedTreesAlgorithmStatic.cs

    r14185 r14237  
    148148    // for custom stepping & termination
    149149    public static IGbmState CreateGbmState(IRegressionProblemData problemData, ILossFunction lossFunction, uint randSeed, int maxSize = 3, double r = 0.66, double m = 0.5, double nu = 0.01) {
     150      // check input variables. Only double variables are allowed.
     151      var invalidInputs =
     152        problemData.AllowedInputVariables.Where(name => !problemData.Dataset.VariableHasType<double>(name));
     153      if (invalidInputs.Any())
     154        throw new NotSupportedException("Gradient tree boosting only supports real-valued variables. Unsupported inputs: " + string.Join(", ", invalidInputs));
     155
    150156      return new GbmState(problemData, lossFunction, randSeed, maxSize, r, m, nu);
    151157    }
  • branches/symbreg-factors-2650/HeuristicLab.Algorithms.DataAnalysis/3.4/Linear/AlglibUtil.cs

    r14185 r14237  
    2020#endregion
    2121
     22using System;
    2223using System.Collections.Generic;
    2324using System.Linq;
     
    2728  public static class AlglibUtil {
    2829    public static double[,] PrepareInputMatrix(IDataset dataset, IEnumerable<string> variables, IEnumerable<int> rows) {
    29       List<string> variablesList = variables.ToList();
     30      // check input variables. Only double variables are allowed.
     31      var invalidInputs =
     32        variables.Where(name => !dataset.VariableHasType<double>(name));
     33      if (invalidInputs.Any())
     34        throw new NotSupportedException("Unsupported inputs: " + string.Join(", ", invalidInputs));
     35
    3036      List<int> rowsList = rows.ToList();
    31 
    32       double[,] matrix = new double[rowsList.Count, variablesList.Count];
     37      double[,] matrix = new double[rowsList.Count, variables.Count()];
    3338
    3439      int col = 0;
     
    4550      return matrix;
    4651    }
     52
    4753    public static double[,] PrepareAndScaleInputMatrix(IDataset dataset, IEnumerable<string> variables, IEnumerable<int> rows, Scaling scaling) {
     54      // check input variables. Only double variables are allowed.
     55      var invalidInputs =
     56        variables.Where(name => !dataset.VariableHasType<double>(name));
     57      if (invalidInputs.Any())
     58        throw new NotSupportedException("Unsupported inputs: " + string.Join(", ", invalidInputs));
     59
    4860      List<string> variablesList = variables.ToList();
    4961      List<int> rowsList = rows.ToList();
     
    6476      return matrix;
    6577    }
     78
     79    /// <summary>
     80    /// Prepares a binary data matrix from a number of factors and specified factor values
     81    /// </summary>
     82    /// <param name="dataset">A dataset that contains the variable values</param>
     83    /// <param name="factorVariables">An enumerable of categorical variables (factors). For each variable an enumerable of values must be specified.</param>
     84    /// <param name="rows">An enumerable of row indices for the dataset</param>
     85    /// <returns></returns>
     86    /// <remarks>Factor variables (categorical variables) are split up into multiple binary variables one for each specified value.</remarks>
     87    public static double[,] PrepareInputMatrix(
     88      IDataset dataset,
     89      IEnumerable<KeyValuePair<string, IEnumerable<string>>> factorVariables,
     90      IEnumerable<int> rows) {
     91      // check input variables. Only string variables are allowed.
     92      var invalidInputs =
     93        factorVariables.Select(kvp => kvp.Key).Where(name => !dataset.VariableHasType<string>(name));
     94      if (invalidInputs.Any())
     95        throw new NotSupportedException("Unsupported inputs: " + string.Join(", ", invalidInputs));
     96
     97      int numBinaryColumns = factorVariables.Sum(kvp => kvp.Value.Count());
     98
     99      List<int> rowsList = rows.ToList();
     100      double[,] matrix = new double[rowsList.Count, numBinaryColumns];
     101
     102      int col = 0;
     103      foreach (var kvp in factorVariables) {
     104        var varName = kvp.Key;
     105        var cats = kvp.Value;
     106        var catCount = cats.Count();
     107        if (catCount == 0) continue;
     108        foreach (var cat in cats) {
     109          var values = dataset.GetStringValues(varName, rows);
     110          int row = 0;
     111          foreach (var value in values) {
     112            matrix[row, col] = value == cat ? 1 : 0;
     113            row++;
     114          }
     115          col++;
     116        }
     117      }
     118      return matrix;
     119    }
    66120  }
    67121}
  • branches/symbreg-factors-2650/HeuristicLab.Algorithms.DataAnalysis/3.4/Linear/LinearRegression.cs

    r14185 r14237  
    7373      IEnumerable<string> allowedInputVariables = problemData.AllowedInputVariables;
    7474      IEnumerable<int> rows = problemData.TrainingIndices;
    75       double[,] inputMatrix = AlglibUtil.PrepareInputMatrix(dataset, allowedInputVariables.Concat(new string[] { targetVariable }), rows);
     75      var doubleVariables = allowedInputVariables.Where(dataset.VariableHasType<double>);
     76      var factorVariableNames = allowedInputVariables.Where(dataset.VariableHasType<string>);
     77      var factorVariables = from factor in factorVariableNames
     78                            let distinctValues = dataset.GetStringValues(factor, rows).Distinct().ToArray()
     79                            // 1 distinct value => skip (constant)
     80                            // 2 distinct values => only take one of the two values
     81                            // >=3 distinct values => create a binary value for each value
     82                            let reducedValues = distinctValues.Length <= 2
     83                              ? distinctValues.Take(distinctValues.Length - 1)
     84                              : distinctValues
     85                            select new KeyValuePair<string, IEnumerable<string>>(factor, reducedValues);
     86      double[,] binaryMatrix = AlglibUtil.PrepareInputMatrix(dataset, factorVariables, rows);
     87      double[,] doubleVarMatrix = AlglibUtil.PrepareInputMatrix(dataset, doubleVariables.Concat(new string[] { targetVariable }), rows);
     88      var inputMatrix = binaryMatrix.VertCat(doubleVarMatrix);
     89
    7690      if (inputMatrix.Cast<double>().Any(x => double.IsNaN(x) || double.IsInfinity(x)))
    7791        throw new NotSupportedException("Linear regression does not support NaN or infinity values in the input dataset.");
     
    98112
    99113      int col = 0;
    100       foreach (string column in allowedInputVariables) {
     114      foreach (var kvp in factorVariables) {
     115        var varName = kvp.Key;
     116        foreach (var cat in kvp.Value) {
     117          FactorVariableTreeNode vNode =
     118            (FactorVariableTreeNode)new HeuristicLab.Problems.DataAnalysis.Symbolic.FactorVariable().CreateTreeNode();
     119          vNode.VariableName = varName;
     120          vNode.VariableValue = cat;
     121          vNode.Weight = coefficients[col];
     122          addition.AddSubtree(vNode);
     123          col++;
     124        }
     125      }
     126      foreach (string column in doubleVariables) {
    101127        VariableTreeNode vNode = (VariableTreeNode)new HeuristicLab.Problems.DataAnalysis.Symbolic.Variable().CreateTreeNode();
    102128        vNode.VariableName = column;
  • branches/symbreg-factors-2650/HeuristicLab.Algorithms.DataAnalysis/3.4/Nca/ModelCreation/NcaModelCreator.cs

    r14185 r14237  
    2020#endregion
    2121
     22using System;
    2223using System.Linq;
    2324using HeuristicLab.Common;
  • branches/symbreg-factors-2650/HeuristicLab.Algorithms.DataAnalysis/3.4/NearestNeighbour/NearestNeighbourModel.cs

    r14185 r14237  
    104104      this.allowedInputVariables = allowedInputVariables.ToArray();
    105105
     106      // check input variables. Only double variables are allowed.
     107      var invalidInputs =
     108        allowedInputVariables.Where(name => !dataset.VariableHasType<double>(name));
     109      if (invalidInputs.Any())
     110        throw new NotSupportedException("Gradient tree boosting only supports real-valued variables. Unsupported inputs: " + string.Join(", ", invalidInputs));
     111
     112
    106113      var inputMatrix = AlglibUtil.PrepareInputMatrix(dataset,
    107114                                   allowedInputVariables.Concat(new string[] { targetVariable }),
Note: See TracChangeset for help on using the changeset viewer.