Free cookie consent management tool by TermsFeed Policy Generator

Ignore:
Timestamp:
04/04/17 17:52:44 (8 years ago)
Author:
gkronber
Message:

#2650: merged the factors branch into trunk

Location:
trunk/sources
Files:
8 edited

Legend:

Unmodified
Added
Removed
  • trunk/sources

  • trunk/sources/HeuristicLab.Algorithms.DataAnalysis

  • trunk/sources/HeuristicLab.Algorithms.DataAnalysis/3.4/Linear/AlglibUtil.cs

    r14400 r14826  
    2020#endregion
    2121
     22using System;
    2223using System.Collections.Generic;
    2324using System.Linq;
     
    2728  public static class AlglibUtil {
    2829    public static double[,] PrepareInputMatrix(IDataset dataset, IEnumerable<string> variables, IEnumerable<int> rows) {
    29       List<string> variablesList = variables.ToList();
     30      // check input variables. Only double variables are allowed.
     31      var invalidInputs =
     32        variables.Where(name => !dataset.VariableHasType<double>(name));
     33      if (invalidInputs.Any())
     34        throw new NotSupportedException("Unsupported inputs: " + string.Join(", ", invalidInputs));
     35
    3036      List<int> rowsList = rows.ToList();
    31 
    32       double[,] matrix = new double[rowsList.Count, variablesList.Count];
     37      double[,] matrix = new double[rowsList.Count, variables.Count()];
    3338
    3439      int col = 0;
     
    4550      return matrix;
    4651    }
     52
    4753    public static double[,] PrepareAndScaleInputMatrix(IDataset dataset, IEnumerable<string> variables, IEnumerable<int> rows, Scaling scaling) {
     54      // check input variables. Only double variables are allowed.
     55      var invalidInputs =
     56        variables.Where(name => !dataset.VariableHasType<double>(name));
     57      if (invalidInputs.Any())
     58        throw new NotSupportedException("Unsupported inputs: " + string.Join(", ", invalidInputs));
     59
    4860      List<string> variablesList = variables.ToList();
    4961      List<int> rowsList = rows.ToList();
     
    6476      return matrix;
    6577    }
     78
     79    /// <summary>
     80    /// Prepares a binary data matrix from a number of factors and specified factor values
     81    /// </summary>
     82    /// <param name="dataset">A dataset that contains the variable values</param>
     83    /// <param name="factorVariables">An enumerable of categorical variables (factors). For each variable an enumerable of values must be specified.</param>
     84    /// <param name="rows">An enumerable of row indices for the dataset</param>
     85    /// <returns></returns>
     86    /// <remarks>Factor variables (categorical variables) are split up into multiple binary variables one for each specified value.</remarks>
     87    public static double[,] PrepareInputMatrix(
     88      IDataset dataset,
     89      IEnumerable<KeyValuePair<string, IEnumerable<string>>> factorVariables,
     90      IEnumerable<int> rows) {
     91      // check input variables. Only string variables are allowed.
     92      var invalidInputs =
     93        factorVariables.Select(kvp => kvp.Key).Where(name => !dataset.VariableHasType<string>(name));
     94      if (invalidInputs.Any())
     95        throw new NotSupportedException("Unsupported inputs: " + string.Join(", ", invalidInputs));
     96
     97      int numBinaryColumns = factorVariables.Sum(kvp => kvp.Value.Count());
     98
     99      List<int> rowsList = rows.ToList();
     100      double[,] matrix = new double[rowsList.Count, numBinaryColumns];
     101
     102      int col = 0;
     103      foreach (var kvp in factorVariables) {
     104        var varName = kvp.Key;
     105        var cats = kvp.Value;
     106        if (!cats.Any()) continue;
     107        foreach (var cat in cats) {
     108          var values = dataset.GetStringValues(varName, rows);
     109          int row = 0;
     110          foreach (var value in values) {
     111            matrix[row, col] = value == cat ? 1 : 0;
     112            row++;
     113          }
     114          col++;
     115        }
     116      }
     117      return matrix;
     118    }
     119
     120    public static IEnumerable<KeyValuePair<string, IEnumerable<string>>> GetFactorVariableValues(IDataset ds, IEnumerable<string> factorVariables, IEnumerable<int> rows) {
     121      return from factor in factorVariables
     122             let distinctValues = ds.GetStringValues(factor, rows).Distinct().ToArray()
     123             // 1 distinct value => skip (constant)
     124             // 2 distinct values => only take one of the two values
     125             // >=3 distinct values => create a binary value for each value
     126             let reducedValues = distinctValues.Length <= 2
     127               ? distinctValues.Take(distinctValues.Length - 1)
     128               : distinctValues
     129             select new KeyValuePair<string, IEnumerable<string>>(factor, reducedValues);
     130    }
    66131  }
    67132}
  • trunk/sources/HeuristicLab.Algorithms.DataAnalysis/3.4/Linear/LinearDiscriminantAnalysis.cs

    r14685 r14826  
    3737  /// Linear discriminant analysis classification algorithm.
    3838  /// </summary>
    39   [Item("Linear Discriminant Analysis", "Linear discriminant analysis classification algorithm (wrapper for ALGLIB).")]
     39  [Item("Linear Discriminant Analysis (LDA)", "Linear discriminant analysis classification algorithm (wrapper for ALGLIB).")]
    4040  [Creatable(CreatableAttribute.Categories.DataAnalysisClassification, Priority = 100)]
    4141  [StorableClass]
     
    7171      IEnumerable<int> rows = problemData.TrainingIndices;
    7272      int nClasses = problemData.ClassNames.Count();
    73       double[,] inputMatrix = AlglibUtil.PrepareInputMatrix(dataset, allowedInputVariables.Concat(new string[] { targetVariable }), rows);
     73      var doubleVariableNames = allowedInputVariables.Where(dataset.VariableHasType<double>).ToArray();
     74      var factorVariableNames = allowedInputVariables.Where(dataset.VariableHasType<string>).ToArray();
     75      double[,] inputMatrix = AlglibUtil.PrepareInputMatrix(dataset, doubleVariableNames.Concat(new string[] { targetVariable }), rows);
     76
     77      var factorVariables = AlglibUtil.GetFactorVariableValues(dataset, factorVariableNames, rows);
     78      double[,] factorMatrix = AlglibUtil.PrepareInputMatrix(dataset, factorVariables, rows);
     79
     80      inputMatrix = factorMatrix.HorzCat(inputMatrix);
     81
    7482      if (inputMatrix.Cast<double>().Any(x => double.IsNaN(x) || double.IsInfinity(x)))
    7583        throw new NotSupportedException("Linear discriminant analysis does not support NaN or infinity values in the input dataset.");
     
    8391      int info;
    8492      double[] w;
    85       alglib.fisherlda(inputMatrix, inputMatrix.GetLength(0), allowedInputVariables.Count(), nClasses, out info, out w);
     93      alglib.fisherlda(inputMatrix, inputMatrix.GetLength(0), inputMatrix.GetLength(1) - 1, nClasses, out info, out w);
    8694      if (info < 1) throw new ArgumentException("Error in calculation of linear discriminant analysis solution");
    8795
     
    93101
    94102      int col = 0;
    95       foreach (string column in allowedInputVariables) {
     103      foreach (var kvp in factorVariables) {
     104        var varName = kvp.Key;
     105        foreach (var cat in kvp.Value) {
     106          BinaryFactorVariableTreeNode vNode =
     107            (BinaryFactorVariableTreeNode)new HeuristicLab.Problems.DataAnalysis.Symbolic.BinaryFactorVariable().CreateTreeNode();
     108          vNode.VariableName = varName;
     109          vNode.VariableValue = cat;
     110          vNode.Weight = w[col];
     111          addition.AddSubtree(vNode);
     112          col++;
     113        }
     114      }
     115      foreach (string column in doubleVariableNames) {
    96116        VariableTreeNode vNode = (VariableTreeNode)new HeuristicLab.Problems.DataAnalysis.Symbolic.Variable().CreateTreeNode();
    97117        vNode.VariableName = column;
  • trunk/sources/HeuristicLab.Algorithms.DataAnalysis/3.4/Linear/LinearRegression.cs

    r14685 r14826  
    7474      IEnumerable<string> allowedInputVariables = problemData.AllowedInputVariables;
    7575      IEnumerable<int> rows = problemData.TrainingIndices;
    76       double[,] inputMatrix = AlglibUtil.PrepareInputMatrix(dataset, allowedInputVariables.Concat(new string[] { targetVariable }), rows);
     76      var doubleVariables = allowedInputVariables.Where(dataset.VariableHasType<double>);
     77      var factorVariableNames = allowedInputVariables.Where(dataset.VariableHasType<string>);
     78      var factorVariables = AlglibUtil.GetFactorVariableValues(dataset, factorVariableNames, rows);
     79      double[,] binaryMatrix = AlglibUtil.PrepareInputMatrix(dataset, factorVariables, rows);
     80      double[,] doubleVarMatrix = AlglibUtil.PrepareInputMatrix(dataset, doubleVariables.Concat(new string[] { targetVariable }), rows);
     81      var inputMatrix = binaryMatrix.HorzCat(doubleVarMatrix);
     82
    7783      if (inputMatrix.Cast<double>().Any(x => double.IsNaN(x) || double.IsInfinity(x)))
    7884        throw new NotSupportedException("Linear regression does not support NaN or infinity values in the input dataset.");
     
    99105
    100106      int col = 0;
    101       foreach (string column in allowedInputVariables) {
     107      foreach (var kvp in factorVariables) {
     108        var varName = kvp.Key;
     109        foreach (var cat in kvp.Value) {
     110          BinaryFactorVariableTreeNode vNode =
     111            (BinaryFactorVariableTreeNode)new HeuristicLab.Problems.DataAnalysis.Symbolic.BinaryFactorVariable().CreateTreeNode();
     112          vNode.VariableName = varName;
     113          vNode.VariableValue = cat;
     114          vNode.Weight = coefficients[col];
     115          addition.AddSubtree(vNode);
     116          col++;
     117        }
     118      }
     119      foreach (string column in doubleVariables) {
    102120        VariableTreeNode vNode = (VariableTreeNode)new HeuristicLab.Problems.DataAnalysis.Symbolic.Variable().CreateTreeNode();
    103121        vNode.VariableName = column;
  • trunk/sources/HeuristicLab.Algorithms.DataAnalysis/3.4/Linear/MultinomialLogitClassification.cs

    r14523 r14826  
    6969      var dataset = problemData.Dataset;
    7070      string targetVariable = problemData.TargetVariable;
    71       IEnumerable<string> allowedInputVariables = problemData.AllowedInputVariables;
     71      var doubleVariableNames = problemData.AllowedInputVariables.Where(dataset.VariableHasType<double>);
     72      var factorVariableNames = problemData.AllowedInputVariables.Where(dataset.VariableHasType<string>);
    7273      IEnumerable<int> rows = problemData.TrainingIndices;
    73       double[,] inputMatrix = AlglibUtil.PrepareInputMatrix(dataset, allowedInputVariables.Concat(new string[] { targetVariable }), rows);
     74      double[,] inputMatrix = AlglibUtil.PrepareInputMatrix(dataset, doubleVariableNames.Concat(new string[] { targetVariable }), rows);
     75
     76      var factorVariableValues = AlglibUtil.GetFactorVariableValues(dataset, factorVariableNames, rows);
     77      var factorMatrix = AlglibUtil.PrepareInputMatrix(dataset, factorVariableValues, rows);
     78      inputMatrix = factorMatrix.HorzCat(inputMatrix);
     79
    7480      if (inputMatrix.Cast<double>().Any(x => double.IsNaN(x) || double.IsInfinity(x)))
    7581        throw new NotSupportedException("Multinomial logit classification does not support NaN or infinity values in the input dataset.");
     
    96102      relClassError = alglib.mnlrelclserror(lm, inputMatrix, nRows);
    97103
    98       MultinomialLogitClassificationSolution solution = new MultinomialLogitClassificationSolution(new MultinomialLogitModel(lm, targetVariable, allowedInputVariables, classValues), (IClassificationProblemData)problemData.Clone());
     104      MultinomialLogitClassificationSolution solution = new MultinomialLogitClassificationSolution(new MultinomialLogitModel(lm, targetVariable, doubleVariableNames, factorVariableValues, classValues), (IClassificationProblemData)problemData.Clone());
    99105      return solution;
    100106    }
  • trunk/sources/HeuristicLab.Algorithms.DataAnalysis/3.4/Linear/MultinomialLogitClassificationSolution.cs

    r14185 r14826  
    4343      : base(original, cloner) {
    4444    }
    45     public MultinomialLogitClassificationSolution( MultinomialLogitModel logitModel,IClassificationProblemData problemData)
     45    public MultinomialLogitClassificationSolution(MultinomialLogitModel logitModel, IClassificationProblemData problemData)
    4646      : base(logitModel, problemData) {
    4747    }
  • trunk/sources/HeuristicLab.Algorithms.DataAnalysis/3.4/Linear/MultinomialLogitModel.cs

    r14400 r14826  
    5656    [Storable]
    5757    private double[] classValues;
     58    [Storable]
     59    private List<KeyValuePair<string, IEnumerable<string>>> factorVariables;
     60
    5861    [StorableConstructor]
    5962    private MultinomialLogitModel(bool deserializing)
     
    6871      allowedInputVariables = (string[])original.allowedInputVariables.Clone();
    6972      classValues = (double[])original.classValues.Clone();
     73      this.factorVariables = original.factorVariables.Select(kvp => new KeyValuePair<string, IEnumerable<string>>(kvp.Key, new List<string>(kvp.Value))).ToList();
    7074    }
    71     public MultinomialLogitModel(alglib.logitmodel logitModel, string targetVariable, IEnumerable<string> allowedInputVariables, double[] classValues)
     75    public MultinomialLogitModel(alglib.logitmodel logitModel, string targetVariable, IEnumerable<string> doubleInputVariables, IEnumerable<KeyValuePair<string, IEnumerable<string>>> factorVariables, double[] classValues)
    7276      : base(targetVariable) {
    7377      this.name = ItemName;
    7478      this.description = ItemDescription;
    7579      this.logitModel = logitModel;
    76       this.allowedInputVariables = allowedInputVariables.ToArray();
     80      this.allowedInputVariables = doubleInputVariables.ToArray();
     81      this.factorVariables = factorVariables.Select(kvp => new KeyValuePair<string, IEnumerable<string>>(kvp.Key, new List<string>(kvp.Value))).ToList();
    7782      this.classValues = (double[])classValues.Clone();
     83    }
     84
     85    [StorableHook(HookType.AfterDeserialization)]
     86    private void AfterDeserialization() {
     87      // BackwardsCompatibility3.3
     88      #region Backwards compatible code, remove with 3.4
     89      factorVariables = new List<KeyValuePair<string, IEnumerable<string>>>();
     90      #endregion
    7891    }
    7992
     
    8396
    8497    public override IEnumerable<double> GetEstimatedClassValues(IDataset dataset, IEnumerable<int> rows) {
     98
    8599      double[,] inputData = AlglibUtil.PrepareInputMatrix(dataset, allowedInputVariables, rows);
     100      double[,] factorData = AlglibUtil.PrepareInputMatrix(dataset, factorVariables, rows);
     101
     102      inputData = factorData.HorzCat(inputData);
    86103
    87104      int n = inputData.GetLength(0);
Note: See TracChangeset for help on using the changeset viewer.