Free cookie consent management tool by TermsFeed Policy Generator

Ignore:
Timestamp:
04/10/17 15:48:20 (8 years ago)
Author:
gkronber
Message:

#2700 merged changesets from trunk to branch

Location:
branches/TSNE/HeuristicLab.Algorithms.DataAnalysis
Files:
7 edited

Legend:

Unmodified
Added
Removed
  • branches/TSNE/HeuristicLab.Algorithms.DataAnalysis

  • branches/TSNE/HeuristicLab.Algorithms.DataAnalysis/3.4/Linear/AlglibUtil.cs

    r14185 r14836  
    2020#endregion
    2121
     22using System;
    2223using System.Collections.Generic;
    2324using System.Linq;
     
    2728  public static class AlglibUtil {
    2829    public static double[,] PrepareInputMatrix(IDataset dataset, IEnumerable<string> variables, IEnumerable<int> rows) {
    29       List<string> variablesList = variables.ToList();
     30      // check input variables. Only double variables are allowed.
     31      var invalidInputs =
     32        variables.Where(name => !dataset.VariableHasType<double>(name));
     33      if (invalidInputs.Any())
     34        throw new NotSupportedException("Unsupported inputs: " + string.Join(", ", invalidInputs));
     35
    3036      List<int> rowsList = rows.ToList();
    31 
    32       double[,] matrix = new double[rowsList.Count, variablesList.Count];
     37      double[,] matrix = new double[rowsList.Count, variables.Count()];
    3338
    3439      int col = 0;
     
    4550      return matrix;
    4651    }
     52
    4753    public static double[,] PrepareAndScaleInputMatrix(IDataset dataset, IEnumerable<string> variables, IEnumerable<int> rows, Scaling scaling) {
     54      // check input variables. Only double variables are allowed.
     55      var invalidInputs =
     56        variables.Where(name => !dataset.VariableHasType<double>(name));
     57      if (invalidInputs.Any())
     58        throw new NotSupportedException("Unsupported inputs: " + string.Join(", ", invalidInputs));
     59
    4860      List<string> variablesList = variables.ToList();
    4961      List<int> rowsList = rows.ToList();
     
    6476      return matrix;
    6577    }
     78
     79    /// <summary>
     80    /// Prepares a binary data matrix from a number of factors and specified factor values
     81    /// </summary>
     82    /// <param name="dataset">A dataset that contains the variable values</param>
     83    /// <param name="factorVariables">An enumerable of categorical variables (factors). For each variable an enumerable of values must be specified.</param>
     84    /// <param name="rows">An enumerable of row indices for the dataset</param>
     85    /// <returns></returns>
     86    /// <remarks>Factor variables (categorical variables) are split up into multiple binary variables one for each specified value.</remarks>
     87    public static double[,] PrepareInputMatrix(
     88      IDataset dataset,
     89      IEnumerable<KeyValuePair<string, IEnumerable<string>>> factorVariables,
     90      IEnumerable<int> rows) {
     91      // check input variables. Only string variables are allowed.
     92      var invalidInputs =
     93        factorVariables.Select(kvp => kvp.Key).Where(name => !dataset.VariableHasType<string>(name));
     94      if (invalidInputs.Any())
     95        throw new NotSupportedException("Unsupported inputs: " + string.Join(", ", invalidInputs));
     96
     97      int numBinaryColumns = factorVariables.Sum(kvp => kvp.Value.Count());
     98
     99      List<int> rowsList = rows.ToList();
     100      double[,] matrix = new double[rowsList.Count, numBinaryColumns];
     101
     102      int col = 0;
     103      foreach (var kvp in factorVariables) {
     104        var varName = kvp.Key;
     105        var cats = kvp.Value;
     106        if (!cats.Any()) continue;
     107        foreach (var cat in cats) {
     108          var values = dataset.GetStringValues(varName, rows);
     109          int row = 0;
     110          foreach (var value in values) {
     111            matrix[row, col] = value == cat ? 1 : 0;
     112            row++;
     113          }
     114          col++;
     115        }
     116      }
     117      return matrix;
     118    }
     119
     120    public static IEnumerable<KeyValuePair<string, IEnumerable<string>>> GetFactorVariableValues(IDataset ds, IEnumerable<string> factorVariables, IEnumerable<int> rows) {
     121      return from factor in factorVariables
     122             let distinctValues = ds.GetStringValues(factor, rows).Distinct().ToArray()
     123             // 1 distinct value => skip (constant)
     124             // 2 distinct values => only take one of the two values
     125             // >=3 distinct values => create a binary value for each value
     126             let reducedValues = distinctValues.Length <= 2
     127               ? distinctValues.Take(distinctValues.Length - 1)
     128               : distinctValues
     129             select new KeyValuePair<string, IEnumerable<string>>(factor, reducedValues);
     130    }
    66131  }
    67132}
  • branches/TSNE/HeuristicLab.Algorithms.DataAnalysis/3.4/Linear/LinearDiscriminantAnalysis.cs

    r14185 r14836  
    2323using System.Collections.Generic;
    2424using System.Linq;
     25using System.Threading;
    2526using HeuristicLab.Common;
    2627using HeuristicLab.Core;
     
    3637  /// Linear discriminant analysis classification algorithm.
    3738  /// </summary>
    38   [Item("Linear Discriminant Analysis", "Linear discriminant analysis classification algorithm (wrapper for ALGLIB).")]
     39  [Item("Linear Discriminant Analysis (LDA)", "Linear discriminant analysis classification algorithm (wrapper for ALGLIB).")]
    3940  [Creatable(CreatableAttribute.Categories.DataAnalysisClassification, Priority = 100)]
    4041  [StorableClass]
     
    5960
    6061    #region Fisher LDA
    61     protected override void Run() {
     62    protected override void Run(CancellationToken cancellationToken) {
    6263      var solution = CreateLinearDiscriminantAnalysisSolution(Problem.ProblemData);
    6364      Results.Add(new Result(LinearDiscriminantAnalysisSolutionResultName, "The linear discriminant analysis.", solution));
     
    7071      IEnumerable<int> rows = problemData.TrainingIndices;
    7172      int nClasses = problemData.ClassNames.Count();
    72       double[,] inputMatrix = AlglibUtil.PrepareInputMatrix(dataset, allowedInputVariables.Concat(new string[] { targetVariable }), rows);
     73      var doubleVariableNames = allowedInputVariables.Where(dataset.VariableHasType<double>).ToArray();
     74      var factorVariableNames = allowedInputVariables.Where(dataset.VariableHasType<string>).ToArray();
     75      double[,] inputMatrix = AlglibUtil.PrepareInputMatrix(dataset, doubleVariableNames.Concat(new string[] { targetVariable }), rows);
     76
     77      var factorVariables = AlglibUtil.GetFactorVariableValues(dataset, factorVariableNames, rows);
     78      double[,] factorMatrix = AlglibUtil.PrepareInputMatrix(dataset, factorVariables, rows);
     79
     80      inputMatrix = factorMatrix.HorzCat(inputMatrix);
     81
    7382      if (inputMatrix.Cast<double>().Any(x => double.IsNaN(x) || double.IsInfinity(x)))
    7483        throw new NotSupportedException("Linear discriminant analysis does not support NaN or infinity values in the input dataset.");
     
    8291      int info;
    8392      double[] w;
    84       alglib.fisherlda(inputMatrix, inputMatrix.GetLength(0), allowedInputVariables.Count(), nClasses, out info, out w);
     93      alglib.fisherlda(inputMatrix, inputMatrix.GetLength(0), inputMatrix.GetLength(1) - 1, nClasses, out info, out w);
    8594      if (info < 1) throw new ArgumentException("Error in calculation of linear discriminant analysis solution");
    8695
     
    92101
    93102      int col = 0;
    94       foreach (string column in allowedInputVariables) {
     103      foreach (var kvp in factorVariables) {
     104        var varName = kvp.Key;
     105        foreach (var cat in kvp.Value) {
     106          BinaryFactorVariableTreeNode vNode =
     107            (BinaryFactorVariableTreeNode)new HeuristicLab.Problems.DataAnalysis.Symbolic.BinaryFactorVariable().CreateTreeNode();
     108          vNode.VariableName = varName;
     109          vNode.VariableValue = cat;
     110          vNode.Weight = w[col];
     111          addition.AddSubtree(vNode);
     112          col++;
     113        }
     114      }
     115      foreach (string column in doubleVariableNames) {
    95116        VariableTreeNode vNode = (VariableTreeNode)new HeuristicLab.Problems.DataAnalysis.Symbolic.Variable().CreateTreeNode();
    96117        vNode.VariableName = column;
     
    100121      }
    101122
    102       var model = LinearDiscriminantAnalysis.CreateDiscriminantFunctionModel(tree, new SymbolicDataAnalysisExpressionTreeInterpreter(), problemData, rows);
     123      var model = CreateDiscriminantFunctionModel(tree, new SymbolicDataAnalysisExpressionTreeLinearInterpreter(), problemData, rows);
    103124      SymbolicDiscriminantFunctionClassificationSolution solution = new SymbolicDiscriminantFunctionClassificationSolution(model, (IClassificationProblemData)problemData.Clone());
    104125
  • branches/TSNE/HeuristicLab.Algorithms.DataAnalysis/3.4/Linear/LinearRegression.cs

    r14185 r14836  
    2323using System.Collections.Generic;
    2424using System.Linq;
     25using System.Threading;
    2526using HeuristicLab.Common;
    2627using HeuristicLab.Core;
     
    6061
    6162    #region linear regression
    62     protected override void Run() {
     63    protected override void Run(CancellationToken cancellationToken) {
    6364      double rmsError, cvRmsError;
    6465      var solution = CreateLinearRegressionSolution(Problem.ProblemData, out rmsError, out cvRmsError);
     
    7374      IEnumerable<string> allowedInputVariables = problemData.AllowedInputVariables;
    7475      IEnumerable<int> rows = problemData.TrainingIndices;
    75       double[,] inputMatrix = AlglibUtil.PrepareInputMatrix(dataset, allowedInputVariables.Concat(new string[] { targetVariable }), rows);
     76      var doubleVariables = allowedInputVariables.Where(dataset.VariableHasType<double>);
     77      var factorVariableNames = allowedInputVariables.Where(dataset.VariableHasType<string>);
     78      var factorVariables = AlglibUtil.GetFactorVariableValues(dataset, factorVariableNames, rows);
     79      double[,] binaryMatrix = AlglibUtil.PrepareInputMatrix(dataset, factorVariables, rows);
     80      double[,] doubleVarMatrix = AlglibUtil.PrepareInputMatrix(dataset, doubleVariables.Concat(new string[] { targetVariable }), rows);
     81      var inputMatrix = binaryMatrix.HorzCat(doubleVarMatrix);
     82
    7683      if (inputMatrix.Cast<double>().Any(x => double.IsNaN(x) || double.IsInfinity(x)))
    7784        throw new NotSupportedException("Linear regression does not support NaN or infinity values in the input dataset.");
     
    98105
    99106      int col = 0;
    100       foreach (string column in allowedInputVariables) {
     107      foreach (var kvp in factorVariables) {
     108        var varName = kvp.Key;
     109        foreach (var cat in kvp.Value) {
     110          BinaryFactorVariableTreeNode vNode =
     111            (BinaryFactorVariableTreeNode)new HeuristicLab.Problems.DataAnalysis.Symbolic.BinaryFactorVariable().CreateTreeNode();
     112          vNode.VariableName = varName;
     113          vNode.VariableValue = cat;
     114          vNode.Weight = coefficients[col];
     115          addition.AddSubtree(vNode);
     116          col++;
     117        }
     118      }
     119      foreach (string column in doubleVariables) {
    101120        VariableTreeNode vNode = (VariableTreeNode)new HeuristicLab.Problems.DataAnalysis.Symbolic.Variable().CreateTreeNode();
    102121        vNode.VariableName = column;
     
    110129      addition.AddSubtree(cNode);
    111130
    112       SymbolicRegressionSolution solution = new SymbolicRegressionSolution(new SymbolicRegressionModel(problemData.TargetVariable, tree, new SymbolicDataAnalysisExpressionTreeInterpreter()), (IRegressionProblemData)problemData.Clone());
     131      SymbolicRegressionSolution solution = new SymbolicRegressionSolution(new SymbolicRegressionModel(problemData.TargetVariable, tree, new SymbolicDataAnalysisExpressionTreeLinearInterpreter()), (IRegressionProblemData)problemData.Clone());
    113132      solution.Model.Name = "Linear Regression Model";
    114133      solution.Name = "Linear Regression Solution";
  • branches/TSNE/HeuristicLab.Algorithms.DataAnalysis/3.4/Linear/MultinomialLogitClassification.cs

    r14185 r14836  
    2323using System.Collections.Generic;
    2424using System.Linq;
     25using System.Threading;
    2526using HeuristicLab.Common;
    2627using HeuristicLab.Core;
     
    5758
    5859    #region logit classification
    59     protected override void Run() {
     60    protected override void Run(CancellationToken cancellationToken) {
    6061      double rmsError, relClassError;
    6162      var solution = CreateLogitClassificationSolution(Problem.ProblemData, out rmsError, out relClassError);
     
    6869      var dataset = problemData.Dataset;
    6970      string targetVariable = problemData.TargetVariable;
    70       IEnumerable<string> allowedInputVariables = problemData.AllowedInputVariables;
     71      var doubleVariableNames = problemData.AllowedInputVariables.Where(dataset.VariableHasType<double>);
     72      var factorVariableNames = problemData.AllowedInputVariables.Where(dataset.VariableHasType<string>);
    7173      IEnumerable<int> rows = problemData.TrainingIndices;
    72       double[,] inputMatrix = AlglibUtil.PrepareInputMatrix(dataset, allowedInputVariables.Concat(new string[] { targetVariable }), rows);
     74      double[,] inputMatrix = AlglibUtil.PrepareInputMatrix(dataset, doubleVariableNames.Concat(new string[] { targetVariable }), rows);
     75
     76      var factorVariableValues = AlglibUtil.GetFactorVariableValues(dataset, factorVariableNames, rows);
     77      var factorMatrix = AlglibUtil.PrepareInputMatrix(dataset, factorVariableValues, rows);
     78      inputMatrix = factorMatrix.HorzCat(inputMatrix);
     79
    7380      if (inputMatrix.Cast<double>().Any(x => double.IsNaN(x) || double.IsInfinity(x)))
    7481        throw new NotSupportedException("Multinomial logit classification does not support NaN or infinity values in the input dataset.");
     
    95102      relClassError = alglib.mnlrelclserror(lm, inputMatrix, nRows);
    96103
    97       MultinomialLogitClassificationSolution solution = new MultinomialLogitClassificationSolution(new MultinomialLogitModel(lm, targetVariable, allowedInputVariables, classValues), (IClassificationProblemData)problemData.Clone());
     104      MultinomialLogitClassificationSolution solution = new MultinomialLogitClassificationSolution(new MultinomialLogitModel(lm, targetVariable, doubleVariableNames, factorVariableValues, classValues), (IClassificationProblemData)problemData.Clone());
    98105      return solution;
    99106    }
  • branches/TSNE/HeuristicLab.Algorithms.DataAnalysis/3.4/Linear/MultinomialLogitClassificationSolution.cs

    r14185 r14836  
    4343      : base(original, cloner) {
    4444    }
    45     public MultinomialLogitClassificationSolution( MultinomialLogitModel logitModel,IClassificationProblemData problemData)
     45    public MultinomialLogitClassificationSolution(MultinomialLogitModel logitModel, IClassificationProblemData problemData)
    4646      : base(logitModel, problemData) {
    4747    }
  • branches/TSNE/HeuristicLab.Algorithms.DataAnalysis/3.4/Linear/MultinomialLogitModel.cs

    r14185 r14836  
    5656    [Storable]
    5757    private double[] classValues;
     58    [Storable]
     59    private List<KeyValuePair<string, IEnumerable<string>>> factorVariables;
     60
    5861    [StorableConstructor]
    5962    private MultinomialLogitModel(bool deserializing)
     
    6871      allowedInputVariables = (string[])original.allowedInputVariables.Clone();
    6972      classValues = (double[])original.classValues.Clone();
     73      this.factorVariables = original.factorVariables.Select(kvp => new KeyValuePair<string, IEnumerable<string>>(kvp.Key, new List<string>(kvp.Value))).ToList();
    7074    }
    71     public MultinomialLogitModel(alglib.logitmodel logitModel, string targetVariable, IEnumerable<string> allowedInputVariables, double[] classValues)
     75    public MultinomialLogitModel(alglib.logitmodel logitModel, string targetVariable, IEnumerable<string> doubleInputVariables, IEnumerable<KeyValuePair<string, IEnumerable<string>>> factorVariables, double[] classValues)
    7276      : base(targetVariable) {
    7377      this.name = ItemName;
    7478      this.description = ItemDescription;
    7579      this.logitModel = logitModel;
    76       this.allowedInputVariables = allowedInputVariables.ToArray();
     80      this.allowedInputVariables = doubleInputVariables.ToArray();
     81      this.factorVariables = factorVariables.Select(kvp => new KeyValuePair<string, IEnumerable<string>>(kvp.Key, new List<string>(kvp.Value))).ToList();
    7782      this.classValues = (double[])classValues.Clone();
     83    }
     84
     85    [StorableHook(HookType.AfterDeserialization)]
     86    private void AfterDeserialization() {
     87      // BackwardsCompatibility3.3
     88      #region Backwards compatible code, remove with 3.4
     89      factorVariables = new List<KeyValuePair<string, IEnumerable<string>>>();
     90      #endregion
    7891    }
    7992
     
    8396
    8497    public override IEnumerable<double> GetEstimatedClassValues(IDataset dataset, IEnumerable<int> rows) {
     98
    8599      double[,] inputData = AlglibUtil.PrepareInputMatrix(dataset, allowedInputVariables, rows);
     100      double[,] factorData = AlglibUtil.PrepareInputMatrix(dataset, factorVariables, rows);
     101
     102      inputData = factorData.HorzCat(inputData);
    86103
    87104      int n = inputData.GetLength(0);
Note: See TracChangeset for help on using the changeset viewer.