- Timestamp:
- 04/10/17 15:48:20 (8 years ago)
- Location:
- branches/TSNE/HeuristicLab.Algorithms.DataAnalysis
- Files:
-
- 7 edited
Legend:
- Unmodified
- Added
- Removed
-
branches/TSNE/HeuristicLab.Algorithms.DataAnalysis
- Property svn:mergeinfo changed
-
branches/TSNE/HeuristicLab.Algorithms.DataAnalysis/3.4/Linear/AlglibUtil.cs
r14185 r14836 20 20 #endregion 21 21 22 using System; 22 23 using System.Collections.Generic; 23 24 using System.Linq; … … 27 28 public static class AlglibUtil { 28 29 public static double[,] PrepareInputMatrix(IDataset dataset, IEnumerable<string> variables, IEnumerable<int> rows) { 29 List<string> variablesList = variables.ToList(); 30 // check input variables. Only double variables are allowed. 31 var invalidInputs = 32 variables.Where(name => !dataset.VariableHasType<double>(name)); 33 if (invalidInputs.Any()) 34 throw new NotSupportedException("Unsupported inputs: " + string.Join(", ", invalidInputs)); 35 30 36 List<int> rowsList = rows.ToList(); 31 32 double[,] matrix = new double[rowsList.Count, variablesList.Count]; 37 double[,] matrix = new double[rowsList.Count, variables.Count()]; 33 38 34 39 int col = 0; … … 45 50 return matrix; 46 51 } 52 47 53 public static double[,] PrepareAndScaleInputMatrix(IDataset dataset, IEnumerable<string> variables, IEnumerable<int> rows, Scaling scaling) { 54 // check input variables. Only double variables are allowed. 55 var invalidInputs = 56 variables.Where(name => !dataset.VariableHasType<double>(name)); 57 if (invalidInputs.Any()) 58 throw new NotSupportedException("Unsupported inputs: " + string.Join(", ", invalidInputs)); 59 48 60 List<string> variablesList = variables.ToList(); 49 61 List<int> rowsList = rows.ToList(); … … 64 76 return matrix; 65 77 } 78 79 /// <summary> 80 /// Prepares a binary data matrix from a number of factors and specified factor values 81 /// </summary> 82 /// <param name="dataset">A dataset that contains the variable values</param> 83 /// <param name="factorVariables">An enumerable of categorical variables (factors). For each variable an enumerable of values must be specified.</param> 84 /// <param name="rows">An enumerable of row indices for the dataset</param> 85 /// <returns></returns> 86 /// <remarks>Factor variables (categorical variables) are split up into multiple binary variables one for each specified value.</remarks> 87 public static double[,] PrepareInputMatrix( 88 IDataset dataset, 89 IEnumerable<KeyValuePair<string, IEnumerable<string>>> factorVariables, 90 IEnumerable<int> rows) { 91 // check input variables. Only string variables are allowed. 92 var invalidInputs = 93 factorVariables.Select(kvp => kvp.Key).Where(name => !dataset.VariableHasType<string>(name)); 94 if (invalidInputs.Any()) 95 throw new NotSupportedException("Unsupported inputs: " + string.Join(", ", invalidInputs)); 96 97 int numBinaryColumns = factorVariables.Sum(kvp => kvp.Value.Count()); 98 99 List<int> rowsList = rows.ToList(); 100 double[,] matrix = new double[rowsList.Count, numBinaryColumns]; 101 102 int col = 0; 103 foreach (var kvp in factorVariables) { 104 var varName = kvp.Key; 105 var cats = kvp.Value; 106 if (!cats.Any()) continue; 107 foreach (var cat in cats) { 108 var values = dataset.GetStringValues(varName, rows); 109 int row = 0; 110 foreach (var value in values) { 111 matrix[row, col] = value == cat ? 1 : 0; 112 row++; 113 } 114 col++; 115 } 116 } 117 return matrix; 118 } 119 120 public static IEnumerable<KeyValuePair<string, IEnumerable<string>>> GetFactorVariableValues(IDataset ds, IEnumerable<string> factorVariables, IEnumerable<int> rows) { 121 return from factor in factorVariables 122 let distinctValues = ds.GetStringValues(factor, rows).Distinct().ToArray() 123 // 1 distinct value => skip (constant) 124 // 2 distinct values => only take one of the two values 125 // >=3 distinct values => create a binary value for each value 126 let reducedValues = distinctValues.Length <= 2 127 ? distinctValues.Take(distinctValues.Length - 1) 128 : distinctValues 129 select new KeyValuePair<string, IEnumerable<string>>(factor, reducedValues); 130 } 66 131 } 67 132 } -
branches/TSNE/HeuristicLab.Algorithms.DataAnalysis/3.4/Linear/LinearDiscriminantAnalysis.cs
r14185 r14836 23 23 using System.Collections.Generic; 24 24 using System.Linq; 25 using System.Threading; 25 26 using HeuristicLab.Common; 26 27 using HeuristicLab.Core; … … 36 37 /// Linear discriminant analysis classification algorithm. 37 38 /// </summary> 38 [Item("Linear Discriminant Analysis ", "Linear discriminant analysis classification algorithm (wrapper for ALGLIB).")]39 [Item("Linear Discriminant Analysis (LDA)", "Linear discriminant analysis classification algorithm (wrapper for ALGLIB).")] 39 40 [Creatable(CreatableAttribute.Categories.DataAnalysisClassification, Priority = 100)] 40 41 [StorableClass] … … 59 60 60 61 #region Fisher LDA 61 protected override void Run( ) {62 protected override void Run(CancellationToken cancellationToken) { 62 63 var solution = CreateLinearDiscriminantAnalysisSolution(Problem.ProblemData); 63 64 Results.Add(new Result(LinearDiscriminantAnalysisSolutionResultName, "The linear discriminant analysis.", solution)); … … 70 71 IEnumerable<int> rows = problemData.TrainingIndices; 71 72 int nClasses = problemData.ClassNames.Count(); 72 double[,] inputMatrix = AlglibUtil.PrepareInputMatrix(dataset, allowedInputVariables.Concat(new string[] { targetVariable }), rows); 73 var doubleVariableNames = allowedInputVariables.Where(dataset.VariableHasType<double>).ToArray(); 74 var factorVariableNames = allowedInputVariables.Where(dataset.VariableHasType<string>).ToArray(); 75 double[,] inputMatrix = AlglibUtil.PrepareInputMatrix(dataset, doubleVariableNames.Concat(new string[] { targetVariable }), rows); 76 77 var factorVariables = AlglibUtil.GetFactorVariableValues(dataset, factorVariableNames, rows); 78 double[,] factorMatrix = AlglibUtil.PrepareInputMatrix(dataset, factorVariables, rows); 79 80 inputMatrix = factorMatrix.HorzCat(inputMatrix); 81 73 82 if (inputMatrix.Cast<double>().Any(x => double.IsNaN(x) || double.IsInfinity(x))) 74 83 throw new NotSupportedException("Linear discriminant analysis does not support NaN or infinity values in the input dataset."); … … 82 91 int info; 83 92 double[] w; 84 alglib.fisherlda(inputMatrix, inputMatrix.GetLength(0), allowedInputVariables.Count(), nClasses, out info, out w);93 alglib.fisherlda(inputMatrix, inputMatrix.GetLength(0), inputMatrix.GetLength(1) - 1, nClasses, out info, out w); 85 94 if (info < 1) throw new ArgumentException("Error in calculation of linear discriminant analysis solution"); 86 95 … … 92 101 93 102 int col = 0; 94 foreach (string column in allowedInputVariables) { 103 foreach (var kvp in factorVariables) { 104 var varName = kvp.Key; 105 foreach (var cat in kvp.Value) { 106 BinaryFactorVariableTreeNode vNode = 107 (BinaryFactorVariableTreeNode)new HeuristicLab.Problems.DataAnalysis.Symbolic.BinaryFactorVariable().CreateTreeNode(); 108 vNode.VariableName = varName; 109 vNode.VariableValue = cat; 110 vNode.Weight = w[col]; 111 addition.AddSubtree(vNode); 112 col++; 113 } 114 } 115 foreach (string column in doubleVariableNames) { 95 116 VariableTreeNode vNode = (VariableTreeNode)new HeuristicLab.Problems.DataAnalysis.Symbolic.Variable().CreateTreeNode(); 96 117 vNode.VariableName = column; … … 100 121 } 101 122 102 var model = LinearDiscriminantAnalysis.CreateDiscriminantFunctionModel(tree, new SymbolicDataAnalysisExpressionTreeInterpreter(), problemData, rows);123 var model = CreateDiscriminantFunctionModel(tree, new SymbolicDataAnalysisExpressionTreeLinearInterpreter(), problemData, rows); 103 124 SymbolicDiscriminantFunctionClassificationSolution solution = new SymbolicDiscriminantFunctionClassificationSolution(model, (IClassificationProblemData)problemData.Clone()); 104 125 -
branches/TSNE/HeuristicLab.Algorithms.DataAnalysis/3.4/Linear/LinearRegression.cs
r14185 r14836 23 23 using System.Collections.Generic; 24 24 using System.Linq; 25 using System.Threading; 25 26 using HeuristicLab.Common; 26 27 using HeuristicLab.Core; … … 60 61 61 62 #region linear regression 62 protected override void Run( ) {63 protected override void Run(CancellationToken cancellationToken) { 63 64 double rmsError, cvRmsError; 64 65 var solution = CreateLinearRegressionSolution(Problem.ProblemData, out rmsError, out cvRmsError); … … 73 74 IEnumerable<string> allowedInputVariables = problemData.AllowedInputVariables; 74 75 IEnumerable<int> rows = problemData.TrainingIndices; 75 double[,] inputMatrix = AlglibUtil.PrepareInputMatrix(dataset, allowedInputVariables.Concat(new string[] { targetVariable }), rows); 76 var doubleVariables = allowedInputVariables.Where(dataset.VariableHasType<double>); 77 var factorVariableNames = allowedInputVariables.Where(dataset.VariableHasType<string>); 78 var factorVariables = AlglibUtil.GetFactorVariableValues(dataset, factorVariableNames, rows); 79 double[,] binaryMatrix = AlglibUtil.PrepareInputMatrix(dataset, factorVariables, rows); 80 double[,] doubleVarMatrix = AlglibUtil.PrepareInputMatrix(dataset, doubleVariables.Concat(new string[] { targetVariable }), rows); 81 var inputMatrix = binaryMatrix.HorzCat(doubleVarMatrix); 82 76 83 if (inputMatrix.Cast<double>().Any(x => double.IsNaN(x) || double.IsInfinity(x))) 77 84 throw new NotSupportedException("Linear regression does not support NaN or infinity values in the input dataset."); … … 98 105 99 106 int col = 0; 100 foreach (string column in allowedInputVariables) { 107 foreach (var kvp in factorVariables) { 108 var varName = kvp.Key; 109 foreach (var cat in kvp.Value) { 110 BinaryFactorVariableTreeNode vNode = 111 (BinaryFactorVariableTreeNode)new HeuristicLab.Problems.DataAnalysis.Symbolic.BinaryFactorVariable().CreateTreeNode(); 112 vNode.VariableName = varName; 113 vNode.VariableValue = cat; 114 vNode.Weight = coefficients[col]; 115 addition.AddSubtree(vNode); 116 col++; 117 } 118 } 119 foreach (string column in doubleVariables) { 101 120 VariableTreeNode vNode = (VariableTreeNode)new HeuristicLab.Problems.DataAnalysis.Symbolic.Variable().CreateTreeNode(); 102 121 vNode.VariableName = column; … … 110 129 addition.AddSubtree(cNode); 111 130 112 SymbolicRegressionSolution solution = new SymbolicRegressionSolution(new SymbolicRegressionModel(problemData.TargetVariable, tree, new SymbolicDataAnalysisExpressionTree Interpreter()), (IRegressionProblemData)problemData.Clone());131 SymbolicRegressionSolution solution = new SymbolicRegressionSolution(new SymbolicRegressionModel(problemData.TargetVariable, tree, new SymbolicDataAnalysisExpressionTreeLinearInterpreter()), (IRegressionProblemData)problemData.Clone()); 113 132 solution.Model.Name = "Linear Regression Model"; 114 133 solution.Name = "Linear Regression Solution"; -
branches/TSNE/HeuristicLab.Algorithms.DataAnalysis/3.4/Linear/MultinomialLogitClassification.cs
r14185 r14836 23 23 using System.Collections.Generic; 24 24 using System.Linq; 25 using System.Threading; 25 26 using HeuristicLab.Common; 26 27 using HeuristicLab.Core; … … 57 58 58 59 #region logit classification 59 protected override void Run( ) {60 protected override void Run(CancellationToken cancellationToken) { 60 61 double rmsError, relClassError; 61 62 var solution = CreateLogitClassificationSolution(Problem.ProblemData, out rmsError, out relClassError); … … 68 69 var dataset = problemData.Dataset; 69 70 string targetVariable = problemData.TargetVariable; 70 IEnumerable<string> allowedInputVariables = problemData.AllowedInputVariables; 71 var doubleVariableNames = problemData.AllowedInputVariables.Where(dataset.VariableHasType<double>); 72 var factorVariableNames = problemData.AllowedInputVariables.Where(dataset.VariableHasType<string>); 71 73 IEnumerable<int> rows = problemData.TrainingIndices; 72 double[,] inputMatrix = AlglibUtil.PrepareInputMatrix(dataset, allowedInputVariables.Concat(new string[] { targetVariable }), rows); 74 double[,] inputMatrix = AlglibUtil.PrepareInputMatrix(dataset, doubleVariableNames.Concat(new string[] { targetVariable }), rows); 75 76 var factorVariableValues = AlglibUtil.GetFactorVariableValues(dataset, factorVariableNames, rows); 77 var factorMatrix = AlglibUtil.PrepareInputMatrix(dataset, factorVariableValues, rows); 78 inputMatrix = factorMatrix.HorzCat(inputMatrix); 79 73 80 if (inputMatrix.Cast<double>().Any(x => double.IsNaN(x) || double.IsInfinity(x))) 74 81 throw new NotSupportedException("Multinomial logit classification does not support NaN or infinity values in the input dataset."); … … 95 102 relClassError = alglib.mnlrelclserror(lm, inputMatrix, nRows); 96 103 97 MultinomialLogitClassificationSolution solution = new MultinomialLogitClassificationSolution(new MultinomialLogitModel(lm, targetVariable, allowedInputVariables, classValues), (IClassificationProblemData)problemData.Clone());104 MultinomialLogitClassificationSolution solution = new MultinomialLogitClassificationSolution(new MultinomialLogitModel(lm, targetVariable, doubleVariableNames, factorVariableValues, classValues), (IClassificationProblemData)problemData.Clone()); 98 105 return solution; 99 106 } -
branches/TSNE/HeuristicLab.Algorithms.DataAnalysis/3.4/Linear/MultinomialLogitClassificationSolution.cs
r14185 r14836 43 43 : base(original, cloner) { 44 44 } 45 public MultinomialLogitClassificationSolution( MultinomialLogitModel logitModel,IClassificationProblemData problemData)45 public MultinomialLogitClassificationSolution(MultinomialLogitModel logitModel, IClassificationProblemData problemData) 46 46 : base(logitModel, problemData) { 47 47 } -
branches/TSNE/HeuristicLab.Algorithms.DataAnalysis/3.4/Linear/MultinomialLogitModel.cs
r14185 r14836 56 56 [Storable] 57 57 private double[] classValues; 58 [Storable] 59 private List<KeyValuePair<string, IEnumerable<string>>> factorVariables; 60 58 61 [StorableConstructor] 59 62 private MultinomialLogitModel(bool deserializing) … … 68 71 allowedInputVariables = (string[])original.allowedInputVariables.Clone(); 69 72 classValues = (double[])original.classValues.Clone(); 73 this.factorVariables = original.factorVariables.Select(kvp => new KeyValuePair<string, IEnumerable<string>>(kvp.Key, new List<string>(kvp.Value))).ToList(); 70 74 } 71 public MultinomialLogitModel(alglib.logitmodel logitModel, string targetVariable, IEnumerable<string> allowedInputVariables, double[] classValues)75 public MultinomialLogitModel(alglib.logitmodel logitModel, string targetVariable, IEnumerable<string> doubleInputVariables, IEnumerable<KeyValuePair<string, IEnumerable<string>>> factorVariables, double[] classValues) 72 76 : base(targetVariable) { 73 77 this.name = ItemName; 74 78 this.description = ItemDescription; 75 79 this.logitModel = logitModel; 76 this.allowedInputVariables = allowedInputVariables.ToArray(); 80 this.allowedInputVariables = doubleInputVariables.ToArray(); 81 this.factorVariables = factorVariables.Select(kvp => new KeyValuePair<string, IEnumerable<string>>(kvp.Key, new List<string>(kvp.Value))).ToList(); 77 82 this.classValues = (double[])classValues.Clone(); 83 } 84 85 [StorableHook(HookType.AfterDeserialization)] 86 private void AfterDeserialization() { 87 // BackwardsCompatibility3.3 88 #region Backwards compatible code, remove with 3.4 89 factorVariables = new List<KeyValuePair<string, IEnumerable<string>>>(); 90 #endregion 78 91 } 79 92 … … 83 96 84 97 public override IEnumerable<double> GetEstimatedClassValues(IDataset dataset, IEnumerable<int> rows) { 98 85 99 double[,] inputData = AlglibUtil.PrepareInputMatrix(dataset, allowedInputVariables, rows); 100 double[,] factorData = AlglibUtil.PrepareInputMatrix(dataset, factorVariables, rows); 101 102 inputData = factorData.HorzCat(inputData); 86 103 87 104 int n = inputData.GetLength(0);
Note: See TracChangeset
for help on using the changeset viewer.