[17737] | 1 | using HeuristicLab.Algorithms.DataAnalysis.Glmnet;
|
---|
[17740] | 2 | using HeuristicLab.Common;
|
---|
[17737] | 3 | using HeuristicLab.Data;
|
---|
| 4 | using HeuristicLab.Problems.DataAnalysis;
|
---|
| 5 | using System;
|
---|
| 6 | using System.Collections.Generic;
|
---|
| 7 | using System.Linq;
|
---|
| 8 | using System.Runtime.CompilerServices;
|
---|
| 9 |
|
---|
| 10 | [assembly: InternalsVisibleTo("UnitTests")]
|
---|
| 11 | namespace HeuristicLab.Algorithms.DataAnalysis.FastFunctionExtraction {
|
---|
| 12 | // utility functions for creating Basis Functions
|
---|
| 13 | internal static class BFUtils {
|
---|
| 14 | public static IEnumerable<IBasisFunction> CreateBasisFunctions(IRegressionProblemData data, Approach approach) {
|
---|
| 15 | var exponents = approach.AllowExp ? approach.Exponents : new HashSet<double> { 1 };
|
---|
[17779] | 16 | var funcs = approach.AllowNonLinearFunctions ? approach.NonLinearFunctions : new HashSet<NonlinearOperator> { NonlinearOperator.None };
|
---|
[17737] | 17 | var simpleBasisFuncs = CreateSimpleBases(data, exponents, funcs);
|
---|
| 18 |
|
---|
| 19 | if (approach.AllowHinge) {
|
---|
| 20 | // only allow hinge functions for features with exponent = 1 (deemed too complex otherwise)
|
---|
[17740] | 21 | var linearSimpleBasisFuncs = simpleBasisFuncs.Where(simpleBf => simpleBf.Exponent == 1 && simpleBf.Operator.Equals(NonlinearOperator.None));
|
---|
[17779] | 22 | simpleBasisFuncs = simpleBasisFuncs.Concat(CreateHingeBases(data, linearSimpleBasisFuncs, approach.MinHingeThreshold, approach.MaxHingeThreshold, approach.NumHingeThresholds));
|
---|
[17737] | 23 | }
|
---|
| 24 |
|
---|
| 25 | IEnumerable<IBasisFunction> functions = simpleBasisFuncs;
|
---|
| 26 |
|
---|
[17779] | 27 | if (approach.AllowInteractions) {
|
---|
[17737] | 28 | var multivariateBases = CreateMultivariateBases(data, simpleBasisFuncs.ToArray());
|
---|
| 29 | functions = functions.Concat(multivariateBases);
|
---|
| 30 | }
|
---|
| 31 |
|
---|
[17779] | 32 | if (approach.AllowDenominators) {
|
---|
[17737] | 33 | var denominatorBases = CreateDenominatorBases(functions);
|
---|
| 34 | functions = functions.Concat(denominatorBases);
|
---|
| 35 | }
|
---|
| 36 | return functions;
|
---|
| 37 | }
|
---|
| 38 |
|
---|
[17779] | 39 | public static IEnumerable<ISimpleBasisFunction> CreateSimpleBases(IRegressionProblemData problemData, HashSet<double> exponents, HashSet<NonlinearOperator> nonLinearFunctions) {
|
---|
[17737] | 40 | var simpleBasisFunctions = new List<ISimpleBasisFunction>();
|
---|
| 41 | foreach (var variableName in problemData.AllowedInputVariables) {
|
---|
| 42 | var vals = problemData.Dataset.GetDoubleValues(variableName).ToArray();
|
---|
| 43 | var min = vals.Min();
|
---|
| 44 | foreach (var exp in exponents) {
|
---|
| 45 | var simpleBase = new SimpleBasisFunction(variableName, exp, NonlinearOperator.None);
|
---|
| 46 | // if the basis function is not valid without any operator, then it won't be valid in combination with any nonlinear operator -> skip
|
---|
[17779] | 47 | if (!Ok(simpleBase.Evaluate(problemData))) continue;
|
---|
[17737] | 48 |
|
---|
[17779] | 49 | foreach (NonlinearOperator op in nonLinearFunctions) {
|
---|
[17737] | 50 | // ignore cases where op has no effect
|
---|
[17779] | 51 | if (op.Equals(NonlinearOperator.Abs) && new[] { -2.0, 2.0 }.Contains(exp) && nonLinearFunctions.Contains(NonlinearOperator.None)) continue;
|
---|
[17737] | 52 | if (op.Equals(NonlinearOperator.Abs) && min >= 0) continue;
|
---|
| 53 | var nonsimpleBase = (SimpleBasisFunction)simpleBase.DeepCopy();
|
---|
| 54 | nonsimpleBase.Operator = op;
|
---|
[17779] | 55 | if (!Ok(nonsimpleBase.Evaluate(problemData))) continue;
|
---|
[17737] | 56 | simpleBasisFunctions.Add(nonsimpleBase);
|
---|
| 57 | }
|
---|
| 58 | }
|
---|
| 59 | }
|
---|
| 60 | return simpleBasisFunctions;
|
---|
| 61 | }
|
---|
| 62 |
|
---|
| 63 | public static IEnumerable<IBasisFunction> CreateMultivariateBases(IRegressionProblemData data, IList<ISimpleBasisFunction> univariateBases) {
|
---|
| 64 | var orderedFuncs = OrderBasisFuncsByImportance(data, univariateBases).ToArray();
|
---|
| 65 | var multivariateBases = new List<IBasisFunction>();
|
---|
| 66 | int maxSize = 2 * orderedFuncs.Length;
|
---|
| 67 | foreach (var bf in orderedFuncs) {
|
---|
| 68 | // disallow bases with exponents
|
---|
| 69 | if (bf.Exponent != 1) continue;
|
---|
| 70 | multivariateBases.Add(new ProductBaseFunction(bf, bf, true));
|
---|
| 71 | }
|
---|
| 72 |
|
---|
| 73 | for (int i = 0; i < orderedFuncs.Count(); i++) {
|
---|
| 74 | var b_i = orderedFuncs.ElementAt(i);
|
---|
| 75 | for (int j = 0; j < i; j++) {
|
---|
| 76 | var b_j = orderedFuncs.ElementAt(j);
|
---|
| 77 | if (b_j.Operator != NonlinearOperator.None) continue; // disallow op() * op(); deemed to complex
|
---|
| 78 | var b_inter = new ProductBaseFunction(b_i, b_j, true);
|
---|
[17779] | 79 | if (!Ok(b_inter.Evaluate(data))) continue;
|
---|
[17737] | 80 | multivariateBases.Add(b_inter);
|
---|
| 81 | if (multivariateBases.Count() >= maxSize)
|
---|
| 82 | return multivariateBases;
|
---|
| 83 | }
|
---|
| 84 | }
|
---|
| 85 | return multivariateBases;
|
---|
| 86 | }
|
---|
| 87 |
|
---|
| 88 | // order basis functions by importance (decr)
|
---|
| 89 | // the importance of a basis function is measured by the absolute value of its coefficient when optimized on the data
|
---|
| 90 | public static IEnumerable<ISimpleBasisFunction> OrderBasisFuncsByImportance(IRegressionProblemData data, IList<ISimpleBasisFunction> candidateFunctions) {
|
---|
[17740] | 91 | var elnetData = PrepareData(Normalize(data, out _, out _, out _, out _), candidateFunctions);
|
---|
[17737] | 92 | var coeff = ElasticNetLinearRegression.CalculateModelCoefficients(elnetData, 0, 0, out var trainNMSE, out var testNMSE); // LS-fit
|
---|
| 93 | var intercept = coeff.Last();
|
---|
| 94 | coeff = coeff.Take(coeff.Length - 1).ToArray();
|
---|
| 95 | var order = Utils.Argsort(coeff);
|
---|
| 96 | Array.Reverse(order);
|
---|
| 97 | return order.Select(idx => candidateFunctions[idx]);
|
---|
| 98 | }
|
---|
| 99 |
|
---|
[17779] | 100 | public static IList<ISimpleBasisFunction> CreateHingeBases(IRegressionProblemData data, IEnumerable<ISimpleBasisFunction> simple_bfs,
|
---|
| 101 | double relative_start_thr = 0.0, double relative_end_thr = 1.0, int num_thrs = 3, IntRange trainingPartition = null) {
|
---|
[17737] | 102 | var hingeBases = new List<ISimpleBasisFunction>();
|
---|
| 103 |
|
---|
| 104 | foreach (var simple_bf in simple_bfs) {
|
---|
| 105 | hingeBases.AddRange(CreateHingeBases(data, simple_bf, relative_start_thr, relative_end_thr, num_thrs, trainingPartition));
|
---|
| 106 | }
|
---|
| 107 | return hingeBases;
|
---|
| 108 | }
|
---|
| 109 |
|
---|
[17779] | 110 | private static IEnumerable<ISimpleBasisFunction> CreateHingeBases(IRegressionProblemData data, ISimpleBasisFunction simple_bf,
|
---|
| 111 | double relative_start_thr, double relative_end_thr, int num_thrs, IntRange trainingPartition) {
|
---|
[17737] | 112 | if (relative_start_thr >= relative_end_thr) throw new ArgumentException($"{nameof(relative_start_thr)} must be smaller than {nameof(relative_end_thr)}.");
|
---|
| 113 | var ans = new List<ISimpleBasisFunction>();
|
---|
| 114 |
|
---|
[17779] | 115 | var vals = simple_bf.Evaluate(data);
|
---|
[17737] | 116 | var temp = trainingPartition ?? data.TrainingPartition;
|
---|
| 117 | double min = Double.MaxValue;
|
---|
| 118 | double max = Double.MinValue;
|
---|
| 119 | for (int i = temp.Start; i < temp.End; i++) {
|
---|
| 120 | min = Math.Min(min, vals[i]);
|
---|
| 121 | max = Math.Max(max, vals[i]);
|
---|
| 122 | }
|
---|
| 123 | if (max - min == 0) return ans;
|
---|
| 124 | var full_range = max - min;
|
---|
| 125 | var start_thr = min + relative_start_thr * full_range;
|
---|
| 126 | var end_thr = min + relative_end_thr * full_range;
|
---|
| 127 | var thresholds = Utils.Linspace(start_thr, end_thr, num_thrs);
|
---|
| 128 |
|
---|
| 129 | foreach (var thr in thresholds) {
|
---|
[17779] | 130 | ans.Add(new SimpleBasisFunction(simple_bf.Feature, 1, NonlinearOperator.GT_Hinge, true, thr));
|
---|
| 131 | ans.Add(new SimpleBasisFunction(simple_bf.Feature, 1, NonlinearOperator.LT_Hinge, true, thr));
|
---|
[17737] | 132 | }
|
---|
| 133 | return ans;
|
---|
| 134 | }
|
---|
| 135 |
|
---|
| 136 | public static IEnumerable<IBasisFunction> CreateDenominatorBases(IEnumerable<IBasisFunction> basisFunctions) {
|
---|
| 137 | List<IBasisFunction> ans = new List<IBasisFunction>();
|
---|
| 138 | foreach (var bf in basisFunctions) {
|
---|
[17779] | 139 | if (!bf.IsDenominator) continue;
|
---|
[17737] | 140 | var denomFunc = bf.DeepCopy();
|
---|
[17779] | 141 | denomFunc.IsDenominator = false;
|
---|
[17737] | 142 | ans.Add(denomFunc);
|
---|
| 143 | }
|
---|
| 144 | return ans;
|
---|
| 145 | }
|
---|
| 146 |
|
---|
| 147 | public static IRegressionProblemData PrepareData(IRegressionProblemData problemData, IEnumerable<IBasisFunction> basisFunctions) {
|
---|
| 148 | int numRows = problemData.Dataset.Rows;
|
---|
| 149 | int numCols = basisFunctions.Count();
|
---|
| 150 | HashSet<string> allowedInputVars = new HashSet<string>();
|
---|
| 151 | double[,] variableValues = new double[numRows, numCols + 1]; // +1 for target var
|
---|
| 152 |
|
---|
| 153 | int col = 0;
|
---|
| 154 | foreach (var basisFunc in basisFunctions) {
|
---|
[17779] | 155 | allowedInputVars.Add(basisFunc.ToString() + (!basisFunc.IsDenominator ? " * " + problemData.TargetVariable : ""));
|
---|
| 156 | var vals = basisFunc.Evaluate(problemData);
|
---|
[17737] | 157 | for (int i = 0; i < numRows; i++) {
|
---|
| 158 | variableValues[i, col] = vals[i];
|
---|
| 159 | }
|
---|
| 160 | col++;
|
---|
| 161 | }
|
---|
| 162 |
|
---|
| 163 | // add the unmodified target variable to the dataset
|
---|
| 164 | var allVariables = new HashSet<string>(allowedInputVars);
|
---|
| 165 | allVariables.Add(problemData.TargetVariable);
|
---|
| 166 |
|
---|
| 167 | var targetVals = problemData.TargetVariableValues.ToArray();
|
---|
| 168 | for (int i = 0; i < numRows; i++) {
|
---|
| 169 | variableValues[i, col] = targetVals[i];
|
---|
| 170 | }
|
---|
| 171 |
|
---|
| 172 | var temp = new Dataset(allVariables, variableValues);
|
---|
| 173 |
|
---|
| 174 | IRegressionProblemData rpd = new RegressionProblemData(temp, allowedInputVars, problemData.TargetVariable);
|
---|
| 175 | rpd.TrainingPartition.Start = problemData.TrainingPartition.Start;
|
---|
| 176 | rpd.TrainingPartition.End = problemData.TrainingPartition.End;
|
---|
| 177 | rpd.TestPartition.Start = problemData.TestPartition.Start;
|
---|
| 178 | rpd.TestPartition.End = problemData.TestPartition.End;
|
---|
| 179 | return rpd;
|
---|
| 180 | }
|
---|
| 181 |
|
---|
[17740] | 182 | public static IRegressionProblemData Normalize(IRegressionProblemData data, out double[] X_avgs, out double[] X_stds, out double y_avg, out double y_std) {
|
---|
| 183 | X_avgs = data.AllowedInputVariables
|
---|
| 184 | .Select(varname => data.Dataset.GetDoubleValues(varname)
|
---|
| 185 | .Average())
|
---|
| 186 | .ToArray();
|
---|
| 187 | X_stds = data.AllowedInputVariables
|
---|
| 188 | .Select(varname => data.Dataset.GetDoubleValues(varname)
|
---|
| 189 | .StandardDeviationPop())
|
---|
| 190 | .ToArray();
|
---|
| 191 | for (int i = 0; i < X_stds.Length; i++) {
|
---|
| 192 | if (X_stds[i] == 0) X_stds[i] = 1;
|
---|
| 193 | }
|
---|
| 194 | y_avg = data.TargetVariableValues.Average();
|
---|
| 195 | y_std = data.TargetVariableValues.StandardDeviationPop();
|
---|
| 196 | if (y_std == 0) y_std = 1;
|
---|
| 197 | var temp = Normalize(data.Dataset);
|
---|
| 198 | var ans = new RegressionProblemData(Normalize(data.Dataset), data.AllowedInputVariables, data.TargetVariable);
|
---|
| 199 | return ans;
|
---|
| 200 | }
|
---|
[17737] | 201 |
|
---|
| 202 | // return a normalized version of IDataset ds
|
---|
| 203 | private static IDataset Normalize(IDataset ds) {
|
---|
| 204 | var doubleNames = ds.DoubleVariables.ToArray();
|
---|
| 205 | if (ds.VariableNames.Count() != doubleNames.Length) throw new ArgumentException(nameof(ds));
|
---|
| 206 | var variableVals = new List<List<double>>();
|
---|
| 207 | foreach (var name in doubleNames) {
|
---|
| 208 | var vals = Utils.Normalize(ds.GetDoubleValues(name).ToArray());
|
---|
| 209 | variableVals.Add(vals.ToList());
|
---|
| 210 | }
|
---|
| 211 | return new Dataset(doubleNames, variableVals);
|
---|
| 212 | }
|
---|
| 213 |
|
---|
| 214 | private static bool Ok(IEnumerable<double> data) => data.All(x => !double.IsNaN(x) && !double.IsInfinity(x));
|
---|
| 215 | }
|
---|
| 216 | }
|
---|