source: branches/3022-FastFunctionExtraction/FFX/BFUtils.cs @ 17779

Last change on this file since 17779 was 17779, checked in by gkronber, 12 months ago

#3022: made a few changes while reviewing the code.

File size: 11.6 KB
Line 
1using HeuristicLab.Algorithms.DataAnalysis.Glmnet;
2using HeuristicLab.Common;
3using HeuristicLab.Data;
4using HeuristicLab.Problems.DataAnalysis;
5using System;
6using System.Collections.Generic;
7using System.Linq;
8using System.Runtime.CompilerServices;
9
10[assembly: InternalsVisibleTo("UnitTests")]
11namespace HeuristicLab.Algorithms.DataAnalysis.FastFunctionExtraction {
12    // utility functions for creating Basis Functions
13    internal static class BFUtils {
14        public static IEnumerable<IBasisFunction> CreateBasisFunctions(IRegressionProblemData data, Approach approach) {
15            var exponents = approach.AllowExp ? approach.Exponents : new HashSet<double> { 1 };
16            var funcs = approach.AllowNonLinearFunctions ? approach.NonLinearFunctions : new HashSet<NonlinearOperator> { NonlinearOperator.None };
17            var simpleBasisFuncs = CreateSimpleBases(data, exponents, funcs);
18
19            if (approach.AllowHinge) {
20                // only allow hinge functions for features with exponent = 1 (deemed too complex otherwise)
21                var linearSimpleBasisFuncs = simpleBasisFuncs.Where(simpleBf => simpleBf.Exponent == 1 && simpleBf.Operator.Equals(NonlinearOperator.None));
22                simpleBasisFuncs = simpleBasisFuncs.Concat(CreateHingeBases(data, linearSimpleBasisFuncs, approach.MinHingeThreshold, approach.MaxHingeThreshold, approach.NumHingeThresholds));
23            }
24
25            IEnumerable<IBasisFunction> functions = simpleBasisFuncs;
26
27            if (approach.AllowInteractions) {
28                var multivariateBases = CreateMultivariateBases(data, simpleBasisFuncs.ToArray());
29                functions = functions.Concat(multivariateBases);
30            }
31
32            if (approach.AllowDenominators) {
33                var denominatorBases = CreateDenominatorBases(functions);
34                functions = functions.Concat(denominatorBases);
35            }
36            return functions;
37        }
38
39        public static IEnumerable<ISimpleBasisFunction> CreateSimpleBases(IRegressionProblemData problemData, HashSet<double> exponents, HashSet<NonlinearOperator> nonLinearFunctions) {
40            var simpleBasisFunctions = new List<ISimpleBasisFunction>();
41            foreach (var variableName in problemData.AllowedInputVariables) {
42                var vals = problemData.Dataset.GetDoubleValues(variableName).ToArray();
43                var min = vals.Min();
44                foreach (var exp in exponents) {
45                    var simpleBase = new SimpleBasisFunction(variableName, exp, NonlinearOperator.None);
46                    // if the basis function is not valid without any operator, then it won't be valid in combination with any nonlinear operator -> skip
47                    if (!Ok(simpleBase.Evaluate(problemData))) continue;
48
49                    foreach (NonlinearOperator op in nonLinearFunctions) {
50                        // ignore cases where op has no effect
51                        if (op.Equals(NonlinearOperator.Abs) && new[] { -2.0, 2.0 }.Contains(exp) && nonLinearFunctions.Contains(NonlinearOperator.None)) continue;
52                        if (op.Equals(NonlinearOperator.Abs) && min >= 0) continue;
53                        var nonsimpleBase = (SimpleBasisFunction)simpleBase.DeepCopy();
54                        nonsimpleBase.Operator = op;
55                        if (!Ok(nonsimpleBase.Evaluate(problemData))) continue;
56                        simpleBasisFunctions.Add(nonsimpleBase);
57                    }
58                }
59            }
60            return simpleBasisFunctions;
61        }
62
63        public static IEnumerable<IBasisFunction> CreateMultivariateBases(IRegressionProblemData data, IList<ISimpleBasisFunction> univariateBases) {
64            var orderedFuncs = OrderBasisFuncsByImportance(data, univariateBases).ToArray();
65            var multivariateBases = new List<IBasisFunction>();
66            int maxSize = 2 * orderedFuncs.Length;
67            foreach (var bf in orderedFuncs) {
68                // disallow bases with exponents
69                if (bf.Exponent != 1) continue;
70                multivariateBases.Add(new ProductBaseFunction(bf, bf, true));
71            }
72
73            for (int i = 0; i < orderedFuncs.Count(); i++) {
74                var b_i = orderedFuncs.ElementAt(i);
75                for (int j = 0; j < i; j++) {
76                    var b_j = orderedFuncs.ElementAt(j);
77                    if (b_j.Operator != NonlinearOperator.None) continue; // disallow op() * op(); deemed to complex
78                    var b_inter = new ProductBaseFunction(b_i, b_j, true);
79                    if (!Ok(b_inter.Evaluate(data))) continue;
80                    multivariateBases.Add(b_inter);
81                    if (multivariateBases.Count() >= maxSize)
82                        return multivariateBases;
83                }
84            }
85            return multivariateBases;
86        }
87
88        // order basis functions by importance (decr)
89        // the importance of a basis function is measured by the absolute value of its coefficient when optimized on the data
90        public static IEnumerable<ISimpleBasisFunction> OrderBasisFuncsByImportance(IRegressionProblemData data, IList<ISimpleBasisFunction> candidateFunctions) {
91            var elnetData = PrepareData(Normalize(data, out _, out _, out _, out _), candidateFunctions);
92            var coeff = ElasticNetLinearRegression.CalculateModelCoefficients(elnetData, 0, 0, out var trainNMSE, out var testNMSE); // LS-fit
93            var intercept = coeff.Last();
94            coeff = coeff.Take(coeff.Length - 1).ToArray();
95            var order = Utils.Argsort(coeff);
96            Array.Reverse(order);
97            return order.Select(idx => candidateFunctions[idx]);
98        }
99
100        public static IList<ISimpleBasisFunction> CreateHingeBases(IRegressionProblemData data, IEnumerable<ISimpleBasisFunction> simple_bfs,
101          double relative_start_thr = 0.0, double relative_end_thr = 1.0, int num_thrs = 3, IntRange trainingPartition = null) {
102            var hingeBases = new List<ISimpleBasisFunction>();
103
104            foreach (var simple_bf in simple_bfs) {
105                hingeBases.AddRange(CreateHingeBases(data, simple_bf, relative_start_thr, relative_end_thr, num_thrs, trainingPartition));
106            }
107            return hingeBases;
108        }
109
110        private static IEnumerable<ISimpleBasisFunction> CreateHingeBases(IRegressionProblemData data, ISimpleBasisFunction simple_bf,
111          double relative_start_thr, double relative_end_thr, int num_thrs, IntRange trainingPartition) {
112            if (relative_start_thr >= relative_end_thr) throw new ArgumentException($"{nameof(relative_start_thr)} must be smaller than {nameof(relative_end_thr)}.");
113            var ans = new List<ISimpleBasisFunction>();
114
115            var vals = simple_bf.Evaluate(data);
116            var temp = trainingPartition ?? data.TrainingPartition;
117            double min = Double.MaxValue;
118            double max = Double.MinValue;
119            for (int i = temp.Start; i < temp.End; i++) {
120                min = Math.Min(min, vals[i]);
121                max = Math.Max(max, vals[i]);
122            }
123            if (max - min == 0) return ans;
124            var full_range = max - min;
125            var start_thr = min + relative_start_thr * full_range;
126            var end_thr = min + relative_end_thr * full_range;
127            var thresholds = Utils.Linspace(start_thr, end_thr, num_thrs);
128
129            foreach (var thr in thresholds) {
130                ans.Add(new SimpleBasisFunction(simple_bf.Feature, 1, NonlinearOperator.GT_Hinge, true, thr));
131                ans.Add(new SimpleBasisFunction(simple_bf.Feature, 1, NonlinearOperator.LT_Hinge, true, thr));
132            }
133            return ans;
134        }
135
136        public static IEnumerable<IBasisFunction> CreateDenominatorBases(IEnumerable<IBasisFunction> basisFunctions) {
137            List<IBasisFunction> ans = new List<IBasisFunction>();
138            foreach (var bf in basisFunctions) {
139                if (!bf.IsDenominator) continue;
140                var denomFunc = bf.DeepCopy();
141                denomFunc.IsDenominator = false;
142                ans.Add(denomFunc);
143            }
144            return ans;
145        }
146
147        public static IRegressionProblemData PrepareData(IRegressionProblemData problemData, IEnumerable<IBasisFunction> basisFunctions) {
148            int numRows = problemData.Dataset.Rows;
149            int numCols = basisFunctions.Count();
150            HashSet<string> allowedInputVars = new HashSet<string>();
151            double[,] variableValues = new double[numRows, numCols + 1]; // +1 for target var
152
153            int col = 0;
154            foreach (var basisFunc in basisFunctions) {
155                allowedInputVars.Add(basisFunc.ToString() + (!basisFunc.IsDenominator ? " * " + problemData.TargetVariable : ""));
156                var vals = basisFunc.Evaluate(problemData);
157                for (int i = 0; i < numRows; i++) {
158                    variableValues[i, col] = vals[i];
159                }
160                col++;
161            }
162
163            // add the unmodified target variable to the dataset
164            var allVariables = new HashSet<string>(allowedInputVars);
165            allVariables.Add(problemData.TargetVariable);
166
167            var targetVals = problemData.TargetVariableValues.ToArray();
168            for (int i = 0; i < numRows; i++) {
169                variableValues[i, col] = targetVals[i];
170            }
171
172            var temp = new Dataset(allVariables, variableValues);
173
174            IRegressionProblemData rpd = new RegressionProblemData(temp, allowedInputVars, problemData.TargetVariable);
175            rpd.TrainingPartition.Start = problemData.TrainingPartition.Start;
176            rpd.TrainingPartition.End = problemData.TrainingPartition.End;
177            rpd.TestPartition.Start = problemData.TestPartition.Start;
178            rpd.TestPartition.End = problemData.TestPartition.End;
179            return rpd;
180        }
181
182        public static IRegressionProblemData Normalize(IRegressionProblemData data, out double[] X_avgs, out double[] X_stds, out double y_avg, out double y_std) {
183            X_avgs = data.AllowedInputVariables
184                .Select(varname => data.Dataset.GetDoubleValues(varname)
185                    .Average())
186                .ToArray();
187            X_stds = data.AllowedInputVariables
188                .Select(varname => data.Dataset.GetDoubleValues(varname)
189                    .StandardDeviationPop())
190                .ToArray();
191            for (int i = 0; i < X_stds.Length; i++) {
192                if (X_stds[i] == 0) X_stds[i] = 1;
193            }
194            y_avg = data.TargetVariableValues.Average();
195            y_std = data.TargetVariableValues.StandardDeviationPop();
196            if (y_std == 0) y_std = 1;
197            var temp = Normalize(data.Dataset);
198            var ans = new RegressionProblemData(Normalize(data.Dataset), data.AllowedInputVariables, data.TargetVariable);
199            return ans;
200        }
201
202        // return a normalized version of IDataset ds
203        private static IDataset Normalize(IDataset ds) {
204            var doubleNames = ds.DoubleVariables.ToArray();
205            if (ds.VariableNames.Count() != doubleNames.Length) throw new ArgumentException(nameof(ds));
206            var variableVals = new List<List<double>>();
207            foreach (var name in doubleNames) {
208                var vals = Utils.Normalize(ds.GetDoubleValues(name).ToArray());
209                variableVals.Add(vals.ToList());
210            }
211            return new Dataset(doubleNames, variableVals);
212        }
213
214        private static bool Ok(IEnumerable<double> data) => data.All(x => !double.IsNaN(x) && !double.IsInfinity(x));
215    }
216}
Note: See TracBrowser for help on using the repository browser.