Context Navigation

GAM.cs @ 15576

Visit:

Last change on this file since 15576 was 15469, checked in by gkronber, 7 years ago
#2789 trying to get SBART to work correctly.
File size: 16.0 KB

Line
1	#region License Information
2	/* HeuristicLab
3	* Copyright (C) 2002-2016 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4	*
5	* This file is part of HeuristicLab.
6	*
7	* HeuristicLab is free software: you can redistribute it and/or modify
8	* it under the terms of the GNU General Public License as published by
9	* the Free Software Foundation, either version 3 of the License, or
10	* (at your option) any later version.
11	*
12	* HeuristicLab is distributed in the hope that it will be useful,
13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	* GNU General Public License for more details.
16	*
17	* You should have received a copy of the GNU General Public License
18	* along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19	*/
20	#endregion
21
22	using System;
23	using System.Collections.Generic;
24	using System.Linq;
25	using System.Threading;
26	using HeuristicLab.Analysis;
27	using HeuristicLab.Common;
28	using HeuristicLab.Core;
29	using HeuristicLab.Data;
30	using HeuristicLab.Optimization;
31	using HeuristicLab.Parameters;
32	using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
33	using HeuristicLab.Problems.DataAnalysis;
34
35	namespace HeuristicLab.Algorithms.DataAnalysis.Experimental {
36	// UNFINISHED
37	[Item("Generalized Additive Modelling", "GAM")]
38	[Creatable(CreatableAttribute.Categories.DataAnalysisRegression, Priority = 102)]
39	[StorableClass]
40	public sealed class GAM : FixedDataAnalysisAlgorithm<IRegressionProblem> {
41
42	private const string LambdaParameterName = "Lambda";
43	private const string MaxIterationsParameterName = "Max iterations";
44	private const string MaxInteractionsParameterName = "Max interactions";
45
46	public IFixedValueParameter<DoubleValue> LambdaParameter {
47	get { return (IFixedValueParameter<DoubleValue>)Parameters[LambdaParameterName]; }
48	}
49	public IFixedValueParameter<IntValue> MaxIterationsParameter {
50	get { return (IFixedValueParameter<IntValue>)Parameters[MaxIterationsParameterName]; }
51	}
52	public IFixedValueParameter<IntValue> MaxInteractionsParameter {
53	get { return (IFixedValueParameter<IntValue>)Parameters[MaxInteractionsParameterName]; }
54	}
55
56	public double Lambda {
57	get { return LambdaParameter.Value.Value; }
58	set { LambdaParameter.Value.Value = value; }
59	}
60	public int MaxIterations {
61	get { return MaxIterationsParameter.Value.Value; }
62	set { MaxIterationsParameter.Value.Value = value; }
63	}
64	public int MaxInteractions {
65	get { return MaxInteractionsParameter.Value.Value; }
66	set { MaxInteractionsParameter.Value.Value = value; }
67	}
68
69	[StorableConstructor]
70	private GAM(bool deserializing) : base(deserializing) { }
71	[StorableHook(HookType.AfterDeserialization)]
72	private void AfterDeserialization() {
73	}
74
75	private GAM(GAM original, Cloner cloner)
76	: base(original, cloner) {
77	}
78	public override IDeepCloneable Clone(Cloner cloner) {
79	return new GAM(this, cloner);
80	}
81
82	public GAM()
83	: base() {
84	Problem = new RegressionProblem();
85	Parameters.Add(new FixedValueParameter<DoubleValue>(LambdaParameterName, "Regularization for smoothing splines", new DoubleValue(1.0)));
86	Parameters.Add(new FixedValueParameter<IntValue>(MaxIterationsParameterName, "", new IntValue(100)));
87	Parameters.Add(new FixedValueParameter<IntValue>(MaxInteractionsParameterName, "", new IntValue(1)));
88	}
89
90
91	protected override void Run(CancellationToken cancellationToken) {
92	double lambda = Lambda;
93	int maxIters = MaxIterations;
94	int maxInteractions = MaxInteractions;
95	if (maxInteractions < 1 \|\| maxInteractions > 5) throw new ArgumentException("Max interactions is outside the valid range [1 .. 5]");
96
97	// calculates a GAM model using a linear representation + independent non-linear functions of each variable
98	// using backfitting algorithm (see The Elements of Statistical Learning page 298)
99
100	var problemData = Problem.ProblemData;
101	var y = problemData.TargetVariableTrainingValues.ToArray();
102	var avgY = y.Average();
103	var inputVars = Problem.ProblemData.AllowedInputVariables.ToArray();
104	var nTerms = 0; // inputVars.Length; // LR
105	for (int i = 1; i <= maxInteractions; i++) {
106	nTerms += inputVars.Combinations(i).Count();
107	}
108
109	IRegressionModel[] f = new IRegressionModel[nTerms];
110	for (int i = 0; i < f.Length; i++) {
111	f[i] = new ConstantModel(0.0, problemData.TargetVariable);
112	}
113
114	var rmseTable = new DataTable("RMSE");
115	var rmseRow = new DataRow("RMSE (train)");
116	var rmseRowTest = new DataRow("RMSE (test)");
117	rmseTable.Rows.Add(rmseRow);
118	rmseTable.Rows.Add(rmseRowTest);
119
120	Results.Add(new Result("RMSE", rmseTable));
121	rmseRow.Values.Add(CalculateResiduals(problemData, f, -1, avgY, problemData.TrainingIndices).StandardDeviation()); // -1 index to use all predictors
122	rmseRowTest.Values.Add(CalculateResiduals(problemData, f, -1, avgY, problemData.TestIndices).StandardDeviation());
123
124	// for analytics
125	double[] rss = new double[f.Length];
126	string[] terms = new string[f.Length];
127	Results.Add(new Result("RSS Values", typeof(DoubleMatrix)));
128
129	var combinations = new List<string[]>();
130	for (int i = 1; i <= maxInteractions; i++)
131	combinations.AddRange(HeuristicLab.Common.EnumerableExtensions.Combinations(inputVars, i).Select(c => c.ToArray()));
132	// combinations.Add(new string[] { "X1", "X2" });
133	// combinations.Add(new string[] { "X3", "X4" });
134	// combinations.Add(new string[] { "X5", "X6" });
135	// combinations.Add(new string[] { "X1", "X7", "X9" });
136	// combinations.Add(new string[] { "X3", "X6", "X10" });
137
138
139
140	// until convergence
141	int iters = 0;
142	var t = new double[y.Length];
143	while (iters++ < maxIters) {
144	int j = 0;
145	//foreach (var inputVar in inputVars) {
146	// var res = CalculateResiduals(problemData, f, j, avgY, problemData.TrainingIndices);
147	// rss[j] = res.Variance();
148	// terms[j] = inputVar;
149	// f[j] = RegressLR(problemData, inputVar, res);
150	// j++;
151	//}
152
153
154
155	foreach (var element in combinations) {
156	var res = CalculateResiduals(problemData, f, j, avgY, problemData.TrainingIndices);
157	rss[j] = res.Variance();
158	terms[j] = string.Format("f({0})", string.Join(",", element));
159	f[j] = RegressSpline(problemData, element.ToArray(), res, lambda);
160	j++;
161	}
162
163	rmseRow.Values.Add(CalculateResiduals(problemData, f, -1, avgY, problemData.TrainingIndices).StandardDeviation()); // -1 index to use all predictors
164	rmseRowTest.Values.Add(CalculateResiduals(problemData, f, -1, avgY, problemData.TestIndices).StandardDeviation());
165
166	// calculate table with residual contributions of each term
167	var rssTable = new DoubleMatrix(rss.Length, 1, new string[] { "RSS" }, terms);
168	for (int i = 0; i < rss.Length; i++) rssTable[i, 0] = rss[i];
169	Results["RSS Values"].Value = rssTable;
170
171	if (cancellationToken.IsCancellationRequested) break;
172	}
173
174	var model = new RegressionEnsembleModel(f.Concat(new[] { new ConstantModel(avgY, problemData.TargetVariable) }));
175	model.AverageModelEstimates = false;
176	var solution = model.CreateRegressionSolution((IRegressionProblemData)problemData.Clone());
177	Results.Add(new Result("Ensemble solution", solution));
178	}
179
180	private double[] CalculateResiduals(IRegressionProblemData problemData, IRegressionModel[] f, int j, double avgY, IEnumerable<int> rows) {
181	var y = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, rows);
182	double[] t = y.Select(yi => yi - avgY).ToArray();
183	// collect other predictions
184	for (int k = 0; k < f.Length; k++) {
185	if (k != j) {
186	var pred = f[k].GetEstimatedValues(problemData.Dataset, rows).ToArray();
187	// determine target for this smoother
188	for (int i = 0; i < t.Length; i++) {
189	t[i] -= pred[i];
190	}
191	}
192	}
193	return t;
194	}
195
196	private IRegressionModel RegressLR(IRegressionProblemData problemData, string inputVar, double[] target) {
197	// Umständlich!
198	var ds = ((Dataset)problemData.Dataset).ToModifiable();
199	ds.ReplaceVariable(problemData.TargetVariable, target.Concat(Enumerable.Repeat(0.0, ds.Rows - target.Length)).ToList<double>());
200	var pd = new RegressionProblemData(ds, new string[] { inputVar }, problemData.TargetVariable);
201	pd.TrainingPartition.Start = problemData.TrainingPartition.Start;
202	pd.TrainingPartition.End = problemData.TrainingPartition.End;
203	pd.TestPartition.Start = problemData.TestPartition.Start;
204	pd.TestPartition.End = problemData.TestPartition.End;
205	double rmsError, cvRmsError;
206	return LinearRegression.CreateLinearRegressionSolution(pd, out rmsError, out cvRmsError).Model;
207	}
208
209	// private IRegressionModel RegressSpline(IRegressionProblemData problemData, string inputVar, double[] target, double lambda) {
210	// if (problemData.Dataset.VariableHasType<double>(inputVar)) {
211	// // Umständlich!
212	// return Splines.CalculatePenalizedRegressionSpline(
213	// problemData.Dataset.GetDoubleValues(inputVar, problemData.TrainingIndices).ToArray(),
214	// (double[])target.Clone(), lambda,
215	// problemData.TargetVariable, new string[] { inputVar }
216	// );
217	// } else return new ConstantModel(target.Average(), problemData.TargetVariable);
218	// }
219	private IRegressionModel RegressSpline(IRegressionProblemData problemData, string[] inputVars, double[] target, double lambda) {
220	if (inputVars.All(problemData.Dataset.VariableHasType<double>)) {
221	var product = problemData.Dataset.GetDoubleValues(inputVars.First(), problemData.TrainingIndices).ToArray();
222	for (int i = 1; i < inputVars.Length; i++) {
223	product = product.Zip(problemData.Dataset.GetDoubleValues(inputVars[i], problemData.TrainingIndices), (pi, vi) => pi * vi).ToArray();
224	}
225	// CubicSplineGCV.CubGcvReport report;
226	// return CubicSplineGCV.CalculateCubicSpline(
227	// product,
228	// (double[])target.Clone(),
229	// problemData.TargetVariable, inputVars, out report
230	// );
231	//
232	// double optTolerance; double cvRMSE;
233	// find tolerance
234	// var ensemble = Splines.CalculateSmoothingSplineReinsch(product, (double[])target.Clone(), inputVars, problemData.TargetVariable, out optTolerance, out cvRMSE);
235	// // train on whole data
236	// return Splines.CalculateSmoothingSplineReinsch(product, (double[])target.Clone(), inputVars, optTolerance, product.Length - 1, problemData.TargetVariable);
237
238
239	// find tolerance
240	//var bestLambda = double.NaN;
241	// double bestCVRMSE = target.StandardDeviation();
242	// double avgTrainRMSE = double.PositiveInfinity;
243	// double[] bestPredictions = new double[target.Length]; // zero
244
245
246	//double[] bestSSE = target.Select(ti => ti*ti).ToArray(); // target - zero
247	//for (double curLambda = 6.0; curLambda >= -6.0; curLambda -= 1.0) {
248	// double[] predictions;
249	// var ensemble = Splines.CalculatePenalizedRegressionSpline(product, (double[])target.Clone(), curLambda, problemData.TargetVariable, inputVars, out avgTrainRMSE, out cvRMSE, out predictions);
250	// double[] sse = target.Zip(predictions, (t, p) => (t - p)*(t-p)).ToArray();
251	// // Console.Write("{0} {1} {2}", curLambda, avgTrainRMSE, cvRMSE);
252	// double bothTails = .0, leftTail = .0, rightTail = .0;
253	// alglib.stest.onesamplesigntest(bestSSE.Zip(sse, (a, b) => a-b).ToArray(), predictions.Length, 0.0, ref bothTails, ref leftTail, ref rightTail);
254	// if (bothTails < 0.1 && bestCVRMSE > cvRMSE) {
255	// Console.Write(" *");
256	// bestCVRMSE = cvRMSE;
257	// bestLambda = curLambda;
258	// bestSSE = sse;
259	// bestPredictions = predictions;
260	// }
261	// // Console.WriteLine();
262	//}
263	//if (double.IsNaN(bestLambda)) {
264	// return new ConstantModel(target.Average(), problemData.TargetVariable);
265	//} else {
266	// train on whole data
267
268
269	// return Splines.CalculatePenalizedRegressionSpline(product, (double[])target.Clone(), lambda, problemData.TargetVariable, inputVars, out avgTrainRMSE, out cvRMSE, out bestPredictions);
270	SBART.SBART_Report rep;
271	var w = product.Select(_ => 1.0).ToArray();
272	var model = SBART.CalculateSBART(product, (double[])target.Clone(), w, 10, problemData.TargetVariable, inputVars, out rep);
273	Console.WriteLine("{0} {1:N5} {2:N5} {3:N5} {4:N5}", string.Join(",", inputVars), rep.gcv, rep.leverage.Sum(), product.StandardDeviation(), target.StandardDeviation());
274	return model;
275	// }
276
277	} else return new ConstantModel(target.Average(), problemData.TargetVariable);
278	}
279
280	private IRegressionModel RegressRF(IRegressionProblemData problemData, string inputVar, double[] target, double lambda) {
281	if (problemData.Dataset.VariableHasType<double>(inputVar)) {
282	// Umständlich!
283	var ds = ((Dataset)problemData.Dataset).ToModifiable();
284	ds.ReplaceVariable(problemData.TargetVariable, target.Concat(Enumerable.Repeat(0.0, ds.Rows - target.Length)).ToList<double>());
285	var pd = new RegressionProblemData(ds, new string[] { inputVar }, problemData.TargetVariable);
286	pd.TrainingPartition.Start = problemData.TrainingPartition.Start;
287	pd.TrainingPartition.End = problemData.TrainingPartition.End;
288	pd.TestPartition.Start = problemData.TestPartition.Start;
289	pd.TestPartition.End = problemData.TestPartition.End;
290	double rmsError, oobRmsError;
291	double avgRelError, oobAvgRelError;
292	return RandomForestRegression.CreateRandomForestRegressionModel(pd, 100, 0.5, 0.5, 1234, out rmsError, out avgRelError, out oobRmsError, out oobAvgRelError);
293	} else return new ConstantModel(target.Average(), problemData.TargetVariable);
294	}
295	}
296
297
298	// UNFINISHED
299	public class RBFModel : NamedItem, IRegressionModel {
300	private alglib.rbfmodel model;
301
302	public string TargetVariable { get; set; }
303
304	public IEnumerable<string> VariablesUsedForPrediction { get; private set; }
305	private ITransformation<double>[] scaling;
306
307	public event EventHandler TargetVariableChanged;
308
309	public RBFModel(RBFModel orig, Cloner cloner) : base(orig, cloner) {
310	this.TargetVariable = orig.TargetVariable;
311	this.VariablesUsedForPrediction = orig.VariablesUsedForPrediction.ToArray();
312	this.model = (alglib.rbfmodel)orig.model.make_copy();
313	this.scaling = orig.scaling.Select(s => cloner.Clone(s)).ToArray();
314	}
315	public RBFModel(alglib.rbfmodel model, string targetVar, string[] inputs, IEnumerable<ITransformation<double>> scaling) : base("RBFModel", "RBFModel") {
316	this.model = model;
317	this.TargetVariable = targetVar;
318	this.VariablesUsedForPrediction = inputs;
319	this.scaling = scaling.ToArray();
320	}
321
322	public override IDeepCloneable Clone(Cloner cloner) {
323	return new RBFModel(this, cloner);
324	}
325
326	public IRegressionSolution CreateRegressionSolution(IRegressionProblemData problemData) {
327	return new RegressionSolution(this, (IRegressionProblemData)problemData.Clone());
328	}
329
330	public IEnumerable<double> GetEstimatedValues(IDataset dataset, IEnumerable<int> rows) {
331	double[] x = new double[VariablesUsedForPrediction.Count()];
332	double[] y;
333	foreach (var r in rows) {
334	int c = 0;
335	foreach (var v in VariablesUsedForPrediction) {
336	x[c] = scaling[c].Apply(dataset.GetDoubleValue(v, r).ToEnumerable()).First(); // OUCH!
337	c++;
338	}
339	alglib.rbfcalc(model, x, out y);
340	yield return y[0];
341	}
342	}
343	}
344	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: branches/MathNetNumerics-Exploration-2789/HeuristicLab.Algorithms.DataAnalysis.Experimental/GAM.cs @ 15576

Download in other formats: