#region License Information
/* HeuristicLab
* Copyright (C) 2002-2016 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
*
* This file is part of HeuristicLab.
*
* HeuristicLab is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* HeuristicLab is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with HeuristicLab. If not, see .
*/
#endregion
using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading;
using HeuristicLab.Analysis;
using HeuristicLab.Common;
using HeuristicLab.Core;
using HeuristicLab.Data;
using HeuristicLab.Optimization;
using HeuristicLab.Parameters;
using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
using HeuristicLab.Problems.DataAnalysis;
namespace HeuristicLab.Algorithms.DataAnalysis.Experimental {
// UNFINISHED
[Item("Generalized Additive Modelling", "GAM")]
[Creatable(CreatableAttribute.Categories.DataAnalysisRegression, Priority = 102)]
[StorableClass]
public sealed class GAM : FixedDataAnalysisAlgorithm {
private const string LambdaParameterName = "Lambda";
private const string MaxIterationsParameterName = "Max iterations";
private const string MaxInteractionsParameterName = "Max interactions";
public IFixedValueParameter LambdaParameter {
get { return (IFixedValueParameter)Parameters[LambdaParameterName]; }
}
public IFixedValueParameter MaxIterationsParameter {
get { return (IFixedValueParameter)Parameters[MaxIterationsParameterName]; }
}
public IFixedValueParameter MaxInteractionsParameter {
get { return (IFixedValueParameter)Parameters[MaxInteractionsParameterName]; }
}
public double Lambda {
get { return LambdaParameter.Value.Value; }
set { LambdaParameter.Value.Value = value; }
}
public int MaxIterations {
get { return MaxIterationsParameter.Value.Value; }
set { MaxIterationsParameter.Value.Value = value; }
}
public int MaxInteractions {
get { return MaxInteractionsParameter.Value.Value; }
set { MaxInteractionsParameter.Value.Value = value; }
}
[StorableConstructor]
private GAM(bool deserializing) : base(deserializing) { }
[StorableHook(HookType.AfterDeserialization)]
private void AfterDeserialization() {
}
private GAM(GAM original, Cloner cloner)
: base(original, cloner) {
}
public override IDeepCloneable Clone(Cloner cloner) {
return new GAM(this, cloner);
}
public GAM()
: base() {
Problem = new RegressionProblem();
Parameters.Add(new FixedValueParameter(LambdaParameterName, "Regularization for smoothing splines", new DoubleValue(1.0)));
Parameters.Add(new FixedValueParameter(MaxIterationsParameterName, "", new IntValue(100)));
Parameters.Add(new FixedValueParameter(MaxInteractionsParameterName, "", new IntValue(1)));
}
protected override void Run(CancellationToken cancellationToken) {
double lambda = Lambda;
int maxIters = MaxIterations;
int maxInteractions = MaxInteractions;
if (maxInteractions < 1 || maxInteractions > 5) throw new ArgumentException("Max interactions is outside the valid range [1 .. 5]");
// calculates a GAM model using a linear representation + independent non-linear functions of each variable
// using backfitting algorithm (see The Elements of Statistical Learning page 298)
var problemData = Problem.ProblemData;
var y = problemData.TargetVariableTrainingValues.ToArray();
var avgY = y.Average();
var inputVars = Problem.ProblemData.AllowedInputVariables.ToArray();
var nTerms = 0; // inputVars.Length; // LR
for (int i = 1; i <= maxInteractions; i++) {
nTerms += inputVars.Combinations(i).Count();
}
IRegressionModel[] f = new IRegressionModel[nTerms];
for (int i = 0; i < f.Length; i++) {
f[i] = new ConstantModel(0.0, problemData.TargetVariable);
}
var rmseTable = new DataTable("RMSE");
var rmseRow = new DataRow("RMSE (train)");
var rmseRowTest = new DataRow("RMSE (test)");
rmseTable.Rows.Add(rmseRow);
rmseTable.Rows.Add(rmseRowTest);
Results.Add(new Result("RMSE", rmseTable));
rmseRow.Values.Add(CalculateResiduals(problemData, f, -1, avgY, problemData.TrainingIndices).StandardDeviation()); // -1 index to use all predictors
rmseRowTest.Values.Add(CalculateResiduals(problemData, f, -1, avgY, problemData.TestIndices).StandardDeviation());
// for analytics
double[] rss = new double[f.Length];
string[] terms = new string[f.Length];
Results.Add(new Result("RSS Values", typeof(DoubleMatrix)));
var combinations = new List();
for (int i = 1; i <= maxInteractions; i++)
combinations.AddRange(HeuristicLab.Common.EnumerableExtensions.Combinations(inputVars, i).Select(c => c.ToArray()));
// combinations.Add(new string[] { "X1", "X2" });
// combinations.Add(new string[] { "X3", "X4" });
// combinations.Add(new string[] { "X5", "X6" });
// combinations.Add(new string[] { "X1", "X7", "X9" });
// combinations.Add(new string[] { "X3", "X6", "X10" });
// until convergence
int iters = 0;
var t = new double[y.Length];
while (iters++ < maxIters) {
int j = 0;
//foreach (var inputVar in inputVars) {
// var res = CalculateResiduals(problemData, f, j, avgY, problemData.TrainingIndices);
// rss[j] = res.Variance();
// terms[j] = inputVar;
// f[j] = RegressLR(problemData, inputVar, res);
// j++;
//}
foreach (var element in combinations) {
var res = CalculateResiduals(problemData, f, j, avgY, problemData.TrainingIndices);
rss[j] = res.Variance();
terms[j] = string.Format("f({0})", string.Join(",", element));
f[j] = RegressSpline(problemData, element.ToArray(), res, lambda);
j++;
}
rmseRow.Values.Add(CalculateResiduals(problemData, f, -1, avgY, problemData.TrainingIndices).StandardDeviation()); // -1 index to use all predictors
rmseRowTest.Values.Add(CalculateResiduals(problemData, f, -1, avgY, problemData.TestIndices).StandardDeviation());
// calculate table with residual contributions of each term
var rssTable = new DoubleMatrix(rss.Length, 1, new string[] { "RSS" }, terms);
for (int i = 0; i < rss.Length; i++) rssTable[i, 0] = rss[i];
Results["RSS Values"].Value = rssTable;
if (cancellationToken.IsCancellationRequested) break;
}
var model = new RegressionEnsembleModel(f.Concat(new[] { new ConstantModel(avgY, problemData.TargetVariable) }));
model.AverageModelEstimates = false;
var solution = model.CreateRegressionSolution((IRegressionProblemData)problemData.Clone());
Results.Add(new Result("Ensemble solution", solution));
}
private double[] CalculateResiduals(IRegressionProblemData problemData, IRegressionModel[] f, int j, double avgY, IEnumerable rows) {
var y = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, rows);
double[] t = y.Select(yi => yi - avgY).ToArray();
// collect other predictions
for (int k = 0; k < f.Length; k++) {
if (k != j) {
var pred = f[k].GetEstimatedValues(problemData.Dataset, rows).ToArray();
// determine target for this smoother
for (int i = 0; i < t.Length; i++) {
t[i] -= pred[i];
}
}
}
return t;
}
private IRegressionModel RegressLR(IRegressionProblemData problemData, string inputVar, double[] target) {
// Umständlich!
var ds = ((Dataset)problemData.Dataset).ToModifiable();
ds.ReplaceVariable(problemData.TargetVariable, target.Concat(Enumerable.Repeat(0.0, ds.Rows - target.Length)).ToList());
var pd = new RegressionProblemData(ds, new string[] { inputVar }, problemData.TargetVariable);
pd.TrainingPartition.Start = problemData.TrainingPartition.Start;
pd.TrainingPartition.End = problemData.TrainingPartition.End;
pd.TestPartition.Start = problemData.TestPartition.Start;
pd.TestPartition.End = problemData.TestPartition.End;
double rmsError, cvRmsError;
return LinearRegression.CreateLinearRegressionSolution(pd, out rmsError, out cvRmsError).Model;
}
// private IRegressionModel RegressSpline(IRegressionProblemData problemData, string inputVar, double[] target, double lambda) {
// if (problemData.Dataset.VariableHasType(inputVar)) {
// // Umständlich!
// return Splines.CalculatePenalizedRegressionSpline(
// problemData.Dataset.GetDoubleValues(inputVar, problemData.TrainingIndices).ToArray(),
// (double[])target.Clone(), lambda,
// problemData.TargetVariable, new string[] { inputVar }
// );
// } else return new ConstantModel(target.Average(), problemData.TargetVariable);
// }
private IRegressionModel RegressSpline(IRegressionProblemData problemData, string[] inputVars, double[] target, double lambda) {
if (inputVars.All(problemData.Dataset.VariableHasType)) {
var product = problemData.Dataset.GetDoubleValues(inputVars.First(), problemData.TrainingIndices).ToArray();
for (int i = 1; i < inputVars.Length; i++) {
product = product.Zip(problemData.Dataset.GetDoubleValues(inputVars[i], problemData.TrainingIndices), (pi, vi) => pi * vi).ToArray();
}
// CubicSplineGCV.CubGcvReport report;
// return CubicSplineGCV.CalculateCubicSpline(
// product,
// (double[])target.Clone(),
// problemData.TargetVariable, inputVars, out report
// );
//
// double optTolerance; double cvRMSE;
// find tolerance
// var ensemble = Splines.CalculateSmoothingSplineReinsch(product, (double[])target.Clone(), inputVars, problemData.TargetVariable, out optTolerance, out cvRMSE);
// // train on whole data
// return Splines.CalculateSmoothingSplineReinsch(product, (double[])target.Clone(), inputVars, optTolerance, product.Length - 1, problemData.TargetVariable);
// find tolerance
//var bestLambda = double.NaN;
// double bestCVRMSE = target.StandardDeviation();
// double avgTrainRMSE = double.PositiveInfinity;
// double[] bestPredictions = new double[target.Length]; // zero
//double[] bestSSE = target.Select(ti => ti*ti).ToArray(); // target - zero
//for (double curLambda = 6.0; curLambda >= -6.0; curLambda -= 1.0) {
// double[] predictions;
// var ensemble = Splines.CalculatePenalizedRegressionSpline(product, (double[])target.Clone(), curLambda, problemData.TargetVariable, inputVars, out avgTrainRMSE, out cvRMSE, out predictions);
// double[] sse = target.Zip(predictions, (t, p) => (t - p)*(t-p)).ToArray();
// // Console.Write("{0} {1} {2}", curLambda, avgTrainRMSE, cvRMSE);
// double bothTails = .0, leftTail = .0, rightTail = .0;
// alglib.stest.onesamplesigntest(bestSSE.Zip(sse, (a, b) => a-b).ToArray(), predictions.Length, 0.0, ref bothTails, ref leftTail, ref rightTail);
// if (bothTails < 0.1 && bestCVRMSE > cvRMSE) {
// Console.Write(" *");
// bestCVRMSE = cvRMSE;
// bestLambda = curLambda;
// bestSSE = sse;
// bestPredictions = predictions;
// }
// // Console.WriteLine();
//}
//if (double.IsNaN(bestLambda)) {
// return new ConstantModel(target.Average(), problemData.TargetVariable);
//} else {
// train on whole data
// return Splines.CalculatePenalizedRegressionSpline(product, (double[])target.Clone(), lambda, problemData.TargetVariable, inputVars, out avgTrainRMSE, out cvRMSE, out bestPredictions);
SBART.SBART_Report rep;
var w = product.Select(_ => 1.0).ToArray();
var model = SBART.CalculateSBART(product, (double[])target.Clone(), w, 10, problemData.TargetVariable, inputVars, out rep);
Console.WriteLine("{0} {1:N5} {2:N5} {3:N5} {4:N5}", string.Join(",", inputVars), rep.gcv, rep.leverage.Sum(), product.StandardDeviation(), target.StandardDeviation());
return model;
// }
} else return new ConstantModel(target.Average(), problemData.TargetVariable);
}
private IRegressionModel RegressRF(IRegressionProblemData problemData, string inputVar, double[] target, double lambda) {
if (problemData.Dataset.VariableHasType(inputVar)) {
// Umständlich!
var ds = ((Dataset)problemData.Dataset).ToModifiable();
ds.ReplaceVariable(problemData.TargetVariable, target.Concat(Enumerable.Repeat(0.0, ds.Rows - target.Length)).ToList());
var pd = new RegressionProblemData(ds, new string[] { inputVar }, problemData.TargetVariable);
pd.TrainingPartition.Start = problemData.TrainingPartition.Start;
pd.TrainingPartition.End = problemData.TrainingPartition.End;
pd.TestPartition.Start = problemData.TestPartition.Start;
pd.TestPartition.End = problemData.TestPartition.End;
double rmsError, oobRmsError;
double avgRelError, oobAvgRelError;
return RandomForestRegression.CreateRandomForestRegressionModel(pd, 100, 0.5, 0.5, 1234, out rmsError, out avgRelError, out oobRmsError, out oobAvgRelError);
} else return new ConstantModel(target.Average(), problemData.TargetVariable);
}
}
// UNFINISHED
public class RBFModel : NamedItem, IRegressionModel {
private alglib.rbfmodel model;
public string TargetVariable { get; set; }
public IEnumerable VariablesUsedForPrediction { get; private set; }
private ITransformation[] scaling;
public event EventHandler TargetVariableChanged;
public RBFModel(RBFModel orig, Cloner cloner) : base(orig, cloner) {
this.TargetVariable = orig.TargetVariable;
this.VariablesUsedForPrediction = orig.VariablesUsedForPrediction.ToArray();
this.model = (alglib.rbfmodel)orig.model.make_copy();
this.scaling = orig.scaling.Select(s => cloner.Clone(s)).ToArray();
}
public RBFModel(alglib.rbfmodel model, string targetVar, string[] inputs, IEnumerable> scaling) : base("RBFModel", "RBFModel") {
this.model = model;
this.TargetVariable = targetVar;
this.VariablesUsedForPrediction = inputs;
this.scaling = scaling.ToArray();
}
public override IDeepCloneable Clone(Cloner cloner) {
return new RBFModel(this, cloner);
}
public IRegressionSolution CreateRegressionSolution(IRegressionProblemData problemData) {
return new RegressionSolution(this, (IRegressionProblemData)problemData.Clone());
}
public IEnumerable GetEstimatedValues(IDataset dataset, IEnumerable rows) {
double[] x = new double[VariablesUsedForPrediction.Count()];
double[] y;
foreach (var r in rows) {
int c = 0;
foreach (var v in VariablesUsedForPrediction) {
x[c] = scaling[c].Apply(dataset.GetDoubleValue(v, r).ToEnumerable()).First(); // OUCH!
c++;
}
alglib.rbfcalc(model, x, out y);
yield return y[0];
}
}
}
}