#region License Information /* HeuristicLab * Copyright (C) 2002-2016 Heuristic and Evolutionary Algorithms Laboratory (HEAL) * * This file is part of HeuristicLab. * * HeuristicLab is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * HeuristicLab is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with HeuristicLab. If not, see . */ #endregion using System; using System.Collections.Generic; using System.Linq; using System.Threading; using HeuristicLab.Analysis; using HeuristicLab.Common; using HeuristicLab.Core; using HeuristicLab.Data; using HeuristicLab.Optimization; using HeuristicLab.Parameters; using HeuristicLab.Persistence.Default.CompositeSerializers.Storable; using HeuristicLab.Problems.DataAnalysis; namespace HeuristicLab.Algorithms.DataAnalysis.Experimental { // UNFINISHED [Item("Generalized Additive Modelling", "GAM")] [Creatable(CreatableAttribute.Categories.DataAnalysisRegression, Priority = 102)] [StorableClass] public sealed class GAM : FixedDataAnalysisAlgorithm { private const string LambdaParameterName = "Lambda"; private const string MaxIterationsParameterName = "Max iterations"; private const string MaxInteractionsParameterName = "Max interactions"; public IFixedValueParameter LambdaParameter { get { return (IFixedValueParameter)Parameters[LambdaParameterName]; } } public IFixedValueParameter MaxIterationsParameter { get { return (IFixedValueParameter)Parameters[MaxIterationsParameterName]; } } public IFixedValueParameter MaxInteractionsParameter { get { return (IFixedValueParameter)Parameters[MaxInteractionsParameterName]; } } public double Lambda { get { return LambdaParameter.Value.Value; } set { LambdaParameter.Value.Value = value; } } public int MaxIterations { get { return MaxIterationsParameter.Value.Value; } set { MaxIterationsParameter.Value.Value = value; } } public int MaxInteractions { get { return MaxInteractionsParameter.Value.Value; } set { MaxInteractionsParameter.Value.Value = value; } } [StorableConstructor] private GAM(bool deserializing) : base(deserializing) { } [StorableHook(HookType.AfterDeserialization)] private void AfterDeserialization() { } private GAM(GAM original, Cloner cloner) : base(original, cloner) { } public override IDeepCloneable Clone(Cloner cloner) { return new GAM(this, cloner); } public GAM() : base() { Problem = new RegressionProblem(); Parameters.Add(new FixedValueParameter(LambdaParameterName, "Regularization for smoothing splines", new DoubleValue(1.0))); Parameters.Add(new FixedValueParameter(MaxIterationsParameterName, "", new IntValue(100))); Parameters.Add(new FixedValueParameter(MaxInteractionsParameterName, "", new IntValue(1))); } protected override void Run(CancellationToken cancellationToken) { double lambda = Lambda; int maxIters = MaxIterations; int maxInteractions = MaxInteractions; if (maxInteractions < 1 || maxInteractions > 5) throw new ArgumentException("Max interactions is outside the valid range [1 .. 5]"); // calculates a GAM model using a linear representation + independent non-linear functions of each variable // using backfitting algorithm (see The Elements of Statistical Learning page 298) var problemData = Problem.ProblemData; var y = problemData.TargetVariableTrainingValues.ToArray(); var avgY = y.Average(); var inputVars = Problem.ProblemData.AllowedInputVariables.ToArray(); var nTerms = 0; // inputVars.Length; // LR for (int i = 1; i <= maxInteractions; i++) { nTerms += inputVars.Combinations(i).Count(); } IRegressionModel[] f = new IRegressionModel[nTerms]; for (int i = 0; i < f.Length; i++) { f[i] = new ConstantModel(0.0, problemData.TargetVariable); } var rmseTable = new DataTable("RMSE"); var rmseRow = new DataRow("RMSE (train)"); var rmseRowTest = new DataRow("RMSE (test)"); rmseTable.Rows.Add(rmseRow); rmseTable.Rows.Add(rmseRowTest); Results.Add(new Result("RMSE", rmseTable)); rmseRow.Values.Add(CalculateResiduals(problemData, f, -1, avgY, problemData.TrainingIndices).StandardDeviation()); // -1 index to use all predictors rmseRowTest.Values.Add(CalculateResiduals(problemData, f, -1, avgY, problemData.TestIndices).StandardDeviation()); // for analytics double[] rss = new double[f.Length]; string[] terms = new string[f.Length]; Results.Add(new Result("RSS Values", typeof(DoubleMatrix))); var combinations = new List(); for (int i = 1; i <= maxInteractions; i++) combinations.AddRange(HeuristicLab.Common.EnumerableExtensions.Combinations(inputVars, i).Select(c => c.ToArray())); // combinations.Add(new string[] { "X1", "X2" }); // combinations.Add(new string[] { "X3", "X4" }); // combinations.Add(new string[] { "X5", "X6" }); // combinations.Add(new string[] { "X1", "X7", "X9" }); // combinations.Add(new string[] { "X3", "X6", "X10" }); // until convergence int iters = 0; var t = new double[y.Length]; while (iters++ < maxIters) { int j = 0; //foreach (var inputVar in inputVars) { // var res = CalculateResiduals(problemData, f, j, avgY, problemData.TrainingIndices); // rss[j] = res.Variance(); // terms[j] = inputVar; // f[j] = RegressLR(problemData, inputVar, res); // j++; //} foreach (var element in combinations) { var res = CalculateResiduals(problemData, f, j, avgY, problemData.TrainingIndices); rss[j] = res.Variance(); terms[j] = string.Format("f({0})", string.Join(",", element)); f[j] = RegressSpline(problemData, element.ToArray(), res, lambda); j++; } rmseRow.Values.Add(CalculateResiduals(problemData, f, -1, avgY, problemData.TrainingIndices).StandardDeviation()); // -1 index to use all predictors rmseRowTest.Values.Add(CalculateResiduals(problemData, f, -1, avgY, problemData.TestIndices).StandardDeviation()); // calculate table with residual contributions of each term var rssTable = new DoubleMatrix(rss.Length, 1, new string[] { "RSS" }, terms); for (int i = 0; i < rss.Length; i++) rssTable[i, 0] = rss[i]; Results["RSS Values"].Value = rssTable; if (cancellationToken.IsCancellationRequested) break; } var model = new RegressionEnsembleModel(f.Concat(new[] { new ConstantModel(avgY, problemData.TargetVariable) })); model.AverageModelEstimates = false; var solution = model.CreateRegressionSolution((IRegressionProblemData)problemData.Clone()); Results.Add(new Result("Ensemble solution", solution)); } private double[] CalculateResiduals(IRegressionProblemData problemData, IRegressionModel[] f, int j, double avgY, IEnumerable rows) { var y = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, rows); double[] t = y.Select(yi => yi - avgY).ToArray(); // collect other predictions for (int k = 0; k < f.Length; k++) { if (k != j) { var pred = f[k].GetEstimatedValues(problemData.Dataset, rows).ToArray(); // determine target for this smoother for (int i = 0; i < t.Length; i++) { t[i] -= pred[i]; } } } return t; } private IRegressionModel RegressLR(IRegressionProblemData problemData, string inputVar, double[] target) { // Umständlich! var ds = ((Dataset)problemData.Dataset).ToModifiable(); ds.ReplaceVariable(problemData.TargetVariable, target.Concat(Enumerable.Repeat(0.0, ds.Rows - target.Length)).ToList()); var pd = new RegressionProblemData(ds, new string[] { inputVar }, problemData.TargetVariable); pd.TrainingPartition.Start = problemData.TrainingPartition.Start; pd.TrainingPartition.End = problemData.TrainingPartition.End; pd.TestPartition.Start = problemData.TestPartition.Start; pd.TestPartition.End = problemData.TestPartition.End; double rmsError, cvRmsError; return LinearRegression.CreateLinearRegressionSolution(pd, out rmsError, out cvRmsError).Model; } // private IRegressionModel RegressSpline(IRegressionProblemData problemData, string inputVar, double[] target, double lambda) { // if (problemData.Dataset.VariableHasType(inputVar)) { // // Umständlich! // return Splines.CalculatePenalizedRegressionSpline( // problemData.Dataset.GetDoubleValues(inputVar, problemData.TrainingIndices).ToArray(), // (double[])target.Clone(), lambda, // problemData.TargetVariable, new string[] { inputVar } // ); // } else return new ConstantModel(target.Average(), problemData.TargetVariable); // } private IRegressionModel RegressSpline(IRegressionProblemData problemData, string[] inputVars, double[] target, double lambda) { if (inputVars.All(problemData.Dataset.VariableHasType)) { var product = problemData.Dataset.GetDoubleValues(inputVars.First(), problemData.TrainingIndices).ToArray(); for (int i = 1; i < inputVars.Length; i++) { product = product.Zip(problemData.Dataset.GetDoubleValues(inputVars[i], problemData.TrainingIndices), (pi, vi) => pi * vi).ToArray(); } // CubicSplineGCV.CubGcvReport report; // return CubicSplineGCV.CalculateCubicSpline( // product, // (double[])target.Clone(), // problemData.TargetVariable, inputVars, out report // ); // // double optTolerance; double cvRMSE; // find tolerance // var ensemble = Splines.CalculateSmoothingSplineReinsch(product, (double[])target.Clone(), inputVars, problemData.TargetVariable, out optTolerance, out cvRMSE); // // train on whole data // return Splines.CalculateSmoothingSplineReinsch(product, (double[])target.Clone(), inputVars, optTolerance, product.Length - 1, problemData.TargetVariable); // find tolerance //var bestLambda = double.NaN; // double bestCVRMSE = target.StandardDeviation(); // double avgTrainRMSE = double.PositiveInfinity; // double[] bestPredictions = new double[target.Length]; // zero //double[] bestSSE = target.Select(ti => ti*ti).ToArray(); // target - zero //for (double curLambda = 6.0; curLambda >= -6.0; curLambda -= 1.0) { // double[] predictions; // var ensemble = Splines.CalculatePenalizedRegressionSpline(product, (double[])target.Clone(), curLambda, problemData.TargetVariable, inputVars, out avgTrainRMSE, out cvRMSE, out predictions); // double[] sse = target.Zip(predictions, (t, p) => (t - p)*(t-p)).ToArray(); // // Console.Write("{0} {1} {2}", curLambda, avgTrainRMSE, cvRMSE); // double bothTails = .0, leftTail = .0, rightTail = .0; // alglib.stest.onesamplesigntest(bestSSE.Zip(sse, (a, b) => a-b).ToArray(), predictions.Length, 0.0, ref bothTails, ref leftTail, ref rightTail); // if (bothTails < 0.1 && bestCVRMSE > cvRMSE) { // Console.Write(" *"); // bestCVRMSE = cvRMSE; // bestLambda = curLambda; // bestSSE = sse; // bestPredictions = predictions; // } // // Console.WriteLine(); //} //if (double.IsNaN(bestLambda)) { // return new ConstantModel(target.Average(), problemData.TargetVariable); //} else { // train on whole data // return Splines.CalculatePenalizedRegressionSpline(product, (double[])target.Clone(), lambda, problemData.TargetVariable, inputVars, out avgTrainRMSE, out cvRMSE, out bestPredictions); SBART.SBART_Report rep; var w = product.Select(_ => 1.0).ToArray(); var model = SBART.CalculateSBART(product, (double[])target.Clone(), w, 10, problemData.TargetVariable, inputVars, out rep); Console.WriteLine("{0} {1:N5} {2:N5} {3:N5} {4:N5}", string.Join(",", inputVars), rep.gcv, rep.leverage.Sum(), product.StandardDeviation(), target.StandardDeviation()); return model; // } } else return new ConstantModel(target.Average(), problemData.TargetVariable); } private IRegressionModel RegressRF(IRegressionProblemData problemData, string inputVar, double[] target, double lambda) { if (problemData.Dataset.VariableHasType(inputVar)) { // Umständlich! var ds = ((Dataset)problemData.Dataset).ToModifiable(); ds.ReplaceVariable(problemData.TargetVariable, target.Concat(Enumerable.Repeat(0.0, ds.Rows - target.Length)).ToList()); var pd = new RegressionProblemData(ds, new string[] { inputVar }, problemData.TargetVariable); pd.TrainingPartition.Start = problemData.TrainingPartition.Start; pd.TrainingPartition.End = problemData.TrainingPartition.End; pd.TestPartition.Start = problemData.TestPartition.Start; pd.TestPartition.End = problemData.TestPartition.End; double rmsError, oobRmsError; double avgRelError, oobAvgRelError; return RandomForestRegression.CreateRandomForestRegressionModel(pd, 100, 0.5, 0.5, 1234, out rmsError, out avgRelError, out oobRmsError, out oobAvgRelError); } else return new ConstantModel(target.Average(), problemData.TargetVariable); } } // UNFINISHED public class RBFModel : NamedItem, IRegressionModel { private alglib.rbfmodel model; public string TargetVariable { get; set; } public IEnumerable VariablesUsedForPrediction { get; private set; } private ITransformation[] scaling; public event EventHandler TargetVariableChanged; public RBFModel(RBFModel orig, Cloner cloner) : base(orig, cloner) { this.TargetVariable = orig.TargetVariable; this.VariablesUsedForPrediction = orig.VariablesUsedForPrediction.ToArray(); this.model = (alglib.rbfmodel)orig.model.make_copy(); this.scaling = orig.scaling.Select(s => cloner.Clone(s)).ToArray(); } public RBFModel(alglib.rbfmodel model, string targetVar, string[] inputs, IEnumerable> scaling) : base("RBFModel", "RBFModel") { this.model = model; this.TargetVariable = targetVar; this.VariablesUsedForPrediction = inputs; this.scaling = scaling.ToArray(); } public override IDeepCloneable Clone(Cloner cloner) { return new RBFModel(this, cloner); } public IRegressionSolution CreateRegressionSolution(IRegressionProblemData problemData) { return new RegressionSolution(this, (IRegressionProblemData)problemData.Clone()); } public IEnumerable GetEstimatedValues(IDataset dataset, IEnumerable rows) { double[] x = new double[VariablesUsedForPrediction.Count()]; double[] y; foreach (var r in rows) { int c = 0; foreach (var v in VariablesUsedForPrediction) { x[c] = scaling[c].Apply(dataset.GetDoubleValue(v, r).ToEnumerable()).First(); // OUCH! c++; } alglib.rbfcalc(model, x, out y); yield return y[0]; } } } }