#region License Information /* HeuristicLab * Copyright (C) 2002-2016 Heuristic and Evolutionary Algorithms Laboratory (HEAL) * * This file is part of HeuristicLab. * * HeuristicLab is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * HeuristicLab is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with HeuristicLab. If not, see . */ #endregion using System; using System.Linq; using System.Threading; using HeuristicLab.Algorithms.DataAnalysis; using HeuristicLab.Common; using HeuristicLab.Core; using HeuristicLab.Data; using HeuristicLab.Operators; using HeuristicLab.Optimization; using HeuristicLab.Parameters; using HeuristicLab.Persistence.Default.CompositeSerializers.Storable; using HeuristicLab.Problems.DataAnalysis; namespace HeuristicLab.Algorithms.EGO { [Item("ModelBuilder", "Builds a model from a dataset and a given RegressionAlgorithm")] [StorableClass] public class ModelBuilder : InstrumentedOperator, IStochasticOperator, ICancellableOperator { public override bool CanChangeName => true; public CancellationToken Cancellation { get; set; } #region Parameter properties public ILookupParameter> RegressionAlgorithmParameter => (ILookupParameter>)Parameters["RegressionAlgorithm"]; public ILookupParameter ModelParameter => (ILookupParameter)Parameters["Model"]; public ILookupParameter DatasetParameter => (ILookupParameter)Parameters["Dataset"]; public ILookupParameter RandomParameter => (ILookupParameter)Parameters["Random"]; public ILookupParameter MaxModelSizeParameter => (ILookupParameter)Parameters["Maximal Model Size"]; public ILookupParameter InfillBoundsParameter => (ILookupParameter)Parameters["InfillBounds"]; #endregion [StorableConstructor] protected ModelBuilder(bool deserializing) : base(deserializing) { } protected ModelBuilder(ModelBuilder original, Cloner cloner) : base(original, cloner) { } public ModelBuilder() { Parameters.Add(new LookupParameter>("RegressionAlgorithm", "The algorithm used to build a model") { Hidden = true }); Parameters.Add(new LookupParameter("Model", "The resulting model") { Hidden = true }); Parameters.Add(new LookupParameter("Dataset", "The Dataset from which the model is created") { Hidden = true }); Parameters.Add(new LookupParameter("Random", "A random number generator") { Hidden = true }); Parameters.Add(new LookupParameter("Maximal Model Size", "The maximum number of sample points used to build the model (Set -1 for infinite size") { Hidden = true }); Parameters.Add(new LookupParameter("InfillBounds", "The bounds applied for infill solving") { Hidden = true }); } public override IDeepCloneable Clone(Cloner cloner) { return new ModelBuilder(this, cloner); } public override IOperation InstrumentedApply() { var regressionAlg = RegressionAlgorithmParameter.ActualValue; IDataset data = DatasetParameter.ActualValue; var random = RandomParameter.ActualValue; var oldModel = ModelParameter.ActualValue; var max = MaxModelSizeParameter.ActualValue.Value; if (data.Rows > max && max > 0) { data = SelectBestSamples(data, max); InfillBoundsParameter.ActualValue = GetBounds(data); } ModelParameter.ActualValue = BuildModel(random, regressionAlg, data, oldModel); return base.InstrumentedApply(); } private DoubleMatrix GetBounds(IDataset data) { var res = new DoubleMatrix(data.Columns - 1, 2); var names = data.DoubleVariables.ToArray(); for (var i = 0; i < names.Length - 1; i++) { res[i, 0] = data.GetDoubleValues(names[i]).Min(); res[i, 1] = data.GetDoubleValues(names[i]).Max(); } return res; } private static Dataset SelectBestSamples(IDataset data, int max) { var bestSampleIndices = data.GetDoubleValues("output").Select((d, i) => Tuple.Create(d, i)).OrderBy(x => x.Item1).Take(max).Select(x => x.Item2).ToArray(); return new Dataset(data.VariableNames, data.VariableNames.Select(v => data.GetDoubleValues(v, bestSampleIndices).ToList())); } private IRegressionSolution BuildModel(IRandom random, IDataAnalysisAlgorithm regressionAlgorithm, IDataset dataset, IRegressionSolution oldSolution) { //var dataset = EgoUtilities.GetDataSet(dataSamples, RemoveDuplicates); var problemdata = new RegressionProblemData(dataset, dataset.VariableNames.Where(x => !x.Equals("output")), "output"); problemdata.TrainingPartition.Start = 0; problemdata.TrainingPartition.End = dataset.Rows; problemdata.TestPartition.Start = dataset.Rows; problemdata.TestPartition.End = dataset.Rows; //train var problem = (RegressionProblem)regressionAlgorithm.Problem; problem.ProblemDataParameter.Value = problemdata; var i = 0; IRegressionSolution solution = null; while (solution == null && i++ < 100) { var results = EgoUtilities.SyncRunSubAlgorithm(regressionAlgorithm, random.Next(int.MaxValue), Cancellation); solution = results.Select(x => x.Value).OfType().SingleOrDefault(); } if (regressionAlgorithm is GaussianProcessRegression && oldSolution != null) solution = SanitizeGaussianProcess(oldSolution as GaussianProcessRegressionSolution, solution as GaussianProcessRegressionSolution, Cancellation); //if (regressionAlgorithm is M5RegressionTree && oldSolution != null) solution = SanitizeM5Regression(oldSolution.Model as M5Model, solution, random, Cancellation); regressionAlgorithm.Runs.Clear(); return solution; } //private static IRegressionSolution SanitizeM5Regression(M5Model oldmodel, IRegressionSolution newSolution, IRandom random, CancellationToken cancellation) { // var problemdata = newSolution.ProblemData; // oldmodel.UpdateLeafModels(problemdata, problemdata.AllIndices, random, cancellation); // var oldSolution = oldmodel.CreateRegressionSolution(problemdata); // var magicDecision = newSolution.TrainingRSquared < oldSolution.TrainingRSquared - 0.05; // return magicDecision ? newSolution : oldmodel.CreateRegressionSolution(problemdata); //} //try creating a model with old hyperparameters and new dataset; private static IRegressionSolution SanitizeGaussianProcess(GaussianProcessRegressionSolution oldmodel, GaussianProcessRegressionSolution newSolution, CancellationToken cancellation) { var problemdata = newSolution.ProblemData; var mean = (IMeanFunction)oldmodel.Model.MeanFunction.Clone(); var cov = (ICovarianceFunction)oldmodel.Model.CovarianceFunction.Clone(); try { var model = new GaussianProcessModel(problemdata.Dataset, problemdata.TargetVariable, problemdata.AllowedInputVariables, problemdata.TrainingIndices, new[] { 0.0 }, mean, cov); cancellation.ThrowIfCancellationRequested(); model.FixParameters(); var sol = new GaussianProcessRegressionSolution(model, problemdata); if (newSolution.TrainingMeanSquaredError > sol.TrainingMeanSquaredError) { newSolution = sol; } } catch (ArgumentException) { } return newSolution; } } }