#region License Information
/* HeuristicLab
* Copyright (C) 2002-2016 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
*
* This file is part of HeuristicLab.
*
* HeuristicLab is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* HeuristicLab is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with HeuristicLab. If not, see .
*/
#endregion
using System;
using System.Linq;
using System.Threading;
using HEAL.Attic;
using HeuristicLab.Algorithms.DataAnalysis;
using HeuristicLab.Common;
using HeuristicLab.Core;
using HeuristicLab.Data;
using HeuristicLab.Operators;
using HeuristicLab.Optimization;
using HeuristicLab.Parameters;
using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
using HeuristicLab.Problems.DataAnalysis;
namespace HeuristicLab.Algorithms.EGO {
[Item("ModelBuilder", "Builds a model from a dataset and a given RegressionAlgorithm")]
[StorableType("8b80026f-b6a5-4892-9826-86ffba1e4e10")]
public class ModelBuilder : InstrumentedOperator, IStochasticOperator, ICancellableOperator {
public override bool CanChangeName => true;
public CancellationToken Cancellation { get; set; }
#region Parameter properties
public ILookupParameter> RegressionAlgorithmParameter => (ILookupParameter>)Parameters["RegressionAlgorithm"];
public ILookupParameter ModelParameter => (ILookupParameter)Parameters["Model"];
public ILookupParameter DatasetParameter => (ILookupParameter)Parameters["Dataset"];
public ILookupParameter RandomParameter => (ILookupParameter)Parameters["Random"];
public ILookupParameter MaxModelSizeParameter => (ILookupParameter)Parameters["Maximal Model Size"];
public ILookupParameter InfillBoundsParameter => (ILookupParameter)Parameters["InfillBounds"];
#endregion
[StorableConstructor]
protected ModelBuilder(StorableConstructorFlag deserializing) : base(deserializing) { }
protected ModelBuilder(ModelBuilder original, Cloner cloner) : base(original, cloner) { }
public ModelBuilder() {
Parameters.Add(new LookupParameter>("RegressionAlgorithm", "The algorithm used to build a model") { Hidden = true });
Parameters.Add(new LookupParameter("Model", "The resulting model") { Hidden = true });
Parameters.Add(new LookupParameter("Dataset", "The Dataset from which the model is created") { Hidden = true });
Parameters.Add(new LookupParameter("Random", "A random number generator") { Hidden = true });
Parameters.Add(new LookupParameter("Maximal Model Size", "The maximum number of sample points used to build the model (Set -1 for infinite size") { Hidden = true });
Parameters.Add(new LookupParameter("InfillBounds", "The bounds applied for infill solving") { Hidden = true });
}
public override IDeepCloneable Clone(Cloner cloner) {
return new ModelBuilder(this, cloner);
}
public override IOperation InstrumentedApply() {
var regressionAlg = RegressionAlgorithmParameter.ActualValue;
IDataset data = DatasetParameter.ActualValue;
var random = RandomParameter.ActualValue;
var oldModel = ModelParameter.ActualValue;
var max = MaxModelSizeParameter.ActualValue.Value;
if (data.Rows > max && max > 0) {
data = SelectBestSamples(data, max);
InfillBoundsParameter.ActualValue = GetBounds(data);
}
ModelParameter.ActualValue = BuildModel(random, regressionAlg, data, oldModel);
return base.InstrumentedApply();
}
private DoubleMatrix GetBounds(IDataset data) {
var res = new DoubleMatrix(data.Columns - 1, 2);
var names = data.DoubleVariables.ToArray();
for (var i = 0; i < names.Length - 1; i++) {
res[i, 0] = data.GetDoubleValues(names[i]).Min();
res[i, 1] = data.GetDoubleValues(names[i]).Max();
}
return res;
}
private static Dataset SelectBestSamples(IDataset data, int max) {
var bestSampleIndices = data.GetDoubleValues("output").Select((d, i) => Tuple.Create(d, i)).OrderBy(x => x.Item1).Take(max).Select(x => x.Item2).ToArray();
return new Dataset(data.VariableNames, data.VariableNames.Select(v => data.GetDoubleValues(v, bestSampleIndices).ToList()));
}
private IRegressionSolution BuildModel(IRandom random, IDataAnalysisAlgorithm regressionAlgorithm, IDataset dataset, IRegressionSolution oldSolution) {
//var dataset = EgoUtilities.GetDataSet(dataSamples, RemoveDuplicates);
var problemdata = new RegressionProblemData(dataset, dataset.VariableNames.Where(x => !x.Equals("output")), "output");
problemdata.TrainingPartition.Start = 0;
problemdata.TrainingPartition.End = dataset.Rows;
problemdata.TestPartition.Start = dataset.Rows;
problemdata.TestPartition.End = dataset.Rows;
//train
var problem = (RegressionProblem)regressionAlgorithm.Problem;
problem.ProblemDataParameter.Value = problemdata;
var i = 0;
IRegressionSolution solution = null;
while (solution == null && i++ < 100) {
var results = EgoUtilities.SyncRunSubAlgorithm(regressionAlgorithm, random.Next(int.MaxValue), Cancellation);
solution = results.Select(x => x.Value).OfType().SingleOrDefault();
}
if (regressionAlgorithm is GaussianProcessRegression && oldSolution != null)
solution = SanitizeGaussianProcess(oldSolution as GaussianProcessRegressionSolution, solution as GaussianProcessRegressionSolution, Cancellation);
//if (regressionAlgorithm is M5RegressionTree && oldSolution != null) solution = SanitizeM5Regression(oldSolution.Model as M5Model, solution, random, Cancellation);
regressionAlgorithm.Runs.Clear();
return solution;
}
//private static IRegressionSolution SanitizeM5Regression(M5Model oldmodel, IRegressionSolution newSolution, IRandom random, CancellationToken cancellation) {
// var problemdata = newSolution.ProblemData;
// oldmodel.UpdateLeafModels(problemdata, problemdata.AllIndices, random, cancellation);
// var oldSolution = oldmodel.CreateRegressionSolution(problemdata);
// var magicDecision = newSolution.TrainingRSquared < oldSolution.TrainingRSquared - 0.05;
// return magicDecision ? newSolution : oldmodel.CreateRegressionSolution(problemdata);
//}
//try creating a model with old hyperparameters and new dataset;
private static IRegressionSolution SanitizeGaussianProcess(GaussianProcessRegressionSolution oldmodel, GaussianProcessRegressionSolution newSolution, CancellationToken cancellation) {
var problemdata = newSolution.ProblemData;
var mean = (IMeanFunction)oldmodel.Model.MeanFunction.Clone();
var cov = (ICovarianceFunction)oldmodel.Model.CovarianceFunction.Clone();
try {
var model = new GaussianProcessModel(problemdata.Dataset, problemdata.TargetVariable, problemdata.AllowedInputVariables, problemdata.TrainingIndices, new[] { 0.0 }, mean, cov);
cancellation.ThrowIfCancellationRequested();
model.FixParameters();
var sol = new GaussianProcessRegressionSolution(model, problemdata);
if (newSolution.TrainingMeanSquaredError > sol.TrainingMeanSquaredError) {
newSolution = sol;
}
}
catch (ArgumentException) { }
return newSolution;
}
}
}