#region License Information
/* HeuristicLab
* Copyright (C) Heuristic and Evolutionary Algorithms Laboratory (HEAL)
*
* This file is part of HeuristicLab.
*
* HeuristicLab is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* HeuristicLab is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with HeuristicLab. If not, see .
*/
#endregion
extern alias alglib_3_7;
using System;
using System.Collections.Generic;
using System.Linq;
using HEAL.Attic;
using HeuristicLab.Common;
using HeuristicLab.Core;
using HeuristicLab.Encodings.SymbolicExpressionTreeEncoding;
using HeuristicLab.Problems.DataAnalysis;
using HeuristicLab.Problems.DataAnalysis.Symbolic;
namespace HeuristicLab.Algorithms.DataAnalysis {
///
/// Represents a random forest model for regression and classification
///
[Obsolete("This class only exists for backwards compatibility reasons for stored models with the XML Persistence. Use RFModelSurrogate or RFModelFull instead.")]
[StorableType("9AA4CCC2-CD75-4471-8DF6-949E5B783642")]
[Item("RandomForestModel", "Represents a random forest for regression and classification.")]
public sealed class RandomForestModel : ClassificationModel, IRandomForestModel {
// not persisted
private alglib_3_7.alglib.decisionforest randomForest;
private alglib_3_7.alglib.decisionforest RandomForest {
get {
// recalculate lazily
if (randomForest.innerobj.trees == null || randomForest.innerobj.trees.Length == 0) RecalculateModel();
return randomForest;
}
}
public override IEnumerable VariablesUsedForPrediction {
get { return originalTrainingData.AllowedInputVariables; }
}
public int NumberOfTrees {
get { return nTrees; }
}
// instead of storing the data of the model itself
// we instead only store data necessary to recalculate the same model lazily on demand
[Storable]
private int seed;
[Storable]
private IDataAnalysisProblemData originalTrainingData;
[Storable]
private double[] classValues;
[Storable]
private int nTrees;
[Storable]
private double r;
[Storable]
private double m;
[StorableConstructor]
private RandomForestModel(StorableConstructorFlag _) : base(_) {
// for backwards compatibility (loading old solutions)
randomForest = new alglib_3_7.alglib.decisionforest();
}
private RandomForestModel(RandomForestModel original, Cloner cloner)
: base(original, cloner) {
randomForest = new alglib_3_7.alglib.decisionforest();
randomForest.innerobj.bufsize = original.randomForest.innerobj.bufsize;
randomForest.innerobj.nclasses = original.randomForest.innerobj.nclasses;
randomForest.innerobj.ntrees = original.randomForest.innerobj.ntrees;
randomForest.innerobj.nvars = original.randomForest.innerobj.nvars;
// we assume that the trees array (double[]) is immutable in alglib
randomForest.innerobj.trees = original.randomForest.innerobj.trees;
// allowedInputVariables is immutable so we don't need to clone
allowedInputVariables = original.allowedInputVariables;
// clone data which is necessary to rebuild the model
this.seed = original.seed;
this.originalTrainingData = cloner.Clone(original.originalTrainingData);
// classvalues is immutable so we don't need to clone
this.classValues = original.classValues;
this.nTrees = original.nTrees;
this.r = original.r;
this.m = original.m;
}
// random forest models can only be created through the static factory methods CreateRegressionModel and CreateClassificationModel
private RandomForestModel(string targetVariable, alglib_3_7.alglib.decisionforest randomForest,
int seed, IDataAnalysisProblemData originalTrainingData,
int nTrees, double r, double m, double[] classValues = null)
: base(targetVariable) {
this.name = ItemName;
this.description = ItemDescription;
// the model itself
this.randomForest = randomForest;
// data which is necessary for recalculation of the model
this.seed = seed;
this.originalTrainingData = (IDataAnalysisProblemData)originalTrainingData.Clone();
this.classValues = classValues;
this.nTrees = nTrees;
this.r = r;
this.m = m;
}
public override IDeepCloneable Clone(Cloner cloner) {
return new RandomForestModel(this, cloner);
}
private void RecalculateModel() {
double rmsError, oobRmsError, relClassError, oobRelClassError;
var regressionProblemData = originalTrainingData as IRegressionProblemData;
var classificationProblemData = originalTrainingData as IClassificationProblemData;
if (regressionProblemData != null) {
var model = CreateRegressionModel(regressionProblemData,
nTrees, r, m, seed, out rmsError, out oobRmsError,
out relClassError, out oobRelClassError);
randomForest = model.randomForest;
} else if (classificationProblemData != null) {
var model = CreateClassificationModel(classificationProblemData,
nTrees, r, m, seed, out rmsError, out oobRmsError,
out relClassError, out oobRelClassError);
randomForest = model.randomForest;
}
}
public IEnumerable GetEstimatedValues(IDataset dataset, IEnumerable rows) {
double[,] inputData = dataset.ToArray(AllowedInputVariables, rows);
RandomForestUtil.AssertInputMatrix(inputData);
int n = inputData.GetLength(0);
int columns = inputData.GetLength(1);
double[] x = new double[columns];
double[] y = new double[1];
for (int row = 0; row < n; row++) {
for (int column = 0; column < columns; column++) {
x[column] = inputData[row, column];
}
alglib_3_7.alglib.dfprocess(RandomForest, x, ref y);
yield return y[0];
}
}
public IEnumerable GetEstimatedVariances(IDataset dataset, IEnumerable rows) {
double[,] inputData = dataset.ToArray(AllowedInputVariables, rows);
RandomForestUtil.AssertInputMatrix(inputData);
int n = inputData.GetLength(0);
int columns = inputData.GetLength(1);
double[] x = new double[columns];
double[] ys = new double[this.RandomForest.innerobj.ntrees];
for (int row = 0; row < n; row++) {
for (int column = 0; column < columns; column++) {
x[column] = inputData[row, column];
}
alglib_3_7.alglib.dforest.dfprocessraw(RandomForest.innerobj, x, ref ys);
yield return ys.VariancePop();
}
}
public override IEnumerable GetEstimatedClassValues(IDataset dataset, IEnumerable rows) {
double[,] inputData = dataset.ToArray(AllowedInputVariables, rows);
RandomForestUtil.AssertInputMatrix(inputData);
int n = inputData.GetLength(0);
int columns = inputData.GetLength(1);
double[] x = new double[columns];
double[] y = new double[RandomForest.innerobj.nclasses];
for (int row = 0; row < n; row++) {
for (int column = 0; column < columns; column++) {
x[column] = inputData[row, column];
}
alglib_3_7.alglib.dfprocess(randomForest, x, ref y);
// find class for with the largest probability value
int maxProbClassIndex = 0;
double maxProb = y[0];
for (int i = 1; i < y.Length; i++) {
if (maxProb < y[i]) {
maxProb = y[i];
maxProbClassIndex = i;
}
}
yield return classValues[maxProbClassIndex];
}
}
public ISymbolicExpressionTree ExtractTree(int treeIdx) {
var rf = RandomForest;
// hoping that the internal representation of alglib is stable
// TREE FORMAT
// W[Offs] - size of sub-array (for the tree)
// node info:
// W[K+0] - variable number (-1 for leaf mode)
// W[K+1] - threshold (class/value for leaf node)
// W[K+2] - ">=" branch index (absent for leaf node)
// skip irrelevant trees
int offset = 0;
for (int i = 0; i < treeIdx - 1; i++) {
offset = offset + (int)Math.Round(rf.innerobj.trees[offset]);
}
var constSy = new Constant();
var varCondSy = new VariableCondition() { IgnoreSlope = true };
var node = CreateRegressionTreeRec(rf.innerobj.trees, offset, offset + 1, constSy, varCondSy);
var startNode = new StartSymbol().CreateTreeNode();
startNode.AddSubtree(node);
var root = new ProgramRootSymbol().CreateTreeNode();
root.AddSubtree(startNode);
return new SymbolicExpressionTree(root);
}
private ISymbolicExpressionTreeNode CreateRegressionTreeRec(double[] trees, int offset, int k, Constant constSy, VariableCondition varCondSy) {
// alglib source for evaluation of one tree (dfprocessinternal)
// offs = 0
//
// Set pointer to the root
//
// k = offs + 1;
//
// //
// // Navigate through the tree
// //
// while (true) {
// if ((double)(df.trees[k]) == (double)(-1)) {
// if (df.nclasses == 1) {
// y[0] = y[0] + df.trees[k + 1];
// } else {
// idx = (int)Math.Round(df.trees[k + 1]);
// y[idx] = y[idx] + 1;
// }
// break;
// }
// if ((double)(x[(int)Math.Round(df.trees[k])]) < (double)(df.trees[k + 1])) {
// k = k + innernodewidth;
// } else {
// k = offs + (int)Math.Round(df.trees[k + 2]);
// }
// }
if ((double)(trees[k]) == (double)(-1)) {
var constNode = (ConstantTreeNode)constSy.CreateTreeNode();
constNode.Value = trees[k + 1];
return constNode;
} else {
var condNode = (VariableConditionTreeNode)varCondSy.CreateTreeNode();
condNode.VariableName = AllowedInputVariables[(int)Math.Round(trees[k])];
condNode.Threshold = trees[k + 1];
condNode.Slope = double.PositiveInfinity;
var left = CreateRegressionTreeRec(trees, offset, k + 3, constSy, varCondSy);
var right = CreateRegressionTreeRec(trees, offset, offset + (int)Math.Round(trees[k + 2]), constSy, varCondSy);
condNode.AddSubtree(left); // not 100% correct because interpreter uses: if(x <= thres) left() else right() and RF uses if(x < thres) left() else right() (see above)
condNode.AddSubtree(right);
return condNode;
}
}
public IRegressionSolution CreateRegressionSolution(IRegressionProblemData problemData) {
return new RandomForestRegressionSolution(this, new RegressionProblemData(problemData));
}
public override IClassificationSolution CreateClassificationSolution(IClassificationProblemData problemData) {
return new RandomForestClassificationSolution(this, new ClassificationProblemData(problemData));
}
public bool IsProblemDataCompatible(IRegressionProblemData problemData, out string errorMessage) {
return RegressionModel.IsProblemDataCompatible(this, problemData, out errorMessage);
}
public override bool IsProblemDataCompatible(IDataAnalysisProblemData problemData, out string errorMessage) {
if (problemData == null) throw new ArgumentNullException("problemData", "The provided problemData is null.");
var regressionProblemData = problemData as IRegressionProblemData;
if (regressionProblemData != null)
return IsProblemDataCompatible(regressionProblemData, out errorMessage);
var classificationProblemData = problemData as IClassificationProblemData;
if (classificationProblemData != null)
return IsProblemDataCompatible(classificationProblemData, out errorMessage);
throw new ArgumentException("The problem data is not compatible with this random forest. Instead a " + problemData.GetType().GetPrettyName() + " was provided.", "problemData");
}
public static RandomForestModel CreateRegressionModel(IRegressionProblemData problemData, int nTrees, double r, double m, int seed,
out double rmsError, out double outOfBagRmsError, out double avgRelError, out double outOfBagAvgRelError) {
return CreateRegressionModel(problemData, problemData.TrainingIndices, nTrees, r, m, seed,
rmsError: out rmsError, outOfBagRmsError: out outOfBagRmsError, avgRelError: out avgRelError, outOfBagAvgRelError: out outOfBagAvgRelError);
}
public static RandomForestModel CreateRegressionModel(IRegressionProblemData problemData, IEnumerable trainingIndices, int nTrees, double r, double m, int seed,
out double rmsError, out double outOfBagRmsError, out double avgRelError, out double outOfBagAvgRelError) {
var variables = problemData.AllowedInputVariables.Concat(new string[] { problemData.TargetVariable });
double[,] inputMatrix = problemData.Dataset.ToArray(variables, trainingIndices);
var dForest = RandomForestUtil.CreateRandomForestModelAlglib_3_7(seed, inputMatrix, nTrees, r, m, 1, out var rep);
rmsError = rep.rmserror;
outOfBagRmsError = rep.oobrmserror;
avgRelError = rep.avgrelerror;
outOfBagAvgRelError = rep.oobavgrelerror;
return new RandomForestModel(problemData.TargetVariable, dForest, seed, problemData, nTrees, r, m);
}
public static RandomForestModel CreateClassificationModel(IClassificationProblemData problemData, int nTrees, double r, double m, int seed,
out double rmsError, out double outOfBagRmsError, out double relClassificationError, out double outOfBagRelClassificationError) {
return CreateClassificationModel(problemData, problemData.TrainingIndices, nTrees, r, m, seed,
out rmsError, out outOfBagRmsError, out relClassificationError, out outOfBagRelClassificationError);
}
public static RandomForestModel CreateClassificationModel(IClassificationProblemData problemData, IEnumerable trainingIndices, int nTrees, double r, double m, int seed,
out double rmsError, out double outOfBagRmsError, out double relClassificationError, out double outOfBagRelClassificationError) {
var variables = problemData.AllowedInputVariables.Concat(new string[] { problemData.TargetVariable });
double[,] inputMatrix = problemData.Dataset.ToArray(variables, trainingIndices);
var classValues = problemData.ClassValues.ToArray();
int nClasses = classValues.Length;
// map original class values to values [0..nClasses-1]
var classIndices = new Dictionary();
for (int i = 0; i < nClasses; i++) {
classIndices[classValues[i]] = i;
}
int nRows = inputMatrix.GetLength(0);
int nColumns = inputMatrix.GetLength(1);
for (int row = 0; row < nRows; row++) {
inputMatrix[row, nColumns - 1] = classIndices[inputMatrix[row, nColumns - 1]];
}
var dForest = RandomForestUtil.CreateRandomForestModelAlglib_3_7(seed, inputMatrix, nTrees, r, m, nClasses, out var rep);
rmsError = rep.rmserror;
outOfBagRmsError = rep.oobrmserror;
relClassificationError = rep.relclserror;
outOfBagRelClassificationError = rep.oobrelclserror;
return new RandomForestModel(problemData.TargetVariable, dForest, seed, problemData, nTrees, r, m, classValues);
}
#region persistence for backwards compatibility
// when the originalTrainingData is null this means the model was loaded from an old file
// therefore, we cannot use the new persistence mechanism because the original data is not available anymore
// in such cases we still store the compete model
private bool IsCompatibilityLoaded { get { return originalTrainingData == null; } }
private string[] allowedInputVariables;
[Storable(Name = "allowedInputVariables")]
private string[] AllowedInputVariables {
get {
if (IsCompatibilityLoaded) return allowedInputVariables;
else return originalTrainingData.AllowedInputVariables.ToArray();
}
set { allowedInputVariables = value; }
}
[Storable]
private int RandomForestBufSize {
get {
if (IsCompatibilityLoaded) return randomForest.innerobj.bufsize;
else return 0;
}
set {
randomForest.innerobj.bufsize = value;
}
}
[Storable]
private int RandomForestNClasses {
get {
if (IsCompatibilityLoaded) return randomForest.innerobj.nclasses;
else return 0;
}
set {
randomForest.innerobj.nclasses = value;
}
}
[Storable]
private int RandomForestNTrees {
get {
if (IsCompatibilityLoaded) return randomForest.innerobj.ntrees;
else return 0;
}
set {
randomForest.innerobj.ntrees = value;
}
}
[Storable]
private int RandomForestNVars {
get {
if (IsCompatibilityLoaded) return randomForest.innerobj.nvars;
else return 0;
}
set {
randomForest.innerobj.nvars = value;
}
}
[Storable]
private double[] RandomForestTrees {
get {
if (IsCompatibilityLoaded) return randomForest.innerobj.trees;
else return new double[] { };
}
set {
randomForest.innerobj.trees = value;
}
}
#endregion
}
}