#region License Information /* HeuristicLab * Copyright (C) 2002-2019 Heuristic and Evolutionary Algorithms Laboratory (HEAL) * * This file is part of HeuristicLab. * * HeuristicLab is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * HeuristicLab is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with HeuristicLab. If not, see . */ #endregion using System; using System.Collections.Generic; using System.Linq; using HeuristicLab.Common; using HeuristicLab.Core; using HEAL.Attic; using HeuristicLab.Problems.DataAnalysis; namespace HeuristicLab.Algorithms.DataAnalysis { /// /// Represents a nearest neighbour model for regression and classification /// [StorableType("A76C0823-3077-4ACE-8A40-E9B717C7DB60")] [Item("NearestNeighbourModel", "Represents a nearest neighbour model for regression and classification.")] public sealed class NearestNeighbourModel : ClassificationModel, INearestNeighbourModel { private readonly object kdTreeLockObject = new object(); private alglib.nearestneighbor.kdtree kdTree; public alglib.nearestneighbor.kdtree KDTree { get { return kdTree; } set { if (value != kdTree) { if (value == null) throw new ArgumentNullException(); kdTree = value; OnChanged(EventArgs.Empty); } } } public override IEnumerable VariablesUsedForPrediction { get { return allowedInputVariables; } } [Storable] private string[] allowedInputVariables; [Storable] private double[] classValues; [Storable] private int k; [Storable(DefaultValue = false)] private bool selfMatch; [Storable(DefaultValue = null)] private double[] weights; // not set for old versions loaded from disk [Storable(DefaultValue = null)] private double[] offsets; // not set for old versions loaded from disk [StorableConstructor] private NearestNeighbourModel(StorableConstructorFlag _) : base(_) { kdTree = new alglib.nearestneighbor.kdtree(); } private NearestNeighbourModel(NearestNeighbourModel original, Cloner cloner) : base(original, cloner) { kdTree = new alglib.nearestneighbor.kdtree(); kdTree.approxf = original.kdTree.approxf; kdTree.boxmax = (double[])original.kdTree.boxmax.Clone(); kdTree.boxmin = (double[])original.kdTree.boxmin.Clone(); kdTree.buf = (double[])original.kdTree.buf.Clone(); kdTree.curboxmax = (double[])original.kdTree.curboxmax.Clone(); kdTree.curboxmin = (double[])original.kdTree.curboxmin.Clone(); kdTree.curdist = original.kdTree.curdist; kdTree.debugcounter = original.kdTree.debugcounter; kdTree.idx = (int[])original.kdTree.idx.Clone(); kdTree.kcur = original.kdTree.kcur; kdTree.kneeded = original.kdTree.kneeded; kdTree.n = original.kdTree.n; kdTree.nodes = (int[])original.kdTree.nodes.Clone(); kdTree.normtype = original.kdTree.normtype; kdTree.nx = original.kdTree.nx; kdTree.ny = original.kdTree.ny; kdTree.r = (double[])original.kdTree.r.Clone(); kdTree.rneeded = original.kdTree.rneeded; kdTree.selfmatch = original.kdTree.selfmatch; kdTree.splits = (double[])original.kdTree.splits.Clone(); kdTree.tags = (int[])original.kdTree.tags.Clone(); kdTree.x = (double[])original.kdTree.x.Clone(); kdTree.xy = (double[,])original.kdTree.xy.Clone(); selfMatch = original.selfMatch; k = original.k; isCompatibilityLoaded = original.IsCompatibilityLoaded; if (!IsCompatibilityLoaded) { weights = new double[original.weights.Length]; Array.Copy(original.weights, weights, weights.Length); offsets = new double[original.offsets.Length]; Array.Copy(original.offsets, this.offsets, this.offsets.Length); } allowedInputVariables = (string[])original.allowedInputVariables.Clone(); if (original.classValues != null) this.classValues = (double[])original.classValues.Clone(); } public NearestNeighbourModel(IDataset dataset, IEnumerable rows, int k, bool selfMatch, string targetVariable, IEnumerable allowedInputVariables, IEnumerable weights = null, double[] classValues = null) : base(targetVariable) { Name = ItemName; Description = ItemDescription; this.selfMatch = selfMatch; this.k = k; this.allowedInputVariables = allowedInputVariables.ToArray(); double[,] inputMatrix; if (IsCompatibilityLoaded) { // no scaling inputMatrix = dataset.ToArray( this.allowedInputVariables.Concat(new string[] { targetVariable }), rows); } else { this.offsets = this.allowedInputVariables .Select(name => dataset.GetDoubleValues(name, rows).Average() * -1) .Concat(new double[] { 0 }) // no offset for target variable .ToArray(); if (weights == null) { // automatic determination of weights (all features should have variance = 1) this.weights = this.allowedInputVariables .Select(name => { var pop = dataset.GetDoubleValues(name, rows).StandardDeviationPop(); return pop.IsAlmost(0) ? 1.0 : 1.0 / pop; }) .Concat(new double[] { 1.0 }) // no scaling for target variable .ToArray(); } else { // user specified weights (+ 1 for target) this.weights = weights.Concat(new double[] { 1.0 }).ToArray(); if (this.weights.Length - 1 != this.allowedInputVariables.Length) throw new ArgumentException("The number of elements in the weight vector must match the number of input variables"); } inputMatrix = CreateScaledData(dataset, this.allowedInputVariables.Concat(new string[] { targetVariable }), rows, this.offsets, this.weights); } if (inputMatrix.ContainsNanOrInfinity()) throw new NotSupportedException( "Nearest neighbour model does not support NaN or infinity values in the input dataset."); this.kdTree = new alglib.nearestneighbor.kdtree(); var nRows = inputMatrix.GetLength(0); var nFeatures = inputMatrix.GetLength(1) - 1; if (classValues != null) { this.classValues = (double[])classValues.Clone(); int nClasses = classValues.Length; // map original class values to values [0..nClasses-1] var classIndices = new Dictionary(); for (int i = 0; i < nClasses; i++) classIndices[classValues[i]] = i; for (int row = 0; row < nRows; row++) { inputMatrix[row, nFeatures] = classIndices[inputMatrix[row, nFeatures]]; } } alglib.nearestneighbor.kdtreebuild(inputMatrix, nRows, inputMatrix.GetLength(1) - 1, 1, 2, kdTree); } private static double[,] CreateScaledData(IDataset dataset, IEnumerable variables, IEnumerable rows, double[] offsets, double[] factors) { var transforms = variables.Select( (_, colIdx) => new LinearTransformation(variables) { Addend = offsets[colIdx] * factors[colIdx], Multiplier = factors[colIdx] }); return dataset.ToArray(variables, transforms, rows); } public override IDeepCloneable Clone(Cloner cloner) { return new NearestNeighbourModel(this, cloner); } public IEnumerable GetEstimatedValues(IDataset dataset, IEnumerable rows) { double[,] inputData; if (IsCompatibilityLoaded) { inputData = dataset.ToArray(allowedInputVariables, rows); } else { inputData = CreateScaledData(dataset, allowedInputVariables, rows, offsets, weights); } int n = inputData.GetLength(0); int columns = inputData.GetLength(1); double[] x = new double[columns]; double[] dists = new double[k]; double[,] neighbours = new double[k, columns + 1]; for (int row = 0; row < n; row++) { for (int column = 0; column < columns; column++) { x[column] = inputData[row, column]; } int numNeighbours; lock (kdTreeLockObject) { // gkronber: the following calls change the kdTree data structure numNeighbours = alglib.nearestneighbor.kdtreequeryknn(kdTree, x, k, selfMatch); alglib.nearestneighbor.kdtreequeryresultsdistances(kdTree, ref dists); alglib.nearestneighbor.kdtreequeryresultsxy(kdTree, ref neighbours); } if (selfMatch) { // weights for neighbours are 1/d. // override distances (=0) of exact matches using 1% of the distance of the next closest non-self-match neighbour -> selfmatches weight 100x more than the next closest neighbor. // if all k neighbours are selfmatches then they all have weight 0.01. double minDist = dists[0] + 1; for (int i = 0; i < numNeighbours; i++) { if ((minDist > dists[i]) && (dists[i] != 0)) { minDist = dists[i]; } } minDist /= 100.0; for (int i = 0; i < numNeighbours; i++) { if (dists[i] == 0) { dists[i] = minDist; } } } double distanceWeightedValue = 0.0; double distsSum = 0.0; for (int i = 0; i < numNeighbours; i++) { distanceWeightedValue += neighbours[i, columns] / dists[i]; distsSum += 1.0 / dists[i]; } yield return distanceWeightedValue / distsSum; } } public override IEnumerable GetEstimatedClassValues(IDataset dataset, IEnumerable rows) { if (classValues == null) throw new InvalidOperationException("No class values are defined."); double[,] inputData; if (IsCompatibilityLoaded) { inputData = dataset.ToArray(allowedInputVariables, rows); } else { inputData = CreateScaledData(dataset, allowedInputVariables, rows, offsets, weights); } int n = inputData.GetLength(0); int columns = inputData.GetLength(1); double[] x = new double[columns]; int[] y = new int[classValues.Length]; double[] dists = new double[k]; double[,] neighbours = new double[k, columns + 1]; for (int row = 0; row < n; row++) { for (int column = 0; column < columns; column++) { x[column] = inputData[row, column]; } int numNeighbours; lock (kdTreeLockObject) { // gkronber: the following calls change the kdTree data structure numNeighbours = alglib.nearestneighbor.kdtreequeryknn(kdTree, x, k, selfMatch); alglib.nearestneighbor.kdtreequeryresultsdistances(kdTree, ref dists); alglib.nearestneighbor.kdtreequeryresultsxy(kdTree, ref neighbours); } Array.Clear(y, 0, y.Length); for (int i = 0; i < numNeighbours; i++) { int classValue = (int)Math.Round(neighbours[i, columns]); y[classValue]++; } // find class for with the largest probability value int maxProbClassIndex = 0; double maxProb = y[0]; for (int i = 1; i < y.Length; i++) { if (maxProb < y[i]) { maxProb = y[i]; maxProbClassIndex = i; } } yield return classValues[maxProbClassIndex]; } } public bool IsProblemDataCompatible(IRegressionProblemData problemData, out string errorMessage) { return RegressionModel.IsProblemDataCompatible(this, problemData, out errorMessage); } public override bool IsProblemDataCompatible(IDataAnalysisProblemData problemData, out string errorMessage) { if (problemData == null) throw new ArgumentNullException("problemData", "The provided problemData is null."); var regressionProblemData = problemData as IRegressionProblemData; if (regressionProblemData != null) return IsProblemDataCompatible(regressionProblemData, out errorMessage); var classificationProblemData = problemData as IClassificationProblemData; if (classificationProblemData != null) return IsProblemDataCompatible(classificationProblemData, out errorMessage); throw new ArgumentException("The problem data is not compatible with this nearest neighbour model. Instead a " + problemData.GetType().GetPrettyName() + " was provided.", "problemData"); } IRegressionSolution IRegressionModel.CreateRegressionSolution(IRegressionProblemData problemData) { return new NearestNeighbourRegressionSolution(this, new RegressionProblemData(problemData)); } public override IClassificationSolution CreateClassificationSolution(IClassificationProblemData problemData) { return new NearestNeighbourClassificationSolution(this, new ClassificationProblemData(problemData)); } #region events public event EventHandler Changed; private void OnChanged(EventArgs e) { var handlers = Changed; if (handlers != null) handlers(this, e); } #endregion // BackwardsCompatibility3.3 #region Backwards compatible code, remove with 3.4 private bool isCompatibilityLoaded = false; // new kNN models have the value false, kNN models loaded from disc have the value true [Storable(DefaultValue = true)] public bool IsCompatibilityLoaded { get { return isCompatibilityLoaded; } set { isCompatibilityLoaded = value; } } #endregion #region persistence [Storable] public double KDTreeApproxF { get { return kdTree.approxf; } set { kdTree.approxf = value; } } [Storable] public double[] KDTreeBoxMax { get { return kdTree.boxmax; } set { kdTree.boxmax = value; } } [Storable] public double[] KDTreeBoxMin { get { return kdTree.boxmin; } set { kdTree.boxmin = value; } } [Storable] public double[] KDTreeBuf { get { return kdTree.buf; } set { kdTree.buf = value; } } [Storable] public double[] KDTreeCurBoxMax { get { return kdTree.curboxmax; } set { kdTree.curboxmax = value; } } [Storable] public double[] KDTreeCurBoxMin { get { return kdTree.curboxmin; } set { kdTree.curboxmin = value; } } [Storable] public double KDTreeCurDist { get { return kdTree.curdist; } set { kdTree.curdist = value; } } [Storable] public int KDTreeDebugCounter { get { return kdTree.debugcounter; } set { kdTree.debugcounter = value; } } [Storable] public int[] KDTreeIdx { get { return kdTree.idx; } set { kdTree.idx = value; } } [Storable] public int KDTreeKCur { get { return kdTree.kcur; } set { kdTree.kcur = value; } } [Storable] public int KDTreeKNeeded { get { return kdTree.kneeded; } set { kdTree.kneeded = value; } } [Storable] public int KDTreeN { get { return kdTree.n; } set { kdTree.n = value; } } [Storable] public int[] KDTreeNodes { get { return kdTree.nodes; } set { kdTree.nodes = value; } } [Storable] public int KDTreeNormType { get { return kdTree.normtype; } set { kdTree.normtype = value; } } [Storable] public int KDTreeNX { get { return kdTree.nx; } set { kdTree.nx = value; } } [Storable] public int KDTreeNY { get { return kdTree.ny; } set { kdTree.ny = value; } } [Storable] public double[] KDTreeR { get { return kdTree.r; } set { kdTree.r = value; } } [Storable] public double KDTreeRNeeded { get { return kdTree.rneeded; } set { kdTree.rneeded = value; } } [Storable] public bool KDTreeSelfMatch { get { return kdTree.selfmatch; } set { kdTree.selfmatch = value; } } [Storable] public double[] KDTreeSplits { get { return kdTree.splits; } set { kdTree.splits = value; } } [Storable] public int[] KDTreeTags { get { return kdTree.tags; } set { kdTree.tags = value; } } [Storable] public double[] KDTreeX { get { return kdTree.x; } set { kdTree.x = value; } } [Storable] public double[,] KDTreeXY { get { return kdTree.xy; } set { kdTree.xy = value; } } #endregion } }