#region License Information
/* HeuristicLab
* Copyright (C) 2002-2014 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
*
* This file is part of HeuristicLab.
*
* HeuristicLab is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* HeuristicLab is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with HeuristicLab. If not, see .
*/
#endregion
using System;
using System.Collections.Generic;
using System.Linq;
using HeuristicLab.Common;
using HeuristicLab.Core;
using HeuristicLab.Encodings.SymbolicExpressionTreeEncoding;
using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
namespace HeuristicLab.Problems.DataAnalysis.Symbolic.Classification {
///
/// Represents a nearest neighbour model for regression and classification
///
[StorableClass]
[Item("SymbolicNearestNeighbourClassificationModel", "Represents a nearest neighbour model for symbolic classification.")]
public sealed class SymbolicNearestNeighbourClassificationModel : SymbolicClassificationModel {
[Storable]
private int k;
[Storable]
private List trainedClasses;
[Storable]
private List trainedEstimatedValues;
[Storable]
private ClassFrequencyComparer frequencyComparer;
[StorableConstructor]
private SymbolicNearestNeighbourClassificationModel(bool deserializing) : base(deserializing) { }
private SymbolicNearestNeighbourClassificationModel(SymbolicNearestNeighbourClassificationModel original, Cloner cloner)
: base(original, cloner) {
k = original.k;
frequencyComparer = new ClassFrequencyComparer(original.frequencyComparer);
trainedEstimatedValues = new List(original.trainedEstimatedValues);
trainedClasses = new List(original.trainedClasses);
}
public SymbolicNearestNeighbourClassificationModel(int k, ISymbolicExpressionTree tree, ISymbolicDataAnalysisExpressionTreeInterpreter interpreter, double lowerEstimationLimit = double.MinValue, double upperEstimationLimit = double.MaxValue)
: base(tree, interpreter, lowerEstimationLimit, upperEstimationLimit) {
this.k = k;
frequencyComparer = new ClassFrequencyComparer();
}
public override IDeepCloneable Clone(Cloner cloner) {
return new SymbolicNearestNeighbourClassificationModel(this, cloner);
}
public override IEnumerable GetEstimatedClassValues(Dataset dataset, IEnumerable rows) {
var estimatedValues = Interpreter.GetSymbolicExpressionTreeValues(SymbolicExpressionTree, dataset, rows)
.LimitToRange(LowerEstimationLimit, UpperEstimationLimit);
foreach (var ev in estimatedValues) {
// find the range [lower, upper[ of trainedTargetValues that contains the k closest neighbours
// the range can span more than k elements when there are equal estimated values
// find the index of the training-point to which distance is shortest
int lower = trainedEstimatedValues.BinarySearch(ev);
int upper;
// if the element was not found exactly, BinarySearch returns the complement of the index of the next larger item
if (lower < 0) {
lower = ~lower;
// lower is not necessarily the closer one
// determine which element is closer to ev (lower - 1) or (lower)
if (lower == trainedEstimatedValues.Count ||
(lower > 0 && Math.Abs(ev - trainedEstimatedValues[lower - 1]) < Math.Abs(ev - trainedEstimatedValues[lower]))) {
lower = lower - 1;
}
}
upper = lower + 1;
// at this point we have a range [lower, upper[ that includes only the closest element to ev
// expand the range to left or right looking for the nearest neighbors
while (upper - lower < Math.Min(k, trainedEstimatedValues.Count)) {
bool lowerIsCloser = upper >= trainedEstimatedValues.Count ||
(lower > 0 && ev - trainedEstimatedValues[lower] <= trainedEstimatedValues[upper] - ev);
bool upperIsCloser = lower <= 0 ||
(upper < trainedEstimatedValues.Count &&
ev - trainedEstimatedValues[lower] >= trainedEstimatedValues[upper] - ev);
if (!lowerIsCloser && !upperIsCloser) break;
if (lowerIsCloser) {
lower--;
// eat up all equal values
while (lower > 0 && trainedEstimatedValues[lower - 1].IsAlmost(trainedEstimatedValues[lower]))
lower--;
}
if (upperIsCloser) {
upper++;
while (upper < trainedEstimatedValues.Count &&
trainedEstimatedValues[upper - 1].IsAlmost(trainedEstimatedValues[upper]))
upper++;
}
}
// majority voting with preference for bigger class in case of tie
yield return Enumerable.Range(lower, upper - lower)
.Select(i => trainedClasses[i])
.GroupBy(c => c)
.Select(g => new { Class = g.Key, Votes = g.Count() })
.MaxItems(p => p.Votes)
.OrderByDescending(m => m.Class, frequencyComparer)
.First().Class;
}
}
public override void RecalculateModelParameters(IClassificationProblemData problemData, IEnumerable rows) {
var estimatedValues = Interpreter.GetSymbolicExpressionTreeValues(SymbolicExpressionTree, problemData.Dataset, rows)
.LimitToRange(LowerEstimationLimit, UpperEstimationLimit);
var targetValues = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, rows);
var trainedClasses = targetValues.ToArray();
var trainedEstimatedValues = estimatedValues.ToArray();
Array.Sort(trainedEstimatedValues, trainedClasses);
this.trainedClasses = new List(trainedClasses);
this.trainedEstimatedValues = new List(trainedEstimatedValues);
var freq = trainedClasses
.GroupBy(c => c)
.ToDictionary(g => g.Key, g => g.Count());
this.frequencyComparer = new ClassFrequencyComparer(freq);
}
public override ISymbolicClassificationSolution CreateClassificationSolution(IClassificationProblemData problemData) {
return new SymbolicClassificationSolution((ISymbolicClassificationModel)Clone(), problemData);
}
}
[StorableClass]
internal sealed class ClassFrequencyComparer : IComparer {
[Storable]
private readonly Dictionary classFrequencies;
[StorableConstructor]
private ClassFrequencyComparer(bool deserializing) { }
public ClassFrequencyComparer() {
classFrequencies = new Dictionary();
}
public ClassFrequencyComparer(Dictionary frequencies) {
classFrequencies = frequencies;
}
public ClassFrequencyComparer(ClassFrequencyComparer original) {
classFrequencies = new Dictionary(original.classFrequencies);
}
public int Compare(double x, double y) {
bool cx = classFrequencies.ContainsKey(x), cy = classFrequencies.ContainsKey(y);
if (cx && cy)
return classFrequencies[x].CompareTo(classFrequencies[y]);
if (cx) return 1;
return -1;
}
}
}