#region License Information
/* HeuristicLab
* Copyright (C) 2002-2016 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
*
* This file is part of HeuristicLab.
*
* HeuristicLab is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* HeuristicLab is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with HeuristicLab. If not, see .
*/
#endregion
using System;
using System.Linq;
using System.Threading;
using HeuristicLab.Algorithms.DataAnalysis;
using HeuristicLab.Analysis;
using HeuristicLab.Common;
using HeuristicLab.Core;
using HeuristicLab.Data;
using HeuristicLab.Operators;
using HeuristicLab.Optimization;
using HeuristicLab.Parameters;
using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
using HeuristicLab.Problems.DataAnalysis;
namespace HeuristicLab.Algorithms.EGO {
[Item("FitnessClusteringAnalyzer", "Analyzes the correlation between perdictions and actual fitness values")]
[StorableClass]
public class FitnessClusteringAnalyzer : SingleSuccessorOperator, IAnalyzer, IStochasticOperator, IResultsOperator {
public override bool CanChangeName => true;
public bool EnabledByDefault => false;
public ILookupParameter DatasetParameter => (ILookupParameter)Parameters["Dataset"];
public ILookupParameter ResultsParameter => (ILookupParameter)Parameters["Results"];
public IFixedValueParameter KParameter => (IFixedValueParameter)Parameters["K"];
public IFixedValueParameter K2Parameter => (IFixedValueParameter)Parameters["K2"];
public ILookupParameter RandomParameter => (ILookupParameter)Parameters["Random"];
private const string SolutionName = "FitnessClustering";
private const string PlotName = "FitnessClusterPlot";
[StorableConstructor]
protected FitnessClusteringAnalyzer(bool deserializing) : base(deserializing) { }
protected FitnessClusteringAnalyzer(FitnessClusteringAnalyzer original, Cloner cloner) : base(original, cloner) { }
public FitnessClusteringAnalyzer() {
Parameters.Add(new LookupParameter("Dataset"));
Parameters.Add(new LookupParameter("Results", "The collection to store the results in."));
Parameters.Add(new FixedValueParameter("K", "The number of clusters.", new IntValue(3)));
Parameters.Add(new FixedValueParameter("K2", "The number of clusters.", new IntValue(3)));
Parameters.Add(new LookupParameter("Random"));
}
public override IDeepCloneable Clone(Cloner cloner) {
return new FitnessClusteringAnalyzer(this, cloner);
}
public sealed override IOperation Apply() {
var dataset = DatasetParameter.ActualValue;
var results = ResultsParameter.ActualValue;
var random = RandomParameter.ActualValue;
if (dataset.Rows < KParameter.Value.Value || dataset.Rows < 20) return base.Apply();
var clustering = CreateClustering(dataset, random);
if (!results.ContainsKey(SolutionName)) results.Add(new Result(SolutionName, clustering));
results[SolutionName].Value = clustering;
var plot = CreateTSNEPlot(clustering, dataset, random);
if (!results.ContainsKey(PlotName)) results.Add(new Result(PlotName, plot));
results[PlotName].Value = plot;
return base.Apply();
}
private ScatterPlot CreateTSNEPlot(KMeansClusteringSolution clustering, ModifiableDataset data, IRandom random) {
var clusteredData = (ModifiableDataset)data.Clone();
clusteredData.AddVariable("cluster", clustering.ClusterValues.Select(x => (double)x));
var prob = new RegressionProblem {
ProblemData = new RegressionProblemData(clusteredData, new[] { "output" }, "cluster")
};
var tsne = new TSNEAlgorithm {
Perplexity = data.Rows / 3 - 1,
Problem = prob
};
tsne.ClassesNameParameter.Value = tsne.ClassesNameParameter.ValidValues.FirstOrDefault(x => x.Value.Equals("cluster"));
var res = EgoUtilities.SyncRunSubAlgorithm(tsne, random.Next(), CancellationToken.None);
return res.Select(r => r.Value).OfType().First();
}
private KMeansClusteringSolution CreateClustering(ModifiableDataset dataset, IRandom random) {
var pd = new ClusteringProblemData(dataset, new[] { "output" });
pd.TestPartition.Start = dataset.Rows;
pd.TestPartition.End = dataset.Rows;
pd.TrainingPartition.Start = 0;
pd.TrainingPartition.End = dataset.Rows;
return KMeansClustering.CreateKMeansSolution(pd, KParameter.Value.Value, 1);
}
private double[] GetWeights(ModifiableDataset dataset) {
var inputMatrix = dataset.ToArray(dataset.VariableNames.Where(x => x.StartsWith("input")), Enumerable.Range(0, dataset.Rows));
if (inputMatrix.Cast().Any(x => double.IsNaN(x) || double.IsInfinity(x)))
throw new NotSupportedException("k-Means clustering does not support NaN or infinity values in the input dataset.");
var indices = Enumerable.Range(0, inputMatrix.GetLength(0)).ToArray();
return indices.Select(i =>
K2Parameter.Value.Value > 0 ? indices.Where(j => j != i).Select(j =>
1 / Math.Sqrt(EuclideanSquared(inputMatrix, inputMatrix, i, j))
).OrderBy(x => x).Take(K2Parameter.Value.Value).Sum() : 1.0
).ToArray();
}
private static double EuclideanSquared(double[,] input, double[,] input2, int row1, int row2) {
var sum = 0.0;
for (var i = 0; i < input.GetLength(1); i++) {
var d = input[row1, i] - input2[row2, i];
sum += d * d;
}
return sum;
}
}
}