#region License Information /* HeuristicLab * Copyright (C) 2002-2016 Heuristic and Evolutionary Algorithms Laboratory (HEAL) * * This file is part of HeuristicLab. * * HeuristicLab is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * HeuristicLab is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with HeuristicLab. If not, see . */ #endregion using System; using System.Linq; using System.Threading; using HeuristicLab.Algorithms.DataAnalysis; using HeuristicLab.Analysis; using HeuristicLab.Common; using HeuristicLab.Core; using HeuristicLab.Data; using HeuristicLab.Operators; using HeuristicLab.Optimization; using HeuristicLab.Parameters; using HeuristicLab.Persistence.Default.CompositeSerializers.Storable; using HeuristicLab.Problems.DataAnalysis; namespace HeuristicLab.Algorithms.EGO { [Item("FitnessClusteringAnalyzer", "Analyzes the correlation between perdictions and actual fitness values")] [StorableClass] public class FitnessClusteringAnalyzer : SingleSuccessorOperator, IAnalyzer, IStochasticOperator, IResultsOperator { public override bool CanChangeName => true; public bool EnabledByDefault => false; public ILookupParameter DatasetParameter => (ILookupParameter)Parameters["Dataset"]; public ILookupParameter ResultsParameter => (ILookupParameter)Parameters["Results"]; public IFixedValueParameter KParameter => (IFixedValueParameter)Parameters["K"]; public IFixedValueParameter K2Parameter => (IFixedValueParameter)Parameters["K2"]; public ILookupParameter RandomParameter => (ILookupParameter)Parameters["Random"]; private const string SolutionName = "FitnessClustering"; private const string PlotName = "FitnessClusterPlot"; [StorableConstructor] protected FitnessClusteringAnalyzer(bool deserializing) : base(deserializing) { } protected FitnessClusteringAnalyzer(FitnessClusteringAnalyzer original, Cloner cloner) : base(original, cloner) { } public FitnessClusteringAnalyzer() { Parameters.Add(new LookupParameter("Dataset")); Parameters.Add(new LookupParameter("Results", "The collection to store the results in.")); Parameters.Add(new FixedValueParameter("K", "The number of clusters.", new IntValue(3))); Parameters.Add(new FixedValueParameter("K2", "The number of clusters.", new IntValue(3))); Parameters.Add(new LookupParameter("Random")); } public override IDeepCloneable Clone(Cloner cloner) { return new FitnessClusteringAnalyzer(this, cloner); } public sealed override IOperation Apply() { var dataset = DatasetParameter.ActualValue; var results = ResultsParameter.ActualValue; var random = RandomParameter.ActualValue; if (dataset.Rows < KParameter.Value.Value || dataset.Rows < 20) return base.Apply(); var clustering = CreateClustering(dataset, random); if (!results.ContainsKey(SolutionName)) results.Add(new Result(SolutionName, clustering)); results[SolutionName].Value = clustering; var plot = CreateTSNEPlot(clustering, dataset, random); if (!results.ContainsKey(PlotName)) results.Add(new Result(PlotName, plot)); results[PlotName].Value = plot; return base.Apply(); } private ScatterPlot CreateTSNEPlot(KMeansClusteringSolution clustering, ModifiableDataset data, IRandom random) { var clusteredData = (ModifiableDataset)data.Clone(); clusteredData.AddVariable("cluster", clustering.ClusterValues.Select(x => (double)x)); var prob = new RegressionProblem { ProblemData = new RegressionProblemData(clusteredData, new[] { "output" }, "cluster") }; var tsne = new TSNEAlgorithm { Perplexity = data.Rows / 3 - 1, Problem = prob }; tsne.ClassesNameParameter.Value = tsne.ClassesNameParameter.ValidValues.FirstOrDefault(x => x.Value.Equals("cluster")); var res = EgoUtilities.SyncRunSubAlgorithm(tsne, random.Next(), CancellationToken.None); return res.Select(r => r.Value).OfType().First(); } private KMeansClusteringSolution CreateClustering(ModifiableDataset dataset, IRandom random) { var pd = new ClusteringProblemData(dataset, new[] { "output" }); pd.TestPartition.Start = dataset.Rows; pd.TestPartition.End = dataset.Rows; pd.TrainingPartition.Start = 0; pd.TrainingPartition.End = dataset.Rows; return KMeansClustering.CreateKMeansSolution(pd, KParameter.Value.Value, 1); } private double[] GetWeights(ModifiableDataset dataset) { var inputMatrix = dataset.ToArray(dataset.VariableNames.Where(x => x.StartsWith("input")), Enumerable.Range(0, dataset.Rows)); if (inputMatrix.Cast().Any(x => double.IsNaN(x) || double.IsInfinity(x))) throw new NotSupportedException("k-Means clustering does not support NaN or infinity values in the input dataset."); var indices = Enumerable.Range(0, inputMatrix.GetLength(0)).ToArray(); return indices.Select(i => K2Parameter.Value.Value > 0 ? indices.Where(j => j != i).Select(j => 1 / Math.Sqrt(EuclideanSquared(inputMatrix, inputMatrix, i, j)) ).OrderBy(x => x).Take(K2Parameter.Value.Value).Sum() : 1.0 ).ToArray(); } private static double EuclideanSquared(double[,] input, double[,] input2, int row1, int row2) { var sum = 0.0; for (var i = 0; i < input.GetLength(1); i++) { var d = input[row1, i] - input2[row2, i]; sum += d * d; } return sum; } } }