#region License Information /* HeuristicLab * Copyright (C) 2002-2012 Heuristic and Evolutionary Algorithms Laboratory (HEAL) * * This file is part of HeuristicLab. * * HeuristicLab is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * HeuristicLab is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with HeuristicLab. If not, see . */ #endregion using System; using System.Collections.Generic; using System.Linq; using HeuristicLab.Problems.DataAnalysis; namespace HeuristicLab.Algorithms.DataAnalysis { public static class KMeansClusteringUtil { public static IEnumerable FindClosestCenters(IEnumerable centers, Dataset dataset, IEnumerable allowedInputVariables, IEnumerable rows) { int nRows = rows.Count(); int nCols = allowedInputVariables.Count(); int[] closestCenter = new int[nRows]; double[] bestCenterDistance = Enumerable.Repeat(double.MaxValue, nRows).ToArray(); int centerIndex = 1; foreach (double[] center in centers) { if (nCols != center.Length) throw new ArgumentException(); int rowIndex = 0; foreach (var row in rows) { // calc euclidian distance of point to center double centerDistance = 0; int col = 0; foreach (var inputVariable in allowedInputVariables) { double d = center[col++] - dataset.GetDoubleValue(inputVariable, row); d = d * d; // square; centerDistance += d; if (centerDistance > bestCenterDistance[rowIndex]) break; } if (centerDistance < bestCenterDistance[rowIndex]) { bestCenterDistance[rowIndex] = centerDistance; closestCenter[rowIndex] = centerIndex; } rowIndex++; } centerIndex++; } return closestCenter; } public static double CalculateIntraClusterSumOfSquares(KMeansClusteringModel model, Dataset dataset, IEnumerable rows) { List clusterValues = model.GetClusterValues(dataset, rows).ToList(); List allowedInputVariables = model.AllowedInputVariables.ToList(); int nCols = allowedInputVariables.Count; Dictionary> clusterPoints = new Dictionary>(); Dictionary clusterMeans = new Dictionary(); foreach (var clusterValue in clusterValues.Distinct()) { clusterPoints.Add(clusterValue, new List()); } // collect points of clusters int clusterValueIndex = 0; foreach (var row in rows) { double[] p = new double[allowedInputVariables.Count]; for (int i = 0; i < nCols; i++) { p[i] = dataset.GetDoubleValue(allowedInputVariables[i], row); } clusterPoints[clusterValues[clusterValueIndex++]].Add(p); } // calculate cluster means foreach (var pair in clusterPoints) { double[] mean = new double[nCols]; foreach (var p in pair.Value) { for (int i = 0; i < nCols; i++) { mean[i] += p[i]; } } for (int i = 0; i < nCols; i++) { mean[i] /= pair.Value.Count; } clusterMeans[pair.Key] = mean; } // calculate distances double allCenterDistances = 0; foreach (var pair in clusterMeans) { double[] mean = pair.Value; double centerDistances = 0; foreach (var clusterPoint in clusterPoints[pair.Key]) { double centerDistance = 0; for (int i = 0; i < nCols; i++) { double d = mean[i] - clusterPoint[i]; d = d * d; centerDistance += d; } centerDistances += centerDistance; } allCenterDistances += centerDistances; } return allCenterDistances; } } }