#region License Information /* HeuristicLab * Copyright (C) 2002-2011 Heuristic and Evolutionary Algorithms Laboratory (HEAL) * * This file is part of HeuristicLab. * * HeuristicLab is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * HeuristicLab is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with HeuristicLab. If not, see . */ #endregion using System.Collections.Generic; using System.Linq; using HeuristicLab.Problems.DataAnalysis; using System; namespace HeuristicLab.Algorithms.DataAnalysis { public static class KMeansClusteringUtil { public static double[,] PrepareInputMatrix(Dataset dataset, IEnumerable allowedInputVariables, IEnumerable rows) { List allowedRows = CalculateAllowedRows(dataset, allowedInputVariables, rows).ToList(); double[,] matrix = new double[allowedRows.Count, allowedInputVariables.Count()]; for (int row = 0; row < allowedRows.Count; row++) { int col = 0; foreach (string column in allowedInputVariables) { matrix[row, col] = dataset[column, row]; col++; } } return matrix; } private static IEnumerable CalculateAllowedRows(Dataset dataset, IEnumerable allowedInputVariables, IEnumerable rows) { // return only rows that contain no infinity or NaN values return from row in rows where (from inputVariable in allowedInputVariables let x = dataset[inputVariable, row] where double.IsInfinity(x) || double.IsNaN(x) select 1) .Any() == false select row; } public static IEnumerable FindClosestCenters(IEnumerable centers, Dataset dataset, IEnumerable allowedInputVariables, IEnumerable rows) { int nRows = rows.Count(); int nCols = allowedInputVariables.Count(); int[] closestCenter = new int[nRows]; double[] bestCenterDistance = Enumerable.Repeat(double.MaxValue, nRows).ToArray(); int centerIndex = 1; foreach (double[] center in centers) { if (nCols != center.Length) throw new ArgumentException(); int rowIndex = 0; foreach (var row in rows) { // calc euclidian distance of point to center double centerDistance = 0; int col = 0; foreach (var inputVariable in allowedInputVariables) { double d = center[col++] - dataset[inputVariable, row]; d = d * d; // square; centerDistance += d; if (centerDistance > bestCenterDistance[rowIndex]) break; } if (centerDistance < bestCenterDistance[rowIndex]) { bestCenterDistance[rowIndex] = centerDistance; closestCenter[rowIndex] = centerIndex; } rowIndex++; } centerIndex++; } return closestCenter; } public static double CalculateIntraClusterSumOfSquares(KMeansClusteringModel model, Dataset dataset, IEnumerable rows) { List clusterValues = model.GetClusterValues(dataset, rows).ToList(); List allowedInputVariables = model.AllowedInputVariables.ToList(); int nCols = allowedInputVariables.Count; Dictionary> clusterPoints = new Dictionary>(); Dictionary clusterMeans = new Dictionary(); foreach (var clusterValue in clusterValues.Distinct()) { clusterPoints.Add(clusterValue, new List()); } // collect points of clusters int clusterValueIndex = 0; foreach (var row in rows) { double[] p = new double[allowedInputVariables.Count]; for (int i = 0; i < nCols; i++) { p[i] = dataset[allowedInputVariables[i], row]; } clusterPoints[clusterValues[clusterValueIndex++]].Add(p); } // calculate cluster means foreach (var pair in clusterPoints) { double[] mean = new double[nCols]; foreach (var p in pair.Value) { for (int i = 0; i < nCols; i++) { mean[i] += p[i]; } } for (int i = 0; i < nCols; i++) { mean[i] /= pair.Value.Count; } clusterMeans[pair.Key] = mean; } // calculate distances double allCenterDistances = 0; foreach (var pair in clusterMeans) { double[] mean = pair.Value; double centerDistances = 0; foreach (var clusterPoint in clusterPoints[pair.Key]) { double centerDistance = 0; for (int i = 0; i < nCols; i++) { double d = mean[i] - clusterPoint[i]; d = d * d; centerDistance += d; } centerDistances += centerDistance; } allCenterDistances += centerDistances; } return allCenterDistances; } } }