Free cookie consent management tool by TermsFeed Policy Generator

source: branches/DataAnalysis Refactoring/HeuristicLab.Algorithms.DataAnalysis/3.4/kMeans/KMeansClusteringUtil.cs @ 5651

Last change on this file since 5651 was 5651, checked in by gkronber, 14 years ago

#1418 implemented wrapper classes for k-Means clustering in alglib.

File size: 5.4 KB
RevLine 
[5651]1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2011 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System.Collections.Generic;
23using System.Linq;
24using HeuristicLab.Problems.DataAnalysis;
25using System;
26
27namespace HeuristicLab.Algorithms.DataAnalysis {
28  public static class KMeansClusteringUtil {
29    public static double[,] PrepareInputMatrix(Dataset dataset, IEnumerable<string> allowedInputVariables, IEnumerable<int> rows) {
30      List<int> allowedRows = CalculateAllowedRows(dataset, allowedInputVariables, rows).ToList();
31
32      double[,] matrix = new double[allowedRows.Count, allowedInputVariables.Count()];
33      for (int row = 0; row < allowedRows.Count; row++) {
34        int col = 0;
35        foreach (string column in allowedInputVariables) {
36          matrix[row, col] = dataset[column, row];
37          col++;
38        }
39      }
40      return matrix;
41    }
42
43    private static IEnumerable<int> CalculateAllowedRows(Dataset dataset, IEnumerable<string> allowedInputVariables, IEnumerable<int> rows) {
44      // return only rows that contain no infinity or NaN values
45      return from row in rows
46             where (from inputVariable in allowedInputVariables
47                    let x = dataset[inputVariable, row]
48                    where double.IsInfinity(x) || double.IsNaN(x)
49                    select 1)
50                    .Any() == false
51             select row;
52    }
53
54    public static IEnumerable<int> FindClosestCenters(IEnumerable<double[]> centers, Dataset dataset, IEnumerable<string> allowedInputVariables, IEnumerable<int> rows) {
55      int nRows = rows.Count();
56      int nCols = allowedInputVariables.Count();
57      int[] closestCenter = new int[nRows];
58      double[] bestCenterDistance = Enumerable.Repeat(double.MaxValue, nRows).ToArray();
59      int centerIndex = 1;
60
61      foreach (double[] center in centers) {
62        if (nCols != center.Length) throw new ArgumentException();
63        int rowIndex = 0;
64        foreach (var row in rows) {
65          // calc euclidian distance of point to center
66          double centerDistance = 0;
67          int col = 0;
68          foreach (var inputVariable in allowedInputVariables) {
69            double d = center[col++] - dataset[inputVariable, row];
70            d = d * d; // square;
71            centerDistance += d;
72            if (centerDistance > bestCenterDistance[rowIndex]) break;
73          }
74          if (centerDistance < bestCenterDistance[rowIndex]) {
75            bestCenterDistance[rowIndex] = centerDistance;
76            closestCenter[rowIndex] = centerIndex;
77          }
78          rowIndex++;
79        }
80        centerIndex++;
81      }
82      return closestCenter;
83    }
84
85    public static double CalculateIntraClusterSumOfSquares(KMeansClusteringModel model, Dataset dataset, IEnumerable<int> rows) {
86      List<int> clusterValues = model.GetClusterValues(dataset, rows).ToList();
87      List<string> allowedInputVariables = model.AllowedInputVariables.ToList();
88      int nCols = allowedInputVariables.Count;
89      Dictionary<int, List<double[]>> clusterPoints = new Dictionary<int, List<double[]>>();
90      Dictionary<int, double[]> clusterMeans = new Dictionary<int, double[]>();
91      foreach (var clusterValue in clusterValues.Distinct()) {
92        clusterPoints.Add(clusterValue, new List<double[]>());
93      }
94
95      // collect points of clusters
96      int clusterValueIndex = 0;
97      foreach (var row in rows) {
98        double[] p = new double[allowedInputVariables.Count];
99        for (int i = 0; i < nCols; i++) {
100          p[i] = dataset[allowedInputVariables[i], row];
101        }
102        clusterPoints[clusterValues[clusterValueIndex++]].Add(p);
103      }
104      // calculate cluster means
105      foreach (var pair in clusterPoints) {
106        double[] mean = new double[nCols];
107        foreach (var p in pair.Value) {
108          for (int i = 0; i < nCols; i++) {
109            mean[i] += p[i];
110          }
111        }
112        for (int i = 0; i < nCols; i++) {
113          mean[i] /= pair.Value.Count;
114        }
115        clusterMeans[pair.Key] = mean;
116      }
117      // calculate distances
118      double allCenterDistances = 0;
119      foreach (var pair in clusterMeans) {
120        double[] mean = pair.Value;
121        double centerDistances = 0;
122        foreach (var clusterPoint in clusterPoints[pair.Key]) {
123          double centerDistance = 0;
124          for (int i = 0; i < nCols; i++) {
125            double d = mean[i] - clusterPoint[i];
126            d = d * d;
127            centerDistance += d;
128          }
129          centerDistances += centerDistance;
130        }
131        allCenterDistances += centerDistances;
132      }
133      return allCenterDistances;
134    }
135  }
136}
Note: See TracBrowser for help on using the repository browser.