Free cookie consent management tool by TermsFeed Policy Generator

source: branches/2457_ExpertSystem/HeuristicLab.Analysis/3.3/Clustering/CkMeans1D.cs @ 16189

Last change on this file since 16189 was 13794, checked in by abeham, 9 years ago

#2457: worked on recommendation algorithms (x-validation)

File size: 4.6 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2016 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using HeuristicLab.Common;
23using System;
24using System.Collections.Generic;
25using System.Linq;
26
27namespace HeuristicLab.Analysis {
28  /// <summary>
29  /// Implements the Ckmeans.1d.dp method. It is described in the paper:
30  /// Haizhou Wang and Mingzhou Song. 2011.
31  /// Ckmeans.1d.dp: Optimal k-means Clustering in One Dimension by Dynamic Programming
32  /// The R Journal Vol. 3/2, pp. 29-33.
33  /// available at https://journal.r-project.org/archive/2011-2/RJournal_2011-2_Wang+Song.pdf
34  /// </summary>
35  public class CkMeans1D {
36    /// <summary>
37    /// Clusters the 1-dimensional data given in <paramref name="estimations"/>.
38    /// </summary>
39    /// <param name="estimations">The 1-dimensional data that should be clustered.</param>
40    /// <param name="k">The maximum number of clusters.</param>
41    /// <param name="clusterValues">A vector of the same length as estimations that assigns to each point a cluster id.</param>
42    /// <returns>A sorted list of cluster centroids and corresponding cluster ids.</returns>
43    public static SortedList<double, int> Cluster(double[] estimations, int k, out int[] clusterValues) {
44      int nPoints = estimations.Length;
45      var distinct = estimations.Distinct().OrderBy(x => x).ToArray();
46      var max = distinct.Max();
47      if (distinct.Length <= k) {
48        var dict = distinct.Select((v, i) => new { Index = i, Value = v }).ToDictionary(x => x.Value, y => y.Index);
49        for (int i = distinct.Length; i < k; i++)
50          dict.Add(max + i - distinct.Length + 1, i);
51
52        clusterValues = new int[nPoints];
53        for (int i = 0; i < nPoints; i++)
54          if (!dict.ContainsKey(estimations[i])) clusterValues[i] = 0;
55          else clusterValues[i] = dict[estimations[i]];
56
57        return new SortedList<double, int>(dict);
58      }
59
60      var n = distinct.Length;
61      var D = new double[n, k];
62      var B = new int[n, k];
63
64      for (int m = 0; m < k; m++) {
65        for (int j = m; j <= n - k + m; j++) {
66          if (m == 0)
67            D[j, m] = SumOfSquaredDistances(distinct, 0, j + 1);
68          else {
69            var minD = double.MaxValue;
70            var minI = 0;
71            for (int i = 1; i <= j; i++) {
72              var d = D[i - 1, m - 1] + SumOfSquaredDistances(distinct, i, j + 1);
73              if (d < minD) {
74                minD = d;
75                minI = i;
76              }
77            }
78            D[j, m] = minD;
79            B[j, m] = minI;
80          }
81        }
82      }
83
84      var centers = new SortedList<double, int>();
85      var upper = B[n - 1, k - 1];
86      var c = Mean(distinct, upper, n);
87      centers.Add(c, k - 1);
88      for (int i = k - 2; i >= 0; i--) {
89        var lower = B[upper - 1, i];
90        var c2 = Mean(distinct, lower, upper);
91        centers.Add(c2, i);
92        upper = lower;
93      }
94
95      clusterValues = new int[nPoints];
96      for (int i = 0; i < estimations.Length; i++) {
97        clusterValues[i] = centers.MinItems(x => Math.Abs(estimations[i] - x.Key)).First().Value;
98      }
99
100      return centers;
101    }
102
103    private static double SumOfSquaredDistances(double[] x, int start, int end) {
104      if (start == end) throw new InvalidOperationException();
105      if (start + 1 == end) return 0.0;
106      double mean = 0.0;
107      for (int i = start; i < end; i++) {
108        mean += x[i];
109      }
110      mean /= (end - start);
111      var sum = 0.0;
112      for (int i = start; i < end; i++) {
113        sum += (x[i] - mean) * (x[i] - mean);
114      }
115      return sum;
116    }
117
118    private static double Mean(double[] x, int start, int end) {
119      if (start == end) throw new InvalidOperationException();
120      double mean = 0.0;
121      for (int i = start; i < end; i++) {
122        mean += x[i];
123      }
124      mean /= (end - start);
125      return mean;
126    }
127  }
128}
Note: See TracBrowser for help on using the repository browser.