1 | /*************************************************************************
|
---|
2 | Copyright (c) 2008, Sergey Bochkanov (ALGLIB project).
|
---|
3 |
|
---|
4 | >>> SOURCE LICENSE >>>
|
---|
5 | This program is free software; you can redistribute it and/or modify
|
---|
6 | it under the terms of the GNU General Public License as published by
|
---|
7 | the Free Software Foundation (www.fsf.org); either version 2 of the
|
---|
8 | License, or (at your option) any later version.
|
---|
9 |
|
---|
10 | This program is distributed in the hope that it will be useful,
|
---|
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
13 | GNU General Public License for more details.
|
---|
14 |
|
---|
15 | A copy of the GNU General Public License is available at
|
---|
16 | http://www.fsf.org/licensing/licenses
|
---|
17 |
|
---|
18 | >>> END OF LICENSE >>>
|
---|
19 | *************************************************************************/
|
---|
20 |
|
---|
21 |
|
---|
22 | namespace alglib {
|
---|
23 | public class kmeans {
|
---|
24 | /*************************************************************************
|
---|
25 | k-means++ clusterization
|
---|
26 |
|
---|
27 | INPUT PARAMETERS:
|
---|
28 | XY - dataset, array [0..NPoints-1,0..NVars-1].
|
---|
29 | NPoints - dataset size, NPoints>=K
|
---|
30 | NVars - number of variables, NVars>=1
|
---|
31 | K - desired number of clusters, K>=1
|
---|
32 | Restarts - number of restarts, Restarts>=1
|
---|
33 |
|
---|
34 | OUTPUT PARAMETERS:
|
---|
35 | Info - return code:
|
---|
36 | * -3, if taskis degenerate (number of distinct points is
|
---|
37 | less than K)
|
---|
38 | * -1, if incorrect NPoints/NFeatures/K/Restarts was passed
|
---|
39 | * 1, if subroutine finished successfully
|
---|
40 | C - array[0..NVars-1,0..K-1].matrix whose columns store
|
---|
41 | cluster's centers
|
---|
42 | XYC - array which contains number of clusters dataset points
|
---|
43 | belong to.
|
---|
44 |
|
---|
45 | -- ALGLIB --
|
---|
46 | Copyright 21.03.2009 by Bochkanov Sergey
|
---|
47 | *************************************************************************/
|
---|
48 | public static void kmeansgenerate(ref double[,] xy,
|
---|
49 | int npoints,
|
---|
50 | int nvars,
|
---|
51 | int k,
|
---|
52 | int restarts,
|
---|
53 | ref int info,
|
---|
54 | ref double[,] c,
|
---|
55 | ref int[] xyc) {
|
---|
56 | int i = 0;
|
---|
57 | int j = 0;
|
---|
58 | double[,] ct = new double[0, 0];
|
---|
59 | double[,] ctbest = new double[0, 0];
|
---|
60 | double e = 0;
|
---|
61 | double ebest = 0;
|
---|
62 | double[] x = new double[0];
|
---|
63 | double[] tmp = new double[0];
|
---|
64 | double[] d2 = new double[0];
|
---|
65 | double[] p = new double[0];
|
---|
66 | int[] csizes = new int[0];
|
---|
67 | bool[] cbusy = new bool[0];
|
---|
68 | double v = 0;
|
---|
69 | int cclosest = 0;
|
---|
70 | double dclosest = 0;
|
---|
71 | double[] work = new double[0];
|
---|
72 | bool waschanges = new bool();
|
---|
73 | bool zerosizeclusters = new bool();
|
---|
74 | int pass = 0;
|
---|
75 | int i_ = 0;
|
---|
76 |
|
---|
77 |
|
---|
78 | //
|
---|
79 | // Test parameters
|
---|
80 | //
|
---|
81 | if (npoints < k | nvars < 1 | k < 1 | restarts < 1) {
|
---|
82 | info = -1;
|
---|
83 | return;
|
---|
84 | }
|
---|
85 |
|
---|
86 | //
|
---|
87 | // TODO: special case K=1
|
---|
88 | // TODO: special case K=NPoints
|
---|
89 | //
|
---|
90 | info = 1;
|
---|
91 |
|
---|
92 | //
|
---|
93 | // Multiple passes of k-means++ algorithm
|
---|
94 | //
|
---|
95 | ct = new double[k - 1 + 1, nvars - 1 + 1];
|
---|
96 | ctbest = new double[k - 1 + 1, nvars - 1 + 1];
|
---|
97 | xyc = new int[npoints - 1 + 1];
|
---|
98 | d2 = new double[npoints - 1 + 1];
|
---|
99 | p = new double[npoints - 1 + 1];
|
---|
100 | tmp = new double[nvars - 1 + 1];
|
---|
101 | csizes = new int[k - 1 + 1];
|
---|
102 | cbusy = new bool[k - 1 + 1];
|
---|
103 | ebest = AP.Math.MaxRealNumber;
|
---|
104 | for (pass = 1; pass <= restarts; pass++) {
|
---|
105 |
|
---|
106 | //
|
---|
107 | // Select initial centers using k-means++ algorithm
|
---|
108 | // 1. Choose first center at random
|
---|
109 | // 2. Choose next centers using their distance from centers already chosen
|
---|
110 | //
|
---|
111 | // Note that for performance reasons centers are stored in ROWS of CT, not
|
---|
112 | // in columns. We'll transpose CT in the end and store it in the C.
|
---|
113 | //
|
---|
114 | i = AP.Math.RandomInteger(npoints);
|
---|
115 | for (i_ = 0; i_ <= nvars - 1; i_++) {
|
---|
116 | ct[0, i_] = xy[i, i_];
|
---|
117 | }
|
---|
118 | cbusy[0] = true;
|
---|
119 | for (i = 1; i <= k - 1; i++) {
|
---|
120 | cbusy[i] = false;
|
---|
121 | }
|
---|
122 | if (!selectcenterpp(ref xy, npoints, nvars, ref ct, cbusy, k, ref d2, ref p, ref tmp)) {
|
---|
123 | info = -3;
|
---|
124 | return;
|
---|
125 | }
|
---|
126 |
|
---|
127 | //
|
---|
128 | // Update centers:
|
---|
129 | // 2. update center positions
|
---|
130 | //
|
---|
131 | while (true) {
|
---|
132 |
|
---|
133 | //
|
---|
134 | // fill XYC with center numbers
|
---|
135 | //
|
---|
136 | waschanges = false;
|
---|
137 | for (i = 0; i <= npoints - 1; i++) {
|
---|
138 | cclosest = -1;
|
---|
139 | dclosest = AP.Math.MaxRealNumber;
|
---|
140 | for (j = 0; j <= k - 1; j++) {
|
---|
141 | for (i_ = 0; i_ <= nvars - 1; i_++) {
|
---|
142 | tmp[i_] = xy[i, i_];
|
---|
143 | }
|
---|
144 | for (i_ = 0; i_ <= nvars - 1; i_++) {
|
---|
145 | tmp[i_] = tmp[i_] - ct[j, i_];
|
---|
146 | }
|
---|
147 | v = 0.0;
|
---|
148 | for (i_ = 0; i_ <= nvars - 1; i_++) {
|
---|
149 | v += tmp[i_] * tmp[i_];
|
---|
150 | }
|
---|
151 | if ((double)(v) < (double)(dclosest)) {
|
---|
152 | cclosest = j;
|
---|
153 | dclosest = v;
|
---|
154 | }
|
---|
155 | }
|
---|
156 | if (xyc[i] != cclosest) {
|
---|
157 | waschanges = true;
|
---|
158 | }
|
---|
159 | xyc[i] = cclosest;
|
---|
160 | }
|
---|
161 |
|
---|
162 | //
|
---|
163 | // Update centers
|
---|
164 | //
|
---|
165 | for (j = 0; j <= k - 1; j++) {
|
---|
166 | csizes[j] = 0;
|
---|
167 | }
|
---|
168 | for (i = 0; i <= k - 1; i++) {
|
---|
169 | for (j = 0; j <= nvars - 1; j++) {
|
---|
170 | ct[i, j] = 0;
|
---|
171 | }
|
---|
172 | }
|
---|
173 | for (i = 0; i <= npoints - 1; i++) {
|
---|
174 | csizes[xyc[i]] = csizes[xyc[i]] + 1;
|
---|
175 | for (i_ = 0; i_ <= nvars - 1; i_++) {
|
---|
176 | ct[xyc[i], i_] = ct[xyc[i], i_] + xy[i, i_];
|
---|
177 | }
|
---|
178 | }
|
---|
179 | zerosizeclusters = false;
|
---|
180 | for (i = 0; i <= k - 1; i++) {
|
---|
181 | cbusy[i] = csizes[i] != 0;
|
---|
182 | zerosizeclusters = zerosizeclusters | csizes[i] == 0;
|
---|
183 | }
|
---|
184 | if (zerosizeclusters) {
|
---|
185 |
|
---|
186 | //
|
---|
187 | // Some clusters have zero size - rare, but possible.
|
---|
188 | // We'll choose new centers for such clusters using k-means++ rule
|
---|
189 | // and restart algorithm
|
---|
190 | //
|
---|
191 | if (!selectcenterpp(ref xy, npoints, nvars, ref ct, cbusy, k, ref d2, ref p, ref tmp)) {
|
---|
192 | info = -3;
|
---|
193 | return;
|
---|
194 | }
|
---|
195 | continue;
|
---|
196 | }
|
---|
197 | for (j = 0; j <= k - 1; j++) {
|
---|
198 | v = (double)(1) / (double)(csizes[j]);
|
---|
199 | for (i_ = 0; i_ <= nvars - 1; i_++) {
|
---|
200 | ct[j, i_] = v * ct[j, i_];
|
---|
201 | }
|
---|
202 | }
|
---|
203 |
|
---|
204 | //
|
---|
205 | // if nothing has changed during iteration
|
---|
206 | //
|
---|
207 | if (!waschanges) {
|
---|
208 | break;
|
---|
209 | }
|
---|
210 | }
|
---|
211 |
|
---|
212 | //
|
---|
213 | // 3. Calculate E, compare with best centers found so far
|
---|
214 | //
|
---|
215 | e = 0;
|
---|
216 | for (i = 0; i <= npoints - 1; i++) {
|
---|
217 | for (i_ = 0; i_ <= nvars - 1; i_++) {
|
---|
218 | tmp[i_] = xy[i, i_];
|
---|
219 | }
|
---|
220 | for (i_ = 0; i_ <= nvars - 1; i_++) {
|
---|
221 | tmp[i_] = tmp[i_] - ct[xyc[i], i_];
|
---|
222 | }
|
---|
223 | v = 0.0;
|
---|
224 | for (i_ = 0; i_ <= nvars - 1; i_++) {
|
---|
225 | v += tmp[i_] * tmp[i_];
|
---|
226 | }
|
---|
227 | e = e + v;
|
---|
228 | }
|
---|
229 | if ((double)(e) < (double)(ebest)) {
|
---|
230 |
|
---|
231 | //
|
---|
232 | // store partition
|
---|
233 | //
|
---|
234 | blas.copymatrix(ref ct, 0, k - 1, 0, nvars - 1, ref ctbest, 0, k - 1, 0, nvars - 1);
|
---|
235 | }
|
---|
236 | }
|
---|
237 |
|
---|
238 | //
|
---|
239 | // Copy and transpose
|
---|
240 | //
|
---|
241 | c = new double[nvars - 1 + 1, k - 1 + 1];
|
---|
242 | blas.copyandtranspose(ref ctbest, 0, k - 1, 0, nvars - 1, ref c, 0, nvars - 1, 0, k - 1);
|
---|
243 | }
|
---|
244 |
|
---|
245 |
|
---|
246 | /*************************************************************************
|
---|
247 | Select center for a new cluster using k-means++ rule
|
---|
248 | *************************************************************************/
|
---|
249 | private static bool selectcenterpp(ref double[,] xy,
|
---|
250 | int npoints,
|
---|
251 | int nvars,
|
---|
252 | ref double[,] centers,
|
---|
253 | bool[] busycenters,
|
---|
254 | int ccnt,
|
---|
255 | ref double[] d2,
|
---|
256 | ref double[] p,
|
---|
257 | ref double[] tmp) {
|
---|
258 | bool result = new bool();
|
---|
259 | int i = 0;
|
---|
260 | int j = 0;
|
---|
261 | int cc = 0;
|
---|
262 | double v = 0;
|
---|
263 | double s = 0;
|
---|
264 | int i_ = 0;
|
---|
265 |
|
---|
266 | busycenters = (bool[])busycenters.Clone();
|
---|
267 |
|
---|
268 | result = true;
|
---|
269 | for (cc = 0; cc <= ccnt - 1; cc++) {
|
---|
270 | if (!busycenters[cc]) {
|
---|
271 |
|
---|
272 | //
|
---|
273 | // fill D2
|
---|
274 | //
|
---|
275 | for (i = 0; i <= npoints - 1; i++) {
|
---|
276 | d2[i] = AP.Math.MaxRealNumber;
|
---|
277 | for (j = 0; j <= ccnt - 1; j++) {
|
---|
278 | if (busycenters[j]) {
|
---|
279 | for (i_ = 0; i_ <= nvars - 1; i_++) {
|
---|
280 | tmp[i_] = xy[i, i_];
|
---|
281 | }
|
---|
282 | for (i_ = 0; i_ <= nvars - 1; i_++) {
|
---|
283 | tmp[i_] = tmp[i_] - centers[j, i_];
|
---|
284 | }
|
---|
285 | v = 0.0;
|
---|
286 | for (i_ = 0; i_ <= nvars - 1; i_++) {
|
---|
287 | v += tmp[i_] * tmp[i_];
|
---|
288 | }
|
---|
289 | if ((double)(v) < (double)(d2[i])) {
|
---|
290 | d2[i] = v;
|
---|
291 | }
|
---|
292 | }
|
---|
293 | }
|
---|
294 | }
|
---|
295 |
|
---|
296 | //
|
---|
297 | // calculate P (non-cumulative)
|
---|
298 | //
|
---|
299 | s = 0;
|
---|
300 | for (i = 0; i <= npoints - 1; i++) {
|
---|
301 | s = s + d2[i];
|
---|
302 | }
|
---|
303 | if ((double)(s) == (double)(0)) {
|
---|
304 | result = false;
|
---|
305 | return result;
|
---|
306 | }
|
---|
307 | s = 1 / s;
|
---|
308 | for (i_ = 0; i_ <= npoints - 1; i_++) {
|
---|
309 | p[i_] = s * d2[i_];
|
---|
310 | }
|
---|
311 |
|
---|
312 | //
|
---|
313 | // choose one of points with probability P
|
---|
314 | // random number within (0,1) is generated and
|
---|
315 | // inverse empirical CDF is used to randomly choose a point.
|
---|
316 | //
|
---|
317 | s = 0;
|
---|
318 | v = AP.Math.RandomReal();
|
---|
319 | for (i = 0; i <= npoints - 1; i++) {
|
---|
320 | s = s + p[i];
|
---|
321 | if ((double)(v) <= (double)(s) | i == npoints - 1) {
|
---|
322 | for (i_ = 0; i_ <= nvars - 1; i_++) {
|
---|
323 | centers[cc, i_] = xy[i, i_];
|
---|
324 | }
|
---|
325 | busycenters[cc] = true;
|
---|
326 | break;
|
---|
327 | }
|
---|
328 | }
|
---|
329 | }
|
---|
330 | }
|
---|
331 | return result;
|
---|
332 | }
|
---|
333 | }
|
---|
334 | }
|
---|