1 | /*************************************************************************
|
---|
2 | Copyright (c) 2008, Sergey Bochkanov (ALGLIB project).
|
---|
3 |
|
---|
4 | >>> SOURCE LICENSE >>>
|
---|
5 | This program is free software; you can redistribute it and/or modify
|
---|
6 | it under the terms of the GNU General Public License as published by
|
---|
7 | the Free Software Foundation (www.fsf.org); either version 2 of the
|
---|
8 | License, or (at your option) any later version.
|
---|
9 |
|
---|
10 | This program is distributed in the hope that it will be useful,
|
---|
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
13 | GNU General Public License for more details.
|
---|
14 |
|
---|
15 | A copy of the GNU General Public License is available at
|
---|
16 | http://www.fsf.org/licensing/licenses
|
---|
17 |
|
---|
18 | >>> END OF LICENSE >>>
|
---|
19 | *************************************************************************/
|
---|
20 |
|
---|
21 | using System;
|
---|
22 |
|
---|
23 | namespace alglib
|
---|
24 | {
|
---|
25 | public class kmeans
|
---|
26 | {
|
---|
27 | /*************************************************************************
|
---|
28 | k-means++ clusterization
|
---|
29 |
|
---|
30 | INPUT PARAMETERS:
|
---|
31 | XY - dataset, array [0..NPoints-1,0..NVars-1].
|
---|
32 | NPoints - dataset size, NPoints>=K
|
---|
33 | NVars - number of variables, NVars>=1
|
---|
34 | K - desired number of clusters, K>=1
|
---|
35 | Restarts - number of restarts, Restarts>=1
|
---|
36 |
|
---|
37 | OUTPUT PARAMETERS:
|
---|
38 | Info - return code:
|
---|
39 | * -3, if taskis degenerate (number of distinct points is
|
---|
40 | less than K)
|
---|
41 | * -1, if incorrect NPoints/NFeatures/K/Restarts was passed
|
---|
42 | * 1, if subroutine finished successfully
|
---|
43 | C - array[0..NVars-1,0..K-1].matrix whose columns store
|
---|
44 | cluster's centers
|
---|
45 | XYC - array which contains number of clusters dataset points
|
---|
46 | belong to.
|
---|
47 |
|
---|
48 | -- ALGLIB --
|
---|
49 | Copyright 21.03.2009 by Bochkanov Sergey
|
---|
50 | *************************************************************************/
|
---|
51 | public static void kmeansgenerate(ref double[,] xy,
|
---|
52 | int npoints,
|
---|
53 | int nvars,
|
---|
54 | int k,
|
---|
55 | int restarts,
|
---|
56 | ref int info,
|
---|
57 | ref double[,] c,
|
---|
58 | ref int[] xyc)
|
---|
59 | {
|
---|
60 | int i = 0;
|
---|
61 | int j = 0;
|
---|
62 | double[,] ct = new double[0,0];
|
---|
63 | double[,] ctbest = new double[0,0];
|
---|
64 | double e = 0;
|
---|
65 | double ebest = 0;
|
---|
66 | double[] x = new double[0];
|
---|
67 | double[] tmp = new double[0];
|
---|
68 | double[] d2 = new double[0];
|
---|
69 | double[] p = new double[0];
|
---|
70 | int[] csizes = new int[0];
|
---|
71 | bool[] cbusy = new bool[0];
|
---|
72 | double v = 0;
|
---|
73 | int cclosest = 0;
|
---|
74 | double dclosest = 0;
|
---|
75 | double[] work = new double[0];
|
---|
76 | bool waschanges = new bool();
|
---|
77 | bool zerosizeclusters = new bool();
|
---|
78 | int pass = 0;
|
---|
79 | int i_ = 0;
|
---|
80 |
|
---|
81 |
|
---|
82 | //
|
---|
83 | // Test parameters
|
---|
84 | //
|
---|
85 | if( npoints<k | nvars<1 | k<1 | restarts<1 )
|
---|
86 | {
|
---|
87 | info = -1;
|
---|
88 | return;
|
---|
89 | }
|
---|
90 |
|
---|
91 | //
|
---|
92 | // TODO: special case K=1
|
---|
93 | // TODO: special case K=NPoints
|
---|
94 | //
|
---|
95 | info = 1;
|
---|
96 |
|
---|
97 | //
|
---|
98 | // Multiple passes of k-means++ algorithm
|
---|
99 | //
|
---|
100 | ct = new double[k-1+1, nvars-1+1];
|
---|
101 | ctbest = new double[k-1+1, nvars-1+1];
|
---|
102 | xyc = new int[npoints-1+1];
|
---|
103 | d2 = new double[npoints-1+1];
|
---|
104 | p = new double[npoints-1+1];
|
---|
105 | tmp = new double[nvars-1+1];
|
---|
106 | csizes = new int[k-1+1];
|
---|
107 | cbusy = new bool[k-1+1];
|
---|
108 | ebest = AP.Math.MaxRealNumber;
|
---|
109 | for(pass=1; pass<=restarts; pass++)
|
---|
110 | {
|
---|
111 |
|
---|
112 | //
|
---|
113 | // Select initial centers using k-means++ algorithm
|
---|
114 | // 1. Choose first center at random
|
---|
115 | // 2. Choose next centers using their distance from centers already chosen
|
---|
116 | //
|
---|
117 | // Note that for performance reasons centers are stored in ROWS of CT, not
|
---|
118 | // in columns. We'll transpose CT in the end and store it in the C.
|
---|
119 | //
|
---|
120 | i = AP.Math.RandomInteger(npoints);
|
---|
121 | for(i_=0; i_<=nvars-1;i_++)
|
---|
122 | {
|
---|
123 | ct[0,i_] = xy[i,i_];
|
---|
124 | }
|
---|
125 | cbusy[0] = true;
|
---|
126 | for(i=1; i<=k-1; i++)
|
---|
127 | {
|
---|
128 | cbusy[i] = false;
|
---|
129 | }
|
---|
130 | if( !selectcenterpp(ref xy, npoints, nvars, ref ct, cbusy, k, ref d2, ref p, ref tmp) )
|
---|
131 | {
|
---|
132 | info = -3;
|
---|
133 | return;
|
---|
134 | }
|
---|
135 |
|
---|
136 | //
|
---|
137 | // Update centers:
|
---|
138 | // 2. update center positions
|
---|
139 | //
|
---|
140 | while( true )
|
---|
141 | {
|
---|
142 |
|
---|
143 | //
|
---|
144 | // fill XYC with center numbers
|
---|
145 | //
|
---|
146 | waschanges = false;
|
---|
147 | for(i=0; i<=npoints-1; i++)
|
---|
148 | {
|
---|
149 | cclosest = -1;
|
---|
150 | dclosest = AP.Math.MaxRealNumber;
|
---|
151 | for(j=0; j<=k-1; j++)
|
---|
152 | {
|
---|
153 | for(i_=0; i_<=nvars-1;i_++)
|
---|
154 | {
|
---|
155 | tmp[i_] = xy[i,i_];
|
---|
156 | }
|
---|
157 | for(i_=0; i_<=nvars-1;i_++)
|
---|
158 | {
|
---|
159 | tmp[i_] = tmp[i_] - ct[j,i_];
|
---|
160 | }
|
---|
161 | v = 0.0;
|
---|
162 | for(i_=0; i_<=nvars-1;i_++)
|
---|
163 | {
|
---|
164 | v += tmp[i_]*tmp[i_];
|
---|
165 | }
|
---|
166 | if( (double)(v)<(double)(dclosest) )
|
---|
167 | {
|
---|
168 | cclosest = j;
|
---|
169 | dclosest = v;
|
---|
170 | }
|
---|
171 | }
|
---|
172 | if( xyc[i]!=cclosest )
|
---|
173 | {
|
---|
174 | waschanges = true;
|
---|
175 | }
|
---|
176 | xyc[i] = cclosest;
|
---|
177 | }
|
---|
178 |
|
---|
179 | //
|
---|
180 | // Update centers
|
---|
181 | //
|
---|
182 | for(j=0; j<=k-1; j++)
|
---|
183 | {
|
---|
184 | csizes[j] = 0;
|
---|
185 | }
|
---|
186 | for(i=0; i<=k-1; i++)
|
---|
187 | {
|
---|
188 | for(j=0; j<=nvars-1; j++)
|
---|
189 | {
|
---|
190 | ct[i,j] = 0;
|
---|
191 | }
|
---|
192 | }
|
---|
193 | for(i=0; i<=npoints-1; i++)
|
---|
194 | {
|
---|
195 | csizes[xyc[i]] = csizes[xyc[i]]+1;
|
---|
196 | for(i_=0; i_<=nvars-1;i_++)
|
---|
197 | {
|
---|
198 | ct[xyc[i],i_] = ct[xyc[i],i_] + xy[i,i_];
|
---|
199 | }
|
---|
200 | }
|
---|
201 | zerosizeclusters = false;
|
---|
202 | for(i=0; i<=k-1; i++)
|
---|
203 | {
|
---|
204 | cbusy[i] = csizes[i]!=0;
|
---|
205 | zerosizeclusters = zerosizeclusters | csizes[i]==0;
|
---|
206 | }
|
---|
207 | if( zerosizeclusters )
|
---|
208 | {
|
---|
209 |
|
---|
210 | //
|
---|
211 | // Some clusters have zero size - rare, but possible.
|
---|
212 | // We'll choose new centers for such clusters using k-means++ rule
|
---|
213 | // and restart algorithm
|
---|
214 | //
|
---|
215 | if( !selectcenterpp(ref xy, npoints, nvars, ref ct, cbusy, k, ref d2, ref p, ref tmp) )
|
---|
216 | {
|
---|
217 | info = -3;
|
---|
218 | return;
|
---|
219 | }
|
---|
220 | continue;
|
---|
221 | }
|
---|
222 | for(j=0; j<=k-1; j++)
|
---|
223 | {
|
---|
224 | v = (double)(1)/(double)(csizes[j]);
|
---|
225 | for(i_=0; i_<=nvars-1;i_++)
|
---|
226 | {
|
---|
227 | ct[j,i_] = v*ct[j,i_];
|
---|
228 | }
|
---|
229 | }
|
---|
230 |
|
---|
231 | //
|
---|
232 | // if nothing has changed during iteration
|
---|
233 | //
|
---|
234 | if( !waschanges )
|
---|
235 | {
|
---|
236 | break;
|
---|
237 | }
|
---|
238 | }
|
---|
239 |
|
---|
240 | //
|
---|
241 | // 3. Calculate E, compare with best centers found so far
|
---|
242 | //
|
---|
243 | e = 0;
|
---|
244 | for(i=0; i<=npoints-1; i++)
|
---|
245 | {
|
---|
246 | for(i_=0; i_<=nvars-1;i_++)
|
---|
247 | {
|
---|
248 | tmp[i_] = xy[i,i_];
|
---|
249 | }
|
---|
250 | for(i_=0; i_<=nvars-1;i_++)
|
---|
251 | {
|
---|
252 | tmp[i_] = tmp[i_] - ct[xyc[i],i_];
|
---|
253 | }
|
---|
254 | v = 0.0;
|
---|
255 | for(i_=0; i_<=nvars-1;i_++)
|
---|
256 | {
|
---|
257 | v += tmp[i_]*tmp[i_];
|
---|
258 | }
|
---|
259 | e = e+v;
|
---|
260 | }
|
---|
261 | if( (double)(e)<(double)(ebest) )
|
---|
262 | {
|
---|
263 |
|
---|
264 | //
|
---|
265 | // store partition
|
---|
266 | //
|
---|
267 | blas.copymatrix(ref ct, 0, k-1, 0, nvars-1, ref ctbest, 0, k-1, 0, nvars-1);
|
---|
268 | }
|
---|
269 | }
|
---|
270 |
|
---|
271 | //
|
---|
272 | // Copy and transpose
|
---|
273 | //
|
---|
274 | c = new double[nvars-1+1, k-1+1];
|
---|
275 | blas.copyandtranspose(ref ctbest, 0, k-1, 0, nvars-1, ref c, 0, nvars-1, 0, k-1);
|
---|
276 | }
|
---|
277 |
|
---|
278 |
|
---|
279 | /*************************************************************************
|
---|
280 | Select center for a new cluster using k-means++ rule
|
---|
281 | *************************************************************************/
|
---|
282 | private static bool selectcenterpp(ref double[,] xy,
|
---|
283 | int npoints,
|
---|
284 | int nvars,
|
---|
285 | ref double[,] centers,
|
---|
286 | bool[] busycenters,
|
---|
287 | int ccnt,
|
---|
288 | ref double[] d2,
|
---|
289 | ref double[] p,
|
---|
290 | ref double[] tmp)
|
---|
291 | {
|
---|
292 | bool result = new bool();
|
---|
293 | int i = 0;
|
---|
294 | int j = 0;
|
---|
295 | int cc = 0;
|
---|
296 | double v = 0;
|
---|
297 | double s = 0;
|
---|
298 | int i_ = 0;
|
---|
299 |
|
---|
300 | busycenters = (bool[])busycenters.Clone();
|
---|
301 |
|
---|
302 | result = true;
|
---|
303 | for(cc=0; cc<=ccnt-1; cc++)
|
---|
304 | {
|
---|
305 | if( !busycenters[cc] )
|
---|
306 | {
|
---|
307 |
|
---|
308 | //
|
---|
309 | // fill D2
|
---|
310 | //
|
---|
311 | for(i=0; i<=npoints-1; i++)
|
---|
312 | {
|
---|
313 | d2[i] = AP.Math.MaxRealNumber;
|
---|
314 | for(j=0; j<=ccnt-1; j++)
|
---|
315 | {
|
---|
316 | if( busycenters[j] )
|
---|
317 | {
|
---|
318 | for(i_=0; i_<=nvars-1;i_++)
|
---|
319 | {
|
---|
320 | tmp[i_] = xy[i,i_];
|
---|
321 | }
|
---|
322 | for(i_=0; i_<=nvars-1;i_++)
|
---|
323 | {
|
---|
324 | tmp[i_] = tmp[i_] - centers[j,i_];
|
---|
325 | }
|
---|
326 | v = 0.0;
|
---|
327 | for(i_=0; i_<=nvars-1;i_++)
|
---|
328 | {
|
---|
329 | v += tmp[i_]*tmp[i_];
|
---|
330 | }
|
---|
331 | if( (double)(v)<(double)(d2[i]) )
|
---|
332 | {
|
---|
333 | d2[i] = v;
|
---|
334 | }
|
---|
335 | }
|
---|
336 | }
|
---|
337 | }
|
---|
338 |
|
---|
339 | //
|
---|
340 | // calculate P (non-cumulative)
|
---|
341 | //
|
---|
342 | s = 0;
|
---|
343 | for(i=0; i<=npoints-1; i++)
|
---|
344 | {
|
---|
345 | s = s+d2[i];
|
---|
346 | }
|
---|
347 | if( (double)(s)==(double)(0) )
|
---|
348 | {
|
---|
349 | result = false;
|
---|
350 | return result;
|
---|
351 | }
|
---|
352 | s = 1/s;
|
---|
353 | for(i_=0; i_<=npoints-1;i_++)
|
---|
354 | {
|
---|
355 | p[i_] = s*d2[i_];
|
---|
356 | }
|
---|
357 |
|
---|
358 | //
|
---|
359 | // choose one of points with probability P
|
---|
360 | // random number within (0,1) is generated and
|
---|
361 | // inverse empirical CDF is used to randomly choose a point.
|
---|
362 | //
|
---|
363 | s = 0;
|
---|
364 | v = AP.Math.RandomReal();
|
---|
365 | for(i=0; i<=npoints-1; i++)
|
---|
366 | {
|
---|
367 | s = s+p[i];
|
---|
368 | if( (double)(v)<=(double)(s) | i==npoints-1 )
|
---|
369 | {
|
---|
370 | for(i_=0; i_<=nvars-1;i_++)
|
---|
371 | {
|
---|
372 | centers[cc,i_] = xy[i,i_];
|
---|
373 | }
|
---|
374 | busycenters[cc] = true;
|
---|
375 | break;
|
---|
376 | }
|
---|
377 | }
|
---|
378 | }
|
---|
379 | }
|
---|
380 | return result;
|
---|
381 | }
|
---|
382 | }
|
---|
383 | }
|
---|