[2806] | 1 | /*************************************************************************
|
---|
| 2 | Copyright (c) 2008, Sergey Bochkanov (ALGLIB project).
|
---|
| 3 |
|
---|
| 4 | >>> SOURCE LICENSE >>>
|
---|
| 5 | This program is free software; you can redistribute it and/or modify
|
---|
| 6 | it under the terms of the GNU General Public License as published by
|
---|
| 7 | the Free Software Foundation (www.fsf.org); either version 2 of the
|
---|
| 8 | License, or (at your option) any later version.
|
---|
| 9 |
|
---|
| 10 | This program is distributed in the hope that it will be useful,
|
---|
| 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 13 | GNU General Public License for more details.
|
---|
| 14 |
|
---|
| 15 | A copy of the GNU General Public License is available at
|
---|
| 16 | http://www.fsf.org/licensing/licenses
|
---|
| 17 |
|
---|
| 18 | >>> END OF LICENSE >>>
|
---|
| 19 | *************************************************************************/
|
---|
| 20 |
|
---|
| 21 | using System;
|
---|
| 22 |
|
---|
| 23 | namespace alglib
|
---|
| 24 | {
|
---|
| 25 | public class kmeans
|
---|
| 26 | {
|
---|
| 27 | /*************************************************************************
|
---|
| 28 | k-means++ clusterization
|
---|
| 29 |
|
---|
| 30 | INPUT PARAMETERS:
|
---|
| 31 | XY - dataset, array [0..NPoints-1,0..NVars-1].
|
---|
| 32 | NPoints - dataset size, NPoints>=K
|
---|
| 33 | NVars - number of variables, NVars>=1
|
---|
| 34 | K - desired number of clusters, K>=1
|
---|
| 35 | Restarts - number of restarts, Restarts>=1
|
---|
| 36 |
|
---|
| 37 | OUTPUT PARAMETERS:
|
---|
| 38 | Info - return code:
|
---|
| 39 | * -3, if taskis degenerate (number of distinct points is
|
---|
| 40 | less than K)
|
---|
| 41 | * -1, if incorrect NPoints/NFeatures/K/Restarts was passed
|
---|
| 42 | * 1, if subroutine finished successfully
|
---|
| 43 | C - array[0..NVars-1,0..K-1].matrix whose columns store
|
---|
| 44 | cluster's centers
|
---|
| 45 | XYC - array which contains number of clusters dataset points
|
---|
| 46 | belong to.
|
---|
| 47 |
|
---|
| 48 | -- ALGLIB --
|
---|
| 49 | Copyright 21.03.2009 by Bochkanov Sergey
|
---|
| 50 | *************************************************************************/
|
---|
| 51 | public static void kmeansgenerate(ref double[,] xy,
|
---|
| 52 | int npoints,
|
---|
| 53 | int nvars,
|
---|
| 54 | int k,
|
---|
| 55 | int restarts,
|
---|
| 56 | ref int info,
|
---|
| 57 | ref double[,] c,
|
---|
| 58 | ref int[] xyc)
|
---|
| 59 | {
|
---|
| 60 | int i = 0;
|
---|
| 61 | int j = 0;
|
---|
| 62 | double[,] ct = new double[0,0];
|
---|
| 63 | double[,] ctbest = new double[0,0];
|
---|
| 64 | double e = 0;
|
---|
| 65 | double ebest = 0;
|
---|
| 66 | double[] x = new double[0];
|
---|
| 67 | double[] tmp = new double[0];
|
---|
| 68 | double[] d2 = new double[0];
|
---|
| 69 | double[] p = new double[0];
|
---|
| 70 | int[] csizes = new int[0];
|
---|
| 71 | bool[] cbusy = new bool[0];
|
---|
| 72 | double v = 0;
|
---|
| 73 | int cclosest = 0;
|
---|
| 74 | double dclosest = 0;
|
---|
| 75 | double[] work = new double[0];
|
---|
| 76 | bool waschanges = new bool();
|
---|
| 77 | bool zerosizeclusters = new bool();
|
---|
| 78 | int pass = 0;
|
---|
| 79 | int i_ = 0;
|
---|
| 80 |
|
---|
| 81 |
|
---|
| 82 | //
|
---|
| 83 | // Test parameters
|
---|
| 84 | //
|
---|
| 85 | if( npoints<k | nvars<1 | k<1 | restarts<1 )
|
---|
| 86 | {
|
---|
| 87 | info = -1;
|
---|
| 88 | return;
|
---|
| 89 | }
|
---|
| 90 |
|
---|
| 91 | //
|
---|
| 92 | // TODO: special case K=1
|
---|
| 93 | // TODO: special case K=NPoints
|
---|
| 94 | //
|
---|
| 95 | info = 1;
|
---|
| 96 |
|
---|
| 97 | //
|
---|
| 98 | // Multiple passes of k-means++ algorithm
|
---|
| 99 | //
|
---|
| 100 | ct = new double[k-1+1, nvars-1+1];
|
---|
| 101 | ctbest = new double[k-1+1, nvars-1+1];
|
---|
| 102 | xyc = new int[npoints-1+1];
|
---|
| 103 | d2 = new double[npoints-1+1];
|
---|
| 104 | p = new double[npoints-1+1];
|
---|
| 105 | tmp = new double[nvars-1+1];
|
---|
| 106 | csizes = new int[k-1+1];
|
---|
| 107 | cbusy = new bool[k-1+1];
|
---|
| 108 | ebest = AP.Math.MaxRealNumber;
|
---|
| 109 | for(pass=1; pass<=restarts; pass++)
|
---|
| 110 | {
|
---|
| 111 |
|
---|
| 112 | //
|
---|
| 113 | // Select initial centers using k-means++ algorithm
|
---|
| 114 | // 1. Choose first center at random
|
---|
| 115 | // 2. Choose next centers using their distance from centers already chosen
|
---|
| 116 | //
|
---|
| 117 | // Note that for performance reasons centers are stored in ROWS of CT, not
|
---|
| 118 | // in columns. We'll transpose CT in the end and store it in the C.
|
---|
| 119 | //
|
---|
| 120 | i = AP.Math.RandomInteger(npoints);
|
---|
| 121 | for(i_=0; i_<=nvars-1;i_++)
|
---|
| 122 | {
|
---|
| 123 | ct[0,i_] = xy[i,i_];
|
---|
| 124 | }
|
---|
| 125 | cbusy[0] = true;
|
---|
| 126 | for(i=1; i<=k-1; i++)
|
---|
| 127 | {
|
---|
| 128 | cbusy[i] = false;
|
---|
| 129 | }
|
---|
| 130 | if( !selectcenterpp(ref xy, npoints, nvars, ref ct, cbusy, k, ref d2, ref p, ref tmp) )
|
---|
| 131 | {
|
---|
| 132 | info = -3;
|
---|
| 133 | return;
|
---|
| 134 | }
|
---|
| 135 |
|
---|
| 136 | //
|
---|
| 137 | // Update centers:
|
---|
| 138 | // 2. update center positions
|
---|
| 139 | //
|
---|
| 140 | while( true )
|
---|
| 141 | {
|
---|
| 142 |
|
---|
| 143 | //
|
---|
| 144 | // fill XYC with center numbers
|
---|
| 145 | //
|
---|
| 146 | waschanges = false;
|
---|
| 147 | for(i=0; i<=npoints-1; i++)
|
---|
| 148 | {
|
---|
| 149 | cclosest = -1;
|
---|
| 150 | dclosest = AP.Math.MaxRealNumber;
|
---|
| 151 | for(j=0; j<=k-1; j++)
|
---|
| 152 | {
|
---|
| 153 | for(i_=0; i_<=nvars-1;i_++)
|
---|
| 154 | {
|
---|
| 155 | tmp[i_] = xy[i,i_];
|
---|
| 156 | }
|
---|
| 157 | for(i_=0; i_<=nvars-1;i_++)
|
---|
| 158 | {
|
---|
| 159 | tmp[i_] = tmp[i_] - ct[j,i_];
|
---|
| 160 | }
|
---|
| 161 | v = 0.0;
|
---|
| 162 | for(i_=0; i_<=nvars-1;i_++)
|
---|
| 163 | {
|
---|
| 164 | v += tmp[i_]*tmp[i_];
|
---|
| 165 | }
|
---|
| 166 | if( (double)(v)<(double)(dclosest) )
|
---|
| 167 | {
|
---|
| 168 | cclosest = j;
|
---|
| 169 | dclosest = v;
|
---|
| 170 | }
|
---|
| 171 | }
|
---|
| 172 | if( xyc[i]!=cclosest )
|
---|
| 173 | {
|
---|
| 174 | waschanges = true;
|
---|
| 175 | }
|
---|
| 176 | xyc[i] = cclosest;
|
---|
| 177 | }
|
---|
| 178 |
|
---|
| 179 | //
|
---|
| 180 | // Update centers
|
---|
| 181 | //
|
---|
| 182 | for(j=0; j<=k-1; j++)
|
---|
| 183 | {
|
---|
| 184 | csizes[j] = 0;
|
---|
| 185 | }
|
---|
| 186 | for(i=0; i<=k-1; i++)
|
---|
| 187 | {
|
---|
| 188 | for(j=0; j<=nvars-1; j++)
|
---|
| 189 | {
|
---|
| 190 | ct[i,j] = 0;
|
---|
| 191 | }
|
---|
| 192 | }
|
---|
| 193 | for(i=0; i<=npoints-1; i++)
|
---|
| 194 | {
|
---|
| 195 | csizes[xyc[i]] = csizes[xyc[i]]+1;
|
---|
| 196 | for(i_=0; i_<=nvars-1;i_++)
|
---|
| 197 | {
|
---|
| 198 | ct[xyc[i],i_] = ct[xyc[i],i_] + xy[i,i_];
|
---|
| 199 | }
|
---|
| 200 | }
|
---|
| 201 | zerosizeclusters = false;
|
---|
| 202 | for(i=0; i<=k-1; i++)
|
---|
| 203 | {
|
---|
| 204 | cbusy[i] = csizes[i]!=0;
|
---|
| 205 | zerosizeclusters = zerosizeclusters | csizes[i]==0;
|
---|
| 206 | }
|
---|
| 207 | if( zerosizeclusters )
|
---|
| 208 | {
|
---|
| 209 |
|
---|
| 210 | //
|
---|
| 211 | // Some clusters have zero size - rare, but possible.
|
---|
| 212 | // We'll choose new centers for such clusters using k-means++ rule
|
---|
| 213 | // and restart algorithm
|
---|
| 214 | //
|
---|
| 215 | if( !selectcenterpp(ref xy, npoints, nvars, ref ct, cbusy, k, ref d2, ref p, ref tmp) )
|
---|
| 216 | {
|
---|
| 217 | info = -3;
|
---|
| 218 | return;
|
---|
| 219 | }
|
---|
| 220 | continue;
|
---|
| 221 | }
|
---|
| 222 | for(j=0; j<=k-1; j++)
|
---|
| 223 | {
|
---|
| 224 | v = (double)(1)/(double)(csizes[j]);
|
---|
| 225 | for(i_=0; i_<=nvars-1;i_++)
|
---|
| 226 | {
|
---|
| 227 | ct[j,i_] = v*ct[j,i_];
|
---|
| 228 | }
|
---|
| 229 | }
|
---|
| 230 |
|
---|
| 231 | //
|
---|
| 232 | // if nothing has changed during iteration
|
---|
| 233 | //
|
---|
| 234 | if( !waschanges )
|
---|
| 235 | {
|
---|
| 236 | break;
|
---|
| 237 | }
|
---|
| 238 | }
|
---|
| 239 |
|
---|
| 240 | //
|
---|
| 241 | // 3. Calculate E, compare with best centers found so far
|
---|
| 242 | //
|
---|
| 243 | e = 0;
|
---|
| 244 | for(i=0; i<=npoints-1; i++)
|
---|
| 245 | {
|
---|
| 246 | for(i_=0; i_<=nvars-1;i_++)
|
---|
| 247 | {
|
---|
| 248 | tmp[i_] = xy[i,i_];
|
---|
| 249 | }
|
---|
| 250 | for(i_=0; i_<=nvars-1;i_++)
|
---|
| 251 | {
|
---|
| 252 | tmp[i_] = tmp[i_] - ct[xyc[i],i_];
|
---|
| 253 | }
|
---|
| 254 | v = 0.0;
|
---|
| 255 | for(i_=0; i_<=nvars-1;i_++)
|
---|
| 256 | {
|
---|
| 257 | v += tmp[i_]*tmp[i_];
|
---|
| 258 | }
|
---|
| 259 | e = e+v;
|
---|
| 260 | }
|
---|
| 261 | if( (double)(e)<(double)(ebest) )
|
---|
| 262 | {
|
---|
| 263 |
|
---|
| 264 | //
|
---|
| 265 | // store partition
|
---|
| 266 | //
|
---|
| 267 | blas.copymatrix(ref ct, 0, k-1, 0, nvars-1, ref ctbest, 0, k-1, 0, nvars-1);
|
---|
| 268 | }
|
---|
| 269 | }
|
---|
| 270 |
|
---|
| 271 | //
|
---|
| 272 | // Copy and transpose
|
---|
| 273 | //
|
---|
| 274 | c = new double[nvars-1+1, k-1+1];
|
---|
| 275 | blas.copyandtranspose(ref ctbest, 0, k-1, 0, nvars-1, ref c, 0, nvars-1, 0, k-1);
|
---|
| 276 | }
|
---|
| 277 |
|
---|
| 278 |
|
---|
| 279 | /*************************************************************************
|
---|
| 280 | Select center for a new cluster using k-means++ rule
|
---|
| 281 | *************************************************************************/
|
---|
| 282 | private static bool selectcenterpp(ref double[,] xy,
|
---|
| 283 | int npoints,
|
---|
| 284 | int nvars,
|
---|
| 285 | ref double[,] centers,
|
---|
| 286 | bool[] busycenters,
|
---|
| 287 | int ccnt,
|
---|
| 288 | ref double[] d2,
|
---|
| 289 | ref double[] p,
|
---|
| 290 | ref double[] tmp)
|
---|
| 291 | {
|
---|
| 292 | bool result = new bool();
|
---|
| 293 | int i = 0;
|
---|
| 294 | int j = 0;
|
---|
| 295 | int cc = 0;
|
---|
| 296 | double v = 0;
|
---|
| 297 | double s = 0;
|
---|
| 298 | int i_ = 0;
|
---|
| 299 |
|
---|
| 300 | busycenters = (bool[])busycenters.Clone();
|
---|
| 301 |
|
---|
| 302 | result = true;
|
---|
| 303 | for(cc=0; cc<=ccnt-1; cc++)
|
---|
| 304 | {
|
---|
| 305 | if( !busycenters[cc] )
|
---|
| 306 | {
|
---|
| 307 |
|
---|
| 308 | //
|
---|
| 309 | // fill D2
|
---|
| 310 | //
|
---|
| 311 | for(i=0; i<=npoints-1; i++)
|
---|
| 312 | {
|
---|
| 313 | d2[i] = AP.Math.MaxRealNumber;
|
---|
| 314 | for(j=0; j<=ccnt-1; j++)
|
---|
| 315 | {
|
---|
| 316 | if( busycenters[j] )
|
---|
| 317 | {
|
---|
| 318 | for(i_=0; i_<=nvars-1;i_++)
|
---|
| 319 | {
|
---|
| 320 | tmp[i_] = xy[i,i_];
|
---|
| 321 | }
|
---|
| 322 | for(i_=0; i_<=nvars-1;i_++)
|
---|
| 323 | {
|
---|
| 324 | tmp[i_] = tmp[i_] - centers[j,i_];
|
---|
| 325 | }
|
---|
| 326 | v = 0.0;
|
---|
| 327 | for(i_=0; i_<=nvars-1;i_++)
|
---|
| 328 | {
|
---|
| 329 | v += tmp[i_]*tmp[i_];
|
---|
| 330 | }
|
---|
| 331 | if( (double)(v)<(double)(d2[i]) )
|
---|
| 332 | {
|
---|
| 333 | d2[i] = v;
|
---|
| 334 | }
|
---|
| 335 | }
|
---|
| 336 | }
|
---|
| 337 | }
|
---|
| 338 |
|
---|
| 339 | //
|
---|
| 340 | // calculate P (non-cumulative)
|
---|
| 341 | //
|
---|
| 342 | s = 0;
|
---|
| 343 | for(i=0; i<=npoints-1; i++)
|
---|
| 344 | {
|
---|
| 345 | s = s+d2[i];
|
---|
| 346 | }
|
---|
| 347 | if( (double)(s)==(double)(0) )
|
---|
| 348 | {
|
---|
| 349 | result = false;
|
---|
| 350 | return result;
|
---|
| 351 | }
|
---|
| 352 | s = 1/s;
|
---|
| 353 | for(i_=0; i_<=npoints-1;i_++)
|
---|
| 354 | {
|
---|
| 355 | p[i_] = s*d2[i_];
|
---|
| 356 | }
|
---|
| 357 |
|
---|
| 358 | //
|
---|
| 359 | // choose one of points with probability P
|
---|
| 360 | // random number within (0,1) is generated and
|
---|
| 361 | // inverse empirical CDF is used to randomly choose a point.
|
---|
| 362 | //
|
---|
| 363 | s = 0;
|
---|
| 364 | v = AP.Math.RandomReal();
|
---|
| 365 | for(i=0; i<=npoints-1; i++)
|
---|
| 366 | {
|
---|
| 367 | s = s+p[i];
|
---|
| 368 | if( (double)(v)<=(double)(s) | i==npoints-1 )
|
---|
| 369 | {
|
---|
| 370 | for(i_=0; i_<=nvars-1;i_++)
|
---|
| 371 | {
|
---|
| 372 | centers[cc,i_] = xy[i,i_];
|
---|
| 373 | }
|
---|
| 374 | busycenters[cc] = true;
|
---|
| 375 | break;
|
---|
| 376 | }
|
---|
| 377 | }
|
---|
| 378 | }
|
---|
| 379 | }
|
---|
| 380 | return result;
|
---|
| 381 | }
|
---|
| 382 | }
|
---|
| 383 | }
|
---|