[2563] | 1 | /*************************************************************************
|
---|
| 2 | Copyright (c) 2008, Sergey Bochkanov (ALGLIB project).
|
---|
| 3 |
|
---|
| 4 | >>> SOURCE LICENSE >>>
|
---|
| 5 | This program is free software; you can redistribute it and/or modify
|
---|
| 6 | it under the terms of the GNU General Public License as published by
|
---|
| 7 | the Free Software Foundation (www.fsf.org); either version 2 of the
|
---|
| 8 | License, or (at your option) any later version.
|
---|
| 9 |
|
---|
| 10 | This program is distributed in the hope that it will be useful,
|
---|
| 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 13 | GNU General Public License for more details.
|
---|
| 14 |
|
---|
| 15 | A copy of the GNU General Public License is available at
|
---|
| 16 | http://www.fsf.org/licensing/licenses
|
---|
| 17 |
|
---|
| 18 | >>> END OF LICENSE >>>
|
---|
| 19 | *************************************************************************/
|
---|
| 20 |
|
---|
| 21 | using System;
|
---|
| 22 |
|
---|
| 23 | namespace alglib
|
---|
| 24 | {
|
---|
| 25 | public class kmeans
|
---|
| 26 | {
|
---|
| 27 | /*************************************************************************
|
---|
| 28 | k-means++ clusterization
|
---|
| 29 |
|
---|
| 30 | INPUT PARAMETERS:
|
---|
| 31 | XY - dataset, array [0..NPoints-1,0..NVars-1].
|
---|
| 32 | NPoints - dataset size, NPoints>=K
|
---|
| 33 | NVars - number of variables, NVars>=1
|
---|
| 34 | K - desired number of clusters, K>=1
|
---|
| 35 | Restarts - number of restarts, Restarts>=1
|
---|
| 36 |
|
---|
| 37 | OUTPUT PARAMETERS:
|
---|
| 38 | Info - return code:
|
---|
| 39 | * -3, if taskis degenerate (number of distinct points is
|
---|
| 40 | less than K)
|
---|
| 41 | * -1, if incorrect NPoints/NFeatures/K/Restarts was passed
|
---|
| 42 | * 1, if subroutine finished successfully
|
---|
| 43 | C - array[0..NVars-1,0..K-1].matrix whose columns store
|
---|
| 44 | cluster's centers
|
---|
| 45 | XYC - array which contains number of clusters dataset points
|
---|
| 46 | belong to.
|
---|
| 47 |
|
---|
| 48 | -- ALGLIB --
|
---|
| 49 | Copyright 21.03.2009 by Bochkanov Sergey
|
---|
| 50 | *************************************************************************/
|
---|
| 51 | public static void kmeansgenerate(ref double[,] xy,
|
---|
| 52 | int npoints,
|
---|
| 53 | int nvars,
|
---|
| 54 | int k,
|
---|
| 55 | int restarts,
|
---|
| 56 | ref int info,
|
---|
| 57 | ref double[,] c,
|
---|
| 58 | ref int[] xyc)
|
---|
| 59 | {
|
---|
| 60 | int i = 0;
|
---|
| 61 | int j = 0;
|
---|
| 62 | double[,] ct = new double[0,0];
|
---|
| 63 | double[,] ctbest = new double[0,0];
|
---|
| 64 | double e = 0;
|
---|
| 65 | double ebest = 0;
|
---|
| 66 | double[] x = new double[0];
|
---|
| 67 | double[] tmp = new double[0];
|
---|
| 68 | int cc = 0;
|
---|
| 69 | double[] d2 = new double[0];
|
---|
| 70 | double[] p = new double[0];
|
---|
| 71 | int[] csizes = new int[0];
|
---|
| 72 | bool[] cbusy = new bool[0];
|
---|
| 73 | double v = 0;
|
---|
| 74 | double s = 0;
|
---|
| 75 | int cclosest = 0;
|
---|
| 76 | double dclosest = 0;
|
---|
| 77 | double[] work = new double[0];
|
---|
| 78 | bool waschanges = new bool();
|
---|
| 79 | bool zerosizeclusters = new bool();
|
---|
| 80 | int pass = 0;
|
---|
| 81 | int i_ = 0;
|
---|
| 82 |
|
---|
| 83 |
|
---|
| 84 | //
|
---|
| 85 | // Test parameters
|
---|
| 86 | //
|
---|
| 87 | if( npoints<k | nvars<1 | k<1 | restarts<1 )
|
---|
| 88 | {
|
---|
| 89 | info = -1;
|
---|
| 90 | return;
|
---|
| 91 | }
|
---|
| 92 |
|
---|
| 93 | //
|
---|
| 94 | // TODO: special case K=1
|
---|
| 95 | // TODO: special case K=NPoints
|
---|
| 96 | //
|
---|
| 97 | info = 1;
|
---|
| 98 |
|
---|
| 99 | //
|
---|
| 100 | // Multiple passes of k-means++ algorithm
|
---|
| 101 | //
|
---|
| 102 | ct = new double[k-1+1, nvars-1+1];
|
---|
| 103 | ctbest = new double[k-1+1, nvars-1+1];
|
---|
| 104 | xyc = new int[npoints-1+1];
|
---|
| 105 | d2 = new double[npoints-1+1];
|
---|
| 106 | p = new double[npoints-1+1];
|
---|
| 107 | tmp = new double[nvars-1+1];
|
---|
| 108 | csizes = new int[k-1+1];
|
---|
| 109 | cbusy = new bool[k-1+1];
|
---|
| 110 | ebest = AP.Math.MaxRealNumber;
|
---|
| 111 | for(pass=1; pass<=restarts; pass++)
|
---|
| 112 | {
|
---|
| 113 |
|
---|
| 114 | //
|
---|
| 115 | // Select initial centers using k-means++ algorithm
|
---|
| 116 | // 1. Choose first center at random
|
---|
| 117 | // 2. Choose next centers using their distance from centers already chosen
|
---|
| 118 | //
|
---|
| 119 | // Note that for performance reasons centers are stored in ROWS of CT, not
|
---|
| 120 | // in columns. We'll transpose CT in the end and store it in the C.
|
---|
| 121 | //
|
---|
| 122 | i = AP.Math.RandomInteger(npoints);
|
---|
| 123 | for(i_=0; i_<=nvars-1;i_++)
|
---|
| 124 | {
|
---|
| 125 | ct[0,i_] = xy[i,i_];
|
---|
| 126 | }
|
---|
| 127 | cbusy[0] = true;
|
---|
| 128 | for(i=1; i<=k-1; i++)
|
---|
| 129 | {
|
---|
| 130 | cbusy[i] = false;
|
---|
| 131 | }
|
---|
| 132 | if( !selectcenterpp(ref xy, npoints, nvars, ref ct, cbusy, k, ref d2, ref p, ref tmp) )
|
---|
| 133 | {
|
---|
| 134 | info = -3;
|
---|
| 135 | return;
|
---|
| 136 | }
|
---|
| 137 |
|
---|
| 138 | //
|
---|
| 139 | // Update centers:
|
---|
| 140 | // 2. update center positions
|
---|
| 141 | //
|
---|
| 142 | while( true )
|
---|
| 143 | {
|
---|
| 144 |
|
---|
| 145 | //
|
---|
| 146 | // fill XYC with center numbers
|
---|
| 147 | //
|
---|
| 148 | waschanges = false;
|
---|
| 149 | for(i=0; i<=npoints-1; i++)
|
---|
| 150 | {
|
---|
| 151 | cclosest = -1;
|
---|
| 152 | dclosest = AP.Math.MaxRealNumber;
|
---|
| 153 | for(j=0; j<=k-1; j++)
|
---|
| 154 | {
|
---|
| 155 | for(i_=0; i_<=nvars-1;i_++)
|
---|
| 156 | {
|
---|
| 157 | tmp[i_] = xy[i,i_];
|
---|
| 158 | }
|
---|
| 159 | for(i_=0; i_<=nvars-1;i_++)
|
---|
| 160 | {
|
---|
| 161 | tmp[i_] = tmp[i_] - ct[j,i_];
|
---|
| 162 | }
|
---|
| 163 | v = 0.0;
|
---|
| 164 | for(i_=0; i_<=nvars-1;i_++)
|
---|
| 165 | {
|
---|
| 166 | v += tmp[i_]*tmp[i_];
|
---|
| 167 | }
|
---|
| 168 | if( (double)(v)<(double)(dclosest) )
|
---|
| 169 | {
|
---|
| 170 | cclosest = j;
|
---|
| 171 | dclosest = v;
|
---|
| 172 | }
|
---|
| 173 | }
|
---|
| 174 | if( xyc[i]!=cclosest )
|
---|
| 175 | {
|
---|
| 176 | waschanges = true;
|
---|
| 177 | }
|
---|
| 178 | xyc[i] = cclosest;
|
---|
| 179 | }
|
---|
| 180 |
|
---|
| 181 | //
|
---|
| 182 | // Update centers
|
---|
| 183 | //
|
---|
| 184 | for(j=0; j<=k-1; j++)
|
---|
| 185 | {
|
---|
| 186 | csizes[j] = 0;
|
---|
| 187 | }
|
---|
| 188 | for(i=0; i<=k-1; i++)
|
---|
| 189 | {
|
---|
| 190 | for(j=0; j<=nvars-1; j++)
|
---|
| 191 | {
|
---|
| 192 | ct[i,j] = 0;
|
---|
| 193 | }
|
---|
| 194 | }
|
---|
| 195 | for(i=0; i<=npoints-1; i++)
|
---|
| 196 | {
|
---|
| 197 | csizes[xyc[i]] = csizes[xyc[i]]+1;
|
---|
| 198 | for(i_=0; i_<=nvars-1;i_++)
|
---|
| 199 | {
|
---|
| 200 | ct[xyc[i],i_] = ct[xyc[i],i_] + xy[i,i_];
|
---|
| 201 | }
|
---|
| 202 | }
|
---|
| 203 | zerosizeclusters = false;
|
---|
| 204 | for(i=0; i<=k-1; i++)
|
---|
| 205 | {
|
---|
| 206 | cbusy[i] = csizes[i]!=0;
|
---|
| 207 | zerosizeclusters = zerosizeclusters | csizes[i]==0;
|
---|
| 208 | }
|
---|
| 209 | if( zerosizeclusters )
|
---|
| 210 | {
|
---|
| 211 |
|
---|
| 212 | //
|
---|
| 213 | // Some clusters have zero size - rare, but possible.
|
---|
| 214 | // We'll choose new centers for such clusters using k-means++ rule
|
---|
| 215 | // and restart algorithm
|
---|
| 216 | //
|
---|
| 217 | if( !selectcenterpp(ref xy, npoints, nvars, ref ct, cbusy, k, ref d2, ref p, ref tmp) )
|
---|
| 218 | {
|
---|
| 219 | info = -3;
|
---|
| 220 | return;
|
---|
| 221 | }
|
---|
| 222 | continue;
|
---|
| 223 | }
|
---|
| 224 | for(j=0; j<=k-1; j++)
|
---|
| 225 | {
|
---|
| 226 | v = (double)(1)/(double)(csizes[j]);
|
---|
| 227 | for(i_=0; i_<=nvars-1;i_++)
|
---|
| 228 | {
|
---|
| 229 | ct[j,i_] = v*ct[j,i_];
|
---|
| 230 | }
|
---|
| 231 | }
|
---|
| 232 |
|
---|
| 233 | //
|
---|
| 234 | // if nothing has changed during iteration
|
---|
| 235 | //
|
---|
| 236 | if( !waschanges )
|
---|
| 237 | {
|
---|
| 238 | break;
|
---|
| 239 | }
|
---|
| 240 | }
|
---|
| 241 |
|
---|
| 242 | //
|
---|
| 243 | // 3. Calculate E, compare with best centers found so far
|
---|
| 244 | //
|
---|
| 245 | e = 0;
|
---|
| 246 | for(i=0; i<=npoints-1; i++)
|
---|
| 247 | {
|
---|
| 248 | for(i_=0; i_<=nvars-1;i_++)
|
---|
| 249 | {
|
---|
| 250 | tmp[i_] = xy[i,i_];
|
---|
| 251 | }
|
---|
| 252 | for(i_=0; i_<=nvars-1;i_++)
|
---|
| 253 | {
|
---|
| 254 | tmp[i_] = tmp[i_] - ct[xyc[i],i_];
|
---|
| 255 | }
|
---|
| 256 | v = 0.0;
|
---|
| 257 | for(i_=0; i_<=nvars-1;i_++)
|
---|
| 258 | {
|
---|
| 259 | v += tmp[i_]*tmp[i_];
|
---|
| 260 | }
|
---|
| 261 | e = e+v;
|
---|
| 262 | }
|
---|
| 263 | if( (double)(e)<(double)(ebest) )
|
---|
| 264 | {
|
---|
| 265 |
|
---|
| 266 | //
|
---|
| 267 | // store partition
|
---|
| 268 | //
|
---|
| 269 | blas.copymatrix(ref ct, 0, k-1, 0, nvars-1, ref ctbest, 0, k-1, 0, nvars-1);
|
---|
| 270 | }
|
---|
| 271 | }
|
---|
| 272 |
|
---|
| 273 | //
|
---|
| 274 | // Copy and transpose
|
---|
| 275 | //
|
---|
| 276 | c = new double[nvars-1+1, k-1+1];
|
---|
| 277 | blas.copyandtranspose(ref ctbest, 0, k-1, 0, nvars-1, ref c, 0, nvars-1, 0, k-1);
|
---|
| 278 | }
|
---|
| 279 |
|
---|
| 280 |
|
---|
| 281 | /*************************************************************************
|
---|
| 282 | Select center for a new cluster using k-means++ rule
|
---|
| 283 | *************************************************************************/
|
---|
| 284 | private static bool selectcenterpp(ref double[,] xy,
|
---|
| 285 | int npoints,
|
---|
| 286 | int nvars,
|
---|
| 287 | ref double[,] centers,
|
---|
| 288 | bool[] busycenters,
|
---|
| 289 | int ccnt,
|
---|
| 290 | ref double[] d2,
|
---|
| 291 | ref double[] p,
|
---|
| 292 | ref double[] tmp)
|
---|
| 293 | {
|
---|
| 294 | bool result = new bool();
|
---|
| 295 | int i = 0;
|
---|
| 296 | int j = 0;
|
---|
| 297 | int cc = 0;
|
---|
| 298 | double v = 0;
|
---|
| 299 | double s = 0;
|
---|
| 300 | int i_ = 0;
|
---|
| 301 |
|
---|
| 302 | busycenters = (bool[])busycenters.Clone();
|
---|
| 303 |
|
---|
| 304 | result = true;
|
---|
| 305 | for(cc=0; cc<=ccnt-1; cc++)
|
---|
| 306 | {
|
---|
| 307 | if( !busycenters[cc] )
|
---|
| 308 | {
|
---|
| 309 |
|
---|
| 310 | //
|
---|
| 311 | // fill D2
|
---|
| 312 | //
|
---|
| 313 | for(i=0; i<=npoints-1; i++)
|
---|
| 314 | {
|
---|
| 315 | d2[i] = AP.Math.MaxRealNumber;
|
---|
| 316 | for(j=0; j<=ccnt-1; j++)
|
---|
| 317 | {
|
---|
| 318 | if( busycenters[j] )
|
---|
| 319 | {
|
---|
| 320 | for(i_=0; i_<=nvars-1;i_++)
|
---|
| 321 | {
|
---|
| 322 | tmp[i_] = xy[i,i_];
|
---|
| 323 | }
|
---|
| 324 | for(i_=0; i_<=nvars-1;i_++)
|
---|
| 325 | {
|
---|
| 326 | tmp[i_] = tmp[i_] - centers[j,i_];
|
---|
| 327 | }
|
---|
| 328 | v = 0.0;
|
---|
| 329 | for(i_=0; i_<=nvars-1;i_++)
|
---|
| 330 | {
|
---|
| 331 | v += tmp[i_]*tmp[i_];
|
---|
| 332 | }
|
---|
| 333 | if( (double)(v)<(double)(d2[i]) )
|
---|
| 334 | {
|
---|
| 335 | d2[i] = v;
|
---|
| 336 | }
|
---|
| 337 | }
|
---|
| 338 | }
|
---|
| 339 | }
|
---|
| 340 |
|
---|
| 341 | //
|
---|
| 342 | // calculate P (non-cumulative)
|
---|
| 343 | //
|
---|
| 344 | s = 0;
|
---|
| 345 | for(i=0; i<=npoints-1; i++)
|
---|
| 346 | {
|
---|
| 347 | s = s+d2[i];
|
---|
| 348 | }
|
---|
| 349 | if( (double)(s)==(double)(0) )
|
---|
| 350 | {
|
---|
| 351 | result = false;
|
---|
| 352 | return result;
|
---|
| 353 | }
|
---|
| 354 | s = 1/s;
|
---|
| 355 | for(i_=0; i_<=npoints-1;i_++)
|
---|
| 356 | {
|
---|
| 357 | p[i_] = s*d2[i_];
|
---|
| 358 | }
|
---|
| 359 |
|
---|
| 360 | //
|
---|
| 361 | // choose one of points with probability P
|
---|
| 362 | // random number within (0,1) is generated and
|
---|
| 363 | // inverse empirical CDF is used to randomly choose a point.
|
---|
| 364 | //
|
---|
| 365 | s = 0;
|
---|
| 366 | v = AP.Math.RandomReal();
|
---|
| 367 | for(i=0; i<=npoints-1; i++)
|
---|
| 368 | {
|
---|
| 369 | s = s+p[i];
|
---|
| 370 | if( (double)(v)<=(double)(s) | i==npoints-1 )
|
---|
| 371 | {
|
---|
| 372 | for(i_=0; i_<=nvars-1;i_++)
|
---|
| 373 | {
|
---|
| 374 | centers[cc,i_] = xy[i,i_];
|
---|
| 375 | }
|
---|
| 376 | busycenters[cc] = true;
|
---|
| 377 | break;
|
---|
| 378 | }
|
---|
| 379 | }
|
---|
| 380 | }
|
---|
| 381 | }
|
---|
| 382 | return result;
|
---|
| 383 | }
|
---|
| 384 | }
|
---|
| 385 | }
|
---|