Changeset 4068 for trunk/sources/HeuristicLab.ExtLibs/HeuristicLab.ALGLIB/2.5.0/ALGLIB-2.5.0/kmeans.cs
- Timestamp:
- 07/22/10 00:44:01 (14 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/sources/HeuristicLab.ExtLibs/HeuristicLab.ALGLIB/2.5.0/ALGLIB-2.5.0/kmeans.cs
r3839 r4068 19 19 *************************************************************************/ 20 20 21 using System; 22 23 namespace alglib 24 { 25 public class kmeans 26 { 27 /************************************************************************* 28 k-means++ clusterization 29 30 INPUT PARAMETERS: 31 XY - dataset, array [0..NPoints-1,0..NVars-1]. 32 NPoints - dataset size, NPoints>=K 33 NVars - number of variables, NVars>=1 34 K - desired number of clusters, K>=1 35 Restarts - number of restarts, Restarts>=1 36 37 OUTPUT PARAMETERS: 38 Info - return code: 39 * -3, if taskis degenerate (number of distinct points is 40 less than K) 41 * -1, if incorrect NPoints/NFeatures/K/Restarts was passed 42 * 1, if subroutine finished successfully 43 C - array[0..NVars-1,0..K-1].matrix whose columns store 44 cluster's centers 45 XYC - array which contains number of clusters dataset points 46 belong to. 47 48 -- ALGLIB -- 49 Copyright 21.03.2009 by Bochkanov Sergey 50 *************************************************************************/ 51 public static void kmeansgenerate(ref double[,] xy, 52 int npoints, 53 int nvars, 54 int k, 55 int restarts, 56 ref int info, 57 ref double[,] c, 58 ref int[] xyc) 59 { 60 int i = 0; 61 int j = 0; 62 double[,] ct = new double[0,0]; 63 double[,] ctbest = new double[0,0]; 64 double e = 0; 65 double ebest = 0; 66 double[] x = new double[0]; 67 double[] tmp = new double[0]; 68 double[] d2 = new double[0]; 69 double[] p = new double[0]; 70 int[] csizes = new int[0]; 71 bool[] cbusy = new bool[0]; 72 double v = 0; 73 int cclosest = 0; 74 double dclosest = 0; 75 double[] work = new double[0]; 76 bool waschanges = new bool(); 77 bool zerosizeclusters = new bool(); 78 int pass = 0; 79 int i_ = 0; 80 81 21 22 namespace alglib { 23 public class kmeans { 24 /************************************************************************* 25 k-means++ clusterization 26 27 INPUT PARAMETERS: 28 XY - dataset, array [0..NPoints-1,0..NVars-1]. 29 NPoints - dataset size, NPoints>=K 30 NVars - number of variables, NVars>=1 31 K - desired number of clusters, K>=1 32 Restarts - number of restarts, Restarts>=1 33 34 OUTPUT PARAMETERS: 35 Info - return code: 36 * -3, if taskis degenerate (number of distinct points is 37 less than K) 38 * -1, if incorrect NPoints/NFeatures/K/Restarts was passed 39 * 1, if subroutine finished successfully 40 C - array[0..NVars-1,0..K-1].matrix whose columns store 41 cluster's centers 42 XYC - array which contains number of clusters dataset points 43 belong to. 44 45 -- ALGLIB -- 46 Copyright 21.03.2009 by Bochkanov Sergey 47 *************************************************************************/ 48 public static void kmeansgenerate(ref double[,] xy, 49 int npoints, 50 int nvars, 51 int k, 52 int restarts, 53 ref int info, 54 ref double[,] c, 55 ref int[] xyc) { 56 int i = 0; 57 int j = 0; 58 double[,] ct = new double[0, 0]; 59 double[,] ctbest = new double[0, 0]; 60 double e = 0; 61 double ebest = 0; 62 double[] x = new double[0]; 63 double[] tmp = new double[0]; 64 double[] d2 = new double[0]; 65 double[] p = new double[0]; 66 int[] csizes = new int[0]; 67 bool[] cbusy = new bool[0]; 68 double v = 0; 69 int cclosest = 0; 70 double dclosest = 0; 71 double[] work = new double[0]; 72 bool waschanges = new bool(); 73 bool zerosizeclusters = new bool(); 74 int pass = 0; 75 int i_ = 0; 76 77 78 // 79 // Test parameters 80 // 81 if (npoints < k | nvars < 1 | k < 1 | restarts < 1) { 82 info = -1; 83 return; 84 } 85 86 // 87 // TODO: special case K=1 88 // TODO: special case K=NPoints 89 // 90 info = 1; 91 92 // 93 // Multiple passes of k-means++ algorithm 94 // 95 ct = new double[k - 1 + 1, nvars - 1 + 1]; 96 ctbest = new double[k - 1 + 1, nvars - 1 + 1]; 97 xyc = new int[npoints - 1 + 1]; 98 d2 = new double[npoints - 1 + 1]; 99 p = new double[npoints - 1 + 1]; 100 tmp = new double[nvars - 1 + 1]; 101 csizes = new int[k - 1 + 1]; 102 cbusy = new bool[k - 1 + 1]; 103 ebest = AP.Math.MaxRealNumber; 104 for (pass = 1; pass <= restarts; pass++) { 105 106 // 107 // Select initial centers using k-means++ algorithm 108 // 1. Choose first center at random 109 // 2. Choose next centers using their distance from centers already chosen 110 // 111 // Note that for performance reasons centers are stored in ROWS of CT, not 112 // in columns. We'll transpose CT in the end and store it in the C. 113 // 114 i = AP.Math.RandomInteger(npoints); 115 for (i_ = 0; i_ <= nvars - 1; i_++) { 116 ct[0, i_] = xy[i, i_]; 117 } 118 cbusy[0] = true; 119 for (i = 1; i <= k - 1; i++) { 120 cbusy[i] = false; 121 } 122 if (!selectcenterpp(ref xy, npoints, nvars, ref ct, cbusy, k, ref d2, ref p, ref tmp)) { 123 info = -3; 124 return; 125 } 126 127 // 128 // Update centers: 129 // 2. update center positions 130 // 131 while (true) { 132 133 // 134 // fill XYC with center numbers 135 // 136 waschanges = false; 137 for (i = 0; i <= npoints - 1; i++) { 138 cclosest = -1; 139 dclosest = AP.Math.MaxRealNumber; 140 for (j = 0; j <= k - 1; j++) { 141 for (i_ = 0; i_ <= nvars - 1; i_++) { 142 tmp[i_] = xy[i, i_]; 143 } 144 for (i_ = 0; i_ <= nvars - 1; i_++) { 145 tmp[i_] = tmp[i_] - ct[j, i_]; 146 } 147 v = 0.0; 148 for (i_ = 0; i_ <= nvars - 1; i_++) { 149 v += tmp[i_] * tmp[i_]; 150 } 151 if ((double)(v) < (double)(dclosest)) { 152 cclosest = j; 153 dclosest = v; 154 } 155 } 156 if (xyc[i] != cclosest) { 157 waschanges = true; 158 } 159 xyc[i] = cclosest; 160 } 161 162 // 163 // Update centers 164 // 165 for (j = 0; j <= k - 1; j++) { 166 csizes[j] = 0; 167 } 168 for (i = 0; i <= k - 1; i++) { 169 for (j = 0; j <= nvars - 1; j++) { 170 ct[i, j] = 0; 171 } 172 } 173 for (i = 0; i <= npoints - 1; i++) { 174 csizes[xyc[i]] = csizes[xyc[i]] + 1; 175 for (i_ = 0; i_ <= nvars - 1; i_++) { 176 ct[xyc[i], i_] = ct[xyc[i], i_] + xy[i, i_]; 177 } 178 } 179 zerosizeclusters = false; 180 for (i = 0; i <= k - 1; i++) { 181 cbusy[i] = csizes[i] != 0; 182 zerosizeclusters = zerosizeclusters | csizes[i] == 0; 183 } 184 if (zerosizeclusters) { 185 82 186 // 83 // Test parameters 187 // Some clusters have zero size - rare, but possible. 188 // We'll choose new centers for such clusters using k-means++ rule 189 // and restart algorithm 84 190 // 85 if( npoints<k | nvars<1 | k<1 | restarts<1 ) 86 { 87 info = -1; 88 return; 89 } 90 91 // 92 // TODO: special case K=1 93 // TODO: special case K=NPoints 94 // 95 info = 1; 96 97 // 98 // Multiple passes of k-means++ algorithm 99 // 100 ct = new double[k-1+1, nvars-1+1]; 101 ctbest = new double[k-1+1, nvars-1+1]; 102 xyc = new int[npoints-1+1]; 103 d2 = new double[npoints-1+1]; 104 p = new double[npoints-1+1]; 105 tmp = new double[nvars-1+1]; 106 csizes = new int[k-1+1]; 107 cbusy = new bool[k-1+1]; 108 ebest = AP.Math.MaxRealNumber; 109 for(pass=1; pass<=restarts; pass++) 110 { 111 112 // 113 // Select initial centers using k-means++ algorithm 114 // 1. Choose first center at random 115 // 2. Choose next centers using their distance from centers already chosen 116 // 117 // Note that for performance reasons centers are stored in ROWS of CT, not 118 // in columns. We'll transpose CT in the end and store it in the C. 119 // 120 i = AP.Math.RandomInteger(npoints); 121 for(i_=0; i_<=nvars-1;i_++) 122 { 123 ct[0,i_] = xy[i,i_]; 191 if (!selectcenterpp(ref xy, npoints, nvars, ref ct, cbusy, k, ref d2, ref p, ref tmp)) { 192 info = -3; 193 return; 194 } 195 continue; 196 } 197 for (j = 0; j <= k - 1; j++) { 198 v = (double)(1) / (double)(csizes[j]); 199 for (i_ = 0; i_ <= nvars - 1; i_++) { 200 ct[j, i_] = v * ct[j, i_]; 201 } 202 } 203 204 // 205 // if nothing has changed during iteration 206 // 207 if (!waschanges) { 208 break; 209 } 210 } 211 212 // 213 // 3. Calculate E, compare with best centers found so far 214 // 215 e = 0; 216 for (i = 0; i <= npoints - 1; i++) { 217 for (i_ = 0; i_ <= nvars - 1; i_++) { 218 tmp[i_] = xy[i, i_]; 219 } 220 for (i_ = 0; i_ <= nvars - 1; i_++) { 221 tmp[i_] = tmp[i_] - ct[xyc[i], i_]; 222 } 223 v = 0.0; 224 for (i_ = 0; i_ <= nvars - 1; i_++) { 225 v += tmp[i_] * tmp[i_]; 226 } 227 e = e + v; 228 } 229 if ((double)(e) < (double)(ebest)) { 230 231 // 232 // store partition 233 // 234 blas.copymatrix(ref ct, 0, k - 1, 0, nvars - 1, ref ctbest, 0, k - 1, 0, nvars - 1); 235 } 236 } 237 238 // 239 // Copy and transpose 240 // 241 c = new double[nvars - 1 + 1, k - 1 + 1]; 242 blas.copyandtranspose(ref ctbest, 0, k - 1, 0, nvars - 1, ref c, 0, nvars - 1, 0, k - 1); 243 } 244 245 246 /************************************************************************* 247 Select center for a new cluster using k-means++ rule 248 *************************************************************************/ 249 private static bool selectcenterpp(ref double[,] xy, 250 int npoints, 251 int nvars, 252 ref double[,] centers, 253 bool[] busycenters, 254 int ccnt, 255 ref double[] d2, 256 ref double[] p, 257 ref double[] tmp) { 258 bool result = new bool(); 259 int i = 0; 260 int j = 0; 261 int cc = 0; 262 double v = 0; 263 double s = 0; 264 int i_ = 0; 265 266 busycenters = (bool[])busycenters.Clone(); 267 268 result = true; 269 for (cc = 0; cc <= ccnt - 1; cc++) { 270 if (!busycenters[cc]) { 271 272 // 273 // fill D2 274 // 275 for (i = 0; i <= npoints - 1; i++) { 276 d2[i] = AP.Math.MaxRealNumber; 277 for (j = 0; j <= ccnt - 1; j++) { 278 if (busycenters[j]) { 279 for (i_ = 0; i_ <= nvars - 1; i_++) { 280 tmp[i_] = xy[i, i_]; 124 281 } 125 cbusy[0] = true; 126 for(i=1; i<=k-1; i++) 127 { 128 cbusy[i] = false; 282 for (i_ = 0; i_ <= nvars - 1; i_++) { 283 tmp[i_] = tmp[i_] - centers[j, i_]; 129 284 } 130 if( !selectcenterpp(ref xy, npoints, nvars, ref ct, cbusy, k, ref d2, ref p, ref tmp) ) 131 { 132 info = -3; 133 return; 285 v = 0.0; 286 for (i_ = 0; i_ <= nvars - 1; i_++) { 287 v += tmp[i_] * tmp[i_]; 134 288 } 135 136 // 137 // Update centers: 138 // 2. update center positions 139 // 140 while( true ) 141 { 142 143 // 144 // fill XYC with center numbers 145 // 146 waschanges = false; 147 for(i=0; i<=npoints-1; i++) 148 { 149 cclosest = -1; 150 dclosest = AP.Math.MaxRealNumber; 151 for(j=0; j<=k-1; j++) 152 { 153 for(i_=0; i_<=nvars-1;i_++) 154 { 155 tmp[i_] = xy[i,i_]; 156 } 157 for(i_=0; i_<=nvars-1;i_++) 158 { 159 tmp[i_] = tmp[i_] - ct[j,i_]; 160 } 161 v = 0.0; 162 for(i_=0; i_<=nvars-1;i_++) 163 { 164 v += tmp[i_]*tmp[i_]; 165 } 166 if( (double)(v)<(double)(dclosest) ) 167 { 168 cclosest = j; 169 dclosest = v; 170 } 171 } 172 if( xyc[i]!=cclosest ) 173 { 174 waschanges = true; 175 } 176 xyc[i] = cclosest; 177 } 178 179 // 180 // Update centers 181 // 182 for(j=0; j<=k-1; j++) 183 { 184 csizes[j] = 0; 185 } 186 for(i=0; i<=k-1; i++) 187 { 188 for(j=0; j<=nvars-1; j++) 189 { 190 ct[i,j] = 0; 191 } 192 } 193 for(i=0; i<=npoints-1; i++) 194 { 195 csizes[xyc[i]] = csizes[xyc[i]]+1; 196 for(i_=0; i_<=nvars-1;i_++) 197 { 198 ct[xyc[i],i_] = ct[xyc[i],i_] + xy[i,i_]; 199 } 200 } 201 zerosizeclusters = false; 202 for(i=0; i<=k-1; i++) 203 { 204 cbusy[i] = csizes[i]!=0; 205 zerosizeclusters = zerosizeclusters | csizes[i]==0; 206 } 207 if( zerosizeclusters ) 208 { 209 210 // 211 // Some clusters have zero size - rare, but possible. 212 // We'll choose new centers for such clusters using k-means++ rule 213 // and restart algorithm 214 // 215 if( !selectcenterpp(ref xy, npoints, nvars, ref ct, cbusy, k, ref d2, ref p, ref tmp) ) 216 { 217 info = -3; 218 return; 219 } 220 continue; 221 } 222 for(j=0; j<=k-1; j++) 223 { 224 v = (double)(1)/(double)(csizes[j]); 225 for(i_=0; i_<=nvars-1;i_++) 226 { 227 ct[j,i_] = v*ct[j,i_]; 228 } 229 } 230 231 // 232 // if nothing has changed during iteration 233 // 234 if( !waschanges ) 235 { 236 break; 237 } 289 if ((double)(v) < (double)(d2[i])) { 290 d2[i] = v; 238 291 } 239 240 // 241 // 3. Calculate E, compare with best centers found so far 242 // 243 e = 0; 244 for(i=0; i<=npoints-1; i++) 245 { 246 for(i_=0; i_<=nvars-1;i_++) 247 { 248 tmp[i_] = xy[i,i_]; 249 } 250 for(i_=0; i_<=nvars-1;i_++) 251 { 252 tmp[i_] = tmp[i_] - ct[xyc[i],i_]; 253 } 254 v = 0.0; 255 for(i_=0; i_<=nvars-1;i_++) 256 { 257 v += tmp[i_]*tmp[i_]; 258 } 259 e = e+v; 260 } 261 if( (double)(e)<(double)(ebest) ) 262 { 263 264 // 265 // store partition 266 // 267 blas.copymatrix(ref ct, 0, k-1, 0, nvars-1, ref ctbest, 0, k-1, 0, nvars-1); 268 } 269 } 270 271 // 272 // Copy and transpose 273 // 274 c = new double[nvars-1+1, k-1+1]; 275 blas.copyandtranspose(ref ctbest, 0, k-1, 0, nvars-1, ref c, 0, nvars-1, 0, k-1); 276 } 277 278 279 /************************************************************************* 280 Select center for a new cluster using k-means++ rule 281 *************************************************************************/ 282 private static bool selectcenterpp(ref double[,] xy, 283 int npoints, 284 int nvars, 285 ref double[,] centers, 286 bool[] busycenters, 287 int ccnt, 288 ref double[] d2, 289 ref double[] p, 290 ref double[] tmp) 291 { 292 bool result = new bool(); 293 int i = 0; 294 int j = 0; 295 int cc = 0; 296 double v = 0; 297 double s = 0; 298 int i_ = 0; 299 300 busycenters = (bool[])busycenters.Clone(); 301 302 result = true; 303 for(cc=0; cc<=ccnt-1; cc++) 304 { 305 if( !busycenters[cc] ) 306 { 307 308 // 309 // fill D2 310 // 311 for(i=0; i<=npoints-1; i++) 312 { 313 d2[i] = AP.Math.MaxRealNumber; 314 for(j=0; j<=ccnt-1; j++) 315 { 316 if( busycenters[j] ) 317 { 318 for(i_=0; i_<=nvars-1;i_++) 319 { 320 tmp[i_] = xy[i,i_]; 321 } 322 for(i_=0; i_<=nvars-1;i_++) 323 { 324 tmp[i_] = tmp[i_] - centers[j,i_]; 325 } 326 v = 0.0; 327 for(i_=0; i_<=nvars-1;i_++) 328 { 329 v += tmp[i_]*tmp[i_]; 330 } 331 if( (double)(v)<(double)(d2[i]) ) 332 { 333 d2[i] = v; 334 } 335 } 336 } 337 } 338 339 // 340 // calculate P (non-cumulative) 341 // 342 s = 0; 343 for(i=0; i<=npoints-1; i++) 344 { 345 s = s+d2[i]; 346 } 347 if( (double)(s)==(double)(0) ) 348 { 349 result = false; 350 return result; 351 } 352 s = 1/s; 353 for(i_=0; i_<=npoints-1;i_++) 354 { 355 p[i_] = s*d2[i_]; 356 } 357 358 // 359 // choose one of points with probability P 360 // random number within (0,1) is generated and 361 // inverse empirical CDF is used to randomly choose a point. 362 // 363 s = 0; 364 v = AP.Math.RandomReal(); 365 for(i=0; i<=npoints-1; i++) 366 { 367 s = s+p[i]; 368 if( (double)(v)<=(double)(s) | i==npoints-1 ) 369 { 370 for(i_=0; i_<=nvars-1;i_++) 371 { 372 centers[cc,i_] = xy[i,i_]; 373 } 374 busycenters[cc] = true; 375 break; 376 } 377 } 378 } 379 } 292 } 293 } 294 } 295 296 // 297 // calculate P (non-cumulative) 298 // 299 s = 0; 300 for (i = 0; i <= npoints - 1; i++) { 301 s = s + d2[i]; 302 } 303 if ((double)(s) == (double)(0)) { 304 result = false; 380 305 return result; 381 } 306 } 307 s = 1 / s; 308 for (i_ = 0; i_ <= npoints - 1; i_++) { 309 p[i_] = s * d2[i_]; 310 } 311 312 // 313 // choose one of points with probability P 314 // random number within (0,1) is generated and 315 // inverse empirical CDF is used to randomly choose a point. 316 // 317 s = 0; 318 v = AP.Math.RandomReal(); 319 for (i = 0; i <= npoints - 1; i++) { 320 s = s + p[i]; 321 if ((double)(v) <= (double)(s) | i == npoints - 1) { 322 for (i_ = 0; i_ <= nvars - 1; i_++) { 323 centers[cc, i_] = xy[i, i_]; 324 } 325 busycenters[cc] = true; 326 break; 327 } 328 } 329 } 330 } 331 return result; 382 332 } 333 } 383 334 }
Note: See TracChangeset
for help on using the changeset viewer.