Loading [MathJax]/extensions/TeX/AMSsymbols.js
LALInference 4.1.9.1-5e288d3
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Modules Pages
LALInferenceClusteredKDE.c
Go to the documentation of this file.
1/*
2 * LALInferenceClusteringKDE.c: Clustering-optimized Gaussian
3 * kernel density estimator.
4 *
5 * Copyright (C) 2013 Ben Farr
6 *
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with with program; see the file COPYING. If not, write to the
20 * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
21 * MA 02110-1301 USA
22 */
23
24#include <stdlib.h>
25#include <stdio.h>
26#include <math.h>
27#include <float.h>
28
29#include <gsl/gsl_randist.h>
30#include <gsl/gsl_sort_vector_double.h>
31#include <gsl/gsl_vector.h>
32#include <gsl/gsl_matrix.h>
33#include <gsl/gsl_linalg.h>
34#include <gsl/gsl_blas.h>
35
36#include <lal/LALStdlib.h>
37#include <lal/LALConstants.h>
38#include <lal/LALDatatypes.h>
39
40#include <lal/LALInference.h>
41#include <lal/LALInferencePrior.h>
42#include <lal/LALInferenceKDE.h>
43#include <lal/LALInferenceClusteredKDE.h>
44
45#ifndef _OPENMP
46#define omp ignore
47#endif
48
49
50
51/**
52 * Kmeans cluster data, increasing k until the BIC stops increasing.
53 *
54 * Run the kmeans clustering algorithm incrementally, calculating the Bayes
55 * Information Criteria (BIC) at each step, stopping when the BIC stops
56 * increasing. The BIC is calculated by estimating the distribution in each
57 * cluster with a Gaussian kernel density estimate, then weighting that
58 * clusters contribution to the likelihood by the fraction of total samples in
59 * that cluster.
60 * @param[in] data A GSL matrix containing the data to be clustered.
61 * @param[in] ntrials Number of kmeans attempts at fixed k to find optimal BIC.
62 * @param[in] rng A GSL random number generator used by the kmeans algorithm.
63 * @result A kmeans structure containing the clustering that maximizes the BIC.
64 */
66 INT4 ntrials,
67 gsl_rng *rng) {
68 INT4 k = 1;
69 REAL8 best_bic = -INFINITY;
70 REAL8 bic;
71
72 LALInferenceKmeans *kmeans;
73 LALInferenceKmeans *best_clustering = NULL;
74
75 /* Start at k=1 and increase k until the BIC stops increasing. */
76 while (1) {
77 kmeans = LALInferenceKmeansRunBestOf(k, data, ntrials, rng);
78
79 /* If kmeans creation failed, the cluster size was likely too small */
80 if (kmeans)
81 bic = LALInferenceKmeansBIC(kmeans);
82 else
83 bic = -INFINITY;
84
85 /* Keep going until the BIC decreases */
86 if (bic > best_bic) {
87 if (best_clustering)
88 LALInferenceKmeansDestroy(best_clustering);
89 best_clustering = kmeans;
90 best_bic = bic;
91 } else {
93 break;
94 }
95 k++;
96 }
97
98 return best_clustering;
99}
100
101
102/**
103 * Kmeans cluster data, maximizing BIC assuming BIC(k) is concave-down.
104 *
105 * Run the kmeans clustering algorithm, searching for the k that maximizes the
106 * Bayes Information Criteria (BIC). The BIC is calculated by estimating the
107 * distribution in each cluster with a Gaussian kernel density estimate, then
108 * weighing that clusters contribution to the likelihood by the fraction of
109 * total samples in that cluster.
110 * @param[in] data A GSL matrix containing the data to be clustered.
111 * @param[in] ntrials Number of kmeans attempts at fixed k to find optimal BIC.
112 * @param[in] rng A GSL random number generator used by the kmeans algorithm.
113 * @result A kmeans structure containing the clustering that maximizes the BIC.
114 */
116 INT4 ntrials,
117 gsl_rng *rng) {
118 INT4 k, low_k = 1, mid_k = 2, high_k = 4;
119 REAL8 bic, low_bic, mid_bic, high_bic;
120 LALInferenceKmeans *low_kmeans = NULL;
121 LALInferenceKmeans *mid_kmeans = NULL;
122 LALInferenceKmeans *high_kmeans = NULL;
123
124 /* Calculate starting clusters and BICs */
125 low_kmeans = LALInferenceKmeansRunBestOf(low_k, data, ntrials, rng);
126 if (!low_kmeans) {
127 fprintf(stderr, "Can't build basic KDE.");
128 fprintf(stderr, " Perhaps sample size (%i) is too small.\n",
129 (INT4)data->size1);
130 if (mid_kmeans) LALInferenceKmeansDestroy(mid_kmeans);
131 if (high_kmeans) LALInferenceKmeansDestroy(high_kmeans);
132 return NULL;
133 }
134 low_bic = LALInferenceKmeansBIC(low_kmeans);
135
136 mid_kmeans = LALInferenceKmeansRunBestOf(mid_k, data, ntrials, rng);
137
138 /* Return vanilla KDE if clustering with k > 1 fails */
139 if (!mid_kmeans)
140 return low_kmeans;
141
142 mid_bic = LALInferenceKmeansBIC(mid_kmeans);
143
144 /* If high-k fails, try shrinking before giving up */
145 high_kmeans = LALInferenceKmeansRunBestOf(high_k, data, ntrials, rng);
146 if (!high_kmeans) {
147 high_k--;
148 high_kmeans = LALInferenceKmeansRunBestOf(high_k, data, ntrials, rng);
149 if (!high_kmeans) {
150 if (mid_bic > low_bic) {
151 LALInferenceKmeansDestroy(low_kmeans);
152 return mid_kmeans;
153 } else {
154 LALInferenceKmeansDestroy(mid_kmeans);
155 return low_kmeans;
156 }
157 }
158 }
159 high_bic = LALInferenceKmeansBIC(high_kmeans);
160
161 /* Keep doubling the highest sample until the peak is passed */
162 while (high_bic > mid_bic) {
163 low_k = mid_k;
164 mid_k = high_k;
165 high_k *= 2;
166
167 low_bic = mid_bic;
168 mid_bic = high_bic;
169
170 LALInferenceKmeansDestroy(low_kmeans);
171 low_kmeans = mid_kmeans;
172 mid_kmeans = high_kmeans;
173
174 while (1) {
175 high_kmeans =
176 LALInferenceKmeansRunBestOf(high_k, data, ntrials, rng);
177 if (!high_kmeans) {
178 high_k = mid_k + (high_k - mid_k)/2;
179 if (high_k < mid_k + 1)
180 high_bic = -INFINITY;
181 else
182 continue;
183 }
184 high_bic = LALInferenceKmeansBIC(high_kmeans);
185 break;
186 }
187 }
188
189 /* Perform bisection search to find maximum */
190 LALInferenceKmeans *kmeans = NULL;
191 while (high_k - low_k > 2) {
192 if (high_k - mid_k > mid_k - low_k) {
193 k = mid_k + (high_k - mid_k)/2;
194 if (kmeans)
196 kmeans = LALInferenceKmeansRunBestOf(k, data, ntrials, rng);
197 bic = LALInferenceKmeansBIC(kmeans);
198
199 if (bic > mid_bic) {
200 low_k = mid_k;
201 mid_k = k;
202 low_bic = mid_bic;
203 mid_bic = bic;
204 LALInferenceKmeansDestroy(low_kmeans);
205 low_kmeans = mid_kmeans;
206 mid_kmeans = kmeans;
207 kmeans = NULL;
208 } else {
209 high_k = k;
210 high_bic = bic;
211 LALInferenceKmeansDestroy(high_kmeans);
212 high_kmeans = kmeans;
213 kmeans = NULL;
214 }
215 } else {
216 k = low_k + (mid_k - low_k)/2;
217 if (kmeans)
219 kmeans = LALInferenceKmeansRunBestOf(k, data, ntrials, rng);
220 bic = LALInferenceKmeansBIC(kmeans);
221
222 if (bic > mid_bic) {
223 high_k = mid_k;
224 mid_k = k;
225 high_bic = mid_bic;
226 mid_bic = bic;
227
228 LALInferenceKmeansDestroy(high_kmeans);
229 high_kmeans = mid_kmeans;
230 mid_kmeans = kmeans;
231 kmeans = NULL;
232 } else {
233 low_k = k;
234 low_bic = bic;
235 LALInferenceKmeansDestroy(low_kmeans);
236 low_kmeans = kmeans;
237 kmeans = NULL;
238 }
239 }
240 }
241
242 /* Cleanup and return the best clustering */
243 if (low_kmeans != mid_kmeans)
244 LALInferenceKmeansDestroy(low_kmeans);
245 if (high_kmeans != mid_kmeans)
246 LALInferenceKmeansDestroy(high_kmeans);
247 if (kmeans && kmeans != mid_kmeans)
249 return mid_kmeans;
250}
251
252
253/**
254 * Cluster data xmeans-style, splitting centroids according to the BIC.
255 *
256 * Run an xmeans-like clustering, increasing a kmeans clustering starting from
257 * k=1 by splitting each centroid individually and checking for an increase of
258 * the BIC over the parent cluster.
259 * @param[in] data A GSL matrix containing the data to be clustered.
260 * @param[in] ntrials Number of kmeans attempts at fixed k to find optimal BIC.
261 * @param[in] rng A GSL random number generator used by the kmeans algorithm.
262 * @result A kmeans structure containing the clustering that maximizes the BIC.
263 */
265 INT4 ntrials,
266 gsl_rng *rng) {
267 INT4 kmax = 50;
268 INT4 split_size = 2;
269 INT4 c,k;
270 REAL8 starting_bic, ending_bic;
271 REAL8 old_bic, new_bic;
272
273 LALInferenceKmeans *sub_kmeans = NULL;
274 LALInferenceKmeans *new_kmeans = NULL;
275
277
278 /* If kmeans wasn't setup, return nothing */
279 if (!kmeans)
280 return NULL;
281
283 LALInferenceKmeansRun(kmeans);
284
285 /* Keep increasing k by splitting centroids. Stop if BIC is maximized */
286 while (kmeans->k < kmax) {
287 if (kmeans->recursive_centroids != NULL)
288 gsl_matrix_free(kmeans->recursive_centroids);
289
290 /* Store the BIC of the initial clustering */
291 starting_bic = LALInferenceKmeansBIC(kmeans);
292
293 /* For each existing centroid, split and see if BIC increases locally */
294 for (c = 0; c < kmeans->k; c++) {
295 sub_kmeans = LALInferenceKmeansExtractCluster(kmeans, c);
296
297 /* Reject split if cluster extraction failed. */
298 if (sub_kmeans) {
299 old_bic = LALInferenceKmeansBIC(sub_kmeans);
300
301 new_kmeans = LALInferenceKmeansRunBestOf(split_size,
302 sub_kmeans->data,
303 ntrials, rng);
304 new_bic = LALInferenceKmeansBIC(new_kmeans);
305 } else {
306 old_bic = INFINITY;
307 new_bic = -INFINITY;
308 }
309
310 if (new_bic > old_bic) {
311 /* BIC increased, so keep the new centroids */
312 for (k = 0; k < new_kmeans->k; k++) {
313 gsl_vector_view centroid =
314 gsl_matrix_row(new_kmeans->centroids, k);
315
317 &centroid.vector);
318 }
319 } else {
320 /* BIC decreased, keep the parent centroid */
321 gsl_vector_view centroid = gsl_matrix_row(kmeans->centroids, c);
322
324 &centroid.vector);
325 }
326
327 LALInferenceKmeansDestroy(sub_kmeans);
328 LALInferenceKmeansDestroy(new_kmeans);
329 }
330
331 /* Create a self-consistent kmeans from the resulting centroids */
332 gsl_matrix *centroids = kmeans->recursive_centroids;
333 k = centroids->size1;
334
335 new_kmeans = LALInferenceCreateKmeans(k, data, rng);
336 if (new_kmeans) {
337 gsl_matrix_memcpy(new_kmeans->centroids, centroids);
338 LALInferenceKmeansRun(new_kmeans);
339
340 /* Store BIC after all centroids have been attempted to be split */
341 ending_bic = LALInferenceKmeansBIC(new_kmeans);
342 } else {
343 /* If new kmeans couldn't be created, reject it */
344 ending_bic = -INFINITY;
345 }
346
347 if (ending_bic > starting_bic) {
348 /* BIC improved, so update and continue */
350 kmeans = new_kmeans;
351 } else {
352 /* Clustering was unchanged, or resulted in a lower BIC. End */
353 LALInferenceKmeansDestroy(new_kmeans);
354 break;
355 }
356 }
357
358 return kmeans;
359}
360
361
362/**
363 * Run kmeans, recursively splitting centroids, accepting if the BIC increases.
364 *
365 * Run kmeans recursively, splitting each centroid in two recursively. At each
366 * step, the BIC is checked before and after splitting. If the split
367 * increases the BIC, each centroid is then attempted to be split. If the BIC
368 * decreases, the split is rejected and the centroid is kept. This is less
369 * precise than other methods, but much faster, as the likelihood is computed
370 * at few and fewer points as centroids are split.
371 * @param[in] data A GSL matrix containing the data to be clustered.
372 * @param[in] ntrials Number of kmeans attempts at fixed k to find optimal BIC.
373 * @param[in] rng A GSL random number generator used by the kmeans algorithm.
374 * @result A kmeans structure containing the clustering that maximizes the BIC.
375 */
377 INT4 ntrials,
378 gsl_rng *rng) {
379 INT4 k;
380
381 /* Perform recursive splitting. Return NULL if kmeans creation fails */
382 LALInferenceKmeans *split_kmeans = LALInferenceCreateKmeans(1, data, rng);
383 if (!split_kmeans)
384 return NULL;
385
387 LALInferenceKmeansRecursiveSplit(split_kmeans, ntrials, rng);
388
389 /* Take the final centroids and make a fully self-consistent kmeans */
390 gsl_matrix *centroids = split_kmeans->recursive_centroids;
391
392 k = centroids->size1;
394
395 if (!kmeans)
396 return NULL;
397
398 gsl_matrix_memcpy(kmeans->centroids, centroids);
399 LALInferenceKmeansRun(kmeans);
400
401 LALInferenceKmeansDestroy(split_kmeans);
402 return kmeans;
403}
404
405
406/**
407 * Recursively split a k=1 kmeans.
408 *
409 * Split a centroid in 2 recursively, evaluating the BIC to determine if the
410 * split is kept. This is a cheap method, as the BIC becomes cheaper to
411 * compute after each split.
412 * @param[in] kmeans A k=1 kmeans to be split recursively.
413 * @param[in] ntrials Number of kmeans attempts at fixed k to find optimal BIC.
414 * @param[in] rng A GSL random number generator used by the kmeans algorithm.
415 * \sa LALInferenceRecursiveKmeans()
416 */
418 INT4 ntrials,
419 gsl_rng *rng) {
420 LALInferenceKmeansRun(kmeans);
421 INT4 cluster, i;
422 INT4 split_size = 2;
423 REAL8 current_bic, new_bic;
424
425 LALInferenceKmeans *new_kmeans =
426 LALInferenceKmeansRunBestOf(split_size, kmeans->data, ntrials, rng);
427
428 current_bic = LALInferenceKmeansBIC(kmeans);
429 new_bic = LALInferenceKmeansBIC(new_kmeans);
430
431 /* If the BIC increased, attempt a further splitting of the centroids */
432 if (new_bic > current_bic) {
433 for (cluster = 0; cluster < new_kmeans->k; cluster++) {
434 LALInferenceKmeans *sub_kmeans =
435 LALInferenceKmeansExtractCluster(new_kmeans, cluster);
436
437 if (sub_kmeans)
438 LALInferenceKmeansRecursiveSplit(sub_kmeans, ntrials, rng);
439
440 INT4 n_new_centroids = sub_kmeans->recursive_centroids->size1;
441 for (i = 0; i < n_new_centroids ; i++) {
442 gsl_vector_view centroid =
443 gsl_matrix_row(sub_kmeans->recursive_centroids, i);
444
446 &centroid.vector);
447 }
448
449 LALInferenceKmeansDestroy(sub_kmeans);
450 }
451 } else {
452 /* The split was rejected, so save the old centroid */
453 gsl_vector_view centroid = gsl_matrix_row(kmeans->centroids, 0);
454 accumulate_vectors(&kmeans->recursive_centroids, &centroid.vector);
455 }
456
457 LALInferenceKmeansDestroy(new_kmeans);
458}
459
460
461/**
462 * Run the kmeans algorithm until cluster assignments don't change.
463 *
464 * Starting with some random initialization, points are assigned to the closest
465 * centroids, then the centroids are calculated of the new cluster. This is
466 * repeated until the assignments stop changing.
467 * @param kmeans The initialized kmeans to run.
468 */
470 INT4 i;
471
472 while (kmeans->has_changed) {
473 kmeans->has_changed = 0;
474
477 }
478
479 for (i = 0; i < kmeans->k; i++)
480 kmeans->weights[i] = (REAL8)(kmeans->sizes[i]) / (REAL8)(kmeans->npts);
481}
482
483
484/**
485 * Run a kmeans several times and return the best.
486 *
487 * The kmeans is run \a ntrials times, each time with a new random
488 * initialization. The one that results in the highest Bayes Information
489 * Criteria (BIC) is returned.
490 * @param[in] k The number of clusters to use.
491 * @param[in] samples The (unwhitened) data to cluster.
492 * @param[in] ntrials The number of random initialization to run.
493 * @param[in] rng The GSL random number generator to use for random
494 * initializations.
495 * @return The kmeans with the highest BIC of \a ntrials attempts.
496 */
498 gsl_matrix *samples,
499 INT4 ntrials,
500 gsl_rng *rng) {
501 INT4 i;
502
503 LALInferenceKmeans *best_kmeans = NULL;
504 REAL8 best_bic = -INFINITY;
505
506 /* Don't bother with multiple trials if k=1 */
507 if (k == 1)
508 ntrials = 1;
509
510 #pragma omp parallel
511 {
512 LALInferenceKmeans *kmeans;
513 REAL8 bic = 0;
514 REAL8 error = -INFINITY;
515
516 #pragma omp for
517 for (i = 0; i < ntrials; i++) {
518 kmeans = LALInferenceCreateKmeans(k, samples, rng);
519 if (!kmeans)
520 continue;
521
523 LALInferenceKmeansRun(kmeans);
524
525 /* Assume BIC hasn't changed if
526 * error (summed-dist^2 from assigned centroids) hasn't */
527 if (error != kmeans->error) {
528 error = kmeans->error;
529 bic = LALInferenceKmeansBIC(kmeans);
530 }
531
532 #pragma omp critical
533 {
534 if (bic > best_bic) {
535 if (best_kmeans)
536 LALInferenceKmeansDestroy(best_kmeans);
537 best_bic = bic;
538 best_kmeans = kmeans;
539 } else {
541 }
542 }
543 }
544 }
545
546 return best_kmeans;
547}
548
549/**
550 * Generate a new kmeans struct from a set of data.
551 *
552 * Whitens data and fills a newly allocated kmeans structure of the
553 * requested size.
554 * @param[in] k The requested number of clusters.
555 * @param[in] data The unwhitened data to be clustered.
556 * @param[in] rng A GSL random number generator used for initialization.
557 */
559 gsl_matrix *data,
560 gsl_rng *rng) {
561 INT4 i;
562
563 /* Return nothing if given empty dataset */
564 if (!data)
565 return NULL;
566
568 kmeans->k = k;
569 kmeans->npts = data->size1;
570 kmeans->dim = data->size2;
571 kmeans->has_changed = 1;
572 kmeans->rng = rng;
573
574 /* Allocate GSL matrices and vectors */
575 kmeans->mean = gsl_vector_alloc(kmeans->dim);
576 kmeans->cov = gsl_matrix_alloc(kmeans->dim, kmeans->dim);
577 kmeans->std = gsl_vector_alloc(kmeans->dim);
578 kmeans->data = gsl_matrix_alloc(kmeans->npts, kmeans->dim);
579 kmeans->centroids = gsl_matrix_alloc(kmeans->k, kmeans->dim);
580 kmeans->recursive_centroids = NULL;
581
582 /* Allocate everything else */
583 kmeans->assignments = XLALCalloc(kmeans->npts, sizeof(INT4));
584 kmeans->sizes = XLALCalloc(kmeans->k, sizeof(INT4));
585 kmeans->mask = XLALCalloc(kmeans->npts, sizeof(INT4));
586 kmeans->weights = XLALCalloc(kmeans->k, sizeof(REAL8));
587 kmeans->KDEs = NULL;
588
589 /* Set distance and centroid calculators */
590 kmeans->dist = &euclidean_dist_squared;
591 kmeans->centroid = &euclidean_centroid;
592
593 /* Store the unwhitened mean and covariance matrix for later transformations */
596
597 for (i = 0; i < kmeans->dim; i++)
598 gsl_vector_set(kmeans->std, i, sqrt(gsl_matrix_get(kmeans->cov, i, i)));
599
600 /* Whiten the data */
602 if (!kmeans->data) {
603 fprintf(stderr, "Unable to whiten data. No proposal has been built.\n");
605 return NULL;
606 }
607
608
609 return kmeans;
610}
611
612
613/**
614 * A 'k-means++'-like seeded initialization of centroids for a kmeans run.
615 * @param kmeans The kmeans to initialize the centroids of.
616 */
618 INT4 i, j;
619 INT4 u = 0;
620 REAL8 dist, norm, randomDraw;
621 gsl_vector_view c, x;
622
623 gsl_vector *dists = gsl_vector_alloc(kmeans->npts);
624 gsl_permutation *p = gsl_permutation_alloc(kmeans->npts);
625
626 /* Choose first centroid randomly from data */
627 i = gsl_rng_uniform_int(kmeans->rng, kmeans->npts);
628
629 for (j = 0; j < kmeans->dim; j++)
630 gsl_matrix_set(kmeans->centroids, u, j,
631 gsl_matrix_get(kmeans->data, i, j));
632
633 u++;
634 while (u < kmeans->k) {
635 /* Calculate distances from centroid */
636 norm = 0.0;
637 c = gsl_matrix_row(kmeans->centroids, u-1);
638 for (i = 0; i < kmeans->npts; i++) {
639 x = gsl_matrix_row(kmeans->data, i);
640 dist = kmeans->dist(&x.vector, &c.vector);
641
642 gsl_vector_set(dists, i, dist);
643 norm += dist;
644 }
645
646 gsl_vector_scale(dists, 1.0/norm);
647 gsl_sort_vector_index(p, dists);
648
649 randomDraw = gsl_rng_uniform(kmeans->rng);
650
651 i = 0;
652 dist = gsl_vector_get(dists, p->data[i]);
653 while (dist < randomDraw) {
654 i++;
655 dist += gsl_vector_get(dists, p->data[i]);
656 }
657
658 for (j = 0; j < kmeans->dim; j++)
659 gsl_matrix_set(kmeans->centroids, u, j,
660 gsl_matrix_get(kmeans->data, i, j));
661
662 u++;
663 }
664
665 gsl_permutation_free(p);
666}
667
668
669/**
670 * Randomly select points from the data as initial centroids for a kmeans run.
671 * @param kmeans The kmeans to initialize the centroids of.
672 */
674 INT4 i, j, u;
675
676 gsl_permutation *p = gsl_permutation_alloc(kmeans->npts);
677
678 gsl_permutation_init(p);
679 gsl_ran_shuffle(kmeans->rng, p->data, kmeans->npts, sizeof(size_t));
680
681 for (i = 0; i < kmeans->k; i++) {
682 u = gsl_permutation_get(p, i);
683 for (j = 0; j < kmeans->dim; j++)
684 gsl_matrix_set(kmeans->centroids, i, j,
685 gsl_matrix_get(kmeans->data, u, j));
686 }
687
688 gsl_permutation_free(p);
689}
690
691
692/**
693 * Use the resulting centroids from a random cluster assignment as the initial
694 * centroids of a kmeans run.
695 * @param kmeans The kmeans to initialize the centroids of.
696 */
698 INT4 i;
699 INT4 cluster_id;
700
701 for (i = 0; i < kmeans->npts; i++) {
702 cluster_id = gsl_rng_uniform_int(kmeans->rng, kmeans->k);
703 kmeans->assignments[i] = cluster_id;
704 }
705
707}
708
709
710/**
711 * The assignment step of the kmeans algorithm.
712 *
713 * Assign all data to the closest centroid and calculate the error, defined
714 * as the cumulative sum of the distance between all points and their closest
715 * centroid.
716 * @param kmeans The kmeans to perform the assignment step on.
717 */
719 INT4 i, j;
720
721 kmeans->error = 0.;
722
723 for (i = 0; i < kmeans->npts; i++) {
724 gsl_vector_view x = gsl_matrix_row(kmeans->data, i);
725 gsl_vector_view c;
726
727 INT4 best_cluster = 0;
728 REAL8 best_dist = INFINITY;
729 REAL8 dist;
730
731 /* Find the closest centroid */
732 for (j = 0; j < kmeans->k; j++) {
733 c = gsl_matrix_row(kmeans->centroids, j);
734 dist = kmeans->dist(&x.vector, &c.vector);
735
736 if (dist < best_dist) {
737 best_cluster = j;
738 best_dist = dist;
739 }
740 }
741
742 /* Check if the point's assignment has changed */
743 INT4 current_cluster = kmeans->assignments[i];
744 if (best_cluster != current_cluster) {
745 kmeans->has_changed = 1;
746 kmeans->assignments[i] = best_cluster;
747 }
748 kmeans->error += best_dist;
749 }
750
751 /* Recalculate cluster sizes */
752 for (i = 0; i < kmeans->k; i++)
753 kmeans->sizes[i] = 0;
754
755 for (i = 0; i < kmeans->npts; i++)
756 kmeans->sizes[kmeans->assignments[i]]++;
757}
758
759
760/**
761 * The update step of the kmeans algorithm.
762 *
763 * Based on the current assignments, calculate the new centroid of each cluster.
764 * @param kmeans The kmeans to perform the update step on.
765 */
767 INT4 i;
768
769 for (i = 0; i < kmeans->k; i ++) {
770 LALInferenceKmeansConstructMask(kmeans, kmeans->mask, i);
771
772 gsl_vector_view c = gsl_matrix_row(kmeans->centroids, i);
773 kmeans->centroid(&c.vector, kmeans->data, kmeans->mask);
774 }
775}
776
777
778
779/**
780 * Construct a mask to select only the data assigned to a single cluster.
781 *
782 * Contruct a mask that selects the data from \a kmeans assigned to the centroid
783 * indexed in the centroid list by \a centroid_id.
784 * @param[in] kmeans The kmeans clustering to mask the data of.
785 * @param[out] mask The mask that will select points assigned to
786 * \a cluster_id in \a kmeans.
787 * @param[in] cluster_id The index of the centroid in \a kmeans to choose points
788 * that are assigned to.
789 */
791 INT4 *mask,
792 INT4 cluster_id) {
793 INT4 i;
794 for (i = 0; i < kmeans->npts; i++) {
795 if (kmeans->assignments[i] == cluster_id)
796 mask[i] = 1;
797 else
798 mask[i] = 0;
799 }
800}
801
802
803/**
804 * Extract a single cluster from an existing kmeans as a new 1-means.
805 *
806 * Given the index of the centroid requested, generate a new 1-means structure
807 * containing only the points assigned to that cluster.
808 * @param[in] kmeans The parent kmeans to extract the cluster from.
809 * @param[in] cluster_id The index of the centroid in \a kmeans corresponding
810 * to the cluster to extract.
811 * @result A kmeans with k=1 containing only the points assigned to cluster
812 * \a cluster_id from \a kmeans.
813 */
815 INT4 cluster_id) {
816 /* Construct a mask to select only the cluster requested */
817 LALInferenceKmeansConstructMask(kmeans, kmeans->mask, cluster_id);
818 gsl_matrix *masked_data = mask_data(kmeans->data, kmeans->mask);
819
820 /* Create the kmeans if cluster isn't empty (i.e. masked_array != NULL) */
821 LALInferenceKmeans *sub_kmeans = NULL;
822 if (masked_data) {
823 sub_kmeans = LALInferenceCreateKmeans(1, masked_data, kmeans->rng);
824 gsl_matrix_free(masked_data);
825 }
826
827 /* If cluster was extracted successfully, then run it */
828 if (sub_kmeans) {
829 /* Initialize and assign all points to the only cluster */
831 LALInferenceKmeansRun(sub_kmeans);
832 }
833
834 return sub_kmeans;
835}
836
837
838/**
839 * Impose boundaries on individual KDEs.
840 *
841 * Draw samples from each cluster. If too many samples lie outside of the
842 * prior, impose a cyclic/reflective bound for the offending parameter(s) if
843 * requested. If boundaries aren't incountered, box in the cluster using the
844 * samples drawn. This avoids needing to evaluate KDEs that are too far away.
845 * @param[in] kmeans kmeans to cycle through the clusters of.
846 * @param[in] params Parameters to impose bounds on.
847 * @param[in] priorArgs Variables containing prior boundaries.
848 * @param[in] cyclic_reflective Flag to check for cyclic/reflective bounds.
849 *
850 */
853 LALInferenceVariables *priorArgs,
854 INT4 cyclic_reflective) {
855 INT4 i, p, dim;
856 INT4 n_below, n_above;
857 INT4 ndraws, n_thresh;
858 INT4 c;
859 REAL8 draw;
860 REAL8 min, max, threshold;
861 REAL8 drawn_min = INFINITY, drawn_max = -INFINITY;
862 REAL8 mean, std;
863 LALInferenceVariableItem *param=NULL;
864
865 dim = kmeans->dim;
866
867 ndraws = 1000; // Number of draws from the KDE to check
868 threshold = 0.01; // Fraction outside boundary to make it cyclic/reflective
869 n_thresh = (INT4)(threshold * ndraws);
870
871 REAL8 *draws = XLALCalloc(ndraws * dim, sizeof(REAL8));
872 for (c = 0; c < kmeans->k; c++) {
873 /* Skip empty clusters */
874 if (kmeans->npts == 0)
875 continue;
876
877 for (i = 0; i < ndraws; i++) {
878 /* Draw a whitened sample from the cluster's KDE */
879 REAL8 *point =
880 LALInferenceDrawKDESample(kmeans->KDEs[c], kmeans->rng);
881
882 memcpy(&(draws[i*dim]), point, dim*sizeof(REAL8));
883 XLALFree(point);
884 }
885
886 p = 0;
887 for (param = params->head; param; param = param->next) {
889 /* Whiten prior boundaries */
890 LALInferenceGetMinMaxPrior(priorArgs, param->name, &min, &max);
891 mean = gsl_vector_get(kmeans->mean, p);
892 std = gsl_vector_get(kmeans->std, p);
893 min = (min - mean)/std;
894 max = (max - mean)/std;
895
896 n_below = 0;
897 n_above = 0;
898 for (i = 0; i < ndraws; i++) {
899 draw = draws[i*dim + p];
900 if (draw < min)
901 n_below++;
902 else if (draw > max)
903 n_above++;
904
905 if (draw < drawn_min)
906 drawn_min = draw;
907 else if (draw > drawn_max)
908 drawn_max = draw;
909 }
910
911 if (cyclic_reflective && n_below > n_thresh) {
912 /* Impose cyclic boundaries on both sides */
913 if (param->vary == LALINFERENCE_PARAM_CIRCULAR) {
914 kmeans->KDEs[c]->lower_bound_types[p] = param->vary;
915 kmeans->KDEs[c]->upper_bound_types[p] = param->vary;
916 } else {
917 kmeans->KDEs[c]->lower_bound_types[p] = param->vary;
918 }
919 } else {
920 min = drawn_min;
922 }
923
924 if (cyclic_reflective && n_above > n_thresh) {
925 /* Impose cyclic boundaries on both sides */
926 if (param->vary == LALINFERENCE_PARAM_CIRCULAR) {
927 kmeans->KDEs[c]->lower_bound_types[p] = param->vary;
928 kmeans->KDEs[c]->upper_bound_types[p] = param->vary;
929 } else {
930 kmeans->KDEs[c]->upper_bound_types[p] = param->vary;
931 }
932 } else {
933 max = drawn_max;
935 }
936
937 kmeans->KDEs[c]->lower_bounds[p] = min;
938 kmeans->KDEs[c]->upper_bounds[p] = max;
939
940 p++;
941 }
942 }
943 }
944}
945
946/**
947 * Generate a new matrix by masking an existing one.
948 *
949 * Allocate and fill a new GSL matrix by masking an
950 * existing one.
951 * @param[in] data GSL matrix whose rows will be masked.
952 * @param[in] mask 0/1 array with length equal to the number of rows in \a data.
953 * @return A new GSL matrix constaining only the masked data.
954 */
955gsl_matrix *mask_data(gsl_matrix *data, INT4 *mask) {
956 INT4 i;
957 INT4 N = data->size1;
958 INT4 dim = data->size2;
959
960 /* Calculate how big the resulting matrix will be */
961 INT4 new_N = 0;
962 for (i = 0; i < N; i++)
963 new_N += mask[i];
964
965 /* Return NULL if masked array is empty */
966 gsl_matrix *masked_data = NULL;
967 if (new_N > 0)
968 masked_data = gsl_matrix_alloc(new_N, dim);
969 else
970 return NULL;
971
972 gsl_vector_view source, target;
973
974 INT4 idx = 0;
975 for (i = 0; i < N; i++) {
976 if (mask[i]) {
977 source = gsl_matrix_row(data, i);
978 target = gsl_matrix_row(masked_data, idx);
979 gsl_vector_memcpy(&target.vector, &source.vector);
980 idx++;
981 }
982 }
983 return masked_data;
984}
985
986
987/**
988 * Add a vector to the end of a matrix, allocating if necessary.
989 *
990 * Utility for acculating vectors in a GSL matrix, allocating if its
991 * the first vector to be accumulated.
992 * @param[in] mat_ptr Pointer to the GSL matrix to append to.
993 * @param[in] vec The vector to append to the matrix pointed to by \a mat_ptr.
994 * \sa append_vec_to_mat()
995 */
996void accumulate_vectors(gsl_matrix **mat_ptr, gsl_vector *vec) {
997 gsl_matrix *mat = *mat_ptr;
998
999 if (mat == NULL) {
1000 mat = gsl_matrix_alloc(1, vec->size);
1001 gsl_vector_view row = gsl_matrix_row(mat, 0);
1002 gsl_vector_memcpy(&row.vector, vec);
1003 *mat_ptr = mat;
1004 } else {
1005 append_vec_to_mat(mat_ptr, vec);
1006 }
1007}
1008
1009/**
1010 * Add a vector to the end of a matrix.
1011 *
1012 * Utility for acculating vectors in a GSL matrix.
1013 * @param[in] mat_ptr Pointer to the GSL matrix to append to.
1014 * @param[in] vec The vector to append to the matrix pointed to by \a mat_ptr.
1015 */
1016void append_vec_to_mat(gsl_matrix **mat_ptr, gsl_vector *vec) {
1017 gsl_matrix *mat = *mat_ptr;
1018 INT4 rows = mat->size1;
1019 INT4 cols = mat->size2;
1020
1021 gsl_matrix *new_mat = gsl_matrix_alloc(rows+1, cols);
1022
1023 /* Copy over existing rows */
1024 gsl_matrix_view sub_mat_view =
1025 gsl_matrix_submatrix(new_mat, 0, 0, rows, cols);
1026 gsl_matrix_memcpy(&sub_mat_view.matrix, mat);
1027
1028 /* Copy vector into new row */
1029 gsl_vector_view row_view = gsl_matrix_row(new_mat, rows);
1030 gsl_vector_memcpy(&row_view.vector, vec);
1031
1032 gsl_matrix_free(mat);
1033 *mat_ptr = new_mat;
1034}
1035
1036
1037/**
1038 * Draw a sample from a kmeans-KDE estimate of a distribution.
1039 *
1040 * Draw a random sample from the estimated distribution of a set of points.
1041 * A cluster from kmeans is picked at random, from which a sample is drawn from
1042 * the KDE of the cluster.
1043 * @param[in] kmeans The kmeans to draw a sample from.
1044 * @returns An array containing a sample drawn from \a kmeans.
1045 */
1047 /* Draw a cluster at random, using the weights assigned to each cluster */
1048 REAL8 randomDraw = gsl_rng_uniform(kmeans->rng);
1049
1050 INT4 cluster = 0;
1051 REAL8 cumulativeWeight = kmeans->weights[cluster];
1052 while (cumulativeWeight < randomDraw) {
1053 cluster++;
1054 cumulativeWeight += kmeans->weights[cluster];
1055 }
1056
1057 /* Draw a whitened sample from the chosen cluster's KDE */
1058 REAL8 *point =
1059 LALInferenceDrawKDESample(kmeans->KDEs[cluster], kmeans->rng);
1060
1061 /* Recolor the sample */
1062 gsl_vector_view pt = gsl_vector_view_array(point, kmeans->dim);
1063 gsl_vector_mul(&pt.vector, kmeans->std);
1064 gsl_vector_add(&pt.vector, kmeans->mean);
1065
1066 return point;
1067}
1068
1069
1070/**
1071 * Evaluate the estimated (log) PDF from a clustered-KDE at a point.
1072 *
1073 * Compute the (log) value of the estimated probability density function from
1074 * the clustered kernel density estimate of a distribution.
1075 * @param[in] kmeans The kmeans clustering to estimate the PDF from.
1076 * @param[in] pt Array containing the point to evaluate the distribution at.
1077 * @return The estimated value of the PDF at \a pt.
1078 */
1080 /* The vector is first whitened to be consistent with the data stored
1081 * in the kernel density estimator. */
1082 gsl_vector *y = gsl_vector_alloc(kmeans->dim);
1083 gsl_vector_view pt_view = gsl_vector_view_array(pt, kmeans->dim);
1084
1085 /* Copy the point to a local vector, since it will be overwritten */
1086 gsl_vector_memcpy(y, &pt_view.vector);
1087
1088 /* Subtract the mean from the point */
1089 gsl_vector_sub(y, kmeans->mean);
1090 gsl_vector_div(y, kmeans->std);
1091
1092 REAL8 result = LALInferenceWhitenedKmeansPDF(kmeans, y->data);
1093
1094 gsl_vector_free(y);
1095
1096 return result;
1097}
1098
1099
1100/**
1101 * Estimate (log) PDF from a clustered-KDE at an already whitened point.
1102 *
1103 * Calculate the probability at a point from the clustered-KDE estimate,
1104 * assuming the point has already been whitened using the same process as the
1105 * stored data in kmeans. This is particularly useful when evaluating the BIC
1106 * during clustering.
1107 * @param[in] kmeans The kmeans clustering to estimate the PDF from.
1108 * @param[in] pt Array containing the point to evaluate the distribution at.
1109 * @return The estimated value of the PDF at \a pt.
1110 */
1112 INT4 j;
1113
1114 if (kmeans->KDEs == NULL)
1116
1117 REAL8* cluster_pdfs = XLALCalloc(kmeans->k, sizeof(REAL8));
1118 for (j = 0; j < kmeans->k; j++)
1119 cluster_pdfs[j] = log(kmeans->weights[j]) +
1120 LALInferenceKDEEvaluatePoint(kmeans->KDEs[j], pt);
1121
1122 REAL8 result = log_add_exps(cluster_pdfs, kmeans->k);
1123 XLALFree(cluster_pdfs);
1124 return result;
1125}
1126
1127
1128/**
1129 * Transform a data set to obtain a 0-mean and identity covariance matrix.
1130 *
1131 * Determine and execute the transformation of a data set to remove any global
1132 * correlations in a data set.
1133 * @param[in] samples The data set to whiten.
1134 * @return A whitened data set with samples corresponding to \a samples.
1135 */
1136gsl_matrix * LALInferenceWhitenSamples(gsl_matrix *samples) {
1137 INT4 i;
1138 INT4 npts = samples->size1;
1139 INT4 dim = samples->size2;
1140
1141 gsl_matrix *whitened_samples = gsl_matrix_alloc(npts, dim);
1142 gsl_matrix_memcpy(whitened_samples, samples);
1143
1144 /* Calculate the mean and covariance matrix */
1145 gsl_vector *mean = gsl_vector_alloc(dim);
1146 gsl_matrix *cov = gsl_matrix_alloc(dim, dim);
1147 gsl_vector *std = gsl_vector_alloc(dim);
1150
1151 for (i = 0; i < dim; i++)
1152 gsl_vector_set(std, i, sqrt(gsl_matrix_get(cov, i, i)));
1153
1154 /* Decorrelate the data */
1155 gsl_vector_view y;
1156 for (i=0; i<npts; i++) {
1157 y = gsl_matrix_row(whitened_samples, i);
1158 gsl_vector_sub(&y.vector, mean);
1159 gsl_vector_div(&y.vector, std);
1160 }
1161
1162 gsl_vector_free(mean);
1163 gsl_matrix_free(cov);
1164 gsl_vector_free(std);
1165
1166 return whitened_samples;
1167}
1168
1169/**
1170 * Calculate the squared Euclidean distance bewteen two points.
1171 *
1172 * @param[in] x A GSL vector.
1173 * @param[in] y Another GSL vector.
1174 * @return The squared Euclidean distance between \a x and \a y.
1175 */
1176REAL8 euclidean_dist_squared(gsl_vector *x, gsl_vector *y) {
1177 size_t i;
1178
1179 REAL8 dist = 0.;
1180 for (i = 0; i < x->size; i++) {
1181 REAL8 diff = gsl_vector_get(x, i) - gsl_vector_get(y, i);
1182 dist += diff * diff;
1183 }
1184
1185 return dist;
1186}
1187
1188
1189/**
1190 * Find the centroid of a masked data set.
1191 *
1192 * @param[out] centroid The newly determined centroid.
1193 * @param[in] data Set of points to determin \a centroid from.
1194 * @param[in] mask Mask to select which samples in \a data to calculate the
1195 * centroid of.
1196 */
1197void euclidean_centroid(gsl_vector *centroid, gsl_matrix *data, INT4 *mask) {
1198 INT4 npts = data->size1;
1199 INT4 dim = data->size2;
1200 INT4 i;
1201 INT4 count = 0;
1202
1203 for (i = 0; i < dim; i++)
1204 gsl_vector_set(centroid, i, 0.);
1205
1206 for (i = 0; i < npts; i++) {
1207 if (mask[i]) {
1208 gsl_vector_view x = gsl_matrix_row(data, i);
1209 gsl_vector_add(centroid, &x.vector);
1210 count++;
1211 }
1212 }
1213
1214 gsl_vector_scale(centroid, 1./count);
1215}
1216
1217
1218/**
1219 * Build the kernel density estimate from a kmeans clustering.
1220 *
1221 * Given the current clustering, estimate the distribution of each cluster
1222 * using a kernel density estimator. This will be used to evaluate the Bayes
1223 * Information Criteria when deciding the optimal clustering.
1224 * @param kmeans The kmeans to estimate the cluster distributions of.
1225 */
1227 INT4 i;
1228
1229 if (kmeans->KDEs == NULL)
1230 kmeans->KDEs = XLALCalloc(kmeans->k, sizeof(LALInferenceKDE*));
1231
1232 for (i = 0; i < kmeans->k; i++) {
1233 LALInferenceKmeansConstructMask(kmeans, kmeans->mask, i);
1234 kmeans->KDEs[i] = LALInferenceNewKDEfromMat(kmeans->data, kmeans->mask);
1235 }
1236}
1237
1238
1239/**
1240 * Calculate max likelihood of a kmeans assuming spherical Gaussian clusters.
1241 *
1242 * Determine the maximum likelihood estimate (MLE) of the clustering assuming
1243 * each cluster is drawn from a spherical Gaussian. This is not currently
1244 * used, but is the original MLE used by the xmeans clustering algorithm.
1245 * @param kmeans The kmeans to determine the BIC of.
1246 * @return The maximum likelihood estimate of the \a kmeans clustering.
1247 */
1249 INT4 i, j;
1250
1251 REAL8 n_c;
1252 REAL8 N = (REAL8) kmeans->npts;
1253 REAL8 dim = (REAL8) kmeans->dim;
1254 REAL8 k = (REAL8) kmeans->k;
1255
1256 /* Calculate the MLE of the variance */
1257 gsl_vector_view c;
1258 REAL8 MLE_variance = 0.;
1259 for (j = 0; j < kmeans->k; j++) {
1260 LALInferenceKmeansConstructMask(kmeans, kmeans->mask, j);
1261 c = gsl_matrix_row(kmeans->centroids, j);
1262
1263 for (i = 0; i < kmeans->npts; i++) {
1264 if (kmeans->mask[i]) {
1265 gsl_vector_view x = gsl_matrix_row(kmeans->data, i);
1266 MLE_variance += euclidean_dist_squared(&x.vector, &c.vector);
1267 }
1268 }
1269 }
1270
1271 MLE_variance /= N - k;
1272
1273 /* Use the MLE of the variance to determine the maximum likelihood */
1274 REAL8 MLE = 0.;
1275 for (i = 0; i < kmeans->k; i++) {
1276 n_c = (REAL8) kmeans->sizes[i];
1277 MLE += -n_c/2. * (log(2.*LAL_PI) -
1278 dim * log(MLE_variance) +
1279 2. * (log(n_c) - log(N))) - (n_c - k)/2.;
1280 }
1281
1282 return MLE;
1283}
1284
1285
1286/**
1287 * Calculate the BIC using the KDEs of the cluster distributions.
1288 *
1289 * Use the kernal density estimate of each clusters distribution to calculate
1290 * the Bayes Information Criterian (BIC). The BIC penalizes the maximum
1291 * likelihood of a given model based on the number of parameters of the model.
1292 * This is used to deside the optimal clustering.
1293 * @param kmeans The kmeans to determine the BIC of.
1294 * @return The Bayes Information Criterian of the \a kmeans clustering.
1295 */
1297
1298 /* Return -infinity if kmeans isn't allocated */
1299 if (!kmeans)
1300 return -INFINITY;
1301
1302 INT4 i;
1303 REAL8 log_l;
1304 REAL8 k = (REAL8) kmeans->k;
1305 REAL8 N = (REAL8) kmeans->npts;
1306 REAL8 d = (REAL8) kmeans->dim;
1307
1308 log_l = 0.;
1309 for (i = 0; i < kmeans->npts; i++) {
1310 gsl_vector_view pt = gsl_matrix_row(kmeans->data, i);
1311 log_l += LALInferenceWhitenedKmeansPDF(kmeans, (&pt.vector)->data);
1312 }
1313
1314 /* Determine the total number of parameters in clustered-KDE */
1315 /* Account for centroid locations */
1316 REAL8 nparams = k * d;
1317
1318 /* One weight for each cluster, minus one for constraint that
1319 * all sum to unity */
1320 nparams += k - 1;
1321
1322 /* Separate kernel covariances for each cluster */
1323 nparams += k*(d+1)*d/2.0;
1324
1325 return log_l - nparams/2.0 * log(N);
1326}
1327
1328
1329/**
1330 * Destroy a kmeans instance.
1331 *
1332 * Systematically deallocate all memory associated to \a kmeans.
1333 * @param kmeans The kmeans instance to deallocate.
1334 */
1336 /* Only destroy when there is something to destroy */
1337 if (kmeans) {
1338 /* Free GSL matrices and vectors */
1339 gsl_vector_free(kmeans->mean);
1340 gsl_matrix_free(kmeans->centroids);
1341 if (kmeans->data) gsl_matrix_free(kmeans->data);
1342 if (kmeans->recursive_centroids)
1343 gsl_matrix_free(kmeans->recursive_centroids);
1344
1345 /* Free non-GSL arrays */
1346 XLALFree(kmeans->assignments);
1347 XLALFree(kmeans->mask);
1348 XLALFree(kmeans->weights);
1349 XLALFree(kmeans->sizes);
1350
1351 /* If KDEs are defined, free all of them */
1352 if (kmeans->KDEs != NULL) {
1353 INT4 k;
1354 for (k=0; k<kmeans->k; k++)
1355 if (kmeans->KDEs[k]) LALInferenceDestroyKDE(kmeans->KDEs[k]);
1356 XLALFree(kmeans->KDEs);
1357 }
1358
1359 XLALFree(kmeans);
1360 }
1361}
REAL8 LALInferenceKmeansBIC(LALInferenceKmeans *kmeans)
Calculate the BIC using the KDEs of the cluster distributions.
REAL8 LALInferenceKmeansMaxLogL(LALInferenceKmeans *kmeans)
Calculate max likelihood of a kmeans assuming spherical Gaussian clusters.
void LALInferenceKmeansRecursiveSplit(LALInferenceKmeans *kmeans, INT4 ntrials, gsl_rng *rng)
Recursively split a k=1 kmeans.
REAL8 euclidean_dist_squared(gsl_vector *x, gsl_vector *y)
Calculate the squared Euclidean distance bewteen two points.
void LALInferenceKmeansImposeBounds(LALInferenceKmeans *kmeans, LALInferenceVariables *params, LALInferenceVariables *priorArgs, INT4 cyclic_reflective)
Impose boundaries on individual KDEs.
LALInferenceKmeans * LALInferenceCreateKmeans(INT4 k, gsl_matrix *data, gsl_rng *rng)
Generate a new kmeans struct from a set of data.
void LALInferenceKmeansForgyInitialize(LALInferenceKmeans *kmeans)
Randomly select points from the data as initial centroids for a kmeans run.
LALInferenceKmeans * LALInferenceRecursiveKmeans(gsl_matrix *data, INT4 ntrials, gsl_rng *rng)
Run kmeans, recursively splitting centroids, accepting if the BIC increases.
void accumulate_vectors(gsl_matrix **mat_ptr, gsl_vector *vec)
Add a vector to the end of a matrix, allocating if necessary.
gsl_matrix * mask_data(gsl_matrix *data, INT4 *mask)
Generate a new matrix by masking an existing one.
void LALInferenceKmeansBuildKDE(LALInferenceKmeans *kmeans)
Build the kernel density estimate from a kmeans clustering.
LALInferenceKmeans * LALInferenceXmeans(gsl_matrix *data, INT4 ntrials, gsl_rng *rng)
Cluster data xmeans-style, splitting centroids according to the BIC.
REAL8 LALInferenceKmeansPDF(LALInferenceKmeans *kmeans, REAL8 *pt)
Evaluate the estimated (log) PDF from a clustered-KDE at a point.
LALInferenceKmeans * LALInferenceIncrementalKmeans(gsl_matrix *data, INT4 ntrials, gsl_rng *rng)
Kmeans cluster data, increasing k until the BIC stops increasing.
gsl_matrix * LALInferenceWhitenSamples(gsl_matrix *samples)
Transform a data set to obtain a 0-mean and identity covariance matrix.
LALInferenceKmeans * LALInferenceOptimizedKmeans(gsl_matrix *data, INT4 ntrials, gsl_rng *rng)
Kmeans cluster data, maximizing BIC assuming BIC(k) is concave-down.
void LALInferenceKmeansRun(LALInferenceKmeans *kmeans)
Run the kmeans algorithm until cluster assignments don't change.
REAL8 LALInferenceWhitenedKmeansPDF(LALInferenceKmeans *kmeans, REAL8 *pt)
Estimate (log) PDF from a clustered-KDE at an already whitened point.
void euclidean_centroid(gsl_vector *centroid, gsl_matrix *data, INT4 *mask)
Find the centroid of a masked data set.
void append_vec_to_mat(gsl_matrix **mat_ptr, gsl_vector *vec)
Add a vector to the end of a matrix.
void LALInferenceKmeansAssignment(LALInferenceKmeans *kmeans)
The assignment step of the kmeans algorithm.
void LALInferenceKmeansDestroy(LALInferenceKmeans *kmeans)
Destroy a kmeans instance.
void LALInferenceKmeansUpdate(LALInferenceKmeans *kmeans)
The update step of the kmeans algorithm.
LALInferenceKmeans * LALInferenceKmeansExtractCluster(LALInferenceKmeans *kmeans, INT4 cluster_id)
Extract a single cluster from an existing kmeans as a new 1-means.
void LALInferenceKmeansConstructMask(LALInferenceKmeans *kmeans, INT4 *mask, INT4 cluster_id)
Construct a mask to select only the data assigned to a single cluster.
void LALInferenceKmeansSeededInitialize(LALInferenceKmeans *kmeans)
A 'k-means++'-like seeded initialization of centroids for a kmeans run.
LALInferenceKmeans * LALInferenceKmeansRunBestOf(INT4 k, gsl_matrix *samples, INT4 ntrials, gsl_rng *rng)
Run a kmeans several times and return the best.
REAL8 * LALInferenceKmeansDraw(LALInferenceKmeans *kmeans)
Draw a sample from a kmeans-KDE estimate of a distribution.
void LALInferenceKmeansRandomPartitionInitialize(LALInferenceKmeans *kmeans)
Use the resulting centroids from a random cluster assignment as the initial centroids of a kmeans run...
REAL8 * LALInferenceDrawKDESample(LALInferenceKDE *kde, gsl_rng *rng)
Draw a sample from a kernel density estimate.
void LALInferenceComputeMean(gsl_vector *mean, gsl_matrix *data)
Calculate the mean of a data set.
REAL8 LALInferenceKDEEvaluatePoint(LALInferenceKDE *kde, REAL8 *point)
Evaluate the (log) PDF from a KDE at a single point.
void LALInferenceDestroyKDE(LALInferenceKDE *kde)
Free an allocated KDE structure.
REAL8 log_add_exps(REAL8 *vals, INT4 size)
Determine the log of the sum of an array of exponentials.
void LALInferenceComputeCovariance(gsl_matrix *cov, gsl_matrix *data)
Calculate the covariance matrix of a data set.
LALInferenceKDE * LALInferenceNewKDEfromMat(gsl_matrix *data, INT4 *mask)
Allocate, fill, and tune a Gaussian kernel density estimate from a matrix of points.
static REAL8 mean(REAL8 *array, int N)
static REAL8 norm(const REAL8 x[3])
#define max(a, b)
int j
int k
#define fprintf
const double u
sigmaKerr data[0]
#define LAL_PI
double REAL8
int32_t INT4
int LALInferenceCheckVariableNonFixed(LALInferenceVariables *vars, const char *name)
Checks for name being present in vars and having type LINEAR or CIRCULAR.
Definition: LALInference.c:503
@ LALINFERENCE_PARAM_FIXED
A parameter that is cyclic, such as an angle between 0 and 2pi.
Definition: LALInference.h:130
@ LALINFERENCE_PARAM_CIRCULAR
A parameter that simply has a maximum and a minimum.
Definition: LALInference.h:129
void LALInferenceGetMinMaxPrior(LALInferenceVariables *priorArgs, const char *name, REAL8 *min, REAL8 *max)
Get the minimum and maximum values of the uniform prior from the priorArgs list, given a name.
void * XLALCalloc(size_t m, size_t n)
void XLALFree(void *p)
list y
c
Structure containing the Guassian kernel density of a set of samples.
REAL8 * upper_bounds
Upper param bounds.
LALInferenceParamVaryType * lower_bound_types
Array of param boundary types.
LALInferenceParamVaryType * upper_bound_types
Array of param boundary types.
REAL8 * lower_bounds
Lower param bounds.
Structure for performing the kmeans clustering algorithm on a set of samples.
INT4 npts
Number of points being clustered.
INT4 * sizes
Cluster sizes.
REAL8 error
Error of current clustering.
gsl_vector * std
Std deviations of unwhitened data.
INT4 * assignments
Cluster assignments.
INT4 k
Number of clusters.
gsl_matrix * recursive_centroids
Matrix used to accumulate tested centroids.
gsl_matrix * data
Data to be clustered (typically whitened)
REAL8 * weights
Fraction of data points in each cluster.
REAL8(* dist)(gsl_vector *x, gsl_vector *y)
Distance funtion.
gsl_rng * rng
Random number generator.
gsl_matrix * cov
Covariance matrix of data.
gsl_matrix * centroids
Array with rows containing cluster centroids.
LALInferenceKDE ** KDEs
Array of KDEs, one for each cluster.
INT4 has_changed
Flag indicating a change to cluster assignmens.
INT4 dim
Dimension of data.
gsl_vector * mean
Mean of unwhitened data (for tranforming points)
INT4 * mask
Mask used to select data from individual clusters.
void(* centroid)(gsl_vector *centroid, gsl_matrix *data, INT4 *mask)
Find centroid.
The LALInferenceVariableItem list node structure This should only be accessed using the accessor func...
Definition: LALInference.h:154
char name[VARNAME_MAX]
Definition: LALInference.h:155
struct tagVariableItem * next
Definition: LALInference.h:159
LALInferenceParamVaryType vary
Definition: LALInference.h:158
The LALInferenceVariables structure to contain a set of parameters Implemented as a linked list of LA...
Definition: LALInference.h:170