29#include <gsl/gsl_randist.h>
30#include <gsl/gsl_sort_vector_double.h>
31#include <gsl/gsl_vector.h>
32#include <gsl/gsl_matrix.h>
33#include <gsl/gsl_linalg.h>
34#include <gsl/gsl_blas.h>
36#include <lal/LALStdlib.h>
37#include <lal/LALConstants.h>
38#include <lal/LALDatatypes.h>
40#include <lal/LALInference.h>
41#include <lal/LALInferencePrior.h>
42#include <lal/LALInferenceKDE.h>
43#include <lal/LALInferenceClusteredKDE.h>
69 REAL8 best_bic = -INFINITY;
89 best_clustering = kmeans;
98 return best_clustering;
118 INT4 k, low_k = 1, mid_k = 2, high_k = 4;
119 REAL8 bic, low_bic, mid_bic, high_bic;
127 fprintf(stderr,
"Can't build basic KDE.");
128 fprintf(stderr,
" Perhaps sample size (%i) is too small.\n",
150 if (mid_bic > low_bic) {
162 while (high_bic > mid_bic) {
171 low_kmeans = mid_kmeans;
172 mid_kmeans = high_kmeans;
178 high_k = mid_k + (high_k - mid_k)/2;
179 if (high_k < mid_k + 1)
180 high_bic = -INFINITY;
191 while (high_k - low_k > 2) {
192 if (high_k - mid_k > mid_k - low_k) {
193 k = mid_k + (high_k - mid_k)/2;
205 low_kmeans = mid_kmeans;
212 high_kmeans = kmeans;
216 k = low_k + (mid_k - low_k)/2;
229 high_kmeans = mid_kmeans;
243 if (low_kmeans != mid_kmeans)
245 if (high_kmeans != mid_kmeans)
247 if (kmeans && kmeans != mid_kmeans)
270 REAL8 starting_bic, ending_bic;
271 REAL8 old_bic, new_bic;
286 while (kmeans->
k < kmax) {
294 for (
c = 0;
c < kmeans->
k;
c++) {
310 if (new_bic > old_bic) {
312 for (
k = 0;
k < new_kmeans->
k;
k++) {
313 gsl_vector_view centroid =
321 gsl_vector_view centroid = gsl_matrix_row(kmeans->
centroids,
c);
333 k = centroids->size1;
337 gsl_matrix_memcpy(new_kmeans->
centroids, centroids);
344 ending_bic = -INFINITY;
347 if (ending_bic > starting_bic) {
392 k = centroids->size1;
398 gsl_matrix_memcpy(kmeans->
centroids, centroids);
423 REAL8 current_bic, new_bic;
432 if (new_bic > current_bic) {
433 for (cluster = 0; cluster < new_kmeans->
k; cluster++) {
441 for (
i = 0;
i < n_new_centroids ;
i++) {
442 gsl_vector_view centroid =
453 gsl_vector_view centroid = gsl_matrix_row(kmeans->
centroids, 0);
479 for (
i = 0;
i < kmeans->
k;
i++)
504 REAL8 best_bic = -INFINITY;
514 REAL8 error = -INFINITY;
517 for (
i = 0;
i < ntrials;
i++) {
527 if (error != kmeans->
error) {
528 error = kmeans->
error;
534 if (bic > best_bic) {
538 best_kmeans = kmeans;
575 kmeans->
mean = gsl_vector_alloc(kmeans->
dim);
576 kmeans->
cov = gsl_matrix_alloc(kmeans->
dim, kmeans->
dim);
577 kmeans->
std = gsl_vector_alloc(kmeans->
dim);
578 kmeans->
data = gsl_matrix_alloc(kmeans->
npts, kmeans->
dim);
579 kmeans->
centroids = gsl_matrix_alloc(kmeans->
k, kmeans->
dim);
597 for (
i = 0;
i < kmeans->
dim;
i++)
598 gsl_vector_set(kmeans->
std,
i, sqrt(gsl_matrix_get(kmeans->
cov,
i,
i)));
603 fprintf(stderr,
"Unable to whiten data. No proposal has been built.\n");
621 gsl_vector_view
c,
x;
623 gsl_vector *dists = gsl_vector_alloc(kmeans->
npts);
624 gsl_permutation *
p = gsl_permutation_alloc(kmeans->
npts);
627 i = gsl_rng_uniform_int(kmeans->
rng, kmeans->
npts);
629 for (
j = 0;
j < kmeans->
dim;
j++)
631 gsl_matrix_get(kmeans->
data,
i,
j));
634 while (u < kmeans->
k) {
638 for (
i = 0;
i < kmeans->
npts;
i++) {
639 x = gsl_matrix_row(kmeans->
data,
i);
642 gsl_vector_set(dists,
i,
dist);
646 gsl_vector_scale(dists, 1.0/
norm);
647 gsl_sort_vector_index(
p, dists);
649 randomDraw = gsl_rng_uniform(kmeans->
rng);
652 dist = gsl_vector_get(dists,
p->data[
i]);
653 while (
dist < randomDraw) {
655 dist += gsl_vector_get(dists,
p->data[
i]);
658 for (
j = 0;
j < kmeans->
dim;
j++)
660 gsl_matrix_get(kmeans->
data,
i,
j));
665 gsl_permutation_free(
p);
676 gsl_permutation *
p = gsl_permutation_alloc(kmeans->
npts);
678 gsl_permutation_init(
p);
679 gsl_ran_shuffle(kmeans->
rng,
p->data, kmeans->
npts,
sizeof(
size_t));
681 for (
i = 0;
i < kmeans->
k;
i++) {
682 u = gsl_permutation_get(
p,
i);
683 for (
j = 0;
j < kmeans->
dim;
j++)
685 gsl_matrix_get(kmeans->
data,
u,
j));
688 gsl_permutation_free(
p);
701 for (
i = 0;
i < kmeans->
npts;
i++) {
702 cluster_id = gsl_rng_uniform_int(kmeans->
rng, kmeans->
k);
723 for (
i = 0;
i < kmeans->
npts;
i++) {
724 gsl_vector_view
x = gsl_matrix_row(kmeans->
data,
i);
727 INT4 best_cluster = 0;
728 REAL8 best_dist = INFINITY;
732 for (
j = 0;
j < kmeans->
k;
j++) {
736 if (
dist < best_dist) {
744 if (best_cluster != current_cluster) {
748 kmeans->
error += best_dist;
752 for (
i = 0;
i < kmeans->
k;
i++)
755 for (
i = 0;
i < kmeans->
npts;
i++)
769 for (
i = 0;
i < kmeans->
k;
i ++) {
772 gsl_vector_view
c = gsl_matrix_row(kmeans->
centroids,
i);
794 for (
i = 0;
i < kmeans->
npts;
i++) {
824 gsl_matrix_free(masked_data);
854 INT4 cyclic_reflective) {
856 INT4 n_below, n_above;
857 INT4 ndraws, n_thresh;
861 REAL8 drawn_min = INFINITY, drawn_max = -INFINITY;
869 n_thresh = (
INT4)(threshold * ndraws);
872 for (
c = 0;
c < kmeans->
k;
c++) {
874 if (kmeans->
npts == 0)
877 for (
i = 0;
i < ndraws;
i++) {
882 memcpy(&(draws[
i*dim]), point, dim*
sizeof(
REAL8));
887 for (param =
params->head; param; param = param->
next) {
892 std = gsl_vector_get(kmeans->
std,
p);
893 min = (min -
mean)/std;
898 for (
i = 0;
i < ndraws;
i++) {
899 draw = draws[
i*dim +
p];
905 if (draw < drawn_min)
907 else if (draw > drawn_max)
911 if (cyclic_reflective && n_below > n_thresh) {
924 if (cyclic_reflective && n_above > n_thresh) {
962 for (
i = 0;
i <
N;
i++)
966 gsl_matrix *masked_data = NULL;
968 masked_data = gsl_matrix_alloc(new_N, dim);
972 gsl_vector_view source, target;
975 for (
i = 0;
i <
N;
i++) {
977 source = gsl_matrix_row(
data,
i);
978 target = gsl_matrix_row(masked_data, idx);
979 gsl_vector_memcpy(&target.vector, &source.vector);
997 gsl_matrix *mat = *mat_ptr;
1000 mat = gsl_matrix_alloc(1, vec->size);
1001 gsl_vector_view
row = gsl_matrix_row(mat, 0);
1002 gsl_vector_memcpy(&
row.vector, vec);
1017 gsl_matrix *mat = *mat_ptr;
1018 INT4 rows = mat->size1;
1019 INT4 cols = mat->size2;
1021 gsl_matrix *new_mat = gsl_matrix_alloc(rows+1, cols);
1024 gsl_matrix_view sub_mat_view =
1025 gsl_matrix_submatrix(new_mat, 0, 0, rows, cols);
1026 gsl_matrix_memcpy(&sub_mat_view.matrix, mat);
1029 gsl_vector_view row_view = gsl_matrix_row(new_mat, rows);
1030 gsl_vector_memcpy(&row_view.vector, vec);
1032 gsl_matrix_free(mat);
1048 REAL8 randomDraw = gsl_rng_uniform(kmeans->
rng);
1052 while (cumulativeWeight < randomDraw) {
1054 cumulativeWeight += kmeans->
weights[cluster];
1062 gsl_vector_view pt = gsl_vector_view_array(point, kmeans->
dim);
1063 gsl_vector_mul(&pt.vector, kmeans->
std);
1064 gsl_vector_add(&pt.vector, kmeans->
mean);
1082 gsl_vector *
y = gsl_vector_alloc(kmeans->
dim);
1083 gsl_vector_view pt_view = gsl_vector_view_array(pt, kmeans->
dim);
1086 gsl_vector_memcpy(
y, &pt_view.vector);
1089 gsl_vector_sub(
y, kmeans->
mean);
1090 gsl_vector_div(
y, kmeans->
std);
1114 if (kmeans->
KDEs == NULL)
1118 for (
j = 0;
j < kmeans->
k;
j++)
1119 cluster_pdfs[
j] = log(kmeans->
weights[
j]) +
1141 gsl_matrix *whitened_samples = gsl_matrix_alloc(npts, dim);
1142 gsl_matrix_memcpy(whitened_samples,
samples);
1145 gsl_vector *
mean = gsl_vector_alloc(dim);
1146 gsl_matrix *cov = gsl_matrix_alloc(dim, dim);
1147 gsl_vector *std = gsl_vector_alloc(dim);
1151 for (
i = 0;
i < dim;
i++)
1152 gsl_vector_set(std,
i, sqrt(gsl_matrix_get(cov,
i,
i)));
1156 for (
i=0;
i<npts;
i++) {
1157 y = gsl_matrix_row(whitened_samples,
i);
1158 gsl_vector_sub(&
y.vector,
mean);
1159 gsl_vector_div(&
y.vector, std);
1162 gsl_vector_free(
mean);
1163 gsl_matrix_free(cov);
1164 gsl_vector_free(std);
1166 return whitened_samples;
1180 for (
i = 0;
i <
x->size;
i++) {
1181 REAL8 diff = gsl_vector_get(
x,
i) - gsl_vector_get(
y,
i);
1182 dist += diff * diff;
1203 for (
i = 0;
i < dim;
i++)
1204 gsl_vector_set(centroid,
i, 0.);
1206 for (
i = 0;
i < npts;
i++) {
1208 gsl_vector_view
x = gsl_matrix_row(
data,
i);
1209 gsl_vector_add(centroid, &
x.vector);
1214 gsl_vector_scale(centroid, 1./count);
1229 if (kmeans->
KDEs == NULL)
1232 for (
i = 0;
i < kmeans->
k;
i++) {
1258 REAL8 MLE_variance = 0.;
1259 for (
j = 0;
j < kmeans->
k;
j++) {
1263 for (
i = 0;
i < kmeans->
npts;
i++) {
1264 if (kmeans->
mask[
i]) {
1265 gsl_vector_view
x = gsl_matrix_row(kmeans->
data,
i);
1271 MLE_variance /=
N -
k;
1275 for (
i = 0;
i < kmeans->
k;
i++) {
1277 MLE += -n_c/2. * (log(2.*
LAL_PI) -
1278 dim * log(MLE_variance) +
1279 2. * (log(n_c) - log(
N))) - (n_c -
k)/2.;
1309 for (
i = 0;
i < kmeans->
npts;
i++) {
1310 gsl_vector_view pt = gsl_matrix_row(kmeans->
data,
i);
1323 nparams +=
k*(d+1)*d/2.0;
1325 return log_l - nparams/2.0 * log(
N);
1339 gsl_vector_free(kmeans->
mean);
1341 if (kmeans->
data) gsl_matrix_free(kmeans->
data);
1352 if (kmeans->
KDEs != NULL) {
1354 for (
k=0;
k<kmeans->
k;
k++)
REAL8 LALInferenceKmeansBIC(LALInferenceKmeans *kmeans)
Calculate the BIC using the KDEs of the cluster distributions.
REAL8 LALInferenceKmeansMaxLogL(LALInferenceKmeans *kmeans)
Calculate max likelihood of a kmeans assuming spherical Gaussian clusters.
void LALInferenceKmeansRecursiveSplit(LALInferenceKmeans *kmeans, INT4 ntrials, gsl_rng *rng)
Recursively split a k=1 kmeans.
REAL8 euclidean_dist_squared(gsl_vector *x, gsl_vector *y)
Calculate the squared Euclidean distance bewteen two points.
void LALInferenceKmeansImposeBounds(LALInferenceKmeans *kmeans, LALInferenceVariables *params, LALInferenceVariables *priorArgs, INT4 cyclic_reflective)
Impose boundaries on individual KDEs.
LALInferenceKmeans * LALInferenceCreateKmeans(INT4 k, gsl_matrix *data, gsl_rng *rng)
Generate a new kmeans struct from a set of data.
void LALInferenceKmeansForgyInitialize(LALInferenceKmeans *kmeans)
Randomly select points from the data as initial centroids for a kmeans run.
LALInferenceKmeans * LALInferenceRecursiveKmeans(gsl_matrix *data, INT4 ntrials, gsl_rng *rng)
Run kmeans, recursively splitting centroids, accepting if the BIC increases.
void accumulate_vectors(gsl_matrix **mat_ptr, gsl_vector *vec)
Add a vector to the end of a matrix, allocating if necessary.
gsl_matrix * mask_data(gsl_matrix *data, INT4 *mask)
Generate a new matrix by masking an existing one.
void LALInferenceKmeansBuildKDE(LALInferenceKmeans *kmeans)
Build the kernel density estimate from a kmeans clustering.
LALInferenceKmeans * LALInferenceXmeans(gsl_matrix *data, INT4 ntrials, gsl_rng *rng)
Cluster data xmeans-style, splitting centroids according to the BIC.
REAL8 LALInferenceKmeansPDF(LALInferenceKmeans *kmeans, REAL8 *pt)
Evaluate the estimated (log) PDF from a clustered-KDE at a point.
LALInferenceKmeans * LALInferenceIncrementalKmeans(gsl_matrix *data, INT4 ntrials, gsl_rng *rng)
Kmeans cluster data, increasing k until the BIC stops increasing.
gsl_matrix * LALInferenceWhitenSamples(gsl_matrix *samples)
Transform a data set to obtain a 0-mean and identity covariance matrix.
LALInferenceKmeans * LALInferenceOptimizedKmeans(gsl_matrix *data, INT4 ntrials, gsl_rng *rng)
Kmeans cluster data, maximizing BIC assuming BIC(k) is concave-down.
void LALInferenceKmeansRun(LALInferenceKmeans *kmeans)
Run the kmeans algorithm until cluster assignments don't change.
REAL8 LALInferenceWhitenedKmeansPDF(LALInferenceKmeans *kmeans, REAL8 *pt)
Estimate (log) PDF from a clustered-KDE at an already whitened point.
void euclidean_centroid(gsl_vector *centroid, gsl_matrix *data, INT4 *mask)
Find the centroid of a masked data set.
void append_vec_to_mat(gsl_matrix **mat_ptr, gsl_vector *vec)
Add a vector to the end of a matrix.
void LALInferenceKmeansAssignment(LALInferenceKmeans *kmeans)
The assignment step of the kmeans algorithm.
void LALInferenceKmeansDestroy(LALInferenceKmeans *kmeans)
Destroy a kmeans instance.
void LALInferenceKmeansUpdate(LALInferenceKmeans *kmeans)
The update step of the kmeans algorithm.
LALInferenceKmeans * LALInferenceKmeansExtractCluster(LALInferenceKmeans *kmeans, INT4 cluster_id)
Extract a single cluster from an existing kmeans as a new 1-means.
void LALInferenceKmeansConstructMask(LALInferenceKmeans *kmeans, INT4 *mask, INT4 cluster_id)
Construct a mask to select only the data assigned to a single cluster.
void LALInferenceKmeansSeededInitialize(LALInferenceKmeans *kmeans)
A 'k-means++'-like seeded initialization of centroids for a kmeans run.
LALInferenceKmeans * LALInferenceKmeansRunBestOf(INT4 k, gsl_matrix *samples, INT4 ntrials, gsl_rng *rng)
Run a kmeans several times and return the best.
REAL8 * LALInferenceKmeansDraw(LALInferenceKmeans *kmeans)
Draw a sample from a kmeans-KDE estimate of a distribution.
void LALInferenceKmeansRandomPartitionInitialize(LALInferenceKmeans *kmeans)
Use the resulting centroids from a random cluster assignment as the initial centroids of a kmeans run...
REAL8 * LALInferenceDrawKDESample(LALInferenceKDE *kde, gsl_rng *rng)
Draw a sample from a kernel density estimate.
void LALInferenceComputeMean(gsl_vector *mean, gsl_matrix *data)
Calculate the mean of a data set.
REAL8 LALInferenceKDEEvaluatePoint(LALInferenceKDE *kde, REAL8 *point)
Evaluate the (log) PDF from a KDE at a single point.
void LALInferenceDestroyKDE(LALInferenceKDE *kde)
Free an allocated KDE structure.
REAL8 log_add_exps(REAL8 *vals, INT4 size)
Determine the log of the sum of an array of exponentials.
void LALInferenceComputeCovariance(gsl_matrix *cov, gsl_matrix *data)
Calculate the covariance matrix of a data set.
LALInferenceKDE * LALInferenceNewKDEfromMat(gsl_matrix *data, INT4 *mask)
Allocate, fill, and tune a Gaussian kernel density estimate from a matrix of points.
static REAL8 mean(REAL8 *array, int N)
static REAL8 norm(const REAL8 x[3])
int LALInferenceCheckVariableNonFixed(LALInferenceVariables *vars, const char *name)
Checks for name being present in vars and having type LINEAR or CIRCULAR.
@ LALINFERENCE_PARAM_FIXED
A parameter that is cyclic, such as an angle between 0 and 2pi.
@ LALINFERENCE_PARAM_CIRCULAR
A parameter that simply has a maximum and a minimum.
void LALInferenceGetMinMaxPrior(LALInferenceVariables *priorArgs, const char *name, REAL8 *min, REAL8 *max)
Get the minimum and maximum values of the uniform prior from the priorArgs list, given a name.
void * XLALCalloc(size_t m, size_t n)
Structure containing the Guassian kernel density of a set of samples.
REAL8 * upper_bounds
Upper param bounds.
LALInferenceParamVaryType * lower_bound_types
Array of param boundary types.
LALInferenceParamVaryType * upper_bound_types
Array of param boundary types.
REAL8 * lower_bounds
Lower param bounds.
Structure for performing the kmeans clustering algorithm on a set of samples.
INT4 npts
Number of points being clustered.
INT4 * sizes
Cluster sizes.
REAL8 error
Error of current clustering.
gsl_vector * std
Std deviations of unwhitened data.
INT4 * assignments
Cluster assignments.
INT4 k
Number of clusters.
gsl_matrix * recursive_centroids
Matrix used to accumulate tested centroids.
gsl_matrix * data
Data to be clustered (typically whitened)
REAL8 * weights
Fraction of data points in each cluster.
REAL8(* dist)(gsl_vector *x, gsl_vector *y)
Distance funtion.
gsl_rng * rng
Random number generator.
gsl_matrix * cov
Covariance matrix of data.
gsl_matrix * centroids
Array with rows containing cluster centroids.
LALInferenceKDE ** KDEs
Array of KDEs, one for each cluster.
INT4 has_changed
Flag indicating a change to cluster assignmens.
INT4 dim
Dimension of data.
gsl_vector * mean
Mean of unwhitened data (for tranforming points)
INT4 * mask
Mask used to select data from individual clusters.
void(* centroid)(gsl_vector *centroid, gsl_matrix *data, INT4 *mask)
Find centroid.
The LALInferenceVariableItem list node structure This should only be accessed using the accessor func...
struct tagVariableItem * next
LALInferenceParamVaryType vary
The LALInferenceVariables structure to contain a set of parameters Implemented as a linked list of LA...