Loading [MathJax]/extensions/TeX/AMSsymbols.js
LALInference 4.1.9.1-5e288d3
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Modules Pages
LALInferenceClusteredKDE.h
Go to the documentation of this file.
1/*
2 * LALInferenceClusteredKDE.h: Bayesian Followup, kernel density estimator.
3 *
4 * Copyright (C) 2013 Ben Farr
5 *
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with with program; see the file COPYING. If not, write to the
19 * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
20 * MA 02110-1301 USA
21 */
22#ifndef LALInferenceClusteredKDE_h
23#define LALInferenceClusteredKDE_h
24
25#include <lal/LALStdlib.h>
26#include <lal/LALConstants.h>
27#include <lal/LALDatatypes.h>
28
29#include <lal/LALInferenceKDE.h>
30
31struct tagkmeans;
32
33/**
34 * Structure for performing the kmeans clustering algorithm on a set of samples.
35 */
36typedef struct
37tagkmeans
38{
39 gsl_matrix *data; /**< Data to be clustered (typically whitened) */
40 INT4 dim; /**< Dimension of data */
41 INT4 npts; /**< Number of points being clustered */
42 INT4 k; /**< Number of clusters */
43 INT4 has_changed; /**< Flag indicating a change to cluster assignmens */
44
45 REAL8 (*dist) (gsl_vector *x, gsl_vector *y); /**< Distance funtion */
46 void (*centroid) (gsl_vector *centroid, gsl_matrix *data, INT4 *mask); /**< Find centroid */
47
48 INT4 *assignments; /**< Cluster assignments */
49 INT4 *sizes; /**< Cluster sizes */
50 INT4 *mask; /**< Mask used to select data from individual clusters */
51 REAL8 *weights; /**< Fraction of data points in each cluster */
52 gsl_vector *mean; /**< Mean of unwhitened data (for tranforming points) */
53 gsl_vector *std; /**< Std deviations of unwhitened data */
54 gsl_matrix *cov; /**< Covariance matrix of data */
55 gsl_matrix *centroids; /**< Array with rows containing cluster centroids */
56 gsl_matrix *recursive_centroids; /**< Matrix used to accumulate tested centroids */
57 gsl_rng *rng; /**< Random number generator */
58
59 REAL8 error; /**< Error of current clustering */
60
61 LALInferenceKDE **KDEs; /**< Array of KDEs, one for each cluster */
63
64/* Kmeans cluster data, increasing k until the Bayes Information Criteria stop increasing. */
65LALInferenceKmeans *LALInferenceIncrementalKmeans(gsl_matrix *data, INT4 ntrials, gsl_rng *rng);
66
67/* Kmeans cluster data, find k that maximizes BIC assuming BIC(k) is concave-down. */
68LALInferenceKmeans *LALInferenceOptimizedKmeans(gsl_matrix *data, INT4 ntrials, gsl_rng *rng);
69
70/* Xmeans cluster data, splitting centroids in kmeans according to the Bayes Information Criteria. */
71LALInferenceKmeans *LALInferenceXmeans(gsl_matrix *data, INT4 ntrials, gsl_rng *rng);
72
73/* Run kmeans recursively, splitting each centroid recursively, accepting if the BIC increases. */
74LALInferenceKmeans *LALInferenceRecursiveKmeans(gsl_matrix *data, INT4 ntrials, gsl_rng *rng);
75
76/* Recursively split a k=1 kmeans. */
77void LALInferenceKmeansRecursiveSplit(LALInferenceKmeans *kmeans, INT4 ntrials, gsl_rng *rng);
78
79/* Run the kmeans algorithm until cluster assignments don't change. */
81
82/* Run a kmeans several times and return the best. */
83LALInferenceKmeans *LALInferenceKmeansRunBestOf(INT4 k, gsl_matrix *samples, INT4 ntrials, gsl_rng *rng);
84
85/* Generate a new kmeans struct from a set of data. */
86LALInferenceKmeans * LALInferenceCreateKmeans(INT4 k, gsl_matrix *data, gsl_rng *rng);
87
88/* A 'k-means++'-like seeded initialization of centroids for a kmeans run. */
90
91/* Randomly select points from the data as initial centroids for a kmeans run. */
93
94/* Use the resulting centroids from a random cluster assignment as the initial centroids of a kmeans run. */
96
97/* The assignment step of the kmeans algorithm. */
99
100/* The update step of the kmeans algorithm. */
102
103/* Construct a mask to select only the data assigned to a single cluster. */
104void LALInferenceKmeansConstructMask(LALInferenceKmeans *kmeans, INT4 *mask, INT4 cluster_id);
105
106/* Extract a single cluster from an existing kmeans as a new 1-means. */
108
109/* Impose boundaries on KDEs. */
111
112/* Generate a new matrix by masking an existing one. */
113gsl_matrix *mask_data(gsl_matrix *data, INT4 *mask);
114
115/* Add a vector to the end of a matrix, allocating if necessary. */
116void accumulate_vectors(gsl_matrix **mat_ptr, gsl_vector *vec);
117
118/* Add a vector to the end of a matrix. */
119void append_vec_to_mat(gsl_matrix **mat_ptr, gsl_vector *vec);
120
121/* Draw a sample from a kmeans-KDE estimate of a distribution. */
123
124/* Evaluate the estimated (log) PDF from a clustered-KDE at a point. */
126
127/* Evaluate the estimated (log) PDF from a clustered-KDE at an already whitened point. */
129
130/* Transform a data set to obtain a 0-mean and identity covariance matrix. */
131gsl_matrix * LALInferenceWhitenSamples(gsl_matrix *samples);
132
133/* Calculate the squared Euclidean distance bewteen two points. */
134REAL8 euclidean_dist_squared(gsl_vector *x, gsl_vector *y);
135
136/* Find the centroid of a masked data set. */
137void euclidean_centroid(gsl_vector *centroid, gsl_matrix *data, INT4 *mask);
138
139/* Build the kernel density estimate from a kmeans clustering. */
141
142/* Calculate the maximum likelihood of a given kmeans assuming spherical Gaussian clusters. */
144
145/* Calculate the BIC using the KDEs of the cluster distributions. */
147
148/* Destroy a kmeans instance. */
150
151#endif
REAL8 LALInferenceKmeansBIC(LALInferenceKmeans *kmeans)
Calculate the BIC using the KDEs of the cluster distributions.
REAL8 LALInferenceKmeansMaxLogL(LALInferenceKmeans *kmeans)
Calculate max likelihood of a kmeans assuming spherical Gaussian clusters.
void LALInferenceKmeansRecursiveSplit(LALInferenceKmeans *kmeans, INT4 ntrials, gsl_rng *rng)
Recursively split a k=1 kmeans.
REAL8 euclidean_dist_squared(gsl_vector *x, gsl_vector *y)
Calculate the squared Euclidean distance bewteen two points.
void LALInferenceKmeansImposeBounds(LALInferenceKmeans *kmeans, LALInferenceVariables *params, LALInferenceVariables *priorArgs, INT4 cyclic_reflective)
Impose boundaries on individual KDEs.
LALInferenceKmeans * LALInferenceCreateKmeans(INT4 k, gsl_matrix *data, gsl_rng *rng)
Generate a new kmeans struct from a set of data.
void LALInferenceKmeansForgyInitialize(LALInferenceKmeans *kmeans)
Randomly select points from the data as initial centroids for a kmeans run.
LALInferenceKmeans * LALInferenceRecursiveKmeans(gsl_matrix *data, INT4 ntrials, gsl_rng *rng)
Run kmeans, recursively splitting centroids, accepting if the BIC increases.
void accumulate_vectors(gsl_matrix **mat_ptr, gsl_vector *vec)
Add a vector to the end of a matrix, allocating if necessary.
gsl_matrix * mask_data(gsl_matrix *data, INT4 *mask)
Generate a new matrix by masking an existing one.
void LALInferenceKmeansBuildKDE(LALInferenceKmeans *kmeans)
Build the kernel density estimate from a kmeans clustering.
LALInferenceKmeans * LALInferenceXmeans(gsl_matrix *data, INT4 ntrials, gsl_rng *rng)
Cluster data xmeans-style, splitting centroids according to the BIC.
REAL8 LALInferenceKmeansPDF(LALInferenceKmeans *kmeans, REAL8 *pt)
Evaluate the estimated (log) PDF from a clustered-KDE at a point.
LALInferenceKmeans * LALInferenceIncrementalKmeans(gsl_matrix *data, INT4 ntrials, gsl_rng *rng)
Kmeans cluster data, increasing k until the BIC stops increasing.
gsl_matrix * LALInferenceWhitenSamples(gsl_matrix *samples)
Transform a data set to obtain a 0-mean and identity covariance matrix.
LALInferenceKmeans * LALInferenceOptimizedKmeans(gsl_matrix *data, INT4 ntrials, gsl_rng *rng)
Kmeans cluster data, maximizing BIC assuming BIC(k) is concave-down.
void LALInferenceKmeansRun(LALInferenceKmeans *kmeans)
Run the kmeans algorithm until cluster assignments don't change.
REAL8 LALInferenceWhitenedKmeansPDF(LALInferenceKmeans *kmeans, REAL8 *pt)
Estimate (log) PDF from a clustered-KDE at an already whitened point.
void euclidean_centroid(gsl_vector *centroid, gsl_matrix *data, INT4 *mask)
Find the centroid of a masked data set.
void append_vec_to_mat(gsl_matrix **mat_ptr, gsl_vector *vec)
Add a vector to the end of a matrix.
void LALInferenceKmeansAssignment(LALInferenceKmeans *kmeans)
The assignment step of the kmeans algorithm.
void LALInferenceKmeansDestroy(LALInferenceKmeans *kmeans)
Destroy a kmeans instance.
void LALInferenceKmeansUpdate(LALInferenceKmeans *kmeans)
The update step of the kmeans algorithm.
LALInferenceKmeans * LALInferenceKmeansExtractCluster(LALInferenceKmeans *kmeans, INT4 cluster_id)
Extract a single cluster from an existing kmeans as a new 1-means.
void LALInferenceKmeansConstructMask(LALInferenceKmeans *kmeans, INT4 *mask, INT4 cluster_id)
Construct a mask to select only the data assigned to a single cluster.
void LALInferenceKmeansSeededInitialize(LALInferenceKmeans *kmeans)
A 'k-means++'-like seeded initialization of centroids for a kmeans run.
LALInferenceKmeans * LALInferenceKmeansRunBestOf(INT4 k, gsl_matrix *samples, INT4 ntrials, gsl_rng *rng)
Run a kmeans several times and return the best.
REAL8 * LALInferenceKmeansDraw(LALInferenceKmeans *kmeans)
Draw a sample from a kmeans-KDE estimate of a distribution.
void LALInferenceKmeansRandomPartitionInitialize(LALInferenceKmeans *kmeans)
Use the resulting centroids from a random cluster assignment as the initial centroids of a kmeans run...
int k
sigmaKerr data[0]
double REAL8
int32_t INT4
list y
Structure containing the Guassian kernel density of a set of samples.
Structure for performing the kmeans clustering algorithm on a set of samples.
INT4 npts
Number of points being clustered.
INT4 * sizes
Cluster sizes.
REAL8 error
Error of current clustering.
gsl_vector * std
Std deviations of unwhitened data.
INT4 * assignments
Cluster assignments.
INT4 k
Number of clusters.
gsl_matrix * recursive_centroids
Matrix used to accumulate tested centroids.
gsl_matrix * data
Data to be clustered (typically whitened)
REAL8 * weights
Fraction of data points in each cluster.
gsl_rng * rng
Random number generator.
gsl_matrix * cov
Covariance matrix of data.
gsl_matrix * centroids
Array with rows containing cluster centroids.
LALInferenceKDE ** KDEs
Array of KDEs, one for each cluster.
INT4 has_changed
Flag indicating a change to cluster assignmens.
INT4 dim
Dimension of data.
gsl_vector * mean
Mean of unwhitened data (for tranforming points)
INT4 * mask
Mask used to select data from individual clusters.
void(* centroid)(gsl_vector *centroid, gsl_matrix *data, INT4 *mask)
Find centroid.
The LALInferenceVariables structure to contain a set of parameters Implemented as a linked list of LA...
Definition: LALInference.h:170