Loading [MathJax]/extensions/TeX/AMSsymbols.js
LALInference 4.1.9.1-5e288d3
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Modules Pages
LALInferenceKDE.c
Go to the documentation of this file.
1/*
2 * LALInferenceKDE.c: Bayesian Followup, kernel density estimator.
3 *
4 * Copyright (C) 2013 Ben Farr
5 *
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with with program; see the file COPYING. If not, write to the
19 * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
20 * MA 02110-1301 USA
21 */
22
23#include <stdio.h>
24#include <stdlib.h>
25#include <math.h>
26
27#include <gsl/gsl_randist.h>
28#include <gsl/gsl_vector.h>
29#include <gsl/gsl_matrix.h>
30#include <gsl/gsl_linalg.h>
31#include <gsl/gsl_blas.h>
32
33#include <lal/LALConstants.h>
34#include <lal/LALDatatypes.h>
35#include <lal/LALStdlib.h>
36
37#include <lal/LALInference.h>
38#include <lal/LALInferenceKDE.h>
39
40#ifndef _OPENMP
41#define omp ignore
42#endif
43
44
45
46/**
47 * Allocate, fill, and tune a Gaussian kernel density estimate from
48 * an array of samples.
49 *
50 * Given a set of points, the distribution from which the points were sampled
51 * from is estimated by a kernel density estimate (KDE), using a Gaussian
52 * kernal. This routine initializes a new KDE and sets the bandwidth according
53 * to Scott's rule. From this it is possible to both evaluate the estimate of
54 * the probability density function at an arbitrary location, and generate more
55 * samples from the estimated distribution. A mask can be provided to select a
56 * subset of the provided data, otherwise all data is used.
57 * @param[in] pts Array containing the points to estimate the distribution of.
58 * @param[in] npts The total number of points contained in \a pts.
59 * @param[in] dim The number of dimensions of the points contained in \a pts.
60 * @param[in] mask An optional \a npts long 0/1 array to mask points in \a pts.
61 * @return A LALInferenceKDE structure containing the KDE of the distribution.
62 */
64 INT4 npts,
65 INT4 dim,
66 INT4 *mask) {
67 INT4 i,j;
68
69 /* Determine the total number of samples */
70 INT4 count = npts;
71 if (mask != NULL) {
72 for (i = 0; i < npts; i++) {
73 if (!mask[i])
74 count--;
75 }
76 }
77
78 /* Initialize KDE */
79 LALInferenceKDE *kde = LALInferenceInitKDE(count, dim);
80
81 /* Fill in samples */
82 INT4 idx = 0;
83 for (i = 0; i < npts; i++) {
84 if (mask == NULL || mask[i]) {
85 for (j = 0; j < dim; j++)
86 gsl_matrix_set(kde->data, idx, j, pts[i*dim + j]);
87 idx++;
88 }
89 }
90
91 /* Set bandwidth */
93
94 return kde;
95}
96
97
98/**
99 * Allocate, fill, and tune a Gaussian kernel density estimate from
100 * a matrix of points.
101 *
102 * Estimate the underlying distribution of samples in a GSL matrix using a
103 * Gaussian KDE.
104 * @param[in] data GSL matrix with rows of samples from a target distribution.
105 * @param[in] mask An optional \a npts long 0/1 array to mask points in \a pts.
106 * @return A LALInferenceKDE structure containing the KDE of the distribution.
107 * \sa LALInferenceNewKDE()
108 */
110 INT4 i,j;
111 INT4 npts = data->size1;
112 INT4 dim = data->size2;
113
114 /* Determine the total number of samples */
115 INT4 count = npts;
116 if (mask != NULL) {
117 for (i = 0; i < npts; i++) {
118 if (!mask[i])
119 count--;
120 }
121 }
122
123 /* Initialize KDE */
124 LALInferenceKDE *kde = LALInferenceInitKDE(count, dim);
125
126 /* Fill in samples */
127 INT4 idx = 0;
128 for (i = 0; i < npts; i++) {
129 if (mask == NULL || mask[i]) {
130 for (j = 0; j < dim; j++)
131 gsl_matrix_set(kde->data, idx, j, gsl_matrix_get(data, i, j));
132 idx++;
133 }
134 }
135
136 /* Calculate covariances and set bandwidth */
138
139 return kde;
140}
141
142
143/**
144 * Construct an empty KDE structure.
145 *
146 * Create an empty LALInferenceKDE structure, allocated to handle the given size
147 * and dimension.
148 * @param[in] npts Number of samples that will be used to estimate
149 * the distribution.
150 * @param[in] dim Number of dimensions that the probability density function
151 * will be estimated in.
152 * @return An allocated, empty LALInferenceKDE structure.
153 * \sa LALInferenceKDE, LALInferenceSetKDEBandwidth()
154 */
156 INT4 p;
157 LALInferenceKDE *kde = XLALCalloc(1, sizeof(LALInferenceKDE));
158 kde->dim = dim;
159 kde->npts = npts;
160 kde->mean = gsl_vector_calloc(dim);
161 kde->cholesky_decomp_cov = gsl_matrix_calloc(dim, dim);
162 kde->cholesky_decomp_cov_lower = gsl_matrix_calloc(dim, dim);
163 kde->cov = gsl_matrix_calloc(dim, dim);
164
167 kde->lower_bounds = XLALCalloc(dim, sizeof(REAL8));
168 kde->upper_bounds = XLALCalloc(dim, sizeof(REAL8));
169
170 for (p = 0; p < dim; p++) {
173 }
174
175 if (npts > 0)
176 kde->data = gsl_matrix_alloc(npts, dim);
177
178 return kde;
179}
180
181
182/**
183 * Free an allocated KDE structure.
184 *
185 * Frees all memory allocated for a given KDE structure.
186 * @param[in] kde The KDE structure to be freed.
187 * \sa LALInferenceKDE, LALInferenceInitKDE
188 */
190 if (kde) {
191 gsl_vector_free(kde->mean);
192 gsl_matrix_free(kde->cholesky_decomp_cov);
193 gsl_matrix_free(kde->cholesky_decomp_cov_lower);
194 gsl_matrix_free(kde->cov);
195
196 if (kde->npts > 0) gsl_matrix_free(kde->data);
197
202
203 XLALFree(kde);
204 }
205}
206
207
208/**
209 * Compute the Cholesky decomposition of a matrix.
210 *
211 * A wrapper for gsl_linalg_cholesky_decomp() that avoids halting if the matrix
212 * is found not to be positive-definite. This often happens when decomposing a
213 * covariance matrix that is poorly estimated due to low sample size.
214 * @param mat The matrix to decompose (in place).
215 * @return Status of call to gsl_linalg_cholesky_decomp().
216 */
218 INT4 status;
219
220 /* Turn off default GSL error handling (i.e. aborting), and catch
221 * errors decomposing due to non-positive definite covariance matrices */
222 gsl_error_handler_t *default_gsl_error_handler =
223 gsl_set_error_handler_off();
224
225 status = gsl_linalg_cholesky_decomp(mat);
226 if (status) {
227 if (status != GSL_EDOM) {
228 fprintf(stderr, "ERROR: Unexpected problem \
229 Cholesky-decomposing matrix.\n");
230 exit(-1);
231 }
232 }
233
234 /* Return to default GSL error handling */
235 gsl_set_error_handler(default_gsl_error_handler);
236 return status;
237}
238
239
240/**
241 * Calculate the bandwidth and normalization factor for a KDE.
242 *
243 * Use Scott's rule to determine the bandwidth, and corresponding normalization
244 * factor, for a KDE.
245 * @param[in] kde The kernel density estimate to estimate the bandwidth of.
246 */
248 /* Use Scott's Bandwidth method */
249 REAL8 det_cov, cov_factor;
250 INT4 i, j;
251 INT4 status;
252
253 /* If data set is empty, set the normalization to infinity */
254 if (kde->npts == 0) {
255 kde->log_norm_factor = INFINITY;
256 return;
257 }
258
259 /* Calculate average and coveriance */
262
263 cov_factor = pow((REAL8)kde->npts, -1./(REAL8)(kde->dim + 4));
264 gsl_matrix_scale(kde->cov, cov_factor*cov_factor);
265
266 gsl_matrix_memcpy(kde->cholesky_decomp_cov, kde->cov);
268
269 /* If cholesky decomposition failed, set the normalization to infinity */
270 if (status) {
271 kde->log_norm_factor = INFINITY;
272 return;
273 }
274
275 /* Zero out upper right triangle of decomposed covariance matrix,
276 * which contains the transpose */
277 gsl_matrix_memcpy(kde->cholesky_decomp_cov_lower, kde->cholesky_decomp_cov);
278 for (i = 0 ; i < kde->dim; i++) {
279 for (j = i+1 ; j < kde->dim; j++)
280 gsl_matrix_set(kde->cholesky_decomp_cov_lower, i, j, 0.);
281 }
282
283 det_cov = LALInferenceMatrixDet(kde->cov);
284 kde->log_norm_factor =
285 log(kde->npts * sqrt(pow(2*LAL_PI, kde->dim) * det_cov));
286
287 return;
288}
289
290
291/**
292 * Evaluate the (log) PDF from a KDE at a single point.
293 *
294 * Calculate the (log) value of the probability density function estimate from
295 * a kernel density estimate at a single point.
296 * @param[in] kde The kernel density estimate to evaluate.
297 * @param[in] point An array containing the point to evaluate the PDF at.
298 * @return The value of the estimated probability density function at \a point.
299 */
301 INT4 dim = kde->dim;
302 INT4 npts = kde->npts;
303 INT4 i, j, p;
304 INT4 n_evals = 1; // Number of evaluations to be done
305 REAL8 min, max, width, val;
306
307 /* If the normalization is infinite, don't bother calculating anything */
308 if (isinf(kde->log_norm_factor))
309 return -INFINITY;
310
311 gsl_vector_view x = gsl_vector_view_array(point, dim);
312
313 /* If the point is outside the bounding box, return */
314 for (p = 0; p < dim; p++) {
315 val = gsl_vector_get(&x.vector, p);
316 min = kde->lower_bounds[p];
317 max = kde->upper_bounds[p];
318
320 val < min) ||
322 val > max))
323 return -INFINITY;
324 }
325
326 /* Repeat point across any imposed cyclic or reflective boundaries */
327 for (p = 0; p < dim; p++) {
330 n_evals++;
331
334 n_evals++;
335 }
336
337 gsl_matrix *points = gsl_matrix_alloc(n_evals, dim);
338 gsl_matrix_set_row(points, 0, &x.vector);
339
340 i = 1;
341 for (p = 0; p < dim; p++) {
342 min = kde->lower_bounds[p];
343 max = kde->upper_bounds[p];
344 width = max - min;
345 val = gsl_vector_get(&x.vector, p);
346
348 gsl_matrix_set_row(points, i, &x.vector);
349 gsl_matrix_set(points, i, p, min - (val - min));
350 i++;
351 }
352
354 gsl_matrix_set_row(points, i, &x.vector);
355 gsl_matrix_set(points, i, p, max + (max - val));
356 i++;
357 }
358
361 gsl_matrix_set_row(points, i, &x.vector);
362 gsl_matrix_set(points, i, p, val + width);
363 i++;
364
365 gsl_matrix_set_row(points, i, &x.vector);
366 gsl_matrix_set(points, i, p, val - width);
367 i++;
368 }
369 }
370
371 /* Loop over list of reflected and cycled points */
372 REAL8* results = XLALMalloc(npts * sizeof(REAL8));
373 REAL8* eval_results = XLALMalloc(n_evals * sizeof(REAL8));
374
375 /* Loop over reflected and cycled set of points */
376 for (i = 0; i < n_evals; i++) {
377 gsl_vector_view pt = gsl_matrix_row(points, i);
378
379 /* Loop over points in KDE dataset, using the Cholesky decomposition
380 * of the covariance to avoid ever inverting the covariance matrix */
381 #pragma omp parallel
382 {
383 /* Vectors that will hold the difference and transformed distance */
384 gsl_vector *diff = gsl_vector_alloc(dim);
385 gsl_vector *tdiff = gsl_vector_alloc(dim);
386
387 #pragma omp for schedule(static)
388 for (j = 0; j < npts; j++) {
389 gsl_vector_view d = gsl_matrix_row(kde->data, j);
390 gsl_vector_memcpy(diff, &d.vector);
391
392 gsl_vector_sub(diff, &pt.vector);
393 gsl_linalg_cholesky_solve(kde->cholesky_decomp_cov, diff, tdiff);
394 gsl_vector_mul(diff, tdiff);
395
396 REAL8 energy = 0.;
397 for (INT4 k=0; k<dim; k++)
398 energy += gsl_vector_get(diff, k);
399 results[j] = -energy/2.;
400 }
401
402 gsl_vector_free(diff);
403 gsl_vector_free(tdiff);
404 }
405
406 /* Normalize the result */
407 eval_results[i] = log_add_exps(results, npts) - kde->log_norm_factor;
408 }
409
410 /* Accumulate probability after accounting for all boundaries */
411 REAL8 result = log_add_exps(eval_results, n_evals);
412
413 gsl_matrix_free(points);
414 XLALFree(results);
415 XLALFree(eval_results);
416
417 return result;
418}
419
420
421/**
422 * Draw a sample from a kernel density estimate.
423 *
424 * Draw a sample from a distribution, as estimated by a kernel density estimator.
425 * @param[in] kde The kernel density estimate to draw \a point from.
426 * @param[in] rng GSL random number generator to use.
427 * @return The sample drawn from the distribution.
428 */
430 INT4 dim = kde->dim;
431 INT4 j, p;
432 REAL8 min, max, width;
433 REAL8 val, offset;
434 INT4 within_bounds = 0;
435
436 gsl_vector *unit_draw = gsl_vector_alloc(dim);
437
438 REAL8 *point = XLALCalloc(dim, sizeof(REAL8));
439 gsl_vector_view pt = gsl_vector_view_array(point, dim);
440
441 /* Draw samples until one is within the bounding box */
442 while (!within_bounds) {
443 within_bounds = 1;
444
445 /* Draw a random sample from KDE dataset */
446 INT4 ind = gsl_rng_uniform_int(rng, kde->npts);
447 gsl_vector_view d = gsl_matrix_row(kde->data, ind);
448 gsl_vector_memcpy(&pt.vector, &d.vector);
449
450 /* Draw individual parameters from 1D unit Gaussians */
451 for (j = 0; j < dim; j++) {
452 val = gsl_ran_ugaussian(rng);
453 gsl_vector_set(unit_draw, j, val);
454 }
455
456 /* Scale and shift the uncorrelated unit-width sample */
457 gsl_blas_dgemv(CblasNoTrans, 1.0,
458 kde->cholesky_decomp_cov_lower, unit_draw,
459 1.0, &pt.vector);
460
461 /* Apply any imposed cyclic or reflective boundaries */
462 for (p = 0; p < dim; p++) {
463 min = kde->lower_bounds[p];
464 max = kde->upper_bounds[p];
465 width = max - min;
466
467 if (point[p] < min) {
468 offset = min - point[p];
470 point[p] += width;
472 point[p] = min + offset;
474 within_bounds = 0;
475
476 } else if (point[p] > max) {
477 offset = point[p] - max;
479 point[p] -= width;
481 point[p] = max - offset;
483 within_bounds = 0;
484 }
485 }
486 }
487
488 gsl_vector_free(unit_draw);
489 return point;
490}
491
492
493/**
494 * Calculate the determinant of a matrix.
495 *
496 * Calculates the determinant of a GSL matrix.
497 * @param[in] mat The matrix to calculate the determinant of.
498 * @return The determinant of \a mat.
499 */
500REAL8 LALInferenceMatrixDet(gsl_matrix *mat) {
501 REAL8 det=0.0;
502 INT4 sign=0;
503 INT4 * signum = &sign;
504 INT4 dim = mat->size1;
505
506 gsl_permutation * p = gsl_permutation_calloc(dim);
507 gsl_matrix * tmp_ptr = gsl_matrix_calloc(dim, dim);
508 gsl_matrix_memcpy(tmp_ptr, mat);
509
510 gsl_linalg_LU_decomp(tmp_ptr, p, signum);
511 det = gsl_linalg_LU_det(tmp_ptr, *signum);
512
513 gsl_permutation_free(p);
514 gsl_matrix_free(tmp_ptr);
515
516 return det;
517}
518
519
520/**
521 * Calculate the mean of a data set.
522 *
523 * Calculate the mean vector of a data set contained in the rows of a
524 * GSL matrix.
525 * @param[out] mean GSL vector containing the mean.
526 * @param[in] data GSL matrix data set to compute the mean of.
527 */
528void LALInferenceComputeMean(gsl_vector *mean, gsl_matrix *data) {
529 INT4 i;
530 INT4 npts = data->size1;
531
532 /* Zero out mean vector */
533 for (i = 0; i < (INT4)mean->size; i++)
534 gsl_vector_set(mean, i, 0.0);
535
536 gsl_vector_view x;
537 for (i = 0; i < npts; i++) {
538 x = gsl_matrix_row(data, i);
539 gsl_vector_add(mean, &x.vector);
540 }
541 gsl_vector_scale(mean, 1./(REAL8)npts);
542}
543
544
545/* Compute the (GSL) covariance matrix of a list of samples */
546/**
547 * Calculate the covariance matrix of a data set.
548 *
549 * Calculate the covariance matrix of a data set contained in the rows of a
550 * GSL matrix.
551 * @param[out] cov GSL matrix containing the covariance matrix of \a data.
552 * @param[in] data GSL matrix data set to compute the covariance matrix of.
553 */
554void LALInferenceComputeCovariance(gsl_matrix *cov, gsl_matrix *data) {
555 INT4 i, j, k;
556 REAL8 var;
557
558 INT4 npts = data->size1;
559 INT4 dim = data->size2;
560
561 gsl_vector *adiff = gsl_vector_alloc(npts);
562 gsl_vector *bdiff = gsl_vector_alloc(npts);
563
564 gsl_vector *mean = gsl_vector_alloc(dim);
566 for (i = 0; i < dim; i++) {
567 gsl_vector_view a = gsl_matrix_column(data, i);
568 gsl_vector_memcpy(adiff, &a.vector);
569 gsl_vector_add_constant(adiff, -gsl_vector_get(mean, i));
570
571 for (j = 0; j < dim; j++) {
572 gsl_vector_view b = gsl_matrix_column(data, j);
573 gsl_vector_memcpy(bdiff, &b.vector);
574 gsl_vector_add_constant(bdiff, -gsl_vector_get(mean, j));
575
576 gsl_vector_mul(bdiff, adiff);
577 var = 0.;
578 for (k = 0; k < npts; k++)
579 var += gsl_vector_get(bdiff, k);
580 var /= (REAL8)(npts-1);
581
582 gsl_matrix_set(cov, i, j, var);
583 }
584 }
585
586 gsl_vector_free(adiff);
587 gsl_vector_free(bdiff);
588 gsl_vector_free(mean);
589}
590
591
592/**
593 * Determine the log of the sum of an array of exponentials.
594 *
595 * Utility for calculating the log of the sum of an array of exponentials.
596 * Useful for avoiding overflows.
597 * @param[in] vals Array of values to be exponentiated, summed, and logged.
598 * @param[in] size Number of elements in \a vals.
599 * @return The log of the sum of elements in \a vals.
600 */
602 INT4 i;
603
604 REAL8 max_comp = 0.;
605 for (i = 0; i < size; i++) {
606 if (max_comp < vals[i])
607 max_comp = vals[i];
608 }
609
610 REAL8 result = 0.;
611 for (i = 0; i < size; i++)
612 result += exp(vals[i]-max_comp);
613 result = max_comp + log(result);
614
615 return result;
616}
INT4 LALInferenceCholeskyDecompose(gsl_matrix *mat)
Compute the Cholesky decomposition of a matrix.
void LALInferenceSetKDEBandwidth(LALInferenceKDE *kde)
Calculate the bandwidth and normalization factor for a KDE.
LALInferenceKDE * LALInferenceNewKDE(REAL8 *pts, INT4 npts, INT4 dim, INT4 *mask)
Allocate, fill, and tune a Gaussian kernel density estimate from an array of samples.
REAL8 * LALInferenceDrawKDESample(LALInferenceKDE *kde, gsl_rng *rng)
Draw a sample from a kernel density estimate.
void LALInferenceComputeMean(gsl_vector *mean, gsl_matrix *data)
Calculate the mean of a data set.
REAL8 LALInferenceKDEEvaluatePoint(LALInferenceKDE *kde, REAL8 *point)
Evaluate the (log) PDF from a KDE at a single point.
void LALInferenceDestroyKDE(LALInferenceKDE *kde)
Free an allocated KDE structure.
REAL8 log_add_exps(REAL8 *vals, INT4 size)
Determine the log of the sum of an array of exponentials.
void LALInferenceComputeCovariance(gsl_matrix *cov, gsl_matrix *data)
Calculate the covariance matrix of a data set.
LALInferenceKDE * LALInferenceInitKDE(INT4 npts, INT4 dim)
Construct an empty KDE structure.
REAL8 LALInferenceMatrixDet(gsl_matrix *mat)
Calculate the determinant of a matrix.
LALInferenceKDE * LALInferenceNewKDEfromMat(gsl_matrix *data, INT4 *mask)
Allocate, fill, and tune a Gaussian kernel density estimate from a matrix of points.
static REAL8 mean(REAL8 *array, int N)
#define max(a, b)
int j
int k
#define fprintf
sigmaKerr data[0]
#define LAL_PI
double REAL8
int32_t INT4
LALInferenceParamVaryType
An enumerated type for denoting the topology of a parameter.
Definition: LALInference.h:127
@ LALINFERENCE_PARAM_OUTPUT
A parameter that never changes, functions should respect this.
Definition: LALInference.h:131
@ LALINFERENCE_PARAM_FIXED
A parameter that is cyclic, such as an angle between 0 and 2pi.
Definition: LALInference.h:130
@ LALINFERENCE_PARAM_CIRCULAR
A parameter that simply has a maximum and a minimum.
Definition: LALInference.h:129
@ LALINFERENCE_PARAM_LINEAR
Definition: LALInference.h:128
void * XLALMalloc(size_t n)
void * XLALCalloc(size_t m, size_t n)
void XLALFree(void *p)
static const INT4 a
width
Structure containing the Guassian kernel density of a set of samples.
REAL8 log_norm_factor
Normalization factor of the KDE.
REAL8 * upper_bounds
Upper param bounds.
LALInferenceParamVaryType * lower_bound_types
Array of param boundary types.
INT4 npts
Number of points in data.
gsl_matrix * cov
The covariance matrix of data.
LALInferenceParamVaryType * upper_bound_types
Array of param boundary types.
gsl_matrix * cholesky_decomp_cov
The Cholesky decomposition of the covariance matrix, containing both terms as returned by gsl_linalg_...
gsl_matrix * data
Data to estimate the underlying distribution of.
INT4 dim
Dimension of points in data.
REAL8 * lower_bounds
Lower param bounds.
gsl_vector * mean
The mean of data.
gsl_matrix * cholesky_decomp_cov_lower
Just the lower portion of cholesky_decomp_cov.