Loading [MathJax]/extensions/TeX/AMSsymbols.js
LALInference 4.1.9.1-8a6b96f
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Modules Pages
cbcBayesMCMC2pos.py
Go to the documentation of this file.
1##python
2# -*- coding: utf-8 -*-
3#
4# cbcBayesMCMC2pos.py
5#
6# Copyright 2016
7# Carl-Johan Haster <carl-johan.haster@ligo.org>
8#
9# Following methodology from cbcBayesThermoInt.py and lalapps_nest2pos.py
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
24# MA 02110-1301, USA.
25
26import sys
27from functools import reduce
28
29import matplotlib
30matplotlib.use('Agg') #sets backend to not need to open windows
31
32import numpy as np
33import astropy.table as apt
34try:
35 from scipy.integrate import trapezoid
36except ImportError:
37 # FIXME: Remove this once we require scipy >=1.6.0.
38 from scipy.integrate import trapz as trapezoid
39from optparse import OptionParser
40
41from lalinference import git_version
42from lalinference import bayespputils as bppu
43from lalinference.io import read_samples, write_samples, extract_metadata
44
45from lalinference import LALINFERENCE_PARAM_FIXED as FIXED
46from lalinference import LALINFERENCE_PARAM_OUTPUT as OUTPUT
47
48__author__="Carl-Johan Haster <carl-johan.haster@ligo.org>>"
49__version__= "git id %s"%git_version.id
50__date__= git_version.date
51
52def multipleFileCB(opt, opt_str, value, parser):
53 args=[]
54
55 def floatable(str):
56 try:
57 float(str)
58 return True
59 except ValueError:
60 return False
61
62 for arg in parser.rargs:
63 # stop on --foo like options
64 if arg[:2] == "--" and len(arg) > 2:
65 break
66 # stop on -a, but not on -3 or -3.0
67 if arg[:1] == "-" and len(arg) > 1 and not floatable(arg):
68 break
69 args.append(arg)
70
71 del parser.rargs[:len(args)]
72 #Append new files to list if some already specified
73 if getattr(parser.values, opt.dest):
74 oldargs = getattr(parser.values, opt.dest)
75 oldargs.extend(args)
76 args = oldargs
77 setattr(parser.values, opt.dest, args)
78
79mcmc_group_id = '/lalinference/lalinference_mcmc'
80
81def reassign_metadata(new_posterior, original_hdf5):
82 # Make sure output file has same metadata as original
83 # input hdf5 file
84
85 base_file = read_samples(original_hdf5)
86
87 mcmc_diagnostics_params = ['nLocalTemps','randomSeed']
88
89 meta_dict = {}
90
91 for colname, column in base_file.columns.items():
92 meta_dict[colname] = column.meta
93
94 for colname, column in new_posterior.columns.items():
95 if colname in mcmc_diagnostics_params:
96 column.meta = {'vary': OUTPUT}
97 # these parameters are fixed within a run,
98 # but doesn't have to be equal between runs.
99 elif colname in meta_dict:
100 column.meta = meta_dict[colname]
101 elif 'cos'+colname in meta_dict:
102 column.meta = meta_dict['cos'+colname]
103 elif 'sin'+colname in meta_dict:
104 column.meta = meta_dict['sin'+colname]
105 elif 'log'+colname in meta_dict:
106 column.meta = meta_dict['log'+colname]
107 elif colname.startswith('chain_'):
108 column.meta = {'vary': OUTPUT}
109 # same argument as with mcmc_diagnostics_params
110 else:
111 column.meta = {'vary': FIXED}
112
113 return new_posterior
114
115def downsample_and_evidence(data_hdf5, deltaLogP=None, fixedBurnin=None, nDownsample=None, verbose=False):
116
117 # Remove burnin from beginning of MCMC-chains, downsample by the chain's respective autocorrelation length.
118 # Compute the evidence for the set of parallel tempered chains through a thermodynamic integral
119
120 if not data_hdf5.lower().endswith(('.hdf5', '.hdf', '.h5')):
121 print('cbcBayesMCMC2pos only suports hdf5 input, for older file formats plese revert to cbcBayesThermoInt and cbcBayesPosProc')
122 sys.exit(1)
123
124 peparser = bppu.PEOutputParser('hdf5')
125
126 ps, samps = peparser.parse(data_hdf5, deltaLogP=deltaLogP, fixedBurnins=fixedBurnin,
127 nDownsample=nDownsample, tablename=None)
128 posterior_samples = apt.Table(samps, names=ps)
129
130 highTchains = []
131 for i in range(1,int(posterior_samples['nTemps'][0])):
132 ps, samps = peparser.parse(data_hdf5, deltaLogP=deltaLogP, fixedBurnins=fixedBurnin,
133 nDownsample=nDownsample, tablename='chain_'+str('%02.f' %i))
134 highTchains.append(apt.Table(samps, names=ps))
135 if verbose: print('chain_'+str('%02.f' %i)+' at a temperature '+str(highTchains[i-1]['temperature'].mean()))
136
137 betas = np.zeros(len(highTchains)+1)
138 logls = np.zeros_like(betas)
139
140 betas[0] = 1./np.median(posterior_samples['temperature'])
141 logls[0] = np.mean(posterior_samples['logl'])
142
143 for i in range(len(highTchains)):
144 betas[i+1] = 1./np.median(highTchains[i]['temperature'])
145 logls[i+1] = np.mean(highTchains[i]['logl'])
146
147 inds = np.argsort(betas)[::-1]
148
149 betas = betas[inds]
150 logls = logls[inds]
151
152 # Now extend to infinite temperature by copying the last <log(L)>.
153 # This works as long as the chains have extended to high enough
154 # temperature to sample the prior.
155 # If infinite temperature is already included, this 'duplicate'
156 # will not change the final evidence.
157 ebetas = np.concatenate((betas, [0.0]))
158 elogls = np.concatenate((logls, [logls[-1]]))
159
160 ebetas2 = np.concatenate((betas[::2], [0.0]))
161 elogls2 = np.concatenate((logls[::2], [logls[::2][-1]]))
162
163 evidence = -trapezoid(elogls, ebetas)
164 evidence2 = -trapezoid(elogls2, ebetas2)
165
166 posterior_samples['chain_log_evidence'] = evidence
167 posterior_samples['chain_delta_log_evidence'] = np.absolute(evidence - evidence2)
168 posterior_samples['chain_log_noise_evidence'] = posterior_samples['nullLogL']
169 posterior_samples['chain_log_bayes_factor'] = posterior_samples['chain_log_evidence'] - posterior_samples['chain_log_noise_evidence']
170
171 if verbose:
172 print('logZ = '+str(posterior_samples['chain_log_evidence'][0])+'+-'+str(posterior_samples['chain_delta_log_evidence'][0]))
173 print('logB_SN = '+str(posterior_samples['chain_log_bayes_factor'][0]))
174
175 posterior_samples = reassign_metadata(posterior_samples, data_hdf5)
176
177 return posterior_samples
178
179
180def weight_and_combine(pos_chains, verbose=False, evidence_weighting=True, combine_only=False):
181
182 # Combine several posterior chains into one
183 # If evidence_weighting == True, they are
184 # weighted by their relative evidence
185
186 # Otherwise they are just combined
187
188 log_evs = np.zeros(len(pos_chains))
189 log_noise_evs = np.zeros_like(log_evs)
190
191 for i in range(len(pos_chains)):
192 log_evs[i] = pos_chains[i]['chain_log_evidence'][0]
193 log_noise_evs[i] = pos_chains[i]['chain_log_noise_evidence'][0]
194 if verbose: print('Computed log_evidences: %s'%(str(log_evs)))
195
196 max_log_ev = log_evs.max()
197
198 if evidence_weighting:
199 fracs=[np.exp(log_ev-max_log_ev) for log_ev in log_evs]
200 else:
201 fracs = [1.0 for _ in log_evs]
202 if verbose: print('Relative weights of input files: %s'%(str(fracs)))
203
204 Ns=[fracs[i]/len(pos_chains[i]) for i in range(len(fracs))]
205 Ntot=max(Ns)
206 fracs=[n/Ntot for n in Ns]
207 if combine_only:
208 fracs = [1.0 for _ in fracs]
209 if verbose: print('Relative weights of input files taking into account their length: %s'%(str(fracs)))
210
211 final_posterior = pos_chains[0][np.random.uniform(size=len(pos_chains[0]))<fracs[0]]
212
213 for i in range(1,len(pos_chains)):
214 final_posterior = apt.vstack([final_posterior,
215 pos_chains[i][np.random.uniform(size=len(pos_chains[i]))<fracs[i]]])
216
217 final_log_evidence = reduce(np.logaddexp, log_evs) - np.log(len(log_evs))
218 final_log_noise_evidence = reduce(np.logaddexp, log_noise_evs) - np.log(len(log_noise_evs))
219
220 run_level= 'lalinference/lalinference_mcmc'
221 metadata = {}
222 metadata[run_level] = {}
223 metadata[run_level]['log_bayes_factor'] = final_log_evidence - final_log_noise_evidence
224 metadata[run_level]['log_evidence'] = final_log_evidence
225 metadata[run_level]['log_noise_evidence'] = final_log_noise_evidence
226 metadata[run_level]['log_max_likelihood'] = final_posterior['logl'].max()
227 # This has already been burned-in and downsampled,
228 # remove the cycle column to stop cbcBayesPosProc
229 # from doing it again.
230 final_posterior.remove_column('cycle')
231
232 return final_posterior, metadata
233
234USAGE='''%prog [options] PTMCMC_datafile.hdf5 [PTMCMC_datafile2.hdf5 ...]
235Compute the evidence for a set of parallel tempered MCMC chains
236thourgh thermodynamical integration. If using several input PTMCMC files,
237combine them into one set of posterior samples, weighted by their relative
238evidences.
239'''
240
241if __name__ == '__main__':
242 parser = OptionParser(USAGE)
243 parser.add_option(
244 '-p', '--pos', action='store', type='string', default=None,
245 help='Output file for posterior samples', metavar='posterior.hdf5')
246 parser.add_option(
247 '-v', '--verbose', action='store_true', default=False,
248 help='Print some additional information')
249 parser.add_option('-d','--data',dest='data',action='callback',
250 callback=multipleFileCB,help='datafile')
251 parser.add_option(
252 '-s','--downsample',action='store',default=None,
253 help='Approximate number of samples to record in the posterior',type='int')
254 parser.add_option(
255 '-l','--deltaLogP',action='store',default=None,
256 help='Difference in logPosterior to use for convergence test.',type='float')
257 parser.add_option(
258 '-b','--fixedBurnin',dest='fixedBurnin',action="callback",
259 callback=multipleFileCB,help='Fixed number of iteration for burnin.')
260 parser.add_option(
261 '--equal-weighting',action='store_true',default=False,
262 help = 'Disable evidence weighting and just combine chains so they have equal\
263 contribution to the final result')
264 parser.add_option(
265 '--combine-only', action='store_true',default=False,
266 help = "Don't weight the chains at all, just concatenate them (different length\
267 inputs will have different representation in the output)")
268 opts, args = parser.parse_args()
269
270 datafiles=[]
271 if args:
272 datafiles = datafiles + args
273 if opts.data:
274 datafiles = datafiles + opts.data
275
276 if opts.fixedBurnin:
277 # If only one value for multiple chains, assume it's to be applied to all chains
278 if len(opts.fixedBurnin) == 1:
279 fixedBurnins = [int(opts.fixedBurnin[0]) for df in datafiles]
280 else:
281 fixedBurnins = [int(fixedBurnin) for fixedBurnin in opts.fixedBurnin]
282 else:
283 fixedBurnins = [None]*len(datafiles)
284
285 chain_posteriors = []
286
287 for i in range(len(datafiles)):
288 chain_posteriors.append(downsample_and_evidence(datafiles[i],
289 deltaLogP=opts.deltaLogP, fixedBurnin=fixedBurnins[i], nDownsample=opts.downsample, verbose=opts.verbose))
290
291 final_posterior, metadata = weight_and_combine(chain_posteriors, verbose=opts.verbose,
292 evidence_weighting = not opts.equal_weighting,
293 combine_only = opts.combine_only)
294
295 for path in datafiles:
296 run_identifier = extract_metadata(path, metadata)
297
298 # Remove duplicate metadata
299 path_to_samples = '/'.join(['','lalinference',run_identifier,'posterior_samples'])
300 if path_to_samples in metadata:
301 for colname in final_posterior.columns:
302 metadata[path_to_samples].pop(colname, None)
303
304 # for metadata which is in a list, take the average.
305 for level in metadata:
306 for key in metadata[level]:
307 #if isinstance(metadata[level][key], list) and all(isinstance(x, (int,float)) for x in metadata[level][key]):
308 # metadata[level][key] = mean(metadata[level][key])
309 if isinstance(metadata[level][key], list) and all(isinstance(x, (str)) for x in metadata[level][key]):
310 print("Warning: only printing the first of the %d entries found for metadata %s/%s. You can find the whole list in the headers of individual hdf5 output files\n"%(len(metadata[level][key]),level,key))
311 metadata[level][key] = metadata[level][key][0]
312
313 write_samples(final_posterior, opts.pos,
314 path=path_to_samples, metadata=metadata)
static REAL8 mean(REAL8 *array, int N)
#define max(a, b)
def weight_and_combine(pos_chains, verbose=False, evidence_weighting=True, combine_only=False)
def downsample_and_evidence(data_hdf5, deltaLogP=None, fixedBurnin=None, nDownsample=None, verbose=False)
def reassign_metadata(new_posterior, original_hdf5)
def extract_metadata(filename, metadata, log_noise_evidences=[], log_max_likelihoods=[], nlive=[], dset_name=None, nest=False, strict_versions=True)
Extract metadata from HDF5 sample chain file.
Definition: hdf5.py:332
def write_samples(table, filename, metadata=None, **kwargs)
Write an HDF5 sample chain file.
Definition: hdf5.py:243
def read_samples(filename, path=None, tablename=POSTERIOR_SAMPLES)
Read an HDF5 sample chain file.
Definition: hdf5.py:168