Source code for far

# Copyright (C) 2011--2014  Kipp Cannon, Chad Hanna, Drew Keppel
# Copyright (C) 2013  Jacob Peoples
#
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
# Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

## @file
# The python module to implement false alarm probability and false alarm rate
#
# ### Review Status
#
# STATUS: reviewed with actions
#
# | Names                                                                                 | Hash                                        | Date       | Diff to Head of Master      |
# | -------------------------------------------                                           | ------------------------------------------- | ---------- | --------------------------- |
# | Hanna, Cannon, Meacher, Creighton J, Robinet, Sathyaprakash, Messick, Dent, Blackburn | 7fb5f008afa337a33a72e182d455fdd74aa7aa7a | 2014-11-05 |<a href="@gstlal_inspiral_cgit_diff/python/far.py?id=HEAD&id2=7fb5f008afa337a33a72e182d455fdd74aa7aa7a">far.py</a> |
# | Hanna, Cannon, Meacher, Creighton J, Sathyaprakash,                                   | 72875f5cb241e8d297cd9b3f9fe309a6cfe3f716 | 2015-11-06 |<a href="@gstlal_inspiral_cgit_diff/python/far.py?id=HEAD&id2=72875f5cb241e8d297cd9b3f9fe309a6cfe3f716">far.py</a> |
#
# #### Action items
#

# - Address the fixed SNR PDF using median PSD which could be pre-computed and stored on disk. (Store database of SNR pdfs for a variety of horizon)
# - The binning parameters are hard-coded too; Could it be a problem?
# - Chisquare binning hasn't been tuned to be a good representation of the PDFs; could be improved in future

## @package far


#
# =============================================================================
#
#                                   Preamble
#
# =============================================================================
#


try:
	from fpconst import NaN, NegInf, PosInf
except ImportError:
	# fpconst is not part of the standard library and might not be
	# available
	NaN = float("nan")
	NegInf = float("-inf")
	PosInf = float("+inf")
import itertools
import math
import multiprocessing
import numpy
import random
from scipy import interpolate
import sys
import time
from tqdm import tqdm


from ligo.lw import ligolw
from ligo.lw import array as ligolw_array
from ligo.lw import param as ligolw_param
from ligo.lw import lsctables
from ligo.lw import utils as ligolw_utils
from lal import rate
from lalburst import snglcoinc
from ligo import segments
from ligo.segments import utils as segmentsUtils


from gstlal import stats as gstlalstats
from gstlal.stats import inspiral_lr


#
# =============================================================================
#
#                              Ranking Statistic
#
# =============================================================================
#


[docs]def kwarggeniter(d, min_instruments): d = tuple(sorted(d.items())) return map(dict, itertools.chain(*(itertools.combinations(d, i) for i in range(min_instruments, len(d) + 1))))
[docs]def kwarggen(segments, snrs, chi2s_over_snr2s, phase, dt, template_id, min_instruments): # segments and template_id held fixed for snrs, chi2s_over_snr2s, phase, dt in zip( kwarggeniter(snrs, min_instruments), kwarggeniter(chi2s_over_snr2s, min_instruments), kwarggeniter(phase, min_instruments), kwarggeniter(dt, min_instruments) ): yield { "segments": segments, "snrs": snrs, "chi2s_over_snr2s": chi2s_over_snr2s, "phase": phase, "dt": dt, "template_id": template_id }
[docs]class RankingStat(snglcoinc.LnLikelihoodRatioMixin): ligo_lw_name_suffix = u"gstlal_inspiral_rankingstat" # # Default content handler for loading RankingStat objects from XML # documents #
[docs] @ligolw_array.use_in @ligolw_param.use_in @lsctables.use_in class LIGOLWContentHandler(ligolw.LIGOLWContentHandler): pass
# network SNR threshold network_snrsq_threshold = 49.0 def __init__(self, template_ids = None, instruments = frozenset(("H1", "L1", "V1")), population_model_file = None, dtdphi_file = None, min_instruments = 1, delta_t = 0.005, horizon_factors = None, idq_file = None): self.numerator = inspiral_lr.LnSignalDensity(template_ids = template_ids, instruments = instruments, delta_t = delta_t, population_model_file = population_model_file, dtdphi_file = dtdphi_file, min_instruments = min_instruments, horizon_factors = horizon_factors, idq_file = idq_file) self.denominator = inspiral_lr.LnNoiseDensity(template_ids = template_ids, instruments = instruments, delta_t = delta_t, min_instruments = min_instruments) self.zerolag = inspiral_lr.LnLRDensity(template_ids = template_ids, instruments = instruments, delta_t = delta_t, min_instruments = min_instruments)
[docs] def fast_path_cut(self, snrs, chi2s_over_snr2s, **kwargs): """ Return True if the candidate described by kwargs should be cut, False otherwise. Used to fast-path out of the full likelihood evaluation, and to drop coincs from the coincidence engine to reduce data rate. NOTE: surviving this cut is not an endorsement of the candidate, many candidates that survive this cut will subsequently be discarded for other reasons. This code is only intended to achieve a computationally efficient data rate reduction that does not negatively impact the search sensitivity. """ # network SNR cut if sum(snr**2. for snr in snrs.values()) < self.network_snrsq_threshold: return True elif any(snr > self.numerator.snr_max for snr in snrs.values())\ or any(chi2_over_snr2 < self.numerator.chi2_over_snr2_min for chi2_over_snr2 in chi2s_over_snr2s.values())\ or any(chi2_over_snr2 > self.numerator.chi2_over_snr2_max for chi2_over_snr2 in chi2s_over_snr2s.values()): # check if the given trigger is outside of SNR-chisq boundaries. return True return False
def __call__(self, **kwargs): """ Evaluate the ranking statistic. """ # ranking statistic is only defined for SNRs at or above # the threshold. modern gstlal_inspiral generates # sub-threshold triggers for Bayestar and we need to be # ceratin they don't leak into here. assert all(snr >= self.snr_min for snr in kwargs["snrs"].values()) # fast-path cut if self.fast_path_cut(**kwargs): return NegInf # FIXME NOTE # Here we put in a penalty for single detector triggers. # This is a tuned parameter. lnP = 0. if len(kwargs["snrs"]) > 1 else -13. # full ln L ranking stat. we define the ranking statistic # to be the largest ln L from all allowed subsets of # triggers. Maximizes over higher than double IFO combos. return lnP + super(RankingStat, self).__call__(**kwargs) if len(kwargs["snrs"])==1 else max(super(RankingStat, self).__call__(**kwargs) for kwargs in kwarggen(min_instruments = max(2, self.min_instruments), **kwargs)) @property def template_ids(self): return self.denominator.template_ids @template_ids.setter def template_ids(self, value): self.numerator.template_ids = value self.denominator.template_ids = value self.zerolag.template_ids = value @property def snr_min(self): return self.numerator.snr_min @property def instruments(self): return self.denominator.instruments @property def min_instruments(self): return self.denominator.min_instruments @property def delta_t(self): return self.denominator.delta_t @property def population_model_file(self): return self.numerator.population_model_file @property def dtdphi_file(self): return self.numerator.dtdphi_file @property def idq_file(self): return self.numerator.idq_file @property def horizon_factors(self): return self.numerator.horizon_factors @property def segmentlists(self): return self.denominator.segmentlists def __iadd__(self, other): if type(other) != type(self): raise TypeError(other) self.numerator += other.numerator self.denominator += other.denominator self.zerolag += other.zerolag return self
[docs] def copy(self): new = type(self)(template_ids = self.template_ids, instruments = self.instruments, population_model_file = self.population_model_file, dtdphi_file = self.dtdphi_file, min_instruments = self.min_instruments, delta_t = self.delta_t) new.numerator = self.numerator.copy() new.denominator = self.denominator.copy() new.zerolag = self.zerolag.copy() # NOTE: only if denominator.lnzerolagdensity is pointing # to *our* zero-lag density will the copy's be set, # otherwise the copy's will be reset to None if self.denominator.lnzerolagdensity is self.zerolag: new.denominator.lnzerolagdensity = new.zerolag return new
[docs] def kwargs_from_triggers(self, events, offsetvector): """ Constructs the key-word arguments to be passed to .__call__() from a sequence of single-detector triggers constituting a coincident candidate collected with the given offset vector. For internal use by the *_from_triggers() methods. """ # # exclude triggers that are below the SNR threshold. this # is easier to do here, when what we have is triggers, than # in the .__call__() method where their parameters have # already been mixed into the kwargs. # events = tuple(event for event in events if event.snr >= self.snr_min) assert len(events) >= self.min_instruments, "coincidence engine failed to respect minimum instrument count requirement for candidates: found candidate with %d < %d instruments" % (len(events), self.min_instruments) # # pick a random, but reproducible, trigger to provide a # reference timestamp for, e.g, the \Delta t's between # instruments and the time spanned by the candidate. # # the trigger times are conveyed as offsets-from-epoch. # the trigger times are taken to be their time-shifted # values, the time-shifted reference trigger is used to # define the epoch. the objective here is to allow the # trigger times to be converted to floats without loss of # precision, without loosing knowledge of the \Delta t's # between triggers, and in such a way that singles always # have a time-shifted offset-from-epoch of 0. # # for the time spanned by the event, we need a segment for # every instrument whether or not it provided a trigger, # and reflecting the offset vector that was considered when # this candidate was formed (the ranking statistic needs to # know when it was we were looking for triggers in the # instruments that failed to provide them). for # instruments that do not provide a trigger, we time-shift # the reference trigger's interval under the assumption # that because we use exact-match coincidence the interval # is the same for all instruments. # reference = min(events, key = lambda event: event.ifo) ref_end, ref_offset = reference.end, offsetvector[reference.ifo] template_id = reference.template_id if template_id not in self.template_ids: raise ValueError("event IDs %s are from the wrong template" % ", ".join(sorted(str(event.event_id) for event in events))) # segment spanned by reference event seg = segments.segment(ref_end - reference.template_duration, ref_end) # initially populate segs dictionary shifting reference # instrument's segment according to offset vectors segs = dict((instrument, seg.shift(ref_offset - offsetvector[instrument])) for instrument in self.instruments) # for any any real triggers we have, use their true # intervals segs.update((event.ifo, segments.segment(event.end - event.template_duration, event.end)) for event in events) # done return dict( segments = segs, snrs = dict((event.ifo, event.snr) for event in events), chi2s_over_snr2s = dict((event.ifo, event.chisq / event.snr**2.) for event in events), phase = dict((event.ifo, event.coa_phase) for event in events), dt = dict((event.ifo, float(event.end - ref_end) + offsetvector[event.ifo] - ref_offset) for event in events), template_id = template_id )
[docs] def fast_path_cut_from_triggers(self, events, offsetvector): """ Evaluate the ranking statistic's fast-path cut for a sequence of single-detector triggers constituting a coincident candidate collected with the given offset vector. """ return self.fast_path_cut(**self.kwargs_from_triggers(events, offsetvector))
[docs] def ln_lr_from_triggers(self, events, offsetvector): """ Evaluate the ranking statistic for a sequence of single-detector triggers constituting a coincident candidate collected with the given offset vector. """ try: return self(**self.kwargs_from_triggers(events, offsetvector)) except (ValueError, AssertionError) as e: raise type(e)("%s: event IDs %s, offsets %s" % (str(e), ", ".join(sorted(str(event.event_id) for event in events)), str(offsetvector)))
[docs] def finish(self): self.numerator.finish() self.denominator.finish() self.zerolag.finish() return self
[docs] def is_healthy(self, verbose = False): # do we believe the PDFs are sufficiently well-defined to # compute ln L? not healthy until at least one instrument # in the analysis has produced triggers, and until all that # have produced triggers have each produced at least 10 # million. # NOTE: this will go badly if a detector that has never # produced triggers, say because it joins an observing run # late, suddenly starts producing triggers between snapshot # cycles of an online analysis. we're assuming, here, that # detectors join science runs not at random times, but at # scheduled times, say, during maintenance breaks, and that # the analysis will not be collecting any candidates for # approximately one snapshot interval around the addition # of the new detector. nonzero_counts = [count for count in self.denominator.triggerrates.counts.values() if count] health = 0. if not nonzero_counts else min(nonzero_counts) / 10000000. with tqdm(desc = "ranking stat. health", total = 1., bar_format="{l_bar}{bar}", disable = not verbose) as pbar: pbar.update(health) return health >= 1.
[docs] @classmethod def get_xml_root(cls, xml, name): """ Sub-classes can use this in their overrides of the .from_xml() method to find the root element of the XML serialization. """ name = u"%s:%s" % (name, cls.ligo_lw_name_suffix) xml = [elem for elem in xml.getElementsByTagName(ligolw.LIGO_LW.tagName) if elem.hasAttribute(u"Name") and elem.Name == name] if len(xml) != 1: raise ValueError("XML tree must contain exactly one %s element named %s" % (ligolw.LIGO_LW.tagName, name)) return xml[0]
[docs] @classmethod def from_xml(cls, xml, name, convert=False): """ In the XML document tree rooted at xml, search for the serialized RankingStat object named name, and deserialize it. The return value is a two-element tuple. The first element is the deserialized RankingStat object, the second is the process ID recorded when it was written to XML. """ xml = cls.get_xml_root(xml, name) self = cls.__new__(cls) self.numerator = inspiral_lr.LnSignalDensity.from_xml(xml, "numerator", convert=convert) self.denominator = inspiral_lr.LnNoiseDensity.from_xml(xml, "denominator", convert=convert) self.zerolag = inspiral_lr.LnLRDensity.from_xml(xml, "zerolag", convert=convert) return self
[docs] def to_xml(self, name): """ Serialize this RankingStat object to an XML fragment and return the root element of the resulting XML tree. """ xml = ligolw.LIGO_LW({u"Name": u"%s:%s" % (name, self.ligo_lw_name_suffix)}) xml.appendChild(self.numerator.to_xml("numerator")) xml.appendChild(self.denominator.to_xml("denominator")) xml.appendChild(self.zerolag.to_xml("zerolag")) return xml
[docs]class DatalessRankingStat(RankingStat): # NOTE: .__iadd__(), .copy() and I/O are forbidden, but these # operations will be blocked by the .numerator and .denominator # instances, no need to add extra code here to prevent these # operations def __init__(self, *args, **kwargs): self.numerator = inspiral_lr.DatalessLnSignalDensity(*args, **kwargs) kwargs.pop("population_model_file", None) kwargs.pop("dtdphi_file", None) kwargs.pop("idq_file", None) self.denominator = inspiral_lr.DatalessLnNoiseDensity(*args, **kwargs)
[docs] def finish(self): # no zero-lag self.numerator.finish() self.denominator.finish() return self
[docs] def is_healthy(self, verbose = False): with tqdm(desc = "ranking stat. health", total = 1., bar_format="{l_bar}{bar}", disable = not verbose) as pbar: pbar.update(1.) return True
[docs]class OnlineFrankensteinRankingStat(RankingStat): """ Version of RankingStat with horizon distance history and trigger rate history spliced in from another instance. Used to solve a chicken-or-egg problem and assign ranking statistic values in an aonline anlysis. NOTE: the donor data is not copied, instances of this class hold references to the donor's data, so as it is modified those modifications are immediately reflected here. """ # NOTE: .__iadd__() is forbidden, but this operation will be blocked by # the .numerator and .denominator instances, no need to add extra code here # to prevent that. def __init__(self, src, donor): self.numerator = inspiral_lr.OnlineFrankensteinLnSignalDensity.splice(src.numerator, donor.numerator) self.denominator = inspiral_lr.OnlineFrankensteinLnNoiseDensity.splice(src.denominator, donor.denominator) self.zerolag = inspiral_lr.OnlineFrankensteinLnLRDensity.splice(src.zerolag) self.src = src self.donor = donor
[docs] def copy(self): new = type(self)(src=self.src, donor=self.donor) new.numerator = self.numerator.copy() new.denominator = self.denominator.copy() new.zerolag = self.zerolag.copy() return new
[docs] def finish(self): # no zero-lag self.numerator.finish() self.denominator.finish() self.zerolag.finish() return self
# # ============================================================================= # # False Alarm Book-Keeping Object # # ============================================================================= #
[docs]def binned_log_likelihood_ratio_rates_from_samples(signal_lr_lnpdf, noise_lr_lnpdf, samples, nsamples): """ Populate signal and noise BinnedLnPDF densities from a sequence of samples (which can be a generator). The first nsamples elements from the sequence are used. The samples must be a sequence of three-element tuples (or sequences) in which the first element is a value of the ranking statistic (likelihood ratio) and the second and third elements the logs of the probabilities of obtaining that value of the ranking statistic in the signal and noise populations respectively. """ exp = math.exp isnan = math.isnan signal_lr_lnpdf_count = signal_lr_lnpdf.count noise_lr_lnpdf_count = noise_lr_lnpdf.count for ln_lamb, lnP_signal, lnP_noise in itertools.islice(samples, nsamples): if isnan(ln_lamb): raise ValueError("encountered NaN likelihood ratio") if isnan(lnP_signal) or isnan(lnP_noise): raise ValueError("encountered NaN signal or noise model probability densities") signal_lr_lnpdf_count[ln_lamb,] += exp(lnP_signal) noise_lr_lnpdf_count[ln_lamb,] += exp(lnP_noise)
# # Class to compute ranking statistic PDFs for background-like and # signal-like populations #
[docs]class RankingStatPDF(object): ligo_lw_name_suffix = u"gstlal_inspiral_rankingstatpdf"
[docs] @staticmethod def density_estimate(lnpdf, name, kernel = rate.gaussian_window(4.)): """ For internal use only. """ assert not numpy.isnan(lnpdf.array).any(), "%s log likelihood ratio PDF contain NaNs" % name rate.filter_array(lnpdf.array, kernel)
[docs] @staticmethod def binned_log_likelihood_ratio_rates_from_samples_wrapper(queue, signal_lr_lnpdf, noise_lr_lnpdf, samples, nsamples): """ For internal use only. """ try: # want the forked processes to use different random # number sequences, so we re-seed Python and # numpy's random number generators here in the # wrapper in the hope that that takes care of it random.seed() numpy.random.seed() binned_log_likelihood_ratio_rates_from_samples(signal_lr_lnpdf, noise_lr_lnpdf, samples, nsamples) queue.put((signal_lr_lnpdf.array, noise_lr_lnpdf.array)) except: queue.put((None, None)) raise
def __init__(self, rankingstat, signal_noise_pdfs = None, nsamples = 2**24, nthreads = 8, verbose = False): # # bailout out used by .from_xml() class method to get an # uninitialized instance # if rankingstat is None: return # # initialize binnings # self.noise_lr_lnpdf = rate.BinnedLnPDF(rate.NDBins((rate.ATanBins(0., 110., 6000),))) self.signal_lr_lnpdf = rate.BinnedLnPDF(rate.NDBins((rate.ATanBins(0., 110., 6000),))) self.zero_lag_lr_lnpdf = rate.BinnedLnPDF(rate.NDBins((rate.ATanBins(0., 110., 6000),))) self.segments = segmentsUtils.vote(rankingstat.segmentlists.values(), rankingstat.min_instruments) if rankingstat.template_ids is None: raise ValueError("cannot be initialized from a RankingStat that is not for a specific set of templates") self.template_ids = rankingstat.template_ids # # bailout used by codes that want all-zeros histograms # if not nsamples: return # # run importance-weighted random sampling to populate # binnings. # if signal_noise_pdfs is None: signal_noise_pdfs = rankingstat nthreads = int(nthreads) assert nthreads >= 1 threads = [] for i in range(nthreads): assert nsamples // nthreads >= 1 q = multiprocessing.SimpleQueue() p = multiprocessing.Process(target = lambda: self.binned_log_likelihood_ratio_rates_from_samples_wrapper( q, self.signal_lr_lnpdf, self.noise_lr_lnpdf, rankingstat.ln_lr_samples(rankingstat.denominator.random_params(), signal_noise_pdfs), nsamples = nsamples // nthreads )) p.start() threads.append((p, q)) nsamples -= nsamples // nthreads nthreads -= 1 # sleep a bit to help random number seeds change time.sleep(1.5) while threads: p, q = threads.pop(0) signal_counts, noise_counts = q.get() self.signal_lr_lnpdf.array += signal_counts self.noise_lr_lnpdf.array += noise_counts p.join() if p.exitcode: raise Exception("sampling thread failed") if verbose: print("done computing ranking statistic PDFs", file=sys.stderr) # # apply density estimation kernels to counts # self.density_estimate(self.noise_lr_lnpdf, "noise model") self.density_estimate(self.signal_lr_lnpdf, "signal model") # # set the total sample count in the noise and signal # ranking statistic histogram equal to the total expected # count of the respective events from the experiment. this # information is required so that when adding ranking # statistic PDFs in our .__iadd__() method they are # combined with the correct relative weights, so that # .__iadd__() has the effect of marginalizing the # distribution over the experiments being combined. # self.noise_lr_lnpdf.array *= sum(rankingstat.denominator.candidate_count_model().values()) / self.noise_lr_lnpdf.array.sum() self.noise_lr_lnpdf.normalize() self.signal_lr_lnpdf.array *= rankingstat.numerator.candidate_count_model() / self.signal_lr_lnpdf.array.sum() self.signal_lr_lnpdf.normalize()
[docs] def copy(self): new = self.__class__(None) new.noise_lr_lnpdf = self.noise_lr_lnpdf.copy() new.signal_lr_lnpdf = self.signal_lr_lnpdf.copy() new.zero_lag_lr_lnpdf = self.zero_lag_lr_lnpdf.copy() new.segments = type(self.segments)(self.segments) new.template_ids = self.template_ids return new
[docs] def collect_zero_lag_rates(self, connection, coinc_def_id): # FIXME: Gamma0 contains the template_id, switch to proper # column when one is available for ln_likelihood_ratio, template_id in connection.cursor().execute(""" SELECT likelihood, (SELECT Gamma0 FROM sngl_inspiral JOIN coinc_event_map ON ( coinc_event_map.table_name == "sngl_inspiral" AND coinc_event_map.event_id == sngl_inspiral.event_id ) WHERE coinc_event_map.coinc_event_id == coinc_event.coinc_event_id LIMIT 1 ) FROM coinc_event WHERE coinc_def_id == ? AND NOT EXISTS ( SELECT * FROM time_slide WHERE time_slide.time_slide_id == coinc_event.time_slide_id AND time_slide.offset != 0 ) """, (coinc_def_id,)): assert ln_likelihood_ratio is not None, "null likelihood ratio encountered. probably coincs have not been ranked" if template_id not in self.template_ids: raise ValueError("zero-lag candidate encountered with template ID not for this RankingStatPDF") self.zero_lag_lr_lnpdf.count[ln_likelihood_ratio,] += 1. self.zero_lag_lr_lnpdf.normalize()
[docs] def density_estimate_zero_lag_rates(self): # apply density estimation preserving total count, then # normalize PDF count_before = self.zero_lag_lr_lnpdf.array.sum() # FIXME: should .normalize() be able to handle NaN? if count_before: self.density_estimate(self.zero_lag_lr_lnpdf, "zero lag") self.zero_lag_lr_lnpdf.array *= count_before / self.zero_lag_lr_lnpdf.array.sum() self.zero_lag_lr_lnpdf.normalize()
def __iadd__(self, other): self.noise_lr_lnpdf += other.noise_lr_lnpdf self.noise_lr_lnpdf.normalize() self.signal_lr_lnpdf += other.signal_lr_lnpdf self.signal_lr_lnpdf.normalize() self.zero_lag_lr_lnpdf += other.zero_lag_lr_lnpdf self.zero_lag_lr_lnpdf.normalize() # FIXME: now that we know what templates this is for, we # could conceivably impose a policy that if the segments # overlap then the templates must be different, and if the # templates are the same then the segments must be disjoint self.segments += other.segments self.template_ids |= other.template_ids return self
[docs] def new_with_extinction(self): self = self.copy() # the probability that an event survives clustering is the # probability that no event with a higher value of the # ranking statistic is within +/- dt of the event in # question. .noise_lr_lnpdf.count contains an accurate # model of the counts of pre-clustered events in each # ranking statistic bin, but we know the events are not # independent and so the total number of them cannot be # used to form a Poisson rate for the purpose of computing # the probability we desire. it has been found, # empirically, that if the CCDF of pre-clustered background # event counts is raised to some power and the clustering # survival probability computed from that as though it were # the CCDF of a Poisson process with some effective mean # event rate, the result is a very good model for the # post-clustering distribution of ranking statistics. we # have two unknown parameters: the power to which to raise # the pre-clustered ranking statistic's CCDF, and the mean # event rate to assume. we solve for these parameters by # fitting to the observed zero-lag ranking statistic # histogram x = self.noise_lr_lnpdf.bins[0].centres() assert (x == self.zero_lag_lr_lnpdf.bins[0].centres()).all() # some candidates are rejected by the ranking statistic, # causing there to be a spike in the zero-lag density at ln # L = -inf. if enough candidates get rejected this spike # becomes the mode of the PDF which messes up the mask # constructed below for the fit. we zero the first 40 bins # here to prevent that from happening (assume density # estimation kernel = 4 bins wide, with 10 sigma impulse # length) zl_counts = self.zero_lag_lr_lnpdf.array.copy() zl_counts[:40] = 0. if not zl_counts.any(): raise ValueError("zero-lag counts are all zero") # get the noise counts noise_counts = self.noise_lr_lnpdf.array.copy() # get the zerolag counts. # we model the tail of the distribution - top 0.1 - 1% - where # clustering only effects the result at a < 1% level. if zl_counts.sum() < 100 * 100: tail_zl_counts = zl_counts.sum() * 0.99 else: tail_zl_counts = zl_counts.sum() - 100 onepercent = zl_counts.cumsum().searchsorted(tail_zl_counts) # normalize the counts noise_counts /= noise_counts.sum() zl_counts /= zl_counts.sum() # compute survival probability norm = zl_counts[onepercent] / noise_counts[onepercent] zl_counts[onepercent:] = 0 noise_counts[onepercent:] = 0 survival_probability = zl_counts / noise_counts survival_probability[onepercent:] = norm survival_probability[numpy.isnan(survival_probability)] = 0.0 # apply to background counts and signal counts self.noise_lr_lnpdf.array *= survival_probability self.noise_lr_lnpdf.normalize() self.signal_lr_lnpdf.array *= survival_probability self.signal_lr_lnpdf.normalize() # # never allow PDFs that have had the extinction model # applied to be written to disk: on-disk files must only # ever provide the original data. forbid PDFs that have # been extincted from being re-extincted. # def new_with_extinction(*args, **kwargs): raise NotImplementedError("re-extincting an extincted RankingStatPDF object is forbidden") self.new_with_extinction = new_with_extinction def to_xml(*args, **kwargs): raise NotImplementedError("writing extincted RankingStatPDF object to disk is forbidden") self.to_xml = to_xml return self
[docs] def is_healthy(self, verbose = False): # do we believe the PDFs are sufficiently well-defined to # compute FAPs and FARs? health = min(self.noise_lr_lnpdf.array.sum() / 1000000., self.zero_lag_lr_lnpdf.array.sum() / 1000.) with tqdm(desc = "ranking stat. health", total = 1., bar_format="{l_bar}{bar}", disable = not verbose) as pbar: pbar.update(health) return health >= 1.
[docs] @classmethod def get_xml_root(cls, xml, name): """ Sub-classes can use this in their overrides of the .from_xml() method to find the root element of the XML serialization. """ name = u"%s:%s" % (name, cls.ligo_lw_name_suffix) xml = [elem for elem in xml.getElementsByTagName(ligolw.LIGO_LW.tagName) if elem.hasAttribute(u"Name") and elem.Name == name] if len(xml) != 1: raise ValueError("XML tree must contain exactly one %s element named %s" % (ligolw.LIGO_LW.tagName, name)) return xml[0]
[docs] @classmethod def from_xml(cls, xml, name): # find the root of the XML tree containing the # serialization of this object xml = cls.get_xml_root(xml, name) # create a mostly uninitialized instance self = cls(None) # populate from XML self.noise_lr_lnpdf = rate.BinnedLnPDF.from_xml(xml, u"noise_lr_lnpdf") self.signal_lr_lnpdf = rate.BinnedLnPDF.from_xml(xml, u"signal_lr_lnpdf") self.zero_lag_lr_lnpdf = rate.BinnedLnPDF.from_xml(xml, u"zero_lag_lr_lnpdf") self.segments = ligolw_param.get_pyvalue(xml, u"segments").strip() self.segments = segmentsUtils.from_range_strings(self.segments.split(",") if self.segments else [], float) self.template_ids = frozenset(map(int, ligolw_param.get_pyvalue(xml, u"template_ids").split(","))) return self
[docs] def to_xml(self, name): # do not allow ourselves to be written to disk without our # PDFs' internal normalization metadata being up to date self.noise_lr_lnpdf.normalize() self.signal_lr_lnpdf.normalize() self.zero_lag_lr_lnpdf.normalize() xml = ligolw.LIGO_LW({u"Name": u"%s:%s" % (name, self.ligo_lw_name_suffix)}) xml.appendChild(self.noise_lr_lnpdf.to_xml(u"noise_lr_lnpdf")) xml.appendChild(self.signal_lr_lnpdf.to_xml(u"signal_lr_lnpdf")) xml.appendChild(self.zero_lag_lr_lnpdf.to_xml(u"zero_lag_lr_lnpdf")) xml.appendChild(ligolw_param.Param.from_pyvalue(u"segments", ",".join(segmentsUtils.to_range_strings(self.segments)))) xml.appendChild(ligolw_param.Param.from_pyvalue(u"template_ids", ",".join("%d" % template_id for template_id in sorted(self.template_ids)))) return xml
# # Class to compute false-alarm probabilities and false-alarm rates from # ranking statistic PDFs #
[docs]class FAPFAR(object): def __init__(self, rankingstatpdf): # input checks if not rankingstatpdf.zero_lag_lr_lnpdf.array.any(): raise ValueError("RankingStatPDF's zero-lag counts are all zero") # save the livetime self.livetime = float(abs(rankingstatpdf.segments)) # set the rate normalization LR threshold to be the mode of # the zero-lag LR distribution. NOTE: this is a hack to # work around the extinction model's inability to model the # rate of extremly low significance events. if the # extinction model can ever be made to model the observed # candidate rate at all ranking statistic values then this # threshold nonsense can be removed. NOTE: some candidates # are rejected by the ranking statistic, causing there to # be a spike in the zero-lag density at ln L = -inf. if # enough candidates get rejected this spike becomes the # mode of the PDF. .new_with_extinction() above has a hack # to work around this in its noise mode fitting code, which # we need to reproduce here when picking the threshold # above which we believe the noise model to be correct zl = rankingstatpdf.zero_lag_lr_lnpdf.copy() zl.array[:40] = 0. rate_normalization_lr_threshold, = zl.argmax() # record trials factor, with safety checks counts = rankingstatpdf.zero_lag_lr_lnpdf.count assert not numpy.isnan(counts.array).any(), "zero lag log likelihood ratio counts contain NaNs" assert (counts.array >= 0.).all(), "zero lag log likelihood ratio rates contain negative values" self.count_above_threshold = counts[rate_normalization_lr_threshold:,].sum() # get noise model ranking stat values and event counts from # bins threshold_index = rankingstatpdf.noise_lr_lnpdf.bins[0][rate_normalization_lr_threshold] ranks = rankingstatpdf.noise_lr_lnpdf.bins[0].lower()[threshold_index:] counts = rankingstatpdf.noise_lr_lnpdf.array[threshold_index:] assert not numpy.isnan(counts).any(), "background log likelihood ratio rates contain NaNs" assert (counts >= 0.).all(), "background log likelihood ratio rates contain negative values" # complementary cumulative distribution function ccdf = counts[::-1].cumsum()[::-1] ccdf /= ccdf[0] # ccdf is P(ranking stat > threshold | a candidate). we # need P(ranking stat > threshold), i.e. need to correct # for the possibility that no candidate is present. # specifically, the ccdf needs to =1-1/e at the candidate # identification threshold, and cdf=1/e at the candidate # threshold, in order for FAR(threshold) * livetime to # equal the actual observed number of candidates. ccdf = gstlalstats.poisson_p_not_0(ccdf) # safety checks assert not numpy.isnan(ranks).any(), "log likelihood ratio co-ordinates contain NaNs" assert not numpy.isinf(ranks).any(), "log likelihood ratio co-ordinates are not all finite" assert not numpy.isnan(ccdf).any(), "log likelihood ratio CCDF contains NaNs" assert ((0. <= ccdf) & (ccdf <= 1.)).all(), "log likelihood ratio CCDF failed to be normalized" # build interpolator. self.ccdf_interpolator = interpolate.interp1d(ranks, ccdf) # record min and max ranks so we know which end of the ccdf # to use when we're out of bounds self.minrank = ranks[0] self.maxrank = ranks[-1] @gstlalstats.assert_probability def ccdf_from_rank(self, rank): return self.ccdf_interpolator(numpy.clip(rank, self.minrank, self.maxrank))
[docs] def fap_from_rank(self, rank): # implements equation (8) from Phys. Rev. D 88, 024025. # arXiv:1209.0718. return gstlalstats.fap_after_trials(self.ccdf_from_rank(rank), self.count_above_threshold)
[docs] def rank_from_fap(self, p, tolerance = 1e-6): """ Inverts .fap_from_rank(). This function is sensitive to numerical noise for probabilities that are close to 1. The tolerance sets the absolute error of the result. """ assert 0. <= p <= 1., "p (%g) is not a valid probability" % p lo, hi = self.minrank, self.maxrank while hi - lo > tolerance: mid = (hi + lo) / 2. mid_fap = self.fap_from_rank(mid) if p > mid_fap: # desired rank is below the middle hi = mid elif p < mid_fap: # desired rank is above the middle lo = mid else: # jackpot return mid return (hi + lo) / 2.
rank_from_fap.__get__ = numpy.vectorize(rank_from_fap, otypes = (numpy.float64,), excluded = (0, 2, 'self', 'tolerance'))
[docs] def far_from_rank(self, rank): # implements equation (B4) of Phys. Rev. D 88, 024025. # arXiv:1209.0718. the return value is divided by T to # convert events/experiment to events/second. "tdp" = # true-dismissal probability = 1 - single-event false-alarm # probability, the integral in equation (B4) log_tdp = numpy.log1p(-self.ccdf_from_rank(rank)) return self.count_above_threshold * -log_tdp / self.livetime
[docs] def rank_from_far(self, rate, tolerance = 1e-6): """ Inverts .far_from_rank() using a bisection search. The tolerance sets the absolute error of the result. """ lo, hi = self.minrank, self.maxrank while hi - lo > tolerance: mid = (hi + lo) / 2. mid_far = self.far_from_rank(mid) if rate > mid_far: # desired rank is below the middle hi = mid elif rate < mid_far: # desired rank is above the middle lo = mid else: # jackpot return mid return (hi + lo) / 2.
rank_from_far.__get__ = numpy.vectorize(rank_from_far, otypes = (numpy.float64,), excluded = (0, 2, 'self', 'tolerance'))
[docs] def assign_fapfars(self, connection): # assign false-alarm probabilities and false-alarm rates # FIXME: abusing false_alarm_rate column to store FAP, # move to a false_alarm_probability column?? def as_float(f): def g(x): return float(f(x)) return g connection.create_function("fap_from_rankingstat", 1, as_float(self.fap_from_rank)) connection.create_function("far_from_rankingstat", 1, as_float(self.far_from_rank)) connection.cursor().execute(""" UPDATE coinc_inspiral SET false_alarm_rate = ( SELECT fap_from_rankingstat(coinc_event.likelihood) FROM coinc_event WHERE coinc_event.coinc_event_id == coinc_inspiral.coinc_event_id ), combined_far = ( SELECT far_from_rankingstat(coinc_event.likelihood) FROM coinc_event WHERE coinc_event.coinc_event_id == coinc_inspiral.coinc_event_id ) """)
# # ============================================================================= # # I/O # # ============================================================================= #
[docs]def gen_likelihood_control_doc(xmldoc, rankingstat, rankingstatpdf): name = u"gstlal_inspiral_likelihood" node = xmldoc.childNodes[-1] assert node.tagName == ligolw.LIGO_LW.tagName if rankingstat is not None: node.appendChild(rankingstat.to_xml(name)) if rankingstatpdf is not None: node.appendChild(rankingstatpdf.to_xml(name)) return xmldoc
[docs]def parse_likelihood_control_doc(xmldoc, convert=False): name = u"gstlal_inspiral_likelihood" try: rankingstat = RankingStat.from_xml(xmldoc, name, convert=convert) except ValueError: rankingstat = None try: rankingstatpdf = RankingStatPDF.from_xml(xmldoc, name) except ValueError: rankingstatpdf = None if rankingstat is None and rankingstatpdf is None: raise ValueError("document does not contain likelihood ratio data") return rankingstat, rankingstatpdf
[docs]def marginalize_pdf_urls(urls, which, ignore_missing_files = False, verbose = False): """ Implements marginalization of PDFs in ranking statistic data files. The marginalization is over the degree of freedom represented by the file collection. One or both of the candidate parameter PDFs and ranking statistic PDFs can be processed, with errors thrown if one or more files is missing the required component. """ name = u"gstlal_inspiral_likelihood" data = None for n, url in enumerate(urls, start = 1): # # load input document # if verbose: print("%d/%d:" % (n, len(urls)), file=sys.stderr) try: xmldoc = ligolw_utils.load_url(url, verbose = verbose, contenthandler = RankingStat.LIGOLWContentHandler) except IOError: # IOError is raised when an on-disk file is # missing. urllib2.URLError is raised when a URL # cannot be loaded, but this is subclassed from # IOError so IOError will catch those, too. if not ignore_missing_files: raise if verbose: print("Could not load \"%s\" ... skipping as requested" % url, file=sys.stderr) continue # # extract PDF objects compute weighted sum of ranking data # PDFs # if which == "RankingStat": if data is None: data = RankingStat.from_xml(xmldoc, name) else: data += RankingStat.from_xml(xmldoc, name) elif which == "RankingStatPDF": if data is None: data = RankingStatPDF.from_xml(xmldoc, name) else: data += RankingStatPDF.from_xml(xmldoc, name) else: raise ValueError("invalid which (%s)" % which) xmldoc.unlink() return data