lalsuite/lalapps/lalapps__cosmicstring__pipe_8py_source.html

##python

#

# =============================================================================

#

#                                   Preamble

#

# =============================================================================

#


"""

Standalone ring pipeline driver script


This script produces the condor submit and dag files to run

the standalone ring code on LIGO data

"""


from __future__ import print_function


import itertools

from optparse import OptionParser

import os

import sys

import tempfile

from configparser import ConfigParser


from igwn_ligolw import lsctables

from igwn_ligolw import utils as ligolw_utils

from igwn_ligolw.utils import segments as ligolw_segments

from lal import LIGOTimeGPS

from lal import pipeline

from lal.utils import CacheEntry

from lalburst import offsetvector

from lalburst import power

from lalapps import cosmicstring

import igwn_segments as segments


__author__ = 'Xavier Siemens<siemens@gravity.phys.uwm.edu>'

__date__ = '$Date$'

__version__ = '$Revision$'


#

# =============================================================================

#

#                                 Command Line

#

# =============================================================================

#


def parse_command_line():

        parser = OptionParser(

                usage = "%prog [options] ...",

                description = "FIXME"

        )

        parser.add_option("-f", "--config-file", metavar = "filename", help = "Use this configuration file (required).")

        parser.add_option("-l", "--log-path", metavar = "path", help = "Make condor put log files in this directory (required).")

        parser.add_option("--background-time-slides", metavar = "filename", action = "append", help = "Set the name of the file from which to obtain the time slide table for use in the background branch of the pipeline (required).  This option can be given multiple times to parallelize the background analysis across time slides.  You will want to make sure the time slide files have distinct vectors to not repeat the same analysis multiple times, and in particular you'll want to make sure only one of them has a zero-lag vector in it.")

        parser.add_option("--injection-time-slides", metavar = "filename", help = "Set the name of the file from which to obtain the time slide table for use in the injection branch of the pipeline (required).")

        parser.add_option("--segments-file", metavar = "filename", help = "Set the name of the LIGO Light-Weight XML file from which to obtain segment lists (required).  See ligolw_segments and ligolw_segment_query for more information on constructing an XML-format segments file.  See also --segments-name.")

        parser.add_option("--segments-name", metavar = "name", default = "segments", help = "Set the name of the segment lists to retrieve from the segments file (default = \"segments\").  See also --segments-file.")

        parser.add_option("--vetoes-file", metavar = "filename", help = "Set the name of the LIGO Light-Weight XML file from which to obtain veto segment lists (optional).  See ligolw_segments and ligolw_segment_query for more information on constructing an XML-format segments file.  See also --vetos-name.")

        parser.add_option("--vetoes-name", metavar = "name", default = "vetoes", help = "Set the name of the segment lists to retrieve from the veto segments file (default = \"vetoes\").  See also --vetoes-file.")

        parser.add_option("-v", "--verbose", action = "store_true", help = "Be verbose.")


        options, filenames = parser.parse_args()


        required_options = ["log_path", "config_file", "background_time_slides", "injection_time_slides", "segments_file"]

        missing_options = [option for option in required_options if getattr(options, option) is None]

        if missing_options:

                raise ValueError("missing required options %s" % ", ".join(sorted("--%s" % option.replace("_", "-") for option in missing_options)))


        if options.vetoes_file is not None:

                options.vetoes_cache = set([CacheEntry(None, "VETO", None, "file://localhost" + os.path.abspath(options.vetoes_file))])

        else:

                options.vetoes_cache = set()


        options.injection_time_slides = [options.injection_time_slides]


        return options, filenames


options, filenames = parse_command_line()


#

# =============================================================================

#

#                                  Initialize

#

# =============================================================================

#


#

# start a log file for this script

#


basename = os.path.splitext(os.path.basename(options.config_file))[0]

log_fh = open(basename + '.pipeline.log', 'w')

# FIXME: the following code uses obsolete CVS ID tags.

# It should be modified to use git version information.

print("$Id$\nInvoked with arguments:", file=log_fh)

for name_value in options.__dict__.items():

        print("%s %s" % name_value, file=log_fh)

print(file=log_fh)


#

# create the config parser object and read in the ini file

#


config_parser = ConfigParser()

config_parser.read(options.config_file)


#

# initialize lalapps.power and lalapps.cosmicstring modules

#


power.init_job_types(config_parser, job_types = ("datafind", "binj", "lladd", "binjfind", "burca", "sqlite"))

cosmicstring.init_job_types(config_parser, job_types = ("string", "meas_likelihood", "calc_likelihood", "runsqlite"))


#

# make directories to store the cache files, job logs, and trigger output

#


def make_dag_directories(top_level_directory, config_parser):

        cwd = os.getcwd()

        power.make_dir_if_not_exists(top_level_directory)

        os.chdir(top_level_directory)

        power.make_dag_directories(config_parser)

        # FIXME:  move this into make_dag_directories().  requires update

        # of excess power and gstlal dags

        power.make_dir_if_not_exists(power.get_triggers_dir(config_parser))

        os.chdir(cwd)


power.make_dag_directories(config_parser)

injection_folders = []

for i in range(config_parser.getint('pipeline', 'injection-runs')):

        injection_folders.append(os.path.abspath("injections%d" % i))

        make_dag_directories(injection_folders[-1], config_parser)

noninjection_folders = []

noninjection_folders.append(os.path.abspath("noninjections"))

make_dag_directories(noninjection_folders[-1], config_parser)


#

# create a log file that the Condor jobs will write to

#


logfile = tempfile.mkstemp(prefix = basename, suffix = '.log', dir = options.log_path)[1]


#

# create the DAG writing the log to the specified directory

#


dag = pipeline.CondorDAG(logfile)

dag.set_dag_file(basename)

clipsegments_sql_filename = os.path.abspath("clipsegments.sql")


#

# get chunk lengths from the values in the ini file

#


short_segment_duration = config_parser.getint('lalapps_StringSearch', 'short-segment-duration')

pad = config_parser.getint('lalapps_StringSearch', 'pad')

min_segment_length = config_parser.getint('pipeline', 'segment-length') # not including pad at each end

trig_overlap = config_parser.getint('pipeline', 'trig_overlap')

overlap = short_segment_duration / 2 + 2 * pad  # FIXME:  correct?


#

# get the instruments and raw segments

#


instruments = lsctables.instrumentsproperty.get(config_parser.get('pipeline','ifos'))

segments_cache = set([CacheEntry(None, "SEG", None, "file://localhost" + os.path.abspath(options.segments_file))])

seglists = ligolw_segments.segmenttable_get_by_name(ligolw_utils.load_filename(options.segments_file, verbose = options.verbose), options.segments_name).coalesce()

# remove extra instruments

for instrument in set(seglists) - instruments:

        if options.verbose:

                print("warning: ignoring segments for '%s' found in '%s'" % (instrument, options.segments_file), file=sys.stderr)

        del seglists[instrument]

# check for missing instruments

if not instruments.issubset(set(seglists)):

        raise ValueError("segment lists retrieved from '%s' missing segments for instruments %s" % (options.segments_file, ", ".join(instruments - set(seglists))))

# now rely on seglists' keys to provide the instruments

del instruments


#

# Using time slide information, construct segment lists describing times

# requiring trigger construction.

#


if options.verbose:

        print("Computing segments for which lalapps_StringSearch jobs are required ...", file=sys.stderr)


background_time_slides = {}

background_seglists = segments.segmentlistdict()

for filename in options.background_time_slides:

        cache_entry = CacheEntry(None, "BG", None, "file://localhost" + os.path.abspath(filename))


        background_time_slides[cache_entry] = lsctables.TimeSlideTable.get_table(ligolw_utils.load_filename(filename, verbose = options.verbose)).as_dict().values()


        for i in range(len(background_time_slides[cache_entry])):

                background_time_slides[cache_entry][i] = offsetvector.offsetvector((instrument, LIGOTimeGPS(offset)) for instrument, offset in background_time_slides[cache_entry][i].items())

        background_seglists |= cosmicstring.compute_segment_lists(seglists, offsetvector.component_offsetvectors(background_time_slides[cache_entry], 2), min_segment_length, pad)


injection_time_slides = {}

injection_seglists = segments.segmentlistdict()

for filename in options.injection_time_slides:

        cache_entry = CacheEntry(None, "INJ", None, "file://localhost" + os.path.abspath(filename))


        injection_time_slides[cache_entry] = lsctables.TimeSlideTable.get_table(ligolw_utils.load_filename(filename, verbose = options.verbose)).as_dict().values()


        for i in range(len(injection_time_slides[cache_entry])):

                injection_time_slides[cache_entry][i] = offsetvector.offsetvector((instrument, LIGOTimeGPS(offset)) for instrument, offset in injection_time_slides[cache_entry][i].items())

        injection_seglists |= cosmicstring.compute_segment_lists(seglists, offsetvector.component_offsetvectors(injection_time_slides[cache_entry], 2), min_segment_length, pad)


#

# check for duplicate offset vectors

#


def check_for_reused_offsetvectors(background_time_slides, injection_time_slides):

        # make sure none of the injection run offset vectors is also used

        # for a non-injection run


        for background_cache_entry, background_offsetvector in [(cache_entry, offsetvector) for cache_entry, offsetvectors in background_time_slides.items() for offsetvector in offsetvectors]:

                for injection_cache_entry, injection_offsetvector in [(cache_entry, offsetvector) for cache_entry, offsetvectors in injection_time_slides.items() for offsetvector in offsetvectors]:

                        if background_offsetvector.deltas == injection_offsetvector.deltas:

                                raise ValueError("injections offset vector %s from %s is the same as non-injections offset vector %s from %s.  to avoid a self-selection bias, injections must not be performed at the same relative time shifts as a non-injection run" % (str(injection_offsetvector), injection_cache_entry.url, str(background_offsetvector), background_cache_entry.url))


check_for_reused_offsetvectors(background_time_slides, injection_time_slides)


#

# =============================================================================

#

#                                  Build DAG

#

# =============================================================================

#


#

# datafinds

#


datafinds = power.make_datafind_stage(dag, injection_seglists | background_seglists, verbose = options.verbose)


#

# trigger production, coincidence, injection identification, likelihood

# ratio probability density data collection

#


def make_coinc_branch(dag, datafinds, seglists, time_slides, min_segment_length, pad, overlap, short_segment_duration, tag, vetoes_cache = set(), do_injections = False, injections_offset = 0.0, verbose = False):

        #

        # injection job

        #


        binjnodes = set()

        if do_injections:

                # don't know what to do with more than one list of offset

                # vectors

                assert len(time_slides) == 1


                # get the largest injection offset's magnitude

                maxoffset = max(abs(offset) for offsetvectorlist in time_slides.values() for offsetvector in offsetvectorlist for offset in offsetvector.values())


                # to save disk space and speed the dag along we don't

                # generate a single injection list for the entire analysis

                # run, instead a separate list is constructed for each

                # block of data to be analyzed.  we need to be careful that

                # two nearby injection lists don't contain injections for

                # the same time, so we protract the segments by the time

                # step and coalesce so that only gaps between segments

                # larger than twice the time step result in separate files

                # being generated.  we could allow smaller gaps to survive,

                # but this way we don't have to worry about it.


                # injections_offset is a number between 0 and 1 in units of

                # the period between injections


                for seg in seglists.union(seglists).protract(power.binjjob.time_step + maxoffset).coalesce().contract(power.binjjob.time_step + maxoffset):

                        binjnodes |= power.make_binj_fragment(dag, seg.protract(maxoffset), time_slides.keys()[0], tag, offset = injections_offset)


                # artificial parent-child relationship to induce dagman to

                # submit binj jobs as the corresponding datafinds complete

                # instead of submiting all of one kind before any of the next.

                # makes dag run faster because it allows string search jobs to

                # start moving onto the cluster without waiting for all the

                # datafinds and/or all the binjs to complete


                for datafindnode in datafinds:

                        seg = segments.segment(datafindnode.get_start(), datafindnode.get_end())

                        for binjnode in binjnodes:

                                if seg.intersects(power.cache_span(binjnode.get_output_cache())):

                                        binjnode.add_parent(datafindnode)


        #

        # trigger generator jobs

        #


        # set max job length to ~3600 s (will be clipped to an allowed

        # size)

        trigger_nodes = cosmicstring.make_single_instrument_stage(dag, datafinds, seglists, tag, min_segment_length, pad, overlap, short_segment_duration, max_job_length = 3600, binjnodes = binjnodes, verbose = verbose)


        #

        # coincidence analysis

        #


        coinc_nodes = []

        for n, (time_slides_cache_entry, these_time_slides) in enumerate(time_slides.items()):

                if verbose:

                        print("%s %d/%d (%s):" % (tag, n + 1, len(time_slides), time_slides_cache_entry.path), file=sys.stderr)

                coinc_nodes.append(set())


                #

                # lalapps_cafe & ligolw_add

                #


                tisi_cache = set([time_slides_cache_entry])

                lladd_nodes = set()

                for segnum, (seg, parents, cache, clipseg) in enumerate(power.group_coinc_parents(trigger_nodes, these_time_slides, extentlimit = 150000000.0 / (len(these_time_slides) or 1), verbose = verbose)):

                        binj_cache = set(cache_entry for node in binjnodes for cache_entry in node.get_output_cache() if cache_entry.segment.intersects(seg))

                        # otherwise too many copies of the offset vector

                        # will be fed into burca

                        assert len(binj_cache) < 2

                        if do_injections:

                                # lalapps_binj has already copied the time

                                # slide document into its own output

                                extra_input_cache = vetoes_cache

                        else:

                                # ligolw_add needs to copy the time slide

                                # document into its output

                                extra_input_cache = tisi_cache | vetoes_cache

                        these_lladd_nodes = power.make_lladd_fragment(dag, parents | binjnodes, "%s_%d_%x" % (tag, n, segnum), segment = seg, input_cache = cache | binj_cache | segments_cache, extra_input_cache = extra_input_cache, remove_input = do_injections and clipseg is not None, preserve_cache = binj_cache | segments_cache | tisi_cache | vetoes_cache)

                        if clipseg is not None:

                                #

                                # this is a fragment of a too-large burca

                                # job, construct it specially and add the

                                # command-line option needed to clip the

                                # output

                                #


                                assert len(these_lladd_nodes) == 1

                                coinc_nodes[-1] |= power.make_burca_fragment(dag, these_lladd_nodes, "%s_%d" % (tag, n), coincidence_segments = segments.segmentlist([clipseg]), verbose = verbose)


                        else:

                                #

                                # this is not a fragment of a too-large

                                # burca job, add it to the pool of files to

                                # be processed by the burcas that don't

                                # require special clipping command line

                                # options

                                #


                                lladd_nodes |= these_lladd_nodes


                #

                # lalburst_coinc pool.  these are the burca jobs that don't

                # require special clipping command line options, and so can

                # bulk-process many files with each job

                #


                if verbose:

                        print("building burca jobs ...", file=sys.stderr)

                coinc_nodes[-1] |= power.make_burca_fragment(dag, lladd_nodes, "%s_%d" % (tag, n), verbose = verbose)

                if verbose:

                        print("done %s %d/%d" % (tag, n + 1, len(time_slides)), file=sys.stderr)


        #

        # lalburst_injfind

        #


        if do_injections:

                if verbose:

                        print("building binjfind jobs ...", file=sys.stderr)

                coinc_nodes = [power.make_binjfind_fragment(dag, these_coinc_nodes, "%s_%d" % (tag, n), verbose = verbose) for n, these_coinc_nodes in enumerate(coinc_nodes)]


        #

        # ligolw_sqlite and ligolw_run_sqlite

        #


        if verbose:

                print("building sqlite jobs ...", file=sys.stderr)

        coinc_nodes = [power.make_sqlite_fragment(dag, these_coinc_nodes, "%s_%d" % (tag, n), verbose = verbose) for n, these_coinc_nodes in enumerate(coinc_nodes)]

        coinc_nodes = [cosmicstring.make_run_sqlite_fragment(dag, these_coinc_nodes, "%s_%d" % (tag, n), clipsegments_sql_filename) for n, these_coinc_nodes in enumerate(coinc_nodes)]


        #

        # lalapps_string_meas_likelihood

        #


        if verbose:

                print("building lalapps_string_meas_likelihood jobs ...", file=sys.stderr)

        likelihood_nodes = [cosmicstring.make_meas_likelihood_fragment(dag, these_coinc_nodes, "%s_%d" % (tag, n)) for n, these_coinc_nodes in enumerate(coinc_nodes)]


        #

        # write output cache

        #


        if verbose:

                print("writing output cache ...", file=sys.stderr)

        for n, (these_coinc_nodes, these_likelihood_nodes) in enumerate(zip(coinc_nodes, likelihood_nodes)):

                power.write_output_cache(these_coinc_nodes | these_likelihood_nodes, "%s_%s_output.cache" % (os.path.splitext(dag.get_dag_file())[0], "%s_%d" % (tag, n)))


        #

        # done

        #


        return coinc_nodes, likelihood_nodes


user_tag = config_parser.get('pipeline', 'user_tag')


injection_coinc_nodes = []

injection_likelihood_nodes = []

for i, folder in enumerate(injection_folders):

        cwd = os.getcwd()

        os.chdir(folder)

        # note:  unpacking enforces rule that each injection run uses a

        # single time-slides file

        [these_coinc_nodes], [these_likelihood_nodes] = make_coinc_branch(dag, datafinds, injection_seglists, injection_time_slides, min_segment_length, pad, overlap, short_segment_duration, "%s_INJ_%d" % (user_tag, i), vetoes_cache = options.vetoes_cache, do_injections = True, injections_offset = float(i) / len(injection_folders), verbose = options.verbose)

        os.chdir(cwd)

        injection_coinc_nodes.append(these_coinc_nodes)

        injection_likelihood_nodes.append(these_likelihood_nodes)


for i, folder in enumerate(noninjection_folders):

        assert i == 0   # loop only works for one iteration

        cwd = os.getcwd()

        os.chdir(folder)

        background_coinc_nodes, background_likelihood_nodes = make_coinc_branch(dag, datafinds, background_seglists, background_time_slides, min_segment_length, pad, overlap, short_segment_duration, "%s_%d" % (user_tag, i), vetoes_cache = options.vetoes_cache, do_injections = False, verbose = options.verbose)

        os.chdir(cwd)


def flatten_node_groups(node_groups):

        return reduce(lambda a, b: a | b, node_groups, set())


all_background_likelihood_nodes = flatten_node_groups(background_likelihood_nodes)

all_injection_likelihood_nodes = flatten_node_groups(injection_likelihood_nodes)


#

# round-robin parameter distribution density data for likelihood ratio

# computation.

#


if options.verbose:

        print("building lalapps_string_calc_likelihood jobs ...", file=sys.stderr)


def round_robin_and_flatten(injection_coinc_node_groups, injection_likelihood_node_groups):

        # round-robin the node lists

        A = list(itertools.combinations(injection_coinc_node_groups, 1))

        B = list(itertools.combinations(injection_likelihood_node_groups, len(injection_likelihood_node_groups) - 1))

        B.reverse()

        A = [flatten_node_groups(node_groups) for node_groups in A]

        B = [flatten_node_groups(node_groups) for node_groups in B]

        return zip(A, B)


coinc_nodes = set()

for n, (these_inj_coinc_nodes, these_inj_likelihood_nodes) in enumerate(round_robin_and_flatten(injection_coinc_nodes, injection_likelihood_nodes)):

        these_inj_coinc_nodes = cosmicstring.make_calc_likelihood_fragment(dag, these_inj_coinc_nodes, these_inj_likelihood_nodes | all_background_likelihood_nodes, "%s_INJ_%d" % (user_tag, n), verbose = options.verbose)

        # to prevent race conditions, none of the calc_likelihood jobs

        # should start before all the meas_likelihood jobs have completed,

        # even those that produce data not required by the calc_likelihood

        # jobs because the calc_likelihood jobs might overwrite the

        # database files just as the meas_likelihood jobs are opening them

        # for reading.  this is only a problem for injection

        # calc_likelihood jobs because of the round-robining, the

        # non-injection ones already have all meas_likelihood jobs as

        # parents.

        for extra_parent in all_injection_likelihood_nodes - these_inj_likelihood_nodes:

                for coinc_node in these_inj_coinc_nodes:

                        coinc_node.add_parent(extra_parent)

        coinc_nodes |= these_inj_coinc_nodes

coinc_nodes |= cosmicstring.make_calc_likelihood_fragment(dag, flatten_node_groups(background_coinc_nodes), all_injection_likelihood_nodes | all_background_likelihood_nodes, user_tag, files_per_calc_likelihood = 1, verbose = options.verbose)


#

# =============================================================================

#

#                                     Done

#

# =============================================================================

#


#

# write DAG

#


if options.verbose:

        print("writing dag ...", file=sys.stderr)

dag.write_sub_files()

dag.write_dag()

cosmicstring.write_clip_segment_sql_file(clipsegments_sql_filename)


#

# write a message telling the user that the DAG has been written

#


print("""Created a DAG file which can be submitted by executing


$ condor_submit_dag %s


from a condor submit machine (e.g. hydra.phys.uwm.edu)


Do not forget to initialize your grid proxy certificate on the condor

submit machine by running the commands


$ unset X509_USER_PROXY

$ grid-proxy-init -hours $((24*7)):00


""" % dag.get_dag_file())

cosmicstring.init_job_types
def init_job_types(config_parser, job_types=("string", "meas_likelihoodjob", "calc_likelihood", "runsqlite"))
Construct definitions of the submit files.
Definition: cosmicstring.py:457

cosmicstring.write_clip_segment_sql_file
def write_clip_segment_sql_file(filename)
Definition: cosmicstring.py:601

cosmicstring.make_meas_likelihood_fragment
def make_meas_likelihood_fragment(dag, parents, tag, files_per_meas_likelihood=None)
Definition: cosmicstring.py:644

cosmicstring.make_single_instrument_stage
def make_single_instrument_stage(dag, datafinds, seglistdict, tag, min_segment_length, pad, overlap, short_segment_duration, max_job_length, binjnodes=set(), verbose=False)
Definition: cosmicstring.py:573

cosmicstring.make_run_sqlite_fragment
def make_run_sqlite_fragment(dag, parents, tag, sql_file, files_per_run_sqlite=None)
Definition: cosmicstring.py:616

cosmicstring.make_calc_likelihood_fragment
def make_calc_likelihood_fragment(dag, parents, likelihood_parents, tag, files_per_calc_likelihood=None, verbose=False)
Definition: cosmicstring.py:672

cosmicstring.compute_segment_lists
def compute_segment_lists(seglists, offset_vectors, min_segment_length, pad)
Definition: cosmicstring.py:404

lal::utils

lalapps_cosmicstring_pipe.check_for_reused_offsetvectors
def check_for_reused_offsetvectors(background_time_slides, injection_time_slides)
Definition: lalapps_cosmicstring_pipe.py:228

lalapps_cosmicstring_pipe.flatten_node_groups
def flatten_node_groups(node_groups)
Definition: lalapps_cosmicstring_pipe.py:445

lalapps_cosmicstring_pipe.parse_command_line
def parse_command_line()
Definition: lalapps_cosmicstring_pipe.py:57

lalapps_cosmicstring_pipe.make_coinc_branch
def make_coinc_branch(dag, datafinds, seglists, time_slides, min_segment_length, pad, overlap, short_segment_duration, tag, vetoes_cache=set(), do_injections=False, injections_offset=0.0, verbose=False)
Definition: lalapps_cosmicstring_pipe.py:263

lalapps_cosmicstring_pipe.round_robin_and_flatten
def round_robin_and_flatten(injection_coinc_node_groups, injection_likelihood_node_groups)
Definition: lalapps_cosmicstring_pipe.py:462

lalapps_cosmicstring_pipe.make_dag_directories
def make_dag_directories(top_level_directory, config_parser)
Definition: lalapps_cosmicstring_pipe.py:131

lalapps_string_contour_plotter.max
max
Definition: lalapps_string_contour_plotter.py:102