From 1929bc29999767165abaa5d30cb6db95733245a6 Mon Sep 17 00:00:00 2001 From: Francesco Pannarale Date: Tue, 10 Dec 2024 13:43:20 +0100 Subject: [PATCH] Vetoes in PyGRB efficiency and page_tables scripts (#4978) * Vetoes in pycbc_pygrb_page_tables + some syntax streamlining * Vetoes in pycbc_pygrb_efficiency + some syntax streamlining * Squashed mchirp retrieval bug in page_tables * PR review follow up: comprehension, comment, readability, unused variables * Cleaner format_pvalue_str * Cleaner comprehensions --- bin/pygrb/pycbc_pygrb_efficiency | 247 +++++++++++---------- bin/pygrb/pycbc_pygrb_page_tables | 345 +++++++++++++++--------------- 2 files changed, 301 insertions(+), 291 deletions(-) diff --git a/bin/pygrb/pycbc_pygrb_efficiency b/bin/pygrb/pycbc_pygrb_efficiency index 171809007f6..5745dc9eded 100644 --- a/bin/pygrb/pycbc_pygrb_efficiency +++ b/bin/pygrb/pycbc_pygrb_efficiency @@ -125,7 +125,9 @@ parser.add_argument("--bank-file", action="store", type=str, required=True, help="Location of the full template bank used.") ppu.pygrb_add_injmc_opts(parser) ppu.pygrb_add_bestnr_cut_opt(parser) +ppu.pygrb_add_slide_opts(parser) opts = parser.parse_args() +ppu.slide_opts_helper(opts) init_logging(opts.verbose, format="%(asctime)s: %(levelname)s: %(message)s") @@ -144,6 +146,7 @@ if opts.exclusion_dist_output_file is not None or \ trig_file = opts.trig_file onsource_file = opts.onsource_file found_missed_file = opts.found_missed_file +veto_file = opts.veto_file inj_set_name = opts.injection_set_name wf_err = opts.waveform_error cal_errs = {} @@ -178,76 +181,84 @@ for output_file in [opts.exclusion_dist_output_file, if output_file is not None: outdir = os.path.split(os.path.abspath(output_file))[0] if not os.path.isdir(outdir): - logging.info("Creating the output directoryi %s.", outdir) + logging.info("Creating the output directory %s.", outdir) os.makedirs(outdir) -# Extract IFOs and vetoes -ifos, vetoes = ppu.extract_ifos_and_vetoes(trig_file, opts.veto_files, - opts.veto_category) - -# Load triggers (apply reweighted SNR cut), time-slides, and segment dictionary -logging.info("Loading triggers.") -trigs = ppu.load_triggers(trig_file, ifos, vetoes, - rw_snr_threshold=opts.newsnr_threshold) -logging.info("%d offsource triggers surviving reweighted SNR cut.", - len(trigs['network/event_id'])) -logging.info("Loading timeslides.") -slide_dict = ppu.load_time_slides(trig_file) -logging.info("Loading segments.") -segment_dict = ppu.load_segment_dict(trig_file) - -# Construct trials -logging.info("Constructing trials.") -trial_dict = ppu.construct_trials(opts.seg_files, segment_dict, - ifos, slide_dict, vetoes) -total_trials = sum([len(trial_dict[slide_id]) for slide_id in slide_dict]) -logging.info("%d trials generated.", total_trials) +# Extract IFOs +ifos = ppu.extract_ifos(trig_file) -# Extract basic trigger properties and store as dictionaries -trig_time, trig_snr, trig_bestnr = \ - ppu.extract_basic_trig_properties(trial_dict, trigs, slide_dict, - segment_dict, opts) - -# Calculate BestNR values and maximum -time_veto_max_bestnr = {} +# Generate time-slides dictionary +slide_dict = ppu.load_time_slides(trig_file) -for slide_id in slide_dict: - num_slide_segs = len(trial_dict[slide_id]) - time_veto_max_bestnr[slide_id] = np.zeros(num_slide_segs) +# Generate segments dictionary +segment_dict = ppu.load_segment_dict(trig_file) +# Construct trials removing vetoed times +trial_dict, total_trials = ppu.construct_trials( + opts.seg_files, + segment_dict, + ifos, + slide_dict, + veto_file +) + +# Load triggers (apply reweighted SNR cut, not vetoes) +all_off_trigs = ppu.load_data(trig_file, ifos, data_tag='offsource', + rw_snr_threshold=opts.newsnr_threshold, + slide_id=opts.slide_id) + +# Extract needed trigger properties and store them as dictionaries +# Based on trial_dict: if vetoes were applied, trig_* are the veto survivors +keys = ['network/end_time_gc', 'network/reweighted_snr'] +trig_data = ppu.extract_trig_properties( + trial_dict, + all_off_trigs, + slide_dict, + segment_dict, + keys +) + +# Max BestNR values in each trial: these are stored in a dictionary keyed +# by slide_id, as arrays indexed by trial number +background = {k: np.zeros(len(v)) for k,v in trial_dict.items()} for slide_id in slide_dict: + trig_times = trig_data[keys[0]][slide_id] for j, trial in enumerate(trial_dict[slide_id]): - trial_cut = (trial[0] <= trig_time[slide_id])\ - & (trig_time[slide_id] < trial[1]) + # True whenever the trigger is in the trial + trial_cut = (trial[0] <= trig_times) & (trig_times < trial[1]) + # Move on if nothing was in the trial if not trial_cut.any(): continue # Max BestNR - time_veto_max_bestnr[slide_id][j] = \ - max(trig_bestnr[slide_id][trial_cut]) + background[slide_id][j] = max(trig_data[keys[1]][slide_id][trial_cut]) + +# Max and median values of reweighted SNR, +# and sorted (loudest in trial) reweighted SNR values +max_bestnr, median_bestnr, sorted_bkgd =\ + ppu.max_median_stat(slide_dict, background, trig_data[keys[1]], + total_trials) +assert total_trials == len(sorted_bkgd) -logging.info("SNR and bestNR maxima calculated.") +logging.info("Background bestNR calculated.") -# Output details of loudest offsouce triggers +# Output details of loudest offsouce triggers: only triggers compatible +# with the trial_dict are considered offsource_trigs = [] -sorted_trigs = ppu.sort_trigs(trial_dict, trigs, slide_dict, segment_dict) +sorted_off_trigs = ppu.sort_trigs( + trial_dict, + all_off_trigs, + slide_dict, + segment_dict +) for slide_id in slide_dict: - offsource_trigs.extend(zip(trig_bestnr[slide_id], sorted_trigs[slide_id])) + offsource_trigs.extend( + zip(trig_data[keys[1]][slide_id], sorted_off_trigs[slide_id]) + ) offsource_trigs.sort(key=lambda element: element[0]) offsource_trigs.reverse() -# ========================== -# Print loudest SNRs to file -# THIS OUTPUT FILE IS CURRENTLY UNUSED - MAYBE DELETE? -# Note: the only new info from above is the median SNR, bestnr -# and loudest SNR, so could just add this to the above's caption. -# ========================== -max_bestnr, _, full_time_veto_max_bestnr =\ - ppu.max_median_stat(slide_dict, time_veto_max_bestnr, trig_bestnr, - total_trials) -# ========================== -# Calculate template chirp masses from bank -# ========================== +# Calculate chirp masses of templates in bank logging.info('Reading template chirp masses') with HFile(opts.bank_file, 'r') as bank_file: template_mchirps = mchirp_from_mass1_mass2( @@ -261,9 +272,10 @@ with HFile(opts.bank_file, 'r') as bank_file: if onsource_file: logging.info("Processing onsource.") - # Get onsouce_triggers (apply reweighted SNR cut) - on_trigs = ppu.load_triggers(onsource_file, ifos, vetoes, - rw_snr_threshold=opts.newsnr_threshold) + # Load onsoource triggers (apply reweighted SNR cut, not vetoes) + on_trigs = ppu.load_data(onsource_file, ifos, data_tag=None, + rw_snr_threshold=opts.newsnr_threshold, + slide_id=0) # Calculate chirp mass values on_mchirp = template_mchirps[on_trigs['network/template_id']] @@ -288,65 +300,57 @@ if onsource_file: logging.info("Onsource analysed.") if loud_on_bestnr_idx is not None: - num_trials_louder = 0 - tot_off_snr = np.array([]) - for slide_id in slide_dict: - num_trials_louder += sum(time_veto_max_bestnr[slide_id] > - loud_on_bestnr) - tot_off_snr = np.concatenate([tot_off_snr, - time_veto_max_bestnr[slide_id]]) - #fap_test = sum(tot_off_snr > loud_on_bestnr)/total_trials - loud_on_fap = num_trials_louder/total_trials + loud_on_fap = sum(sorted_bkgd > loud_on_bestnr) / total_trials -else: - tot_off_snr = np.array([]) - for slide_id in slide_dict: - tot_off_snr = np.concatenate([tot_off_snr, - time_veto_max_bestnr[slide_id]]) - med_snr = np.median(tot_off_snr) - #loud_on_fap = sum(tot_off_snr > med_snr)/total_trials # ======================= # Post-process injections # ======================= - -sites = [ifo[0] for ifo in ifos] - -# injs contains the information about found/missed injections AND triggers -# Triggers and injections are discared if at vetoed times and/or below -# Reweighted SNR thrshold -injs = ppu.load_triggers(found_missed_file, ifos, vetoes, - rw_snr_threshold=opts.newsnr_threshold) - -logging.info("Missed/found injections/triggers loaded.") +# injs contains found/missed injections AND triggers they generated +# The reweighted SNR cut is applied, vetoes are not +injs = ppu.load_data(found_missed_file, ifos, data_tag='injs', + rw_snr_threshold=opts.newsnr_threshold, + slide_id=0) + +# Gather injections that were not missed +found_inj = {} +for k in injs.keys(): + if 'missed' not in k: + found_inj[k] = injs[k] + +# Separate them in found surviving vetoes and found but vetoed +found_after_vetoes, vetoed, *_ = ppu.apply_vetoes_to_found_injs( + found_missed_file, + found_inj, + ifos, + veto_file=veto_file +) # Calculate quantities not included in trigger files, such as chirp mass -found_trig_mchirp = template_mchirps[injs['network/template_id']] - +found_trig_mchirp = template_mchirps[found_after_vetoes['network/template_id']] # Construct conditions for injection: -# 1) found louder than background, -zero_fap = np.zeros(len(injs['network/end_time_gc'])).astype(bool) -zero_fap_cut = injs['network/reweighted_snr'][:] > max_bestnr +# 1) found (surviving vetoes) louder than background, +zero_fap = np.zeros(len(found_after_vetoes['network/end_time_gc'])).astype(bool) +zero_fap_cut = found_after_vetoes['network/reweighted_snr'] > max_bestnr zero_fap = zero_fap | (zero_fap_cut) -# 2) found (bestnr > 0) but not louder than background (non-zero FAP) -nonzero_fap = ~zero_fap & (injs['network/reweighted_snr'] != 0) +# 2) found (bestnr > 0, and surviving vetoes) but not louder than background +nonzero_fap = ~zero_fap & (found_after_vetoes['network/reweighted_snr'] != 0) -# 3) missed after being recovered (i.e., vetoed) are not used here -# missed = (~zero_fap) & (~nonzero_fap) +# 3) missed after being recovered (i.e., vetoed) are in vetoed # Non-zero FAP triggers (g_ifar) g_ifar = {} -g_ifar['bestnr'] = injs['network/reweighted_snr'][nonzero_fap] +g_ifar['bestnr'] = found_after_vetoes['network/reweighted_snr'][nonzero_fap] g_ifar['stat'] = np.zeros([len(g_ifar['bestnr'])]) for ix, (mc, bestnr) in \ enumerate(zip(found_trig_mchirp[nonzero_fap], g_ifar['bestnr'])): - g_ifar['stat'][ix] = (full_time_veto_max_bestnr > bestnr).sum() + g_ifar['stat'][ix] = (sorted_bkgd > bestnr).sum() g_ifar['stat'] = g_ifar['stat'] / total_trials # Set the sigma values -inj_sigma = {ifo: injs[f'{ifo}/sigmasq'][:] for ifo in ifos} +inj_sigma = {ifo: found_after_vetoes[f'{ifo}/sigmasq'][:] for ifo in ifos} # If the sigmasqs are not populated, we can still do calibration errors, # but only in the 1-detector case for ifo in ifos: @@ -365,9 +369,9 @@ f_resp = {} for ifo in ifos: antenna = Detector(ifo) f_resp[ifo] = ppu.get_antenna_responses(antenna, - injs['found/ra'][:], - injs['found/dec'][:], - injs['found/tc'][:]) + found_after_vetoes['found/ra'][:], + found_after_vetoes['found/dec'][:], + found_after_vetoes['found/tc'][:]) inj_sigma_mult = (np.asarray(list(inj_sigma.values())) * np.asarray(list(f_resp.values()))) @@ -380,12 +384,12 @@ inj_sigma_mean = {} for ifo in ifos: inj_sigma_mean[ifo] = ((inj_sigma[ifo]*f_resp[ifo])/inj_sigma_tot).mean() -logging.info("%d found injections analysed.", len(injs['found/tc'])) - -# Process missed injections (injs['missed']) -logging.info("%d missed injections analysed.", len(injs['missed/tc'])) +msg = f"{len(found_after_vetoes['found/tc'])} injections found and surviving " +msg += f"vetoes and {len(injs['missed/tc'])} missed injections analysed." +logging.info(msg) -# Create new set of injections for efficiency calculations +# Create new set of injections for efficiency calculations: +# these are as many as the original injections total_injs = len(injs['found/distance']) + len(injs['missed/distance']) long_inj = {} long_inj['dist'] = stats.uniform.rvs(size=total_injs) * \ @@ -411,7 +415,7 @@ for key in ['mc', 'no_mc']: found_on_bestnr[key] = np.zeros(num_dist_bins_plus_one) # Construct FAP list for all found injections -inj_fap = np.zeros(len(injs['found/distance'])) +inj_fap = np.zeros(len(found_after_vetoes['found/distance'])) inj_fap[nonzero_fap] = g_ifar['stat'] # Calculate the amplitude error @@ -434,10 +438,20 @@ logging.info("Calibration amplitude uncertainty calculated.") # NOTE: the loop on num_mc_injs would fill up the *_inj['dist_mc']'s at the # same time, so filling them up sequentially will vary the numbers a little # (this is an MC, order of operations matters!) -found_inj_dist_mc = ppu.mc_cal_wf_errs(num_mc_injs, injs['found/distance'], - cal_error, wav_err, max_dc_cal_error) -missed_inj_dist_mc = ppu.mc_cal_wf_errs(num_mc_injs, injs['missed/distance'], - cal_error, wav_err, max_dc_cal_error) +found_inj_dist_mc = ppu.mc_cal_wf_errs( + num_mc_injs, + found_after_vetoes['found/distance'], + cal_error, + wav_err, + max_dc_cal_error +) +missed_inj_dist_mc = ppu.mc_cal_wf_errs( + num_mc_injs, + np.concatenate((vetoed['found/distance'],injs['missed/distance'])), + cal_error, + wav_err, + max_dc_cal_error +) long_inj['dist_mc'] = ppu.mc_cal_wf_errs(num_mc_injs, long_inj['dist'], cal_error, wav_err, max_dc_cal_error) @@ -452,32 +466,32 @@ else: distance_count = np.zeros(len(dist_bins)) -found_trig_max_bestnr = np.empty(len(injs['network/event_id'])) +found_trig_max_bestnr = np.empty(len(found_after_vetoes['network/event_id'])) found_trig_max_bestnr.fill(max_bestnr) -max_bestnr_cut = (injs['network/reweighted_snr'] > found_trig_max_bestnr) +max_bestnr_cut = (found_after_vetoes['network/reweighted_snr'] > found_trig_max_bestnr) # Check louder than on source -found_trig_loud_on_bestnr = np.empty(len(injs['network/event_id'])) +found_trig_loud_on_bestnr = np.empty(len(found_after_vetoes['network/event_id'])) if onsource_file: found_trig_loud_on_bestnr.fill(loud_on_bestnr) else: - found_trig_loud_on_bestnr.fill(med_snr) -on_bestnr_cut = injs['network/reweighted_snr'] > found_trig_loud_on_bestnr + found_trig_loud_on_bestnr.fill(median_bestnr) +on_bestnr_cut = found_after_vetoes['network/reweighted_snr'] > found_trig_loud_on_bestnr # Check whether injection is found for the purposes of exclusion # distance calculation. # Found: if louder than all on source # Missed: if not louder than loudest on source found_excl = on_bestnr_cut & (more_sig_than_onsource) & \ - (injs['network/reweighted_snr'] != 0) + (found_after_vetoes['network/reweighted_snr'] != 0) # If not missed, double check bestnr against nearby triggers near_test = np.zeros((found_excl).sum()).astype(bool) -for j, (t, bestnr) in enumerate(zip(injs['found/tc'][found_excl], - injs['network/reweighted_snr'][found_excl])): +for j, (t, bestnr) in enumerate(zip(found_after_vetoes['found/tc'][found_excl], + found_after_vetoes['network/reweighted_snr'][found_excl])): # 0 is the zero-lag timeslide near_bestnr = \ - trig_bestnr[0][np.abs(trig_time[0]-t) < cluster_window] + trig_data[keys[1]][0][np.abs(trig_data[keys[0]][0]-t) < cluster_window] near_test[j] = ~((near_bestnr * glitch_check_fac > bestnr).any()) # Apply the local test c = 0 @@ -528,6 +542,7 @@ logging.info("Found/missed injection efficiency calculations completed.") # ========== # Make plots # ========== +logging.info("Plotting.") # Calculate distances (horizontal axis) as means dist_plot_vals = [np.asarray(dist_bin).mean() for dist_bin in dist_bins] @@ -578,7 +593,7 @@ yerr_low, yerr_high, fraction_mc = \ red_efficiency = (fraction_mc) - (yerr_low) * scipy.stats.norm.isf(0.1) # Calculate and save to disk 50% and 90% exclusion distances -# excl_dist dictionary contains 50% and 90% exclusion distances +# excl_dist dictionary contains 50% and 90% exclusion distances excl_dist = {} for percentile in [50, 90]: eff_idx = np.where(red_efficiency < (percentile / 100.))[0] diff --git a/bin/pygrb/pycbc_pygrb_page_tables b/bin/pygrb/pycbc_pygrb_page_tables index 6d53fc14efa..b2e7e18dc79 100755 --- a/bin/pygrb/pycbc_pygrb_page_tables +++ b/bin/pygrb/pycbc_pygrb_page_tables @@ -54,7 +54,7 @@ def additional_injection_data(data, ifos): eff_dist = 0 for ifo in ifos: antenna = Detector(ifo) - data['eff_dist_%s' % ifo] = antenna.effective_distance( + data['eff_dist_'+ifo] = antenna.effective_distance( data['distance'], data['ra'], data['dec'], @@ -62,13 +62,13 @@ def additional_injection_data(data, ifos): data['tc'], data['inclination'] ) - eff_dist += 1.0 / data['eff_dist_%s' % ifo] + eff_dist += 1.0 / data['eff_dist_'+ifo] data['eff_dist'] = 1.0 / eff_dist return data -def load_missed_found_injections(hdf_file, ifos, snr_threshold, bank_file, +def load_missed_found_injections(hdf_file, ifos, bank_file, snr_threshold=None, background_bestnrs=None): """Loads found and missed injections from an hdf file as two dictionaries @@ -77,18 +77,20 @@ def load_missed_found_injections(hdf_file, ifos, snr_threshold, bank_file, hdf_file: str File path ifos: list - snr_threshold: float - NewSNR threshold - bank_file: HFile object - background_bestnrs: numpy.array, optional - Used to compute FAP of quiet injections. + bank_file: h5py.File object + snr_threshold: float, optional [default: None] + Reweighted SNR threshold + background_bestnrs: numpy.array, optional [default: None] + Used to compute FAP of quiet injections Returns ------- data: tuple of dictionaries - Found and missed injection parameter dictionaries. + Found, missed, and missed after the cut in reweighted SNR injection + parameter dictionaries. """ + logging.info('Loading injections...') inj_data = HFile(hdf_file, 'r') inj_params = ['mass1', 'mass2', 'distance', 'inclination', 'ra', 'dec', 'polarization', 'spin1x', 'spin1y', 'spin1z', 'spin2x', @@ -96,12 +98,11 @@ def load_missed_found_injections(hdf_file, ifos, snr_threshold, bank_file, found_data = {} # Missed injections (ones not recovered at all) missed_data = {} - logging.info('Loading injections...') # Load injections parameters for param in inj_params: - missed_data[param] = inj_data['missed/%s' % param][...] - found_data[param] = inj_data['found/%s' % param][...] + missed_data[param] = inj_data['missed/'+param][...] + found_data[param] = inj_data['found/'+param][...] # Calculate effective distance for the ifos found_data = additional_injection_data(found_data, ifos) @@ -110,7 +111,7 @@ def load_missed_found_injections(hdf_file, ifos, snr_threshold, bank_file, # Get recovered parameters and statistic values for the found injections # Recovered parameters for param in ['mass1', 'mass2', 'spin1z', 'spin2z']: - found_data['rec_%s' % param] = \ + found_data['rec_'+param] = \ np.array(bank_file[param])[inj_data['network/template_id']] found_data['time_diff'] = \ found_data['tc'] - inj_data['network/end_time_gc'][...] @@ -122,26 +123,40 @@ def load_missed_found_injections(hdf_file, ifos, snr_threshold, bank_file, found_data['rec_dec'] = inj_data['network/dec'][...] # Statistics values for param in ['coherent_snr', 'reweighted_snr', 'null_snr']: - found_data[param] = inj_data['network/%s' % param][...] + found_data[param] = inj_data['network/'+param][...] found_data['chisq'] = inj_data['network/my_network_chisq'][...] found_data['nifos'] = inj_data['network/nifo'][...].astype(int) for ifo in ifos: if np.all(inj_data['network/event_id'][...] == - inj_data['%s/event_id' % ifo][...]): - found_data['sigmasq_%s' % ifo] = inj_data['%s/sigmasq' % ifo][...] - found_data['snr_%s' % ifo] = inj_data['%s/snr' % ifo][...] + inj_data[ifo+'/event_id'][...]): + found_data['sigmasq_'+ifo] = inj_data[ifo+'/sigmasq'][...] + found_data['snr_'+ifo] = inj_data[ifo+'/snr'][...] + found_data[ifo+'/end_time'] = inj_data[ifo+'/end_time'][...] else: # Sort the ifo event_id with respect to the network event_id ifo_sorted_indices = np.argsort(inj_data['network/event_id'][...][ np.argsort(inj_data['network/event_id'])].searchsorted( - inj_data['%s/event_id' % ifo][...])) - found_data['sigmasq_%s' % ifo] = \ - inj_data['%s/sigmasq' % ifo][...][ifo_sorted_indices] - found_data['snr_%s' % ifo] = \ - inj_data['%s/snr' % ifo][...][ifo_sorted_indices] + inj_data[ifo+'/event_id'][...])) + found_data['sigmasq_'+ifo] = \ + inj_data[ifo+'/sigmasq'][...][ifo_sorted_indices] + found_data['snr_'+ifo] = \ + inj_data[ifo+'/snr'][...][ifo_sorted_indices] # BestNRs found_data['bestnr'] = reweightedsnr_cut(found_data['reweighted_snr'][...], snr_threshold) + # Apply reweighted SNR cut + cut_data = {} + if snr_threshold: + logging.info("%d found injections loaded.", len(found_data[inj_params[0]])) + logging.info("%d missed injections loaded.", len(missed_data[inj_params[0]])) + logging.info("Applying reweighted SNR cut at %s.", snr_threshold) + rw_snr_cut = found_data['reweighted_snr'] < snr_threshold + for key in found_data: + cut_data[key] = found_data[key][rw_snr_cut] + found_data[key] = found_data[key][~rw_snr_cut] + del found_data['reweighted_snr'] + del cut_data['reweighted_snr'] + if background_bestnrs is not None: found_data['fap'] = np.array( [sum(background_bestnrs > bestnr) for bestnr in @@ -150,15 +165,15 @@ def load_missed_found_injections(hdf_file, ifos, snr_threshold, bank_file, # Antenna responses f_resp = {} for ifo in ifos: - if sum(found_data['sigmasq_%s' % ifo] == 0): + if sum(found_data['sigmasq_'+ifo] == 0): logging.info("%s: sigmasq not set for at least one trigger.", ifo) - if sum(found_data['sigmasq_%s' % ifo] != 0) == 0: + if sum(found_data['sigmasq_'+ifo] != 0) == 0: logging.info("%s: sigmasq not set for any trigger.", ifo) if len(ifos) == 1: msg = "This is a single ifo analysis. " msg += "Setting sigmasq to unity for all triggers." logging.info(msg) - found_data['sigmasq_%s' % ifo][:] = 1.0 + found_data['sigmasq_'+ifo][:] = 1.0 antenna = Detector(ifo) f_resp[ifo] = ppu.get_antenna_responses(antenna, found_data['ra'], found_data['dec'], @@ -166,15 +181,24 @@ def load_missed_found_injections(hdf_file, ifos, snr_threshold, bank_file, inj_sigma_mult = \ np.asarray([f_resp[ifo] * - found_data['sigmasq_%s' % ifo] for ifo in ifos]) + found_data['sigmasq_'+ifo] for ifo in ifos]) inj_sigma_tot = np.sum(inj_sigma_mult, axis=0) for ifo in ifos: - found_data['inj_sigma_mean_%s' % ifo] = np.mean( - found_data['sigmasq_%s' % ifo] * f_resp[ifo] / inj_sigma_tot) + found_data['inj_sigma_mean_'+ifo] = np.mean( + found_data['sigmasq_'+ifo] * f_resp[ifo] / inj_sigma_tot) # Close the hdf file inj_data.close() - return found_data, missed_data + logging.info("%d found injections.", len(found_data['mchirp'])) + logging.info("%d missed injections.", len(missed_data['mchirp'])) + logging.info("%d injections cut.", len(cut_data['mchirp'])) + + return found_data, missed_data, cut_data + + +def format_pvalue_str(pvalue, n_trials): + """Format p-value as a string.""" + return f'< {(1./n_trials):.3g}' if pvalue == 0 else f'{pvalue:.3g}' # ============================================================================= @@ -216,7 +240,9 @@ parser.add_argument("-C", "--cluster-window", action="store", type=float, default=0.1, help="The cluster window used " + "to cluster triggers in time.") ppu.pygrb_add_bestnr_cut_opt(parser) +ppu.pygrb_add_slide_opts(parser) opts = parser.parse_args() +ppu.slide_opts_helper(opts) init_logging(opts.verbose, format="%(asctime)s: %(levelname)s: %(message)s") @@ -266,84 +292,90 @@ for output_file in output_files: if not os.path.isdir(outdir): os.makedirs(outdir) -# Extract IFOs and vetoes -ifos, vetoes = ppu.extract_ifos_and_vetoes(offsource_file, opts.veto_files, - opts.veto_category) - -# Load triggers, time-slides, and segment dictionary -logging.info("Loading triggers.") -trig_data = ppu.load_triggers(offsource_file, ifos, None, - rw_snr_threshold=opts.newsnr_threshold) -logging.info("%d offsource triggers surviving reweighted SNR cut.", - len(trig_data['network/event_id'])) -logging.info("Loading timeslides.") -slide_dict = ppu.load_time_slides(offsource_file) -logging.info("Loading segments.") -segment_dict = ppu.load_segment_dict(offsource_file) +# Extract IFOs +ifos = ppu.extract_ifos(offsource_file) -# Calculate chirp masses of templates -logging.info('Loading triggers template masses') -bank_data = HFile(opts.bank_file, 'r') -mchirps = mchirp_from_mass1_mass2( - bank_data['mass1'][...], - bank_data['mass2'][...] - ) +# Generate time-slides dictionary +slide_dict = ppu.load_time_slides(offsource_file) -# Construct trials -logging.info("Constructing trials.") -trial_dict = ppu.construct_trials(opts.seg_files, segment_dict, - ifos, slide_dict, vetoes) -total_trials = sum([len(trial_dict[slide_id]) for slide_id in slide_dict]) -logging.info("%d trials generated.", total_trials) - -# Extract basic trigger properties and store as dictionaries -trig_time, trig_snr, trig_bestnr = \ - ppu.extract_basic_trig_properties(trial_dict, trig_data, slide_dict, - segment_dict, opts) -# Calculate SNR and BestNR values and maxima -time_veto_max_snr = {} -time_veto_max_bestnr = {} -for slide_id in slide_dict: - num_slide_segs = len(trial_dict[slide_id]) - time_veto_max_snr[slide_id] = np.zeros(num_slide_segs) - time_veto_max_bestnr[slide_id] = np.zeros(num_slide_segs) +# Generate segments dictionary +segment_dict = ppu.load_segment_dict(offsource_file) +# Construct trials removing vetoed times +trial_dict, total_trials = ppu.construct_trials(opts.seg_files, segment_dict, + ifos, slide_dict, + opts.veto_file) + +# Load triggers (apply reweighted SNR cut, not vetoes) +trig_data = ppu.load_data(offsource_file, ifos, data_tag='offsource', + rw_snr_threshold=opts.newsnr_threshold, + slide_id=opts.slide_id) + +# Extract needed trigger properties and store them as dictionaries +# Based on trial_dict: if vetoes were applied, trig_* are the veto survivors +# _av stands for after vetoes +keys = ['network/end_time_gc', 'network/coherent_snr', 'network/reweighted_snr'] +trig_data_av = ppu.extract_trig_properties( + trial_dict, + trig_data, + slide_dict, + segment_dict, + keys +) + +# Max SNR and BestNR values in each trial: these are stored in dictionaries +# keyed by slide_id, as arrays indexed by trial number +background_snr = {k: np.zeros(len(v)) for k,v in trial_dict.items()} +background = {k: np.zeros(len(v)) for k,v in trial_dict.items()} for slide_id in slide_dict: + trig_times = trig_data_av[keys[0]][slide_id] for j, trial in enumerate(trial_dict[slide_id]): - trial_cut = (trial[0] <= trig_time[slide_id])\ - & (trig_time[slide_id] < trial[1]) + # True whenever the trigger is in the trial + trial_cut = (trial[0] <= trig_times) & (trig_times < trial[1]) if not trial_cut.any(): continue # Max SNR - time_veto_max_snr[slide_id][j] = \ - max(trig_snr[slide_id][trial_cut]) + background_snr[slide_id][j] = \ + max(trig_data_av[keys[1]][slide_id][trial_cut]) # Max BestNR - time_veto_max_bestnr[slide_id][j] = \ - max(trig_bestnr[slide_id][trial_cut]) - # Max SNR for triggers passing SBVs - sbv_cut = trig_bestnr[slide_id] != 0 - if not (trial_cut & sbv_cut).any(): - continue + background[slide_id][j] = \ + max(trig_data_av[keys[2]][slide_id][trial_cut]) + +# Max and median values of reweighted SNR, +# and sorted (loudest in trial) reweighted SNR values +max_bestnr, median_bestnr, sorted_bkgd =\ + ppu.max_median_stat(slide_dict, background, + trig_data_av[keys[2]], total_trials) +assert total_trials == len(sorted_bkgd) + +# Median value of SNR +_, median_snr, _ = ppu.max_median_stat(slide_dict, background_snr, + trig_data_av[keys[1]], total_trials) -logging.info("SNR and bestNR maxima calculated.") +logging.info("Background SNR and bestNR of trials calculated.") -# Output details of loudest offsouce triggers, sorted by BestNR +# Output details of loudest offsouce triggers: only triggers compatible +# with the trial_dict are considered offsource_trigs = [] sorted_trigs = ppu.sort_trigs(trial_dict, trig_data, slide_dict, segment_dict) for slide_id in slide_dict: - offsource_trigs.extend(zip(trig_bestnr[slide_id], - sorted_trigs[slide_id])) - + offsource_trigs.extend( + zip(trig_data_av[keys[2]][slide_id], sorted_trigs[slide_id]) + ) offsource_trigs.sort(key=lambda element: element[0]) offsource_trigs.reverse() -# Median and max values of SNR and BestNR -_, median_snr, _ = ppu.max_median_stat(slide_dict, time_veto_max_snr, - trig_snr, total_trials) -max_bestnr, median_bestnr, full_time_veto_max_bestnr =\ - ppu.max_median_stat(slide_dict, time_veto_max_bestnr, trig_bestnr, - total_trials) +# Calculate chirp masses of templates +logging.info('Loading triggers template masses') +bank_data = h5py.File(opts.bank_file, 'r') +template_mchirps = mchirp_from_mass1_mass2( + bank_data['mass1'][...], + bank_data['mass2'][...] + ) +# ========================================= +# Output of loudest offsource triggers data +# ========================================= if lofft_outfile: # td: table data td = [] @@ -355,7 +387,7 @@ if lofft_outfile: trig_index = \ np.where(trig_data['network/event_id'] == trig_id)[0][0] ifo_trig_index = { - ifo: np.where(trig_data['%s/event_id' % ifo] == trig_id)[0][0] + ifo: np.where(trig_data[ifo+'/event_id'] == trig_id)[0][0] for ifo in ifos } trig_slide_id = int(trig_data['network/slide_id'][trig_index]) @@ -370,18 +402,13 @@ if lofft_outfile: chunk_num = 'No trial' # Get FAP of trigger - num_trials_louder = 0 - for slide_id in slide_dict: - for val in time_veto_max_bestnr[slide_id]: - if val > bestnr: - num_trials_louder += 1 - fap = num_trials_louder/total_trials - pval = '< %.3g' % (1./total_trials) if fap == 0 else '%.3g' % fap + pval = sum(sorted_bkgd > bestnr) / total_trials + pval = format_pvalue_str(pval, total_trials) d = [chunk_num, trig_slide_id, pval, trig_data['network/end_time_gc'][trig_index], bank_data['mass1'][trig_data['network/template_id'][trig_index]], bank_data['mass2'][trig_data['network/template_id'][trig_index]], - mchirps[trig_index], + template_mchirps[trig_data['network/template_id'][trig_index]], bank_data['spin1z'][trig_data['network/template_id'][trig_index]], bank_data['spin2z'][trig_data['network/template_id'][trig_index]], trig_data['network/ra'][trig_index], @@ -389,7 +416,7 @@ if lofft_outfile: trig_data['network/coherent_snr'][trig_index], trig_data['network/my_network_chisq'][trig_index], trig_data['network/null_snr'][trig_index]] - d.extend([trig_data['%s/snr' % ifo][ifo_trig_index[ifo]] + d.extend([trig_data[ifo+'/snr'][ifo_trig_index[ifo]] for ifo in ifos]) d.extend([slide_dict[trig_slide_id][ifo] for ifo in ifos]) d.append(bestnr) @@ -399,8 +426,8 @@ if lofft_outfile: th = ['Trial', 'Slide Num', 'p-value', 'GPS time', 'Rec. m1', 'Rec. m2', 'Rec. Mc', 'Rec. spin1z', 'Rec. spin2z', 'Rec. RA', 'Rec. Dec', 'SNR', 'Chi^2', 'Null SNR'] - th.extend(['%s SNR' % ifo for ifo in ifos]) - th.extend(['%s time shift (s)' % ifo for ifo in ifos]) + th.extend([ifo+' SNR' for ifo in ifos]) + th.extend([ifo+' time shift (s)' for ifo in ifos]) th.append('BestNR') # To ensure desired formatting in the h5 file and html table: @@ -409,14 +436,14 @@ if lofft_outfile: # Write to h5 file logging.info("Writing %d loudest offsource triggers to h5 file.", - len(td)) + len(td[0])) lofft_h5_fp = HFile(lofft_h5_outfile, 'w') for i, key in enumerate(th): lofft_h5_fp.create_dataset(key, data=td[i]) lofft_h5_fp.close() # Write to html file - logging.info("Writing %d loudest triggers to html file.", len(td)) + logging.info("Writing %d loudest triggers to html file.", len(td[0])) # To ensure desired formatting in the html table: # 2) convert the columns to numpy arrays @@ -451,7 +478,7 @@ if lofft_outfile: # end of an observing run collectively # TODO: Needs a final place in the results webpage # np.savetxt('%s/bestnr_vs_fap_numbers.txt' %(outdir), - # full_time_veto_max_bestnr, delimiter='/t') + # sorted_bkgd, delimiter='/t') # ======================= @@ -460,8 +487,9 @@ if lofft_outfile: if onsource_file: # Get trigs - on_trigs = ppu.load_triggers(onsource_file, ifos, None, - rw_snr_threshold=opts.newsnr_threshold) + on_trigs = ppu.load_data(onsource_file, ifos, data_tag=None, + rw_snr_threshold=opts.newsnr_threshold, + slide_id='all') # Record loudest trig by BestNR loud_on_bestnr = 0 @@ -483,30 +511,21 @@ if onsource_file: td = [] # Gather data - loud_on_fap = 1 if loud_on_bestnr_trigs: trig_id = loud_on_bestnr_trigs trig_index = np.where(on_trigs['network/event_id'] == trig_id)[0][0] ifo_trig_index = { - ifo: np.where(on_trigs['%s/event_id' % ifo] == trig_id)[0][0] + ifo: np.where(on_trigs[ifo+'/event_id'] == trig_id)[0][0] for ifo in ifos } num_trials_louder = 0 - tot_off_snr = np.array([]) - for slide_id in slide_dict: - num_trials_louder += sum(time_veto_max_bestnr[slide_id] > - loud_on_bestnr) - tot_off_snr = np.concatenate([tot_off_snr, - time_veto_max_bestnr[slide_id]]) - fap = num_trials_louder/total_trials - fap_test = sum(tot_off_snr > loud_on_bestnr)/total_trials - pval = '< %.3g' % (1./total_trials) if fap == 0 else '%.3g' % fap - loud_on_fap = fap + pval = sum(sorted_bkgd > loud_on_bestnr)/total_trials + pval = format_pvalue_str(pval, total_trials) d = [pval, on_trigs['network/end_time_gc'][trig_index], bank_data['mass1'][on_trigs['network/template_id'][trig_index]], bank_data['mass2'][on_trigs['network/template_id'][trig_index]], - mchirps[on_trigs['network/template_id'][trig_index]], + template_mchirps[on_trigs['network/template_id'][trig_index]], bank_data['spin1z'][on_trigs['network/template_id'][trig_index]], bank_data['spin2z'][on_trigs['network/template_id'][trig_index]], on_trigs['network/ra'][trig_index], @@ -514,7 +533,7 @@ if onsource_file: on_trigs['network/coherent_snr'][trig_index], on_trigs['network/my_network_chisq'][trig_index], on_trigs['network/null_snr'][trig_index]] + \ - [on_trigs['%s/snr' % ifo][ifo_trig_index[ifo]] for ifo in ifos] + \ + [on_trigs[ifo+'/snr'][ifo_trig_index[ifo]] for ifo in ifos] + \ [loud_on_bestnr] td.append(d) else: @@ -524,7 +543,7 @@ if onsource_file: # Table header th = ['p-value', 'GPS time', 'Rec. m1', 'Rec. m2', 'Rec. Mc', 'Rec. spin1z', 'Rec. spin2z', 'Rec. RA', 'Rec. Dec', 'SNR', 'Chi^2', - 'Null SNR'] + ['%s SNR' % ifo for ifo in ifos] + ['BestNR'] + 'Null SNR'] + [ifo+' SNR' for ifo in ifos] + ['BestNR'] td = list(zip(*td)) @@ -555,51 +574,45 @@ if onsource_file: pycbc.results.save_fig_with_metadata(str(html_table), lont_outfile, **kwds) -else: - tot_off_snr = np.array([]) - for slide_id in slide_dict: - tot_off_snr = np.concatenate([tot_off_snr, - time_veto_max_bestnr[slide_id]]) - med_snr = np.median(tot_off_snr) - fap = sum(tot_off_snr > med_snr)/total_trials - # ======================= # Post-process injections # ======================= if found_missed_file is not None: - found_injs, missed_injs = load_missed_found_injections( - found_missed_file, ifos, opts.newsnr_threshold, bank_data, - background_bestnrs=full_time_veto_max_bestnr) - logging.info("Missed/found injections/triggers loaded.") - logging.info("%d found injections found.", len(found_injs['mchirp'])) - logging.info("%d missed injections found.", len(missed_injs['mchirp'])) + # Load injections applying reweighted SNR cut + found_injs, missed_injs, cut_injs = load_missed_found_injections( + found_missed_file, ifos, bank_data, + snr_threshold=opts.newsnr_threshold, + background_bestnrs=sorted_bkgd + ) + + # Split in injections found surviving vetoes and ones found but vetoed + found_after_vetoes, vetoed, *_ = ppu.apply_vetoes_to_found_injs( + found_missed_file, + found_injs, + ifos, + veto_file=opts.veto_file + ) + # Construct conditions for injection: # 1) found louder than background, - zero_fap = found_injs['bestnr'] > max_bestnr + zero_fap = found_after_vetoes['bestnr'] > max_bestnr # 2) found (bestnr > 0) but not louder than background (non-zero FAP) - nonzero_fap = ~zero_fap & (found_injs['bestnr'] != 0) - - # 3) missed after being recovered (i.e., vetoed) - # -- > question: is there ever another way this happens other than veto? - # vetoed_trigs = (~zero_fap) & (~nonzero_fap) - vetoed_trigs = found_injs['bestnr'] == 0 + nonzero_fap = ~zero_fap & (found_after_vetoes['bestnr'] != 0) - logging.info("%d found injections analysed.", len(found_injs['mchirp'])) + # 3) missed after being recovered: vetoed (these have bestnr = 0) # Avoids a problem with formatting in the non-static html output file - missed_na = [-0] * len(missed_injs['mchirp']) - - logging.info("%d missed injections analysed.", len(missed_injs['mchirp'])) + #missed_na = [-0] * len(missed_injs['mchirp']) # Write quiet triggers to file sites = [ifo[0] for ifo in ifos] - th = ['Dist'] + ['Eff. Dist. %s' % site for site in sites] +\ + th = ['Dist'] + ['Eff. Dist. '+site for site in sites] +\ ['GPS time', 'GPS time - Rec. Time'] +\ ['Inj. m1', 'Inj. m2', 'Inj. Mc', 'Rec. m1', 'Rec. m2', 'Rec. Mc', 'Inj. inc', 'Inj. RA', 'Inj. Dec', 'Rec. RA', 'Rec. Dec', 'SNR', 'Chi^2', 'Null SNR'] +\ - ['SNR %s' % ifo for ifo in ifos] +\ + ['SNR '+ifo for ifo in ifos] +\ ['BestNR', 'Inj S1x', 'Inj S1y', 'Inj S1z', 'Inj S2x', 'Inj S2y', 'Inj S2z', 'Rec S1z', 'Rec S2z'] @@ -617,43 +630,28 @@ if found_missed_file is not None: '##.##', '##.##', '##.##', '##.##', '##.##', '##.##', '##.##', '##.##']) - sngl_snr_keys = ['snr_%s' % ifo for ifo in ifos] + sngl_snr_keys = ['snr_'+ifo for ifo in ifos] keys = ['distance'] - keys += ['eff_dist_%s' % ifo for ifo in ifos] + keys += ['eff_dist_'+ifo for ifo in ifos] keys += ['tc', 'time_diff', 'mass1', 'mass2', 'mchirp', 'rec_mass1', 'rec_mass2', 'rec_mchirp', 'inclination', 'ra', 'dec', 'rec_ra', 'rec_dec', 'coherent_snr', 'chisq', 'null_snr'] keys += sngl_snr_keys keys += ['bestnr', 'spin1x', 'spin1y', 'spin1z', 'spin2x', 'spin2y', 'spin2z', 'rec_spin1z', 'rec_spin2z'] - # The following parameters are available only for recovered injections - na_keys = ['time_diff', 'rec_mass1', 'rec_mass2', 'rec_mchirp', - 'rec_spin1z', 'rec_spin2z', 'rec_ra', 'rec_dec', 'coherent_snr', - 'chisq', 'null_snr', 'bestnr'] - na_keys += sngl_snr_keys - td = [] - for key in keys: - if key in na_keys: - td += [np.concatenate((found_injs[key][nonzero_fap], - found_injs[key][vetoed_trigs], - missed_na))] - else: - td += [np.concatenate((found_injs[key][nonzero_fap], - found_injs[key][vetoed_trigs], - missed_injs[key]))] + td = [found_after_vetoes[key][nonzero_fap] for key in keys] td = list(zip(*td)) td.sort(key=lambda elem: elem[0]) + logging.info("Writing %d quiet-found injections to h5 and html files.", + len(td)) td = list(zip(*td)) # Write to h5 file - logging.info("Writing %d quiet-found injections to h5 file.", len(td)) with HFile(qf_h5_outfile, 'w') as qf_h5_fp: for i, key in enumerate(th): qf_h5_fp.create_dataset(key, data=td[i]) # Write to html file - logging.info("Writing %d quiet-found injections to html file.", - len(td)) td = [np.asarray(d) for d in td] html_table = pycbc.results.html_table(td, th, format_strings=format_strings, @@ -665,15 +663,12 @@ if found_missed_file is not None: pycbc.results.save_fig_with_metadata(str(html_table), qf_outfile, **kwds) - # Write to html file - t_missed = [] - for key in keys: - t_missed += [found_injs[key][vetoed_trigs]] + # Write quiet triggers to html file + t_missed = [np.concatenate((vetoed[key], cut_injs[key])) for key in keys] t_missed = list(zip(*t_missed)) t_missed.sort(key=lambda elem: elem[0]) logging.info("Writing %d missed-found injections to html file.", len(t_missed)) - t_missed = zip(*t_missed) t_missed = [np.asarray(d) for d in t_missed] html_table = pycbc.results.html_table(t_missed, th, @@ -681,8 +676,8 @@ if found_missed_file is not None: page_size=20) kwds = {'title': "Missed found injections", 'caption': "Recovered parameters and statistic values of \ - injections that are recovered, but downwieghted to BestNR = 0 \ - (i.e., vetoed).", + injections that are recovered, but with reweighted SNR \ + below threshold or vetoed.", 'cmd': ' '.join(sys.argv), } pycbc.results.save_fig_with_metadata(str(html_table), mf_outfile, **kwds)