From eb436df3d1efc6ae7112e999936e3c16ef5b9d1f Mon Sep 17 00:00:00 2001 From: Jean-Philippe Lenain Date: Wed, 2 Oct 2024 11:24:34 +0200 Subject: [PATCH] Automatic DQM jobs on DIRAC (#151) * Add script to be used as cronjob to automatically submit DQM jobs on DIRAC once a run has been transferred from CEA to DIRAC * Check if a DQM job has already been submitted for a given NectarCAM run * Activate proper conda environment when starting the script * Cleaning * Add a check whether a run is already present in the ZODB database before parsing it. * Add cronjob script to parse DQM results and feed the ZODB database. * Process a list of runs instead of a single run at once. * Adapt cronjob to pass a list of runs as argument to DQM parser script. * Do not fail when a DQM result could not be fetched from DIRAC, but instead skip to next DQM run. * Change location of log file. * Automatically renew DIRAC proxy --------- Co-authored-by: Jean-Philippe Lenain --- .../jlenain/cronjob_parse_dqm_fits_file.sh | 23 +++ .../dqm_job_submitter/cronjob_launchDQM.sh | 46 ++++++ .../jlenain/parse_dqm_fits_file.py | 140 ++++++++++-------- 3 files changed, 151 insertions(+), 58 deletions(-) create mode 100755 src/nectarchain/user_scripts/jlenain/cronjob_parse_dqm_fits_file.sh create mode 100755 src/nectarchain/user_scripts/jlenain/dqm_job_submitter/cronjob_launchDQM.sh diff --git a/src/nectarchain/user_scripts/jlenain/cronjob_parse_dqm_fits_file.sh b/src/nectarchain/user_scripts/jlenain/cronjob_parse_dqm_fits_file.sh new file mode 100755 index 00000000..777b4d6c --- /dev/null +++ b/src/nectarchain/user_scripts/jlenain/cronjob_parse_dqm_fits_file.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash +# -*- coding: utf-8 -*- +# +# This script is to be used as a cronjob on the nectarcam-dqm-rw VM on the LPNHE OpenStack cloud platform, in order to feed the ZODB database from DQM run on DIRAC. + +# Log everything to $LOGFILE +LOGFILE=${0%".sh"}_$(date +%F).log +LOGFILE=$HOME/log/$(basename $LOGFILE) +exec 1>"$LOGFILE" 2>&1 + +. "/opt/conda/etc/profile.d/conda.sh" +conda activate nectar-dev + +# Initialize DIRAC proxy from user certificate: +if ! dirac-proxy-init -M -g cta_nectarcam --pwstdin < ~/.dirac.pwd; then + echo "DIRAC proxy initialization failed..." + exit 1 +fi + +remoteParentDir="/vo.cta.in2p3.fr/user/j/jlenain/nectarcam/dqm" +nectarchainScriptDir="/opt/cta/nectarchain/src/nectarchain/user_scripts/jlenain" + +python ${nectarchainScriptDir}/parse_dqm_fits_file.py -r $(dls ${remoteParentDir} | grep -ve "/vo.cta" | awk -F. '{print $1}' | awk -Fn '{print $2}' | tr '\n' ' ') diff --git a/src/nectarchain/user_scripts/jlenain/dqm_job_submitter/cronjob_launchDQM.sh b/src/nectarchain/user_scripts/jlenain/dqm_job_submitter/cronjob_launchDQM.sh new file mode 100755 index 00000000..932d3c48 --- /dev/null +++ b/src/nectarchain/user_scripts/jlenain/dqm_job_submitter/cronjob_launchDQM.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash +# -*- coding: utf-8 -*- +# +# Author: Jean-Philippe Lenain +# +# Script as a cronjob to dynamically launch NectarCAM DQM runs on DIRAC after data transfer, to be run once a day on sedipccaa23 in CEA/Irfu. + +# Log everything to $LOGFILE +LOGFILE=${0%".sh"}_$(date +%F).log +LOGFILE=$HOME/log/$(basename $LOGFILE) +exec 1>"$LOGFILE" 2>&1 + +source /opt/cta/mambaforge/etc/profile.d/conda.sh +conda activate ctadirac + +localParentDir="/data/nvme/ZFITS" +remoteParentDir="/vo.cta.in2p3.fr/nectarcam" +nectarchainScriptDir="$HOME/local/src/python/cta-observatory/nectarchain/src/nectarchain/user_scripts/jlenain/dqm_job_submitter" + +cd $nectarchainScriptDir || (echo "Failed to cd into ${nectarchainScriptDir}, exiting..."; exit 1) + +for run in $(find ${localParentDir} -type f -name "NectarCAM*.fits.fz" | awk -F. '{print $2}' | awk -Fn '{print $2}' | sort | uniq); do + echo "Probing files for run ${run}" + nbLocalFiles=$(find ${localParentDir} -type f -name "NectarCAM.Run${run}.????.fits.fz" | wc -l) + echo " Found $nbLocalFiles local files for run $run" + nbRemoteFiles=$(dfind ${remoteParentDir} | grep -e "NectarCAM.Run${run}" | grep --count -e "fits.fz") + echo " Found $nbRemoteFiles remote files on DIRAC for run $run" + # If number of local and remote files matching, will attempt to launch a DQM run + if [ ${nbLocalFiles} -eq ${nbRemoteFiles} ]; then + echo " Run $run: number of local and remote files matching, will attempt to submit a DQM job" + # Has this DQM run already been submitted ? + if [ $(dstat | grep --count -e "NectarCAM DQM run ${run}") -eq 0 ]; then + yyyymmdd=$(find ${localParentDir} -type f -name "NectarCAM.Run${run}.????.fits.fz" | head -n 1 | awk -F/ '{print $6}') + yyyy=${yyyymmdd:0:4} + mm=${yyyymmdd:4:2} + dd=${yyyymmdd:6:2} + cmd="python submit_dqm_processor.py -d "${yyyy}-${mm}-${dd}" -r $run" + echo "Running: $cmd" + eval $cmd + else + echo " DQM job for run $run already submitted, either ongoing or failed, skipping it." + fi + else + echo " Run $run is not yet complete on DIRAC, will wait another day before launching a DQM job on it." + fi +done diff --git a/src/nectarchain/user_scripts/jlenain/parse_dqm_fits_file.py b/src/nectarchain/user_scripts/jlenain/parse_dqm_fits_file.py index c88ac69c..8e78636f 100644 --- a/src/nectarchain/user_scripts/jlenain/parse_dqm_fits_file.py +++ b/src/nectarchain/user_scripts/jlenain/parse_dqm_fits_file.py @@ -28,11 +28,11 @@ formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument( - "-r", - "--run", - default=None, - help="process a specific run.", - type=str, + "-f", + "--force", + default=False, + action="store_true", + help="if this run is already in the DB, force re-parsing its DQM output again.", ) parser.add_argument( "-p", @@ -41,62 +41,86 @@ help="path on DIRAC where to grab DQM outputs (optional).", type=str, ) +parser.add_argument( + "-r", + "--runs", + nargs="+", + default=None, + help="process a specific run or a list of runs.", +) args = parser.parse_args() -if args.run is None: - logger.critical("A run number should be provided.") +if args.runs is None: + logger.critical("At least one run number should be provided.") sys.exit(1) -lfn = f"{args.path}/NectarCAM_DQM_Run{args.run}.tar.gz" - -if not os.path.exists(os.path.basename(lfn)): - DIRAC.initialize() - - dirac = Dirac() - - dirac.getFile( - lfn=lfn, - destDir=f".", - printOutput=True, - ) - -with tarfile.open(os.path.basename(lfn), "r") as tar: - tar.extractall(".") - -fits_file = ( - f"./NectarCAM_DQM_Run{args.run}/output/NectarCAM_Run{args.run}/" - f"NectarCAM_Run{args.run}_calib/NectarCAM_Run{args.run}_Results.fits" -) - -hdu = fits.open(fits_file) - -# Explore FITS file structure -hdu.info() - -outdict = dict() - -for h in range(1, len(hdu)): - extname = hdu[h].header["EXTNAME"] - outdict[extname] = dict() - for i in range(hdu[extname].header["TFIELDS"]): - keyname = hdu[extname].header[f"TTYPE{i+1}"] - outdict[extname][keyname] = hdu[extname].data[keyname] - -try: - db = DQMDB(read_only=False) - db.insert(f"NectarCAM_Run{args.run}", outdict) - db.commit_and_close() -except ZEO.Exceptions.ClientDisconnected as e: - logger.critical(f"Impossible to feed the ZODB data base. Received error: {e}") - -# Remove DQM archive file and directory -try: - os.remove(f"NectarCAM_DQM_Run{args.run}.tar.gz") -except OSError: - logger.warning( - f"Could not remove NectarCAM_DQM_Run{args.run}.tar.gz or it does not exist" +db_read = DQMDB(read_only=True) +db_read_keys = list(db_read.root.keys()) +db_read.abort_and_close() + +for run in args.runs: + if not args.force and f"NectarCAM_Run{run}" in db_read_keys: + logger.warning( + f'The run {run} is already present in the DB, will not parse this DQM run, or consider forcing it with the "--force" option.' + ) + continue + + lfn = f"{args.path}/NectarCAM_DQM_Run{run}.tar.gz" + + if not os.path.exists(os.path.basename(lfn)): + DIRAC.initialize() + + dirac = Dirac() + + dirac.getFile( + lfn=lfn, + destDir=f".", + printOutput=True, + ) + + try: + with tarfile.open(os.path.basename(lfn), "r") as tar: + tar.extractall(".") + except FileNotFoundError as e: + logger.warning( + f"Could not fetch DQM results from DIRAC for run {run}, received error {e}, skipping this run..." + ) + continue + + fits_file = ( + f"./NectarCAM_DQM_Run{run}/output/NectarCAM_Run{run}/" + f"NectarCAM_Run{run}_calib/NectarCAM_Run{run}_Results.fits" ) -dirpath = Path(f"./NectarCAM_DQM_Run{args.run}") -if dirpath.exists() and dirpath.is_dir(): - shutil.rmtree(dirpath) + hdu = fits.open(fits_file) + + # Explore FITS file structure + hdu.info() + + outdict = dict() + + for h in range(1, len(hdu)): + extname = hdu[h].header["EXTNAME"] + outdict[extname] = dict() + for i in range(hdu[extname].header["TFIELDS"]): + keyname = hdu[extname].header[f"TTYPE{i+1}"] + outdict[extname][keyname] = hdu[extname].data[keyname] + + try: + db = DQMDB(read_only=False) + db.insert(f"NectarCAM_Run{run}", outdict) + db.commit_and_close() + except ZEO.Exceptions.ClientDisconnected as e: + logger.critical(f"Impossible to feed the ZODB data base. Received error: {e}") + + # Remove DQM archive file and directory + try: + os.remove(f"NectarCAM_DQM_Run{run}.tar.gz") + except OSError: + logger.warning( + f"Could not remove NectarCAM_DQM_Run{run}.tar.gz or it does not exist" + ) + + dirpath = Path(f"./NectarCAM_DQM_Run{run}") + if dirpath.exists() and dirpath.is_dir(): + shutil.rmtree(dirpath)