diff --git a/.flake8 b/.flake8 deleted file mode 100644 index 5f6aab9..0000000 --- a/.flake8 +++ /dev/null @@ -1,6 +0,0 @@ -[flake8] -ignore = C901,E262,E266,N804,W504,E251,ANN101,ANN002,ANN003,ANN201,ANN204 -show-source = true -max-line-length = 160 -select=A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z -max-complexity = 15 diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 4b040aa..01fbfb2 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -29,14 +29,10 @@ jobs: ${{ runner.os }}-pip- - name: Install Hatch uses: pypa/hatch@a3c83ab3d481fbc2dc91dd0088628817488dd1d5 - - name: Install application and deps - run: | - python -m pip install --upgrade pip - pip install . - name: Linting run: | - pip install flake8 - flake8 . + pip install ruff + ruff check - name: Run tests run: | hatch run test:pytest diff --git a/bin/validate-raythena-job.py b/bin/validate-raythena-job.py index b6ec34d..435c9c0 100644 --- a/bin/validate-raythena-job.py +++ b/bin/validate-raythena-job.py @@ -1,20 +1,18 @@ #!/usr/bin/env python -from __future__ import print_function -from array import array import argparse import json import os.path as path - +from array import array import ROOT def get_event_numbers(filename): f = ROOT.TFile.Open(filename) tree = f.Get("POOLCollectionTree") - event_number = array('Q', [0]) + event_number = array("Q", [0]) n_entries = tree.GetEntries() - tree.SetBranchAddress('EventNumber', event_number) + tree.SetBranchAddress("EventNumber", event_number) event_numbers = list() for n in range(n_entries): tree.GetEntry(n) @@ -23,10 +21,10 @@ def get_event_numbers(filename): def validate_job(job_dir, job_state_file): - with open(job_state_file, 'r') as f: + with open(job_state_file) as f: job_state = json.load(f) merged_input_files = job_state["merged"] - merged_output_files = set([list(x.keys())[0] for x in merged_input_files.values()]) + merged_output_files = set([next(iter(x.keys())) for x in merged_input_files.values()]) event_numbers = set() for output_file in merged_output_files: output_file_abs = path.join(job_dir, "final", output_file) @@ -37,14 +35,30 @@ def validate_job(job_dir, job_state_file): current_event_numbers = get_event_numbers(output_file_abs) unique_current_event_numbers = set(current_event_numbers) if len(unique_current_event_numbers) != len(current_event_numbers): - print("Duplicate events in file " + output_file + "(" + str(len(current_event_numbers) - len(unique_current_event_numbers)) + "): ") + print( + "Duplicate events in file " + + output_file + + "(" + + str(len(current_event_numbers) - len(unique_current_event_numbers)) + + "): " + ) exit(1) print(str(len(current_event_numbers)) + " events in file " + output_file) if not unique_current_event_numbers.isdisjoint(event_numbers): - print("Found duplicate events in file " + output_file + ": " + str(unique_current_event_numbers & event_numbers)) + print( + "Found duplicate events in file " + + output_file + + ": " + + str(unique_current_event_numbers & event_numbers) + ) exit(1) event_numbers |= unique_current_event_numbers - print("No duplicate found. # events merged: " + str(len(event_numbers)) + ", # of files: " + str(len(merged_output_files))) + print( + "No duplicate found. # events merged: " + + str(len(event_numbers)) + + ", # of files: " + + str(len(merged_output_files)) + ) def main(): diff --git a/example/standalone_ray_test_hello_world.py b/example/standalone_ray_test_hello_world.py index afac388..ee827a1 100755 --- a/example/standalone_ray_test_hello_world.py +++ b/example/standalone_ray_test_hello_world.py @@ -9,16 +9,16 @@ import argparse import os import platform +import time from pprint import pprint import ray -import time def build_nodes_resource_list(redis_ip: str): nodes = ray.nodes() resource_list = list() for node in nodes: - naddr = node['NodeManagerAddress'] + naddr = node["NodeManagerAddress"] if naddr == redis_ip: continue else: @@ -27,7 +27,7 @@ def build_nodes_resource_list(redis_ip: str): @ray.remote -class actor(): +class actor: def __init__(self) -> None: self.pid = os.getpid() self.hostname = platform.node() @@ -42,8 +42,11 @@ def ping(self): def main(redis_ip: str, redis_port: str, redis_password: str): redis_address = f"{redis_ip}:{redis_port}" - ray.init(ignore_reinit_error=True, - address="%s" % redis_address, _redis_password="%s" % redis_password) + ray.init( + ignore_reinit_error=True, + address=f"{redis_address}", + _redis_password=f"{redis_password}", + ) # show the ray cluster print(f"Ray Cluster resources : {ray.cluster_resources()}") @@ -73,10 +76,10 @@ def main(redis_ip: str, redis_port: str, redis_password: str): if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Wait on ray head node or workers to connect') - parser.add_argument('--redis-ip', default="%s" % (os.environ["RAYTHENA_RAY_HEAD_IP"])) - parser.add_argument('--redis-port', default="%s" % (os.environ["RAYTHENA_RAY_REDIS_PORT"])) - parser.add_argument('--redis-password', default=os.environ["RAYTHENA_RAY_REDIS_PASSWORD"]) + parser = argparse.ArgumentParser(description="Wait on ray head node or workers to connect") + parser.add_argument("--redis-ip", default="{}".format(os.environ["RAYTHENA_RAY_HEAD_IP"])) + parser.add_argument("--redis-port", default="{}".format(os.environ["RAYTHENA_RAY_REDIS_PORT"])) + parser.add_argument("--redis-password", default=os.environ["RAYTHENA_RAY_REDIS_PASSWORD"]) args = parser.parse_args() print(f"args : {args}") main(args.redis_ip, args.redis_port, args.redis_password) diff --git a/pyproject.toml b/pyproject.toml index 113b8b4..11cbe51 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,9 +45,11 @@ dependencies = [ [tool.ruff] -line-length = 80 +line-length = 120 indent-width = 4 +target-version = "py39" + [tool.ruff.lint] select = [ @@ -62,5 +64,10 @@ select = [ # flake8-simplify "SIM", # isort - "I" + "I", + # Ruff + "RUF", ] + +[tool.ruff.lint.isort] +no-lines-before = ["third-party", "first-party", "standard-library"] diff --git a/src/raythena/__init__.py b/src/raythena/__init__.py index 70edd66..63620c5 100644 --- a/src/raythena/__init__.py +++ b/src/raythena/__init__.py @@ -1,5 +1,6 @@ try: from . import _version + __version__ = _version.__version__ except: # noqa: E722 __version__ = "0.0.0" diff --git a/src/raythena/actors/esworker.py b/src/raythena/actors/esworker.py index 4361f36..f13e5f6 100644 --- a/src/raythena/actors/esworker.py +++ b/src/raythena/actors/esworker.py @@ -1,89 +1,107 @@ +import datetime import json import os import re import shutil -import time -from typing import Union, Tuple, Sequence, Any, Mapping, Optional - -import datetime import threading - +import time +from collections.abc import Mapping, Sequence from socket import gethostname from time import sleep - +from typing import Any, Optional, Union import ray - -from raythena.utils.logging import disable_stdout_logging, make_logger, log_to_file -from raythena.utils.config import Config -from raythena.utils.eventservice import EventRangeRequest, Messages, EventRangeUpdate, PandaJob, EventRange -from raythena.utils.exception import IllegalWorkerState, StageInFailed, StageOutFailed, WrappedException, BaseRaythenaException -from raythena.utils.ray import get_node_ip -# from raythena.utils.timing import CPUMonitor from raythena.actors.payloads.basePayload import BasePayload from raythena.actors.payloads.eventservice.esPayload import ESPayload - from raythena.actors.payloads.eventservice.pilothttp import PilotHttpPayload - +from raythena.utils.config import Config +from raythena.utils.eventservice import ( + EventRange, + EventRangeRequest, + EventRangeUpdate, + Messages, + PandaJob, +) +from raythena.utils.exception import ( + BaseRaythenaException, + IllegalWorkerState, + StageInFailed, + StageOutFailed, + WrappedException, +) +from raythena.utils.logging import ( + disable_stdout_logging, + log_to_file, + make_logger, +) +from raythena.utils.ray import get_node_ip # Type returned by the worker methods to the driver -WorkerResponse = Tuple[str, int, Any] +WorkerResponse = tuple[str, int, Any] + +READY_FOR_JOB = 0 # initial state, before the first job request +JOB_REQUESTED = 1 # job has been requested to the driver, waiting for result +READY_FOR_EVENTS = 2 # ready to request new events for the current job +EVENT_RANGES_REQUESTED = 3 # event ranges have been requested to the driver, waiting for result +FINISHING_LOCAL_RANGES = 4 # do not request additional ranges, will move to STAGE_OUT once local cache is empty +PROCESSING = 5 # currently processing event ranges +FINISHING = 6 # Performing cleanup of resources, preparing final server update +DONE = 7 # Actor has finished processing job +STAGE_IN = 8 # Staging-in data. +STAGE_OUT = 9 # Staging-out data + +STATES_NAME = { + READY_FOR_JOB: "READY_FOR_JOB", + JOB_REQUESTED: "JOB_REQUESTED", + READY_FOR_EVENTS: "READY_FOR_EVENTS", + EVENT_RANGES_REQUESTED: "EVENT_RANGES_REQUESTED", + FINISHING_LOCAL_RANGES: "FINISHING_LOCAL_RANGES", + PROCESSING: "PROCESSING", + FINISHING: "FINISHING", + DONE: "DONE", + STAGE_IN: "STAGE_IN", + STAGE_OUT: "STAGE_OUT", +} + +# authorize state transition from x to y if y in TRANSITION[X] +TRANSITIONS = { + READY_FOR_JOB: [JOB_REQUESTED], + JOB_REQUESTED: [STAGE_IN, DONE], + STAGE_IN: [READY_FOR_EVENTS], + READY_FOR_EVENTS: [EVENT_RANGES_REQUESTED, STAGE_OUT], + EVENT_RANGES_REQUESTED: [FINISHING_LOCAL_RANGES, PROCESSING, STAGE_OUT], + FINISHING_LOCAL_RANGES: [STAGE_OUT], + PROCESSING: [READY_FOR_EVENTS, STAGE_OUT], + STAGE_OUT: [FINISHING], + FINISHING: [DONE], + DONE: [READY_FOR_JOB], +} @ray.remote(num_cpus=1, max_restarts=1, max_task_retries=3) -class ESWorker(object): +class ESWorker: """ Actor running on HPC compute node. Each actor will start a payload plugin which handle the job processing as well as the communication with the job processing framework, Athena or any intermediary layer such as pilot 2. A worker instance is a stateful object which basically transitions from job request -> stage-in -> processing <-> ranges request -> stage-out -> done - Allowed transition are defined by ESWorker.TRANSITIONS + Allowed transition are defined by TRANSITIONS The current state defines what message will be sent to the driver when it requests the worker state using get_message(). The driver needs to frequently call get_message() and process requests from the worker, allowing the worker to progress in the job processing. """ - READY_FOR_JOB = 0 # initial state, before the first job request - JOB_REQUESTED = 1 # job has been requested to the driver, waiting for result - READY_FOR_EVENTS = 2 # ready to request new events for the current job - EVENT_RANGES_REQUESTED = 3 # event ranges have been requested to the driver, waiting for result - FINISHING_LOCAL_RANGES = 4 # do not request additional ranges, will move to STAGE_OUT once local cache is empty - PROCESSING = 5 # currently processing event ranges - FINISHING = 6 # Performing cleanup of resources, preparing final server update - DONE = 7 # Actor has finished processing job - STAGE_IN = 8 # Staging-in data. - STAGE_OUT = 9 # Staging-out data - - STATES_NAME = { - READY_FOR_JOB: "READY_FOR_JOB", - JOB_REQUESTED: "JOB_REQUESTED", - READY_FOR_EVENTS: "READY_FOR_EVENTS", - EVENT_RANGES_REQUESTED: "EVENT_RANGES_REQUESTED", - FINISHING_LOCAL_RANGES: "FINISHING_LOCAL_RANGES", - PROCESSING: "PROCESSING", - FINISHING: "FINISHING", - DONE: "DONE", - STAGE_IN: "STAGE_IN", - STAGE_OUT: "STAGE_OUT" - } - - # authorize state transition from x to y if y in TRANSITION[X] - TRANSITIONS = { - READY_FOR_JOB: [JOB_REQUESTED], - JOB_REQUESTED: [STAGE_IN, DONE], - STAGE_IN: [READY_FOR_EVENTS], - READY_FOR_EVENTS: [EVENT_RANGES_REQUESTED, STAGE_OUT], - EVENT_RANGES_REQUESTED: [FINISHING_LOCAL_RANGES, PROCESSING, STAGE_OUT], - FINISHING_LOCAL_RANGES: [STAGE_OUT], - PROCESSING: [READY_FOR_EVENTS, STAGE_OUT], - STAGE_OUT: [FINISHING], - FINISHING: [DONE], - DONE: [READY_FOR_JOB] - } - - def __init__(self, actor_id: str, config: Config, - session_log_dir: str, actor_no: int, actor_count: int, job: PandaJob = None, event_ranges: Sequence[EventRange] = None) -> None: + def __init__( + self, + actor_id: str, + config: Config, + session_log_dir: str, + actor_no: int, + actor_count: int, + job: PandaJob = None, + event_ranges: Optional[Sequence[EventRange]] = None, + ) -> None: """ Initialize attributes, instantiate a payload and setup the workdir @@ -101,31 +119,32 @@ def __init__(self, actor_id: str, config: Config, self._logger = make_logger(self.config, self.id) self.session_log_dir = session_log_dir self.job = None - self.transitions = ESWorker.TRANSITIONS + self.transitions = TRANSITIONS self.node_ip = get_node_ip() - self.state = ESWorker.READY_FOR_JOB + self.state = READY_FOR_JOB self.payload_job_dir = None self.payload_actor_output_dir = None self.payload_actor_process_dir = None self.actor_ray_logs_dir = None self.cpu_monitor = None - self.workdir = os.path.expandvars( - self.config.ray.get('workdir', os.getcwd())) + self.workdir = os.path.expandvars(self.config.ray.get("workdir", os.getcwd())) if not os.path.isdir(self.workdir): self.workdir = os.getcwd() self.output_dir = self.config.ray.get("outputdir") - self.pilot_kill_file = os.path.expandvars(self.config.payload.get('pilotkillfile', 'pilot_kill_payload')) - self.pilot_kill_time = self.config.payload.get('pilotkilltime', 600) - self.time_monitor_file = os.path.expandvars(self.config.payload.get('timemonitorfile', 'RaythenaTimeMonitor.txt')) + self.pilot_kill_file = os.path.expandvars(self.config.payload.get("pilotkillfile", "pilot_kill_payload")) + self.pilot_kill_time = self.config.payload.get("pilotkilltime", 600) + self.time_monitor_file = os.path.expandvars( + self.config.payload.get("timemonitorfile", "RaythenaTimeMonitor.txt") + ) self.payload: Union[BasePayload, ESPayload] = PilotHttpPayload(self.id, self.config) self.start_time = -1 self.time_limit = -1 self.elapsed = 1 if job: - self.transition_state(ESWorker.JOB_REQUESTED) + self.transition_state(JOB_REQUESTED) self.receive_job(Messages.REPLY_OK, job) if event_ranges: - self.transition_state(ESWorker.EVENT_RANGES_REQUESTED) + self.transition_state(EVENT_RANGES_REQUESTED) self.receive_event_ranges(Messages.REPLY_OK, event_ranges) def check_time(self) -> None: @@ -142,15 +161,15 @@ def check_time(self) -> None: if time_elapsed // 300 >= self.elapsed: self.elapsed += 1 try: - if self.config.logging.get('copyraylogs', False): + if self.config.logging.get("copyraylogs", False): if os.path.isdir(self.actor_ray_logs_dir): shutil.rmtree(self.actor_ray_logs_dir) shutil.copytree(self.session_log_dir, self.actor_ray_logs_dir) except Exception as e: self._logger.warning(f"Failed to copy ray logs to actor directory: {e}") if time_elapsed > self.time_limit - self.pilot_kill_time: - killsignal = open(self.pilot_kill_file, 'w') - killsignal.close() + with open(self.pilot_kill_file, "w") as f: + f.write("KILL") self._logger.info("killsignal sent to payload") break else: @@ -173,11 +192,18 @@ def modify_job(self, job: PandaJob) -> PandaJob: input_evnt_file = re.findall(r"\-\-inputEVNTFile=([\w\.\,]*) \-", cmd) if len(input_evnt_file) != 1: return job - in_files = [os.path.join(os.path.expandvars(self.config.harvester['endpoint']), x) - for x in input_evnt_file[0].split(",")] + in_files = [ + os.path.join(os.path.expandvars(self.config.harvester["endpoint"]), x) + for x in input_evnt_file[0].split(",") + ] in_files = ",".join(in_files[0:1]) - cmd = re.sub(r"\-\-inputEVNTFile=([\w\.\,]*) \-", f"--inputEVNTFile={in_files} -", cmd) - # convert args of the form --outputHITSFile=HITS.30737678._[011001,...].pool.root to --outputHITSFile=HITS.30737678._011001.pool.root + cmd = re.sub( + r"\-\-inputEVNTFile=([\w\.\,]*) \-", + f"--inputEVNTFile={in_files} -", + cmd, + ) + # convert args of the form --outputHITSFile=HITS.30737678._[011001,...].pool.root + # to --outputHITSFile=HITS.30737678._011001.pool.root match = re.findall(r"--outputHITSFile=([0-9A-Z._]+)\[([0-9,]+)\](.pool.root)", cmd) if match: match_tuple = match[0] @@ -185,7 +211,11 @@ def modify_job(self, job: PandaJob) -> PandaJob: suffix = match_tuple[2] nums = match_tuple[1].split(",") dummy_name = f"{prefix}{nums[0]}{suffix}" - cmd = re.sub(r"--outputHITSFile=[0-9A-Z._]+\[[0-9,]+\].pool.root", f"--outputHITSFile={dummy_name}", cmd) + cmd = re.sub( + r"--outputHITSFile=[0-9A-Z._]+\[[0-9,]+\].pool.root", + f"--outputHITSFile={dummy_name}", + cmd, + ) job_number = max(int(job["attemptNr"]) - 1, 0) * self.actor_count + self.actor_no + 1 if "--jobNumber=" in cmd: @@ -193,7 +223,7 @@ def modify_job(self, job: PandaJob) -> PandaJob: else: cmd = f"{cmd} --jobNumber={job_number} " - maxEvents = min(500, job['nEventsPerInputFile']) + maxEvents = min(500, job["nEventsPerInputFile"]) if "--maxEvents=" in cmd: cmd = re.sub(r"--maxEvents=[0-9]+", f"--maxEvents={maxEvents}", cmd) else: @@ -212,9 +242,10 @@ def stagein(self) -> None: Postconditions: - The worker is in the READY_FOR_EVENTS state. Raises: - StageInFailed: If creating / moving to the work directory fails or the call to the payload stage-in raises an exception. + StageInFailed: If creating / moving to the work directory fails or the call to the payload + stage-in raises an exception. """ - self.payload_job_dir = os.path.join(self.workdir, self.job['PandaID']) + self.payload_job_dir = os.path.join(self.workdir, self.job["PandaID"]) if not os.path.isdir(self.payload_job_dir): self._logger.warning(f"Specified path {self.payload_job_dir} does not exist. Using cwd {os.getcwd()}") self.payload_job_dir = self.workdir @@ -224,14 +255,14 @@ def stagein(self) -> None: self.payload_actor_output_dir = os.path.join(self.payload_job_dir, subdir, "esOutput") self.actor_ray_logs_dir = os.path.join(self.payload_actor_process_dir, "ray_logs") try: - time_limit_monitor = open(os.path.join(self.workdir, self.time_monitor_file)) - start_time = time_limit_monitor.readline().split(':') - self.start_time = int(start_time[0]) * 3600 + int(start_time[1]) * 60 + int(start_time[2]) - time_limit = time_limit_monitor.readline().split(':') - if len(time_limit) < 3: - time_limit = ['0'] + time_limit - self.time_limit = int(time_limit[0]) * 3600 + int(time_limit[1]) * 60 + int(time_limit[2]) - timer_thread = threading.Thread(name='timer', target=self.check_time, daemon=True) + with open(os.path.join(self.workdir, self.time_monitor_file)) as time_limit_monitor: + start_time = time_limit_monitor.readline().split(":") + self.start_time = int(start_time[0]) * 3600 + int(start_time[1]) * 60 + int(start_time[2]) + time_limit = time_limit_monitor.readline().split(":") + if len(time_limit) < 3: + time_limit = ["0", *time_limit] + self.time_limit = int(time_limit[0]) * 3600 + int(time_limit[1]) * 60 + int(time_limit[2]) + timer_thread = threading.Thread(name="timer", target=self.check_time, daemon=True) timer_thread.start() except Exception as e: self._logger.warning(f"Failed to setup timer thread: {e}") @@ -239,9 +270,15 @@ def stagein(self) -> None: try: os.mkdir(self.payload_actor_process_dir) os.chdir(self.payload_actor_process_dir) - worker_logfile = self.config.logging.get('workerlogfile', None) + worker_logfile = self.config.logging.get("workerlogfile", None) if worker_logfile: - log_to_file(self.config.logging.get('level', 'warning').upper(), os.path.join(self.payload_actor_process_dir, os.path.basename(worker_logfile))) + log_to_file( + self.config.logging.get("level", "warning").upper(), + os.path.join( + self.payload_actor_process_dir, + os.path.basename(worker_logfile), + ), + ) disable_stdout_logging() self._logger.info(f"Ray worker started on node {gethostname()}") @@ -250,7 +287,7 @@ def stagein(self) -> None: os.mkdir(self.payload_actor_output_dir) except Exception as e: self._logger.warning(f"Exception when creating dir: {e}") - raise StageInFailed(self.id) + raise StageInFailed(self.id) from e # self.cpu_monitor = CPUMonitor(os.path.join(self.payload_actor_process_dir, "cpu_monitor.json")) # self.cpu_monitor.start() try: @@ -258,9 +295,8 @@ def stagein(self) -> None: self.payload.start(self.modify_job(self.job)) except Exception as e: self._logger.warning(f"Failed to stagein payload: {e}") - raise StageInFailed(self.id) - self.transition_state(ESWorker.READY_FOR_EVENTS if self. - is_event_service_job() else ESWorker.PROCESSING) + raise StageInFailed(self.id) from e + self.transition_state(READY_FOR_EVENTS if self.is_event_service_job() else PROCESSING) def stageout(self) -> None: """ @@ -274,7 +310,7 @@ def stageout(self) -> None: - The worker is in the DONE state. """ self.payload.stageout() - self.transition_state(ESWorker.FINISHING) + self.transition_state(FINISHING) self.terminate_actor() def transition_state(self, dest: int) -> None: @@ -288,10 +324,12 @@ def transition_state(self, dest: int) -> None: IllegalWorkerState if the transition isn't allowed """ if dest not in self.transitions[self.state]: - self._logger.error(f"Illegal transition from {ESWorker.STATES_NAME[self.state]} to {ESWorker.STATES_NAME[dest]}") - raise IllegalWorkerState(worker_id=self.id, - src_state=ESWorker.STATES_NAME[self.state], - dst_state=ESWorker.STATES_NAME[dest]) + self._logger.error(f"Illegal transition from {STATES_NAME[self.state]} to {STATES_NAME[dest]}") + raise IllegalWorkerState( + worker_id=self.id, + src_state=STATES_NAME[self.state], + dst_state=STATES_NAME[dest], + ) self.state = dest def is_event_service_job(self) -> bool: @@ -322,15 +360,15 @@ def receive_job(self, reply: int, job: PandaJob) -> WorkerResponse: """ self.job = job if reply == Messages.REPLY_OK and self.job: - self.transition_state(ESWorker.STAGE_IN) + self.transition_state(STAGE_IN) try: self.stagein() except BaseRaythenaException: raise except Exception as e: - raise WrappedException(self.id, e) + raise WrappedException(self.id, e) from e else: - self.transition_state(ESWorker.DONE) + self.transition_state(DONE) self._logger.error("Could not fetch job. Set state to done.") return self.return_message(Messages.REPLY_OK) @@ -347,13 +385,11 @@ def mark_new_job(self) -> WorkerResponse: """ # TODO: either remove this functionality (event service workers will only ever have one job) # TODO: or finish the implementation by also cleaning up the filesystem - self.transition_state(ESWorker.READY_FOR_JOB) - self.transition_state(ESWorker.JOB_REQUESTED) + self.transition_state(READY_FOR_JOB) + self.transition_state(JOB_REQUESTED) return self.return_message(Messages.REQUEST_NEW_JOB) - def receive_event_ranges( - self, reply: int, - event_ranges: Sequence[EventRange]) -> WorkerResponse: + def receive_event_ranges(self, reply: int, event_ranges: Sequence[EventRange]) -> WorkerResponse: """ Sends event ranges to be processed by the worker. Update the PFN of event ranges to an absolute path if it is a relative path. If no ranges are provided, the worker will not expect any more ranges in the future and @@ -376,22 +412,21 @@ def receive_event_ranges( """ if reply == Messages.REPLY_NO_MORE_EVENT_RANGES or not event_ranges: # no new ranges... finish processing local cache then terminate actor - self.transition_state(ESWorker.FINISHING_LOCAL_RANGES) + self.transition_state(FINISHING_LOCAL_RANGES) self.payload.submit_new_ranges(None) return self.return_message(Messages.REPLY_OK) for crange in event_ranges: if not os.path.isabs(crange.PFN): crange.PFN = os.path.join( - os.path.expandvars(self.config.harvester['endpoint']), - crange.PFN) + os.path.expandvars(self.config.harvester["endpoint"]), + crange.PFN, + ) self.payload.submit_new_ranges(event_ranges) - self.transition_state(ESWorker.PROCESSING) + self.transition_state(PROCESSING) return self.return_message(Messages.REPLY_OK) - def return_message(self, - message: int, - data: Any = None) -> WorkerResponse: + def return_message(self, message: int, data: Any = None) -> WorkerResponse: """ Utility function to build a tuple response for to the driver @@ -400,7 +435,7 @@ def return_message(self, data: extra data attached to the message type Returns: - Tuple of (id, message, data) + tuple of (id, message, data) """ return self.id, message, data @@ -423,7 +458,7 @@ def terminate_actor(self) -> None: """ self.payload.stop() # self.cpu_monitor.stop() - self.transition_state(ESWorker.DONE) + self.transition_state(DONE) def should_request_ranges(self) -> bool: """ @@ -437,17 +472,15 @@ def should_request_ranges(self) -> bool: True if more event ranges are needed by the payload """ # do not transition if not in a state allowing for event ranges request - if ESWorker.READY_FOR_EVENTS not in self.transitions[self.state]: + if READY_FOR_EVENTS not in self.transitions[self.state]: return False res = self.payload.should_request_more_ranges() if res: - self.transition_state(ESWorker.READY_FOR_EVENTS) + self.transition_state(READY_FOR_EVENTS) return res - def stageout_event_service_files( - self, - ranges_update: Mapping[str, str]) -> Optional[EventRangeUpdate]: + def stageout_event_service_files(self, ranges_update: Mapping[str, str]) -> Optional[EventRangeUpdate]: """ Move the HITS files reported by the pilot payload. Files are moved from the Athena work directory to the worker-specific output directory. @@ -458,7 +491,7 @@ def stageout_event_service_files( Returns: Updated event ranges update referencing the moved output files """ - ranges = json.loads(ranges_update['eventRanges'][0]) + ranges = json.loads(ranges_update["eventRanges"][0]) ranges = EventRangeUpdate.build_from_dict(self.job.get_id(), ranges) # stage-out finished event ranges for range_update in ranges[self.job.get_id()]: @@ -467,7 +500,7 @@ def stageout_event_service_files( if range_update["eventStatus"] == "failed": self._logger.warning("event range failed, will not stage-out") continue - if "path" in range_update and range_update["path"]: + if range_update.get("path"): cfile_key = "path" else: raise StageOutFailed(self.id) @@ -475,13 +508,14 @@ def stageout_event_service_files( if cfile: dst = os.path.join( self.output_dir, - os.path.basename(cfile) if os.path.isabs(cfile) else cfile) + os.path.basename(cfile) if os.path.isabs(cfile) else cfile, + ) if os.path.isfile(cfile): try: os.replace(cfile, dst) except OSError as e: self._logger.error(f"Failed to move file {cfile} to {dst}: errno {e.errno}: {e.strerror}") - raise StageOutFailed(self.id) + raise StageOutFailed(self.id) from e range_update[cfile_key] = dst else: self._logger.warning(f"Couldn't stageout file {cfile} as it doesn't exist") @@ -499,8 +533,7 @@ def get_payload_message(self) -> Optional[WorkerResponse]: ranges_update = self.payload.fetch_ranges_update() if ranges_update: ranges_update = self.stageout_event_service_files(ranges_update) - return self.return_message(Messages.UPDATE_EVENT_RANGES, - ranges_update) + return self.return_message(Messages.UPDATE_EVENT_RANGES, ranges_update) job_update = self.payload.fetch_job_update() if job_update: @@ -519,17 +552,17 @@ def get_message(self) -> WorkerResponse: Returns: - Tuple depending on the current worker state, informing the driver about what information should be sent + tuple depending on the current worker state, informing the driver about what information should be sent to the worker or if the worker produced output data. """ try: - while self.state != ESWorker.DONE: + while self.state != DONE: payload_message = self.get_payload_message() if payload_message: return payload_message - elif self.state == ESWorker.READY_FOR_JOB: + elif self.state == READY_FOR_JOB: # ready to get a new job - self.transition_state(ESWorker.JOB_REQUESTED) + self.transition_state(JOB_REQUESTED) return self.return_message(Messages.REQUEST_NEW_JOB) elif self.payload.is_complete(): # check if there are any remaining message from the payload in queue. @@ -539,19 +572,20 @@ def get_message(self) -> WorkerResponse: return payload_message else: # if no more message, proceed to stage-out - self.transition_state(ESWorker.STAGE_OUT) + self.transition_state(STAGE_OUT) self.stageout() return self.return_message(Messages.PROCESS_DONE) - elif self.is_event_service_job() and ( - self.state == ESWorker.READY_FOR_EVENTS or - self.should_request_ranges()): + elif self.is_event_service_job() and (self.state == READY_FOR_EVENTS or self.should_request_ranges()): req = EventRangeRequest() - req.add_event_request(self.job['PandaID'], - self.config.resources.get('corepernode', 64), - self.job['taskID'], self.job['jobsetID']) - self.transition_state(ESWorker.EVENT_RANGES_REQUESTED) + req.add_event_request( + self.job["PandaID"], + self.config.resources.get("corepernode", 64), + self.job["taskID"], + self.job["jobsetID"], + ) + self.transition_state(EVENT_RANGES_REQUESTED) return self.return_message(Messages.REQUEST_EVENT_RANGES, req) - elif self.state == ESWorker.DONE: + elif self.state == DONE: return self.return_message(Messages.PROCESS_DONE) else: time.sleep(1) # Nothing to do, sleeping... @@ -560,4 +594,4 @@ def get_message(self) -> WorkerResponse: except BaseRaythenaException: raise except Exception as e: - raise WrappedException(self.id, e) + raise WrappedException(self.id, e) from e diff --git a/src/raythena/actors/payloads/basePayload.py b/src/raythena/actors/payloads/basePayload.py index 08af4c4..030ca46 100644 --- a/src/raythena/actors/payloads/basePayload.py +++ b/src/raythena/actors/payloads/basePayload.py @@ -1,6 +1,5 @@ from abc import ABC, abstractmethod -from typing import Dict, Optional, Any - +from typing import Any, Optional from raythena.utils.config import Config from raythena.utils.eventservice import PandaJob @@ -11,8 +10,7 @@ class BasePayload(ABC): panda job specification and are responsible to handle the execution of the """ - def __init__(self, worker_id: str, - config: Config) -> None: + def __init__(self, worker_id: str, config: Config) -> None: """ Setup base payload attributes @@ -89,7 +87,7 @@ def return_code(self) -> int: raise NotImplementedError("Base method not implemented") @abstractmethod - def fetch_job_update(self) -> Optional[Dict[str, Any]]: + def fetch_job_update(self) -> Optional[dict[str, Any]]: """ Tries to get a job update from the payload diff --git a/src/raythena/actors/payloads/eventservice/esPayload.py b/src/raythena/actors/payloads/eventservice/esPayload.py index fe31dfa..2de9d17 100644 --- a/src/raythena/actors/payloads/eventservice/esPayload.py +++ b/src/raythena/actors/payloads/eventservice/esPayload.py @@ -1,6 +1,6 @@ from abc import abstractmethod -from typing import Dict, Optional, Sequence - +from collections.abc import Sequence +from typing import Optional from raythena.actors.payloads.basePayload import BasePayload from raythena.utils.config import Config from raythena.utils.eventservice import EventRange @@ -32,12 +32,12 @@ def submit_new_ranges(self, event_ranges: Optional[Sequence[EventRange]]) -> Non raise NotImplementedError("Base method not implemented") @abstractmethod - def fetch_ranges_update(self) -> Optional[Dict[str, str]]: + def fetch_ranges_update(self) -> Optional[dict[str, str]]: """ Checks if event ranges update are available Returns: - Dict holding event range update of processed events, None if no update is available + dict holding event range update of processed events, None if no update is available """ raise NotImplementedError("Base method not implemented") diff --git a/src/raythena/actors/payloads/eventservice/pilothttp.py b/src/raythena/actors/payloads/eventservice/pilothttp.py index 8b42d39..98f8679 100644 --- a/src/raythena/actors/payloads/eventservice/pilothttp.py +++ b/src/raythena/actors/payloads/eventservice/pilothttp.py @@ -4,23 +4,21 @@ import os import shlex import stat -from asyncio import Queue, QueueEmpty, Event +from asyncio import Event, Queue, QueueEmpty +from collections.abc import Iterable, Mapping from subprocess import DEVNULL, Popen -from typing import Dict, List, Callable, Optional, Iterable, Mapping +from typing import Callable, Optional from urllib.parse import parse_qs - import uvloop from aiohttp import web - -from raythena.utils.logging import make_logger from raythena.actors.payloads.eventservice.esPayload import ESPayload from raythena.utils.config import Config -from raythena.utils.eventservice import ESEncoder -from raythena.utils.eventservice import PandaJob, EventRange -from raythena.utils.exception import FailedPayload, ExThread +from raythena.utils.eventservice import ESEncoder, EventRange, PandaJob +from raythena.utils.exception import ExThread, FailedPayload +from raythena.utils.logging import make_logger -class AsyncRouter(object): +class AsyncRouter: """ Very simple router mapping HTTP endpoint to a handler. Only supports with asynchronous handler compatible with the asyncio Framework. @@ -96,7 +94,7 @@ def __init__(self, worker_id: str, config: Config) -> None: """ super().__init__(worker_id, config) self._logger = make_logger(self.config, self.worker_id) - self.host = '127.0.0.1' + self.host = "127.0.0.1" self.port = 8080 self.json_encoder = functools.partial(json.dumps, cls=ESEncoder) self.server_thread = None @@ -114,19 +112,15 @@ def __init__(self, worker_id: str, config: Config) -> None: self.ranges_queue = Queue() self.router = AsyncRouter() - self.router.register('/', self.handle_get_job) - self.router.register('/server/panda/getJob', self.handle_get_job) - self.router.register('/server/panda/updateJob', self.handle_update_job) - self.router.register('/server/panda/updateWorkerPilotStatus', self.handle_update_job) - self.router.register('/server/panda/updateJobsInBulk', - self.handle_update_jobs_in_bulk) - self.router.register('/server/panda/getStatus', self.handle_get_status) - self.router.register('/server/panda/getEventRanges', - self.handle_get_event_ranges) - self.router.register('/server/panda/updateEventRanges', - self.handle_update_event_ranges) - self.router.register('/server/panda/getKeyPair', - self.handle_get_key_pair) + self.router.register("/", self.handle_get_job) + self.router.register("/server/panda/getJob", self.handle_get_job) + self.router.register("/server/panda/updateJob", self.handle_update_job) + self.router.register("/server/panda/updateWorkerPilotStatus", self.handle_update_job) + self.router.register("/server/panda/updateJobsInBulk", self.handle_update_jobs_in_bulk) + self.router.register("/server/panda/getStatus", self.handle_get_status) + self.router.register("/server/panda/getEventRanges", self.handle_get_event_ranges) + self.router.register("/server/panda/updateEventRanges", self.handle_update_event_ranges) + self.router.register("/server/panda/getKeyPair", self.handle_get_key_pair) def _start_payload(self) -> None: """ @@ -137,12 +131,14 @@ def _start_payload(self) -> None: # we're not reading data using communicate() and the pipe buffer becomes full as pilot2 # generates a lot of data to the stdout pipe # see https://docs.python.org/3.7/library/subprocess.html#subprocess.Popen.wait - self.pilot_process = Popen(command, - stdin=DEVNULL, - stdout=DEVNULL, - stderr=DEVNULL, - shell=True, - close_fds=True) + self.pilot_process = Popen( + command, + stdin=DEVNULL, + stdout=DEVNULL, + stderr=DEVNULL, + shell=True, + close_fds=True, + ) self._logger.info(f"Pilot payload started with PID {self.pilot_process.pid}") def _build_pilot_command(self) -> str: @@ -157,9 +153,9 @@ def _build_pilot_command(self) -> str: Raises: FailedPayload: if source code to be executed cannot be retrieved from CVMFS """ - cmd = str() + cmd = "" - extra_setup = self.config.payload.get('extrasetup', None) + extra_setup = self.config.payload.get("extrasetup", None) if extra_setup is not None: cmd += f"{extra_setup}{';' if not extra_setup.endswith(';') else ''}" @@ -174,34 +170,38 @@ def _build_pilot_command(self) -> str: cmd += f"ln -s {pilot_src} {os.path.join(os.getcwd(), pilot_base)};" - prod_source_label = shlex.quote(self.current_job['prodSourceLabel']) + prod_source_label = shlex.quote(self.current_job["prodSourceLabel"]) pilotwrapper_bin = "/cvmfs/atlas.cern.ch/repo/sw/PandaPilotWrapper/latest/runpilot2-wrapper.sh" if not os.path.isfile(pilotwrapper_bin): raise FailedPayload(self.worker_id) - queue_escaped = shlex.quote(self.config.payload['pandaqueue']) - cmd += f"{shlex.quote(pilotwrapper_bin)} --localpy --piloturl local -q {queue_escaped} -r {queue_escaped} -s {queue_escaped} " + queue_escaped = shlex.quote(self.config.payload["pandaqueue"]) + cmd += ( + f"{shlex.quote(pilotwrapper_bin)} --localpy --piloturl local " + f"-q {queue_escaped} -r {queue_escaped} -s {queue_escaped} " + ) cmd += "--pilotversion 3 --pythonversion 3 " - cmd += f"-i PR -j {prod_source_label} --container --mute --pilot-user=atlas -t -u --es-executor-type=raythena -v 1 " \ - f"-d --cleanup=False -w generic --use-https False --allow-same-user=False --resource-type MCORE " \ + cmd += ( + f"-i PR -j {prod_source_label} --container --mute --pilot-user=atlas -t -u " + f"--es-executor-type=raythena -v 1 " + f"-d --cleanup=False -w generic --use-https False --allow-same-user=False --resource-type MCORE " f"--hpc-resource {shlex.quote(self.config.payload['hpcresource'])};" + ) - extra_script = self.config.payload.get('extrapostpayload', None) + extra_script = self.config.payload.get("extrapostpayload", None) if extra_script is not None: cmd += f"{extra_script}{';' if not extra_script.endswith(';') else ''}" cmd_script = os.path.join(os.getcwd(), "payload.sh") - with open(cmd_script, 'w') as f: + with open(cmd_script, "w") as f: f.write(cmd) st = os.stat(cmd_script) os.chmod(cmd_script, st.st_mode | stat.S_IEXEC) - payload_log = shlex.quote( - self.config.payload.get('logfilename', 'wrapper')) - return (f"/bin/bash {cmd_script} " - f"> {payload_log} 2> {payload_log}.stderr") + payload_log = shlex.quote(self.config.payload.get("logfilename", "wrapper")) + return f"/bin/bash {cmd_script} " f"> {payload_log} 2> {payload_log}.stderr" def stagein(self) -> None: """ @@ -236,8 +236,7 @@ def is_complete(self) -> bool: Returns: False if the payload has not finished yet, True otherwise """ - return self.pilot_process is not None and self.pilot_process.poll( - ) is not None + return self.pilot_process is not None and self.pilot_process.poll() is not None def return_code(self) -> Optional[int]: """ @@ -263,8 +262,7 @@ def start(self, job: PandaJob) -> None: self.current_job = job self.ranges_queue = Queue() self.no_more_ranges = False - self.server_thread = ExThread(target=self.run, - name="http-server") + self.server_thread = ExThread(target=self.run, name="http-server") self.server_thread.start() def stop(self) -> None: @@ -273,14 +271,12 @@ def stop(self) -> None: and wait until it exits then stop the http server """ if self.server_thread and self.server_thread.is_alive(): - pexit = self.pilot_process.poll() if pexit is None: self.pilot_process.terminate() pexit = self.pilot_process.wait() self._logger.debug(f"Payload return code: {pexit}") - asyncio.run_coroutine_threadsafe(self.notify_stop_server_task(), - self.loop) + asyncio.run_coroutine_threadsafe(self.notify_stop_server_task(), self.loop) self.server_thread.join() def submit_new_range(self, event_range: Optional[EventRange]) -> asyncio.Future: @@ -290,8 +286,7 @@ def submit_new_range(self, event_range: Optional[EventRange]) -> asyncio.Future: Args: event_range: range to forward to pilot """ - return asyncio.run_coroutine_threadsafe(self.ranges_queue.put(event_range), - self.loop) + return asyncio.run_coroutine_threadsafe(self.ranges_queue.put(event_range), self.loop) def submit_new_ranges(self, event_ranges: Optional[Iterable[EventRange]]) -> None: """ @@ -328,7 +323,7 @@ def fetch_ranges_update(self) -> Optional[Mapping[str, str]]: Checks if event ranges update are available by polling the event ranges update queue Returns: - Dict holding event range update of processed events, None if no update is available + dict holding event range update of processed events, None if no update is available """ try: res = self.ranges_update.get_nowait() @@ -340,8 +335,8 @@ def fetch_ranges_update(self) -> Optional[Mapping[str, str]]: def should_request_more_ranges(self) -> bool: """ Checks if the payload is ready to receive more event ranges. If false is returned, then the payload is - not expecting to have more ranges assigned to it by calling submit_new_ranges. If this method ever returns false, - then any future to it will return false as well. + not expecting to have more ranges assigned to it by calling submit_new_ranges. + If this method ever returns false, then any future to it will return false as well. Event ranges submitted after this method returns false will be ignored and never sent to the pilot process. Returns: @@ -366,11 +361,10 @@ async def http_handler(self, request: web.BaseRequest) -> web.Response: try: return await self.router.route(request.path, request=request) except Exception: - return web.json_response({"StatusCode": 500}, - dumps=self.json_encoder) + return web.json_response({"StatusCode": 500}, dumps=self.json_encoder) @staticmethod - async def parse_qs_body(request: web.BaseRequest) -> Dict[str, List[str]]: + async def parse_qs_body(request: web.BaseRequest) -> dict[str, list[str]]: """ Parses the query-string request body to a dictionary @@ -418,8 +412,7 @@ async def handle_update_job(self, request: web.BaseRequest) -> web.Response: # self._logger.debug(f"job update queue size is {self.job_update.qsize()}") return web.json_response(res, dumps=self.json_encoder) - async def handle_get_event_ranges(self, - request: web.BaseRequest) -> web.Response: + async def handle_get_event_ranges(self, request: web.BaseRequest) -> web.Response: """ Handler for getEventRanges call, retrieve event ranges from the queue and returns ranges to pilot. If not enough event ranges are available yet, wait until more ranges become available or a message indicating @@ -433,15 +426,15 @@ async def handle_get_event_ranges(self, """ body = await PilotHttpPayload.parse_qs_body(request) status = 0 - panda_id = body['pandaID'][0] + panda_id = body["pandaID"][0] ranges = list() # PandaID does not match the current job, return an error - if panda_id != self.current_job['PandaID']: + if panda_id != self.current_job["PandaID"]: status = -1 else: - n_ranges = int(body['nRanges'][0]) + n_ranges = int(body["nRanges"][0]) if not self.no_more_ranges: - for i in range(n_ranges): + for _ in range(n_ranges): crange = await self.ranges_queue.get() if crange is None: self.no_more_ranges = True @@ -451,8 +444,7 @@ async def handle_get_event_ranges(self, # self._logger.info(f"{len(res['eventRanges'])} ranges sent to pilot") return web.json_response(res, dumps=self.json_encoder) - async def handle_update_event_ranges( - self, request: web.BaseRequest) -> web.Response: + async def handle_update_event_ranges(self, request: web.BaseRequest) -> web.Response: """ Handler for updateEventRanges call, adds the event ranges update to a queue to be retrieved by the worker @@ -468,8 +460,7 @@ async def handle_update_event_ranges( # self._logger.debug(f"event ranges queue size is {self.ranges_update.qsize()}") return web.json_response(res, dumps=self.json_encoder) - async def handle_update_jobs_in_bulk( - self, request: web.BaseRequest) -> web.Response: + async def handle_update_jobs_in_bulk(self, request: web.BaseRequest) -> web.Response: """ Not used by pilot in the current workflow @@ -499,8 +490,7 @@ async def handle_get_status(self, request: web.BaseRequest) -> web.Response: """ raise NotImplementedError(f"{request.path} handler not implemented") - async def handle_get_key_pair(self, - request: web.BaseRequest) -> web.Response: + async def handle_get_key_pair(self, request: web.BaseRequest) -> web.Response: """ Not used by pilot in the current workflow diff --git a/src/raythena/drivers/baseDriver.py b/src/raythena/drivers/baseDriver.py index 344d67e..464cfc6 100644 --- a/src/raythena/drivers/baseDriver.py +++ b/src/raythena/drivers/baseDriver.py @@ -1,5 +1,4 @@ from abc import ABC, abstractmethod - from raythena.utils.config import Config diff --git a/src/raythena/drivers/communicators/baseCommunicator.py b/src/raythena/drivers/communicators/baseCommunicator.py index b766e1b..effdf1a 100644 --- a/src/raythena/drivers/communicators/baseCommunicator.py +++ b/src/raythena/drivers/communicators/baseCommunicator.py @@ -1,12 +1,25 @@ from abc import ABC, abstractmethod +from collections.abc import Mapping, Sequence from queue import Queue -from typing import Mapping, Sequence, Union - +from typing import Union from raythena.utils.config import Config -from raythena.utils.eventservice import EventRangeRequest, PandaJobRequest, PandaJobUpdate, EventRangeUpdate, \ - JobReport, EventRangeDef, JobDef +from raythena.utils.eventservice import ( + EventRangeDef, + EventRangeRequest, + EventRangeUpdate, + JobDef, + JobReport, + PandaJobRequest, + PandaJobUpdate, +) -RequestData = Union[PandaJobRequest, EventRangeUpdate, JobReport, EventRangeRequest, PandaJobUpdate] +RequestData = Union[ + PandaJobRequest, + EventRangeUpdate, + JobReport, + EventRangeRequest, + PandaJobUpdate, +] class BaseCommunicator(ABC): @@ -15,8 +28,13 @@ class BaseCommunicator(ABC): to be implemented by different communicators as well as setting up queues used to communicate with other threads. """ - def __init__(self, requests_queue: 'Queue[RequestData]', job_queue: 'Queue[Mapping[str, JobDef]]', - event_ranges_queue: 'Queue[Mapping[str, Sequence[EventRangeDef]]]', config: Config) -> None: + def __init__( + self, + requests_queue: "Queue[RequestData]", + job_queue: "Queue[Mapping[str, JobDef]]", + event_ranges_queue: "Queue[Mapping[str, Sequence[EventRangeDef]]]", + config: Config, + ) -> None: """ Base constructor setting up queues and application config diff --git a/src/raythena/drivers/communicators/harvesterFileMessenger.py b/src/raythena/drivers/communicators/harvesterFileMessenger.py index ae3952e..dc2bbbd 100644 --- a/src/raythena/drivers/communicators/harvesterFileMessenger.py +++ b/src/raythena/drivers/communicators/harvesterFileMessenger.py @@ -1,15 +1,21 @@ import configparser +import contextlib import json import os import shutil import time from queue import Queue - from raythena.drivers.communicators.baseCommunicator import BaseCommunicator from raythena.utils.config import Config -from raythena.utils.logging import make_logger -from raythena.utils.eventservice import EventRangeRequest, PandaJobRequest, PandaJobUpdate, EventRangeUpdate, JobReport +from raythena.utils.eventservice import ( + EventRangeRequest, + EventRangeUpdate, + JobReport, + PandaJobRequest, + PandaJobUpdate, +) from raythena.utils.exception import ExThread +from raythena.utils.logging import make_logger class HarvesterFileCommunicator(BaseCommunicator): @@ -21,8 +27,13 @@ class HarvesterFileCommunicator(BaseCommunicator): system is required. """ - def __init__(self, requests_queue: Queue, job_queue: Queue, - event_ranges_queue: Queue, config: Config) -> None: + def __init__( + self, + requests_queue: Queue, + job_queue: Queue, + event_ranges_queue: Queue, + config: Config, + ) -> None: """ Initialize communicator thread and parses the harvester config file @@ -33,16 +44,14 @@ def __init__(self, requests_queue: Queue, job_queue: Queue, config: app config """ super().__init__(requests_queue, job_queue, event_ranges_queue, config) - self.harvester_workdir = os.path.expandvars( - self.config.harvester['endpoint']) + self.harvester_workdir = os.path.expandvars(self.config.harvester["endpoint"]) self.ranges_requests_count = 0 self._parse_harvester_config() self.id = "HarvesterCommunicator" self._logger = make_logger(self.config, self.id) self.event_ranges_update_buffer = EventRangeUpdate() self.event_ranges_update_interval = 5 * 60 - self.communicator_thread = ExThread(target=self.run, - name="communicator-thread") + self.communicator_thread = ExThread(target=self.run, name="communicator-thread") def _parse_harvester_config(self) -> None: """ @@ -59,29 +68,32 @@ def _parse_harvester_config(self) -> None: Raises: FileNotFoundError if the harvester config file doesn't exist """ - self.harvester_conf_file = os.path.expandvars( - self.config.harvester['harvesterconf']) + self.harvester_conf_file = os.path.expandvars(self.config.harvester["harvesterconf"]) if not os.path.isfile(self.harvester_conf_file): raise FileNotFoundError("Harvester config file not found") self.harvester_conf = configparser.ConfigParser() self.harvester_conf.read(self.harvester_conf_file) - for k in self.harvester_conf['payload_interaction']: + for k in self.harvester_conf["payload_interaction"]: setattr( - self, k, - os.path.join(self.harvester_workdir, - self.harvester_conf['payload_interaction'][k])) + self, + k, + os.path.join( + self.harvester_workdir, + self.harvester_conf["payload_interaction"][k], + ), + ) if not hasattr(self, "jobspecfile"): - self.jobspecfile = str() + self.jobspecfile = "" if not hasattr(self, "jobspecfile"): - self.jobrequestfile = str() + self.jobrequestfile = "" if not hasattr(self, "eventrangesfile"): - self.eventrangesfile = str() + self.eventrangesfile = "" if not hasattr(self, "eventrequestfile"): - self.eventrequestfile = str() + self.eventrequestfile = "" if not hasattr(self, "eventstatusdumpjsonfile"): - self.eventstatusdumpjsonfile = str() + self.eventstatusdumpjsonfile = "" if not hasattr(self, "jobreportfile"): - self.jobreportfile = str() + self.jobreportfile = "" def request_job(self, request: PandaJobRequest) -> None: """ @@ -103,7 +115,7 @@ def request_job(self, request: PandaJobRequest) -> None: # create request file if necessary if not os.path.isfile(self.jobrequestfile): request_tmp = f"{self.jobrequestfile}.tmp" - with open(request_tmp, 'w') as f: + with open(request_tmp, "w") as f: json.dump(request.to_dict(), f) shutil.move(request_tmp, self.jobrequestfile) @@ -115,14 +127,10 @@ def request_job(self, request: PandaJobRequest) -> None: with open(self.jobspecfile) as f: job = json.load(f) - try: + with contextlib.suppress(FileNotFoundError): os.remove(self.jobrequestfile) - except FileNotFoundError: - pass - try: + with contextlib.suppress(FileNotFoundError): os.rename(self.jobspecfile, f"{self.jobspecfile}.read") - except FileNotFoundError: - pass if job: self.job_queue.put(job) @@ -138,10 +146,9 @@ def request_event_ranges(self, request: EventRangeRequest) -> None: Returns: None """ - if not os.path.isfile(self.eventrangesfile) and not os.path.exists( - self.eventrequestfile): + if not os.path.isfile(self.eventrangesfile) and not os.path.exists(self.eventrequestfile): event_request_file_tmp = f"{self.eventrequestfile}.tmp" - with open(event_request_file_tmp, 'w') as f: + with open(event_request_file_tmp, "w") as f: json.dump(request.request, f) shutil.move(event_request_file_tmp, self.eventrequestfile) self._logger.debug(f"request_event_ranges: created new {self.eventrequestfile} file") @@ -152,21 +159,20 @@ def request_event_ranges(self, request: EventRangeRequest) -> None: self._logger.debug(f"request_event_ranges: found a {self.eventrangesfile} file") while os.path.isfile(self.eventrangesfile): try: - with open(self.eventrangesfile, 'r') as f: + with open(self.eventrangesfile) as f: ranges = json.load(f) if os.path.isfile(self.eventrangesfile): shutil.move( self.eventrangesfile, - f"{self.eventrangesfile}-{self.ranges_requests_count}") + f"{self.eventrangesfile}-{self.ranges_requests_count}", + ) except Exception: time.sleep(5) if os.path.exists(f"{self.eventrangesfile}-{self.ranges_requests_count}"): self.ranges_requests_count += 1 - try: + with contextlib.suppress(FileNotFoundError): os.remove(self.eventrequestfile) - except FileNotFoundError: - pass self.ranges_requests_count += 1 self.event_ranges_queue.put(ranges) @@ -206,7 +212,9 @@ def update_events(self, request: EventRangeUpdate) -> None: current_update = json.load(f) os.remove(tmp_status_dump_file) except Exception as e: - self._logger.critical("Failed to read and remove leftover tmp update file. Update will never get reported to harvester.") + self._logger.critical( + "Failed to read and remove leftover tmp update file. Update will never get reported to harvester." + ) self._logger.critical(e) else: request.merge_update(EventRangeUpdate(current_update)) @@ -239,7 +247,7 @@ def merge_write_dump_file(self, request: EventRangeUpdate, tmp_status_dump_file: self._logger.debug("Writting event ranges update to temporary file") try: - with open(tmp_status_dump_file, 'w') as f: + with open(tmp_status_dump_file, "w") as f: json.dump(request.range_update, f) except Exception as e: self._logger.error(f"Failed to write event update to temporary file: {e}") @@ -270,7 +278,7 @@ def create_job_report(self, request: JobReport) -> None: """ job_report_file = f"{self.jobreportfile}" - with open(job_report_file, 'w') as f: + with open(job_report_file, "w") as f: json.dump(request.to_dict(), f) def run(self) -> None: @@ -330,5 +338,4 @@ def stop(self) -> None: if self.communicator_thread.is_alive(): self.requests_queue.put(None) self.communicator_thread.join() - self.communicator_thread = ExThread(target=self.run, - name="communicator-thread") + self.communicator_thread = ExThread(target=self.run, name="communicator-thread") diff --git a/src/raythena/drivers/communicators/harvesterMock.py b/src/raythena/drivers/communicators/harvesterMock.py index e3844ac..f0beefc 100644 --- a/src/raythena/drivers/communicators/harvesterMock.py +++ b/src/raythena/drivers/communicators/harvesterMock.py @@ -3,10 +3,14 @@ import random import time from queue import Queue - from raythena.drivers.communicators.baseCommunicator import BaseCommunicator from raythena.utils.config import Config -from raythena.utils.eventservice import EventRangeRequest, PandaJobRequest, PandaJobUpdate, EventRangeUpdate +from raythena.utils.eventservice import ( + EventRangeRequest, + EventRangeUpdate, + PandaJobRequest, + PandaJobUpdate, +) from raythena.utils.exception import ExThread @@ -20,24 +24,28 @@ class HarvesterMock(BaseCommunicator): Input files specified in the inFiles attribute should exist in the ray workdir before starting ray """ - def __init__(self, requests_queue: Queue, job_queue: Queue, - event_ranges_queue: Queue, config: Config) -> None: + def __init__( + self, + requests_queue: Queue, + job_queue: Queue, + event_ranges_queue: Queue, + config: Config, + ) -> None: super().__init__(requests_queue, job_queue, event_ranges_queue, config) """ Initialize communicator thread, input files name, job worker_id, number of events to be distributed """ - self.communicator_thread = ExThread(target=self.run, - name="communicator-thread") + self.communicator_thread = ExThread(target=self.run, name="communicator-thread") self.event_ranges = None self.pandaID = random.randint(0, 100) self.jobsetId = random.randint(0, 100) self.taskId = random.randint(0, 100) self.config = config - self.scope = 'mc16_13TeV' - self.guid = '74DFB3ED-DAA7-E011-8954-001E4F3D9CB1,74DFB3ED-DAA7-E011-8954-001E4F3D9CB1' + self.scope = "mc16_13TeV" + self.guid = "74DFB3ED-DAA7-E011-8954-001E4F3D9CB1,74DFB3ED-DAA7-E011-8954-001E4F3D9CB1" self.guids = self.guid.split(",") self.inFiles = "EVNT.12458444._000048.pool.root.1,EVNT.12458444._000052.pool.root.1" - workdir = os.path.expandvars(self.config.ray['workdir']) + workdir = os.path.expandvars(self.config.ray["workdir"]) self.files = self.inFiles.split(",") self.nfiles = len(self.files) self.inFilesAbs = list() @@ -47,7 +55,7 @@ def __init__(self, requests_queue: Queue, job_queue: Queue, self.nevents_per_file = 5000 self.nevents = self.nevents_per_file * self.nfiles self.served_events = 0 - self.ncores = self.config.resources['corepernode'] + self.ncores = self.config.resources["corepernode"] def run(self) -> None: """ @@ -112,20 +120,20 @@ def request_event_ranges(self, request: EventRangeRequest) -> None: for pandaID in request: range_list = list() request_dict = request[pandaID] - nranges = min(self.nevents - self.served_events, - request_dict['nRanges']) - for i in range(self.served_events + 1, - self.served_events + nranges + 1): + nranges = min(self.nevents - self.served_events, request_dict["nRanges"]) + for i in range(self.served_events + 1, self.served_events + nranges + 1): file_idx = self.served_events // self.nevents_per_file range_id = f"Range-{i:05}" - range_list.append({ - 'lastEvent': i - file_idx * self.nevents_per_file, - 'eventRangeID': range_id, - 'startEvent': i - file_idx * self.nevents_per_file, - 'scope': self.scope, - 'LFN': self.inFilesAbs[file_idx], - 'GUID': self.guids[file_idx] - }) + range_list.append( + { + "lastEvent": i - file_idx * self.nevents_per_file, + "eventRangeID": range_id, + "startEvent": i - file_idx * self.nevents_per_file, + "scope": self.scope, + "LFN": self.inFilesAbs[file_idx], + "GUID": self.guids[file_idx], + } + ) self.served_events += 1 @@ -161,7 +169,7 @@ def get_panda_queue_name(self) -> str: Returns: The name of the pandaqueue from which jobs are retrieved. """ - return self.config.payload['pandaqueue'] + return self.config.payload["pandaqueue"] def request_job(self, job_request: PandaJobRequest) -> None: """ @@ -176,122 +184,75 @@ def request_job(self, job_request: PandaJobRequest) -> None: """ md5_hash = hashlib.md5() - md5_hash.update(str(time.time()).encode('utf-8')) + md5_hash.update(str(time.time()).encode("utf-8")) log_guid = md5_hash.hexdigest() - md5_hash.update(str(time.time()).encode('utf-8')) + md5_hash.update(str(time.time()).encode("utf-8")) job_name = md5_hash.hexdigest() - self.job_queue.put({ - str(self.pandaID): { - u'jobsetID': - self.jobsetId, - u'logGUID': - log_guid, - u'cmtConfig': - u'x86_64-slc6-gcc49-opt', - u'prodDBlocks': - u'user.mlassnig:user.mlassnig.pilot.test.single.hits', - u'dispatchDBlockTokenForOut': - u'NULL,NULL', - u'destinationDBlockToken': - u'NULL,NULL', - u'destinationSE': - self.get_panda_queue_name(), - u'realDatasets': - job_name, - u'prodUserID': - u'no_one', - u'GUID': - self.guid, - u'realDatasetsIn': - u'user.mlassnig:user.mlassnig.pilot.test.single.hits', - u'nSent': - 0, - u'eventService': - 'true', - u'cloud': - u'US', - u'StatusCode': - 0, - u'homepackage': - u'AtlasOffline/21.0.15', - u'inFiles': - self.inFiles, - u'processingType': - u'pilot-ptest', - u'ddmEndPointOut': - u'UTA_SWT2_DATADISK,UTA_SWT2_DATADISK', - u'fsize': - u'118612262', - u'fileDestinationSE': - f"{self.get_panda_queue_name()},{self.get_panda_queue_name()}", - u'scopeOut': - u'panda', - u'minRamCount': - 0, - u'jobDefinitionID': - 7932, - u'maxWalltime': - u'NULL', - u'scopeLog': - u'panda', - u'transformation': - u'Sim_tf.py', - u'maxDiskCount': - 0, - u'coreCount': - self.ncores, - u'prodDBlockToken': - u'NULL', - u'transferType': - u'NULL', - u'destinationDblock': - job_name, - u'dispatchDBlockToken': - u'NULL', - u'jobPars': ( - '--eventService=True --skipEvents=0 --firstEvent=1 --preExec \'from AthenaCommon.DetFlags ' - 'import DetFlags;DetFlags.ID_setOn();DetFlags.Calo_setOff();' - 'DetFlags.Muon_setOff();DetFlags.Lucid_setOff();DetFlags.Truth_setOff()\' ' - '--athenaopts=--preloadlib=${ATLASMKLLIBDIR_PRELOAD}/libimf.so ' - '--preInclude sim:SimulationJobOptions/preInclude.FrozenShowersFCalOnly.py,' - 'SimulationJobOptions/preInclude.BeamPipeKill.py ' - '--geometryVersion ATLAS-R2-2016-01-00-00_VALIDATION --physicsList QGSP_BERT ' - '--randomSeed 1234 --conditionsTag OFLCOND-MC12-SIM-00 ' - '--maxEvents=-1 --inputEvgenFile %s --outputHitsFile HITS_%s.pool.root' - % (self.inFiles, job_name)), - u'attemptNr': - 0, - u'swRelease': - u'Atlas-21.0.15', - u'nucleus': - u'NULL', - u'maxCpuCount': - 0, - u'outFiles': - u'HITS_%s.pool.root,%s.job.log.tgz' % (job_name, job_name), - u'currentPriority': - 1000, - u'scopeIn': - self.scope, - u'PandaID': - self.pandaID, - u'sourceSite': - u'NULL', - u'dispatchDblock': - u'NULL', - u'prodSourceLabel': - u'ptest', - u'checksum': - u'ad:5d000974', - u'jobName': - job_name, - u'ddmEndPointIn': - u'UTA_SWT2_DATADISK', - u'taskID': - self.taskId, - u'logFile': - u'%s.job.log.tgz' % job_name + self.job_queue.put( + { + str(self.pandaID): { + "jobsetID": self.jobsetId, + "logGUID": log_guid, + "cmtConfig": "x86_64-slc6-gcc49-opt", + "prodDBlocks": "user.mlassnig:user.mlassnig.pilot.test.single.hits", + "dispatchDBlockTokenForOut": "NULL,NULL", + "destinationDBlockToken": "NULL,NULL", + "destinationSE": self.get_panda_queue_name(), + "realDatasets": job_name, + "prodUserID": "no_one", + "GUID": self.guid, + "realDatasetsIn": "user.mlassnig:user.mlassnig.pilot.test.single.hits", + "nSent": 0, + "eventService": "true", + "cloud": "US", + "StatusCode": 0, + "homepackage": "AtlasOffline/21.0.15", + "inFiles": self.inFiles, + "processingType": "pilot-ptest", + "ddmEndPointOut": "UTA_SWT2_DATADISK,UTA_SWT2_DATADISK", + "fsize": "118612262", + "fileDestinationSE": f"{self.get_panda_queue_name()},{self.get_panda_queue_name()}", + "scopeOut": "panda", + "minRamCount": 0, + "jobDefinitionID": 7932, + "maxWalltime": "NULL", + "scopeLog": "panda", + "transformation": "Sim_tf.py", + "maxDiskCount": 0, + "coreCount": self.ncores, + "prodDBlockToken": "NULL", + "transferType": "NULL", + "destinationDblock": job_name, + "dispatchDBlockToken": "NULL", + "jobPars": ( + "--eventService=True --skipEvents=0 --firstEvent=1 --preExec 'from AthenaCommon.DetFlags " + "import DetFlags;DetFlags.ID_setOn();DetFlags.Calo_setOff();" + "DetFlags.Muon_setOff();DetFlags.Lucid_setOff();DetFlags.Truth_setOff()' " + "--athenaopts=--preloadlib=${ATLASMKLLIBDIR_PRELOAD}/libimf.so " + "--preInclude sim:SimulationJobOptions/preInclude.FrozenShowersFCalOnly.py," + "SimulationJobOptions/preInclude.BeamPipeKill.py " + "--geometryVersion ATLAS-R2-2016-01-00-00_VALIDATION --physicsList QGSP_BERT " + "--randomSeed 1234 --conditionsTag OFLCOND-MC12-SIM-00 " + f"--maxEvents=-1 --inputEvgenFile {self.inFiles} --outputHitsFile HITS_{job_name}.pool.root" + ), + "attemptNr": 0, + "swRelease": "Atlas-21.0.15", + "nucleus": "NULL", + "maxCpuCount": 0, + "outFiles": f"HITS_{job_name}.pool.root,{job_name}.job.log.tgz", + "currentPriority": 1000, + "scopeIn": self.scope, + "PandaID": self.pandaID, + "sourceSite": "NULL", + "dispatchDblock": "NULL", + "prodSourceLabel": "ptest", + "checksum": "ad:5d000974", + "jobName": job_name, + "ddmEndPointIn": "UTA_SWT2_DATADISK", + "taskID": self.taskId, + "logFile": f"{job_name}.job.log.tgz", + } } - }) + ) diff --git a/src/raythena/drivers/communicators/harvesterMock2205.py b/src/raythena/drivers/communicators/harvesterMock2205.py index 775a5a0..0b99aee 100644 --- a/src/raythena/drivers/communicators/harvesterMock2205.py +++ b/src/raythena/drivers/communicators/harvesterMock2205.py @@ -3,7 +3,6 @@ import random import time from queue import Queue - from raythena.drivers.communicators.harvesterMock import HarvesterMock from raythena.utils.config import Config from raythena.utils.eventservice import PandaJobRequest @@ -15,24 +14,28 @@ class HarvesterMock2205(HarvesterMock): Same purposes as HarvesterMock except that a job spec for Athena/22.0.5 is provided """ - def __init__(self, requests_queue: Queue, job_queue: Queue, - event_ranges_queue: Queue, config: Config) -> None: + def __init__( + self, + requests_queue: Queue, + job_queue: Queue, + event_ranges_queue: Queue, + config: Config, + ) -> None: """ Initialize communicator thread, input files name, job worker_id, number of events to be distributed """ super().__init__(requests_queue, job_queue, event_ranges_queue, config) - self.communicator_thread = ExThread(target=self.run, - name="communicator-thread") + self.communicator_thread = ExThread(target=self.run, name="communicator-thread") self.event_ranges = None self.pandaID = random.randint(0, 100) self.jobsetId = random.randint(0, 100) self.taskId = random.randint(0, 100) self.config = config - self.scope = 'mc16_13TeV' - self.guid = '74DFB3ED-DAA7-E011-8954-001E4F3D9CB1,74DFB3ED-DAA7-E011-8954-001E4F3D9CB1' + self.scope = "mc16_13TeV" + self.guid = "74DFB3ED-DAA7-E011-8954-001E4F3D9CB1,74DFB3ED-DAA7-E011-8954-001E4F3D9CB1" self.guids = self.guid.split(",") self.inFiles = "EVNT.12458444._000048.pool.root.1,EVNT.12458444._000052.pool.root.1" - workdir = os.path.expandvars(self.config.ray['workdir']) + workdir = os.path.expandvars(self.config.ray["workdir"]) self.files = self.inFiles.split(",") self.nfiles = len(self.files) self.inFilesAbs = list() @@ -42,7 +45,7 @@ def __init__(self, requests_queue: Queue, job_queue: Queue, self.nevents_per_file = 50 self.nevents = self.nevents_per_file * self.nfiles self.served_events = 0 - self.ncores = self.config.resources['corepernode'] + self.ncores = self.config.resources["corepernode"] def request_job(self, job_request: PandaJobRequest) -> None: """ @@ -57,124 +60,77 @@ def request_job(self, job_request: PandaJobRequest) -> None: """ md5_hash = hashlib.md5() - md5_hash.update(str(time.time()).encode('utf-8')) + md5_hash.update(str(time.time()).encode("utf-8")) log_guid = md5_hash.hexdigest() - md5_hash.update(str(time.time()).encode('utf-8')) + md5_hash.update(str(time.time()).encode("utf-8")) job_name = md5_hash.hexdigest() - self.job_queue.put({ - str(self.pandaID): { - u'jobsetID': - self.jobsetId, - u'logGUID': - log_guid, - u'cmtConfig': - u'x86_64-centos7-gcc8-opt', - u'prodDBlocks': - u'user.mlassnig:user.mlassnig.pilot.test.single.hits', - u'dispatchDBlockTokenForOut': - u'NULL,NULL', - u'destinationDBlockToken': - u'NULL,NULL', - u'destinationSE': - self.get_panda_queue_name(), - u'realDatasets': - job_name, - u'prodUserID': - u'no_one', - u'GUID': - self.guid, - u'realDatasetsIn': - u'user.mlassnig:user.mlassnig.pilot.test.single.hits', - u'nSent': - 0, - u'eventService': - 'true', - u'cloud': - u'US', - u'StatusCode': - 0, - u'homepackage': - u'Athena/22.0.5', - u'inFiles': - self.inFiles, - u'processingType': - u'pilot-ptest', - u'ddmEndPointOut': - u'UTA_SWT2_DATADISK,UTA_SWT2_DATADISK', - u'fsize': - u'118612262', - u'fileDestinationSE': - f"{self.get_panda_queue_name()},{self.get_panda_queue_name()}", - u'scopeOut': - u'panda', - u'minRamCount': - 0, - u'jobDefinitionID': - 7932, - u'maxWalltime': - u'NULL', - u'scopeLog': - u'panda', - u'transformation': - u'Sim_tf.py', - u'maxDiskCount': - 0, - u'coreCount': - self.ncores, - u'prodDBlockToken': - u'NULL', - u'transferType': - u'NULL', - u'destinationDblock': - job_name, - u'dispatchDBlockToken': - u'NULL', - u'jobPars': ( - '--multiprocess --eventService=True --skipEvents=0 --firstEvent=1 ' - '--preExec \'from AthenaCommon.DetFlags ' - 'import DetFlags;DetFlags.ID_setOn();DetFlags.Calo_setOff();' - 'DetFlags.Muon_setOff();DetFlags.Lucid_setOff();DetFlags.Truth_setOff()\' ' - '--athenaopts=--preloadlib=${ATLASMKLLIBDIR_PRELOAD}/libimf.so ' - '--preInclude sim:SimulationJobOptions/preInclude.FrozenShowersFCalOnly.py,' - 'SimulationJobOptions/preInclude.BeamPipeKill.py ' - '--geometryVersion default:ATLAS-R2-2016-01-00-01_VALIDATION ' - '--physicsList FTFP_BERT_ATL_VALIDATION --randomSeed 1234 ' - '--conditionsTag default:OFLCOND-MC16-SDR-14 ' - '--maxEvents=-1 --inputEvgenFile %s --outputHitsFile HITS_%s.pool.root' - % (self.inFiles, job_name)), - u'attemptNr': - 0, - u'swRelease': - u'Atlas-22.0.5', - u'nucleus': - u'NULL', - u'maxCpuCount': - 0, - u'outFiles': - u'HITS_%s.pool.root,%s.job.log.tgz' % (job_name, job_name), - u'currentPriority': - 1000, - u'scopeIn': - self.scope, - u'PandaID': - self.pandaID, - u'sourceSite': - u'NULL', - u'dispatchDblock': - u'NULL', - u'prodSourceLabel': - u'ptest', - u'checksum': - u'ad:5d000974', - u'jobName': - job_name, - u'ddmEndPointIn': - u'UTA_SWT2_DATADISK', - u'taskID': - self.taskId, - u'logFile': - u'%s.job.log.tgz' % job_name + self.job_queue.put( + { + str(self.pandaID): { + "jobsetID": self.jobsetId, + "logGUID": log_guid, + "cmtConfig": "x86_64-centos7-gcc8-opt", + "prodDBlocks": "user.mlassnig:user.mlassnig.pilot.test.single.hits", + "dispatchDBlockTokenForOut": "NULL,NULL", + "destinationDBlockToken": "NULL,NULL", + "destinationSE": self.get_panda_queue_name(), + "realDatasets": job_name, + "prodUserID": "no_one", + "GUID": self.guid, + "realDatasetsIn": "user.mlassnig:user.mlassnig.pilot.test.single.hits", + "nSent": 0, + "eventService": "true", + "cloud": "US", + "StatusCode": 0, + "homepackage": "Athena/22.0.5", + "inFiles": self.inFiles, + "processingType": "pilot-ptest", + "ddmEndPointOut": "UTA_SWT2_DATADISK,UTA_SWT2_DATADISK", + "fsize": "118612262", + "fileDestinationSE": f"{self.get_panda_queue_name()},{self.get_panda_queue_name()}", + "scopeOut": "panda", + "minRamCount": 0, + "jobDefinitionID": 7932, + "maxWalltime": "NULL", + "scopeLog": "panda", + "transformation": "Sim_tf.py", + "maxDiskCount": 0, + "coreCount": self.ncores, + "prodDBlockToken": "NULL", + "transferType": "NULL", + "destinationDblock": job_name, + "dispatchDBlockToken": "NULL", + "jobPars": ( + "--multiprocess --eventService=True --skipEvents=0 --firstEvent=1 " + "--preExec 'from AthenaCommon.DetFlags " + "import DetFlags;DetFlags.ID_setOn();DetFlags.Calo_setOff();" + "DetFlags.Muon_setOff();DetFlags.Lucid_setOff();DetFlags.Truth_setOff()' " + "--athenaopts=--preloadlib=${ATLASMKLLIBDIR_PRELOAD}/libimf.so " + "--preInclude sim:SimulationJobOptions/preInclude.FrozenShowersFCalOnly.py," + "SimulationJobOptions/preInclude.BeamPipeKill.py " + "--geometryVersion default:ATLAS-R2-2016-01-00-01_VALIDATION " + "--physicsList FTFP_BERT_ATL_VALIDATION --randomSeed 1234 " + "--conditionsTag default:OFLCOND-MC16-SDR-14 " + f"--maxEvents=-1 --inputEvgenFile {self.inFiles} --outputHitsFile HITS_{job_name}.pool.root" + ), + "attemptNr": 0, + "swRelease": "Atlas-22.0.5", + "nucleus": "NULL", + "maxCpuCount": 0, + "outFiles": f"HITS_{job_name}.pool.root,{job_name}.job.log.tgz", + "currentPriority": 1000, + "scopeIn": self.scope, + "PandaID": self.pandaID, + "sourceSite": "NULL", + "dispatchDblock": "NULL", + "prodSourceLabel": "ptest", + "checksum": "ad:5d000974", + "jobName": job_name, + "ddmEndPointIn": "UTA_SWT2_DATADISK", + "taskID": self.taskId, + "logFile": f"{job_name}.job.log.tgz", + } } - }) + ) diff --git a/src/raythena/drivers/esdriver.py b/src/raythena/drivers/esdriver.py index 4e01657..fc44142 100644 --- a/src/raythena/drivers/esdriver.py +++ b/src/raythena/drivers/esdriver.py @@ -1,50 +1,64 @@ import configparser +import json import os import re -import json import shutil import stat import tempfile import time import traceback +from collections.abc import Iterable, Iterator, Mapping, Sequence from math import ceil from queue import Empty, Queue from socket import gethostname -from typing import (Any, Dict, Iterator, List, Mapping, Optional, Sequence, Iterable, - Tuple) from subprocess import DEVNULL, Popen - +from typing import ( + Any, + Optional, +) import ray from ray.exceptions import RayActorError from ray.types import ObjectRef from raythena import __version__ from raythena.actors.esworker import ESWorker, WorkerResponse from raythena.drivers.baseDriver import BaseDriver -from raythena.drivers.communicators.baseCommunicator import (BaseCommunicator, - RequestData) -from raythena.drivers.communicators.harvesterFileMessenger import \ - HarvesterFileCommunicator +from raythena.drivers.communicators.baseCommunicator import ( + BaseCommunicator, + RequestData, +) +from raythena.drivers.communicators.harvesterFileMessenger import ( + HarvesterFileCommunicator, +) from raythena.utils.bookkeeper import BookKeeper, TaskStatus from raythena.utils.config import Config -from raythena.utils.eventservice import (EventRange, EventRangeDef, - EventRangeRequest, EventRangeUpdate, - JobDef, Messages, - PandaJobRequest - ) +from raythena.utils.eventservice import ( + EventRange, + EventRangeDef, + EventRangeRequest, + EventRangeUpdate, + JobDef, + Messages, + PandaJobRequest, +) from raythena.utils.exception import BaseRaythenaException -from raythena.utils.logging import (disable_stdout_logging, log_to_file, - make_logger) +from raythena.utils.logging import ( + disable_stdout_logging, + log_to_file, + make_logger, +) from raythena.utils.ray import build_nodes_resource_list class ESDriver(BaseDriver): """ The driver is managing all the ray workers and handling the communication with Harvester. It keeps tracks of - which event ranges is assigned to which actor using a BookKeeper instance which provides the interface to read and update the status of each event range. + which event ranges is assigned to which actor using a BookKeeper instance which provides the interface to read + and update the status of each event range. - It will also send requests for jobs, event ranges or update of produced output to harvester by using a communicator instance. - The communicator uses the shared file system to communicate with Harvester and does I/O in a separate thread, - communication between the driver and the communicator is done by message passing using a queue. + It will also send requests for jobs, event ranges or update of produced output to harvester by + using a communicator instance. The communicator uses the shared file system to communicate with + Harvester and does I/O in a separate thread, communication between the driver and the communicator + is done by message passing using a queue. The driver is starting one actor per node in the ray cluster except for the ray head node which doesn't execute any worker @@ -72,13 +86,13 @@ def __init__(self, config: Config, session_dir: str) -> None: self.jobs_queue: Queue[Mapping[str, JobDef]] = Queue() self.event_ranges_queue: Queue[Mapping[str, Sequence[EventRangeDef]]] = Queue() - workdir = os.path.expandvars(self.config.ray.get('workdir')) + workdir = os.path.expandvars(self.config.ray.get("workdir")) if not workdir or not os.path.exists(workdir): workdir = os.getcwd() - self.config.ray['workdir'] = workdir + self.config.ray["workdir"] = workdir self.workdir = workdir - self.output_dir = str() - self.merged_files_dir = str() + self.output_dir = "" + self.merged_files_dir = "" logfile = self.config.logging.get("driverlogfile", None) if logfile: log_to_file(self.config.logging.get("level", None), logfile) @@ -91,14 +105,16 @@ def __init__(self, config: Config, session_dir: str) -> None: # self.cpu_monitor = CPUMonitor(os.path.join(workdir, "cpu_monitor_driver.json")) # self.cpu_monitor.start() - self.communicator: BaseCommunicator = HarvesterFileCommunicator(self.requests_queue, - self.jobs_queue, - self.event_ranges_queue, - self.config) + self.communicator: BaseCommunicator = HarvesterFileCommunicator( + self.requests_queue, + self.jobs_queue, + self.event_ranges_queue, + self.config, + ) self.communicator.start() self.requests_queue.put(PandaJobRequest()) - self.actors: Dict[str, ESWorker] = dict() - self.pending_objectref_to_actor: Dict[ObjectRef, str] = dict() + self.actors: dict[str, ESWorker] = dict() + self.pending_objectref_to_actor: dict[ObjectRef, str] = dict() self.actors_message_queue = list() self.bookKeeper = BookKeeper(self.config) self.terminated = list() @@ -107,25 +123,29 @@ def __init__(self, config: Config, session_dir: str) -> None: self.max_retries_error_failed_tasks = 3 self.first_event_range_request = True self.no_more_events = False - self.cache_size_factor = self.config.ray.get('cachesizefactor', 3) - self.cores_per_node = self.config.resources.get('corepernode', os.cpu_count()) + self.cache_size_factor = self.config.ray.get("cachesizefactor", 3) + self.cores_per_node = self.config.resources.get("corepernode", os.cpu_count()) self.n_actors = len(self.nodes) self.events_cache_size = self.cores_per_node * self.n_actors * self.cache_size_factor - self.timeoutinterval = self.config.ray['timeoutinterval'] - self.max_running_merge_transforms = self.config.ray['mergemaxprocesses'] + self.timeoutinterval = self.config.ray["timeoutinterval"] + self.max_running_merge_transforms = self.config.ray["mergemaxprocesses"] self.panda_taskid = None - self.pandaqueue = self.config.payload['pandaqueue'] + self.pandaqueue = self.config.payload["pandaqueue"] parser = configparser.ConfigParser() - harvester_config = self.config.harvester['harvesterconf'] - self.queuedata_file = str() - self.container_options = str() - self.container_type = str() - self.jobreport_name = str() + harvester_config = self.config.harvester["harvesterconf"] + self.queuedata_file = "" + self.container_options = "" + self.container_type = "" + self.jobreport_name = "" if not os.path.isfile(harvester_config): self._logger.warning(f"Couldn't find harvester config file {harvester_config}") else: parser.read(harvester_config) - queuedata_config = [queue.split('|')[-1] for queue in parser["cacher"]["data"].splitlines() if queue.startswith(self.pandaqueue)] + queuedata_config = [ + queue.split("|")[-1] + for queue in parser["cacher"]["data"].splitlines() + if queue.startswith(self.pandaqueue) + ] self.jobreport_name = parser["payload_interaction"]["jobReportFile"] if not queuedata_config: self._logger.warning(f"No queuedata config found for {self.pandaqueue}") @@ -133,16 +153,18 @@ def __init__(self, config: Config, session_dir: str) -> None: self._logger.warning(f"cached queudata file not found: {queuedata_config[0]}") else: self.queuedata_file = queuedata_config[0] - with open(self.queuedata_file, 'r') as f: + with open(self.queuedata_file) as f: queuedata = json.load(f) self.container_options = queuedata["container_options"] self.container_type = queuedata["container_type"].split(":")[0] - if self.container_type != self.config.payload['containerengine']: - self._logger.warning("Mismatch between pandaqueue and raythena container type. Overriding raythena config") - self.config.payload['containerengine'] = self.container_type + if self.container_type != self.config.payload["containerengine"]: + self._logger.warning( + "Mismatch between pandaqueue and raythena container type. Overriding raythena config" + ) + self.config.payload["containerengine"] = self.container_type # {input_filename, {merged_output_filename, ([(event_range_id, EventRange)], subprocess handle)}} - self.running_merge_transforms: Dict[str, Tuple[List[Tuple[str, EventRange]], Popen, str]] = dict() + self.running_merge_transforms: dict[str, tuple[list[tuple[str, EventRange]], Popen, str]] = dict() self.total_running_merge_transforms = 0 self.failed_actor_tasks_count = dict() self.available_events_per_actor = 0 @@ -172,7 +194,8 @@ def __getitem__(self, key: str) -> ESWorker: def start_actors(self) -> None: """ - Initialize actor communication by performing the first call to get_message() and add the future to the future list. + Initialize actor communication by performing the first call to get_message() and + add the future to the future list. Returns: None @@ -182,33 +205,34 @@ def start_actors(self) -> None: def create_actors(self) -> None: """ - Create actors on each node. Before creating an actor, the driver tries to assign it a job and an initial batch of - event ranges. This avoid having all actors requesting jobs and event ranges at the start of the job. + Create actors on each node. Before creating an actor, the driver tries to assign it a job and an initial + batch of event ranges. This avoid having all actors requesting jobs and event ranges at the start of the job. Returns: None """ events_per_actor = min(self.available_events_per_actor, self.cores_per_node) for i, node in enumerate(self.nodes): - nodeip = node['NodeManagerAddress'] + nodeip = node["NodeManagerAddress"] node_constraint = f"node:{nodeip}" actor_id = f"Actor_{i}" kwargs = { - 'actor_id': actor_id, - 'config': self.config_remote, - 'session_log_dir': self.session_log_dir, - 'actor_no': i, - 'actor_count': self.n_actors, + "actor_id": actor_id, + "config": self.config_remote, + "session_log_dir": self.session_log_dir, + "actor_no": i, + "actor_count": self.n_actors, } job = self.bookKeeper.assign_job_to_actor(actor_id) if job: - job_remote = self.remote_jobdef_byid[job['PandaID']] - kwargs['job'] = job_remote + job_remote = self.remote_jobdef_byid[job["PandaID"]] + kwargs["job"] = job_remote event_ranges = self.bookKeeper.fetch_event_ranges(actor_id, events_per_actor) if event_ranges: - kwargs['event_ranges'] = event_ranges + kwargs["event_ranges"] = event_ranges self._logger.debug( - f"Prefetched job {job['PandaID']} and {len(event_ranges)} event ranges for {actor_id}") + f"Prefetched job {job['PandaID']} and {len(event_ranges)} event ranges for {actor_id}" + ) actor = ESWorker.options(resources={node_constraint: 1}).remote(**kwargs) self.actors[actor_id] = actor @@ -216,7 +240,8 @@ def create_actors(self) -> None: def retrieve_actors_messages(self, ready: Sequence[ObjectRef]) -> Iterator[WorkerResponse]: """ Given a list of ready futures from actors, unwrap them and return an interable over the result of each future. - In case one of the futures raised an exception, the exception is handled by this function and not propagated to the caller. + In case one of the futures raised an exception, the exception is handled by this function and + not propagated to the caller. Args: ready: a list of read futures @@ -227,7 +252,8 @@ def retrieve_actors_messages(self, ready: Sequence[ObjectRef]) -> Iterator[Worke try: messages = ray.get(ready) except Exception: - # if any of the future raised an exception, we need to handle them one by one to know which one produced the exception. + # if any of the future raised an exception, we need to handle them one by one + # to know which one produced the exception. for r in ready: try: actor_id, message, data = ray.get(r) @@ -236,7 +262,9 @@ def retrieve_actors_messages(self, ready: Sequence[ObjectRef]) -> Iterator[Worke except RayActorError as e: self._logger.error(f"RayActorError: {e.error_msg}") except Exception as e: - self._logger.error(f"Caught exception while fetching result from {self.pending_objectref_to_actor[r]}: {e}") + self._logger.error( + f"Caught exception while fetching result from {self.pending_objectref_to_actor[r]}: {e}" + ) else: yield actor_id, message, data else: @@ -249,7 +277,8 @@ def enqueue_actor_call(self, actor_id: str, future: ObjectRef): def handle_actors(self) -> None: """ - Main function handling messages from all ray actors and dispatching to the appropriate handling function according to the message returned by the actor, + Main function handling messages from all ray actors and dispatching to the appropriate handling + function according to the message returned by the actor, Returns: None @@ -275,29 +304,36 @@ def handle_actors(self) -> None: self._logger.debug("Finished handling the Actors. Raythena will shutdown now.") - def wait_on_messages(self) -> Tuple[List[ObjectRef], List[ObjectRef]]: + def wait_on_messages(self) -> tuple[list[ObjectRef], list[ObjectRef]]: """ Wait on part of the pending futures to complete. Wait for 1 second trying to fetch half of the pending futures. If no futures are ready, then wait another second to fetch a tenth of the pending futures. - If there are still no futures ready, then wait for the timeout interval or until one future is ready. If this is the beginning of the job, i.e. no - events have finished processing yet, then wait forever until one future is ready instead of only timeout interval. + If there are still no futures ready, then wait for the timeout interval or until one future is ready. + If this is the beginning of the job, i.e. no events have finished processing yet, then wait forever until + one future is ready instead of only timeout interval. Returns: - Tuple of a list of completed futures and a list of pending futures, respectively + tuple of a list of completed futures and a list of pending futures, respectively """ - if self.bookKeeper.have_finished_events(): - timeoutinterval = self.timeoutinterval - else: - timeoutinterval = None + timeoutinterval = self.timeoutinterval if self.bookKeeper.have_finished_events() else None messages, queue = ray.wait( - self.actors_message_queue, num_returns=max(1, len(self.actors_message_queue) // 2), timeout=1) + self.actors_message_queue, + num_returns=max(1, len(self.actors_message_queue) // 2), + timeout=1, + ) if not messages: messages, queue = ray.wait( - self.actors_message_queue, num_returns=max(1, len(self.actors_message_queue) // 10), timeout=1) + self.actors_message_queue, + num_returns=max(1, len(self.actors_message_queue) // 10), + timeout=1, + ) if not messages: messages, queue = ray.wait( - self.actors_message_queue, num_returns=1, timeout=timeoutinterval) + self.actors_message_queue, + num_returns=1, + timeout=timeoutinterval, + ) return messages, queue def handle_actor_done(self, actor_id: str) -> bool: @@ -356,9 +392,10 @@ def handle_request_event_ranges(self, actor_id: str, data: EventRangeRequest, to the number of events returned in a single request is capped to the number of local events divided by the number of actors. This cap is updated every time new events are retrieved from Harvester. - If the driver doesn't have enough events to send to the actor, then it will initiate or wait on a pending event request to Harvester to get more events. - It will only return less events than the request number (or cap) if Harvester returns no events. - Requests to Harvester are skipped if it was flagged as not having any events left for the current actor's job. + If the driver doesn't have enough events to send to the actor, then it will initiate or wait on a pending + event request to Harvester to get more events. It will only return less events than the request number (or cap) + if Harvester returns no events. Requests to Harvester are skipped if it was flagged as not having any events + left for the current actor's job. Args: actor_id: worker sending the event ranges update @@ -371,10 +408,9 @@ def handle_request_event_ranges(self, actor_id: str, data: EventRangeRequest, to panda_id = self.bookKeeper.get_actor_job(actor_id) # get the min between requested ranges and what is available for each actor - n_ranges = min(data[panda_id]['nRanges'], self.available_events_per_actor) + n_ranges = min(data[panda_id]["nRanges"], self.available_events_per_actor) - evt_range = self.bookKeeper.fetch_event_ranges( - actor_id, n_ranges) + evt_range = self.bookKeeper.fetch_event_ranges(actor_id, n_ranges) # did not fetch enough events and harvester might have more, needs to get more events now # while (len(evt_range) < n_ranges and # not self.bookKeeper.is_flagged_no_more_events( @@ -385,9 +421,13 @@ def handle_request_event_ranges(self, actor_id: str, data: EventRangeRequest, to # actor_id, n_ranges) if evt_range: total_sent += len(evt_range) - self.enqueue_actor_call(actor_id, self[actor_id].receive_event_ranges.remote( - Messages.REPLY_OK if evt_range else - Messages.REPLY_NO_MORE_EVENT_RANGES, evt_range)) + self.enqueue_actor_call( + actor_id, + self[actor_id].receive_event_ranges.remote( + Messages.REPLY_OK if evt_range else Messages.REPLY_NO_MORE_EVENT_RANGES, + evt_range, + ), + ) self._logger.info(f"Sending {len(evt_range)} events to {actor_id}") return total_sent @@ -407,8 +447,13 @@ def handle_job_request(self, actor_id: str) -> None: # self.request_event_ranges(block=True) # job = self.bookKeeper.assign_job_to_actor(actor_id) - self.enqueue_actor_call(actor_id, self[actor_id].receive_job.remote(Messages.REPLY_OK - if job else Messages.REPLY_NO_MORE_JOBS, self.remote_jobdef_byid[job['PandaID']])) + self.enqueue_actor_call( + actor_id, + self[actor_id].receive_job.remote( + Messages.REPLY_OK if job else Messages.REPLY_NO_MORE_JOBS, + self.remote_jobdef_byid[job["PandaID"]], + ), + ) def request_event_ranges(self, block: bool = False) -> None: """ @@ -434,10 +479,12 @@ def request_event_ranges(self, block: bool = False) -> None: n_available_ranges = self.bookKeeper.n_ready(pandaID) job = self.bookKeeper.jobs[pandaID] if n_available_ranges < self.events_cache_size: - event_request.add_event_request(pandaID, - self.events_cache_size, - job['taskID'], - job['jobsetID']) + event_request.add_event_request( + pandaID, + self.events_cache_size, + job["taskID"], + job["jobsetID"], + ) if len(event_request) > 0: self._logger.debug(f"Sending event ranges request to harvester for {self.events_cache_size} events") @@ -489,8 +536,11 @@ def cleanup(self) -> None: ray.get(handles) def setup_dirs(self): - self.output_dir = os.path.join(os.path.expandvars(self.config.ray.get("taskprogressbasedir")), str(self.panda_taskid)) - with open(self.task_workdir_path_file, 'w') as f: + self.output_dir = os.path.join( + os.path.expandvars(self.config.ray.get("taskprogressbasedir")), + str(self.panda_taskid), + ) + with open(self.task_workdir_path_file, "w") as f: f.write(self.output_dir) self.config.ray["outputdir"] = self.output_dir @@ -522,7 +572,8 @@ def run(self) -> None: Method used to start the driver, initializing actors, retrieving initial job and event ranges, creates job subdir then handle actors until they are all done or stop() has been called This function will also create a directory in config.ray.workdir for the retrieved job - with the directory name being the PandaID. Workers will then each create their own subdirectory in that job directory. + with the directory name being the PandaID. Workers will then each create their own + subdirectory in that job directory. Returns: None @@ -535,7 +586,7 @@ def run(self) -> None: if len(jobs) > 1: self._logger.critical("Raythena can only handle one job") return - job = list(jobs.values())[0] + job = next(iter(jobs.values())) job["eventService"] = "true" job["jobPars"] = f"--eventService=True {job['jobPars']}" self.panda_taskid = job["taskID"] @@ -568,7 +619,7 @@ def run(self) -> None: return job_id = self.bookKeeper.jobs.next_job_id_to_process() total_events = self.bookKeeper.n_ready(job_id) - os.makedirs(os.path.join(self.config.ray['workdir'], job_id)) + os.makedirs(os.path.join(self.config.ray["workdir"], job_id)) if total_events: self.available_events_per_actor = max(1, ceil(total_events / self.n_actors)) for pandaID in self.bookKeeper.jobs: @@ -585,7 +636,7 @@ def run(self) -> None: self._logger.error(f"{traceback.format_exc()}") self._logger.error(f"Error while handling actors: {e}. stopping...") - if self.config.logging.get('copyraylogs', False): + if self.config.logging.get("copyraylogs", False): ray_logs = os.path.join(self.workdir, "ray_logs") try: shutil.copytree(self.session_log_dir, ray_logs) @@ -624,7 +675,7 @@ def run(self) -> None: self.bookKeeper.print_status() self._logger.debug("All driver threads stopped. Quitting...") - def rename_output_files(self, output_map: Dict[str, str]): + def rename_output_files(self, output_map: dict[str, str]): """ Rename final output files """ @@ -637,9 +688,12 @@ def rename_output_files(self, output_map: Dict[str, str]): if not new_filename: self._logger.warning(f"Couldn't find new name for {file}, will not be staged out correctly") continue - os.rename(os.path.join(self.merged_files_dir, file), os.path.join(self.merged_files_dir, new_filename)) + os.rename( + os.path.join(self.merged_files_dir, file), + os.path.join(self.merged_files_dir, new_filename), + ) - def produce_final_report(self, output_map: Dict[str, str]): + def produce_final_report(self, output_map: dict[str, str]): """ Merge job reports from individual merge transforms to produce the final jobReport for Panda. """ @@ -648,7 +702,7 @@ def produce_final_report(self, output_map: Dict[str, str]): if not files: return - with open(os.path.join(self.job_reports_dir, files[0]), 'r') as f: + with open(os.path.join(self.job_reports_dir, files[0])) as f: final_report = json.load(f) final_report_files = final_report["files"] @@ -661,12 +715,12 @@ def produce_final_report(self, output_map: Dict[str, str]): # read the commit log to recover the correct name. If we get another KeyError, we can't recover new_filename = output_map[self.bookKeeper.recover_outputfile_name(old_filename)] output_file_entry["name"] = new_filename - with open(os.path.join(self.job_reports_dir, files[0]), 'w') as f: + with open(os.path.join(self.job_reports_dir, files[0]), "w") as f: json.dump(final_report, f) for file in files[1:]: current_file = os.path.join(self.job_reports_dir, file) - with open(current_file, 'r') as f: + with open(current_file) as f: current_report = json.load(f) final_report_files["input"].append(current_report["files"]["input"][0]) output_file_entry = current_report["files"]["output"][0]["subFiles"][0] @@ -678,11 +732,11 @@ def produce_final_report(self, output_map: Dict[str, str]): new_filename = output_map[self.bookKeeper.recover_outputfile_name(old_filename)] output_file_entry["name"] = new_filename final_report_files["output"][0]["subFiles"].append(output_file_entry) - with open(current_file, 'w') as f: + with open(current_file, "w") as f: json.dump(current_report, f) tmp = os.path.join(self.workdir, self.jobreport_name + ".tmp") - with open(tmp, 'w') as f: + with open(tmp, "w") as f: json.dump(final_report, f) shutil.move(tmp, os.path.join(self.workdir, self.jobreport_name)) @@ -704,8 +758,8 @@ def stop(self) -> None: def handle_actor_exception(self, actor_id: str, ex: Exception) -> None: """ Handle exception that occurred in an actor process. Log the exception and count the number of exceptions - that were produced by the same actor. If the number of exceptions is greater than the threshold, the driver will simply drop the actor - by no longer calling remote functions on it. + that were produced by the same actor. If the number of exceptions is greater than the threshold, + the driver will simply drop the actor by no longer calling remote functions on it. Args: actor_id: the actor that raised the exception @@ -731,7 +785,7 @@ def get_output_file_guid(self, job_report_file) -> Optional[str]: """ Extract the GUID from the jobReport of HITSMerge_tf """ - with open(job_report_file, 'r') as f: + with open(job_report_file) as f: job_report = json.load(f) try: guid = job_report["files"]["output"][0]["subFiles"][0]["file_guid"] @@ -742,10 +796,12 @@ def get_output_file_guid(self, job_report_file) -> Optional[str]: def handle_merge_transforms(self, wait_for_completion=False) -> bool: """ Checks if the bookkeeper has files ready to be merged. If so, subprocesses for merge tasks are started. - After starting any subprocess, go through all the running subprocess and poll then to check if any completed and report status to the bookkeepr. + After starting any subprocess, go through all the running subprocess and poll then to check + if any completed and report status to the bookkeepr. Args: - wait_for_completion: Wait for all the subprocesses (including those started by this call) to finish before returning + wait_for_completion: Wait for all the subprocesses + (including those started by this call) to finish before returning Returns: True if new merge jobs were created @@ -758,9 +814,15 @@ def handle_merge_transforms(self, wait_for_completion=False) -> bool: while merge_files: (output_filename, event_ranges) = merge_files assert len(event_ranges) > 0 - (sub_process, job_report_file) = self.hits_merge_transform([e[0] for e in event_ranges], output_filename) + (sub_process, job_report_file) = self.hits_merge_transform( + [e[0] for e in event_ranges], output_filename + ) self._logger.debug(f"Starting merge transform for {output_filename}") - self.running_merge_transforms[output_filename] = (event_ranges, sub_process, job_report_file) + self.running_merge_transforms[output_filename] = ( + event_ranges, + sub_process, + job_report_file, + ) self.total_running_merge_transforms += 1 new_transforms = True if self.total_running_merge_transforms >= self.max_running_merge_transforms: @@ -768,7 +830,11 @@ def handle_merge_transforms(self, wait_for_completion=False) -> bool: merge_files = self.bookKeeper.get_file_to_merge() to_remove = [] - for output_filename, (event_ranges, sub_process, job_report_file) in self.running_merge_transforms.items(): + for output_filename, ( + event_ranges, + sub_process, + job_report_file, + ) in self.running_merge_transforms.items(): if wait_for_completion: while sub_process.poll() is None: time.sleep(5) @@ -779,17 +845,26 @@ def handle_merge_transforms(self, wait_for_completion=False) -> bool: self._logger.debug(f"Merge transform for file {output_filename} finished.") event_ranges_map = {} guid = self.get_output_file_guid(job_report_file) - for (event_range_output, event_range) in event_ranges: - event_ranges_map[event_range.eventRangeID] = TaskStatus.build_eventrange_dict(event_range, event_range_output) - self.bookKeeper.report_merged_file(self.panda_taskid, output_filename, event_ranges_map, guid) + for event_range_output, event_range in event_ranges: + event_ranges_map[event_range.eventRangeID] = TaskStatus.build_eventrange_dict( + event_range, event_range_output + ) + self.bookKeeper.report_merged_file( + self.panda_taskid, + output_filename, + event_ranges_map, + guid, + ) else: self.bookKeeper.report_failed_merge_transform(self.panda_taskid, output_filename) - self._logger.debug(f"Merge transform for {output_filename} failed with return code {sub_process.returncode}") + self._logger.debug( + f"Merge transform for {output_filename} failed with return code {sub_process.returncode}" + ) for o in to_remove: del self.running_merge_transforms[o] return new_transforms - def hits_merge_transform(self, input_files: Iterable[str], output_file: str) -> Tuple[Popen, str]: + def hits_merge_transform(self, input_files: Iterable[str], output_file: str) -> tuple[Popen, str]: """ Prepare the shell command for the merging subprocess and starts it. @@ -808,43 +883,63 @@ def hits_merge_transform(self, input_files: Iterable[str], output_file: str) -> output_file = os.path.join(self.merged_files_dir, output_file) file_list_path = os.path.join(tmp_dir, "file_list.txt") - with open(file_list_path, 'w') as f: + with open(file_list_path, "w") as f: f.write(file_list) - transform_params = re.sub(r"@inputFor_\$\{OUTPUT0\}", f"@/srv/{os.path.basename(file_list_path)}", self.merge_transform_params) + transform_params = re.sub( + r"@inputFor_\$\{OUTPUT0\}", + f"@/srv/{os.path.basename(file_list_path)}", + self.merge_transform_params, + ) transform_params = re.sub(r"--inputHitsFile=", "--inputHitsFile ", transform_params) transform_params = re.sub(r"--inputHITSFile=", "--inputHITSFile ", transform_params) transform_params = re.sub(r"\$\{OUTPUT0\}", output_file, transform_params, count=1) transform_params = re.sub(r"--autoConfiguration=everything", "", transform_params) transform_params = re.sub(r"--DBRelease=current", "", transform_params) - endtoken = "" if self.config.payload['containerextrasetup'].strip().endswith(";") else ";" - container_script = f"{self.config.payload['containerextrasetup']}{endtoken}{self.merge_transform} {transform_params}" + endtoken = "" if self.config.payload["containerextrasetup"].strip().endswith(";") else ";" + container_script = ( + f"{self.config.payload['containerextrasetup']}{endtoken}" f"{self.merge_transform} {transform_params}" + ) merge_script_path = os.path.join(tmp_dir, "merge_transform.sh") - with open(merge_script_path, 'w') as f: + with open(merge_script_path, "w") as f: f.write(container_script) - os.chmod(merge_script_path, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH | stat.S_IXOTH) + os.chmod( + merge_script_path, + stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH | stat.S_IXOTH, + ) setup_script_path = os.path.join(tmp_dir, "release_setup.sh") - setup_script = f"asetup Athena,{self.release},notest --platform {self.cmt_config} --makeflags=\'$MAKEFLAGS\'" + setup_script = f"asetup Athena,{self.release},notest --platform {self.cmt_config} --makeflags='$MAKEFLAGS'" self._logger.debug(f"Setting up release with: {setup_script}") - with open(setup_script_path, 'w') as f: + with open(setup_script_path, "w") as f: f.write(setup_script) - os.chmod(setup_script_path, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH | stat.S_IXOTH) + os.chmod( + setup_script_path, + stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH | stat.S_IXOTH, + ) - cmd = str() + cmd = "" cmd += "export ATLAS_LOCAL_ROOT_BASE=/cvmfs/atlas.cern.ch/repo/ATLASLocalRootBase;" - cmd += f"export thePlatform=\"{self.the_platform}\";" - endtoken = "" if self.config.payload['containerextraargs'].strip().endswith(";") else ";" - cmd += (f"{self.config.payload['containerextraargs']}{endtoken}" - f"source ${{ATLAS_LOCAL_ROOT_BASE}}/user/atlasLocalSetup.sh --swtype {self.config.payload['containerengine']}" - f" -c $thePlatform -s /srv/release_setup.sh -r /srv/merge_transform.sh -e \"{self.container_options}\";" - f"RETURN_VAL=$?;if [ \"$RETURN_VAL\" -eq 0 ]; then cp jobReport.json {job_report_name};fi;exit $RETURN_VAL;") - return (Popen(cmd, - stdin=DEVNULL, - stdout=DEVNULL, - stderr=DEVNULL, - shell=True, - cwd=tmp_dir, - close_fds=True), job_report_name) + cmd += f'export thePlatform="{self.the_platform}";' + endtoken = "" if self.config.payload["containerextraargs"].strip().endswith(";") else ";" + cmd += ( + f"{self.config.payload['containerextraargs']}{endtoken}" + f"source ${{ATLAS_LOCAL_ROOT_BASE}}/user/atlasLocalSetup.sh" + f" --swtype {self.config.payload['containerengine']}" + f" -c $thePlatform -s /srv/release_setup.sh -r /srv/merge_transform.sh -e \"{self.container_options}\";" + f"RETURN_VAL=$?;if [ \"$RETURN_VAL\" -eq 0 ]; then cp jobReport.json {job_report_name};fi;exit $RETURN_VAL;" + ) + return ( + Popen( + cmd, + stdin=DEVNULL, + stdout=DEVNULL, + stderr=DEVNULL, + shell=True, + cwd=tmp_dir, + close_fds=True, + ), + job_report_name, + ) diff --git a/src/raythena/scripts/raythena.py b/src/raythena/scripts/raythena.py index 6971a4f..2550ddf 100755 --- a/src/raythena/scripts/raythena.py +++ b/src/raythena/scripts/raythena.py @@ -1,70 +1,44 @@ #!/usr/bin/env python import functools import signal -import types import traceback - +import types import click - from raythena.drivers.baseDriver import BaseDriver +from raythena.drivers.esdriver import ESDriver from raythena.utils.config import Config from raythena.utils.ray import setup_ray, shutdown_ray -from raythena.drivers.esdriver import ESDriver - @click.command() +@click.option("--config", required=True, help="raythena configuration file.") +@click.option("-d", "--debug", is_flag=True, help="Debug log level") +@click.option("--ray-head-ip", help="IP address of ray head node") +@click.option("--ray-redis-port", help="Port of redis instance used by the ray cluster") +@click.option("--ray-redis-password", help="Redis password setup in the ray cluster") +@click.option("--ray-workdir", help="Workdirectory for ray actors") @click.option( - '--config', - required=True, - help='raythena configuration file.' -) -@click.option( - '-d', '--debug', - is_flag=True, - help='Debug log level' -) -@click.option( - '--ray-head-ip', - help='IP address of ray head node' -) -@click.option( - '--ray-redis-port', - help='Port of redis instance used by the ray cluster' -) -@click.option( - '--ray-redis-password', - help='Redis password setup in the ray cluster' -) -@click.option( - '--ray-workdir', - help='Workdirectory for ray actors' -) -@click.option( - '--harvester-endpoint', - help='Directory to use to communicate with harvester' -) -@click.option( - '--panda-queue', - help='Panda queue provided to the payload' + "--harvester-endpoint", + help="Directory to use to communicate with harvester", ) +@click.option("--panda-queue", help="Panda queue provided to the payload") @click.option( - '--core-per-node', - help='Used to determine how many events should be buffered by ray actors' + "--core-per-node", + help="Used to determine how many events should be buffered by ray actors", ) def cli(*args, **kwargs): """ - Starts the application by initializing the config object, connecting or starting the ray cluster, loading the driver - and starting it. + Starts the application by initializing the config object, + connecting or starting the ray cluster, loading the driverand starting it. Returns: None """ - config = Config(kwargs['config'], *args, **kwargs) + config = Config(kwargs["config"], *args, **kwargs) cluster_config = setup_ray(config) try: - driver = ESDriver(config, cluster_config['session_dir']) + driver = ESDriver(config, cluster_config["session_dir"]) signal.signal(signal.SIGINT, functools.partial(cleanup, config, driver)) signal.signal(signal.SIGTERM, functools.partial(cleanup, config, driver)) @@ -82,7 +56,12 @@ def cli(*args, **kwargs): shutdown_ray(config) -def cleanup(config: Config, driver: BaseDriver, signum: signal.Signals, frame: types.FrameType) -> None: +def cleanup( + config: Config, + driver: BaseDriver, + signum: signal.Signals, + frame: types.FrameType, +) -> None: """ Signal handler, notify the ray driver to stop @@ -99,7 +78,7 @@ def cleanup(config: Config, driver: BaseDriver, signum: signal.Signals, frame: t def main(): - cli(auto_envvar_prefix='RAYTHENA') + cli(auto_envvar_prefix="RAYTHENA") if __name__ == "__main__": diff --git a/src/raythena/utils/bookkeeper.py b/src/raythena/utils/bookkeeper.py index ce06ad4..be2004c 100644 --- a/src/raythena/utils/bookkeeper.py +++ b/src/raythena/utils/bookkeeper.py @@ -1,32 +1,48 @@ import collections -from functools import reduce import json +import os import threading +import time +from collections import deque +from collections.abc import Mapping, Sequence +from functools import reduce +from typing import ( + Any, + Optional, + Union, +) from raythena.utils.config import Config -from raythena.utils.eventservice import PandaJobQueue, EventRange, PandaJob, EventRangeUpdate, EventRangeDef, JobDef, PilotEventRangeUpdateDef +from raythena.utils.eventservice import ( + EventRange, + EventRangeDef, + EventRangeUpdate, + JobDef, + PandaJob, + PandaJobQueue, + PilotEventRangeUpdateDef, +) from raythena.utils.exception import ExThread from raythena.utils.logging import make_logger -from typing import Deque, Dict, Set, Optional, List, Mapping, Sequence, Union, Tuple, Any - -import time -import os - class TaskStatus: """ Utility class which manages the persistancy to file of the progress on a given Panda task. All operations (set_eventrange_simulated, set_eventrange_failed, set_file_merged) are lazy. - They will only enqueue a message which will only be processed just before writting the status to disk in save_status. - The reason for this design is that save_status and the update operations are supposed to be called by different threads and would - therefore add synchronization overhead and latency for the main driver thread responsible for polling actors. Having a single thread - updating and serializing the dictionary eliminate the need for synchronization however it also means that other thread reading the dictionary - (e.g. from get_nsimulated) will get out of date information as there will most likely be update pending in the queue at any point in time + They will only enqueue a message which will only be processed just before writting + the status to disk in save_status. The reason for this design is that save_status and the update + operations are supposed to be called by different threads and would therefore add synchronization overhead + and latency for the main driver thread responsible for polling actors. Having a single thread updating and + serializing the dictionary eliminate the need for synchronization, however it also means that other thread + reading the dictionary (e.g. from get_nsimulated) will get out of date information as there will most + likely be update pending in the queue at any point in time Keys set relation of each sub-dictionnary (simulated, merged, failed, merging): - - merged and merging key sets are disjoints -- when a file has been fully merged, its entry is removed from merging and moved into merged - - merged and simulated key sets are disjoints -- when a file has been fully merged, it is no longer necessary to keep track of individual event ranges; + - merged and merging key sets are disjoints -- when a file has been fully merged, + its entry is removed from merging and moved into merged + - merged and simulated key sets are disjoints -- when a file has been fully merged, + it is no longer necessary to keep track of individual event ranges; they are removed from simulated - merging is a subset of simulated -- it is possible for events from a given file to have been simulated but no merge job has completed for that specific file. @@ -46,16 +62,20 @@ def __init__(self, job: PandaJob, merged_files_dir: str, config: Config) -> None self.merged_files_dir = merged_files_dir self.filepath = os.path.join(self.output_dir, "state.json") self.tmpfilepath = f"{self.filepath}.tmp" - self._events_per_file = int(job['nEventsPerInputFile']) - self._nfiles = len(job['inFiles'].split(",")) + self._events_per_file = int(job["nEventsPerInputFile"]) + self._nfiles = len(job["inFiles"].split(",")) self._nevents = self._events_per_file * self._nfiles - self._hits_per_file = int(job['esmergeSpec']['nEventsPerOutputFile']) + self._hits_per_file = int(job["esmergeSpec"]["nEventsPerOutputFile"]) assert (self._events_per_file % self._hits_per_file == 0) or ( - self._hits_per_file % self._events_per_file == 0), "Expected number of events per input file to be a multiple of number of hits per merged file" + self._hits_per_file % self._events_per_file == 0 + ), "Expected number of events per input file to be a multiple of number of hits per merged file" # if _hits_per_file > _events_per_file, each input file has a single output file self._n_output_per_input_file = max(1, self._events_per_file // self._hits_per_file) - self._status: Dict[str, Union[Dict[str, Dict[str, Dict[str, str]]], Dict[str, List[str]]]] = dict() - self._update_queue: Deque[Tuple[str, Union[EventRange, Tuple]]] = collections.deque() + self._status: dict[ + str, + Union[dict[str, dict[str, dict[str, str]]], dict[str, list[str]]], + ] = dict() + self._update_queue: deque[tuple[str, Union[EventRange, tuple]]] = collections.deque() self._restore_status() def _default_init_status(self): @@ -83,23 +103,24 @@ def _restore_status(self): filename = self.tmpfilepath try: - with open(filename, 'r') as f: + with open(filename) as f: self._status = json.load(f) except OSError as e: # failed to load status, try to read from a possible tmp file if it exists and not already done if filename != self.tmpfilepath and os.path.isfile(self.tmpfilepath): try: - with open(self.tmpfilepath, 'r') as f: + with open(self.tmpfilepath) as f: self._status = json.load(f) except OSError as ee: self._logger.error(e.strerror) self._logger.error(ee.strerror) self._default_init_status() - def save_status(self, write_to_tmp=True, force_update = False): + def save_status(self, write_to_tmp=True, force_update=False): """ - Save the current status to a json file. Before saving to file, the update queue will be drained, actually carrying out the operations to the dictionary - that will be written to file. + Save the current status to a json file. Before saving to file, + the update queue will be drained, actually carrying out + the operations to the dictionary that will be written to file. Args: write_to_tmp: if true, the json data will be written to a temporary file then renamed to the final file @@ -123,7 +144,7 @@ def save_status(self, write_to_tmp=True, force_update = False): if write_to_tmp: filename = self.tmpfilepath try: - with open(filename, 'w') as f: + with open(filename, "w") as f: json.dump(self._status, f) if write_to_tmp: @@ -138,7 +159,7 @@ def is_stale(self) -> bool: return len(self._update_queue) > 0 @staticmethod - def build_eventrange_dict(eventrange: EventRange, output_file: str = None) -> Dict[str, Any]: + def build_eventrange_dict(eventrange: EventRange, output_file: Optional[str] = None) -> dict[str, Any]: """ Takes an EventRange object and retuns the dict representation which should be saved in the state file @@ -147,7 +168,11 @@ def build_eventrange_dict(eventrange: EventRange, output_file: str = None) -> Di Returns: The dictionnary to serialize """ - res = {"eventRangeID": eventrange.eventRangeID, "startEvent": eventrange.startEvent, "lastEvent": eventrange.lastEvent} + res = { + "eventRangeID": eventrange.eventRangeID, + "startEvent": eventrange.startEvent, + "lastEvent": eventrange.lastEvent, + } if output_file: res["path"] = output_file return res @@ -174,9 +199,17 @@ def _set_eventrange_simulated(self, eventrange: EventRange, simulation_output_fi simulated_dict = self._status[TaskStatus.SIMULATED] if filename not in simulated_dict: simulated_dict[filename] = dict() - simulated_dict[filename][eventrange.eventRangeID] = TaskStatus.build_eventrange_dict(eventrange, simulation_output_file) - - def set_file_merged(self, input_files: List[str], outputfile: str, event_ranges: Mapping[str, Mapping[str, str]], guid: Optional[str]): + simulated_dict[filename][eventrange.eventRangeID] = TaskStatus.build_eventrange_dict( + eventrange, simulation_output_file + ) + + def set_file_merged( + self, + input_files: list[str], + outputfile: str, + event_ranges: Mapping[str, Mapping[str, str]], + guid: Optional[str], + ): """ Enqueue a message indicating that a file has been merged. @@ -187,7 +220,13 @@ def set_file_merged(self, input_files: List[str], outputfile: str, event_ranges: """ self._update_queue.append((TaskStatus.MERGING, (input_files, outputfile, event_ranges, guid))) - def _set_file_merged(self, input_files: List[str], outputfile: str, event_ranges: Mapping[str, Mapping[str, str]], guid: Optional[str]): + def _set_file_merged( + self, + input_files: list[str], + outputfile: str, + event_ranges: Mapping[str, Mapping[str, str]], + guid: Optional[str], + ): """ Performs the update of the internal dictionnary of a merged file. @@ -201,7 +240,9 @@ def _set_file_merged(self, input_files: List[str], outputfile: str, event_ranges for file in input_files: if file in failed_dict: total_failed += len(failed_dict[file]) - assert len(event_ranges) + total_failed == self._hits_per_file, f"Expected {self._hits_per_file} hits in {outputfile}, got {len(event_ranges)}" + assert ( + len(event_ranges) + total_failed == self._hits_per_file + ), f"Expected {self._hits_per_file} hits in {outputfile}, got {len(event_ranges)}" for inputfile in input_files: if inputfile not in self._status[TaskStatus.MERGING]: self._status[TaskStatus.MERGING][inputfile] = {outputfile: event_ranges} @@ -211,8 +252,11 @@ def _set_file_merged(self, input_files: List[str], outputfile: str, event_ranges if len(self._status[TaskStatus.MERGING][inputfile]) == self._n_output_per_input_file: merged_dict = dict() self._status[TaskStatus.MERGED][inputfile] = merged_dict - for merged_outputfile in self._status[TaskStatus.MERGING][inputfile].keys(): - merged_dict[merged_outputfile] = {"path": os.path.join(self.merged_files_dir, merged_outputfile), "guid": guid if guid else ""} + for merged_outputfile in self._status[TaskStatus.MERGING][inputfile]: + merged_dict[merged_outputfile] = { + "path": os.path.join(self.merged_files_dir, merged_outputfile), + "guid": guid if guid else "", + } del self._status[TaskStatus.MERGING][inputfile] del self._status[TaskStatus.SIMULATED][inputfile] else: @@ -248,7 +292,8 @@ def get_nsimulated(self, filename=None) -> int: Total number of event ranges that have been simulated but not yet merged. Args: - filename: if none, returns the total number of simulated events. If specified, returns the number of events simulated for that specific file + filename: if none, returns the total number of simulated events. + If specified, returns the number of events simulated for that specific file Returns: the number of events simulated @@ -261,22 +306,34 @@ def get_nsimulated(self, filename=None) -> int: merged = len(self._status[TaskStatus.MERGING][filename]) * self._hits_per_file return len(self._status[TaskStatus.SIMULATED].get(filename, [])) - merged - return reduce(lambda acc, cur: acc + len(cur), self._status[TaskStatus.SIMULATED].values(), 0) - \ - reduce(lambda acc, cur: acc + len(cur) * self._hits_per_file, self._status[TaskStatus.MERGING].values(), 0) + return reduce( + lambda acc, cur: acc + len(cur), + self._status[TaskStatus.SIMULATED].values(), + 0, + ) - reduce( + lambda acc, cur: acc + len(cur) * self._hits_per_file, + self._status[TaskStatus.MERGING].values(), + 0, + ) def get_nfailed(self, filename=None) -> int: """ Total number of event ranges that have failed. Args: - filename: if none, returns the total number of failed events. If specified, returns the number of events failed for that specific file + filename: if none, returns the total number of failed events. + If specified, returns the number of events failed for that specific file Returns: the number of events failed """ if filename: return len(self._status[TaskStatus.FAILED].get(filename, [])) - return reduce(lambda acc, cur: acc + len(cur), self._status[TaskStatus.FAILED].values(), 0) + return reduce( + lambda acc, cur: acc + len(cur), + self._status[TaskStatus.FAILED].values(), + 0, + ) def get_nmerged(self, filename=None) -> int: """ @@ -293,8 +350,11 @@ def get_nmerged(self, filename=None) -> int: return self._events_per_file elif filename in self._status[TaskStatus.MERGING]: return len(self._status[TaskStatus.MERGING][filename]) * self._hits_per_file - return len(self._status[TaskStatus.MERGED]) * self._events_per_file + \ - reduce(lambda acc, cur: acc + len(cur) * self._hits_per_file, self._status[TaskStatus.MERGING].values(), 0) + return len(self._status[TaskStatus.MERGED]) * self._events_per_file + reduce( + lambda acc, cur: acc + len(cur) * self._hits_per_file, + self._status[TaskStatus.MERGING].values(), + 0, + ) def total_events(self) -> int: """ @@ -303,7 +363,7 @@ def total_events(self) -> int: return self._nevents -class BookKeeper(object): +class BookKeeper: """ Performs bookkeeping of jobs and event ranges distributed to workers """ @@ -311,26 +371,27 @@ class BookKeeper(object): def __init__(self, config: Config) -> None: self.jobs: PandaJobQueue = PandaJobQueue() self.config: Config = config - self.output_dir = str() - self.merged_files_dir = str() - self.commitlog = str() + self.output_dir = "" + self.merged_files_dir = "" + self.commitlog = "" self._logger = make_logger(self.config, "BookKeeper") - self.actors: Dict[str, Optional[str]] = dict() - self.rangesID_by_actor: Dict[str, Set[str]] = dict() - #  Output files for which we are ready to launch a merge transform - self.files_ready_to_merge: Dict[str, List[Tuple[str, EventRange]]] = dict() + self.actors: dict[str, Optional[str]] = dict() + self.rangesID_by_actor: dict[str, set[str]] = dict() + # Output files for which we are ready to launch a merge transform + self.files_ready_to_merge: dict[str, list[tuple[str, EventRange]]] = dict() # Event ranges for a given input file which have been simulated and a ready to be merged - self.ranges_to_merge: Dict[str, List[Tuple[str, EventRange]]] = dict() - # Accumulate event ranges of different input files into the same output file until we have enough to produce a merged file + self.ranges_to_merge: dict[str, list[tuple[str, EventRange]]] = dict() + # Accumulate event ranges of different input files into the same output file until + # we have enough to produce a merged file. # Only used when multiple input files are merged in a single output (n-1) to pool input files together - self.output_merge_queue: Dict[str, List[Tuple[str, EventRange]]] = dict() + self.output_merge_queue: dict[str, list[tuple[str, EventRange]]] = dict() # Keep tracks of merge job definition that have been distributed to the driver for which we expect an update - self.ditributed_merge_tasks: Dict[str, List[Tuple[str, EventRange]]] = dict() - self.files_guids: Dict[str, str] = dict() + self.ditributed_merge_tasks: dict[str, list[tuple[str, EventRange]]] = dict() + self.files_guids: dict[str, str] = dict() self.last_status_print = time.time() - self.taskstatus: Dict[str, TaskStatus] = dict() - self._input_output_mapping: Dict[str, List[str]] = dict() - self._output_input_mapping: Dict[str, List[str]] = dict() + self.taskstatus: dict[str, TaskStatus] = dict() + self._input_output_mapping: dict[str, list[str]] = dict() + self._output_input_mapping: dict[str, list[str]] = dict() self.stop_saver = threading.Event() self.stop_cleaner = threading.Event() self.save_state_thread = ExThread(target=self._saver_thead_run, name="status-saver-thread") @@ -345,7 +406,7 @@ def _cleaner_thead_run(self): if os.path.isdir(self.output_dir): files = set(os.listdir(self.output_dir)) for task_status in self.taskstatus.values(): - for merged_file in task_status._status[TaskStatus.MERGED].keys(): + for merged_file in task_status._status[TaskStatus.MERGED]: if self.stop_cleaner.is_set(): break for temp_file in files: @@ -361,7 +422,6 @@ def _cleaner_thead_run(self): self.stop_cleaner.wait(60) def _saver_thead_run(self): - while not self.stop_saver.is_set(): self.save_status() # wait for 60s before next update or until the stop condition is met @@ -376,8 +436,8 @@ def save_status(self): def check_mergeable_files(self): """ - Goes through the current task status, checks if a file has been entierly processed (event ranges all simulated or failed) and - if so adds the file to self.files_ready_to_merge + Goes through the current task status, checks if a file has been entierly processed + (event ranges all simulated or failed) and if so adds the file to self.files_ready_to_merge """ if self._hits_per_file >= self._events_per_file: self._check_mergeable_files_n_1() @@ -387,8 +447,8 @@ def check_mergeable_files(self): def _check_mergeable_files_1_n(self): for input_file, event_ranges in self.ranges_to_merge.items(): while len(event_ranges) >= self._hits_per_file: - ranges_to_merge = event_ranges[-self._hits_per_file:] - del event_ranges[-self._hits_per_file:] + ranges_to_merge = event_ranges[-self._hits_per_file :] + del event_ranges[-self._hits_per_file :] output_file = self._input_output_mapping[input_file].pop() self.files_ready_to_merge[output_file] = ranges_to_merge @@ -431,8 +491,9 @@ def start_threads(self): def add_jobs(self, jobs: Mapping[str, JobDef], start_threads=True) -> None: """ - Register new jobs. Event service jobs will not be assigned to worker until event ranges are added to the job. - This will also automatically start the thread responsible for saving the task status to file if the parameter start_save_thread is True. + Register new jobs. Event service jobs will not be assigned to worker until + event ranges are added to the job. This will also automatically start the thread + responsible for saving the task status to file if the parameter start_save_thread is True. If the thread is started, it must be stopped with stop_save_thread before exiting the application Args: @@ -450,7 +511,7 @@ def add_jobs(self, jobs: Mapping[str, JobDef], start_threads=True) -> None: assert self.output_dir assert self.merged_files_dir ts = TaskStatus(job, self.merged_files_dir, self.config) - self.taskstatus[job['taskID']] = ts + self.taskstatus[job["taskID"]] = ts self.commitlog = os.path.join(self.output_dir, "commit_log") self._generate_input_output_mapping(job) self._generate_event_ranges(job, ts) @@ -462,10 +523,10 @@ def _generate_input_output_mapping(self, job: PandaJob): Goes through the list of input and ouput file names and matches expected output files for a given input file """ # Filter out potential log files, only interested in HITS files - output_files = [e for e in job["outFiles"].split(',') if e.startswith("HITS")] - input_files = job["inFiles"].split(',') - events_per_file = int(job['nEventsPerInputFile']) - hits_per_file = int(job['esmergeSpec']['nEventsPerOutputFile']) + output_files = [e for e in job["outFiles"].split(",") if e.startswith("HITS")] + input_files = job["inFiles"].split(",") + events_per_file = int(job["nEventsPerInputFile"]) + hits_per_file = int(job["esmergeSpec"]["nEventsPerOutputFile"]) input_output_mapping = dict() output_input_mapping = dict() @@ -486,8 +547,8 @@ def _generate_input_output_mapping(self, job: PandaJob): n = events_per_file // hits_per_file assert len(input_files) * n == len(output_files) for i, j in zip(range(len(input_files)), range(0, len(output_files), n)): - input_output_mapping[input_files[i]] = output_files[j:(j + n)] - for output_file in output_files[j:(j + n)]: + input_output_mapping[input_files[i]] = output_files[j : (j + n)] + for output_file in output_files[j : (j + n)]: output_input_mapping[output_file] = [input_files[i]] self._input_output_mapping = input_output_mapping self._output_input_mapping = output_input_mapping @@ -496,7 +557,7 @@ def _generate_input_output_mapping(self, job: PandaJob): def generate_event_range_id(file: str, n: str): return f"{file}-{n}" - def remap_output_files(self, panda_id: str) -> Dict[str, str]: + def remap_output_files(self, panda_id: str) -> dict[str, str]: """ Translate an existing output file to an output filename matching the current job definition. """ @@ -505,9 +566,9 @@ def remap_output_files(self, panda_id: str) -> Dict[str, str]: if task_status.is_stale(): task_status.save_status() merged_files = task_status._status[TaskStatus.MERGED] - previous_to_current_output_lookup: Dict[str, str] = dict() + previous_to_current_output_lookup: dict[str, str] = dict() - with open(self.commitlog, 'a') as f: + with open(self.commitlog, "a") as f: for input_file, output_files in self._input_output_mapping.items(): merged_output_files = merged_files[input_file] assert isinstance(merged_output_files, dict) @@ -535,7 +596,7 @@ def recover_outputfile_name(self, filename: str) -> str: """ Read the commitlog change history of filename and return the current filename """ - with open(self.commitlog, 'r') as f: + with open(self.commitlog) as f: for line in f: op, *args = line.rstrip().split(" ") if op != "rename_output": @@ -564,19 +625,16 @@ def _generate_event_ranges(self, job: PandaJob, task_status: TaskStatus): job: the job to which the generated event ranges will be assigned task_status: current status of the panda task """ - self._events_per_file = int(job['nEventsPerInputFile']) + self._events_per_file = int(job["nEventsPerInputFile"]) # We only ever get one job - self._hits_per_file = int(job['esmergeSpec']['nEventsPerOutputFile']) + self._hits_per_file = int(job["esmergeSpec"]["nEventsPerOutputFile"]) is_n_to_one = self._hits_per_file >= self._events_per_file - files = job["inFiles"].split(',') + files = job["inFiles"].split(",") if files: - guids = job["GUID"].split(',') + guids = job["GUID"].split(",") for file, guid in zip(files, guids): self.files_guids[file] = guid - if "scopeIn" in job: - scope = job["scopeIn"] - else: - scope = "" + scope = job.get("scopeIn", "") event_ranges = [] merged_files = task_status._status[TaskStatus.MERGED] merging_files = task_status._status[TaskStatus.MERGING] @@ -605,7 +663,8 @@ def _generate_event_ranges(self, job: PandaJob, task_status: TaskStatus): # Second pass handling only merged, simulated and not processed files for file, guid in zip(files, guids): - # if all the event ranges in the input file have been merge, or the file was declared as failed in the first pass, move to the next + # if all the event ranges in the input file have been merge, + # or the file was declared as failed in the first pass, move to the next if file in merged_files or file in failed_input_files: continue file_simulated_ranges = simulated_ranges.get(file) @@ -625,7 +684,10 @@ def _generate_event_ranges(self, job: PandaJob, task_status: TaskStatus): continue # event range hasn't been merged but already simulated, add it as ready to be merged if file_simulated_ranges is not None and range_id in file_simulated_ranges: - item = (file_simulated_ranges[range_id]["path"], event_range) + item = ( + file_simulated_ranges[range_id]["path"], + event_range, + ) if event_range.PFN not in self.ranges_to_merge: self.ranges_to_merge[event_range.PFN] = [item] else: @@ -640,13 +702,12 @@ def _generate_event_ranges(self, job: PandaJob, task_status: TaskStatus): self._logger.debug(f"Generated {len(event_ranges)} event ranges") job.event_ranges_queue.add_new_event_ranges(event_ranges) - def add_event_ranges( - self, event_ranges: Mapping[str, Sequence[EventRangeDef]]) -> None: + def add_event_ranges(self, event_ranges: Mapping[str, Sequence[EventRangeDef]]) -> None: """ Assign event ranges to the jobs in queue. Args: - event_ranges: List of event ranges dict as returned by harvester + event_ranges: list of event ranges dict as returned by harvester Returns: None @@ -703,11 +764,12 @@ def assign_job_to_actor(self, actor_id: str) -> Optional[PandaJob]: self.actors[actor_id] = job_id return self.jobs[job_id] if job_id else None - def fetch_event_ranges(self, actor_id: str, n: int) -> List[EventRange]: + def fetch_event_ranges(self, actor_id: str, n: int) -> list[EventRange]: """ - Retrieve event ranges for an actor. The specified actor should have a job assigned from assign_job_to_actor() or an empty list will be returned. - If the job assigned to the actor doesn't have enough range currently available, it will assign all of its remaining anges - to the worker without trying to get new ranges from harvester. + Retrieve event ranges for an actor. The specified actor should have + a job assigned from assign_job_to_actor() or an empty list will be returned. + If the job assigned to the actor doesn't have enough range currently available, + it will assign all of its remaining ranges to the worker without trying to get new ranges from harvester. Args: actor_id: actor requesting event ranges @@ -720,12 +782,13 @@ def fetch_event_ranges(self, actor_id: str, n: int) -> List[EventRange]: return list() if actor_id not in self.rangesID_by_actor: self.rangesID_by_actor[actor_id] = set() - ranges = self.jobs.get_event_ranges( - self.actors[actor_id]).get_next_ranges(n) + ranges = self.jobs.get_event_ranges(self.actors[actor_id]).get_next_ranges(n) self.rangesID_by_actor[actor_id].update(map(lambda e: e.eventRangeID, ranges)) return ranges - def get_file_to_merge(self) -> Optional[Tuple[str, List[Tuple[str, EventRange]]]]: + def get_file_to_merge( + self, + ) -> Optional[tuple[str, list[tuple[str, EventRange]]]]: """ Returns a merge tasks available for an arbitrary input file if available, None otherwise. """ @@ -735,20 +798,35 @@ def get_file_to_merge(self) -> Optional[Tuple[str, List[Tuple[str, EventRange]]] return merge_task return None - def report_merged_file(self, taskID: str, merged_output_file: str, merged_event_ranges: Mapping[str, Mapping[str, str]], guid: Optional[str]): + def report_merged_file( + self, + taskID: str, + merged_output_file: str, + merged_event_ranges: Mapping[str, Mapping[str, str]], + guid: Optional[str], + ): assert merged_output_file in self.ditributed_merge_tasks del self.ditributed_merge_tasks[merged_output_file] - self.taskstatus[taskID].set_file_merged(self._output_input_mapping[merged_output_file], merged_output_file, merged_event_ranges, guid) + self.taskstatus[taskID].set_file_merged( + self._output_input_mapping[merged_output_file], + merged_output_file, + merged_event_ranges, + guid, + ) def report_failed_merge_transform(self, taskID: str, merged_output_file: str): assert merged_output_file in self.ditributed_merge_tasks old_task = self.ditributed_merge_tasks.pop(merged_output_file) self.files_ready_to_merge[merged_output_file] = old_task - def process_event_ranges_update(self, actor_id: str, event_ranges_update: Union[Sequence[PilotEventRangeUpdateDef], EventRangeUpdate]): + def process_event_ranges_update( + self, + actor_id: str, + event_ranges_update: Union[Sequence[PilotEventRangeUpdateDef], EventRangeUpdate], + ): """ - Process the event ranges update sent by the worker. This will update the status of event ranges in the update as well as building - the list of event ranges to be tarred up for each input file. + Process the event ranges update sent by the worker. This will update the status of event ranges + in the update as well as building the list of event ranges to be tarred up for each input file. Args: actor_id: actor worker_id that sent the update @@ -763,20 +841,23 @@ def process_event_ranges_update(self, actor_id: str, event_ranges_update: Union[ return if not isinstance(event_ranges_update, EventRangeUpdate): - event_ranges_update = EventRangeUpdate.build_from_dict( - panda_id, event_ranges_update) + event_ranges_update = EventRangeUpdate.build_from_dict(panda_id, event_ranges_update) self.jobs.process_event_ranges_update(event_ranges_update) - task_status = self.taskstatus[self.jobs[panda_id]['taskID']] + task_status = self.taskstatus[self.jobs[panda_id]["taskID"]] job_ranges = self.jobs.get_event_ranges(panda_id) actor_ranges = self.rangesID_by_actor[actor_id] # 1st pass for failed ranges failed_files = [] for r in event_ranges_update[panda_id]: - status = r['eventStatus'] - if 'eventRangeID' in r and r['eventRangeID'] in actor_ranges and status in [EventRange.FAILED, EventRange.FATAL]: + status = r["eventStatus"] + if ( + "eventRangeID" in r + and r["eventRangeID"] in actor_ranges + and status in [EventRange.FAILED, EventRange.FATAL] + ): self._logger.info(f"Received failed event from {actor_id}: {r}") - evnt_range = job_ranges[r['eventRangeID']] + evnt_range = job_ranges[r["eventRangeID"]] if evnt_range.PFN in failed_files: continue failed_files.extend(self.get_files_to_merge_with(evnt_range.PFN)) @@ -788,14 +869,14 @@ def process_event_ranges_update(self, actor_id: str, event_ranges_update: Union[ task_status.set_eventrange_failed(job_ranges[event_range_id]) for r in event_ranges_update[panda_id]: - if 'eventRangeID' in r and r['eventRangeID'] in actor_ranges: - range_id = r['eventRangeID'] + if "eventRangeID" in r and r["eventRangeID"] in actor_ranges: + range_id = r["eventRangeID"] actor_ranges.remove(range_id) evnt_range = job_ranges[range_id] if evnt_range.PFN in failed_files: continue - if r['eventStatus'] == EventRange.DONE: - task_status.set_eventrange_simulated(evnt_range, r['path']) + if r["eventStatus"] == EventRange.DONE: + task_status.set_eventrange_simulated(evnt_range, r["path"]) if evnt_range.PFN not in self.ranges_to_merge: self.ranges_to_merge[evnt_range.PFN] = list() self.ranges_to_merge[evnt_range.PFN].append((r["path"], evnt_range)) @@ -842,8 +923,7 @@ def process_actor_end(self, actor_id: str) -> None: return self._logger.info(f"{actor_id} finished with {len(actor_ranges)} events remaining to process") for rangeID in actor_ranges: - self.jobs.get_event_ranges(panda_id).update_range_state( - rangeID, EventRange.READY) + self.jobs.get_event_ranges(panda_id).update_range_state(rangeID, EventRange.READY) actor_ranges.clear() self.actors[actor_id] = None @@ -861,13 +941,13 @@ def n_ready(self, panda_id: str) -> int: def n_events(self, panda_id: str) -> int: """ - Total number of events for a given Panda job + Total number of events for a given Panda job - Args: - panda_id: job worker_id to check + Args: + panda_id: job worker_id to check - Returns: - Number of events in panda_id + Returns: + Number of events in panda_id """ return len(self.jobs.get_event_ranges(panda_id)) @@ -875,7 +955,8 @@ def is_flagged_no_more_events(self, panda_id: str) -> bool: """ Checks if a job can still receive more event ranges from harvester. This function returning Trued doesn't guarantee that Harvester has more events available, - only that it may or may not have more events available. If false is returned, Harvester doesn't have more events available + only that it may or may not have more events available. + If false is returned, Harvester doesn't have more events available Args: panda_id: job worker_id to check diff --git a/src/raythena/utils/config.py b/src/raythena/utils/config.py index c73ccee..6a95457 100644 --- a/src/raythena/utils/config.py +++ b/src/raythena/utils/config.py @@ -1,9 +1,48 @@ import os - import yaml - -class Config(object): +required_conf_settings = { + "payload": { + "pandaqueue": str, + "logfilename": str, + "extrasetup": str, + "hpcresource": str, + "extrapostpayload": str, + "containerengine": str, + "containerextraargs": str, + "containerextrasetup": str, + "pilotkillfile": str, + "pilotversion": str, + "pilotkilltime": int, + "timemonitorfile": str, + }, + "harvester": { + "endpoint": str, + "harvesterconf": str, + }, + "ray": { + "workdir": str, + "taskprogressbasedir": str, + "headip": str, + "redisport": int, + "redispassword": str, + "timeoutinterval": int, + "mergemaxprocesses": int, + "cachesizefactor": int, + }, + "resources": { + "corepernode": int, + }, + "logging": { + "level": str, + "driverlogfile": str, + "workerlogfile": str, + "copyraylogs": bool, + }, +} + + +class Config: """Class storing app configuration. This class will store configuration by prioritizing in the following order: @@ -11,50 +50,10 @@ class Config(object): Note that not all arguments can be specified using cli or env variable, some of them can only be specified from the conf file. See the file for more information about which settings can be specified using cli. Any parameter can be specified in the config file, the only constraint checked being that - attributes in Config.required_conf_settings should be present in the config file. This allows to specify + attributes in required_conf_settings should be present in the config file. This allows to specify custom settings for plugins if necessary. """ - required_conf_settings = { - 'payload': { - "pandaqueue": str, - "logfilename": str, - "extrasetup": str, - "hpcresource": str, - "extrapostpayload": str, - "containerengine": str, - "containerextraargs": str, - "containerextrasetup": str, - "pilotkillfile": str, - "pilotversion": str, - "pilotkilltime": int, - "timemonitorfile": str, - }, - 'harvester': { - 'endpoint': str, - 'harvesterconf': str, - }, - 'ray': { - 'workdir': str, - 'taskprogressbasedir': str, - 'headip': str, - 'redisport': int, - 'redispassword': str, - 'timeoutinterval': int, - 'mergemaxprocesses': int, - 'cachesizefactor': int, - }, - 'resources': { - 'corepernode': int, - }, - 'logging': { - 'level': str, - 'driverlogfile': str, - 'workerlogfile': str, - 'copyraylogs': bool - } - } - def __init__(self, config_path: str, *args, **kwargs) -> None: """Parse the config file to an object @@ -89,11 +88,18 @@ def __str__(self): """ return str(self.__dict__) - def _parse_cli_args(self, config: str, debug: bool, - ray_head_ip: str, - ray_redis_password: str, ray_redis_port: str, - ray_workdir: str, harvester_endpoint: str, - panda_queue: str, core_per_node: int) -> None: + def _parse_cli_args( + self, + config: str, + debug: bool, + ray_head_ip: str, + ray_redis_password: str, + ray_redis_port: str, + ray_workdir: str, + harvester_endpoint: str, + panda_queue: str, + core_per_node: int, + ) -> None: """ Overrides config settings with settings specified via cli / env vars @@ -114,24 +120,28 @@ def _parse_cli_args(self, config: str, debug: bool, None """ if debug: - self.logging['level'] = 'debug' + self.logging["level"] = "debug" if ray_head_ip: - self.ray['headip'] = ray_head_ip + self.ray["headip"] = ray_head_ip if ray_redis_port: - self.ray['redispassword'] = ray_redis_password + self.ray["redispassword"] = ray_redis_password if ray_redis_port: - self.ray['redisport'] = ray_redis_port + self.ray["redisport"] = ray_redis_port if ray_workdir: - self.ray['workdir'] = ray_workdir + self.ray["workdir"] = ray_workdir if harvester_endpoint: - self.harvester['endpoint'] = harvester_endpoint + self.harvester["endpoint"] = harvester_endpoint if panda_queue: - self.payload['pandaqueue'] = panda_queue + self.payload["pandaqueue"] = panda_queue if core_per_node: - self.resources['corepernode'] = int(core_per_node) - - def _validate_section(self, template_section_name: str, - section_params: dict, template_params: dict) -> None: + self.resources["corepernode"] = int(core_per_node) + + def _validate_section( + self, + template_section_name: str, + section_params: dict, + template_params: dict, + ) -> None: """ Validate one section of the config file @@ -147,13 +157,14 @@ def _validate_section(self, template_section_name: str, Exception: Invalid configuration file """ for name, value in template_params.items(): - if name not in section_params.keys(): - raise Exception( - f"Param '{name}' not found in conf section '{template_section_name}'" - ) + if name not in section_params: + raise Exception(f"Param '{name}' not found in conf section '{template_section_name}'") if isinstance(value, dict): - self._validate_section(f"{template_section_name}.{name}", - section_params.get(name), value) + self._validate_section( + f"{template_section_name}.{name}", + section_params.get(name), + value, + ) def _validate(self) -> None: """ @@ -166,12 +177,11 @@ def _validate(self) -> None: Exception: config file is invalid """ # validate pilot section - for template_section, template_params in Config.required_conf_settings.items( - ): + for ( + template_section, + template_params, + ) in required_conf_settings.items(): section_params = getattr(self, template_section, None) if section_params is None: - raise Exception( - f"Malformed configuration file: section '{template_section}' not found" - ) - self._validate_section(template_section, section_params, - template_params) + raise Exception(f"Malformed configuration file: section '{template_section}' not found") + self._validate_section(template_section, section_params, template_params) diff --git a/src/raythena/utils/eventservice.py b/src/raythena/utils/eventservice.py index d5e5aaf..4e14c5f 100644 --- a/src/raythena/utils/eventservice.py +++ b/src/raythena/utils/eventservice.py @@ -1,23 +1,18 @@ import json import os - -from typing import Set, Union, Dict, List, Mapping, Iterable, Any, Optional, Sequence, MutableMapping +from collections.abc import Iterable, Mapping, MutableMapping, Sequence +from typing import ( + Any, + Optional, + Union, +) # Types aliases Builtin = Union[int, float, str] -JobDef = Dict[str, Builtin] +JobDef = dict[str, Builtin] EventRangeDef = MutableMapping[str, Builtin] FileInfo = Mapping[str, Builtin] -PilotEventRangeUpdateDef = Mapping[ - str, - Union[ - Builtin, - FileInfo, - Sequence[ - EventRangeDef - ] - ] -] +PilotEventRangeUpdateDef = Mapping[str, Union[Builtin, FileInfo, Sequence[EventRangeDef]]] HarvesterEventRangeUpdateDef = Sequence[MutableMapping[str, Builtin]] EventRangeUpdateDef = Union[Sequence[PilotEventRangeUpdateDef], HarvesterEventRangeUpdateDef] @@ -25,10 +20,11 @@ # Messages sent by ray actor to the driver -class Messages(object): +class Messages: """ Defines messages exchanged between ray actors and the driver """ + REQUEST_NEW_JOB = 0 REQUEST_EVENT_RANGES = 1 UPDATE_JOB = 2 @@ -79,7 +75,7 @@ def default(self, o: Any) -> Any: return super().default(o) -class PandaJobQueue(object): +class PandaJobQueue: """ Build from the reply to a job request. Harvester will provide the following JSON as a reply: Provides utility methods to manage the job queue such as retrieving a spcific job, assigning jobs to workers. @@ -95,21 +91,21 @@ class PandaJobQueue(object): See PandaJob doc for the format """ - def __init__(self, jobs: Mapping[str, JobDef] = None) -> None: - self.jobs: Dict[str, PandaJob] = dict() + def __init__(self, jobs: Optional[Mapping[str, JobDef]] = None) -> None: + self.jobs: dict[str, PandaJob] = dict() self.distributed_jobs_ids = list() if jobs: self.add_jobs(jobs) - def __getitem__(self, k: str) -> 'PandaJob': + def __getitem__(self, k: str) -> "PandaJob": return self.jobs[k] - def __setitem__(self, k: str, v: 'PandaJob') -> None: + def __setitem__(self, k: str, v: "PandaJob") -> None: if isinstance(v, PandaJob): self.jobs[k] = v else: - raise Exception(f"{v} is not of type {PandaJob}") + raise ValueError(f"{v} is not of type {PandaJob}") def __iter__(self) -> Iterable[str]: return iter(self.jobs) @@ -120,7 +116,7 @@ def __len__(self) -> int: def __contains__(self, k: str) -> bool: return self.has_job(k) - def next_job_to_process(self) -> Optional['PandaJob']: + def next_job_to_process(self) -> Optional["PandaJob"]: """ Retrieve the next available job in the jobqueue. If the job is an eventservice job, it needs to have event ranges available otherwise it will not be considered as available @@ -174,7 +170,7 @@ def add_jobs(self, jobs: Mapping[str, JobDef]) -> None: for jobID, jobDef in jobs.items(): self.jobs[jobID] = PandaJob(jobDef) - def get_event_ranges(self, panda_id: str) -> 'EventRangeQueue': + def get_event_ranges(self, panda_id: str) -> "EventRangeQueue": """ Retrieve the EventRangeQueue for the given panda job @@ -187,8 +183,7 @@ def get_event_ranges(self, panda_id: str) -> 'EventRangeQueue': if panda_id in self.jobs: return self[panda_id].event_ranges_queue - def process_event_ranges_update(self, - ranges_update: 'EventRangeUpdate') -> None: + def process_event_ranges_update(self, ranges_update: "EventRangeUpdate") -> None: """ Update the range status Args: @@ -221,7 +216,7 @@ def process_event_ranges_reply(self, reply: Mapping[str, HarvesterEventRangeUpda self.get_event_ranges(pandaID).add_new_event_ranges(ranges_obj) @staticmethod - def build_from_dict(jobs_dict: Mapping[str, JobDef]) -> 'PandaJobQueue': + def build_from_dict(jobs_dict: Mapping[str, JobDef]) -> "PandaJobQueue": """ Convert dict of jobs returned by harvester to a PandaJobQueue. Args: @@ -314,7 +309,7 @@ def pop(self): return obj -class EventRangeQueue(object): +class EventRangeQueue: """ Each PandaJob has an eventRangeQueue that should be filled from a reply to an event ranges request: @@ -339,11 +334,11 @@ def __init__(self) -> None: """ Init the queue """ - self.event_ranges_by_id: Dict[str, EventRange] = dict() - self.rangesID_by_state: Dict[str, Set[str]] = dict() + self.event_ranges_by_id: dict[str, EventRange] = dict() + self.rangesID_by_state: dict[str, set[str]] = dict() # only holds event ranges that are ready - self.rangesID_by_file: Dict[str, Set[str]] = dict() - self.event_ranges_count: Dict[str, int] = dict() + self.rangesID_by_file: dict[str, set[str]] = dict() + self.event_ranges_count: dict[str, int] = dict() for s in EventRange.STATES: self.event_ranges_count[s] = 0 self.rangesID_by_state[s] = set() @@ -354,14 +349,14 @@ def __iter__(self) -> Iterable[str]: def __len__(self) -> int: return len(self.event_ranges_by_id) - def __getitem__(self, k: str) -> 'EventRange': + def __getitem__(self, k: str) -> "EventRange": return self.event_ranges_by_id[k] - def __setitem__(self, k: str, v: 'EventRange') -> None: + def __setitem__(self, k: str, v: "EventRange") -> None: if not isinstance(v, EventRange): - raise Exception(f"{v} should be of type {EventRange}") + raise ValueError(f"{v} should be of type {EventRange}") if k != v.eventRangeID: - raise Exception(f"Specified key '{k}' should be equals to the event range id '{v.eventRangeID}' ") + raise KeyError(f"Specified key '{k}' should be equals to the event range id '{v.eventRangeID}' ") if k in self.event_ranges_by_id: self.rangesID_by_state[v.status].remove(k) if v.PFN in self.rangesID_by_file: @@ -374,7 +369,9 @@ def __contains__(self, k: str) -> bool: return k in self.event_ranges_by_id @staticmethod - def build_from_list(ranges_list: Iterable[EventRangeDef]) -> 'EventRangeQueue': + def build_from_list( + ranges_list: Iterable[EventRangeDef], + ) -> "EventRangeQueue": """ Build an EventRangeQueue from a list of event ranges sent by harvester @@ -392,7 +389,7 @@ def build_from_list(ranges_list: Iterable[EventRangeDef]) -> 'EventRangeQueue': def _get_file_from_id(self, range_id: str) -> str: return os.path.basename(self.event_ranges_by_id[range_id].PFN) - def update_range_state(self, range_id: str, new_state: str) -> 'EventRange': + def update_range_state(self, range_id: str, new_state: str) -> "EventRange": """ Update the status of an event range Args: @@ -403,8 +400,7 @@ def update_range_state(self, range_id: str, new_state: str) -> 'EventRange': the updated event range """ if range_id not in self.event_ranges_by_id: - raise Exception( - f"Trying to update non-existing eventrange {range_id}") + raise KeyError(f"Trying to update non-existing eventrange {range_id}") event_range = self.event_ranges_by_id[range_id] if new_state != EventRange.READY and event_range.status == EventRange.READY: @@ -420,11 +416,11 @@ def update_range_state(self, range_id: str, new_state: str) -> 'EventRange': # rangesID_by_file only hold ids of ranges that are ready to be assigned return event_range - def assign_ready_ranges(self, n_ranges=1) -> List['EventRange']: + def assign_ready_ranges(self, n_ranges=1) -> list["EventRange"]: n_ranges = min(self.nranges_available(), n_ranges) if not n_ranges: return list() - res: List[Optional['EventRange']] = [None] * n_ranges + res: list[Optional[EventRange]] = [None] * n_ranges res_idx = 0 ready = self.rangesID_by_state[EventRange.READY] assigned = self.rangesID_by_state[EventRange.ASSIGNED] @@ -460,8 +456,8 @@ def update_ranges(self, ranges_update: Sequence[EventRangeDef]) -> None: None """ for r in ranges_update: - range_id = r['eventRangeID'] - range_status = r['eventStatus'] + range_id = r["eventRangeID"] + range_status = r["eventStatus"] if range_id not in self.event_ranges_by_id: raise Exception() self.update_range_state(range_id, range_status) @@ -476,8 +472,7 @@ def nranges_remaining(self) -> int: Returns: Number of event ranges which are not finished or failed """ - return len(self.event_ranges_by_id) - (self.nranges_done() + - self.nranges_failed()) + return len(self.event_ranges_by_id) - (self.nranges_done() + self.nranges_failed()) def nranges_available(self) -> int: """ @@ -515,7 +510,7 @@ def nranges_done(self) -> int: """ return self._get_ranges_count(EventRange.DONE) - def append(self, event_range: Union[EventRangeDef, 'EventRange']) -> None: + def append(self, event_range: Union[EventRangeDef, "EventRange"]) -> None: """ Append a single event range to the queue @@ -536,7 +531,7 @@ def append(self, event_range: Union[EventRangeDef, 'EventRange']) -> None: self.rangesID_by_file[event_range.PFN].add(event_range.eventRangeID) self.event_ranges_count[event_range.status] += 1 - def add_new_event_ranges(self, ranges: Sequence['EventRange']) -> None: + def add_new_event_ranges(self, ranges: Sequence["EventRange"]) -> None: # PRE: all ranges in the list are in state ready self.rangesID_by_state[EventRange.READY].update(map(lambda e: e.eventRangeID, ranges)) self.event_ranges_count[EventRange.READY] += len(ranges) @@ -546,7 +541,7 @@ def add_new_event_ranges(self, ranges: Sequence['EventRange']) -> None: self.rangesID_by_file[r.PFN] = set() self.rangesID_by_file[r.PFN].add(r.eventRangeID) - def concat(self, ranges: Sequence[Union[EventRangeDef, 'EventRange']]) -> None: + def concat(self, ranges: Sequence[Union[EventRangeDef, "EventRange"]]) -> None: """ Concatenate a list of event ranges to the queue @@ -559,7 +554,7 @@ def concat(self, ranges: Sequence[Union[EventRangeDef, 'EventRange']]) -> None: for r in ranges: self.append(r) - def get_next_ranges(self, nranges: int) -> List['EventRange']: + def get_next_ranges(self, nranges: int) -> list["EventRange"]: """ Dequeue event ranges. Event ranges which were dequeued are updated to the 'ASSIGNED' status and should be assigned to workers to be processed. In case more ranges are requested @@ -574,7 +569,7 @@ def get_next_ranges(self, nranges: int) -> List['EventRange']: return self.assign_ready_ranges(n_ranges=nranges) -class PandaJobUpdate(object): +class PandaJobUpdate: """ Wrapper for jobUpdate @@ -604,11 +599,11 @@ def __init__(self, **kwargs) -> None: def __str__(self) -> str: return str(self.__dict__) - def to_dict(self) -> Dict[str, Builtin]: + def to_dict(self) -> dict[str, Builtin]: return self.__dict__ -class EventRangeUpdate(object): +class EventRangeUpdate: """ Event ranges update sent by pilot 2 using JSON schema: [ @@ -679,7 +674,10 @@ class EventRangeUpdate(object): """ - def __init__(self, range_update: Dict[str, List[MutableMapping[str, Union[str, int]]]] = None) -> None: + def __init__( + self, + range_update: Optional[dict[str, list[MutableMapping[str, Union[str, int]]]]] = None, + ) -> None: """ Wraps the range update dict in an object. The range update should be in the harvester-supported format. @@ -687,12 +685,12 @@ def __init__(self, range_update: Dict[str, List[MutableMapping[str, Union[str, i range_update: range update """ if not range_update: - self.range_update: Dict[str, HarvesterEventRangeUpdateDef] = dict() + self.range_update: dict[str, HarvesterEventRangeUpdateDef] = dict() else: for v in range_update.values(): if not isinstance(v, list): - raise Exception(f"Expecting type list for element {v}") - self.range_update: Dict[str, HarvesterEventRangeUpdateDef] = range_update + raise ValueError(f"Expecting type list for element {v}") + self.range_update: dict[str, HarvesterEventRangeUpdateDef] = range_update def __len__(self) -> int: return len(self.range_update) @@ -708,10 +706,10 @@ def __getitem__(self, k: str) -> HarvesterEventRangeUpdateDef: def __setitem__(self, k: str, v: HarvesterEventRangeUpdateDef) -> None: if not isinstance(v, list): - raise Exception(f"Expecting type list for element {v}") + raise ValueError(f"Expecting type list for element {v}") self.range_update[k] = v - def merge_update(self, other: 'EventRangeUpdate') -> None: + def merge_update(self, other: "EventRangeUpdate") -> None: for pandaID in other: if pandaID in self: self[pandaID] += other[pandaID] @@ -719,8 +717,7 @@ def merge_update(self, other: 'EventRangeUpdate') -> None: self[pandaID] = other[pandaID] @staticmethod - def build_from_dict(panda_id: str, - range_update: Sequence[PilotEventRangeUpdateDef]) -> 'EventRangeUpdate': + def build_from_dict(panda_id: str, range_update: Sequence[PilotEventRangeUpdateDef]) -> "EventRangeUpdate": """ Parses a range_update dict to a format adapted to be sent to harvester. @@ -733,23 +730,25 @@ def build_from_dict(panda_id: str, """ update_dict = dict() update_dict[panda_id] = list() - if isinstance( - range_update, dict - ) and "zipFile" not in range_update and "esOutput" not in range_update \ - and "eventRangeID" not in range_update: - range_update: Sequence[PilotEventRangeUpdateDef] = json.loads(range_update['eventRanges'][0]) + if ( + isinstance(range_update, dict) + and "zipFile" not in range_update + and "esOutput" not in range_update + and "eventRangeID" not in range_update + ): + range_update: Sequence[PilotEventRangeUpdateDef] = json.loads(range_update["eventRanges"][0]) for range_elt in range_update: - if "zipFile" in range_elt and range_elt["zipFile"]: + if range_elt.get("zipFile"): range_update_type = "zipFile" - file_info: FileInfo = range_elt.get('zipFile', None) - elif "esOutput" in range_elt and range_elt["esOutput"]: + file_info: FileInfo = range_elt.get("zipFile", None) + elif range_elt.get("esOutput"): range_update_type = "esOutput" - file_info: FileInfo = range_elt.get('esOutput', None) + file_info: FileInfo = range_elt.get("esOutput", None) else: range_update_type = None file_info: None = None - ranges_info: Sequence[EventRangeDef] = range_elt.get('eventRanges', None) + ranges_info: Sequence[EventRangeDef] = range_elt.get("eventRanges", None) file_data = dict() if file_info: @@ -757,37 +756,37 @@ def build_from_dict(panda_id: str, ftype = "es_output" else: ftype = "zip_output" - file_data['path'] = file_info['lfn'] - file_data['chksum'] = file_info['adler32'] - file_data['fsize'] = file_info['fsize'] - file_data['type'] = ftype + file_data["path"] = file_info["lfn"] + file_data["chksum"] = file_info["adler32"] + file_data["fsize"] = file_info["fsize"] + file_data["type"] = ftype if ranges_info: for rangeInfo in ranges_info: elt = dict() - elt['eventRangeID'] = rangeInfo['eventRangeID'] - elt['eventStatus'] = rangeInfo['eventStatus'] + elt["eventRangeID"] = rangeInfo["eventRangeID"] + elt["eventStatus"] = rangeInfo["eventStatus"] if range_update_type == "esOutput": - elt['path'] = rangeInfo['pfn'] - elt['chksum'] = rangeInfo['adler32'] - elt['fsize'] = rangeInfo['fsize'] + elt["path"] = rangeInfo["pfn"] + elt["chksum"] = rangeInfo["adler32"] + elt["fsize"] = rangeInfo["fsize"] elt.update(file_data) update_dict[panda_id].append(elt) else: elt = dict() - elt['eventRangeID'] = range_elt['eventRangeID'] - elt['eventStatus'] = range_elt['eventStatus'] + elt["eventRangeID"] = range_elt["eventRangeID"] + elt["eventStatus"] = range_elt["eventStatus"] if range_update_type == "esOutput": - elt['path'] = range_elt['pfn'] - elt['chksum'] = range_elt['adler32'] - elt['fsize'] = range_elt['fsize'] + elt["path"] = range_elt["pfn"] + elt["chksum"] = range_elt["adler32"] + elt["fsize"] = range_elt["fsize"] elt.update(file_data) update_dict[panda_id].append(elt) return EventRangeUpdate(update_dict) -class PandaJobRequest(object): +class PandaJobRequest: """ Wrapper for a job request. Pilot2 requests job using the following JSON schema: @@ -807,17 +806,19 @@ class PandaJobRequest(object): Note that harvester will ignore the content of the job request file and simply check if it exists """ - def __init__(self, - node: str = None, - disk_space: str = None, - working_group: str = None, - prod_source_label: str = None, - computing_element: str = None, - site_name: str = None, - resource_type: str = None, - mem: str = None, - cpu: str = None, - allow_other_country: str = None) -> None: + def __init__( + self, + node: str = "", + disk_space: str = "", + working_group: str = "", + prod_source_label: str = "", + computing_element: str = "", + site_name: str = "", + resource_type: str = "", + mem: str = "", + cpu: str = "", + allow_other_country: str = "", + ) -> None: self.node = node self.diskSpace = disk_space self.workingGroup = working_group @@ -832,11 +833,11 @@ def __init__(self, def __str__(self) -> str: return str(self.__dict__) - def to_dict(self) -> Dict[str, Builtin]: + def to_dict(self) -> dict[str, Builtin]: return self.__dict__ -class EventRangeRequest(object): +class EventRangeRequest: """ Send event request to harvester. Event ranges for multiple jobs can be requested in a singled request. Harvester expects the following JSON schema: @@ -852,7 +853,7 @@ class EventRangeRequest(object): """ def __init__(self) -> None: - self.request: Dict[str, Dict[str, Builtin]] = dict() + self.request: dict[str, dict[str, Builtin]] = dict() def __len__(self) -> int: return len(self.request) @@ -860,7 +861,7 @@ def __len__(self) -> int: def __iter__(self) -> Iterable[str]: return iter(self.request) - def __getitem__(self, k: str) -> Dict[str, Builtin]: + def __getitem__(self, k: str) -> dict[str, Builtin]: return self.request[k] def __str__(self) -> str: @@ -880,14 +881,16 @@ def add_event_request(self, panda_id: str, n_ranges: int, task_id: str, jobset_i """ self.request[panda_id] = { - 'pandaID': panda_id, - 'nRanges': n_ranges, - 'taskID': task_id, - 'jobsetID': jobset_id + "pandaID": panda_id, + "nRanges": n_ranges, + "taskID": task_id, + "jobsetID": jobset_id, } @staticmethod - def build_from_dict(request_dict: Mapping[str, Dict[str, Builtin]]) -> 'EventRangeRequest': + def build_from_dict( + request_dict: Mapping[str, dict[str, Builtin]], + ) -> "EventRangeRequest": """ Build a request object from a dict parsed from its json representation @@ -902,7 +905,7 @@ def build_from_dict(request_dict: Mapping[str, Dict[str, Builtin]]) -> 'EventRan return request -class PandaJob(object): +class PandaJob: """ Wrapper for a panda jobspec. Usually contains the following fields: { @@ -945,7 +948,7 @@ class PandaJob(object): --athenaopts=--preloadlib=${ATLASMKLLIBDIR_PRELOAD}/libimf.so --preInclude sim:SimulationJobOptions/ preInclude.FrozenShowersFCalOnly.py,SimulationJobOptions/preInclude.BeamPipeKill.py - --geometryVersion ATLAS-R2-2016-01-00-00_VALIDATION --physicsList QGSP_BERT --randomSeed 1234 + --geometryVersion ATLAS-R2-2016-01-00-00_VALIDATION --physicslist QGSP_BERT --randomSeed 1234 --conditionsTag OFLCOND-MC12-SIM-00 --maxEvents=-1 --inputEvgenFile EVNT.01469903._009502.pool.root.1 --outputHitsFile HITS_%s.pool.root' % job_name, 'attemptNr': 0, @@ -968,7 +971,7 @@ class PandaJob(object): """ def __init__(self, job_def: JobDef) -> None: - self.job: Dict[str, Builtin] = job_def + self.job: dict[str, Builtin] = job_def if "PandaID" in self: self["PandaID"] = str(self["PandaID"]) self.event_ranges_queue: EventRangeQueue = EventRangeQueue() @@ -995,7 +998,7 @@ def nranges_available(self) -> int: """ return self.event_ranges_queue.nranges_available() - def get_next_ranges(self, nranges: int) -> List['EventRange']: + def get_next_ranges(self, nranges: int) -> list["EventRange"]: """ See Also: EventRangeQueue.get_next_ranges() @@ -1009,7 +1012,7 @@ def get_pandaQueue(self) -> str: Returns: Name of the panda queue from which harvester is retrieving jobs """ - return self['destinationSE'] + return self["destinationSE"] def get_id(self) -> str: """ @@ -1018,7 +1021,12 @@ def get_id(self) -> str: Returns: the job worker_id """ - return self['PandaID'] + return self["PandaID"] + + def get(self, k: str, default: Any = "") -> Builtin: + if k in self.job: + return self.job[k] + return default def __str__(self) -> str: return json.dumps(self.job) @@ -1039,7 +1047,7 @@ def __contains__(self, k: str) -> bool: return k in self.job -class EventRange(object): +class EventRange: """ Hold an event range: { @@ -1064,10 +1072,17 @@ class EventRange(object): DONE = "finished" FAILED = "failed" FATAL = "fatal" - STATES = [READY, ASSIGNED, DONE, FAILED, FATAL] - - def __init__(self, event_range_id: str, start_event: int, last_event: int, - pfn: str, guid: str, scope: str) -> None: + STATES = frozenset([READY, ASSIGNED, DONE, FAILED, FATAL]) + + def __init__( + self, + event_range_id: str, + start_event: int, + last_event: int, + pfn: str, + guid: str, + scope: str, + ) -> None: """ Initialize the range @@ -1090,7 +1105,7 @@ def __init__(self, event_range_id: str, start_event: int, last_event: int, def set_assigned(self) -> None: """ - Set current state to ASSIGNED + set current state to ASSIGNED Returns: None @@ -1099,7 +1114,7 @@ def set_assigned(self) -> None: def set_done(self) -> None: """ - Set current state to DONE + set current state to DONE Returns: None @@ -1108,7 +1123,7 @@ def set_done(self) -> None: def set_failed(self) -> None: """ - Set current state to FAILED + set current state to FAILED Returns: None @@ -1133,7 +1148,7 @@ def __str__(self) -> str: """ return json.dumps(self.to_dict()) - def __eq__(self, o: 'EventRange') -> bool: + def __eq__(self, o: "EventRange") -> bool: if not isinstance(o, EventRange): return False return self.eventRangeID == o.eventRangeID @@ -1146,15 +1161,15 @@ def to_dict(self) -> EventRangeDef: dict serialization of the range """ return { - 'PFN': self.PFN, - 'lastEvent': self.lastEvent, - 'eventRangeID': self.eventRangeID, - 'startEvent': self.startEvent, - 'GUID': self.GUID + "PFN": self.PFN, + "lastEvent": self.lastEvent, + "eventRangeID": self.eventRangeID, + "startEvent": self.startEvent, + "GUID": self.GUID, } @staticmethod - def build_from_dict(event_ranges_dict: EventRangeDef) -> 'EventRange': + def build_from_dict(event_ranges_dict: EventRangeDef) -> "EventRange": """ Construct an event range from a dict returned by harvester @@ -1165,13 +1180,16 @@ def build_from_dict(event_ranges_dict: EventRangeDef) -> 'EventRange': EventRange object """ return EventRange( - event_ranges_dict['eventRangeID'], event_ranges_dict['startEvent'], - event_ranges_dict['lastEvent'], - event_ranges_dict.get('PFN', event_ranges_dict.get('LFN', None)), - event_ranges_dict['GUID'], event_ranges_dict['scope']) + event_ranges_dict["eventRangeID"], + event_ranges_dict["startEvent"], + event_ranges_dict["lastEvent"], + event_ranges_dict.get("PFN", event_ranges_dict.get("LFN", None)), + event_ranges_dict["GUID"], + event_ranges_dict["scope"], + ) -class JobReport(object): +class JobReport: """ Wrapper for a job report. Raythena creates a job report after the job has finished: @@ -1183,10 +1201,7 @@ class JobReport(object): """ - def __init__(self, - exitCode: int = 0, - exitMsg: str = None, - exitMsgExtra: str = None) -> None: + def __init__(self, exitCode: int = 0, exitMsg: Optional[str] = None, exitMsgExtra: Optional[str] = None) -> None: self.exitCode = exitCode self.exitMsg = exitMsg self.exitMsgExtra = exitMsgExtra @@ -1194,5 +1209,5 @@ def __init__(self, def __str__(self) -> str: return str(self.__dict__) - def to_dict(self) -> Dict[str, Builtin]: + def to_dict(self) -> dict[str, Builtin]: return self.__dict__ diff --git a/src/raythena/utils/exception.py b/src/raythena/utils/exception.py index 4e8b321..eff9a0c 100644 --- a/src/raythena/utils/exception.py +++ b/src/raythena/utils/exception.py @@ -1,26 +1,27 @@ import threading -from queue import Queue, Empty +from queue import Empty, Queue +from typing import Optional +ILLEGAL_WORKER_STATE = 20 +STAGEIN_FAILED = 30 +STAGEOUT_FAILED = 40 +PAYLOAD_FAILED = 50 +UNKNOWN = 0 -class ErrorCodes(object): +ERROR_CODES_GENRIC_MESSAGES = { + ILLEGAL_WORKER_STATE: "Illegal worker state transition", + STAGEIN_FAILED: "Failed to stagein data", + STAGEOUT_FAILED: "Failed to stageout data", + PAYLOAD_FAILED: "Payload execution failed", + UNKNOWN: "Unknown error", +} + + +class ErrorCodes: """ Defines error codes constants and associated default error message for each error code """ - ILLEGAL_WORKER_STATE = 20 - STAGEIN_FAILED = 30 - STAGEOUT_FAILED = 40 - PAYLOAD_FAILED = 50 - UNKNOWN = 0 - - ERROR_CODES_GENRIC_MESSAGES = { - ILLEGAL_WORKER_STATE: "Illegal worker state transition", - STAGEIN_FAILED: "Failed to stagein data", - STAGEOUT_FAILED: "Failed to stageout data", - PAYLOAD_FAILED: "Payload execution failed", - UNKNOWN: "Unknown error" - } - @staticmethod def get_error_message(error_code: int) -> str: """ @@ -32,7 +33,7 @@ def get_error_message(error_code: int) -> str: Returns: The default error message """ - return ErrorCodes.ERROR_CODES_GENRIC_MESSAGES.get(error_code, "") + return ERROR_CODES_GENRIC_MESSAGES.get(error_code, "") class ExThread(threading.Thread): @@ -105,7 +106,7 @@ class BaseRaythenaException(Exception): Base class for raythena exception """ - def __init__(self, worker_id: str, error_code: int, message: str = None) -> None: + def __init__(self, worker_id: str, error_code: int, message: Optional[str] = None) -> None: """ Initialize worker_id, error code and message @@ -116,8 +117,7 @@ def __init__(self, worker_id: str, error_code: int, message: str = None) -> None """ self.worker_id = worker_id self.error_code = error_code - self.message = message if message else ErrorCodes.get_error_message( - error_code) + self.message = message if message else ErrorCodes.get_error_message(error_code) super().__init__(self.message) def __reduce__(self): @@ -129,13 +129,22 @@ class IllegalWorkerState(BaseRaythenaException): Raised when the worker state tries to transition to a state he shouldn't be able to from its current state. """ - def __init__(self, worker_id: str, src_state: str, dst_state: str, message: str = None) -> None: + def __init__( + self, + worker_id: str, + src_state: str, + dst_state: str, + message: str = "", + ) -> None: super().__init__(worker_id, ErrorCodes.ILLEGAL_WORKER_STATE, message) self.src_state = src_state self.dst_state = dst_state def __reduce__(self): - return (self.__class__, (self.worker_id, self.src_state, self.dst_state, self.message)) + return ( + self.__class__, + (self.worker_id, self.src_state, self.dst_state, self.message), + ) class StageInFailed(BaseRaythenaException): @@ -143,7 +152,7 @@ class StageInFailed(BaseRaythenaException): Raised when the worker was unable to stage-in data """ - def __init__(self, worker_id: str, message: str = None) -> None: + def __init__(self, worker_id: str, message: Optional[str] = None) -> None: super().__init__(worker_id, ErrorCodes.STAGEIN_FAILED, message) def __reduce__(self): @@ -155,7 +164,7 @@ class StageOutFailed(BaseRaythenaException): Raised when the worker was unable to stage-out data """ - def __init__(self, worker_id: str, message: str = None) -> None: + def __init__(self, worker_id: str, message: Optional[str] = None) -> None: super().__init__(worker_id, ErrorCodes.STAGEOUT_FAILED, message) def __reduce__(self): @@ -167,7 +176,7 @@ class FailedPayload(BaseRaythenaException): Raised when the worker payload failed """ - def __init__(self, worker_id: str, message: str = None) -> None: + def __init__(self, worker_id: str, message: Optional[str] = None) -> None: super().__init__(worker_id, ErrorCodes.PAYLOAD_FAILED, message) def __reduce__(self): @@ -179,7 +188,7 @@ class UnknownException(BaseRaythenaException): Raised when no other exception type applies """ - def __init__(self, worker_id: str, message: str = None) -> None: + def __init__(self, worker_id: str, message: Optional[str] = None) -> None: super().__init__(worker_id, ErrorCodes.UNKNOWN, message) def __reduce__(self): diff --git a/src/raythena/utils/importUtils.py b/src/raythena/utils/importUtils.py index 17ae8ef..7b3168d 100644 --- a/src/raythena/utils/importUtils.py +++ b/src/raythena/utils/importUtils.py @@ -16,10 +16,10 @@ def import_from_string(module_path: str) -> Callable: Raises: ImportError if the specified class couldn't be found """ - module, _, instance = module_path.partition(':') + module, _, instance = module_path.partition(":") module = importlib.import_module(module) - for elt in instance.split('.'): + for elt in instance.split("."): if not hasattr(module, elt): raise ImportError(f"Can't import {elt} from {module}") module = getattr(module, elt) diff --git a/src/raythena/utils/logging.py b/src/raythena/utils/logging.py index d89ad20..895f45c 100644 --- a/src/raythena/utils/logging.py +++ b/src/raythena/utils/logging.py @@ -1,13 +1,13 @@ import logging import sys from time import gmtime - +from typing import Optional from raythena.utils.config import Config _initialized = False -def make_logger(config: Config, name: str, filepath: str = None) -> logging.Logger: +def make_logger(config: Config, name: str, filepath: Optional[str] = None) -> logging.Logger: global _initialized if not _initialized: configure_logger(config, filepath) @@ -16,7 +16,7 @@ def make_logger(config: Config, name: str, filepath: str = None) -> logging.Logg def log_to_file(log_level, filepath: str): - fh = logging.FileHandler(filepath, mode='w') + fh = logging.FileHandler(filepath, mode="w") fh.setFormatter(logging.Formatter(*get_fmt(log_level))) logging.getLogger().addHandler(fh) @@ -29,11 +29,11 @@ def disable_stdout_logging(): def get_fmt(log_level): - if logging.DEBUG == logging.getLevelName(log_level): + if logging.getLevelName(log_level) == logging.DEBUG: fmt = "{asctime} | {levelname:8} | {name}:{funcName} | {message}" else: fmt = "{asctime} | {levelname:8} | {name} | {message}" - return fmt, "%Y-%m-%d %H:%M:%S", '{' + return fmt, "%Y-%m-%d %H:%M:%S", "{" def configure_logger(config: Config, filepath: str) -> None: @@ -47,11 +47,11 @@ def configure_logger(config: Config, filepath: str) -> None: Returns: None """ - log_level = config.logging.get('level', 'warning').upper() + log_level = config.logging.get("level", "warning").upper() logging.Formatter.converter = gmtime handlers = list() if filepath: - fh = logging.FileHandler(filepath, mode='w') + fh = logging.FileHandler(filepath, mode="w") handlers.append(fh) else: ch = logging.StreamHandler(sys.stdout) @@ -62,4 +62,5 @@ def configure_logger(config: Config, filepath: str) -> None: style=style, datefmt=datefmt, level=logging.getLevelName(log_level), - handlers=handlers) + handlers=handlers, + ) diff --git a/src/raythena/utils/ray.py b/src/raythena/utils/ray.py index ae0b37e..ee7c6cf 100644 --- a/src/raythena/utils/ray.py +++ b/src/raythena/utils/ray.py @@ -1,12 +1,10 @@ -from typing import List, Mapping, Any - +from collections.abc import Mapping +from typing import Any import ray - from raythena.utils.config import Config -def build_nodes_resource_list(config: Config, - run_actor_on_head: bool = False) -> List[Mapping[str, Any]]: +def build_nodes_resource_list(config: Config, run_actor_on_head: bool = False) -> list[Mapping[str, Any]]: """ Build and setup ray custom resources. Actors should then be instantiated by requiring one of the resource in the returned list. @@ -22,11 +20,11 @@ def build_nodes_resource_list(config: Config, nodes = ray.nodes() if len(nodes) == 1: # only a head node run_actor_on_head = True - head_ip = config.ray['headip'] + head_ip = config.ray["headip"] resource_list = list() for node in nodes: - naddr = node['NodeManagerAddress'] - if not node['alive'] or (not run_actor_on_head and naddr == head_ip): + naddr = node["NodeManagerAddress"] + if not node["alive"] or (not run_actor_on_head and naddr == head_ip): continue else: resource_list.extend([node]) @@ -55,8 +53,7 @@ def is_external_cluster(config: Config) -> bool: Returns: True if raythena is connecting to an existing cluster, False otherwise """ - return config.ray['headip'] is not None and config.ray[ - 'redisport'] is not None + return config.ray["headip"] is not None and config.ray["redisport"] is not None def setup_ray(config: Config) -> Any: @@ -69,10 +66,14 @@ def setup_ray(config: Config) -> Any: Returns: dict of cluster params """ - log_to_driver = True if not config.logging.get('workerlogfile', None) else False + log_to_driver = bool(not config.logging.get("workerlogfile", None)) if is_external_cluster(config): ray_url = f"{config.ray['headip']}:{config.ray['redisport']}" - return ray.init(address=ray_url, _redis_password=config.ray['redispassword'], log_to_driver=log_to_driver) + return ray.init( + address=ray_url, + _redis_password=config.ray["redispassword"], + log_to_driver=log_to_driver, + ) else: return ray.init(log_to_driver=log_to_driver) diff --git a/src/raythena/utils/timing.py b/src/raythena/utils/timing.py index d8e2d7a..b29ba15 100644 --- a/src/raythena/utils/timing.py +++ b/src/raythena/utils/timing.py @@ -1,10 +1,8 @@ -import psutil -import time import json - +import time from threading import Event -from typing import Any, Dict, List, Union - +from typing import Any, Union +import psutil from raythena.utils.exception import ExThread @@ -12,6 +10,7 @@ class CPUMonitor: """ Monitoring tools recording system cpu utilization as well as process cpu utilization to a file """ + def __init__(self, log_file: str, pid: Any = None) -> None: self.process = psutil.Process(pid) self.log_file = log_file @@ -43,7 +42,7 @@ def stop(self) -> None: self.monitor_thread = ExThread(target=self.monitor_cpu, name="cpu_monitor") self.stop_event = Event() - def _log_to_file(self, data: Dict[str, Union[Dict[str, List], List, int]]) -> None: + def _log_to_file(self, data: dict[str, Union[dict[str, list], list, int]]) -> None: """ Write data to log file @@ -53,7 +52,7 @@ def _log_to_file(self, data: Dict[str, Union[Dict[str, List], List, int]]) -> No Returns: None """ - with open(self.log_file, 'w') as f: + with open(self.log_file, "w") as f: json.dump(data, f) def monitor_cpu(self) -> None: @@ -93,7 +92,7 @@ def monitor_cpu(self) -> None: "system_usage": system_usage, "process_usage": process_usage, "process_times": process_times, - "time_step": self.time_step + "time_step": self.time_step, } while not self.stop_event.is_set(): @@ -102,7 +101,7 @@ def monitor_cpu(self) -> None: process_usage.append(self.process.cpu_percent()) process_cpu_times = self.process.cpu_times() - for k in process_times.keys(): + for k in process_times: process_times[k].append(getattr(process_cpu_times, k)) if time.time() >= last_write + self.write_interval: @@ -112,7 +111,6 @@ def monitor_cpu(self) -> None: class Timing: - def __init__(self): self._timings = dict() diff --git a/tests/conftest.py b/tests/conftest.py index 8debc51..f4a98e0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,8 +1,6 @@ import hashlib import time - import pytest - from raythena.utils.config import Config from raythena.utils.ray import setup_ray, shutdown_ray @@ -21,16 +19,18 @@ def requires_ray(config_base): @pytest.fixture(scope="class") def config_base(config_path): - return Config(config_path, - config=None, - debug=False, - ray_head_ip=None, - ray_redis_password=None, - ray_redis_port=None, - ray_workdir=None, - harvester_endpoint=None, - panda_queue=None, - core_per_node=None) + return Config( + config_path, + config=None, + debug=False, + ray_head_ip=None, + ray_redis_password=None, + ray_redis_port=None, + ray_workdir=None, + harvester_endpoint=None, + panda_queue=None, + core_per_node=None, + ) @pytest.fixture @@ -57,9 +57,9 @@ def is_eventservice(request): @pytest.fixture def pandaids(njobs): res = [] - for i in range(njobs): + for _ in range(njobs): hash = hashlib.md5() - hash.update(str(time.time()).encode('utf-8')) + hash.update(str(time.time()).encode("utf-8")) res.append(hash.hexdigest()) return res @@ -95,41 +95,39 @@ def sample_ranges(range_ids, pandaids, input_output_file_list): range_list = list() res[pandaID] = range_list for i in range(nevents): - range_list.append({ - 'lastEvent': i, - 'eventRangeID': range_ids[i], - 'startEvent': i, - 'scope': '13Mev', - 'LFN': files[i % nfiles], - 'GUID': '0' - }) + range_list.append( + { + "lastEvent": i, + "eventRangeID": range_ids[i], + "startEvent": i, + "scope": "13Mev", + "LFN": files[i % nfiles], + "GUID": "0", + } + ) return res @pytest.fixture def sample_rangeupdate(range_ids): - return [{ - "zipFile": { - "numEvents": len(range_ids), - "lfn": "EventService_premerge_Range-00000.tar", - "adler32": "36503831", - "objstoreID": 1641, - "fsize": 860160, - "pathConvention": 1000 - }, - "eventRanges": [{ - "eventRangeID": r, - "eventStatus": "finished" - } for r in range_ids] - }] + return [ + { + "zipFile": { + "numEvents": len(range_ids), + "lfn": "EventService_premerge_Range-00000.tar", + "adler32": "36503831", + "objstoreID": 1641, + "fsize": 860160, + "pathConvention": 1000, + }, + "eventRanges": [{"eventRangeID": r, "eventStatus": "finished"} for r in range_ids], + } + ] @pytest.fixture def sample_failed_rangeupdate(range_ids): - return [{ - "eventRangeID": r, - "eventStatus": "failed" - } for r in range_ids] + return [{"eventRangeID": r, "eventStatus": "failed"} for r in range_ids] @pytest.fixture @@ -148,141 +146,102 @@ def input_output_file_list(nfiles, nhits_per_file, nevents_per_file): @pytest.fixture -def sample_multijobs(request, input_output_file_list, is_eventservice, pandaids, nhits_per_file, nevents_per_file): +def sample_multijobs( + request, + input_output_file_list, + is_eventservice, + pandaids, + nhits_per_file, + nevents_per_file, +): res = {} (input_files, output_files) = input_output_file_list for pandaID in pandaids: hash = hashlib.md5() - hash.update(str(time.time()).encode('utf-8')) + hash.update(str(time.time()).encode("utf-8")) log_guid = hash.hexdigest() - hash.update(str(time.time()).encode('utf-8')) + hash.update(str(time.time()).encode("utf-8")) job_name = hash.hexdigest() - jobsetId = '0' - taskId = '0' - ncores = '8' - guid = '0' + jobsetId = "0" + taskId = "0" + ncores = "8" + guid = "0" scope = "13Mev" panda_queue_name = f"pandaqueue_{hash.hexdigest()}" inFiles = ",".join(input_files) outFiles = ",".join(output_files) outFilesShort = f"[{','.join([str(i) for i in range(len(outFiles))])}]" res[pandaID] = { - u'jobsetID': - jobsetId, - u'nEventsPerInputFile': nevents_per_file, - u'esmergeSpec': { + "jobsetID": jobsetId, + "nEventsPerInputFile": nevents_per_file, + "esmergeSpec": { "transPath": "", "jobParameters": "", - "nEventsPerOutputFile": nhits_per_file + "nEventsPerOutputFile": nhits_per_file, }, - u'logGUID': - log_guid, - u'cmtConfig': - u'x86_64-slc6-gcc49-opt', - u'prodDBlocks': - u'user.mlassnig:user.mlassnig.pilot.test.single.hits', - u'dispatchDBlockTokenForOut': - u'NULL,NULL', - u'destinationDBlockToken': - u'NULL,NULL', - u'destinationSE': - panda_queue_name, - u'realDatasets': - job_name, - u'prodUserID': - u'no_one', - u'GUID': - ",".join([f"{guid}{i}" for i in range(len(input_files))]), - u'realDatasetsIn': - u'user.mlassnig:user.mlassnig.pilot.test.single.hits', - u'nSent': - 0, - u'eventService': - str(is_eventservice), - u'cloud': - u'US', - u'StatusCode': - 0, - u'homepackage': - u'AtlasOffline/21.0.15', - u'inFiles': - inFiles, - u'processingType': - u'pilot-ptest', - u'ddmEndPointOut': - u'UTA_SWT2_DATADISK,UTA_SWT2_DATADISK', - u'fsize': - u'118612262', - u'fileDestinationSE': - f"{panda_queue_name},{panda_queue_name}", - u'scopeOut': - u'panda', - u'minRamCount': - 0, - u'jobDefinitionID': - 7932, - u'maxWalltime': - u'NULL', - u'scopeLog': - u'panda', - u'transformation': - u'Sim_tf.py', - u'maxDiskCount': - 0, - u'coreCount': - ncores, - u'prodDBlockToken': - u'NULL', - u'transferType': - u'NULL', - u'destinationDblock': - job_name, - u'dispatchDBlockToken': - u'NULL', - u'jobPars': ( - '--eventService=%s --skipEvents=0 --firstEvent=1 --preExec "from AthenaCommon.DetFlags ' - 'import DetFlags;DetFlags.ID_setOn();DetFlags.Calo_setOff();' + "logGUID": log_guid, + "cmtConfig": "x86_64-slc6-gcc49-opt", + "prodDBlocks": "user.mlassnig:user.mlassnig.pilot.test.single.hits", + "dispatchDBlockTokenForOut": "NULL,NULL", + "destinationDBlockToken": "NULL,NULL", + "destinationSE": panda_queue_name, + "realDatasets": job_name, + "prodUserID": "no_one", + "GUID": ",".join([f"{guid}{i}" for i in range(len(input_files))]), + "realDatasetsIn": "user.mlassnig:user.mlassnig.pilot.test.single.hits", + "nSent": 0, + "eventService": str(is_eventservice), + "cloud": "US", + "StatusCode": 0, + "homepackage": "AtlasOffline/21.0.15", + "inFiles": inFiles, + "processingType": "pilot-ptest", + "ddmEndPointOut": "UTA_SWT2_DATADISK,UTA_SWT2_DATADISK", + "fsize": "118612262", + "fileDestinationSE": f"{panda_queue_name},{panda_queue_name}", + "scopeOut": "panda", + "minRamCount": 0, + "jobDefinitionID": 7932, + "maxWalltime": "NULL", + "scopeLog": "panda", + "transformation": "Sim_tf.py", + "maxDiskCount": 0, + "coreCount": ncores, + "prodDBlockToken": "NULL", + "transferType": "NULL", + "destinationDblock": job_name, + "dispatchDBlockToken": "NULL", + "jobPars": ( + f"--eventService={is_eventservice!s} --skipEvents=0 --firstEvent=1 " + '--preExec "from AthenaCommon.DetFlags ' + "import DetFlags;DetFlags.ID_setOn();DetFlags.Calo_setOff();" 'DetFlags.Muon_setOff();DetFlags.Lucid_setOff();DetFlags.Truth_setOff() "' - '--athenaopts=--preloadlib=${ATLASMKLLIBDIR_PRELOAD}/libimf.so ' - '--preInclude sim:SimulationJobOptions/preInclude.FrozenShowersFCalOnly.py,SimulationJobOptions/preInclude.BeamPipeKill.py ' - '--geometryVersion ATLAS-R2-2016-01-00-00_VALIDATION --physicsList QGSP_BERT --randomSeed 1234 --conditionsTag OFLCOND-MC12-SIM-00 ' - '--maxEvents=-1 --inputEvgenFile %s --outputHitsFile HITS_%s.pool.root)' - % (str(is_eventservice), inFiles, outFilesShort)), - u'attemptNr': - 0, - u'swRelease': - u'Atlas-21.0.15', - u'nucleus': - u'NULL', - u'maxCpuCount': - 0, - u'outFiles': - outFiles, - u'currentPriority': - 1000, - u'scopeIn': - scope, - u'PandaID': - pandaID, - u'sourceSite': - u'NULL', - u'dispatchDblock': - u'NULL', - u'prodSourceLabel': - u'ptest', - u'checksum': - u'ad:5d000974', - u'jobName': - job_name, - u'ddmEndPointIn': - u'UTA_SWT2_DATADISK', - u'taskID': - taskId, - u'logFile': - u'%s.job.log.tgz' % job_name + "--athenaopts=--preloadlib=${ATLASMKLLIBDIR_PRELOAD}/libimf.so " + "--preInclude sim:SimulationJobOptions/preInclude.FrozenShowersFCalOnly.py," + "SimulationJobOptions/preInclude.BeamPipeKill.py " + "--geometryVersion ATLAS-R2-2016-01-00-00_VALIDATION --physicsList QGSP_BERT " + "--randomSeed 1234 --conditionsTag OFLCOND-MC12-SIM-00 " + f"--maxEvents=-1 --inputEvgenFile {inFiles} --outputHitsFile HITS_{outFilesShort}.pool.root)" + ), + "attemptNr": 0, + "swRelease": "Atlas-21.0.15", + "nucleus": "NULL", + "maxCpuCount": 0, + "outFiles": outFiles, + "currentPriority": 1000, + "scopeIn": scope, + "PandaID": pandaID, + "sourceSite": "NULL", + "dispatchDblock": "NULL", + "prodSourceLabel": "ptest", + "checksum": "ad:5d000974", + "jobName": job_name, + "ddmEndPointIn": "UTA_SWT2_DATADISK", + "taskID": taskId, + "logFile": f"{job_name}.job.log.tgz", } return res @@ -291,16 +250,16 @@ def sample_multijobs(request, input_output_file_list, is_eventservice, pandaids, def sample_job(is_eventservice, input_output_file_list, nhits_per_file, nevents_per_file): hash = hashlib.md5() (input_files, output_files) = input_output_file_list - hash.update(str(time.time()).encode('utf-8')) + hash.update(str(time.time()).encode("utf-8")) log_guid = hash.hexdigest() - hash.update(str(time.time()).encode('utf-8')) + hash.update(str(time.time()).encode("utf-8")) job_name = hash.hexdigest() - pandaID = '0' - jobsetId = '0' - taskId = '0' - ncores = '8' - guid = '0' + pandaID = "0" + jobsetId = "0" + taskId = "0" + ncores = "8" + guid = "0" scope = "13Mev" panda_queue_name = "pandaqueue" inFiles = ",".join(input_files) @@ -308,118 +267,72 @@ def sample_job(is_eventservice, input_output_file_list, nhits_per_file, nevents_ outFilesShort = f"[{','.join([str(i) for i in range(len(outFiles))])}]" return { pandaID: { - u'jobsetID': - jobsetId, - u'logGUID': - log_guid, - u'nEventsPerInputFile': nevents_per_file, - u'esmergeSpec': { + "jobsetID": jobsetId, + "logGUID": log_guid, + "nEventsPerInputFile": nevents_per_file, + "esmergeSpec": { "transPath": "", "jobParameters": "", - "nEventsPerOutputFile": nhits_per_file + "nEventsPerOutputFile": nhits_per_file, }, - u'cmtConfig': - u'x86_64-slc6-gcc49-opt', - u'prodDBlocks': - u'user.mlassnig:user.mlassnig.pilot.test.single.hits', - u'dispatchDBlockTokenForOut': - u'NULL,NULL', - u'destinationDBlockToken': - u'NULL,NULL', - u'destinationSE': - panda_queue_name, - u'realDatasets': - job_name, - u'prodUserID': - u'no_one', - u'GUID': - guid, - u'realDatasetsIn': - u'user.mlassnig:user.mlassnig.pilot.test.single.hits', - u'nSent': - 0, - u'eventService': - str(is_eventservice), - u'cloud': - u'US', - u'StatusCode': - 0, - u'homepackage': - u'AtlasOffline/21.0.15', - u'inFiles': - inFiles, - u'processingType': - u'pilot-ptest', - u'ddmEndPointOut': - u'UTA_SWT2_DATADISK,UTA_SWT2_DATADISK', - u'fsize': - u'118612262', - u'fileDestinationSE': - f"{panda_queue_name},{panda_queue_name}", - u'scopeOut': - u'panda', - u'minRamCount': - 0, - u'jobDefinitionID': - 7932, - u'maxWalltime': - u'NULL', - u'scopeLog': - u'panda', - u'transformation': - u'Sim_tf.py', - u'maxDiskCount': - 0, - u'coreCount': - ncores, - u'prodDBlockToken': - u'NULL', - u'transferType': - u'NULL', - u'destinationDblock': - job_name, - u'dispatchDBlockToken': - u'NULL', - u'jobPars': ( - '--eventService=%s --skipEvents=0 --firstEvent=1 --preExec "from AthenaCommon.DetFlags ' - 'import DetFlags;DetFlags.ID_setOn();DetFlags.Calo_setOff();' + "cmtConfig": "x86_64-slc6-gcc49-opt", + "prodDBlocks": "user.mlassnig:user.mlassnig.pilot.test.single.hits", + "dispatchDBlockTokenForOut": "NULL,NULL", + "destinationDBlockToken": "NULL,NULL", + "destinationSE": panda_queue_name, + "realDatasets": job_name, + "prodUserID": "no_one", + "GUID": guid, + "realDatasetsIn": "user.mlassnig:user.mlassnig.pilot.test.single.hits", + "nSent": 0, + "eventService": str(is_eventservice), + "cloud": "US", + "StatusCode": 0, + "homepackage": "AtlasOffline/21.0.15", + "inFiles": inFiles, + "processingType": "pilot-ptest", + "ddmEndPointOut": "UTA_SWT2_DATADISK,UTA_SWT2_DATADISK", + "fsize": "118612262", + "fileDestinationSE": f"{panda_queue_name},{panda_queue_name}", + "scopeOut": "panda", + "minRamCount": 0, + "jobDefinitionID": 7932, + "maxWalltime": "NULL", + "scopeLog": "panda", + "transformation": "Sim_tf.py", + "maxDiskCount": 0, + "coreCount": ncores, + "prodDBlockToken": "NULL", + "transferType": "NULL", + "destinationDblock": job_name, + "dispatchDBlockToken": "NULL", + "jobPars": ( + f"--eventService={is_eventservice!s} --skipEvents=0 --firstEvent=1 " + '--preExec "from AthenaCommon.DetFlags ' + "import DetFlags;DetFlags.ID_setOn();DetFlags.Calo_setOff();" 'DetFlags.Muon_setOff();DetFlags.Lucid_setOff();DetFlags.Truth_setOff() "' - '--athenaopts=--preloadlib=${ATLASMKLLIBDIR_PRELOAD}/libimf.so ' - '--preInclude sim:SimulationJobOptions/preInclude.FrozenShowersFCalOnly.py,SimulationJobOptions/preInclude.BeamPipeKill.py ' - '--geometryVersion ATLAS-R2-2016-01-00-00_VALIDATION --physicsList QGSP_BERT --randomSeed 1234 --conditionsTag OFLCOND-MC12-SIM-00 ' - '--maxEvents=-1 --inputEvgenFile %s --outputHitsFile HITS_%s.pool.root)' - % (str(is_eventservice), inFiles, outFilesShort)), - u'attemptNr': - 0, - u'swRelease': - u'Atlas-21.0.15', - u'nucleus': - u'NULL', - u'maxCpuCount': - 0, - u'outFiles': - outFiles, - u'currentPriority': - 1000, - u'scopeIn': - scope, - u'PandaID': - pandaID, - u'sourceSite': - u'NULL', - u'dispatchDblock': - u'NULL', - u'prodSourceLabel': - u'ptest', - u'checksum': - u'ad:5d000974', - u'jobName': - job_name, - u'ddmEndPointIn': - u'UTA_SWT2_DATADISK', - u'taskID': - taskId, - u'logFile': - u'%s.job.log.tgz' % job_name + "--athenaopts=--preloadlib=${ATLASMKLLIBDIR_PRELOAD}/libimf.so " + "--preInclude sim:SimulationJobOptions/preInclude.FrozenShowersFCalOnly.py," + "SimulationJobOptions/preInclude.BeamPipeKill.py " + "--geometryVersion ATLAS-R2-2016-01-00-00_VALIDATION --physicsList QGSP_BERT " + "--randomSeed 1234 --conditionsTag OFLCOND-MC12-SIM-00 " + f"--maxEvents=-1 --inputEvgenFile {inFiles} --outputHitsFile HITS_{outFilesShort}.pool.root)" + ), + "attemptNr": 0, + "swRelease": "Atlas-21.0.15", + "nucleus": "NULL", + "maxCpuCount": 0, + "outFiles": outFiles, + "currentPriority": 1000, + "scopeIn": scope, + "PandaID": pandaID, + "sourceSite": "NULL", + "dispatchDblock": "NULL", + "prodSourceLabel": "ptest", + "checksum": "ad:5d000974", + "jobName": job_name, + "ddmEndPointIn": "UTA_SWT2_DATADISK", + "taskID": taskId, + "logFile": f"{job_name}.job.log.tgz", } } diff --git a/tests/harvester/conftest.py b/tests/harvester/conftest.py index 52ac4c8..7941e8b 100644 --- a/tests/harvester/conftest.py +++ b/tests/harvester/conftest.py @@ -1,9 +1,9 @@ import os import queue - import pytest - -from raythena.drivers.communicators.harvesterFileMessenger import HarvesterFileCommunicator +from raythena.drivers.communicators.harvesterFileMessenger import ( + HarvesterFileCommunicator, +) from raythena.drivers.communicators.harvesterMock import HarvesterMock from raythena.drivers.communicators.harvesterMock2205 import HarvesterMock2205 @@ -37,14 +37,16 @@ def clean_files(files): @pytest.fixture -def harvester_file_communicator(tmpdir, config, request_queue, jobs_queue, - ranges_queue): - config.harvester['endpoint'] = str(tmpdir) - communicator = HarvesterFileCommunicator(request_queue, jobs_queue, - ranges_queue, config) +def harvester_file_communicator(tmpdir, config, request_queue, jobs_queue, ranges_queue): + config.harvester["endpoint"] = str(tmpdir) + communicator = HarvesterFileCommunicator(request_queue, jobs_queue, ranges_queue, config) yield communicator communicator.stop() - clean_files([ - communicator.jobrequestfile, communicator.jobspecfile, - communicator.eventrequestfile, communicator.eventrangesfile - ]) + clean_files( + [ + communicator.jobrequestfile, + communicator.jobspecfile, + communicator.eventrequestfile, + communicator.eventrangesfile, + ] + ) diff --git a/tests/harvester/test_harvesterFileMessenger.py b/tests/harvester/test_harvesterFileMessenger.py index f4f0033..ec8831d 100644 --- a/tests/harvester/test_harvesterFileMessenger.py +++ b/tests/harvester/test_harvesterFileMessenger.py @@ -1,22 +1,18 @@ import json import os import time - -from raythena.utils.eventservice import PandaJobRequest, EventRangeRequest +from raythena.utils.eventservice import EventRangeRequest, PandaJobRequest class TestHarvesterFileMessenger: - def check_job(self, jobs, sample_jobs): assert jobs is not None assert len(jobs) == len(sample_jobs) for sample_ID, jobID in zip(sample_jobs, jobs): assert sample_ID == jobID - def test_get_job(self, harvester_file_communicator, sample_job, - request_queue, jobs_queue): - - with open(harvester_file_communicator.jobspecfile, 'w') as f: + def test_get_job(self, harvester_file_communicator, sample_job, request_queue, jobs_queue): + with open(harvester_file_communicator.jobspecfile, "w") as f: json.dump(sample_job, f) harvester_file_communicator.start() @@ -24,15 +20,14 @@ def test_get_job(self, harvester_file_communicator, sample_job, job_communicator = jobs_queue.get(timeout=5) self.check_job(job_communicator, sample_job) - def test_get_job_request(self, harvester_file_communicator, sample_job, - request_queue, jobs_queue): + def test_get_job_request(self, harvester_file_communicator, sample_job, request_queue, jobs_queue): harvester_file_communicator.start() request_queue.put(PandaJobRequest()) while not os.path.exists(harvester_file_communicator.jobrequestfile): time.sleep(0.1) - with open(harvester_file_communicator.jobspecfile, 'w') as f: + with open(harvester_file_communicator.jobspecfile, "w") as f: json.dump(sample_job, f) jobs = jobs_queue.get(timeout=5) self.check_job(jobs, sample_job) @@ -47,7 +42,7 @@ def test_restart(self, harvester_file_communicator): assert ref_thread == harvester_file_communicator.communicator_thread harvester_file_communicator.stop() assert not harvester_file_communicator.communicator_thread.is_alive() - assert not ref_thread == harvester_file_communicator.communicator_thread + assert ref_thread != harvester_file_communicator.communicator_thread harvester_file_communicator.start() ref_thread = harvester_file_communicator.communicator_thread assert harvester_file_communicator.communicator_thread.is_alive() @@ -55,44 +50,48 @@ def test_restart(self, harvester_file_communicator): assert harvester_file_communicator.communicator_thread.is_alive() assert harvester_file_communicator.communicator_thread == ref_thread - def test_get_event_ranges(self, config, harvester_file_communicator, - request_queue, ranges_queue, sample_job): + def test_get_event_ranges( + self, + config, + harvester_file_communicator, + request_queue, + ranges_queue, + sample_job, + ): harvester_file_communicator.start() n_events = 3 evnt_request = EventRangeRequest() for pandaID, job in sample_job.items(): - evnt_request.add_event_request(pandaID, n_events, job['taskID'], - job['jobsetID']) + evnt_request.add_event_request(pandaID, n_events, job["taskID"], job["jobsetID"]) request_queue.put(evnt_request) while not os.path.isfile(harvester_file_communicator.eventrequestfile): time.sleep(0.01) ranges_res = {} - with open(harvester_file_communicator.eventrequestfile, 'r') as f: + with open(harvester_file_communicator.eventrequestfile) as f: communicator_request = json.load(f) - for pandaIDSent, pandaIDCom in zip(evnt_request, - communicator_request): + for pandaIDSent, pandaIDCom in zip(evnt_request, communicator_request): assert pandaIDSent == pandaIDCom - assert evnt_request[pandaIDSent][ - 'nRanges'] == communicator_request[pandaIDSent]['nRanges'] - ranges_res[pandaIDSent] = [{ - 'lastEvent': 0, - 'eventRangeID': "0", - 'startEvent': 0, - 'scope': "scope_value", - 'LFN': "/path/to/file", - 'GUID': "worker_id" - }] * n_events - with open(harvester_file_communicator.eventrangesfile, 'w') as f: + assert evnt_request[pandaIDSent]["nRanges"] == communicator_request[pandaIDSent]["nRanges"] + ranges_res[pandaIDSent] = [ + { + "lastEvent": 0, + "eventRangeID": "0", + "startEvent": 0, + "scope": "scope_value", + "LFN": "/path/to/file", + "GUID": "worker_id", + } + ] * n_events + with open(harvester_file_communicator.eventrangesfile, "w") as f: json.dump(ranges_res, f) ranges_com = ranges_queue.get(timeout=5) for pandaIDSent, pandaIDCom in zip(ranges_res, ranges_com): assert pandaIDSent == pandaIDCom - assert len(ranges_res[pandaIDSent]) == len( - ranges_com[pandaIDSent]) == n_events + assert len(ranges_res[pandaIDSent]) == len(ranges_com[pandaIDSent]) == n_events assert not os.path.isfile(harvester_file_communicator.eventrequestfile) assert not os.path.isfile(harvester_file_communicator.eventrangesfile) @@ -103,10 +102,9 @@ def test_get_event_ranges(self, config, harvester_file_communicator, ranges_res = {} for pandaID in evnt_request: ranges_res[pandaID] = [] - with open(harvester_file_communicator.eventrangesfile, 'w') as f: + with open(harvester_file_communicator.eventrangesfile, "w") as f: json.dump(ranges_res, f) ranges_com = ranges_queue.get(timeout=5) for pandaIDSent, pandaIDCom in zip(ranges_res, ranges_com): assert pandaIDSent == pandaIDCom - assert len(ranges_res[pandaIDSent]) == len( - ranges_com[pandaIDSent]) == 0 + assert len(ranges_res[pandaIDSent]) == len(ranges_com[pandaIDSent]) == 0 diff --git a/tests/harvester/test_harvesterMock.py b/tests/harvester/test_harvesterMock.py index fce89a6..0164908 100644 --- a/tests/harvester/test_harvesterMock.py +++ b/tests/harvester/test_harvesterMock.py @@ -2,15 +2,13 @@ class TestHarvesterMock: - def test_get_job(self, harvester_mock, request_queue, jobs_queue): harvester_mock.start() request_queue.put(PandaJobRequest()) job = jobs_queue.get(timeout=5) assert job is not None and isinstance(job, dict) - def test_get_ranges(self, harvester_mock, request_queue, jobs_queue, - ranges_queue): + def test_get_ranges(self, harvester_mock, request_queue, jobs_queue, ranges_queue): harvester_mock.start() request_queue.put(PandaJobRequest()) jobs = jobs_queue.get(timeout=5) @@ -18,13 +16,12 @@ def test_get_ranges(self, harvester_mock, request_queue, jobs_queue, n_events = harvester_mock.nevents evnt_request = EventRangeRequest() for pandaID, job in jobs.items(): - evnt_request.add_event_request(pandaID, n_events, job['taskID'], - job['jobsetID']) + evnt_request.add_event_request(pandaID, n_events, job["taskID"], job["jobsetID"]) request_queue.put(evnt_request) ranges = ranges_queue.get(timeout=5) assert ranges is not None assert isinstance(ranges, dict) - for pandaID, job_ranges in ranges.items(): + for _pandaID, job_ranges in ranges.items(): assert len(job_ranges) == n_events # should return 0 ranges per job @@ -32,5 +29,5 @@ def test_get_ranges(self, harvester_mock, request_queue, jobs_queue, ranges = ranges_queue.get(timeout=5) assert ranges is not None assert isinstance(ranges, dict) - for pandaID, job_ranges in ranges.items(): + for _pandaID, job_ranges in ranges.items(): assert len(job_ranges) == 0 diff --git a/tests/test_bookkeeper.py b/tests/test_bookkeeper.py index 705089e..fd903b4 100644 --- a/tests/test_bookkeeper.py +++ b/tests/test_bookkeeper.py @@ -1,11 +1,9 @@ import pytest - from raythena.utils.bookkeeper import BookKeeper @pytest.mark.usefixtures("requires_ray") class TestBookKeeper: - def test_add_jobs(self, is_eventservice, config, sample_multijobs, njobs): bookKeeper = BookKeeper(config) bookKeeper.output_dir = "dummy" @@ -15,8 +13,15 @@ def test_add_jobs(self, is_eventservice, config, sample_multijobs, njobs): for pandaID in bookKeeper.jobs: assert pandaID in sample_multijobs - def test_assign_job_to_actor(elf, is_eventservice, config, sample_multijobs, - njobs, sample_ranges, nevents): + def test_assign_job_to_actor( + elf, + is_eventservice, + config, + sample_multijobs, + njobs, + sample_ranges, + nevents, + ): bookKeeper = BookKeeper(config) bookKeeper.output_dir = "dummy" bookKeeper.merged_files_dir = "dummy" @@ -24,27 +29,33 @@ def test_assign_job_to_actor(elf, is_eventservice, config, sample_multijobs, actor_id = "a1" if not is_eventservice: job = None - for i in range(njobs): + for _ in range(njobs): job_tmp = bookKeeper.assign_job_to_actor(actor_id) if job: - assert job['PandaID'] != job_tmp['PandaID'] + assert job["PandaID"] != job_tmp["PandaID"] job = job_tmp assert not bookKeeper.has_jobs_ready() assert not bookKeeper.assign_job_to_actor(actor_id) else: bookKeeper.add_event_ranges(sample_ranges) job = None - for i in range(njobs): + for _ in range(njobs): job_tmp = bookKeeper.assign_job_to_actor(actor_id) if job: - assert job['PandaID'] == job_tmp['PandaID'] + assert job["PandaID"] == job_tmp["PandaID"] job = job_tmp bookKeeper.fetch_event_ranges(actor_id, nevents) - assert bookKeeper.assign_job_to_actor( - actor_id)['PandaID'] == job['PandaID'] - - def test_add_event_ranges(self, is_eventservice, config, sample_multijobs, - njobs, nevents, sample_ranges): + assert bookKeeper.assign_job_to_actor(actor_id)["PandaID"] == job["PandaID"] + + def test_add_event_ranges( + self, + is_eventservice, + config, + sample_multijobs, + njobs, + nevents, + sample_ranges, + ): if not is_eventservice: pytest.skip() @@ -59,8 +70,15 @@ def test_add_event_ranges(self, is_eventservice, config, sample_multijobs, print(bookKeeper.jobs[pandaID].event_ranges_queue.event_ranges_by_id) assert bookKeeper.n_ready(pandaID) == nevents - def test_fetch_event_ranges(self, is_eventservice, config, sample_multijobs, - njobs, nevents, sample_ranges): + def test_fetch_event_ranges( + self, + is_eventservice, + config, + sample_multijobs, + njobs, + nevents, + sample_ranges, + ): if not is_eventservice: pytest.skip() worker_ids = [f"w_{i}" for i in range(10)] @@ -74,19 +92,25 @@ def test_fetch_event_ranges(self, is_eventservice, config, sample_multijobs, for wid in worker_ids: assert not bookKeeper.fetch_event_ranges(wid, 100) - assigned_workers = worker_ids[:int(len(worker_ids) / 2)] + assigned_workers = worker_ids[: int(len(worker_ids) / 2)] for wid in assigned_workers: job = bookKeeper.assign_job_to_actor(wid) - assert job['PandaID'] in sample_multijobs - ranges = bookKeeper.fetch_event_ranges( - wid, int(nevents / len(assigned_workers))) + assert job["PandaID"] in sample_multijobs + ranges = bookKeeper.fetch_event_ranges(wid, int(nevents / len(assigned_workers))) assert ranges assert not bookKeeper.fetch_event_ranges(wid[0], 1) - def test_process_event_ranges_update(self, is_eventservice, config, - sample_multijobs, njobs, nevents, - sample_ranges, sample_rangeupdate, - sample_failed_rangeupdate): + def test_process_event_ranges_update( + self, + is_eventservice, + config, + sample_multijobs, + njobs, + nevents, + sample_ranges, + sample_rangeupdate, + sample_failed_rangeupdate, + ): if not is_eventservice: pytest.skip("No eventservice jobs") @@ -98,7 +122,7 @@ def __inner__(range_update, failed=False): bookKeeper.merged_files_dir = "dummy" bookKeeper.add_jobs(sample_multijobs, False) - for i in range(njobs): + for _ in range(njobs): job = bookKeeper.assign_job_to_actor(actor_id) _ = bookKeeper.fetch_event_ranges(actor_id, nevents) print(job.event_ranges_queue.rangesID_by_state) @@ -108,9 +132,10 @@ def __inner__(range_update, failed=False): assert job.event_ranges_queue.nranges_failed() == nevents else: assert job.event_ranges_queue.nranges_done() == nevents - assert not bookKeeper.is_flagged_no_more_events(job['PandaID']) + assert not bookKeeper.is_flagged_no_more_events(job["PandaID"]) assert bookKeeper.assign_job_to_actor(actor_id) + __inner__(sample_rangeupdate) __inner__(sample_failed_rangeupdate, True) @@ -129,9 +154,9 @@ def __inner__(range_update, failed=False): assert job.event_ranges_queue.nranges_failed() == nevents assert not bookKeeper.rangesID_by_actor[actor_id] - n_success = len(sample_rangeupdate[0]['eventRanges']) // 2 - sample_rangeupdate[0]['eventRanges'] = sample_rangeupdate[0]['eventRanges'][:n_success] - bookKeeper.process_event_ranges_update(actor_id, sample_rangeupdate[0]['eventRanges']) + n_success = len(sample_rangeupdate[0]["eventRanges"]) // 2 + sample_rangeupdate[0]["eventRanges"] = sample_rangeupdate[0]["eventRanges"][:n_success] + bookKeeper.process_event_ranges_update(actor_id, sample_rangeupdate[0]["eventRanges"]) assert not bookKeeper.rangesID_by_actor[actor_id] assert job.event_ranges_queue.nranges_done() == n_success @@ -142,11 +167,18 @@ def __inner__(range_update, failed=False): assert job.event_ranges_queue.nranges_done() == n_success print(job.event_ranges_queue.rangesID_by_state) print(bookKeeper.rangesID_by_actor) - assert not bookKeeper.is_flagged_no_more_events(job['PandaID']) + assert not bookKeeper.is_flagged_no_more_events(job["PandaID"]) assert bookKeeper.assign_job_to_actor(actor_id) - def test_process_actor_end(self, is_eventservice, config, njobs, - sample_multijobs, nevents, sample_ranges): + def test_process_actor_end( + self, + is_eventservice, + config, + njobs, + sample_multijobs, + nevents, + sample_ranges, + ): if not is_eventservice: pytest.skip("No eventservice jobs") @@ -159,7 +191,7 @@ def test_process_actor_end(self, is_eventservice, config, njobs, bookKeeper.add_jobs(sample_multijobs, False) job = bookKeeper.assign_job_to_actor(actor_id_1) - pandaID = job['PandaID'] + pandaID = job["PandaID"] assert bookKeeper.n_ready(pandaID) == nevents bookKeeper.process_actor_end(actor_id_1) @@ -167,15 +199,15 @@ def test_process_actor_end(self, is_eventservice, config, njobs, job = bookKeeper.assign_job_to_actor(actor_id_1) job_2 = bookKeeper.assign_job_to_actor(actor_id_2) - assert job_2['PandaID'] == job['PandaID'] == pandaID + assert job_2["PandaID"] == job["PandaID"] == pandaID ranges_1 = bookKeeper.fetch_event_ranges(actor_id_1, nevents) assert len(ranges_1) == nevents ranges_2 = bookKeeper.fetch_event_ranges(actor_id_2, nevents) assert len(ranges_2) == bookKeeper.n_ready(pandaID) == 0 - assert bookKeeper.assign_job_to_actor(actor_id_2)['PandaID'] == pandaID + assert bookKeeper.assign_job_to_actor(actor_id_2)["PandaID"] == pandaID bookKeeper.process_actor_end(actor_id_1) assert bookKeeper.n_ready(pandaID) == nevents - assert bookKeeper.assign_job_to_actor(actor_id_1)['PandaID'] == pandaID + assert bookKeeper.assign_job_to_actor(actor_id_1)["PandaID"] == pandaID diff --git a/tests/test_config.py b/tests/test_config.py index 9aa45f7..e87049d 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,4 +1,3 @@ class TestConfig: - def test_config(self, config): pass diff --git a/tests/test_driver.py b/tests/test_driver.py index ce4efcc..3b7ef7e 100644 --- a/tests/test_driver.py +++ b/tests/test_driver.py @@ -1,5 +1,4 @@ class TestDriver: - def test_one(self, tmpdir): assert True diff --git a/tests/test_eventservice.py b/tests/test_eventservice.py index 383896d..5244ffc 100644 --- a/tests/test_eventservice.py +++ b/tests/test_eventservice.py @@ -1,88 +1,100 @@ import pytest - -from raythena.utils.eventservice import EventRange, EventRangeQueue, EventRangeRequest, EventRangeUpdate -from raythena.utils.eventservice import PandaJob, PandaJobQueue, PandaJobRequest, PandaJobUpdate +from raythena.utils.eventservice import ( + EventRange, + EventRangeQueue, + EventRangeRequest, + EventRangeUpdate, + PandaJob, + PandaJobQueue, + PandaJobRequest, + PandaJobUpdate, +) class TestEventRangeRequest: - def test_from_dict_init(self): request_dict = { "0": { "nRanges": 10, "pandaID": "0", "taskID": "0", - "jobsetID": "0" + "jobsetID": "0", }, "1": { "nRanges": 20, "pandaID": "1", "taskID": "1", - "jobsetID": "1" - } + "jobsetID": "1", + }, } ranges_request = EventRangeRequest.build_from_dict(request_dict) ranges_request_init = EventRangeRequest() for pandaID, req in request_dict.items(): - ranges_request_init.add_event_request(pandaID, req['nRanges'], - req['taskID'], - req['jobsetID']) - assert len(request_dict) == len(ranges_request) == len( - ranges_request_init) - for id1, id2, id3 in zip(ranges_request, ranges_request_init, - request_dict): - assert ranges_request[id1]['pandaID'] == ranges_request_init[id2][ - 'pandaID'] == request_dict[id3]['pandaID'] + ranges_request_init.add_event_request(pandaID, req["nRanges"], req["taskID"], req["jobsetID"]) + assert len(request_dict) == len(ranges_request) == len(ranges_request_init) + for id1, id2, id3 in zip(ranges_request, ranges_request_init, request_dict): + assert ranges_request[id1]["pandaID"] == ranges_request_init[id2]["pandaID"] == request_dict[id3]["pandaID"] class TestEventRangeUpdate: - - def test_build_range_update(self, nevents, sample_rangeupdate, - sample_failed_rangeupdate): + def test_build_range_update(self, nevents, sample_rangeupdate, sample_failed_rangeupdate): pandaID = "0" - ranges_update = EventRangeUpdate.build_from_dict( - pandaID, sample_rangeupdate) + ranges_update = EventRangeUpdate.build_from_dict(pandaID, sample_rangeupdate) assert pandaID in ranges_update ranges = ranges_update[pandaID] assert len(ranges) == nevents assert len(ranges_update) == 1 for r in ranges: - assert "eventRangeID" in r and "eventStatus" in r and "path" in r and "type" in r and "chksum" in r and "fsize" in r - - ranges_update = EventRangeUpdate.build_from_dict( - pandaID, sample_failed_rangeupdate) + assert ( + "eventRangeID" in r + and "eventStatus" in r + and "path" in r + and "type" in r + and "chksum" in r + and "fsize" in r + ) + + ranges_update = EventRangeUpdate.build_from_dict(pandaID, sample_failed_rangeupdate) assert pandaID in ranges_update ranges = ranges_update[pandaID] assert len(ranges) == nevents for r in ranges: - assert "eventRangeID" in r and "eventStatus" in r and \ - "path" not in r and "type" not in r and "chksum" not in r and "fsize" not in r - - with pytest.raises(Exception): + assert ( + "eventRangeID" in r + and "eventStatus" in r + and "path" not in r + and "type" not in r + and "chksum" not in r + and "fsize" not in r + ) + + with pytest.raises(ValueError): ranges_update.range_update[pandaID] = None EventRangeUpdate(ranges_update.range_update) - with pytest.raises(Exception): + with pytest.raises(ValueError): ranges_update[pandaID] = None ranges_update[pandaID] = [] assert not ranges_update[pandaID] class TestEventRangeQueue: - def test_new(self, nevents, sample_job, sample_ranges): ranges_queue = EventRangeQueue() assert len(ranges_queue) == 0 - ranges = list(sample_ranges.values())[0] + ranges = next(iter(sample_ranges.values())) ranges_queue = EventRangeQueue.build_from_list(ranges) - assert len(ranges) == len( - ranges_queue) == ranges_queue.nranges_available() ==\ - ranges_queue.nranges_remaining() == nevents - assert ranges_queue.nranges_assigned() == ranges_queue.nranges_done() ==\ - ranges_queue.nranges_failed() == 0 - - with pytest.raises(Exception): + assert ( + len(ranges) + == len(ranges_queue) + == ranges_queue.nranges_available() + == ranges_queue.nranges_remaining() + == nevents + ) + assert ranges_queue.nranges_assigned() == ranges_queue.nranges_done() == ranges_queue.nranges_failed() == 0 + + with pytest.raises(ValueError): ranges_queue["key"] = None ranges_queue_2 = EventRangeQueue() @@ -92,30 +104,35 @@ def test_new(self, nevents, sample_job, sample_ranges): def test_concat(self, nevents, sample_job, sample_ranges): ranges_queue = EventRangeQueue() - ranges = list(sample_ranges.values())[0] + ranges = next(iter(sample_ranges.values())) ranges_queue.concat(ranges) - assert len(ranges) == len(ranges_queue) ==\ - ranges_queue.nranges_available() ==\ - ranges_queue.nranges_remaining() == nevents - assert ranges_queue.nranges_assigned() ==\ - ranges_queue.nranges_done() ==\ - ranges_queue.nranges_failed() == 0 - assert ranges_queue[ - ranges[0]['eventRangeID']].eventRangeID == ranges[0]['eventRangeID'] + assert ( + len(ranges) + == len(ranges_queue) + == ranges_queue.nranges_available() + == ranges_queue.nranges_remaining() + == nevents + ) + assert ranges_queue.nranges_assigned() == ranges_queue.nranges_done() == ranges_queue.nranges_failed() == 0 + assert ranges_queue[ranges[0]["eventRangeID"]].eventRangeID == ranges[0]["eventRangeID"] for r in ranges: - assert r['eventRangeID'] in ranges_queue - - def test_update(self, sample_job, sample_ranges, nevents, - sample_rangeupdate, sample_failed_rangeupdate): + assert r["eventRangeID"] in ranges_queue + + def test_update( + self, + sample_job, + sample_ranges, + nevents, + sample_rangeupdate, + sample_failed_rangeupdate, + ): pandaID = "0" - ranges = list(sample_ranges.values())[0] + ranges = next(iter(sample_ranges.values())) ranges_queue = EventRangeQueue.build_from_list(ranges) nsuccess = int(nevents / 2) - ranges_update = EventRangeUpdate.build_from_dict( - pandaID, sample_rangeupdate)[pandaID][:nsuccess] - failed_ranges_update = EventRangeUpdate.build_from_dict( - pandaID, sample_failed_rangeupdate)[pandaID][nsuccess:] + ranges_update = EventRangeUpdate.build_from_dict(pandaID, sample_rangeupdate)[pandaID][:nsuccess] + failed_ranges_update = EventRangeUpdate.build_from_dict(pandaID, sample_failed_rangeupdate)[pandaID][nsuccess:] ranges_queue.get_next_ranges(nevents) ranges_queue.update_ranges(ranges_update) @@ -129,13 +146,13 @@ def test_update(self, sample_job, sample_ranges, nevents, assert ranges_queue.nranges_assigned() == 0 assert ranges_queue.nranges_remaining() == 0 - with pytest.raises(Exception): + with pytest.raises(KeyError): ranges_queue.update_range_state("unknown", EventRange.ASSIGNED) def test_get_next(self, sample_job, sample_ranges): ranges_queue = EventRangeQueue() assert not ranges_queue.get_next_ranges(10) - ranges = list(sample_ranges.values())[0] + ranges = next(iter(sample_ranges.values())) ranges_queue.concat(ranges) nranges = len(ranges_queue) nranges_requested = max(1, int(nranges / 3)) @@ -145,19 +162,16 @@ def test_get_next(self, sample_job, sample_ranges): assert ranges_queue.nranges_remaining() == nranges assert ranges_queue.nranges_available() == nranges - nranges_requested for requested_range in requested_ranges: - assert ranges_queue[ - requested_range.eventRangeID].status == EventRange.ASSIGNED + assert ranges_queue[requested_range.eventRangeID].status == EventRange.ASSIGNED requested_ranges = ranges_queue.get_next_ranges(nranges) assert len(requested_ranges) == nranges - nranges_requested assert ranges_queue.nranges_available() == 0 - assert ranges_queue.nranges_assigned( - ) == ranges_queue.nranges_remaining() == nranges + assert ranges_queue.nranges_assigned() == ranges_queue.nranges_remaining() == nranges assert len(ranges_queue.get_next_ranges(1)) == 0 class TestEventRanges: - def test_new(self): id = "Range-0" start = 0 @@ -183,22 +197,25 @@ def test_build_from_dict(self): "lastEvent": last, "startEvent": start, "GUID": guid, - "scope": scope + "scope": scope, } range_from_dict = EventRange.build_from_dict(r_dict) - assert range_from_dict.PFN == pfn and range_from_dict.eventRangeID == id and range_from_dict.startEvent == start \ - and range_from_dict.lastEvent == last and range_from_dict.GUID == guid and range_from_dict.scope == scope + assert ( + pfn == range_from_dict.PFN + and range_from_dict.eventRangeID == id + and range_from_dict.startEvent == start + and range_from_dict.lastEvent == last + and guid == range_from_dict.GUID + and range_from_dict.scope == scope + ) assert range_from_dict.status == EventRange.READY class TestPandaJobQueue: - - def test_build_pandajob_queue(self, is_eventservice, njobs, - sample_multijobs): + def test_build_pandajob_queue(self, is_eventservice, njobs, sample_multijobs): assert len(sample_multijobs) == njobs pandajob_queue = PandaJobQueue() - pandajob_queue_fromdict = PandaJobQueue.build_from_dict( - sample_multijobs) + pandajob_queue_fromdict = PandaJobQueue.build_from_dict(sample_multijobs) assert len(pandajob_queue) == 0 assert not pandajob_queue.next_job_to_process() @@ -208,9 +225,9 @@ def test_build_pandajob_queue(self, is_eventservice, njobs, if is_eventservice: assert job else: - for i in range(1, njobs): + for _ in range(1, njobs): next_job = pandajob_queue.next_job_to_process() - assert job['PandaID'] != next_job['PandaID'] + assert job["PandaID"] != next_job["PandaID"] job = next_job for pandaID in pandajob_queue: @@ -218,7 +235,7 @@ def test_build_pandajob_queue(self, is_eventservice, njobs, assert isinstance(event_ranges, EventRangeQueue) assert len(event_ranges) == 0 assert pandajob_queue.has_job(pandaID) - with pytest.raises(Exception): + with pytest.raises(ValueError): pandajob_queue[pandaID] = None pandajob_queue_2 = PandaJobQueue() @@ -227,9 +244,7 @@ def test_build_pandajob_queue(self, is_eventservice, njobs, pandajob_queue_2["key"] = job assert "key" in pandajob_queue_2 - def test_pandajob_process_event_ranges_reply(self, is_eventservice, njobs, - sample_multijobs, - sample_ranges): + def test_pandajob_process_event_ranges_reply(self, is_eventservice, njobs, sample_multijobs, sample_ranges): if not is_eventservice: pytest.skip("Not eventservice jobs") pandajob_queue = PandaJobQueue(sample_multijobs) @@ -237,7 +252,7 @@ def test_pandajob_process_event_ranges_reply(self, is_eventservice, njobs, pandajob_queue.process_event_ranges_reply(sample_ranges) job = pandajob_queue.next_job_to_process() - assert job['PandaID'] in sample_ranges + assert job["PandaID"] in sample_ranges for pandaID in pandajob_queue: ranges = pandajob_queue.get_event_ranges(pandaID) @@ -254,9 +269,15 @@ def test_pandajob_process_event_ranges_reply(self, is_eventservice, njobs, pandajob_queue.process_event_ranges_reply(sample_ranges) assert "key" not in pandajob_queue - def test_process_event_ranges_update(self, is_eventservice, njobs, nevents, - sample_multijobs, sample_ranges, - sample_rangeupdate): + def test_process_event_ranges_update( + self, + is_eventservice, + njobs, + nevents, + sample_multijobs, + sample_ranges, + sample_rangeupdate, + ): if not is_eventservice: pytest.skip("Not eventservice jobs") pandajob_queue = PandaJobQueue(sample_multijobs) @@ -265,35 +286,31 @@ def test_process_event_ranges_update(self, is_eventservice, njobs, nevents, job = pandajob_queue.next_job_to_process() assert job == pandajob_queue.next_job_to_process() - ranges_update = EventRangeUpdate.build_from_dict( - job['PandaID'], sample_rangeupdate) + ranges_update = EventRangeUpdate.build_from_dict(job["PandaID"], sample_rangeupdate) - ranges_queue = pandajob_queue.get_event_ranges(job['PandaID']) + ranges_queue = pandajob_queue.get_event_ranges(job["PandaID"]) _ = job.get_next_ranges(nevents) pandajob_queue.process_event_ranges_update(ranges_update) assert not job.no_more_ranges assert ranges_queue.nranges_done() == nevents - assert ranges_queue.nranges_remaining( - ) == ranges_queue.nranges_available() == 0 + assert ranges_queue.nranges_remaining() == ranges_queue.nranges_available() == 0 job_2 = pandajob_queue.next_job_to_process() - assert job['PandaID'] == job_2['PandaID'] + assert job["PandaID"] == job_2["PandaID"] class TestPandaJob: - def test_build_pandajob(self, sample_job): - job_dict = list(sample_job.values())[0] + job_dict = next(iter(sample_job.values())) job = PandaJob(job_dict) for k in job_dict: assert k in job assert job_dict[k] == job[k] - assert job.get_id() == list(sample_job.keys())[0] + assert job.get_id() == next(iter(sample_job.keys())) job["k"] = "v" assert job["k"] == "v" class TestPandaJobRequest: - def test_build_pandajob_request(self): request_dict = { "node": "nodename", @@ -305,31 +322,29 @@ def test_build_pandajob_request(self): "resource_type": "rt", "mem": 230000, "cpu": 32, - "allow_other_country": "false" + "allow_other_country": "false", } jobrequest = PandaJobRequest(**request_dict) - assert jobrequest.diskSpace == request_dict['disk_space'] - assert jobrequest.mem == request_dict['mem'] - assert jobrequest.allowOtherCountry == request_dict[ - 'allow_other_country'] + assert jobrequest.diskSpace == request_dict["disk_space"] + assert jobrequest.mem == request_dict["mem"] + assert jobrequest.allowOtherCountry == request_dict["allow_other_country"] class TestPandaJobUpdate: - def test_build_pandajob_update(self): update_dict = { - 'node': ['nid00038'], - 'startTime': ['1574112042.86'], - 'jobMetrics': ['coreCount=32'], - 'siteName': ['NERSC_Cori_p2_ES'], - 'timestamp': ['2019-11-18T13:20:45-08:00'], - 'coreCount': ['32'], - 'attemptNr': ['0'], - 'jobId': ['7a75654803d17d54f9129e2a6974beda'], - 'batchID': ['25932742'], - 'state': ['starting'], - 'schedulerID': ['unknown'], - 'pilotID': ['unknown|SLURM|PR|2.2.2 (1)'] + "node": ["nid00038"], + "startTime": ["1574112042.86"], + "jobMetrics": ["coreCount=32"], + "siteName": ["NERSC_Cori_p2_ES"], + "timestamp": ["2019-11-18T13:20:45-08:00"], + "coreCount": ["32"], + "attemptNr": ["0"], + "jobId": ["7a75654803d17d54f9129e2a6974beda"], + "batchID": ["25932742"], + "state": ["starting"], + "schedulerID": ["unknown"], + "pilotID": ["unknown|SLURM|PR|2.2.2 (1)"], } jobupdate = PandaJobUpdate(**update_dict) for k in update_dict: diff --git a/tests/test_importutils.py b/tests/test_importutils.py index 8c614dc..85131ce 100644 --- a/tests/test_importutils.py +++ b/tests/test_importutils.py @@ -1,12 +1,13 @@ import pytest - from raythena.utils.importUtils import import_from_string def test_importutils(): errors_string = [ - "unknown", "unknown:unknown", "unknown:", - "raythena.drivers.esdriver:ESDriver.unknown" + "unknown", + "unknown:unknown", + "unknown:", + "raythena.drivers.esdriver:ESDriver.unknown", ] for s in errors_string: with pytest.raises(ImportError): @@ -15,4 +16,5 @@ def test_importutils(): with pytest.raises(ValueError): import_from_string(":unknown") from raythena.drivers.esdriver import ESDriver + assert import_from_string("raythena.drivers.esdriver:ESDriver") == ESDriver diff --git a/tests/test_pilothttp.py b/tests/test_pilothttp.py index 4fddde6..60b974d 100644 --- a/tests/test_pilothttp.py +++ b/tests/test_pilothttp.py @@ -1,15 +1,12 @@ import os import time - import pytest import requests - from raythena.actors.payloads.eventservice.pilothttp import PilotHttpPayload -from raythena.utils.eventservice import PandaJob, EventRange +from raythena.utils.eventservice import EventRange, PandaJob class MockPopen: - def __init__(self, returncode): self.returncode = returncode @@ -26,18 +23,16 @@ def terminate(self): class MockPayload(PilotHttpPayload): - def _start_payload(self): self.pilot_process = MockPopen(None) @pytest.mark.usefixtures("requires_ray") class TestPilotHttp: - def wait_server_start(self): while True: try: - requests.post('http://127.0.0.1:8080') + requests.post("http://127.0.0.1:8080") except requests.exceptions.ConnectionError: time.sleep(0.5) else: @@ -49,9 +44,9 @@ def setup_payload(self, config): @pytest.fixture def payload(self, tmpdir, config, sample_job): cwd = os.getcwd() - config.ray['workdir'] = str(tmpdir) + config.ray["workdir"] = str(tmpdir) os.chdir(tmpdir) - job_dict = list(sample_job.values())[0] + job_dict = next(iter(sample_job.values())) job = PandaJob(job_dict) payload = self.setup_payload(config) payload.start(job) @@ -63,20 +58,19 @@ def payload(self, tmpdir, config, sample_job): def test_getjob(self, payload, is_eventservice, config, sample_job): if not is_eventservice: pytest.skip() - job_dict = list(sample_job.values())[0] + job_dict = next(iter(sample_job.values())) job = PandaJob(job_dict) - res = requests.post('http://127.0.0.1:8080/server/panda/getJob').json() - assert job['PandaID'] == PandaJob(res)['PandaID'] + res = requests.post("http://127.0.0.1:8080/server/panda/getJob").json() + assert job["PandaID"] == PandaJob(res)["PandaID"] - assert requests.post( - 'http://127.0.0.1:8080/unknown').json()['StatusCode'] == 500 + assert requests.post("http://127.0.0.1:8080/unknown").json()["StatusCode"] == 500 payload.stop() assert payload.is_complete() assert payload.return_code() == payload.pilot_process.returncode def endpoint_not_implemented(self, endpoint): - assert requests.post(f'http://127.0.0.1:8080/server/panda/{endpoint}').json()['StatusCode'] == 500 + assert requests.post(f"http://127.0.0.1:8080/server/panda/{endpoint}").json()["StatusCode"] == 500 @pytest.mark.usefixtures("payload") def test_updateJobsInBulk(self): @@ -95,61 +89,67 @@ def test_jobUpdate(self, payload, config, is_eventservice): pytest.skip() assert not payload.fetch_job_update() - data = {"pilotErrorCode": '0'} - res = requests.post('http://127.0.0.1:8080/server/panda/updateJob', - data=data).json() - assert res['StatusCode'] == 0 + data = {"pilotErrorCode": "0"} + res = requests.post("http://127.0.0.1:8080/server/panda/updateJob", data=data).json() + assert res["StatusCode"] == 0 # Disabled as job update are currently not forwarded to the driver # job_update = payload.fetch_job_update() # assert job_update['pilotErrorCode'][0] == data['pilotErrorCode'] - def test_rangesUpdate(self, payload, config, is_eventservice, sample_job, - sample_ranges, nevents): + def test_rangesUpdate( + self, + payload, + config, + is_eventservice, + sample_job, + sample_ranges, + nevents, + ): if not is_eventservice: pytest.skip() assert not payload.fetch_ranges_update() data = {"pilotErrorCode": 0} - res = requests.post( - 'http://127.0.0.1:8080/server/panda/updateEventRanges', - data=data).json() - assert res['StatusCode'] == 0 - - def test_getranges(self, payload, config, is_eventservice, sample_job, - sample_ranges, nevents): + res = requests.post("http://127.0.0.1:8080/server/panda/updateEventRanges", data=data).json() + assert res["StatusCode"] == 0 + + def test_getranges( + self, + payload, + config, + is_eventservice, + sample_job, + sample_ranges, + nevents, + ): if not is_eventservice: pytest.skip() - job_dict = list(sample_job.values())[0] + job_dict = next(iter(sample_job.values())) job = PandaJob(job_dict) data = { "pandaID": job["PandaID"], "nRanges": nevents, "jobsetID": job["jobsetID"], - "taskID": job["taskID"] + "taskID": job["taskID"], } - res = requests.post( - 'http://127.0.0.1:8080/server/panda/getEventRanges').json() - assert res['StatusCode'] == 500 + res = requests.post("http://127.0.0.1:8080/server/panda/getEventRanges").json() + assert res["StatusCode"] == 500 assert payload.should_request_more_ranges() ranges = list() - for r in list(sample_ranges.values())[0]: + for r in next(iter(sample_ranges.values())): ranges.append(EventRange.build_from_dict(r)) payload.submit_new_ranges(ranges) payload.submit_new_ranges(None) - res = requests.post('http://127.0.0.1:8080/server/panda/getEventRanges', - data=data).json() - assert res['StatusCode'] == 0 - assert len(res['eventRanges']) == nevents + res = requests.post("http://127.0.0.1:8080/server/panda/getEventRanges", data=data).json() + assert res["StatusCode"] == 0 + assert len(res["eventRanges"]) == nevents - res = requests.post('http://127.0.0.1:8080/server/panda/getEventRanges', - data=data).json() - assert res['StatusCode'] == 0 - assert len(res['eventRanges']) == 0 + res = requests.post("http://127.0.0.1:8080/server/panda/getEventRanges", data=data).json() + assert res["StatusCode"] == 0 + assert len(res["eventRanges"]) == 0 assert not payload.should_request_more_ranges() data["pandaID"] = "None" - assert requests.post( - 'http://127.0.0.1:8080/server/panda/getEventRanges', - data=data).json()['StatusCode'] == -1 + assert requests.post("http://127.0.0.1:8080/server/panda/getEventRanges", data=data).json()["StatusCode"] == -1 diff --git a/tests/test_ray_utils.py b/tests/test_ray_utils.py index 8c72048..3c70875 100644 --- a/tests/test_ray_utils.py +++ b/tests/test_ray_utils.py @@ -1,17 +1,17 @@ import socket - import pytest - -from raythena.utils.ray import cluster_size, build_nodes_resource_list, get_node_ip +from raythena.utils.ray import ( + build_nodes_resource_list, + cluster_size, + get_node_ip, +) @pytest.mark.usefixtures("requires_ray") class TestRayUtils: - def test_build_nodes_resource_list(self, config): constraints = build_nodes_resource_list(config) - assert len( - constraints) == cluster_size() + assert len(constraints) == cluster_size() def test_cluster_size(self): assert cluster_size() > 0 diff --git a/tests/test_taskstatus.py b/tests/test_taskstatus.py index 0d1bd53..0a9aca5 100644 --- a/tests/test_taskstatus.py +++ b/tests/test_taskstatus.py @@ -3,14 +3,13 @@ class TestTaskStatus: - def test_save_restore_status(self, nfiles, tmp_path, config, sample_job, sample_ranges): config.ray["outputdir"] = tmp_path - job = PandaJob(list(sample_job.values())[0]) + job = PandaJob(next(iter(sample_job.values()))) ts = TaskStatus(job, tmp_path, config) - ranges = list(sample_ranges.values())[0] - hits_per_file = int(job['esmergeSpec']['nEventsPerOutputFile']) - events_per_file = int(job['nEventsPerInputFile']) + ranges = next(iter(sample_ranges.values())) + hits_per_file = int(job["esmergeSpec"]["nEventsPerOutputFile"]) + events_per_file = int(job["nEventsPerInputFile"]) assert events_per_file % hits_per_file == 0 n_output_per_input_file = events_per_file // hits_per_file offset = nfiles @@ -36,10 +35,10 @@ def test_save_restore_status(self, nfiles, tmp_path, config, sample_job, sample_ def test_set_processed(self, nfiles, nevents, tmp_path, config, sample_job, sample_ranges): config.ray["outputdir"] = tmp_path - job = PandaJob(list(sample_job.values())[0]) + job = PandaJob(next(iter(sample_job.values()))) ts = TaskStatus(job, tmp_path, config) - ranges_list = list(sample_ranges.values())[0] + ranges_list = next(iter(sample_ranges.values())) for r in ranges_list: ts.set_eventrange_simulated(EventRange.build_from_dict(r), "outputfile") @@ -51,10 +50,10 @@ def test_set_processed(self, nfiles, nevents, tmp_path, config, sample_job, samp def test_set_failed(self, nfiles, nevents, tmp_path, config, sample_job, sample_ranges): config.ray["outputdir"] = tmp_path - job = PandaJob(list(sample_job.values())[0]) + job = PandaJob(next(iter(sample_job.values()))) ts = TaskStatus(job, tmp_path, config) - ranges_list = list(sample_ranges.values())[0] + ranges_list = next(iter(sample_ranges.values())) for r in ranges_list: ts.set_eventrange_failed(EventRange.build_from_dict(r)) @@ -66,16 +65,16 @@ def test_set_failed(self, nfiles, nevents, tmp_path, config, sample_job, sample_ def test_set_merged(self, nfiles, nevents, tmp_path, config, sample_job, sample_ranges): config.ray["outputdir"] = tmp_path - job = PandaJob(list(sample_job.values())[0]) + job = PandaJob(next(iter(sample_job.values()))) ts = TaskStatus(job, tmp_path, config) - ranges = list(sample_ranges.values())[0] + ranges = next(iter(sample_ranges.values())) for e in ranges: er = EventRange.build_from_dict(e) ts.set_eventrange_simulated(er, f"outputfile-{er.eventRangeID}") - hits_per_file = int(job['esmergeSpec']['nEventsPerOutputFile']) - events_per_file = int(job['nEventsPerInputFile']) + hits_per_file = int(job["esmergeSpec"]["nEventsPerOutputFile"]) + events_per_file = int(job["nEventsPerInputFile"]) assert events_per_file % hits_per_file == 0 n_output_per_input_file = events_per_file // hits_per_file offset = nfiles @@ -90,7 +89,9 @@ def test_set_merged(self, nfiles, nevents, tmp_path, config, sample_job, sample_ ranges_map = {} for r in ranges_list: event_range = EventRange.build_from_dict(r) - ranges_map[event_range.eventRangeID] = TaskStatus.build_eventrange_dict(event_range, f"outputfile-{event_range.eventRangeID}") + ranges_map[event_range.eventRangeID] = TaskStatus.build_eventrange_dict( + event_range, f"outputfile-{event_range.eventRangeID}" + ) ts.set_file_merged([fname], outputfile, ranges_map, "guid") ts.save_status() @@ -110,7 +111,9 @@ def test_set_merged(self, nfiles, nevents, tmp_path, config, sample_job, sample_ ranges_map = {} for r in ranges_list: event_range = EventRange.build_from_dict(r) - ranges_map[event_range.eventRangeID] = TaskStatus.build_eventrange_dict(event_range, f"outputfile-{event_range.eventRangeID}") + ranges_map[event_range.eventRangeID] = TaskStatus.build_eventrange_dict( + event_range, f"outputfile-{event_range.eventRangeID}" + ) ts.set_file_merged([fname], outputfile, ranges_map, "guid") ts.save_status() print(ts._status)