From 30191361d91cb1b55e32f53bea2db37e98a56567 Mon Sep 17 00:00:00 2001 From: WayneZ Date: Fri, 12 Jul 2024 15:01:05 -0700 Subject: [PATCH 01/12] updated_reservation id thing --- .../launchpad-checkpoint.py | 2196 +++++++++++++++++ fireworks/core/launchpad.py | 15 +- .../.ipynb_checkpoints/lpad_run-checkpoint.py | 1565 ++++++++++++ 3 files changed, 3765 insertions(+), 11 deletions(-) create mode 100644 fireworks/core/.ipynb_checkpoints/launchpad-checkpoint.py create mode 100644 fireworks/scripts/.ipynb_checkpoints/lpad_run-checkpoint.py diff --git a/fireworks/core/.ipynb_checkpoints/launchpad-checkpoint.py b/fireworks/core/.ipynb_checkpoints/launchpad-checkpoint.py new file mode 100644 index 000000000..432d5f429 --- /dev/null +++ b/fireworks/core/.ipynb_checkpoints/launchpad-checkpoint.py @@ -0,0 +1,2196 @@ +"""The LaunchPad manages the FireWorks database.""" + +import datetime +import json +import os +import random +import shutil +import time +import traceback +import warnings +from collections import defaultdict +from itertools import chain +from sustodian import FindMyFW + +import gridfs +from bson import ObjectId +from monty.os.path import zpath +from monty.serialization import loadfn +from pymongo import ASCENDING, DESCENDING +from pymongo.errors import DocumentTooLarge +from tqdm import tqdm + +from fireworks.core.firework import Firework, FWAction, Launch, Tracker, Workflow +from fireworks.fw_config import MongoClient +from fireworks.fw_config import ( + GRIDFS_FALLBACK_COLLECTION, + LAUNCHPAD_LOC, + MAINTAIN_INTERVAL, + MONGO_SOCKET_TIMEOUT_MS, + RESERVATION_EXPIRATION_SECS, + RUN_EXPIRATION_SECS, + SORT_FWS, + WFLOCK_EXPIRATION_KILL, + WFLOCK_EXPIRATION_SECS, +) +from fireworks.utilities.fw_serializers import FWSerializable, reconstitute_dates, recursive_dict +from fireworks.utilities.fw_utilities import get_fw_logger + +__author__ = "Anubhav Jain" +__copyright__ = "Copyright 2013, The Materials Project" +__maintainer__ = "Anubhav Jain" +__email__ = "ajain@lbl.gov" +__date__ = "Jan 30, 2013" + + +# TODO: lots of duplication reduction and cleanup possible + + +def sort_aggregation(sort): + """Build sorting aggregation pipeline. + + Args: + sort [(str,int)]: sorting keys and directions as a list of + (str, int) tuples, i.e. [('updated_on', 1)] + """ + # Fix for sorting by dates which are actually stored as strings: + # Not sure about the underlying issue's source, but apparently some + # dates are stored as strings and others as date objects. + # Following pipeline makes sure all stored dates are actually date + # objects for proper comparison when sorting. + # Assumption below is that dates are either strings or date objects, + # nothing else. + aggregation = [] + for k, _ in sort: + if k in {"updated_on", "created_on"}: + aggregation.append( + { + "$set": { + k: { + "$dateFromString": { + "dateString": "$" + k, + "onError": "$" + k, # if conversion fails, just return original object + } + } + } + } + ) + aggregation.append({"$sort": dict(sort)}) + return aggregation + + +class LockedWorkflowError(ValueError): + """ + Error raised if the context manager WFLock can't acquire the lock on the WF within the selected + time interval (WFLOCK_EXPIRATION_SECS), if the killing of the lock is disabled (WFLOCK_EXPIRATION_KILL). + """ + + +class WFLock: + """ + Lock a Workflow, i.e. for performing update operations + Raises a LockedWorkflowError if the lock couldn't be acquired within expire_secs and kill==False. + Calling functions are responsible for handling the error in order to avoid database inconsistencies. + """ + + def __init__(self, lp, fw_id, expire_secs=WFLOCK_EXPIRATION_SECS, kill=WFLOCK_EXPIRATION_KILL) -> None: + """ + Args: + lp (LaunchPad) + fw_id (int): Firework id + expire_secs (int): max waiting time in seconds. + kill (bool): force lock acquisition or not. + """ + self.lp = lp + self.fw_id = fw_id + self.expire_secs = expire_secs + self.kill = kill + + def __enter__(self): + ctr = 0 + waiting_time = 0 + # acquire lock + links_dict = self.lp.workflows.find_one_and_update( + {"nodes": self.fw_id, "locked": {"$exists": False}}, {"$set": {"locked": True}} + ) + # could not acquire lock b/c WF is already locked for writing + while not links_dict: + ctr += 1 + time_incr = ctr / 10.0 + random.random() / 100.0 + time.sleep(time_incr) # wait a bit for lock to free up + waiting_time += time_incr + if waiting_time > self.expire_secs: # too much time waiting, expire lock + wf = self.lp.workflows.find_one({"nodes": self.fw_id}) + if not wf: + raise ValueError(f"Could not find workflow in database: {self.fw_id}") + if self.kill: # force lock acquisition + self.lp.m_logger.warning(f"FORCIBLY ACQUIRING LOCK, WF: {self.fw_id}") + links_dict = self.lp.workflows.find_one_and_update( + {"nodes": self.fw_id}, {"$set": {"locked": True}} + ) + else: # throw error if we don't want to force lock acquisition + raise LockedWorkflowError(f"Could not get workflow - LOCKED: {self.fw_id}") + else: + # retry lock + links_dict = self.lp.workflows.find_one_and_update( + {"nodes": self.fw_id, "locked": {"$exists": False}}, {"$set": {"locked": True}} + ) + + def __exit__(self, exc_type, exc_val, exc_tb): + self.lp.workflows.find_one_and_update({"nodes": self.fw_id}, {"$unset": {"locked": True}}) + + +class LaunchPad(FWSerializable): + """The LaunchPad manages the FireWorks database.""" + + def __init__( + self, + host=None, + port=None, + name=None, + username=None, + password=None, + logdir=None, + strm_lvl=None, + user_indices=None, + wf_user_indices=None, + authsource=None, + uri_mode=False, + mongoclient_kwargs=None, + ) -> None: + """ + Args: + host (str): hostname. If uri_mode is True, a MongoDB connection string URI + (https://docs.mongodb.com/manual/reference/connection-string/) can be used instead of the remaining + options below. + port (int): port number + name (str): database name + username (str) + password (str) + logdir (str): path to the log directory + strm_lvl (str): the logger stream level + user_indices (list): list of 'fireworks' collection indexes to be built + wf_user_indices (list): list of 'workflows' collection indexes to be built + authsource (str): authSource parameter for MongoDB authentication; defaults to "name" (i.e., db name) if + not set + uri_mode (bool): if set True, all Mongo connection parameters occur through a MongoDB URI string (set as + the host). + mongoclient_kwargs (dict): A list of any other custom keyword arguments to be + passed into the MongoClient connection. Use these kwargs to specify SSL/TLS or serverSelectionTimeoutMS + arguments. Note these arguments are different depending on the major pymongo version used; see + pymongo documentation for more details. + """ + self.host = host if (host or uri_mode) else "localhost" + self.port = port if (port or uri_mode) else 27017 + self.name = name if (name or uri_mode) else "fireworks" + self.username = username + self.password = password + self.authsource = authsource or self.name + self.mongoclient_kwargs = mongoclient_kwargs or {} + self.uri_mode = uri_mode + + # set up logger + self.logdir = logdir + self.strm_lvl = strm_lvl or "INFO" + self.m_logger = get_fw_logger("launchpad", l_dir=self.logdir, stream_level=self.strm_lvl) + + self.user_indices = user_indices or [] + self.wf_user_indices = wf_user_indices or [] + + # get connection + if uri_mode: + self.connection = MongoClient(host, **self.mongoclient_kwargs) + if self.name is None: + raise ValueError("Must specify a database name when using a MongoDB URI string.") + self.db = self.connection[self.name] + else: + self.connection = MongoClient( + self.host, + self.port, + socketTimeoutMS=MONGO_SOCKET_TIMEOUT_MS, + username=self.username, + password=self.password, + authSource=self.authsource, + **self.mongoclient_kwargs, + ) + self.db = self.connection[self.name] + + self.fireworks = self.db.fireworks + self.launches = self.db.launches + self.offline_runs = self.db.offline_runs + self.fw_id_assigner = self.db.fw_id_assigner + self.workflows = self.db.workflows + if GRIDFS_FALLBACK_COLLECTION: + self.gridfs_fallback = gridfs.GridFS(self.db, GRIDFS_FALLBACK_COLLECTION) + else: + self.gridfs_fallback = None + + self.backup_launch_data = {} + self.backup_fw_data = {} + + def to_dict(self): + """Note: usernames/passwords are exported as unencrypted Strings!""" + return { + "host": self.host, + "port": self.port, + "name": self.name, + "username": self.username, + "password": self.password, + "logdir": self.logdir, + "strm_lvl": self.strm_lvl, + "user_indices": self.user_indices, + "wf_user_indices": self.wf_user_indices, + "authsource": self.authsource, + "uri_mode": self.uri_mode, + "mongoclient_kwargs": self.mongoclient_kwargs, + } + + def update_spec(self, fw_ids, spec_document, mongo=False) -> None: + """ + Update fireworks with a spec. Sometimes you need to modify a firework in progress. + + Args: + fw_ids [int]: All fw_ids to modify. + spec_document (dict): The spec document. Note that only modifications to + the spec key are allowed. So if you supply {"_tasks.1.parameter": "hello"}, + you are effectively modifying spec._tasks.1.parameter in the actual fireworks + collection. + mongo (bool): spec_document uses mongo syntax to directly update the spec + """ + mod_spec = spec_document if mongo else {"$set": {"spec." + k: v for k, v in spec_document.items()}} + + allowed_states = ["READY", "WAITING", "FIZZLED", "DEFUSED", "PAUSED"] + self.fireworks.update_many({"fw_id": {"$in": fw_ids}, "state": {"$in": allowed_states}}, mod_spec) + for fw in self.fireworks.find( + {"fw_id": {"$in": fw_ids}, "state": {"$nin": allowed_states}}, {"fw_id": 1, "state": 1} + ): + self.m_logger.warning( + f"Cannot update spec of fw_id: {fw['fw_id']} with state: {fw['state']}. Try rerunning first." + ) + + @classmethod + def from_dict(cls, d): + port = d.get("port", None) + name = d.get("name", None) + username = d.get("username", None) + password = d.get("password", None) + logdir = d.get("logdir", None) + strm_lvl = d.get("strm_lvl", None) + user_indices = d.get("user_indices", []) + wf_user_indices = d.get("wf_user_indices", []) + authsource = d.get("authsource", None) + uri_mode = d.get("uri_mode", False) + mongoclient_kwargs = d.get("mongoclient_kwargs", None) + return LaunchPad( + d["host"], + port, + name, + username, + password, + logdir, + strm_lvl, + user_indices, + wf_user_indices, + authsource, + uri_mode, + mongoclient_kwargs, + ) + + @classmethod + def auto_load(cls): + if LAUNCHPAD_LOC: + return LaunchPad.from_file(LAUNCHPAD_LOC) + return LaunchPad() + + def reset(self, password, require_password=True, max_reset_wo_password=25) -> None: + """ + Create a new FireWorks database. This will overwrite the existing FireWorks database! To + safeguard against accidentally erasing an existing database, a password must be entered. + + Args: + password (str): A String representing today's date, e.g. '2012-12-31' + require_password (bool): Whether a password is required to reset the DB. Setting to + false is dangerous because running code unintentionally could clear your DB - use + max_reset_wo_password to minimize risk. + max_reset_wo_password (int): A failsafe; when require_password is set to False, + FWS will not clear DBs that contain more workflows than this parameter + """ + m_password = datetime.datetime.now().strftime("%Y-%m-%d") + + if password == m_password or ( + not require_password and self.workflows.count_documents({}) <= max_reset_wo_password + ): + self.fireworks.delete_many({}) + self.launches.delete_many({}) + self.workflows.delete_many({}) + self.offline_runs.delete_many({}) + self._restart_ids(1, 1) + if self.gridfs_fallback is not None: + self.db.drop_collection(f"{GRIDFS_FALLBACK_COLLECTION}.chunks") + self.db.drop_collection(f"{GRIDFS_FALLBACK_COLLECTION}.files") + self.tuneup() + self.m_logger.info("LaunchPad was RESET.") + elif not require_password: + raise ValueError( + f"Password check cannot be overridden since the size of DB ({self.fireworks.count_documents({})} " + f"workflows) is greater than the max_reset_wo_password parameter ({max_reset_wo_password})." + ) + else: + raise ValueError(f"Invalid password! Password is today's date: {m_password}") + + def maintain(self, infinite=True, maintain_interval=None) -> None: + """ + Perform launchpad maintenance: detect lost runs and unreserved RESERVE launches. + + Args: + infinite (bool) + maintain_interval (seconds): sleep time + """ + maintain_interval = maintain_interval or MAINTAIN_INTERVAL + + while True: + self.m_logger.info("Performing maintenance on Launchpad...") + self.m_logger.debug("Tracking down FIZZLED jobs...") + fl, ff, inconsistent_fw_ids = self.detect_lostruns(fizzle=True) + if fl: + self.m_logger.info(f"Detected {len(fl)} FIZZLED launches: {fl}") + self.m_logger.info(f"Detected {len(ff)} FIZZLED FWs: {ff}") + if inconsistent_fw_ids: + self.m_logger.info( + f"Detected {len(inconsistent_fw_ids)} FIZZLED inconsistent fireworks: {inconsistent_fw_ids}" + ) + + self.m_logger.debug("Tracking down stuck RESERVED jobs...") + ur = self.detect_unreserved(rerun=True) + if ur: + self.m_logger.info(f"Unreserved {len(ur)} RESERVED launches: {ur}") + + self.m_logger.info("LaunchPad was MAINTAINED.") + + if not infinite: + break + + self.m_logger.debug(f"Sleeping for {maintain_interval} secs...") + time.sleep(maintain_interval) + + def add_wf(self, wf, reassign_all=True): + """ + Add workflow(or firework) to the launchpad. The firework ids will be reassigned. + + Args: + wf (Workflow/Firework): Workflow or Firework object + reassign_all (bool): If True, the firework ids will be assigned + starting from the next available id. Defaults to True. + + Returns: + dict: mapping between old and new Firework ids + """ + if isinstance(wf, Firework): + wf = Workflow.from_firework(wf) + # sets the root FWs as READY + # prefer to wf.refresh() for speed reasons w/many root FWs + for fw_id in wf.root_fw_ids: + wf.id_fw[fw_id].state = "READY" + wf.fw_states[fw_id] = "READY" + # insert the FireWorks and get back mapping of old to new ids + old_new = self._upsert_fws(list(wf.id_fw.values()), reassign_all=reassign_all) + # update the Workflow with the new ids + wf._reassign_ids(old_new) + # insert the WFLinks + self.workflows.insert_one(wf.to_db_dict()) + self.m_logger.info(f"Added a workflow. id_map: {old_new}") + return old_new + + def bulk_add_wfs(self, wfs) -> None: + """ + Adds a list of workflows to the fireworks database + using insert_many for both the fws and wfs, is + more efficient than adding them one at a time. + + Args: + wfs ([Workflow]): list of workflows or fireworks + + Returns: + None + + """ + # Make all fireworks workflows + wfs = [Workflow.from_Firework(wf) if isinstance(wf, Firework) else wf for wf in wfs] + + # Initialize new firework counter, starting from the next fw id + total_num_fws = sum(len(wf) for wf in wfs) + new_fw_counter = self.fw_id_assigner.find_one_and_update({}, {"$inc": {"next_fw_id": total_num_fws}})[ + "next_fw_id" + ] + for wf in tqdm(wfs): + # Reassign fw_ids and increment the counter + old_new = dict(zip(wf.id_fw, range(new_fw_counter, new_fw_counter + len(wf)))) + for fw in wf: + fw.fw_id = old_new[fw.fw_id] + wf._reassign_ids(old_new) + new_fw_counter += len(wf) + + # Set root fws to READY + for fw_id in wf.root_fw_ids: + wf.id_fw[fw_id].state = "READY" + wf.fw_states[fw_id] = "READY" + + # Insert all fws and wfs, do workflows first so fws don't + # get checked out prematurely + self.workflows.insert_many(wf.to_db_dict() for wf in wfs) + all_fws = chain.from_iterable(wf for wf in wfs) + self.fireworks.insert_many(fw.to_db_dict() for fw in all_fws) + return + + def append_wf(self, new_wf, fw_ids, detour=False, pull_spec_mods=True) -> None: + """ + Append a new workflow on top of an existing workflow. + + Args: + new_wf (Workflow): The new workflow to append + fw_ids ([int]): The parent fw_ids at which to append the workflow + detour (bool): Whether to connect the new Workflow in a "detour" style, i.e., move + original children of the parent fw_ids to the new_wf + pull_spec_mods (bool): Whether the new Workflow should pull the FWActions of the parent + fw_ids + """ + wf = self.get_wf_by_fw_id(fw_ids[0]) + updated_ids = wf.append_wf(new_wf, fw_ids, detour=detour, pull_spec_mods=pull_spec_mods) + with WFLock(self, fw_ids[0]): + self._update_wf(wf, updated_ids) + + def get_launch_by_id(self, launch_id): + """ + Given a Launch id, return details of the Launch. + + Args: + launch_id (int): launch id. + + Returns: + Launch object + """ + m_launch = self.launches.find_one({"launch_id": launch_id}) + if m_launch: + m_launch["action"] = get_action_from_gridfs(m_launch.get("action"), self.gridfs_fallback) + return Launch.from_dict(m_launch) + raise ValueError(f"No Launch exists with {launch_id=}") + + def get_fw_dict_by_id(self, fw_id): + """ + Given firework id, return firework dict. + + Args: + fw_id (int): Firework id. + + Returns: + dict + """ + fw_dict = self.fireworks.find_one({"fw_id": fw_id}) + if not fw_dict: + raise ValueError(f"No Firework exists with id: {fw_id}") + # recreate launches from the launch collection + launches = list( + self.launches.find({"launch_id": {"$in": fw_dict["launches"]}}, sort=[("launch_id", ASCENDING)]) + ) + for launch in launches: + launch["action"] = get_action_from_gridfs(launch.get("action"), self.gridfs_fallback) + fw_dict["launches"] = launches + launches = list( + self.launches.find({"launch_id": {"$in": fw_dict["archived_launches"]}}, sort=[("launch_id", ASCENDING)]) + ) + for launch in launches: + launch["action"] = get_action_from_gridfs(launch.get("action"), self.gridfs_fallback) + fw_dict["archived_launches"] = launches + return fw_dict + + def get_fw_by_id(self, fw_id): + """ + Given a Firework id, give back a Firework object. + + Args: + fw_id (int): Firework id. + + Returns: + Firework object + """ + return Firework.from_dict(self.get_fw_dict_by_id(fw_id)) + + def get_wf_by_fw_id(self, fw_id): + """Given a Firework id, give back the Workflow containing that Firework. + + Args: + fw_id (int): Firework id. + + Returns: + A Workflow object + """ + links_dict = self.workflows.find_one({"nodes": fw_id}) + if not links_dict: + raise ValueError(f"Could not find a Workflow with {fw_id=}") + fws = map(self.get_fw_by_id, links_dict["nodes"]) + return Workflow( + fws, + links_dict["links"], + links_dict["name"], + links_dict["metadata"], + links_dict["created_on"], + links_dict["updated_on"], + ) + + def get_wf_by_fw_id_lzyfw(self, fw_id: int) -> Workflow: + """Given a FireWork id, give back the Workflow containing that FireWork. + + Args: + fw_id (int): FireWork id. + + Returns: + A Workflow object + """ + links_dict = self.workflows.find_one({"nodes": fw_id}) + if not links_dict: + raise ValueError(f"Could not find a Workflow with {fw_id=}") + + fws = [ + LazyFirework(fw_id, self.fireworks, self.launches, self.gridfs_fallback) for fw_id in links_dict["nodes"] + ] + + # Check for fw_states in links_dict to conform with pre-optimized workflows + fw_states = {int(k): v for k, v in links_dict["fw_states"].items()} if "fw_states" in links_dict else None + + return Workflow( + fws, + links_dict["links"], + links_dict["name"], + links_dict["metadata"], + links_dict["created_on"], + links_dict["updated_on"], + fw_states, + ) + + def delete_fws(self, fw_ids, delete_launch_dirs=False) -> None: + """Delete a set of fireworks identified by their fw_ids. + + ATTENTION: This function serves maintenance purposes and will leave + workflows untouched. Its use will thus result in a corrupted database. + Use 'delete_wf' instead for consistently deleting workflows together + with their fireworks. + + Args: + fw_ids ([int]): Firework ids + delete_launch_dirs (bool): if True all the launch directories associated with + the WF will be deleted as well, if possible. + """ + potential_launch_ids = [] + launch_ids = [] + for fw_id in fw_ids: + fw_dict = self.fireworks.find_one({"fw_id": fw_id}) + if fw_dict: + potential_launch_ids += fw_dict.get("launches", []) + fw_dict.get("archived_launches", []) + + launch_ids = [ + launch_id + for launch_id in potential_launch_ids + if not self.fireworks.find_one( + {"$or": [{"launches": launch_id}, {"archived_launches": launch_id}], "fw_id": {"$nin": fw_ids}}, + {"launch_id": 1}, + ) + ] + + if delete_launch_dirs: + launch_dirs = [ + self.launches.find_one({"launch_id": launch_id}, {"launch_dir": 1})["launch_dir"] + for launch_id in launch_ids + ] + print(f"Remove folders {launch_dirs}") + for launch_dir in launch_dirs: + shutil.rmtree(launch_dir, ignore_errors=True) + + print(f"Remove fws {fw_ids}") + if self.gridfs_fallback is not None: + for launch_id in launch_ids: + for file_id in self.gridfs_fallback.find({"metadata.launch_id": launch_id}): + self.gridfs_fallback.delete(file_id._id) + print(f"Remove launches {launch_ids}") + self.launches.delete_many({"launch_id": {"$in": launch_ids}}) + self.offline_runs.delete_many({"launch_id": {"$in": launch_ids}}) + self.fireworks.delete_many({"fw_id": {"$in": fw_ids}}) + + def delete_wf(self, fw_id, delete_launch_dirs=False) -> None: + """ + Delete the workflow containing firework with the given id. + + Args: + fw_id (int): Firework id + delete_launch_dirs (bool): if True all the launch directories associated with + the WF will be deleted as well, if possible. + delete_launch_dirs + """ + links_dict = self.workflows.find_one({"nodes": fw_id}) + fw_ids = links_dict["nodes"] + self.delete_fws(fw_ids, delete_launch_dirs=delete_launch_dirs) + print("Removing workflow.") + self.workflows.delete_one({"nodes": fw_id}) + + def get_wf_summary_dict(self, fw_id, mode="more"): + """ + A much faster way to get summary information about a Workflow by querying only for + needed information. + + Args: + fw_id (int): A Firework id. + mode (str): Choose between "more", "less" and "all" in terms of quantity of information. + + Returns: + dict: information about Workflow. + """ + wf_fields = ["state", "created_on", "name", "nodes"] + fw_fields = ["state", "fw_id"] + launch_fields = [] + + if mode != "less": + wf_fields.append("updated_on") + fw_fields.extend(["name", "launches"]) + launch_fields.extend(("launch_id", "launch_dir")) + + if mode == "reservations": + launch_fields.append("state_history.reservation_id") + + if mode == "all": + wf_fields = None + + wf = self.workflows.find_one({"nodes": fw_id}, projection=wf_fields) + fw_data = [] + id_name_map = {} + launch_ids = [] + for fw in self.fireworks.find({"fw_id": {"$in": wf["nodes"]}}, projection=fw_fields): + if launch_fields: + launch_ids.extend(fw["launches"]) + fw_data.append(fw) + if mode != "less": + id_name_map[fw["fw_id"]] = f"{fw['name']}--{int(fw['fw_id'])}" + + if launch_fields: + launch_info = defaultdict(list) + for launch in self.launches.find({"launch_id": {"$in": launch_ids}}, projection=launch_fields): + for i, fw in enumerate(fw_data): + if launch["launch_id"] in fw["launches"]: + launch_info[i].append(launch) + for k, v in launch_info.items(): + fw_data[k]["launches"] = v + + wf["fw"] = fw_data + + # Post process the summary dict so that it "looks" better. + if mode == "less": + wf["states_list"] = "-".join( + [fw["state"][:3] if fw["state"].startswith("R") else fw["state"][0] for fw in wf["fw"]] + ) + del wf["nodes"] + + if mode == "more" or mode == "all": + wf["states"] = {} + wf["launch_dirs"] = {} + for fw in wf["fw"]: + k = f"{fw['name']}--{int(fw['fw_id'])}" + wf["states"][k] = fw["state"] + wf["launch_dirs"][k] = [launch["launch_dir"] for launch in fw["launches"]] + del wf["nodes"] + + if mode == "all": + del wf["fw_states"] + wf["links"] = {id_name_map[int(k)]: [id_name_map[i] for i in v] for k, v in wf["links"].items()} + wf["parent_links"] = { + id_name_map[int(k)]: [id_name_map[i] for i in v] for k, v in wf["parent_links"].items() + } + if mode == "reservations": + wf["states"] = {} + wf["launches"] = {} + for fw in wf["fw"]: + k = f"{fw['name']}--{int(fw['fw_id'])}" + wf["states"][k] = fw["state"] + wf["launches"][k] = fw["launches"] + del wf["nodes"] + + del wf["_id"] + del wf["fw"] + + return wf + + def get_fw_ids(self, query=None, sort=None, limit=0, count_only=False, launches_mode=False): + """ + Return all the fw ids that match a query. + + Args: + query (dict): representing a Mongo query + sort [(str,str)]: sort argument in Pymongo format + limit (int): limit the results + count_only (bool): only return the count rather than explicit ids + launches_mode (bool): query the launches collection instead of fireworks + + Returns: + list: list of firework ids matching the query + """ + coll = "launches" if launches_mode else "fireworks" + criteria = query or {} + if launches_mode: + lids = self._get_active_launch_ids() + criteria["launch_id"] = {"$in": lids} + + if count_only and limit: + return ValueError("Cannot count_only and limit at the same time!") + + aggregation = [] + + if criteria is not None: + aggregation.append({"$match": criteria}) + + if count_only: + aggregation.append({"$count": "count"}) + self.m_logger.debug(f"Aggregation '{aggregation}'.") + + cursor = getattr(self, coll).aggregate(aggregation) + res = list(cursor) + return res[0]["count"] if len(res) > 0 else 0 + + if sort is not None: + aggregation.extend(sort_aggregation(sort)) + + aggregation.append({"$project": {"fw_id": True, "_id": False}}) + + if limit is not None and limit > 0: + aggregation.append({"$limit": limit}) + + self.m_logger.debug(f"Aggregation '{aggregation}'.") + cursor = getattr(self, coll).aggregate(aggregation) + return [fw["fw_id"] for fw in cursor] + + def get_wf_ids(self, query=None, sort=None, limit=0, count_only=False): + """ + Return one fw id for all workflows that match a query. + + Args: + query (dict): representing a Mongo query + sort [(str,str)]: sort argument in Pymongo format + limit (int): limit the results + count_only (bool): only return the count rather than explicit ids + + Returns: + list: list of firework ids + """ + criteria = query or {} + aggregation = [] + + if criteria is not None: + aggregation.append({"$match": criteria}) + + if count_only: + aggregation.append({"$count": "count"}) + self.m_logger.debug(f"Aggregation '{aggregation}'.") + + cursor = self.workflows.aggregate(aggregation) + res = list(cursor) + return res[0]["count"] if len(res) > 0 else 0 + + if sort is not None: + aggregation.extend(sort_aggregation(sort)) + + aggregation.append({"$project": {"nodes": True, "_id": False}}) + + if limit is not None and limit > 0: + aggregation.append({"$limit": limit}) + + self.m_logger.debug(f"Aggregation '{aggregation}'.") + cursor = self.workflows.aggregate(aggregation) + + return [fw["nodes"][0] for fw in cursor] + + def get_fw_ids_in_wfs( + self, wf_query=None, fw_query=None, sort=None, limit=0, count_only=False, launches_mode=False + ): + """ + Return all fw ids that match fw_query within workflows that match wf_query. + + Args: + wf_query (dict): representing a Mongo query on workflows + fw_query (dict): representing a Mongo query on Fireworks + sort [(str,str)]: sort argument in Pymongo format + limit (int): limit the results + count_only (bool): only return the count rather than explicit ids + launches_mode (bool): query the launches collection instead of fireworks + + Returns: + list: list of firework ids matching the query + """ + coll = "launches" if launches_mode else "fireworks" + if launches_mode: + lids = self._get_active_launch_ids() + if fw_query is None: + fw_query = {} + fw_query["launch_id"] = {"$in": lids} + + if count_only and limit: + return ValueError("Cannot count_only and limit at the same time!") + + aggregation = [] + + if wf_query is not None: + aggregation.append( + {"$match": wf_query}, + ) + + aggregation.extend( + [ + {"$project": {"nodes": True, "_id": False}}, + {"$unwind": "$nodes"}, + { + "$lookup": { + "from": coll, # fireworks or launches + "localField": "nodes", + "foreignField": "fw_id", + "as": "fireworks", + } + }, + {"$project": {"fireworks": 1, "_id": 0}}, + {"$unwind": "$fireworks"}, + {"$replaceRoot": {"newRoot": "$fireworks"}}, + ] + ) + + if fw_query is not None: + aggregation.append({"$match": fw_query}) + + if count_only: + aggregation.append({"$count": "count"}) + self.m_logger.debug(f"Aggregation '{aggregation}'.") + + cursor = self.workflows.aggregate(aggregation) + res = list(cursor) + return res[0]["count"] if len(res) > 0 else 0 + + if sort is not None: + aggregation.extend(sort_aggregation(sort)) + + aggregation.append({"$project": {"fw_id": True, "_id": False}}) + + if limit is not None and limit > 0: + aggregation.append({"$limit": limit}) + + self.m_logger.debug(f"Aggregation '{aggregation}'.") + cursor = self.workflows.aggregate(aggregation) + return [fw["fw_id"] for fw in cursor] + + def run_exists(self, fworker=None): + """ + Checks to see if the database contains any FireWorks that are ready to run. + + Returns: + bool: True if the database contains any FireWorks that are ready to run. + """ + q = fworker.query if fworker else {} + return bool(self._get_a_fw_to_run(query=q, checkout=False)) + + def future_run_exists(self, fworker=None) -> bool: + """Check if database has any current OR future Fireworks available. + + Returns: + bool: True if database has any ready or waiting Fireworks. + """ + if self.run_exists(fworker): + # check first to see if any are READY + return True + # retrieve all [RUNNING/RESERVED] fireworks + q = fworker.query if fworker else {} + q.update(state={"$in": ["RUNNING", "RESERVED"]}) + active = self.get_fw_ids(q) + # then check if they have WAITING children + for fw_id in active: + children = self.get_wf_by_fw_id_lzyfw(fw_id).links[fw_id] + if any(self.get_fw_dict_by_id(i)["state"] == "WAITING" for i in children): + return True + + # if we loop over all active and none have WAITING children + # there is no future work to do + return False + + def tuneup(self, bkground=True) -> None: + """Database tuneup: build indexes.""" + self.m_logger.info("Performing db tune-up") + + self.m_logger.debug("Updating indices...") + self.fireworks.create_index("fw_id", unique=True, background=bkground) + for f in ("state", "spec._category", "created_on", "updated_on", "name", "launches"): + self.fireworks.create_index(f, background=bkground) + + self.launches.create_index("launch_id", unique=True, background=bkground) + self.launches.create_index("fw_id", background=bkground) + self.launches.create_index("state_history.reservation_id", background=bkground) + + if GRIDFS_FALLBACK_COLLECTION is not None: + files_collection = self.db[f"{GRIDFS_FALLBACK_COLLECTION}.files"] + files_collection.create_index("metadata.launch_id", unique=True, background=bkground) + + for f in ("state", "time_start", "time_end", "host", "ip", "fworker.name"): + self.launches.create_index(f, background=bkground) + + for f in ("name", "created_on", "updated_on", "nodes"): + self.workflows.create_index(f, background=bkground) + + for idx in self.user_indices: + self.fireworks.create_index(idx, background=bkground) + + for idx in self.wf_user_indices: + self.workflows.create_index(idx, background=bkground) + + # for frontend, which needs to sort on _id after querying on state + self.fireworks.create_index([("state", DESCENDING), ("_id", DESCENDING)], background=bkground) + self.fireworks.create_index( + [("state", DESCENDING), ("spec._priority", DESCENDING), ("created_on", DESCENDING)], background=bkground + ) + self.fireworks.create_index( + [("state", DESCENDING), ("spec._priority", DESCENDING), ("created_on", ASCENDING)], background=bkground + ) + self.workflows.create_index([("state", DESCENDING), ("_id", DESCENDING)], background=bkground) + + if not bkground: + self.m_logger.debug("Compacting database...") + try: + self.db.command({"compact": "fireworks"}) + self.db.command({"compact": "launches"}) + except Exception: + self.m_logger.debug("Database compaction failed (not critical)") + + def pause_fw(self, fw_id): + """ + Given the firework id, pauses the firework and refresh the workflow. + + Args: + fw_id(int): firework id + """ + allowed_states = ["WAITING", "READY", "RESERVED"] + f = self.fireworks.find_one_and_update( + {"fw_id": fw_id, "state": {"$in": allowed_states}}, + {"$set": {"state": "PAUSED", "updated_on": datetime.datetime.utcnow()}}, + ) + if f: + self._refresh_wf(fw_id) + if not f: + self.m_logger.error(f"No pausable (WAITING,READY,RESERVED) Firework exists with {fw_id=}") + return f + + def defuse_fw(self, fw_id, rerun_duplicates=True): + """ + Given the firework id, defuse the firework and refresh the workflow. + + Args: + fw_id (int): firework id + rerun_duplicates (bool): if True, duplicate fireworks(ones with the same launch) are + marked for rerun and then defused. + """ + allowed_states = ["DEFUSED", "WAITING", "READY", "FIZZLED", "PAUSED"] + f = self.fireworks.find_one_and_update( + {"fw_id": fw_id, "state": {"$in": allowed_states}}, + {"$set": {"state": "DEFUSED", "updated_on": datetime.datetime.utcnow()}}, + ) + if f: + self._refresh_wf(fw_id) + if not f: + self.rerun_fw(fw_id, rerun_duplicates) + f = self.fireworks.find_one_and_update( + {"fw_id": fw_id, "state": {"$in": allowed_states}}, + {"$set": {"state": "DEFUSED", "updated_on": datetime.datetime.utcnow()}}, + ) + if f: + self._refresh_wf(fw_id) + return f + + def reignite_fw(self, fw_id): + """ + Given the firework id, re-ignite(set state=WAITING) the defused firework. + + Args: + fw_id (int): firework id + """ + f = self.fireworks.find_one_and_update( + {"fw_id": fw_id, "state": "DEFUSED"}, + {"$set": {"state": "WAITING", "updated_on": datetime.datetime.utcnow()}}, + ) + if f: + self._refresh_wf(fw_id) + return f + + def resume_fw(self, fw_id): + """ + Given the firework id, resume (set state=WAITING) the paused firework. + + Args: + fw_id (int): firework id + """ + f = self.fireworks.find_one_and_update( + {"fw_id": fw_id, "state": "PAUSED"}, + {"$set": {"state": "WAITING", "updated_on": datetime.datetime.utcnow()}}, + ) + if f: + self._refresh_wf(fw_id) + return f + + def defuse_wf(self, fw_id, defuse_all_states=True) -> None: + """ + Defuse the workflow containing the given firework id. + + Args: + fw_id (int): firework id + defuse_all_states (bool) + """ + wf = self.get_wf_by_fw_id_lzyfw(fw_id) + for fw in wf: + if fw.state not in ["COMPLETED", "FIZZLED"] or defuse_all_states: + self.defuse_fw(fw.fw_id) + + def pause_wf(self, fw_id) -> None: + """ + Pause the workflow containing the given firework id. + + Args: + fw_id (int): firework id + """ + wf = self.get_wf_by_fw_id_lzyfw(fw_id) + for fw in wf: + if fw.state not in ["COMPLETED", "FIZZLED", "DEFUSED"]: + self.pause_fw(fw.fw_id) + + def reignite_wf(self, fw_id) -> None: + """ + Reignite the workflow containing the given firework id. + + Args: + fw_id (int): firework id + """ + wf = self.get_wf_by_fw_id_lzyfw(fw_id) + for fw in wf: + self.reignite_fw(fw.fw_id) + + def archive_wf(self, fw_id) -> None: + """ + Archive the workflow containing the given firework id. + + Args: + fw_id (int): firework id + """ + # first archive all the launches, so they are not used in duplicate checks + wf = self.get_wf_by_fw_id_lzyfw(fw_id) + if wf.state != "ARCHIVED": + fw_ids = [f.fw_id for f in wf] + for fw_id in fw_ids: + self.rerun_fw(fw_id) + + # second set the state of all FWs to ARCHIVED + wf = self.get_wf_by_fw_id_lzyfw(fw_id) + for fw in wf: + self.fireworks.find_one_and_update( + {"fw_id": fw.fw_id}, {"$set": {"state": "ARCHIVED", "updated_on": datetime.datetime.utcnow()}} + ) + self._refresh_wf(fw.fw_id) + + def _restart_ids(self, next_fw_id, next_launch_id) -> None: + """ + internal method used to reset firework id counters. + + Args: + next_fw_id (int): id to give next Firework + next_launch_id (int): id to give next Launch + """ + self.fw_id_assigner.delete_many({}) + self.fw_id_assigner.find_one_and_replace( + {"_id": -1}, {"next_fw_id": next_fw_id, "next_launch_id": next_launch_id}, upsert=True + ) + self.m_logger.debug(f"RESTARTED fw_id, launch_id to ({next_fw_id}, {next_launch_id})") + + def _check_fw_for_uniqueness(self, m_fw) -> bool: + """ + Check if there are duplicates. If not unique, a new id is assigned and the workflow + refreshed. + + Args: + m_fw (Firework) + + Returns: + bool: True if the firework is unique + """ + if not self._steal_launches(m_fw): + self.m_logger.debug(f"FW with id: {m_fw.fw_id} is unique!") + return True + self._upsert_fws([m_fw]) # update the DB with the new launches + self._refresh_wf(m_fw.fw_id) # since we updated a state, we need to refresh the WF again + return False + + def _get_a_fw_to_run(self, query=None, fw_id=None, checkout=True): + """ + Get the next ready firework to run. + + Args: + query (dict) + fw_id (int): If given the query is updated. + Note: We want to return None if this specific FW doesn't exist anymore. This is + because our queue params might have been tailored to this FW. + checkout (bool): if True, check out the matching firework and set state=RESERVED + + Returns: + Firework + """ + m_query = dict(query) if query else {} # make a defensive copy + m_query["state"] = "READY" + sortby = [("spec._priority", DESCENDING)] + + if SORT_FWS.upper() == "FIFO": + sortby.append(("created_on", ASCENDING)) + elif SORT_FWS.upper() == "FILO": + sortby.append(("created_on", DESCENDING)) + + # Override query if fw_id defined + if fw_id: + m_query = {"fw_id": fw_id, "state": {"$in": ["READY", "RESERVED"]}} + + while True: + # check out the matching firework, depending on the query set by the FWorker + if checkout: + m_fw = self.fireworks.find_one_and_update( + m_query, {"$set": {"state": "RESERVED", "updated_on": datetime.datetime.utcnow()}}, sort=sortby + ) + else: + m_fw = self.fireworks.find_one(m_query, {"fw_id": 1, "spec": 1}, sort=sortby) + + if not m_fw: + return None + m_fw = self.get_fw_by_id(m_fw["fw_id"]) + if self._check_fw_for_uniqueness(m_fw): + return m_fw + + def _get_active_launch_ids(self): + """ + Get all the launch ids. + + Returns: + list: all launch ids + """ + all_launch_ids = [] + for launch in self.fireworks.find({}, {"launches": 1}): + all_launch_ids.extend(launch["launches"]) + return all_launch_ids + + def reserve_fw(self, fworker, launch_dir, host=None, ip=None, fw_id=None): + """ + Checkout the next ready firework and mark the launch reserved. + + Args: + fworker (FWorker) + launch_dir (str): path to the launch directory. + host (str): hostname + ip (str): ip address + fw_id (int): fw_id to be reserved, if desired + + Returns: + (Firework, int): the checked out firework and the new launch id. + """ + return self.checkout_fw(fworker, launch_dir, host=host, ip=ip, fw_id=fw_id, state="RESERVED") + + def get_fw_ids_from_reservation_id(self, reservation_id): + + fw_id=FindMyFW.get_fwid(reservation_id) + return fw_id + + def cancel_reservation_by_reservation_id(self, reservation_id) -> None: + """Given the reservation id, cancel the reservation and rerun the corresponding fireworks.""" + l_id = self.launches.find_one( + {"state_history.reservation_id": reservation_id, "state": "RESERVED"}, {"launch_id": 1} + ) + if l_id: + self.cancel_reservation(l_id["launch_id"]) + else: + self.m_logger.info(f"Can't find any reserved jobs with reservation id: {reservation_id}") + + def get_reservation_id_from_fw_id(self, fw_id): + """Given the firework id, return the reservation id.""" + fw = self.fireworks.find_one({"fw_id": fw_id}, {"launches": 1}) + if fw: + for launch in self.launches.find({"launch_id": {"$in": fw["launches"]}}, {"state_history": 1}): + for d in launch["state_history"]: + if "reservation_id" in d: + return d["reservation_id"] + return None + return None + + def cancel_reservation(self, launch_id) -> None: + """Given the launch id, cancel the reservation and rerun the fireworks.""" + m_launch = self.get_launch_by_id(launch_id) + m_launch.state = "READY" + self.launches.find_one_and_replace( + {"launch_id": m_launch.launch_id, "state": "RESERVED"}, m_launch.to_db_dict(), upsert=True + ) + + for fw in self.fireworks.find({"launches": launch_id, "state": "RESERVED"}, {"fw_id": 1}): + self.rerun_fw(fw["fw_id"], rerun_duplicates=False) + + def detect_unreserved(self, expiration_secs=RESERVATION_EXPIRATION_SECS, rerun=False): + """ + Return the reserved launch ids that have not been updated for a while. + + Args: + expiration_secs (seconds): time limit + rerun (bool): if True, the expired reservations are cancelled and the fireworks rerun. + + Returns: + [int]: list of expired launch ids + """ + now_time = datetime.datetime.utcnow() + cutoff_time_str = (now_time - datetime.timedelta(seconds=expiration_secs)).isoformat() + bad_launch_data = self.launches.find( + { + "state": "RESERVED", + "state_history": {"$elemMatch": {"state": "RESERVED", "updated_on": {"$lte": cutoff_time_str}}}, + }, + {"launch_id": 1, "fw_id": 1}, + ) + bad_launch_ids = [ + ld["launch_id"] + for ld in bad_launch_data + if self.fireworks.find_one({"fw_id": ld["fw_id"], "state": "RESERVED"}, {"fw_id": 1}) + ] + if rerun: + for lid in bad_launch_ids: + self.cancel_reservation(lid) + return bad_launch_ids + + def mark_fizzled(self, launch_id) -> None: + """ + Mark the launch corresponding to the given id as FIZZLED. + + Args: + launch_id (int): launch id. + + Returns: + dict: updated launch + """ + # Do a confirmed write and make sure state_history is preserved + self.complete_launch(launch_id, state="FIZZLED") + + def detect_lostruns( + self, + expiration_secs=RUN_EXPIRATION_SECS, + fizzle=False, + rerun=False, + max_runtime=None, + min_runtime=None, + refresh=False, + query=None, + launch_query=None, + ): + """ + Detect lost runs i.e running fireworks that haven't been updated within the specified + time limit or running firework whose launch has been marked fizzed or completed. + + Args: + expiration_secs (seconds): expiration time in seconds + fizzle (bool): if True, mark the lost runs fizzed + rerun (bool): if True, mark the lost runs fizzed and rerun + max_runtime (seconds): maximum run time + min_runtime (seconds): minimum run time + refresh (bool): if True, refresh the workflow with inconsistent fireworks. + query (dict): restrict search to FWs matching this query + launch_query (dict): restrict search to launches matching this query (e.g. host restriction) + + Returns: + ([int], [int], [int]): tuple of list of lost launch ids, lost firework ids and + inconsistent firework ids. + """ + lost_launch_ids = [] + lost_fw_ids = [] + potential_lost_fw_ids = [] + now_time = datetime.datetime.utcnow() + cutoff_timestr = (now_time - datetime.timedelta(seconds=expiration_secs)).isoformat() + + lostruns_query = launch_query or {} + lostruns_query["state"] = "RUNNING" + lostruns_query["state_history"] = {"$elemMatch": {"state": "RUNNING", "updated_on": {"$lte": cutoff_timestr}}} + + if query: + fw_ids = [x["fw_id"] for x in self.fireworks.find(query, {"fw_id": 1})] + lostruns_query["fw_id"] = {"$in": fw_ids} + + bad_launch_data = self.launches.find(lostruns_query, {"launch_id": 1, "fw_id": 1}) + for ld in bad_launch_data: + bad_launch = True + if max_runtime or min_runtime: + bad_launch = False + m_l = self.get_launch_by_id(ld["launch_id"]) + utime = m_l._get_time("RUNNING", use_update_time=True) + ctime = m_l._get_time("RUNNING", use_update_time=False) + if (not max_runtime or (utime - ctime).seconds <= max_runtime) and ( + not min_runtime or (utime - ctime).seconds >= min_runtime + ): + bad_launch = True + if bad_launch: + lost_launch_ids.append(ld["launch_id"]) + potential_lost_fw_ids.append(ld["fw_id"]) + + for fw_id in potential_lost_fw_ids: # tricky: figure out what's actually lost + fw = self.fireworks.find_one({"fw_id": fw_id}, {"launches": 1, "state": 1}) or {} + # only RUNNING FireWorks can be "lost", i.e. not defused or archived + if fw.get("state") == "RUNNING": + l_ids = fw["launches"] + not_lost = [x for x in l_ids if x not in lost_launch_ids] + if len(not_lost) == 0: # all launches are lost - we are lost! + lost_fw_ids.append(fw_id) + else: + for l_id in not_lost: + l_state = self.launches.find_one({"launch_id": l_id}, {"state": 1})["state"] + if Firework.STATE_RANKS[l_state] > Firework.STATE_RANKS["FIZZLED"]: + break + else: + lost_fw_ids.append(fw_id) # all Launches not lost are anyway FIZZLED / ARCHIVED + + if fizzle or rerun: + for lid in lost_launch_ids: + self.mark_fizzled(lid) + + # for offline runs, you want to forget about the run + # see: https://groups.google.com/forum/#!topic/fireworkflows/oimFmE5tZ4E + offline_run = self.offline_runs.count_documents({"launch_id": lid, "deprecated": False}) > 0 + if offline_run: + self.forget_offline(lid, launch_mode=True) + + if rerun: + fw_id = self.launches.find_one({"launch_id": lid}, {"fw_id": 1})["fw_id"] + if fw_id in lost_fw_ids: + self.rerun_fw(fw_id) + + inconsistent_fw_ids = [] + inconsistent_query = query or {} + inconsistent_query["state"] = "RUNNING" + running_fws = self.fireworks.find(inconsistent_query, {"fw_id": 1, "launches": 1}) + for fw in running_fws: + if self.launches.find_one( + {"launch_id": {"$in": fw["launches"]}, "state": {"$in": ["FIZZLED", "COMPLETED"]}} + ): + inconsistent_fw_ids.append(fw["fw_id"]) + if refresh: + self._refresh_wf(fw["fw_id"]) + + return lost_launch_ids, lost_fw_ids, inconsistent_fw_ids + + def set_reservation_id(self, launch_id, reservation_id) -> None: + """ + Set reservation id to the launch corresponding to the given launch id. + + Args: + launch_id (int) + reservation_id (int) + """ + m_launch = self.get_launch_by_id(launch_id) + m_launch.set_reservation_id(reservation_id) + self.launches.find_one_and_replace({"launch_id": launch_id}, m_launch.to_db_dict()) + + def checkout_fw(self, fworker, launch_dir, fw_id=None, host=None, ip=None, state="RUNNING"): + """ + Checkout the next ready firework, mark it with the given state(RESERVED or RUNNING) and + return it to the caller. The caller is responsible for running the Firework. + + Args: + fworker (FWorker): A FWorker instance + launch_dir (str): the dir the FW will be run in (for creating a Launch object) + fw_id (int): Firework id + host (str): the host making the request (for creating a Launch object) + ip (str): the ip making the request (for creating a Launch object) + state (str): RESERVED or RUNNING, the fetched firework's state will be set to this value. + + Returns: + (Firework, int): firework and the new launch id. + """ + m_fw = self._get_a_fw_to_run(fworker.query, fw_id=fw_id) + if not m_fw: + return None, None + + # If this Launch was previously reserved, overwrite that reservation with this Launch + # note that adding a new Launch is problematic from a duplicate run standpoint + prev_reservations = [launch for launch in m_fw.launches if launch.state == "RESERVED"] + reserved_launch = None if not prev_reservations else prev_reservations[0] + state_history = reserved_launch.state_history if reserved_launch else None + + # get new launch + launch_id = reserved_launch.launch_id if reserved_launch else self.get_new_launch_id() + trackers = [Tracker.from_dict(f) for f in m_fw.spec["_trackers"]] if "_trackers" in m_fw.spec else None + m_launch = Launch( + state, + launch_dir, + fworker, + host, + ip, + trackers=trackers, + state_history=state_history, + launch_id=launch_id, + fw_id=m_fw.fw_id, + ) + + # insert the launch + self.launches.find_one_and_replace({"launch_id": m_launch.launch_id}, m_launch.to_db_dict(), upsert=True) + + self.m_logger.debug(f"Created/updated Launch with {launch_id=}") + + # update the firework's launches + if not reserved_launch: + # we're appending a new Firework + m_fw.launches.append(m_launch) + else: + # we're updating an existing launch + m_fw.launches = [m_launch if launch.launch_id == m_launch.launch_id else launch for launch in m_fw.launches] + + # insert the firework and refresh the workflow + m_fw.state = state + self._upsert_fws([m_fw]) + self._refresh_wf(m_fw.fw_id) + + # update any duplicated runs + if state == "RUNNING": + for fw in self.fireworks.find( + {"launches": launch_id, "state": {"$in": ["WAITING", "READY", "RESERVED", "FIZZLED"]}}, {"fw_id": 1} + ): + fw_id = fw["fw_id"] + fw = self.get_fw_by_id(fw_id) + fw.state = state + self._upsert_fws([fw]) + self._refresh_wf(fw.fw_id) + + # Store backup copies of the initial data for retrieval in case of failure + self.backup_launch_data[m_launch.launch_id] = m_launch.to_db_dict() + self.backup_fw_data[fw_id] = m_fw.to_db_dict() + + self.m_logger.debug(f"{m_fw.state} FW with id: {m_fw.fw_id}") + + return m_fw, launch_id + + def change_launch_dir(self, launch_id, launch_dir) -> None: + """ + Change the launch directory corresponding to the given launch id. + + Args: + launch_id (int) + launch_dir (str): path to the new launch directory. + """ + m_launch = self.get_launch_by_id(launch_id) + m_launch.launch_dir = launch_dir + self.launches.find_one_and_replace({"launch_id": m_launch.launch_id}, m_launch.to_db_dict(), upsert=True) + + def restore_backup_data(self, launch_id, fw_id) -> None: + """For the given launch id and firework id, restore the back up data.""" + if launch_id in self.backup_launch_data: + self.launches.find_one_and_replace({"launch_id": launch_id}, self.backup_launch_data[launch_id]) + if fw_id in self.backup_fw_data: + self.fireworks.find_one_and_replace({"fw_id": fw_id}, self.backup_fw_data[fw_id]) + + def complete_launch(self, launch_id, action=None, state="COMPLETED"): + """ + Internal method used to mark a Firework's Launch as completed. + + Args: + launch_id (int) + action (FWAction): the FWAction of what to do next + state (str): COMPLETED or FIZZLED + + Returns: + dict: updated launch + """ + # update the launch data to COMPLETED, set end time, etc + m_launch = self.get_launch_by_id(launch_id) + m_launch.state = state + if action: + m_launch.action = action + + try: + self.launches.find_one_and_replace({"launch_id": m_launch.launch_id}, m_launch.to_db_dict(), upsert=True) + except DocumentTooLarge as err: + launch_db_dict = m_launch.to_db_dict() + action_dict = launch_db_dict.get("action", None) + if not action_dict: + # in case the action is empty and it is not the source of + # the error, raise the exception again. + raise + if self.gridfs_fallback is None: + err.args = ( + err.args[0] + ". Set GRIDFS_FALLBACK_COLLECTION in FW_config.yaml" + " to a value different from None", + ) + raise err + + # encoding required for python2/3 compatibility. + action_id = self.gridfs_fallback.put( + json.dumps(action_dict), encoding="utf-8", metadata={"launch_id": launch_id} + ) + launch_db_dict["action"] = {"gridfs_id": str(action_id)} + self.m_logger.warning("The size of the launch document was too large. Saving the action in gridfs.") + + self.launches.find_one_and_replace({"launch_id": m_launch.launch_id}, launch_db_dict, upsert=True) + + # find all the fws that have this launch + for fw in self.fireworks.find({"launches": launch_id}, {"fw_id": 1}): + fw_id = fw["fw_id"] + self._refresh_wf(fw_id) + + # change return type to dict to make return type serializable to support job packing + return m_launch.to_dict() + + def ping_launch(self, launch_id, ptime=None, checkpoint=None) -> None: + """ + Ping that a Launch is still alive: updates the 'update_on 'field of the state history of a + Launch. + + Args: + launch_id (int) + ptime (datetime) + """ + m_launch = self.get_launch_by_id(launch_id) + for tracker in m_launch.trackers: + tracker.track_file(m_launch.launch_dir) + m_launch.touch_history(ptime, checkpoint=checkpoint) + self.launches.update_one( + {"launch_id": launch_id, "state": "RUNNING"}, + { + "$set": { + "state_history": m_launch.to_db_dict()["state_history"], + "trackers": [t.to_dict() for t in m_launch.trackers], + } + }, + ) + + def get_new_fw_id(self, quantity=1): + """ + Checkout the next Firework id. + + Args: + quantity (int): optionally ask for many ids, otherwise defaults to 1 + this then returns the *first* fw_id in that range + """ + try: + return self.fw_id_assigner.find_one_and_update({}, {"$inc": {"next_fw_id": quantity}})["next_fw_id"] + except Exception: + raise ValueError( + "Could not get next FW id! If you have not yet initialized the database," + " please do so by performing a database reset (e.g., lpad reset)" + ) + + def get_new_launch_id(self): + """Checkout the next Launch id.""" + try: + return self.fw_id_assigner.find_one_and_update({}, {"$inc": {"next_launch_id": 1}})["next_launch_id"] + except Exception: + raise ValueError( + "Could not get next launch id! If you have not yet initialized the " + "database, please do so by performing a database reset (e.g., lpad reset)" + ) + + def _upsert_fws(self, fws, reassign_all=False): + """ + Insert the fireworks to the 'fireworks' collection. + + Args: + fws ([Firework]): list of fireworks + reassign_all (bool): if True, reassign the firework ids. The ids are also reassigned + if the current firework ids are negative. + + Returns: + dict: mapping between old and new Firework ids + """ + old_new = {} + # sort the FWs by id, then the new FW_ids will match the order of the old ones... + fws.sort(key=lambda x: x.fw_id) + + if reassign_all: + used_ids = [] + # we can request multiple fw_ids up front + # this is the FIRST fw_id we should use + first_new_id = self.get_new_fw_id(quantity=len(fws)) + + for new_id, fw in enumerate(fws, start=first_new_id): + old_new[fw.fw_id] = new_id + fw.fw_id = new_id + used_ids.append(new_id) + # delete/add in bulk + self.fireworks.delete_many({"fw_id": {"$in": used_ids}}) + self.fireworks.insert_many(fw.to_db_dict() for fw in fws) + else: + for fw in fws: + if fw.fw_id < 0: + new_id = self.get_new_fw_id() + old_new[fw.fw_id] = new_id + fw.fw_id = new_id + + self.fireworks.find_one_and_replace({"fw_id": fw.fw_id}, fw.to_db_dict(), upsert=True) + + return old_new + + def rerun_fw(self, fw_id, rerun_duplicates=True, recover_launch=None, recover_mode=None): + """ + Rerun the firework corresponding to the given id. + + Args: + fw_id (int): firework id + rerun_duplicates (bool): flag for whether duplicates should be rerun + recover_launch ('last' or int): launch_id for last recovery, if set to + 'last' (default), recovery will find the last available launch. + If it is an int, will recover that specific launch + recover_mode ('prev_dir' or 'cp'): flag to indicate whether to copy + or run recovery fw in previous directory + + Returns: + [int]: list of firework ids that were rerun + """ + m_fw = self.fireworks.find_one({"fw_id": fw_id}, {"state": 1}) + + if not m_fw: + raise ValueError(f"FW with id: {fw_id} not found!") + + # detect FWs that share the same launch. Must do this before rerun + duplicates = [] + reruns = [] + if rerun_duplicates: + fw = self.fireworks.find_one({"fw_id": fw_id, "spec._dupefinder": {"$exists": True}}, {"launches": 1}) + if fw: + duplicates = [ + fw_dct["fw_id"] + for fw_dct in self.fireworks.find( + {"launches": {"$in": fw["launches"]}, "fw_id": {"$ne": fw_id}}, {"fw_id": 1} + ) + ] + duplicates = list(set(duplicates)) + + # Launch recovery + if recover_launch is not None: + recovery = self.get_recovery(fw_id, recover_launch) + recovery.update(_mode=recover_mode) + set_spec = recursive_dict({"$set": {"spec._recovery": recovery}}) + if recover_mode == "prev_dir": + prev_dir = self.get_launch_by_id(recovery.get("_launch_id")).launch_dir + set_spec["$set"]["spec._launch_dir"] = prev_dir + self.fireworks.find_one_and_update({"fw_id": fw_id}, set_spec) + + # If no launch recovery specified, unset the firework recovery spec + else: + set_spec = {"$unset": {"spec._recovery": ""}} + self.fireworks.find_one_and_update({"fw_id": fw_id}, set_spec) + + # rerun this FW + if m_fw["state"] in ["ARCHIVED", "DEFUSED"]: + self.m_logger.info(f"Cannot rerun {fw_id=}: it is {m_fw['state']}.") + elif m_fw["state"] == "WAITING" and not recover_launch: + self.m_logger.debug(f"Skipping rerun {fw_id=}: it is already WAITING.") + else: + with WFLock(self, fw_id): + wf = self.get_wf_by_fw_id_lzyfw(fw_id) + updated_ids = wf.rerun_fw(fw_id) + self._update_wf(wf, updated_ids) + reruns.append(fw_id) + + # rerun duplicated FWs + for fw in duplicates: + self.m_logger.info(f"Also rerunning duplicate fw_id: {fw}") + # False for speed, True shouldn't be needed + r = self.rerun_fw(fw, rerun_duplicates=False, recover_launch=recover_launch, recover_mode=recover_mode) + reruns.extend(r) + + return reruns + + def get_recovery(self, fw_id, launch_id="last"): + """ + function to get recovery data for a given fw and launch + Args: + fw_id (int): fw id to get recovery data for + launch_id (int or 'last'): launch_id to get recovery data for, if 'last' + recovery data is generated from last launch. + """ + m_fw = self.get_fw_by_id(fw_id) + launch = m_fw.launches[-1] if launch_id == "last" else self.get_launch_by_id(launch_id) + recovery = launch.state_history[-1].get("checkpoint") + recovery.update(_prev_dir=launch.launch_dir, _launch_id=launch.launch_id) + return recovery + + def _refresh_wf(self, fw_id) -> None: + """ + Update the FW state of all jobs in workflow. + + Args: + fw_id (int): the parent fw_id - children will be refreshed + """ + # TODO: time how long it took to refresh the WF! + # TODO: need a try-except here, high probability of failure if incorrect action supplied + try: + with WFLock(self, fw_id): + wf = self.get_wf_by_fw_id_lzyfw(fw_id) + updated_ids = wf.refresh(fw_id) + self._update_wf(wf, updated_ids) + except LockedWorkflowError: + self.m_logger.info(f"{fw_id=} locked. Can't refresh!") + except Exception: + # some kind of internal error - an example is that fws serialization changed due to + # code updates and thus the Firework object can no longer be loaded from db description + # Action: *manually* mark the fw and workflow as FIZZLED + self.fireworks.find_one_and_update({"fw_id": fw_id}, {"$set": {"state": "FIZZLED"}}) + self.workflows.find_one_and_update({"nodes": fw_id}, {"$set": {"state": "FIZZLED"}}) + self.workflows.find_one_and_update({"nodes": fw_id}, {"$set": {f"fw_states.{fw_id}": "FIZZLED"}}) + import traceback + + err_message = f"Error refreshing workflow. The full stack trace is: {traceback.format_exc()}" + raise RuntimeError(err_message) + + def _update_wf(self, wf, updated_ids) -> None: + """ + Update the workflow with the updated firework ids. + Note: must be called within an enclosing WFLock. + + Args: + wf (Workflow) + updated_ids ([int]): list of firework ids + """ + updated_fws = [wf.id_fw[fid] for fid in updated_ids] + old_new = self._upsert_fws(updated_fws) + wf._reassign_ids(old_new) + + # find a node for which the id did not change, so we can query on it to get WF + query_node = None + for f in wf.id_fw: + if f not in old_new.values() or old_new.get(f, None) == f: + query_node = f + break + + assert query_node is not None + if not self.workflows.find_one({"nodes": query_node}): + raise ValueError(f"BAD QUERY_NODE! {query_node}") + # redo the links and fw_states + wf = wf.to_db_dict() + wf["locked"] = True # preserve the lock! + self.workflows.find_one_and_replace({"nodes": query_node}, wf) + + def _steal_launches(self, thief_fw): + """ + Check if there are duplicates. If there are duplicates, the matching firework's launches + are added to the launches of the given firework. + + Returns: + bool: False if the given firework is unique + """ + stolen = False + if thief_fw.state in ["READY", "RESERVED"] and "_dupefinder" in thief_fw.spec: + m_dupefinder = thief_fw.spec["_dupefinder"] + # get the query that will limit the number of results to check as duplicates + m_query = m_dupefinder.query(thief_fw.to_dict()["spec"]) + self.m_logger.debug(f"Querying for duplicates, fw_id: {thief_fw.fw_id}") + # iterate through all potential duplicates in the DB + for potential_match in self.fireworks.find(m_query): + self.m_logger.debug(f"Verifying for duplicates, fw_ids: {thief_fw.fw_id}, {potential_match['fw_id']}") + + # see if verification is needed, as this slows the process + verified = False + try: + m_dupefinder.verify({}, {}) # is implemented test + + except NotImplementedError: + verified = True # no dupefinder.verify() implemented, skip verification + + except Exception: + # we want to catch any exceptions from testing an empty dict, which the dupefinder might not be + # designed for + pass + + if not verified: + # dupefinder.verify() is implemented, let's call verify() + spec1 = dict(thief_fw.to_dict()["spec"]) # defensive copy + spec2 = dict(potential_match["spec"]) # defensive copy + verified = m_dupefinder.verify(spec1, spec2) + + if verified: + # steal the launches + victim_fw = self.get_fw_by_id(potential_match["fw_id"]) + thief_launches = [launch.launch_id for launch in thief_fw.launches] + valuable_launches = [ + launch for launch in victim_fw.launches if launch.launch_id not in thief_launches + ] + for launch in valuable_launches: + thief_fw.launches.append(launch) + stolen = True + self.m_logger.info(f"Duplicate found! fwids {thief_fw.fw_id} and {potential_match['fw_id']}") + return stolen + + def set_priority(self, fw_id, priority) -> None: + """ + Set priority to the firework with the given id. + + Args: + fw_id (int): firework id + priority + """ + self.fireworks.find_one_and_update({"fw_id": fw_id}, {"$set": {"spec._priority": priority}}) + + def get_logdir(self): + """ + Return the log directory. + + AJ: This is needed for job packing due to Proxy objects not being fully featured... + """ + return self.logdir + + def add_offline_run(self, launch_id, fw_id, name) -> None: + """ + Add the launch and firework to the offline_run collection. + + Args: + launch_id (int): launch id. + fw_id (id): firework id + name (str) + """ + d = {"fw_id": fw_id} + d["launch_id"] = launch_id + d["name"] = name + d["created_on"] = datetime.datetime.utcnow().isoformat() + d["updated_on"] = datetime.datetime.utcnow().isoformat() + d["deprecated"] = False + d["completed"] = False + self.offline_runs.insert_one(d) + + def recover_offline(self, launch_id, ignore_errors=False, print_errors=False): + """ + Update the launch state using the offline data in FW_offline.json file. + + Args: + launch_id (int): launch id. + ignore_errors (bool) + print_errors (bool) + + Returns: + firework id if the recovering fails otherwise None + """ + # get the launch directory + m_launch = self.get_launch_by_id(launch_id) + try: + self.m_logger.debug(f"RECOVERING fw_id: {m_launch.fw_id}") + + offline_loc = zpath(os.path.join(m_launch.launch_dir, "FW_offline.json")) + + offline_data = loadfn(offline_loc) + + if "started_on" in offline_data: # started running at some point + already_running = False + for s in m_launch.state_history: + if s["state"] == "RUNNING": + s["created_on"] = reconstitute_dates(offline_data["started_on"]) + already_running = True + + if not already_running: + m_launch.state = "RUNNING" # this should also add a history item + + checkpoint = offline_data.get("checkpoint", None) + + # look for ping file - update the Firework if this is the case + ping_loc = os.path.join(m_launch.launch_dir, "FW_ping.json") + if os.path.exists(ping_loc): + ping_dict = loadfn(ping_loc) + self.ping_launch(launch_id, ptime=ping_dict["ping_time"], checkpoint=checkpoint) + else: + warnings.warn( + f"Unable to find FW_ping.json in {m_launch.launch_dir}! State history updated_on might be " + "incorrect, trackers may not update." + ) + m_launch.touch_history(checkpoint=checkpoint) + + if "fwaction" in offline_data: + fwaction = FWAction.from_dict(offline_data["fwaction"]) + m_launch.state = offline_data["state"] + self.launches.find_one_and_replace( + {"launch_id": m_launch.launch_id}, m_launch.to_db_dict(), upsert=True + ) + + m_launch = Launch.from_dict(self.complete_launch(launch_id, fwaction, m_launch.state)) + + for s in m_launch.state_history: + if s["state"] == offline_data["state"]: + s["created_on"] = reconstitute_dates(offline_data["completed_on"]) + self.launches.find_one_and_update( + {"launch_id": m_launch.launch_id}, {"$set": {"state_history": m_launch.state_history}} + ) + + self.offline_runs.update_one({"launch_id": launch_id}, {"$set": {"completed": True}}) + + else: + launch = self.launches.find_one_and_replace( + {"launch_id": m_launch.launch_id}, m_launch.to_db_dict(), upsert=True + ) + fw_id = launch["fw_id"] + f = self.fireworks.find_one_and_update( + {"fw_id": fw_id}, {"$set": {"state": "RUNNING", "updated_on": datetime.datetime.utcnow()}} + ) + if f: + self._refresh_wf(fw_id) + + # update the updated_on + self.offline_runs.update_one( + {"launch_id": launch_id}, {"$set": {"updated_on": datetime.datetime.utcnow().isoformat()}} + ) + return None + + except Exception: + if print_errors: + self.m_logger.error(f"failed recovering {launch_id=}.\n{traceback.format_exc()}") + if not ignore_errors: + traceback.print_exc() + m_action = FWAction( + stored_data={ + "_message": "runtime error during task", + "_task": None, + "_exception": {"_stacktrace": traceback.format_exc(), "_details": None}, + }, + exit=True, + ) + self.complete_launch(launch_id, m_action, "FIZZLED") + self.offline_runs.update_one({"launch_id": launch_id}, {"$set": {"completed": True}}) + return m_launch.fw_id + + def forget_offline(self, launchid_or_fwid, launch_mode=True) -> None: + """ + Unmark the offline run for the given launch or firework id. + + Args: + launchid_or_fwid (int): launch od or firework id + launch_mode (bool): if True then launch id is given. + """ + q = {"launch_id": launchid_or_fwid} if launch_mode else {"fw_id": launchid_or_fwid} + self.offline_runs.update_many(q, {"$set": {"deprecated": True}}) + + def get_tracker_data(self, fw_id): + """ + Args: + fw_id (id): firework id. + + Returns: + [dict]: list tracker dicts + """ + data = [] + for launch in self.launches.find({"fw_id": fw_id}, {"trackers": 1, "launch_id": 1}): + if "trackers" in launch: # backwards compatibility + trackers = [Tracker.from_dict(t) for t in launch["trackers"]] + data.append({"launch_id": launch["launch_id"], "trackers": trackers}) + return data + + def get_launchdir(self, fw_id, launch_idx=-1): + """ + Returns the directory of the *most recent* launch of a fw_id + Args: + fw_id: (int) fw_id to get launch id for + launch_idx: (int) index of the launch to get. Default is -1, which is most recent. + """ + fw = self.get_fw_by_id(fw_id) + return fw.launches[launch_idx].launch_dir if len(fw.launches) > 0 else None + + def log_message(self, level, message) -> None: + """ + Support for job packing. + + Args: + level (str) + message (str) + """ + self.m_logger.log(level, message) + + +class LazyFirework: + """ + A LazyFirework only has the fw_id, and retrieves other data just-in-time. + This representation can speed up Workflow loading as only "important" FWs need to be + fully loaded. + """ + + # Get these fields from DB when creating new FireWork object + db_fields = ("name", "fw_id", "spec", "created_on", "state") + db_launch_fields = ("launches", "archived_launches") + + def __init__(self, fw_id, fw_coll, launch_coll, fallback_fs) -> None: + """ + Args: + fw_id (int): firework id + fw_coll (pymongo.collection): fireworks collection + launch_coll (pymongo.collection): launches collection. + """ + # This is the only attribute known w/o a DB query + self.fw_id = fw_id + self._fwc, self._lc, self._ffs = fw_coll, launch_coll, fallback_fs + self._launches = dict.fromkeys(self.db_launch_fields, False) + self._fw, self._lids, self._state = None, None, None + + # FireWork methods + + # Treat state as special case as it is always required when accessing a Firework lazily + # If the partial fw is not available the state is fetched independently + @property + def state(self): + if self._fw is not None: + self._state = self._fw.state + elif self._state is None: + self._state = self._fwc.find_one({"fw_id": self.fw_id}, projection=["state"])["state"] + return self._state + + @state.setter + def state(self, state) -> None: + self.partial_fw._state = state + self.partial_fw.updated_on = datetime.datetime.utcnow() + + def to_dict(self): + return self.full_fw.to_dict() + + def _rerun(self) -> None: + self.full_fw._rerun() + + def to_db_dict(self): + return self.full_fw.to_db_dict() + + def __str__(self) -> str: + return f"LazyFireWork object: (id: {self.fw_id})" + + # Properties that shadow FireWork attributes + + @property + def tasks(self): + return self.partial_fw.tasks + + @tasks.setter + def tasks(self, value) -> None: + self.partial_fw.tasks = value + + @property + def spec(self): + return self.partial_fw.spec + + @spec.setter + def spec(self, value) -> None: + self.partial_fw.spec = value + + @property + def name(self): + return self.partial_fw.name + + @name.setter + def name(self, value) -> None: + self.partial_fw.name = value + + @property + def created_on(self): + return self.partial_fw.created_on + + @created_on.setter + def created_on(self, value) -> None: + self.partial_fw.created_on = value + + @property + def updated_on(self): + return self.partial_fw.updated_on + + @updated_on.setter + def updated_on(self, value) -> None: + self.partial_fw.updated_on = value + + @property + def parents(self): + if self._fw is not None: + return self.partial_fw.parents + return [] + + @parents.setter + def parents(self, value) -> None: + self.partial_fw.parents = value + + # Properties that shadow FireWork attributes, but which are + # fetched individually from the DB (i.e. launch objects) + + @property + def launches(self): + return self._get_launch_data("launches") + + @launches.setter + def launches(self, value) -> None: + self._launches["launches"] = True + self.partial_fw.launches = value + + @property + def archived_launches(self): + return self._get_launch_data("archived_launches") + + @archived_launches.setter + def archived_launches(self, value) -> None: + self._launches["archived_launches"] = True + self.partial_fw.archived_launches = value + + # Lazy properties that idempotently instantiate a FireWork object + @property + def partial_fw(self): + if not self._fw: + fields = list(self.db_fields) + list(self.db_launch_fields) + data = self._fwc.find_one({"fw_id": self.fw_id}, projection=fields) + launch_data = {} # move some data to separate launch dict + for key in self.db_launch_fields: + launch_data[key] = data[key] + del data[key] + self._lids = launch_data + self._fw = Firework.from_dict(data) + return self._fw + + @property + def full_fw(self): + # map(self._get_launch_data, self.db_launch_fields) + for launch_field in self.db_launch_fields: + self._get_launch_data(launch_field) + return self._fw + + # Get a type of Launch object + + def _get_launch_data(self, name): + """ + Pull launch data individually for each field. + + Args: + name (str): Name of field, e.g. 'archived_launches'. + + Returns: + Launch obj (also propagated to self._fw) + """ + fw = self.partial_fw # assure stage 1 + if not self._launches[name]: + launch_ids = self._lids[name] + result = [] + if launch_ids: + data = self._lc.find({"launch_id": {"$in": launch_ids}}) + for ld in data: + ld["action"] = get_action_from_gridfs(ld.get("action"), self._ffs) + result.append(Launch.from_dict(ld)) + + setattr(fw, name, result) # put into real FireWork obj + self._launches[name] = True + return getattr(fw, name) + + +def get_action_from_gridfs(action_dict, fallback_fs): + """ + Helper function to obtain the correct dictionary of the FWAction associated + with a launch. If necessary retrieves the information from gridfs based + on its identifier, otherwise simply returns the dictionary in input. + Should be used when accessing a launch to ensure the presence of the + correct action dictionary. + + Args: + action_dict (dict): the dictionary contained in the "action" key of a launch + document. + fallback_fs (GridFS): the GridFS with the actions exceeding the 16MB limit. + + Returns: + dict: the dictionary of the action. + """ + if not action_dict or "gridfs_id" not in action_dict: + return action_dict + + action_gridfs_id = ObjectId(action_dict["gridfs_id"]) + + action_data = fallback_fs.get(ObjectId(action_gridfs_id)) + return json.loads(action_data.read()) diff --git a/fireworks/core/launchpad.py b/fireworks/core/launchpad.py index 5a559818b..432d5f429 100644 --- a/fireworks/core/launchpad.py +++ b/fireworks/core/launchpad.py @@ -10,6 +10,7 @@ import warnings from collections import defaultdict from itertools import chain +from sustodian import FindMyFW import gridfs from bson import ObjectId @@ -1193,17 +1194,9 @@ def reserve_fw(self, fworker, launch_dir, host=None, ip=None, fw_id=None): return self.checkout_fw(fworker, launch_dir, host=host, ip=ip, fw_id=fw_id, state="RESERVED") def get_fw_ids_from_reservation_id(self, reservation_id): - """ - Given the reservation id, return the list of firework ids. - - Args: - reservation_id (int) - - Returns: - [int]: list of firework ids. - """ - l_id = self.launches.find_one({"state_history.reservation_id": reservation_id}, {"launch_id": 1})["launch_id"] - return [fw["fw_id"] for fw in self.fireworks.find({"launches": l_id}, {"fw_id": 1})] + + fw_id=FindMyFW.get_fwid(reservation_id) + return fw_id def cancel_reservation_by_reservation_id(self, reservation_id) -> None: """Given the reservation id, cancel the reservation and rerun the corresponding fireworks.""" diff --git a/fireworks/scripts/.ipynb_checkpoints/lpad_run-checkpoint.py b/fireworks/scripts/.ipynb_checkpoints/lpad_run-checkpoint.py new file mode 100644 index 000000000..d4badd6f2 --- /dev/null +++ b/fireworks/scripts/.ipynb_checkpoints/lpad_run-checkpoint.py @@ -0,0 +1,1565 @@ +"""A runnable script for managing a FireWorks database (a command-line interface to launchpad.py).""" + +from __future__ import annotations + +import ast +import copy +import datetime +import json +import os +import re +import sys +import time +from argparse import ArgumentParser, ArgumentTypeError, Namespace +from importlib import metadata +from typing import Any, Sequence + +from pymongo import ASCENDING, DESCENDING +from ruamel.yaml import YAML + +from fireworks import FW_INSTALL_DIR +from fireworks.core.firework import Firework, Workflow +from fireworks.core.fworker import FWorker +from fireworks.core.launchpad import LaunchPad, WFLock +from fireworks.features.fw_report import FWReport +from fireworks.features.introspect import Introspector +from fireworks.fw_config import ( + CONFIG_FILE_DIR, + FWORKER_LOC, + LAUNCHPAD_LOC, + MAINTAIN_INTERVAL, + PW_CHECK_NUM, + RESERVATION_EXPIRATION_SECS, + RUN_EXPIRATION_SECS, + WEBSERVER_HOST, + WEBSERVER_PORT, +) +from fireworks.user_objects.firetasks.script_task import ScriptTask +from fireworks.utilities.fw_serializers import DATETIME_HANDLER, recursive_dict + +from ._helpers import _validate_config_file_paths + +__author__ = "Anubhav Jain" +__credits__ = "Shyue Ping Ong" +__copyright__ = "Copyright 2013, The Materials Project" +__maintainer__ = "Anubhav Jain" +__email__ = "ajain@lbl.gov" +__date__ = "Feb 7, 2013" + +DEFAULT_LPAD_YAML = "my_launchpad.yaml" + + +def pw_check(ids: list[int], args: Namespace, skip_pw: bool = False) -> list[int]: + if len(ids) > PW_CHECK_NUM and not skip_pw: + m_password = datetime.datetime.now().strftime("%Y-%m-%d") + if not args.password: + if input(f"Are you sure? This will modify {len(ids)} entries. (Y/N)")[0].upper() == "Y": + args.password = datetime.datetime.now().strftime("%Y-%m-%d") + else: + print("Operation aborted by user.") + if args.password != m_password: + raise ValueError( + f"Modifying more than {PW_CHECK_NUM} entries requires setting the --password parameter! " + "(Today's date, e.g. 2012-02-25)" + ) + return ids + + +def parse_helper(lp: LaunchPad, args: Namespace, wf_mode: bool = False, skip_pw: bool = False) -> list[int]: + """ + Helper method to parse args that can take either id, name, state or query. + + Args: + args: Namespace of parsed CLI arguments. + wf_mode (bool): If True, will query lp for workflow instead of fireworks IDs. + skip_pw (bool): If True, skip PW check. Defaults to False. + + Returns: + list[int]: Firework or Workflow IDs. + """ + if args.fw_id and sum(bool(x) for x in [args.name, args.state, args.query]) >= 1: + raise ValueError("Cannot specify both fw_id and name/state/query)") + + query = {} + if args.fw_id: + return pw_check(args.fw_id, args, skip_pw) + if args.query: + query = ast.literal_eval(args.query) + if args.name and "launches_mode" in args and not args.launches_mode: + query["name"] = args.name + if args.state: + query["state"] = args.state + + if hasattr(args, "sort") and args.sort: + sort = [(args.sort, ASCENDING)] + elif hasattr(args, "rsort") and args.rsort: + sort = [(args.rsort, DESCENDING)] + else: + sort = None + + max = args.max if hasattr(args, "max") else 0 + + if wf_mode: + return pw_check(lp.get_wf_ids(query, sort=sort, limit=max), args, skip_pw) + + return pw_check(lp.get_fw_ids(query, sort=sort, limit=max, launches_mode=args.launches_mode), args, skip_pw) + + +def get_lp(args: Namespace) -> LaunchPad: + try: + if args.launchpad_file: + lp = LaunchPad.from_file(args.launchpad_file) + else: + args.loglvl = "CRITICAL" if args.silencer else args.loglvl + # no lpad file means we try connect to localhost which is fast so use small timeout + # (default 30s) for quick response to user if no DB is running + mongo_kwds = {"serverSelectionTimeoutMS": 500} + lp = LaunchPad(logdir=args.logdir, strm_lvl=args.loglvl, mongoclient_kwargs=mongo_kwds) + + # make sure we can connect to DB, raises pymongo.errors.ServerSelectionTimeoutError if not + lp.connection.admin.command("ping") + return lp + + except Exception: + err_message = ( + f"FireWorks was not able to connect to MongoDB at {lp.host}:{lp.port}. Is the server running? " + f"The database file specified was {args.launchpad_file}." + ) + if not args.launchpad_file: + err_message += ( + ' Type "lpad init" if you would like to set up a file that specifies ' + "location and credentials of your Mongo database (otherwise use default " + "localhost configuration)." + ) + # use from None to hide the pymongo ServerSelectionTimeoutError that otherwise clutters up the stack trace + raise ValueError(err_message) from None + + +def init_yaml(args: Namespace) -> None: + if args.uri_mode: + fields = ( + ("host", None, "Example: mongodb+srv://USER:PASSWORD@CLUSTERNAME.mongodb.net/fireworks"), + ("ssl_ca_file", None, "Path to any client certificate to be used for mongodb connection"), + ( + "authsource", + None, + "Database used for authentication, if not connection db. e.g., for MongoDB Atlas this is sometimes " + "'admin'.", + ), + ) + else: + fields = ( + ("host", "localhost", "Example: 'localhost' or 'mongodb+srv://CLUSTERNAME.mongodb.net'"), + ("port", 27017, ""), + ("name", "fireworks", "Database under which to store the fireworks collections"), + ("username", None, "Username for MongoDB authentication"), + ("password", None, "Password for MongoDB authentication"), + ("ssl_ca_file", None, "Path to any client certificate to be used for Mongodb connection"), + ( + "authsource", + None, + "Database used for authentication, if not connection db. e.g., for MongoDB Atlas this is sometimes " + "'admin'.", + ), + ) + + doc: dict[str, str | int | bool | None] = {} + if args.uri_mode: + print( + "Note 1: You are in URI format mode. This means that all database parameters (username, password, host, " + "port, database name, etc.) must be present in the URI. See: " + "https://docs.mongodb.com/manual/reference/connection-string/ for details." + ) + print("(Enter your connection URI through the 'host' parameter)") + print("Please supply the following configuration values") + print("(press Enter if you want to accept the defaults)\n") + for k, default, helptext in fields: + val = input(f"Enter {k} parameter. ({default=}). {helptext}: ") + doc[k] = val or default + if "port" in doc: + doc["port"] = int(doc["port"]) # enforce the port as an int + if args.uri_mode: + doc["uri_mode"] = True + + lp = LaunchPad.from_dict(doc) + lp.to_file(args.config_file) + print(f"\nConfiguration written to {args.config_file}!") + + +def reset(args: Namespace) -> None: + lp = get_lp(args) + if not args.password: + n_docs = lp.workflows.count_documents({}) + answer = input( + f"Are you sure? This will RESET {n_docs} workflows and all data. " + f"To confirm, please type the name of this database ({lp.name}) :" + ) + if answer == lp.name: + args.password = datetime.datetime.now().strftime("%Y-%m-%d") + else: + raise ValueError("Incorrect input to confirm database reset, operation aborted.") + lp.reset(args.password) + + +def add_wf(args: Namespace) -> None: + lp = get_lp(args) + if args.dir: + files = [] + for f in args.wf_file: + files.extend([os.path.join(f, i) for i in os.listdir(f)]) + else: + files = args.wf_file + for f in files: + fwf = Workflow.from_file(f) + if args.check: + from fireworks.utilities.dagflow import DAGFlow + + DAGFlow.from_fireworks(fwf).check() + lp.add_wf(fwf) + + +def append_wf(args: Namespace) -> None: + lp = get_lp(args) + lp.append_wf(Workflow.from_file(args.wf_file), args.fw_id, detour=args.detour, pull_spec_mods=args.pull_spec_mods) + + +def dump_wf(args: Namespace) -> None: + lp = get_lp(args) + lp.get_wf_by_fw_id(args.fw_id).to_file(args.wf_file) + + +def check_wf(args: Namespace) -> None: + from fireworks.utilities.dagflow import DAGFlow + + lp = get_lp(args) + DAGFlow.from_fireworks(lp.get_wf_by_fw_id(args.fw_id)).check() + + +def add_wf_dir(args: Namespace) -> None: + lp = get_lp(args) + for filename in os.listdir(args.wf_dir): + fwf = Workflow.from_file(filename) + lp.add_wf(fwf) + + +def print_fws(ids, lp, args: Namespace) -> None: + """Prints results of some FireWorks query to stdout.""" + fws = [] + if args.display_format == "ids": + fws = ids + elif args.display_format == "count": + fws = [ids] + else: + for id in ids: + fw = lp.get_fw_by_id(id) + d = fw.to_dict() + d["state"] = d.get("state", "WAITING") + if args.display_format == "more" or args.display_format == "less": + if "archived_launches" in d: + del d["archived_launches"] + del d["spec"] + if args.display_format == "less" and "launches" in d: + del d["launches"] + fws.append(d) + if len(fws) == 1: + fws = fws[0] + get_output(args, fws) + + +def get_fw_ids_helper(lp: LaunchPad, args: Namespace, count_only: bool | None = None) -> list[int] | int: + """Build fws query from command line options and submit. + + Parameters: + lp (fireworks.core.firework.Launchpad) + args (argparse.Namespace) + count_only (bool): if None, then looked up in args. + + Returns: + list[int] | int: resulting fw_ids or count of fws in query. + """ + if sum(bool(x) for x in [args.fw_id, args.name, args.state, args.query]) > 1: + raise ValueError("Please specify exactly one of (fw_id, name, state, query)") + if sum(bool(x) for x in [args.fw_id, args.name, args.state, args.query]) == 0: + args.query = "{}" + args.display_format = args.display_format or "ids" + if sum(bool(x) for x in [args.fw_id, args.name, args.qid]) > 1: + raise ValueError("Please specify exactly one of (fw_id, name, qid)") + args.display_format = args.display_format or "more" + + if args.fw_id: + query = {"fw_id": {"$in": args.fw_id}} + elif args.name and not args.launches_mode: + query = {"name": args.name} + elif args.state: + query = {"state": args.state} + elif args.query: + query = ast.literal_eval(args.query) + else: + query = None + + if args.sort: + sort = [(args.sort, ASCENDING)] + elif args.rsort: + sort = [(args.rsort, DESCENDING)] + else: + sort = None + + if count_only is None: + count_only = args.display_format == "count" + if args.qid: + ids = lp.get_fw_ids_from_reservation_id(args.qid) + if query: + query["fw_id"] = {"$in": ids} + ids = lp.get_fw_ids(query, sort, args.max, launches_mode=args.launches_mode) + + else: + ids = lp.get_fw_ids(query, sort, args.max, count_only=count_only, launches_mode=args.launches_mode) + return ids + + +def get_fws_helper( + lp: LaunchPad, ids: list[int], args: Namespace +) -> list[int] | int | list[dict[str, str | int | bool]] | str | bool: + """Get fws from ids in a representation according to args.display_format.""" + fws = [] + if args.display_format == "ids": + fws = ids + elif args.display_format == "count": + fws = [ids] + else: + for id in ids: + fw = lp.get_fw_by_id(id) + d = fw.to_dict() + d["state"] = d.get("state", "WAITING") + if args.display_format == "more" or args.display_format == "less": + if "archived_launches" in d: + del d["archived_launches"] + del d["spec"] + if args.display_format == "less" and "launches" in d: + del d["launches"] + fws.append(d) + return fws[0] if len(fws) == 1 else fws + + +def get_fws(args: Namespace) -> None: + lp = get_lp(args) + ids = get_fw_ids_helper(lp, args) + fws = get_fws_helper(lp, ids, args) + get_output(args, fws) + + +def get_fws_in_wfs(args: Namespace) -> None: + # get_wfs + lp = get_lp(args) + if sum(bool(x) for x in [args.wf_fw_id, args.wf_name, args.wf_state, args.wf_query]) > 1: + raise ValueError("Please specify exactly one of (fw_id, name, state, query)") + if sum(bool(x) for x in [args.wf_fw_id, args.wf_name, args.wf_state, args.wf_query]) == 0: + args.wf_query = "{}" + + if args.wf_fw_id: + wf_query = {"nodes": {"$in": args.wf_fw_id}} + elif args.wf_name: + wf_query = {"name": args.wf_name} + elif args.wf_state: + wf_query = {"state": args.wf_state} + else: + wf_query = ast.literal_eval(args.wf_query) + + # get_fws + if sum(bool(x) for x in [args.fw_fw_id, args.fw_name, args.fw_state, args.fw_query]) > 1: + raise ValueError("Please specify exactly one of (fw_id, name, state, query)") + if sum(bool(x) for x in [args.fw_fw_id, args.fw_name, args.fw_state, args.fw_query]) == 0: + args.fw_query = "{}" + args.display_format = args.display_format or "ids" + if sum(bool(x) for x in [args.fw_fw_id, args.fw_name, args.qid]) > 1: + raise ValueError("Please specify exactly one of (fw_id, name, qid)") + args.display_format = args.display_format or "more" + + if args.fw_fw_id: + fw_query = {"fw_id": {"$in": args.fw_fw_id}} + elif args.fw_name and not args.launches_mode: + fw_query = {"name": args.fw_name} + elif args.fw_state: + fw_query = {"state": args.fw_state} + elif args.fw_query: + fw_query = ast.literal_eval(args.fw_query) + else: + fw_query = None + + if args.sort: + sort = [(args.sort, ASCENDING)] + elif args.rsort: + sort = [(args.rsort, DESCENDING)] + else: + sort = None + + if args.qid: + ids = lp.get_fw_ids_from_reservation_id(args.qid) + if fw_query: + fw_query["fw_id"] = {"$in": ids} + ids = lp.get_fw_ids_in_wfs( + wf_query=wf_query, fw_query=fw_query, sort=sort, limit=args.max, launches_mode=args.launches_mode + ) + else: + ids = lp.get_fw_ids_in_wfs( + wf_query=wf_query, + fw_query=fw_query, + sort=sort, + limit=args.max, + count_only=args.display_format == "count", + launches_mode=args.launches_mode, + ) + + print_fws(ids, lp, args) + + +def update_fws(args: Namespace) -> None: + lp = get_lp(args) + fw_ids = parse_helper(lp, args) + lp.update_spec(fw_ids, json.loads(args.update), args.mongo) + + +def get_wfs(args: Namespace) -> None: + lp = get_lp(args) + if sum(bool(x) for x in [args.fw_id, args.name, args.state, args.query]) > 1: + raise ValueError("Please specify exactly one of (fw_id, name, state, query)") + if sum(bool(x) for x in [args.fw_id, args.name, args.state, args.query]) == 0: + args.query = "{}" + args.display_format = args.display_format or "ids" + else: + args.display_format = args.display_format or "more" + + if args.fw_id: + query = {"nodes": {"$in": args.fw_id}} + elif args.name: + query = {"name": args.name} + elif args.state: + query = {"state": args.state} + else: + query = ast.literal_eval(args.query) + + if args.sort: + sort = [(args.sort, ASCENDING)] + elif args.rsort: + sort = [(args.rsort, DESCENDING)] + else: + sort = None + + ids = lp.get_wf_ids(query, sort, args.max, count_only=args.display_format == "count") + if args.display_format == "ids": + wfs = ids + elif args.display_format == "count": + wfs = [ids] + else: + wfs = [] + for i in ids: + d = lp.get_wf_summary_dict(i, args.display_format) + d["name"] += f"--{int(i)}" + wfs.append(d) + + if args.table: + if wfs: + headers = list(wfs[0]) + from prettytable import PrettyTable + + t = PrettyTable(headers) + for d in wfs: + t.add_row([d.get(k) for k in headers]) + print(t) + else: + if len(wfs) == 1: + wfs = wfs[0] + get_output(args, wfs) + + +def delete_wfs(args: Namespace) -> None: + lp = get_lp(args) + fw_ids = parse_helper(lp, args, wf_mode=True) + for f in fw_ids: + lp.delete_wf(f, delete_launch_dirs=args.delete_launch_dirs) + lp.m_logger.debug(f"Processed fw_id: {f}") + lp.m_logger.info(f"Finished deleting {len(fw_ids)} WFs") + + +def get_children(links, start, max_depth): + data = {} + for link, child in links.items(): + if link == start: + if len(child) > 0: + data[link] = [get_children(links, idx, max_depth) for idx in child] + else: + data[link] = child + return data + + +def detect_lostruns(args: Namespace) -> None: + lp = get_lp(args) + query = ast.literal_eval(args.query) if args.query else None + launch_query = ast.literal_eval(args.launch_query) if args.launch_query else None + fl, ff, fi = lp.detect_lostruns( + expiration_secs=args.time, + fizzle=args.fizzle, + rerun=args.rerun, + max_runtime=args.max_runtime, + min_runtime=args.min_runtime, + refresh=args.refresh, + query=query, + launch_query=launch_query, + ) + lp.m_logger.debug(f"Detected {len(fl)} lost launches: {fl}") + lp.m_logger.info(f"Detected {len(ff)} lost FWs: {ff}") + if args.display_format is not None and args.display_format != "none": + print_fws(ff, lp, args) + lp.m_logger.info(f"Detected {len(fi)} inconsistent FWs: {fi}") + if args.display_format is not None and args.display_format != "none": + print_fws(fi, lp, args) + if len(ff) > 0 and not args.fizzle and not args.rerun: + print("You can fix lost FWs using the --rerun or --fizzle arguments to the detect_lostruns command") + if len(fi) > 0 and not args.refresh: + print("You can fix inconsistent FWs using the --refresh argument to the detect_lostruns command") + + +def detect_unreserved(args: Namespace) -> None: + lp = get_lp(args) + if args.display_format is not None and args.display_format != "none": + unreserved = lp.detect_unreserved(expiration_secs=args.time, rerun=False) + # very inefficient, replace by mongo aggregation + fw_ids = [] + for launch_id in unreserved: + launch = lp.get_launch_by_id(launch_id) + fw_ids.append(launch.fw_id) + print_fws(fw_ids, lp, args) + print(lp.detect_unreserved(expiration_secs=args.time, rerun=args.rerun)) + + +def tuneup(args: Namespace) -> None: + lp = get_lp(args) + lp.tuneup(bkground=not args.full) + + +def defuse_wfs(args: Namespace) -> None: + lp = get_lp(args) + fw_ids = parse_helper(lp, args, wf_mode=True) + for f in fw_ids: + lp.defuse_wf(f, defuse_all_states=args.defuse_all_states) + lp.m_logger.debug(f"Processed fw_id: {f}") + lp.m_logger.info(f"Finished defusing {len(fw_ids)} FWs.") + if not args.defuse_all_states: + lp.m_logger.info( + "Note: FIZZLED and COMPLETED FWs were not defused. " + "Use the --defuse_all_states option to force this (or rerun FIZZLED FWs first)." + ) + + +def pause_wfs(args: Namespace) -> None: + lp = get_lp(args) + fw_ids = parse_helper(lp, args, wf_mode=True) + for f in fw_ids: + lp.pause_wf(f) + lp.m_logger.debug(f"Processed fw_id: {f}") + lp.m_logger.info(f"Finished defusing {len(fw_ids)} FWs.") + + +def archive(args: Namespace) -> None: + lp = get_lp(args) + fw_ids = parse_helper(lp, args, wf_mode=True) + for f in fw_ids: + lp.archive_wf(f) + lp.m_logger.debug(f"Processed fw_id: {f}") + lp.m_logger.info(f"Finished archiving {len(fw_ids)} WFs") + + +def reignite_wfs(args: Namespace) -> None: + lp = get_lp(args) + fw_ids = parse_helper(lp, args, wf_mode=True) + for f in fw_ids: + lp.reignite_wf(f) + lp.m_logger.debug(f"Processed Workflow with fw_id: {f}") + lp.m_logger.info(f"Finished reigniting {len(fw_ids)} Workflows") + + +def defuse_fws(args: Namespace) -> None: + lp = get_lp(args) + fw_ids = parse_helper(lp, args) + for f in fw_ids: + lp.defuse_fw(f) + lp.m_logger.debug(f"Processed fw_id: {f}") + lp.m_logger.info(f"Finished defusing {len(fw_ids)} FWs") + + +def pause_fws(args: Namespace) -> None: + lp = get_lp(args) + fw_ids = parse_helper(lp, args) + for f in fw_ids: + lp.pause_fw(f) + lp.m_logger.debug(f"Processed fw_id: {f}") + lp.m_logger.info(f"Finished pausing {len(fw_ids)} FWs") + + +def reignite_fws(args: Namespace) -> None: + lp = get_lp(args) + fw_ids = parse_helper(lp, args) + for f in fw_ids: + lp.reignite_fw(f) + lp.m_logger.debug(f"Processed fw_id: {f}") + lp.m_logger.info(f"Finished reigniting {len(fw_ids)} FWs") + + +def resume_fws(args: Namespace) -> None: + lp = get_lp(args) + fw_ids = parse_helper(lp, args) + for f in fw_ids: + lp.resume_fw(f) + lp.m_logger.debug(f"Processed fw_id: {f}") + lp.m_logger.info(f"Finished resuming {len(fw_ids)} FWs") + + +def rerun_fws(args: Namespace) -> None: + lp = get_lp(args) + fw_ids = parse_helper(lp, args) + if args.task_level: + launch_ids = args.launch_id + if launch_ids is None: + launch_ids = ["last"] * len(fw_ids) + elif len(launch_ids) != len(fw_ids): + raise ValueError("Specify the same number of tasks and launches") + else: + launch_ids = [None] * len(fw_ids) + for fw_id, l_id in zip(fw_ids, launch_ids): + lp.rerun_fw(int(fw_id), recover_launch=l_id, recover_mode=args.recover_mode) + lp.m_logger.debug(f"Processed {fw_id=}") + lp.m_logger.info(f"Finished setting {len(fw_ids)} FWs to rerun") + + +def refresh(args: Namespace) -> None: + lp = get_lp(args) + fw_ids = parse_helper(lp, args, wf_mode=True) + for f in fw_ids: + wf = lp.get_wf_by_fw_id_lzyfw(f) + for fw_id in wf.root_fw_ids: + lp._refresh_wf(fw_id) + lp.m_logger.debug(f"Processed Workflow with fw_id: {f}") + lp.m_logger.info(f"Finished refreshing {len(fw_ids)} Workflows") + + +def unlock(args: Namespace) -> None: + lp = get_lp(args) + fw_ids = parse_helper(lp, args, wf_mode=True) + for fw_id in fw_ids: + with WFLock(lp, fw_id, expire_secs=0, kill=True): + lp.m_logger.warning(f"FORCIBLY RELEASING LOCK DUE TO USER COMMAND, WF: {fw_id}") + lp.m_logger.debug(f"Processed Workflow with {fw_id=}") + lp.m_logger.info(f"Finished unlocking {len(fw_ids)} Workflows") + + +def get_qid(args: Namespace) -> None: + lp = get_lp(args) + for f in args.fw_id: + print(lp.get_reservation_id_from_fw_id(f)) + + +def cancel_qid(args: Namespace) -> None: + lp = get_lp(args) + lp.m_logger.warning( + "WARNING: cancel_qid does not actually remove jobs from the queue " + "(e.g., execute qdel), this must be done manually!" + ) + lp.cancel_reservation_by_reservation_id(args.qid) + + +def set_priority(args: Namespace) -> None: + wf_mode = args.wf + lp = get_lp(args) + fw_ids = parse_helper(lp, args, wf_mode=wf_mode) + if wf_mode: + all_fw_ids = set() + for fw_id in fw_ids: + wf = lp.get_wf_by_fw_id_lzyfw(fw_id) + all_fw_ids.update(wf.id_fw) + fw_ids = list(all_fw_ids) + for f in fw_ids: + lp.set_priority(f, args.priority) + lp.m_logger.debug(f"Processed fw_id {f}") + lp.m_logger.info(f"Finished setting priorities of {len(fw_ids)} FWs") + + +def _open_webbrowser(url) -> None: + """Open a web browser after a delay to give the web server more startup time.""" + import webbrowser + + time.sleep(2) + webbrowser.open(url) + + +def webgui(args: Namespace) -> None: + from fireworks.flask_site.app import app + + app.lp = get_lp(args) + + if any([args.webgui_username, args.webgui_password]) and not all([args.webgui_username, args.webgui_password]): + raise ValueError("Must set BOTH a webgui_username and webgui_password!") + + app.config["WEBGUI_USERNAME"] = args.webgui_username + app.config["WEBGUI_PASSWORD"] = args.webgui_password + + if args.wflowquery: + app.BASE_Q_WF = json.loads(args.wflowquery) + if args.fwquery: + app.BASE_Q = json.loads(args.fwquery) + if "state" in app.BASE_Q: + app.BASE_Q_WF["state"] = app.BASE_Q["state"] + + if not args.server_mode: + from threading import Thread + + url = f"http://{args.host}:{args.port}" + p1 = Thread(target=_open_webbrowser, args=(url,)) + p1.start() + app.run(host=args.host, port=args.port, debug=args.debug) + p1.join() + else: + try: + from fireworks.flask_site.gunicorn import StandaloneApplication + except ImportError: + import sys + + sys.exit("Gunicorn is required for server mode. Install using `pip install gunicorn`.") + options = { + "bind": f"{args.host}:{args.port}", + "workers": args.nworkers, + } + StandaloneApplication(app, options).run() + + +def add_scripts(args: Namespace) -> None: + lp = get_lp(args) + args.names = args.names or [None] * len(args.scripts) + args.wf_name = args.wf_name or args.names[0] + fws = [] + links = {} + for idx, s in enumerate(args.scripts): + fws.append(Firework(ScriptTask({"script": s, "use_shell": True}), name=args.names[idx], fw_id=idx)) + if idx != 0: + links[idx - 1] = idx + + lp.add_wf(Workflow(fws, links, args.wf_name)) + + +def recover_offline(args: Namespace) -> None: + lp = get_lp(args) + fworker_name = FWorker.from_file(args.fworker_file).name if args.fworker_file else None + failed_fws = [] + recovered_fws = [] + + for launch in lp.offline_runs.find({"completed": False, "deprecated": False}, {"launch_id": 1, "fw_id": 1}): + if fworker_name and lp.launches.count({"launch_id": launch["launch_id"], "fworker.name": fworker_name}) == 0: + continue + fw = lp.recover_offline(launch["launch_id"], args.ignore_errors, args.print_errors) + if fw: + failed_fws.append(launch["fw_id"]) + else: + recovered_fws.append(launch["fw_id"]) + + lp.m_logger.info(f"FINISHED recovering offline runs. {len(recovered_fws)} job(s) recovered: {recovered_fws}") + if failed_fws: + lp.m_logger.info(f"FAILED to recover offline fw_ids: {failed_fws}") + + +def forget_offline(args: Namespace) -> None: + lp = get_lp(args) + fw_ids = parse_helper(lp, args) + for f in fw_ids: + lp.forget_offline(f, launch_mode=False) + lp.m_logger.debug(f"Processed fw_id: {f}") + lp.m_logger.info(f"Finished forget_offline, processed {len(fw_ids)} FWs") + + +def report(args: Namespace) -> None: + lp = get_lp(args) + query = ast.literal_eval(args.query) if args.query else None + fwr = FWReport(lp) + stats = fwr.get_stats( + coll=args.collection, interval=args.interval, num_intervals=args.num_intervals, additional_query=query + ) + title_str = f"Stats on {args.collection}" + title_dec = "-" * len(title_str) + print(title_dec) + print(title_str) + print(title_dec) + print(fwr.get_stats_str(stats)) + + +def introspect(args: Namespace) -> None: + print("NOTE: This feature is in beta mode...") + lp = get_lp(args) + isp = Introspector(lp) + for coll in ["launches", "tasks", "fireworks", "workflows"]: + print(f"generating report for {coll}...please wait...") + print() + table = isp.introspect_fizzled(coll=coll, threshold=args.threshold, limit=args.max) + isp.print_report(table, coll) + print() + + +def get_launchdir(args: Namespace) -> None: + lp = get_lp(args) + ld = lp.get_launchdir(args.fw_id, args.launch_idx) + print(ld) + + +def track_fws(args: Namespace) -> None: + lp = get_lp(args) + fw_ids = parse_helper(lp, args, skip_pw=True) + include = args.include + exclude = args.exclude + first_print = True # used to control newline + for fw_id in fw_ids: + data = lp.get_tracker_data(fw_id) + output = [] + for dct in data: + for tracker in dct["trackers"]: + if (not include or tracker.filename in include) and (not exclude or tracker.filename not in exclude): + output.extend((f"## Launch id: {dct['launch_id']}", str(tracker))) + if output: + name = lp.fireworks.find_one({"fw_id": fw_id}, {"name": 1})["name"] + output.insert(0, f"# FW id: {fw_id}, FW {name=}") + if first_print: + first_print = False + else: + output.insert(0, ">------<") + print("\n".join(output)) + + +def maintain(args: Namespace) -> None: + lp = get_lp(args) + lp.maintain(args.infinite, args.maintain_interval) + + +def orphaned(args: Namespace) -> None: + # get_fws + lp = get_lp(args) + fw_ids = get_fw_ids_helper(lp, args, count_only=False) + + # get_wfs + orphaned_fw_ids = [] + for fw_id in fw_ids: + query = {"nodes": fw_id} + wf_ids = lp.get_wf_ids(query) + if len(wf_ids) == 0: + orphaned_fw_ids.append(fw_id) + + fws = get_fws_helper(lp, orphaned_fw_ids, args) + if args.remove: + lp.m_logger.info(f"Found {len(orphaned_fw_ids)} orphaned fw_ids: {orphaned_fw_ids}") + lp.delete_fws(orphaned_fw_ids, delete_launch_dirs=args.delete_launch_dirs) + else: + get_output(args, fws) + + +def get_output(args: Namespace, objs: list[Any]) -> None: + """Prints output on stdout""" + if args.output == "json": + json.dump(objs, sys.stdout, default=DATETIME_HANDLER, indent=4) + else: + yaml = YAML(typ="safe", pure=True) + yaml.default_flow_style = False + yaml.dump(recursive_dict(objs, preserve_unicode=False), sys.stdout) + print() + + +def arg_positive_int(value: str) -> int: + try: + ivalue = int(value) + except ValueError: + raise ArgumentTypeError(f"int(value) conversion failed for {value}") + + if ivalue < 1: + raise ValueError(f"{value} is not a positive integer") + return ivalue + + +def lpad(argv: Sequence[str] | None = None) -> int: + m_description = ( + "A command line interface to FireWorks. For more help on a specific command, type 'lpad -h'." + ) + + parser = ArgumentParser("lpad", description=m_description) + + fw_version = metadata.version("fireworks") + v_out = f"%(prog)s v{fw_version} located in {FW_INSTALL_DIR}" + parser.add_argument("-v", "--version", action="version", version=v_out) + + parent_parser = ArgumentParser(add_help=False) + parser.add_argument( + "-o", + "--output", + choices=["json", "yaml"], + default="json", + type=lambda s: s.lower(), + help="Set output display format to either json or YAML. " + "YAML is easier to read for long documents. JSON is the default.", + ) + + subparsers = parser.add_subparsers(help="command", dest="command") + + # This makes common argument options easier to maintain. E.g., what if + # there is a new state or disp option? + # NOTE: Those sets of standard options are not used consistently below (jotelha) + fw_id_args = ["-i", "--fw_id"] + fw_id_kwargs = {"type": str, "help": "fw_id"} + + state_args = ["-s", "--state"] + state_kwargs = { + "type": lambda s: s.upper(), + "help": "Select by state.", + "choices": Firework.STATE_RANKS, + } + disp_args = ["-d", "--display_format"] + disp_kwargs = { + "type": lambda s: s.lower(), + "help": "Display format.", + "default": "less", + "choices": ["all", "more", "less", "ids", "count", "reservations"], + } + + # enhanced display options allow for value 'none' or None (default) for no output + enh_disp_args = copy.deepcopy(disp_args) + enh_disp_kwargs = copy.deepcopy(disp_kwargs) + enh_disp_kwargs["choices"].append("none") + enh_disp_kwargs["default"] = None + + query_args = ["-q", "--query"] + query_kwargs = {"help": 'Query (enclose pymongo-style dict in single-quotes, e.g. \'{"state":"COMPLETED"}\')'} + + launches_mode_args = ["-lm", "--launches_mode"] + launches_mode_kwargs = { + "action": "store_true", + "help": "Query the launches collection (enclose pymongo-style " + "dict in single-quotes, e.g. '{\"launch_id\": 1}')", + } + + qid_args = ["--qid"] + qid_kwargs = {"help": "Query by reservation id of job in queue"} + + # for using fw- and wf-specific options on one command line, distinguish by prefix fw and wf + # prefix short one-dash options with 'wf', i.e. '-i' -> '-wfi' + # prefix long two-dash options with 'wf_', i.e. '--fw_id' -> '--wf_fw_id' + wf_prefixed_fw_id_args = [re.sub("^-([^-].*)$", "-wf\\1", s) for s in fw_id_args] + wf_prefixed_fw_id_args = [re.sub("^--(.*)$", "--wf_\\1", s) for s in wf_prefixed_fw_id_args] + + wf_prefixed_state_args = [re.sub("^-([^-].*)$", "-wf\\1", s) for s in state_args] + wf_prefixed_state_args = [re.sub("^--(.*)$", "--wf_\\1", s) for s in wf_prefixed_state_args] + + wf_prefixed_query_args = [re.sub("^-([^-].*)$", "-wf\\1", s) for s in query_args] + wf_prefixed_query_args = [re.sub("^--(.*)$", "--wf_\\1", s) for s in wf_prefixed_query_args] + + # prefix short one-dash options with 'fw', i.e. '-i' -> '-fwi' + # prefix long two-dash options with 'fw_', i.e. '--fw_id' -> '--fw_fw_id' + fw_prefixed_fw_id_args = [re.sub("^-([^-].*)$", "-fw\\1", s) for s in fw_id_args] + fw_prefixed_fw_id_args = [re.sub("^--(.*)$", "--fw_\\1", s) for s in fw_prefixed_fw_id_args] + + fw_prefixed_state_args = [re.sub("^-([^-].*)$", "-fw\\1", s) for s in state_args] + fw_prefixed_state_args = [re.sub("^--(.*)$", "--fw_\\1", s) for s in fw_prefixed_state_args] + + fw_prefixed_query_args = [re.sub("^-([^-].*)$", "-fw\\1", s) for s in query_args] + fw_prefixed_query_args = [re.sub("^--(.*)$", "--fw_\\1", s) for s in fw_prefixed_query_args] + + # filter all long options, i.e. '--fw_id' and strip off preceding '--' + fw_id_options = [ + re.sub("^--(.*)$", "\\1", opt) + for opt in [*fw_id_args, *wf_prefixed_fw_id_args, *fw_prefixed_fw_id_args] + if re.match("^--.*$", opt) + ] + + init_parser = subparsers.add_parser("init", help="Initialize a Fireworks launchpad YAML file.") + init_parser.add_argument( + "-u", + "--uri_mode", + action="store_true", + help="Connect via a URI, see: https://docs.mongodb.com/manual/reference/connection-string/", + ) + init_parser.add_argument("--config-file", default=DEFAULT_LPAD_YAML, type=str, help="Filename to write to.") + init_parser.set_defaults(func=init_yaml) + + reset_parser = subparsers.add_parser("reset", help="reset and re-initialize the FireWorks database") + reset_parser.add_argument( + "--password", + help="Today's date, e.g. 2012-02-25. " + "Password or positive response to input prompt " + "required to protect against accidental reset.", + ) + reset_parser.set_defaults(func=reset) + + addwf_parser = subparsers.add_parser("add", help="insert a Workflow from file") + addwf_parser.add_argument( + "-d", "--dir", action="store_true", help="Directory mode. Finds all files in the paths given by wf_file." + ) + addwf_parser.add_argument("wf_file", nargs="+", help="Path to a Firework or Workflow file") + addwf_parser.add_argument( + "-c", "--check", help="check the workflow before adding", dest="check", action="store_true" + ) + addwf_parser.set_defaults(func=add_wf, check=False) + + check_wf_parser = subparsers.add_parser("check_wflow", help="check a workflow from launchpad") + check_wf_parser.add_argument("-i", "--fw_id", type=int, help="the id of a firework from the workflow") + check_wf_parser.set_defaults(func=check_wf) + + get_launchdir_parser = subparsers.add_parser( + "get_launchdir", + help="get the directory of the most recent launch of the given fw_id. A common usage is " + "'cd `get_launchdir `' to change the working directory to that of the FW launch", + ) + get_launchdir_parser.add_argument("fw_id", type=int, help="fw_id to chdir to") + get_launchdir_parser.add_argument( + "--launch_idx", + type=int, + help="the index of the launch to get (default of -1 is most recent launch)", + default=-1, + ) + get_launchdir_parser.set_defaults(func=get_launchdir) + + append_wf_parser = subparsers.add_parser( + "append_wflow", help="append a workflow from file to a workflow on launchpad" + ) + append_wf_parser.add_argument(*fw_id_args, type=fw_id_kwargs["type"], help="parent firework ids") + append_wf_parser.add_argument("-f", "--wf_file", help="path to a firework or workflow file") + append_wf_parser.add_argument( + "-d", "--detour", help="append workflow as a detour", dest="detour", action="store_true" + ) + append_wf_parser.add_argument( + "--no_pull_spec_mods", help="do not to pull spec mods from parent", dest="pull_spec_mods", action="store_false" + ) + append_wf_parser.set_defaults(func=append_wf, detour=False, pull_spec_mods=True) + + dump_wf_parser = subparsers.add_parser("dump_wflow", help="dump a workflow from launchpad to a file") + dump_wf_parser.add_argument("-i", "--fw_id", type=int, help="the id of a firework from the workflow") + dump_wf_parser.add_argument("-f", "--wf_file", help="path to a local file to store the workflow") + dump_wf_parser.set_defaults(func=dump_wf) + + addscript_parser = subparsers.add_parser( + "add_scripts", help="quickly add a script (or several scripts) to run in sequence" + ) + addscript_parser.add_argument("scripts", help="Script to run, or space-separated names", nargs="*") + addscript_parser.add_argument("-n", "--names", help="Firework name, or space-separated names", nargs="*") + addscript_parser.add_argument("-w", "--wf_name", help="Workflow name") + addscript_parser.add_argument("-d", "--delimiter", help="delimiter for separating scripts", default=",") + addscript_parser.set_defaults(func=add_scripts) + + get_fw_parser = subparsers.add_parser("get_fws", help="get information about FireWorks") + get_fw_parser.add_argument(*fw_id_args, **fw_id_kwargs) + get_fw_parser.add_argument("-n", "--name", help="get FWs with this name") + get_fw_parser.add_argument(*state_args, **state_kwargs) + get_fw_parser.add_argument(*query_args, **query_kwargs) + get_fw_parser.add_argument(*launches_mode_args, **launches_mode_kwargs) + get_fw_parser.add_argument(*qid_args, **qid_kwargs) + get_fw_parser.add_argument(*disp_args, **disp_kwargs) + get_fw_parser.add_argument("-m", "--max", help="limit results", default=0, type=int) + get_fw_parser.add_argument("--sort", help="Sort results", choices=["created_on", "updated_on"]) + get_fw_parser.add_argument("--rsort", help="Reverse sort results", choices=["created_on", "updated_on"]) + get_fw_parser.set_defaults(func=get_fws) + + get_fw_in_wf_parser = subparsers.add_parser( + "get_fws_in_wflows", + help="get information about FireWorks in Workflows", + aliases=["get_fws_in_wfs"], + ) + + get_fw_in_wf_parser.add_argument(*wf_prefixed_fw_id_args, **fw_id_kwargs) + get_fw_in_wf_parser.add_argument("-wfn", "--wf_name", help="get WFs with this name") + get_fw_in_wf_parser.add_argument(*wf_prefixed_state_args, **state_kwargs) + get_fw_in_wf_parser.add_argument(*wf_prefixed_query_args, **query_kwargs) + + get_fw_in_wf_parser.add_argument(*fw_prefixed_fw_id_args, **fw_id_kwargs) + get_fw_in_wf_parser.add_argument("-fwn", "--fw_name", help="get FWs with this name") + get_fw_in_wf_parser.add_argument(*fw_prefixed_state_args, **state_kwargs) + get_fw_in_wf_parser.add_argument(*fw_prefixed_query_args, **query_kwargs) + get_fw_in_wf_parser.add_argument(*launches_mode_args, **launches_mode_kwargs) + get_fw_in_wf_parser.add_argument(*qid_args, **qid_kwargs) + get_fw_in_wf_parser.add_argument(*disp_args, **disp_kwargs) + get_fw_in_wf_parser.add_argument("-m", "--max", help="limit results", default=0, type=int) + get_fw_in_wf_parser.add_argument("--sort", help="Sort results", choices=["created_on", "updated_on"]) + get_fw_in_wf_parser.add_argument("--rsort", help="Reverse sort results", choices=["created_on", "updated_on"]) + get_fw_in_wf_parser.set_defaults(func=get_fws_in_wfs) + + trackfw_parser = subparsers.add_parser("track_fws", help="Track FireWorks") + trackfw_parser.add_argument(*fw_id_args, **fw_id_kwargs) + trackfw_parser.add_argument("-n", "--name", help="name") + trackfw_parser.add_argument(*state_args, **state_kwargs) + trackfw_parser.add_argument(*query_args, **query_kwargs) + trackfw_parser.add_argument(*launches_mode_args, **launches_mode_kwargs) + trackfw_parser.add_argument("-c", "--include", nargs="+", help="only include these files in the report") + trackfw_parser.add_argument("-x", "--exclude", nargs="+", help="exclude these files from the report") + trackfw_parser.add_argument("-m", "--max", help="limit results", default=0, type=int) + trackfw_parser.set_defaults(func=track_fws) + + rerun_fws_parser = subparsers.add_parser("rerun_fws", help="re-run Firework(s)") + rerun_fws_parser.add_argument(*fw_id_args, **fw_id_kwargs) + rerun_fws_parser.add_argument("-n", "--name", help="name") + rerun_fws_parser.add_argument(*state_args, **state_kwargs) + rerun_fws_parser.add_argument(*query_args, **query_kwargs) + rerun_fws_parser.add_argument(*launches_mode_args, **launches_mode_kwargs) + rerun_fws_parser.add_argument( + "--password", + help="Today's date, e.g. 2012-02-25. Password or positive response to " + f"input prompt required when modifying more than {PW_CHECK_NUM} entries.", + ) + rerun_fws_parser.add_argument("--task-level", action="store_true", help="Enable task level recovery") + rerun_fws_parser.add_argument( + "-lid", "--launch_id", nargs="+", help="Recover launch id. --task-level must be given", default=None, type=int + ) + recover_mode_group = rerun_fws_parser.add_mutually_exclusive_group() + recover_mode_group.add_argument( + "-cp", + "--copy-data", + action="store_const", + const="cp", + dest="recover_mode", + help="Copy data from previous run. --task-level must be given", + ) + recover_mode_group.add_argument( + "-pd", + "--previous-dir", + action="store_const", + const="prev_dir", + dest="recover_mode", + help="Reruns in the previous folder. --task-level must be given", + ) + rerun_fws_parser.set_defaults(func=rerun_fws) + + defuse_fw_parser = subparsers.add_parser("defuse_fws", help="cancel (de-fuse) a single Firework") + defuse_fw_parser.add_argument(*fw_id_args, **fw_id_kwargs) + defuse_fw_parser.add_argument("-n", "--name", help="name") + defuse_fw_parser.add_argument(*state_args, **state_kwargs) + defuse_fw_parser.add_argument(*query_args, **query_kwargs) + defuse_fw_parser.add_argument(*launches_mode_args, **launches_mode_kwargs) + defuse_fw_parser.add_argument( + "--password", + help="Today's date, e.g. 2012-02-25. Password or positive response to " + f"input prompt required when modifying more than {PW_CHECK_NUM} entries.", + ) + defuse_fw_parser.set_defaults(func=defuse_fws) + + pause_fw_parser = subparsers.add_parser("pause_fws", help="pause a single Firework") + pause_fw_parser.add_argument(*fw_id_args, **fw_id_kwargs) + pause_fw_parser.add_argument("-n", "--name", help="name") + pause_fw_parser.add_argument(*state_args, **state_kwargs) + pause_fw_parser.add_argument(*query_args, **query_kwargs) + pause_fw_parser.add_argument(*launches_mode_args, **launches_mode_kwargs) + pause_fw_parser.add_argument( + "--password", + help="Today's date, e.g. 2012-02-25. Password or positive response to " + f"input prompt required when modifying more than {PW_CHECK_NUM} entries.", + ) + pause_fw_parser.set_defaults(func=pause_fws) + + reignite_fw_parser = subparsers.add_parser("reignite_fws", help="reignite (un-cancel) a set of Fireworks") + reignite_fw_parser.add_argument(*fw_id_args, **fw_id_kwargs) + reignite_fw_parser.add_argument("-n", "--name", help="name") + reignite_fw_parser.add_argument(*state_args, **state_kwargs) + reignite_fw_parser.add_argument(*query_args, **query_kwargs) + reignite_fw_parser.add_argument(*launches_mode_args, **launches_mode_kwargs) + reignite_fw_parser.add_argument( + "--password", + help="Today's date, e.g. 2012-02-25. Password or positive response to " + f"input prompt required when modifying more than {PW_CHECK_NUM} entries.", + ) + reignite_fw_parser.set_defaults(func=reignite_fws) + + resume_fw_parser = subparsers.add_parser("resume_fws", help="resume (un-pause) a set of Fireworks") + resume_fw_parser.add_argument(*fw_id_args, **fw_id_kwargs) + resume_fw_parser.add_argument("-n", "--name", help="name") + resume_fw_parser.add_argument(*state_args, **state_kwargs) + resume_fw_parser.add_argument(*query_args, **query_kwargs) + resume_fw_parser.add_argument(*launches_mode_args, **launches_mode_kwargs) + resume_fw_parser.add_argument( + "--password", + help="Today's date, e.g. 2012-02-25. Password or positive response to " + f"input prompt required when modifying more than {PW_CHECK_NUM} entries.", + ) + resume_fw_parser.set_defaults(func=resume_fws) + + update_fws_parser = subparsers.add_parser("update_fws", help="Update a Firework spec.") + update_fws_parser.add_argument(*fw_id_args, **fw_id_kwargs) + update_fws_parser.add_argument("-n", "--name", help="get FWs with this name") + update_fws_parser.add_argument(*state_args, **state_kwargs) + update_fws_parser.add_argument(*query_args, **query_kwargs) + update_fws_parser.add_argument(*launches_mode_args, **launches_mode_kwargs) + update_fws_parser.add_argument( + "-u", + "--update", + type=str, + help='Doc update (enclose pymongo-style dict in single-quotes, e.g. \'{"_tasks.1.hello": "world"}\')', + ) + update_fws_parser.add_argument( + "--mongo", + default=False, + action="store_true", + help="Use full pymongo style dict to modify spec. Be very careful as you can break your spec", + ) + update_fws_parser.add_argument( + "--password", + help="Today's date, e.g. 2012-02-25. Password or positive response to " + f"input prompt required when modifying more than {PW_CHECK_NUM} entries.", + ) + update_fws_parser.set_defaults(func=update_fws) + + get_wf_parser = subparsers.add_parser( + "get_wflows", + help="get information about Workflows", + aliases=["get_wfs"], + ) + get_wf_parser.add_argument(*fw_id_args, **fw_id_kwargs) + get_wf_parser.add_argument("-n", "--name", help="get WFs with this name") + get_wf_parser.add_argument(*state_args, **state_kwargs) + get_wf_parser.add_argument(*query_args, **query_kwargs) + get_wf_parser.add_argument(*disp_args, **disp_kwargs) + get_wf_parser.add_argument("-m", "--max", help="limit results", default=0, type=int) + get_wf_parser.add_argument("--sort", help="Sort results", choices=["created_on", "updated_on"]) + get_wf_parser.add_argument("--rsort", help="Reverse sort results", choices=["created_on", "updated_on"]) + get_wf_parser.add_argument( + "-t", + "--table", + help='Print results in table form instead of json. Needs prettytable. Works best with "-d less"', + action="store_true", + ) + get_wf_parser.set_defaults(func=get_wfs) + + defuse_wf_parser = subparsers.add_parser("defuse_wflows", help="cancel (de-fuse) an entire Workflow") + defuse_wf_parser.add_argument( + "--defuse_all_states", help="also defuse COMPLETED and FIZZLED workflows", action="store_true" + ) + defuse_wf_parser.add_argument(*fw_id_args, **fw_id_kwargs) + defuse_wf_parser.add_argument("-n", "--name", help="name") + defuse_wf_parser.add_argument(*state_args, **state_kwargs) + defuse_wf_parser.add_argument(*query_args, **query_kwargs) + defuse_wf_parser.add_argument( + "--password", + help="Today's date, e.g. 2012-02-25. Password or positive response to " + f"input prompt required when modifying more than {PW_CHECK_NUM} entries.", + ) + defuse_wf_parser.set_defaults(func=pause_wfs) + + pause_wf_parser = subparsers.add_parser( + "pause_wflows", + help="pause an entire Workflow", + aliases=["pause_wfs"], + ) + pause_wf_parser.add_argument(*fw_id_args, **fw_id_kwargs) + pause_wf_parser.add_argument("-n", "--name", help="name") + pause_wf_parser.add_argument(*state_args, **state_kwargs) + pause_wf_parser.add_argument(*query_args, **query_kwargs) + pause_wf_parser.add_argument( + "--password", + help="Today's date, e.g. 2012-02-25. Password or positive response to " + f"input prompt required when modifying more than {PW_CHECK_NUM} entries.", + ) + pause_wf_parser.set_defaults(func=pause_wfs) + + reignite_wfs_parser = subparsers.add_parser( + "reignite_wflows", + help="reignite (un-cancel) an entire Workflow", + aliases=["reignite_wfs"], + ) + reignite_wfs_parser.add_argument(*fw_id_args, **fw_id_kwargs) + reignite_wfs_parser.add_argument("-n", "--name", help="name") + reignite_wfs_parser.add_argument(*state_args, **state_kwargs) + reignite_wfs_parser.add_argument(*query_args, **query_kwargs) + reignite_wfs_parser.add_argument( + "--password", + help="Today's date, e.g. 2012-02-25. Password or positive response to " + f"input prompt required when modifying more than {PW_CHECK_NUM} entries.", + ) + reignite_wfs_parser.set_defaults(func=reignite_wfs) + + archive_parser = subparsers.add_parser( + "archive_wflows", + help="archive an entire Workflow (irreversible)", + aliases=["archive_wfs"], + ) + archive_parser.add_argument(*fw_id_args, **fw_id_kwargs) + archive_parser.add_argument("-n", "--name", help="name") + archive_parser.add_argument(*state_args, **state_kwargs) + archive_parser.add_argument(*query_args, **query_kwargs) + archive_parser.add_argument( + "--password", + help="Today's date, e.g. 2012-02-25. Password or positive response to " + f"input prompt required when modifying more than {PW_CHECK_NUM} entries.", + ) + archive_parser.set_defaults(func=archive) + + delete_wfs_parser = subparsers.add_parser( + "delete_wflows", + help='Delete workflows (permanently). Use "archive_wflows" instead if you want to "soft-remove"', + aliases=["delete_wfs"], + ) + delete_wfs_parser.add_argument(*fw_id_args, **fw_id_kwargs) + delete_wfs_parser.add_argument("-n", "--name", help="name") + delete_wfs_parser.add_argument(*state_args, **state_kwargs) + delete_wfs_parser.add_argument(*query_args, **query_kwargs) + delete_wfs_parser.add_argument( + "--password", + help="Today's date, e.g. 2012-02-25. Password or positive response to " + f"input prompt required when modifying more than {PW_CHECK_NUM} entries.", + ) + delete_wfs_parser.add_argument( + "--ldirs", + help="the launch directories associated with the WF will be deleted as well, if possible", + dest="delete_launch_dirs", + action="store_true", + ) + delete_wfs_parser.set_defaults(func=delete_wfs, delete_launch_dirs=False) + + get_qid_parser = subparsers.add_parser("get_qids", help="get the queue id of a Firework") + get_qid_parser.add_argument(*fw_id_args, **fw_id_kwargs) + get_qid_parser.set_defaults(func=get_qid) + + cancel_qid_parser = subparsers.add_parser("cancel_qid", help="cancel a reservation") + cancel_qid_parser.add_argument(*qid_args, **qid_kwargs) + cancel_qid_parser.set_defaults(func=cancel_qid) + + reservation_parser = subparsers.add_parser("detect_unreserved", help="Find launches with stale reservations") + reservation_parser.add_argument( + "--time", help="expiration time (seconds)", default=RESERVATION_EXPIRATION_SECS, type=int + ) + reservation_parser.add_argument("--rerun", help="cancel and rerun expired reservations", action="store_true") + reservation_parser.add_argument(*enh_disp_args, **enh_disp_kwargs) + reservation_parser.set_defaults(func=detect_unreserved) + + fizzled_parser = subparsers.add_parser("detect_lostruns", help="Find launches that have FIZZLED") + fizzled_parser.add_argument("--time", help="expiration time (seconds)", default=RUN_EXPIRATION_SECS, type=int) + fizzled_parser.add_argument("--fizzle", help="mark lost runs as fizzled", action="store_true") + fizzled_parser.add_argument("--rerun", help="rerun lost runs", action="store_true") + fizzled_parser.add_argument("--refresh", help="refresh the detected inconsistent fireworks", action="store_true") + fizzled_parser.add_argument( + "--max_runtime", help="max runtime, matching failures ran no longer than this (seconds)", type=int + ) + fizzled_parser.add_argument( + "--min_runtime", help="min runtime, matching failures must have run at least this long (seconds)", type=int + ) + fizzled_parser.add_argument("-q", "--query", help="restrict search to only FWs matching this query") + fizzled_parser.add_argument("-lq", "--launch_query", help="restrict search to only launches matching this query") + fizzled_parser.add_argument(*enh_disp_args, **enh_disp_kwargs) + fizzled_parser.set_defaults(func=detect_lostruns) + + priority_parser = subparsers.add_parser("set_priority", help="modify the priority of one or more FireWorks") + priority_parser.add_argument("priority", help="get FW with this fw_id", default=None, type=int) + priority_parser.add_argument(*fw_id_args, **fw_id_kwargs) + priority_parser.add_argument("-n", "--name", help="name") + priority_parser.add_argument(*state_args, **state_kwargs) + priority_parser.add_argument(*query_args, **query_kwargs) + priority_parser.add_argument(*launches_mode_args, **launches_mode_kwargs) + priority_parser.add_argument( + "--password", + help="Today's date, e.g. 2012-02-25. Password or positive response to " + f"input prompt required when modifying more than {PW_CHECK_NUM} entries.", + ) + priority_parser.add_argument( + "-wf", action="store_true", help="the priority will be set for all the fireworks of the matching workflows" + ) + priority_parser.set_defaults(func=set_priority) + + parser.add_argument( + "-l", "--launchpad_file", help="path to LaunchPad file containing central DB connection info", default=None + ) + parser.add_argument( + "-c", + "--config_dir", + help="path to a directory containing the LaunchPad file (used if -l unspecified)", + default=CONFIG_FILE_DIR, + ) + parser.add_argument("--logdir", help="path to a directory for logging") + parser.add_argument("--loglvl", help="level to print log messages", default="INFO") + parser.add_argument("-s", "--silencer", help="shortcut to mute log messages", action="store_true") + + webgui_parser = subparsers.add_parser("webgui", help="launch the web GUI") + webgui_parser.add_argument( + "--port", + dest="port", + type=int, + default=WEBSERVER_PORT, + help="Port to run the web server on (default: 5000 or WEBSERVER_PORT arg in FW_config.yaml)", + ) + webgui_parser.add_argument( + "--host", + dest="host", + type=str, + default=WEBSERVER_HOST, + help="Host to run the web server on (default: 127.0.0.1 or WEBSERVER_HOST arg in FW_config.yaml)", + ) + webgui_parser.add_argument("--debug", help="print debug messages", action="store_true") + webgui_parser.add_argument( + "-s", "--server_mode", help="run in server mode (skip opening the browser)", action="store_true" + ) + webgui_parser.add_argument( + "--nworkers", type=arg_positive_int, help="Number of worker processes for server mode", default=1 + ) + webgui_parser.add_argument("--fwquery", help="additional query filter for FireWorks as JSON string") + webgui_parser.add_argument("--wflowquery", help="additional query filter for Workflows as JSON string") + webgui_parser.add_argument( + "--webgui_username", help="Optional username needed to access webgui", type=str, default=None + ) + webgui_parser.add_argument( + "--webgui_password", help="Optional password needed to access webgui", type=str, default=None + ) + webgui_parser.set_defaults(func=webgui) + + recover_parser = subparsers.add_parser("recover_offline", help="recover offline workflows") + recover_parser.add_argument("-i", "--ignore_errors", help="ignore errors", action="store_true") + recover_parser.add_argument( + "-w", + "--fworker_file", + help="path to fworker file. An empty string will match all the workers", + default=FWORKER_LOC, + ) + recover_parser.add_argument("-pe", "--print-errors", help="print errors", action="store_true") + recover_parser.set_defaults(func=recover_offline) + + forget_parser = subparsers.add_parser("forget_offline", help="forget offline workflows") + forget_parser.add_argument("-n", "--name", help="name") + forget_parser.add_argument(*state_args, **state_kwargs) + forget_parser.add_argument(*query_args, **query_kwargs) + forget_parser.set_defaults(func=forget_offline) + + # admin commands + admin_parser = subparsers.add_parser( + "admin", help='Various db admin commands, type "lpad admin -h" for more.', parents=[parent_parser] + ) + admin_subparser = admin_parser.add_subparsers(title="action", dest="action_command") + + maintain_parser = admin_subparser.add_parser("maintain", help="Run database maintenance") + maintain_parser.add_argument("--infinite", help="loop infinitely", action="store_true") + maintain_parser.add_argument( + "--maintain_interval", + help="sleep time between maintenance loops (infinite mode)", + default=MAINTAIN_INTERVAL, + type=int, + ) + maintain_parser.set_defaults(func=maintain) + + orphaned_parser = admin_subparser.add_parser("orphaned", help="Find orphaned FireWorks") + orphaned_parser.add_argument(*fw_id_args, **fw_id_kwargs) + orphaned_parser.add_argument("-n", "--name", help="get FWs with this name") + orphaned_parser.add_argument(*state_args, **state_kwargs) + orphaned_parser.add_argument(*query_args, **query_kwargs) + orphaned_parser.add_argument(*launches_mode_args, **launches_mode_kwargs) + orphaned_parser.add_argument(*qid_args, **qid_kwargs) + orphaned_parser.add_argument(*disp_args, **disp_kwargs) + orphaned_parser.add_argument("-m", "--max", help="limit results", default=0, type=int) + orphaned_parser.add_argument("--sort", help="Sort results", choices=["created_on", "updated_on"]) + orphaned_parser.add_argument("--rsort", help="Reverse sort results", choices=["created_on", "updated_on"]) + orphaned_parser.add_argument("--remove", help="delete orphaned", action="store_true") + orphaned_parser.add_argument( + "--ldirs", + help="the launch directories associated with the orphaned Fireworks will be deleted as well, if possible", + dest="delete_launch_dirs", + action="store_true", + ) + orphaned_parser.set_defaults(func=orphaned) + + tuneup_parser = admin_subparser.add_parser( + "tuneup", help="Tune-up the database (should be performed during scheduled downtime)" + ) + tuneup_parser.add_argument( + "--full", help="Run full tuneup and compaction (should be run during DB downtime only)", action="store_true" + ) + tuneup_parser.set_defaults(func=tuneup) + + refresh_parser = admin_subparser.add_parser( + "refresh", help="manually force a workflow refresh (not usually needed)" + ) + refresh_parser.add_argument(*fw_id_args, **fw_id_kwargs) + refresh_parser.add_argument("-n", "--name", help="name") + refresh_parser.add_argument(*state_args, **state_kwargs) + refresh_parser.add_argument(*query_args, **query_kwargs) + refresh_parser.add_argument( + "--password", + help="Today's date, e.g. 2012-02-25. Password or positive response to " + f"input prompt required when modifying more than {PW_CHECK_NUM} entries.", + ) + refresh_parser.set_defaults(func=refresh) + + unlock_parser = admin_subparser.add_parser( + "unlock", help="manually unlock a workflow that is locked (only if you know what you are doing!)" + ) + unlock_parser.add_argument(*fw_id_args, **fw_id_kwargs) + unlock_parser.add_argument("-n", "--name", help="name") + unlock_parser.add_argument(*state_args, **state_kwargs) + unlock_parser.add_argument(*query_args, **query_kwargs) + unlock_parser.add_argument( + "--password", + help="Today's date, e.g. 2012-02-25. Password or positive response to " + f"input prompt required when modifying more than {PW_CHECK_NUM} entries.", + ) + unlock_parser.set_defaults(func=unlock) + + report_parser = subparsers.add_parser( + "report", help='Compile a report of runtime stats, type "lpad report -h" for more options.' + ) + report_parser.add_argument( + "-c", + "--collection", + help="The collection to report on; choose from 'fws' (default), 'wflows', or 'launches'.", + default="fws", + ) + report_parser.add_argument( + "-i", + "--interval", + help="Interval on which to split the report. " + "Choose from 'minutes', 'hours', " + "'days' (default), 'months', or 'years'.", + default="days", + ) + report_parser.add_argument( + "-n", "--num_intervals", help="The number of intervals on which to report (default=5)", type=int, default=5 + ) + report_parser.add_argument("-q", "--query", help="Additional Pymongo queries to filter entries before processing.") + report_parser.set_defaults(func=report) + + introspect_parser = subparsers.add_parser("introspect", help="Introspect recent runs to pin down errors") + introspect_parser.add_argument("-m", "--max", help="examine past results", default=100, type=int) + introspect_parser.add_argument( + "-t", + "--threshold", + help="controls signal to noise ratio, e.g., 10 means " + "difference of at least 10 runs between fizzled/completed count", + default=10, + type=int, + ) + introspect_parser.set_defaults(func=introspect) + + try: + import argcomplete + + argcomplete.autocomplete(parser) + # This supports bash autocompletion. To enable this, pip install + # argcomplete, activate global completion, or add + # eval "$(register-python-argcomplete lpad)" + # into your .bash_profile or .bashrc + except ImportError: + pass + + args = parser.parse_args(argv) + + cfg_files_to_check = [("launchpad", "-l", False, LAUNCHPAD_LOC)] + if hasattr(args, "fworker_file"): + cfg_files_to_check.append(("fworker", "-w", False, FWORKER_LOC)) + _validate_config_file_paths(args, cfg_files_to_check) + + if args.command is None: + # if no command supplied, print help + parser.print_help() + else: + for opt in fw_id_options: + if hasattr(args, opt) and getattr(args, opt) is not None and isinstance(getattr(args, opt), str): + if "," in getattr(args, opt): + setattr(args, opt, [int(x) for x in getattr(args, opt).split(",")]) + else: + setattr(args, opt, [int(getattr(args, opt))]) + + args.func(args) + + return 0 + + +if __name__ == "__main__": + raise SystemExit(lpad()) From fe07e5ca789edc0b5045ea8189caecaaa6ba64ee Mon Sep 17 00:00:00 2001 From: Wayne Date: Fri, 26 Jul 2024 11:36:39 -0700 Subject: [PATCH 02/12] added file reservation_finder.py that looks matches a jobID withe respective fw_id --- fireworks/core/launchpad.py | 6 +- fireworks/core/reservation_finder.py | 169 +++++++++++++++++++++++++++ 2 files changed, 173 insertions(+), 2 deletions(-) create mode 100644 fireworks/core/reservation_finder.py diff --git a/fireworks/core/launchpad.py b/fireworks/core/launchpad.py index 432d5f429..f17ad0b68 100644 --- a/fireworks/core/launchpad.py +++ b/fireworks/core/launchpad.py @@ -10,7 +10,9 @@ import warnings from collections import defaultdict from itertools import chain -from sustodian import FindMyFW + +#from sustodian import FindMyFW +from fireworks.core import reservation_finder import gridfs from bson import ObjectId @@ -1195,7 +1197,7 @@ def reserve_fw(self, fworker, launch_dir, host=None, ip=None, fw_id=None): def get_fw_ids_from_reservation_id(self, reservation_id): - fw_id=FindMyFW.get_fwid(reservation_id) + fw_id=reservation_finder.get_fwid(reservation_id) return fw_id def cancel_reservation_by_reservation_id(self, reservation_id) -> None: diff --git a/fireworks/core/reservation_finder.py b/fireworks/core/reservation_finder.py new file mode 100644 index 000000000..8c2f9a78e --- /dev/null +++ b/fireworks/core/reservation_finder.py @@ -0,0 +1,169 @@ +import os +import subprocess +import json +import paramiko +import getpass +import re + +# Function to execute local shell commands and return the output +def execute_command(command): + try: + result = subprocess.run(command, shell=True, capture_output=True, text=True) + if result.returncode != 0: + raise Exception(f"Command failed: {command}\n{result.stderr}") + ssh=None + return result.stdout.strip(),ssh + except Exception as e: + result,ssh=ssh_login(command) + print(e) + return result,ssh + + +def extract_username_hostname(input_string): + # Define the regex pattern + pattern = r'(?P[^@]+)@(?P.+)' + + # Search for the pattern in the input string + match = re.match(pattern, input_string) + + if match: + # Extract username and hostname from named groups + username = match.group('username') + hostname = match.group('hostname') + return username, hostname + else: + raise ValueError("The input does not match the required format 'username@hostname'.") + +# Get user input + +# SSH login and execute remote command +def ssh_login(command): + input_string = input("Enter username@hostname: ").strip() + # Replace with your hostname and username + username, hostname = extract_username_hostname(input_string) + password = getpass.getpass('Enter password+OTP: ') + + # Create an SSH client + ssh = paramiko.SSHClient() + ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + + try: + # Connect to the server + ssh.connect(hostname, username=username, password=password) + # Execute the command + stdin, stdout, stderr = ssh.exec_command(command) + output = stdout.read().decode('utf-8').strip() + errors = stderr.read().decode('utf-8').strip() + + if errors: + raise Exception(f"Command failed: {command}\n{errors}") + + except Exception as e: + print(e) + return output, ssh + + +def get_fwid(jobid): + job_info,ssh = execute_command(f"scontrol show jobid {jobid}") + if ssh !=None: + fw_id=find_worker(job_info,ssh) + ssh.close() + else: + fw_id=find_worker(job_info,ssh) + + return fw_id + + +def find_worker(job_info, ssh): + stdout_dir = "" + for line in job_info.splitlines(): + if "StdOut=" in line: + stdout_dir = line.split("=", 1)[1] + break + + if not stdout_dir: + print("StdOut path not found in job information") + return + + base_dir = os.path.dirname(stdout_dir) + + if ssh!=None: + # Change directory to the base directory on the remote server + stdin, stdout, stderr = ssh.exec_command(f"cd {base_dir} && pwd") + current_dir = stdout.read().decode('utf-8').strip() + errors = stderr.read().decode('utf-8').strip() + if errors: + raise Exception(f"Failed to change directory: {errors}") + + print(f"Changed directory to: {current_dir}") + + stdin, stdout, stderr = ssh.exec_command(f"find {current_dir} -type d -name 'launcher_*'") + launch_dirs = stdout.read().decode('utf-8').splitlines() + errors = stderr.read().decode('utf-8').strip() + if errors: + raise Exception(f"Failed to find launch directories: {errors}") + + largest_dir = max(launch_dirs, key=lambda d: d.split('_')[-1]) + + # Change to the largest directory + stdin, stdout, stderr = ssh.exec_command(f"cd {largest_dir} && pwd") + final_dir = stdout.read().decode('utf-8').strip() + errors = stderr.read().decode('utf-8').strip() + if errors: + raise Exception(f"Failed to change directory to {largest_dir}: {errors}") + + print(f"Changed directory to: {final_dir}") + + # Check for the JSON file in the directory + stdin, stdout, stderr = ssh.exec_command(f"cat {final_dir}/FW.json") + json_data = stdout.read().decode('utf-8').strip() + errors = stderr.read().decode('utf-8').strip() + if errors: + raise Exception(f"Failed to read FW.json: {errors}") + + data = json.loads(json_data) + spec_mpid = data.get('spec', {}).get('MPID', 'N/A') + fw_id = data.get('fw_id', 'N/A') + + print(f"spec.MPID: {spec_mpid}") + print(f"fw_id: {fw_id}") + else: + # Change directory to the extracted base directory + try: + os.chdir(base_dir) + except OSError: + print(f"Failed to change directory to {base_dir}") + exit(1) + + # Print the current directory to confirm + print(f"Changed directory to: {os.getcwd()}") + + # Find the largest directory with the pattern "launcher_*" + launch_dirs = subprocess.check_output(f"find {os.getcwd()} -type d -name 'launcher_*'", shell=True).decode().splitlines() + largest_dir = max(launch_dirs, key=lambda d: d.split('_')[-1]) + + try: + os.chdir(largest_dir) + except OSError: + print(f"Failed to change directory to {largest_dir}") + exit(1) + + print(f"Changed directory to: {os.getcwd()}") + + json_file = os.path.join(os.getcwd(), "FW.json") + + # Check if the JSON file exists + if os.path.isfile(json_file): + with open(json_file, 'r') as f: + data = json.load(f) + spec_mpid = data.get('spec', {}).get('MPID', 'N/A') + fw_id = data.get('fw_id', 'N/A') + + # Output the extracted values + print(f"spec.MPID: {spec_mpid}") + print(f"fw_id: {fw_id}") + else: + print(f"FW.json not found in {largest_dir}") + + return fw_id + return fw_id \ No newline at end of file From 7d7762e1ff339a83152b04c64052ef417558e3e5 Mon Sep 17 00:00:00 2001 From: Wayne Date: Fri, 26 Jul 2024 11:50:10 -0700 Subject: [PATCH 03/12] added paramiko dependency --- fireworks/core/launchpad.py | 4 ++-- fireworks/core/reservation_finder.py | 1 - setup.py | 1 + 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fireworks/core/launchpad.py b/fireworks/core/launchpad.py index f17ad0b68..fd31f4302 100644 --- a/fireworks/core/launchpad.py +++ b/fireworks/core/launchpad.py @@ -11,8 +11,6 @@ from collections import defaultdict from itertools import chain -#from sustodian import FindMyFW -from fireworks.core import reservation_finder import gridfs from bson import ObjectId @@ -23,6 +21,7 @@ from tqdm import tqdm from fireworks.core.firework import Firework, FWAction, Launch, Tracker, Workflow +from fireworks.core import reservation_finder from fireworks.fw_config import MongoClient from fireworks.fw_config import ( GRIDFS_FALLBACK_COLLECTION, @@ -1198,6 +1197,7 @@ def reserve_fw(self, fworker, launch_dir, host=None, ip=None, fw_id=None): def get_fw_ids_from_reservation_id(self, reservation_id): fw_id=reservation_finder.get_fwid(reservation_id) + return fw_id def cancel_reservation_by_reservation_id(self, reservation_id) -> None: diff --git a/fireworks/core/reservation_finder.py b/fireworks/core/reservation_finder.py index 8c2f9a78e..6893fc4fd 100644 --- a/fireworks/core/reservation_finder.py +++ b/fireworks/core/reservation_finder.py @@ -39,7 +39,6 @@ def extract_username_hostname(input_string): # SSH login and execute remote command def ssh_login(command): input_string = input("Enter username@hostname: ").strip() - # Replace with your hostname and username username, hostname = extract_username_hostname(input_string) password = getpass.getpass('Enter password+OTP: ') diff --git a/setup.py b/setup.py index 554530e9e..6c9b434d0 100644 --- a/setup.py +++ b/setup.py @@ -35,6 +35,7 @@ "tabulate>=0.7.5", "flask>=0.11.1", "flask-paginate>=0.4.5", + "paramiko>=3.4.0", "gunicorn>=19.6.0", "tqdm>=4.8.4", "importlib-metadata>=4.8.2; python_version<'3.8'", From 85477052264149accd35664687e6c5d81d198afd Mon Sep 17 00:00:00 2001 From: Wayne Date: Fri, 26 Jul 2024 11:53:51 -0700 Subject: [PATCH 04/12] moved new res_finder.py file to utilities --- fireworks/core/launchpad.py | 2 +- fireworks/{core => utilities}/reservation_finder.py | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename fireworks/{core => utilities}/reservation_finder.py (100%) diff --git a/fireworks/core/launchpad.py b/fireworks/core/launchpad.py index fd31f4302..d4c3a3943 100644 --- a/fireworks/core/launchpad.py +++ b/fireworks/core/launchpad.py @@ -21,7 +21,7 @@ from tqdm import tqdm from fireworks.core.firework import Firework, FWAction, Launch, Tracker, Workflow -from fireworks.core import reservation_finder +from fireworks.fireworks.utilities import reservation_finder from fireworks.fw_config import MongoClient from fireworks.fw_config import ( GRIDFS_FALLBACK_COLLECTION, diff --git a/fireworks/core/reservation_finder.py b/fireworks/utilities/reservation_finder.py similarity index 100% rename from fireworks/core/reservation_finder.py rename to fireworks/utilities/reservation_finder.py From c752afb34e4098706f444bf055f562f5f999bd35 Mon Sep 17 00:00:00 2001 From: Wayne Date: Fri, 26 Jul 2024 11:57:48 -0700 Subject: [PATCH 05/12] readded the description of get_fwid_from_reservation_id and also changed name from get_fw_ids_from_reservation_id to just get_fw_id_from_reservation_id --- fireworks/core/launchpad.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fireworks/core/launchpad.py b/fireworks/core/launchpad.py index d4c3a3943..f0f46fda1 100644 --- a/fireworks/core/launchpad.py +++ b/fireworks/core/launchpad.py @@ -1194,8 +1194,14 @@ def reserve_fw(self, fworker, launch_dir, host=None, ip=None, fw_id=None): """ return self.checkout_fw(fworker, launch_dir, host=host, ip=ip, fw_id=fw_id, state="RESERVED") - def get_fw_ids_from_reservation_id(self, reservation_id): - + def get_fw_id_from_reservation_id(self, reservation_id): + """ + Given the reservation id, return the list of firework ids. + Args: + reservation_id (int) + Returns: + [int]: Return the firework id. + """ fw_id=reservation_finder.get_fwid(reservation_id) return fw_id From aa35db3eef00b30b86962d4f334d2bbf8142e5c8 Mon Sep 17 00:00:00 2001 From: Wayne Date: Fri, 26 Jul 2024 11:59:32 -0700 Subject: [PATCH 06/12] removed .ipynb checkpoints --- .../.ipynb_checkpoints/lpad_run-checkpoint.py | 1565 ----------------- 1 file changed, 1565 deletions(-) delete mode 100644 fireworks/scripts/.ipynb_checkpoints/lpad_run-checkpoint.py diff --git a/fireworks/scripts/.ipynb_checkpoints/lpad_run-checkpoint.py b/fireworks/scripts/.ipynb_checkpoints/lpad_run-checkpoint.py deleted file mode 100644 index d4badd6f2..000000000 --- a/fireworks/scripts/.ipynb_checkpoints/lpad_run-checkpoint.py +++ /dev/null @@ -1,1565 +0,0 @@ -"""A runnable script for managing a FireWorks database (a command-line interface to launchpad.py).""" - -from __future__ import annotations - -import ast -import copy -import datetime -import json -import os -import re -import sys -import time -from argparse import ArgumentParser, ArgumentTypeError, Namespace -from importlib import metadata -from typing import Any, Sequence - -from pymongo import ASCENDING, DESCENDING -from ruamel.yaml import YAML - -from fireworks import FW_INSTALL_DIR -from fireworks.core.firework import Firework, Workflow -from fireworks.core.fworker import FWorker -from fireworks.core.launchpad import LaunchPad, WFLock -from fireworks.features.fw_report import FWReport -from fireworks.features.introspect import Introspector -from fireworks.fw_config import ( - CONFIG_FILE_DIR, - FWORKER_LOC, - LAUNCHPAD_LOC, - MAINTAIN_INTERVAL, - PW_CHECK_NUM, - RESERVATION_EXPIRATION_SECS, - RUN_EXPIRATION_SECS, - WEBSERVER_HOST, - WEBSERVER_PORT, -) -from fireworks.user_objects.firetasks.script_task import ScriptTask -from fireworks.utilities.fw_serializers import DATETIME_HANDLER, recursive_dict - -from ._helpers import _validate_config_file_paths - -__author__ = "Anubhav Jain" -__credits__ = "Shyue Ping Ong" -__copyright__ = "Copyright 2013, The Materials Project" -__maintainer__ = "Anubhav Jain" -__email__ = "ajain@lbl.gov" -__date__ = "Feb 7, 2013" - -DEFAULT_LPAD_YAML = "my_launchpad.yaml" - - -def pw_check(ids: list[int], args: Namespace, skip_pw: bool = False) -> list[int]: - if len(ids) > PW_CHECK_NUM and not skip_pw: - m_password = datetime.datetime.now().strftime("%Y-%m-%d") - if not args.password: - if input(f"Are you sure? This will modify {len(ids)} entries. (Y/N)")[0].upper() == "Y": - args.password = datetime.datetime.now().strftime("%Y-%m-%d") - else: - print("Operation aborted by user.") - if args.password != m_password: - raise ValueError( - f"Modifying more than {PW_CHECK_NUM} entries requires setting the --password parameter! " - "(Today's date, e.g. 2012-02-25)" - ) - return ids - - -def parse_helper(lp: LaunchPad, args: Namespace, wf_mode: bool = False, skip_pw: bool = False) -> list[int]: - """ - Helper method to parse args that can take either id, name, state or query. - - Args: - args: Namespace of parsed CLI arguments. - wf_mode (bool): If True, will query lp for workflow instead of fireworks IDs. - skip_pw (bool): If True, skip PW check. Defaults to False. - - Returns: - list[int]: Firework or Workflow IDs. - """ - if args.fw_id and sum(bool(x) for x in [args.name, args.state, args.query]) >= 1: - raise ValueError("Cannot specify both fw_id and name/state/query)") - - query = {} - if args.fw_id: - return pw_check(args.fw_id, args, skip_pw) - if args.query: - query = ast.literal_eval(args.query) - if args.name and "launches_mode" in args and not args.launches_mode: - query["name"] = args.name - if args.state: - query["state"] = args.state - - if hasattr(args, "sort") and args.sort: - sort = [(args.sort, ASCENDING)] - elif hasattr(args, "rsort") and args.rsort: - sort = [(args.rsort, DESCENDING)] - else: - sort = None - - max = args.max if hasattr(args, "max") else 0 - - if wf_mode: - return pw_check(lp.get_wf_ids(query, sort=sort, limit=max), args, skip_pw) - - return pw_check(lp.get_fw_ids(query, sort=sort, limit=max, launches_mode=args.launches_mode), args, skip_pw) - - -def get_lp(args: Namespace) -> LaunchPad: - try: - if args.launchpad_file: - lp = LaunchPad.from_file(args.launchpad_file) - else: - args.loglvl = "CRITICAL" if args.silencer else args.loglvl - # no lpad file means we try connect to localhost which is fast so use small timeout - # (default 30s) for quick response to user if no DB is running - mongo_kwds = {"serverSelectionTimeoutMS": 500} - lp = LaunchPad(logdir=args.logdir, strm_lvl=args.loglvl, mongoclient_kwargs=mongo_kwds) - - # make sure we can connect to DB, raises pymongo.errors.ServerSelectionTimeoutError if not - lp.connection.admin.command("ping") - return lp - - except Exception: - err_message = ( - f"FireWorks was not able to connect to MongoDB at {lp.host}:{lp.port}. Is the server running? " - f"The database file specified was {args.launchpad_file}." - ) - if not args.launchpad_file: - err_message += ( - ' Type "lpad init" if you would like to set up a file that specifies ' - "location and credentials of your Mongo database (otherwise use default " - "localhost configuration)." - ) - # use from None to hide the pymongo ServerSelectionTimeoutError that otherwise clutters up the stack trace - raise ValueError(err_message) from None - - -def init_yaml(args: Namespace) -> None: - if args.uri_mode: - fields = ( - ("host", None, "Example: mongodb+srv://USER:PASSWORD@CLUSTERNAME.mongodb.net/fireworks"), - ("ssl_ca_file", None, "Path to any client certificate to be used for mongodb connection"), - ( - "authsource", - None, - "Database used for authentication, if not connection db. e.g., for MongoDB Atlas this is sometimes " - "'admin'.", - ), - ) - else: - fields = ( - ("host", "localhost", "Example: 'localhost' or 'mongodb+srv://CLUSTERNAME.mongodb.net'"), - ("port", 27017, ""), - ("name", "fireworks", "Database under which to store the fireworks collections"), - ("username", None, "Username for MongoDB authentication"), - ("password", None, "Password for MongoDB authentication"), - ("ssl_ca_file", None, "Path to any client certificate to be used for Mongodb connection"), - ( - "authsource", - None, - "Database used for authentication, if not connection db. e.g., for MongoDB Atlas this is sometimes " - "'admin'.", - ), - ) - - doc: dict[str, str | int | bool | None] = {} - if args.uri_mode: - print( - "Note 1: You are in URI format mode. This means that all database parameters (username, password, host, " - "port, database name, etc.) must be present in the URI. See: " - "https://docs.mongodb.com/manual/reference/connection-string/ for details." - ) - print("(Enter your connection URI through the 'host' parameter)") - print("Please supply the following configuration values") - print("(press Enter if you want to accept the defaults)\n") - for k, default, helptext in fields: - val = input(f"Enter {k} parameter. ({default=}). {helptext}: ") - doc[k] = val or default - if "port" in doc: - doc["port"] = int(doc["port"]) # enforce the port as an int - if args.uri_mode: - doc["uri_mode"] = True - - lp = LaunchPad.from_dict(doc) - lp.to_file(args.config_file) - print(f"\nConfiguration written to {args.config_file}!") - - -def reset(args: Namespace) -> None: - lp = get_lp(args) - if not args.password: - n_docs = lp.workflows.count_documents({}) - answer = input( - f"Are you sure? This will RESET {n_docs} workflows and all data. " - f"To confirm, please type the name of this database ({lp.name}) :" - ) - if answer == lp.name: - args.password = datetime.datetime.now().strftime("%Y-%m-%d") - else: - raise ValueError("Incorrect input to confirm database reset, operation aborted.") - lp.reset(args.password) - - -def add_wf(args: Namespace) -> None: - lp = get_lp(args) - if args.dir: - files = [] - for f in args.wf_file: - files.extend([os.path.join(f, i) for i in os.listdir(f)]) - else: - files = args.wf_file - for f in files: - fwf = Workflow.from_file(f) - if args.check: - from fireworks.utilities.dagflow import DAGFlow - - DAGFlow.from_fireworks(fwf).check() - lp.add_wf(fwf) - - -def append_wf(args: Namespace) -> None: - lp = get_lp(args) - lp.append_wf(Workflow.from_file(args.wf_file), args.fw_id, detour=args.detour, pull_spec_mods=args.pull_spec_mods) - - -def dump_wf(args: Namespace) -> None: - lp = get_lp(args) - lp.get_wf_by_fw_id(args.fw_id).to_file(args.wf_file) - - -def check_wf(args: Namespace) -> None: - from fireworks.utilities.dagflow import DAGFlow - - lp = get_lp(args) - DAGFlow.from_fireworks(lp.get_wf_by_fw_id(args.fw_id)).check() - - -def add_wf_dir(args: Namespace) -> None: - lp = get_lp(args) - for filename in os.listdir(args.wf_dir): - fwf = Workflow.from_file(filename) - lp.add_wf(fwf) - - -def print_fws(ids, lp, args: Namespace) -> None: - """Prints results of some FireWorks query to stdout.""" - fws = [] - if args.display_format == "ids": - fws = ids - elif args.display_format == "count": - fws = [ids] - else: - for id in ids: - fw = lp.get_fw_by_id(id) - d = fw.to_dict() - d["state"] = d.get("state", "WAITING") - if args.display_format == "more" or args.display_format == "less": - if "archived_launches" in d: - del d["archived_launches"] - del d["spec"] - if args.display_format == "less" and "launches" in d: - del d["launches"] - fws.append(d) - if len(fws) == 1: - fws = fws[0] - get_output(args, fws) - - -def get_fw_ids_helper(lp: LaunchPad, args: Namespace, count_only: bool | None = None) -> list[int] | int: - """Build fws query from command line options and submit. - - Parameters: - lp (fireworks.core.firework.Launchpad) - args (argparse.Namespace) - count_only (bool): if None, then looked up in args. - - Returns: - list[int] | int: resulting fw_ids or count of fws in query. - """ - if sum(bool(x) for x in [args.fw_id, args.name, args.state, args.query]) > 1: - raise ValueError("Please specify exactly one of (fw_id, name, state, query)") - if sum(bool(x) for x in [args.fw_id, args.name, args.state, args.query]) == 0: - args.query = "{}" - args.display_format = args.display_format or "ids" - if sum(bool(x) for x in [args.fw_id, args.name, args.qid]) > 1: - raise ValueError("Please specify exactly one of (fw_id, name, qid)") - args.display_format = args.display_format or "more" - - if args.fw_id: - query = {"fw_id": {"$in": args.fw_id}} - elif args.name and not args.launches_mode: - query = {"name": args.name} - elif args.state: - query = {"state": args.state} - elif args.query: - query = ast.literal_eval(args.query) - else: - query = None - - if args.sort: - sort = [(args.sort, ASCENDING)] - elif args.rsort: - sort = [(args.rsort, DESCENDING)] - else: - sort = None - - if count_only is None: - count_only = args.display_format == "count" - if args.qid: - ids = lp.get_fw_ids_from_reservation_id(args.qid) - if query: - query["fw_id"] = {"$in": ids} - ids = lp.get_fw_ids(query, sort, args.max, launches_mode=args.launches_mode) - - else: - ids = lp.get_fw_ids(query, sort, args.max, count_only=count_only, launches_mode=args.launches_mode) - return ids - - -def get_fws_helper( - lp: LaunchPad, ids: list[int], args: Namespace -) -> list[int] | int | list[dict[str, str | int | bool]] | str | bool: - """Get fws from ids in a representation according to args.display_format.""" - fws = [] - if args.display_format == "ids": - fws = ids - elif args.display_format == "count": - fws = [ids] - else: - for id in ids: - fw = lp.get_fw_by_id(id) - d = fw.to_dict() - d["state"] = d.get("state", "WAITING") - if args.display_format == "more" or args.display_format == "less": - if "archived_launches" in d: - del d["archived_launches"] - del d["spec"] - if args.display_format == "less" and "launches" in d: - del d["launches"] - fws.append(d) - return fws[0] if len(fws) == 1 else fws - - -def get_fws(args: Namespace) -> None: - lp = get_lp(args) - ids = get_fw_ids_helper(lp, args) - fws = get_fws_helper(lp, ids, args) - get_output(args, fws) - - -def get_fws_in_wfs(args: Namespace) -> None: - # get_wfs - lp = get_lp(args) - if sum(bool(x) for x in [args.wf_fw_id, args.wf_name, args.wf_state, args.wf_query]) > 1: - raise ValueError("Please specify exactly one of (fw_id, name, state, query)") - if sum(bool(x) for x in [args.wf_fw_id, args.wf_name, args.wf_state, args.wf_query]) == 0: - args.wf_query = "{}" - - if args.wf_fw_id: - wf_query = {"nodes": {"$in": args.wf_fw_id}} - elif args.wf_name: - wf_query = {"name": args.wf_name} - elif args.wf_state: - wf_query = {"state": args.wf_state} - else: - wf_query = ast.literal_eval(args.wf_query) - - # get_fws - if sum(bool(x) for x in [args.fw_fw_id, args.fw_name, args.fw_state, args.fw_query]) > 1: - raise ValueError("Please specify exactly one of (fw_id, name, state, query)") - if sum(bool(x) for x in [args.fw_fw_id, args.fw_name, args.fw_state, args.fw_query]) == 0: - args.fw_query = "{}" - args.display_format = args.display_format or "ids" - if sum(bool(x) for x in [args.fw_fw_id, args.fw_name, args.qid]) > 1: - raise ValueError("Please specify exactly one of (fw_id, name, qid)") - args.display_format = args.display_format or "more" - - if args.fw_fw_id: - fw_query = {"fw_id": {"$in": args.fw_fw_id}} - elif args.fw_name and not args.launches_mode: - fw_query = {"name": args.fw_name} - elif args.fw_state: - fw_query = {"state": args.fw_state} - elif args.fw_query: - fw_query = ast.literal_eval(args.fw_query) - else: - fw_query = None - - if args.sort: - sort = [(args.sort, ASCENDING)] - elif args.rsort: - sort = [(args.rsort, DESCENDING)] - else: - sort = None - - if args.qid: - ids = lp.get_fw_ids_from_reservation_id(args.qid) - if fw_query: - fw_query["fw_id"] = {"$in": ids} - ids = lp.get_fw_ids_in_wfs( - wf_query=wf_query, fw_query=fw_query, sort=sort, limit=args.max, launches_mode=args.launches_mode - ) - else: - ids = lp.get_fw_ids_in_wfs( - wf_query=wf_query, - fw_query=fw_query, - sort=sort, - limit=args.max, - count_only=args.display_format == "count", - launches_mode=args.launches_mode, - ) - - print_fws(ids, lp, args) - - -def update_fws(args: Namespace) -> None: - lp = get_lp(args) - fw_ids = parse_helper(lp, args) - lp.update_spec(fw_ids, json.loads(args.update), args.mongo) - - -def get_wfs(args: Namespace) -> None: - lp = get_lp(args) - if sum(bool(x) for x in [args.fw_id, args.name, args.state, args.query]) > 1: - raise ValueError("Please specify exactly one of (fw_id, name, state, query)") - if sum(bool(x) for x in [args.fw_id, args.name, args.state, args.query]) == 0: - args.query = "{}" - args.display_format = args.display_format or "ids" - else: - args.display_format = args.display_format or "more" - - if args.fw_id: - query = {"nodes": {"$in": args.fw_id}} - elif args.name: - query = {"name": args.name} - elif args.state: - query = {"state": args.state} - else: - query = ast.literal_eval(args.query) - - if args.sort: - sort = [(args.sort, ASCENDING)] - elif args.rsort: - sort = [(args.rsort, DESCENDING)] - else: - sort = None - - ids = lp.get_wf_ids(query, sort, args.max, count_only=args.display_format == "count") - if args.display_format == "ids": - wfs = ids - elif args.display_format == "count": - wfs = [ids] - else: - wfs = [] - for i in ids: - d = lp.get_wf_summary_dict(i, args.display_format) - d["name"] += f"--{int(i)}" - wfs.append(d) - - if args.table: - if wfs: - headers = list(wfs[0]) - from prettytable import PrettyTable - - t = PrettyTable(headers) - for d in wfs: - t.add_row([d.get(k) for k in headers]) - print(t) - else: - if len(wfs) == 1: - wfs = wfs[0] - get_output(args, wfs) - - -def delete_wfs(args: Namespace) -> None: - lp = get_lp(args) - fw_ids = parse_helper(lp, args, wf_mode=True) - for f in fw_ids: - lp.delete_wf(f, delete_launch_dirs=args.delete_launch_dirs) - lp.m_logger.debug(f"Processed fw_id: {f}") - lp.m_logger.info(f"Finished deleting {len(fw_ids)} WFs") - - -def get_children(links, start, max_depth): - data = {} - for link, child in links.items(): - if link == start: - if len(child) > 0: - data[link] = [get_children(links, idx, max_depth) for idx in child] - else: - data[link] = child - return data - - -def detect_lostruns(args: Namespace) -> None: - lp = get_lp(args) - query = ast.literal_eval(args.query) if args.query else None - launch_query = ast.literal_eval(args.launch_query) if args.launch_query else None - fl, ff, fi = lp.detect_lostruns( - expiration_secs=args.time, - fizzle=args.fizzle, - rerun=args.rerun, - max_runtime=args.max_runtime, - min_runtime=args.min_runtime, - refresh=args.refresh, - query=query, - launch_query=launch_query, - ) - lp.m_logger.debug(f"Detected {len(fl)} lost launches: {fl}") - lp.m_logger.info(f"Detected {len(ff)} lost FWs: {ff}") - if args.display_format is not None and args.display_format != "none": - print_fws(ff, lp, args) - lp.m_logger.info(f"Detected {len(fi)} inconsistent FWs: {fi}") - if args.display_format is not None and args.display_format != "none": - print_fws(fi, lp, args) - if len(ff) > 0 and not args.fizzle and not args.rerun: - print("You can fix lost FWs using the --rerun or --fizzle arguments to the detect_lostruns command") - if len(fi) > 0 and not args.refresh: - print("You can fix inconsistent FWs using the --refresh argument to the detect_lostruns command") - - -def detect_unreserved(args: Namespace) -> None: - lp = get_lp(args) - if args.display_format is not None and args.display_format != "none": - unreserved = lp.detect_unreserved(expiration_secs=args.time, rerun=False) - # very inefficient, replace by mongo aggregation - fw_ids = [] - for launch_id in unreserved: - launch = lp.get_launch_by_id(launch_id) - fw_ids.append(launch.fw_id) - print_fws(fw_ids, lp, args) - print(lp.detect_unreserved(expiration_secs=args.time, rerun=args.rerun)) - - -def tuneup(args: Namespace) -> None: - lp = get_lp(args) - lp.tuneup(bkground=not args.full) - - -def defuse_wfs(args: Namespace) -> None: - lp = get_lp(args) - fw_ids = parse_helper(lp, args, wf_mode=True) - for f in fw_ids: - lp.defuse_wf(f, defuse_all_states=args.defuse_all_states) - lp.m_logger.debug(f"Processed fw_id: {f}") - lp.m_logger.info(f"Finished defusing {len(fw_ids)} FWs.") - if not args.defuse_all_states: - lp.m_logger.info( - "Note: FIZZLED and COMPLETED FWs were not defused. " - "Use the --defuse_all_states option to force this (or rerun FIZZLED FWs first)." - ) - - -def pause_wfs(args: Namespace) -> None: - lp = get_lp(args) - fw_ids = parse_helper(lp, args, wf_mode=True) - for f in fw_ids: - lp.pause_wf(f) - lp.m_logger.debug(f"Processed fw_id: {f}") - lp.m_logger.info(f"Finished defusing {len(fw_ids)} FWs.") - - -def archive(args: Namespace) -> None: - lp = get_lp(args) - fw_ids = parse_helper(lp, args, wf_mode=True) - for f in fw_ids: - lp.archive_wf(f) - lp.m_logger.debug(f"Processed fw_id: {f}") - lp.m_logger.info(f"Finished archiving {len(fw_ids)} WFs") - - -def reignite_wfs(args: Namespace) -> None: - lp = get_lp(args) - fw_ids = parse_helper(lp, args, wf_mode=True) - for f in fw_ids: - lp.reignite_wf(f) - lp.m_logger.debug(f"Processed Workflow with fw_id: {f}") - lp.m_logger.info(f"Finished reigniting {len(fw_ids)} Workflows") - - -def defuse_fws(args: Namespace) -> None: - lp = get_lp(args) - fw_ids = parse_helper(lp, args) - for f in fw_ids: - lp.defuse_fw(f) - lp.m_logger.debug(f"Processed fw_id: {f}") - lp.m_logger.info(f"Finished defusing {len(fw_ids)} FWs") - - -def pause_fws(args: Namespace) -> None: - lp = get_lp(args) - fw_ids = parse_helper(lp, args) - for f in fw_ids: - lp.pause_fw(f) - lp.m_logger.debug(f"Processed fw_id: {f}") - lp.m_logger.info(f"Finished pausing {len(fw_ids)} FWs") - - -def reignite_fws(args: Namespace) -> None: - lp = get_lp(args) - fw_ids = parse_helper(lp, args) - for f in fw_ids: - lp.reignite_fw(f) - lp.m_logger.debug(f"Processed fw_id: {f}") - lp.m_logger.info(f"Finished reigniting {len(fw_ids)} FWs") - - -def resume_fws(args: Namespace) -> None: - lp = get_lp(args) - fw_ids = parse_helper(lp, args) - for f in fw_ids: - lp.resume_fw(f) - lp.m_logger.debug(f"Processed fw_id: {f}") - lp.m_logger.info(f"Finished resuming {len(fw_ids)} FWs") - - -def rerun_fws(args: Namespace) -> None: - lp = get_lp(args) - fw_ids = parse_helper(lp, args) - if args.task_level: - launch_ids = args.launch_id - if launch_ids is None: - launch_ids = ["last"] * len(fw_ids) - elif len(launch_ids) != len(fw_ids): - raise ValueError("Specify the same number of tasks and launches") - else: - launch_ids = [None] * len(fw_ids) - for fw_id, l_id in zip(fw_ids, launch_ids): - lp.rerun_fw(int(fw_id), recover_launch=l_id, recover_mode=args.recover_mode) - lp.m_logger.debug(f"Processed {fw_id=}") - lp.m_logger.info(f"Finished setting {len(fw_ids)} FWs to rerun") - - -def refresh(args: Namespace) -> None: - lp = get_lp(args) - fw_ids = parse_helper(lp, args, wf_mode=True) - for f in fw_ids: - wf = lp.get_wf_by_fw_id_lzyfw(f) - for fw_id in wf.root_fw_ids: - lp._refresh_wf(fw_id) - lp.m_logger.debug(f"Processed Workflow with fw_id: {f}") - lp.m_logger.info(f"Finished refreshing {len(fw_ids)} Workflows") - - -def unlock(args: Namespace) -> None: - lp = get_lp(args) - fw_ids = parse_helper(lp, args, wf_mode=True) - for fw_id in fw_ids: - with WFLock(lp, fw_id, expire_secs=0, kill=True): - lp.m_logger.warning(f"FORCIBLY RELEASING LOCK DUE TO USER COMMAND, WF: {fw_id}") - lp.m_logger.debug(f"Processed Workflow with {fw_id=}") - lp.m_logger.info(f"Finished unlocking {len(fw_ids)} Workflows") - - -def get_qid(args: Namespace) -> None: - lp = get_lp(args) - for f in args.fw_id: - print(lp.get_reservation_id_from_fw_id(f)) - - -def cancel_qid(args: Namespace) -> None: - lp = get_lp(args) - lp.m_logger.warning( - "WARNING: cancel_qid does not actually remove jobs from the queue " - "(e.g., execute qdel), this must be done manually!" - ) - lp.cancel_reservation_by_reservation_id(args.qid) - - -def set_priority(args: Namespace) -> None: - wf_mode = args.wf - lp = get_lp(args) - fw_ids = parse_helper(lp, args, wf_mode=wf_mode) - if wf_mode: - all_fw_ids = set() - for fw_id in fw_ids: - wf = lp.get_wf_by_fw_id_lzyfw(fw_id) - all_fw_ids.update(wf.id_fw) - fw_ids = list(all_fw_ids) - for f in fw_ids: - lp.set_priority(f, args.priority) - lp.m_logger.debug(f"Processed fw_id {f}") - lp.m_logger.info(f"Finished setting priorities of {len(fw_ids)} FWs") - - -def _open_webbrowser(url) -> None: - """Open a web browser after a delay to give the web server more startup time.""" - import webbrowser - - time.sleep(2) - webbrowser.open(url) - - -def webgui(args: Namespace) -> None: - from fireworks.flask_site.app import app - - app.lp = get_lp(args) - - if any([args.webgui_username, args.webgui_password]) and not all([args.webgui_username, args.webgui_password]): - raise ValueError("Must set BOTH a webgui_username and webgui_password!") - - app.config["WEBGUI_USERNAME"] = args.webgui_username - app.config["WEBGUI_PASSWORD"] = args.webgui_password - - if args.wflowquery: - app.BASE_Q_WF = json.loads(args.wflowquery) - if args.fwquery: - app.BASE_Q = json.loads(args.fwquery) - if "state" in app.BASE_Q: - app.BASE_Q_WF["state"] = app.BASE_Q["state"] - - if not args.server_mode: - from threading import Thread - - url = f"http://{args.host}:{args.port}" - p1 = Thread(target=_open_webbrowser, args=(url,)) - p1.start() - app.run(host=args.host, port=args.port, debug=args.debug) - p1.join() - else: - try: - from fireworks.flask_site.gunicorn import StandaloneApplication - except ImportError: - import sys - - sys.exit("Gunicorn is required for server mode. Install using `pip install gunicorn`.") - options = { - "bind": f"{args.host}:{args.port}", - "workers": args.nworkers, - } - StandaloneApplication(app, options).run() - - -def add_scripts(args: Namespace) -> None: - lp = get_lp(args) - args.names = args.names or [None] * len(args.scripts) - args.wf_name = args.wf_name or args.names[0] - fws = [] - links = {} - for idx, s in enumerate(args.scripts): - fws.append(Firework(ScriptTask({"script": s, "use_shell": True}), name=args.names[idx], fw_id=idx)) - if idx != 0: - links[idx - 1] = idx - - lp.add_wf(Workflow(fws, links, args.wf_name)) - - -def recover_offline(args: Namespace) -> None: - lp = get_lp(args) - fworker_name = FWorker.from_file(args.fworker_file).name if args.fworker_file else None - failed_fws = [] - recovered_fws = [] - - for launch in lp.offline_runs.find({"completed": False, "deprecated": False}, {"launch_id": 1, "fw_id": 1}): - if fworker_name and lp.launches.count({"launch_id": launch["launch_id"], "fworker.name": fworker_name}) == 0: - continue - fw = lp.recover_offline(launch["launch_id"], args.ignore_errors, args.print_errors) - if fw: - failed_fws.append(launch["fw_id"]) - else: - recovered_fws.append(launch["fw_id"]) - - lp.m_logger.info(f"FINISHED recovering offline runs. {len(recovered_fws)} job(s) recovered: {recovered_fws}") - if failed_fws: - lp.m_logger.info(f"FAILED to recover offline fw_ids: {failed_fws}") - - -def forget_offline(args: Namespace) -> None: - lp = get_lp(args) - fw_ids = parse_helper(lp, args) - for f in fw_ids: - lp.forget_offline(f, launch_mode=False) - lp.m_logger.debug(f"Processed fw_id: {f}") - lp.m_logger.info(f"Finished forget_offline, processed {len(fw_ids)} FWs") - - -def report(args: Namespace) -> None: - lp = get_lp(args) - query = ast.literal_eval(args.query) if args.query else None - fwr = FWReport(lp) - stats = fwr.get_stats( - coll=args.collection, interval=args.interval, num_intervals=args.num_intervals, additional_query=query - ) - title_str = f"Stats on {args.collection}" - title_dec = "-" * len(title_str) - print(title_dec) - print(title_str) - print(title_dec) - print(fwr.get_stats_str(stats)) - - -def introspect(args: Namespace) -> None: - print("NOTE: This feature is in beta mode...") - lp = get_lp(args) - isp = Introspector(lp) - for coll in ["launches", "tasks", "fireworks", "workflows"]: - print(f"generating report for {coll}...please wait...") - print() - table = isp.introspect_fizzled(coll=coll, threshold=args.threshold, limit=args.max) - isp.print_report(table, coll) - print() - - -def get_launchdir(args: Namespace) -> None: - lp = get_lp(args) - ld = lp.get_launchdir(args.fw_id, args.launch_idx) - print(ld) - - -def track_fws(args: Namespace) -> None: - lp = get_lp(args) - fw_ids = parse_helper(lp, args, skip_pw=True) - include = args.include - exclude = args.exclude - first_print = True # used to control newline - for fw_id in fw_ids: - data = lp.get_tracker_data(fw_id) - output = [] - for dct in data: - for tracker in dct["trackers"]: - if (not include or tracker.filename in include) and (not exclude or tracker.filename not in exclude): - output.extend((f"## Launch id: {dct['launch_id']}", str(tracker))) - if output: - name = lp.fireworks.find_one({"fw_id": fw_id}, {"name": 1})["name"] - output.insert(0, f"# FW id: {fw_id}, FW {name=}") - if first_print: - first_print = False - else: - output.insert(0, ">------<") - print("\n".join(output)) - - -def maintain(args: Namespace) -> None: - lp = get_lp(args) - lp.maintain(args.infinite, args.maintain_interval) - - -def orphaned(args: Namespace) -> None: - # get_fws - lp = get_lp(args) - fw_ids = get_fw_ids_helper(lp, args, count_only=False) - - # get_wfs - orphaned_fw_ids = [] - for fw_id in fw_ids: - query = {"nodes": fw_id} - wf_ids = lp.get_wf_ids(query) - if len(wf_ids) == 0: - orphaned_fw_ids.append(fw_id) - - fws = get_fws_helper(lp, orphaned_fw_ids, args) - if args.remove: - lp.m_logger.info(f"Found {len(orphaned_fw_ids)} orphaned fw_ids: {orphaned_fw_ids}") - lp.delete_fws(orphaned_fw_ids, delete_launch_dirs=args.delete_launch_dirs) - else: - get_output(args, fws) - - -def get_output(args: Namespace, objs: list[Any]) -> None: - """Prints output on stdout""" - if args.output == "json": - json.dump(objs, sys.stdout, default=DATETIME_HANDLER, indent=4) - else: - yaml = YAML(typ="safe", pure=True) - yaml.default_flow_style = False - yaml.dump(recursive_dict(objs, preserve_unicode=False), sys.stdout) - print() - - -def arg_positive_int(value: str) -> int: - try: - ivalue = int(value) - except ValueError: - raise ArgumentTypeError(f"int(value) conversion failed for {value}") - - if ivalue < 1: - raise ValueError(f"{value} is not a positive integer") - return ivalue - - -def lpad(argv: Sequence[str] | None = None) -> int: - m_description = ( - "A command line interface to FireWorks. For more help on a specific command, type 'lpad -h'." - ) - - parser = ArgumentParser("lpad", description=m_description) - - fw_version = metadata.version("fireworks") - v_out = f"%(prog)s v{fw_version} located in {FW_INSTALL_DIR}" - parser.add_argument("-v", "--version", action="version", version=v_out) - - parent_parser = ArgumentParser(add_help=False) - parser.add_argument( - "-o", - "--output", - choices=["json", "yaml"], - default="json", - type=lambda s: s.lower(), - help="Set output display format to either json or YAML. " - "YAML is easier to read for long documents. JSON is the default.", - ) - - subparsers = parser.add_subparsers(help="command", dest="command") - - # This makes common argument options easier to maintain. E.g., what if - # there is a new state or disp option? - # NOTE: Those sets of standard options are not used consistently below (jotelha) - fw_id_args = ["-i", "--fw_id"] - fw_id_kwargs = {"type": str, "help": "fw_id"} - - state_args = ["-s", "--state"] - state_kwargs = { - "type": lambda s: s.upper(), - "help": "Select by state.", - "choices": Firework.STATE_RANKS, - } - disp_args = ["-d", "--display_format"] - disp_kwargs = { - "type": lambda s: s.lower(), - "help": "Display format.", - "default": "less", - "choices": ["all", "more", "less", "ids", "count", "reservations"], - } - - # enhanced display options allow for value 'none' or None (default) for no output - enh_disp_args = copy.deepcopy(disp_args) - enh_disp_kwargs = copy.deepcopy(disp_kwargs) - enh_disp_kwargs["choices"].append("none") - enh_disp_kwargs["default"] = None - - query_args = ["-q", "--query"] - query_kwargs = {"help": 'Query (enclose pymongo-style dict in single-quotes, e.g. \'{"state":"COMPLETED"}\')'} - - launches_mode_args = ["-lm", "--launches_mode"] - launches_mode_kwargs = { - "action": "store_true", - "help": "Query the launches collection (enclose pymongo-style " - "dict in single-quotes, e.g. '{\"launch_id\": 1}')", - } - - qid_args = ["--qid"] - qid_kwargs = {"help": "Query by reservation id of job in queue"} - - # for using fw- and wf-specific options on one command line, distinguish by prefix fw and wf - # prefix short one-dash options with 'wf', i.e. '-i' -> '-wfi' - # prefix long two-dash options with 'wf_', i.e. '--fw_id' -> '--wf_fw_id' - wf_prefixed_fw_id_args = [re.sub("^-([^-].*)$", "-wf\\1", s) for s in fw_id_args] - wf_prefixed_fw_id_args = [re.sub("^--(.*)$", "--wf_\\1", s) for s in wf_prefixed_fw_id_args] - - wf_prefixed_state_args = [re.sub("^-([^-].*)$", "-wf\\1", s) for s in state_args] - wf_prefixed_state_args = [re.sub("^--(.*)$", "--wf_\\1", s) for s in wf_prefixed_state_args] - - wf_prefixed_query_args = [re.sub("^-([^-].*)$", "-wf\\1", s) for s in query_args] - wf_prefixed_query_args = [re.sub("^--(.*)$", "--wf_\\1", s) for s in wf_prefixed_query_args] - - # prefix short one-dash options with 'fw', i.e. '-i' -> '-fwi' - # prefix long two-dash options with 'fw_', i.e. '--fw_id' -> '--fw_fw_id' - fw_prefixed_fw_id_args = [re.sub("^-([^-].*)$", "-fw\\1", s) for s in fw_id_args] - fw_prefixed_fw_id_args = [re.sub("^--(.*)$", "--fw_\\1", s) for s in fw_prefixed_fw_id_args] - - fw_prefixed_state_args = [re.sub("^-([^-].*)$", "-fw\\1", s) for s in state_args] - fw_prefixed_state_args = [re.sub("^--(.*)$", "--fw_\\1", s) for s in fw_prefixed_state_args] - - fw_prefixed_query_args = [re.sub("^-([^-].*)$", "-fw\\1", s) for s in query_args] - fw_prefixed_query_args = [re.sub("^--(.*)$", "--fw_\\1", s) for s in fw_prefixed_query_args] - - # filter all long options, i.e. '--fw_id' and strip off preceding '--' - fw_id_options = [ - re.sub("^--(.*)$", "\\1", opt) - for opt in [*fw_id_args, *wf_prefixed_fw_id_args, *fw_prefixed_fw_id_args] - if re.match("^--.*$", opt) - ] - - init_parser = subparsers.add_parser("init", help="Initialize a Fireworks launchpad YAML file.") - init_parser.add_argument( - "-u", - "--uri_mode", - action="store_true", - help="Connect via a URI, see: https://docs.mongodb.com/manual/reference/connection-string/", - ) - init_parser.add_argument("--config-file", default=DEFAULT_LPAD_YAML, type=str, help="Filename to write to.") - init_parser.set_defaults(func=init_yaml) - - reset_parser = subparsers.add_parser("reset", help="reset and re-initialize the FireWorks database") - reset_parser.add_argument( - "--password", - help="Today's date, e.g. 2012-02-25. " - "Password or positive response to input prompt " - "required to protect against accidental reset.", - ) - reset_parser.set_defaults(func=reset) - - addwf_parser = subparsers.add_parser("add", help="insert a Workflow from file") - addwf_parser.add_argument( - "-d", "--dir", action="store_true", help="Directory mode. Finds all files in the paths given by wf_file." - ) - addwf_parser.add_argument("wf_file", nargs="+", help="Path to a Firework or Workflow file") - addwf_parser.add_argument( - "-c", "--check", help="check the workflow before adding", dest="check", action="store_true" - ) - addwf_parser.set_defaults(func=add_wf, check=False) - - check_wf_parser = subparsers.add_parser("check_wflow", help="check a workflow from launchpad") - check_wf_parser.add_argument("-i", "--fw_id", type=int, help="the id of a firework from the workflow") - check_wf_parser.set_defaults(func=check_wf) - - get_launchdir_parser = subparsers.add_parser( - "get_launchdir", - help="get the directory of the most recent launch of the given fw_id. A common usage is " - "'cd `get_launchdir `' to change the working directory to that of the FW launch", - ) - get_launchdir_parser.add_argument("fw_id", type=int, help="fw_id to chdir to") - get_launchdir_parser.add_argument( - "--launch_idx", - type=int, - help="the index of the launch to get (default of -1 is most recent launch)", - default=-1, - ) - get_launchdir_parser.set_defaults(func=get_launchdir) - - append_wf_parser = subparsers.add_parser( - "append_wflow", help="append a workflow from file to a workflow on launchpad" - ) - append_wf_parser.add_argument(*fw_id_args, type=fw_id_kwargs["type"], help="parent firework ids") - append_wf_parser.add_argument("-f", "--wf_file", help="path to a firework or workflow file") - append_wf_parser.add_argument( - "-d", "--detour", help="append workflow as a detour", dest="detour", action="store_true" - ) - append_wf_parser.add_argument( - "--no_pull_spec_mods", help="do not to pull spec mods from parent", dest="pull_spec_mods", action="store_false" - ) - append_wf_parser.set_defaults(func=append_wf, detour=False, pull_spec_mods=True) - - dump_wf_parser = subparsers.add_parser("dump_wflow", help="dump a workflow from launchpad to a file") - dump_wf_parser.add_argument("-i", "--fw_id", type=int, help="the id of a firework from the workflow") - dump_wf_parser.add_argument("-f", "--wf_file", help="path to a local file to store the workflow") - dump_wf_parser.set_defaults(func=dump_wf) - - addscript_parser = subparsers.add_parser( - "add_scripts", help="quickly add a script (or several scripts) to run in sequence" - ) - addscript_parser.add_argument("scripts", help="Script to run, or space-separated names", nargs="*") - addscript_parser.add_argument("-n", "--names", help="Firework name, or space-separated names", nargs="*") - addscript_parser.add_argument("-w", "--wf_name", help="Workflow name") - addscript_parser.add_argument("-d", "--delimiter", help="delimiter for separating scripts", default=",") - addscript_parser.set_defaults(func=add_scripts) - - get_fw_parser = subparsers.add_parser("get_fws", help="get information about FireWorks") - get_fw_parser.add_argument(*fw_id_args, **fw_id_kwargs) - get_fw_parser.add_argument("-n", "--name", help="get FWs with this name") - get_fw_parser.add_argument(*state_args, **state_kwargs) - get_fw_parser.add_argument(*query_args, **query_kwargs) - get_fw_parser.add_argument(*launches_mode_args, **launches_mode_kwargs) - get_fw_parser.add_argument(*qid_args, **qid_kwargs) - get_fw_parser.add_argument(*disp_args, **disp_kwargs) - get_fw_parser.add_argument("-m", "--max", help="limit results", default=0, type=int) - get_fw_parser.add_argument("--sort", help="Sort results", choices=["created_on", "updated_on"]) - get_fw_parser.add_argument("--rsort", help="Reverse sort results", choices=["created_on", "updated_on"]) - get_fw_parser.set_defaults(func=get_fws) - - get_fw_in_wf_parser = subparsers.add_parser( - "get_fws_in_wflows", - help="get information about FireWorks in Workflows", - aliases=["get_fws_in_wfs"], - ) - - get_fw_in_wf_parser.add_argument(*wf_prefixed_fw_id_args, **fw_id_kwargs) - get_fw_in_wf_parser.add_argument("-wfn", "--wf_name", help="get WFs with this name") - get_fw_in_wf_parser.add_argument(*wf_prefixed_state_args, **state_kwargs) - get_fw_in_wf_parser.add_argument(*wf_prefixed_query_args, **query_kwargs) - - get_fw_in_wf_parser.add_argument(*fw_prefixed_fw_id_args, **fw_id_kwargs) - get_fw_in_wf_parser.add_argument("-fwn", "--fw_name", help="get FWs with this name") - get_fw_in_wf_parser.add_argument(*fw_prefixed_state_args, **state_kwargs) - get_fw_in_wf_parser.add_argument(*fw_prefixed_query_args, **query_kwargs) - get_fw_in_wf_parser.add_argument(*launches_mode_args, **launches_mode_kwargs) - get_fw_in_wf_parser.add_argument(*qid_args, **qid_kwargs) - get_fw_in_wf_parser.add_argument(*disp_args, **disp_kwargs) - get_fw_in_wf_parser.add_argument("-m", "--max", help="limit results", default=0, type=int) - get_fw_in_wf_parser.add_argument("--sort", help="Sort results", choices=["created_on", "updated_on"]) - get_fw_in_wf_parser.add_argument("--rsort", help="Reverse sort results", choices=["created_on", "updated_on"]) - get_fw_in_wf_parser.set_defaults(func=get_fws_in_wfs) - - trackfw_parser = subparsers.add_parser("track_fws", help="Track FireWorks") - trackfw_parser.add_argument(*fw_id_args, **fw_id_kwargs) - trackfw_parser.add_argument("-n", "--name", help="name") - trackfw_parser.add_argument(*state_args, **state_kwargs) - trackfw_parser.add_argument(*query_args, **query_kwargs) - trackfw_parser.add_argument(*launches_mode_args, **launches_mode_kwargs) - trackfw_parser.add_argument("-c", "--include", nargs="+", help="only include these files in the report") - trackfw_parser.add_argument("-x", "--exclude", nargs="+", help="exclude these files from the report") - trackfw_parser.add_argument("-m", "--max", help="limit results", default=0, type=int) - trackfw_parser.set_defaults(func=track_fws) - - rerun_fws_parser = subparsers.add_parser("rerun_fws", help="re-run Firework(s)") - rerun_fws_parser.add_argument(*fw_id_args, **fw_id_kwargs) - rerun_fws_parser.add_argument("-n", "--name", help="name") - rerun_fws_parser.add_argument(*state_args, **state_kwargs) - rerun_fws_parser.add_argument(*query_args, **query_kwargs) - rerun_fws_parser.add_argument(*launches_mode_args, **launches_mode_kwargs) - rerun_fws_parser.add_argument( - "--password", - help="Today's date, e.g. 2012-02-25. Password or positive response to " - f"input prompt required when modifying more than {PW_CHECK_NUM} entries.", - ) - rerun_fws_parser.add_argument("--task-level", action="store_true", help="Enable task level recovery") - rerun_fws_parser.add_argument( - "-lid", "--launch_id", nargs="+", help="Recover launch id. --task-level must be given", default=None, type=int - ) - recover_mode_group = rerun_fws_parser.add_mutually_exclusive_group() - recover_mode_group.add_argument( - "-cp", - "--copy-data", - action="store_const", - const="cp", - dest="recover_mode", - help="Copy data from previous run. --task-level must be given", - ) - recover_mode_group.add_argument( - "-pd", - "--previous-dir", - action="store_const", - const="prev_dir", - dest="recover_mode", - help="Reruns in the previous folder. --task-level must be given", - ) - rerun_fws_parser.set_defaults(func=rerun_fws) - - defuse_fw_parser = subparsers.add_parser("defuse_fws", help="cancel (de-fuse) a single Firework") - defuse_fw_parser.add_argument(*fw_id_args, **fw_id_kwargs) - defuse_fw_parser.add_argument("-n", "--name", help="name") - defuse_fw_parser.add_argument(*state_args, **state_kwargs) - defuse_fw_parser.add_argument(*query_args, **query_kwargs) - defuse_fw_parser.add_argument(*launches_mode_args, **launches_mode_kwargs) - defuse_fw_parser.add_argument( - "--password", - help="Today's date, e.g. 2012-02-25. Password or positive response to " - f"input prompt required when modifying more than {PW_CHECK_NUM} entries.", - ) - defuse_fw_parser.set_defaults(func=defuse_fws) - - pause_fw_parser = subparsers.add_parser("pause_fws", help="pause a single Firework") - pause_fw_parser.add_argument(*fw_id_args, **fw_id_kwargs) - pause_fw_parser.add_argument("-n", "--name", help="name") - pause_fw_parser.add_argument(*state_args, **state_kwargs) - pause_fw_parser.add_argument(*query_args, **query_kwargs) - pause_fw_parser.add_argument(*launches_mode_args, **launches_mode_kwargs) - pause_fw_parser.add_argument( - "--password", - help="Today's date, e.g. 2012-02-25. Password or positive response to " - f"input prompt required when modifying more than {PW_CHECK_NUM} entries.", - ) - pause_fw_parser.set_defaults(func=pause_fws) - - reignite_fw_parser = subparsers.add_parser("reignite_fws", help="reignite (un-cancel) a set of Fireworks") - reignite_fw_parser.add_argument(*fw_id_args, **fw_id_kwargs) - reignite_fw_parser.add_argument("-n", "--name", help="name") - reignite_fw_parser.add_argument(*state_args, **state_kwargs) - reignite_fw_parser.add_argument(*query_args, **query_kwargs) - reignite_fw_parser.add_argument(*launches_mode_args, **launches_mode_kwargs) - reignite_fw_parser.add_argument( - "--password", - help="Today's date, e.g. 2012-02-25. Password or positive response to " - f"input prompt required when modifying more than {PW_CHECK_NUM} entries.", - ) - reignite_fw_parser.set_defaults(func=reignite_fws) - - resume_fw_parser = subparsers.add_parser("resume_fws", help="resume (un-pause) a set of Fireworks") - resume_fw_parser.add_argument(*fw_id_args, **fw_id_kwargs) - resume_fw_parser.add_argument("-n", "--name", help="name") - resume_fw_parser.add_argument(*state_args, **state_kwargs) - resume_fw_parser.add_argument(*query_args, **query_kwargs) - resume_fw_parser.add_argument(*launches_mode_args, **launches_mode_kwargs) - resume_fw_parser.add_argument( - "--password", - help="Today's date, e.g. 2012-02-25. Password or positive response to " - f"input prompt required when modifying more than {PW_CHECK_NUM} entries.", - ) - resume_fw_parser.set_defaults(func=resume_fws) - - update_fws_parser = subparsers.add_parser("update_fws", help="Update a Firework spec.") - update_fws_parser.add_argument(*fw_id_args, **fw_id_kwargs) - update_fws_parser.add_argument("-n", "--name", help="get FWs with this name") - update_fws_parser.add_argument(*state_args, **state_kwargs) - update_fws_parser.add_argument(*query_args, **query_kwargs) - update_fws_parser.add_argument(*launches_mode_args, **launches_mode_kwargs) - update_fws_parser.add_argument( - "-u", - "--update", - type=str, - help='Doc update (enclose pymongo-style dict in single-quotes, e.g. \'{"_tasks.1.hello": "world"}\')', - ) - update_fws_parser.add_argument( - "--mongo", - default=False, - action="store_true", - help="Use full pymongo style dict to modify spec. Be very careful as you can break your spec", - ) - update_fws_parser.add_argument( - "--password", - help="Today's date, e.g. 2012-02-25. Password or positive response to " - f"input prompt required when modifying more than {PW_CHECK_NUM} entries.", - ) - update_fws_parser.set_defaults(func=update_fws) - - get_wf_parser = subparsers.add_parser( - "get_wflows", - help="get information about Workflows", - aliases=["get_wfs"], - ) - get_wf_parser.add_argument(*fw_id_args, **fw_id_kwargs) - get_wf_parser.add_argument("-n", "--name", help="get WFs with this name") - get_wf_parser.add_argument(*state_args, **state_kwargs) - get_wf_parser.add_argument(*query_args, **query_kwargs) - get_wf_parser.add_argument(*disp_args, **disp_kwargs) - get_wf_parser.add_argument("-m", "--max", help="limit results", default=0, type=int) - get_wf_parser.add_argument("--sort", help="Sort results", choices=["created_on", "updated_on"]) - get_wf_parser.add_argument("--rsort", help="Reverse sort results", choices=["created_on", "updated_on"]) - get_wf_parser.add_argument( - "-t", - "--table", - help='Print results in table form instead of json. Needs prettytable. Works best with "-d less"', - action="store_true", - ) - get_wf_parser.set_defaults(func=get_wfs) - - defuse_wf_parser = subparsers.add_parser("defuse_wflows", help="cancel (de-fuse) an entire Workflow") - defuse_wf_parser.add_argument( - "--defuse_all_states", help="also defuse COMPLETED and FIZZLED workflows", action="store_true" - ) - defuse_wf_parser.add_argument(*fw_id_args, **fw_id_kwargs) - defuse_wf_parser.add_argument("-n", "--name", help="name") - defuse_wf_parser.add_argument(*state_args, **state_kwargs) - defuse_wf_parser.add_argument(*query_args, **query_kwargs) - defuse_wf_parser.add_argument( - "--password", - help="Today's date, e.g. 2012-02-25. Password or positive response to " - f"input prompt required when modifying more than {PW_CHECK_NUM} entries.", - ) - defuse_wf_parser.set_defaults(func=pause_wfs) - - pause_wf_parser = subparsers.add_parser( - "pause_wflows", - help="pause an entire Workflow", - aliases=["pause_wfs"], - ) - pause_wf_parser.add_argument(*fw_id_args, **fw_id_kwargs) - pause_wf_parser.add_argument("-n", "--name", help="name") - pause_wf_parser.add_argument(*state_args, **state_kwargs) - pause_wf_parser.add_argument(*query_args, **query_kwargs) - pause_wf_parser.add_argument( - "--password", - help="Today's date, e.g. 2012-02-25. Password or positive response to " - f"input prompt required when modifying more than {PW_CHECK_NUM} entries.", - ) - pause_wf_parser.set_defaults(func=pause_wfs) - - reignite_wfs_parser = subparsers.add_parser( - "reignite_wflows", - help="reignite (un-cancel) an entire Workflow", - aliases=["reignite_wfs"], - ) - reignite_wfs_parser.add_argument(*fw_id_args, **fw_id_kwargs) - reignite_wfs_parser.add_argument("-n", "--name", help="name") - reignite_wfs_parser.add_argument(*state_args, **state_kwargs) - reignite_wfs_parser.add_argument(*query_args, **query_kwargs) - reignite_wfs_parser.add_argument( - "--password", - help="Today's date, e.g. 2012-02-25. Password or positive response to " - f"input prompt required when modifying more than {PW_CHECK_NUM} entries.", - ) - reignite_wfs_parser.set_defaults(func=reignite_wfs) - - archive_parser = subparsers.add_parser( - "archive_wflows", - help="archive an entire Workflow (irreversible)", - aliases=["archive_wfs"], - ) - archive_parser.add_argument(*fw_id_args, **fw_id_kwargs) - archive_parser.add_argument("-n", "--name", help="name") - archive_parser.add_argument(*state_args, **state_kwargs) - archive_parser.add_argument(*query_args, **query_kwargs) - archive_parser.add_argument( - "--password", - help="Today's date, e.g. 2012-02-25. Password or positive response to " - f"input prompt required when modifying more than {PW_CHECK_NUM} entries.", - ) - archive_parser.set_defaults(func=archive) - - delete_wfs_parser = subparsers.add_parser( - "delete_wflows", - help='Delete workflows (permanently). Use "archive_wflows" instead if you want to "soft-remove"', - aliases=["delete_wfs"], - ) - delete_wfs_parser.add_argument(*fw_id_args, **fw_id_kwargs) - delete_wfs_parser.add_argument("-n", "--name", help="name") - delete_wfs_parser.add_argument(*state_args, **state_kwargs) - delete_wfs_parser.add_argument(*query_args, **query_kwargs) - delete_wfs_parser.add_argument( - "--password", - help="Today's date, e.g. 2012-02-25. Password or positive response to " - f"input prompt required when modifying more than {PW_CHECK_NUM} entries.", - ) - delete_wfs_parser.add_argument( - "--ldirs", - help="the launch directories associated with the WF will be deleted as well, if possible", - dest="delete_launch_dirs", - action="store_true", - ) - delete_wfs_parser.set_defaults(func=delete_wfs, delete_launch_dirs=False) - - get_qid_parser = subparsers.add_parser("get_qids", help="get the queue id of a Firework") - get_qid_parser.add_argument(*fw_id_args, **fw_id_kwargs) - get_qid_parser.set_defaults(func=get_qid) - - cancel_qid_parser = subparsers.add_parser("cancel_qid", help="cancel a reservation") - cancel_qid_parser.add_argument(*qid_args, **qid_kwargs) - cancel_qid_parser.set_defaults(func=cancel_qid) - - reservation_parser = subparsers.add_parser("detect_unreserved", help="Find launches with stale reservations") - reservation_parser.add_argument( - "--time", help="expiration time (seconds)", default=RESERVATION_EXPIRATION_SECS, type=int - ) - reservation_parser.add_argument("--rerun", help="cancel and rerun expired reservations", action="store_true") - reservation_parser.add_argument(*enh_disp_args, **enh_disp_kwargs) - reservation_parser.set_defaults(func=detect_unreserved) - - fizzled_parser = subparsers.add_parser("detect_lostruns", help="Find launches that have FIZZLED") - fizzled_parser.add_argument("--time", help="expiration time (seconds)", default=RUN_EXPIRATION_SECS, type=int) - fizzled_parser.add_argument("--fizzle", help="mark lost runs as fizzled", action="store_true") - fizzled_parser.add_argument("--rerun", help="rerun lost runs", action="store_true") - fizzled_parser.add_argument("--refresh", help="refresh the detected inconsistent fireworks", action="store_true") - fizzled_parser.add_argument( - "--max_runtime", help="max runtime, matching failures ran no longer than this (seconds)", type=int - ) - fizzled_parser.add_argument( - "--min_runtime", help="min runtime, matching failures must have run at least this long (seconds)", type=int - ) - fizzled_parser.add_argument("-q", "--query", help="restrict search to only FWs matching this query") - fizzled_parser.add_argument("-lq", "--launch_query", help="restrict search to only launches matching this query") - fizzled_parser.add_argument(*enh_disp_args, **enh_disp_kwargs) - fizzled_parser.set_defaults(func=detect_lostruns) - - priority_parser = subparsers.add_parser("set_priority", help="modify the priority of one or more FireWorks") - priority_parser.add_argument("priority", help="get FW with this fw_id", default=None, type=int) - priority_parser.add_argument(*fw_id_args, **fw_id_kwargs) - priority_parser.add_argument("-n", "--name", help="name") - priority_parser.add_argument(*state_args, **state_kwargs) - priority_parser.add_argument(*query_args, **query_kwargs) - priority_parser.add_argument(*launches_mode_args, **launches_mode_kwargs) - priority_parser.add_argument( - "--password", - help="Today's date, e.g. 2012-02-25. Password or positive response to " - f"input prompt required when modifying more than {PW_CHECK_NUM} entries.", - ) - priority_parser.add_argument( - "-wf", action="store_true", help="the priority will be set for all the fireworks of the matching workflows" - ) - priority_parser.set_defaults(func=set_priority) - - parser.add_argument( - "-l", "--launchpad_file", help="path to LaunchPad file containing central DB connection info", default=None - ) - parser.add_argument( - "-c", - "--config_dir", - help="path to a directory containing the LaunchPad file (used if -l unspecified)", - default=CONFIG_FILE_DIR, - ) - parser.add_argument("--logdir", help="path to a directory for logging") - parser.add_argument("--loglvl", help="level to print log messages", default="INFO") - parser.add_argument("-s", "--silencer", help="shortcut to mute log messages", action="store_true") - - webgui_parser = subparsers.add_parser("webgui", help="launch the web GUI") - webgui_parser.add_argument( - "--port", - dest="port", - type=int, - default=WEBSERVER_PORT, - help="Port to run the web server on (default: 5000 or WEBSERVER_PORT arg in FW_config.yaml)", - ) - webgui_parser.add_argument( - "--host", - dest="host", - type=str, - default=WEBSERVER_HOST, - help="Host to run the web server on (default: 127.0.0.1 or WEBSERVER_HOST arg in FW_config.yaml)", - ) - webgui_parser.add_argument("--debug", help="print debug messages", action="store_true") - webgui_parser.add_argument( - "-s", "--server_mode", help="run in server mode (skip opening the browser)", action="store_true" - ) - webgui_parser.add_argument( - "--nworkers", type=arg_positive_int, help="Number of worker processes for server mode", default=1 - ) - webgui_parser.add_argument("--fwquery", help="additional query filter for FireWorks as JSON string") - webgui_parser.add_argument("--wflowquery", help="additional query filter for Workflows as JSON string") - webgui_parser.add_argument( - "--webgui_username", help="Optional username needed to access webgui", type=str, default=None - ) - webgui_parser.add_argument( - "--webgui_password", help="Optional password needed to access webgui", type=str, default=None - ) - webgui_parser.set_defaults(func=webgui) - - recover_parser = subparsers.add_parser("recover_offline", help="recover offline workflows") - recover_parser.add_argument("-i", "--ignore_errors", help="ignore errors", action="store_true") - recover_parser.add_argument( - "-w", - "--fworker_file", - help="path to fworker file. An empty string will match all the workers", - default=FWORKER_LOC, - ) - recover_parser.add_argument("-pe", "--print-errors", help="print errors", action="store_true") - recover_parser.set_defaults(func=recover_offline) - - forget_parser = subparsers.add_parser("forget_offline", help="forget offline workflows") - forget_parser.add_argument("-n", "--name", help="name") - forget_parser.add_argument(*state_args, **state_kwargs) - forget_parser.add_argument(*query_args, **query_kwargs) - forget_parser.set_defaults(func=forget_offline) - - # admin commands - admin_parser = subparsers.add_parser( - "admin", help='Various db admin commands, type "lpad admin -h" for more.', parents=[parent_parser] - ) - admin_subparser = admin_parser.add_subparsers(title="action", dest="action_command") - - maintain_parser = admin_subparser.add_parser("maintain", help="Run database maintenance") - maintain_parser.add_argument("--infinite", help="loop infinitely", action="store_true") - maintain_parser.add_argument( - "--maintain_interval", - help="sleep time between maintenance loops (infinite mode)", - default=MAINTAIN_INTERVAL, - type=int, - ) - maintain_parser.set_defaults(func=maintain) - - orphaned_parser = admin_subparser.add_parser("orphaned", help="Find orphaned FireWorks") - orphaned_parser.add_argument(*fw_id_args, **fw_id_kwargs) - orphaned_parser.add_argument("-n", "--name", help="get FWs with this name") - orphaned_parser.add_argument(*state_args, **state_kwargs) - orphaned_parser.add_argument(*query_args, **query_kwargs) - orphaned_parser.add_argument(*launches_mode_args, **launches_mode_kwargs) - orphaned_parser.add_argument(*qid_args, **qid_kwargs) - orphaned_parser.add_argument(*disp_args, **disp_kwargs) - orphaned_parser.add_argument("-m", "--max", help="limit results", default=0, type=int) - orphaned_parser.add_argument("--sort", help="Sort results", choices=["created_on", "updated_on"]) - orphaned_parser.add_argument("--rsort", help="Reverse sort results", choices=["created_on", "updated_on"]) - orphaned_parser.add_argument("--remove", help="delete orphaned", action="store_true") - orphaned_parser.add_argument( - "--ldirs", - help="the launch directories associated with the orphaned Fireworks will be deleted as well, if possible", - dest="delete_launch_dirs", - action="store_true", - ) - orphaned_parser.set_defaults(func=orphaned) - - tuneup_parser = admin_subparser.add_parser( - "tuneup", help="Tune-up the database (should be performed during scheduled downtime)" - ) - tuneup_parser.add_argument( - "--full", help="Run full tuneup and compaction (should be run during DB downtime only)", action="store_true" - ) - tuneup_parser.set_defaults(func=tuneup) - - refresh_parser = admin_subparser.add_parser( - "refresh", help="manually force a workflow refresh (not usually needed)" - ) - refresh_parser.add_argument(*fw_id_args, **fw_id_kwargs) - refresh_parser.add_argument("-n", "--name", help="name") - refresh_parser.add_argument(*state_args, **state_kwargs) - refresh_parser.add_argument(*query_args, **query_kwargs) - refresh_parser.add_argument( - "--password", - help="Today's date, e.g. 2012-02-25. Password or positive response to " - f"input prompt required when modifying more than {PW_CHECK_NUM} entries.", - ) - refresh_parser.set_defaults(func=refresh) - - unlock_parser = admin_subparser.add_parser( - "unlock", help="manually unlock a workflow that is locked (only if you know what you are doing!)" - ) - unlock_parser.add_argument(*fw_id_args, **fw_id_kwargs) - unlock_parser.add_argument("-n", "--name", help="name") - unlock_parser.add_argument(*state_args, **state_kwargs) - unlock_parser.add_argument(*query_args, **query_kwargs) - unlock_parser.add_argument( - "--password", - help="Today's date, e.g. 2012-02-25. Password or positive response to " - f"input prompt required when modifying more than {PW_CHECK_NUM} entries.", - ) - unlock_parser.set_defaults(func=unlock) - - report_parser = subparsers.add_parser( - "report", help='Compile a report of runtime stats, type "lpad report -h" for more options.' - ) - report_parser.add_argument( - "-c", - "--collection", - help="The collection to report on; choose from 'fws' (default), 'wflows', or 'launches'.", - default="fws", - ) - report_parser.add_argument( - "-i", - "--interval", - help="Interval on which to split the report. " - "Choose from 'minutes', 'hours', " - "'days' (default), 'months', or 'years'.", - default="days", - ) - report_parser.add_argument( - "-n", "--num_intervals", help="The number of intervals on which to report (default=5)", type=int, default=5 - ) - report_parser.add_argument("-q", "--query", help="Additional Pymongo queries to filter entries before processing.") - report_parser.set_defaults(func=report) - - introspect_parser = subparsers.add_parser("introspect", help="Introspect recent runs to pin down errors") - introspect_parser.add_argument("-m", "--max", help="examine past results", default=100, type=int) - introspect_parser.add_argument( - "-t", - "--threshold", - help="controls signal to noise ratio, e.g., 10 means " - "difference of at least 10 runs between fizzled/completed count", - default=10, - type=int, - ) - introspect_parser.set_defaults(func=introspect) - - try: - import argcomplete - - argcomplete.autocomplete(parser) - # This supports bash autocompletion. To enable this, pip install - # argcomplete, activate global completion, or add - # eval "$(register-python-argcomplete lpad)" - # into your .bash_profile or .bashrc - except ImportError: - pass - - args = parser.parse_args(argv) - - cfg_files_to_check = [("launchpad", "-l", False, LAUNCHPAD_LOC)] - if hasattr(args, "fworker_file"): - cfg_files_to_check.append(("fworker", "-w", False, FWORKER_LOC)) - _validate_config_file_paths(args, cfg_files_to_check) - - if args.command is None: - # if no command supplied, print help - parser.print_help() - else: - for opt in fw_id_options: - if hasattr(args, opt) and getattr(args, opt) is not None and isinstance(getattr(args, opt), str): - if "," in getattr(args, opt): - setattr(args, opt, [int(x) for x in getattr(args, opt).split(",")]) - else: - setattr(args, opt, [int(getattr(args, opt))]) - - args.func(args) - - return 0 - - -if __name__ == "__main__": - raise SystemExit(lpad()) From 250cb9b130ab714a111475ede6d29bb21b5c240b Mon Sep 17 00:00:00 2001 From: Wayne Date: Fri, 26 Jul 2024 12:13:56 -0700 Subject: [PATCH 07/12] updated error messages --- fireworks/utilities/reservation_finder.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fireworks/utilities/reservation_finder.py b/fireworks/utilities/reservation_finder.py index 6893fc4fd..7fe4c139f 100644 --- a/fireworks/utilities/reservation_finder.py +++ b/fireworks/utilities/reservation_finder.py @@ -10,7 +10,8 @@ def execute_command(command): try: result = subprocess.run(command, shell=True, capture_output=True, text=True) if result.returncode != 0: - raise Exception(f"Command failed: {command}\n{result.stderr}") + #raise Exception(f"Command failed: {command}\n{result.stderr}") + raise Exception('Running fireworks locally') ssh=None return result.stdout.strip(),ssh except Exception as e: From 6c74d1e29d76091f506124e26d12b4674abd2ba1 Mon Sep 17 00:00:00 2001 From: Wayne Date: Fri, 26 Jul 2024 12:14:50 -0700 Subject: [PATCH 08/12] removed .ipynb_checkpoints again --- .../launchpad-checkpoint.py | 2196 ----------------- 1 file changed, 2196 deletions(-) delete mode 100644 fireworks/core/.ipynb_checkpoints/launchpad-checkpoint.py diff --git a/fireworks/core/.ipynb_checkpoints/launchpad-checkpoint.py b/fireworks/core/.ipynb_checkpoints/launchpad-checkpoint.py deleted file mode 100644 index 432d5f429..000000000 --- a/fireworks/core/.ipynb_checkpoints/launchpad-checkpoint.py +++ /dev/null @@ -1,2196 +0,0 @@ -"""The LaunchPad manages the FireWorks database.""" - -import datetime -import json -import os -import random -import shutil -import time -import traceback -import warnings -from collections import defaultdict -from itertools import chain -from sustodian import FindMyFW - -import gridfs -from bson import ObjectId -from monty.os.path import zpath -from monty.serialization import loadfn -from pymongo import ASCENDING, DESCENDING -from pymongo.errors import DocumentTooLarge -from tqdm import tqdm - -from fireworks.core.firework import Firework, FWAction, Launch, Tracker, Workflow -from fireworks.fw_config import MongoClient -from fireworks.fw_config import ( - GRIDFS_FALLBACK_COLLECTION, - LAUNCHPAD_LOC, - MAINTAIN_INTERVAL, - MONGO_SOCKET_TIMEOUT_MS, - RESERVATION_EXPIRATION_SECS, - RUN_EXPIRATION_SECS, - SORT_FWS, - WFLOCK_EXPIRATION_KILL, - WFLOCK_EXPIRATION_SECS, -) -from fireworks.utilities.fw_serializers import FWSerializable, reconstitute_dates, recursive_dict -from fireworks.utilities.fw_utilities import get_fw_logger - -__author__ = "Anubhav Jain" -__copyright__ = "Copyright 2013, The Materials Project" -__maintainer__ = "Anubhav Jain" -__email__ = "ajain@lbl.gov" -__date__ = "Jan 30, 2013" - - -# TODO: lots of duplication reduction and cleanup possible - - -def sort_aggregation(sort): - """Build sorting aggregation pipeline. - - Args: - sort [(str,int)]: sorting keys and directions as a list of - (str, int) tuples, i.e. [('updated_on', 1)] - """ - # Fix for sorting by dates which are actually stored as strings: - # Not sure about the underlying issue's source, but apparently some - # dates are stored as strings and others as date objects. - # Following pipeline makes sure all stored dates are actually date - # objects for proper comparison when sorting. - # Assumption below is that dates are either strings or date objects, - # nothing else. - aggregation = [] - for k, _ in sort: - if k in {"updated_on", "created_on"}: - aggregation.append( - { - "$set": { - k: { - "$dateFromString": { - "dateString": "$" + k, - "onError": "$" + k, # if conversion fails, just return original object - } - } - } - } - ) - aggregation.append({"$sort": dict(sort)}) - return aggregation - - -class LockedWorkflowError(ValueError): - """ - Error raised if the context manager WFLock can't acquire the lock on the WF within the selected - time interval (WFLOCK_EXPIRATION_SECS), if the killing of the lock is disabled (WFLOCK_EXPIRATION_KILL). - """ - - -class WFLock: - """ - Lock a Workflow, i.e. for performing update operations - Raises a LockedWorkflowError if the lock couldn't be acquired within expire_secs and kill==False. - Calling functions are responsible for handling the error in order to avoid database inconsistencies. - """ - - def __init__(self, lp, fw_id, expire_secs=WFLOCK_EXPIRATION_SECS, kill=WFLOCK_EXPIRATION_KILL) -> None: - """ - Args: - lp (LaunchPad) - fw_id (int): Firework id - expire_secs (int): max waiting time in seconds. - kill (bool): force lock acquisition or not. - """ - self.lp = lp - self.fw_id = fw_id - self.expire_secs = expire_secs - self.kill = kill - - def __enter__(self): - ctr = 0 - waiting_time = 0 - # acquire lock - links_dict = self.lp.workflows.find_one_and_update( - {"nodes": self.fw_id, "locked": {"$exists": False}}, {"$set": {"locked": True}} - ) - # could not acquire lock b/c WF is already locked for writing - while not links_dict: - ctr += 1 - time_incr = ctr / 10.0 + random.random() / 100.0 - time.sleep(time_incr) # wait a bit for lock to free up - waiting_time += time_incr - if waiting_time > self.expire_secs: # too much time waiting, expire lock - wf = self.lp.workflows.find_one({"nodes": self.fw_id}) - if not wf: - raise ValueError(f"Could not find workflow in database: {self.fw_id}") - if self.kill: # force lock acquisition - self.lp.m_logger.warning(f"FORCIBLY ACQUIRING LOCK, WF: {self.fw_id}") - links_dict = self.lp.workflows.find_one_and_update( - {"nodes": self.fw_id}, {"$set": {"locked": True}} - ) - else: # throw error if we don't want to force lock acquisition - raise LockedWorkflowError(f"Could not get workflow - LOCKED: {self.fw_id}") - else: - # retry lock - links_dict = self.lp.workflows.find_one_and_update( - {"nodes": self.fw_id, "locked": {"$exists": False}}, {"$set": {"locked": True}} - ) - - def __exit__(self, exc_type, exc_val, exc_tb): - self.lp.workflows.find_one_and_update({"nodes": self.fw_id}, {"$unset": {"locked": True}}) - - -class LaunchPad(FWSerializable): - """The LaunchPad manages the FireWorks database.""" - - def __init__( - self, - host=None, - port=None, - name=None, - username=None, - password=None, - logdir=None, - strm_lvl=None, - user_indices=None, - wf_user_indices=None, - authsource=None, - uri_mode=False, - mongoclient_kwargs=None, - ) -> None: - """ - Args: - host (str): hostname. If uri_mode is True, a MongoDB connection string URI - (https://docs.mongodb.com/manual/reference/connection-string/) can be used instead of the remaining - options below. - port (int): port number - name (str): database name - username (str) - password (str) - logdir (str): path to the log directory - strm_lvl (str): the logger stream level - user_indices (list): list of 'fireworks' collection indexes to be built - wf_user_indices (list): list of 'workflows' collection indexes to be built - authsource (str): authSource parameter for MongoDB authentication; defaults to "name" (i.e., db name) if - not set - uri_mode (bool): if set True, all Mongo connection parameters occur through a MongoDB URI string (set as - the host). - mongoclient_kwargs (dict): A list of any other custom keyword arguments to be - passed into the MongoClient connection. Use these kwargs to specify SSL/TLS or serverSelectionTimeoutMS - arguments. Note these arguments are different depending on the major pymongo version used; see - pymongo documentation for more details. - """ - self.host = host if (host or uri_mode) else "localhost" - self.port = port if (port or uri_mode) else 27017 - self.name = name if (name or uri_mode) else "fireworks" - self.username = username - self.password = password - self.authsource = authsource or self.name - self.mongoclient_kwargs = mongoclient_kwargs or {} - self.uri_mode = uri_mode - - # set up logger - self.logdir = logdir - self.strm_lvl = strm_lvl or "INFO" - self.m_logger = get_fw_logger("launchpad", l_dir=self.logdir, stream_level=self.strm_lvl) - - self.user_indices = user_indices or [] - self.wf_user_indices = wf_user_indices or [] - - # get connection - if uri_mode: - self.connection = MongoClient(host, **self.mongoclient_kwargs) - if self.name is None: - raise ValueError("Must specify a database name when using a MongoDB URI string.") - self.db = self.connection[self.name] - else: - self.connection = MongoClient( - self.host, - self.port, - socketTimeoutMS=MONGO_SOCKET_TIMEOUT_MS, - username=self.username, - password=self.password, - authSource=self.authsource, - **self.mongoclient_kwargs, - ) - self.db = self.connection[self.name] - - self.fireworks = self.db.fireworks - self.launches = self.db.launches - self.offline_runs = self.db.offline_runs - self.fw_id_assigner = self.db.fw_id_assigner - self.workflows = self.db.workflows - if GRIDFS_FALLBACK_COLLECTION: - self.gridfs_fallback = gridfs.GridFS(self.db, GRIDFS_FALLBACK_COLLECTION) - else: - self.gridfs_fallback = None - - self.backup_launch_data = {} - self.backup_fw_data = {} - - def to_dict(self): - """Note: usernames/passwords are exported as unencrypted Strings!""" - return { - "host": self.host, - "port": self.port, - "name": self.name, - "username": self.username, - "password": self.password, - "logdir": self.logdir, - "strm_lvl": self.strm_lvl, - "user_indices": self.user_indices, - "wf_user_indices": self.wf_user_indices, - "authsource": self.authsource, - "uri_mode": self.uri_mode, - "mongoclient_kwargs": self.mongoclient_kwargs, - } - - def update_spec(self, fw_ids, spec_document, mongo=False) -> None: - """ - Update fireworks with a spec. Sometimes you need to modify a firework in progress. - - Args: - fw_ids [int]: All fw_ids to modify. - spec_document (dict): The spec document. Note that only modifications to - the spec key are allowed. So if you supply {"_tasks.1.parameter": "hello"}, - you are effectively modifying spec._tasks.1.parameter in the actual fireworks - collection. - mongo (bool): spec_document uses mongo syntax to directly update the spec - """ - mod_spec = spec_document if mongo else {"$set": {"spec." + k: v for k, v in spec_document.items()}} - - allowed_states = ["READY", "WAITING", "FIZZLED", "DEFUSED", "PAUSED"] - self.fireworks.update_many({"fw_id": {"$in": fw_ids}, "state": {"$in": allowed_states}}, mod_spec) - for fw in self.fireworks.find( - {"fw_id": {"$in": fw_ids}, "state": {"$nin": allowed_states}}, {"fw_id": 1, "state": 1} - ): - self.m_logger.warning( - f"Cannot update spec of fw_id: {fw['fw_id']} with state: {fw['state']}. Try rerunning first." - ) - - @classmethod - def from_dict(cls, d): - port = d.get("port", None) - name = d.get("name", None) - username = d.get("username", None) - password = d.get("password", None) - logdir = d.get("logdir", None) - strm_lvl = d.get("strm_lvl", None) - user_indices = d.get("user_indices", []) - wf_user_indices = d.get("wf_user_indices", []) - authsource = d.get("authsource", None) - uri_mode = d.get("uri_mode", False) - mongoclient_kwargs = d.get("mongoclient_kwargs", None) - return LaunchPad( - d["host"], - port, - name, - username, - password, - logdir, - strm_lvl, - user_indices, - wf_user_indices, - authsource, - uri_mode, - mongoclient_kwargs, - ) - - @classmethod - def auto_load(cls): - if LAUNCHPAD_LOC: - return LaunchPad.from_file(LAUNCHPAD_LOC) - return LaunchPad() - - def reset(self, password, require_password=True, max_reset_wo_password=25) -> None: - """ - Create a new FireWorks database. This will overwrite the existing FireWorks database! To - safeguard against accidentally erasing an existing database, a password must be entered. - - Args: - password (str): A String representing today's date, e.g. '2012-12-31' - require_password (bool): Whether a password is required to reset the DB. Setting to - false is dangerous because running code unintentionally could clear your DB - use - max_reset_wo_password to minimize risk. - max_reset_wo_password (int): A failsafe; when require_password is set to False, - FWS will not clear DBs that contain more workflows than this parameter - """ - m_password = datetime.datetime.now().strftime("%Y-%m-%d") - - if password == m_password or ( - not require_password and self.workflows.count_documents({}) <= max_reset_wo_password - ): - self.fireworks.delete_many({}) - self.launches.delete_many({}) - self.workflows.delete_many({}) - self.offline_runs.delete_many({}) - self._restart_ids(1, 1) - if self.gridfs_fallback is not None: - self.db.drop_collection(f"{GRIDFS_FALLBACK_COLLECTION}.chunks") - self.db.drop_collection(f"{GRIDFS_FALLBACK_COLLECTION}.files") - self.tuneup() - self.m_logger.info("LaunchPad was RESET.") - elif not require_password: - raise ValueError( - f"Password check cannot be overridden since the size of DB ({self.fireworks.count_documents({})} " - f"workflows) is greater than the max_reset_wo_password parameter ({max_reset_wo_password})." - ) - else: - raise ValueError(f"Invalid password! Password is today's date: {m_password}") - - def maintain(self, infinite=True, maintain_interval=None) -> None: - """ - Perform launchpad maintenance: detect lost runs and unreserved RESERVE launches. - - Args: - infinite (bool) - maintain_interval (seconds): sleep time - """ - maintain_interval = maintain_interval or MAINTAIN_INTERVAL - - while True: - self.m_logger.info("Performing maintenance on Launchpad...") - self.m_logger.debug("Tracking down FIZZLED jobs...") - fl, ff, inconsistent_fw_ids = self.detect_lostruns(fizzle=True) - if fl: - self.m_logger.info(f"Detected {len(fl)} FIZZLED launches: {fl}") - self.m_logger.info(f"Detected {len(ff)} FIZZLED FWs: {ff}") - if inconsistent_fw_ids: - self.m_logger.info( - f"Detected {len(inconsistent_fw_ids)} FIZZLED inconsistent fireworks: {inconsistent_fw_ids}" - ) - - self.m_logger.debug("Tracking down stuck RESERVED jobs...") - ur = self.detect_unreserved(rerun=True) - if ur: - self.m_logger.info(f"Unreserved {len(ur)} RESERVED launches: {ur}") - - self.m_logger.info("LaunchPad was MAINTAINED.") - - if not infinite: - break - - self.m_logger.debug(f"Sleeping for {maintain_interval} secs...") - time.sleep(maintain_interval) - - def add_wf(self, wf, reassign_all=True): - """ - Add workflow(or firework) to the launchpad. The firework ids will be reassigned. - - Args: - wf (Workflow/Firework): Workflow or Firework object - reassign_all (bool): If True, the firework ids will be assigned - starting from the next available id. Defaults to True. - - Returns: - dict: mapping between old and new Firework ids - """ - if isinstance(wf, Firework): - wf = Workflow.from_firework(wf) - # sets the root FWs as READY - # prefer to wf.refresh() for speed reasons w/many root FWs - for fw_id in wf.root_fw_ids: - wf.id_fw[fw_id].state = "READY" - wf.fw_states[fw_id] = "READY" - # insert the FireWorks and get back mapping of old to new ids - old_new = self._upsert_fws(list(wf.id_fw.values()), reassign_all=reassign_all) - # update the Workflow with the new ids - wf._reassign_ids(old_new) - # insert the WFLinks - self.workflows.insert_one(wf.to_db_dict()) - self.m_logger.info(f"Added a workflow. id_map: {old_new}") - return old_new - - def bulk_add_wfs(self, wfs) -> None: - """ - Adds a list of workflows to the fireworks database - using insert_many for both the fws and wfs, is - more efficient than adding them one at a time. - - Args: - wfs ([Workflow]): list of workflows or fireworks - - Returns: - None - - """ - # Make all fireworks workflows - wfs = [Workflow.from_Firework(wf) if isinstance(wf, Firework) else wf for wf in wfs] - - # Initialize new firework counter, starting from the next fw id - total_num_fws = sum(len(wf) for wf in wfs) - new_fw_counter = self.fw_id_assigner.find_one_and_update({}, {"$inc": {"next_fw_id": total_num_fws}})[ - "next_fw_id" - ] - for wf in tqdm(wfs): - # Reassign fw_ids and increment the counter - old_new = dict(zip(wf.id_fw, range(new_fw_counter, new_fw_counter + len(wf)))) - for fw in wf: - fw.fw_id = old_new[fw.fw_id] - wf._reassign_ids(old_new) - new_fw_counter += len(wf) - - # Set root fws to READY - for fw_id in wf.root_fw_ids: - wf.id_fw[fw_id].state = "READY" - wf.fw_states[fw_id] = "READY" - - # Insert all fws and wfs, do workflows first so fws don't - # get checked out prematurely - self.workflows.insert_many(wf.to_db_dict() for wf in wfs) - all_fws = chain.from_iterable(wf for wf in wfs) - self.fireworks.insert_many(fw.to_db_dict() for fw in all_fws) - return - - def append_wf(self, new_wf, fw_ids, detour=False, pull_spec_mods=True) -> None: - """ - Append a new workflow on top of an existing workflow. - - Args: - new_wf (Workflow): The new workflow to append - fw_ids ([int]): The parent fw_ids at which to append the workflow - detour (bool): Whether to connect the new Workflow in a "detour" style, i.e., move - original children of the parent fw_ids to the new_wf - pull_spec_mods (bool): Whether the new Workflow should pull the FWActions of the parent - fw_ids - """ - wf = self.get_wf_by_fw_id(fw_ids[0]) - updated_ids = wf.append_wf(new_wf, fw_ids, detour=detour, pull_spec_mods=pull_spec_mods) - with WFLock(self, fw_ids[0]): - self._update_wf(wf, updated_ids) - - def get_launch_by_id(self, launch_id): - """ - Given a Launch id, return details of the Launch. - - Args: - launch_id (int): launch id. - - Returns: - Launch object - """ - m_launch = self.launches.find_one({"launch_id": launch_id}) - if m_launch: - m_launch["action"] = get_action_from_gridfs(m_launch.get("action"), self.gridfs_fallback) - return Launch.from_dict(m_launch) - raise ValueError(f"No Launch exists with {launch_id=}") - - def get_fw_dict_by_id(self, fw_id): - """ - Given firework id, return firework dict. - - Args: - fw_id (int): Firework id. - - Returns: - dict - """ - fw_dict = self.fireworks.find_one({"fw_id": fw_id}) - if not fw_dict: - raise ValueError(f"No Firework exists with id: {fw_id}") - # recreate launches from the launch collection - launches = list( - self.launches.find({"launch_id": {"$in": fw_dict["launches"]}}, sort=[("launch_id", ASCENDING)]) - ) - for launch in launches: - launch["action"] = get_action_from_gridfs(launch.get("action"), self.gridfs_fallback) - fw_dict["launches"] = launches - launches = list( - self.launches.find({"launch_id": {"$in": fw_dict["archived_launches"]}}, sort=[("launch_id", ASCENDING)]) - ) - for launch in launches: - launch["action"] = get_action_from_gridfs(launch.get("action"), self.gridfs_fallback) - fw_dict["archived_launches"] = launches - return fw_dict - - def get_fw_by_id(self, fw_id): - """ - Given a Firework id, give back a Firework object. - - Args: - fw_id (int): Firework id. - - Returns: - Firework object - """ - return Firework.from_dict(self.get_fw_dict_by_id(fw_id)) - - def get_wf_by_fw_id(self, fw_id): - """Given a Firework id, give back the Workflow containing that Firework. - - Args: - fw_id (int): Firework id. - - Returns: - A Workflow object - """ - links_dict = self.workflows.find_one({"nodes": fw_id}) - if not links_dict: - raise ValueError(f"Could not find a Workflow with {fw_id=}") - fws = map(self.get_fw_by_id, links_dict["nodes"]) - return Workflow( - fws, - links_dict["links"], - links_dict["name"], - links_dict["metadata"], - links_dict["created_on"], - links_dict["updated_on"], - ) - - def get_wf_by_fw_id_lzyfw(self, fw_id: int) -> Workflow: - """Given a FireWork id, give back the Workflow containing that FireWork. - - Args: - fw_id (int): FireWork id. - - Returns: - A Workflow object - """ - links_dict = self.workflows.find_one({"nodes": fw_id}) - if not links_dict: - raise ValueError(f"Could not find a Workflow with {fw_id=}") - - fws = [ - LazyFirework(fw_id, self.fireworks, self.launches, self.gridfs_fallback) for fw_id in links_dict["nodes"] - ] - - # Check for fw_states in links_dict to conform with pre-optimized workflows - fw_states = {int(k): v for k, v in links_dict["fw_states"].items()} if "fw_states" in links_dict else None - - return Workflow( - fws, - links_dict["links"], - links_dict["name"], - links_dict["metadata"], - links_dict["created_on"], - links_dict["updated_on"], - fw_states, - ) - - def delete_fws(self, fw_ids, delete_launch_dirs=False) -> None: - """Delete a set of fireworks identified by their fw_ids. - - ATTENTION: This function serves maintenance purposes and will leave - workflows untouched. Its use will thus result in a corrupted database. - Use 'delete_wf' instead for consistently deleting workflows together - with their fireworks. - - Args: - fw_ids ([int]): Firework ids - delete_launch_dirs (bool): if True all the launch directories associated with - the WF will be deleted as well, if possible. - """ - potential_launch_ids = [] - launch_ids = [] - for fw_id in fw_ids: - fw_dict = self.fireworks.find_one({"fw_id": fw_id}) - if fw_dict: - potential_launch_ids += fw_dict.get("launches", []) + fw_dict.get("archived_launches", []) - - launch_ids = [ - launch_id - for launch_id in potential_launch_ids - if not self.fireworks.find_one( - {"$or": [{"launches": launch_id}, {"archived_launches": launch_id}], "fw_id": {"$nin": fw_ids}}, - {"launch_id": 1}, - ) - ] - - if delete_launch_dirs: - launch_dirs = [ - self.launches.find_one({"launch_id": launch_id}, {"launch_dir": 1})["launch_dir"] - for launch_id in launch_ids - ] - print(f"Remove folders {launch_dirs}") - for launch_dir in launch_dirs: - shutil.rmtree(launch_dir, ignore_errors=True) - - print(f"Remove fws {fw_ids}") - if self.gridfs_fallback is not None: - for launch_id in launch_ids: - for file_id in self.gridfs_fallback.find({"metadata.launch_id": launch_id}): - self.gridfs_fallback.delete(file_id._id) - print(f"Remove launches {launch_ids}") - self.launches.delete_many({"launch_id": {"$in": launch_ids}}) - self.offline_runs.delete_many({"launch_id": {"$in": launch_ids}}) - self.fireworks.delete_many({"fw_id": {"$in": fw_ids}}) - - def delete_wf(self, fw_id, delete_launch_dirs=False) -> None: - """ - Delete the workflow containing firework with the given id. - - Args: - fw_id (int): Firework id - delete_launch_dirs (bool): if True all the launch directories associated with - the WF will be deleted as well, if possible. - delete_launch_dirs - """ - links_dict = self.workflows.find_one({"nodes": fw_id}) - fw_ids = links_dict["nodes"] - self.delete_fws(fw_ids, delete_launch_dirs=delete_launch_dirs) - print("Removing workflow.") - self.workflows.delete_one({"nodes": fw_id}) - - def get_wf_summary_dict(self, fw_id, mode="more"): - """ - A much faster way to get summary information about a Workflow by querying only for - needed information. - - Args: - fw_id (int): A Firework id. - mode (str): Choose between "more", "less" and "all" in terms of quantity of information. - - Returns: - dict: information about Workflow. - """ - wf_fields = ["state", "created_on", "name", "nodes"] - fw_fields = ["state", "fw_id"] - launch_fields = [] - - if mode != "less": - wf_fields.append("updated_on") - fw_fields.extend(["name", "launches"]) - launch_fields.extend(("launch_id", "launch_dir")) - - if mode == "reservations": - launch_fields.append("state_history.reservation_id") - - if mode == "all": - wf_fields = None - - wf = self.workflows.find_one({"nodes": fw_id}, projection=wf_fields) - fw_data = [] - id_name_map = {} - launch_ids = [] - for fw in self.fireworks.find({"fw_id": {"$in": wf["nodes"]}}, projection=fw_fields): - if launch_fields: - launch_ids.extend(fw["launches"]) - fw_data.append(fw) - if mode != "less": - id_name_map[fw["fw_id"]] = f"{fw['name']}--{int(fw['fw_id'])}" - - if launch_fields: - launch_info = defaultdict(list) - for launch in self.launches.find({"launch_id": {"$in": launch_ids}}, projection=launch_fields): - for i, fw in enumerate(fw_data): - if launch["launch_id"] in fw["launches"]: - launch_info[i].append(launch) - for k, v in launch_info.items(): - fw_data[k]["launches"] = v - - wf["fw"] = fw_data - - # Post process the summary dict so that it "looks" better. - if mode == "less": - wf["states_list"] = "-".join( - [fw["state"][:3] if fw["state"].startswith("R") else fw["state"][0] for fw in wf["fw"]] - ) - del wf["nodes"] - - if mode == "more" or mode == "all": - wf["states"] = {} - wf["launch_dirs"] = {} - for fw in wf["fw"]: - k = f"{fw['name']}--{int(fw['fw_id'])}" - wf["states"][k] = fw["state"] - wf["launch_dirs"][k] = [launch["launch_dir"] for launch in fw["launches"]] - del wf["nodes"] - - if mode == "all": - del wf["fw_states"] - wf["links"] = {id_name_map[int(k)]: [id_name_map[i] for i in v] for k, v in wf["links"].items()} - wf["parent_links"] = { - id_name_map[int(k)]: [id_name_map[i] for i in v] for k, v in wf["parent_links"].items() - } - if mode == "reservations": - wf["states"] = {} - wf["launches"] = {} - for fw in wf["fw"]: - k = f"{fw['name']}--{int(fw['fw_id'])}" - wf["states"][k] = fw["state"] - wf["launches"][k] = fw["launches"] - del wf["nodes"] - - del wf["_id"] - del wf["fw"] - - return wf - - def get_fw_ids(self, query=None, sort=None, limit=0, count_only=False, launches_mode=False): - """ - Return all the fw ids that match a query. - - Args: - query (dict): representing a Mongo query - sort [(str,str)]: sort argument in Pymongo format - limit (int): limit the results - count_only (bool): only return the count rather than explicit ids - launches_mode (bool): query the launches collection instead of fireworks - - Returns: - list: list of firework ids matching the query - """ - coll = "launches" if launches_mode else "fireworks" - criteria = query or {} - if launches_mode: - lids = self._get_active_launch_ids() - criteria["launch_id"] = {"$in": lids} - - if count_only and limit: - return ValueError("Cannot count_only and limit at the same time!") - - aggregation = [] - - if criteria is not None: - aggregation.append({"$match": criteria}) - - if count_only: - aggregation.append({"$count": "count"}) - self.m_logger.debug(f"Aggregation '{aggregation}'.") - - cursor = getattr(self, coll).aggregate(aggregation) - res = list(cursor) - return res[0]["count"] if len(res) > 0 else 0 - - if sort is not None: - aggregation.extend(sort_aggregation(sort)) - - aggregation.append({"$project": {"fw_id": True, "_id": False}}) - - if limit is not None and limit > 0: - aggregation.append({"$limit": limit}) - - self.m_logger.debug(f"Aggregation '{aggregation}'.") - cursor = getattr(self, coll).aggregate(aggregation) - return [fw["fw_id"] for fw in cursor] - - def get_wf_ids(self, query=None, sort=None, limit=0, count_only=False): - """ - Return one fw id for all workflows that match a query. - - Args: - query (dict): representing a Mongo query - sort [(str,str)]: sort argument in Pymongo format - limit (int): limit the results - count_only (bool): only return the count rather than explicit ids - - Returns: - list: list of firework ids - """ - criteria = query or {} - aggregation = [] - - if criteria is not None: - aggregation.append({"$match": criteria}) - - if count_only: - aggregation.append({"$count": "count"}) - self.m_logger.debug(f"Aggregation '{aggregation}'.") - - cursor = self.workflows.aggregate(aggregation) - res = list(cursor) - return res[0]["count"] if len(res) > 0 else 0 - - if sort is not None: - aggregation.extend(sort_aggregation(sort)) - - aggregation.append({"$project": {"nodes": True, "_id": False}}) - - if limit is not None and limit > 0: - aggregation.append({"$limit": limit}) - - self.m_logger.debug(f"Aggregation '{aggregation}'.") - cursor = self.workflows.aggregate(aggregation) - - return [fw["nodes"][0] for fw in cursor] - - def get_fw_ids_in_wfs( - self, wf_query=None, fw_query=None, sort=None, limit=0, count_only=False, launches_mode=False - ): - """ - Return all fw ids that match fw_query within workflows that match wf_query. - - Args: - wf_query (dict): representing a Mongo query on workflows - fw_query (dict): representing a Mongo query on Fireworks - sort [(str,str)]: sort argument in Pymongo format - limit (int): limit the results - count_only (bool): only return the count rather than explicit ids - launches_mode (bool): query the launches collection instead of fireworks - - Returns: - list: list of firework ids matching the query - """ - coll = "launches" if launches_mode else "fireworks" - if launches_mode: - lids = self._get_active_launch_ids() - if fw_query is None: - fw_query = {} - fw_query["launch_id"] = {"$in": lids} - - if count_only and limit: - return ValueError("Cannot count_only and limit at the same time!") - - aggregation = [] - - if wf_query is not None: - aggregation.append( - {"$match": wf_query}, - ) - - aggregation.extend( - [ - {"$project": {"nodes": True, "_id": False}}, - {"$unwind": "$nodes"}, - { - "$lookup": { - "from": coll, # fireworks or launches - "localField": "nodes", - "foreignField": "fw_id", - "as": "fireworks", - } - }, - {"$project": {"fireworks": 1, "_id": 0}}, - {"$unwind": "$fireworks"}, - {"$replaceRoot": {"newRoot": "$fireworks"}}, - ] - ) - - if fw_query is not None: - aggregation.append({"$match": fw_query}) - - if count_only: - aggregation.append({"$count": "count"}) - self.m_logger.debug(f"Aggregation '{aggregation}'.") - - cursor = self.workflows.aggregate(aggregation) - res = list(cursor) - return res[0]["count"] if len(res) > 0 else 0 - - if sort is not None: - aggregation.extend(sort_aggregation(sort)) - - aggregation.append({"$project": {"fw_id": True, "_id": False}}) - - if limit is not None and limit > 0: - aggregation.append({"$limit": limit}) - - self.m_logger.debug(f"Aggregation '{aggregation}'.") - cursor = self.workflows.aggregate(aggregation) - return [fw["fw_id"] for fw in cursor] - - def run_exists(self, fworker=None): - """ - Checks to see if the database contains any FireWorks that are ready to run. - - Returns: - bool: True if the database contains any FireWorks that are ready to run. - """ - q = fworker.query if fworker else {} - return bool(self._get_a_fw_to_run(query=q, checkout=False)) - - def future_run_exists(self, fworker=None) -> bool: - """Check if database has any current OR future Fireworks available. - - Returns: - bool: True if database has any ready or waiting Fireworks. - """ - if self.run_exists(fworker): - # check first to see if any are READY - return True - # retrieve all [RUNNING/RESERVED] fireworks - q = fworker.query if fworker else {} - q.update(state={"$in": ["RUNNING", "RESERVED"]}) - active = self.get_fw_ids(q) - # then check if they have WAITING children - for fw_id in active: - children = self.get_wf_by_fw_id_lzyfw(fw_id).links[fw_id] - if any(self.get_fw_dict_by_id(i)["state"] == "WAITING" for i in children): - return True - - # if we loop over all active and none have WAITING children - # there is no future work to do - return False - - def tuneup(self, bkground=True) -> None: - """Database tuneup: build indexes.""" - self.m_logger.info("Performing db tune-up") - - self.m_logger.debug("Updating indices...") - self.fireworks.create_index("fw_id", unique=True, background=bkground) - for f in ("state", "spec._category", "created_on", "updated_on", "name", "launches"): - self.fireworks.create_index(f, background=bkground) - - self.launches.create_index("launch_id", unique=True, background=bkground) - self.launches.create_index("fw_id", background=bkground) - self.launches.create_index("state_history.reservation_id", background=bkground) - - if GRIDFS_FALLBACK_COLLECTION is not None: - files_collection = self.db[f"{GRIDFS_FALLBACK_COLLECTION}.files"] - files_collection.create_index("metadata.launch_id", unique=True, background=bkground) - - for f in ("state", "time_start", "time_end", "host", "ip", "fworker.name"): - self.launches.create_index(f, background=bkground) - - for f in ("name", "created_on", "updated_on", "nodes"): - self.workflows.create_index(f, background=bkground) - - for idx in self.user_indices: - self.fireworks.create_index(idx, background=bkground) - - for idx in self.wf_user_indices: - self.workflows.create_index(idx, background=bkground) - - # for frontend, which needs to sort on _id after querying on state - self.fireworks.create_index([("state", DESCENDING), ("_id", DESCENDING)], background=bkground) - self.fireworks.create_index( - [("state", DESCENDING), ("spec._priority", DESCENDING), ("created_on", DESCENDING)], background=bkground - ) - self.fireworks.create_index( - [("state", DESCENDING), ("spec._priority", DESCENDING), ("created_on", ASCENDING)], background=bkground - ) - self.workflows.create_index([("state", DESCENDING), ("_id", DESCENDING)], background=bkground) - - if not bkground: - self.m_logger.debug("Compacting database...") - try: - self.db.command({"compact": "fireworks"}) - self.db.command({"compact": "launches"}) - except Exception: - self.m_logger.debug("Database compaction failed (not critical)") - - def pause_fw(self, fw_id): - """ - Given the firework id, pauses the firework and refresh the workflow. - - Args: - fw_id(int): firework id - """ - allowed_states = ["WAITING", "READY", "RESERVED"] - f = self.fireworks.find_one_and_update( - {"fw_id": fw_id, "state": {"$in": allowed_states}}, - {"$set": {"state": "PAUSED", "updated_on": datetime.datetime.utcnow()}}, - ) - if f: - self._refresh_wf(fw_id) - if not f: - self.m_logger.error(f"No pausable (WAITING,READY,RESERVED) Firework exists with {fw_id=}") - return f - - def defuse_fw(self, fw_id, rerun_duplicates=True): - """ - Given the firework id, defuse the firework and refresh the workflow. - - Args: - fw_id (int): firework id - rerun_duplicates (bool): if True, duplicate fireworks(ones with the same launch) are - marked for rerun and then defused. - """ - allowed_states = ["DEFUSED", "WAITING", "READY", "FIZZLED", "PAUSED"] - f = self.fireworks.find_one_and_update( - {"fw_id": fw_id, "state": {"$in": allowed_states}}, - {"$set": {"state": "DEFUSED", "updated_on": datetime.datetime.utcnow()}}, - ) - if f: - self._refresh_wf(fw_id) - if not f: - self.rerun_fw(fw_id, rerun_duplicates) - f = self.fireworks.find_one_and_update( - {"fw_id": fw_id, "state": {"$in": allowed_states}}, - {"$set": {"state": "DEFUSED", "updated_on": datetime.datetime.utcnow()}}, - ) - if f: - self._refresh_wf(fw_id) - return f - - def reignite_fw(self, fw_id): - """ - Given the firework id, re-ignite(set state=WAITING) the defused firework. - - Args: - fw_id (int): firework id - """ - f = self.fireworks.find_one_and_update( - {"fw_id": fw_id, "state": "DEFUSED"}, - {"$set": {"state": "WAITING", "updated_on": datetime.datetime.utcnow()}}, - ) - if f: - self._refresh_wf(fw_id) - return f - - def resume_fw(self, fw_id): - """ - Given the firework id, resume (set state=WAITING) the paused firework. - - Args: - fw_id (int): firework id - """ - f = self.fireworks.find_one_and_update( - {"fw_id": fw_id, "state": "PAUSED"}, - {"$set": {"state": "WAITING", "updated_on": datetime.datetime.utcnow()}}, - ) - if f: - self._refresh_wf(fw_id) - return f - - def defuse_wf(self, fw_id, defuse_all_states=True) -> None: - """ - Defuse the workflow containing the given firework id. - - Args: - fw_id (int): firework id - defuse_all_states (bool) - """ - wf = self.get_wf_by_fw_id_lzyfw(fw_id) - for fw in wf: - if fw.state not in ["COMPLETED", "FIZZLED"] or defuse_all_states: - self.defuse_fw(fw.fw_id) - - def pause_wf(self, fw_id) -> None: - """ - Pause the workflow containing the given firework id. - - Args: - fw_id (int): firework id - """ - wf = self.get_wf_by_fw_id_lzyfw(fw_id) - for fw in wf: - if fw.state not in ["COMPLETED", "FIZZLED", "DEFUSED"]: - self.pause_fw(fw.fw_id) - - def reignite_wf(self, fw_id) -> None: - """ - Reignite the workflow containing the given firework id. - - Args: - fw_id (int): firework id - """ - wf = self.get_wf_by_fw_id_lzyfw(fw_id) - for fw in wf: - self.reignite_fw(fw.fw_id) - - def archive_wf(self, fw_id) -> None: - """ - Archive the workflow containing the given firework id. - - Args: - fw_id (int): firework id - """ - # first archive all the launches, so they are not used in duplicate checks - wf = self.get_wf_by_fw_id_lzyfw(fw_id) - if wf.state != "ARCHIVED": - fw_ids = [f.fw_id for f in wf] - for fw_id in fw_ids: - self.rerun_fw(fw_id) - - # second set the state of all FWs to ARCHIVED - wf = self.get_wf_by_fw_id_lzyfw(fw_id) - for fw in wf: - self.fireworks.find_one_and_update( - {"fw_id": fw.fw_id}, {"$set": {"state": "ARCHIVED", "updated_on": datetime.datetime.utcnow()}} - ) - self._refresh_wf(fw.fw_id) - - def _restart_ids(self, next_fw_id, next_launch_id) -> None: - """ - internal method used to reset firework id counters. - - Args: - next_fw_id (int): id to give next Firework - next_launch_id (int): id to give next Launch - """ - self.fw_id_assigner.delete_many({}) - self.fw_id_assigner.find_one_and_replace( - {"_id": -1}, {"next_fw_id": next_fw_id, "next_launch_id": next_launch_id}, upsert=True - ) - self.m_logger.debug(f"RESTARTED fw_id, launch_id to ({next_fw_id}, {next_launch_id})") - - def _check_fw_for_uniqueness(self, m_fw) -> bool: - """ - Check if there are duplicates. If not unique, a new id is assigned and the workflow - refreshed. - - Args: - m_fw (Firework) - - Returns: - bool: True if the firework is unique - """ - if not self._steal_launches(m_fw): - self.m_logger.debug(f"FW with id: {m_fw.fw_id} is unique!") - return True - self._upsert_fws([m_fw]) # update the DB with the new launches - self._refresh_wf(m_fw.fw_id) # since we updated a state, we need to refresh the WF again - return False - - def _get_a_fw_to_run(self, query=None, fw_id=None, checkout=True): - """ - Get the next ready firework to run. - - Args: - query (dict) - fw_id (int): If given the query is updated. - Note: We want to return None if this specific FW doesn't exist anymore. This is - because our queue params might have been tailored to this FW. - checkout (bool): if True, check out the matching firework and set state=RESERVED - - Returns: - Firework - """ - m_query = dict(query) if query else {} # make a defensive copy - m_query["state"] = "READY" - sortby = [("spec._priority", DESCENDING)] - - if SORT_FWS.upper() == "FIFO": - sortby.append(("created_on", ASCENDING)) - elif SORT_FWS.upper() == "FILO": - sortby.append(("created_on", DESCENDING)) - - # Override query if fw_id defined - if fw_id: - m_query = {"fw_id": fw_id, "state": {"$in": ["READY", "RESERVED"]}} - - while True: - # check out the matching firework, depending on the query set by the FWorker - if checkout: - m_fw = self.fireworks.find_one_and_update( - m_query, {"$set": {"state": "RESERVED", "updated_on": datetime.datetime.utcnow()}}, sort=sortby - ) - else: - m_fw = self.fireworks.find_one(m_query, {"fw_id": 1, "spec": 1}, sort=sortby) - - if not m_fw: - return None - m_fw = self.get_fw_by_id(m_fw["fw_id"]) - if self._check_fw_for_uniqueness(m_fw): - return m_fw - - def _get_active_launch_ids(self): - """ - Get all the launch ids. - - Returns: - list: all launch ids - """ - all_launch_ids = [] - for launch in self.fireworks.find({}, {"launches": 1}): - all_launch_ids.extend(launch["launches"]) - return all_launch_ids - - def reserve_fw(self, fworker, launch_dir, host=None, ip=None, fw_id=None): - """ - Checkout the next ready firework and mark the launch reserved. - - Args: - fworker (FWorker) - launch_dir (str): path to the launch directory. - host (str): hostname - ip (str): ip address - fw_id (int): fw_id to be reserved, if desired - - Returns: - (Firework, int): the checked out firework and the new launch id. - """ - return self.checkout_fw(fworker, launch_dir, host=host, ip=ip, fw_id=fw_id, state="RESERVED") - - def get_fw_ids_from_reservation_id(self, reservation_id): - - fw_id=FindMyFW.get_fwid(reservation_id) - return fw_id - - def cancel_reservation_by_reservation_id(self, reservation_id) -> None: - """Given the reservation id, cancel the reservation and rerun the corresponding fireworks.""" - l_id = self.launches.find_one( - {"state_history.reservation_id": reservation_id, "state": "RESERVED"}, {"launch_id": 1} - ) - if l_id: - self.cancel_reservation(l_id["launch_id"]) - else: - self.m_logger.info(f"Can't find any reserved jobs with reservation id: {reservation_id}") - - def get_reservation_id_from_fw_id(self, fw_id): - """Given the firework id, return the reservation id.""" - fw = self.fireworks.find_one({"fw_id": fw_id}, {"launches": 1}) - if fw: - for launch in self.launches.find({"launch_id": {"$in": fw["launches"]}}, {"state_history": 1}): - for d in launch["state_history"]: - if "reservation_id" in d: - return d["reservation_id"] - return None - return None - - def cancel_reservation(self, launch_id) -> None: - """Given the launch id, cancel the reservation and rerun the fireworks.""" - m_launch = self.get_launch_by_id(launch_id) - m_launch.state = "READY" - self.launches.find_one_and_replace( - {"launch_id": m_launch.launch_id, "state": "RESERVED"}, m_launch.to_db_dict(), upsert=True - ) - - for fw in self.fireworks.find({"launches": launch_id, "state": "RESERVED"}, {"fw_id": 1}): - self.rerun_fw(fw["fw_id"], rerun_duplicates=False) - - def detect_unreserved(self, expiration_secs=RESERVATION_EXPIRATION_SECS, rerun=False): - """ - Return the reserved launch ids that have not been updated for a while. - - Args: - expiration_secs (seconds): time limit - rerun (bool): if True, the expired reservations are cancelled and the fireworks rerun. - - Returns: - [int]: list of expired launch ids - """ - now_time = datetime.datetime.utcnow() - cutoff_time_str = (now_time - datetime.timedelta(seconds=expiration_secs)).isoformat() - bad_launch_data = self.launches.find( - { - "state": "RESERVED", - "state_history": {"$elemMatch": {"state": "RESERVED", "updated_on": {"$lte": cutoff_time_str}}}, - }, - {"launch_id": 1, "fw_id": 1}, - ) - bad_launch_ids = [ - ld["launch_id"] - for ld in bad_launch_data - if self.fireworks.find_one({"fw_id": ld["fw_id"], "state": "RESERVED"}, {"fw_id": 1}) - ] - if rerun: - for lid in bad_launch_ids: - self.cancel_reservation(lid) - return bad_launch_ids - - def mark_fizzled(self, launch_id) -> None: - """ - Mark the launch corresponding to the given id as FIZZLED. - - Args: - launch_id (int): launch id. - - Returns: - dict: updated launch - """ - # Do a confirmed write and make sure state_history is preserved - self.complete_launch(launch_id, state="FIZZLED") - - def detect_lostruns( - self, - expiration_secs=RUN_EXPIRATION_SECS, - fizzle=False, - rerun=False, - max_runtime=None, - min_runtime=None, - refresh=False, - query=None, - launch_query=None, - ): - """ - Detect lost runs i.e running fireworks that haven't been updated within the specified - time limit or running firework whose launch has been marked fizzed or completed. - - Args: - expiration_secs (seconds): expiration time in seconds - fizzle (bool): if True, mark the lost runs fizzed - rerun (bool): if True, mark the lost runs fizzed and rerun - max_runtime (seconds): maximum run time - min_runtime (seconds): minimum run time - refresh (bool): if True, refresh the workflow with inconsistent fireworks. - query (dict): restrict search to FWs matching this query - launch_query (dict): restrict search to launches matching this query (e.g. host restriction) - - Returns: - ([int], [int], [int]): tuple of list of lost launch ids, lost firework ids and - inconsistent firework ids. - """ - lost_launch_ids = [] - lost_fw_ids = [] - potential_lost_fw_ids = [] - now_time = datetime.datetime.utcnow() - cutoff_timestr = (now_time - datetime.timedelta(seconds=expiration_secs)).isoformat() - - lostruns_query = launch_query or {} - lostruns_query["state"] = "RUNNING" - lostruns_query["state_history"] = {"$elemMatch": {"state": "RUNNING", "updated_on": {"$lte": cutoff_timestr}}} - - if query: - fw_ids = [x["fw_id"] for x in self.fireworks.find(query, {"fw_id": 1})] - lostruns_query["fw_id"] = {"$in": fw_ids} - - bad_launch_data = self.launches.find(lostruns_query, {"launch_id": 1, "fw_id": 1}) - for ld in bad_launch_data: - bad_launch = True - if max_runtime or min_runtime: - bad_launch = False - m_l = self.get_launch_by_id(ld["launch_id"]) - utime = m_l._get_time("RUNNING", use_update_time=True) - ctime = m_l._get_time("RUNNING", use_update_time=False) - if (not max_runtime or (utime - ctime).seconds <= max_runtime) and ( - not min_runtime or (utime - ctime).seconds >= min_runtime - ): - bad_launch = True - if bad_launch: - lost_launch_ids.append(ld["launch_id"]) - potential_lost_fw_ids.append(ld["fw_id"]) - - for fw_id in potential_lost_fw_ids: # tricky: figure out what's actually lost - fw = self.fireworks.find_one({"fw_id": fw_id}, {"launches": 1, "state": 1}) or {} - # only RUNNING FireWorks can be "lost", i.e. not defused or archived - if fw.get("state") == "RUNNING": - l_ids = fw["launches"] - not_lost = [x for x in l_ids if x not in lost_launch_ids] - if len(not_lost) == 0: # all launches are lost - we are lost! - lost_fw_ids.append(fw_id) - else: - for l_id in not_lost: - l_state = self.launches.find_one({"launch_id": l_id}, {"state": 1})["state"] - if Firework.STATE_RANKS[l_state] > Firework.STATE_RANKS["FIZZLED"]: - break - else: - lost_fw_ids.append(fw_id) # all Launches not lost are anyway FIZZLED / ARCHIVED - - if fizzle or rerun: - for lid in lost_launch_ids: - self.mark_fizzled(lid) - - # for offline runs, you want to forget about the run - # see: https://groups.google.com/forum/#!topic/fireworkflows/oimFmE5tZ4E - offline_run = self.offline_runs.count_documents({"launch_id": lid, "deprecated": False}) > 0 - if offline_run: - self.forget_offline(lid, launch_mode=True) - - if rerun: - fw_id = self.launches.find_one({"launch_id": lid}, {"fw_id": 1})["fw_id"] - if fw_id in lost_fw_ids: - self.rerun_fw(fw_id) - - inconsistent_fw_ids = [] - inconsistent_query = query or {} - inconsistent_query["state"] = "RUNNING" - running_fws = self.fireworks.find(inconsistent_query, {"fw_id": 1, "launches": 1}) - for fw in running_fws: - if self.launches.find_one( - {"launch_id": {"$in": fw["launches"]}, "state": {"$in": ["FIZZLED", "COMPLETED"]}} - ): - inconsistent_fw_ids.append(fw["fw_id"]) - if refresh: - self._refresh_wf(fw["fw_id"]) - - return lost_launch_ids, lost_fw_ids, inconsistent_fw_ids - - def set_reservation_id(self, launch_id, reservation_id) -> None: - """ - Set reservation id to the launch corresponding to the given launch id. - - Args: - launch_id (int) - reservation_id (int) - """ - m_launch = self.get_launch_by_id(launch_id) - m_launch.set_reservation_id(reservation_id) - self.launches.find_one_and_replace({"launch_id": launch_id}, m_launch.to_db_dict()) - - def checkout_fw(self, fworker, launch_dir, fw_id=None, host=None, ip=None, state="RUNNING"): - """ - Checkout the next ready firework, mark it with the given state(RESERVED or RUNNING) and - return it to the caller. The caller is responsible for running the Firework. - - Args: - fworker (FWorker): A FWorker instance - launch_dir (str): the dir the FW will be run in (for creating a Launch object) - fw_id (int): Firework id - host (str): the host making the request (for creating a Launch object) - ip (str): the ip making the request (for creating a Launch object) - state (str): RESERVED or RUNNING, the fetched firework's state will be set to this value. - - Returns: - (Firework, int): firework and the new launch id. - """ - m_fw = self._get_a_fw_to_run(fworker.query, fw_id=fw_id) - if not m_fw: - return None, None - - # If this Launch was previously reserved, overwrite that reservation with this Launch - # note that adding a new Launch is problematic from a duplicate run standpoint - prev_reservations = [launch for launch in m_fw.launches if launch.state == "RESERVED"] - reserved_launch = None if not prev_reservations else prev_reservations[0] - state_history = reserved_launch.state_history if reserved_launch else None - - # get new launch - launch_id = reserved_launch.launch_id if reserved_launch else self.get_new_launch_id() - trackers = [Tracker.from_dict(f) for f in m_fw.spec["_trackers"]] if "_trackers" in m_fw.spec else None - m_launch = Launch( - state, - launch_dir, - fworker, - host, - ip, - trackers=trackers, - state_history=state_history, - launch_id=launch_id, - fw_id=m_fw.fw_id, - ) - - # insert the launch - self.launches.find_one_and_replace({"launch_id": m_launch.launch_id}, m_launch.to_db_dict(), upsert=True) - - self.m_logger.debug(f"Created/updated Launch with {launch_id=}") - - # update the firework's launches - if not reserved_launch: - # we're appending a new Firework - m_fw.launches.append(m_launch) - else: - # we're updating an existing launch - m_fw.launches = [m_launch if launch.launch_id == m_launch.launch_id else launch for launch in m_fw.launches] - - # insert the firework and refresh the workflow - m_fw.state = state - self._upsert_fws([m_fw]) - self._refresh_wf(m_fw.fw_id) - - # update any duplicated runs - if state == "RUNNING": - for fw in self.fireworks.find( - {"launches": launch_id, "state": {"$in": ["WAITING", "READY", "RESERVED", "FIZZLED"]}}, {"fw_id": 1} - ): - fw_id = fw["fw_id"] - fw = self.get_fw_by_id(fw_id) - fw.state = state - self._upsert_fws([fw]) - self._refresh_wf(fw.fw_id) - - # Store backup copies of the initial data for retrieval in case of failure - self.backup_launch_data[m_launch.launch_id] = m_launch.to_db_dict() - self.backup_fw_data[fw_id] = m_fw.to_db_dict() - - self.m_logger.debug(f"{m_fw.state} FW with id: {m_fw.fw_id}") - - return m_fw, launch_id - - def change_launch_dir(self, launch_id, launch_dir) -> None: - """ - Change the launch directory corresponding to the given launch id. - - Args: - launch_id (int) - launch_dir (str): path to the new launch directory. - """ - m_launch = self.get_launch_by_id(launch_id) - m_launch.launch_dir = launch_dir - self.launches.find_one_and_replace({"launch_id": m_launch.launch_id}, m_launch.to_db_dict(), upsert=True) - - def restore_backup_data(self, launch_id, fw_id) -> None: - """For the given launch id and firework id, restore the back up data.""" - if launch_id in self.backup_launch_data: - self.launches.find_one_and_replace({"launch_id": launch_id}, self.backup_launch_data[launch_id]) - if fw_id in self.backup_fw_data: - self.fireworks.find_one_and_replace({"fw_id": fw_id}, self.backup_fw_data[fw_id]) - - def complete_launch(self, launch_id, action=None, state="COMPLETED"): - """ - Internal method used to mark a Firework's Launch as completed. - - Args: - launch_id (int) - action (FWAction): the FWAction of what to do next - state (str): COMPLETED or FIZZLED - - Returns: - dict: updated launch - """ - # update the launch data to COMPLETED, set end time, etc - m_launch = self.get_launch_by_id(launch_id) - m_launch.state = state - if action: - m_launch.action = action - - try: - self.launches.find_one_and_replace({"launch_id": m_launch.launch_id}, m_launch.to_db_dict(), upsert=True) - except DocumentTooLarge as err: - launch_db_dict = m_launch.to_db_dict() - action_dict = launch_db_dict.get("action", None) - if not action_dict: - # in case the action is empty and it is not the source of - # the error, raise the exception again. - raise - if self.gridfs_fallback is None: - err.args = ( - err.args[0] + ". Set GRIDFS_FALLBACK_COLLECTION in FW_config.yaml" - " to a value different from None", - ) - raise err - - # encoding required for python2/3 compatibility. - action_id = self.gridfs_fallback.put( - json.dumps(action_dict), encoding="utf-8", metadata={"launch_id": launch_id} - ) - launch_db_dict["action"] = {"gridfs_id": str(action_id)} - self.m_logger.warning("The size of the launch document was too large. Saving the action in gridfs.") - - self.launches.find_one_and_replace({"launch_id": m_launch.launch_id}, launch_db_dict, upsert=True) - - # find all the fws that have this launch - for fw in self.fireworks.find({"launches": launch_id}, {"fw_id": 1}): - fw_id = fw["fw_id"] - self._refresh_wf(fw_id) - - # change return type to dict to make return type serializable to support job packing - return m_launch.to_dict() - - def ping_launch(self, launch_id, ptime=None, checkpoint=None) -> None: - """ - Ping that a Launch is still alive: updates the 'update_on 'field of the state history of a - Launch. - - Args: - launch_id (int) - ptime (datetime) - """ - m_launch = self.get_launch_by_id(launch_id) - for tracker in m_launch.trackers: - tracker.track_file(m_launch.launch_dir) - m_launch.touch_history(ptime, checkpoint=checkpoint) - self.launches.update_one( - {"launch_id": launch_id, "state": "RUNNING"}, - { - "$set": { - "state_history": m_launch.to_db_dict()["state_history"], - "trackers": [t.to_dict() for t in m_launch.trackers], - } - }, - ) - - def get_new_fw_id(self, quantity=1): - """ - Checkout the next Firework id. - - Args: - quantity (int): optionally ask for many ids, otherwise defaults to 1 - this then returns the *first* fw_id in that range - """ - try: - return self.fw_id_assigner.find_one_and_update({}, {"$inc": {"next_fw_id": quantity}})["next_fw_id"] - except Exception: - raise ValueError( - "Could not get next FW id! If you have not yet initialized the database," - " please do so by performing a database reset (e.g., lpad reset)" - ) - - def get_new_launch_id(self): - """Checkout the next Launch id.""" - try: - return self.fw_id_assigner.find_one_and_update({}, {"$inc": {"next_launch_id": 1}})["next_launch_id"] - except Exception: - raise ValueError( - "Could not get next launch id! If you have not yet initialized the " - "database, please do so by performing a database reset (e.g., lpad reset)" - ) - - def _upsert_fws(self, fws, reassign_all=False): - """ - Insert the fireworks to the 'fireworks' collection. - - Args: - fws ([Firework]): list of fireworks - reassign_all (bool): if True, reassign the firework ids. The ids are also reassigned - if the current firework ids are negative. - - Returns: - dict: mapping between old and new Firework ids - """ - old_new = {} - # sort the FWs by id, then the new FW_ids will match the order of the old ones... - fws.sort(key=lambda x: x.fw_id) - - if reassign_all: - used_ids = [] - # we can request multiple fw_ids up front - # this is the FIRST fw_id we should use - first_new_id = self.get_new_fw_id(quantity=len(fws)) - - for new_id, fw in enumerate(fws, start=first_new_id): - old_new[fw.fw_id] = new_id - fw.fw_id = new_id - used_ids.append(new_id) - # delete/add in bulk - self.fireworks.delete_many({"fw_id": {"$in": used_ids}}) - self.fireworks.insert_many(fw.to_db_dict() for fw in fws) - else: - for fw in fws: - if fw.fw_id < 0: - new_id = self.get_new_fw_id() - old_new[fw.fw_id] = new_id - fw.fw_id = new_id - - self.fireworks.find_one_and_replace({"fw_id": fw.fw_id}, fw.to_db_dict(), upsert=True) - - return old_new - - def rerun_fw(self, fw_id, rerun_duplicates=True, recover_launch=None, recover_mode=None): - """ - Rerun the firework corresponding to the given id. - - Args: - fw_id (int): firework id - rerun_duplicates (bool): flag for whether duplicates should be rerun - recover_launch ('last' or int): launch_id for last recovery, if set to - 'last' (default), recovery will find the last available launch. - If it is an int, will recover that specific launch - recover_mode ('prev_dir' or 'cp'): flag to indicate whether to copy - or run recovery fw in previous directory - - Returns: - [int]: list of firework ids that were rerun - """ - m_fw = self.fireworks.find_one({"fw_id": fw_id}, {"state": 1}) - - if not m_fw: - raise ValueError(f"FW with id: {fw_id} not found!") - - # detect FWs that share the same launch. Must do this before rerun - duplicates = [] - reruns = [] - if rerun_duplicates: - fw = self.fireworks.find_one({"fw_id": fw_id, "spec._dupefinder": {"$exists": True}}, {"launches": 1}) - if fw: - duplicates = [ - fw_dct["fw_id"] - for fw_dct in self.fireworks.find( - {"launches": {"$in": fw["launches"]}, "fw_id": {"$ne": fw_id}}, {"fw_id": 1} - ) - ] - duplicates = list(set(duplicates)) - - # Launch recovery - if recover_launch is not None: - recovery = self.get_recovery(fw_id, recover_launch) - recovery.update(_mode=recover_mode) - set_spec = recursive_dict({"$set": {"spec._recovery": recovery}}) - if recover_mode == "prev_dir": - prev_dir = self.get_launch_by_id(recovery.get("_launch_id")).launch_dir - set_spec["$set"]["spec._launch_dir"] = prev_dir - self.fireworks.find_one_and_update({"fw_id": fw_id}, set_spec) - - # If no launch recovery specified, unset the firework recovery spec - else: - set_spec = {"$unset": {"spec._recovery": ""}} - self.fireworks.find_one_and_update({"fw_id": fw_id}, set_spec) - - # rerun this FW - if m_fw["state"] in ["ARCHIVED", "DEFUSED"]: - self.m_logger.info(f"Cannot rerun {fw_id=}: it is {m_fw['state']}.") - elif m_fw["state"] == "WAITING" and not recover_launch: - self.m_logger.debug(f"Skipping rerun {fw_id=}: it is already WAITING.") - else: - with WFLock(self, fw_id): - wf = self.get_wf_by_fw_id_lzyfw(fw_id) - updated_ids = wf.rerun_fw(fw_id) - self._update_wf(wf, updated_ids) - reruns.append(fw_id) - - # rerun duplicated FWs - for fw in duplicates: - self.m_logger.info(f"Also rerunning duplicate fw_id: {fw}") - # False for speed, True shouldn't be needed - r = self.rerun_fw(fw, rerun_duplicates=False, recover_launch=recover_launch, recover_mode=recover_mode) - reruns.extend(r) - - return reruns - - def get_recovery(self, fw_id, launch_id="last"): - """ - function to get recovery data for a given fw and launch - Args: - fw_id (int): fw id to get recovery data for - launch_id (int or 'last'): launch_id to get recovery data for, if 'last' - recovery data is generated from last launch. - """ - m_fw = self.get_fw_by_id(fw_id) - launch = m_fw.launches[-1] if launch_id == "last" else self.get_launch_by_id(launch_id) - recovery = launch.state_history[-1].get("checkpoint") - recovery.update(_prev_dir=launch.launch_dir, _launch_id=launch.launch_id) - return recovery - - def _refresh_wf(self, fw_id) -> None: - """ - Update the FW state of all jobs in workflow. - - Args: - fw_id (int): the parent fw_id - children will be refreshed - """ - # TODO: time how long it took to refresh the WF! - # TODO: need a try-except here, high probability of failure if incorrect action supplied - try: - with WFLock(self, fw_id): - wf = self.get_wf_by_fw_id_lzyfw(fw_id) - updated_ids = wf.refresh(fw_id) - self._update_wf(wf, updated_ids) - except LockedWorkflowError: - self.m_logger.info(f"{fw_id=} locked. Can't refresh!") - except Exception: - # some kind of internal error - an example is that fws serialization changed due to - # code updates and thus the Firework object can no longer be loaded from db description - # Action: *manually* mark the fw and workflow as FIZZLED - self.fireworks.find_one_and_update({"fw_id": fw_id}, {"$set": {"state": "FIZZLED"}}) - self.workflows.find_one_and_update({"nodes": fw_id}, {"$set": {"state": "FIZZLED"}}) - self.workflows.find_one_and_update({"nodes": fw_id}, {"$set": {f"fw_states.{fw_id}": "FIZZLED"}}) - import traceback - - err_message = f"Error refreshing workflow. The full stack trace is: {traceback.format_exc()}" - raise RuntimeError(err_message) - - def _update_wf(self, wf, updated_ids) -> None: - """ - Update the workflow with the updated firework ids. - Note: must be called within an enclosing WFLock. - - Args: - wf (Workflow) - updated_ids ([int]): list of firework ids - """ - updated_fws = [wf.id_fw[fid] for fid in updated_ids] - old_new = self._upsert_fws(updated_fws) - wf._reassign_ids(old_new) - - # find a node for which the id did not change, so we can query on it to get WF - query_node = None - for f in wf.id_fw: - if f not in old_new.values() or old_new.get(f, None) == f: - query_node = f - break - - assert query_node is not None - if not self.workflows.find_one({"nodes": query_node}): - raise ValueError(f"BAD QUERY_NODE! {query_node}") - # redo the links and fw_states - wf = wf.to_db_dict() - wf["locked"] = True # preserve the lock! - self.workflows.find_one_and_replace({"nodes": query_node}, wf) - - def _steal_launches(self, thief_fw): - """ - Check if there are duplicates. If there are duplicates, the matching firework's launches - are added to the launches of the given firework. - - Returns: - bool: False if the given firework is unique - """ - stolen = False - if thief_fw.state in ["READY", "RESERVED"] and "_dupefinder" in thief_fw.spec: - m_dupefinder = thief_fw.spec["_dupefinder"] - # get the query that will limit the number of results to check as duplicates - m_query = m_dupefinder.query(thief_fw.to_dict()["spec"]) - self.m_logger.debug(f"Querying for duplicates, fw_id: {thief_fw.fw_id}") - # iterate through all potential duplicates in the DB - for potential_match in self.fireworks.find(m_query): - self.m_logger.debug(f"Verifying for duplicates, fw_ids: {thief_fw.fw_id}, {potential_match['fw_id']}") - - # see if verification is needed, as this slows the process - verified = False - try: - m_dupefinder.verify({}, {}) # is implemented test - - except NotImplementedError: - verified = True # no dupefinder.verify() implemented, skip verification - - except Exception: - # we want to catch any exceptions from testing an empty dict, which the dupefinder might not be - # designed for - pass - - if not verified: - # dupefinder.verify() is implemented, let's call verify() - spec1 = dict(thief_fw.to_dict()["spec"]) # defensive copy - spec2 = dict(potential_match["spec"]) # defensive copy - verified = m_dupefinder.verify(spec1, spec2) - - if verified: - # steal the launches - victim_fw = self.get_fw_by_id(potential_match["fw_id"]) - thief_launches = [launch.launch_id for launch in thief_fw.launches] - valuable_launches = [ - launch for launch in victim_fw.launches if launch.launch_id not in thief_launches - ] - for launch in valuable_launches: - thief_fw.launches.append(launch) - stolen = True - self.m_logger.info(f"Duplicate found! fwids {thief_fw.fw_id} and {potential_match['fw_id']}") - return stolen - - def set_priority(self, fw_id, priority) -> None: - """ - Set priority to the firework with the given id. - - Args: - fw_id (int): firework id - priority - """ - self.fireworks.find_one_and_update({"fw_id": fw_id}, {"$set": {"spec._priority": priority}}) - - def get_logdir(self): - """ - Return the log directory. - - AJ: This is needed for job packing due to Proxy objects not being fully featured... - """ - return self.logdir - - def add_offline_run(self, launch_id, fw_id, name) -> None: - """ - Add the launch and firework to the offline_run collection. - - Args: - launch_id (int): launch id. - fw_id (id): firework id - name (str) - """ - d = {"fw_id": fw_id} - d["launch_id"] = launch_id - d["name"] = name - d["created_on"] = datetime.datetime.utcnow().isoformat() - d["updated_on"] = datetime.datetime.utcnow().isoformat() - d["deprecated"] = False - d["completed"] = False - self.offline_runs.insert_one(d) - - def recover_offline(self, launch_id, ignore_errors=False, print_errors=False): - """ - Update the launch state using the offline data in FW_offline.json file. - - Args: - launch_id (int): launch id. - ignore_errors (bool) - print_errors (bool) - - Returns: - firework id if the recovering fails otherwise None - """ - # get the launch directory - m_launch = self.get_launch_by_id(launch_id) - try: - self.m_logger.debug(f"RECOVERING fw_id: {m_launch.fw_id}") - - offline_loc = zpath(os.path.join(m_launch.launch_dir, "FW_offline.json")) - - offline_data = loadfn(offline_loc) - - if "started_on" in offline_data: # started running at some point - already_running = False - for s in m_launch.state_history: - if s["state"] == "RUNNING": - s["created_on"] = reconstitute_dates(offline_data["started_on"]) - already_running = True - - if not already_running: - m_launch.state = "RUNNING" # this should also add a history item - - checkpoint = offline_data.get("checkpoint", None) - - # look for ping file - update the Firework if this is the case - ping_loc = os.path.join(m_launch.launch_dir, "FW_ping.json") - if os.path.exists(ping_loc): - ping_dict = loadfn(ping_loc) - self.ping_launch(launch_id, ptime=ping_dict["ping_time"], checkpoint=checkpoint) - else: - warnings.warn( - f"Unable to find FW_ping.json in {m_launch.launch_dir}! State history updated_on might be " - "incorrect, trackers may not update." - ) - m_launch.touch_history(checkpoint=checkpoint) - - if "fwaction" in offline_data: - fwaction = FWAction.from_dict(offline_data["fwaction"]) - m_launch.state = offline_data["state"] - self.launches.find_one_and_replace( - {"launch_id": m_launch.launch_id}, m_launch.to_db_dict(), upsert=True - ) - - m_launch = Launch.from_dict(self.complete_launch(launch_id, fwaction, m_launch.state)) - - for s in m_launch.state_history: - if s["state"] == offline_data["state"]: - s["created_on"] = reconstitute_dates(offline_data["completed_on"]) - self.launches.find_one_and_update( - {"launch_id": m_launch.launch_id}, {"$set": {"state_history": m_launch.state_history}} - ) - - self.offline_runs.update_one({"launch_id": launch_id}, {"$set": {"completed": True}}) - - else: - launch = self.launches.find_one_and_replace( - {"launch_id": m_launch.launch_id}, m_launch.to_db_dict(), upsert=True - ) - fw_id = launch["fw_id"] - f = self.fireworks.find_one_and_update( - {"fw_id": fw_id}, {"$set": {"state": "RUNNING", "updated_on": datetime.datetime.utcnow()}} - ) - if f: - self._refresh_wf(fw_id) - - # update the updated_on - self.offline_runs.update_one( - {"launch_id": launch_id}, {"$set": {"updated_on": datetime.datetime.utcnow().isoformat()}} - ) - return None - - except Exception: - if print_errors: - self.m_logger.error(f"failed recovering {launch_id=}.\n{traceback.format_exc()}") - if not ignore_errors: - traceback.print_exc() - m_action = FWAction( - stored_data={ - "_message": "runtime error during task", - "_task": None, - "_exception": {"_stacktrace": traceback.format_exc(), "_details": None}, - }, - exit=True, - ) - self.complete_launch(launch_id, m_action, "FIZZLED") - self.offline_runs.update_one({"launch_id": launch_id}, {"$set": {"completed": True}}) - return m_launch.fw_id - - def forget_offline(self, launchid_or_fwid, launch_mode=True) -> None: - """ - Unmark the offline run for the given launch or firework id. - - Args: - launchid_or_fwid (int): launch od or firework id - launch_mode (bool): if True then launch id is given. - """ - q = {"launch_id": launchid_or_fwid} if launch_mode else {"fw_id": launchid_or_fwid} - self.offline_runs.update_many(q, {"$set": {"deprecated": True}}) - - def get_tracker_data(self, fw_id): - """ - Args: - fw_id (id): firework id. - - Returns: - [dict]: list tracker dicts - """ - data = [] - for launch in self.launches.find({"fw_id": fw_id}, {"trackers": 1, "launch_id": 1}): - if "trackers" in launch: # backwards compatibility - trackers = [Tracker.from_dict(t) for t in launch["trackers"]] - data.append({"launch_id": launch["launch_id"], "trackers": trackers}) - return data - - def get_launchdir(self, fw_id, launch_idx=-1): - """ - Returns the directory of the *most recent* launch of a fw_id - Args: - fw_id: (int) fw_id to get launch id for - launch_idx: (int) index of the launch to get. Default is -1, which is most recent. - """ - fw = self.get_fw_by_id(fw_id) - return fw.launches[launch_idx].launch_dir if len(fw.launches) > 0 else None - - def log_message(self, level, message) -> None: - """ - Support for job packing. - - Args: - level (str) - message (str) - """ - self.m_logger.log(level, message) - - -class LazyFirework: - """ - A LazyFirework only has the fw_id, and retrieves other data just-in-time. - This representation can speed up Workflow loading as only "important" FWs need to be - fully loaded. - """ - - # Get these fields from DB when creating new FireWork object - db_fields = ("name", "fw_id", "spec", "created_on", "state") - db_launch_fields = ("launches", "archived_launches") - - def __init__(self, fw_id, fw_coll, launch_coll, fallback_fs) -> None: - """ - Args: - fw_id (int): firework id - fw_coll (pymongo.collection): fireworks collection - launch_coll (pymongo.collection): launches collection. - """ - # This is the only attribute known w/o a DB query - self.fw_id = fw_id - self._fwc, self._lc, self._ffs = fw_coll, launch_coll, fallback_fs - self._launches = dict.fromkeys(self.db_launch_fields, False) - self._fw, self._lids, self._state = None, None, None - - # FireWork methods - - # Treat state as special case as it is always required when accessing a Firework lazily - # If the partial fw is not available the state is fetched independently - @property - def state(self): - if self._fw is not None: - self._state = self._fw.state - elif self._state is None: - self._state = self._fwc.find_one({"fw_id": self.fw_id}, projection=["state"])["state"] - return self._state - - @state.setter - def state(self, state) -> None: - self.partial_fw._state = state - self.partial_fw.updated_on = datetime.datetime.utcnow() - - def to_dict(self): - return self.full_fw.to_dict() - - def _rerun(self) -> None: - self.full_fw._rerun() - - def to_db_dict(self): - return self.full_fw.to_db_dict() - - def __str__(self) -> str: - return f"LazyFireWork object: (id: {self.fw_id})" - - # Properties that shadow FireWork attributes - - @property - def tasks(self): - return self.partial_fw.tasks - - @tasks.setter - def tasks(self, value) -> None: - self.partial_fw.tasks = value - - @property - def spec(self): - return self.partial_fw.spec - - @spec.setter - def spec(self, value) -> None: - self.partial_fw.spec = value - - @property - def name(self): - return self.partial_fw.name - - @name.setter - def name(self, value) -> None: - self.partial_fw.name = value - - @property - def created_on(self): - return self.partial_fw.created_on - - @created_on.setter - def created_on(self, value) -> None: - self.partial_fw.created_on = value - - @property - def updated_on(self): - return self.partial_fw.updated_on - - @updated_on.setter - def updated_on(self, value) -> None: - self.partial_fw.updated_on = value - - @property - def parents(self): - if self._fw is not None: - return self.partial_fw.parents - return [] - - @parents.setter - def parents(self, value) -> None: - self.partial_fw.parents = value - - # Properties that shadow FireWork attributes, but which are - # fetched individually from the DB (i.e. launch objects) - - @property - def launches(self): - return self._get_launch_data("launches") - - @launches.setter - def launches(self, value) -> None: - self._launches["launches"] = True - self.partial_fw.launches = value - - @property - def archived_launches(self): - return self._get_launch_data("archived_launches") - - @archived_launches.setter - def archived_launches(self, value) -> None: - self._launches["archived_launches"] = True - self.partial_fw.archived_launches = value - - # Lazy properties that idempotently instantiate a FireWork object - @property - def partial_fw(self): - if not self._fw: - fields = list(self.db_fields) + list(self.db_launch_fields) - data = self._fwc.find_one({"fw_id": self.fw_id}, projection=fields) - launch_data = {} # move some data to separate launch dict - for key in self.db_launch_fields: - launch_data[key] = data[key] - del data[key] - self._lids = launch_data - self._fw = Firework.from_dict(data) - return self._fw - - @property - def full_fw(self): - # map(self._get_launch_data, self.db_launch_fields) - for launch_field in self.db_launch_fields: - self._get_launch_data(launch_field) - return self._fw - - # Get a type of Launch object - - def _get_launch_data(self, name): - """ - Pull launch data individually for each field. - - Args: - name (str): Name of field, e.g. 'archived_launches'. - - Returns: - Launch obj (also propagated to self._fw) - """ - fw = self.partial_fw # assure stage 1 - if not self._launches[name]: - launch_ids = self._lids[name] - result = [] - if launch_ids: - data = self._lc.find({"launch_id": {"$in": launch_ids}}) - for ld in data: - ld["action"] = get_action_from_gridfs(ld.get("action"), self._ffs) - result.append(Launch.from_dict(ld)) - - setattr(fw, name, result) # put into real FireWork obj - self._launches[name] = True - return getattr(fw, name) - - -def get_action_from_gridfs(action_dict, fallback_fs): - """ - Helper function to obtain the correct dictionary of the FWAction associated - with a launch. If necessary retrieves the information from gridfs based - on its identifier, otherwise simply returns the dictionary in input. - Should be used when accessing a launch to ensure the presence of the - correct action dictionary. - - Args: - action_dict (dict): the dictionary contained in the "action" key of a launch - document. - fallback_fs (GridFS): the GridFS with the actions exceeding the 16MB limit. - - Returns: - dict: the dictionary of the action. - """ - if not action_dict or "gridfs_id" not in action_dict: - return action_dict - - action_gridfs_id = ObjectId(action_dict["gridfs_id"]) - - action_data = fallback_fs.get(ObjectId(action_gridfs_id)) - return json.loads(action_data.read()) From 1d9f6b93bda3b415721195a6a44d66ed8fb6a5c7 Mon Sep 17 00:00:00 2001 From: Wayne Date: Fri, 26 Jul 2024 12:20:42 -0700 Subject: [PATCH 09/12] updated import statement --- fireworks/core/launchpad.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fireworks/core/launchpad.py b/fireworks/core/launchpad.py index f0f46fda1..10176e09d 100644 --- a/fireworks/core/launchpad.py +++ b/fireworks/core/launchpad.py @@ -21,7 +21,7 @@ from tqdm import tqdm from fireworks.core.firework import Firework, FWAction, Launch, Tracker, Workflow -from fireworks.fireworks.utilities import reservation_finder +from fireworks.utilities import reservation_finder from fireworks.fw_config import MongoClient from fireworks.fw_config import ( GRIDFS_FALLBACK_COLLECTION, From 78b1e5ea768b15d80e90a2fb373b747a3fc8908e Mon Sep 17 00:00:00 2001 From: Wayne Date: Fri, 26 Jul 2024 15:01:02 -0700 Subject: [PATCH 10/12] get reservation id from fw_id for slurm jobs works locally --- fireworks/core/launchpad.py | 11 +- fireworks/utilities/reservation_finder2.py | 196 +++++++++++++++++++++ 2 files changed, 199 insertions(+), 8 deletions(-) create mode 100644 fireworks/utilities/reservation_finder2.py diff --git a/fireworks/core/launchpad.py b/fireworks/core/launchpad.py index 10176e09d..71d65689f 100644 --- a/fireworks/core/launchpad.py +++ b/fireworks/core/launchpad.py @@ -22,6 +22,7 @@ from fireworks.core.firework import Firework, FWAction, Launch, Tracker, Workflow from fireworks.utilities import reservation_finder +from fireworks.utilities import reservation_finder2 from fireworks.fw_config import MongoClient from fireworks.fw_config import ( GRIDFS_FALLBACK_COLLECTION, @@ -1218,14 +1219,8 @@ def cancel_reservation_by_reservation_id(self, reservation_id) -> None: def get_reservation_id_from_fw_id(self, fw_id): """Given the firework id, return the reservation id.""" - fw = self.fireworks.find_one({"fw_id": fw_id}, {"launches": 1}) - if fw: - for launch in self.launches.find({"launch_id": {"$in": fw["launches"]}}, {"state_history": 1}): - for d in launch["state_history"]: - if "reservation_id" in d: - return d["reservation_id"] - return None - return None + jobid=reservation_finder2.main(fw_id) + return jobid def cancel_reservation(self, launch_id) -> None: """Given the launch id, cancel the reservation and rerun the fireworks.""" diff --git a/fireworks/utilities/reservation_finder2.py b/fireworks/utilities/reservation_finder2.py new file mode 100644 index 000000000..2b41df8ac --- /dev/null +++ b/fireworks/utilities/reservation_finder2.py @@ -0,0 +1,196 @@ +import os +import subprocess +import re +import json +import sys +import getpass +import paramiko + +# ANSI color codes for terminal output +RED = '\033[0;31m' +CYAN = '\033[0;36m' +ORANGE = '\033[0;33m' +NC = '\033[0m' # No Color + +def run_command(command, ssh=None): + if ssh: + stdin, stdout, stderr = ssh.exec_command(command) + output = stdout.read().decode('utf-8').strip() + errors = stderr.read().decode('utf-8').strip() + if errors: + raise Exception(f"Command failed: {command}\n{errors}") + return output + else: + result = subprocess.run(command, shell=True, capture_output=True, text=True) + if result.returncode != 0: + #raise Exception(f"Command failed: {command}\n{result.stderr}") + raise Exception('Running locally') + return result.stdout.strip() + +def execute_command(command): + try: + return run_command(command), None + except Exception as e: + print(e) + return ssh_login(command) + +def extract_username_hostname(input_string): + pattern = r'(?P[^@]+)@(?P.+)' + match = re.match(pattern, input_string) + if match: + return match.group('username'), match.group('hostname') + else: + raise ValueError("The input does not match the required format 'username@hostname'.") + +def ssh_login(command): + input_string = input("Enter username@hostname: ").strip() + username, hostname = extract_username_hostname(input_string) + password = getpass.getpass('Enter password+OTP: ') + + ssh = paramiko.SSHClient() + ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + ssh.connect(hostname, username=username, password=password) + + return run_command(command, ssh), ssh + +def get_stdout_dir(job_id, ssh=None): + command = f"scontrol show jobid {job_id}" + output = run_command(command, ssh) + match = re.search(r"StdOut=(\S+)", output) + if match: + return match.group(1) + else: + print(f"{RED}StdOut path not found in job information{NC}") + sys.exit(1) + +def load_json(file_path, ssh=None): + if ssh: + command = f"cat {file_path}" + json_data = run_command(command, ssh) + return json.loads(json_data) + else: + with open(file_path) as f: + return json.load(f) + +def process_all_jobs(job_list,ssh=None): + fw_dict = {} + #user = os.getenv('USER') + #job_list = run_command(f"squeue --states=R -u {user}", ssh) + job_lines = job_list.splitlines()[1:] # Skip header + if not job_lines: + print(f"{RED}No jobs found!{NC}") + sys.exit(1) + + for line in job_lines: + job_id = line.split()[0] + print(f"{ORANGE}Processing job ID: {CYAN}{job_id}{NC}") + fw_id = process_single_job(job_id, ssh) + fw_dict[fw_id] = job_id + + + return fw_dict + +def process_single_job(job_id, ssh=None): + stdout_dir = get_stdout_dir(job_id, ssh) + base_dir = os.path.dirname(stdout_dir) + return dir_rapidfire(base_dir, ssh) + +def dir_singleshot(base_dir, ssh=None): + json_file = os.path.join(base_dir, "FW.json") + if ssh: + command = f"test -f {json_file} && cat {json_file}" + try: + data = run_command(command, ssh) + data = json.loads(data) + except: + print_warning(base_dir) + return 1 + else: + if os.path.isfile(json_file): + data = load_json(json_file) + else: + print_warning(base_dir) + return 1 + + spec_mpid = data.get('spec', {}).get('MPID') + fw_id = data.get('fw_id') + if spec_mpid: + print(f"spec.MPID: {spec_mpid}") + if fw_id: + print(f"fw_id: {fw_id}") + return fw_id if fw_id else 1 + +def dir_rapidfire(base_dir, ssh=None): + if ssh: + command = f"cd {base_dir} && ls -d launcher_*" + try: + launcher_dirs = run_command(command, ssh).split() + except: + return dir_singleshot(base_dir, ssh) + else: + launcher_dirs = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d)) and d.startswith("launcher_")] + + if not launcher_dirs: + return dir_singleshot(base_dir, ssh) + + largest_dir = sorted(launcher_dirs, reverse=True)[0] + launcher_path = os.path.join(base_dir, largest_dir) + json_file = os.path.join(launcher_path, "FW.json") + + if ssh: + command = f"test -f {json_file} && cat {json_file}" + try: + data = run_command(command, ssh) + data = json.loads(data) + except: + print_warning(base_dir) + return dir_singleshot(base_dir, ssh) + else: + if os.path.isfile(json_file): + data = load_json(json_file) + else: + print_warning(base_dir) + return dir_singleshot(base_dir) + + spec_mpid = data.get('spec', {}).get('MPID') + fw_id = data.get('fw_id') + #print(f"spec.MPID: {spec_mpid}") + print(f"fw_id: {fw_id}") + return fw_id + +def print_warning(dir_path): + warning_message = f""" + {"-" * 77} + | | + | W W AA RRRRR N N II N N GGGG !!! | + | W W A A R R NN N II NN N G G !!! | + | W W A A R R N N N II N N N G !!! | + | W WW W AAAAAA RRRRR N N N II N N N G GGG ! | + | WW WW A A R R N NN II N NN G G | + | W W A A R R N N II N N GGGG !!! | + | | + | This slurm job probably doesn't have an FW_ID associated with it. | + | something probably went wrong. You can probably check the directory | + | above to maybe figure out what happened. Best of luck | + | Also why are you using singleshot | + | | + {"-" * 77} + """ + print(warning_message) + +def main(fw_id): + current_dir = os.getcwd() + fw_dict = {} + + command = "squeue --states=R -u ${USER}" + result, ssh = execute_command(command) + #print(result) + + fw_dict = process_all_jobs(result,ssh) + + os.chdir(current_dir) + jobid=fw_dict.get(fw_id) + return jobid + +#if __name__ == "__main__": + # main() From 7fbbb5a9cd9c809a1426587fc522a40185094d30 Mon Sep 17 00:00:00 2001 From: Wayne Date: Fri, 26 Jul 2024 15:29:11 -0700 Subject: [PATCH 11/12] added loading bar and removed some print statements --- fireworks/core/launchpad.py | 8 ++++---- ..._finder.py => fw_id_from_reservation_id.py} | 0 ...finder2.py => reservation_id_from_fw_id.py} | 18 ++++++++++++++---- 3 files changed, 18 insertions(+), 8 deletions(-) rename fireworks/utilities/{reservation_finder.py => fw_id_from_reservation_id.py} (100%) rename fireworks/utilities/{reservation_finder2.py => reservation_id_from_fw_id.py} (92%) diff --git a/fireworks/core/launchpad.py b/fireworks/core/launchpad.py index 71d65689f..bcfe94457 100644 --- a/fireworks/core/launchpad.py +++ b/fireworks/core/launchpad.py @@ -21,8 +21,8 @@ from tqdm import tqdm from fireworks.core.firework import Firework, FWAction, Launch, Tracker, Workflow -from fireworks.utilities import reservation_finder -from fireworks.utilities import reservation_finder2 +from fireworks.utilities import fw_id_from_reservation_id +from fireworks.utilities import reservation_id_from_fw_id from fireworks.fw_config import MongoClient from fireworks.fw_config import ( GRIDFS_FALLBACK_COLLECTION, @@ -1203,7 +1203,7 @@ def get_fw_id_from_reservation_id(self, reservation_id): Returns: [int]: Return the firework id. """ - fw_id=reservation_finder.get_fwid(reservation_id) + fw_id=fw_id_from_reservation_id.get_fwid(reservation_id) return fw_id @@ -1219,7 +1219,7 @@ def cancel_reservation_by_reservation_id(self, reservation_id) -> None: def get_reservation_id_from_fw_id(self, fw_id): """Given the firework id, return the reservation id.""" - jobid=reservation_finder2.main(fw_id) + jobid=reservation_id_from_fw_id.main(fw_id) return jobid def cancel_reservation(self, launch_id) -> None: diff --git a/fireworks/utilities/reservation_finder.py b/fireworks/utilities/fw_id_from_reservation_id.py similarity index 100% rename from fireworks/utilities/reservation_finder.py rename to fireworks/utilities/fw_id_from_reservation_id.py diff --git a/fireworks/utilities/reservation_finder2.py b/fireworks/utilities/reservation_id_from_fw_id.py similarity index 92% rename from fireworks/utilities/reservation_finder2.py rename to fireworks/utilities/reservation_id_from_fw_id.py index 2b41df8ac..df81ad2f8 100644 --- a/fireworks/utilities/reservation_finder2.py +++ b/fireworks/utilities/reservation_id_from_fw_id.py @@ -80,10 +80,20 @@ def process_all_jobs(job_list,ssh=None): if not job_lines: print(f"{RED}No jobs found!{NC}") sys.exit(1) - - for line in job_lines: + load_length=len(job_lines) + for i,line in enumerate(job_lines): + i=i+1 job_id = line.split()[0] - print(f"{ORANGE}Processing job ID: {CYAN}{job_id}{NC}") + percent_complete = (i / load_length) * 100 + + # Create the bar part of the output + bar = '#' * i + '-' * (load_length - i) + + # Display the loading bar along with the percentage + sys.stdout.write(f'\r[{bar}] {percent_complete:.2f}%') + sys.stdout.flush() + print("\n") + #print(f"{ORANGE}Processing job ID: {CYAN}{job_id}{NC}") fw_id = process_single_job(job_id, ssh) fw_dict[fw_id] = job_id @@ -155,7 +165,7 @@ def dir_rapidfire(base_dir, ssh=None): spec_mpid = data.get('spec', {}).get('MPID') fw_id = data.get('fw_id') #print(f"spec.MPID: {spec_mpid}") - print(f"fw_id: {fw_id}") + #print(f"fw_id: {fw_id}") return fw_id def print_warning(dir_path): From 1b1f84bb0e4a3aab545096a47fa619b7e1082000 Mon Sep 17 00:00:00 2001 From: Wayne Date: Fri, 26 Jul 2024 15:32:17 -0700 Subject: [PATCH 12/12] added loading bar and removed some print statements --- fireworks/core/launchpad.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fireworks/core/launchpad.py b/fireworks/core/launchpad.py index bcfe94457..fef85ff95 100644 --- a/fireworks/core/launchpad.py +++ b/fireworks/core/launchpad.py @@ -1220,6 +1220,8 @@ def cancel_reservation_by_reservation_id(self, reservation_id) -> None: def get_reservation_id_from_fw_id(self, fw_id): """Given the firework id, return the reservation id.""" jobid=reservation_id_from_fw_id.main(fw_id) + if jobid==None: + print('No matching fw_id-JobID pair. The firework may be a lost run') return jobid def cancel_reservation(self, launch_id) -> None: