From 5b080226711712d4823cc2427e2cc1f4938cd9fb Mon Sep 17 00:00:00 2001 From: Jorge <97254349+Jgmedina95@users.noreply.github.com> Date: Tue, 23 Jan 2024 16:40:33 -0500 Subject: [PATCH] 69 dealing with simulfiles (#70) 1. add FileType.RECORD functionality in writefilenames and get ids at the path registry (write name and get id) 2. Add handling temp files in setup and run: if final record files are saved in the path registry, if not they get deleted. --- .../preprocess_tools/clean_tools.py | 5 +- .../simulation_tools/setup_and_run.py | 250 +++++++++++++----- mdagent/utils/path_registry.py | 17 +- tests/test_fxns.py | 10 +- 4 files changed, 210 insertions(+), 72 deletions(-) diff --git a/mdagent/tools/base_tools/preprocess_tools/clean_tools.py b/mdagent/tools/base_tools/preprocess_tools/clean_tools.py index e6d408f1..a9c53d3f 100644 --- a/mdagent/tools/base_tools/preprocess_tools/clean_tools.py +++ b/mdagent/tools/base_tools/preprocess_tools/clean_tools.py @@ -227,7 +227,6 @@ class CleaningToolFunctionInput(BaseModel): """Input model for CleaningToolFunction""" pdb_id: str = Field(..., description="ID of the pdb/cif file in the path registry") - output_path: Optional[str] = Field(..., description="Path to the output file") replace_nonstandard_residues: bool = Field( True, description="Whether to replace nonstandard residues with standard ones. " ) @@ -301,7 +300,7 @@ def _run(self, **input_args) -> str: pdbfile_name = pdbfile.split("/")[-1] name = pdbfile_name.split("_")[0] end = pdbfile_name.split(".")[1] - print(f"pdbfile: {pdbfile}", f"name: {name}", f"end: {end}") + except Exception as e: print(f"error retrieving from path_registry, trying to read file {e}") return "File not found in path registry. " @@ -384,7 +383,7 @@ def _run(self, **input_args) -> str: self.path_registry.map_path( file_id, f"{directory}/{file_name}", file_description ) - return f"{file_id} written to {directory}/{file_name}" + return f"File cleaned!\nFile ID:{file_id}\nPath:{directory}/{file_name}" except FileNotFoundError: return "Check your file path. File not found." except Exception as e: diff --git a/mdagent/tools/base_tools/simulation_tools/setup_and_run.py b/mdagent/tools/base_tools/simulation_tools/setup_and_run.py index 53f558ee..e3f240c2 100644 --- a/mdagent/tools/base_tools/simulation_tools/setup_and_run.py +++ b/mdagent/tools/base_tools/simulation_tools/setup_and_run.py @@ -550,6 +550,22 @@ async def _arun(self, query: str) -> str: class SetUpandRunFunctionInput(BaseModel): pdb_id: str forcefield_files: List[str] + final: bool = Field( + False, + description=( + ( + "Set to 'True' when the simulation is the desired final version. " + "Determines whether the simulation is the primary one " + "intended for final use. If set to 'False' (default), " + "the simulation is considered as being in a testing " + "or preliminary scripting stage, utilizing default parameters. " + "This setting is ideal for initial experimentation or " + "basic script development before customizing the " + "script for final use." + ) + ), + ) + system_params: Dict[str, Any] = Field( { "nonbondedMethod": "NoCutoff", @@ -559,29 +575,30 @@ class SetUpandRunFunctionInput(BaseModel): "rigidWater": False, "constraintTolerance": None, }, - description="""Parameters for the openmm system. - For nonbondedMethod, you can choose from the following: - NoCutoff, CutoffNonPeriodic, CutoffPeriodic, Ewald, PME. - If anything but NoCutoff is chosen, - you have to include a nonbondedCutoff - and a constrainTolerance. - If PME is chosen, - you have to include an ewaldErrorTolerance too. - For constraints, you can choose from the following: - None, HBonds, AllBonds or OnlyWater. - For rigidWater, you can choose from the following: - True, False. - Example1: - {"nonbondedMethod": 'NoCutoff', - "constraints": 'None', - "rigidWater": False} - Example2: - {"nonbondedMethod": 'CutoffPeriodic', - "nonbondedCutoff": 1.0, - "constraints": 'HBonds', - "rigidWater": True, - "constraintTolerance": 0.00001} - """, + description=( + "Parameters for the openmm system. " + "For nonbondedMethod, you can choose from the following:\n" + "NoCutoff, CutoffNonPeriodic, CutoffPeriodic, Ewald, PME. " + "If anything but NoCutoff is chosen," + "you have to include a nonbondedCutoff" + "and a constrainTolerance.\n" + "If PME is chosen," + "you have to include an ewaldErrorTolerance too." + "For constraints, you can choose from the following:\n" + "None, HBonds, AllBonds or OnlyWater." + "For rigidWater, you can choose from the following:\n" + "True, False.\n" + "Example1:\n" + "{'nonbondedMethod': 'NoCutoff',\n" + "'constraints': 'None',\n" + "'rigidWater': False}\n" + "Example2:\n" + "{'nonbondedMethod': 'CutoffPeriodic',\n" + "'nonbondedCutoff': 1.0,\n" + "'constraints': 'HBonds',\n" + "'rigidWater': True,\n" + "'constraintTolerance': 0.00001} " + ), ) integrator_params: Dict[str, Any] = Field( { @@ -618,9 +635,17 @@ class SetUpandRunFunctionInput(BaseModel): class OpenMMSimulation: def __init__( - self, input_params: SetUpandRunFunctionInput, path_registry: PathRegistry + self, + input_params: SetUpandRunFunctionInput, + path_registry: PathRegistry, + final: bool, + sim_id: str, + pdb_id: str, ): self.params = input_params + self.final = final + self.sim_id = sim_id + self.pdb_id = pdb_id self.int_params = self.params.get("integrator_params", None) if self.int_params is None: self.int_params = { @@ -709,23 +734,74 @@ def create_simulation(self): ) self.simulation.context.setPositions(self.pdb.positions) - # Add reporters for output - self.simulation.reporters.append( - DCDReporter( - "trajectory.dcd", - self.sim_params["record_interval_steps"], + # TEMPORARY FILE MANAGEMENT OR PATH REGISTRY MAPPING + if self.final: + trajectory_name = self.path_registry.write_file_name( + type=FileType.RECORD, + record_type="TRAJ", + protein_file_id=self.pdb_id, + Sim_id=self.sim_id, + term="dcd", ) - ) - self.simulation.reporters.append( - StateDataReporter( - "log.txt", - self.sim_params["record_interval_steps"], - step=True, - potentialEnergy=True, - temperature=True, - separator="\t", + + log_name = self.path_registry.write_file_name( + type=FileType.RECORD, + record_type="LOG", + protein_file_id=self.pdb_id, + Sim_id=self.sim_id, + term="txt", + ) + traj_id = self.path_registry.get_fileid(trajectory_name, FileType.RECORD) + log_id = self.path_registry.get_fileid(log_name, FileType.RECORD) + traj_desc = ( + f"Simulation trajectory for protein {self.pdb_id}" + f" and simulation {self.sim_id}" + ) + log_desc = ( + f"Simulation state log for protein {self.pdb_id} " + f"and simulation {self.sim_id}" + ) + + self.simulation.reporters.append( + DCDReporter( + f"{trajectory_name}", + self.sim_params["record_interval_steps"], + ) + ) + self.simulation.reporters.append( + StateDataReporter( + f"{log_name}", + self.sim_params["record_interval_steps"], + step=True, + potentialEnergy=True, + temperature=True, + separator="\t", + ) + ) + self.registry_records = [ + (traj_id, f"files/records/{trajectory_name}", traj_desc), + (log_id, f"files/records/{log_name}", log_desc), + ] + + # TODO add checkpoint too? + + else: + self.simulation.reporters.append( + DCDReporter( + "temp_trajectory.dcd", + self.sim_params["record_interval_steps"], + ) + ) + self.simulation.reporters.append( + StateDataReporter( + "temp_log.txt", + self.sim_params["record_interval_steps"], + step=True, + potentialEnergy=True, + temperature=True, + separator="\t", + ) ) - ) def _create_system( self, @@ -858,13 +934,13 @@ def unit_to_string(unit): steps = {self.sim_params.get("Number of Steps", record_interval_steps)} equilibrationSteps = 1000 platform = Platform.getPlatformByName('CPU') - dcdReporter = DCDReporter('trajectory.dcd', 10000) + dcdReporter = DCDReporter('trajectory.dcd', 1000) dataReporter = StateDataReporter('log.txt', {record_interval_steps}, totalSteps=steps, step=True, speed=True, progress=True, elapsedTime=True, remainingTime=True, potentialEnergy=True, temperature=True, volume=True, density=True, separator='\t') - checkpointReporter = CheckpointReporter('checkpoint.chk', 10000) + checkpointReporter = CheckpointReporter('checkpoint.chk', 5000) # Minimize and Equilibrate # ... code for minimization and equilibration ... @@ -989,16 +1065,26 @@ def run(self): self.simulation.currentStep = 0 self.simulation.step(self.sim_params["Number of Steps"]) print("Done!") + if not self.final: + if os.path.exists("temp_trajectory.dcd"): + os.remove("temp_trajectory.dcd") + if os.path.exists("temp_log.txt"): + os.remove("temp_log.txt") + if os.path.exists("temp_checkpoint.chk"): + os.remove("temp_checkpoint.chk") + return "Simulation done!" class SetUpandRunFunction(BaseTool): name: str = "SetUpandRunFunction" - description: str = """This tool will set up and run a short simulation of a protein. - Then will write a standalone script that can be used - to reproduce the simulation or change accordingly for - a more elaborate simulation. It only runs short simulations because, - if there are errors you can try again changing the input""" + description: str = ( + "This tool will set up and run a short simulation of a protein. " + "Then will write a standalone script that can be used " + "to reproduce the simulation or change accordingly for " + "a more elaborate simulation. It only runs short simulations because, " + "if there are errors, you can try again changing the input" + ) args_schema: Type[BaseModel] = SetUpandRunFunctionInput @@ -1009,17 +1095,43 @@ def _run(self, **input_args): print("Path registry not initialized") return "Path registry not initialized" input = self.check_system_params(input_args) - error = input.get("error", None) if error: + print(f"error found: {error}") return error + try: pdb_id = input["pdb_id"] + # check if pdb_id is in the registry or as 1XYZ_112233 format + if pdb_id not in self.path_registry.list_path_names(): + return "No pdb_id found in input, use the file id not the file name" except KeyError: print("whoops no pdb_id found in input,", input) return "No pdb_id found in input" try: - Simulation = OpenMMSimulation(input, self.path_registry) + final = input["final"] # either this simulation + # the final one or not for this system + except KeyError: + final = False + print( + "No 'final' key found in input, setting to False. " + "Record files will be deleted after script is written." + ) + try: + file_name = self.path_registry.write_file_name( + type=FileType.SIMULATION, + type_of_sim=input["simmulation_params"]["Ensemble"], + protein_file_id=pdb_id, + ) + + sim_id = self.path_registry.get_fileid(file_name, FileType.SIMULATION) + except Exception as e: + print(f"An exception was found: {str(e)}.") + return f"An exception was found trying to write the filenames: {str(e)}." + try: + Simulation = OpenMMSimulation( + input, self.path_registry, final, sim_id, pdb_id + ) print("simulation set!") except ValueError as e: return str(e) + f"This were the inputs {input_args}" @@ -1030,27 +1142,36 @@ def _run(self, **input_args): try: Simulation.run() except Exception as e: - return f"""An exception was found: {str(e)}. Not a problem, thats one - purpose of this tool: to run a short simulation to check for correct - initialization. \n\n Try a) with different parameters like - nonbondedMethod, constraints, etc or b) clean file inputs depending on error - """ - try: - file_name = self.path_registry.write_file_name( - type=FileType.SIMULATION, - type_of_sim=input["simmulation_params"]["Ensemble"], - protein_file_id=pdb_id, + return ( + f"An exception was found: {str(e)}. Not a problem, thats one " + "purpose of this tool: to run a short simulation to check for correct " + "initialization. " + "" + "Try a) with different parameters like " + "nonbondedMethod, constraints, etc \n or\n" + "b) clean file inputs depending on error " ) - file_id = self.path_registry.get_fileid(file_name, FileType.SIMULATION) + try: Simulation.write_standalone_script(filename=file_name) self.path_registry.map_path( - file_id, file_name, f"Basic Simulation of Protein {pdb_id}" + sim_id, + f"files/simulations/{file_name}", + f"Basic Simulation of Protein {pdb_id}", ) + if final: + records = Simulation.registry_records + # move record files to files/records/ + print(os.listdir(".")) + if not os.path.exists("files/records"): + os.makedirs("files/records") + for record in records: + os.rename(record[1].split("/")[-1], f"{record[1]}") + for record in records: + self.path_registry.map_path(*record) return "Simulation done!" except Exception as e: print(f"An exception was found: {str(e)}.") - return f"""An exception was found trying to write the filenames: {str(e)}. - """ + return f"An exception was found trying to write the filenames: {str(e)}." def _parse_cutoff(self, cutoff): # Check if cutoff is already an OpenMM Quantity (has a unit) @@ -1481,6 +1602,10 @@ def check_system_params(cls, values): if file not in FORCEFIELD_LIST: error_msg += "The forcefield file is not present" + final = values.get("final", False) + if type(final) != bool: + error_msg += "final must be a boolean value" + if error_msg != "": return { "error": error_msg @@ -1489,6 +1614,7 @@ def check_system_params(cls, values): values = { "pdb_id": pdb_id, "forcefield_files": forcefield_files, + "final": final, "system_params": system_params, "integrator_params": integrator_params, "simmulation_params": simmulation_params, diff --git a/mdagent/utils/path_registry.py b/mdagent/utils/path_registry.py index b5733fd3..702833ca 100644 --- a/mdagent/utils/path_registry.py +++ b/mdagent/utils/path_registry.py @@ -56,13 +56,13 @@ def _check_json_content(self, name): return name in data # we use this fxn to "save" files (paths) to the json file - def map_path(self, name, path, description=None): + def map_path(self, file_id, path, description=None): description = description or "No description provided" full_path = self._get_full_path(path) - path_dict = {name: {"path": full_path, "description": description}} + path_dict = {file_id: {"path": full_path, "description": description}} self._save_mapping_to_json(path_dict) - saved = self._check_json_content(name) - return f"Path {'successfully' if saved else 'not'} mapped to name: {name}" + saved = self._check_json_content(file_id) + return f"Path {'successfully' if saved else 'not'} mapped to name: {file_id}" # this if we want to get the path. not use as often def get_mapped_path(self, name): @@ -159,6 +159,7 @@ def write_file_name(self, type: FileType, **kwargs): conditions = kwargs.get("conditions", None) Sim_id = kwargs.get("Sim_id", None) modified = kwargs.get("modified", False) + term = kwargs.get("term", "term") # Default term if not provided file_name = "" if type == FileType.PROTEIN: file_name += f"{protein_name}_{description}_{time_stamp}.{file_format}" @@ -171,6 +172,14 @@ def write_file_name(self, type: FileType, **kwargs): file_name += f"{Sim_id}_MOD_{time_stamp}.py" else: file_name += f"{type_of_sim}_{protein_file_id}_{time_stamp}.py" + if type == FileType.RECORD: + record_type_name = kwargs.get("record_type", "RECORD") + term = kwargs.get("term", "term") # Default term if not provided + + file_name = ( + f"{record_type_name}_{Sim_id}_{protein_file_id}_" f"{time_stamp}.{term}" + ) + if file_name == "": file_name += "ErrorDuringNaming_error.py" return file_name diff --git a/tests/test_fxns.py b/tests/test_fxns.py index 5b136a88..67f6ae34 100644 --- a/tests/test_fxns.py +++ b/tests/test_fxns.py @@ -229,12 +229,16 @@ def test_write_file_name_simulation_default(path_registry): assert file_name == "MD_123_20240109.py" -@pytest.mark.skip(reason="not implemented for record files") def test_write_file_name_record(path_registry): file_name = path_registry.write_file_name( - FileType.RECORD, protein_file_id="123", Sim_id="SIM456", time_stamp="20240109" + FileType.RECORD, + record_type="REC", + protein_file_id="123", + Sim_id="SIM456", + term="dcd", + time_stamp="20240109", ) - assert file_name == "123_SIM456_20240109" + assert file_name == "REC_SIM456_123_20240109.dcd" def test_map_path():