improved user file system, added 'no run' mode, save initial position…

…s to PDB (#169)
ur-whitelab · Feb 22, 2025 · b265a6d · b265a6d
1 parent dde6e22
commit b265a6d
Show file tree

Hide file tree

Showing 7 changed files with 111 additions and 62 deletions.
diff --git a/README.md b/README.md
@@ -21,7 +21,7 @@ pip install git+https://github.com/ur-whitelab/MDCrow.git
 
 ## Usage
 The next step is to set up your API keys in your environment. An API key for an LLM provider is necessary for this project. Supported LLM providers are OpenAI, TogetherAI, Fireworks, and Anthropic.
-Other tools require API keys, such as paper-qa for literature searches. We recommend setting up the keys in a .env file. You can use the provided .env.example file as a template.
+We recommend setting up the API keys in a .env file. You can use the provided .env.example file as a template.
 1. Copy the `.env.example` file and rename it to `.env`: `cp .env.example .env`
 2. Replace the placeholder values in `.env` with your actual keys
 
@@ -43,6 +43,4 @@ By default, we support LLMs through OpenAI API. However, feel free to use other
 
 ## Contributing
 
-We welcome contributions to MDCrow! If you're interested in contributing to the project, please check out our [Contributor's Guide](CONTRIBUTING.md) for detailed instructions on getting started, feature development, and the pull request process.
-
 We value and appreciate all contributions to MDCrow.
diff --git a/mdcrow/agent/agent.py b/mdcrow/agent/agent.py
@@ -1,4 +1,5 @@
 import os
+from datetime import datetime
 
 from dotenv import load_dotenv
 from langchain.agents import AgentExecutor, OpenAIFunctionsAgent
@@ -46,6 +47,7 @@ def __init__(
         uploaded_files=[],  # user input files to add to path registry
         run_id="",
         use_memory=False,
+        modifysim_no_run=False,
         paper_dir=None,  # papers for pqa, relative path within repo
     ):
         self.llm = _make_llm(model, temp, streaming)
@@ -58,18 +60,51 @@ def __init__(
         self.ckpt_dir = self.path_registry.ckpt_dir
         self.memory = MemoryManager(self.path_registry, self.tools_llm, run_id=run_id)
         self.run_id = self.memory.run_id
-
         self.uploaded_files = uploaded_files
-        for file in uploaded_files:  # todo -> allow users to add descriptions?
-            self.path_registry.map_path(file, file, description="User uploaded file")
-
         self.agent = None
         self.agent_type = agent_type
         self.top_k_tools = top_k_tools
         self.use_human_tool = use_human_tool
         self.user_tools = tools
         self.verbose = verbose
 
+        if self.uploaded_files:
+            self.add_file(self.uploaded_files)
+        self.modifysim_no_run = modifysim_no_run
+
+    def _add_single_file(self, file_path, description=None):
+        now = datetime.now()
+        # Format the date and time as "YYYYMMDD_HHMMSS"
+        timestamp = now.strftime("%Y%m%d_%H%M%S")
+        i = 0
+        ID = "UPL_" + str(i) + timestamp
+        while ID in self.path_registry.list_path_names():  # check if ID already exists
+            i += 1
+            ID = "UPL_" + str(i) + timestamp
+        if not description:
+            # asks for user input to add description for file file_path
+            # wait for 20 seconds or set up a default description
+            description = "User uploaded file"
+        print(f"Adding file {file_path} with ID {ID}\n")
+        self.path_registry.map_path(ID, file_path, description=description)
+
+    def add_file(self, uploaded_files):
+        if isinstance(uploaded_files, str):
+            self._add_single_file(uploaded_files)
+        elif isinstance(uploaded_files, tuple):
+            self._add_single_file(uploaded_files[0], description=uploaded_files[1])
+        elif isinstance(uploaded_files, list):
+            for file_path in uploaded_files:
+                print(f"Adding file {file_path}\n")
+                print(type(file_path))
+                self.add_file(file_path)
+        else:
+            raise ValueError(
+                "Invalid input. Please provide a file path \
+                             or list of file paths. Optionally, tuple or list of tuples\
+                             of file path and description"
+            )
+
     def _initialize_tools_and_agent(self, user_input=None):
         """Retrieve tools and initialize the agent."""
         if self.user_tools is not None:
@@ -88,6 +123,7 @@ def _initialize_tools_and_agent(self, user_input=None):
                 self.tools = make_all_tools(
                     self.tools_llm,
                     human=self.use_human_tool,
+                    modifysim_no_run=self.modifysim_no_run,
                 )
         return AgentExecutor.from_agent_and_tools(
             tools=self.tools,

diff --git a/mdcrow/tools/base_tools/preprocess_tools/pdb_get.py b/mdcrow/tools/base_tools/preprocess_tools/pdb_get.py
@@ -272,7 +272,7 @@ def small_molecule_pdb(self, mol_str: str) -> str:
         except Exception as e:
             print(
                 "There was an error getting pdb. Please input a single molecule name."
-                f"{mol_str},{mol_name}"
+                f"{mol_str}"
             )
             return (
                 "Failed. There was an error getting pdb. "

diff --git a/mdcrow/tools/base_tools/simulation_tools/create_simulation.py b/mdcrow/tools/base_tools/simulation_tools/create_simulation.py
@@ -1,3 +1,4 @@
+import os
 import textwrap
 from typing import Optional
 
@@ -16,7 +17,7 @@ class ModifyScriptUtils:
     def __init__(self, llm):
         self.llm = llm
 
-    def _prompt_summary(self, query: str):
+    def _prompt_summary(self, task: dict):
         if not self.llm:
             raise ValueError("No language model provided at ModifyScriptTool")
 
@@ -50,9 +51,7 @@ def _prompt_summary(self, query: str):
         )
         llm_chain = prompt | self.llm | StrOutputParser()
 
-        return llm_chain.invoke(query)
-
-        # Remove leading spaces for proper formatting
+        return llm_chain.invoke(task)
 
     def remove_leading_spaces(self, text):
         lines = text.split("\n")
@@ -61,15 +60,16 @@ def remove_leading_spaces(self, text):
 
 
 class ModifyScriptInput(BaseModel):
+    script_id: str = Field(..., description=" File ID of the simulation script file")
     query: str = Field(
         ...,
         description=(
-            "simulation required by the user. You MUST "
-            "specify the objective, and requirements of the simulation as well "
-            "as on what protein you are working."
+            "Simulation required by the user. Be as descriptive as possible, "
+            "including the requirements of the simulation, such as the force fields, "
+            "integrator, and constraints. Also, mention the protein you "
+            "are working on. "
         ),
     )
-    script: str = Field(..., description=" simulation ID of the base script file")
 
 
 class ModifyBaseSimulationScriptTool(BaseTool):
@@ -82,25 +82,28 @@ class ModifyBaseSimulationScriptTool(BaseTool):
     args_schema = ModifyScriptInput
     llm: Optional[BaseLanguageModel]
     path_registry: Optional[PathRegistry]
+    modifysim_no_run: Optional[bool]
 
-    def __init__(self, path_registry: Optional[PathRegistry], llm):
+    def __init__(self, path_registry, llm, modifysim_no_run=False):
         super().__init__()
         self.path_registry = path_registry
         self.llm = llm
+        self.modifysim_no_run = modifysim_no_run
 
-    def _run(self, *args, **input):
-        if len(args) > 0:
-            return (
-                "Failed. This tool expects you to provide the input as a "
-                "dictionary: {'query': 'your query', 'script': 'script id'}"
-            )
+    def _run(self, script_id: str, query: str) -> str:
         if not self.path_registry:
             return "Failed. No path registry provided"  # this should not happen
-        base_script_id = input.get("script")
+        base_script_id = script_id
         if not base_script_id:
             return (
                 "Failed. No id provided. The keys for the input are: "
-                "query' and 'script'"
+                "query' and 'script_id'"
+            )
+        current_ids = self.path_registry.list_path_names()
+        if base_script_id not in current_ids:
+            return (
+                f"Failed. File ID not found: {base_script_id}, make sure "
+                "the script ID is correct"
             )
         try:
             base_script_path = self.path_registry.get_mapped_path(base_script_id)
@@ -109,18 +112,22 @@ def _run(self, *args, **input):
                 parts[-1]
         except Exception as e:
             return f"Failed. Error getting path from file id: {e}"
-        with open(base_script_path, "r") as file:
-            base_script = file.read()
+        if os.path.exists(base_script_path):
+            with open(base_script_path, "r") as file:
+                base_script = file.read()
+        else:
+            return f"Failed. File not found: {base_script_id}"
+
         base_script = "".join(base_script)
         utils = ModifyScriptUtils(self.llm)
 
-        description = input.get("query")
+        description = query
         answer = utils._prompt_summary(
-            query={"base_script": base_script, "query": description}
+            task={"base_script": base_script, "query": description}
         )
-        script = answer["text"]
-        thoughts, new_script = script.split("SCRIPT:")
-        script_content = utils.remove_leading_spaces(new_script)
+        print("This the answer from the LLM\n\n", answer)
+        thoughts, new_script = answer.split("SCRIPT:")
+        script_content = new_script
         if "FINAL THOUGHTS:" in script_content:
             script_content, final_thoughts = script_content.split("FINAL THOUGHTS:")
         # replace ''' with #
@@ -135,8 +142,21 @@ def _run(self, *args, **input):
         with open(f"{directory}/{filename}", "w") as file:
             file.write(script_content)
 
-        self.path_registry.map_path(file_id, filename, description)
-        return f"Succeeded. Script modified successfully. Modified Script ID: {file_id}"
+        self.path_registry.map_path(file_id, f"{directory}/{filename}", description)
+        # if no-run mode is on, return the file id
+        if self.modifysim_no_run:
+            return f"Succeeded. Script modified successfully. Modified Script ID: {file_id}"
+        # if no-run mode is off, try to run the script
+        try:
+            exec(script_content)
+            return f"Succeeded. Script modified and ran \
+                successfully. Modified Script ID: {file_id}"
+        except Exception as e:
+            return (
+                f"Failed. Error running the script: {e}."
+                "Modified Script ID: {file_id}. If you want to try to correct the "
+                "script, use the file id of the modified to correct the script."
+            )
 
     async def _arun(self, query) -> str:
         """Use the tool asynchronously."""

diff --git a/mdcrow/tools/base_tools/simulation_tools/setup_and_run.py b/mdcrow/tools/base_tools/simulation_tools/setup_and_run.py
@@ -723,14 +723,22 @@ def _construct_script_content(
             system.addForce(MonteCarloBarostat(pressure, temperature, barostatInterval))
             """
 
-        if integrator_type == "LangevinMiddle" and constraints != "None":
+        if (
+            integrator_type == "LangevinMiddle"
+            and constraints != "None"
+            and constraints
+        ):
+            print("Constraints must be set to 'None' for LangevinMiddle integrator.")
+            print(integrator_type, "constraints: ", constraints)
             script_content += """
         integrator = LangevinMiddleIntegrator(temperature, friction, dt)
         integrator.setConstraintTolerance(constraintTolerance)
         simulation = Simulation(modeller.topology, system, integrator, platform)
         simulation.context.setPositions(modeller.positions)
         """
-        if integrator_type == "LangevinMiddle" and constraints == "None":
+        if integrator_type == "LangevinMiddle" and (
+            constraints == "None" or constraints is None
+        ):
             script_content += """
             integrator = LangevinMiddleIntegrator(temperature, friction, dt)
             simulation = Simulation(modeller.topology, system, integrator, platform)
@@ -742,6 +750,15 @@ def _construct_script_content(
 
         print('Performing energy minimization...')
         simulation.minimizeEnergy()
+        ## Save initial positions
+        top_name = 'simulation_initial_positions.pdb'
+        top_description = 'Initial positions of the simulation'
+        with open(top_name, "w") as f:
+            \tPDBFile.writeFile(
+                \tsimulation.topology,
+                \tsimulation.context.getState(getPositions=True).getPositions(),
+                \tf,
+            \t)
         print('Equilibrating...')
         simulation.context.setVelocitiesToTemperature(temperature)
         simulation.step(equilibrationSteps)

diff --git a/mdcrow/tools/maketools.py b/mdcrow/tools/maketools.py
@@ -64,6 +64,7 @@
 def make_all_tools(
     llm: BaseLanguageModel,
     human=False,
+    modifysim_no_run=False,
 ):
     load_dotenv()
     all_tools = []
@@ -72,7 +73,9 @@ def make_all_tools(
         all_tools += agents.load_tools(["llm-math"], llm)
         # all_tools += [PythonREPLTool()]
         all_tools += [
-            ModifyBaseSimulationScriptTool(path_registry=path_instance, llm=llm),
+            ModifyBaseSimulationScriptTool(
+                path_registry=path_instance, llm=llm, modifysim_no_run=modifysim_no_run
+            ),
         ]
         if path_instance.ckpt_papers:
             all_tools += [Scholar2ResultLLM(llm=llm, path_registry=path_instance)]

diff --git a/notebooks/experiments/prompts.md b/notebooks/experiments/prompts.md