Skip to content

Commit

Permalink
improved user file system, added 'no run' mode, save initial position…
Browse files Browse the repository at this point in the history
…s to PDB (#169)
  • Loading branch information
qcampbel authored Feb 22, 2025
1 parent dde6e22 commit b265a6d
Show file tree
Hide file tree
Showing 7 changed files with 111 additions and 62 deletions.
4 changes: 1 addition & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ pip install git+https://github.com/ur-whitelab/MDCrow.git

## Usage
The next step is to set up your API keys in your environment. An API key for an LLM provider is necessary for this project. Supported LLM providers are OpenAI, TogetherAI, Fireworks, and Anthropic.
Other tools require API keys, such as paper-qa for literature searches. We recommend setting up the keys in a .env file. You can use the provided .env.example file as a template.
We recommend setting up the API keys in a .env file. You can use the provided .env.example file as a template.
1. Copy the `.env.example` file and rename it to `.env`: `cp .env.example .env`
2. Replace the placeholder values in `.env` with your actual keys

Expand All @@ -43,6 +43,4 @@ By default, we support LLMs through OpenAI API. However, feel free to use other

## Contributing

We welcome contributions to MDCrow! If you're interested in contributing to the project, please check out our [Contributor's Guide](CONTRIBUTING.md) for detailed instructions on getting started, feature development, and the pull request process.

We value and appreciate all contributions to MDCrow.
44 changes: 40 additions & 4 deletions mdcrow/agent/agent.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
from datetime import datetime

from dotenv import load_dotenv
from langchain.agents import AgentExecutor, OpenAIFunctionsAgent
Expand Down Expand Up @@ -46,6 +47,7 @@ def __init__(
uploaded_files=[], # user input files to add to path registry
run_id="",
use_memory=False,
modifysim_no_run=False,
paper_dir=None, # papers for pqa, relative path within repo
):
self.llm = _make_llm(model, temp, streaming)
Expand All @@ -58,18 +60,51 @@ def __init__(
self.ckpt_dir = self.path_registry.ckpt_dir
self.memory = MemoryManager(self.path_registry, self.tools_llm, run_id=run_id)
self.run_id = self.memory.run_id

self.uploaded_files = uploaded_files
for file in uploaded_files: # todo -> allow users to add descriptions?
self.path_registry.map_path(file, file, description="User uploaded file")

self.agent = None
self.agent_type = agent_type
self.top_k_tools = top_k_tools
self.use_human_tool = use_human_tool
self.user_tools = tools
self.verbose = verbose

if self.uploaded_files:
self.add_file(self.uploaded_files)
self.modifysim_no_run = modifysim_no_run

def _add_single_file(self, file_path, description=None):
now = datetime.now()
# Format the date and time as "YYYYMMDD_HHMMSS"
timestamp = now.strftime("%Y%m%d_%H%M%S")
i = 0
ID = "UPL_" + str(i) + timestamp
while ID in self.path_registry.list_path_names(): # check if ID already exists
i += 1
ID = "UPL_" + str(i) + timestamp
if not description:
# asks for user input to add description for file file_path
# wait for 20 seconds or set up a default description
description = "User uploaded file"
print(f"Adding file {file_path} with ID {ID}\n")
self.path_registry.map_path(ID, file_path, description=description)

def add_file(self, uploaded_files):
if isinstance(uploaded_files, str):
self._add_single_file(uploaded_files)
elif isinstance(uploaded_files, tuple):
self._add_single_file(uploaded_files[0], description=uploaded_files[1])
elif isinstance(uploaded_files, list):
for file_path in uploaded_files:
print(f"Adding file {file_path}\n")
print(type(file_path))
self.add_file(file_path)
else:
raise ValueError(
"Invalid input. Please provide a file path \
or list of file paths. Optionally, tuple or list of tuples\
of file path and description"
)

def _initialize_tools_and_agent(self, user_input=None):
"""Retrieve tools and initialize the agent."""
if self.user_tools is not None:
Expand All @@ -88,6 +123,7 @@ def _initialize_tools_and_agent(self, user_input=None):
self.tools = make_all_tools(
self.tools_llm,
human=self.use_human_tool,
modifysim_no_run=self.modifysim_no_run,
)
return AgentExecutor.from_agent_and_tools(
tools=self.tools,
Expand Down
2 changes: 1 addition & 1 deletion mdcrow/tools/base_tools/preprocess_tools/pdb_get.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ def small_molecule_pdb(self, mol_str: str) -> str:
except Exception as e:
print(
"There was an error getting pdb. Please input a single molecule name."
f"{mol_str},{mol_name}"
f"{mol_str}"
)
return (
"Failed. There was an error getting pdb. "
Expand Down
72 changes: 46 additions & 26 deletions mdcrow/tools/base_tools/simulation_tools/create_simulation.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
import textwrap
from typing import Optional

Expand All @@ -16,7 +17,7 @@ class ModifyScriptUtils:
def __init__(self, llm):
self.llm = llm

def _prompt_summary(self, query: str):
def _prompt_summary(self, task: dict):
if not self.llm:
raise ValueError("No language model provided at ModifyScriptTool")

Expand Down Expand Up @@ -50,9 +51,7 @@ def _prompt_summary(self, query: str):
)
llm_chain = prompt | self.llm | StrOutputParser()

return llm_chain.invoke(query)

# Remove leading spaces for proper formatting
return llm_chain.invoke(task)

def remove_leading_spaces(self, text):
lines = text.split("\n")
Expand All @@ -61,15 +60,16 @@ def remove_leading_spaces(self, text):


class ModifyScriptInput(BaseModel):
script_id: str = Field(..., description=" File ID of the simulation script file")
query: str = Field(
...,
description=(
"simulation required by the user. You MUST "
"specify the objective, and requirements of the simulation as well "
"as on what protein you are working."
"Simulation required by the user. Be as descriptive as possible, "
"including the requirements of the simulation, such as the force fields, "
"integrator, and constraints. Also, mention the protein you "
"are working on. "
),
)
script: str = Field(..., description=" simulation ID of the base script file")


class ModifyBaseSimulationScriptTool(BaseTool):
Expand All @@ -82,25 +82,28 @@ class ModifyBaseSimulationScriptTool(BaseTool):
args_schema = ModifyScriptInput
llm: Optional[BaseLanguageModel]
path_registry: Optional[PathRegistry]
modifysim_no_run: Optional[bool]

def __init__(self, path_registry: Optional[PathRegistry], llm):
def __init__(self, path_registry, llm, modifysim_no_run=False):
super().__init__()
self.path_registry = path_registry
self.llm = llm
self.modifysim_no_run = modifysim_no_run

def _run(self, *args, **input):
if len(args) > 0:
return (
"Failed. This tool expects you to provide the input as a "
"dictionary: {'query': 'your query', 'script': 'script id'}"
)
def _run(self, script_id: str, query: str) -> str:
if not self.path_registry:
return "Failed. No path registry provided" # this should not happen
base_script_id = input.get("script")
base_script_id = script_id
if not base_script_id:
return (
"Failed. No id provided. The keys for the input are: "
"query' and 'script'"
"query' and 'script_id'"
)
current_ids = self.path_registry.list_path_names()
if base_script_id not in current_ids:
return (
f"Failed. File ID not found: {base_script_id}, make sure "
"the script ID is correct"
)
try:
base_script_path = self.path_registry.get_mapped_path(base_script_id)
Expand All @@ -109,18 +112,22 @@ def _run(self, *args, **input):
parts[-1]
except Exception as e:
return f"Failed. Error getting path from file id: {e}"
with open(base_script_path, "r") as file:
base_script = file.read()
if os.path.exists(base_script_path):
with open(base_script_path, "r") as file:
base_script = file.read()
else:
return f"Failed. File not found: {base_script_id}"

base_script = "".join(base_script)
utils = ModifyScriptUtils(self.llm)

description = input.get("query")
description = query
answer = utils._prompt_summary(
query={"base_script": base_script, "query": description}
task={"base_script": base_script, "query": description}
)
script = answer["text"]
thoughts, new_script = script.split("SCRIPT:")
script_content = utils.remove_leading_spaces(new_script)
print("This the answer from the LLM\n\n", answer)
thoughts, new_script = answer.split("SCRIPT:")
script_content = new_script
if "FINAL THOUGHTS:" in script_content:
script_content, final_thoughts = script_content.split("FINAL THOUGHTS:")
# replace ''' with #
Expand All @@ -135,8 +142,21 @@ def _run(self, *args, **input):
with open(f"{directory}/{filename}", "w") as file:
file.write(script_content)

self.path_registry.map_path(file_id, filename, description)
return f"Succeeded. Script modified successfully. Modified Script ID: {file_id}"
self.path_registry.map_path(file_id, f"{directory}/{filename}", description)
# if no-run mode is on, return the file id
if self.modifysim_no_run:
return f"Succeeded. Script modified successfully. Modified Script ID: {file_id}"
# if no-run mode is off, try to run the script
try:
exec(script_content)
return f"Succeeded. Script modified and ran \
successfully. Modified Script ID: {file_id}"
except Exception as e:
return (
f"Failed. Error running the script: {e}."
"Modified Script ID: {file_id}. If you want to try to correct the "
"script, use the file id of the modified to correct the script."
)

async def _arun(self, query) -> str:
"""Use the tool asynchronously."""
Expand Down
21 changes: 19 additions & 2 deletions mdcrow/tools/base_tools/simulation_tools/setup_and_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -723,14 +723,22 @@ def _construct_script_content(
system.addForce(MonteCarloBarostat(pressure, temperature, barostatInterval))
"""

if integrator_type == "LangevinMiddle" and constraints != "None":
if (
integrator_type == "LangevinMiddle"
and constraints != "None"
and constraints
):
print("Constraints must be set to 'None' for LangevinMiddle integrator.")
print(integrator_type, "constraints: ", constraints)
script_content += """
integrator = LangevinMiddleIntegrator(temperature, friction, dt)
integrator.setConstraintTolerance(constraintTolerance)
simulation = Simulation(modeller.topology, system, integrator, platform)
simulation.context.setPositions(modeller.positions)
"""
if integrator_type == "LangevinMiddle" and constraints == "None":
if integrator_type == "LangevinMiddle" and (
constraints == "None" or constraints is None
):
script_content += """
integrator = LangevinMiddleIntegrator(temperature, friction, dt)
simulation = Simulation(modeller.topology, system, integrator, platform)
Expand All @@ -742,6 +750,15 @@ def _construct_script_content(
print('Performing energy minimization...')
simulation.minimizeEnergy()
## Save initial positions
top_name = 'simulation_initial_positions.pdb'
top_description = 'Initial positions of the simulation'
with open(top_name, "w") as f:
\tPDBFile.writeFile(
\tsimulation.topology,
\tsimulation.context.getState(getPositions=True).getPositions(),
\tf,
\t)
print('Equilibrating...')
simulation.context.setVelocitiesToTemperature(temperature)
simulation.step(equilibrationSteps)
Expand Down
5 changes: 4 additions & 1 deletion mdcrow/tools/maketools.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
def make_all_tools(
llm: BaseLanguageModel,
human=False,
modifysim_no_run=False,
):
load_dotenv()
all_tools = []
Expand All @@ -72,7 +73,9 @@ def make_all_tools(
all_tools += agents.load_tools(["llm-math"], llm)
# all_tools += [PythonREPLTool()]
all_tools += [
ModifyBaseSimulationScriptTool(path_registry=path_instance, llm=llm),
ModifyBaseSimulationScriptTool(
path_registry=path_instance, llm=llm, modifysim_no_run=modifysim_no_run
),
]
if path_instance.ckpt_papers:
all_tools += [Scholar2ResultLLM(llm=llm, path_registry=path_instance)]
Expand Down
25 changes: 0 additions & 25 deletions notebooks/experiments/prompts.md

This file was deleted.

0 comments on commit b265a6d

Please sign in to comment.