Skip to content

Commit

Permalink
trying to use rdkit for small mol pdb
Browse files Browse the repository at this point in the history
  • Loading branch information
SamCox822 committed Jan 31, 2024
1 parent 60bb0e6 commit ee12619
Show file tree
Hide file tree
Showing 6 changed files with 160 additions and 9 deletions.
3 changes: 2 additions & 1 deletion mdagent/tools/base_tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
RemoveWaterCleaningTool,
SpecializedCleanTool,
)
from .preprocess_tools.pdb_tools import Name2PDBTool, PackMolTool, get_pdb
from .preprocess_tools.pdb_tools import Name2PDBTool, PackMolTool, SmallMolPDB, get_pdb
from .simulation_tools.create_simulation import ModifyBaseSimulationScriptTool
from .simulation_tools.setup_and_run import (
InstructionSummary,
Expand All @@ -35,6 +35,7 @@
"Name2PDBTool",
"PackMolTool",
"PPIDistance",
"SmallMolPDB",
"VisualizeProtein",
"RMSDCalculator",
"RemoveWaterCleaningTool",
Expand Down
3 changes: 2 additions & 1 deletion mdagent/tools/base_tools/preprocess_tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
RemoveWaterCleaningTool,
SpecializedCleanTool,
)
from .pdb_tools import Name2PDBTool, PackMolTool, get_pdb
from .pdb_tools import Name2PDBTool, PackMolTool, SmallMolPDB, get_pdb

__all__ = [
"AddHydrogensCleaningTool",
Expand All @@ -16,4 +16,5 @@
"SpecializedCleanTool",
"get_pdb",
"CleaningToolFunction",
"SmallMolPDB",
]
127 changes: 120 additions & 7 deletions mdagent/tools/base_tools/preprocess_tools/pdb_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from langchain.tools import BaseTool
from pdbfixer import PDBFixer
from pydantic import BaseModel, Field, ValidationError, root_validator
from rdkit import Chem

from mdagent.utils import FileType, PathRegistry

Expand Down Expand Up @@ -64,11 +65,10 @@ class Name2PDBTool(BaseTool):
name = "PDBFileDownloader"
description = """This tool downloads PDB (Protein Data Bank) or
CIF (Crystallographic Information File) files using
commercial chemical names. It’s ideal for situations where
you need to directly retrieve these file using a chemical’s
commercial name. When a specific file type, either PDB or CIF,
a protein's common name (NOT a small molecule).
When a specific file type, either PDB or CIF,
is requested, add file type to the query string with space.
Input: Commercial name of the chemical or file without
Input: Commercial name of the protein or file without
file extension
Output: Corresponding PDB or CIF file"""
path_registry: Optional[PathRegistry]
Expand Down Expand Up @@ -453,7 +453,7 @@ def validate_input(cls, values: Union[str, Dict[str, Any]]) -> Dict:
class PackMolTool(BaseTool):
name: str = "packmol_tool"
description: str = """Useful when you need to create a box
of different types of molecules molecules"""
of different types of molecules molecules. """

args_schema: Type[BaseModel] = PackmolInput

Expand Down Expand Up @@ -1446,9 +1446,9 @@ class FixPDBFile(BaseTool):
description: str = "Fixes PDB files columns if needed"
args_schema: Type[BaseModel] = PDBFilesFixInp

path_registry: typing.Optional[PathRegistry]
path_registry: Optional[PathRegistry]

def __init__(self, path_registry: typing.Optional[PathRegistry]):
def __init__(self, path_registry: Optional[PathRegistry]):
super().__init__()
self.path_registry = path_registry

Expand Down Expand Up @@ -1495,3 +1495,116 @@ def _run(self, query: Dict):
return "PDB file fixed"
else:
return "PDB not fully fixed"


class MolPDB:
def is_smiles(self, text: str) -> bool:
try:
m = Chem.MolFromSmiles(text, sanitize=False)
if m is None:
return False
return True
except Exception:
return False

def largest_mol(
self, smiles: str
) -> (
str
): # from https://github.com/ur-whitelab/chemcrow-public/blob/main/chemcrow/utils.py
ss = smiles.split(".")
ss.sort(key=lambda a: len(a))
while not self.is_smiles(ss[-1]):
rm = ss[-1]
ss.remove(rm)
return ss[-1]

def molname2smiles(
self, query: str
) -> (
str
): # from https://github.com/ur-whitelab/chemcrow-public/blob/main/chemcrow/tools/databases.py
url = " https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{}/{}"
r = requests.get(url.format(query, "property/IsomericSMILES/JSON"))
# convert the response to a json object
data = r.json()
# return the SMILES string
try:
smi = data["PropertyTable"]["Properties"][0]["IsomericSMILES"]
except KeyError:
return """Could not find a molecule matching the text.
One possible cause is that the input is incorrect, input one
molecule at a time."""
# remove salts
return Chem.CanonSmiles(self.largest_mol(smi))

def smiles2name(self, smi: str) -> str:
try:
smi = Chem.MolToSmiles(Chem.MolFromSmiles(smi), canonical=True)
except Exception:
return "Invalid SMILES string"
# query the PubChem database
r = requests.get(
"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/"
+ smi
+ "/synonyms/JSON"
)
data = r.json()
try:
name = data["InformationList"]["Information"][0]["Synonym"][0]
except KeyError:
return "Unknown Molecule"
return name

def small_molecule_pdb(self, mol_str: str, path_registry=None) -> str:
# takes in molecule name or smiles (converts to smiles if name)
# writes pdb file name.pdb (gets name from smiles if possible)
# output is done message
ps = Chem.SmilesParserParams()
ps.removeHs = False
try:
if self.is_smiles(mol_str):
m = Chem.MolFromSmiles(mol_str)
mol_name = self.smiles2name(mol_str)
else: # if input is not smiles, try getting smiles
smi = self.molname2smiles(mol_str)
m = Chem.MolFromSmiles(smi)
mol_name = mol_str
try: # only if needed
m = Chem.AddHs(m)
except Exception:
pass
Chem.AllChem.EmbedMolecule(m)
file_name = f"{mol_name}.pdb"
Chem.MolToPDBFile(m, file_name)
# add to path registry
if path_registry:
_ = path_registry.map_math(
file_name, file_name, f"pdb file for the small molecule {mol_name}"
)
return (
f"PDB file for {mol_str} successfully created and saved to {file_name}."
)
except Exception:
return (
"There was an error getting pdb. Please input a single molecule name."
)


class SmallMolPDB(BaseTool):
name = "SmallMoleculePDB"
description = """
Creates a PDB file for a small molecule
Use this tool when you need to use a small molecule in a simulation.
Input can be a molecule name or a SMILES string."""
path_registry: Optional[PathRegistry]

def __init__(self, path_registry: Optional[PathRegistry]):
super().__init__()
self.path_registry = path_registry

def _run(self, mol_str: str) -> str:
"""use the tool."""
mol_pdb = MolPDB()
output = mol_pdb.small_molecule_pdb(mol_str, self.path_registry)
return output
2 changes: 2 additions & 0 deletions mdagent/tools/maketools.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
Scholar2ResultLLM,
SetUpandRunFunction,
SimulationOutputFigures,
SmallMolPDB,
VisualizeProtein,
)
from .subagent_tools import RetryExecuteSkill, SkillRetrieval, WorkflowPlan
Expand Down Expand Up @@ -82,6 +83,7 @@ def make_all_tools(
# MapPath2Name(path_registry=path_instance),
Name2PDBTool(path_registry=path_instance),
PackMolTool(path_registry=path_instance),
SmallMolPDB(path_registry=path_instance),
VisualizeProtein(path_registry=path_instance),
PPIDistance(),
RMSDCalculator(),
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
"requests",
"rmrkl",
"tiktoken",
"rdkit",
],
test_suite="tests",
long_description=long_description,
Expand Down
33 changes: 33 additions & 0 deletions tests/test_fxns.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
get_pdb,
)
from mdagent.tools.base_tools.analysis_tools.plot_tools import plot_data, process_csv
from mdagent.tools.base_tools.preprocess_tools.pdb_tools import MolPDB
from mdagent.utils import FileType, PathRegistry

warnings.filterwarnings("ignore", category=DeprecationWarning, module="pkg_resources")
Expand Down Expand Up @@ -39,6 +40,11 @@ def cleaning_fxns():
return CleaningTools()


@pytest.fixture
def molpdb():
return MolPDB()


# Test simulation tools
@pytest.fixture
def sim_fxns():
Expand Down Expand Up @@ -281,3 +287,30 @@ def test_map_path():

# Check the result message
assert result == "Path successfully mapped to name: new_name"


def test_small_molecule_pdb(molpdb):
# Test with a valid SMILES string
valid_smiles = "C1=CC=CC=C1" # Benzene
expected_output = (
"PDB file for C1=CC=CC=C1 successfully created and saved to benzene.pdb."
)
assert molpdb.small_molecule_pdb(valid_smiles) == expected_output
assert os.path.exists("benzene.pdb")
os.remove("benzene.pdb") # Clean up

# test with invalid SMILES string and invalid molecule name
invalid_smiles = "C1=CC=CC=C1X"
invalid_name = "NotAMolecule"
expected_output = (
"There was an error getting pdb. Please input a single molecule name."
)
assert molpdb.small_molecule_pdb(invalid_smiles) == expected_output
assert molpdb.small_molecule_pdb(invalid_name) == expected_output

# test with valid molecule name
valid_name = "water"
expected_output = "PDB file for water successfully created and saved to water.pdb."
assert molpdb.small_molecule_pdb(valid_name) == expected_output
assert os.path.exists("water.pdb")
os.remove("water.pdb") # Clean up

0 comments on commit ee12619

Please sign in to comment.