trying to use rdkit for small mol pdb

ur-whitelab · Jan 31, 2024 · ee12619 · ee12619
1 parent 60bb0e6
commit ee12619
Show file tree

Hide file tree

Showing 6 changed files with 160 additions and 9 deletions.
diff --git a/mdagent/tools/base_tools/__init__.py b/mdagent/tools/base_tools/__init__.py
@@ -13,7 +13,7 @@
     RemoveWaterCleaningTool,
     SpecializedCleanTool,
 )
-from .preprocess_tools.pdb_tools import Name2PDBTool, PackMolTool, get_pdb
+from .preprocess_tools.pdb_tools import Name2PDBTool, PackMolTool, SmallMolPDB, get_pdb
 from .simulation_tools.create_simulation import ModifyBaseSimulationScriptTool
 from .simulation_tools.setup_and_run import (
     InstructionSummary,
@@ -35,6 +35,7 @@
     "Name2PDBTool",
     "PackMolTool",
     "PPIDistance",
+    "SmallMolPDB",
     "VisualizeProtein",
     "RMSDCalculator",
     "RemoveWaterCleaningTool",

diff --git a/mdagent/tools/base_tools/preprocess_tools/__init__.py b/mdagent/tools/base_tools/preprocess_tools/__init__.py
@@ -5,7 +5,7 @@
     RemoveWaterCleaningTool,
     SpecializedCleanTool,
 )
-from .pdb_tools import Name2PDBTool, PackMolTool, get_pdb
+from .pdb_tools import Name2PDBTool, PackMolTool, SmallMolPDB, get_pdb
 
 __all__ = [
     "AddHydrogensCleaningTool",
@@ -16,4 +16,5 @@
     "SpecializedCleanTool",
     "get_pdb",
     "CleaningToolFunction",
+    "SmallMolPDB",
 ]
diff --git a/mdagent/tools/base_tools/preprocess_tools/pdb_tools.py b/mdagent/tools/base_tools/preprocess_tools/pdb_tools.py
@@ -9,6 +9,7 @@
 from langchain.tools import BaseTool
 from pdbfixer import PDBFixer
 from pydantic import BaseModel, Field, ValidationError, root_validator
+from rdkit import Chem
 
 from mdagent.utils import FileType, PathRegistry
 
@@ -64,11 +65,10 @@ class Name2PDBTool(BaseTool):
     name = "PDBFileDownloader"
     description = """This tool downloads PDB (Protein Data Bank) or
                     CIF (Crystallographic Information File) files using
-                    commercial chemical names. It’s ideal for situations where
-                    you need to directly retrieve these file using a chemical’s
-                    commercial name. When a specific file type, either PDB or CIF,
+                    a protein's common name (NOT a small molecule).
+                    When a specific file type, either PDB or CIF,
                     is requested, add file type to the query string with space.
-                    Input: Commercial name of the chemical or file without
+                    Input: Commercial name of the protein or file without
                     file extension
                     Output: Corresponding PDB or CIF file"""
     path_registry: Optional[PathRegistry]
@@ -453,7 +453,7 @@ def validate_input(cls, values: Union[str, Dict[str, Any]]) -> Dict:
 class PackMolTool(BaseTool):
     name: str = "packmol_tool"
     description: str = """Useful when you need to create a box
-    of different types of molecules molecules"""
+    of different types of molecules molecules. """
 
     args_schema: Type[BaseModel] = PackmolInput
 
@@ -1446,9 +1446,9 @@ class FixPDBFile(BaseTool):
     description: str = "Fixes PDB files columns if needed"
     args_schema: Type[BaseModel] = PDBFilesFixInp
 
-    path_registry: typing.Optional[PathRegistry]
+    path_registry: Optional[PathRegistry]
 
-    def __init__(self, path_registry: typing.Optional[PathRegistry]):
+    def __init__(self, path_registry: Optional[PathRegistry]):
         super().__init__()
         self.path_registry = path_registry
 
@@ -1495,3 +1495,116 @@ def _run(self, query: Dict):
                 return "PDB file fixed"
             else:
                 return "PDB not fully fixed"
+
+
+class MolPDB:
+    def is_smiles(self, text: str) -> bool:
+        try:
+            m = Chem.MolFromSmiles(text, sanitize=False)
+            if m is None:
+                return False
+            return True
+        except Exception:
+            return False
+
+    def largest_mol(
+        self, smiles: str
+    ) -> (
+        str
+    ):  # from https://github.com/ur-whitelab/chemcrow-public/blob/main/chemcrow/utils.py
+        ss = smiles.split(".")
+        ss.sort(key=lambda a: len(a))
+        while not self.is_smiles(ss[-1]):
+            rm = ss[-1]
+            ss.remove(rm)
+        return ss[-1]
+
+    def molname2smiles(
+        self, query: str
+    ) -> (
+        str
+    ):  # from https://github.com/ur-whitelab/chemcrow-public/blob/main/chemcrow/tools/databases.py
+        url = " https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{}/{}"
+        r = requests.get(url.format(query, "property/IsomericSMILES/JSON"))
+        # convert the response to a json object
+        data = r.json()
+        # return the SMILES string
+        try:
+            smi = data["PropertyTable"]["Properties"][0]["IsomericSMILES"]
+        except KeyError:
+            return """Could not find a molecule matching the text.
+            One possible cause is that the input is incorrect, input one
+            molecule at a time."""
+        # remove salts
+        return Chem.CanonSmiles(self.largest_mol(smi))
+
+    def smiles2name(self, smi: str) -> str:
+        try:
+            smi = Chem.MolToSmiles(Chem.MolFromSmiles(smi), canonical=True)
+        except Exception:
+            return "Invalid SMILES string"
+        # query the PubChem database
+        r = requests.get(
+            "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/"
+            + smi
+            + "/synonyms/JSON"
+        )
+        data = r.json()
+        try:
+            name = data["InformationList"]["Information"][0]["Synonym"][0]
+        except KeyError:
+            return "Unknown Molecule"
+        return name
+
+    def small_molecule_pdb(self, mol_str: str, path_registry=None) -> str:
+        # takes in molecule name or smiles (converts to smiles if name)
+        # writes pdb file name.pdb (gets name from smiles if possible)
+        # output is done message
+        ps = Chem.SmilesParserParams()
+        ps.removeHs = False
+        try:
+            if self.is_smiles(mol_str):
+                m = Chem.MolFromSmiles(mol_str)
+                mol_name = self.smiles2name(mol_str)
+            else:  # if input is not smiles, try getting smiles
+                smi = self.molname2smiles(mol_str)
+                m = Chem.MolFromSmiles(smi)
+                mol_name = mol_str
+            try:  # only if needed
+                m = Chem.AddHs(m)
+            except Exception:
+                pass
+            Chem.AllChem.EmbedMolecule(m)
+            file_name = f"{mol_name}.pdb"
+            Chem.MolToPDBFile(m, file_name)
+            # add to path registry
+            if path_registry:
+                _ = path_registry.map_math(
+                    file_name, file_name, f"pdb file for the small molecule {mol_name}"
+                )
+            return (
+                f"PDB file for {mol_str} successfully created and saved to {file_name}."
+            )
+        except Exception:
+            return (
+                "There was an error getting pdb. Please input a single molecule name."
+            )
+
+
+class SmallMolPDB(BaseTool):
+    name = "SmallMoleculePDB"
+    description = """
+        Creates a PDB file for a small molecule
+        Use this tool when you need to use a small molecule in a simulation.
+        Input can be a molecule name or a SMILES string."""
+    path_registry: Optional[PathRegistry]
+
+    def __init__(self, path_registry: Optional[PathRegistry]):
+        super().__init__()
+        self.path_registry = path_registry
+
+    def _run(self, mol_str: str) -> str:
+        """use the tool."""
+        mol_pdb = MolPDB()
+        output = mol_pdb.small_molecule_pdb(mol_str, self.path_registry)
+        return output
diff --git a/mdagent/tools/maketools.py b/mdagent/tools/maketools.py
@@ -26,6 +26,7 @@
     Scholar2ResultLLM,
     SetUpandRunFunction,
     SimulationOutputFigures,
+    SmallMolPDB,
     VisualizeProtein,
 )
 from .subagent_tools import RetryExecuteSkill, SkillRetrieval, WorkflowPlan
@@ -82,6 +83,7 @@ def make_all_tools(
         #    MapPath2Name(path_registry=path_instance),
         Name2PDBTool(path_registry=path_instance),
         PackMolTool(path_registry=path_instance),
+        SmallMolPDB(path_registry=path_instance),
         VisualizeProtein(path_registry=path_instance),
         PPIDistance(),
         RMSDCalculator(),

diff --git a/setup.py b/setup.py
@@ -31,6 +31,7 @@
         "requests",
         "rmrkl",
         "tiktoken",
+        "rdkit",
     ],
     test_suite="tests",
     long_description=long_description,

diff --git a/tests/test_fxns.py b/tests/test_fxns.py
@@ -12,6 +12,7 @@
     get_pdb,
 )
 from mdagent.tools.base_tools.analysis_tools.plot_tools import plot_data, process_csv
+from mdagent.tools.base_tools.preprocess_tools.pdb_tools import MolPDB
 from mdagent.utils import FileType, PathRegistry
 
 warnings.filterwarnings("ignore", category=DeprecationWarning, module="pkg_resources")
@@ -39,6 +40,11 @@ def cleaning_fxns():
     return CleaningTools()
 
 
+@pytest.fixture
+def molpdb():
+    return MolPDB()
+
+
 # Test simulation tools
 @pytest.fixture
 def sim_fxns():
@@ -281,3 +287,30 @@ def test_map_path():
 
                 # Check the result message
                 assert result == "Path successfully mapped to name: new_name"
+
+
+def test_small_molecule_pdb(molpdb):
+    # Test with a valid SMILES string
+    valid_smiles = "C1=CC=CC=C1"  # Benzene
+    expected_output = (
+        "PDB file for C1=CC=CC=C1 successfully created and saved to benzene.pdb."
+    )
+    assert molpdb.small_molecule_pdb(valid_smiles) == expected_output
+    assert os.path.exists("benzene.pdb")
+    os.remove("benzene.pdb")  # Clean up
+
+    # test with invalid SMILES string and invalid molecule name
+    invalid_smiles = "C1=CC=CC=C1X"
+    invalid_name = "NotAMolecule"
+    expected_output = (
+        "There was an error getting pdb. Please input a single molecule name."
+    )
+    assert molpdb.small_molecule_pdb(invalid_smiles) == expected_output
+    assert molpdb.small_molecule_pdb(invalid_name) == expected_output
+
+    # test with valid molecule name
+    valid_name = "water"
+    expected_output = "PDB file for water successfully created and saved to water.pdb."
+    assert molpdb.small_molecule_pdb(valid_name) == expected_output
+    assert os.path.exists("water.pdb")
+    os.remove("water.pdb")  # Clean up