From 052ccdf5b2605eeb4b2537614f511bdb222c36ee Mon Sep 17 00:00:00 2001
From: CalCraven <54594941+CalCraven@users.noreply.github.com>
Date: Fri, 13 Dec 2024 10:33:43 -0600
Subject: [PATCH] Reformat loaders for different smiles paths (#1211)

* Reformat loaders for different smiles paths

* merge test_compound tests

* allow for lists of smiles or .smi file lists to be loaded into a compound

* remove extra large_smiles test

* remove duplicated tests
---
 mbuild/conversion.py          | 180 ++++++++++++++++++++--------------
 mbuild/tests/test_compound.py |  25 +++++
 2 files changed, 133 insertions(+), 72 deletions(-)

diff --git a/mbuild/conversion.py b/mbuild/conversion.py
index b056ce3ec..006f30f27 100644
--- a/mbuild/conversion.py
+++ b/mbuild/conversion.py
@@ -88,7 +88,11 @@ def load(
     structure's position (recommended).
     """
     # First check if we are loading from an object
-    if not isinstance(filename_or_object, str):
+    if not (
+        isinstance(filename_or_object, str)
+        or isinstance(filename_or_object, list)
+        or isinstance(filename_or_object, tuple)
+    ):
         return load_object(
             obj=filename_or_object,
             compound=compound,
@@ -97,19 +101,26 @@ def load(
             **kwargs,
         )
     # Second check if we are loading SMILES strings
-    elif smiles:
+    elif smiles and (backend is None or backend.lower() == "rdkit"):
         # Ignore the box info for SMILES (its never there)
-        ignore_box_warn = True
-        return load_smiles(
+        seed = kwargs.get("seed", 0)
+        return load_rdkit_smiles(
             smiles_or_filename=filename_or_object,
             compound=compound,
             infer_hierarchy=infer_hierarchy,
-            ignore_box_warn=ignore_box_warn,
-            backend=backend,
-            **kwargs,
+            ignore_box_warn=True,
+            seed=seed,
         )
-    # Last, if none of the above, load from file
+    elif smiles and isinstance(backend, str) and backend.lower() == "pybel":
+        return load_pybel_smiles(
+            smiles_or_filename=filename_or_object,
+            compound=compound,
+            infer_hierarchy=infer_hierarchy,
+            ignore_box_warn=True,
+        )
+
     else:
+        # Last, if none of the above, load from file
         return load_file(
             filename=filename_or_object,
             relative_to_module=relative_to_module,
@@ -191,14 +202,12 @@ def load_object(
     raise ValueError(f"Object of type {type(obj).__name__} is not supported.")
 
 
-def load_smiles(
+def load_pybel_smiles(
     smiles_or_filename,
     compound=None,
     infer_hierarchy=True,
     ignore_box_warn=False,
-    backend="rdkit",
     coords_only=False,
-    **kwargs,
 ):
     """Load a SMILES string as an mBuild Compound.
 
@@ -216,88 +225,115 @@ def load_smiles(
         If True, ignore warning if no box is present.
     coords_only : bool, optional, default=False
         Only load the coordinates into a provided compound.
-    backend : str, optional, default='rdkit'
-        The smiles loading backend, either 'rdkit' or 'pybel'
 
     Returns
     -------
     compound : mb.Compound
     """
-    # Initialize an mb.Compound if none is provided
-    if not compound:
+    if compound is None:
         compound = mb.Compound()
 
-    test_path = Path(smiles_or_filename)
+    pybel = import_("pybel")
+    # First we try treating smiles_or_filename as a SMILES string
+    try:
+        mymol = pybel.readstring("smi", smiles_or_filename)
+        mymolGen = [mymol]
+    # Now we treat it as a filename
+    except (OSError, IOError):
+        mymolGen = pybel.readfile("smi", smiles_or_filename)
+
+    for mymol in mymolGen:
+        mymol.make3D()
+        from_pybel(
+            pybel_mol=mymol,
+            compound=compound,
+            infer_hierarchy=infer_hierarchy,
+            ignore_box_warn=ignore_box_warn,
+        )
+
+    return compound
 
-    # Will try to support list of smiles strings in the future
-    if backend is None:
-        backend = "rdkit"
 
-    if backend == "rdkit":
-        rdkit = import_("rdkit")  # noqa: F841
-        from rdkit import Chem
+def load_rdkit_smiles(
+    smiles_or_filename,
+    compound=None,
+    infer_hierarchy=True,
+    ignore_box_warn=False,
+    coords_only=False,
+    seed=0,
+):
+    """Load a SMILES string as an mBuild Compound.
 
-        if test_path.exists():
-            # assuming this is a smi file now
-            mymol = Chem.SmilesMolSupplier(smiles_or_filename)
-            if not mymol:
-                raise ValueError(
-                    "Provided smiles string or file was invalid. Refer to the "
-                    "above RDKit error messages for additional information."
-                )
-            mol_list = [mol for mol in mymol]
-            if len(mol_list) == 1:
-                rdmol = mymol[0]
-            else:
-                rdmol = mymol[0]
-                warn(
-                    "More than one SMILES string in file, more than one SMILES "
-                    f"string is not supported, using {Chem.MolToSmiles(rdmol)}"
-                )
-        else:
-            rdmol = Chem.MolFromSmiles(smiles_or_filename)
+    Loading SMILES string from a string, a list, or a file using RDKit by
+    default. Must have rdkit or pybel packages installed.
+
+    Parameters
+    ----------
+    smiles_or_filename : str
+        SMILES string or file of SMILES string to load
+    compound : mb.Compound
+        The host mbuild Compound
+    infer_hierarchy : bool, optional, default=True
+    ignore_box_warn : bool, optional, default=False
+        If True, ignore warning if no box is present.
+    coords_only : bool, optional, default=False
+        Only load the coordinates into a provided compound.
 
-        seed = kwargs.get("smiles_seed", 0)
+    Returns
+    -------
+    compound : mb.Compound
+    """
+    # Initialize an mb.Compound if none is provided
+    if not compound:
+        compound = mb.Compound()
+
+    if not seed:  # default rdkit seed
+        seed = 0
+
+    rdkit = import_("rdkit")  # noqa: F841
+    from rdkit import Chem
+
+    if isinstance(smiles_or_filename, (tuple, list)):
+        for mol in smiles_or_filename:
+            rdmol = Chem.MolFromSmiles(mol)
+            from_rdkit(
+                rdkit_mol=rdmol,
+                compound=compound,
+                coords_only=coords_only,
+                smiles_seed=seed,
+            )
+        return compound
 
+    rdmol = Chem.MolFromSmiles(smiles_or_filename)
+    if rdmol:  # return right away if the smiles loads properly
         return from_rdkit(
             rdkit_mol=rdmol,
             compound=compound,
             coords_only=coords_only,
             smiles_seed=seed,
         )
-    elif backend == "pybel":
-        pybel = import_("pybel")
-        # First we try treating filename_or_object as a SMILES string
-        try:
-            mymol = pybel.readstring("smi", smiles_or_filename)
-        # Now we treat it as a filename
-        except (OSError, IOError):
-            # For now, we only support reading in a single smiles molecule,
-            # but pybel returns a generator, so we get the first molecule
-            # and warn the user if there is more
-
-            mymol_generator = pybel.readfile("smi", smiles_or_filename)
-            mymol_list = list(mymol_generator)
-            if len(mymol_list) == 1:
-                mymol = mymol_list[0]
-            else:
-                mymol = mymol_list[0]
-                warn(
-                    "More than one SMILES string in file, more than one SMILES "
-                    f"string is not supported, using {mymol.write('smi')}"
-                )
-        mymol.make3D()
-        return from_pybel(
-            pybel_mol=mymol,
+
+    # Try to assume it's a smiles file
+    mymol = Chem.SmilesMolSupplier(smiles_or_filename, titleLine=0)
+    if not mymol:
+        raise ValueError(
+            "Provided smiles string or file was invalid. Refer to the "
+            "above RDKit error messages for additional information."
+        )
+    molList = [mol for mol in mymol]
+    for rdmol in molList:
+        from_rdkit(
+            rdkit_mol=rdmol,
             compound=compound,
-            infer_hierarchy=infer_hierarchy,
-            ignore_box_warn=ignore_box_warn,
+            coords_only=coords_only,
+            smiles_seed=seed,
         )
-    else:
+    if not compound:
         raise ValueError(
-            "Expected SMILES loading backend 'rdkit' or 'pybel'. "
-            f"Was provided: {backend}"
+            "Expected SMILES loading backend 'rdkit' failed to load any compouds."
+            f"Check the SMILES string of .smi file passed to {smiles_or_filename=}"
         )
+    return compound
 
 
 def load_file(
@@ -409,7 +445,7 @@ def load_file(
         elif extension == ".txt":
             warn(".txt file detected, loading as a SMILES string")
             # Fail-safe measure
-            compound = load_smiles(filename, compound)
+            compound = load_pybel_smiles(filename, compound)
 
     # Then mdtraj reader
     elif backend == "mdtraj":
diff --git a/mbuild/tests/test_compound.py b/mbuild/tests/test_compound.py
index 5345c49a9..4bc17fb95 100644
--- a/mbuild/tests/test_compound.py
+++ b/mbuild/tests/test_compound.py
@@ -2537,9 +2537,34 @@ def test_catalog_bondgraph_types(self, benzene):
             == "particle_graph"
         )
 
+    def test_load_large_smiles(self):
+        cpd = mb.load(
+            (
+                "CC1C(=O)NC(C(=O)NC(CSC(C2=C(C3=CC4=C(C(SCC(C(=O)N1)"
+                "NC(=O)C(CCCCN)NC(=O)C(CCC(=O)N)NC(=O)C(C(C)C)N)C)C(=C("
+                "[N-]4)C=C5C(=C(C(=N5)C=C6C(=C(C(=CC2=N3)[N-]6)C)CCC(=O"
+                ")[O-])CCC(=O)[O-])C)C)C)C)C(=O)NC(CC7=CNC=N7)C(=O)NC(C"
+                "(C)O)C(=O)NC(C(C)C)C(=O)NC(CCC(=O)[O-])C(=O)[O-])CCC(="
+                "O)N.[Na+].[Na+].[Na+].[Na+].[Fe+2]"
+            ),
+            smiles=True,
+        )
+        assert cpd.n_particles == 244
+
     def test_reset_labels(self):
         ethane = mb.load("CC", smiles=True)
         Hs = ethane.particles_by_name("H")
         ethane.remove(Hs, reset_labels=True)
         ports = set(f"port[{i}]" for i in range(6))
         assert ports.issubset(set(ethane.labels.keys()))
+
+    def test_load_molfile(self):
+        with open("ethane.smi", "w") as f:
+            f.writelines("CC ethane\nCCC propane\nCCCC butane")  # write a test file
+
+        cpd = mb.load("ethane.smi", smiles=True, backend="rdkit")
+        assert cpd.n_particles == 33
+
+    def test_load_list_of_smiles(self):
+        cpd = mb.load(["C", "O"], smiles=True)
+        assert len(cpd.children) == 8