From 052ccdf5b2605eeb4b2537614f511bdb222c36ee Mon Sep 17 00:00:00 2001 From: CalCraven <54594941+CalCraven@users.noreply.github.com> Date: Fri, 13 Dec 2024 10:33:43 -0600 Subject: [PATCH] Reformat loaders for different smiles paths (#1211) * Reformat loaders for different smiles paths * merge test_compound tests * allow for lists of smiles or .smi file lists to be loaded into a compound * remove extra large_smiles test * remove duplicated tests --- mbuild/conversion.py | 180 ++++++++++++++++++++-------------- mbuild/tests/test_compound.py | 25 +++++ 2 files changed, 133 insertions(+), 72 deletions(-) diff --git a/mbuild/conversion.py b/mbuild/conversion.py index b056ce3ec..006f30f27 100644 --- a/mbuild/conversion.py +++ b/mbuild/conversion.py @@ -88,7 +88,11 @@ def load( structure's position (recommended). """ # First check if we are loading from an object - if not isinstance(filename_or_object, str): + if not ( + isinstance(filename_or_object, str) + or isinstance(filename_or_object, list) + or isinstance(filename_or_object, tuple) + ): return load_object( obj=filename_or_object, compound=compound, @@ -97,19 +101,26 @@ def load( **kwargs, ) # Second check if we are loading SMILES strings - elif smiles: + elif smiles and (backend is None or backend.lower() == "rdkit"): # Ignore the box info for SMILES (its never there) - ignore_box_warn = True - return load_smiles( + seed = kwargs.get("seed", 0) + return load_rdkit_smiles( smiles_or_filename=filename_or_object, compound=compound, infer_hierarchy=infer_hierarchy, - ignore_box_warn=ignore_box_warn, - backend=backend, - **kwargs, + ignore_box_warn=True, + seed=seed, ) - # Last, if none of the above, load from file + elif smiles and isinstance(backend, str) and backend.lower() == "pybel": + return load_pybel_smiles( + smiles_or_filename=filename_or_object, + compound=compound, + infer_hierarchy=infer_hierarchy, + ignore_box_warn=True, + ) + else: + # Last, if none of the above, load from file return load_file( filename=filename_or_object, relative_to_module=relative_to_module, @@ -191,14 +202,12 @@ def load_object( raise ValueError(f"Object of type {type(obj).__name__} is not supported.") -def load_smiles( +def load_pybel_smiles( smiles_or_filename, compound=None, infer_hierarchy=True, ignore_box_warn=False, - backend="rdkit", coords_only=False, - **kwargs, ): """Load a SMILES string as an mBuild Compound. @@ -216,88 +225,115 @@ def load_smiles( If True, ignore warning if no box is present. coords_only : bool, optional, default=False Only load the coordinates into a provided compound. - backend : str, optional, default='rdkit' - The smiles loading backend, either 'rdkit' or 'pybel' Returns ------- compound : mb.Compound """ - # Initialize an mb.Compound if none is provided - if not compound: + if compound is None: compound = mb.Compound() - test_path = Path(smiles_or_filename) + pybel = import_("pybel") + # First we try treating smiles_or_filename as a SMILES string + try: + mymol = pybel.readstring("smi", smiles_or_filename) + mymolGen = [mymol] + # Now we treat it as a filename + except (OSError, IOError): + mymolGen = pybel.readfile("smi", smiles_or_filename) + + for mymol in mymolGen: + mymol.make3D() + from_pybel( + pybel_mol=mymol, + compound=compound, + infer_hierarchy=infer_hierarchy, + ignore_box_warn=ignore_box_warn, + ) + + return compound - # Will try to support list of smiles strings in the future - if backend is None: - backend = "rdkit" - if backend == "rdkit": - rdkit = import_("rdkit") # noqa: F841 - from rdkit import Chem +def load_rdkit_smiles( + smiles_or_filename, + compound=None, + infer_hierarchy=True, + ignore_box_warn=False, + coords_only=False, + seed=0, +): + """Load a SMILES string as an mBuild Compound. - if test_path.exists(): - # assuming this is a smi file now - mymol = Chem.SmilesMolSupplier(smiles_or_filename) - if not mymol: - raise ValueError( - "Provided smiles string or file was invalid. Refer to the " - "above RDKit error messages for additional information." - ) - mol_list = [mol for mol in mymol] - if len(mol_list) == 1: - rdmol = mymol[0] - else: - rdmol = mymol[0] - warn( - "More than one SMILES string in file, more than one SMILES " - f"string is not supported, using {Chem.MolToSmiles(rdmol)}" - ) - else: - rdmol = Chem.MolFromSmiles(smiles_or_filename) + Loading SMILES string from a string, a list, or a file using RDKit by + default. Must have rdkit or pybel packages installed. + + Parameters + ---------- + smiles_or_filename : str + SMILES string or file of SMILES string to load + compound : mb.Compound + The host mbuild Compound + infer_hierarchy : bool, optional, default=True + ignore_box_warn : bool, optional, default=False + If True, ignore warning if no box is present. + coords_only : bool, optional, default=False + Only load the coordinates into a provided compound. - seed = kwargs.get("smiles_seed", 0) + Returns + ------- + compound : mb.Compound + """ + # Initialize an mb.Compound if none is provided + if not compound: + compound = mb.Compound() + + if not seed: # default rdkit seed + seed = 0 + + rdkit = import_("rdkit") # noqa: F841 + from rdkit import Chem + + if isinstance(smiles_or_filename, (tuple, list)): + for mol in smiles_or_filename: + rdmol = Chem.MolFromSmiles(mol) + from_rdkit( + rdkit_mol=rdmol, + compound=compound, + coords_only=coords_only, + smiles_seed=seed, + ) + return compound + rdmol = Chem.MolFromSmiles(smiles_or_filename) + if rdmol: # return right away if the smiles loads properly return from_rdkit( rdkit_mol=rdmol, compound=compound, coords_only=coords_only, smiles_seed=seed, ) - elif backend == "pybel": - pybel = import_("pybel") - # First we try treating filename_or_object as a SMILES string - try: - mymol = pybel.readstring("smi", smiles_or_filename) - # Now we treat it as a filename - except (OSError, IOError): - # For now, we only support reading in a single smiles molecule, - # but pybel returns a generator, so we get the first molecule - # and warn the user if there is more - - mymol_generator = pybel.readfile("smi", smiles_or_filename) - mymol_list = list(mymol_generator) - if len(mymol_list) == 1: - mymol = mymol_list[0] - else: - mymol = mymol_list[0] - warn( - "More than one SMILES string in file, more than one SMILES " - f"string is not supported, using {mymol.write('smi')}" - ) - mymol.make3D() - return from_pybel( - pybel_mol=mymol, + + # Try to assume it's a smiles file + mymol = Chem.SmilesMolSupplier(smiles_or_filename, titleLine=0) + if not mymol: + raise ValueError( + "Provided smiles string or file was invalid. Refer to the " + "above RDKit error messages for additional information." + ) + molList = [mol for mol in mymol] + for rdmol in molList: + from_rdkit( + rdkit_mol=rdmol, compound=compound, - infer_hierarchy=infer_hierarchy, - ignore_box_warn=ignore_box_warn, + coords_only=coords_only, + smiles_seed=seed, ) - else: + if not compound: raise ValueError( - "Expected SMILES loading backend 'rdkit' or 'pybel'. " - f"Was provided: {backend}" + "Expected SMILES loading backend 'rdkit' failed to load any compouds." + f"Check the SMILES string of .smi file passed to {smiles_or_filename=}" ) + return compound def load_file( @@ -409,7 +445,7 @@ def load_file( elif extension == ".txt": warn(".txt file detected, loading as a SMILES string") # Fail-safe measure - compound = load_smiles(filename, compound) + compound = load_pybel_smiles(filename, compound) # Then mdtraj reader elif backend == "mdtraj": diff --git a/mbuild/tests/test_compound.py b/mbuild/tests/test_compound.py index 5345c49a9..4bc17fb95 100644 --- a/mbuild/tests/test_compound.py +++ b/mbuild/tests/test_compound.py @@ -2537,9 +2537,34 @@ def test_catalog_bondgraph_types(self, benzene): == "particle_graph" ) + def test_load_large_smiles(self): + cpd = mb.load( + ( + "CC1C(=O)NC(C(=O)NC(CSC(C2=C(C3=CC4=C(C(SCC(C(=O)N1)" + "NC(=O)C(CCCCN)NC(=O)C(CCC(=O)N)NC(=O)C(C(C)C)N)C)C(=C(" + "[N-]4)C=C5C(=C(C(=N5)C=C6C(=C(C(=CC2=N3)[N-]6)C)CCC(=O" + ")[O-])CCC(=O)[O-])C)C)C)C)C(=O)NC(CC7=CNC=N7)C(=O)NC(C" + "(C)O)C(=O)NC(C(C)C)C(=O)NC(CCC(=O)[O-])C(=O)[O-])CCC(=" + "O)N.[Na+].[Na+].[Na+].[Na+].[Fe+2]" + ), + smiles=True, + ) + assert cpd.n_particles == 244 + def test_reset_labels(self): ethane = mb.load("CC", smiles=True) Hs = ethane.particles_by_name("H") ethane.remove(Hs, reset_labels=True) ports = set(f"port[{i}]" for i in range(6)) assert ports.issubset(set(ethane.labels.keys())) + + def test_load_molfile(self): + with open("ethane.smi", "w") as f: + f.writelines("CC ethane\nCCC propane\nCCCC butane") # write a test file + + cpd = mb.load("ethane.smi", smiles=True, backend="rdkit") + assert cpd.n_particles == 33 + + def test_load_list_of_smiles(self): + cpd = mb.load(["C", "O"], smiles=True) + assert len(cpd.children) == 8