Skip to content

Commit

Permalink
Reformat loaders for different smiles paths (#1211)
Browse files Browse the repository at this point in the history
* Reformat loaders for different smiles paths

* merge test_compound tests

* allow for lists of smiles or .smi file lists to be loaded into a compound

* remove extra large_smiles test

* remove duplicated tests
  • Loading branch information
CalCraven authored Dec 13, 2024
1 parent 9acf939 commit 052ccdf
Show file tree
Hide file tree
Showing 2 changed files with 133 additions and 72 deletions.
180 changes: 108 additions & 72 deletions mbuild/conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,11 @@ def load(
structure's position (recommended).
"""
# First check if we are loading from an object
if not isinstance(filename_or_object, str):
if not (
isinstance(filename_or_object, str)
or isinstance(filename_or_object, list)
or isinstance(filename_or_object, tuple)
):
return load_object(
obj=filename_or_object,
compound=compound,
Expand All @@ -97,19 +101,26 @@ def load(
**kwargs,
)
# Second check if we are loading SMILES strings
elif smiles:
elif smiles and (backend is None or backend.lower() == "rdkit"):
# Ignore the box info for SMILES (its never there)
ignore_box_warn = True
return load_smiles(
seed = kwargs.get("seed", 0)
return load_rdkit_smiles(
smiles_or_filename=filename_or_object,
compound=compound,
infer_hierarchy=infer_hierarchy,
ignore_box_warn=ignore_box_warn,
backend=backend,
**kwargs,
ignore_box_warn=True,
seed=seed,
)
# Last, if none of the above, load from file
elif smiles and isinstance(backend, str) and backend.lower() == "pybel":
return load_pybel_smiles(
smiles_or_filename=filename_or_object,
compound=compound,
infer_hierarchy=infer_hierarchy,
ignore_box_warn=True,
)

else:
# Last, if none of the above, load from file
return load_file(
filename=filename_or_object,
relative_to_module=relative_to_module,
Expand Down Expand Up @@ -191,14 +202,12 @@ def load_object(
raise ValueError(f"Object of type {type(obj).__name__} is not supported.")


def load_smiles(
def load_pybel_smiles(
smiles_or_filename,
compound=None,
infer_hierarchy=True,
ignore_box_warn=False,
backend="rdkit",
coords_only=False,
**kwargs,
):
"""Load a SMILES string as an mBuild Compound.
Expand All @@ -216,88 +225,115 @@ def load_smiles(
If True, ignore warning if no box is present.
coords_only : bool, optional, default=False
Only load the coordinates into a provided compound.
backend : str, optional, default='rdkit'
The smiles loading backend, either 'rdkit' or 'pybel'
Returns
-------
compound : mb.Compound
"""
# Initialize an mb.Compound if none is provided
if not compound:
if compound is None:
compound = mb.Compound()

test_path = Path(smiles_or_filename)
pybel = import_("pybel")
# First we try treating smiles_or_filename as a SMILES string
try:
mymol = pybel.readstring("smi", smiles_or_filename)
mymolGen = [mymol]
# Now we treat it as a filename
except (OSError, IOError):
mymolGen = pybel.readfile("smi", smiles_or_filename)

for mymol in mymolGen:
mymol.make3D()
from_pybel(
pybel_mol=mymol,
compound=compound,
infer_hierarchy=infer_hierarchy,
ignore_box_warn=ignore_box_warn,
)

return compound

# Will try to support list of smiles strings in the future
if backend is None:
backend = "rdkit"

if backend == "rdkit":
rdkit = import_("rdkit") # noqa: F841
from rdkit import Chem
def load_rdkit_smiles(
smiles_or_filename,
compound=None,
infer_hierarchy=True,
ignore_box_warn=False,
coords_only=False,
seed=0,
):
"""Load a SMILES string as an mBuild Compound.
if test_path.exists():
# assuming this is a smi file now
mymol = Chem.SmilesMolSupplier(smiles_or_filename)
if not mymol:
raise ValueError(
"Provided smiles string or file was invalid. Refer to the "
"above RDKit error messages for additional information."
)
mol_list = [mol for mol in mymol]
if len(mol_list) == 1:
rdmol = mymol[0]
else:
rdmol = mymol[0]
warn(
"More than one SMILES string in file, more than one SMILES "
f"string is not supported, using {Chem.MolToSmiles(rdmol)}"
)
else:
rdmol = Chem.MolFromSmiles(smiles_or_filename)
Loading SMILES string from a string, a list, or a file using RDKit by
default. Must have rdkit or pybel packages installed.
Parameters
----------
smiles_or_filename : str
SMILES string or file of SMILES string to load
compound : mb.Compound
The host mbuild Compound
infer_hierarchy : bool, optional, default=True
ignore_box_warn : bool, optional, default=False
If True, ignore warning if no box is present.
coords_only : bool, optional, default=False
Only load the coordinates into a provided compound.
seed = kwargs.get("smiles_seed", 0)
Returns
-------
compound : mb.Compound
"""
# Initialize an mb.Compound if none is provided
if not compound:
compound = mb.Compound()

if not seed: # default rdkit seed
seed = 0

rdkit = import_("rdkit") # noqa: F841
from rdkit import Chem

if isinstance(smiles_or_filename, (tuple, list)):
for mol in smiles_or_filename:
rdmol = Chem.MolFromSmiles(mol)
from_rdkit(
rdkit_mol=rdmol,
compound=compound,
coords_only=coords_only,
smiles_seed=seed,
)
return compound

rdmol = Chem.MolFromSmiles(smiles_or_filename)
if rdmol: # return right away if the smiles loads properly
return from_rdkit(
rdkit_mol=rdmol,
compound=compound,
coords_only=coords_only,
smiles_seed=seed,
)
elif backend == "pybel":
pybel = import_("pybel")
# First we try treating filename_or_object as a SMILES string
try:
mymol = pybel.readstring("smi", smiles_or_filename)
# Now we treat it as a filename
except (OSError, IOError):
# For now, we only support reading in a single smiles molecule,
# but pybel returns a generator, so we get the first molecule
# and warn the user if there is more

mymol_generator = pybel.readfile("smi", smiles_or_filename)
mymol_list = list(mymol_generator)
if len(mymol_list) == 1:
mymol = mymol_list[0]
else:
mymol = mymol_list[0]
warn(
"More than one SMILES string in file, more than one SMILES "
f"string is not supported, using {mymol.write('smi')}"
)
mymol.make3D()
return from_pybel(
pybel_mol=mymol,

# Try to assume it's a smiles file
mymol = Chem.SmilesMolSupplier(smiles_or_filename, titleLine=0)
if not mymol:
raise ValueError(
"Provided smiles string or file was invalid. Refer to the "
"above RDKit error messages for additional information."
)
molList = [mol for mol in mymol]
for rdmol in molList:
from_rdkit(
rdkit_mol=rdmol,
compound=compound,
infer_hierarchy=infer_hierarchy,
ignore_box_warn=ignore_box_warn,
coords_only=coords_only,
smiles_seed=seed,
)
else:
if not compound:
raise ValueError(
"Expected SMILES loading backend 'rdkit' or 'pybel'. "
f"Was provided: {backend}"
"Expected SMILES loading backend 'rdkit' failed to load any compouds."
f"Check the SMILES string of .smi file passed to {smiles_or_filename=}"
)
return compound


def load_file(
Expand Down Expand Up @@ -409,7 +445,7 @@ def load_file(
elif extension == ".txt":
warn(".txt file detected, loading as a SMILES string")
# Fail-safe measure
compound = load_smiles(filename, compound)
compound = load_pybel_smiles(filename, compound)

# Then mdtraj reader
elif backend == "mdtraj":
Expand Down
25 changes: 25 additions & 0 deletions mbuild/tests/test_compound.py
Original file line number Diff line number Diff line change
Expand Up @@ -2537,9 +2537,34 @@ def test_catalog_bondgraph_types(self, benzene):
== "particle_graph"
)

def test_load_large_smiles(self):
cpd = mb.load(
(
"CC1C(=O)NC(C(=O)NC(CSC(C2=C(C3=CC4=C(C(SCC(C(=O)N1)"
"NC(=O)C(CCCCN)NC(=O)C(CCC(=O)N)NC(=O)C(C(C)C)N)C)C(=C("
"[N-]4)C=C5C(=C(C(=N5)C=C6C(=C(C(=CC2=N3)[N-]6)C)CCC(=O"
")[O-])CCC(=O)[O-])C)C)C)C)C(=O)NC(CC7=CNC=N7)C(=O)NC(C"
"(C)O)C(=O)NC(C(C)C)C(=O)NC(CCC(=O)[O-])C(=O)[O-])CCC(="
"O)N.[Na+].[Na+].[Na+].[Na+].[Fe+2]"
),
smiles=True,
)
assert cpd.n_particles == 244

def test_reset_labels(self):
ethane = mb.load("CC", smiles=True)
Hs = ethane.particles_by_name("H")
ethane.remove(Hs, reset_labels=True)
ports = set(f"port[{i}]" for i in range(6))
assert ports.issubset(set(ethane.labels.keys()))

def test_load_molfile(self):
with open("ethane.smi", "w") as f:
f.writelines("CC ethane\nCCC propane\nCCCC butane") # write a test file

cpd = mb.load("ethane.smi", smiles=True, backend="rdkit")
assert cpd.n_particles == 33

def test_load_list_of_smiles(self):
cpd = mb.load(["C", "O"], smiles=True)
assert len(cpd.children) == 8

0 comments on commit 052ccdf

Please sign in to comment.