Skip to content

Commit

Permalink
Merge pull request #38 from datamol-io/fix/fused-ring
Browse files Browse the repository at this point in the history
Fix Error in Digit Counting That mess up Fused Rings
  • Loading branch information
maclandrol authored Apr 11, 2024
2 parents 80ccca7 + 87986a4 commit d6f4271
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 3 deletions.
11 changes: 8 additions & 3 deletions safe/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,7 @@ def encoder(
continue
obond = mol.GetBondBetweenAtoms(i_a, i_b)
bonds.append(obond.GetIdx())

if len(bonds) > 0:
mol = Chem.FragmentOnBonds(
mol,
Expand All @@ -297,7 +298,6 @@ def encoder(
# here we need to be clever and disable rooted atom as the atom with mapping

frags = list(Chem.GetMolFrags(mol, asMols=True))

if randomize:
frags = rng.permutation(frags).tolist()
elif canonical:
Expand All @@ -322,18 +322,23 @@ def encoder(
)

scaffold_str = ".".join(frags_str)
# EN: fix for https://github.com/datamol-io/safe/issues/37
# we were using the wrong branch number count which did not take into account
# possible change in digit utilization after bond slicing
scf_branch_num = self._find_branch_number(scaffold_str) + branch_numbers

# don't capture atom mapping in the scaffold
attach_pos = set(re.findall(r"(\[\d+\*\]|!\[[^:]*:\d+\])", scaffold_str))

if canonical:
attach_pos = sorted(attach_pos)
starting_num = 1 if len(branch_numbers) == 0 else max(branch_numbers) + 1
starting_num = 1 if len(scf_branch_num) == 0 else max(scf_branch_num) + 1
for attach in attach_pos:
val = str(starting_num) if starting_num < 10 else f"%{starting_num}"
# we cannot have anything of the form "\([@=-#-$/\]*\d+\)"
attach_regexp = re.compile(r"(" + re.escape(attach) + r")")
scaffold_str = attach_regexp.sub(val, scaffold_str)
starting_num += 1

# now we need to remove all the parenthesis around digit only number
wrong_attach = re.compile(r"\(([\%\d]*)\)")
scaffold_str = wrong_attach.sub(r"\g<1>", scaffold_str)
Expand Down
11 changes: 11 additions & 0 deletions tests/test_safe.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,3 +110,14 @@ def test_bracket_smiles_issues(input_sm):
assert safe.decode(safe_str) is not None
assert dm.same_mol(dm.to_mol(safe_str), input_mol)
assert None not in fragments


def test_fused_ring_issue():
FUSED_RING_LIST = [
"[H][C@@]12CC[C@@]3(CCC(=O)O3)[C@@]1(C)CC[C@@]1([H])[C@@]2([H])[C@@]([H])(CC2=CC(=O)CC[C@]12C)SC(C)=O",
"[H][C@@]12C[C@H](C)[C@](OC(=O)CC)(C(=O)COC(=O)CC)[C@@]1(C)C[C@H](O)[C@@]1(Cl)[C@@]2([H])CCC2=CC(=O)C=C[C@]12C",
"[H][C@@]12CC[C@@](O)(C#C)[C@@]1(CC)CC[C@]1([H])[C@@]3([H])CCC(=O)C=C3CC[C@@]21[H]",
]
for fused_ring in FUSED_RING_LIST:
output_string = safe.decode(safe.encode(fused_ring))
assert dm.same_mol(fused_ring, output_string)

0 comments on commit d6f4271

Please sign in to comment.