diff --git a/safe/converter.py b/safe/converter.py index 7611168..1e847f7 100644 --- a/safe/converter.py +++ b/safe/converter.py @@ -288,6 +288,7 @@ def encoder( continue obond = mol.GetBondBetweenAtoms(i_a, i_b) bonds.append(obond.GetIdx()) + if len(bonds) > 0: mol = Chem.FragmentOnBonds( mol, @@ -297,7 +298,6 @@ def encoder( # here we need to be clever and disable rooted atom as the atom with mapping frags = list(Chem.GetMolFrags(mol, asMols=True)) - if randomize: frags = rng.permutation(frags).tolist() elif canonical: @@ -322,18 +322,23 @@ def encoder( ) scaffold_str = ".".join(frags_str) + # EN: fix for https://github.com/datamol-io/safe/issues/37 + # we were using the wrong branch number count which did not take into account + # possible change in digit utilization after bond slicing + scf_branch_num = self._find_branch_number(scaffold_str) + branch_numbers + # don't capture atom mapping in the scaffold attach_pos = set(re.findall(r"(\[\d+\*\]|!\[[^:]*:\d+\])", scaffold_str)) - if canonical: attach_pos = sorted(attach_pos) - starting_num = 1 if len(branch_numbers) == 0 else max(branch_numbers) + 1 + starting_num = 1 if len(scf_branch_num) == 0 else max(scf_branch_num) + 1 for attach in attach_pos: val = str(starting_num) if starting_num < 10 else f"%{starting_num}" # we cannot have anything of the form "\([@=-#-$/\]*\d+\)" attach_regexp = re.compile(r"(" + re.escape(attach) + r")") scaffold_str = attach_regexp.sub(val, scaffold_str) starting_num += 1 + # now we need to remove all the parenthesis around digit only number wrong_attach = re.compile(r"\(([\%\d]*)\)") scaffold_str = wrong_attach.sub(r"\g<1>", scaffold_str) diff --git a/tests/test_safe.py b/tests/test_safe.py index fdfc288..3582f8a 100644 --- a/tests/test_safe.py +++ b/tests/test_safe.py @@ -110,3 +110,14 @@ def test_bracket_smiles_issues(input_sm): assert safe.decode(safe_str) is not None assert dm.same_mol(dm.to_mol(safe_str), input_mol) assert None not in fragments + + +def test_fused_ring_issue(): + FUSED_RING_LIST = [ + "[H][C@@]12CC[C@@]3(CCC(=O)O3)[C@@]1(C)CC[C@@]1([H])[C@@]2([H])[C@@]([H])(CC2=CC(=O)CC[C@]12C)SC(C)=O", + "[H][C@@]12C[C@H](C)[C@](OC(=O)CC)(C(=O)COC(=O)CC)[C@@]1(C)C[C@H](O)[C@@]1(Cl)[C@@]2([H])CCC2=CC(=O)C=C[C@]12C", + "[H][C@@]12CC[C@@](O)(C#C)[C@@]1(CC)CC[C@]1([H])[C@@]3([H])CCC(=O)C=C3CC[C@@]21[H]", + ] + for fused_ring in FUSED_RING_LIST: + output_string = safe.decode(safe.encode(fused_ring)) + assert dm.same_mol(fused_ring, output_string)