Merge pull request #94 from IBM/rfd-dev

🛠️🪲 Gracefully handle incorrect SMILES
IBM · Feb 27, 2025 · ae99c6c · ae99c6c
2 parents 69f5c41 + 946a8f5
commit ae99c6c
Show file tree

Hide file tree

Showing 3 changed files with 42 additions and 15 deletions.
diff --git a/README.md b/README.md
@@ -119,21 +119,21 @@ pip install rdkit
 The HestiaDatasetGenerator allows for the easy generation of training/validation/evaluation partitions with different similarity thresholds. Enabling the estimation of model generalisation capabilities. It also allows for the calculation of the ABOID (Area between the similarity-performance curve (Out-of-distribution) and the In-distribution performance).
 
 ```python
-from hestia.dataset_generator import HestiaDatasetGenerator, SimilarityArguments
+from hestia.dataset_generator import HestiaGenerator, SimArguments
 
 # Initialise the generator for a DataFrame
-generator = HestiaDatasetGenerator(df)
+generator = HestiaGenerator(df)
 
 # Define the similarity arguments (for more info see the documentation page https://ibm.github.io/Hestia-OOD/datasetgenerator)
 
 # Similarity arguments for protein similarity
-prot_args = SimilarityArguments(
+prot_args = SimArguments(
     data_type='sequence', field_name='sequence',
     alignment_algorithm='mmseqs2+prefilter', verbose=3
 )
 
 # Similarity arguments for molecular similarity
-mol_args = SimilarityArguments(
+mol_args = SimArguments(
     data_type='small molecule', field_name='SMILES',
     fingeprint='mapc', radius=2, bits=2048
 )
@@ -152,9 +152,9 @@ generator.save_precalculated('precalculated_partitions.gz')
 # Load pre-calculated partitions
 generator.from_precalculated('precalculated_partitions.gz')
 
-# Training code
+# Training code (filter partitions with test sets less than 18.5% of total data)
 
-for threshold, partition in generator.get_partitions():
+for threshold, partition in generator.get_partitions(filter=0.185):
     train = df.iloc[partition['train']]
     valid = df.iloc[partition['valid']]
     test = df.iloc[partition['test']]

diff --git a/hestia/dataset_generator.py b/hestia/dataset_generator.py
@@ -469,11 +469,14 @@ def calculate_partitions(
                 'train': random[0],
                 'test': random[1]
             }
-        sim_metadata = vars(sim_args)
-        if sim_args.data_type == 'embedding':
-            del sim_metadata['query_embds']
-            if 'target_embds' in sim_metadata:
-                del sim_metadata['target_embds']
+        if sim_args is None:
+            sim_metadata = None
+        else:
+            sim_metadata = vars(sim_args)
+            if sim_args.data_type == 'embedding':
+                del sim_metadata['query_embds']
+                if 'target_embds' in sim_metadata:
+                    del sim_metadata['target_embds']
 
         self.metadata = {
             'partition_algorithm': {

diff --git a/hestia/similarity.py b/hestia/similarity.py
@@ -242,8 +242,17 @@ def molecular_similarity(
         from rdkit.DataStructs import (
             BulkTanimotoSimilarity, BulkDiceSimilarity,
             BulkSokalSimilarity, BulkRogotGoldbergSimilarity,
-            BulkCosineSimilarity
-        )
+            BulkCosineSimilarity)
+        from rdkit import RDLogger
+        from rdkit import rdBase
+
+        def disable_rdkit_log():
+            """Disable all rdkit logs."""
+            for log_level in RDLogger._levels:
+                rdBase.DisableLog(log_level)
+
+        disable_rdkit_log()
+
     except ModuleNotFoundError:
         raise ImportError("This function requires RDKit to be installed.")
 
@@ -259,9 +268,13 @@ def molecular_similarity(
         fpgen = rdFingerprintGenerator.GetMorganGenerator(
             radius=radius, fpSize=bits
         )
-
         def _get_fp(smile: str):
-            mol = Chem.MolFromSmiles(smile)
+            mol = Chem.MolFromSmiles(smile, sanitize=True)
+
+            if mol is None:
+                print(f"SMILES: `{smile}` could not be processed. Will be substituted by `C`")
+                return _get_fp("C")
+
             if sim_function in ['dice', 'tanimoto', 'sokal', 'rogot-goldberg',
                                 'cosine']:
                 fp = fpgen.GetFingerprint(mol)
@@ -276,6 +289,10 @@ def _get_fp(smile: str):
 
         def _get_fp(smile: str):
             mol = Chem.MolFromSmiles(smile)
+            if mol is None:
+                print(f"SMILES: `{smile}` could not be processed. Will be substituted by `C`")
+                return _get_fp("C")
+
             fp = rdMolDescriptors.GetMACCSKeysFingerprint(mol)
             if sim_function in ['dice', 'tanimoto', 'sokal', 'rogot-goldberg',
                                 'cosine']:
@@ -291,6 +308,10 @@ def _get_fp(smile: str):
 
         def _get_fp(smile: str):
             mol = Chem.MolFromSmiles(smile, sanitize=True)
+            if mol is None:
+                print(f"SMILES: `{smile}` could not be processed. Will be substituted by `C`")
+                return _get_fp("C")
+
             fp = encode(mol, max_radius=radius,
                         n_permutations=bits, mapping=False)
             return fp
@@ -304,6 +325,9 @@ def _get_fp(smile: str):
         def _get_fp(smiles: str):
             fp = []
             mol = Chem.MolFromSmiles(smiles, sanitize=True)
+            if mol is None:
+                print(f"SMILES: `{smiles}` could not be processed. Will be substituted by `C`")
+                return _get_fp("C")
             fp.append(Lip.NumHAcceptors(mol))
             fp.append(Lip.NumHDonors(mol))
             fp.append(Lip.NumHeteroatoms(mol))