Skip to content

Commit

Permalink
Merge pull request #94 from IBM/rfd-dev
Browse files Browse the repository at this point in the history
🛠️🪲 Gracefully handle incorrect SMILES
  • Loading branch information
RaulFD-creator authored Feb 27, 2025
2 parents 69f5c41 + 946a8f5 commit ae99c6c
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 15 deletions.
12 changes: 6 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -119,21 +119,21 @@ pip install rdkit
The HestiaDatasetGenerator allows for the easy generation of training/validation/evaluation partitions with different similarity thresholds. Enabling the estimation of model generalisation capabilities. It also allows for the calculation of the ABOID (Area between the similarity-performance curve (Out-of-distribution) and the In-distribution performance).

```python
from hestia.dataset_generator import HestiaDatasetGenerator, SimilarityArguments
from hestia.dataset_generator import HestiaGenerator, SimArguments

# Initialise the generator for a DataFrame
generator = HestiaDatasetGenerator(df)
generator = HestiaGenerator(df)

# Define the similarity arguments (for more info see the documentation page https://ibm.github.io/Hestia-OOD/datasetgenerator)

# Similarity arguments for protein similarity
prot_args = SimilarityArguments(
prot_args = SimArguments(
data_type='sequence', field_name='sequence',
alignment_algorithm='mmseqs2+prefilter', verbose=3
)

# Similarity arguments for molecular similarity
mol_args = SimilarityArguments(
mol_args = SimArguments(
data_type='small molecule', field_name='SMILES',
fingeprint='mapc', radius=2, bits=2048
)
Expand All @@ -152,9 +152,9 @@ generator.save_precalculated('precalculated_partitions.gz')
# Load pre-calculated partitions
generator.from_precalculated('precalculated_partitions.gz')

# Training code
# Training code (filter partitions with test sets less than 18.5% of total data)

for threshold, partition in generator.get_partitions():
for threshold, partition in generator.get_partitions(filter=0.185):
train = df.iloc[partition['train']]
valid = df.iloc[partition['valid']]
test = df.iloc[partition['test']]
Expand Down
13 changes: 8 additions & 5 deletions hestia/dataset_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -469,11 +469,14 @@ def calculate_partitions(
'train': random[0],
'test': random[1]
}
sim_metadata = vars(sim_args)
if sim_args.data_type == 'embedding':
del sim_metadata['query_embds']
if 'target_embds' in sim_metadata:
del sim_metadata['target_embds']
if sim_args is None:
sim_metadata = None
else:
sim_metadata = vars(sim_args)
if sim_args.data_type == 'embedding':
del sim_metadata['query_embds']
if 'target_embds' in sim_metadata:
del sim_metadata['target_embds']

self.metadata = {
'partition_algorithm': {
Expand Down
32 changes: 28 additions & 4 deletions hestia/similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,8 +242,17 @@ def molecular_similarity(
from rdkit.DataStructs import (
BulkTanimotoSimilarity, BulkDiceSimilarity,
BulkSokalSimilarity, BulkRogotGoldbergSimilarity,
BulkCosineSimilarity
)
BulkCosineSimilarity)
from rdkit import RDLogger
from rdkit import rdBase

def disable_rdkit_log():
"""Disable all rdkit logs."""
for log_level in RDLogger._levels:
rdBase.DisableLog(log_level)

disable_rdkit_log()

except ModuleNotFoundError:
raise ImportError("This function requires RDKit to be installed.")

Expand All @@ -259,9 +268,13 @@ def molecular_similarity(
fpgen = rdFingerprintGenerator.GetMorganGenerator(
radius=radius, fpSize=bits
)

def _get_fp(smile: str):
mol = Chem.MolFromSmiles(smile)
mol = Chem.MolFromSmiles(smile, sanitize=True)

if mol is None:
print(f"SMILES: `{smile}` could not be processed. Will be substituted by `C`")
return _get_fp("C")

if sim_function in ['dice', 'tanimoto', 'sokal', 'rogot-goldberg',
'cosine']:
fp = fpgen.GetFingerprint(mol)
Expand All @@ -276,6 +289,10 @@ def _get_fp(smile: str):

def _get_fp(smile: str):
mol = Chem.MolFromSmiles(smile)
if mol is None:
print(f"SMILES: `{smile}` could not be processed. Will be substituted by `C`")
return _get_fp("C")

fp = rdMolDescriptors.GetMACCSKeysFingerprint(mol)
if sim_function in ['dice', 'tanimoto', 'sokal', 'rogot-goldberg',
'cosine']:
Expand All @@ -291,6 +308,10 @@ def _get_fp(smile: str):

def _get_fp(smile: str):
mol = Chem.MolFromSmiles(smile, sanitize=True)
if mol is None:
print(f"SMILES: `{smile}` could not be processed. Will be substituted by `C`")
return _get_fp("C")

fp = encode(mol, max_radius=radius,
n_permutations=bits, mapping=False)
return fp
Expand All @@ -304,6 +325,9 @@ def _get_fp(smile: str):
def _get_fp(smiles: str):
fp = []
mol = Chem.MolFromSmiles(smiles, sanitize=True)
if mol is None:
print(f"SMILES: `{smiles}` could not be processed. Will be substituted by `C`")
return _get_fp("C")
fp.append(Lip.NumHAcceptors(mol))
fp.append(Lip.NumHDonors(mol))
fp.append(Lip.NumHeteroatoms(mol))
Expand Down

0 comments on commit ae99c6c

Please sign in to comment.