Skip to content

Commit

Permalink
Merge pull request #71 from xiaoruiDong/fingerprints
Browse files Browse the repository at this point in the history
Add a new module to calculate fingerprints using RDKit
  • Loading branch information
xiaoruiDong authored Sep 27, 2023
2 parents 8d17698 + 09daf09 commit ce9e368
Show file tree
Hide file tree
Showing 4 changed files with 220 additions and 0 deletions.
59 changes: 59 additions & 0 deletions rdmc/featurizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
This module contains functions for generating molecular fingerprints.
"""

import numpy as np

from rdkit.Chem import Mol
from rdkit.Chem.rdFingerprintGenerator import (GetAtomPairGenerator,
GetMorganGenerator,
GetRDKitFPGenerator,
GetTopologicalTorsionGenerator)


fingerprint_dicts = {'atompair': GetAtomPairGenerator,
'atompairfp': GetAtomPairGenerator,
'morgan': GetMorganGenerator,
'morganfp': GetMorganGenerator,
'rdkit': GetRDKitFPGenerator,
'rdkitfp': GetRDKitFPGenerator,
'topologicaltorsion': GetTopologicalTorsionGenerator,
'topologicaltorsionfp': GetTopologicalTorsionGenerator,
}


def get_fingerprint(mol: 'RDKitMol',
count: bool = False,
fp_type: str = 'morgan',
num_bits: int = 2048,
**kwargs,
) -> np.ndarray:
"""
A helper function for generating molecular fingerprints. Please visit
`RDKit <https://www.rdkit.org/docs/source/rdkit.Chem.rdFingerprintGenerator.html>`_ for
more information. This function also supports fingerprint-specific arguments,
please visit the above website and find ``GetXXXGenerator`` for the corresponding
argument names and allowed value types.
Args:
mol: The molecule to generate a fingerprint for.
count: Whether to generate a count fingerprint. Default is ``False``.
fp_type: The type of fingerprint to generate. Options are:
``'atompair'``, ``'morgan'``, ``'rdkit'``,
and ``'topologicaltorsion'``.
num_bits: The length of the fingerprint. Default is ``2048``.
Returns:
np.ndarray: A numpy array of the molecular fingerprint.
"""
if not isinstance(mol, Mol): # Convert RDKitMol
mol = mol.ToRWMol()

generator = fingerprint_dicts[fp_type.lower()](fpSize=num_bits,
**kwargs)
return getattr(generator,
f'Get{"Count" if count else ""}FingerprintAsNumPy'
)(mol)
24 changes: 24 additions & 0 deletions rdmc/mol.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from rdkit.Geometry.rdGeometry import Point3D

from rdmc.conf import RDKitConf
from rdmc.featurizer import get_fingerprint
from rdmc.utils import *

from ase import Atoms
Expand Down Expand Up @@ -904,6 +905,29 @@ def GetDistanceMatrix(self, id: int = 0) -> np.ndarray:
"""
return Chem.rdmolops.Get3DDistanceMatrix(self._mol, confId=id)

def GetFingerprint(self,
fpType: str = 'morgan',
numBits: int = 2048,
count: bool = False,
**kwargs,
) -> np.ndarray:
"""
Get the fingerprint of the molecule.
Args:
fpType (str, optional): The type of the fingerprint. Defaults to ``'morgan'``.
numBits (int, optional): The number of bits of the fingerprint. Defaults to ``2048``.
count (bool, optional): Whether to count the number of occurrences of each bit. Defaults to ``False``.
Returns:
np.ndarray: A fingerprint of the molecule.
"""
return get_fingerprint(self,
fp_type=fpType,
num_bits=numBits,
count=count,
**kwargs)

def GetPositions(self, id: int = 0) -> np.ndarray:
"""
Get atom positions of the embeded conformer.
Expand Down
123 changes: 123 additions & 0 deletions test/test_fingerprint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Unit tests for the fingerprint module. This module doesn't test the actual values (which are tested
in RDKit's CI pipeline), but rather the functionality of the ``get_fingerprint`` function.
"""

import logging

import numpy as np
import pytest

from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from rdmc import RDKitMol
from rdmc.featurizer import get_fingerprint

logging.basicConfig(level=logging.DEBUG)


smis = ['Fc1cccc(C2(c3nnc(Cc4cccc5ccccc45)o3)CCOCC2)c1',
'O=C(NCc1ccnc(Oc2ccc(F)cc2)c1)c1[nH]nc2c1CCCC2',]


@pytest.fixture(params=smis)
def mol(request):
return Chem.MolFromSmiles(request.param)


@pytest.fixture(params=smis)
def rdkitmol(request):
return RDKitMol.FromSmiles(request.param, addHs=False)


@pytest.fixture(
params=[1024, 2048, 4096]
)
def num_bits(request):
return request.param


@pytest.fixture(
params=[True, False]
)
def count(request):
return request.param


@pytest.fixture(
params=[2, 3]
)
def radius(request):
return request.param


generator_name = {
('Morgan', True): 'GetHashedMorganFingerprint',
('Morgan', False): 'GetMorganFingerprintAsBitVect',
('AtomPair', True): 'GetHashedAtomPairFingerprint',
('AtomPair', False): 'GetHashedAtomPairFingerprintAsBitVect',
('TopologicalTorsion', True): 'GetHashedTopologicalTorsionFingerprint',
('TopologicalTorsion', False): 'GetHashedTopologicalTorsionFingerprintAsBitVect',
('RDKitFP', False): 'RDKFingerprint',
} # The names


def get_allchem_fingerprint(mol, count, fp_type, num_bits, **kwargs):
# This is another way with different APIs people used to calculate fingerprints
generator = getattr(AllChem, generator_name[(fp_type, count)])
features_vec = generator(mol, nBits=num_bits, **kwargs)
features = np.zeros((1,))
DataStructs.ConvertToNumpyArray(features_vec, features)

return features


class TestFingerprint:
@pytest.mark.parametrize('fp_type, count',
[('Morgan', True),
('Morgan', False),
])
def test_morgan_fingerprint(self, fp_type, count, mol, num_bits, radius):
"""
Test the ``get_fingerprint`` function get a reproducible count fingerprint for morgan fingerprints.
"""
assert np.isclose(get_fingerprint(mol, count=count, num_bits=num_bits, fp_type=fp_type, radius=radius),
get_allchem_fingerprint(mol, count=count, num_bits=num_bits, fp_type=fp_type, radius=radius)).all()

@pytest.mark.parametrize('fp_type, count',
[('Morgan', True),
('Morgan', False),
])
def test_morgan_fingerprint_rdkitmol(self, fp_type, count, rdkitmol, num_bits, radius):
"""
Test the ``get_fingerprint`` function get a reproducible count fingerprint for morgan fingerprints and RDKitMol.
"""
assert np.isclose(get_fingerprint(rdkitmol, count=count, num_bits=num_bits, fp_type=fp_type, radius=radius),
get_allchem_fingerprint(rdkitmol.ToRWMol(), count=count, num_bits=num_bits, fp_type=fp_type, radius=radius)).all()

@pytest.mark.parametrize('fp_type, count',
[('AtomPair', True),
('AtomPair', False),
('TopologicalTorsion', True),
('TopologicalTorsion', False),
])
def test_atompair_and_topological_torsion_fingerprint_rdkitmol(self, fp_type, count, rdkitmol, num_bits):
"""
Test the ``get_fingerprint`` function get a reproducible count fingerprint for AtomPair and TopologicalTorsion Fingerprints.
"""
assert np.isclose(get_fingerprint(rdkitmol, count=count, num_bits=num_bits, fp_type=fp_type),
get_allchem_fingerprint(rdkitmol.ToRWMol(), count=count, num_bits=num_bits, fp_type=fp_type)).all()

def test_rdkitfp_bit(self, rdkitmol, num_bits):
"""
Test the ``get_fingerprint`` function get a reproducible bit-based fingerprint for RDKitFP.
I don't find the the count-based fingerprint for RDKitFP implementation in AllChem.
"""
features_vec = AllChem.RDKFingerprint(rdkitmol.ToRWMol(), fpSize=num_bits)
features = np.zeros((1,))
DataStructs.ConvertToNumpyArray(features_vec, features)
assert np.isclose(get_fingerprint(rdkitmol, count=False, num_bits=num_bits, fp_type='rdkitfp'),
features).all()
14 changes: 14 additions & 0 deletions test/test_mol.py
Original file line number Diff line number Diff line change
Expand Up @@ -812,6 +812,20 @@ def test_saturate_biradical_sites_conjugated_double_bond(self):
mol.SaturateBiradicalSitesCDB(multiplicity=1, verbose=True)
assert mol.GetSpinMultiplicity() == 1

def test_get_finger_print(self):
"""
Test the function that generates molecular finger prints.
"""
# We only test one case here to check the functionality of the function
# other cases are covered by test_fingerprints
smi = 'O=C(Nc1cc2c(cn1)CCCC2)N1CCCC1c1ccc(O)cc1'
fp = RDKitMol.FromSmiles(smi, addHs=False).GetFingerprint(fpType='morgan', numBits=2048, count=True, radius=3)
fp_expect \
= Chem.rdFingerprintGenerator \
.GetMorganGenerator(radius=3, fpSize=2048)\
.GetCountFingerprintAsNumPy(Chem.MolFromSmiles(smi))
assert np.isclose(fp, fp_expect).all()


def test_parse_xyz_or_smiles_list():
"""
Expand Down

0 comments on commit ce9e368

Please sign in to comment.