diff --git a/.gitignore b/.gitignore
index 38cd292..fe105ba 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,3 @@
__pycache__/
build/
-hestia.egg-info/
\ No newline at end of file
+*.egg-info/
diff --git a/README.md b/README.md
index a83a8fc..98034e0 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,151 @@
-# Hestia
-Independent evaluation set construction for trustworthy ML models in biochemistry
+
+
Hestia
+
+
Computational tool for generating generalisation-evaluating evaluation sets.
+
+

+

+

+

+
+
+
+- **Documentation:** https://ibm.github.io/Hestia-OOD
+- **Source Code:** https://github.com/IBM/Hestia-OOD
+- **Webserver:** http://peptide.ucd.ie/Hestia
+- **Paper Pre-print:** https://www.biorxiv.org/content/10.1101/2024.03.14.584508v1
+
+## Contents
+
+Table of Contents
+
+- [Intallation Guide](#installation)
+- [Documentation](#documentation)
+- [Examples](#examples)
+- [License](#license)
+
+
+
+ ## Installation
+
+Installing in a conda environment is recommended. For creating the environment, please run:
+
+```bash
+conda create -n autopeptideml python
+conda activate autopeptideml
+```
+
+### 1. Python Package
+
+#### 1.1.From PyPI
+
+
+```bash
+pip install hestia-ood
+```
+
+#### 1.2. Directly from source
+
+```bash
+pip install git+https://github.com/IBM/Hestia-OOD
+```
+
+### 3. Third-party dependencies
+
+For using MMSeqs as alignment algorithm is necessary install it in the environment:
+
+```bash
+conda install -c bioconda mmseqs2
+```
+
+For using Needleman-Wunch:
+
+```bash
+conda install -c bioconda emboss
+```
+
+If installation not in conda environment, please check installation instructions for your particular device:
+
+- Linux:
+ ```bash
+ wget https://mmseqs.com/latest/mmseqs-linux-avx2.tar.gz
+ tar xvfz mmseqs-linux-avx2.tar.gz
+ export PATH=$(pwd)/mmseqs/bin/:$PATH
+ ```
+
+ ```bash
+ sudo apt install emboss
+ ```
+
+ ```bash
+ sudo apt install emboss
+ ```
+
+- Windows: Download binaries from [EMBOSS](https://emboss.sourceforge.net/download/) and [MMSeqs2-latest](https://mmseqs.com/latest/mmseqs-win64.zip)
+
+- Mac:
+ ```bash
+ sudo port install emboss
+ brew install mmseqs2
+ ```
+
+## Documentation
+
+### 1. Similarity calculation
+
+Calculating pairwise similarity between the entities within a DataFrame `df_query` or between two DataFrames `df_query` and `df_target` can be achieved through the `calculate_similarity` function:
+
+```python
+from hestia.similarity import calculate_similarity
+import pandas as pd
+
+df_query = pd.read_csv('example.csv')
+
+# The CSV file needs to have a column describing the entities, i.e., their sequence, their SMILES, or a path to their PDB structure.
+# This column corresponds to `field_name` in the function.
+
+sim_df = calculate_similarity(df_query, species='protein', similarity_metric='mmseqs+prefilter',
+ field_name='sequence')
+```
+
+More details about similarity calculation can be found in the [Similarity calculation documentation](https://ibm.github.io/Hestia-OOD/similarity/).
+
+### 2. Clustering
+
+Clustering the entities within a DataFrame `df` can be achieved through the `generate_clusters` function:
+
+```python
+from hestia.similarity import calculate_similarity
+from hestia.clustering import generate_clusters
+import pandas as pd
+
+df = pd.read_csv('example.csv')
+sim_df = calculate_similarity(df, species='protein', similarity_metric='mmseqs+prefilter',
+ field_name='sequence')
+clusters_df = generate_clusters(df, field_name='sequence', sim_df=sim_df,
+ cluster_algorithms='CDHIT')
+```
+
+There are three clustering algorithms currently supported: `CDHIT`, `greedy_cover_set`, or `connected_components`. More details about clustering can be found in the [Clustering documentation](https://ibm.github.io/Hestia-OOD/clustering/).
+
+
+### 3. Partitioning
+
+Partitioning the entities within a DataFrame `df` into a training and an evaluation subsets can be achieved through 4 different functions: `cc_part`, `graph_part`, `reduction_partition`, and `random_partition`. An example of how `cc_part` would be used is:
+
+```python
+from hestia.partition import cc_part
+import pandas as pd
+
+df = pd.read_csv('example.csv')
+train, test = cc_part(df, species='protein', similarity_metric='mmseqs+prefilter',
+ field_name='sequence', threshold=0.3, test_size=0.2)
+
+train_df = df.iloc[train, :]
+test_df = df.iloc[test, :]
+```
+
+License
+-------
+Hestia is an open-source software licensed under the MIT Clause License. Check the details in the [LICENSE](https://github.com/IBM/Hestia/blob/master/LICENSE) file.
+
diff --git a/hestia/clustering.py b/hestia/clustering.py
index 3f6fffe..1fa14a9 100644
--- a/hestia/clustering.py
+++ b/hestia/clustering.py
@@ -2,6 +2,7 @@
import pandas as pd
from scipy.sparse.csgraph import connected_components
+from tqdm import tqdm
from hestia.similarity import sim_df2mtx
@@ -78,16 +79,20 @@ def _greedy_incremental_clustering(
clustered = set()
sim_df = sim_df[sim_df['metric'] > threshold]
- for i in df.index:
- in_cluster = set(sim_df.loc[sim_df['query'] == i, 'target'])
- in_cluster.update(set(sim_df.loc[sim_df['target'] == i, 'query']))
+ if verbose > 2:
+ pbar = tqdm(df.index)
+ else:
+ pbar = df.index
+ for i in pbar:
if i in clustered:
continue
+ in_cluster = set(sim_df.loc[sim_df['query'] == i, 'target'])
+ in_cluster.update(set(sim_df.loc[sim_df['target'] == i, 'query']))
+ in_cluster.update(set([i]))
+ in_cluster = in_cluster.difference(clustered)
for j in in_cluster:
- if i == j:
- continue
clusters.append({
'cluster': i,
'member': j
@@ -99,7 +104,7 @@ def _greedy_incremental_clustering(
if verbose > 1:
print('Clustering has generated:',
f'{len(cluster_df.cluster.unique()):,d} clusters for',
- f'{len(df):,} entities')
+ f'{len(cluster_df):,} entities')
return cluster_df
@@ -111,7 +116,7 @@ def _greedy_cover_set(
) -> pd.DataFrame:
def _find_connectivity(df, sim_df):
neighbours = []
- for i in df.index:
+ for i in tqdm(df.index):
in_cluster = set(sim_df.loc[sim_df['query'] == i, 'target'])
in_cluster.update(set(sim_df.loc[sim_df['target'] == i, 'query']))
neighbours.append(in_cluster)
@@ -124,15 +129,19 @@ def _find_connectivity(df, sim_df):
clusters = []
clustered = set()
+ if verbose > 2:
+ pbar = tqdm(df.index)
+ else:
+ pbar = df.index
- for i in df.index:
- in_cluster = neighbours.pop(0)
-
+ for i in pbar:
if i in clustered:
continue
+ in_cluster = neighbours.pop(0)
+ in_cluster.update([i])
+ in_cluster = in_cluster.difference(clustered)
+
for j in in_cluster:
- if i == j:
- continue
clusters.append({
'cluster': i,
'member': j
@@ -144,7 +153,7 @@ def _find_connectivity(df, sim_df):
if verbose > 1:
print('Clustering has generated:',
f'{len(cluster_df.cluster.unique()):,d} clusters for',
- f'{len(df):,} entities')
+ f'{len(cluster_df):,} entities')
return cluster_df
@@ -158,7 +167,7 @@ def _connected_components_clustering(
n, labels = connected_components(matrix, directed=False,
return_labels=True)
cluster_df = [{'cluster': labels[i],
- 'member': i} for i in df.index]
+ 'member': i} for i in range(labels.shape[0])]
if verbose > 0:
print('Clustering has generated:',
f'{n:,d} connected componentes for',
diff --git a/hestia/dataset_generator.py b/hestia/dataset_generator.py
new file mode 100644
index 0000000..45f27db
--- /dev/null
+++ b/hestia/dataset_generator.py
@@ -0,0 +1,238 @@
+import gzip
+import json
+from multiprocessing import cpu_count
+
+import pandas as pd
+from sklearn.metrics import auc
+from tqdm import tqdm
+
+from hestia.similarity import calculate_similarity
+from hestia.partition import random_partition, ccpart, graph_part
+
+
+class SimilarityArguments:
+ def __init__(
+ self,
+ data_type: str = 'protein',
+ similarity_metric: str = 'mmseqs+prefilter',
+ field_name: str = 'sequence',
+ min_threshold: float = 0.25,
+ threads: int = cpu_count(),
+ verbose: int = 0,
+ save_alignment: bool = False,
+ filename: str = 'alignment',
+ distance: str = 'tanimoto',
+ bits: str = 1024,
+ radius: int = 2,
+ denominator: str = 'shortest',
+ representation: str = '3di+aa',
+ config: dict = {
+ "gapopen": 10,
+ "gapextend": 0.5,
+ "endweight": True,
+ "endopen": 10,
+ "endextend": 0.5,
+ "matrix": "EBLOSUM62"
+ }
+ ):
+ self.data_type = data_type
+ self.similarity_metric = similarity_metric
+ self.field_name = field_name
+ self.min_threshold = min_threshold
+ self.threads = threads
+ self.verbose = verbose
+ self.save_alignment = save_alignment
+ self.filename = filename
+ self.distance = distance
+ self.bits = bits
+ self.radius = radius
+ self.denominator = denominator
+ self.representation = representation
+ self.config = config
+
+
+class HestiaDatasetGenerator:
+ def __init__(self, data: pd.DataFrame):
+ self.data = data
+ self.sim_df = None
+ self.partitions = None
+ print('Initialising Hestia Dataset Generator')
+ print(f'Number of items in data: {len(self.data)}')
+
+ def from_precalculated(self, data_path: str):
+ with gzip.open(data_path, 'r') as fin:
+ self.partitions = json.loads(fin.read().decode('utf-8'))
+ new_dict = {}
+ for key, value in self.partitions.items():
+ if key != 'random':
+ new_dict[float(key)] = value
+ else:
+ new_dict[key] = value
+ self.partitions = new_dict
+
+ def save_precalculated(self, output_path: str):
+ with gzip.open(output_path, 'w') as fout:
+ fout.write(json.dumps(self.partitions).encode('utf-8'))
+
+ def calculate_similarity(self, similarity_args: SimilarityArguments):
+ print('Calculating similarity...')
+ self.sim_df = calculate_similarity(
+ self.data, self.data, data_type=similarity_args.data_type,
+ similarity_metric=similarity_args.similarity_metric,
+ field_name=similarity_args.field_name,
+ threshold=similarity_args.min_threshold,
+ threads=similarity_args.threads,
+ verbose=similarity_args.verbose,
+ save_alignment=similarity_args.save_alignment,
+ filename=similarity_args.filename,
+ distance=similarity_args.distance,
+ bits=similarity_args.bits,
+ radius=similarity_args.radius,
+ denominator=similarity_args.denominator,
+ representation=similarity_args.representation,
+ config=similarity_args.config
+ )
+ print('Similarity successfully calculated!')
+
+ def load_similarity(self, output_path: str):
+ print('Loading precalculated similarity...')
+ self.sim_df = pd.read_csv(output_path, compression='gzip')
+ print('Precalculated similarity loaded successfully!')
+
+ def calculate_partitions(
+ self,
+ label_name: str = None,
+ min_threshold: float = 0.3,
+ threshold_step: float = 0.1,
+ test_size: float = 0.2,
+ valid_size: float = 0.1,
+ partition_algorithm: str = 'ccpart',
+ random_state: int = 42,
+ similarity_args: SimilarityArguments = SimilarityArguments()
+ ):
+ print('Calculating partitions...')
+ self.partitions = {}
+ if self.sim_df is None:
+ self.calculate_similarity(similarity_args)
+ if partition_algorithm == 'ccpart':
+ partition_algorithm = ccpart
+ elif partition_algorithm == 'graph_part':
+ partition_algorithm = graph_part
+ else:
+ raise ValueError(
+ f'Partition algorithm: {partition_algorithm} is not ' +
+ 'supported. Try using: `ccpart` or `graph_part`.'
+ )
+ min_threshold = int(min_threshold * 100)
+ threshold_step = int(threshold_step * 100)
+
+ for th in tqdm(range(min_threshold, 100, threshold_step)):
+ th_parts = partition_algorithm(
+ self.data,
+ label_name=label_name, test_size=test_size,
+ valid_size=valid_size, threshold=th / 100,
+ sim_df=self.sim_df
+ )
+ train_th_parts = random_partition(
+ self.data.iloc[th_parts[0]].reset_index(drop=True),
+ test_size=valid_size, random_state=random_state
+ )
+ self.partitions[th / 100] = {
+ 'train': train_th_parts[0],
+ 'valid': train_th_parts[1],
+ 'test': th_parts[1]
+ }
+ random = random_partition(self.data, test_size=test_size,
+ random_state=random_state)
+ train_random = random_partition(
+ self.data.iloc[random[0]].reset_index(drop=True),
+ test_size=valid_size, random_state=random_state
+ )
+ self.partitions['random'] = {
+ 'train': train_random[0],
+ 'valid': train_random[1],
+ 'test': random[1]
+ }
+ print('Partitions successfully calculated!')
+
+ def generate_datasets(self, dataset_type: str, threshold: float) -> dict:
+ ds = {}
+
+ if dataset_type == 'huggingface' or dataset_type == 'hf':
+ try:
+ import datasets
+ except ImportError:
+ raise ImportError(
+ f"This dataset_type: {dataset_type} requires `datasets` " +
+ "to be installed. Install using: `pip install datasets`"
+ )
+ for key, value in self.partitions[threshold].items():
+ ds[key] = datasets.Dataset.from_pandas(
+ self.data.iloc[value].reset_index()
+ )
+ return ds
+ elif dataset_type == 'pytorch' or dataset_type == 'torch':
+ try:
+ from hestia.utils.dataset_utils import Dataset_from_pandas
+ except ModuleNotFoundError:
+ raise ImportError(
+ f"This dataset_type: {dataset_type} requires `torch` " +
+ "to be installed. Install using: `pip install torch`"
+ )
+ for key, value in self.partitions[threshold].items():
+ ds[key] = Dataset_from_pandas(
+ self.data.iloc[value].reset_index()
+ )
+ return ds
+
+ @staticmethod
+ def calculate_auspc(results: dict, metric: str):
+ x, y = [], []
+ for key, value in results.items():
+ if key == 'random':
+ continue
+ x.append(key)
+ y.append(results['random'][metric] - value[metric])
+ return auc(x, y)
+
+ @staticmethod
+ def plot_spc(results: dict, metric: str):
+ import matplotlib.pyplot as plt
+ x, y = [], []
+ for key, value in results.items():
+ if key == 'random':
+ continue
+ x.append(key)
+ y.append(value[metric])
+ plt.scatter(x, y)
+ plt.plot(x, [results['random'][metric] for _ in range(len(x))], 'r')
+ plt.ylabel(f'Performance: {metric}')
+ plt.xlabel(f'Threshold similarity')
+ plt.legend(['SP', 'Random'])
+ plt.ylim(0, 1.1)
+ plt.show()
+
+
+if __name__ == '__main__':
+ df = pd.read_csv('dili.tab', sep='\t')
+ generator = HestiaDatasetGenerator(df)
+ args = SimilarityArguments(
+ data_type='small_molecule', field_name='Drug',
+ similarity_metric='fingerprint', verbose=3,
+ save_alignment=True
+ )
+ generator.calculate_similarity(args)
+ generator.load_similarity(args.filename + '.csv.gz')
+ generator.calculate_partitions('Y', min_threshold=0.3,
+ threshold_step=0.05,
+ test_size=0.2, valid_size=0.1)
+ generator.save_precalculated('precalculated_partitions.gz')
+ generator.from_precalculated('precalculated_partitions.gz')
+ ds = generator.generate_datasets('torch', 0.35)
+ trial = {
+ th / 100: {'acc': 0.9 + (0.001 * th)}
+ for th in range(30, 100, 5)
+ }
+ trial.update({'random': {'acc': 1}})
+ print(generator.calculate_auspc(trial, 'acc'))
+ HestiaDatasetGenerator.plot_spc(trial, 'acc')
diff --git a/hestia/partition.py b/hestia/partition.py
index ae42e7d..7121b96 100644
--- a/hestia/partition.py
+++ b/hestia/partition.py
@@ -43,18 +43,18 @@ def random_partition(
return train_df, test_df
-def connected_components_partition(
+def ccpart(
df: pd.DataFrame,
- similarity_metric: str,
- field_name: str,
- label_name: str,
+ similarity_metric: str = None,
+ field_name: str = None,
+ label_name: str = None,
threads: int = cpu_count(),
denominator: str = None,
test_size: float = 0.2,
valid_size: float = 0.0,
threshold: float = 0.3,
verbose: int = 0,
- species: str = 'protein',
+ data_type: str = 'protein',
distance: str = 'tanimoto',
representation: str = '3di+aa',
bits: int = 1024,
@@ -62,7 +62,7 @@ def connected_components_partition(
config: dict = None,
sim_df: Optional[pd.DataFrame] = None
) -> Union[Tuple[list, list], Tuple[list, list, list]]:
- """Use connected components partitioning algorithm
+ """Use CCPart algorithm
to generate training and evaluation subsets
that maximise the dissimilarity between their
entities.
@@ -120,10 +120,10 @@ def connected_components_partition(
- 2: All
Defaults to 0
:type verbose: int, optional
- :param species: Biochemical species to which the data belongs.
+ :param data_type: Biochemical data_type to which the data belongs.
Options: `protein`, `DNA`, `RNA`, or `small_molecule`; defaults to
'protein'
- :type species: str, optional
+ :type data_type: str, optional
:param distance: Distance metrics for small molecule comparison.
Currently, it is restricted to Tanimoto distance will
be extended in future patches; if interested in a specific
@@ -175,7 +175,7 @@ def connected_components_partition(
if sim_df is None:
sim_df = calculate_similarity(
- df, df, species=species,
+ df, df, data_type=data_type,
similarity_metric=similarity_metric,
field_name=field_name, threshold=threshold,
threads=threads, verbose=verbose,
@@ -183,7 +183,6 @@ def connected_components_partition(
bits=bits, denominator=denominator, radius=radius,
representation=representation, config=config
)
-
cluster_df = generate_clusters(df, field_name=field_name,
threshold=threshold,
verbose=verbose,
@@ -251,7 +250,7 @@ def reduction_partition(
valid_size: float = 0.0,
threshold: float = 0.3,
verbose: int = 2,
- species: str = 'protein',
+ data_type: str = 'protein',
representation: str = '3di+aa',
random_state: int = 42,
bits: int = 1024,
@@ -318,10 +317,10 @@ def reduction_partition(
- 2: All
Defaults to 0
:type verbose: int, optional
- :param species: Biochemical species to which the data belongs.
+ :param data_type: Biochemical data_type to which the data belongs.
Options: `protein`, `DNA`, `RNA`, or `small_molecule`; defaults to
'protein'
- :type species: str, optional
+ :type data_type: str, optional
:param distance: Distance metrics for small molecule comparison.
Currently, it is restricted to Tanimoto distance will
be extended in future patches; if interested in a specific
@@ -362,7 +361,7 @@ def reduction_partition(
"""
df = similarity_reduction(df, similarity_metric, field_name,
threads, clustering_mode, denominator,
- test_size, threshold, verbose, species,
+ test_size, threshold, verbose, data_type,
representation, bits,
radius, sim_df, config)
train, test = random_partition(df.index.tolist(), test_size=test_size,
@@ -388,7 +387,7 @@ def graph_part(
threshold: float = 0.3,
verbose: int = 2,
n_parts: int = 10,
- species: str = 'protein',
+ data_type: str = 'protein',
distance: str = 'tanimoto',
representation: str = '3di+aa',
bits: int = 1024,
@@ -454,10 +453,10 @@ def graph_part(
- 2: All
Defaults to 0
:type verbose: int, optional
- :param species: Biochemical species to which the data belongs.
+ :param data_type: Biochemical data_type to which the data belongs.
Options: `protein`, `DNA`, `RNA`, or `small_molecule`; defaults to
'protein'
- :type species: str, optional
+ :type data_type: str, optional
:param distance: Distance metrics for small molecule comparison.
Currently, it is restricted to Tanimoto distance will
be extended in future patches; if interested in a specific
@@ -498,7 +497,7 @@ def graph_part(
"""
if sim_df is None:
sim_df = calculate_similarity(
- df, df, species=species,
+ df, df, data_type=data_type,
similarity_metric=similarity_metric,
field_name=field_name, threshold=threshold,
threads=threads, verbose=verbose,
diff --git a/hestia/similarity.py b/hestia/similarity.py
index 09f819b..61c2e2d 100644
--- a/hestia/similarity.py
+++ b/hestia/similarity.py
@@ -9,6 +9,7 @@
import pandas as pd
from tqdm import tqdm
import scipy.sparse as spr
+from concurrent.futures import ThreadPoolExecutor
def sim_df2mtx(sim_df: pd.DataFrame,
@@ -50,7 +51,7 @@ def sim_df2mtx(sim_df: pd.DataFrame,
def calculate_similarity(
df_query: pd.DataFrame,
df_target: pd.DataFrame = None,
- species: str = 'protein',
+ data_type: str = 'protein',
similarity_metric: str = 'mmseqs+prefilter',
field_name: str = 'sequence',
threshold: float = 0.3,
@@ -76,10 +77,10 @@ def calculate_similarity(
similarities. If not specified, the `df_query` will be used as `df_target`
as well, defaults to None
:type df_target: pd.DataFrame, optional
- :param species: Biochemical species to which the data belongs.
+ :param data_type: Biochemical data_type to which the data belongs.
Options: `protein`, `DNA`, `RNA`, or `small_molecule`; defaults to
'protein'
- :type species: str, optional
+ :type data_type: str, optional
:param similarity_metric: Similarity function to use.
Options:
- `protein`: `mmseqs` (local alignment),
@@ -157,8 +158,8 @@ def calculate_similarity(
- "endextend": 0.5,
- "matrix": "EBLOSUM62"
:type config: dict, optional
- :raises NotImplementedError: Biochemical species is not supported
- see `species`.
+ :raises NotImplementedError: Biochemical data_type is not supported
+ see `data_type`.
:raises NotImplementedError: Similarity metric is not supported
see `similarity_algorithm`
:return: DataFrame with similarities (`metric`) between
@@ -168,10 +169,10 @@ def calculate_similarity(
:rtype: pd.DataFrame
"""
mssg = f'Alignment method: {similarity_metric} '
- mssg += f'not implemented for species: {species}'
- mssg2 = f'Species: {species} not supported'
+ mssg += f'not implemented for data_type: {data_type}'
+ mssg2 = f'data_type: {data_type} not supported'
- if species == 'protein':
+ if data_type == 'protein':
if 'mmseqs' in similarity_metric:
sim_df = _mmseqs2_alignment(
df_query=df_query,
@@ -215,9 +216,9 @@ def calculate_similarity(
)
else:
mssg = f'Alignment method: {similarity_metric} '
- mssg += f'not implemented for species: {species}'
+ mssg += f'not implemented for data_type: {data_type}'
raise NotImplementedError(mssg)
- elif species.upper() == 'DNA' or species.upper() == 'RNA':
+ elif data_type.upper() == 'DNA' or data_type.upper() == 'RNA':
if 'mmseqs' in similarity_metric:
sim_df = _mmseqs2_alignment(
df_query=df_query,
@@ -247,9 +248,9 @@ def calculate_similarity(
)
else:
mssg = f'Alignment method: {similarity_metric} '
- mssg += f'not implemented for species: {species}'
+ mssg += f'not implemented for data_type: {data_type}'
raise NotImplementedError(mssg)
- elif species == 'small_molecule' or species.lower() == 'smiles':
+ elif data_type == 'small_molecule' or data_type.lower() == 'smiles':
if similarity_metric == 'scaffold':
sim_df = _scaffold_alignment(
df_query=df_query,
@@ -274,6 +275,9 @@ def calculate_similarity(
save_alignment=save_alignment,
filename=filename
)
+ else:
+ mssg = f'Alignment method: {similarity_metric} '
+ mssg += f'not implemented for data_type: {data_type}'
else:
raise NotImplementedError(mssg2)
return sim_df
@@ -388,10 +392,6 @@ def _fingerprint_alignment(
filename: str = None,
**kwargs
) -> Union[pd.DataFrame, np.ndarray]:
- # Threshold for similarity evaluation: 0.85, based on:
- # Patterson DE, Cramer RD, Ferguson AM, Clark RD, Weinberger LE:
- # Neighborhood behavior: A useful concept for validation of ''molecular
- # diversity'' descriptors. J Med Chem 1996, 39:3049-3059.
"""_summary_
:param df_query: _description_
@@ -427,9 +427,14 @@ def _fingerprint_alignment(
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem
+ from tqdm.contrib.concurrent import thread_map
except ModuleNotFoundError:
raise ImportError("This function requires RDKit to be installed.")
- from concurrent.futures import ThreadPoolExecutor
+
+ def _get_fp(smile: str):
+ mol = Chem.MolFromSmiles(smile)
+ fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, bits)
+ return fp
def _compute_tanimoto(query_fp: list, target_fps: list):
scores = DataStructs.BulkTanimotoSimilarity(query_fp, target_fps)
@@ -443,43 +448,50 @@ def _compute_tanimoto(query_fp: list, target_fps: list):
if verbose > 0:
print(f'Calculating molecular similarities using ECFP-{radius * 2}',
f'with {bits:,} bits and Tanimoto distance...')
+ query_fps = thread_map(_get_fp, df_query[field_name], max_workers=threads)
if df_target is None:
df_target = df_query
+ target_fps = query_fps
+ else:
+ target_fps = thread_map(_get_fp, df_query[field_name],
+ max_workers=threads)
- mols_query = [Chem.MolFromSmiles(smiles) for smiles in df_query[field_name]]
- mols_target = [Chem.MolFromSmiles(smiles) for smiles in df_target[field_name]]
- fps_query = [AllChem.GetMorganFingerprintAsBitVect(x, radius, bits)
- for x in mols_query]
- fps_target = [AllChem.GetMorganFingerprintAsBitVect(x, radius, bits)
- for x in mols_target]
- jobs = []
- with ThreadPoolExecutor(max_workers=threads) as executor:
- for query_fp in fps_query:
- job = executor.submit(_compute_tanimoto, query_fp, fps_target)
- jobs.append(job)
+ chunk_size = threads * 1_000
+ chunks_target = (len(df_target) // chunk_size) + 1
+ queries, targets, metrics = [], [], []
+ pbar = tqdm(range(len(query_fps)))
- if verbose > 1:
- pbar = tqdm(jobs)
- else:
- pbar = jobs
+ with ThreadPoolExecutor(max_workers=threads) as executor:
+ for chunk in pbar:
+ jobs = []
+ for chunk_t in range(chunks_target):
+ start_t = chunk_t * chunk_size
+ if chunk_t == chunks_target - 1:
+ end_t = -1
+ else:
+ end_t = (chunk_t + 1) * chunk_size
+ chunk_fps = target_fps[start_t:end_t]
+ query_fp = query_fps[chunk]
+ job = executor.submit(_compute_tanimoto, query_fp, chunk_fps)
+ jobs.append(job)
- proto_df = []
- for idx, job in enumerate(pbar):
- if job.exception() is not None:
- raise RuntimeError(job.exception())
- result = job.result()
- entry = [{'query': idx, 'target': idx_target, 'metric': metric}
- for idx_target, metric in enumerate(result)]
- proto_df.extend(entry)
+ for idx, job in enumerate(jobs):
+ if job.exception() is not None:
+ raise RuntimeError(job.exception())
+ result = job.result()
+ for idx_target, metric in enumerate(result):
+ if metric < threshold:
+ continue
+ queries.append(int(chunk))
+ targets.append(int((idx * chunk_size) + idx_target))
+ metrics.append(metric)
+ df = pd.DataFrame({'query': queries, 'target': targets, 'metric': metrics})
- df = pd.DataFrame(proto_df)
if save_alignment:
if filename is None:
filename = time.time()
df.to_csv(f'{filename}.csv.gz', index=False, compression='gzip')
-
- # df = df[df.metric >= threshold]
return df
diff --git a/hestia/utils/dataset_utils.py b/hestia/utils/dataset_utils.py
new file mode 100644
index 0000000..e15f255
--- /dev/null
+++ b/hestia/utils/dataset_utils.py
@@ -0,0 +1,16 @@
+import pandas as pd
+from torch.utils.data import Dataset
+
+
+class Dataset_from_pandas(Dataset):
+ def __init__(self, dataframe: pd.DataFrame):
+ self.dataframe = dataframe
+
+ def __getitem__(self, index):
+ row = self.dataframe.iloc[index].to_numpy()
+ features = row[1:]
+ label = row[0]
+ return features, label
+
+ def __len__(self):
+ return len(self.dataframe)
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 499ffa1..9428c6f 100644
--- a/setup.py
+++ b/setup.py
@@ -15,7 +15,6 @@
'scikit-learn',
'pandas',
'numpy',
- 'rdkit',
'tqdm'
]
@@ -48,6 +47,6 @@
test_suite='tests',
tests_require=test_requirements,
url='https://github.com/IBM/Hestia-OOD',
- version='0.0.3',
+ version='0.0.7',
zip_safe=False,
)