diff --git a/.gitignore b/.gitignore index 38cd292..fe105ba 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,3 @@ __pycache__/ build/ -hestia.egg-info/ \ No newline at end of file +*.egg-info/ diff --git a/README.md b/README.md index a83a8fc..98034e0 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,151 @@ -# Hestia -Independent evaluation set construction for trustworthy ML models in biochemistry +
+

Hestia

+ +

Computational tool for generating generalisation-evaluating evaluation sets.

+ + Tutorials + GitHub + + + +
+ +- **Documentation:** https://ibm.github.io/Hestia-OOD +- **Source Code:** https://github.com/IBM/Hestia-OOD +- **Webserver:** http://peptide.ucd.ie/Hestia +- **Paper Pre-print:** https://www.biorxiv.org/content/10.1101/2024.03.14.584508v1 + +## Contents + +
Table of Contents + +- [Intallation Guide](#installation) +- [Documentation](#documentation) +- [Examples](#examples) +- [License](#license) +
+ + + ## Installation + +Installing in a conda environment is recommended. For creating the environment, please run: + +```bash +conda create -n autopeptideml python +conda activate autopeptideml +``` + +### 1. Python Package + +#### 1.1.From PyPI + + +```bash +pip install hestia-ood +``` + +#### 1.2. Directly from source + +```bash +pip install git+https://github.com/IBM/Hestia-OOD +``` + +### 3. Third-party dependencies + +For using MMSeqs as alignment algorithm is necessary install it in the environment: + +```bash +conda install -c bioconda mmseqs2 +``` + +For using Needleman-Wunch: + +```bash +conda install -c bioconda emboss +``` + +If installation not in conda environment, please check installation instructions for your particular device: + +- Linux: + ```bash + wget https://mmseqs.com/latest/mmseqs-linux-avx2.tar.gz + tar xvfz mmseqs-linux-avx2.tar.gz + export PATH=$(pwd)/mmseqs/bin/:$PATH + ``` + + ```bash + sudo apt install emboss + ``` + + ```bash + sudo apt install emboss + ``` + +- Windows: Download binaries from [EMBOSS](https://emboss.sourceforge.net/download/) and [MMSeqs2-latest](https://mmseqs.com/latest/mmseqs-win64.zip) + +- Mac: + ```bash + sudo port install emboss + brew install mmseqs2 + ``` + +## Documentation + +### 1. Similarity calculation + +Calculating pairwise similarity between the entities within a DataFrame `df_query` or between two DataFrames `df_query` and `df_target` can be achieved through the `calculate_similarity` function: + +```python +from hestia.similarity import calculate_similarity +import pandas as pd + +df_query = pd.read_csv('example.csv') + +# The CSV file needs to have a column describing the entities, i.e., their sequence, their SMILES, or a path to their PDB structure. +# This column corresponds to `field_name` in the function. + +sim_df = calculate_similarity(df_query, species='protein', similarity_metric='mmseqs+prefilter', + field_name='sequence') +``` + +More details about similarity calculation can be found in the [Similarity calculation documentation](https://ibm.github.io/Hestia-OOD/similarity/). + +### 2. Clustering + +Clustering the entities within a DataFrame `df` can be achieved through the `generate_clusters` function: + +```python +from hestia.similarity import calculate_similarity +from hestia.clustering import generate_clusters +import pandas as pd + +df = pd.read_csv('example.csv') +sim_df = calculate_similarity(df, species='protein', similarity_metric='mmseqs+prefilter', + field_name='sequence') +clusters_df = generate_clusters(df, field_name='sequence', sim_df=sim_df, + cluster_algorithms='CDHIT') +``` + +There are three clustering algorithms currently supported: `CDHIT`, `greedy_cover_set`, or `connected_components`. More details about clustering can be found in the [Clustering documentation](https://ibm.github.io/Hestia-OOD/clustering/). + + +### 3. Partitioning + +Partitioning the entities within a DataFrame `df` into a training and an evaluation subsets can be achieved through 4 different functions: `cc_part`, `graph_part`, `reduction_partition`, and `random_partition`. An example of how `cc_part` would be used is: + +```python +from hestia.partition import cc_part +import pandas as pd + +df = pd.read_csv('example.csv') +train, test = cc_part(df, species='protein', similarity_metric='mmseqs+prefilter', + field_name='sequence', threshold=0.3, test_size=0.2) + +train_df = df.iloc[train, :] +test_df = df.iloc[test, :] +``` + +License +------- +Hestia is an open-source software licensed under the MIT Clause License. Check the details in the [LICENSE](https://github.com/IBM/Hestia/blob/master/LICENSE) file. + diff --git a/hestia/clustering.py b/hestia/clustering.py index 3f6fffe..1fa14a9 100644 --- a/hestia/clustering.py +++ b/hestia/clustering.py @@ -2,6 +2,7 @@ import pandas as pd from scipy.sparse.csgraph import connected_components +from tqdm import tqdm from hestia.similarity import sim_df2mtx @@ -78,16 +79,20 @@ def _greedy_incremental_clustering( clustered = set() sim_df = sim_df[sim_df['metric'] > threshold] - for i in df.index: - in_cluster = set(sim_df.loc[sim_df['query'] == i, 'target']) - in_cluster.update(set(sim_df.loc[sim_df['target'] == i, 'query'])) + if verbose > 2: + pbar = tqdm(df.index) + else: + pbar = df.index + for i in pbar: if i in clustered: continue + in_cluster = set(sim_df.loc[sim_df['query'] == i, 'target']) + in_cluster.update(set(sim_df.loc[sim_df['target'] == i, 'query'])) + in_cluster.update(set([i])) + in_cluster = in_cluster.difference(clustered) for j in in_cluster: - if i == j: - continue clusters.append({ 'cluster': i, 'member': j @@ -99,7 +104,7 @@ def _greedy_incremental_clustering( if verbose > 1: print('Clustering has generated:', f'{len(cluster_df.cluster.unique()):,d} clusters for', - f'{len(df):,} entities') + f'{len(cluster_df):,} entities') return cluster_df @@ -111,7 +116,7 @@ def _greedy_cover_set( ) -> pd.DataFrame: def _find_connectivity(df, sim_df): neighbours = [] - for i in df.index: + for i in tqdm(df.index): in_cluster = set(sim_df.loc[sim_df['query'] == i, 'target']) in_cluster.update(set(sim_df.loc[sim_df['target'] == i, 'query'])) neighbours.append(in_cluster) @@ -124,15 +129,19 @@ def _find_connectivity(df, sim_df): clusters = [] clustered = set() + if verbose > 2: + pbar = tqdm(df.index) + else: + pbar = df.index - for i in df.index: - in_cluster = neighbours.pop(0) - + for i in pbar: if i in clustered: continue + in_cluster = neighbours.pop(0) + in_cluster.update([i]) + in_cluster = in_cluster.difference(clustered) + for j in in_cluster: - if i == j: - continue clusters.append({ 'cluster': i, 'member': j @@ -144,7 +153,7 @@ def _find_connectivity(df, sim_df): if verbose > 1: print('Clustering has generated:', f'{len(cluster_df.cluster.unique()):,d} clusters for', - f'{len(df):,} entities') + f'{len(cluster_df):,} entities') return cluster_df @@ -158,7 +167,7 @@ def _connected_components_clustering( n, labels = connected_components(matrix, directed=False, return_labels=True) cluster_df = [{'cluster': labels[i], - 'member': i} for i in df.index] + 'member': i} for i in range(labels.shape[0])] if verbose > 0: print('Clustering has generated:', f'{n:,d} connected componentes for', diff --git a/hestia/dataset_generator.py b/hestia/dataset_generator.py new file mode 100644 index 0000000..45f27db --- /dev/null +++ b/hestia/dataset_generator.py @@ -0,0 +1,238 @@ +import gzip +import json +from multiprocessing import cpu_count + +import pandas as pd +from sklearn.metrics import auc +from tqdm import tqdm + +from hestia.similarity import calculate_similarity +from hestia.partition import random_partition, ccpart, graph_part + + +class SimilarityArguments: + def __init__( + self, + data_type: str = 'protein', + similarity_metric: str = 'mmseqs+prefilter', + field_name: str = 'sequence', + min_threshold: float = 0.25, + threads: int = cpu_count(), + verbose: int = 0, + save_alignment: bool = False, + filename: str = 'alignment', + distance: str = 'tanimoto', + bits: str = 1024, + radius: int = 2, + denominator: str = 'shortest', + representation: str = '3di+aa', + config: dict = { + "gapopen": 10, + "gapextend": 0.5, + "endweight": True, + "endopen": 10, + "endextend": 0.5, + "matrix": "EBLOSUM62" + } + ): + self.data_type = data_type + self.similarity_metric = similarity_metric + self.field_name = field_name + self.min_threshold = min_threshold + self.threads = threads + self.verbose = verbose + self.save_alignment = save_alignment + self.filename = filename + self.distance = distance + self.bits = bits + self.radius = radius + self.denominator = denominator + self.representation = representation + self.config = config + + +class HestiaDatasetGenerator: + def __init__(self, data: pd.DataFrame): + self.data = data + self.sim_df = None + self.partitions = None + print('Initialising Hestia Dataset Generator') + print(f'Number of items in data: {len(self.data)}') + + def from_precalculated(self, data_path: str): + with gzip.open(data_path, 'r') as fin: + self.partitions = json.loads(fin.read().decode('utf-8')) + new_dict = {} + for key, value in self.partitions.items(): + if key != 'random': + new_dict[float(key)] = value + else: + new_dict[key] = value + self.partitions = new_dict + + def save_precalculated(self, output_path: str): + with gzip.open(output_path, 'w') as fout: + fout.write(json.dumps(self.partitions).encode('utf-8')) + + def calculate_similarity(self, similarity_args: SimilarityArguments): + print('Calculating similarity...') + self.sim_df = calculate_similarity( + self.data, self.data, data_type=similarity_args.data_type, + similarity_metric=similarity_args.similarity_metric, + field_name=similarity_args.field_name, + threshold=similarity_args.min_threshold, + threads=similarity_args.threads, + verbose=similarity_args.verbose, + save_alignment=similarity_args.save_alignment, + filename=similarity_args.filename, + distance=similarity_args.distance, + bits=similarity_args.bits, + radius=similarity_args.radius, + denominator=similarity_args.denominator, + representation=similarity_args.representation, + config=similarity_args.config + ) + print('Similarity successfully calculated!') + + def load_similarity(self, output_path: str): + print('Loading precalculated similarity...') + self.sim_df = pd.read_csv(output_path, compression='gzip') + print('Precalculated similarity loaded successfully!') + + def calculate_partitions( + self, + label_name: str = None, + min_threshold: float = 0.3, + threshold_step: float = 0.1, + test_size: float = 0.2, + valid_size: float = 0.1, + partition_algorithm: str = 'ccpart', + random_state: int = 42, + similarity_args: SimilarityArguments = SimilarityArguments() + ): + print('Calculating partitions...') + self.partitions = {} + if self.sim_df is None: + self.calculate_similarity(similarity_args) + if partition_algorithm == 'ccpart': + partition_algorithm = ccpart + elif partition_algorithm == 'graph_part': + partition_algorithm = graph_part + else: + raise ValueError( + f'Partition algorithm: {partition_algorithm} is not ' + + 'supported. Try using: `ccpart` or `graph_part`.' + ) + min_threshold = int(min_threshold * 100) + threshold_step = int(threshold_step * 100) + + for th in tqdm(range(min_threshold, 100, threshold_step)): + th_parts = partition_algorithm( + self.data, + label_name=label_name, test_size=test_size, + valid_size=valid_size, threshold=th / 100, + sim_df=self.sim_df + ) + train_th_parts = random_partition( + self.data.iloc[th_parts[0]].reset_index(drop=True), + test_size=valid_size, random_state=random_state + ) + self.partitions[th / 100] = { + 'train': train_th_parts[0], + 'valid': train_th_parts[1], + 'test': th_parts[1] + } + random = random_partition(self.data, test_size=test_size, + random_state=random_state) + train_random = random_partition( + self.data.iloc[random[0]].reset_index(drop=True), + test_size=valid_size, random_state=random_state + ) + self.partitions['random'] = { + 'train': train_random[0], + 'valid': train_random[1], + 'test': random[1] + } + print('Partitions successfully calculated!') + + def generate_datasets(self, dataset_type: str, threshold: float) -> dict: + ds = {} + + if dataset_type == 'huggingface' or dataset_type == 'hf': + try: + import datasets + except ImportError: + raise ImportError( + f"This dataset_type: {dataset_type} requires `datasets` " + + "to be installed. Install using: `pip install datasets`" + ) + for key, value in self.partitions[threshold].items(): + ds[key] = datasets.Dataset.from_pandas( + self.data.iloc[value].reset_index() + ) + return ds + elif dataset_type == 'pytorch' or dataset_type == 'torch': + try: + from hestia.utils.dataset_utils import Dataset_from_pandas + except ModuleNotFoundError: + raise ImportError( + f"This dataset_type: {dataset_type} requires `torch` " + + "to be installed. Install using: `pip install torch`" + ) + for key, value in self.partitions[threshold].items(): + ds[key] = Dataset_from_pandas( + self.data.iloc[value].reset_index() + ) + return ds + + @staticmethod + def calculate_auspc(results: dict, metric: str): + x, y = [], [] + for key, value in results.items(): + if key == 'random': + continue + x.append(key) + y.append(results['random'][metric] - value[metric]) + return auc(x, y) + + @staticmethod + def plot_spc(results: dict, metric: str): + import matplotlib.pyplot as plt + x, y = [], [] + for key, value in results.items(): + if key == 'random': + continue + x.append(key) + y.append(value[metric]) + plt.scatter(x, y) + plt.plot(x, [results['random'][metric] for _ in range(len(x))], 'r') + plt.ylabel(f'Performance: {metric}') + plt.xlabel(f'Threshold similarity') + plt.legend(['SP', 'Random']) + plt.ylim(0, 1.1) + plt.show() + + +if __name__ == '__main__': + df = pd.read_csv('dili.tab', sep='\t') + generator = HestiaDatasetGenerator(df) + args = SimilarityArguments( + data_type='small_molecule', field_name='Drug', + similarity_metric='fingerprint', verbose=3, + save_alignment=True + ) + generator.calculate_similarity(args) + generator.load_similarity(args.filename + '.csv.gz') + generator.calculate_partitions('Y', min_threshold=0.3, + threshold_step=0.05, + test_size=0.2, valid_size=0.1) + generator.save_precalculated('precalculated_partitions.gz') + generator.from_precalculated('precalculated_partitions.gz') + ds = generator.generate_datasets('torch', 0.35) + trial = { + th / 100: {'acc': 0.9 + (0.001 * th)} + for th in range(30, 100, 5) + } + trial.update({'random': {'acc': 1}}) + print(generator.calculate_auspc(trial, 'acc')) + HestiaDatasetGenerator.plot_spc(trial, 'acc') diff --git a/hestia/partition.py b/hestia/partition.py index ae42e7d..7121b96 100644 --- a/hestia/partition.py +++ b/hestia/partition.py @@ -43,18 +43,18 @@ def random_partition( return train_df, test_df -def connected_components_partition( +def ccpart( df: pd.DataFrame, - similarity_metric: str, - field_name: str, - label_name: str, + similarity_metric: str = None, + field_name: str = None, + label_name: str = None, threads: int = cpu_count(), denominator: str = None, test_size: float = 0.2, valid_size: float = 0.0, threshold: float = 0.3, verbose: int = 0, - species: str = 'protein', + data_type: str = 'protein', distance: str = 'tanimoto', representation: str = '3di+aa', bits: int = 1024, @@ -62,7 +62,7 @@ def connected_components_partition( config: dict = None, sim_df: Optional[pd.DataFrame] = None ) -> Union[Tuple[list, list], Tuple[list, list, list]]: - """Use connected components partitioning algorithm + """Use CCPart algorithm to generate training and evaluation subsets that maximise the dissimilarity between their entities. @@ -120,10 +120,10 @@ def connected_components_partition( - 2: All Defaults to 0 :type verbose: int, optional - :param species: Biochemical species to which the data belongs. + :param data_type: Biochemical data_type to which the data belongs. Options: `protein`, `DNA`, `RNA`, or `small_molecule`; defaults to 'protein' - :type species: str, optional + :type data_type: str, optional :param distance: Distance metrics for small molecule comparison. Currently, it is restricted to Tanimoto distance will be extended in future patches; if interested in a specific @@ -175,7 +175,7 @@ def connected_components_partition( if sim_df is None: sim_df = calculate_similarity( - df, df, species=species, + df, df, data_type=data_type, similarity_metric=similarity_metric, field_name=field_name, threshold=threshold, threads=threads, verbose=verbose, @@ -183,7 +183,6 @@ def connected_components_partition( bits=bits, denominator=denominator, radius=radius, representation=representation, config=config ) - cluster_df = generate_clusters(df, field_name=field_name, threshold=threshold, verbose=verbose, @@ -251,7 +250,7 @@ def reduction_partition( valid_size: float = 0.0, threshold: float = 0.3, verbose: int = 2, - species: str = 'protein', + data_type: str = 'protein', representation: str = '3di+aa', random_state: int = 42, bits: int = 1024, @@ -318,10 +317,10 @@ def reduction_partition( - 2: All Defaults to 0 :type verbose: int, optional - :param species: Biochemical species to which the data belongs. + :param data_type: Biochemical data_type to which the data belongs. Options: `protein`, `DNA`, `RNA`, or `small_molecule`; defaults to 'protein' - :type species: str, optional + :type data_type: str, optional :param distance: Distance metrics for small molecule comparison. Currently, it is restricted to Tanimoto distance will be extended in future patches; if interested in a specific @@ -362,7 +361,7 @@ def reduction_partition( """ df = similarity_reduction(df, similarity_metric, field_name, threads, clustering_mode, denominator, - test_size, threshold, verbose, species, + test_size, threshold, verbose, data_type, representation, bits, radius, sim_df, config) train, test = random_partition(df.index.tolist(), test_size=test_size, @@ -388,7 +387,7 @@ def graph_part( threshold: float = 0.3, verbose: int = 2, n_parts: int = 10, - species: str = 'protein', + data_type: str = 'protein', distance: str = 'tanimoto', representation: str = '3di+aa', bits: int = 1024, @@ -454,10 +453,10 @@ def graph_part( - 2: All Defaults to 0 :type verbose: int, optional - :param species: Biochemical species to which the data belongs. + :param data_type: Biochemical data_type to which the data belongs. Options: `protein`, `DNA`, `RNA`, or `small_molecule`; defaults to 'protein' - :type species: str, optional + :type data_type: str, optional :param distance: Distance metrics for small molecule comparison. Currently, it is restricted to Tanimoto distance will be extended in future patches; if interested in a specific @@ -498,7 +497,7 @@ def graph_part( """ if sim_df is None: sim_df = calculate_similarity( - df, df, species=species, + df, df, data_type=data_type, similarity_metric=similarity_metric, field_name=field_name, threshold=threshold, threads=threads, verbose=verbose, diff --git a/hestia/similarity.py b/hestia/similarity.py index 09f819b..61c2e2d 100644 --- a/hestia/similarity.py +++ b/hestia/similarity.py @@ -9,6 +9,7 @@ import pandas as pd from tqdm import tqdm import scipy.sparse as spr +from concurrent.futures import ThreadPoolExecutor def sim_df2mtx(sim_df: pd.DataFrame, @@ -50,7 +51,7 @@ def sim_df2mtx(sim_df: pd.DataFrame, def calculate_similarity( df_query: pd.DataFrame, df_target: pd.DataFrame = None, - species: str = 'protein', + data_type: str = 'protein', similarity_metric: str = 'mmseqs+prefilter', field_name: str = 'sequence', threshold: float = 0.3, @@ -76,10 +77,10 @@ def calculate_similarity( similarities. If not specified, the `df_query` will be used as `df_target` as well, defaults to None :type df_target: pd.DataFrame, optional - :param species: Biochemical species to which the data belongs. + :param data_type: Biochemical data_type to which the data belongs. Options: `protein`, `DNA`, `RNA`, or `small_molecule`; defaults to 'protein' - :type species: str, optional + :type data_type: str, optional :param similarity_metric: Similarity function to use. Options: - `protein`: `mmseqs` (local alignment), @@ -157,8 +158,8 @@ def calculate_similarity( - "endextend": 0.5, - "matrix": "EBLOSUM62" :type config: dict, optional - :raises NotImplementedError: Biochemical species is not supported - see `species`. + :raises NotImplementedError: Biochemical data_type is not supported + see `data_type`. :raises NotImplementedError: Similarity metric is not supported see `similarity_algorithm` :return: DataFrame with similarities (`metric`) between @@ -168,10 +169,10 @@ def calculate_similarity( :rtype: pd.DataFrame """ mssg = f'Alignment method: {similarity_metric} ' - mssg += f'not implemented for species: {species}' - mssg2 = f'Species: {species} not supported' + mssg += f'not implemented for data_type: {data_type}' + mssg2 = f'data_type: {data_type} not supported' - if species == 'protein': + if data_type == 'protein': if 'mmseqs' in similarity_metric: sim_df = _mmseqs2_alignment( df_query=df_query, @@ -215,9 +216,9 @@ def calculate_similarity( ) else: mssg = f'Alignment method: {similarity_metric} ' - mssg += f'not implemented for species: {species}' + mssg += f'not implemented for data_type: {data_type}' raise NotImplementedError(mssg) - elif species.upper() == 'DNA' or species.upper() == 'RNA': + elif data_type.upper() == 'DNA' or data_type.upper() == 'RNA': if 'mmseqs' in similarity_metric: sim_df = _mmseqs2_alignment( df_query=df_query, @@ -247,9 +248,9 @@ def calculate_similarity( ) else: mssg = f'Alignment method: {similarity_metric} ' - mssg += f'not implemented for species: {species}' + mssg += f'not implemented for data_type: {data_type}' raise NotImplementedError(mssg) - elif species == 'small_molecule' or species.lower() == 'smiles': + elif data_type == 'small_molecule' or data_type.lower() == 'smiles': if similarity_metric == 'scaffold': sim_df = _scaffold_alignment( df_query=df_query, @@ -274,6 +275,9 @@ def calculate_similarity( save_alignment=save_alignment, filename=filename ) + else: + mssg = f'Alignment method: {similarity_metric} ' + mssg += f'not implemented for data_type: {data_type}' else: raise NotImplementedError(mssg2) return sim_df @@ -388,10 +392,6 @@ def _fingerprint_alignment( filename: str = None, **kwargs ) -> Union[pd.DataFrame, np.ndarray]: - # Threshold for similarity evaluation: 0.85, based on: - # Patterson DE, Cramer RD, Ferguson AM, Clark RD, Weinberger LE: - # Neighborhood behavior: A useful concept for validation of ''molecular - # diversity'' descriptors. J Med Chem 1996, 39:3049-3059. """_summary_ :param df_query: _description_ @@ -427,9 +427,14 @@ def _fingerprint_alignment( from rdkit import Chem from rdkit import DataStructs from rdkit.Chem import AllChem + from tqdm.contrib.concurrent import thread_map except ModuleNotFoundError: raise ImportError("This function requires RDKit to be installed.") - from concurrent.futures import ThreadPoolExecutor + + def _get_fp(smile: str): + mol = Chem.MolFromSmiles(smile) + fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, bits) + return fp def _compute_tanimoto(query_fp: list, target_fps: list): scores = DataStructs.BulkTanimotoSimilarity(query_fp, target_fps) @@ -443,43 +448,50 @@ def _compute_tanimoto(query_fp: list, target_fps: list): if verbose > 0: print(f'Calculating molecular similarities using ECFP-{radius * 2}', f'with {bits:,} bits and Tanimoto distance...') + query_fps = thread_map(_get_fp, df_query[field_name], max_workers=threads) if df_target is None: df_target = df_query + target_fps = query_fps + else: + target_fps = thread_map(_get_fp, df_query[field_name], + max_workers=threads) - mols_query = [Chem.MolFromSmiles(smiles) for smiles in df_query[field_name]] - mols_target = [Chem.MolFromSmiles(smiles) for smiles in df_target[field_name]] - fps_query = [AllChem.GetMorganFingerprintAsBitVect(x, radius, bits) - for x in mols_query] - fps_target = [AllChem.GetMorganFingerprintAsBitVect(x, radius, bits) - for x in mols_target] - jobs = [] - with ThreadPoolExecutor(max_workers=threads) as executor: - for query_fp in fps_query: - job = executor.submit(_compute_tanimoto, query_fp, fps_target) - jobs.append(job) + chunk_size = threads * 1_000 + chunks_target = (len(df_target) // chunk_size) + 1 + queries, targets, metrics = [], [], [] + pbar = tqdm(range(len(query_fps))) - if verbose > 1: - pbar = tqdm(jobs) - else: - pbar = jobs + with ThreadPoolExecutor(max_workers=threads) as executor: + for chunk in pbar: + jobs = [] + for chunk_t in range(chunks_target): + start_t = chunk_t * chunk_size + if chunk_t == chunks_target - 1: + end_t = -1 + else: + end_t = (chunk_t + 1) * chunk_size + chunk_fps = target_fps[start_t:end_t] + query_fp = query_fps[chunk] + job = executor.submit(_compute_tanimoto, query_fp, chunk_fps) + jobs.append(job) - proto_df = [] - for idx, job in enumerate(pbar): - if job.exception() is not None: - raise RuntimeError(job.exception()) - result = job.result() - entry = [{'query': idx, 'target': idx_target, 'metric': metric} - for idx_target, metric in enumerate(result)] - proto_df.extend(entry) + for idx, job in enumerate(jobs): + if job.exception() is not None: + raise RuntimeError(job.exception()) + result = job.result() + for idx_target, metric in enumerate(result): + if metric < threshold: + continue + queries.append(int(chunk)) + targets.append(int((idx * chunk_size) + idx_target)) + metrics.append(metric) + df = pd.DataFrame({'query': queries, 'target': targets, 'metric': metrics}) - df = pd.DataFrame(proto_df) if save_alignment: if filename is None: filename = time.time() df.to_csv(f'{filename}.csv.gz', index=False, compression='gzip') - - # df = df[df.metric >= threshold] return df diff --git a/hestia/utils/dataset_utils.py b/hestia/utils/dataset_utils.py new file mode 100644 index 0000000..e15f255 --- /dev/null +++ b/hestia/utils/dataset_utils.py @@ -0,0 +1,16 @@ +import pandas as pd +from torch.utils.data import Dataset + + +class Dataset_from_pandas(Dataset): + def __init__(self, dataframe: pd.DataFrame): + self.dataframe = dataframe + + def __getitem__(self, index): + row = self.dataframe.iloc[index].to_numpy() + features = row[1:] + label = row[0] + return features, label + + def __len__(self): + return len(self.dataframe) \ No newline at end of file diff --git a/setup.py b/setup.py index 499ffa1..9428c6f 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,6 @@ 'scikit-learn', 'pandas', 'numpy', - 'rdkit', 'tqdm' ] @@ -48,6 +47,6 @@ test_suite='tests', tests_require=test_requirements, url='https://github.com/IBM/Hestia-OOD', - version='0.0.3', + version='0.0.7', zip_safe=False, )