Merge pull request #13 from IBM/v.0.0.17

Update V.0.0.17
IBM · Aug 2, 2024 · 43d8fd1 · 43d8fd1
2 parents 56c8390 + 970217a
commit 43d8fd1
Show file tree

Hide file tree

Showing 4 changed files with 61 additions and 26 deletions.
diff --git a/README.md b/README.md
@@ -49,7 +49,17 @@ pip install hestia-ood
 pip install git+https://github.com/IBM/Hestia-OOD
 ```
 
-### 3. Third-party dependencies
+### 3. Optional dependencies
+
+#### 3.1. Molecular similarity
+
+RDKit is a dependency necessary for calculating molecular similarities:
+
+```bash
+pip install rdkit
+```
+
+#### 3.2. Sequence alignment
 
 For using MMSeqs as alignment algorithm is necessary install it in the environment:
 
@@ -167,7 +177,7 @@ df = pd.read_csv('example.csv')
 sim_df = calculate_similarity(df, species='protein', similarity_metric='mmseqs+prefilter',
                               field_name='sequence')
 clusters_df = generate_clusters(df, field_name='sequence', sim_df=sim_df,
-                                cluster_algorithms='CDHIT')
+                                cluster_algorithm='CDHIT')
 ```
 
 There are three clustering algorithms currently supported: `CDHIT`, `greedy_cover_set`, or `connected_components`. More details about clustering can be found in the [Clustering documentation](https://ibm.github.io/Hestia-OOD/clustering/).

diff --git a/hestia/dataset_generator.py b/hestia/dataset_generator.py
@@ -1,7 +1,7 @@
 import gzip
 import json
 from multiprocessing import cpu_count
-from typing import Callable, Union
+from typing import Callable, Optional, Union
 
 import numpy as np
 import pandas as pd
@@ -237,7 +237,8 @@ def calculate_partitions(
         valid_size: float = 0.1,
         partition_algorithm: str = 'ccpart',
         random_state: int = 42,
-        similarity_args: SimilarityArguments = SimilarityArguments()
+        similarity_args: SimilarityArguments = SimilarityArguments(),
+        n_partitions: Optional[int] = None
     ):
         """Calculate partitions
 
@@ -266,18 +267,16 @@ def calculate_partitions(
         :type random_state: int, optional
         :param similarity_args: See similarity arguments entry, defaults to SimilarityArguments()
         :type similarity_args: SimilarityArguments, optional
+        :param n_partitions: Number of partitions to generate, only works with graphpart partitioning algorithm
+        :type n_partitions: int, optional
         :raises ValueError: Partitioning algorithm not supported.
         """
         self.partitions = {}
         if self.sim_df is None:
             self.calculate_similarity(similarity_args)
         print('Calculating partitions...')
 
-        if partition_algorithm == 'ccpart':
-            partition_algorithm = ccpart
-        elif partition_algorithm == 'graph_part':
-            partition_algorithm = graph_part
-        else:
+        if partition_algorithm not in ['ccpart', 'graph_part']:
             raise ValueError(
                 f'Partition algorithm: {partition_algorithm} is not ' +
                 'supported. Try using: `ccpart` or `graph_part`.'
@@ -286,21 +285,42 @@ def calculate_partitions(
         threshold_step = int(threshold_step * 100)
 
         for th in tqdm(range(min_threshold, 100, threshold_step)):
-            th_parts = partition_algorithm(
-                self.data,
-                label_name=label_name, test_size=test_size,
-                threshold=th / 100,
-                sim_df=self.sim_df, verbose=2
-            )
-            train_th_parts = random_partition(
-                self.data.iloc[th_parts[0]].reset_index(drop=True),
-                test_size=valid_size, random_state=random_state
-            )
-            self.partitions[th / 100] = {
-                'train': train_th_parts[0],
-                'valid': train_th_parts[1],
-                'test': th_parts[1]
-            }
+            if partition_algorithm == 'ccpart':
+                th_parts = ccpart(
+                    self.data,
+                    label_name=label_name, test_size=test_size,
+                    threshold=th / 100,
+                    sim_df=self.sim_df, verbose=2
+                )
+            elif partition_algorithm == 'graph_part':
+                try:
+                    th_parts = graph_part(
+                        self.data,
+                        label_name=label_name,
+                        test_size=test_size if n_partitions is None else 0.0,
+                        threshold=th / 100,
+                        sim_df=self.sim_df, verbose=2,
+                        n_parts=n_partitions
+                    )
+                except RuntimeError:
+                    continue
+
+            if n_partitions is None:
+                train_th_parts = random_partition(
+                    self.data.iloc[th_parts[0]].reset_index(drop=True),
+                    test_size=valid_size, random_state=random_state
+                )
+                self.partitions[th / 100] = {
+                    'train': train_th_parts[0],
+                    'valid': train_th_parts[1],
+                    'test': th_parts[1]
+                }
+            else:
+                th_parts = [[i[0] for i in part] for part in th_parts]
+                self.partitions[th / 100] = {
+                    i: th_parts[i] for i in range(n_partitions)
+                }
+
         random = random_partition(self.data, test_size=test_size,
                                   random_state=random_state)
         train_random = random_partition(

diff --git a/hestia/similarity.py b/hestia/similarity.py
@@ -498,7 +498,11 @@ def _compute_tanimoto(query_fp: list, target_fps: list):
                     end_t = -1
                 else:
                     end_t = (chunk_t + 1) * chunk_size
-                chunk_fps = target_fps[start_t:end_t]
+                if end_t == -1:
+                    chunk_fps = target_fps[start_t:]
+                else:
+                    chunk_fps = target_fps[start_t:end_t]
+
                 query_fp = query_fps[chunk]
                 job = executor.submit(_compute_tanimoto, query_fp, chunk_fps)
                 jobs.append(job)
@@ -513,6 +517,7 @@ def _compute_tanimoto(query_fp: list, target_fps: list):
                     queries.append(int(chunk))
                     targets.append(int((idx * chunk_size) + idx_target))
                     metrics.append(metric)
+
     df = pd.DataFrame({'query': queries, 'target': targets, 'metric': metrics})
 
     if save_alignment:

diff --git a/setup.py b/setup.py
@@ -48,6 +48,6 @@
     test_suite='tests',
     tests_require=test_requirements,
     url='https://github.com/IBM/Hestia-OOD',
-    version='0.0.16',
+    version='0.0.17',
     zip_safe=False,
 )