Skip to content

Commit

Permalink
Merge pull request #13 from IBM/v.0.0.17
Browse files Browse the repository at this point in the history
Update V.0.0.17
  • Loading branch information
RaulFD-creator authored Aug 2, 2024
2 parents 56c8390 + 970217a commit 43d8fd1
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 26 deletions.
14 changes: 12 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,17 @@ pip install hestia-ood
pip install git+https://github.com/IBM/Hestia-OOD
```

### 3. Third-party dependencies
### 3. Optional dependencies

#### 3.1. Molecular similarity

RDKit is a dependency necessary for calculating molecular similarities:

```bash
pip install rdkit
```

#### 3.2. Sequence alignment

For using MMSeqs as alignment algorithm is necessary install it in the environment:

Expand Down Expand Up @@ -167,7 +177,7 @@ df = pd.read_csv('example.csv')
sim_df = calculate_similarity(df, species='protein', similarity_metric='mmseqs+prefilter',
field_name='sequence')
clusters_df = generate_clusters(df, field_name='sequence', sim_df=sim_df,
cluster_algorithms='CDHIT')
cluster_algorithm='CDHIT')
```

There are three clustering algorithms currently supported: `CDHIT`, `greedy_cover_set`, or `connected_components`. More details about clustering can be found in the [Clustering documentation](https://ibm.github.io/Hestia-OOD/clustering/).
Expand Down
64 changes: 42 additions & 22 deletions hestia/dataset_generator.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import gzip
import json
from multiprocessing import cpu_count
from typing import Callable, Union
from typing import Callable, Optional, Union

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -237,7 +237,8 @@ def calculate_partitions(
valid_size: float = 0.1,
partition_algorithm: str = 'ccpart',
random_state: int = 42,
similarity_args: SimilarityArguments = SimilarityArguments()
similarity_args: SimilarityArguments = SimilarityArguments(),
n_partitions: Optional[int] = None
):
"""Calculate partitions
Expand Down Expand Up @@ -266,18 +267,16 @@ def calculate_partitions(
:type random_state: int, optional
:param similarity_args: See similarity arguments entry, defaults to SimilarityArguments()
:type similarity_args: SimilarityArguments, optional
:param n_partitions: Number of partitions to generate, only works with graphpart partitioning algorithm
:type n_partitions: int, optional
:raises ValueError: Partitioning algorithm not supported.
"""
self.partitions = {}
if self.sim_df is None:
self.calculate_similarity(similarity_args)
print('Calculating partitions...')

if partition_algorithm == 'ccpart':
partition_algorithm = ccpart
elif partition_algorithm == 'graph_part':
partition_algorithm = graph_part
else:
if partition_algorithm not in ['ccpart', 'graph_part']:
raise ValueError(
f'Partition algorithm: {partition_algorithm} is not ' +
'supported. Try using: `ccpart` or `graph_part`.'
Expand All @@ -286,21 +285,42 @@ def calculate_partitions(
threshold_step = int(threshold_step * 100)

for th in tqdm(range(min_threshold, 100, threshold_step)):
th_parts = partition_algorithm(
self.data,
label_name=label_name, test_size=test_size,
threshold=th / 100,
sim_df=self.sim_df, verbose=2
)
train_th_parts = random_partition(
self.data.iloc[th_parts[0]].reset_index(drop=True),
test_size=valid_size, random_state=random_state
)
self.partitions[th / 100] = {
'train': train_th_parts[0],
'valid': train_th_parts[1],
'test': th_parts[1]
}
if partition_algorithm == 'ccpart':
th_parts = ccpart(
self.data,
label_name=label_name, test_size=test_size,
threshold=th / 100,
sim_df=self.sim_df, verbose=2
)
elif partition_algorithm == 'graph_part':
try:
th_parts = graph_part(
self.data,
label_name=label_name,
test_size=test_size if n_partitions is None else 0.0,
threshold=th / 100,
sim_df=self.sim_df, verbose=2,
n_parts=n_partitions
)
except RuntimeError:
continue

if n_partitions is None:
train_th_parts = random_partition(
self.data.iloc[th_parts[0]].reset_index(drop=True),
test_size=valid_size, random_state=random_state
)
self.partitions[th / 100] = {
'train': train_th_parts[0],
'valid': train_th_parts[1],
'test': th_parts[1]
}
else:
th_parts = [[i[0] for i in part] for part in th_parts]
self.partitions[th / 100] = {
i: th_parts[i] for i in range(n_partitions)
}

random = random_partition(self.data, test_size=test_size,
random_state=random_state)
train_random = random_partition(
Expand Down
7 changes: 6 additions & 1 deletion hestia/similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -498,7 +498,11 @@ def _compute_tanimoto(query_fp: list, target_fps: list):
end_t = -1
else:
end_t = (chunk_t + 1) * chunk_size
chunk_fps = target_fps[start_t:end_t]
if end_t == -1:
chunk_fps = target_fps[start_t:]
else:
chunk_fps = target_fps[start_t:end_t]

query_fp = query_fps[chunk]
job = executor.submit(_compute_tanimoto, query_fp, chunk_fps)
jobs.append(job)
Expand All @@ -513,6 +517,7 @@ def _compute_tanimoto(query_fp: list, target_fps: list):
queries.append(int(chunk))
targets.append(int((idx * chunk_size) + idx_target))
metrics.append(metric)

df = pd.DataFrame({'query': queries, 'target': targets, 'metric': metrics})

if save_alignment:
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,6 @@
test_suite='tests',
tests_require=test_requirements,
url='https://github.com/IBM/Hestia-OOD',
version='0.0.16',
version='0.0.17',
zip_safe=False,
)

0 comments on commit 43d8fd1

Please sign in to comment.