diff --git a/.gitignore b/.gitignore
index 38cd292..fe105ba 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,3 @@
 __pycache__/
 build/
-hestia.egg-info/
\ No newline at end of file
+*.egg-info/
diff --git a/README.md b/README.md
index a83a8fc..98034e0 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,151 @@
-# Hestia
-Independent evaluation set construction for trustworthy ML models in biochemistry
+<div align="center">
+  <h1>Hestia</h1>
+
+  <p>Computational tool for generating generalisation-evaluating evaluation sets.</p>
+  
+  <a href="https://ibm.github.io/Hestia-OOD/"><img alt="Tutorials" src="https://img.shields.io/badge/docs-tutorials-green" /></a>
+  <a href="https://github.com/IBM/Hestia-OOD/blob/main/LICENSE"><img alt="GitHub" src="https://img.shields.io/github/license/IBM/Hestia-OOD" /></a>
+  <a href="https://pypi.org/project/hestia-ood/"><img src="https://img.shields.io/pypi/v/hestia-ood" /></a>
+  <a href="https://pypi.org/project/hestia-ood/"><img src="https://img.shields.io/pypi/dm/hestia-ood" /></a>
+
+</div>
+
+- **Documentation:**  <a href="https://ibm.github.io/AutoPeptideML/" target="_blank">https://ibm.github.io/Hestia-OOD</a>
+- **Source Code:** <a href="https://github.com/IBM/AutoPeptideML" target="_blank">https://github.com/IBM/Hestia-OOD</a>
+- **Webserver:** <a href="http://peptide.ucd.ie/AutoPeptideML" target="_blank">http://peptide.ucd.ie/Hestia</a>
+- **Paper Pre-print:** <a href="https://www.biorxiv.org/content/10.1101/2024.03.14.584508v1" target="_blank">https://www.biorxiv.org/content/10.1101/2024.03.14.584508v1</a>
+
+## Contents
+
+<details open markdown="1"><summary><b>Table of Contents</b></summary>
+
+- [Intallation Guide](#installation)
+- [Documentation](#documentation)
+- [Examples](#examples)
+- [License](#license)
+ </details>
+
+
+ ## Installation <a name="installation"></a>
+
+Installing in a conda environment is recommended. For creating the environment, please run:
+
+```bash
+conda create -n autopeptideml python
+conda activate autopeptideml
+```
+
+### 1. Python Package
+
+#### 1.1.From PyPI
+
+
+```bash
+pip install hestia-ood
+```
+
+#### 1.2. Directly from source
+
+```bash
+pip install git+https://github.com/IBM/Hestia-OOD
+```
+
+### 3. Third-party dependencies
+
+For using MMSeqs as alignment algorithm is necessary install it in the environment:
+
+```bash 
+conda install -c bioconda mmseqs2
+```
+
+For using Needleman-Wunch:
+
+```bash
+conda install -c bioconda emboss
+```
+
+If installation not in conda environment, please check installation instructions for your particular device:
+
+- Linux:
+  ```bash
+  wget https://mmseqs.com/latest/mmseqs-linux-avx2.tar.gz
+  tar xvfz mmseqs-linux-avx2.tar.gz
+  export PATH=$(pwd)/mmseqs/bin/:$PATH
+  ```
+
+  ```bash
+  sudo apt install emboss
+  ```
+
+  ```bash
+  sudo apt install emboss
+  ```
+
+- Windows: Download binaries from [EMBOSS](https://emboss.sourceforge.net/download/) and [MMSeqs2-latest](https://mmseqs.com/latest/mmseqs-win64.zip)
+
+- Mac:
+  ```bash
+  sudo port install emboss
+  brew install mmseqs2
+  ```
+
+## Documentation <a name="documentation"></a>
+
+### 1. Similarity calculation
+
+Calculating pairwise similarity between the entities within a DataFrame `df_query` or between two DataFrames `df_query` and `df_target` can be achieved through the `calculate_similarity` function:
+
+```python
+from hestia.similarity import calculate_similarity
+import pandas as pd
+
+df_query = pd.read_csv('example.csv')
+
+# The CSV file needs to have a column describing the entities, i.e., their sequence, their SMILES, or a path to their PDB structure.
+# This column corresponds to `field_name` in the function.
+
+sim_df = calculate_similarity(df_query, species='protein', similarity_metric='mmseqs+prefilter',
+                              field_name='sequence')
+```
+
+More details about similarity calculation can be found in the [Similarity calculation documentation](https://ibm.github.io/Hestia-OOD/similarity/).
+
+### 2. Clustering
+
+Clustering the entities within a DataFrame `df` can be achieved through the `generate_clusters` function:
+
+```python
+from hestia.similarity import calculate_similarity
+from hestia.clustering import generate_clusters
+import pandas as pd
+
+df = pd.read_csv('example.csv')
+sim_df = calculate_similarity(df, species='protein', similarity_metric='mmseqs+prefilter',
+                              field_name='sequence')
+clusters_df = generate_clusters(df, field_name='sequence', sim_df=sim_df,
+                                cluster_algorithms='CDHIT')
+```
+
+There are three clustering algorithms currently supported: `CDHIT`, `greedy_cover_set`, or `connected_components`. More details about clustering can be found in the [Clustering documentation](https://ibm.github.io/Hestia-OOD/clustering/).
+
+
+### 3. Partitioning
+
+Partitioning the entities within a DataFrame `df` into a training and an evaluation subsets can be achieved through 4 different functions: `cc_part`, `graph_part`, `reduction_partition`, and `random_partition`. An example of how `cc_part` would be used is:
+
+```python
+from hestia.partition import cc_part
+import pandas as pd
+
+df = pd.read_csv('example.csv')
+train, test = cc_part(df, species='protein', similarity_metric='mmseqs+prefilter',
+                      field_name='sequence', threshold=0.3, test_size=0.2)
+
+train_df = df.iloc[train, :]
+test_df = df.iloc[test, :]
+```
+
+License <a name="license"></a>
+-------
+Hestia is an open-source software licensed under the MIT Clause License. Check the details in the [LICENSE](https://github.com/IBM/Hestia/blob/master/LICENSE) file.
+
diff --git a/hestia/clustering.py b/hestia/clustering.py
index 3f6fffe..1fa14a9 100644
--- a/hestia/clustering.py
+++ b/hestia/clustering.py
@@ -2,6 +2,7 @@
 
 import pandas as pd
 from scipy.sparse.csgraph import connected_components
+from tqdm import tqdm
 
 from hestia.similarity import sim_df2mtx
 
@@ -78,16 +79,20 @@ def _greedy_incremental_clustering(
     clustered = set()
     sim_df = sim_df[sim_df['metric'] > threshold]
 
-    for i in df.index:
-        in_cluster = set(sim_df.loc[sim_df['query'] == i, 'target'])
-        in_cluster.update(set(sim_df.loc[sim_df['target'] == i, 'query']))
+    if verbose > 2:
+        pbar = tqdm(df.index)
+    else:
+        pbar = df.index
 
+    for i in pbar:
         if i in clustered:
             continue
+        in_cluster = set(sim_df.loc[sim_df['query'] == i, 'target'])
+        in_cluster.update(set(sim_df.loc[sim_df['target'] == i, 'query']))
+        in_cluster.update(set([i]))
+        in_cluster = in_cluster.difference(clustered)
 
         for j in in_cluster:
-            if i == j:
-                continue
             clusters.append({
                 'cluster': i,
                 'member': j
@@ -99,7 +104,7 @@ def _greedy_incremental_clustering(
     if verbose > 1:
         print('Clustering has generated:',
               f'{len(cluster_df.cluster.unique()):,d} clusters for',
-              f'{len(df):,} entities')
+              f'{len(cluster_df):,} entities')
     return cluster_df
 
 
@@ -111,7 +116,7 @@ def _greedy_cover_set(
 ) -> pd.DataFrame:
     def _find_connectivity(df, sim_df):
         neighbours = []
-        for i in df.index:
+        for i in tqdm(df.index):
             in_cluster = set(sim_df.loc[sim_df['query'] == i, 'target'])
             in_cluster.update(set(sim_df.loc[sim_df['target'] == i, 'query']))
             neighbours.append(in_cluster)
@@ -124,15 +129,19 @@ def _find_connectivity(df, sim_df):
 
     clusters = []
     clustered = set()
+    if verbose > 2:
+        pbar = tqdm(df.index)
+    else:
+        pbar = df.index
 
-    for i in df.index:
-        in_cluster = neighbours.pop(0)
-
+    for i in pbar:
         if i in clustered:
             continue
+        in_cluster = neighbours.pop(0)
+        in_cluster.update([i])
+        in_cluster = in_cluster.difference(clustered)
+
         for j in in_cluster:
-            if i == j:
-                continue
             clusters.append({
                 'cluster': i,
                 'member': j
@@ -144,7 +153,7 @@ def _find_connectivity(df, sim_df):
     if verbose > 1:
         print('Clustering has generated:',
               f'{len(cluster_df.cluster.unique()):,d} clusters for',
-              f'{len(df):,} entities')
+              f'{len(cluster_df):,} entities')
     return cluster_df
 
 
@@ -158,7 +167,7 @@ def _connected_components_clustering(
     n, labels = connected_components(matrix, directed=False,
                                      return_labels=True)
     cluster_df = [{'cluster': labels[i],
-                   'member': i} for i in df.index]
+                   'member': i} for i in range(labels.shape[0])]
     if verbose > 0:
         print('Clustering has generated:',
               f'{n:,d} connected componentes for',
diff --git a/hestia/dataset_generator.py b/hestia/dataset_generator.py
new file mode 100644
index 0000000..45f27db
--- /dev/null
+++ b/hestia/dataset_generator.py
@@ -0,0 +1,238 @@
+import gzip
+import json
+from multiprocessing import cpu_count
+
+import pandas as pd
+from sklearn.metrics import auc
+from tqdm import tqdm
+
+from hestia.similarity import calculate_similarity
+from hestia.partition import random_partition, ccpart, graph_part
+
+
+class SimilarityArguments:
+    def __init__(
+        self,
+        data_type: str = 'protein',
+        similarity_metric: str = 'mmseqs+prefilter',
+        field_name: str = 'sequence',
+        min_threshold: float = 0.25,
+        threads: int = cpu_count(),
+        verbose: int = 0,
+        save_alignment: bool = False,
+        filename: str = 'alignment',
+        distance: str = 'tanimoto',
+        bits: str = 1024,
+        radius: int = 2,
+        denominator: str = 'shortest',
+        representation: str = '3di+aa',
+        config: dict = {
+            "gapopen": 10,
+            "gapextend": 0.5,
+            "endweight": True,
+            "endopen": 10,
+            "endextend": 0.5,
+            "matrix": "EBLOSUM62"
+        }
+    ):
+        self.data_type = data_type
+        self.similarity_metric = similarity_metric
+        self.field_name = field_name
+        self.min_threshold = min_threshold
+        self.threads = threads
+        self.verbose = verbose
+        self.save_alignment = save_alignment
+        self.filename = filename
+        self.distance = distance
+        self.bits = bits
+        self.radius = radius
+        self.denominator = denominator
+        self.representation = representation
+        self.config = config
+
+
+class HestiaDatasetGenerator:
+    def __init__(self, data: pd.DataFrame):
+        self.data = data
+        self.sim_df = None
+        self.partitions = None
+        print('Initialising Hestia Dataset Generator')
+        print(f'Number of items in data: {len(self.data)}')
+
+    def from_precalculated(self, data_path: str):
+        with gzip.open(data_path, 'r') as fin:
+            self.partitions = json.loads(fin.read().decode('utf-8'))
+        new_dict = {}
+        for key, value in self.partitions.items():
+            if key != 'random':
+                new_dict[float(key)] = value
+            else:
+                new_dict[key] = value
+        self.partitions = new_dict
+
+    def save_precalculated(self, output_path: str):
+        with gzip.open(output_path, 'w') as fout:
+            fout.write(json.dumps(self.partitions).encode('utf-8'))
+
+    def calculate_similarity(self, similarity_args: SimilarityArguments):
+        print('Calculating similarity...')
+        self.sim_df = calculate_similarity(
+            self.data, self.data, data_type=similarity_args.data_type,
+            similarity_metric=similarity_args.similarity_metric,
+            field_name=similarity_args.field_name,
+            threshold=similarity_args.min_threshold,
+            threads=similarity_args.threads,
+            verbose=similarity_args.verbose,
+            save_alignment=similarity_args.save_alignment,
+            filename=similarity_args.filename,
+            distance=similarity_args.distance,
+            bits=similarity_args.bits,
+            radius=similarity_args.radius,
+            denominator=similarity_args.denominator,
+            representation=similarity_args.representation,
+            config=similarity_args.config
+        )
+        print('Similarity successfully calculated!')
+
+    def load_similarity(self, output_path: str):
+        print('Loading precalculated similarity...')
+        self.sim_df = pd.read_csv(output_path, compression='gzip')
+        print('Precalculated similarity loaded successfully!')
+
+    def calculate_partitions(
+        self,
+        label_name: str = None,
+        min_threshold: float = 0.3,
+        threshold_step: float = 0.1,
+        test_size: float = 0.2,
+        valid_size: float = 0.1,
+        partition_algorithm: str = 'ccpart',
+        random_state: int = 42,
+        similarity_args: SimilarityArguments = SimilarityArguments()
+    ):
+        print('Calculating partitions...')
+        self.partitions = {}
+        if self.sim_df is None:
+            self.calculate_similarity(similarity_args)
+        if partition_algorithm == 'ccpart':
+            partition_algorithm = ccpart
+        elif partition_algorithm == 'graph_part':
+            partition_algorithm = graph_part
+        else:
+            raise ValueError(
+                f'Partition algorithm: {partition_algorithm} is not ' +
+                'supported. Try using: `ccpart` or `graph_part`.'
+            )
+        min_threshold = int(min_threshold * 100)
+        threshold_step = int(threshold_step * 100)
+
+        for th in tqdm(range(min_threshold, 100, threshold_step)):
+            th_parts = partition_algorithm(
+                self.data,
+                label_name=label_name, test_size=test_size,
+                valid_size=valid_size, threshold=th / 100,
+                sim_df=self.sim_df
+            )
+            train_th_parts = random_partition(
+                self.data.iloc[th_parts[0]].reset_index(drop=True),
+                test_size=valid_size, random_state=random_state
+            )
+            self.partitions[th / 100] = {
+                'train': train_th_parts[0],
+                'valid': train_th_parts[1],
+                'test': th_parts[1]
+            }
+        random = random_partition(self.data, test_size=test_size,
+                                  random_state=random_state)
+        train_random = random_partition(
+            self.data.iloc[random[0]].reset_index(drop=True),
+            test_size=valid_size, random_state=random_state
+        )
+        self.partitions['random'] = {
+            'train': train_random[0],
+            'valid': train_random[1],
+            'test': random[1]
+        }
+        print('Partitions successfully calculated!')
+
+    def generate_datasets(self, dataset_type: str, threshold: float) -> dict:
+        ds = {}
+
+        if dataset_type == 'huggingface' or dataset_type == 'hf':
+            try:
+                import datasets
+            except ImportError:
+                raise ImportError(
+                    f"This dataset_type: {dataset_type} requires `datasets` " +
+                    "to be installed. Install using: `pip install datasets`"
+                    )
+            for key, value in self.partitions[threshold].items():
+                ds[key] = datasets.Dataset.from_pandas(
+                    self.data.iloc[value].reset_index()
+                )
+            return ds
+        elif dataset_type == 'pytorch' or dataset_type == 'torch':
+            try:
+                from hestia.utils.dataset_utils import Dataset_from_pandas
+            except ModuleNotFoundError:
+                raise ImportError(
+                    f"This dataset_type: {dataset_type} requires `torch` " +
+                    "to be installed. Install using: `pip install torch`"
+                    )
+            for key, value in self.partitions[threshold].items():
+                ds[key] = Dataset_from_pandas(
+                    self.data.iloc[value].reset_index()
+                )
+            return ds
+
+    @staticmethod
+    def calculate_auspc(results: dict, metric: str):
+        x, y = [], []
+        for key, value in results.items():
+            if key == 'random':
+                continue
+            x.append(key)
+            y.append(results['random'][metric] - value[metric])
+        return auc(x, y)
+
+    @staticmethod
+    def plot_spc(results: dict, metric: str):
+        import matplotlib.pyplot as plt
+        x, y = [], []
+        for key, value in results.items():
+            if key == 'random':
+                continue
+            x.append(key)
+            y.append(value[metric])
+        plt.scatter(x, y)
+        plt.plot(x, [results['random'][metric] for _ in range(len(x))], 'r')
+        plt.ylabel(f'Performance: {metric}')
+        plt.xlabel(f'Threshold similarity')
+        plt.legend(['SP', 'Random'])
+        plt.ylim(0, 1.1)
+        plt.show()
+
+
+if __name__ == '__main__':
+    df = pd.read_csv('dili.tab', sep='\t')
+    generator = HestiaDatasetGenerator(df)
+    args = SimilarityArguments(
+        data_type='small_molecule', field_name='Drug',
+        similarity_metric='fingerprint', verbose=3,
+        save_alignment=True
+    )
+    generator.calculate_similarity(args)
+    generator.load_similarity(args.filename + '.csv.gz')
+    generator.calculate_partitions('Y', min_threshold=0.3,
+                                   threshold_step=0.05,
+                                   test_size=0.2, valid_size=0.1)
+    generator.save_precalculated('precalculated_partitions.gz')
+    generator.from_precalculated('precalculated_partitions.gz')
+    ds = generator.generate_datasets('torch', 0.35)
+    trial = {
+        th / 100: {'acc': 0.9 + (0.001 * th)}
+        for th in range(30, 100, 5)
+    }
+    trial.update({'random': {'acc': 1}})
+    print(generator.calculate_auspc(trial, 'acc'))
+    HestiaDatasetGenerator.plot_spc(trial, 'acc')
diff --git a/hestia/partition.py b/hestia/partition.py
index ae42e7d..7121b96 100644
--- a/hestia/partition.py
+++ b/hestia/partition.py
@@ -43,18 +43,18 @@ def random_partition(
     return train_df, test_df
 
 
-def connected_components_partition(
+def ccpart(
     df: pd.DataFrame,
-    similarity_metric: str,
-    field_name: str,
-    label_name: str,
+    similarity_metric: str = None,
+    field_name: str = None,
+    label_name: str = None,
     threads: int = cpu_count(),
     denominator: str = None,
     test_size: float = 0.2,
     valid_size: float = 0.0,
     threshold: float = 0.3,
     verbose: int = 0,
-    species: str = 'protein',
+    data_type: str = 'protein',
     distance: str = 'tanimoto',
     representation: str = '3di+aa',
     bits: int = 1024,
@@ -62,7 +62,7 @@ def connected_components_partition(
     config: dict = None,
     sim_df: Optional[pd.DataFrame] = None
 ) -> Union[Tuple[list, list], Tuple[list, list, list]]:
-    """Use connected components partitioning algorithm
+    """Use CCPart algorithm
     to generate training and evaluation subsets
     that maximise the dissimilarity between their
     entities.
@@ -120,10 +120,10 @@ def connected_components_partition(
         - 2: All
     Defaults to 0
     :type verbose: int, optional
-    :param species: Biochemical species to which the data belongs.
+    :param data_type: Biochemical data_type to which the data belongs.
     Options: `protein`, `DNA`, `RNA`, or `small_molecule`; defaults to
     'protein'
-    :type species: str, optional
+    :type data_type: str, optional
     :param distance: Distance metrics for small molecule comparison.
     Currently, it is restricted to Tanimoto distance will
     be extended in future patches; if interested in a specific
@@ -175,7 +175,7 @@ def connected_components_partition(
 
     if sim_df is None:
         sim_df = calculate_similarity(
-            df, df, species=species,
+            df, df, data_type=data_type,
             similarity_metric=similarity_metric,
             field_name=field_name, threshold=threshold,
             threads=threads, verbose=verbose,
@@ -183,7 +183,6 @@ def connected_components_partition(
             bits=bits, denominator=denominator, radius=radius,
             representation=representation, config=config
         )
-
     cluster_df = generate_clusters(df, field_name=field_name,
                                    threshold=threshold,
                                    verbose=verbose,
@@ -251,7 +250,7 @@ def reduction_partition(
     valid_size: float = 0.0,
     threshold: float = 0.3,
     verbose: int = 2,
-    species: str = 'protein',
+    data_type: str = 'protein',
     representation: str = '3di+aa',
     random_state: int = 42,
     bits: int = 1024,
@@ -318,10 +317,10 @@ def reduction_partition(
         - 2: All
     Defaults to 0
     :type verbose: int, optional
-    :param species: Biochemical species to which the data belongs.
+    :param data_type: Biochemical data_type to which the data belongs.
     Options: `protein`, `DNA`, `RNA`, or `small_molecule`; defaults to
     'protein'
-    :type species: str, optional
+    :type data_type: str, optional
     :param distance: Distance metrics for small molecule comparison.
     Currently, it is restricted to Tanimoto distance will
     be extended in future patches; if interested in a specific
@@ -362,7 +361,7 @@ def reduction_partition(
     """
     df = similarity_reduction(df, similarity_metric, field_name,
                               threads, clustering_mode, denominator,
-                              test_size, threshold, verbose, species,
+                              test_size, threshold, verbose, data_type,
                               representation, bits,
                               radius, sim_df, config)
     train, test = random_partition(df.index.tolist(), test_size=test_size,
@@ -388,7 +387,7 @@ def graph_part(
     threshold: float = 0.3,
     verbose: int = 2,
     n_parts: int = 10,
-    species: str = 'protein',
+    data_type: str = 'protein',
     distance: str = 'tanimoto',
     representation: str = '3di+aa',
     bits: int = 1024,
@@ -454,10 +453,10 @@ def graph_part(
         - 2: All
     Defaults to 0
     :type verbose: int, optional
-    :param species: Biochemical species to which the data belongs.
+    :param data_type: Biochemical data_type to which the data belongs.
     Options: `protein`, `DNA`, `RNA`, or `small_molecule`; defaults to
     'protein'
-    :type species: str, optional
+    :type data_type: str, optional
     :param distance: Distance metrics for small molecule comparison.
     Currently, it is restricted to Tanimoto distance will
     be extended in future patches; if interested in a specific
@@ -498,7 +497,7 @@ def graph_part(
     """
     if sim_df is None:
         sim_df = calculate_similarity(
-            df, df, species=species,
+            df, df, data_type=data_type,
             similarity_metric=similarity_metric,
             field_name=field_name, threshold=threshold,
             threads=threads, verbose=verbose,
diff --git a/hestia/similarity.py b/hestia/similarity.py
index 09f819b..61c2e2d 100644
--- a/hestia/similarity.py
+++ b/hestia/similarity.py
@@ -9,6 +9,7 @@
 import pandas as pd
 from tqdm import tqdm
 import scipy.sparse as spr
+from concurrent.futures import ThreadPoolExecutor
 
 
 def sim_df2mtx(sim_df: pd.DataFrame,
@@ -50,7 +51,7 @@ def sim_df2mtx(sim_df: pd.DataFrame,
 def calculate_similarity(
     df_query: pd.DataFrame,
     df_target: pd.DataFrame = None,
-    species: str = 'protein',
+    data_type: str = 'protein',
     similarity_metric: str = 'mmseqs+prefilter',
     field_name: str = 'sequence',
     threshold: float = 0.3,
@@ -76,10 +77,10 @@ def calculate_similarity(
     similarities. If not specified, the `df_query` will be used as `df_target`
     as well, defaults to None
     :type df_target: pd.DataFrame, optional
-    :param species: Biochemical species to which the data belongs.
+    :param data_type: Biochemical data_type to which the data belongs.
     Options: `protein`, `DNA`, `RNA`, or `small_molecule`; defaults to
     'protein'
-    :type species: str, optional
+    :type data_type: str, optional
     :param similarity_metric: Similarity function to use.
     Options:
         - `protein`: `mmseqs` (local alignment),
@@ -157,8 +158,8 @@ def calculate_similarity(
         - "endextend": 0.5,
         - "matrix": "EBLOSUM62"
     :type config: dict, optional
-    :raises NotImplementedError: Biochemical species is not supported
-                                 see `species`.
+    :raises NotImplementedError: Biochemical data_type is not supported
+                                 see `data_type`.
     :raises NotImplementedError: Similarity metric is not supported
                                  see `similarity_algorithm`
     :return: DataFrame with similarities (`metric`) between
@@ -168,10 +169,10 @@ def calculate_similarity(
     :rtype: pd.DataFrame
     """
     mssg = f'Alignment method: {similarity_metric} '
-    mssg += f'not implemented for species: {species}'
-    mssg2 = f'Species: {species} not supported'
+    mssg += f'not implemented for data_type: {data_type}'
+    mssg2 = f'data_type: {data_type} not supported'
 
-    if species == 'protein':
+    if data_type == 'protein':
         if 'mmseqs' in similarity_metric:
             sim_df = _mmseqs2_alignment(
                 df_query=df_query,
@@ -215,9 +216,9 @@ def calculate_similarity(
             )
         else:
             mssg = f'Alignment method: {similarity_metric} '
-            mssg += f'not implemented for species: {species}'
+            mssg += f'not implemented for data_type: {data_type}'
             raise NotImplementedError(mssg)
-    elif species.upper() == 'DNA' or species.upper() == 'RNA':
+    elif data_type.upper() == 'DNA' or data_type.upper() == 'RNA':
         if 'mmseqs' in similarity_metric:
             sim_df = _mmseqs2_alignment(
                 df_query=df_query,
@@ -247,9 +248,9 @@ def calculate_similarity(
             )
         else:
             mssg = f'Alignment method: {similarity_metric} '
-            mssg += f'not implemented for species: {species}'
+            mssg += f'not implemented for data_type: {data_type}'
             raise NotImplementedError(mssg)
-    elif species == 'small_molecule' or species.lower() == 'smiles':
+    elif data_type == 'small_molecule' or data_type.lower() == 'smiles':
         if similarity_metric == 'scaffold':
             sim_df = _scaffold_alignment(
                 df_query=df_query,
@@ -274,6 +275,9 @@ def calculate_similarity(
                 save_alignment=save_alignment,
                 filename=filename
             )
+        else:
+            mssg = f'Alignment method: {similarity_metric} '
+            mssg += f'not implemented for data_type: {data_type}'
     else:
         raise NotImplementedError(mssg2)
     return sim_df
@@ -388,10 +392,6 @@ def _fingerprint_alignment(
     filename: str = None,
     **kwargs
 ) -> Union[pd.DataFrame, np.ndarray]:
-    # Threshold for similarity evaluation: 0.85, based on:
-    # Patterson DE, Cramer RD, Ferguson AM, Clark RD, Weinberger LE:
-    # Neighborhood behavior: A useful concept for validation of ''molecular
-    # diversity'' descriptors. J Med Chem 1996, 39:3049-3059.
     """_summary_
 
     :param df_query: _description_
@@ -427,9 +427,14 @@ def _fingerprint_alignment(
         from rdkit import Chem
         from rdkit import DataStructs
         from rdkit.Chem import AllChem
+        from tqdm.contrib.concurrent import thread_map
     except ModuleNotFoundError:
         raise ImportError("This function requires RDKit to be installed.")
-    from concurrent.futures import ThreadPoolExecutor
+
+    def _get_fp(smile: str):
+        mol = Chem.MolFromSmiles(smile)
+        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, bits)
+        return fp
 
     def _compute_tanimoto(query_fp: list, target_fps: list):
         scores = DataStructs.BulkTanimotoSimilarity(query_fp, target_fps)
@@ -443,43 +448,50 @@ def _compute_tanimoto(query_fp: list, target_fps: list):
     if verbose > 0:
         print(f'Calculating molecular similarities using ECFP-{radius * 2}',
               f'with {bits:,} bits and Tanimoto distance...')
+    query_fps = thread_map(_get_fp, df_query[field_name], max_workers=threads)
 
     if df_target is None:
         df_target = df_query
+        target_fps = query_fps
+    else:
+        target_fps = thread_map(_get_fp, df_query[field_name],
+                                max_workers=threads)
 
-    mols_query = [Chem.MolFromSmiles(smiles) for smiles in df_query[field_name]]
-    mols_target = [Chem.MolFromSmiles(smiles) for smiles in df_target[field_name]]
-    fps_query = [AllChem.GetMorganFingerprintAsBitVect(x, radius, bits)
-                 for x in mols_query]
-    fps_target = [AllChem.GetMorganFingerprintAsBitVect(x, radius, bits)
-                  for x in mols_target]
-    jobs = []
-    with ThreadPoolExecutor(max_workers=threads) as executor:
-        for query_fp in fps_query:
-            job = executor.submit(_compute_tanimoto, query_fp, fps_target)
-            jobs.append(job)
+    chunk_size = threads * 1_000
+    chunks_target = (len(df_target) // chunk_size) + 1
+    queries, targets, metrics = [], [], []
+    pbar = tqdm(range(len(query_fps)))
 
-        if verbose > 1:
-            pbar = tqdm(jobs)
-        else:
-            pbar = jobs
+    with ThreadPoolExecutor(max_workers=threads) as executor:
+        for chunk in pbar:
+            jobs = []
+            for chunk_t in range(chunks_target):
+                start_t = chunk_t * chunk_size
+                if chunk_t == chunks_target - 1:
+                    end_t = -1
+                else:
+                    end_t = (chunk_t + 1) * chunk_size
+                chunk_fps = target_fps[start_t:end_t]
+                query_fp = query_fps[chunk]
+                job = executor.submit(_compute_tanimoto, query_fp, chunk_fps)
+                jobs.append(job)
 
-        proto_df = []
-        for idx, job in enumerate(pbar):
-            if job.exception() is not None:
-                raise RuntimeError(job.exception())
-            result = job.result()
-            entry = [{'query': idx, 'target': idx_target, 'metric': metric}
-                     for idx_target, metric in enumerate(result)]
-            proto_df.extend(entry)
+            for idx, job in enumerate(jobs):
+                if job.exception() is not None:
+                    raise RuntimeError(job.exception())
+                result = job.result()
+                for idx_target, metric in enumerate(result):
+                    if metric < threshold:
+                        continue
+                    queries.append(int(chunk))
+                    targets.append(int((idx * chunk_size) + idx_target))
+                    metrics.append(metric)
+    df = pd.DataFrame({'query': queries, 'target': targets, 'metric': metrics})
 
-    df = pd.DataFrame(proto_df)
     if save_alignment:
         if filename is None:
             filename = time.time()
         df.to_csv(f'{filename}.csv.gz', index=False, compression='gzip')
-
-    # df = df[df.metric >= threshold]
     return df
 
 
diff --git a/hestia/utils/dataset_utils.py b/hestia/utils/dataset_utils.py
new file mode 100644
index 0000000..e15f255
--- /dev/null
+++ b/hestia/utils/dataset_utils.py
@@ -0,0 +1,16 @@
+import pandas as pd
+from torch.utils.data import Dataset
+
+
+class Dataset_from_pandas(Dataset):
+    def __init__(self, dataframe: pd.DataFrame):
+        self.dataframe = dataframe
+
+    def __getitem__(self, index):
+        row = self.dataframe.iloc[index].to_numpy()
+        features = row[1:]
+        label = row[0]
+        return features, label
+
+    def __len__(self):
+        return len(self.dataframe)
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 499ffa1..9428c6f 100644
--- a/setup.py
+++ b/setup.py
@@ -15,7 +15,6 @@
     'scikit-learn',
     'pandas',
     'numpy',
-    'rdkit',
     'tqdm'
 ]
 
@@ -48,6 +47,6 @@
     test_suite='tests',
     tests_require=test_requirements,
     url='https://github.com/IBM/Hestia-OOD',
-    version='0.0.3',
+    version='0.0.7',
     zip_safe=False,
 )