📝 Documentation

IBM · Feb 27, 2024 · f818b27 · f818b27
1 parent 6fb7d23
commit f818b27
Show file tree

Hide file tree

Showing 5 changed files with 47 additions and 16 deletions.
diff --git a/docs/clustering.md b/docs/clustering.md
@@ -1,3 +1,3 @@
 # Clustering
 
-::: hestia.clustering.generate_clusters
+::: hestia.clustering
diff --git a/docs/reduction.md b/docs/reduction.md
@@ -1,3 +1,3 @@
 # Similarity reduction
 
-::: hestia.reduction.similarity_reduction
+::: hestia.reduction
diff --git a/hestia/clustering.py b/hestia/clustering.py
@@ -3,26 +3,57 @@
 import pandas as pd
 from scipy.sparse.csgraph import connected_components
 
-from hestia.alignment import sim_df2mtx
+from hestia.similarity import sim_df2mtx
 
 
 def generate_clusters(
     df: pd.DataFrame,
     field_name: str,
-    threshold: float,
-    verbose: int,
     sim_df: pd.DataFrame,
+    threshold: float = 0.4,
+    verbose: int = 0,
     cluster_algorithm: str = 'greedy_incremental',
 ) -> pd.DataFrame:
+    """Generates clusters from a DataFrame.
+
+    :param df: DataFrame with entities to cluster.
+    :type df: pd.DataFrame
+    :param field_name: Name of the field with the entity information
+    (e.g., `protein_sequence` or `structure_path`), defaults to 'sequence'.
+    :type field_name: str
+    :param threshold: Similarity value above which entities will be
+    considered similar, defaults to 0.4
+    :param sim_df: DataFrame with similarities (`metric`) between
+    `query` and `target`, it is the product of `calculate_similarity` function
+    :type sim_df: pd.DataFrame
+    :type threshold: float
+    :param verbose: How much information will be displayed.
+    Options:
+        - 0: Errors,
+        - 1: Warnings,
+        - 2: All
+    Defaults to 0
+    :type verbose: int
+    :param cluster_algorithm: Clustering algorithm to use.
+    Options:
+        - `CDHIT` or `greedy_incremental`
+        - `greedy_cover_set`
+        - `connected_components`
+    Defaults to "CDHIT".
+    :type cluster_algorithm: str, optional
+    :raises NotImplementedError: Clustering algorithm is not supported
+    :return: DataFrame with entities and the cluster they belong to.
+    :rtype: pd.DataFrame
+    """
     start = time.time()
     if cluster_algorithm in ['greedy_incremental', 'CDHIT']:
-        cluster_df = greedy_incremental_clustering(df, field_name, sim_df,
-                                                   threshold, verbose)
+        cluster_df = _greedy_incremental_clustering(df, field_name, sim_df,
+                                                    threshold, verbose)
     elif cluster_algorithm in ['greedy_cover_set']:
-        cluster_df = greedy_cover_set(df, sim_df, threshold, verbose)
+        cluster_df = _greedy_cover_set(df, sim_df, threshold, verbose)
     elif cluster_algorithm in ['connected_components']:
-        cluster_df = connected_components_clustering(df, sim_df, threshold,
-                                                     verbose)
+        cluster_df = _connected_components_clustering(df, sim_df, threshold,
+                                                      verbose)
     else:
         raise NotImplementedError(
             f'Clustering algorithm: {cluster_algorithm} is not supported'
@@ -33,7 +64,7 @@ def generate_clusters(
     return cluster_df
 
 
-def greedy_incremental_clustering(
+def _greedy_incremental_clustering(
     df: pd.DataFrame,
     field_name: str,
     sim_df: pd.DataFrame,
@@ -72,7 +103,7 @@ def greedy_incremental_clustering(
     return cluster_df
 
 
-def greedy_cover_set(
+def _greedy_cover_set(
     df: pd.DataFrame,
     sim_df: pd.DataFrame,
     threshold: float,
@@ -117,7 +148,7 @@ def _find_connectivity(df, sim_df):
     return cluster_df
 
 
-def connected_components_clustering(
+def _connected_components_clustering(
     df: pd.DataFrame,
     sim_df: pd.DataFrame,
     threshold: float,

diff --git a/hestia/partition.py b/hestia/partition.py
@@ -7,7 +7,7 @@
 
 from sklearn.model_selection import train_test_split
 
-from hestia.alignment import calculate_similarity, sim_df2mtx
+from hestia.similarity import calculate_similarity, sim_df2mtx
 from hestia.clustering import generate_clusters
 from hestia.reduction import similarity_reduction
 from hestia.utils import (_assign_partitions, _cluster_reassignment,
@@ -155,7 +155,7 @@ def connected_components_partition(
         - "matrix": "EBLOSUM62"
     :type config: dict, optional
     :param sim_df:  DataFrame with similarities (`metric`) between
-    `query` and `target`, it is the product of `similarity` function,
+    `query` and `target`, it is the product of `calculate_similarity` function,
     defaults to None
     :type sim_df: Optional[pd.DataFrame], optional
     :return: A tuple with the indexes of training and evaluation samples

diff --git a/hestia/similarity.py b/hestia/similarity.py
@@ -13,7 +13,7 @@
 
 def sim_df2mtx(sim_df: pd.DataFrame,
                threshold: float = 0.05) -> spr.bsr_matrix:
-    """Generates a similarity matrix from 
+    """Generates a similarity matrix from
     a DataFrame with the results from similarity
     calculations in the form of `query`, `target`,
     and `metric`.