Skip to content

Commit

Permalink
📝 Documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
Raúl Fernández Díaz committed Feb 27, 2024
1 parent 6fb7d23 commit f818b27
Show file tree
Hide file tree
Showing 5 changed files with 47 additions and 16 deletions.
2 changes: 1 addition & 1 deletion docs/clustering.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# Clustering

::: hestia.clustering.generate_clusters
::: hestia.clustering
2 changes: 1 addition & 1 deletion docs/reduction.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# Similarity reduction

::: hestia.reduction.similarity_reduction
::: hestia.reduction
53 changes: 42 additions & 11 deletions hestia/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,57 @@
import pandas as pd
from scipy.sparse.csgraph import connected_components

from hestia.alignment import sim_df2mtx
from hestia.similarity import sim_df2mtx


def generate_clusters(
df: pd.DataFrame,
field_name: str,
threshold: float,
verbose: int,
sim_df: pd.DataFrame,
threshold: float = 0.4,
verbose: int = 0,
cluster_algorithm: str = 'greedy_incremental',
) -> pd.DataFrame:
"""Generates clusters from a DataFrame.
:param df: DataFrame with entities to cluster.
:type df: pd.DataFrame
:param field_name: Name of the field with the entity information
(e.g., `protein_sequence` or `structure_path`), defaults to 'sequence'.
:type field_name: str
:param threshold: Similarity value above which entities will be
considered similar, defaults to 0.4
:param sim_df: DataFrame with similarities (`metric`) between
`query` and `target`, it is the product of `calculate_similarity` function
:type sim_df: pd.DataFrame
:type threshold: float
:param verbose: How much information will be displayed.
Options:
- 0: Errors,
- 1: Warnings,
- 2: All
Defaults to 0
:type verbose: int
:param cluster_algorithm: Clustering algorithm to use.
Options:
- `CDHIT` or `greedy_incremental`
- `greedy_cover_set`
- `connected_components`
Defaults to "CDHIT".
:type cluster_algorithm: str, optional
:raises NotImplementedError: Clustering algorithm is not supported
:return: DataFrame with entities and the cluster they belong to.
:rtype: pd.DataFrame
"""
start = time.time()
if cluster_algorithm in ['greedy_incremental', 'CDHIT']:
cluster_df = greedy_incremental_clustering(df, field_name, sim_df,
threshold, verbose)
cluster_df = _greedy_incremental_clustering(df, field_name, sim_df,
threshold, verbose)
elif cluster_algorithm in ['greedy_cover_set']:
cluster_df = greedy_cover_set(df, sim_df, threshold, verbose)
cluster_df = _greedy_cover_set(df, sim_df, threshold, verbose)
elif cluster_algorithm in ['connected_components']:
cluster_df = connected_components_clustering(df, sim_df, threshold,
verbose)
cluster_df = _connected_components_clustering(df, sim_df, threshold,
verbose)
else:
raise NotImplementedError(
f'Clustering algorithm: {cluster_algorithm} is not supported'
Expand All @@ -33,7 +64,7 @@ def generate_clusters(
return cluster_df


def greedy_incremental_clustering(
def _greedy_incremental_clustering(
df: pd.DataFrame,
field_name: str,
sim_df: pd.DataFrame,
Expand Down Expand Up @@ -72,7 +103,7 @@ def greedy_incremental_clustering(
return cluster_df


def greedy_cover_set(
def _greedy_cover_set(
df: pd.DataFrame,
sim_df: pd.DataFrame,
threshold: float,
Expand Down Expand Up @@ -117,7 +148,7 @@ def _find_connectivity(df, sim_df):
return cluster_df


def connected_components_clustering(
def _connected_components_clustering(
df: pd.DataFrame,
sim_df: pd.DataFrame,
threshold: float,
Expand Down
4 changes: 2 additions & 2 deletions hestia/partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from sklearn.model_selection import train_test_split

from hestia.alignment import calculate_similarity, sim_df2mtx
from hestia.similarity import calculate_similarity, sim_df2mtx
from hestia.clustering import generate_clusters
from hestia.reduction import similarity_reduction
from hestia.utils import (_assign_partitions, _cluster_reassignment,
Expand Down Expand Up @@ -155,7 +155,7 @@ def connected_components_partition(
- "matrix": "EBLOSUM62"
:type config: dict, optional
:param sim_df: DataFrame with similarities (`metric`) between
`query` and `target`, it is the product of `similarity` function,
`query` and `target`, it is the product of `calculate_similarity` function,
defaults to None
:type sim_df: Optional[pd.DataFrame], optional
:return: A tuple with the indexes of training and evaluation samples
Expand Down
2 changes: 1 addition & 1 deletion hestia/similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

def sim_df2mtx(sim_df: pd.DataFrame,
threshold: float = 0.05) -> spr.bsr_matrix:
"""Generates a similarity matrix from
"""Generates a similarity matrix from
a DataFrame with the results from similarity
calculations in the form of `query`, `target`,
and `metric`.
Expand Down

0 comments on commit f818b27

Please sign in to comment.