diff --git a/AIMSim-demo.ipynb b/AIMSim-demo.ipynb index dfefb450..0af96d2f 100644 --- a/AIMSim-demo.ipynb +++ b/AIMSim-demo.ipynb @@ -310,7 +310,7 @@ "##### Cluster\n", "Use a clustering algorithm to make groups from the database of molecules.\n", " - `n_clusters`: The number of clusters to group the molecules into.\n", - " - `clustering_method`: Optional string specifying a clustering method implemented in `sklearn`, one of `kmedoids`, `ward`, or `complete_linkage`. `complete_linkage` will be chosen by default if no alternative is provided.\n", + " - `clustering_method`: Optional string specifying a clustering method implemented in `sklearn`, one of `ward`, or `complete_linkage`. `complete_linkage` will be chosen by default if no alternative is provided.\n", " - `log_file_path`: String specifying a file to write output to for the execution of this task. Useful for debugging.\n", " - `cluster_file_path`: String specifying a file path where _AIMSim_ will output the result of clustering. Useful for comparing multiple clustering approaches or saving the results of large data sets.\n", " - `cluster_plot_settings`: Control the appearance of the clustering plot.\n", @@ -666,7 +666,7 @@ "\n", "clustering = ClusterData(\n", " n_clusters=5, # data is clustered into 5 clusters\n", - " clustering_method=\"kmedoids\",\n", + " clustering_method=\"ward\",\n", " embedding_plot_settings={\"embedding\": {\"method\": \"pca\"}},\n", ")\n", "clustering(molecule_set)" diff --git a/aimsim/__init__.py b/aimsim/__init__.py index 09480a9e..e16ecd12 100644 --- a/aimsim/__init__.py +++ b/aimsim/__init__.py @@ -7,4 +7,4 @@ except ImportError: pass # aimsim_core does not include this -__version__ = "2.1.3" +__version__ = "2.2.0" diff --git a/aimsim/chemical_datastructures/molecule_set.py b/aimsim/chemical_datastructures/molecule_set.py index 345b651a..70307847 100644 --- a/aimsim/chemical_datastructures/molecule_set.py +++ b/aimsim/chemical_datastructures/molecule_set.py @@ -65,7 +65,6 @@ class MoleculeSet: in the dataset. cluster(n_clusters=8, clustering_method=None, **kwargs): Cluster the molecules of the MoleculeSet. Implemented methods. - 'kmedoids': for the K-Medoids algorithm. 'complete_linkage', 'complete': Complete linkage agglomerative hierarchical clustering. @@ -997,10 +996,6 @@ def cluster(self, n_clusters=8, clustering_method=None, **kwargs): clustering_method (str): Clustering algorithm to use. Default is None in which case the algorithm is chosen from the similarity measure in use. Implemented clustering_methods are: - 'kmedoids': for the K-Medoids algorithm [1]. - This method is useful - when the molecular descriptors are continuous / Euclidean - since it relies on the existence of a sensible medoid. 'complete_linkage', 'complete': Complete linkage agglomerative hierarchical clustering [2]. 'average_linkage', 'average': @@ -1013,7 +1008,6 @@ def cluster(self, n_clusters=8, clustering_method=None, **kwargs): kwargs (keyword args): Key word arguments to supply to clustering algorithm. See the documentation pages listed below for these arguments: - 'kmedoids': https://scikit-learn-extra.readthedocs.io/en/stable/generated/sklearn_extra.cluster.KMedoids.html 'complete_linkage', 'average_linkage', 'single_linkage', 'ward' : https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html @@ -1037,7 +1031,7 @@ def cluster(self, n_clusters=8, clustering_method=None, **kwargs): "meaningful results." ) if ( - clustering_method == "kmedoids" or clustering_method == "ward" + clustering_method == "ward" ) and self.similarity_measure.type_ == "discrete": print( f"{clustering_method} cannot be used with " @@ -1047,7 +1041,7 @@ def cluster(self, n_clusters=8, clustering_method=None, **kwargs): clustering_method = None if clustering_method is None: if self.similarity_measure.type_ == "continuous": - clustering_method = "kmedoids" + clustering_method = "ward" else: clustering_method = "complete_linkage" self.clusters_ = Cluster( diff --git a/aimsim/ops/clustering.py b/aimsim/ops/clustering.py index 49bee49a..377d98f8 100644 --- a/aimsim/ops/clustering.py +++ b/aimsim/ops/clustering.py @@ -1,7 +1,6 @@ """Operation for clustering molecules""" import sklearn.exceptions from sklearn.cluster import AgglomerativeClustering -from sklearn_extra.cluster import KMedoids as SklearnExtraKMedoids class Cluster: @@ -10,10 +9,6 @@ class Cluster: Attributes: clustering_method (str): Label for the specific algorithm used. - 'kmedoids': - for the K-Medoids algorithm [1]. This method is useful - when the molecular descriptors are continuous / Euclidean - since it relies on the existence of a sensible medoid. 'complete_linkage', 'complete': Complete linkage agglomerative hierarchical clustering [2]. 'average_linkage', 'average': @@ -25,7 +20,7 @@ class Cluster: Euclidean descriptors. n_clusters (int): Number of clusters. - model_ (sklearn.cluster.AgglomerativeClustering or sklearn_extra.cluster.KMedoids): + model_ (sklearn.cluster.AgglomerativeClustering): The clustering estimator. labels_ (np.ndarray of shape (n_samples,)): cluster labels of the training set samples. @@ -50,11 +45,6 @@ def __init__(self, n_clusters, clustering_method, **kwargs): Args: n_clusters (int): Number of clusters. clustering_method(str): Label for the specific algorithm used. - Supported methods are: - 'kmedoids' for the K-Medoids algorithm [1]. This method is - useful when the molecular descriptors are continuous - / Euclidean since it relies on the existence of a - sensible medoid. 'complete_linkage', 'complete' for complete linkage agglomerative hierarchical clustering [2]. 'average_linkage', 'average' for average linkage agglomerative @@ -65,7 +55,6 @@ def __init__(self, n_clusters, clustering_method, **kwargs): Euclidean descriptors. kwargs (dict): Keyword arguments. These are passed to the estimators. Refer to the following documentation page for - kmedoids: https://scikit-learn-extra.readthedocs.io/en/stable/generated/sklearn_extra.cluster.KMedoids.html agglomerative hierarchical clustering: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html References: @@ -78,9 +67,7 @@ def __init__(self, n_clusters, clustering_method, **kwargs): """ self.clustering_method = clustering_method self.n_clusters = n_clusters - if self.clustering_method == "kmedoids": - self.model_ = self._get_kmedoids_model_(**kwargs) - elif clustering_method in ["complete_linkage", "complete"]: + if clustering_method in ["complete_linkage", "complete"]: self.model_ = self._get_linkage_model(linkage_method="complete", **kwargs) elif clustering_method in ["average", "average_linkage"]: @@ -95,24 +82,6 @@ def __init__(self, n_clusters, clustering_method, **kwargs): else: raise ValueError(f"{clustering_method} not implemented") - def _get_kmedoids_model_(self, **kwargs): - """ - Initialize a k-medoids model. - - Args: - kwargs (dict): Keyword arguments. These are passed to the - estimators. Refer to the following documentation page for - kmedoids: - [https://scikit-learn-extra.readthedocs.io/en/stable/generated/sklearn_extra.cluster.KMedoids.html] - - """ - _ = kwargs.pop('metric', None) - return SklearnExtraKMedoids( - n_clusters=self.n_clusters, - metric="precomputed", - **kwargs - ) - def _get_linkage_model(self, linkage_method, **kwargs): _ = kwargs.pop('affinity', None) try: diff --git a/examples/Wang-et-al-log-partition-coefficients/config_logP.yaml b/examples/Wang-et-al-log-partition-coefficients/config_logP.yaml index 2ce7bb43..4ff2eee6 100644 --- a/examples/Wang-et-al-log-partition-coefficients/config_logP.yaml +++ b/examples/Wang-et-al-log-partition-coefficients/config_logP.yaml @@ -55,7 +55,7 @@ tasks: plot_color: '#FD6F96' cluster: n_clusters: 2 - clustering_method: kmedoids + clustering_method: ward log_file_path: '/Users/himaghnabhattacharjee/Documents/Research/AIMSim_project/AIMSim/examples/Wang-et-al-log-partition-coefficients/log/cluster_log.txt' cluster_file_path: 'log/clusters.yml' cluster_plot_settings: diff --git a/requirements.txt b/requirements.txt index c652e1bd..4c349d58 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,6 @@ seaborn tabulate numpy multiprocess>=0.70 -scikit_learn_extra pandas # force pyyaml away from specific versions: https://github.com/yaml/pyyaml/issues/724 pyyaml!=6.0.0,!=5.4.0,!=5.4.1,<7 diff --git a/requirements_core.txt b/requirements_core.txt index 47a28e06..8cb4496e 100644 --- a/requirements_core.txt +++ b/requirements_core.txt @@ -1,6 +1,5 @@ psutil scikit_learn -scikit_learn_extra rdkit numpy pandas diff --git a/tests/test_MoleculeSet.py b/tests/test_MoleculeSet.py index a8aa762a..b7f34d8d 100644 --- a/tests/test_MoleculeSet.py +++ b/tests/test_MoleculeSet.py @@ -1261,6 +1261,7 @@ def test_invalid_transform_error(self): ) remove(csv_fpath) + @unittest.skip(reason="kmedoids was removed, obsoleting this test") def test_clustering_fingerprints(self): """ Test the clustering of molecules featurized by their fingerprints. @@ -1290,8 +1291,8 @@ def test_clustering_fingerprints(self): if molecule_set.similarity_measure.type_ == "continuous": self.assertEqual( str(molecule_set.clusters_), - "kmedoids", - f"Expected kmedoids clustering for " + "ward", + f"Expected ward clustering for " f"similarity: {similarity_measure}", ) else: