theochem · FanwangM · Oct 6, 2024 · Oct 5, 2024 · Oct 6, 2024 · Oct 6, 2024
diff --git a/.coveragerc b/.coveragerc
@@ -7,3 +7,7 @@ omit =
 
 [report]
 show_missing = True
+exclude_also =
+    pragma: no cover
+    raise NotImplementedError
+    if __name__ == .__main__.:
diff --git a/selector/methods/base.py b/selector/methods/base.py
@@ -25,6 +25,7 @@
 
 import warnings
 from abc import ABC, abstractmethod
+from typing import List, Iterable, Union
 
 import numpy as np
 
@@ -34,7 +35,13 @@
 class SelectionBase(ABC):
     """Base class for selecting subset of sample points."""
 
-    def select(self, x: np.ndarray, size: int, labels: np.ndarray = None) -> np.ndarray:
+    def select(
+        self,
+        x: np.ndarray,
+        size: int,
+        labels: np.ndarray = None,
+        proportional_selection: bool = True,
+    ) -> Union[List, Iterable]:
         """Return indices representing subset of sample points.
 
         Parameters
@@ -48,6 +55,10 @@ def select(self, x: np.ndarray, size: int, labels: np.ndarray = None) -> np.ndar
             Array of integers or strings representing the labels of the clusters that
             each sample belongs to. If `None`, the samples are treated as one cluster.
             If labels are provided, selection is made from each cluster.
+        proportional_selection: bool, optional
+            If True, the number of samples to be selected from each cluster is proportional.
+            Otherwise, the number of samples to be selected from each cluster is equal.
+            Default is True.
 
         Returns
         -------
@@ -70,52 +81,92 @@ def select(self, x: np.ndarray, size: int, labels: np.ndarray = None) -> np.ndar
                 f"Number of labels {len(labels)} does not match number of samples {len(x)}."
             )
 
+        selected_ids = []
+
         # compute the number of samples (i.e. population or pop) in each cluster
-        unique_labels = np.unique(labels)
+        unique_labels, unique_label_counts = np.unique(labels, return_counts=True)
         num_clusters = len(unique_labels)
-        pop_clusters = {
-            unique_label: len(np.where(labels == unique_label)[0]) for unique_label in unique_labels
-        }
+        pop_clusters = dict(zip(unique_labels, unique_label_counts))
         # compute number of samples to be selected from each cluster
-        n = size // num_clusters
-
-        # update number of samples to select from each cluster based on the cluster population.
-        # this is needed when some clusters do not have enough samples in them (pop < n) and
-        # needs to be done iteratively until all remaining clusters have at least n samples
-        selected_ids = []
-        while np.any([value <= n for value in pop_clusters.values() if value != 0]):
-            for unique_label in unique_labels:
-                if pop_clusters[unique_label] != 0:
-                    # get index of sample labelled with unique_label
-                    cluster_ids = np.where(labels == unique_label)[0]
-                    if len(cluster_ids) <= n:
-                        # all samples in the cluster are selected & population becomes zero
-                        selected_ids.append(cluster_ids)
-                        pop_clusters[unique_label] = 0
-            # update number of samples to be selected from each cluster
-            totally_used_clusters = list(pop_clusters.values()).count(0)
-            n = (size - len(np.hstack(selected_ids))) // (num_clusters - totally_used_clusters)
-
-            warnings.warn(
-                f"Number of molecules in one cluster is less than"
-                f" {size}/{num_clusters}.\nNumber of selected "
-                f"molecules might be less than desired.\nIn order to avoid this "
-                f"problem. Try to use less number of clusters"
-            )
-
-        for unique_label in unique_labels:
+        if proportional_selection:
+            # make sure that tht total number of samples selected is equal to size
+            size_each_cluster = size * unique_label_counts / len(labels)
+            # using np.round to get to the nearest integer
+            # not using int function directly to avoid truncation of decimal values
+            size_each_cluster = np.round(size_each_cluster).astype(int)
+            # make sure each cluster has at least one sample
+            size_each_cluster[size_each_cluster < 1] = 1
+
+            # the total number of samples selected from all clusters at this point
+            size_each_cluster_total = np.sum(size_each_cluster)
+            # when the total of data points in each class is less than the required number
+            # add one sample to the smallest cluster iteratively until the total is equal to the
+            # required number
+            if size_each_cluster_total < size:
+                while size_each_cluster_total < size:
+                    # the number of remaining data points in each cluster
+                    size_each_cluster_remaining = unique_label_counts - size_each_cluster_total
+                    # skip the clusters with no data points left
+                    size_each_cluster_remaining[size_each_cluster_remaining == 0] = np.inf
+                    smallest_cluster_index = np.argmin(size_each_cluster_remaining)
+                    size_each_cluster[smallest_cluster_index] += 1
+                    size_each_cluster_total += 1
+            # when the total of data points in each class is more than the required number
+            # we need to remove samples from the largest clusters
+            elif size_each_cluster_total > size:
+                while size_each_cluster_total > size:
+                    largest_cluster_index = np.argmax(size_each_cluster)
+                    size_each_cluster[largest_cluster_index] -= 1
+                    size_each_cluster_total -= 1
+            # perfect case where the total is equal to the required number
+            else:
+                pass
+        else:
+            size_each_cluster = size // num_clusters
+
+            # update number of samples to select from each cluster based on the cluster population.
+            # this is needed when some clusters do not have enough samples in them
+            # (pop < size_each_cluster) and needs to be done iteratively until all remaining clusters
+            # have at least size_each_cluster samples
+            while np.any(
+                [value <= size_each_cluster for value in pop_clusters.values() if value != 0]
+            ):
+                for unique_label in unique_labels:
+                    if pop_clusters[unique_label] != 0:
+                        # get index of sample labelled with unique_label
+                        cluster_ids = np.where(labels == unique_label)[0]
+                        if len(cluster_ids) <= size_each_cluster:
+                            # all samples in the cluster are selected & population becomes zero
+                            selected_ids.append(cluster_ids)
+                            pop_clusters[unique_label] = 0
+                # update number of samples to be selected from each cluster
+                totally_used_clusters = list(pop_clusters.values()).count(0)
+                size_each_cluster = (size - len(np.hstack(selected_ids))) // (
+                    num_clusters - totally_used_clusters
+                )
+
+                warnings.warn(
+                    f"Number of molecules in one cluster is less than"
+                    f" {size}/{num_clusters}.\nNumber of selected "
+                    f"molecules might be less than desired.\nIn order to avoid this "
+                    f"problem. Try to use less number of clusters."
+                )
+            # save the number of samples to be selected from each cluster in an array
+            size_each_cluster = np.full(num_clusters, size_each_cluster)
+
+        for unique_label, size_sub in zip(unique_labels, size_each_cluster):
             if pop_clusters[unique_label] != 0:
-                # sample n ids from cluster labeled unique_label
+                # sample size_each_cluster ids from cluster labeled unique_label
                 cluster_ids = np.where(labels == unique_label)[0]
-                selected = self.select_from_cluster(x, n, cluster_ids)
+                selected = self.select_from_cluster(x, size_sub, cluster_ids)
                 selected_ids.append(cluster_ids[selected])
 
         return np.hstack(selected_ids).flatten().tolist()
 
     @abstractmethod
     def select_from_cluster(
         self, x: np.ndarray, size: int, labels: np.ndarray = None
-    ) -> np.ndarray:
+    ) -> np.ndarray: # pragma: no cover
         """Return indices representing subset of sample points from one cluster.
 
         Parameters

diff --git a/selector/methods/distance.py b/selector/methods/distance.py
@@ -26,6 +26,7 @@
 import bitarray
 import numpy as np
 from scipy import spatial
+from typing import List, Iterable, Union
 
 from selector.methods.base import SelectionBase
 from selector.methods.utils import optimize_radius
@@ -86,7 +87,7 @@ def __init__(self, fun_dist=None, ref_index=None):
         self.fun_dist = fun_dist
         self.ref_index = ref_index
 
-    def select_from_cluster(self, x, size, labels=None):
+    def select_from_cluster(self, x, size, labels=None) -> Union[List, Iterable]:
         """Return selected samples from a cluster based on MaxMin algorithm.
 
         Parameters
@@ -102,7 +103,7 @@ def select_from_cluster(self, x, size, labels=None):
 
         Returns
         -------
-        selected : list
+        selected : Union[List, Iterable]
             List of indices of selected samples.
         """
         # calculate pairwise distance between points
@@ -134,6 +135,8 @@ def select_from_cluster(self, x, size, labels=None):
             new_id = np.argmax(min_distances)
             selected.append(new_id)
 
+        selected = [int(i) for i in selected]
+
         return selected
 
 
@@ -184,7 +187,7 @@ def __init__(self, fun_dist=None, ref_index=None):
         self.fun_dist = fun_dist
         self.ref_index = ref_index
 
-    def select_from_cluster(self, x, size, labels=None):
+    def select_from_cluster(self, x, size, labels=None) -> Union[List, Iterable]:
         """Return selected samples from a cluster based on MaxSum algorithm.
 
         Parameters
@@ -200,7 +203,7 @@ def select_from_cluster(self, x, size, labels=None):
 
         Returns
         -------
-        selected : list
+        selected : Union[List, Iterable]
             List of indices of selected samples.
 
         """
@@ -237,6 +240,8 @@ def select_from_cluster(self, x, size, labels=None):
                 # already-selected points
                 new_id = np.argmax(sum_distances)
             selected.append(new_id)
+
+        selected = [int(i) for i in selected]
         return selected
 
 
@@ -261,6 +266,7 @@ class 0 and `ref_index=[3, 6]` class 1 respectively.
     References
     ----------
     [1] J. Chem. Inf. Comput. Sci. 1997, 37, 6, 1181–1188. https://doi.org/10.1021/ci970282v
+
     """
 
     def __init__(
@@ -330,7 +336,7 @@ def __init__(
         self.random_seed = random_seed
         self.fun_dist = fun_dist
 
-    def algorithm(self, x, max_size) -> list:
+    def algorithm(self, x, max_size) -> Union[List, Iterable]:
         """Return selected sample indices based on OptiSim algorithm.
 
         Parameters
@@ -342,7 +348,7 @@ def algorithm(self, x, max_size) -> list:
 
         Returns
         -------
-        selected : list
+        selected : Union[List, Iterable]
             List of indices of selected sample indices.
 
         """
@@ -402,7 +408,7 @@ def algorithm(self, x, max_size) -> list:
 
         return selected
 
-    def select_from_cluster(self, x, size, labels=None):
+    def select_from_cluster(self, x, size, labels=None) -> Union[List, Iterable]:
         """Return selected samples from a cluster based on OptiSim algorithm.
 
         Parameters
@@ -416,7 +422,7 @@ def select_from_cluster(self, x, size, labels=None):
 
         Returns
         -------
-        selected : list
+        selected : Union[List, Iterable]
             List of indices of selected samples.
 
         """
@@ -459,9 +465,7 @@ class DISE(SelectionBase):
 
     """
 
-    def __init__(
-        self, r0=None, ref_index=None, tol=0.05, n_iter=10, p=2.0, eps=0.0, fun_dist=None
-    ):
+    def __init__(self, r0=None, ref_index=None, tol=0.05, n_iter=10, p=2.0, eps=0.0, fun_dist=None):
         """
         Initialize class.
 
@@ -511,7 +515,7 @@ def __init__(
         #     self.fun_dist = fun_dist
         self.fun_dist = fun_dist
 
-    def algorithm(self, x, max_size):
+    def algorithm(self, x, max_size) -> Union[List, Iterable]:
         """Return selected samples based on directed sphere exclusion algorithm.
 
         Parameters
@@ -523,7 +527,7 @@ def algorithm(self, x, max_size):
 
         Returns
         -------
-        selected: list
+        selected: Union[List, Iterable]
             List of indices of selected samples.
 
         """
@@ -593,7 +597,7 @@ def algorithm(self, x, max_size):
 
         return selected
 
-    def select_from_cluster(self, x, size, labels=None):
+    def select_from_cluster(self, x, size, labels=None) -> Union[List, Iterable]:
         """Return selected samples from a cluster based on directed sphere exclusion algorithm
 
         Parameters
@@ -607,7 +611,7 @@ def select_from_cluster(self, x, size, labels=None):
 
         Returns
         -------
-        selected: list
+        selected: Union[List, Iterable]
             List of indices of selected samples.
 
         """
@@ -625,7 +629,7 @@ def select_from_cluster(self, x, size, labels=None):
         return optimize_radius(self, x, size, labels)
 
 
-def get_initial_selection(x=None, x_dist=None, ref_index=None, fun_dist=None):
+def get_initial_selection(x=None, x_dist=None, ref_index=None, fun_dist=None) -> List:
     """Set up the reference index for selecting.
 
     Parameters
@@ -650,7 +654,7 @@ def get_initial_selection(x=None, x_dist=None, ref_index=None, fun_dist=None):
 
     Returns
     -------
-    initial_selections: list
+    initial_selections: List
         List of indices of the initial selected data points.
 
     """

diff --git a/selector/methods/partition.py b/selector/methods/partition.py
@@ -666,4 +666,3 @@ def select_from_cluster(self, arr, num_selected, cluster_ids=None):
                 )
             count += 1
         return selected
-