roi movemstats

sgoldenlab · Jan 17, 2025 · dcf3528 · dcf3528
1 parent 5c8d4f5
commit dcf3528
Show file tree

Hide file tree

Showing 16 changed files with 524 additions and 363 deletions.
diff --git a/docs/nb/shap_example_2.ipynb b/docs/nb/shap_example_2.ipynb
@@ -25,7 +25,7 @@
    "source": [
     "from simba.mixins.train_model_mixin import TrainModelMixin\n",
     "from simba.mixins.config_reader import ConfigReader\n",
-    "from simba.utils.read_write import read_df, read_config_file\n",
+    "from simba.utils.read_write import read_config_file, read_pickle\n",
     "import glob"
    ]
   },
@@ -54,7 +54,7 @@
     "# READ IN THE CONFIG AND THE CLASSIFIER\n",
     "config = read_config_file(config_path=CONFIG_PATH)\n",
     "config_object = ConfigReader(config_path=CONFIG_PATH)\n",
-    "clf = read_df(file_path=CLASSIFIER_PATH, file_type='pickle')"
+    "clf = read_pickle(data_path=CLASSIFIER_PATH)"
    ]
   },
   {
@@ -192,15 +192,19 @@
     }
    ],
    "source": [
-    "TrainModelMixin().create_shap_log_mp(ini_file_path=CONFIG_PATH,\n",
-    "                                     rf_clf=clf,\n",
-    "                                     x_df=data,\n",
-    "                                     y_df=target_df,\n",
-    "                                     x_names=data.columns,\n",
+    "TrainModelMixin().create_shap_log_mp(rf_clf=clf,\n",
+    "                                     x=data,\n",
+    "                                     y=target_df,\n",
+    "                                     x_names=list(data.columns),\n",
     "                                     clf_name=CLASSIFIER_NAME,\n",
     "                                     cnt_present=COUNT_PRESENT,\n",
     "                                     cnt_absent=COUNT_ABSENT,\n",
-    "                                     save_path=config_object.logs_path)"
+    "                                     core_cnt=2,\n",
+    "                                     chunk_size=100,\n",
+    "                                     verbose=True,\n",
+    "                                     save_dir=config_object.logs_path,\n",
+    "                                     save_file_suffix=1,\n",
+    "                                     plot=True)"
    ]
   },
   {

diff --git a/setup.py b/setup.py
@@ -29,7 +29,7 @@
 # Setup configuration
 setuptools.setup(
     name="simba-uw-tf-dev",
-    version="2.4.8",
+    version="2.5.1",
     author="Simon Nilsson, Jia Jie Choong, Sophia Hwang",
     author_email="[email protected]",
     description="Toolkit for computer classification and analysis of behaviors in experimental animals",

diff --git a/simba/data_processors/cuda/create_shap_log.py b/simba/data_processors/cuda/create_shap_log.py
@@ -38,6 +38,10 @@ def create_shap_log(rf_clf: Union[str, os.PathLike, RandomForestClassifier],
        :width: 500
        :align: center
 
+    .. note::
+       (i) The SHAP library has to be built from git repo rather than pip: `pip install git+https://github.com/slundberg/shap.git`.
+       (ii) The scikit model cannot be built using max_depth > 31. You can set this in the SimBA config under [create ensemble settings][rf_max_depth], or `rf_max_depth` in the config CSV's.
+
     :param Union[str, os.PathLike, RandomForestClassifier] rf_clf: Trained RandomForestClassifier model or path to the saved model. Can be a string, os.PathLike object, or an instance of RandomForestClassifier.
     :param Union[pd.DataFrame, np.ndarray] x: Input features used for SHAP value computation. Can be a pandas DataFrame or numpy ndarray.
     :param Union[pd.DataFrame, pd.Series, np.ndarray] y:  Target labels corresponding to the input features. Can be a pandas DataFrame, pandas Series, or numpy ndarray with 0 and 1 values.

diff --git a/simba/data_processors/cuda/statistics.py b/simba/data_processors/cuda/statistics.py
@@ -9,14 +9,16 @@
 import numpy as np
 from numba import cuda
 from scipy.spatial import ConvexHull
-
-from simba.utils.read_write import read_df
-
+from simba.utils.read_write import read_df, get_unique_values_in_iterable
+from simba.utils.warnings import GPUToolsWarning
 try:
     import cupy as cp
     from cupyx.scipy.spatial.distance import cdist
 except:
+    GPUToolsWarning(msg='GPU tools not detected, reverting to CPU')
+    from scipy.spatial.distance import cdist
     import numpy as cp
+
 try:
    from cuml.cluster import KMeans
 except:
@@ -500,6 +502,7 @@ def euclidean_distance_to_static_point(data: np.ndarray,
     :param pixels_per_millimeter: A scaling factor that indicates how many pixels correspond to one millimeter. Defaults to 1 if no scaling is necessary.
     :param centimeter:  A flag to indicate whether the output distances should be converted from millimeters to centimeters. If True, the result is divided by 10. Defaults to False (millimeters).
     :param batch_size: The number of points to process in each batch to avoid memory overflow on the GPU. The default  batch size is set to 65 million points (6.5e+7). Adjust this parameter based on GPU memory capacity.
+    :param batch_size: The number of points to process in each batch to avoid memory overflow on the GPU. The default  batch size is set to 65 million points (6.5e+7). Adjust this parameter based on GPU memory capacity.
     :return: A 1D array of distances between each point in `data` and the static `point`, either in millimeters or centimeters depending on the `centimeter` flag.
     :rtype: np.ndarray
     """
@@ -514,7 +517,7 @@ def euclidean_distance_to_static_point(data: np.ndarray,
         results[l:r]  = cdist(batch_data, point).astype(np.float32) / pixels_per_millimeter
     if centimeter:
         results = results / 10
-    return results.get
+    return results.get()
 
 
 def dunn_index(x: np.ndarray, y: np.ndarray) -> float:
@@ -654,3 +657,96 @@ def kmeans_cuml(data: np.ndarray,
 
     return (mdl.cluster_centers_, mdl.predict(data))
 
+
+
+def xie_beni(x: np.ndarray, y: np.ndarray) -> float:
+    """
+    Computes the Xie-Beni index for clustering evaluation.
+
+    The score is calculated as the ratio between the average intra-cluster variance and the squared minimum distance between cluster centroids. This ensures that the index penalizes both loosely packed clusters and clusters that are too close to each other.
+
+    A lower Xie-Beni index indicates better clustering quality, signifying well-separated and compact clusters.
+
+    .. seealso::
+       To compute Xie-Beni on the CPU, use :func:`~simba.mixins.statistics_mixin.Statistics.xie_beni`
+       Significant GPU savings detected at about 1m features, 25 clusters.
+
+    :param np.ndarray x: The dataset as a 2D NumPy array of shape (n_samples, n_features).
+    :param np.ndarray y: Cluster labels for each data point as a 1D NumPy array of shape (n_samples,).
+    :returns: The Xie-Beni score for the dataset.
+    :rtype: float
+
+    :example:
+    >>> from sklearn.datasets import make_blobs
+    >>> X, y = make_blobs(n_samples=100000, centers=40, n_features=600, random_state=0, cluster_std=0.3)
+    >>> xie_beni(x=X, y=y)
+
+    :references:
+    .. [1] X. L. Xie, G. Beni (1991). A validity measure for fuzzy clustering.
+           In: IEEE Transactions on Pattern Analysis and Machine Intelligence 13(8), 841 - 847. DOI: 10.1109/34.85677
+    """
+    check_valid_array(data=x, accepted_ndims=(2,), accepted_dtypes=Formats.NUMERIC_DTYPES.value)
+    check_valid_array(data=y, accepted_ndims=(1,), accepted_dtypes=Formats.NUMERIC_DTYPES.value, accepted_axis_0_shape=[x.shape[0], ])
+    _ = get_unique_values_in_iterable(data=y, name=xie_beni.__name__, min=2)
+    x, y = cp.array(x), cp.array(y)
+    cluster_ids = cp.unique(y)
+    centroids = cp.full(shape=(cluster_ids.shape[0], x.shape[1]), fill_value=-1.0, dtype=cp.float32)
+    intra_centroid_distances = cp.full(shape=(y.shape[0]), fill_value=-1.0, dtype=cp.float32)
+    obs_cnt = 0
+    for cnt, cluster_id in enumerate(cluster_ids):
+        cluster_obs = x[cp.argwhere(y == cluster_id).flatten()]
+        centroids[cnt] = cp.mean(cluster_obs, axis=0)
+        intra_dist = cp.linalg.norm(cluster_obs - centroids[cnt], axis=1)
+        intra_centroid_distances[obs_cnt: cluster_obs.shape[0] + obs_cnt] = intra_dist
+        obs_cnt += cluster_obs.shape[0]
+    compactness = cp.mean(cp.square(intra_centroid_distances))
+    cluster_dists = cdist(centroids, centroids).flatten()
+    d = cp.sqrt(cluster_dists[cp.argwhere(cluster_dists > 0).flatten()])
+    separation = cp.min(d)
+    xb = compactness / separation
+    return xb
+
+
+def i_index(x: np.ndarray, y: np.ndarray):
+    """
+    Calculate the I-Index for evaluating clustering quality.
+
+    The I-Index is a metric that measures the compactness and separation of clusters.
+    A higher I-Index indicates better clustering with compact and well-separated clusters.
+
+    .. seealso::
+       To compute Xie-Beni on the CPU, use :func:`~simba.mixins.statistics_mixin.Statistics.i_index`
+
+    :param np.ndarray x: The dataset as a 2D NumPy array of shape (n_samples, n_features).
+    :param np.ndarray y: Cluster labels for each data point as a 1D NumPy array of shape (n_samples,).
+    :returns: The I-index score for the dataset.
+    :rtype: float
+
+    :references:
+        .. [1] Zhao, Q., Xu, M., Fränti, P. (2009). Sum-of-Squares Based Cluster Validity Index and Significance Analysis.
+               In: Kolehmainen, M., Toivanen, P., Beliczynski, B. (eds) Adaptive and Natural Computing Algorithms. ICANNGA 2009.
+                Lecture Notes in Computer Science, vol 5495. Springer, Berlin, Heidelberg. https://doi.org/10.1007/978-3-642-04921-7_32
+
+    :example:
+    >>> X, y = make_blobs(n_samples=5000, centers=20, n_features=3, random_state=0, cluster_std=0.1)
+    >>> i_index(x=X, y=y)
+    """
+    check_valid_array(data=x, accepted_ndims=(2,), accepted_dtypes=Formats.NUMERIC_DTYPES.value)
+    check_valid_array(data=y, accepted_ndims=(1,), accepted_dtypes=Formats.NUMERIC_DTYPES.value,
+                      accepted_axis_0_shape=[x.shape[0], ])
+    _ = get_unique_values_in_iterable(data=y, name=i_index.__name__, min=2)
+    x, y = cp.array(x), cp.array(y)
+    unique_y = cp.unique(y)
+    n_y = unique_y.shape[0]
+    global_centroid = cp.mean(x, axis=0)
+    sst = cp.sum(cp.linalg.norm(x - global_centroid, axis=1) ** 2)
+
+    swc = 0
+    for cluster_cnt, cluster_id in enumerate(unique_y):
+        cluster_obs = x[cp.argwhere(y == cluster_id).flatten()]
+        cluster_centroid = cp.mean(cluster_obs, axis=0)
+        swc += cp.sum(cp.linalg.norm(cluster_obs - cluster_centroid, axis=1) ** 2)
+
+    i_idx = sst / (n_y * swc)
+
+    return i_idx