From dcf3528d6d8f20fe7ec1b0bdfaadcf6963198457 Mon Sep 17 00:00:00 2001 From: sronilsson Date: Fri, 17 Jan 2025 15:40:06 -0500 Subject: [PATCH] roi movemstats --- docs/nb/shap_example_2.ipynb | 20 +- setup.py | 2 +- simba/data_processors/cuda/create_shap_log.py | 4 + simba/data_processors/cuda/statistics.py | 104 ++++++- simba/data_processors/movement_calculator.py | 176 +++--------- .../timebins_movement_calculator.py | 93 ++----- .../feature_extraction_supplement_mixin.py | 41 +-- simba/mixins/statistics_mixin.py | 260 +++++++++++++++++- simba/mixins/timeseries_features_mixin.py | 6 +- simba/mixins/train_model_mixin.py | 22 +- simba/model/grid_search_rf.py | 4 +- simba/roi_tools/ROI_analyzer.py | 92 +++---- simba/roi_tools/ROI_time_bin_calculator.py | 38 +-- .../pop_ups/check_videos_seekable_pop_up.py | 2 +- simba/utils/read_write.py | 18 +- simba/utils/warnings.py | 5 + 16 files changed, 524 insertions(+), 363 deletions(-) diff --git a/docs/nb/shap_example_2.ipynb b/docs/nb/shap_example_2.ipynb index f7eeed327..8def1cfa7 100644 --- a/docs/nb/shap_example_2.ipynb +++ b/docs/nb/shap_example_2.ipynb @@ -25,7 +25,7 @@ "source": [ "from simba.mixins.train_model_mixin import TrainModelMixin\n", "from simba.mixins.config_reader import ConfigReader\n", - "from simba.utils.read_write import read_df, read_config_file\n", + "from simba.utils.read_write import read_config_file, read_pickle\n", "import glob" ] }, @@ -54,7 +54,7 @@ "# READ IN THE CONFIG AND THE CLASSIFIER\n", "config = read_config_file(config_path=CONFIG_PATH)\n", "config_object = ConfigReader(config_path=CONFIG_PATH)\n", - "clf = read_df(file_path=CLASSIFIER_PATH, file_type='pickle')" + "clf = read_pickle(data_path=CLASSIFIER_PATH)" ] }, { @@ -192,15 +192,19 @@ } ], "source": [ - "TrainModelMixin().create_shap_log_mp(ini_file_path=CONFIG_PATH,\n", - " rf_clf=clf,\n", - " x_df=data,\n", - " y_df=target_df,\n", - " x_names=data.columns,\n", + "TrainModelMixin().create_shap_log_mp(rf_clf=clf,\n", + " x=data,\n", + " y=target_df,\n", + " x_names=list(data.columns),\n", " clf_name=CLASSIFIER_NAME,\n", " cnt_present=COUNT_PRESENT,\n", " cnt_absent=COUNT_ABSENT,\n", - " save_path=config_object.logs_path)" + " core_cnt=2,\n", + " chunk_size=100,\n", + " verbose=True,\n", + " save_dir=config_object.logs_path,\n", + " save_file_suffix=1,\n", + " plot=True)" ] }, { diff --git a/setup.py b/setup.py index 9d1fea2f4..b141880c3 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ # Setup configuration setuptools.setup( name="simba-uw-tf-dev", - version="2.4.8", + version="2.5.1", author="Simon Nilsson, Jia Jie Choong, Sophia Hwang", author_email="sronilsson@gmail.com", description="Toolkit for computer classification and analysis of behaviors in experimental animals", diff --git a/simba/data_processors/cuda/create_shap_log.py b/simba/data_processors/cuda/create_shap_log.py index f8eab416a..10f2df683 100644 --- a/simba/data_processors/cuda/create_shap_log.py +++ b/simba/data_processors/cuda/create_shap_log.py @@ -38,6 +38,10 @@ def create_shap_log(rf_clf: Union[str, os.PathLike, RandomForestClassifier], :width: 500 :align: center + .. note:: + (i) The SHAP library has to be built from git repo rather than pip: `pip install git+https://github.com/slundberg/shap.git`. + (ii) The scikit model cannot be built using max_depth > 31. You can set this in the SimBA config under [create ensemble settings][rf_max_depth], or `rf_max_depth` in the config CSV's. + :param Union[str, os.PathLike, RandomForestClassifier] rf_clf: Trained RandomForestClassifier model or path to the saved model. Can be a string, os.PathLike object, or an instance of RandomForestClassifier. :param Union[pd.DataFrame, np.ndarray] x: Input features used for SHAP value computation. Can be a pandas DataFrame or numpy ndarray. :param Union[pd.DataFrame, pd.Series, np.ndarray] y: Target labels corresponding to the input features. Can be a pandas DataFrame, pandas Series, or numpy ndarray with 0 and 1 values. diff --git a/simba/data_processors/cuda/statistics.py b/simba/data_processors/cuda/statistics.py index 7622addeb..3d81100e3 100644 --- a/simba/data_processors/cuda/statistics.py +++ b/simba/data_processors/cuda/statistics.py @@ -9,14 +9,16 @@ import numpy as np from numba import cuda from scipy.spatial import ConvexHull - -from simba.utils.read_write import read_df - +from simba.utils.read_write import read_df, get_unique_values_in_iterable +from simba.utils.warnings import GPUToolsWarning try: import cupy as cp from cupyx.scipy.spatial.distance import cdist except: + GPUToolsWarning(msg='GPU tools not detected, reverting to CPU') + from scipy.spatial.distance import cdist import numpy as cp + try: from cuml.cluster import KMeans except: @@ -500,6 +502,7 @@ def euclidean_distance_to_static_point(data: np.ndarray, :param pixels_per_millimeter: A scaling factor that indicates how many pixels correspond to one millimeter. Defaults to 1 if no scaling is necessary. :param centimeter: A flag to indicate whether the output distances should be converted from millimeters to centimeters. If True, the result is divided by 10. Defaults to False (millimeters). :param batch_size: The number of points to process in each batch to avoid memory overflow on the GPU. The default batch size is set to 65 million points (6.5e+7). Adjust this parameter based on GPU memory capacity. + :param batch_size: The number of points to process in each batch to avoid memory overflow on the GPU. The default batch size is set to 65 million points (6.5e+7). Adjust this parameter based on GPU memory capacity. :return: A 1D array of distances between each point in `data` and the static `point`, either in millimeters or centimeters depending on the `centimeter` flag. :rtype: np.ndarray """ @@ -514,7 +517,7 @@ def euclidean_distance_to_static_point(data: np.ndarray, results[l:r] = cdist(batch_data, point).astype(np.float32) / pixels_per_millimeter if centimeter: results = results / 10 - return results.get + return results.get() def dunn_index(x: np.ndarray, y: np.ndarray) -> float: @@ -654,3 +657,96 @@ def kmeans_cuml(data: np.ndarray, return (mdl.cluster_centers_, mdl.predict(data)) + + +def xie_beni(x: np.ndarray, y: np.ndarray) -> float: + """ + Computes the Xie-Beni index for clustering evaluation. + + The score is calculated as the ratio between the average intra-cluster variance and the squared minimum distance between cluster centroids. This ensures that the index penalizes both loosely packed clusters and clusters that are too close to each other. + + A lower Xie-Beni index indicates better clustering quality, signifying well-separated and compact clusters. + + .. seealso:: + To compute Xie-Beni on the CPU, use :func:`~simba.mixins.statistics_mixin.Statistics.xie_beni` + Significant GPU savings detected at about 1m features, 25 clusters. + + :param np.ndarray x: The dataset as a 2D NumPy array of shape (n_samples, n_features). + :param np.ndarray y: Cluster labels for each data point as a 1D NumPy array of shape (n_samples,). + :returns: The Xie-Beni score for the dataset. + :rtype: float + + :example: + >>> from sklearn.datasets import make_blobs + >>> X, y = make_blobs(n_samples=100000, centers=40, n_features=600, random_state=0, cluster_std=0.3) + >>> xie_beni(x=X, y=y) + + :references: + .. [1] X. L. Xie, G. Beni (1991). A validity measure for fuzzy clustering. + In: IEEE Transactions on Pattern Analysis and Machine Intelligence 13(8), 841 - 847. DOI: 10.1109/34.85677 + """ + check_valid_array(data=x, accepted_ndims=(2,), accepted_dtypes=Formats.NUMERIC_DTYPES.value) + check_valid_array(data=y, accepted_ndims=(1,), accepted_dtypes=Formats.NUMERIC_DTYPES.value, accepted_axis_0_shape=[x.shape[0], ]) + _ = get_unique_values_in_iterable(data=y, name=xie_beni.__name__, min=2) + x, y = cp.array(x), cp.array(y) + cluster_ids = cp.unique(y) + centroids = cp.full(shape=(cluster_ids.shape[0], x.shape[1]), fill_value=-1.0, dtype=cp.float32) + intra_centroid_distances = cp.full(shape=(y.shape[0]), fill_value=-1.0, dtype=cp.float32) + obs_cnt = 0 + for cnt, cluster_id in enumerate(cluster_ids): + cluster_obs = x[cp.argwhere(y == cluster_id).flatten()] + centroids[cnt] = cp.mean(cluster_obs, axis=0) + intra_dist = cp.linalg.norm(cluster_obs - centroids[cnt], axis=1) + intra_centroid_distances[obs_cnt: cluster_obs.shape[0] + obs_cnt] = intra_dist + obs_cnt += cluster_obs.shape[0] + compactness = cp.mean(cp.square(intra_centroid_distances)) + cluster_dists = cdist(centroids, centroids).flatten() + d = cp.sqrt(cluster_dists[cp.argwhere(cluster_dists > 0).flatten()]) + separation = cp.min(d) + xb = compactness / separation + return xb + + +def i_index(x: np.ndarray, y: np.ndarray): + """ + Calculate the I-Index for evaluating clustering quality. + + The I-Index is a metric that measures the compactness and separation of clusters. + A higher I-Index indicates better clustering with compact and well-separated clusters. + + .. seealso:: + To compute Xie-Beni on the CPU, use :func:`~simba.mixins.statistics_mixin.Statistics.i_index` + + :param np.ndarray x: The dataset as a 2D NumPy array of shape (n_samples, n_features). + :param np.ndarray y: Cluster labels for each data point as a 1D NumPy array of shape (n_samples,). + :returns: The I-index score for the dataset. + :rtype: float + + :references: + .. [1] Zhao, Q., Xu, M., Fränti, P. (2009). Sum-of-Squares Based Cluster Validity Index and Significance Analysis. + In: Kolehmainen, M., Toivanen, P., Beliczynski, B. (eds) Adaptive and Natural Computing Algorithms. ICANNGA 2009. + Lecture Notes in Computer Science, vol 5495. Springer, Berlin, Heidelberg. https://doi.org/10.1007/978-3-642-04921-7_32 + + :example: + >>> X, y = make_blobs(n_samples=5000, centers=20, n_features=3, random_state=0, cluster_std=0.1) + >>> i_index(x=X, y=y) + """ + check_valid_array(data=x, accepted_ndims=(2,), accepted_dtypes=Formats.NUMERIC_DTYPES.value) + check_valid_array(data=y, accepted_ndims=(1,), accepted_dtypes=Formats.NUMERIC_DTYPES.value, + accepted_axis_0_shape=[x.shape[0], ]) + _ = get_unique_values_in_iterable(data=y, name=i_index.__name__, min=2) + x, y = cp.array(x), cp.array(y) + unique_y = cp.unique(y) + n_y = unique_y.shape[0] + global_centroid = cp.mean(x, axis=0) + sst = cp.sum(cp.linalg.norm(x - global_centroid, axis=1) ** 2) + + swc = 0 + for cluster_cnt, cluster_id in enumerate(unique_y): + cluster_obs = x[cp.argwhere(y == cluster_id).flatten()] + cluster_centroid = cp.mean(cluster_obs, axis=0) + swc += cp.sum(cp.linalg.norm(cluster_obs - cluster_centroid, axis=1) ** 2) + + i_idx = sst / (n_y * swc) + + return i_idx \ No newline at end of file diff --git a/simba/data_processors/movement_calculator.py b/simba/data_processors/movement_calculator.py index 7f11c4545..908b11ff5 100644 --- a/simba/data_processors/movement_calculator.py +++ b/simba/data_processors/movement_calculator.py @@ -2,18 +2,19 @@ import os from statistics import mean -from typing import List, Optional +from typing import List, Optional, Tuple import numpy as np import pandas as pd +from simba.mixins.feature_extraction_supplement_mixin import FeatureExtractionSupplemental from simba.feature_extractors.perimeter_jit import jitted_centroid from simba.mixins.config_reader import ConfigReader from simba.mixins.feature_extraction_mixin import FeatureExtractionMixin -from simba.utils.checks import (check_if_filepath_list_is_empty, - check_that_column_exist) +from simba.utils.checks import (check_if_filepath_list_is_empty,check_that_column_exist, check_valid_array, check_float) from simba.utils.printing import stdout_success from simba.utils.read_write import get_fn_ext, read_df +from simba.utils.enums import Formats class MovementCalculator(ConfigReader, FeatureExtractionMixin): @@ -36,28 +37,17 @@ class MovementCalculator(ConfigReader, FeatureExtractionMixin): """ - def __init__( - self, - config_path: str, - body_parts: List[str], - threshold: float = 0.00, - file_paths: Optional[List[str]] = None, - ): + def __init__(self, + config_path: str, + body_parts: List[str], + threshold: float = 0.00, + file_paths: Optional[List[str]] = None): ConfigReader.__init__(self, config_path=config_path) FeatureExtractionMixin.__init__(self) - self.save_path = os.path.join( - self.logs_path, "Movement_log_{}.csv".format(self.datetime) - ) - self.file_paths, self.body_parts, self.threshold = ( - file_paths, - body_parts, - threshold, - ) + self.save_path = os.path.join(self.logs_path, "Movement_log_{}.csv".format(self.datetime)) + self.file_paths, self.body_parts, self.threshold = (file_paths, body_parts, threshold) if not self.file_paths: - check_if_filepath_list_is_empty( - filepaths=self.outlier_corrected_paths, - error_msg=f"SIMBA ERROR: Cannot process movement. ZERO data files found in the {self.outlier_corrected_dir} directory.", - ) + check_if_filepath_list_is_empty(filepaths=self.outlier_corrected_paths, error_msg=f"SIMBA ERROR: Cannot process movement. ZERO data files found in the {self.outlier_corrected_dir} directory.") self.file_paths = self.outlier_corrected_paths print(f"Processing {len(self.file_paths)} video(s)...") @@ -65,29 +55,14 @@ def __find_body_part_columns(self): self.body_parts_dict, self.bp_list = {}, [] for bp_cnt, bp_name in enumerate(self.body_parts): if not bp_name.endswith("CENTER OF GRAVITY"): - animal_name = self.find_animal_name_from_body_part_name( - bp_name=bp_name, bp_dict=self.animal_bp_dict - ) - self.body_parts_dict[bp_cnt] = { - "ANIMAL NAME": animal_name, - "BODY-PART": bp_name, - "BODY-PART HEADERS": [ - f"{bp_name}_x", - f"{bp_name}_y", - f"{bp_name}_p", - ], - } + animal_name = self.find_animal_name_from_body_part_name(bp_name=bp_name, bp_dict=self.animal_bp_dict) + self.body_parts_dict[bp_cnt] = {"ANIMAL NAME": animal_name, "BODY-PART": bp_name, "BODY-PART HEADERS": [f"{bp_name}_x", f"{bp_name}_y", f"{bp_name}_p"]} self.bp_list.extend((self.body_parts_dict[bp_cnt]["BODY-PART HEADERS"])) else: pass - def __find_polygons(self, data): - print(data.shape) - def run(self): - self.results = pd.DataFrame( - columns=["VIDEO", "ANIMAL", "BODY-PART", "MEASURE", "VALUE"] - ) + self.results = pd.DataFrame(columns=["VIDEO", "ANIMAL", "BODY-PART", "MEASURE", "VALUE"]) self.movement_dfs = {} for file_path in self.file_paths: self.__find_body_part_columns() @@ -102,116 +77,37 @@ def run(self): for animal_cnt, animal_data in self.body_parts_dict.items(): animal_df = self.data_df[animal_data["BODY-PART HEADERS"]] if self.threshold > 0.00: - animal_df = animal_df[ - animal_df[animal_data["BODY-PART HEADERS"][-1]] - >= self.threshold - ] + animal_df = animal_df[animal_df[animal_data["BODY-PART HEADERS"][-1]] >= self.threshold] animal_df = animal_df.iloc[:, 0:2].reset_index(drop=True) - animal_df = self.create_shifted_df(df=animal_df) - bp_time_1 = animal_df[ - [ - animal_data["BODY-PART HEADERS"][0], - animal_data["BODY-PART HEADERS"][1], - ] - ].values.astype(float) - bp_time_2 = animal_df[ - [ - animal_data["BODY-PART HEADERS"][0] + "_shifted", - animal_data["BODY-PART HEADERS"][1] + "_shifted", - ] - ].values.astype(float) - self.movement = pd.Series( - self.framewise_euclidean_distance( - location_1=bp_time_1, - location_2=bp_time_2, - px_per_mm=self.px_per_mm, - ) - ) - self.movement.loc[0] = 0 - self.movement_dfs[video_name][ - f'{animal_data["ANIMAL NAME"]} {animal_data["BODY-PART"]}' - ] = self.movement - distance = round((self.movement.sum() / 10), 4) - velocity_lst = [] - for df in np.array_split( - self.movement, int(len(self.movement) / self.fps) - ): - velocity_lst.append(df.sum()) - self.results.loc[len(self.results)] = [ - video_name, - animal_data["ANIMAL NAME"], - animal_data["BODY-PART"], - "Distance (cm)", - distance, - ] - self.results.loc[len(self.results)] = [ - video_name, - animal_data["ANIMAL NAME"], - animal_data["BODY-PART"], - "Velocity (cm/s)", - round((mean(velocity_lst) / 10), 4), - ] + distance, velocity = FeatureExtractionSupplemental.distance_and_velocity(x=animal_df.values, fps=self.fps, pixels_per_mm=self.px_per_mm, centimeters=True) + self.results.loc[len(self.results)] = [video_name, animal_data["ANIMAL NAME"], animal_data["BODY-PART"], "Distance (cm)", distance] + self.results.loc[len(self.results)] = [ video_name, animal_data["ANIMAL NAME"], animal_data["BODY-PART"], "Velocity (cm/s)", velocity] else: for animal in self.body_parts: animal_name = animal.split("CENTER OF GRAVITY")[0].strip() - x, y = ( - self.data_df[self.animal_bp_dict[animal_name]["X_bps"]], - self.data_df[self.animal_bp_dict[animal_name]["Y_bps"]], - ) - z = pd.concat([x, y], axis=1)[ - [item for items in zip(x.columns, y.columns) for item in items] - ] - df = pd.DataFrame( - jitted_centroid( - points=np.reshape(z.values, (len(z / 2), -1, 2)).astype( - np.float32 - ) - ), - columns=["X", "Y"], - ) + x, y = (self.data_df[self.animal_bp_dict[animal_name]["X_bps"]], self.data_df[self.animal_bp_dict[animal_name]["Y_bps"]]) + z = pd.concat([x, y], axis=1)[[item for items in zip(x.columns, y.columns) for item in items]] + df = pd.DataFrame(jitted_centroid(points=np.reshape(z.values, (len(z / 2), -1, 2)).astype(np.float32)), columns=["X", "Y"]) df = self.dataframe_savgol_smoother(df=df, fps=self.fps).astype(int) - df_shifted = df.shift(1) - df_shifted = df_shifted.combine_first(df).add_suffix("_shifted") - self.movement = pd.Series( - self.framewise_euclidean_distance( - location_1=df.values.astype(np.float32), - location_2=df_shifted.values.astype(np.float32), - px_per_mm=self.px_per_mm, - ) - ) - self.movement.loc[0] = 0 - self.movement_dfs[video_name][ - f'{animal_name} {"GRAVITY CENTER"}' - ] = self.movement - distance = round((self.movement.sum() / 10), 4) - velocity_lst = [] - for df in np.array_split( - self.movement, int(len(self.movement) / self.fps) - ): - velocity_lst.append(df.sum()) - self.results.loc[len(self.results)] = [ - video_name, - animal_name, - "GRAVITY CENTER", - "Distance (cm)", - distance, - ] - self.results.loc[len(self.results)] = [ - video_name, - animal_name, - "GRAVITY CENTER", - "Velocity (cm/s)", - round((mean(velocity_lst) / 10), 4), - ] + distance, velocity = FeatureExtractionSupplemental.distance_and_velocity(x=df.values, fps=self.fps, pixels_per_mm=self.px_per_mm, centimeters=True) + self.results.loc[len(self.results)] = [video_name, animal_name, "GRAVITY CENTER", "Distance (cm)", distance] + self.results.loc[len(self.results)] = [video_name, animal_name, "GRAVITY CENTER", "Velocity (cm/s)", velocity] def save(self): self.results.set_index("VIDEO").to_csv(self.save_path) self.timer.stop_timer() - stdout_success( - msg=f"Movement log saved in {self.save_path}", - elapsed_time=self.timer.elapsed_time_str, - ) + stdout_success(msg=f"Movement log saved in {self.save_path}", elapsed_time=self.timer.elapsed_time_str) + + +# test = MovementCalculator(config_path=r"C:\troubleshooting\ROI_movement_test\project_folder\project_config.ini", +# body_parts=['Animal_1 CENTER OF GRAVITY'], #['Simon CENTER OF GRAVITY', 'JJ CENTER OF GRAVITY', 'Animal_1 CENTER OF GRAVITY'] +# threshold=0.00) +# test.run() + + + + # test = MovementCalculator(config_path='/Users/simon/Desktop/envs/troubleshooting/two_black_animals_14bp/project_folder/project_config.ini', diff --git a/simba/data_processors/timebins_movement_calculator.py b/simba/data_processors/timebins_movement_calculator.py index 6feaa83e1..87c18fc50 100644 --- a/simba/data_processors/timebins_movement_calculator.py +++ b/simba/data_processors/timebins_movement_calculator.py @@ -41,42 +41,29 @@ class TimeBinsMovementCalculator(ConfigReader, FeatureExtractionMixin): >>> calculator.run() """ - def __init__( - self, - config_path: str, - bin_length: Union[int, float], - body_parts: List[str], - plots: Optional[bool] = False, - ): + def __init__(self, + config_path: str, + bin_length: Union[int, float], + body_parts: List[str], + plots: Optional[bool] = False): ConfigReader.__init__(self, config_path=config_path) - log_event( - logger_name=str(self.__class__.__name__), - log_type=TagNames.CLASS_INIT.value, - msg=self.create_log_msg_from_init_args(locals=locals()), - ) + log_event(logger_name=str(self.__class__.__name__), log_type=TagNames.CLASS_INIT.value, msg=self.create_log_msg_from_init_args(locals=locals()),) self.bin_length, self.plots = bin_length, plots check_float(name="TIME BIN", value=bin_length, min_value=10e-6) self.col_headers, self.bp_dict = [], {} for bp_cnt, bp in enumerate(body_parts): self.col_headers.extend((f"{bp}_x", f"{bp}_y")) - animal_name = self.find_animal_name_from_body_part_name( - bp_name=bp, bp_dict=self.animal_bp_dict - ) + animal_name = self.find_animal_name_from_body_part_name(bp_name=bp, bp_dict=self.animal_bp_dict) self.bp_dict[bp_cnt] = {animal_name: [f"{bp}_x", f"{bp}_y"]} - check_if_filepath_list_is_empty( - filepaths=self.outlier_corrected_paths, - error_msg=f"SIMBA ERROR: Cannot analyze movement in time-bins, data directory {self.outlier_corrected_dir} is empty.", - ) + check_if_filepath_list_is_empty(filepaths=self.outlier_corrected_paths, error_msg=f"SIMBA ERROR: Cannot analyze movement in time-bins, data directory {self.outlier_corrected_dir} is empty.",) self.animal_combinations = list(itertools.combinations(self.animal_bp_dict, 2)) print(f"Processing {len(self.outlier_corrected_paths)} video(s)...") def __create_plots(self): timer = SimbaTimer(start=True) print("Creating time-bin movement plots...") - plots_dir = os.path.join( - self.project_path, "logs", f"time_bin_movement_plots_{self.datetime}" - ) + plots_dir = os.path.join( self.project_path, "logs", f"time_bin_movement_plots_{self.datetime}") if not os.path.exists(plots_dir): os.makedirs(plots_dir) for video_name in self.results["VIDEO"].unique(): @@ -112,61 +99,29 @@ def __create_plots(self): def run(self): video_dict, self.out_df_lst = {}, [] self.movement_dict = {} - self.save_path = os.path.join( - self.project_path, - "logs", - f"Time_bins_{self.bin_length}s_movement_results_{self.datetime}.csv", - ) - check_all_file_names_are_represented_in_video_log( - video_info_df=self.video_info_df, data_paths=self.outlier_corrected_paths - ) + self.save_path = os.path.join( self.project_path, "logs", f"Time_bins_{self.bin_length}s_movement_results_{self.datetime}.csv") + check_all_file_names_are_represented_in_video_log(video_info_df=self.video_info_df, data_paths=self.outlier_corrected_paths) for file_cnt, file_path in enumerate(self.outlier_corrected_paths): video_timer = SimbaTimer(start=True) _, video_name, _ = get_fn_ext(file_path) - print( - f"Processing time-bin movements for video {video_name} ({str(file_cnt+1)}/{str(len(self.outlier_corrected_paths))})..." - ) + print(f"Processing time-bin movements for video {video_name} ({str(file_cnt+1)}/{str(len(self.outlier_corrected_paths))})...") video_dict[video_name] = {} video_settings, px_per_mm, fps = self.read_video_info(video_name=video_name) fps, self.movement_cols, self.velocity_cols = int(fps), set(), set() bin_length_frames = int(fps * self.bin_length) if bin_length_frames == 0: - raise FrameRangeError( - msg=f"The specified time-bin length of {self.bin_length} is TOO SHORT for video {video_name} which has a specified FPS of {fps}. This results in time bins that are LESS THAN a single frame.", - source=self.__class__.__name__, - ) + raise FrameRangeError(msg=f"The specified time-bin length of {self.bin_length} is TOO SHORT for video {video_name} which has a specified FPS of {fps}. This results in time bins that are LESS THAN a single frame.", source=self.__class__.__name__,) self.data_df = read_df(file_path, self.file_type, usecols=self.col_headers) self.data_df = self.create_shifted_df(df=self.data_df) results = [] for animal_data in self.bp_dict.values(): name, bps = list(animal_data.keys())[0], list(animal_data.values())[0] - bp_time_1, bp_time_2 = ( - self.data_df[bps].values, - self.data_df[[f"{bps[0]}_shifted", f"{bps[1]}_shifted"]].values, - ) - movement_data = pd.DataFrame( - self.framewise_euclidean_distance( - location_1=bp_time_1, - location_2=bp_time_2, - px_per_mm=px_per_mm, - centimeter=True, - ), - columns=["VALUE"], - ) + bp_time_1, bp_time_2 = (self.data_df[bps].values, self.data_df[[f"{bps[0]}_shifted", f"{bps[1]}_shifted"]].values,) + movement_data = pd.DataFrame(self.framewise_euclidean_distance(location_1=bp_time_1, location_2=bp_time_2, px_per_mm=px_per_mm, centimeter=True), columns=["VALUE"]) self.movement_dict[video_name] = movement_data - movement_df_lists = [ - movement_data[i : i + bin_length_frames] - for i in range(0, movement_data.shape[0], bin_length_frames) - ] + movement_df_lists = [movement_data[i : i + bin_length_frames] for i in range(0, movement_data.shape[0], bin_length_frames)] for bin, movement_df in enumerate(movement_df_lists): - movement, velocity = ( - FeatureExtractionSupplemental.distance_and_velocity( - x=movement_df["VALUE"].values, - fps=fps, - pixels_per_mm=1, - centimeters=False, - ) - ) + movement, velocity = (FeatureExtractionSupplemental.distance_and_velocity(x=movement_df["VALUE"].values, fps=fps, pixels_per_mm=1, centimeters=False)) results.append( { "VIDEO": video_name, @@ -190,21 +145,13 @@ def run(self): results = pd.DataFrame(results).reset_index(drop=True) self.out_df_lst.append(results) video_timer.stop_timer() - print( - f"Video {video_name} complete (elapsed time: {video_timer.elapsed_time_str}s)..." - ) + print(f"Video {video_name} complete (elapsed time: {video_timer.elapsed_time_str}s)...") def save(self): - self.results = pd.concat(self.out_df_lst, axis=0).sort_values( - by=["VIDEO", "TIME BIN #", "MEASUREMENT", "ANIMAL"] - )[["VIDEO", "TIME BIN #", "ANIMAL", "BODY-PART", "MEASUREMENT", "VALUE"]] + self.results = pd.concat(self.out_df_lst, axis=0).sort_values(by=["VIDEO", "TIME BIN #", "MEASUREMENT", "ANIMAL"])[["VIDEO", "TIME BIN #", "ANIMAL", "BODY-PART", "MEASUREMENT", "VALUE"]] self.results.set_index("VIDEO").to_csv(self.save_path) self.timer.stop_timer() - stdout_success( - msg=f"Movement time-bins results saved at {self.save_path}", - elapsed_time=self.timer.elapsed_time_str, - source=self.__class__.__name__, - ) + stdout_success(msg=f"Movement time-bins results saved at {self.save_path}", elapsed_time=self.timer.elapsed_time_str, source=self.__class__.__name__) if self.plots: self.__create_plots() diff --git a/simba/mixins/feature_extraction_supplement_mixin.py b/simba/mixins/feature_extraction_supplement_mixin.py index fcbeb6ecb..a17f00a3d 100644 --- a/simba/mixins/feature_extraction_supplement_mixin.py +++ b/simba/mixins/feature_extraction_supplement_mixin.py @@ -747,7 +747,7 @@ def distance_and_velocity(x: np.ndarray, """ Calculate total movement and mean velocity from a sequence of position data. - :param x: Array containing movement data. For example, created by ``simba.mixins.FeatureExtractionMixin.framewise_euclidean_distance``. + :param x: Array containing movement data. For example, created by ``simba.mixins.FeatureExtractionMixin.framewise_euclidean_distance``. If its a 2-dimensional array, then we assume its pixel coordinates. If it's a 1d array, we assume its frame-wise euclidean distances. :param fps: Frames per second of the data. :param pixels_per_mm: Conversion factor from pixels to millimeters. :param Optional[bool] centimeters: If True, results are returned in centimeters and centimeters per second. Defaults to True. @@ -759,40 +759,25 @@ def distance_and_velocity(x: np.ndarray, >>> sum_movement, avg_velocity = FeatureExtractionSupplemental.distance_and_velocity(x=x, fps=10, pixels_per_mm=10, centimeters=True) """ - check_valid_array( - data=x, - source=FeatureExtractionSupplemental.distance_and_velocity.__name__, - accepted_ndims=(1, 2), - accepted_dtypes=(np.float32, np.float64, np.int32, np.int64, int, float), - ) - check_float( - name=f"{FeatureExtractionSupplemental.distance_and_velocity.__name__} fps", - value=fps, - min_value=1, - ) - check_float( - name=f"{FeatureExtractionSupplemental.distance_and_velocity.__name__} pixels_per_mm", - value=pixels_per_mm, - min_value=10e-6, - ) + check_valid_array(data=x, source=FeatureExtractionSupplemental.distance_and_velocity.__name__, accepted_ndims=(1, 2), accepted_dtypes=Formats.NUMERIC_DTYPES.value) + check_float(name=f"{FeatureExtractionSupplemental.distance_and_velocity.__name__} fps", value=fps, min_value=1) + check_float(name=f"{FeatureExtractionSupplemental.distance_and_velocity.__name__} pixels_per_mm", value=pixels_per_mm, min_value=10e-6) if x.ndim == 2: - check_valid_array( - data=x, - source=FeatureExtractionSupplemental.distance_and_velocity.__name__, - accepted_axis_1_shape=(2,), - ) - t = np.full((x.shape[0]), 0.0) + check_valid_array(data=x, source=FeatureExtractionSupplemental.distance_and_velocity.__name__, accepted_axis_1_shape=[2, ]) + framewise_px_movement = np.full((x.shape[0]), 0.0, dtype=np.float64) for i in range(1, x.shape[0]): - t[i] = np.linalg.norm(x[i] - x[i - 1]) - x = np.copy(t) / pixels_per_mm - movement = np.sum(x) / pixels_per_mm + framewise_px_movement[i] = np.linalg.norm(x[i] - x[i - 1]) + else: + framewise_px_movement = x + movement = np.sum(framewise_px_movement) / pixels_per_mm v = [] - for i in range(0, x.shape[0], int(fps)): - w = x[i : (i + int(fps))] + for i in range(1, framewise_px_movement.shape[0], int(fps)): + w = framewise_px_movement[i: (i + int(fps))] v.append((np.sum(w) / pixels_per_mm) * (1 / (w.shape[0] / int(fps)))) if centimeters: v = [vi / 10 for vi in v] movement = movement / 10 + return movement, np.mean(v) diff --git a/simba/mixins/statistics_mixin.py b/simba/mixins/statistics_mixin.py index c4bfb2505..639d8f5a8 100644 --- a/simba/mixins/statistics_mixin.py +++ b/simba/mixins/statistics_mixin.py @@ -3978,6 +3978,14 @@ def xie_beni(x: np.ndarray, y: np.ndarray) -> float: """ Computes the Xie-Beni index for clustering evaluation. + The score is calculated as the ratio between the average intra-cluster variance and the squared minimum distance between cluster centroids. This ensures that the index penalizes both loosely packed clusters and clusters that are too close to each other. + + A lower Xie-Beni index indicates better clustering quality, signifying well-separated and compact clusters. + + .. seealso:: + To compute Xie-Beni on the GPU, use :func:`~simba.mixins.statistics_mixin.Statistics.xie_beni` + + :param np.ndarray x: The dataset as a 2D NumPy array of shape (n_samples, n_features). :param np.ndarray y: Cluster labels for each data point as a 1D NumPy array of shape (n_samples,). :returns: The Xie-Beni score for the dataset. @@ -4057,6 +4065,9 @@ def i_index(x: np.ndarray, y: np.ndarray): The I-Index is a metric that measures the compactness and separation of clusters. A higher I-Index indicates better clustering with compact and well-separated clusters. + .. seealso:: + To compute I-index on GPU, use :func:`~simba.data_processors.cuda.statistics.i_index` + :param np.ndarray x: The dataset as a 2D NumPy array of shape (n_samples, n_features). :param np.ndarray y: Cluster labels for each data point as a 1D NumPy array of shape (n_samples,). :returns: The I-index score for the dataset. @@ -4073,6 +4084,7 @@ def i_index(x: np.ndarray, y: np.ndarray): """ check_valid_array(data=x, accepted_ndims=(2,), accepted_dtypes=Formats.NUMERIC_DTYPES.value) check_valid_array(data=y, accepted_ndims=(1,), accepted_dtypes=Formats.NUMERIC_DTYPES.value, accepted_axis_0_shape=[x.shape[0], ]) + _ = get_unique_values_in_iterable(data=y, name=Statistics.i_index.__name__, min=2) unique_y = np.unique(y) n_y = unique_y.shape[0] global_centroid = np.mean(x, axis=0) @@ -4091,6 +4103,12 @@ def sd_index(x: np.ndarray, y: np.ndarray) -> float: """ Compute the SD (Scatter and Discriminant) Index for evaluating the quality of a clustering solution. + The SD Index combines two components to measure clustering quality: + 1. **Scatter (SCAT)**: Evaluates the compactness of clusters by measuring the ratio of intra-cluster variance to the global standard deviation. + 2. **Discriminant (DIS)**: Measures the separation between clusters relative to their distance from the global mean. + + A lower SD Index indicates better clustering quality, reflecting compact and well-separated clusters. + :param np.ndarray x: A 2D array of shape (n_samples, n_features) representing the feature vectors of the data points. :param np.ndarray y: A 1D array of shape (n_samples,) containing the cluster labels for each data point. :returns: The SD Index value. Lower values indicate better clustering quality with more compact and well-separated clusters. @@ -4106,6 +4124,7 @@ def sd_index(x: np.ndarray, y: np.ndarray) -> float: """ check_valid_array(data=x, accepted_ndims=(2,), accepted_dtypes=Formats.NUMERIC_DTYPES.value) check_valid_array(data=y, accepted_ndims=(1,), accepted_dtypes=(int,), accepted_axis_0_shape=[x.shape[0], ]) + _ = get_unique_values_in_iterable(data=y, name=Statistics.sd_index.__name__, min=2) global_std = np.std(x) global_m = np.mean(x, axis=0) unique_clusters = np.unique(y) @@ -4143,6 +4162,10 @@ def c_index(x: np.ndarray, y: np.ndarray) -> float: - 0 indicates perfect clustering (clusters are as compact as possible). - 1 indicates worst clustering (clusters are highly spread out). + :references: + .. [1] Ubert, L. J., & Levin, J. R. (1976). A general statistical framework for assessing categorical clustering in free recall. Psychological Bulletin, 83(5), 1072–1080. + + :example: >>> X, y = make_blobs(n_samples=800, centers=2, n_features=3, random_state=0, cluster_std=0.1) >>> Statistics.c_index(x=X, y=y) @@ -4310,7 +4333,7 @@ def cop_index(x: np.ndarray, y: np.ndarray, epsilon: float = 1e-16) -> float: :example: >>> X, y = make_blobs(n_samples=50000, centers=10, n_features=3, random_state=0, cluster_std=1) - >>> cop_index(x=X, y=y) + >>> Statistics.cop_index(x=X, y=y) """ unique_clusters = np.unique(y) @@ -4350,6 +4373,7 @@ def pbm_index(x: np.ndarray, y: np.ndarray) -> float: :references: .. [1] Pakhira, M. K., Bandyopadhyay, S., & Maulik, U. (2004). Validity index for crisp and fuzzy clusters. Pattern Recognition, 37(4), 487–501. https://doi.org/10.1016/j.patcog.2003.09.021 + .. [2] Bernard Desgraupes, University Paris Ouest Lab Modal’X, https://cran.r-project.org/web/packages/clusterCrit/vignettes/clusterCrit.pdf :example: >>> X, y = make_blobs(n_samples=5, centers=2, n_features=3, random_state=0, cluster_std=5) @@ -4380,6 +4404,240 @@ def pbm_index(x: np.ndarray, y: np.ndarray) -> float: return (((1 / N_clusters) * E1) ** 2) / (EK * Dmin) + @staticmethod + def banfeld_raftery_index(x: np.ndarray, y: np.ndarray) -> float: + """ + Computes the Banfeld-Raftery index for clustering evaluation. + + Smaller values represent better clustering. Values can be negative. + + :param x: 2D NumPy array of shape (n_samples, n_features) representing the dataset. + :param y: 1D NumPy array of shape (n_samples,) containing cluster labels for each data point. + :return: The Banfeld-Raftery index. + :rtype: float + + :references: + .. [1] Banfield, J. D., & Raftery, A. E. (1993). Model-based Gaussian and non-Gaussian clustering. Biometrics, 49(3), 803-821. https://doi.org/10.2307/2532201 + + """ + check_valid_array(data=x, accepted_ndims=(2,), accepted_dtypes=Formats.NUMERIC_DTYPES.value) + check_valid_array(data=y, accepted_ndims=(1,), accepted_dtypes=Formats.NUMERIC_DTYPES.value, accepted_axis_0_shape=[x.shape[0], ]) + _ = get_unique_values_in_iterable(data=y, name=Statistics.banfeld_raftery_index.__name__, min=2) + unique_labels = np.unique(y) + val = 0.0 + for cluster_label in unique_labels: + cluster_data = x[y == cluster_label] + n_k = cluster_data.shape[0] + covariance_matrix = np.cov(cluster_data, rowvar=False) + determinant = np.linalg.det(covariance_matrix) + determinant = max(determinant, 1e-10) + val += n_k * np.log(determinant) + + return val + + @staticmethod + def scott_symons_index(x: np.ndarray, y: np.ndarray) -> float: + """ + Compute the Scott-Symons index for clustering evaluation. + + Smaller values represent better clustering. Values can be negative. + + :param np.ndarray x: The dataset as a 2D NumPy array of shape (n_samples, n_features). + :param np.ndarray y: Cluster labels for each data point as a 1D NumPy array of shape (n_samples,). + :returns: The Scott-Symons index score. + :rtype: float + + + :references: + .. [1] . J. Scott and M. J. Symons. Clustering methods based on likelihood ratio criteria. Biometrics, 27:387–397, 1971. + """ + + check_valid_array(data=x, accepted_ndims=(2,), accepted_dtypes=Formats.NUMERIC_DTYPES.value) + check_valid_array(data=y, accepted_ndims=(1,), accepted_dtypes=Formats.NUMERIC_DTYPES.value, accepted_axis_0_shape=[x.shape[0], ]) + _ = get_unique_values_in_iterable(data=y, name=Statistics.scott_symons_index.__name__, min=2) + unique_labels = np.unique(y) + val = 0.0 + + for label in unique_labels: + cluster_points = x[y == label] + n_k = cluster_points.shape[0] + cov_matrix = np.cov(cluster_points, rowvar=False) + det_cov = np.linalg.det(cov_matrix) + val += n_k * np.log(det_cov / n_k) + return val + + @staticmethod + def wemmert_gancarski_index(x: np.ndarray, y: np.ndarray) -> float: + """ + Compute the Wemmert-Gançarski index for clustering evaluation. + + The best case is when the index approaches 1, indicating good clustering. The worst case is when the index approaches 0, indicating poor clustering. + + :param np.ndarray x: The dataset as a 2D NumPy array of shape (n_samples, n_features). + :param np.ndarray y: Cluster labels for each data point as a 1D NumPy array of shape (n_samples,). + :returns: The Wemmert-Gançarski index score. + :rtype: float + + :references: + .. [1] Bernard Desgraupes, University Paris Ouest Lab Modal’X, https://cran.r-project.org/web/packages/clusterCrit/vignettes/clusterCrit.pdf + """ + + check_valid_array(data=x, accepted_ndims=(2,), accepted_dtypes=Formats.NUMERIC_DTYPES.value) + check_valid_array(data=y, accepted_ndims=(1,), accepted_dtypes=Formats.NUMERIC_DTYPES.value, accepted_axis_0_shape=[x.shape[0], ]) + _ = get_unique_values_in_iterable(data=y, name=Statistics.wemmert_gancarski_index.__name__, min=2) + unique_labels = np.unique(y) + total_score = 0.0 + + for label in unique_labels: + cluster_points = x[y == label] + n_k = cluster_points.shape[0] + G_k = np.mean(cluster_points, axis=0) + + R_values = [] + for point in cluster_points: + dist_to_G_k = np.linalg.norm(point - G_k) + distances_to_other_centroids = [np.linalg.norm(point - np.mean(x[y == other_label], axis=0)) for other_label in unique_labels if other_label != label] + min_dist_to_other_centroids = min(distances_to_other_centroids) + R_values.append(dist_to_G_k / min_dist_to_other_centroids) + + J_k = max(0, 1 - (1 / n_k) * np.sum(R_values)) + total_score += n_k * J_k + + return total_score / x.shape[0] + + @staticmethod + def mclain_rao_index(x: np.ndarray, y: np.ndarray) -> float: + """ + Computes the McClain-Rao Index, which measures the quality of clustering by evaluating the ratio of + the mean within-cluster distances to the mean between-cluster distances. + + The McClain-Rao Index is computed by calculating the mean ratio of intra-cluster distances (distances + between points within the same cluster) to inter-cluster distances (distances between points from + different clusters). A lower value indicates a better clustering result, with clusters being compact and well-separated. + + :param np.ndarray x: The dataset as a 2D NumPy array of shape (n_samples, n_features). + :param np.ndarray y: Cluster labels for each data point as a 1D NumPy array of shape (n_samples,). + :returns: The McClain-Rao Index score, a lower value indicates better clustering quality. + :rtype: float + + :references: + .. [1] McClain, J. O., & Rao, V. R. (1975). CLUSTISZ: A program to test for the quality of clustering of a set of objects. *Journal of Marketing Research, 12*(4), 456-460. https://doi.org/10.1177/002224377501200410 + """ + + check_valid_array(data=x, accepted_ndims=(2,), accepted_dtypes=Formats.NUMERIC_DTYPES.value) + check_valid_array(data=y, accepted_ndims=(1,), accepted_dtypes=Formats.NUMERIC_DTYPES.value, accepted_axis_0_shape=[x.shape[0], ]) + _ = get_unique_values_in_iterable(data=y, name=Statistics.mclain_rao_index.__name__, min=2) + unique_labels = np.unique(y) + ratios = np.full(shape=(len(unique_labels)), fill_value=np.nan, dtype=np.float64) + for cluster_cnt, cluster_id in enumerate(unique_labels): + cluster_obs = x[np.argwhere(y == cluster_id).flatten()] + noncluster_obs = x[np.argwhere(y != cluster_id).flatten()] + intra_dists = cdist(cluster_obs, cluster_obs) + np.fill_diagonal(intra_dists, np.nan) + intra_dist_mean = np.nanmean(intra_dists) + inter_dist_mean = np.mean(cdist(cluster_obs, noncluster_obs)) + ratios[cluster_cnt] = intra_dist_mean / inter_dist_mean + + return np.mean(ratios) + + @staticmethod + def s_dbw_index(x: np.ndarray, y: np.ndarray) -> float: + """ + Compute the S_Dbw index for evaluating the clustering quality. + + A lower value indicates a better clustering result. + + :param np.ndarray x: The dataset as a 2D NumPy array of shape (n_samples, n_features). + :param np.ndarray y: Cluster labels for each data point as a 1D NumPy array of shape (n_samples,). + :returns: The S_Dbw index score. + :rtype: float + + .. note:: + Behaves weird as the number of dimensions increase (> 20). + + :example: + >>> from sklearn.datasets import make_blobs + >>> X, labels = make_blobs(n_samples=5000, centers=5, random_state=42, n_features=3, cluster_std=2) + >>> score = Statistics.s_dbw_index(X, labels) + + :references: + .. [1] M. Halkidi and M. Vazirgiannis. Clustering validity assessment: Finding the optimal partitioning of a data set. Proceedings IEEE International Conference on Data Mining, pages 187–194, 2001. + .. [2] M. Halkidi and M. Vazirgiannis. Clustering validity assessment: Finding the optimal partitioning of a data set. Proceedings IEEE International Conference on Data Mining, pages 187–194, 2001. + """ + + check_valid_array(data=x, accepted_ndims=(2,), accepted_dtypes=Formats.NUMERIC_DTYPES.value) + check_valid_array(data=y, accepted_ndims=(1,), accepted_dtypes=Formats.NUMERIC_DTYPES.value, + accepted_axis_0_shape=[x.shape[0], ]) + _ = get_unique_values_in_iterable(data=y, name=Statistics.s_dbw_index.__name__, min=2) + unique_labels = np.unique(y) + K = len(unique_labels) + centroids = np.array([x[y == label].mean(axis=0) for label in unique_labels]) + variances = np.array([np.var(x[y == label], axis=0) for label in unique_labels]) + sigma = np.sqrt(np.sum(np.linalg.norm(variances, axis=1)) / K) + s_dbw = 0.0 + for k in range(K): + for k_prime in range(k + 1, K): + cluster_k = x[y == unique_labels[k]] + cluster_k_prime = x[y == unique_labels[k_prime]] + G_k = centroids[k] + G_k_prime = centroids[k_prime] + H_kk_prime = (G_k + G_k_prime) / 2 + density_at_Gk = np.sum(np.linalg.norm(cluster_k - G_k, axis=1) < sigma) + np.sum(np.linalg.norm(cluster_k_prime - G_k, axis=1) < sigma) + density_at_Gk_prime = np.sum(np.linalg.norm(cluster_k - G_k_prime, axis=1) < sigma) + np.sum(np.linalg.norm(cluster_k_prime - G_k_prime, axis=1) < sigma) + density_at_Hkk_prime = np.sum(np.linalg.norm(cluster_k - H_kk_prime, axis=1) < sigma) + np.sum(np.linalg.norm(cluster_k_prime - H_kk_prime, axis=1) < sigma) + if max(density_at_Gk, density_at_Gk_prime) == 0: + pass + else: + Rkk_prime = density_at_Hkk_prime / max(density_at_Gk, density_at_Gk_prime) + s_dbw += Rkk_prime + + s_dbw /= (K * (K - 1)) / 2 + return s_dbw + + @staticmethod + def ray_turi_index(x: np.ndarray, y: np.ndarray) -> float: + """ + Compute the Ray-Turi index for evaluating the clustering quality. + + A lower value indicates a better clustering result. + + :param np.ndarray x: The dataset as a 2D NumPy array of shape (n_samples, n_features). + :param np.ndarray y: Cluster labels for each data point as a 1D NumPy array of shape (n_samples,). + :returns: The Ray-Turi index score. + :rtype: float + + :example: + >>> from sklearn.datasets import make_blobs + >>> X, labels = make_blobs(n_samples=5000, centers=5, random_state=42, n_features=3, cluster_std=2) + >>> score = Statistics.s_dbw_index(X, labels) + + :references: + .. [1] Ray, S., & Turi, R. H. (1999). Determination of number of clusters in k-means clustering and application in colour image segmentation. Proceedings of the 4th International Conference on Advances in Pattern Recognition and Digital Techniques, 137–143. + """ + + check_valid_array(data=x, accepted_ndims=(2,), accepted_dtypes=Formats.NUMERIC_DTYPES.value) + check_valid_array(data=y, accepted_ndims=(1,), accepted_dtypes=Formats.NUMERIC_DTYPES.value, accepted_axis_0_shape=[x.shape[0], ]) + n_clusters = get_unique_values_in_iterable(data=y, name=Statistics.ray_turi_index.__name__, min=2) + unique_labels = np.unique(y) + + centroids = np.array([x[y == label].mean(axis=0) for label in unique_labels]) + intra_dists = np.full(shape=(x.shape[0]), fill_value=np.nan, dtype=np.float32) + min_cluster_distance = np.inf + obs_cnt = 0 + for cnt, cluster_id in enumerate(unique_labels): + cluster_obs = x[np.argwhere(y == cluster_id).flatten()] + centroids[cnt] = np.mean(cluster_obs, axis=0) + dists = np.linalg.norm(cluster_obs - centroids[cnt], axis=1) ** 2 + intra_dists[obs_cnt: obs_cnt + dists.shape[0]] = dists + obs_cnt += dists.shape[0] + + for i in range(n_clusters): + for j in range(i + 1, n_clusters): + distance = np.sum((centroids[i] - centroids[j]) ** 2) + min_cluster_distance = min(min_cluster_distance, distance) + + return np.mean(intra_dists) / min_cluster_distance + @staticmethod def fowlkes_mallows(x: np.ndarray, y: np.ndarray) -> float: """ diff --git a/simba/mixins/timeseries_features_mixin.py b/simba/mixins/timeseries_features_mixin.py index 21013a262..3f0dd3ca1 100644 --- a/simba/mixins/timeseries_features_mixin.py +++ b/simba/mixins/timeseries_features_mixin.py @@ -801,9 +801,7 @@ def line_length(data: np.ndarray) -> float: @staticmethod @njit("(float32[:], float64[:], int64)", fastmath=True) - def sliding_line_length( - data: np.ndarray, window_sizes: np.ndarray, sample_rate: int - ) -> np.ndarray: + def sliding_line_length(data: np.ndarray, window_sizes: np.ndarray, sample_rate: int) -> np.ndarray: """ Jitted compute of sliding line length for a given time series using different window sizes. @@ -1096,7 +1094,7 @@ def sliding_longest_strike( results[r1 - 1, i] = result - return results + return results @staticmethod @njit( diff --git a/simba/mixins/train_model_mixin.py b/simba/mixins/train_model_mixin.py index f0f9f79d0..fae18b0ca 100644 --- a/simba/mixins/train_model_mixin.py +++ b/simba/mixins/train_model_mixin.py @@ -1721,13 +1721,21 @@ def check_raw_dataset_integrity(self, df: pd.DataFrame, logs_path: Optional[Unio def _create_shap_mp_helper(data: Tuple[int, pd.DataFrame], explainer: shap.TreeExplainer, clf_name: str, - verbose: bool) -> Tuple[int, pd.DataFrame]: + verbose: bool) -> Tuple[pd.DataFrame, int]: if verbose: - print(f'Processing SHAP batch {data[0] + 1}... ({len(data[1])} observations)') + print(f'Processing SHAP core batch {data[0] + 1}... ({len(data[1])} observations)') _ = data[1].pop(clf_name).values.reshape(-1, 1) - shap_results = explainer.shap_values(data[1].values, check_additivity=False)[1] - return shap_results, data[0] + shap_batch_results = np.full(shape=(len(data[1]), len(data[1].columns)), fill_value=np.nan, dtype=np.float32) + for idx in range(len(data[1])): + timer = SimbaTimer(start=True) + obs = data[1].iloc[idx, :].values + shap_batch_results[idx] = explainer.shap_values(obs, check_additivity=False)[1] + timer.stop_timer() + if verbose: + print(f'SHAP frame complete (core batch: {data[0] + 1}, core batch frame: {idx+1}/{len(data[1])}, frame processing time: {timer.elapsed_time_str}s)') + + return shap_batch_results, data[0] def create_shap_log_mp(self, rf_clf: RandomForestClassifier, @@ -1798,7 +1806,7 @@ def create_shap_log_mp(self, check_int(name=f'{TrainModelMixin.create_shap_log_mp.__name__} core_cnt', value=core_cnt, min_value=-1, unaccepted_vals=[0]) check_int(name=f'{TrainModelMixin.create_shap_log_mp.__name__} chunk_size', value=chunk_size, min_value=1) check_valid_boolean(value=[verbose, plot], source=f'{TrainModelMixin.create_shap_log_mp.__name__} verbose, plot') - core_cnt = [find_core_cnt()[0] if core_cnt is -1 or core_cnt > find_core_cnt()[0] else core_cnt][0] + core_cnt = [find_core_cnt()[0] if core_cnt == -1 or core_cnt > find_core_cnt()[0] else core_cnt][0] df = pd.DataFrame(np.hstack((x, y.reshape(-1, 1))), columns=x_names + [clf_name]) del x; del y present_df, absent_df = df[df[clf_name] == 1], df[df[clf_name] == 0] @@ -1822,14 +1830,14 @@ def create_shap_log_mp(self, print(f"Computing {cnt_present + cnt_absent} SHAP values. Follow progress in OS terminal... (CORES: {core_cnt}, CHUNK SIZE: {chunk_size})") with multiprocessing.Pool(core_cnt, maxtasksperchild=Defaults.MAXIMUM_MAX_TASK_PER_CHILD.value) as pool: constants = functools.partial(TrainModelMixin._create_shap_mp_helper, explainer=explainer, clf_name=clf_name, verbose=verbose) - for cnt, result in enumerate(pool.imap_unordered(constants, shap_data, chunksize=1)): + for cnt, result in enumerate(pool.imap(constants, shap_data, chunksize=1)): proba = TrainModelMixin().clf_predict_proba(clf=rf_clf, x_df=shap_data[result[1]][1].drop(clf_name, axis=1), model_name=clf_name).reshape(-1, 1) shap_sum = np.sum(result[0], axis=1).reshape(-1, 1) batch_shap_results = np.hstack((result[0], np.full((result[0].shape[0]), expected_value).reshape(-1, 1), shap_sum + expected_value, proba, shap_data[result[1]][1][clf_name].values.reshape(-1, 1))).astype(np.float32) shap_results.append(batch_shap_results) shap_raw.append(shap_data[result[1]][1].drop(clf_name, axis=1)) if verbose: - print(f"Completed SHAP batch (Batch {result[1] + 1}/{len(shap_data)}).") + print(f"Completed SHAP care batch (Batch {result[1] + 1}/{len(shap_data)}).") pool.terminate(); pool.join() shap_df = pd.DataFrame(data=np.row_stack(shap_results), columns=list(x_names) + ["Expected_value", "Sum", "Prediction_probability", clf_name]) diff --git a/simba/model/grid_search_rf.py b/simba/model/grid_search_rf.py index 7c927c1c0..54f705159 100644 --- a/simba/model/grid_search_rf.py +++ b/simba/model/grid_search_rf.py @@ -42,10 +42,10 @@ def __init__(self, config_path: Union[str, os.PathLike]): check_if_filepath_list_is_empty(filepaths=self.target_file_paths, error_msg=f"Zero data files found in {self.targets_folder}, cannot create models.") if not os.path.exists(self.configs_meta_dir): os.makedirs(self.configs_meta_dir) self.meta_file_lst = sorted(read_simba_meta_files(self.configs_meta_dir)) - print(f"Reading in {len(self.target_file_paths)} annotated files...") + print(f"Reading in {len(self.target_file_paths)} annotated files found in the {self.targets_folder} directory...") self.data_df, self.frm_idx = self.read_all_files_in_folder_mp_futures(self.target_file_paths, self.file_type) self.frm_idx = pd.DataFrame({"VIDEO": list(self.data_df.index), "FRAME_IDX": self.frm_idx}) - self.data_df = self.check_raw_dataset_integrity(self.data_df, logs_path=self.logs_path) + _ = self.check_raw_dataset_integrity(self.data_df, logs_path=self.logs_path) self.data_df = self.drop_bp_cords(df=self.data_df) def perform_sampling(self, meta_dict: dict): diff --git a/simba/roi_tools/ROI_analyzer.py b/simba/roi_tools/ROI_analyzer.py index 95ad52364..1f18425f1 100644 --- a/simba/roi_tools/ROI_analyzer.py +++ b/simba/roi_tools/ROI_analyzer.py @@ -1,28 +1,26 @@ __author__ = "Simon Nilsson" import os -from typing import List, Optional, Union +from typing import List, Optional, Union, Tuple + import numpy as np import pandas as pd from simba.mixins.config_reader import ConfigReader from simba.mixins.feature_extraction_mixin import FeatureExtractionMixin -from simba.mixins.feature_extraction_supplement_mixin import \ - FeatureExtractionSupplemental +from simba.mixins.feature_extraction_supplement_mixin import FeatureExtractionSupplemental from simba.utils.checks import ( check_all_file_names_are_represented_in_video_log, check_file_exist_and_readable, check_float, check_that_column_exist, check_valid_lst) from simba.utils.data import detect_bouts, slice_roi_dict_for_video from simba.utils.enums import Keys -from simba.utils.errors import (CountError, MissingColumnsError, - ROICoordinatesNotFoundError) +from simba.utils.errors import (CountError, MissingColumnsError, ROICoordinatesNotFoundError) from simba.utils.printing import stdout_success from simba.utils.read_write import get_fn_ext, read_data_paths, read_df from simba.utils.warnings import NoDataFoundWarning - class ROIAnalyzer(ConfigReader, FeatureExtractionMixin): """ Analyze movements, entries, exits, and time-spent-in user-defined ROIs. Results are stored in the @@ -47,9 +45,9 @@ class ROIAnalyzer(ConfigReader, FeatureExtractionMixin): def __init__(self, config_path: Union[str, os.PathLike], data_path: Optional[Union[str, os.PathLike, List[str]]] = None, - detailed_bout_data: Optional[bool] = False, - calculate_distances: Optional[bool] = False, - threshold: Optional[float] = 0.0, + detailed_bout_data: bool = False, + calculate_distances: bool = False, + threshold: float = 0.0, body_parts: Optional[List[str]] = None): check_file_exist_and_readable(file_path=config_path) @@ -58,11 +56,7 @@ def __init__(self, raise ROICoordinatesNotFoundError(expected_file_path=self.roi_coordinates_path) self.read_roi_data() FeatureExtractionMixin.__init__(self) - self.data_paths = read_data_paths(path=data_path, - default=self.outlier_corrected_paths, - default_name=self.outlier_corrected_dir, - file_type=self.file_type) - + self.data_paths = read_data_paths(path=data_path, default=self.outlier_corrected_paths, default_name=self.outlier_corrected_dir, file_type=self.file_type) check_float(name="Body-part probability threshold", value=threshold, min_value=0.0, max_value=1.0) check_valid_lst(data=body_parts, source=f"{self.__class__.__name__} body-parts", valid_dtypes=(str,)) if len(set(body_parts)) != len(body_parts): @@ -110,7 +104,7 @@ def run(self): roi_bouts["VIDEO"] = video_name self.roi_bout_results.append(roi_bouts) animal_bout_results[row["Name"]] = roi_bouts - self.entry_results.loc[len(self.entry_results)] = [video_name,animal_name,row["Name"],len(roi_bouts)] + self.entry_results.loc[len(self.entry_results)] = [video_name,animal_name,row["Name"], len(roi_bouts)] self.time_results.loc[len(self.time_results)] = [video_name,animal_name,row["Name"],roi_bouts["Bout_time"].sum()] for _, row in self.sliced_roi_dict[Keys.ROI_CIRCLES.value].iterrows(): @@ -159,58 +153,23 @@ def run(self): if self.calculate_distances: for roi_name, roi_data in animal_bout_results.items(): if len(roi_data) == 0: - self.movements_df.loc[len(self.movements_df)] = [ - video_name, - animal_name, - roi_name, - "Movement (cm)", - 0, - ] - self.movements_df.loc[len(self.movements_df)] = [ - video_name, - animal_name, - roi_name, - "Average velocity (cm/s)", - "None", - ] + self.movements_df.loc[len(self.movements_df)] = [video_name, animal_name, roi_name, "Movement (cm)", 0,] + self.movements_df.loc[len(self.movements_df)] = [video_name, animal_name, roi_name, "Average velocity (cm/s)", "None",] else: distances, velocities = [], [] - roi_frames = roi_data[ - ["Start_frame", "End_frame"] - ].values + roi_frames = roi_data[["Start_frame", "End_frame"]].values for event in roi_frames: - event_pose = animal_df.loc[ - np.arange(event[0], event[1] + 1), bp_names - ] - event_pose = event_pose[ - event_pose[bp_names[2]] > self.threshold - ][bp_names[:2]].values + event_pose = animal_df.loc[np.arange(event[0], event[1] + 1), bp_names] + event_pose = event_pose[event_pose[bp_names[2]] > self.threshold][bp_names[:2]].values if event_pose.shape[0] > 1: - distance, velocity = ( - FeatureExtractionSupplemental.distance_and_velocity( - x=event_pose, - fps=self.fps, - pixels_per_mm=pix_per_mm, - centimeters=True, - ) - ) + distance, velocity = (FeatureExtractionSupplemental.distance_and_velocity(x=event_pose, fps=self.fps, pixels_per_mm=pix_per_mm, centimeters=True)) distances.append(distance) + print(distances, velocity) velocities.append(velocity) - self.movements_df.loc[len(self.movements_df)] = [ - video_name, - animal_name, - roi_name, - "Movement (cm)", - sum(distances), - ] - self.movements_df.loc[len(self.movements_df)] = [ - video_name, - animal_name, - roi_name, - "Average velocity (cm/s)", - np.average(velocities), - ] - if len(self.roi_bout_results) > 1: + self.movements_df.loc[len(self.movements_df)] = [video_name, animal_name, roi_name, "Movement (cm)", sum(distances)] + self.movements_df.loc[len(self.movements_df)] = [video_name, animal_name, roi_name, "Average velocity (cm/s)", np.average(velocities)] + + if len(self.roi_bout_results) > 0: self.detailed_df = pd.concat(self.roi_bout_results, axis=0) self.detailed_df = self.detailed_df.rename(columns={"Event": "SHAPE NAME", "Start_time": "START TIME", "End Time": "END TIME", "Start_frame": "START FRAME", "End_frame": "END FRAME", "Bout_time": "DURATION (S)"}) self.detailed_df["BODY-PART"] = self.detailed_df["ANIMAL"].map(self.bp_lk) @@ -236,6 +195,17 @@ def save(self): stdout_success(msg=f"ROI time and ROI entry saved in the {self.logs_path} directory in CSV format.") +# test = ROIAnalyzer(config_path = r"C:\troubleshooting\ROI_movement_test\project_folder\project_config.ini", +# data_path=None, +# calculate_distances=True, +# detailed_bout_data=True, +# body_parts=['Head'], +# threshold=0.0) +# test.run() + + + + # test = ROIAnalyzer(config_path = r"/Users/simon/Desktop/envs/simba/troubleshooting/two_black_animals_14bp/project_folder/project_config.ini", # data_path=None, # calculate_distances=True, diff --git a/simba/roi_tools/ROI_time_bin_calculator.py b/simba/roi_tools/ROI_time_bin_calculator.py index 9bdb15db8..d597a9cef 100644 --- a/simba/roi_tools/ROI_time_bin_calculator.py +++ b/simba/roi_tools/ROI_time_bin_calculator.py @@ -67,7 +67,7 @@ def __init__(self, raise BodypartColumnNotFoundError(msg=f'The body-part {bp} is not a valid body-part in the SimBA project. Options: {self.body_parts_lst}', source=self.__class__.__name__) if len(set(body_parts)) != len(body_parts): raise DuplicationError(msg=f'All body-part entries have to be unique. Got {body_parts}', source=self.__class__.__name__) - self.roi_analyzer = ROIAnalyzer(config_path=self.config_path, data_path=self.outlier_corrected_dir, calculate_distances=False, threshold=threshold, body_parts=body_parts) + self.roi_analyzer = ROIAnalyzer(config_path=self.config_path, data_path=self.outlier_corrected_dir, calculate_distances=False, threshold=threshold, body_parts=body_parts, detailed_bout_data=True) self.roi_analyzer.run() self.animal_names = list(self.roi_analyzer.bp_dict.keys()) self.bp_dict = self.roi_analyzer.bp_dict @@ -107,22 +107,10 @@ def run(self): if self.movement: if len(frms_inside_roi_in_timebin) > 0: bin_move = (self.movement_timebins.movement_dict[self.video_name].iloc[frms_inside_roi_in_timebin].values.flatten().astype(np.float32)) - _, velocity = (FeatureExtractionSupplemental.distance_and_velocity(x=bin_move,fps=fps, pixels_per_mm=1, centimeters=True)) - self.results_movement_velocity.loc[len(self.results_movement_velocity)] = [self.video_name, - shape_name, - animal_name, - body_part, - bin_cnt, - bin_move[1:].sum() / 10, - velocity] + movement, velocity = (FeatureExtractionSupplemental.distance_and_velocity(x=bin_move,fps=fps, pixels_per_mm=1, centimeters=False)) + self.results_movement_velocity.loc[len(self.results_movement_velocity)] = [self.video_name, shape_name, animal_name, body_part, bin_cnt, bin_move[1:].sum() / 10, velocity] else: - self.results_movement_velocity.loc[len(self.results_movement_velocity)] = [self.video_name, - shape_name, - animal_name, - body_part, - bin_cnt, - 0, - 0] + self.results_movement_velocity.loc[len(self.results_movement_velocity)] = [self.video_name, shape_name, animal_name, body_part, bin_cnt, 0, 0] video_timer.stop_timer() print(f"Video {self.video_name} complete (elapsed time {video_timer.elapsed_time_str}s)") @@ -133,15 +121,19 @@ def save(self): stdout_success(msg=f"ROI time bin entry data saved at {self.save_path_entries}", elapsed_time=self.timer.elapsed_time_str) stdout_success(msg=f"ROI time bin time data saved at {self.save_path_time}", elapsed_time=self.timer.elapsed_time_str) if self.movement: - self.results_movement_velocity.sort_values( - by=["VIDEO", "SHAPE", "ANIMAL", "TIME BIN #"] - ).set_index("VIDEO").to_csv(self.save_path_movement_velocity) - stdout_success( - msg=f"ROI time-bin movement data saved at {self.save_path_movement_velocity}", - elapsed_time=self.timer.elapsed_time_str, - ) + self.results_movement_velocity.sort_values(by=["VIDEO", "SHAPE", "ANIMAL", "TIME BIN #"]).set_index("VIDEO").to_csv(self.save_path_movement_velocity) + stdout_success(msg=f"ROI time-bin movement data saved at {self.save_path_movement_velocity}", elapsed_time=self.timer.elapsed_time_str) + +# test = ROITimebinCalculator(config_path=r"C:\troubleshooting\ROI_movement_test\project_folder\project_config.ini", +# bin_length=0.5, +# body_parts=['Head'], +# threshold=0.00, +# movement=True) +# test.run() +# test.save() + # test = ROITimebinCalculator(config_path=r"/Users/simon/Desktop/envs/simba/troubleshooting/two_black_animals_14bp/project_folder/project_config.ini", # bin_length=1, # body_parts=['Nose_1'], diff --git a/simba/ui/pop_ups/check_videos_seekable_pop_up.py b/simba/ui/pop_ups/check_videos_seekable_pop_up.py index 75d31cbaa..50dc5e1f6 100644 --- a/simba/ui/pop_ups/check_videos_seekable_pop_up.py +++ b/simba/ui/pop_ups/check_videos_seekable_pop_up.py @@ -78,4 +78,4 @@ def run(self, directory: bool): save_path=save_path) -CheckVideoSeekablePopUp() \ No newline at end of file +#CheckVideoSeekablePopUp() \ No newline at end of file diff --git a/simba/utils/read_write.py b/simba/utils/read_write.py index d631eb0b7..c0c23457b 100644 --- a/simba/utils/read_write.py +++ b/simba/utils/read_write.py @@ -72,16 +72,14 @@ READ_OPTIONS = csv.ReadOptions(encoding="utf8") -def read_df( - file_path: Union[str, os.PathLike], - file_type: Union[str, os.PathLike], - has_index: Optional[bool] = True, - remove_columns: Optional[List[str]] = None, - usecols: Optional[List[str]] = None, - anipose_data: Optional[bool] = False, - check_multiindex: Optional[bool] = False, - multi_index_headers_to_keep: Optional[int] = None, -) -> pd.DataFrame: +def read_df(file_path: Union[str, os.PathLike], + file_type: Union[str, os.PathLike], + has_index: Optional[bool] = True, + remove_columns: Optional[List[str]] = None, + usecols: Optional[List[str]] = None, + anipose_data: Optional[bool] = False, + check_multiindex: Optional[bool] = False, + multi_index_headers_to_keep: Optional[int] = None) -> Union[pd.DataFrame, dict]: """ Read single tabular data file or pickle diff --git a/simba/utils/warnings.py b/simba/utils/warnings.py index 00703a514..3bc42bf8d 100644 --- a/simba/utils/warnings.py +++ b/simba/utils/warnings.py @@ -259,3 +259,8 @@ def CorruptedFileWarning(msg: str, source: str = ""): @log_warning def ResolutionWarning(msg: str, source: str = ""): pass + + +@log_warning +def GPUToolsWarning(msg: str, source: str = ""): + pass