diff --git a/setup.py b/setup.py index cc7e4ac70..67c75abd5 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ # Setup configuration setuptools.setup( name="simba-uw-tf-dev", - version="2.4.4", + version="2.4.5", author="Simon Nilsson, Jia Jie Choong, Sophia Hwang", author_email="sronilsson@gmail.com", description="Toolkit for computer classification and analysis of behaviors in experimental animals", diff --git a/simba/data_processors/cuda/utils.py b/simba/data_processors/cuda/utils.py index de0934091..9215a0022 100644 --- a/simba/data_processors/cuda/utils.py +++ b/simba/data_processors/cuda/utils.py @@ -26,12 +26,28 @@ def _cuda_cos(x, t): t[i] = v return t +@cuda.jit(device=True) +def _cuda_min(x: np.ndarray): + return min(x) + +@cuda.jit(device=True) +def _cuda_max(x: np.ndarray): + return max(x) + +@cuda.jit(device=True) +def _cuda_standard_deviation(x): + m = _cuda_mean(x) + std_sum = 0 + for i in range(x.shape[0]): + std_sum += abs(x[i] - m) + return math.sqrt(std_sum / x.shape[0]) + @cuda.jit(device=True) def _cuda_std(x: np.ndarray, x_hat: float): std = 0 for i in range(x.shape[0]): std += (x[0] - x_hat) ** 2 - return std + return math.sqrt(std / x.shape[0]) @cuda.jit(device=True) def _rad2deg(x): @@ -116,6 +132,33 @@ def _cuda_add_2d(x: np.ndarray, vals: np.ndarray) -> np.ndarray: x[i][j] = x[i][j] + vals[j] return x + +@cuda.jit(device=True) +def _cuda_variance(x: np.ndarray): + mean = _cuda_mean(x) + num = 0 + for i in range(x.shape[0]): + num += abs(x[i] - mean) + return num / (x.shape[0] - 1) + + +@cuda.jit(device=True) +def _cuda_mac(x: np.ndarray): + """ mean average change in 1d array (max size 512)""" + diff = cuda.local.array(shape=512, dtype=np.float64) + for i in range(512): + diff[i] = np.inf + for j in range(1, x.shape[0]): + diff[j] = abs(x[j] - x[j-1]) + s, cnt = 0, 0 + for p in range(diff.shape[0]): + if (diff[p] != np.inf): + s += diff[p] + cnt += 1 + val = s / cnt + cuda.syncthreads() + return val + def _cuda_available() -> Tuple[bool, Dict[int, Any]]: """ Check if GPU available. If True, returns the GPUs, the model, physical slots and compute capabilitie(s). @@ -137,18 +180,56 @@ def _cuda_available() -> Tuple[bool, Dict[int, Any]]: return is_available, devices -# @guvectorize([(float64[:], float64[:])], '(n) -> (n)', target='cuda') -# def _cuda_bubble_sort(arr, out): -# """ -# :example: -# >>> a = np.random.randint(5, 50, (5, 200)).astype('float64') -# >>> d_a = cuda.to_device(a) -# >>> _cuda_bubble_sort(d_a) -# >>> d = d_a.copy_to_host() -# """ -# -# for i in range(len(arr)): -# for j in range(len(arr) - 1 - i): -# if arr[j] > arr[j + 1]: -# arr[j], arr[j + 1] = arr[j + 1], arr[j] -# out = arr \ No newline at end of file + +@cuda.jit(device=True) +def _cuda_bubble_sort(x): + n = x.shape[0] + for i in range(n - 1): + for j in range(n - i - 1): + if x[j] > x[j + 1]: + x[j], x[j + 1] = x[j + 1], x[j] + return x + + +@cuda.jit(device=True) +def _cuda_median(x): + sorted_arr = _cuda_bubble_sort(x) + if not x.shape[0] % 2 == 0: + return sorted_arr[int(math.floor(x.shape[0] / 2))] + else: + loc_1, loc_2 = int((x.shape[0] / 2) - 1), int(x.shape[0] / 2) + return (sorted_arr[loc_1] + sorted_arr[loc_2]) / 2 + + +@cuda.jit(device=True) +def _cuda_mad(x): + diff = cuda.local.array(shape=512, dtype=np.float32) + for i in range(512): + diff[i] = np.inf + m = _cuda_median(x) + for j in range(x.shape[0]): + diff[j] = abs(x[j] - m) + return _cuda_median(diff[0:x.shape[0]-1]) + +@cuda.jit(device=True) +def _cuda_rms(x: np.ndarray): + squared = cuda.local.array(shape=512, dtype=np.float64) + for i in range(512): squared[i] = np.inf + for j in range(x.shape[0]): + squared[j] = x[j] ** 2 + m = _cuda_mean(squared[0: x.shape[0]-1]) + return math.sqrt(m) + + +@cuda.jit(device=True) +def _cuda_range(x: np.ndarray): + return _cuda_max(x) - _cuda_min(x) + +@cuda.jit(device=True) +def _cuda_abs_energy(x): + squared = cuda.local.array(shape=512, dtype=np.float64) + for i in range(512): squared[i] = np.inf + for j in range(x.shape[0]): + squared[j] = x[j] ** 2 + m = _cuda_sum(squared[0: x.shape[0] - 1]) + return math.sqrt(m) \ No newline at end of file diff --git a/simba/mixins/geometry_mixin.py b/simba/mixins/geometry_mixin.py index 266fbda04..f035c3c13 100644 --- a/simba/mixins/geometry_mixin.py +++ b/simba/mixins/geometry_mixin.py @@ -845,7 +845,7 @@ def view_shapes(shapes: List[Union[LineString, Polygon, MultiPolygon, MultiLineS bg_img: Optional[np.ndarray] = None, bg_clr: Optional[Tuple[int, int, int]] = None, size: Optional[int] = None, - color_palette: Union[str, List[Tuple[int, int, int]]] = 'Set1', + color_palette: Union[str, List[Tuple[int, ...]]] = 'Set1', fill_shapes: Optional[bool] = False, thickness: Optional[int] = 2, pixel_buffer: Optional[int] = 200, @@ -864,7 +864,7 @@ def view_shapes(shapes: List[Union[LineString, Polygon, MultiPolygon, MultiLineS :param Optional[np.ndarray] bg_img: Optional. An image array (in np.ndarray format) to use as the background. If not provided, a blank canvas will be created. :param Optional[Tuple[int, int, int]] bg_clr: A tuple representing the RGB color of the background (e.g., (255, 255, 255) for white). This is ignored if bg_img is provided. If None the background is white. :param Optional[int] size: Optional. An integer to specify the size of the canvas (width and height). Only applicable if bg_img is not provided. - :param Optional[str] color_palette: Optional. A string specifying the color palette to be used for the shapes. Default is 'Set1', which uses distinct colors. + :param Optional[str] color_palette: Optional. A string specifying the color palette to be used for the shapes. Default is 'Set1', which uses distinct colors. Alternatively, a list of RGB value tuples of same length as `shapes`. :param Optional[int] thickness: Optional. An integer specifying the thickness of the lines when rendering LineString or Polygon borders. Default is 2. :param Optional[int] pixel_buffer: Optional. An integer specifying the number of pixels to add around the bounding box of the shapes for padding. Default is 200. :return: An image (np.ndarray) with the rendered shapes. @@ -3233,13 +3233,13 @@ def cumsum_coord_geometries(self, return np.cumsum(img_arr, axis=0) / fps @staticmethod - def _cumsum_bool_helper( - data: np.ndarray, geometries: Dict[Tuple[int, int], Polygon] - ): + def _cumsum_bool_helper(data: np.ndarray, + geometries: Dict[Tuple[int, int], Polygon], + verbose: bool = True): + data_point = Point(data[1:3]) - print( - f"Processing animal grid square location for boolean in frame {int(data[0])}..." - ) + if verbose: + print(f"Processing animal grid square location for boolean in frame {int(data[0])}...") for k, r in geometries.items(): if r.contains(data_point): return (int(data[0]), k[0], k[1]) @@ -3250,6 +3250,7 @@ def cumsum_bool_geometries(self, geometries: Dict[Tuple[int, int], Polygon], bool_data: np.ndarray, fps: Optional[float] = None, + verbose: bool = True, core_cnt: Optional[int] = -1) -> np.ndarray: """ Compute the cumulative sums of boolean events within polygon geometries over time using multiprocessing. For example, compute the cumulative time of classified events within spatial locations at all time-points of the video. @@ -3262,6 +3263,7 @@ def cumsum_bool_geometries(self, :param Dict[Tuple[int, int], Polygon] geometries: Dictionary of polygons representing spatial regions. E.g., created by :func:`simba.mixins.geometry_mixin.GeometryMixin.bucket_img_into_grid_square` or :func:`simba.mixins.geometry_mixin.GeometryMixin.bucket_img_into_grid_hexagon`. :param np.ndarray bool_data: Boolean array with shape (data.shape[0],) or (data.shape[0], 1) indicating the presence or absence in each frame. :param Optional[float] fps: Frames per second. If provided, the result is normalized by the frame rate. + :param bool verbose: If true, prints progress. Default: True. :param Optional[float] core_cnt: Number of CPU cores to use for parallel processing. Default is -1, which means using all available cores. :returns: Matrix of size (frames x horizontal bins x verical bins) with times in seconds (if fps passed) or frames (if fps not passed) :rtype: np.ndarray @@ -3275,39 +3277,14 @@ def cumsum_bool_geometries(self, >>> (500, 4, 4) """ - check_valid_array( - data=data, - accepted_sizes=[2], - source=f"{GeometryMixin.cumsum_bool_geometries.__name__} data", - ) - check_instance( - source=f"{GeometryMixin.cumsum_bool_geometries.__name__} geometries", - instance=geometries, - accepted_types=dict, - ) - check_valid_array( - data=bool_data, - accepted_shapes=[(data.shape[0], 1), (data.shape[0],)], - source=f"{GeometryMixin.cumsum_bool_geometries.__name__} bool_data", - ) + check_valid_array(data=data, accepted_sizes=[2], source=f"{GeometryMixin.cumsum_bool_geometries.__name__} data") + check_instance(source=f"{GeometryMixin.cumsum_bool_geometries.__name__} geometries",instance=geometries,accepted_types=dict) + check_valid_array(data=bool_data,accepted_shapes=[(data.shape[0], 1), (data.shape[0],)],source=f"{GeometryMixin.cumsum_bool_geometries.__name__} bool_data") if fps is not None: - check_float( - name=f"{GeometryMixin.cumsum_bool_geometries.__name__} fps", - value=fps, - min_value=1.0, - ) - check_int( - name=f"{GeometryMixin.cumsum_bool_geometries.__name__} core_cnt", - value=core_cnt, - min_value=-1, - ) - if not np.array_equal( - np.sort(np.unique(bool_data)).astype(int), np.array([0, 1]) - ): - raise InvalidInputError( - msg=f"Invalid boolean data. Expected {np.array([0, 1])} but found {np.sort(np.unique(bool_data)).astype(int)}", - source=GeometryMixin.cumsum_bool_geometries.__name__, - ) + check_float(name=f"{GeometryMixin.cumsum_bool_geometries.__name__} fps", value=fps, min_value=1.0) + check_int(name=f"{GeometryMixin.cumsum_bool_geometries.__name__} core_cnt", value=core_cnt, min_value=-1) + if not np.array_equal(np.sort(np.unique(bool_data)).astype(int), np.array([0, 1])): + raise InvalidInputError(msg=f"Invalid boolean data. Expected {np.array([0, 1])} but found {np.sort(np.unique(bool_data)).astype(int)}", source=GeometryMixin.cumsum_bool_geometries.__name__) if core_cnt == -1: core_cnt = find_core_cnt()[0] w, h = 0, 0 @@ -3320,12 +3297,10 @@ def cumsum_bool_geometries(self, data = np.hstack((frm_id, data)) img_arr = np.zeros((data.shape[0], h + 1, w + 1)) data = data[np.argwhere((data[:, 3] == 1))].reshape(-1, 4) - with multiprocessing.Pool( - core_cnt, maxtasksperchild=Defaults.LARGE_MAX_TASK_PER_CHILD.value - ) as pool: - constants = functools.partial( - self._cumsum_bool_helper, geometries=geometries - ) + with multiprocessing.Pool(core_cnt, maxtasksperchild=Defaults.LARGE_MAX_TASK_PER_CHILD.value) as pool: + constants = functools.partial(self._cumsum_bool_helper, + geometries=geometries, + verbose=verbose) for cnt, result in enumerate(pool.imap(constants, data, chunksize=1)): if result[1] != -1: img_arr[result[0], result[2] - 1, result[1] - 1] = 1 diff --git a/simba/mixins/plotting_mixin.py b/simba/mixins/plotting_mixin.py index fc7c38912..fa29f8d45 100644 --- a/simba/mixins/plotting_mixin.py +++ b/simba/mixins/plotting_mixin.py @@ -497,7 +497,7 @@ def make_location_heatmap_plot(frm_data: np.array, canvas.draw() mat = np.array(canvas.renderer._renderer) image = cv2.cvtColor(mat, cv2.COLOR_RGB2BGR) - image = cv2.resize(mat, img_size) + image = cv2.resize(image, img_size) image = np.uint8(image) plt.close("all") if file_name is not None: diff --git a/simba/mixins/statistics_mixin.py b/simba/mixins/statistics_mixin.py index a4c5ab937..619391253 100644 --- a/simba/mixins/statistics_mixin.py +++ b/simba/mixins/statistics_mixin.py @@ -1958,15 +1958,17 @@ def sliding_kendall_tau(sample_1: np.ndarray, sample_2: np.ndarray, time_windows return results @staticmethod - def find_collinear_features( - df: pd.DataFrame, - threshold: float, - method: Optional[Literal["pearson", "spearman", "kendall"]] = "pearson", - verbose: Optional[bool] = False, - ) -> List[str]: + def find_collinear_features(df: pd.DataFrame, + threshold: float, + method: Optional[Literal["pearson", "spearman", "kendall"]] = "pearson", + verbose: Optional[bool] = False) -> List[str]: + """ Identify collinear features in the dataframe based on the specified correlation method and threshold. + .. seealso:: + For multicore numba accelerated method, see :func:`simba.mixins.train_model_mixin.TrainModelMixin.find_highly_correlated_fields`. + :param pd.DataFrame df: Input DataFrame containing features. :param float threshold: Threshold value to determine collinearity. :param Optional[Literal['pearson', 'spearman', 'kendall']] method: Method for calculating correlation. Defaults to 'pearson'. @@ -4343,3 +4345,35 @@ def symmetry_index(x: np.ndarray, y: np.ndarray, agg_type: Literal['mean', 'medi else: return np.float32(np.nanmedian(si_values)) + @staticmethod + @njit("(float32[:], float64, float64)") + def sliding_iqr(x: np.ndarray, window_size: float, sample_rate: float) -> np.ndarray: + """ + Compute the sliding interquartile range (IQR) for a 1D array of feature values. + + :param ndarray x: 1D array representing the feature values for which the IQR will be calculated. + :param float window_size: Size of the sliding window, in seconds. This value determines how many samples are included in each window. + :param float sample_rate: The sampling rate in samples per second, e.g., fps. + :returns : Sliding IQR values + :rtype: np.ndarray + + :references: + .. [1] Hession, Leinani E., Gautam S. Sabnis, Gary A. Churchill, and Vivek Kumar. “A Machine-Vision-Based Frailty Index for Mice.” Nature Aging 2, no. 8 (August 16, 2022): 756–66. https://doi.org/10.1038/s43587-022-00266-0. + + :example: + >>> data = np.random.randint(0, 50, (90,)).astype(np.float32) + >>> window_size = 0.5 + >>> Statistics.sliding_iqr(x=data, window_size=0.5, sample_rate=10.0) + """ + + frm_win = max(1, int(window_size * sample_rate)) + results = np.full(shape=(x.shape[0],), dtype=np.float32, fill_value=-1.0) + for r in range(frm_win, x.shape[0] + 1): + sorted_sample = np.sort(x[r - frm_win:r]) + lower_idx = sorted_sample.shape[0] // 4 + upper_idx = (3 * sorted_sample.shape[0]) // 4 + lower_val = sorted_sample[lower_idx] + upper_val = sorted_sample[upper_idx] + results[r - 1] = upper_val - lower_val + return results + diff --git a/simba/mixins/timeseries_features_mixin.py b/simba/mixins/timeseries_features_mixin.py index a760f2881..21013a262 100644 --- a/simba/mixins/timeseries_features_mixin.py +++ b/simba/mixins/timeseries_features_mixin.py @@ -428,7 +428,16 @@ def sliding_percent_beyond_n_std(data: np.ndarray, n: float, window_sizes: np.nd (int64[:], float64[:], int64), ] ) - def sliding_unique(x: np.ndarray, time_windows: np.ndarray, fps: int): + def sliding_unique(x: np.ndarray, time_windows: np.ndarray, fps: int) -> np.ndarray: + """ + Compute the number of unique values in a sliding window over an array of feature values. + + :param x: 1D array of feature values for which the unique values are to be counted. + :param time_windows: Array of window sizes (in seconds) for which the unique values are counted. + :param int fps: The frame rate in frames per second, which is used to calculate the window size in samples. + :return: A 2D array where each row corresponds to a time window, and each element represents the count of unique values in the corresponding sliding window of the array `x`. + :rtype: np.ndarray + """ results = np.full((x.shape[0], time_windows.shape[0]), -1) for i in prange(time_windows.shape[0]): window_size = int(time_windows[i] * fps) @@ -922,10 +931,10 @@ def sliding_descriptive_statistics(data: np.ndarray, window_sizes: np.ndarray, s results[j, r - 1, i] = np.median(sample) elif statistics[j] == "mean": results[j, r - 1, i] = np.mean(sample) - elif statistics[j] == "mad": - results[j, r - 1, i] = np.median(np.abs(sample - np.median(sample))) elif statistics[j] == "sum": results[j, r - 1, i] = np.sum(sample) + elif statistics[j] == "mad": + results[j, r - 1, i] = np.median(np.abs(sample - np.median(sample))) elif statistics[j] == "mac": results[j, r - 1, i] = np.mean(np.abs(sample[1:] - sample[:-1])) elif statistics[j] == "rms": diff --git a/simba/mixins/train_model_mixin.py b/simba/mixins/train_model_mixin.py index c1905b9ac..24ac9e28b 100644 --- a/simba/mixins/train_model_mixin.py +++ b/simba/mixins/train_model_mixin.py @@ -62,7 +62,7 @@ check_if_dir_exists, check_if_valid_input, check_instance, check_int, check_str, check_that_column_exist, check_valid_dataframe, - check_valid_lst) + check_valid_lst, check_valid_array) from simba.utils.data import (detect_bouts, detect_bouts_multiclass, get_library_version) from simba.utils.enums import (OS, ConfigKey, Defaults, Dtypes, Formats, @@ -1166,8 +1166,8 @@ def find_bouts(s: pd.Series, type: str): @staticmethod @njit("(float32[:, :], float64, types.ListType(types.unicode_type))") - def find_highly_correlated_fields(data: np.ndarray,threshold: float,field_names: types.ListType(types.unicode_type), - ) -> List[str]: + def find_highly_correlated_fields(data: np.ndarray,threshold: float, field_names: types.ListType(types.unicode_type)) -> List[str]: + """ Find highly correlated fields in a dataset. @@ -1175,6 +1175,9 @@ def find_highly_correlated_fields(data: np.ndarray,threshold: float,field_names: pairs of columns that have a correlation coefficient greater than the specified threshold. For every pair of correlated features identified, the function returns the field name of one feature. These field names can later be dropped from the input data to reduce memory requirements and collinearity. + .. seealso:: + For non-numba method, see :func:`simba.mixins.statistics_mixin.Statistics.find_collinear_features`. + :param np.ndarray data: Two dimension numpy array with features represented as columns and frames represented as rows. :param float threshold: Threshold value for significant collinearity. :param List[str] field_names: List mapping the column names in data to a field name. Use types.ListType(types.unicode_type) to take advantage of JIT compilation @@ -1186,7 +1189,6 @@ def find_highly_correlated_fields(data: np.ndarray,threshold: float,field_names: >>> field_names = [] >>> for i in range(data.shape[1]): field_names.append(f'Feature_{i+1}') >>> highly_correlated_fields = TrainModelMixin().find_highly_correlated_fields(data=data, field_names=typed.List(field_names), threshold=0.10) - """ column_corr = np.abs(np.corrcoef(data.T)) @@ -2373,28 +2375,34 @@ def scaler_inverse_transform( ).set_index(data.index) @staticmethod - def define_scaler( - scaler_name: Literal["MIN-MAX", "STANDARD", "QUANTILE"] - ) -> Union[MinMaxScaler, StandardScaler, QuantileTransformer]: + def define_scaler(scaler_name: Literal["min-max", "standard", "quantile"]) -> Union[MinMaxScaler, StandardScaler, QuantileTransformer]: """ Defines a sklearn scaler object. See ``UMLOptions.SCALER_OPTIONS.value`` for accepted scalers. :example: - >>> TrainModelMixin.define_scaler(scaler_name='MIN-MAX') + >>> TrainModelMixin.define_scaler(scaler_name='min-max') """ - if scaler_name not in Options.SCALER_OPTIONS.value: - raise InvalidInputError( - msg=f"Scaler {scaler_name} not supported. Options: {Options.SCALER_OPTIONS.value}", - source=TrainModelMixin.define_scaler.__name__, - ) - if scaler_name == Options.MIN_MAX_SCALER.value: + if scaler_name.upper() not in Options.SCALER_OPTIONS.value: + raise InvalidInputError(msg=f"Scaler {scaler_name} not supported. Options: {Options.SCALER_OPTIONS.value}", source=TrainModelMixin.define_scaler.__name__) + if scaler_name.upper() == Options.MIN_MAX_SCALER.value: return MinMaxScaler() - elif scaler_name == Options.STANDARD_SCALER.value: + elif scaler_name.upper() == Options.STANDARD_SCALER.value: return StandardScaler() - elif scaler_name == Options.QUANTILE_SCALER.value: + elif scaler_name.upper() == Options.QUANTILE_SCALER.value: return QuantileTransformer() + @staticmethod + def fit_scaler(scaler: Union[MinMaxScaler, QuantileTransformer, StandardScaler], + data: Union[pd.DataFrame, np.ndarray]) -> Union[ + MinMaxScaler, QuantileTransformer, StandardScaler, object]: + + check_instance(source=f'{TrainModelMixin.fit_scaler} data', instance=data, accepted_types=(pd.DataFrame, np.ndarray)) + check_instance(source=f'{TrainModelMixin.fit_scaler} scaler', instance=scaler, accepted_types=(MinMaxScaler, QuantileTransformer, StandardScaler)) + if isinstance(data, pd.DataFrame): data = data.values + check_valid_array(data=data, accepted_ndims=(2,), accepted_dtypes=Formats.NUMERIC_DTYPES.value) + return scaler.fit(data) + @staticmethod def scaler_transform( data: pd.DataFrame, @@ -2439,20 +2447,14 @@ def find_low_variance_fields(data: pd.DataFrame, variance_threshold: float) -> L :param float variance: Variance threshold (0.0-1.0). :return List[str]: """ - feature_selector = VarianceThreshold( - threshold=round((variance_threshold / 100), 2) - ) + + check_valid_dataframe(df=data, source=TrainModelMixin.find_low_variance_fields.__name__, valid_dtypes=(Formats.NUMERIC_DTYPES.value)) + check_float(name=TrainModelMixin.find_low_variance_fields.__name__, value=variance_threshold, min_value=0.0, max_value=1.0) + feature_selector = VarianceThreshold(threshold=variance_threshold) feature_selector.fit(data) - low_variance_fields = [ - c - for c in data.columns - if c not in data.columns[feature_selector.get_support()] - ] + low_variance_fields = [c for c in data.columns if c not in data.columns[feature_selector.get_support()]] if len(low_variance_fields) == len(data.columns): - raise NoDataError( - msg=f"All feature columns show a variance below the {variance_threshold} threshold. Thus, no data remain for analysis.", - source=TrainModelMixin.find_low_variance_fields.__name__, - ) + raise NoDataError(msg=f"All feature columns show a variance below the {variance_threshold} threshold. Thus, no data remain for analysis.", source=TrainModelMixin.find_low_variance_fields.__name__) return low_variance_fields diff --git a/simba/mixins/unsupervised_mixin.py b/simba/mixins/unsupervised_mixin.py index 760e7165c..2bb0c3233 100644 --- a/simba/mixins/unsupervised_mixin.py +++ b/simba/mixins/unsupervised_mixin.py @@ -1,36 +1,89 @@ __author__ = "Simon Nilsson" +from typing import Union +import pandas as pd +import numpy as np +import datetime +from simba.data_processors.cuda.utils import _cuda_available +from simba.utils.checks import check_float, check_int, check_valid_boolean, check_instance, check_valid_array +from simba.utils.errors import SimBAGPUError +from simba.utils.data import get_library_version +from simba.utils.lookups import get_model_names +from simba.utils.enums import Formats +try: + import cuml.umap as cuml_umap + import cuml.cluster.hdbscan as cuml_hdbscan +except ModuleNotFoundError: + import umap as cuml_umap + import hdbscan as cuml_hdbscan + pass +import umap +import hdbscan -import glob -import os -import pickle -from datetime import datetime -from typing import List, Optional, Union +class UMLMixin(object): + def __init__(self): + self.datetime = datetime.datetime.now().strftime("%Y%m%d%H%M%S") + self.mdl_names = get_model_names() -import numpy as np -import pandas as pd -from sklearn.feature_selection import VarianceThreshold -import simba + @staticmethod + def umap_define(n_neighbors: int, + min_distance: float, + spread: float, + gpu: bool = False, + verbose: int = 1): -try: - from typing import Literal -except: - from typing_extensions import Literal + check_int(name=f'umap_define n_neighbors', value=n_neighbors, min_value=1) + check_float(name=f'umap_define min_distance', value=min_distance, min_value=0.0) + check_float(name=f'umap_define spread', value=spread, min_value=min_distance) + check_valid_boolean(value=[gpu], source=f'umap_define gpu') + check_valid_boolean(value=[verbose], source=f'{UMLMixin.umap_fit.__name__} verbose') + if gpu and not _cuda_available()[0]: + raise SimBAGPUError(msg='No GPU detected and GPU as True passed', source=UMLMixin.umap_define.__name__) + if gpu and not get_library_version(library_name='cuml'): + raise SimBAGPUError(msg='cuML library not detected and GPU as True passed', source=UMLMixin.umap_define.__name__) + if gpu: + return cuml_umap.UMAP(min_dist=min_distance, n_neighbors=n_neighbors, spread=spread, metric='euclidean', verbose=verbose) + else: + return umap.UMAP(min_dist=min_distance, n_neighbors=n_neighbors, spread=spread, metric='euclidean', verbose=verbose) + @staticmethod + def umap_fit(mdl: Union[umap.UMAP, cuml_umap.UMAP], data: Union[np.ndarray, pd.DataFrame]) -> Union[umap.UMAP, cuml_umap.UMAP]: + check_instance(source=f'{UMLMixin.umap_fit.__name__} mdl', instance=mdl, accepted_types=(umap.UMAP, cuml_umap.UMAP,)) + check_instance(source=f'{UMLMixin.umap_fit.__name__} data', instance=data, accepted_types=(pd.DataFrame, np.ndarray,)) + if isinstance(data, pd.DataFrame): + data = data.values + check_valid_array(data=data, source=f'{UMLMixin.umap_fit.__name__} data', accepted_ndims=(2,), accepted_dtypes=Formats.NUMERIC_DTYPES.value) + return mdl.fit(data) -from simba.unsupervised.enums import Unsupervised -from simba.utils.enums import Paths -from simba.utils.printing import SimbaTimer + @staticmethod + def hdbscan_define(alpha: float, + min_cluster_size: int, + min_samples: int, + cluster_selection_epsilon: float, + gpu: bool = False, + verbose: int = 1) -> Union[hdbscan.HDBSCAN, cuml_hdbscan.HDBSCAN]: -class UnsupervisedMixin(object): + check_int(name=f'hdbscan_define min_cluster_size', value=min_cluster_size, min_value=1) + check_int(name=f'hdbscan_define min_samples', value=min_samples, min_value=1) + check_float(name=f'hdbscan_define alpha', value=alpha, min_value=0.0) + check_float(name=f'hdbscan_define cluster_selection_epsilon', value=cluster_selection_epsilon, min_value=0.0) + check_valid_boolean(value=[gpu], source=f'hdbscan_define gpu') + check_valid_boolean(value=[verbose], source=f'hdbscan_define verbose') + if gpu and not _cuda_available()[0]: + raise SimBAGPUError(msg='No GPU detected and GPU as True passed', source=UMLMixin.hdbscan_define.__name__) + if gpu and not get_library_version(library_name='cuml'): + raise SimBAGPUError(msg='cuML library not detected and GPU as True passed', source=UMLMixin.hdbscan_define.__name__) + if not gpu: + return hdbscan.HDBSCAN(algorithm="best", alpha=alpha, approx_min_span_tree=True, gen_min_span_tree=True, min_cluster_size=min_cluster_size, min_samples=min_samples, cluster_selection_epsilon=cluster_selection_epsilon, p=None, prediction_data=True) + else: + return cuml_hdbscan.HDBSCAN(algorithm="best", alpha=alpha, approx_min_span_tree=True, gen_min_span_tree=True, min_cluster_size=min_cluster_size, min_samples=min_samples, cluster_selection_epsilon=cluster_selection_epsilon, p=None, prediction_data=True) - def __init__(self): - self.datetime = datetime.now().strftime("%Y%m%d%H%M%S") - self.timer = SimbaTimer(start=True) - model_names_dir = os.path.join( - os.path.dirname(simba.__file__), Paths.UNSUPERVISED_MODEL_NAMES.value - ) - self.model_names = list( - pd.read_parquet(model_names_dir)[Unsupervised.NAMES.value] - ) + @staticmethod + def hdbscan_fit(mdl: Union[hdbscan.HDBSCAN, cuml_hdbscan.HDBSCAN], data: Union[np.ndarray, pd.DataFrame]) -> object: + check_instance(source=f'{UMLMixin.umap_fit.__name__} mdl', instance=mdl, accepted_types=(hdbscan.HDBSCAN, cuml_hdbscan.HDBSCAN,)) + check_instance(source=f'{UMLMixin.umap_fit.__name__} data', instance=data, accepted_types=(pd.DataFrame, np.ndarray,)) + if isinstance(data, pd.DataFrame): + data = data.values + check_valid_array(data=data, source=f'{UMLMixin.umap_fit.__name__} data', accepted_ndims=(2,), accepted_dtypes=Formats.NUMERIC_DTYPES.value) + return mdl.fit(data) \ No newline at end of file diff --git a/simba/model/regression/model.py b/simba/model/regression/model.py index 3cf129dcb..911d79795 100644 --- a/simba/model/regression/model.py +++ b/simba/model/regression/model.py @@ -19,13 +19,13 @@ def fit_xgb(x: pd.DataFrame, y: np.ndarray, - xgb_reg: xgb.XGBRegressor) -> xgb.XGBRegressor: + mdl: xgb.XGBRegressor) -> xgb.XGBRegressor: """ Fits an XGBoost regressor model to the given data. :param pd.DataFrame x: Input feature matrix where each row represents a sample and each column a feature. The data must have numeric types. :param np.ndarray y: Target values, must be a 1-dimensional array of numeric types with the same number of rows as `x`. - :param xgb.XGBRegressor xgb_reg: Defined xgb.XGBRegressor. E.g., can be defined with :func:`simba.model.regression.model.xgb_define`, + :param xgb.XGBRegressor mdl: Defined xgb.XGBRegressor. E.g., can be defined with :func:`simba.model.regression.model.xgb_define`, :return: Trained XGBoost regressor model. :rtype: xgb.XGBRegressor @@ -37,15 +37,16 @@ def fit_xgb(x: pd.DataFrame, check_valid_dataframe(df=x, source=f'{fit_xgb.__name__} x', valid_dtypes=Formats.NUMERIC_DTYPES.value) check_valid_array(data=y, source=f'{fit_xgb.__name__} y', accepted_ndims=(1,), accepted_axis_0_shape=[x.shape[0]], accepted_dtypes=Formats.NUMERIC_DTYPES.value) check_instance(source=f'{fit_xgb.__name__} fit_xgb', instance=xgb_reg, accepted_types=(xgb.XGBRegressor,)) - return xgb_reg.fit(X=x, y=y) + return mdl.fit(X=x, y=y) -def transform_xgb(x: pd.DataFrame, model: xgb.XGBRegressor) -> np.ndarray: +def transform_xgb(x: pd.DataFrame, + mdl: xgb.XGBRegressor) -> np.ndarray: """ Transforms the input data using the provided XGBoost model by making predictions. :param pd.DataFrame x: Input feature matrix where each row represents a sample and each column a feature. The data must have numeric types. - :param xgb.XGBRegressor model: Trained XGBoost model to use for making predictions. + :param xgb.XGBRegressor mdl: Trained XGBoost model to use for making predictions. :return: Predictions rounded to 2 decimal places. :rtype: np.ndarray @@ -53,18 +54,18 @@ def transform_xgb(x: pd.DataFrame, model: xgb.XGBRegressor) -> np.ndarray: >>> x, y = pd.DataFrame(np.random.randint(0, 500, (100, 20))), np.random.randint(1, 6, (100,)) >>> mdl = fit_xgb(x=x, y=y) >>> new_x = pd.DataFrame(np.random.randint(0, 500, (100, 20))) - >>> results = transform_xgb(x=new_x, model=mdl) + >>> results = transform_xgb(x=new_x, mdl=mdl) :example: >>> x, y = pd.DataFrame(np.random.randint(0, 500, (100, 20))), np.random.randint(1, 6, (100,)) >>> mdl = fit_xgb(x=x, y=y) >>> new_x = pd.DataFrame(np.random.randint(0, 500, (100, 20))) - >>> results = transform_xgb(x=new_x, model=mdl) + >>> results = transform_xgb(x=new_x, mdl=mdl) """ - check_instance(source=transform_xgb.__name__, instance=model, accepted_types=(xgb.XGBRegressor,)) + check_instance(source=transform_xgb.__name__, instance=mdl, accepted_types=(xgb.XGBRegressor,)) check_valid_dataframe(df=x, source=f'{transform_xgb.__name__} x', valid_dtypes=Formats.NUMERIC_DTYPES.value) - expected_x_names = model.get_booster().feature_names + expected_x_names = mdl.get_booster().feature_names new_x_names = [str(i) for i in list(x.columns)] missing_x_names = set([i for i in expected_x_names if i not in new_x_names]) additional_x_names = set([i for i in new_x_names if i not in expected_x_names]) @@ -74,7 +75,7 @@ def transform_xgb(x: pd.DataFrame, model: xgb.XGBRegressor) -> np.ndarray: raise DataHeaderError(msg=f'The new data are missing {len(missing_x_names)} features expected by the model: {missing_x_names}', source=transform_xgb.__name__) if expected_x_names != new_x_names: raise DataHeaderError(msg=f'The new data contains features in the wrong order from the expected features', source=transform_xgb.__name__) - return np.round(model.predict(x), 2) + return np.round(mdl.predict(x), 2) def evaluate_xgb(y_pred: np.ndarray, @@ -97,7 +98,7 @@ def evaluate_xgb(y_pred: np.ndarray, >>> y = np.random.randint(1, 6, (100,)) >>> mdl = fit_xgb(x=x, y=y) >>> new_x = pd.DataFrame(np.random.randint(0, 500, (100, 20))) - >>> y_pred = transform_xgb(x=new_x, model=mdl) + >>> y_pred = transform_xgb(x=new_x, mdl=mdl) >>> evaluate_xgb(y_pred=y_pred, y_true=y, metrics=['MAE', 'MAPE', 'RMSE', 'MSE']) """ METRICS = {'MAPE': mean_absolute_percentage_error, 'MSE': mean_squared_error, 'MAE': mean_absolute_error, 'R2': r2_score, 'RMSE': root_mean_squared_error} @@ -126,6 +127,21 @@ def xgb_define(objective: str = 'reg:squarederror', gamma: float = 0.0, tree_method: str = 'auto') -> xgb.XGBRegressor: + """ + Defines an XGBoost regressor. + + :param str objective: The learning objective for the model. + :param int n_estimators: Number of boosting rounds. Must be greater than or equal to 1. Default is 100. + :param int max_depth: Maximum depth of a tree. Increasing this value makes the model more complex and more likely to overfit. Must be greater than or equal to 1. Default is 6. + :param int verbosity: Verbosity of the training process (0-3). + :param float learning_rate: Step size shrinkage used to prevent overfitting. Lower values make the model more robust but require more boosting rounds. Must be between 0.1 and 1.0. Default is 0.3. + :param float eta: Learning rate alias. Must be between 0.0 and 1.0. Default is 0.3. + :param float gamma: Minimum loss reduction required to make a further partition on a leaf node of the tree. Larger values prevent overfitting. Must be greater than or equal to 0.0. Default is 0.0. + :param str tree_method: The tree construction algorithm used in XGBoost. + :return: An initialized XGBoost Regressor with the specified configuration. + :rtype: xgb.XGBRegressor + """ + OBJECTIVES = ('reg:squarederror', 'reg:squaredlogerror', 'reg:logistic', 'reg:pseudohubererror') TREE_METHODS = ('auto', 'exact', 'approx', 'hist', 'gpu_hist') check_str(name=f'{fit_xgb.__name__} objective', value=objective, options=OBJECTIVES) @@ -157,19 +173,18 @@ def xgb_grid_define(objective: Tuple[str] = ('reg:squarederror',), return mdls - - def xgb_grid_fit(x: pd.DataFrame, y: np.ndarray, - xgb_regs: List[xgb.XGBRegressor]) -> List[xgb.XGBRegressor]: - check_valid_lst(data=xgb_regs, source=xgb_grid_fit.__name__, valid_dtypes=()) - - - - + mdls: List[xgb.XGBRegressor]) -> List[xgb.XGBRegressor]: + check_valid_dataframe(df=x, source=f'{fit_xgb.__name__} x', valid_dtypes=Formats.NUMERIC_DTYPES.value) + check_valid_array(data=y, source=f'{fit_xgb.__name__} y', accepted_ndims=(1,), accepted_axis_0_shape=[x.shape[0]], accepted_dtypes=Formats.NUMERIC_DTYPES.value) + check_valid_lst(data=mdls, source=xgb_grid_fit.__name__, valid_dtypes=(xgb.XGBRegressor,)) + results = [] + for mdl in mdls: + results.append(fit_xgb(x=x, y=y, mdl=mdl)) + return results - #xgb_define() @@ -187,7 +202,7 @@ def xgb_grid_fit(x: pd.DataFrame, -xgb_grid_define(max_depth=(6, 3), gamma=(0, 0.3)) +#xgb_grid_define(max_depth=(6, 3), gamma=(0, 0.3)) # x = pd.DataFrame(np.random.randint(0, 500, (100, 20))) # y = np.random.randint(1, 6, (100,)) diff --git a/simba/plotting/heat_mapper_clf.py b/simba/plotting/heat_mapper_clf.py index 0f858448e..a4bfcb15b 100644 --- a/simba/plotting/heat_mapper_clf.py +++ b/simba/plotting/heat_mapper_clf.py @@ -1,22 +1,17 @@ __author__ = "Simon Nilsson" import os -from typing import List - +from typing import List, Union import cv2 -import matplotlib.pyplot as plt import numpy as np -import pandas as pd -from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas -from numba import jit, prange - from simba.mixins.config_reader import ConfigReader +from simba.mixins.geometry_mixin import GeometryMixin from simba.mixins.plotting_mixin import PlottingMixin -from simba.utils.enums import Formats -from simba.utils.errors import NoSpecifiedOutputError +from simba.utils.enums import Formats, Options +from simba.utils.errors import NoSpecifiedOutputError, InvalidInputError from simba.utils.printing import SimbaTimer, stdout_success from simba.utils.read_write import get_fn_ext, read_df - +from simba.utils.checks import check_all_file_names_are_represented_in_video_log, check_str, check_valid_dict, check_filepaths_in_iterable_exist, check_valid_dataframe, check_int class HeatMapperClfSingleCore(ConfigReader, PlottingMixin): """ @@ -34,291 +29,129 @@ class HeatMapperClfSingleCore(ConfigReader, PlottingMixin): :param bool final_img_setting: If True, then create a single image representing the last frame of the input video :param bool video_setting: If True, then create a video of heatmaps. :param bool frame_setting: If True, then create individual heatmap frames. - :param int bin_size: The rectangular size of each heatmap location in millimeters. For example, `50` will divide the video into - 5 centimeter rectangular spatial bins. - :param str palette: Heatmap pallette. Eg. 'jet', 'magma', 'inferno','plasma', 'viridis', 'gnuplot2' - :param str bodypart: The name of the body-part used to infer the location of the classified behavior :param str clf_name: The name of the classified behavior. - :param int or 'auto' max_scale: The max value in the heatmap in seconds. E.g., with a value of `10`, if the classified behavior has occured - >= 10 within a rectangular bins, it will be filled with the same color. + :param str bodypart: The name of the body-part used to infer the location of the classified behavior + :param Dict style_attr: Dict containing settings for colormap, bin-size, max scale, and smooothing operations. For example: {'palette': 'jet', 'shading': 'gouraud', 'bin_size': 50, 'max_scale': 'auto'}. + + :example: + >>> test = HeatMapperClfSingleCore(config_path=r"C:\troubleshooting\RAT_NOR\project_folder\project_config.ini", + >>> style_attr = {'palette': 'jet', 'shading': 'gouraud', 'bin_size': 50, 'max_scale': 'auto'}, + >>> final_img_setting=True, + >>> video_setting=True, + >>> frame_setting=False, + >>> bodypart='Ear_left', + >>> clf_name='straub_tail', + >>> data_paths=[r"C:\troubleshooting\RAT_NOR\project_folder\csv\test\2022-06-20_NOB_DOT_4.csv"]) + >>> test.run() + """ + + def __init__(self, + config_path: Union[str, os.PathLike], + bodypart: str, + clf_name: str, + data_paths: List[str], + style_attr: dict, + final_img_setting: bool = True, + video_setting: bool = False, + frame_setting: bool = False): - Examples - ----- - >>> heat_mapper_clf = HeatMapperClfSingleCore(config_path='MyConfigPath', final_img_setting=False, video_setting=True, frame_setting=False, bin_size=50, palette='jet', bodypart='Nose_1', clf_name='Attack', max_scale=20).run() - """ - def __init__( - self, - config_path: str, - final_img_setting: bool, - video_setting: bool, - frame_setting: bool, - bodypart: str, - clf_name: str, - files_found: List[str], - style_attr: dict, - ): ConfigReader.__init__(self, config_path=config_path) PlottingMixin.__init__(self) - if (not frame_setting) and (not video_setting) and (not final_img_setting): - raise NoSpecifiedOutputError( - msg="Please choose to select either heatmap videos, frames, and/or final image." - ) - self.frame_setting, self.video_setting = frame_setting, video_setting - self.final_img_setting, self.bp = final_img_setting, bodypart - self.bin_size, self.max_scale, self.palette, self.shading = ( - style_attr["bin_size"], - style_attr["max_scale"], - style_attr["palette"], - style_attr["shading"], - ) - self.clf_name, self.files_found = clf_name, files_found - if not os.path.exists(self.heatmap_clf_location_dir): - os.makedirs(self.heatmap_clf_location_dir) - self.bp_lst = [self.bp + "_x", self.bp + "_y"] + raise NoSpecifiedOutputError(msg="Please select either heatmap videos, frames, and/or final image.") + check_filepaths_in_iterable_exist(file_paths=data_paths, name=f'{self.__class__.__name__} data_paths') + check_str(name=f'{self.__class__.__name__} clf_name', value=clf_name) + check_str(name=f'{self.__class__.__name__} bodypart', value=bodypart) + check_valid_dict(x=style_attr, required_keys=('max_scale', 'bin_size', 'shading', 'palette')) + self.frame_setting, self.video_setting, self.final_img_setting = frame_setting, video_setting, final_img_setting + self.bin_size, self.max_scale, self.palette, self.shading = (style_attr["bin_size"], style_attr["max_scale"], style_attr["palette"], style_attr["shading"]) + check_str(name=f'{self.__class__.__name__} shading', value=style_attr["shading"], options=Options.HEATMAP_SHADING_OPTIONS.value) + check_int(name=f'{self.__class__.__name__} bin_size', value=style_attr["bin_size"], min_value=1) + self.clf_name, self.data_paths, self.bp = clf_name, data_paths, bodypart + if not os.path.exists(self.heatmap_clf_location_dir): os.makedirs(self.heatmap_clf_location_dir) + self.bp_lst = [f"{self.bp}_x", f"{self.bp}_y"] self.timer = SimbaTimer(start=True) - print("Processing {} video(s)...".format(str(len(self.files_found)))) - - @staticmethod - @jit(nopython=True) - def __calculate_cum_array(clf_array: np.array, fps: int): - cum_sum_arr = np.full(clf_array.shape, np.nan) - for frm_idx in prange(clf_array.shape[0]): - frame_cum_sum = np.full((clf_array.shape[1], clf_array.shape[2]), 0.0) - sliced_arr = clf_array[0:frm_idx] - for i in range(sliced_arr.shape[0]): - for j in range(sliced_arr.shape[1]): - for k in range(sliced_arr.shape[2]): - frame_cum_sum[j][k] += sliced_arr[i][j][k] - cum_sum_arr[frm_idx] = frame_cum_sum - - return cum_sum_arr / fps - - def __calculate_bin_attr( - self, - data_df: pd.DataFrame, - clf_name: str, - bp_lst: list, - px_per_mm: int, - img_width: int, - img_height: int, - bin_size: int, - fps: int, - ): - bin_size_px = int(float(px_per_mm) * float(bin_size)) - horizontal_bin_cnt = int(img_width / bin_size_px) - vertical_bin_cnt = int(img_height / bin_size_px) - aspect_ratio = round((vertical_bin_cnt / horizontal_bin_cnt), 3) - - clf_idx = ( - data_df[bp_lst][data_df[clf_name] == 1].reset_index().to_numpy().astype(int) - ) - - bin_dict = {} - x_location, y_location = 0, 0 - for hbin in range(horizontal_bin_cnt): - bin_dict[hbin] = {} - for vbin in range(vertical_bin_cnt): - bin_dict[hbin][vbin] = { - "top_left_x": x_location, - "top_left_y": y_location, - "bottom_right_x": x_location + bin_size_px, - "bottom_right_y": y_location + bin_size_px, - } - y_location += bin_size_px - y_location = 0 - x_location += bin_size_px - - clf_array = np.zeros((len(data_df), vertical_bin_cnt, horizontal_bin_cnt)) - - for clf_frame in clf_idx: - for h_bin_name, v_dict in bin_dict.items(): - for v_bin_name, c in v_dict.items(): - if ( - clf_frame[1] <= c["bottom_right_x"] - and clf_frame[1] >= c["top_left_x"] - ): - if ( - clf_frame[2] <= c["bottom_right_y"] - and clf_frame[2] >= c["top_left_y"] - ): - clf_array[int(clf_frame[0])][v_bin_name][h_bin_name] = 1 - - clf_array = self.__calculate_cum_array(clf_array=clf_array, fps=fps) - - return clf_array, aspect_ratio def __calculate_max_scale(self, clf_array: np.array): return np.round(np.max(np.max(clf_array[-1], axis=0)), 3) def run(self): - """ - Creates heatmap charts. Results are stored in the `project_folder/frames/heatmaps_classifier_locations` - directory of SimBA project. - - Returns - ---------- - None - """ - - for file_cnt, file_path in enumerate(self.files_found): - video_timer = SimbaTimer() - video_timer.start_timer() + print(f"Processing heatmaps for {len(self.data_paths)} video(s)...") + check_all_file_names_are_represented_in_video_log(video_info_df=self.video_info_df, data_paths=self.data_paths) + for file_cnt, file_path in enumerate(self.data_paths): + video_timer = SimbaTimer(start=True) _, self.video_name, _ = get_fn_ext(file_path) - self.video_info, self.px_per_mm, self.fps = self.read_video_info( - video_name=self.video_name - ) - self.width, self.height = int( - self.video_info["Resolution_width"].values[0] - ), int(self.video_info["Resolution_height"].values[0]) + print(f'Plotting heatmap classification map for video {self.video_name}...') + self.video_info, self.px_per_mm, self.fps = self.read_video_info(video_name=self.video_name) + self.width, self.height = int(self.video_info["Resolution_width"].values[0]), int(self.video_info["Resolution_height"].values[0]) if self.video_setting: self.fourcc = cv2.VideoWriter_fourcc(*Formats.MP4_CODEC.value) - self.video_save_path = os.path.join( - self.heatmap_clf_location_dir, self.video_name + ".mp4" - ) - self.writer = cv2.VideoWriter( - self.video_save_path, - self.fourcc, - self.fps, - (self.width, self.height), - ) + self.video_save_path = os.path.join(self.heatmap_clf_location_dir, f"{self.video_name}.mp4") + self.writer = cv2.VideoWriter(self.video_save_path, self.fourcc, self.fps, (self.width, self.height)) if self.frame_setting: - self.save_video_folder = os.path.join( - self.heatmap_clf_location_dir, self.video_name - ) + self.save_video_folder = os.path.join(self.heatmap_clf_location_dir, self.video_name) if not os.path.exists(self.save_video_folder): os.makedirs(self.save_video_folder) self.data_df = read_df(file_path=file_path, file_type=self.file_type) - clf_array, aspect_ratio = self.__calculate_bin_attr( - data_df=self.data_df, - clf_name=self.clf_name, - bp_lst=self.bp_lst, - px_per_mm=self.px_per_mm, - img_width=self.width, - img_height=self.height, - bin_size=self.bin_size, - fps=self.fps, - ) - + check_valid_dataframe(df=self.data_df, required_fields=[self.clf_name] + self.bp_lst, valid_dtypes=Formats.NUMERIC_DTYPES.value) + bp_data = self.data_df[self.bp_lst].values.astype(np.int32) + clf_data = self.data_df[self.clf_name].values.astype(np.int32) + if len(np.unique(clf_data)) == 1: + raise InvalidInputError(msg=f'Cannot plot heatmap for behavior {self.clf_name} in video {self.video_name}. The behavior is classified as {np.unique(clf_data)} in every single frame.') + grid, aspect_ratio = GeometryMixin.bucket_img_into_grid_square(img_size=(self.width, self.height), bucket_grid_size_mm=self.bin_size, px_per_mm=self.px_per_mm, add_correction=False) + clf_data = GeometryMixin().cumsum_bool_geometries(data=bp_data, geometries=grid, bool_data=clf_data, fps=self.fps, verbose=False) if self.max_scale == "auto": - self.max_scale = self.__calculate_max_scale(clf_array=clf_array) - if self.max_scale == 0: - self.max_scale = 1 - + self.max_scale = max(1, self.__calculate_max_scale(clf_array=clf_data)) if self.final_img_setting: - self.make_clf_heatmap_plot( - frm_data=clf_array[-1, :, :], - max_scale=self.max_scale, - palette=self.palette, - aspect_ratio=aspect_ratio, - file_name=os.path.join( - self.heatmap_clf_location_dir, - self.video_name + "_final_frm.png", - ), - shading=self.shading, - clf_name=self.clf_name, - img_size=(self.width, self.height), - final_img=True, - ) + file_name = os.path.join(self.heatmap_clf_location_dir, f"{self.video_name}_final_frm.png") + self.make_location_heatmap_plot(frm_data=clf_data[-1:, :, :][0], + max_scale=self.max_scale, + palette=self.palette, + aspect_ratio=aspect_ratio, + file_name=file_name, + shading=self.shading, + img_size=(self.width, self.height)) + print(f"Final heatmap image saved at {file_name}.") if self.video_setting or self.frame_setting: - for frm_cnt, cumulative_frm_idx in enumerate(range(clf_array.shape[0])): - frm_data = clf_array[cumulative_frm_idx, :, :] - cum_df = pd.DataFrame(frm_data).reset_index() - cum_df = cum_df.melt( - id_vars="index", - value_vars=None, - var_name=None, - value_name="seconds", - col_level=None, - ).rename( - columns={"index": "vertical_idx", "variable": "horizontal_idx"} - ) - cum_df["color"] = ( - (cum_df["seconds"].astype(float) / float(self.max_scale)) - .round(2) - .clip(upper=100) - ) - color_array = np.zeros( - ( - len(cum_df["vertical_idx"].unique()), - len(cum_df["horizontal_idx"].unique()), - ) - ) - for i in range(color_array.shape[0]): - for j in range(color_array.shape[1]): - value = cum_df["color"][ - (cum_df["horizontal_idx"] == j) - & (cum_df["vertical_idx"] == i) - ].values[0] - color_array[i, j] = value - - fig = plt.figure() - im_ratio = color_array.shape[0] / color_array.shape[1] - plt.pcolormesh( - color_array, - shading=self.shading, - cmap=self.palette, - rasterized=True, - alpha=1, - vmin=0.0, - vmax=float(self.max_scale), - ) - plt.gca().invert_yaxis() - plt.xticks([]) - plt.yticks([]) - plt.axis("off") - plt.tick_params(axis="both", which="both", length=0) - cb = plt.colorbar(pad=0.0, fraction=0.023 * im_ratio) - cb.ax.tick_params(size=0) - cb.outline.set_visible(False) - cb.set_label( - "{} (seconds)".format(self.clf_name), rotation=270, labelpad=10 - ) - plt.tight_layout() - plt.gca().set_aspect(aspect_ratio) - canvas = FigureCanvas(fig) - canvas.draw() - mat = np.array(canvas.renderer._renderer) - image = cv2.cvtColor(mat, cv2.COLOR_RGB2BGR) - image = cv2.resize(image, (self.width, self.height)) - image = np.uint8(image) - plt.close() + for frm_cnt, cumulative_frm_idx in enumerate(range(clf_data.shape[0])): + frm_data = clf_data[cumulative_frm_idx, :, :] + img = self.make_location_heatmap_plot(frm_data=frm_data, + max_scale=self.max_scale, + palette=self.palette, + aspect_ratio=aspect_ratio, + shading=self.shading, + img_size=(self.width, self.height))[:,:,:3] if self.video_setting: - self.writer.write(image) + self.writer.write(img) if self.frame_setting: - frame_save_path = os.path.join( - self.save_video_folder, str(frm_cnt) + ".png" - ) - cv2.imwrite(frame_save_path, image) - print( - "Created heatmap frame: {} / {}. Video: {} ({}/{})".format( - str(frm_cnt + 1), - str(len(self.data_df)), - self.video_name, - str(file_cnt + 1), - len(self.files_found), - ) - ) + frame_save_path = os.path.join(self.save_video_folder, f"{frm_cnt}.png") + cv2.imwrite(frame_save_path, img) + print(f"Created heatmap frame: {frm_cnt+1} / {len(self.data_df)}. Video: {self.video_name} ({file_cnt + 1}/{len(self.data_paths)})") + if self.video_setting: + self.writer.release() + video_timer.stop_timer() + print(f"Heatmap plot for video {self.video_name} saved at {self.heatmap_clf_location_dir} (elapsed time: {video_timer.elapsed_time_str}s)...") + + self.timer.stop_timer() + stdout_success(msg=f"All heatmap visualizations created in {self.heatmap_clf_location_dir} directory", elapsed_time=self.timer.elapsed_time_str) + + +# test = HeatMapperClfSingleCore(config_path=r"C:\troubleshooting\RAT_NOR\project_folder\project_config.ini", +# style_attr = {'palette': 'jet', 'shading': 'gouraud', 'bin_size': 50, 'max_scale': 'auto'}, +# final_img_setting=True, +# video_setting=True, +# frame_setting=False, +# bodypart='Ear_left', +# clf_name='straub_tail', +# data_paths=[r"C:\troubleshooting\RAT_NOR\project_folder\csv\test\2022-06-20_NOB_DOT_4.csv"]) +# test.run() - if self.video_setting: - self.writer.release() - video_timer.stop_timer() - print( - "Heatmap plot for video {} saved (elapsed time: {}s) ... ".format( - self.video_name, video_timer.elapsed_time_str - ) - ) - self.timer.stop_timer() - stdout_success( - msg="All heatmap visualizations created in project_folder/frames/output/heatmaps_classifier_locations directory", - elapsed_time="self.timer.elapsed_time_str", - ) # test = HeatMapperClfSingleCore(config_path='/Users/simon/Desktop/envs/troubleshooting/Two_animals_16bps/project_folder/project_config.ini', diff --git a/simba/plotting/heat_mapper_clf_mp.py b/simba/plotting/heat_mapper_clf_mp.py index 2aa5354a7..8926ab6c0 100644 --- a/simba/plotting/heat_mapper_clf_mp.py +++ b/simba/plotting/heat_mapper_clf_mp.py @@ -4,6 +4,7 @@ import multiprocessing import os import platform +from typing import Union, List import cv2 import numpy as np @@ -13,63 +14,54 @@ import simba.mixins.plotting_mixin from simba.mixins.config_reader import ConfigReader from simba.mixins.plotting_mixin import PlottingMixin +from simba.mixins.geometry_mixin import GeometryMixin from simba.utils.enums import Formats -from simba.utils.errors import NoSpecifiedOutputError +from simba.utils.errors import NoSpecifiedOutputError, InvalidInputError from simba.utils.printing import SimbaTimer, stdout_success -from simba.utils.read_write import (concatenate_videos_in_folder, get_fn_ext, - read_df, remove_a_folder) - - -def _heatmap_multiprocessor( - data: np.array, - video_setting: bool, - frame_setting: bool, - video_temp_dir: str, - video_name: str, - frame_dir: str, - fps: int, - style_attr: dict, - max_scale: float, - clf_name: str, - aspect_ratio: float, - size: tuple, - make_clf_heatmap_plot: simba.mixins.plotting_mixin.PlottingMixin.make_clf_heatmap_plot, -): - group = int(data[0][0][1]) +from simba.utils.read_write import (concatenate_videos_in_folder, get_fn_ext, read_df, remove_a_folder, find_core_cnt) +from simba.utils.checks import check_valid_boolean, check_int, check_str, check_valid_dict, check_filepaths_in_iterable_exist, check_all_file_names_are_represented_in_video_log, check_valid_dataframe + + +def _heatmap_multiprocessor(data: np.array, + video_setting: bool, + frame_setting: bool, + video_temp_dir: str, + video_name: str, + frame_dir: str, + fps: int, + style_attr: dict, + max_scale: float, + clf_name: str, + aspect_ratio: float, + size: tuple, + make_clf_heatmap_plot: simba.mixins.plotting_mixin.PlottingMixin.make_clf_heatmap_plot): + + batch, frm_ids, data = data[0], data[1], data[2] if video_setting: fourcc = cv2.VideoWriter_fourcc(*Formats.MP4_CODEC.value) - video_save_path = os.path.join(video_temp_dir, "{}.mp4".format(str(group))) + video_save_path = os.path.join(video_temp_dir, f"{batch}.mp4") video_writer = cv2.VideoWriter(video_save_path, fourcc, fps, size) - - for i in range(data.shape[0]): - frame_id = int(data[i, 0, 0]) - frm_data = data[i, :, 2:] - img = make_clf_heatmap_plot( - frm_data=frm_data, - max_scale=max_scale, - palette=style_attr["palette"], - aspect_ratio=aspect_ratio, - shading=style_attr["shading"], - clf_name=clf_name, - img_size=size, - final_img=False, - ) - print( - "Heatmap frame created: {}, Video: {}, Processing core: {}".format( - str(frame_id + 1), video_name, str(group + 1) - ) - ) + for frm_idx in range(data.shape[0]): + frame_id, frm_data = int(frm_ids[frm_idx]), data[frm_idx] + img = make_clf_heatmap_plot(frm_data=frm_data, + max_scale=max_scale, + palette=style_attr["palette"], + aspect_ratio=aspect_ratio, + shading=style_attr["shading"], + clf_name=clf_name, + img_size=size) + + print(f"Heatmap frame created: {frame_id + 1}, Video: {video_name}, Processing core: {batch+1}") if video_setting: video_writer.write(img) - if frame_setting: - file_path = os.path.join(frame_dir, "{}.png".format(frame_id)) + file_path = os.path.join(frame_dir, f"{frame_id}.png") cv2.imwrite(file_path, img) if video_setting: video_writer.release() - return group + return batch class HeatMapperClfMultiprocess(ConfigReader, PlottingMixin): @@ -96,276 +88,165 @@ class HeatMapperClfMultiprocess(ConfigReader, PlottingMixin): >= 10 within a rectangular bins, it will be filled with the same color. :param int core_cnt: Number of cores to use. - :examples: + :example I: >>> heat_mapper_clf = HeatMapperClfMultiprocess(config_path='MyConfigPath', final_img_setting=False, video_setting=True, frame_setting=False, bin_size=50, palette='jet', bodypart='Nose_1', clf_name='Attack', max_scale=20) >>> heat_mapper_clf.create_heatmaps() + + + :example II: + >>> test = HeatMapperClfMultiprocess(config_path=r"C:\troubleshooting\RAT_NOR\project_folder\project_config.ini", + >>> style_attr = {'palette': 'jet', 'shading': 'gouraud', 'bin_size': 50, 'max_scale': 'auto'}, + >>> final_img_setting=True, + >>> video_setting=True, + >>> frame_setting=True, + >>> bodypart='Ear_left', + >>> clf_name='straub_tail', + >>> data_paths=[r"C:\troubleshooting\RAT_NOR\project_folder\csv\test\2022-06-20_NOB_DOT_4.csv"]) + >>> test.run() """ - def __init__( - self, - config_path: str, - final_img_setting: bool, - video_setting: bool, - frame_setting: bool, - bodypart: str, - clf_name: str, - files_found: list, - style_attr: dict, - core_cnt: int, - ): - ConfigReader.__init__(self, config_path=config_path) + def __init__(self, + config_path: Union[str, os.PathLike], + bodypart: str, + clf_name: str, + data_paths: List[str], + style_attr: dict, + final_img_setting: bool = True, + video_setting: bool = False, + frame_setting: bool = False, + core_cnt: int = -1): + + ConfigReader.__init__(self, config_path=config_path, create_logger=False) PlottingMixin.__init__(self) if platform.system() == "Darwin": multiprocessing.set_start_method("spawn", force=True) + check_valid_boolean(value=[frame_setting, video_setting, final_img_setting], source=self.__class__.__name__) if (not frame_setting) and (not video_setting) and (not final_img_setting): - raise NoSpecifiedOutputError( - msg="Please choose to select either heatmap videos, frames, and/or final image." - ) + raise NoSpecifiedOutputError(msg="Please choose to select either heatmap videos, frames, and/or final image.") + check_filepaths_in_iterable_exist(file_paths=data_paths, name=f'{self.__class__.__name__} data_paths') + check_str(name=f'{self.__class__.__name__} clf_name', value=clf_name) + check_str(name=f'{self.__class__.__name__} bodypart', value=bodypart) + check_int(name=f'{self.__class__.__name__} core_cnt', value=core_cnt, min_value=-1, unaccepted_vals=[0]) self.frame_setting, self.video_setting = frame_setting, video_setting self.final_img_setting, self.bp = final_img_setting, bodypart + check_valid_dict(x=style_attr, required_keys=('max_scale', 'bin_size', 'shading', 'palette')) self.style_attr = style_attr - self.bin_size, self.max_scale, self.palette, self.shading, self.core_cnt = ( - style_attr["bin_size"], - style_attr["max_scale"], - style_attr["palette"], - style_attr["shading"], - core_cnt, - ) - self.clf_name, self.files_found = clf_name, files_found + self.bin_size, self.max_scale, self.palette, self.shading = (style_attr["bin_size"], style_attr["max_scale"], style_attr["palette"], style_attr["shading"]) + self.clf_name, self.data_paths = clf_name, data_paths + self.core_cnt = [find_core_cnt()[0] if core_cnt == -1 or core_cnt > find_core_cnt()[0] else core_cnt][0] if not os.path.exists(self.heatmap_clf_location_dir): os.makedirs(self.heatmap_clf_location_dir) - self.bp_lst = [self.bp + "_x", self.bp + "_y"] - print("Processing {} video(s)...".format(str(len(self.files_found)))) - - @staticmethod - @jit(nopython=True) - def __calculate_cum_array(clf_array: np.array, fps: int): - cum_sum_arr = np.full(clf_array.shape, np.nan) - for frm_idx in prange(clf_array.shape[0]): - frame_cum_sum = np.full((clf_array.shape[1], clf_array.shape[2]), 0.0) - sliced_arr = clf_array[0:frm_idx] - for i in range(sliced_arr.shape[0]): - for j in range(sliced_arr.shape[1]): - for k in range(sliced_arr.shape[2]): - frame_cum_sum[j][k] += sliced_arr[i][j][k] - cum_sum_arr[frm_idx] = frame_cum_sum - - return cum_sum_arr / fps - - @staticmethod - @jit(nopython=True) - def __insert_group_idx_column(data: np.array, group: int, last_frm_idx: int): - results = np.full((data.shape[0], data.shape[1], data.shape[2] + 2), np.nan) - group_col = np.full((data.shape[1], 1), group) - for frm_idx in prange(data.shape[0]): - h_stack = np.hstack((group_col, data[frm_idx])) - frm_col = np.full((h_stack.shape[0], 1), frm_idx + last_frm_idx) - results[frm_idx] = np.hstack((frm_col, h_stack)) - - return results - - def __calculate_bin_attr( - self, - data_df: pd.DataFrame, - clf_name: str, - bp_lst: list, - px_per_mm: int, - img_width: int, - img_height: int, - bin_size: int, - fps: int, - ): - bin_size_px = int(float(px_per_mm) * float(bin_size)) - horizontal_bin_cnt = int(img_width / bin_size_px) - vertical_bin_cnt = int(img_height / bin_size_px) - aspect_ratio = round((vertical_bin_cnt / horizontal_bin_cnt), 3) - - clf_idx = ( - data_df[bp_lst][data_df[clf_name] == 1].reset_index().to_numpy().astype(int) - ) - - bin_dict = {} - x_location, y_location = 0, 0 - for hbin in range(horizontal_bin_cnt): - bin_dict[hbin] = {} - for vbin in range(vertical_bin_cnt): - bin_dict[hbin][vbin] = { - "top_left_x": x_location, - "top_left_y": y_location, - "bottom_right_x": x_location + bin_size_px, - "bottom_right_y": y_location + bin_size_px, - } - y_location += bin_size_px - y_location = 0 - x_location += bin_size_px - - clf_array = np.zeros((len(data_df), vertical_bin_cnt, horizontal_bin_cnt)) - - for clf_frame in clf_idx: - for h_bin_name, v_dict in bin_dict.items(): - for v_bin_name, c in v_dict.items(): - if ( - clf_frame[1] <= c["bottom_right_x"] - and clf_frame[1] >= c["top_left_x"] - ): - if ( - clf_frame[2] <= c["bottom_right_y"] - and clf_frame[2] >= c["top_left_y"] - ): - clf_array[int(clf_frame[0])][v_bin_name][h_bin_name] = 1 - - clf_array = self.__calculate_cum_array(clf_array=clf_array, fps=fps) - - return clf_array, aspect_ratio + self.bp_lst = [f"{self.bp}_x", f"{self.bp}_y"] def __calculate_max_scale(self, clf_array: np.array): return np.round(np.max(np.max(clf_array[-1], axis=0)), 3) def run(self): - for file_cnt, file_path in enumerate(self.files_found): - video_timer = SimbaTimer() - video_timer.start_timer() + print(f"Processing {len(self.data_paths)} video(s)...") + check_all_file_names_are_represented_in_video_log(video_info_df=self.video_info_df, data_paths=self.data_paths) + for file_cnt, file_path in enumerate(self.data_paths): + video_timer = SimbaTimer(start=True) _, self.video_name, _ = get_fn_ext(file_path) - self.video_info, self.px_per_mm, self.fps = self.read_video_info( - video_name=self.video_name - ) - self.width, self.height = int( - self.video_info["Resolution_width"].values[0] - ), int(self.video_info["Resolution_height"].values[0]) - self.save_frame_folder_dir = os.path.join( - self.heatmap_clf_location_dir, self.video_name + "_" + self.clf_name - ) - self.video_folder = os.path.join( - self.heatmap_clf_location_dir, self.video_name + "_" + self.clf_name - ) - self.temp_folder = os.path.join( - self.heatmap_clf_location_dir, - self.video_name + "_" + self.clf_name, - "temp", - ) - if self.frame_setting: - if os.path.exists(self.save_frame_folder_dir): - remove_a_folder(folder_dir=self.save_frame_folder_dir) - if not os.path.exists(self.save_frame_folder_dir): - os.makedirs(self.save_frame_folder_dir) + print(f'Plotting heatmap classification map for video {self.video_name}...') + self.video_info, self.px_per_mm, self.fps = self.read_video_info(video_name=self.video_name) + self.width, self.height = int(self.video_info["Resolution_width"].values[0]), int(self.video_info["Resolution_height"].values[0]) + self.temp_folder = os.path.join(self.heatmap_clf_location_dir, "temp") + self.frames_save_dir = os.path.join(self.heatmap_clf_location_dir, f"{self.video_name}_{self.clf_name}") if self.video_setting: if os.path.exists(self.temp_folder): remove_a_folder(folder_dir=self.temp_folder) - remove_a_folder(folder_dir=self.video_folder) os.makedirs(self.temp_folder) - self.save_video_path = os.path.join( - self.heatmap_clf_location_dir, - "{}_{}.mp4".format(self.video_name, self.clf_name), - ) - + self.save_video_path = os.path.join(self.heatmap_clf_location_dir, f"{self.video_name}_{self.clf_name}.mp4") + if self.frame_setting: + if os.path.exists(self.frames_save_dir): + remove_a_folder(folder_dir=self.frames_save_dir) + os.makedirs(self.frames_save_dir) self.data_df = read_df(file_path=file_path, file_type=self.file_type) - clf_array, aspect_ratio = self.__calculate_bin_attr( - data_df=self.data_df, - clf_name=self.clf_name, - bp_lst=self.bp_lst, - px_per_mm=self.px_per_mm, - img_width=self.width, - img_height=self.height, - bin_size=self.bin_size, - fps=self.fps, - ) - + check_valid_dataframe(df=self.data_df, required_fields=[self.clf_name] + self.bp_lst, valid_dtypes=Formats.NUMERIC_DTYPES.value) + bp_data = self.data_df[self.bp_lst].values.astype(np.int32) + clf_data = self.data_df[self.clf_name].values.astype(np.int32) + if len(np.unique(clf_data)) == 1: + raise InvalidInputError(msg=f'Cannot plot heatmap for behavior {self.clf_name} in video {self.video_name}. The behavior is classified as {np.unique(clf_data)} in every single frame.') + grid, aspect_ratio = GeometryMixin.bucket_img_into_grid_square(img_size=(self.width, self.height), bucket_grid_size_mm=self.bin_size, px_per_mm=self.px_per_mm, add_correction=False, verbose=False) + clf_data = GeometryMixin().cumsum_bool_geometries(data=bp_data, geometries=grid, bool_data=clf_data, fps=self.fps, verbose=False) if self.max_scale == "auto": - self.max_scale = self.__calculate_max_scale(clf_array=clf_array) - + self.max_scale = max(1, self.__calculate_max_scale(clf_array=clf_data)) if self.final_img_setting: - self.make_clf_heatmap_plot( - frm_data=clf_array[-1, :, :], - max_scale=self.max_scale, - palette=self.palette, - aspect_ratio=aspect_ratio, - file_name=os.path.join( - self.heatmap_clf_location_dir, - self.video_name + "_final_frm.png", - ), - shading=self.shading, - clf_name=self.clf_name, - img_size=(self.height, self.width), - final_img=True, - ) + file_name = os.path.join(self.heatmap_clf_location_dir, f"{self.video_name}_{self.clf_name}_final_frm.png") + self.make_location_heatmap_plot(frm_data=clf_data[-1:, :, :][0], + max_scale=self.max_scale, + palette=self.palette, + aspect_ratio=aspect_ratio, + file_name=file_name, + shading=self.shading, + img_size=(self.width, self.height)) + print(f"Final heatmap image saved at {file_name}.") if self.video_setting or self.frame_setting: - frame_arrays = np.array_split(clf_array, self.core_cnt) - last_frm_idx = 0 - for frm_group in range(len(frame_arrays)): - split_arr = frame_arrays[frm_group] - frame_arrays[frm_group] = self.__insert_group_idx_column( - data=split_arr, group=frm_group, last_frm_idx=last_frm_idx - ) - last_frm_idx = np.max( - frame_arrays[frm_group].reshape( - (frame_arrays[frm_group].shape[0], -1) - ) - ) - frm_per_core = frame_arrays[0].shape[0] - - print( - "Creating heatmaps, multiprocessing (chunksize: {}, cores: {})...".format( - str(self.multiprocess_chunksize), str(self.core_cnt) - ) - ) - with multiprocessing.Pool( - self.core_cnt, maxtasksperchild=self.maxtasksperchild - ) as pool: - constants = functools.partial( - _heatmap_multiprocessor, - video_setting=self.video_setting, - frame_setting=self.frame_setting, - style_attr=self.style_attr, - fps=self.fps, - video_temp_dir=self.temp_folder, - frame_dir=self.save_frame_folder_dir, - max_scale=self.max_scale, - aspect_ratio=aspect_ratio, - clf_name=self.clf_name, - size=(self.width, self.height), - video_name=self.video_name, - make_clf_heatmap_plot=self.make_clf_heatmap_plot, - ) - - for cnt, result in enumerate( - pool.imap( - constants, - frame_arrays, - chunksize=self.multiprocess_chunksize, - ) - ): - print( - "Image {}/{}, Video {}/{}...".format( - str(int(frm_per_core * (result + 1))), - str(len(self.data_df)), - str(file_cnt + 1), - str(len(self.files_found)), - ) - ) + frame_arrays = np.array_split(clf_data, self.core_cnt) + frm_per_core_w_batch = [] + frm_cnt = 0 + for batch_cnt in range(len(frame_arrays)): + frm_range = np.arange(frm_cnt, frm_cnt+ frame_arrays[batch_cnt].shape[0]) + frm_cnt += len(frm_range) + frm_per_core_w_batch.append((batch_cnt, frm_range, frame_arrays[batch_cnt])) + del frame_arrays + print(f"Creating heatmaps, multiprocessing (chunksize: {self.multiprocess_chunksize}, cores: {self.core_cnt})...") + with multiprocessing.Pool(self.core_cnt, maxtasksperchild=self.maxtasksperchild) as pool: + constants = functools.partial(_heatmap_multiprocessor, + video_setting=self.video_setting, + frame_setting=self.frame_setting, + style_attr=self.style_attr, + fps=self.fps, + video_temp_dir=self.temp_folder, + frame_dir=self.frames_save_dir, + max_scale=self.max_scale, + aspect_ratio=aspect_ratio, + clf_name=self.clf_name, + size=(self.width, self.height), + video_name=self.video_name, + make_clf_heatmap_plot=self.make_clf_heatmap_plot) + + for cnt, batch in enumerate(pool.imap(constants, frm_per_core_w_batch, chunksize=self.multiprocess_chunksize)): + print(f'Batch core {batch+1}/{self.core_cnt} complete (Video {self.video_name})... ') pool.terminate() pool.join() if self.video_setting: - print("Joining {} multiprocessed video...".format(self.video_name)) - concatenate_videos_in_folder( - in_folder=self.temp_folder, save_path=self.save_video_path - ) + print(f"Joining {self.video_name} multiprocessed video...") + concatenate_videos_in_folder(in_folder=self.temp_folder, save_path=self.save_video_path) video_timer.stop_timer() - print( - "Heatmap video {} complete (elapsed time: {}s) ...".format( - self.video_name, video_timer.elapsed_time_str - ) - ) + print(f"Heatmap video {self.video_name} complete, (elapsed time: {video_timer.elapsed_time_str}s) ...") self.timer.stop_timer() - stdout_success( - msg="heatmap visualizations for {} videos created in project_folder/frames/output/heatmap_classifier locations directory", - elapsed_time=self.timer.elapsed_time_str, - ) - + stdout_success(msg=f"Heatmap visualizations for {len(self.data_paths)} video(s) created in {self.heatmap_clf_location_dir} directory", elapsed_time=self.timer.elapsed_time_str) + + +# if __name__ == "__main__": +# test = HeatMapperClfMultiprocess(config_path=r"C:\troubleshooting\RAT_NOR\project_folder\project_config.ini", +# style_attr = {'palette': 'jet', 'shading': 'gouraud', 'bin_size': 50, 'max_scale': 'auto'}, +# final_img_setting=True, +# video_setting=True, +# frame_setting=True, +# bodypart='Ear_left', +# clf_name='straub_tail', +# data_paths=[r"C:\troubleshooting\RAT_NOR\project_folder\csv\test\2022-06-20_NOB_DOT_4.csv"]) +# test.run() + + + +# if __name__ == "__main__": +# test = HeatMapperClfMultiprocess(config_path=r"C:\troubleshooting\RAT_NOR\project_folder\project_config.ini", +# style_attr = {'palette': 'jet', 'shading': 'gouraud', 'bin_size': 50, 'max_scale': 'auto'}, +# final_img_setting=True, +# video_setting=True, +# frame_setting=False, +# bodypart='Ear_left', +# clf_name='straub_tail', +# data_paths=[r"C:\troubleshooting\RAT_NOR\project_folder\csv\test\2022-06-20_NOB_DOT_4.csv"]) +# test.run() # test = HeatMapperClfMultiprocess(config_path='/Users/simon/Desktop/envs/troubleshooting/two_black_animals_14bp/project_folder/project_config.ini', # style_attr = {'palette': 'jet', 'shading': 'gouraud', 'bin_size': 100, 'max_scale': 'auto'}, diff --git a/simba/plotting/heat_mapper_location.py b/simba/plotting/heat_mapper_location.py index 29d41d503..76146553c 100644 --- a/simba/plotting/heat_mapper_location.py +++ b/simba/plotting/heat_mapper_location.py @@ -85,82 +85,6 @@ def __init__(self, self.bp_lst = [f"{self.bp}_x", f"{self.bp}_y"] print(f"Processing heatmaps for {len(self.data_paths)} video(s)...") - @staticmethod - @jit(nopython=True) - def __calculate_cum_array_final_img(loc_array: np.array): - final_img = np.full((loc_array.shape[1], loc_array.shape[2]), 0) - for frm in range(loc_array.shape[0]): - for row in range(loc_array.shape[1]): - for col in range(loc_array.shape[2]): - final_img[row, col] += loc_array[frm, row, col] - return final_img - - def __calculate_bin_attr( - self, - data_df: pd.DataFrame, - bp_lst: list, - px_per_mm: int, - img_width: int, - img_height: int, - bin_size: int, - fps: int, - ): - - bin_size_px = int(float(px_per_mm) * float(bin_size)) - horizontal_bin_cnt = int(img_width / bin_size_px) - vertical_bin_cnt = int(img_height / bin_size_px) - aspect_ratio = round((vertical_bin_cnt / horizontal_bin_cnt), 3) - - bp_data = data_df[bp_lst].to_numpy().astype(int) - - bin_dict = {} - x_location, y_location = 0, 0 - for hbin in range(horizontal_bin_cnt): - bin_dict[hbin] = {} - for vbin in range(vertical_bin_cnt): - bin_dict[hbin][vbin] = { - "top_left_x": x_location, - "top_left_y": y_location, - "bottom_right_x": x_location + bin_size_px, - "bottom_right_y": y_location + bin_size_px, - } - y_location += bin_size_px - y_location = 0 - x_location += bin_size_px - - location_array = np.zeros( - (bp_data.shape[0], vertical_bin_cnt, horizontal_bin_cnt) - ) - - for frm_cnt, frame in enumerate(bp_data): - for h_bin_name, v_dict in bin_dict.items(): - for v_bin_name, c in v_dict.items(): - if frame[0] <= c["bottom_right_x"] and frame[0] >= c["top_left_x"]: - if ( - frame[1] <= c["bottom_right_y"] - and frame[0] >= c["top_left_y"] - ): - location_array[frm_cnt][v_bin_name][h_bin_name] = 1 - - location_array = self.__calculate_cum_array(clf_array=location_array, fps=fps) - - return location_array, aspect_ratio - - @staticmethod - @jit(nopython=True) - def __calculate_cum_array(clf_array: np.array, fps: int): - cum_sum_arr = np.full(clf_array.shape, np.nan) - for frm_idx in prange(clf_array.shape[0]): - frame_cum_sum = np.full((clf_array.shape[1], clf_array.shape[2]), 0.0) - sliced_arr = clf_array[0:frm_idx] - for i in range(sliced_arr.shape[0]): - for j in range(sliced_arr.shape[1]): - for k in range(sliced_arr.shape[2]): - frame_cum_sum[j][k] += sliced_arr[i][j][k] - cum_sum_arr[frm_idx] = frame_cum_sum - - return cum_sum_arr / fps - def run(self): check_all_file_names_are_represented_in_video_log(video_info_df=self.video_info_df, data_paths=self.data_paths) for file_cnt, file_path in enumerate(self.data_paths): diff --git a/simba/plotting/heat_mapper_location_mp.py b/simba/plotting/heat_mapper_location_mp.py index 63611a4c0..696b97750 100644 --- a/simba/plotting/heat_mapper_location_mp.py +++ b/simba/plotting/heat_mapper_location_mp.py @@ -14,13 +14,11 @@ from simba.utils.checks import ( check_all_file_names_are_represented_in_video_log, check_file_exist_and_readable, check_float, check_if_keys_exist_in_dict, - check_int, check_valid_lst) + check_int, check_valid_lst, check_filepaths_in_iterable_exist) from simba.utils.enums import Defaults, Formats, TagNames from simba.utils.errors import NoSpecifiedOutputError from simba.utils.printing import SimbaTimer, log_event, stdout_success -from simba.utils.read_write import (concatenate_videos_in_folder, - find_core_cnt, get_fn_ext, read_df, - remove_a_folder) +from simba.utils.read_write import (concatenate_videos_in_folder,find_core_cnt, get_fn_ext, read_df, remove_a_folder) STYLE_PALETTE = 'palette' STYLE_SHADING = 'shading' @@ -83,13 +81,12 @@ class HeatMapperLocationMultiprocess(ConfigReader, PlottingMixin): :align: center :param str config_path: path to SimBA project config file in Configparser format - :param str bodypart: The name of the body-part used to infer the location of the animal. - :param int bin_size: The rectangular size of each heatmap location in millimeters. For example, `50` will divide the video frames into 5 centimeter rectangular spatial bins. - :param str palette: Heatmap pallette. Eg. 'jet', 'magma', 'inferno','plasma', 'viridis', 'gnuplot2' - :param dict style_attr: Style attributes of heatmap {'palette': 'jet', 'shading': 'gouraud', 'bin_size': 100, 'max_scale': 'auto'} - :param bool final_img_setting: If True, create a single image representing the last frame of the input video + :param bool final_img_setting: If True, then create a single image representing the last frame of the input video :param bool video_setting: If True, then create a video of heatmaps. :param bool frame_setting: If True, then create individual heatmap frames. + :param str clf_name: The name of the classified behavior. + :param str bodypart: The name of the body-part used to infer the location of the classified behavior + :param Dict style_attr: Dict containing settings for colormap, bin-size, max scale, and smooothing operations. For example: {'palette': 'jet', 'shading': 'gouraud', 'bin_size': 50, 'max_scale': 'auto'}. :param int core_cnt: The number of CPU cores to use. If -1, then all available cores. :example: @@ -114,6 +111,7 @@ def __init__(self, check_valid_lst(data=data_paths, valid_dtypes=(str,), min_len=1) check_if_keys_exist_in_dict(data=style_attr, key=STYLE_ATTR, name=f'{self.__class__.__name__} style_attr') check_int(name=f'{self.__class__.__name__} core_cnt', value=core_cnt, min_value=-1, max_value=find_core_cnt()[0]) + check_filepaths_in_iterable_exist(file_paths=data_paths, name=f'{self.__class__.__name__} data_paths') if core_cnt == -1: core_cnt = find_core_cnt()[0] self.core_cnt = core_cnt ConfigReader.__init__(self, config_path=config_path, create_logger=False) @@ -161,6 +159,9 @@ def run(self): self.save_video_path = os.path.join(self.heatmap_location_dir, f"{self.video_name}.mp4") self.data_df = read_df(file_path=file_path, file_type=self.file_type, usecols=self.bp_lst) + + + squares, aspect_ratio = GeometryMixin().bucket_img_into_grid_square(bucket_grid_size_mm=self.style_attr[STYLE_BIN_SIZE], img_size=(self.width, self.height), px_per_mm=self.px_per_mm) cum_sum_squares = GeometryMixin().cumsum_coord_geometries(data=self.data_df.values, fps=self.fps, geometries=squares) if self.style_attr[STYLE_MAX_SCALE] == "auto": diff --git a/simba/ui/pop_ups/heatmap_clf_pop_up.py b/simba/ui/pop_ups/heatmap_clf_pop_up.py index 91abbae1f..3a110cf6f 100644 --- a/simba/ui/pop_ups/heatmap_clf_pop_up.py +++ b/simba/ui/pop_ups/heatmap_clf_pop_up.py @@ -135,7 +135,7 @@ def __create_heatmap_plots(self, multiple_videos: bool): video_setting=self.heatmap_videos_var.get(), frame_setting=self.heatmap_frames_var.get(), bodypart=self.bp_dropdown.getChoices(), - files_found=data_paths, + data_paths=data_paths, clf_name=self.clf_dropdown.getChoices(), ) @@ -150,7 +150,7 @@ def __create_heatmap_plots(self, multiple_videos: bool): video_setting=self.heatmap_videos_var.get(), frame_setting=self.heatmap_frames_var.get(), bodypart=self.bp_dropdown.getChoices(), - files_found=data_paths, + data_paths=data_paths, clf_name=self.clf_dropdown.getChoices(), core_cnt=int(self.multiprocess_dropdown.getChoices()), ) diff --git a/simba/unsupervised/enums.py b/simba/unsupervised/enums.py index c5a3a3a53..32939b630 100644 --- a/simba/unsupervised/enums.py +++ b/simba/unsupervised/enums.py @@ -40,6 +40,7 @@ class Unsupervised(Enum): DR_MODEL = "DR_MODEL" MODEL = "MODEL" MIN_DISTANCE = "min_distance" + EUCLIDEAN = "euclidean" FEATURE_NAMES = "FEATURE_NAMES" SPREAD = "spread" diff --git a/simba/unsupervised/grid_search_visualizers.py b/simba/unsupervised/grid_search_visualizers.py index f97415739..dbd04606a 100644 --- a/simba/unsupervised/grid_search_visualizers.py +++ b/simba/unsupervised/grid_search_visualizers.py @@ -9,7 +9,7 @@ import pandas as pd from simba.mixins.plotting_mixin import PlottingMixin -from simba.mixins.unsupervised_mixin import UnsupervisedMixin +from simba.mixins.unsupervised_mixin import UMLMixin from simba.unsupervised.enums import Clustering, UMLOptions, Unsupervised from simba.utils.checks import (check_if_dir_exists, check_if_filepath_list_is_empty, @@ -21,7 +21,7 @@ from simba.utils.read_write import read_pickle -class GridSearchVisualizer(UnsupervisedMixin): +class GridSearchVisualizer(UMLMixin): """ Visualize grid-searched latent spaces in .png format. diff --git a/simba/utils/checks.py b/simba/utils/checks.py index ae44817bb..991faff46 100644 --- a/simba/utils/checks.py +++ b/simba/utils/checks.py @@ -795,10 +795,7 @@ def check_that_dir_has_list_of_filenames( files_in_dir = [os.path.basename(x) for x in files_in_dir] for file_name in file_name_lst: if os.path.basename(file_name) not in files_in_dir: - raise NoFilesFoundError( - msg=f"File name {os.path.basename(file_name)} could not be found in the directory {dir}", - source=check_that_dir_has_list_of_filenames.__name__, - ) + raise NoFilesFoundError(msg=f"File name {os.path.basename(file_name)} could not be found in the directory {dir}", source=check_that_dir_has_list_of_filenames.__name__) def check_valid_array(data: np.ndarray, @@ -1035,9 +1032,7 @@ def check_if_keys_exist_in_dict( return True -def check_that_directory_is_empty( - directory: Union[str, os.PathLike], raise_error: Optional[bool] = True -) -> None: +def check_that_directory_is_empty(directory: Union[str, os.PathLike], raise_error: Optional[bool] = True) -> None: """ Checks if a directory is empty. If the directory has content, then returns False or raises ``DirectoryNotEmptyError``. @@ -1268,7 +1263,8 @@ def check_valid_tuple(x: tuple, source: Optional[str] = "", accepted_lengths: Optional[Tuple[int]] = None, valid_dtypes: Optional[Tuple[Any]] = None, - minimum_length: Optional[int] = None): + minimum_length: Optional[int] = None, + accepted_values: Optional[Iterable[Any]] = None): if not isinstance(x, (tuple)): raise InvalidInputError( @@ -1293,6 +1289,11 @@ def check_valid_tuple(x: tuple, if tuple_len < minimum_length: raise InvalidInputError(msg=f"The tuple {source} is shorter ({tuple_len}) than the minimum required length ({minimum_length}).", source=source) + if accepted_values is not None: + check_instance(source=f'{check_valid_tuple.__name__} accepted_values', accepted_types=(list, tuple,), instance=accepted_values) + for i in x: + if i not in accepted_values: + raise InvalidInputError(msg=f"The tuple {source} has a value that is NOT accepted: {i}, (accepted: {accepted_values}).", source=source) @@ -1411,7 +1412,7 @@ def check_valid_dict(x: dict, valid_values_dtypes: Optional[Tuple[Any]] = None, max_len_keys: Optional[int] = None, min_len_keys: Optional[int] = None, - required_keys: Optional[Tuple[Any]] = None): + required_keys: Optional[Tuple[Any, ...]] = None): check_instance(source=check_valid_dict.__name__, instance=x, accepted_types=(dict,)) @@ -1464,4 +1465,16 @@ def is_video_color(video: Union[str, os.PathLike, cv2.VideoCapture]) -> bool: else: return False else: - return False \ No newline at end of file + return False + + +def check_filepaths_in_iterable_exist(file_paths: Iterable[str], + name: Optional[str] = None): + + check_instance(source=f'{check_filepaths_in_iterable_exist.__name__} file_paths {name}', instance=file_paths, accepted_types=(list, tuple,)) + if len(file_paths) == 0: + raise NoFilesFoundError(msg=f'{name} {file_paths} is empty') + for file_path in file_paths: + check_str(name=f'{check_filepaths_in_iterable_exist.__name__} {file_path} {name}', value=file_path) + if not os.path.isfile(file_path): + raise NoFilesFoundError(msg=f'{name} {file_path} is not a valid file path') diff --git a/simba/utils/data.py b/simba/utils/data.py index dc42d106f..d290c808e 100644 --- a/simba/utils/data.py +++ b/simba/utils/data.py @@ -39,7 +39,7 @@ from simba.utils.enums import ConfigKey, Dtypes, Formats, Keys, Options from simba.utils.errors import (BodypartColumnNotFoundError, CountError, InvalidFileTypeError, InvalidInputError, - NoFilesFoundError) + NoFilesFoundError, SimBAModuleNotFoundError) from simba.utils.printing import stdout_success, stdout_warning from simba.utils.read_write import (find_video_of_file, get_fn_ext, get_video_meta_data, read_config_entry, @@ -1394,7 +1394,8 @@ def df_smoother(data: pd.DataFrame, return data.clip(lower=0) -def get_library_version(library_name: str) -> str: +def get_library_version(library_name: str, + raise_error: bool = False) -> Union[str, bool]: """ Get the version installed package in python environment. @@ -1410,7 +1411,11 @@ def get_library_version(library_name: str) -> str: lib = importlib.import_module(library_name) return getattr(lib, '__version__', 'Version information not available') except ImportError: - return f'Library "{library_name}" is not installed' + if not raise_error: + return False + else: + raise SimBAModuleNotFoundError(msg=f'The library {library_name} could not be found', source=get_library_version.__name__) + diff --git a/simba/utils/enums.py b/simba/utils/enums.py index 1825db9c7..bc1775553 100644 --- a/simba/utils/enums.py +++ b/simba/utils/enums.py @@ -496,15 +496,14 @@ class Methods(Enum): WARNING = "WARNING" ERROR = "ERROR" ANOVA = "ANOVA" + AGG_METHODS = ('mean', 'median') INVALID_THIRD_PARTY_APPENDER_FILE = "INVALID annotations file data format" ADDITIONAL_THIRD_PARTY_CLFS = "ADDITIONAL third-party behavior detected" ZERO_THIRD_PARTY_VIDEO_ANNOTATIONS = "ZERO third-party video annotations found" THIRD_PARTY_FPS_CONFLICT = "Annotations and pose FPS conflict" THIRD_PARTY_EVENT_COUNT_CONFLICT = "Annotations EVENT COUNT conflict" THIRD_PARTY_EVENT_OVERLAP = "Annotations OVERLAP inaccuracy" - ZERO_THIRD_PARTY_VIDEO_BEHAVIOR_ANNOTATIONS = ( - "ZERO third-party video behavior annotations found" - ) + ZERO_THIRD_PARTY_VIDEO_BEHAVIOR_ANNOTATIONS = ("ZERO third-party video behavior annotations found") THIRD_PARTY_FRAME_COUNT_CONFLICT = "Annotations and pose FRAME COUNT conflict" THIRD_PARTY_ANNOTATION_FILE_NOT_FOUND = "Annotations data file NOT FOUND" @@ -668,3 +667,74 @@ class MLParamKeys(Enum): class TestPaths(Enum): CRITICAL_VALUES = "../simba/assets/lookups/critical_values_05.pickle" + + + + +class UML(Enum): + FIT_KEYS = ("n_neighbors", "min_distance", "spread") + ALL_FEATURES_EX_POSE = "ALL FEATURES (EXCLUDING POSE)" + DATA_SLICE_SELECTION = "data_slice" + CLF_SLICE_SELECTION = "clf_slice" + ALL_FEATURES_EXCLUDING_POSE = "ALL FEATURES (EXCLUDING POSE)" + ALL_FEATURES_INCLUDING_POSE = "ALL FEATURES (INCLUDING POSE)" + USER_DEFINED_SET = "USER-DEFINED FEATURE SET" + NAMES = "NAMES" + START_FRAME = "START_FRAME" + END_FRAME = "END_FRAME" + CLASSIFIER = "CLASSIFIER" + PROBABILITY = "PROBABILITY" + FRAME = "FRAME" + VIDEO = "VIDEO" + FEATURE_PATH = "feature_path" + BOUT_AGGREGATION_TYPE = "bout_aggregation_type" + MIN_BOUT_LENGTH = "min_bout_length" + N_NEIGHBORS = "n_neighbors" + HASHED_NAME = "HASH" + DATA = "DATA" + RAW = "RAW" + UMAP = "UMAP" + HDBSCAN = "HDBSCAN" + TSNE = "TSNE" + SCALER_TYPE = "SCALER_TYPE" + CSV = "CSV" + MULTICOLLINEARITY = "multicollinearity" + COLLINEAR_FIELDS = 'COLLINEAR_FIELDS' + VARIANCE_THRESHOLD = 'VARIANCE_THRESHOLD' + MULTICOLLINEARITY_THRESHOLD = 'MULTICOLLINEARITY_THRESHOLD' + FORMAT = "format" + SCALED_DATA = "SCALED_DATA" + PARAMETERS = "PARAMETERS" + METHODS = "METHODS" + DR_MODEL = "DR_MODEL" + MODEL = "MODEL" + CLUSTER_MODEL = 'CLUSTER_MODEL' + MIN_DISTANCE = "min_distance" + EUCLIDEAN = "euclidean" + FEATURE_NAMES = "FEATURE_NAMES" + SPREAD = "spread" + SCALER = "scaler" + SCALED = "scaled" + VARIANCE = "variance" + HYPERPARAMETERS = [N_NEIGHBORS, MIN_DISTANCE, SPREAD, SCALER, VARIANCE] + FRAME_FEATURES = "FRAME_FEATURES" + FRAME_POSE = "FRAME_POSE" + FRAME_TARGETS = "FRAME_TARGETS" + BOUTS_FEATURES = "BOUTS_FEATURES" + BOUTS_TARGETS = "BOUTS_TARGETS" + DATASET_DATA_FIELDS = [ + FRAME_FEATURES, + FRAME_POSE, + FRAME_TARGETS, + BOUTS_FEATURES, + BOUTS_TARGETS, + ] + MIN_MAX = "MIN-MAX" + STANDARD = "STANDARD" + QUANTILE = "QUANTILE" + LOW_VARIANCE_FIELDS = "LOW_VARIANCE_FIELDS" + ALPHA = "alpha" + MIN_CLUSTER_SIZE = "min_cluster_size" + MIN_SAMPLES = "min_samples" + EPSILON = "cluster_selection_epsilon" + diff --git a/simba/utils/lookups.py b/simba/utils/lookups.py index 55ede9235..a863133a5 100644 --- a/simba/utils/lookups.py +++ b/simba/utils/lookups.py @@ -18,7 +18,7 @@ import simba from simba.utils.checks import (check_file_exist_and_readable, check_if_dir_exists) -from simba.utils.enums import OS, FontPaths, Methods, Paths +from simba.utils.enums import OS, FontPaths, Methods, Paths, UML from simba.utils.read_write import get_fn_ext from simba.utils.warnings import NoDataFoundWarning @@ -663,6 +663,10 @@ def get_log_config(): } +def get_model_names(): + model_names_dir = os.path.join(os.path.dirname(simba.__file__), Paths.UNSUPERVISED_MODEL_NAMES.value) + return list(pd.read_parquet(model_names_dir)[UML.NAMES.value]) + # # def rao_spacing_critical_values(): # {4.0: 186.45, diff --git a/simba/utils/read_write.py b/simba/utils/read_write.py index 3ee989333..1e47ace84 100644 --- a/simba/utils/read_write.py +++ b/simba/utils/read_write.py @@ -1856,16 +1856,8 @@ def drop_df_fields(data: pd.DataFrame, fields: List[str], raise_error: Optional[ :return pd.DataFrame """ - check_instance( - source=drop_df_fields.__name__, instance=data, accepted_types=(pd.DataFrame,) - ) - check_valid_lst( - data=fields, - source=drop_df_fields.__name__, - valid_dtypes=(str,), - min_len=1, - raise_error=raise_error, - ) + check_instance( source=drop_df_fields.__name__, instance=data, accepted_types=(pd.DataFrame,)) + check_valid_lst(data=fields, source=drop_df_fields.__name__, valid_dtypes=(str,), min_len=1, raise_error=raise_error) if raise_error: return data.drop(columns=fields, errors="raise") else: