From 75b370f01eb4533693324aabd5989f4396129a26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jerzy=20Kami=C5=84ski?= <86363785+jrzkaminski@users.noreply.github.com> Date: Fri, 29 Mar 2024 14:48:00 +0300 Subject: [PATCH] Improve overall user experience of BigBraveBN algorithm (#103) * Update big_brave_bn.py * Update MathUtils.py * Update docstring * Update big_brave_bn.py * fix dataframe mutation --- bamt/networks/big_brave_bn.py | 111 ++++++++++++++++++++++++++------- bamt/utils/MathUtils.py | 113 ---------------------------------- 2 files changed, 90 insertions(+), 134 deletions(-) diff --git a/bamt/networks/big_brave_bn.py b/bamt/networks/big_brave_bn.py index b742623..e4adc9a 100644 --- a/bamt/networks/big_brave_bn.py +++ b/bamt/networks/big_brave_bn.py @@ -1,36 +1,105 @@ -from bamt.utils.MathUtils import get_brave_matrix, get_proximity_matrix +import math + +import numpy as np +import pandas as pd +from sklearn.metrics import mutual_info_score +from sklearn.preprocessing import OrdinalEncoder class BigBraveBN: - def __init__(self, n_nearest=5, threshold=0.3, proximity_metric="MI"): - self.n_nearest = n_nearest - self.threshold = threshold - self.proximity_metric = proximity_metric + def __init__(self): self.possible_edges = [] - def set_possible_edges_by_brave(self, df): - """Returns list of possible edges for structure learning + def set_possible_edges_by_brave( + self, + df: pd.DataFrame, + n_nearest: int = 5, + threshold: float = 0.3, + proximity_metric: str = "MI", + ) -> list: + """Returns list of possible edges for structure learning and sets it into attribute Args: - df (DataFrame): data + df (pd.DataFrame): Data. + n_nearest (int): Number of nearest neighbors to consider. Default is 5. + threshold (float): Threshold for selecting edges. Default is 0.3. + proximity_metric (str): Metric used to calculate proximity. Default is "MI". Returns: - Possible edges: list of possible edges + None: Modifies the object's possible_edges attribute. """ + df_copy = df.copy(deep=True) + proximity_matrix = self._get_proximity_matrix(df_copy, proximity_metric) + brave_matrix = self._get_brave_matrix(df_copy.columns, proximity_matrix, n_nearest) + + threshold_value = brave_matrix.max(numeric_only=True).max() * threshold + filtered_brave_matrix = brave_matrix[brave_matrix > threshold_value].stack() + self.possible_edges = filtered_brave_matrix.index.tolist() + return self.possible_edges + + @staticmethod + def _get_n_nearest( + data: pd.DataFrame, columns: list, corr: bool = False, number_close: int = 5 + ) -> list: + """Returns N nearest neighbors for every column of dataframe.""" + groups = [] + for c in columns: + close_ind = data[c].sort_values(ascending=not corr).index.tolist() + groups.append(close_ind[: number_close + 1]) + return groups + + @staticmethod + def _get_proximity_matrix(df: pd.DataFrame, proximity_metric: str) -> pd.DataFrame: + """Returns matrix of proximity for the dataframe.""" + encoder = OrdinalEncoder() + df_coded = df.copy() + columns_to_encode = list(df_coded.select_dtypes(include=["category", "object"])) + df_coded[columns_to_encode] = encoder.fit_transform(df_coded[columns_to_encode]) - proximity_matrix = get_proximity_matrix( - df, proximity_metric=self.proximity_metric + if proximity_metric == "MI": + df_distance = pd.DataFrame( + np.zeros((len(df.columns), len(df.columns))), + columns=df.columns, + index=df.columns, + ) + for c1 in df.columns: + for c2 in df.columns: + dist = mutual_info_score(df_coded[c1].values, df_coded[c2].values) + df_distance.loc[c1, c2] = dist + return df_distance + + elif proximity_metric == "pearson": + return df_coded.corr(method="pearson") + + def _get_brave_matrix( + self, df_columns: pd.Index, proximity_matrix: pd.DataFrame, n_nearest: int = 5 + ) -> pd.DataFrame: + """Returns matrix of Brave coefficients for the DataFrame.""" + brave_matrix = pd.DataFrame( + np.zeros((len(df_columns), len(df_columns))), + columns=df_columns, + index=df_columns, + ) + groups = self._get_n_nearest( + proximity_matrix, df_columns.tolist(), corr=True, number_close=n_nearest ) - brave_matrix = get_brave_matrix(df.columns, proximity_matrix, self.n_nearest) - possible_edges_list = [] + for c1 in df_columns: + for c2 in df_columns: + a = b = c = d = 0.0 + if c1 != c2: + for g in groups: + a += (c1 in g) & (c2 in g) + b += (c1 in g) & (c2 not in g) + c += (c1 not in g) & (c2 in g) + d += (c1 not in g) & (c2 not in g) - for c1 in df.columns: - for c2 in df.columns: - if ( - brave_matrix.loc[c1, c2] - > brave_matrix.max(numeric_only=True).max() * self.threshold - ): - possible_edges_list.append((c1, c2)) + divisor = (math.sqrt((a + c) * (b + d))) * ( + math.sqrt((a + b) * (c + d)) + ) + br = (a * len(groups) + (a + c) * (a + b)) / ( + divisor if divisor != 0 else 0.0000000001 + ) + brave_matrix.loc[c1, c2] = br - self.possible_edges = possible_edges_list + return brave_matrix diff --git a/bamt/utils/MathUtils.py b/bamt/utils/MathUtils.py index 82ef1c3..ed08105 100644 --- a/bamt/utils/MathUtils.py +++ b/bamt/utils/MathUtils.py @@ -1,12 +1,9 @@ import math import numpy as np -import pandas as pd from scipy import stats from scipy.stats.distributions import chi2 -from sklearn.metrics import mutual_info_score from sklearn.mixture import GaussianMixture -from sklearn.preprocessing import OrdinalEncoder def lrts_comp(data): @@ -125,116 +122,6 @@ def component(data, columns, method): return n -def get_n_nearest(data, columns, corr=False, number_close=5): - """Returns N nearest neighbors for every column of dataframe, added into list - - Args: - data (DataFrame): Proximity matrix - columns (list): df.columns.tolist() - corr (bool, optional): _description_. Defaults to False. - number_close (int, optional): Number of nearest neighbors. Defaults to 5. - - Returns: - groups - """ - groups = [] - for c in columns: - if corr: - close_ind = data[c].sort_values(ascending=False).index.tolist() - else: - close_ind = data[c].sort_values().index.tolist() - groups.append(close_ind[0 : number_close + 1]) - - return groups - - -def get_proximity_matrix(df, proximity_metric) -> pd.DataFrame: - """Returns matrix of proximity matrix of the dataframe, dataframe must be coded first if it contains - categorical data - - Args: - df (DataFrame): data - proximity_metric (str): 'MI' or 'corr' - - Returns: - df_distance: mutual information matrix - """ - - encoder = OrdinalEncoder() - df_coded = df - columnsToEncode = list(df_coded.select_dtypes(include=["category", "object"])) - - df_coded[columnsToEncode] = encoder.fit_transform(df_coded[columnsToEncode]) - - df_distance = pd.DataFrame( - data=np.zeros((len(df.columns), len(df.columns))), columns=df.columns - ) - df_distance.index = df.columns - - if proximity_metric == "MI": - for c1 in df.columns: - for c2 in df.columns: - dist = mutual_info_score(df_coded[c1].values, df_coded[c2].values) - df_distance.loc[c1, c2] = dist - - elif proximity_metric == "corr": - df_distance = df_coded.corr(method="pearson") - - return df_distance - - -def get_brave_matrix(df_columns, proximity_matrix, n_nearest=5) -> pd.DataFrame: - """Returns matrix Brave coeffitients of the DataFrame, requires proximity measure to be calculated - - Args: - df_columns (DataFrame): data.columns - proximity_matrix (DataFrame): may be generated by get_mutual_info_score_matrix() function or - correlation from scipy - n_nearest (int, optional): _description_. Defaults to 5. - - Returns: - brave_matrix: DataFrame of Brave coefficients - """ - - brave_matrix = pd.DataFrame( - data=np.zeros((len(df_columns), len(df_columns))), columns=df_columns - ) - brave_matrix.index = df_columns - - groups = get_n_nearest( - proximity_matrix, df_columns.tolist(), corr=True, number_close=n_nearest - ) - - counter_zeroer = 0.0 - - for c1 in df_columns: - for c2 in df_columns: - a = counter_zeroer - b = counter_zeroer - c = counter_zeroer - d = counter_zeroer - if c1 != c2: - for g in groups: - if (c1 in g) & (c2 in g): - a += 1 - if (c1 in g) & (c2 not in g): - b += 1 - if (c1 not in g) & (c2 in g): - c += 1 - if (c1 not in g) & (c2 not in g): - d += 1 - - if (a + c) * (b + d) != 0 and (a + b) * (c + d) != 0: - br = (a * len(groups) + (a + c) * (a + b)) / ( - (math.sqrt((a + c) * (b + d))) * (math.sqrt((a + b) * (c + d))) - ) - else: - br = (a * len(groups) + (a + c) * (a + b)) / 0.0000000001 - brave_matrix.loc[c1, c2] = br - - return brave_matrix - - def _child_dict(net: list): res_dict = dict() for e0, e1 in net: