Skip to content

Commit

Permalink
Merge branch 'aimclub:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
jrzkaminski authored Apr 27, 2024
2 parents 7001ddd + 75b370f commit 191387a
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 134 deletions.
111 changes: 90 additions & 21 deletions bamt/networks/big_brave_bn.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,105 @@
from bamt.utils.MathUtils import get_brave_matrix, get_proximity_matrix
import math

import numpy as np
import pandas as pd
from sklearn.metrics import mutual_info_score
from sklearn.preprocessing import OrdinalEncoder


class BigBraveBN:
def __init__(self, n_nearest=5, threshold=0.3, proximity_metric="MI"):
self.n_nearest = n_nearest
self.threshold = threshold
self.proximity_metric = proximity_metric
def __init__(self):
self.possible_edges = []

def set_possible_edges_by_brave(self, df):
"""Returns list of possible edges for structure learning
def set_possible_edges_by_brave(
self,
df: pd.DataFrame,
n_nearest: int = 5,
threshold: float = 0.3,
proximity_metric: str = "MI",
) -> list:
"""Returns list of possible edges for structure learning and sets it into attribute
Args:
df (DataFrame): data
df (pd.DataFrame): Data.
n_nearest (int): Number of nearest neighbors to consider. Default is 5.
threshold (float): Threshold for selecting edges. Default is 0.3.
proximity_metric (str): Metric used to calculate proximity. Default is "MI".
Returns:
Possible edges: list of possible edges
None: Modifies the object's possible_edges attribute.
"""
df_copy = df.copy(deep=True)
proximity_matrix = self._get_proximity_matrix(df_copy, proximity_metric)
brave_matrix = self._get_brave_matrix(df_copy.columns, proximity_matrix, n_nearest)

threshold_value = brave_matrix.max(numeric_only=True).max() * threshold
filtered_brave_matrix = brave_matrix[brave_matrix > threshold_value].stack()
self.possible_edges = filtered_brave_matrix.index.tolist()
return self.possible_edges

@staticmethod
def _get_n_nearest(
data: pd.DataFrame, columns: list, corr: bool = False, number_close: int = 5
) -> list:
"""Returns N nearest neighbors for every column of dataframe."""
groups = []
for c in columns:
close_ind = data[c].sort_values(ascending=not corr).index.tolist()
groups.append(close_ind[: number_close + 1])
return groups

@staticmethod
def _get_proximity_matrix(df: pd.DataFrame, proximity_metric: str) -> pd.DataFrame:
"""Returns matrix of proximity for the dataframe."""
encoder = OrdinalEncoder()
df_coded = df.copy()
columns_to_encode = list(df_coded.select_dtypes(include=["category", "object"]))
df_coded[columns_to_encode] = encoder.fit_transform(df_coded[columns_to_encode])

proximity_matrix = get_proximity_matrix(
df, proximity_metric=self.proximity_metric
if proximity_metric == "MI":
df_distance = pd.DataFrame(
np.zeros((len(df.columns), len(df.columns))),
columns=df.columns,
index=df.columns,
)
for c1 in df.columns:
for c2 in df.columns:
dist = mutual_info_score(df_coded[c1].values, df_coded[c2].values)
df_distance.loc[c1, c2] = dist
return df_distance

elif proximity_metric == "pearson":
return df_coded.corr(method="pearson")

def _get_brave_matrix(
self, df_columns: pd.Index, proximity_matrix: pd.DataFrame, n_nearest: int = 5
) -> pd.DataFrame:
"""Returns matrix of Brave coefficients for the DataFrame."""
brave_matrix = pd.DataFrame(
np.zeros((len(df_columns), len(df_columns))),
columns=df_columns,
index=df_columns,
)
groups = self._get_n_nearest(
proximity_matrix, df_columns.tolist(), corr=True, number_close=n_nearest
)
brave_matrix = get_brave_matrix(df.columns, proximity_matrix, self.n_nearest)

possible_edges_list = []
for c1 in df_columns:
for c2 in df_columns:
a = b = c = d = 0.0
if c1 != c2:
for g in groups:
a += (c1 in g) & (c2 in g)
b += (c1 in g) & (c2 not in g)
c += (c1 not in g) & (c2 in g)
d += (c1 not in g) & (c2 not in g)

for c1 in df.columns:
for c2 in df.columns:
if (
brave_matrix.loc[c1, c2]
> brave_matrix.max(numeric_only=True).max() * self.threshold
):
possible_edges_list.append((c1, c2))
divisor = (math.sqrt((a + c) * (b + d))) * (
math.sqrt((a + b) * (c + d))
)
br = (a * len(groups) + (a + c) * (a + b)) / (
divisor if divisor != 0 else 0.0000000001
)
brave_matrix.loc[c1, c2] = br

self.possible_edges = possible_edges_list
return brave_matrix
113 changes: 0 additions & 113 deletions bamt/utils/MathUtils.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
import math

import numpy as np
import pandas as pd
from scipy import stats
from scipy.stats.distributions import chi2
from sklearn.metrics import mutual_info_score
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import OrdinalEncoder


def lrts_comp(data):
Expand Down Expand Up @@ -125,116 +122,6 @@ def component(data, columns, method):
return n


def get_n_nearest(data, columns, corr=False, number_close=5):
"""Returns N nearest neighbors for every column of dataframe, added into list
Args:
data (DataFrame): Proximity matrix
columns (list): df.columns.tolist()
corr (bool, optional): _description_. Defaults to False.
number_close (int, optional): Number of nearest neighbors. Defaults to 5.
Returns:
groups
"""
groups = []
for c in columns:
if corr:
close_ind = data[c].sort_values(ascending=False).index.tolist()
else:
close_ind = data[c].sort_values().index.tolist()
groups.append(close_ind[0 : number_close + 1])

return groups


def get_proximity_matrix(df, proximity_metric) -> pd.DataFrame:
"""Returns matrix of proximity matrix of the dataframe, dataframe must be coded first if it contains
categorical data
Args:
df (DataFrame): data
proximity_metric (str): 'MI' or 'corr'
Returns:
df_distance: mutual information matrix
"""

encoder = OrdinalEncoder()
df_coded = df
columnsToEncode = list(df_coded.select_dtypes(include=["category", "object"]))

df_coded[columnsToEncode] = encoder.fit_transform(df_coded[columnsToEncode])

df_distance = pd.DataFrame(
data=np.zeros((len(df.columns), len(df.columns))), columns=df.columns
)
df_distance.index = df.columns

if proximity_metric == "MI":
for c1 in df.columns:
for c2 in df.columns:
dist = mutual_info_score(df_coded[c1].values, df_coded[c2].values)
df_distance.loc[c1, c2] = dist

elif proximity_metric == "corr":
df_distance = df_coded.corr(method="pearson")

return df_distance


def get_brave_matrix(df_columns, proximity_matrix, n_nearest=5) -> pd.DataFrame:
"""Returns matrix Brave coeffitients of the DataFrame, requires proximity measure to be calculated
Args:
df_columns (DataFrame): data.columns
proximity_matrix (DataFrame): may be generated by get_mutual_info_score_matrix() function or
correlation from scipy
n_nearest (int, optional): _description_. Defaults to 5.
Returns:
brave_matrix: DataFrame of Brave coefficients
"""

brave_matrix = pd.DataFrame(
data=np.zeros((len(df_columns), len(df_columns))), columns=df_columns
)
brave_matrix.index = df_columns

groups = get_n_nearest(
proximity_matrix, df_columns.tolist(), corr=True, number_close=n_nearest
)

counter_zeroer = 0.0

for c1 in df_columns:
for c2 in df_columns:
a = counter_zeroer
b = counter_zeroer
c = counter_zeroer
d = counter_zeroer
if c1 != c2:
for g in groups:
if (c1 in g) & (c2 in g):
a += 1
if (c1 in g) & (c2 not in g):
b += 1
if (c1 not in g) & (c2 in g):
c += 1
if (c1 not in g) & (c2 not in g):
d += 1

if (a + c) * (b + d) != 0 and (a + b) * (c + d) != 0:
br = (a * len(groups) + (a + c) * (a + b)) / (
(math.sqrt((a + c) * (b + d))) * (math.sqrt((a + b) * (c + d)))
)
else:
br = (a * len(groups) + (a + c) * (a + b)) / 0.0000000001
brave_matrix.loc[c1, c2] = br

return brave_matrix


def _child_dict(net: list):
res_dict = dict()
for e0, e1 in net:
Expand Down

0 comments on commit 191387a

Please sign in to comment.