-
Notifications
You must be signed in to change notification settings - Fork 0
/
ks_test.py
88 lines (69 loc) · 3.62 KB
/
ks_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import pandas as pd
from utilsMsc.MeLogSingle import MeLogger
from utilsMsc.MyUtils import MyPipeline
from utilsMsc.MyFairness import Fairness
import os
from sdmetrics.single_column import KSComplement, CSTest
def pipeline_data_distribution_tests(
model_impt:str,
md_mechanism:str,
):
"""
Function to calculate the Kolmogorov-Smirnov test
"""
_logger = MeLogger()
# Read the imputed datasets
diretorio = "./DatasetsImputados"
complete_path = os.path.join(diretorio, md_mechanism)
# Read the original datasets
diretorio = "./DatasetsFairness"
datasets = MyPipeline.carrega_datasets(diretorio)
fairness = Fairness()
tabela_resultados = fairness.cria_tabela_fairness(datasets)
ks_test_all = {"Dataset":[],
"Missing rate":[],
"Column":[],
"ks_stat":[]
}
cs_test_all = {"Dataset":[],
"Missing rate":[],
"Column":[],
"cs_stat":[]
}
for dados, name in zip(tabela_resultados['datasets'], tabela_resultados['nome_datasets']):
df_original = dados.copy()
binary_features = MyPipeline.get_binary_features(data=df_original)
try:
for missing_rate in [10,20,40,60]:
df_path = complete_path + f"\\{name}_{model_impt}_md{missing_rate}.csv"
dados_imputed = pd.read_csv(df_path)
df_imputed = dados_imputed.copy()
# Calculate the Kolmogorov-Smirnov for numerical features and
for col in df_original.columns:
if col not in binary_features:
test_stat = KSComplement.compute(df_original[col].values, df_imputed[col].values)
ks_test_all["Dataset"].append(name)
ks_test_all["Column"].append(col)
ks_test_all["Missing rate"].append(missing_rate)
ks_test_all["ks_stat"].append(test_stat)
else:
test_stat = CSTest.compute(df_original[col].values, df_imputed[col].values)
cs_test_all["Dataset"].append(name)
cs_test_all["Column"].append(col)
cs_test_all["Missing rate"].append(missing_rate)
cs_test_all["cs_stat"].append(test_stat)
except Exception as erro:
_logger.debug(f"Erro: {erro}")
resultados_ks = pd.DataFrame(ks_test_all)
resultados_ks.to_excel(f"./DatasetsImputados/ks_test_{model_impt}_{mecanismo}.xlsx", index=False)
resultados_cs = pd.DataFrame(cs_test_all)
resultados_cs.to_excel(f"./DatasetsImputados/cs_test_{model_impt}_{mecanismo}.xlsx", index=False)
return _logger.info("Resultados salvos com sucesso!")
if __name__ == "__main__":
for mecanismo in ["MAR-random_Multivariado", "MNAR-determisticFalse_Multivariado"]:
pipeline_data_distribution_tests(model_impt="mean",md_mechanism=mecanismo)
pipeline_data_distribution_tests(model_impt="customKNN",md_mechanism=mecanismo)
pipeline_data_distribution_tests(model_impt="mice",md_mechanism=mecanismo)
pipeline_data_distribution_tests(model_impt="softimpute",md_mechanism=mecanismo)
pipeline_data_distribution_tests(model_impt="gain",md_mechanism=mecanismo)
pipeline_data_distribution_tests(model_impt="pmivae",md_mechanism=mecanismo)