-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add FairnessEvalTPL and SLP_English, SLP_Math dataset
- Loading branch information
Showing
10 changed files
with
285 additions
and
19 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
from edustudio.atom_op.raw2mid import BaseRaw2Mid | ||
import pandas as pd | ||
import numpy as np | ||
import time | ||
|
||
""" | ||
SLP Dataset: https://aic-fe.bnu.edu.cn/en/data/index.html | ||
""" | ||
|
||
|
||
class R2M_SLP_English(BaseRaw2Mid): | ||
""" | ||
rawdata: https://aic-fe.bnu.edu.cn/en/data/index.html | ||
""" | ||
def process(self): | ||
super().process() | ||
|
||
# for stu | ||
df_stu = pd.read_csv(f"{self.rawpath}/student.csv") | ||
df_stu.dropna(subset=['school_id'], inplace=True, how='any', axis=0) | ||
df_stu = df_stu[df_stu['school_id'] != 'n.a.'] | ||
|
||
df_stu = df_stu.merge( | ||
pd.read_csv(f"{self.rawpath}/family.csv", index_col=False), | ||
on=['student_id'], how='inner' | ||
) | ||
|
||
df_stu = df_stu.merge( | ||
pd.read_csv(f"{self.rawpath}/school.csv"), | ||
on=['school_id'], how='inner' | ||
) | ||
|
||
df_stu.drop([ | ||
'rate_of_higher_educated_teachers', | ||
"rate_of_teachers_with_master's_degree_and_above" | ||
], inplace=True, axis=1) | ||
df_stu.rename(columns={ | ||
'student_id': 'stu_id:token', 'gender': 'gender:token', | ||
'school_id': 'sch_id:token', 'class_id': 'class_id:token', | ||
'age_father': 'age_father:float', 'age_mother': 'age_mother:token', | ||
'edubg_father': 'edubg_father:token', 'edubg_mother':'edubg_mother:token', | ||
'affiliation_father':'affiliation_father:token', | ||
'affiliation_mother': 'affiliation_mother:token', | ||
'family_income': 'family_income:token', 'is_only_child':'is_only_child:token', | ||
'live_on_campus': 'live_on_campus:token', | ||
'gathering_frequency_father':'gathering_frequency_father:token', | ||
'gathering_frequency_mother':'gathering_frequency_mother:token', | ||
'family_traveling_times': "family_traveling_times:token", | ||
'school_type': 'school_type:token', | ||
'dist_to_downtown': 'dist_to_downtown:float', | ||
#'rate_of_higher_educated_teachers': 'rate_of_higher_educated_teachers:float', | ||
#"rate_of_teachers_with_master's_degree_and_above": "rate_of_teachers_with_master's_degree_and_above:float", | ||
}, inplace=True) | ||
|
||
# for inter | ||
df_inter = pd.read_csv(f"{self.rawpath}/term-eng.csv", index_col=False, low_memory=False) | ||
df_inter = df_inter[(df_inter == 'n.a.').sum(axis=1) == 0].reset_index(drop=True) | ||
df_inter = df_inter[df_inter['concept'] != 'n.a.'] | ||
df_inter['label'] = df_inter['score']/df_inter['full_score'].astype(float) | ||
|
||
df_exer = df_inter[['question_id', 'exam_id', 'subject_abbr', 'concept']] | ||
df_inter = df_inter[['student_id', 'question_id', 'score', 'full_score', 'time_access', 'label']] | ||
df_exer.drop_duplicates(subset=['question_id'], inplace=True) | ||
df_exer['concept'] = df_exer['concept'].apply(lambda x: x.split(";")) | ||
df_inter['time_access'] = df_inter['time_access'].apply(lambda x: self.convert2timestamp(x)) | ||
|
||
df_inter.rename(columns={ | ||
'student_id': 'stu_id:token', 'question_id': 'exer_id:token', | ||
'score': 'score:float', 'full_score':'full_score:float', | ||
'time_access': 'start_timestamp:float', 'label':'label:float' | ||
}, inplace=True) | ||
|
||
df_exer.rename(columns={ | ||
'question_id': 'exer_id:token', | ||
'exam_id': 'exam_id:token', | ||
'subject_abbr': 'subject_abbr:token', | ||
'concept': 'cpt_seq:token_seq' | ||
}, inplace=True) | ||
|
||
df_inter['order_id:token'] = df_inter['start_timestamp:float'].astype(int) | ||
|
||
# save | ||
df_inter.to_csv(f"{self.midpath}/{self.dt}.inter.csv", index=False, encoding='utf-8') | ||
df_stu.to_csv(f"{self.midpath}/{self.dt}.stu.csv", index=False, encoding='utf-8') | ||
df_exer.to_csv(f"{self.midpath}/{self.dt}.exer.csv", index=False, encoding='utf-8') | ||
|
||
@staticmethod | ||
def convert2timestamp(dt): | ||
timeArray = time.strptime(dt, "%Y-%m-%d %H:%M:%S") | ||
timestamp = time.mktime(timeArray) | ||
return timestamp |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
from edustudio.atom_op.raw2mid import BaseRaw2Mid | ||
import pandas as pd | ||
import numpy as np | ||
import time | ||
|
||
""" | ||
SLP Dataset: https://aic-fe.bnu.edu.cn/en/data/index.html | ||
""" | ||
|
||
class R2M_SLP_Math(BaseRaw2Mid): | ||
def process(self): | ||
super().process() | ||
|
||
# for stu | ||
df_stu = pd.read_csv(f"{self.rawpath}/student.csv") | ||
df_stu.dropna(subset=['school_id'], inplace=True, how='any', axis=0) | ||
df_stu = df_stu[df_stu['school_id'] != 'n.a.'] | ||
|
||
df_stu = df_stu.merge( | ||
pd.read_csv(f"{self.rawpath}/family.csv", index_col=False), | ||
on=['student_id'], how='inner' | ||
) | ||
|
||
df_stu = df_stu.merge( | ||
pd.read_csv(f"{self.rawpath}/school.csv"), | ||
on=['school_id'], how='inner' | ||
) | ||
|
||
df_stu.drop([ | ||
'rate_of_higher_educated_teachers', | ||
"rate_of_teachers_with_master's_degree_and_above" | ||
], inplace=True, axis=1) | ||
df_stu.rename(columns={ | ||
'student_id': 'stu_id:token', 'gender': 'gender:token', | ||
'school_id': 'sch_id:token', 'class_id': 'class_id:token', | ||
'age_father': 'age_father:float', 'age_mother': 'age_mother:token', | ||
'edubg_father': 'edubg_father:token', 'edubg_mother':'edubg_mother:token', | ||
'affiliation_father':'affiliation_father:token', | ||
'affiliation_mother': 'affiliation_mother:token', | ||
'family_income': 'family_income:token', 'is_only_child':'is_only_child:token', | ||
'live_on_campus': 'live_on_campus:token', | ||
'gathering_frequency_father':'gathering_frequency_father:token', | ||
'gathering_frequency_mother':'gathering_frequency_mother:token', | ||
'family_traveling_times': "family_traveling_times:token", | ||
'school_type': 'school_type:token', | ||
'dist_to_downtown': 'dist_to_downtown:float', | ||
#'rate_of_higher_educated_teachers': 'rate_of_higher_educated_teachers:float', | ||
#"rate_of_teachers_with_master's_degree_and_above": "rate_of_teachers_with_master's_degree_and_above:float", | ||
}, inplace=True) | ||
|
||
# for inter | ||
df_inter = pd.read_csv(f"{self.rawpath}/term-mat.csv", index_col=False) | ||
df_inter = df_inter[df_inter['concept'] != 'n.a.'] | ||
df_inter['label'] = df_inter['score']/df_inter['full_score'] | ||
|
||
df_exer = df_inter[['question_id', 'exam_id', 'subject_abbr', 'concept']] | ||
df_inter = df_inter[['student_id', 'question_id', 'score', 'full_score', 'time_access', 'label']] | ||
df_exer.drop_duplicates(subset=['question_id'], inplace=True) | ||
df_exer['concept'] = df_exer['concept'].apply(lambda x: x.split(";")) | ||
df_inter['time_access'] = df_inter['time_access'].apply(lambda x: self.convert2timestamp(x)) | ||
|
||
df_inter.rename(columns={ | ||
'student_id': 'stu_id:token', 'question_id': 'exer_id:token', | ||
'score': 'score:float', 'full_score':'full_score:float', | ||
'time_access': 'start_timestamp:float', 'label':'label:float' | ||
}, inplace=True) | ||
|
||
df_exer.rename(columns={ | ||
'question_id': 'exer_id:token', | ||
'exam_id': 'exam_id:token', | ||
'subject_abbr': 'subject_abbr:token', | ||
'concept': 'cpt_seq:token_seq' | ||
}, inplace=True) | ||
|
||
df_inter['order_id:token'] = df_inter['start_timestamp:float'].astype(int) | ||
|
||
# save | ||
df_inter.to_csv(f"{self.midpath}/{self.dt}.inter.csv", index=False, encoding='utf-8') | ||
df_stu.to_csv(f"{self.midpath}/{self.dt}.stu.csv", index=False, encoding='utf-8') | ||
df_exer.to_csv(f"{self.midpath}/{self.dt}.exer.csv", index=False, encoding='utf-8') | ||
|
||
@staticmethod | ||
def convert2timestamp(dt): | ||
timeArray = time.strptime(dt, "%Y-%m-%d %H:%M:%S") | ||
timestamp = time.mktime(timeArray) | ||
return timestamp |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
from .base_evaltpl import BaseEvalTPL | ||
from .bc_evaltpl import BinaryClassificationEvalTPL | ||
from .cd_evaltpl import CognitiveDiagnosisEvalTPL | ||
|
||
from .fairness_evaltpl import FairnessEvalTPL |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
from .base_evaltpl import BaseEvalTPL | ||
import pandas as pd | ||
import numpy as np | ||
from edustudio.utils.common import tensor2npy | ||
|
||
|
||
class FairnessEvalTPL(BaseEvalTPL): | ||
default_cfg = { | ||
'use_sensi_attrs': ['gender:token'], | ||
'use_metrics': ['EO', 'DP', 'FCD'] | ||
} | ||
|
||
def _check_params(self): | ||
assert len(set(self.evaltpl_cfg[self.__class__.__name__]['use_metrics']) - {'EO', 'DP', 'FCD'}) == 0 | ||
|
||
def eval(self, **kwargs): | ||
stu_id = tensor2npy(kwargs['stu_id']) | ||
pd_soft = tensor2npy(kwargs['y_pd']) | ||
gt = tensor2npy(kwargs['y_gt']) | ||
pd_hard = (pd_soft >= 0.5).astype(np.int64) | ||
|
||
df_stu = self.extra_data['df_stu'] | ||
|
||
df = pd.DataFrame() | ||
df['stu_id:token'] = stu_id | ||
df['pd_soft'] = pd_soft | ||
df['pd_hard'] = pd_hard | ||
df['gt'] = gt | ||
df = df.merge(df_stu, on='stu_id:token', how='left') | ||
|
||
ret_dic = {} | ||
for attr in self.evaltpl_cfg[self.__class__.__name__]['use_sensi_attrs']: | ||
g_names = df_stu[attr].unique() | ||
|
||
for use_metric in self.evaltpl_cfg[self.__class__.__name__]['use_metrics']: | ||
if len(g_names) == 2: | ||
if use_metric == 'EO': ret_dic[f"EO_{attr}"] = self.get_eo(df, attr) | ||
if use_metric == 'DP': ret_dic[f"DP_{attr}"] = self.get_dp(df, attr) | ||
if use_metric == 'FCD': ret_dic[f"FCD_{attr}"] = self.get_fcd(df, attr) | ||
else: | ||
pass | ||
return ret_dic | ||
|
||
def get_dp(self, df, sensitive_attr): | ||
"""Demographic Parity | ||
""" | ||
dp = df.groupby(sensitive_attr)['pd_hard'].mean() | ||
return abs(dp[0] - dp[1]) | ||
|
||
def get_eo(self, df, sensitive_attr): | ||
"""Equal Opportunity | ||
""" | ||
eo = df.groupby([sensitive_attr, 'gt'])['pd_hard'].mean() | ||
return abs(eo[0][1] - eo[1][1]) | ||
|
||
|
||
def get_fcd(self, df, sensitive_attr): | ||
"""Fair Cognitive Diagnosis [1] | ||
[1]zhang zheng, et al, Understanding and Improving Fairness in Cognitive Diagnosis, SCIENCE CHINA Information Sciences, 2023, ISSN 1674-733X, https://doi.org/10.1007/s11432-022-3852-0. | ||
""" | ||
fcd_pd = df.groupby([sensitive_attr, 'stu_id:token'])['pd_hard'].mean() | ||
fcd_pd = fcd_pd[0].mean() - fcd_pd[1].mean() | ||
|
||
fcd_gt = df.groupby([sensitive_attr, 'stu_id:token'])['gt'].mean() | ||
fcd_gt = fcd_gt[0].mean() - fcd_gt[1].mean() | ||
return abs(fcd_pd - fcd_gt) | ||
|
||
|
||
def add_extra_data(self, **kwargs): | ||
self.extra_data = kwargs | ||
|
||
df_stu = self.extra_data['df_stu'] | ||
assert df_stu is not None | ||
for attr in self.evaltpl_cfg[self.__class__.__name__]['use_sensi_attrs']: | ||
assert attr in df_stu | ||
g_names = df_stu[attr].unique() | ||
|
||
for use_metric in self.evaltpl_cfg[self.__class__.__name__]['use_metrics']: | ||
assert len(g_names) >= 2 | ||
if len(g_names) > 2: | ||
if use_metric in {'EO', 'DP', 'FCD'}: | ||
self.logger.warning(f"As the number of sensitive attribute `{attr}` values > 2, the fairness metric {use_metric} is not supported for the `{attr}`") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters