diff --git a/edustudio/atom_op/mid2cache/CD/data_split4cd.py b/edustudio/atom_op/mid2cache/CD/data_split4cd.py index 868521f..d0c46cf 100644 --- a/edustudio/atom_op/mid2cache/CD/data_split4cd.py +++ b/edustudio/atom_op/mid2cache/CD/data_split4cd.py @@ -104,5 +104,3 @@ def set_dt_info(self, dt_info, **kwargs): dt_info['cpt_count'] = max(dt_info.get('cpt_count', -1), df[col].max() + 1) else: dt_info['cpt_count'] = max(dt_info.get('cpt_count', -1), np.max(list(chain(*df[col].to_list()))) + 1) - - a = 1 \ No newline at end of file diff --git a/edustudio/atom_op/mid2cache/common/build_dtinfo.py b/edustudio/atom_op/mid2cache/common/build_dtinfo.py deleted file mode 100644 index fed4104..0000000 --- a/edustudio/atom_op/mid2cache/common/build_dtinfo.py +++ /dev/null @@ -1,10 +0,0 @@ -from .base_mid2cache import BaseMid2Cache - -class M2C_BuildDtInfo(BaseMid2Cache): - default_cfg = { - } - - def process(self, **kwargs): - pass - - \ No newline at end of file diff --git a/edustudio/atom_op/raw2mid/__init__.py b/edustudio/atom_op/raw2mid/__init__.py index 11133a5..980cce1 100644 --- a/edustudio/atom_op/raw2mid/__init__.py +++ b/edustudio/atom_op/raw2mid/__init__.py @@ -15,7 +15,8 @@ from .nips12 import R2M_Eedi_20_T12 from .nips34 import R2M_Eedi_20_T34 from .simulated5 import R2M_Simulated5 - +from .slp_english import R2M_SLP_English +from .slp_math import R2M_SLP_Math # look up api dict _cli_api_dict_ = {} @@ -35,3 +36,5 @@ _cli_api_dict_['R2M_Eedi_20_T12'] = R2M_Eedi_20_T12.from_cli _cli_api_dict_['R2M_Eedi_20_T34'] = R2M_Eedi_20_T34.from_cli _cli_api_dict_['R2M_Simulated5'] = R2M_Simulated5.from_cli +_cli_api_dict_['R2M_SLP_Math'] = R2M_SLP_Math.from_cli +_cli_api_dict_['R2M_SLP_English'] = R2M_SLP_English.from_cli diff --git a/edustudio/atom_op/raw2mid/slp_english.py b/edustudio/atom_op/raw2mid/slp_english.py new file mode 100644 index 0000000..80b439f --- /dev/null +++ b/edustudio/atom_op/raw2mid/slp_english.py @@ -0,0 +1,91 @@ +from edustudio.atom_op.raw2mid import BaseRaw2Mid +import pandas as pd +import numpy as np +import time + +""" + SLP Dataset: https://aic-fe.bnu.edu.cn/en/data/index.html +""" + + +class R2M_SLP_English(BaseRaw2Mid): + """ + rawdata: https://aic-fe.bnu.edu.cn/en/data/index.html + """ + def process(self): + super().process() + + # for stu + df_stu = pd.read_csv(f"{self.rawpath}/student.csv") + df_stu.dropna(subset=['school_id'], inplace=True, how='any', axis=0) + df_stu = df_stu[df_stu['school_id'] != 'n.a.'] + + df_stu = df_stu.merge( + pd.read_csv(f"{self.rawpath}/family.csv", index_col=False), + on=['student_id'], how='inner' + ) + + df_stu = df_stu.merge( + pd.read_csv(f"{self.rawpath}/school.csv"), + on=['school_id'], how='inner' + ) + + df_stu.drop([ + 'rate_of_higher_educated_teachers', + "rate_of_teachers_with_master's_degree_and_above" + ], inplace=True, axis=1) + df_stu.rename(columns={ + 'student_id': 'stu_id:token', 'gender': 'gender:token', + 'school_id': 'sch_id:token', 'class_id': 'class_id:token', + 'age_father': 'age_father:float', 'age_mother': 'age_mother:token', + 'edubg_father': 'edubg_father:token', 'edubg_mother':'edubg_mother:token', + 'affiliation_father':'affiliation_father:token', + 'affiliation_mother': 'affiliation_mother:token', + 'family_income': 'family_income:token', 'is_only_child':'is_only_child:token', + 'live_on_campus': 'live_on_campus:token', + 'gathering_frequency_father':'gathering_frequency_father:token', + 'gathering_frequency_mother':'gathering_frequency_mother:token', + 'family_traveling_times': "family_traveling_times:token", + 'school_type': 'school_type:token', + 'dist_to_downtown': 'dist_to_downtown:float', + #'rate_of_higher_educated_teachers': 'rate_of_higher_educated_teachers:float', + #"rate_of_teachers_with_master's_degree_and_above": "rate_of_teachers_with_master's_degree_and_above:float", + }, inplace=True) + + # for inter + df_inter = pd.read_csv(f"{self.rawpath}/term-eng.csv", index_col=False, low_memory=False) + df_inter = df_inter[(df_inter == 'n.a.').sum(axis=1) == 0].reset_index(drop=True) + df_inter = df_inter[df_inter['concept'] != 'n.a.'] + df_inter['label'] = df_inter['score']/df_inter['full_score'].astype(float) + + df_exer = df_inter[['question_id', 'exam_id', 'subject_abbr', 'concept']] + df_inter = df_inter[['student_id', 'question_id', 'score', 'full_score', 'time_access', 'label']] + df_exer.drop_duplicates(subset=['question_id'], inplace=True) + df_exer['concept'] = df_exer['concept'].apply(lambda x: x.split(";")) + df_inter['time_access'] = df_inter['time_access'].apply(lambda x: self.convert2timestamp(x)) + + df_inter.rename(columns={ + 'student_id': 'stu_id:token', 'question_id': 'exer_id:token', + 'score': 'score:float', 'full_score':'full_score:float', + 'time_access': 'start_timestamp:float', 'label':'label:float' + }, inplace=True) + + df_exer.rename(columns={ + 'question_id': 'exer_id:token', + 'exam_id': 'exam_id:token', + 'subject_abbr': 'subject_abbr:token', + 'concept': 'cpt_seq:token_seq' + }, inplace=True) + + df_inter['order_id:token'] = df_inter['start_timestamp:float'].astype(int) + + # save + df_inter.to_csv(f"{self.midpath}/{self.dt}.inter.csv", index=False, encoding='utf-8') + df_stu.to_csv(f"{self.midpath}/{self.dt}.stu.csv", index=False, encoding='utf-8') + df_exer.to_csv(f"{self.midpath}/{self.dt}.exer.csv", index=False, encoding='utf-8') + + @staticmethod + def convert2timestamp(dt): + timeArray = time.strptime(dt, "%Y-%m-%d %H:%M:%S") + timestamp = time.mktime(timeArray) + return timestamp \ No newline at end of file diff --git a/edustudio/atom_op/raw2mid/slp_math.py b/edustudio/atom_op/raw2mid/slp_math.py new file mode 100644 index 0000000..9b57c7f --- /dev/null +++ b/edustudio/atom_op/raw2mid/slp_math.py @@ -0,0 +1,86 @@ +from edustudio.atom_op.raw2mid import BaseRaw2Mid +import pandas as pd +import numpy as np +import time + +""" + SLP Dataset: https://aic-fe.bnu.edu.cn/en/data/index.html +""" + +class R2M_SLP_Math(BaseRaw2Mid): + def process(self): + super().process() + + # for stu + df_stu = pd.read_csv(f"{self.rawpath}/student.csv") + df_stu.dropna(subset=['school_id'], inplace=True, how='any', axis=0) + df_stu = df_stu[df_stu['school_id'] != 'n.a.'] + + df_stu = df_stu.merge( + pd.read_csv(f"{self.rawpath}/family.csv", index_col=False), + on=['student_id'], how='inner' + ) + + df_stu = df_stu.merge( + pd.read_csv(f"{self.rawpath}/school.csv"), + on=['school_id'], how='inner' + ) + + df_stu.drop([ + 'rate_of_higher_educated_teachers', + "rate_of_teachers_with_master's_degree_and_above" + ], inplace=True, axis=1) + df_stu.rename(columns={ + 'student_id': 'stu_id:token', 'gender': 'gender:token', + 'school_id': 'sch_id:token', 'class_id': 'class_id:token', + 'age_father': 'age_father:float', 'age_mother': 'age_mother:token', + 'edubg_father': 'edubg_father:token', 'edubg_mother':'edubg_mother:token', + 'affiliation_father':'affiliation_father:token', + 'affiliation_mother': 'affiliation_mother:token', + 'family_income': 'family_income:token', 'is_only_child':'is_only_child:token', + 'live_on_campus': 'live_on_campus:token', + 'gathering_frequency_father':'gathering_frequency_father:token', + 'gathering_frequency_mother':'gathering_frequency_mother:token', + 'family_traveling_times': "family_traveling_times:token", + 'school_type': 'school_type:token', + 'dist_to_downtown': 'dist_to_downtown:float', + #'rate_of_higher_educated_teachers': 'rate_of_higher_educated_teachers:float', + #"rate_of_teachers_with_master's_degree_and_above": "rate_of_teachers_with_master's_degree_and_above:float", + }, inplace=True) + + # for inter + df_inter = pd.read_csv(f"{self.rawpath}/term-mat.csv", index_col=False) + df_inter = df_inter[df_inter['concept'] != 'n.a.'] + df_inter['label'] = df_inter['score']/df_inter['full_score'] + + df_exer = df_inter[['question_id', 'exam_id', 'subject_abbr', 'concept']] + df_inter = df_inter[['student_id', 'question_id', 'score', 'full_score', 'time_access', 'label']] + df_exer.drop_duplicates(subset=['question_id'], inplace=True) + df_exer['concept'] = df_exer['concept'].apply(lambda x: x.split(";")) + df_inter['time_access'] = df_inter['time_access'].apply(lambda x: self.convert2timestamp(x)) + + df_inter.rename(columns={ + 'student_id': 'stu_id:token', 'question_id': 'exer_id:token', + 'score': 'score:float', 'full_score':'full_score:float', + 'time_access': 'start_timestamp:float', 'label':'label:float' + }, inplace=True) + + df_exer.rename(columns={ + 'question_id': 'exer_id:token', + 'exam_id': 'exam_id:token', + 'subject_abbr': 'subject_abbr:token', + 'concept': 'cpt_seq:token_seq' + }, inplace=True) + + df_inter['order_id:token'] = df_inter['start_timestamp:float'].astype(int) + + # save + df_inter.to_csv(f"{self.midpath}/{self.dt}.inter.csv", index=False, encoding='utf-8') + df_stu.to_csv(f"{self.midpath}/{self.dt}.stu.csv", index=False, encoding='utf-8') + df_exer.to_csv(f"{self.midpath}/{self.dt}.exer.csv", index=False, encoding='utf-8') + + @staticmethod + def convert2timestamp(dt): + timeArray = time.strptime(dt, "%Y-%m-%d %H:%M:%S") + timestamp = time.mktime(timeArray) + return timestamp diff --git a/edustudio/datatpl/common/general_datatpl.py b/edustudio/datatpl/common/general_datatpl.py index 9632752..2fd0f9e 100644 --- a/edustudio/datatpl/common/general_datatpl.py +++ b/edustudio/datatpl/common/general_datatpl.py @@ -36,7 +36,7 @@ class GeneralDataTPL(BaseDataTPL): 'cache_id': 'cache_default', 'load_data_from': 'middata', # ['rawdata', 'middata', 'cachedata'] 'inter_exclude_feat_names': (), - 'raw2mid_op': None, + 'raw2mid_op': "None", 'mid2cache_op_seq': [] } @@ -456,7 +456,7 @@ def _get_r2m_op(cls, cfg): """ from edustudio.atom_op.raw2mid import BaseRaw2Mid r2m_op = cfg.datatpl_cfg['raw2mid_op'] - assert r2m_op is not None + assert r2m_op is not None or r2m_op != "None" if isinstance(r2m_op, str): r2m_op = importlib.import_module('edustudio.atom_op.raw2mid').__getattribute__(r2m_op) elif issubclass(r2m_op, BaseRaw2Mid): @@ -541,13 +541,20 @@ def _preprocess_feat(df): for col in df.columns: col_name, col_type = col.split(":") if col_type == 'token': - df[col] = df[col].astype('int64') + # df[col] = df[col].astype('int64') + pass elif col_type == 'float': df[col] = df[col].astype('float32') elif col_type == 'token_seq': - df[col] = df[col].astype(str).apply(lambda x: [int(i) for i in x.split(",")]) + try: + df[col] = df[col].astype(str).apply(lambda x: [int(i) for i in x.split(",")]) + except: + df[col] = df[col].astype(str).apply(lambda x: eval(x)) elif col_type == 'float_seq': - df[col] = df[col].astype(str).apply(lambda x: [float(i) for i in x.split(",")]) + try: + df[col] = df[col].astype(str).apply(lambda x: [float(i) for i in x.split(",")]) + except: + df[col] = df[col].astype(str).apply(lambda x: eval(x)) else: raise ValueError(f"unknown field type of {col_type}") diff --git a/edustudio/evaltpl/__init__.py b/edustudio/evaltpl/__init__.py index 00b26af..7079012 100644 --- a/edustudio/evaltpl/__init__.py +++ b/edustudio/evaltpl/__init__.py @@ -1,4 +1,4 @@ from .base_evaltpl import BaseEvalTPL from .bc_evaltpl import BinaryClassificationEvalTPL from .cd_evaltpl import CognitiveDiagnosisEvalTPL - +from .fairness_evaltpl import FairnessEvalTPL diff --git a/edustudio/evaltpl/base_evaltpl.py b/edustudio/evaltpl/base_evaltpl.py index f95c0eb..3fb1adb 100644 --- a/edustudio/evaltpl/base_evaltpl.py +++ b/edustudio/evaltpl/base_evaltpl.py @@ -20,6 +20,7 @@ def __init__(self, cfg): self.frame_cfg: UnifyConfig = cfg.frame_cfg self.modeltpl_cfg: UnifyConfig = cfg.modeltpl_cfg self.logger: logging.Logger = logging.getLogger("edustudio") + self._check_params() @classmethod def get_default_cfg(cls): diff --git a/edustudio/evaltpl/fairness_evaltpl.py b/edustudio/evaltpl/fairness_evaltpl.py new file mode 100644 index 0000000..13a53a6 --- /dev/null +++ b/edustudio/evaltpl/fairness_evaltpl.py @@ -0,0 +1,82 @@ +from .base_evaltpl import BaseEvalTPL +import pandas as pd +import numpy as np +from edustudio.utils.common import tensor2npy + + +class FairnessEvalTPL(BaseEvalTPL): + default_cfg = { + 'use_sensi_attrs': ['gender:token'], + 'use_metrics': ['EO', 'DP', 'FCD'] + } + + def _check_params(self): + assert len(set(self.evaltpl_cfg[self.__class__.__name__]['use_metrics']) - {'EO', 'DP', 'FCD'}) == 0 + + def eval(self, **kwargs): + stu_id = tensor2npy(kwargs['stu_id']) + pd_soft = tensor2npy(kwargs['y_pd']) + gt = tensor2npy(kwargs['y_gt']) + pd_hard = (pd_soft >= 0.5).astype(np.int64) + + df_stu = self.extra_data['df_stu'] + + df = pd.DataFrame() + df['stu_id:token'] = stu_id + df['pd_soft'] = pd_soft + df['pd_hard'] = pd_hard + df['gt'] = gt + df = df.merge(df_stu, on='stu_id:token', how='left') + + ret_dic = {} + for attr in self.evaltpl_cfg[self.__class__.__name__]['use_sensi_attrs']: + g_names = df_stu[attr].unique() + + for use_metric in self.evaltpl_cfg[self.__class__.__name__]['use_metrics']: + if len(g_names) == 2: + if use_metric == 'EO': ret_dic[f"EO_{attr}"] = self.get_eo(df, attr) + if use_metric == 'DP': ret_dic[f"DP_{attr}"] = self.get_dp(df, attr) + if use_metric == 'FCD': ret_dic[f"FCD_{attr}"] = self.get_fcd(df, attr) + else: + pass + return ret_dic + + def get_dp(self, df, sensitive_attr): + """Demographic Parity + """ + dp = df.groupby(sensitive_attr)['pd_hard'].mean() + return abs(dp[0] - dp[1]) + + def get_eo(self, df, sensitive_attr): + """Equal Opportunity + """ + eo = df.groupby([sensitive_attr, 'gt'])['pd_hard'].mean() + return abs(eo[0][1] - eo[1][1]) + + + def get_fcd(self, df, sensitive_attr): + """Fair Cognitive Diagnosis [1] + [1]zhang zheng, et al, Understanding and Improving Fairness in Cognitive Diagnosis, SCIENCE CHINA Information Sciences, 2023, ISSN 1674-733X, https://doi.org/10.1007/s11432-022-3852-0. + """ + fcd_pd = df.groupby([sensitive_attr, 'stu_id:token'])['pd_hard'].mean() + fcd_pd = fcd_pd[0].mean() - fcd_pd[1].mean() + + fcd_gt = df.groupby([sensitive_attr, 'stu_id:token'])['gt'].mean() + fcd_gt = fcd_gt[0].mean() - fcd_gt[1].mean() + return abs(fcd_pd - fcd_gt) + + + def add_extra_data(self, **kwargs): + self.extra_data = kwargs + + df_stu = self.extra_data['df_stu'] + assert df_stu is not None + for attr in self.evaltpl_cfg[self.__class__.__name__]['use_sensi_attrs']: + assert attr in df_stu + g_names = df_stu[attr].unique() + + for use_metric in self.evaltpl_cfg[self.__class__.__name__]['use_metrics']: + assert len(g_names) >= 2 + if len(g_names) > 2: + if use_metric in {'EO', 'DP', 'FCD'}: + self.logger.warning(f"As the number of sensitive attribute `{attr}` values > 2, the fairness metric {use_metric} is not supported for the `{attr}`") diff --git a/edustudio/traintpl/general_traintpl.py b/edustudio/traintpl/general_traintpl.py index 5f973d1..9f168d1 100644 --- a/edustudio/traintpl/general_traintpl.py +++ b/edustudio/traintpl/general_traintpl.py @@ -111,17 +111,21 @@ def fit(self, train_loader, valid_loader): @torch.no_grad() def evaluate(self, loader): self.model.eval() + stu_id_list = list(range(len(loader))) pd_list = list(range(len(loader))) gt_list = list(range(len(loader))) for idx, batch_dict in enumerate(tqdm(loader, ncols=self.frame_cfg['TQDM_NCOLS'], desc="[PREDICT]")): batch_dict = self.batch_dict2device(batch_dict) eval_dict = self.model.predict(**batch_dict) + stu_id_list[idx] = batch_dict['stu_id'] pd_list[idx] = eval_dict['y_pd'] gt_list[idx] = eval_dict['y_gt'] if 'y_gt' in eval_dict else batch_dict['label'] y_pd = torch.hstack(pd_list) y_gt = torch.hstack(gt_list) + stu_id = torch.hstack(stu_id_list) eval_data_dict = { + 'stu_id': stu_id, 'y_pd': y_pd, 'y_gt': y_gt, } @@ -142,17 +146,21 @@ def evaluate(self, loader): @torch.no_grad() def inference(self, loader): self.model.eval() + stu_id_list = list(range(len(loader))) pd_list = list(range(len(loader))) gt_list = list(range(len(loader))) for idx, batch_dict in enumerate(tqdm(loader, ncols=self.frame_cfg['TQDM_NCOLS'], desc="[PREDICT]")): batch_dict = self.batch_dict2device(batch_dict) eval_dict = self.model.predict(**batch_dict) + stu_id_list[idx] = batch_dict['stu_id'] pd_list[idx] = eval_dict['y_pd'] gt_list[idx] = eval_dict['y_gt'] if 'y_gt' in eval_dict else batch_dict['label'] y_pd = torch.hstack(pd_list) y_gt = torch.hstack(gt_list) + stu_id = torch.hstack(stu_id_list) eval_data_dict = { + 'stu_id': stu_id, 'y_pd': y_pd, 'y_gt': y_gt, }