add FairnessEvalTPL and SLP_English, SLP_Math dataset

HFUT-LEC · Feb 3, 2024 · 85536e0 · 85536e0
1 parent c7b1280
commit 85536e0
Show file tree

Hide file tree

Showing 10 changed files with 285 additions and 19 deletions.
diff --git a/edustudio/atom_op/mid2cache/CD/data_split4cd.py b/edustudio/atom_op/mid2cache/CD/data_split4cd.py
@@ -104,5 +104,3 @@ def set_dt_info(self, dt_info, **kwargs):
                                 dt_info['cpt_count'] = max(dt_info.get('cpt_count', -1), df[col].max() + 1)
                             else:
                                 dt_info['cpt_count'] = max(dt_info.get('cpt_count', -1), np.max(list(chain(*df[col].to_list()))) + 1)
-
-            a = 1
diff --git a/edustudio/atom_op/mid2cache/common/build_dtinfo.py b/edustudio/atom_op/mid2cache/common/build_dtinfo.py
diff --git a/edustudio/atom_op/raw2mid/__init__.py b/edustudio/atom_op/raw2mid/__init__.py
@@ -15,7 +15,8 @@
 from .nips12 import R2M_Eedi_20_T12
 from .nips34 import R2M_Eedi_20_T34
 from .simulated5 import R2M_Simulated5
-
+from .slp_english import R2M_SLP_English
+from .slp_math import R2M_SLP_Math
 
 # look up api dict
 _cli_api_dict_ = {}
@@ -35,3 +36,5 @@
 _cli_api_dict_['R2M_Eedi_20_T12'] = R2M_Eedi_20_T12.from_cli
 _cli_api_dict_['R2M_Eedi_20_T34'] = R2M_Eedi_20_T34.from_cli
 _cli_api_dict_['R2M_Simulated5'] = R2M_Simulated5.from_cli
+_cli_api_dict_['R2M_SLP_Math'] = R2M_SLP_Math.from_cli
+_cli_api_dict_['R2M_SLP_English'] = R2M_SLP_English.from_cli
diff --git a/edustudio/atom_op/raw2mid/slp_english.py b/edustudio/atom_op/raw2mid/slp_english.py
@@ -0,0 +1,91 @@
+from edustudio.atom_op.raw2mid import BaseRaw2Mid
+import pandas as pd
+import numpy as np
+import time
+
+"""
+    SLP Dataset: https://aic-fe.bnu.edu.cn/en/data/index.html
+"""
+
+
+class R2M_SLP_English(BaseRaw2Mid):
+    """
+        rawdata: https://aic-fe.bnu.edu.cn/en/data/index.html
+    """
+    def process(self):
+        super().process()
+
+        # for stu
+        df_stu = pd.read_csv(f"{self.rawpath}/student.csv")
+        df_stu.dropna(subset=['school_id'], inplace=True, how='any', axis=0)
+        df_stu = df_stu[df_stu['school_id'] != 'n.a.']
+
+        df_stu = df_stu.merge(
+            pd.read_csv(f"{self.rawpath}/family.csv", index_col=False),
+            on=['student_id'], how='inner'
+        )
+
+        df_stu = df_stu.merge(
+                pd.read_csv(f"{self.rawpath}/school.csv"),
+                on=['school_id'], how='inner'
+        )
+
+        df_stu.drop([
+            'rate_of_higher_educated_teachers',
+            "rate_of_teachers_with_master's_degree_and_above"
+        ], inplace=True, axis=1)
+        df_stu.rename(columns={
+            'student_id': 'stu_id:token', 'gender': 'gender:token', 
+            'school_id': 'sch_id:token', 'class_id': 'class_id:token', 
+            'age_father': 'age_father:float', 'age_mother': 'age_mother:token',
+            'edubg_father': 'edubg_father:token', 'edubg_mother':'edubg_mother:token', 
+            'affiliation_father':'affiliation_father:token', 
+            'affiliation_mother': 'affiliation_mother:token',
+            'family_income': 'family_income:token', 'is_only_child':'is_only_child:token',
+            'live_on_campus': 'live_on_campus:token', 
+            'gathering_frequency_father':'gathering_frequency_father:token',
+            'gathering_frequency_mother':'gathering_frequency_mother:token', 
+            'family_traveling_times': "family_traveling_times:token", 
+            'school_type': 'school_type:token',
+            'dist_to_downtown': 'dist_to_downtown:float',
+            #'rate_of_higher_educated_teachers': 'rate_of_higher_educated_teachers:float',
+            #"rate_of_teachers_with_master's_degree_and_above": "rate_of_teachers_with_master's_degree_and_above:float",
+        }, inplace=True)
+
+        # for inter
+        df_inter = pd.read_csv(f"{self.rawpath}/term-eng.csv", index_col=False, low_memory=False)
+        df_inter = df_inter[(df_inter == 'n.a.').sum(axis=1) == 0].reset_index(drop=True)
+        df_inter = df_inter[df_inter['concept'] != 'n.a.']
+        df_inter['label'] = df_inter['score']/df_inter['full_score'].astype(float)
+
+        df_exer = df_inter[['question_id', 'exam_id', 'subject_abbr', 'concept']]
+        df_inter = df_inter[['student_id', 'question_id', 'score', 'full_score', 'time_access', 'label']]
+        df_exer.drop_duplicates(subset=['question_id'], inplace=True)
+        df_exer['concept'] = df_exer['concept'].apply(lambda x: x.split(";"))
+        df_inter['time_access'] = df_inter['time_access'].apply(lambda x: self.convert2timestamp(x))
+
+        df_inter.rename(columns={
+            'student_id': 'stu_id:token', 'question_id': 'exer_id:token',
+            'score': 'score:float', 'full_score':'full_score:float', 
+            'time_access': 'start_timestamp:float', 'label':'label:float'
+        }, inplace=True)
+
+        df_exer.rename(columns={
+             'question_id': 'exer_id:token',
+             'exam_id': 'exam_id:token', 
+             'subject_abbr': 'subject_abbr:token',
+             'concept': 'cpt_seq:token_seq'
+        }, inplace=True)
+
+        df_inter['order_id:token'] = df_inter['start_timestamp:float'].astype(int)
+
+        # save
+        df_inter.to_csv(f"{self.midpath}/{self.dt}.inter.csv", index=False, encoding='utf-8')
+        df_stu.to_csv(f"{self.midpath}/{self.dt}.stu.csv", index=False, encoding='utf-8')
+        df_exer.to_csv(f"{self.midpath}/{self.dt}.exer.csv", index=False, encoding='utf-8')
+
+    @staticmethod
+    def convert2timestamp(dt):
+        timeArray = time.strptime(dt, "%Y-%m-%d %H:%M:%S")
+        timestamp = time.mktime(timeArray)
+        return timestamp
diff --git a/edustudio/atom_op/raw2mid/slp_math.py b/edustudio/atom_op/raw2mid/slp_math.py
@@ -0,0 +1,86 @@
+from edustudio.atom_op.raw2mid import BaseRaw2Mid
+import pandas as pd
+import numpy as np
+import time
+
+"""
+    SLP Dataset: https://aic-fe.bnu.edu.cn/en/data/index.html
+"""
+
+class R2M_SLP_Math(BaseRaw2Mid):
+    def process(self):
+        super().process()
+
+        # for stu
+        df_stu = pd.read_csv(f"{self.rawpath}/student.csv")
+        df_stu.dropna(subset=['school_id'], inplace=True, how='any', axis=0)
+        df_stu = df_stu[df_stu['school_id'] != 'n.a.']
+
+        df_stu = df_stu.merge(
+            pd.read_csv(f"{self.rawpath}/family.csv", index_col=False),
+            on=['student_id'], how='inner'
+        )
+
+        df_stu = df_stu.merge(
+                pd.read_csv(f"{self.rawpath}/school.csv"),
+                on=['school_id'], how='inner'
+        )
+
+        df_stu.drop([
+            'rate_of_higher_educated_teachers',
+            "rate_of_teachers_with_master's_degree_and_above"
+        ], inplace=True, axis=1)
+        df_stu.rename(columns={
+            'student_id': 'stu_id:token', 'gender': 'gender:token', 
+            'school_id': 'sch_id:token', 'class_id': 'class_id:token', 
+            'age_father': 'age_father:float', 'age_mother': 'age_mother:token',
+            'edubg_father': 'edubg_father:token', 'edubg_mother':'edubg_mother:token', 
+            'affiliation_father':'affiliation_father:token', 
+            'affiliation_mother': 'affiliation_mother:token',
+            'family_income': 'family_income:token', 'is_only_child':'is_only_child:token',
+            'live_on_campus': 'live_on_campus:token', 
+            'gathering_frequency_father':'gathering_frequency_father:token',
+            'gathering_frequency_mother':'gathering_frequency_mother:token', 
+            'family_traveling_times': "family_traveling_times:token", 
+            'school_type': 'school_type:token',
+            'dist_to_downtown': 'dist_to_downtown:float',
+            #'rate_of_higher_educated_teachers': 'rate_of_higher_educated_teachers:float',
+            #"rate_of_teachers_with_master's_degree_and_above": "rate_of_teachers_with_master's_degree_and_above:float",
+        }, inplace=True)
+
+        # for inter
+        df_inter = pd.read_csv(f"{self.rawpath}/term-mat.csv", index_col=False)
+        df_inter = df_inter[df_inter['concept'] != 'n.a.']
+        df_inter['label'] = df_inter['score']/df_inter['full_score']
+
+        df_exer = df_inter[['question_id', 'exam_id', 'subject_abbr', 'concept']]
+        df_inter = df_inter[['student_id', 'question_id', 'score', 'full_score', 'time_access', 'label']]
+        df_exer.drop_duplicates(subset=['question_id'], inplace=True)
+        df_exer['concept'] = df_exer['concept'].apply(lambda x: x.split(";"))
+        df_inter['time_access'] = df_inter['time_access'].apply(lambda x: self.convert2timestamp(x))
+
+        df_inter.rename(columns={
+            'student_id': 'stu_id:token', 'question_id': 'exer_id:token',
+            'score': 'score:float', 'full_score':'full_score:float', 
+            'time_access': 'start_timestamp:float', 'label':'label:float'
+        }, inplace=True)
+
+        df_exer.rename(columns={
+             'question_id': 'exer_id:token',
+             'exam_id': 'exam_id:token', 
+             'subject_abbr': 'subject_abbr:token',
+             'concept': 'cpt_seq:token_seq'
+        }, inplace=True)
+
+        df_inter['order_id:token'] = df_inter['start_timestamp:float'].astype(int)
+
+        # save
+        df_inter.to_csv(f"{self.midpath}/{self.dt}.inter.csv", index=False, encoding='utf-8')
+        df_stu.to_csv(f"{self.midpath}/{self.dt}.stu.csv", index=False, encoding='utf-8')
+        df_exer.to_csv(f"{self.midpath}/{self.dt}.exer.csv", index=False, encoding='utf-8')
+
+    @staticmethod
+    def convert2timestamp(dt):
+        timeArray = time.strptime(dt, "%Y-%m-%d %H:%M:%S")
+        timestamp = time.mktime(timeArray)
+        return timestamp
diff --git a/edustudio/datatpl/common/general_datatpl.py b/edustudio/datatpl/common/general_datatpl.py
@@ -36,7 +36,7 @@ class GeneralDataTPL(BaseDataTPL):
         'cache_id': 'cache_default',
         'load_data_from': 'middata', # ['rawdata', 'middata', 'cachedata']
         'inter_exclude_feat_names': (),
-        'raw2mid_op': None, 
+        'raw2mid_op': "None", 
         'mid2cache_op_seq': []
     }
 
@@ -456,7 +456,7 @@ def _get_r2m_op(cls, cfg):
         """
         from edustudio.atom_op.raw2mid import BaseRaw2Mid
         r2m_op = cfg.datatpl_cfg['raw2mid_op']
-        assert r2m_op is not None
+        assert r2m_op is not None or r2m_op != "None"
         if isinstance(r2m_op, str):
             r2m_op = importlib.import_module('edustudio.atom_op.raw2mid').__getattribute__(r2m_op)
         elif issubclass(r2m_op, BaseRaw2Mid):
@@ -541,13 +541,20 @@ def _preprocess_feat(df):
         for col in df.columns:
             col_name, col_type = col.split(":")
             if col_type == 'token':
-                df[col] = df[col].astype('int64')
+                # df[col] = df[col].astype('int64')
+                pass
             elif col_type == 'float':
                 df[col] = df[col].astype('float32')
             elif col_type == 'token_seq':
-                df[col] = df[col].astype(str).apply(lambda x: [int(i) for i in x.split(",")])
+                try:
+                    df[col] = df[col].astype(str).apply(lambda x: [int(i) for i in x.split(",")])
+                except:
+                    df[col] = df[col].astype(str).apply(lambda x: eval(x))
             elif col_type == 'float_seq':
-                df[col] = df[col].astype(str).apply(lambda x: [float(i) for i in x.split(",")])
+                try:
+                    df[col] = df[col].astype(str).apply(lambda x: [float(i) for i in x.split(",")])
+                except:
+                    df[col] = df[col].astype(str).apply(lambda x: eval(x))
             else:
                 raise ValueError(f"unknown field type of {col_type}")
 

diff --git a/edustudio/evaltpl/__init__.py b/edustudio/evaltpl/__init__.py
@@ -1,4 +1,4 @@
 from .base_evaltpl import BaseEvalTPL
 from .bc_evaltpl import BinaryClassificationEvalTPL
 from .cd_evaltpl import CognitiveDiagnosisEvalTPL
-
+from .fairness_evaltpl import FairnessEvalTPL
diff --git a/edustudio/evaltpl/base_evaltpl.py b/edustudio/evaltpl/base_evaltpl.py
@@ -20,6 +20,7 @@ def __init__(self, cfg):
         self.frame_cfg: UnifyConfig = cfg.frame_cfg
         self.modeltpl_cfg: UnifyConfig = cfg.modeltpl_cfg
         self.logger: logging.Logger = logging.getLogger("edustudio")
+        self._check_params()
 
     @classmethod
     def get_default_cfg(cls):

diff --git a/edustudio/evaltpl/fairness_evaltpl.py b/edustudio/evaltpl/fairness_evaltpl.py
@@ -0,0 +1,82 @@
+from .base_evaltpl import BaseEvalTPL
+import pandas as pd
+import numpy as np
+from edustudio.utils.common import tensor2npy
+
+
+class FairnessEvalTPL(BaseEvalTPL):
+    default_cfg = {
+        'use_sensi_attrs': ['gender:token'],
+        'use_metrics': ['EO', 'DP', 'FCD']
+    }
+
+    def _check_params(self):
+        assert len(set(self.evaltpl_cfg[self.__class__.__name__]['use_metrics']) - {'EO', 'DP', 'FCD'}) == 0
+
+    def eval(self, **kwargs):
+        stu_id = tensor2npy(kwargs['stu_id'])
+        pd_soft = tensor2npy(kwargs['y_pd'])
+        gt = tensor2npy(kwargs['y_gt'])
+        pd_hard = (pd_soft >= 0.5).astype(np.int64)
+
+        df_stu = self.extra_data['df_stu']
+
+        df = pd.DataFrame()
+        df['stu_id:token'] = stu_id
+        df['pd_soft'] = pd_soft
+        df['pd_hard'] = pd_hard
+        df['gt'] = gt
+        df = df.merge(df_stu, on='stu_id:token', how='left')
+
+        ret_dic = {}
+        for attr in self.evaltpl_cfg[self.__class__.__name__]['use_sensi_attrs']:
+            g_names = df_stu[attr].unique()
+
+            for use_metric in self.evaltpl_cfg[self.__class__.__name__]['use_metrics']:
+                if len(g_names) == 2:
+                    if use_metric == 'EO': ret_dic[f"EO_{attr}"] = self.get_eo(df, attr)
+                    if use_metric == 'DP': ret_dic[f"DP_{attr}"] = self.get_dp(df, attr)
+                    if use_metric == 'FCD': ret_dic[f"FCD_{attr}"] = self.get_fcd(df, attr)
+                else:
+                    pass
+        return ret_dic
+
+    def get_dp(self, df, sensitive_attr):
+        """Demographic Parity
+        """
+        dp = df.groupby(sensitive_attr)['pd_hard'].mean()
+        return abs(dp[0] - dp[1])
+
+    def get_eo(self, df, sensitive_attr):
+        """Equal Opportunity
+        """
+        eo = df.groupby([sensitive_attr, 'gt'])['pd_hard'].mean()
+        return abs(eo[0][1] - eo[1][1])
+
+
+    def get_fcd(self, df, sensitive_attr):
+        """Fair Cognitive Diagnosis [1]
+        [1]zhang zheng, et al, Understanding and Improving Fairness in Cognitive Diagnosis,  SCIENCE CHINA Information Sciences, 2023, ISSN 1674-733X, https://doi.org/10.1007/s11432-022-3852-0.
+        """
+        fcd_pd = df.groupby([sensitive_attr, 'stu_id:token'])['pd_hard'].mean()
+        fcd_pd = fcd_pd[0].mean() - fcd_pd[1].mean()
+
+        fcd_gt = df.groupby([sensitive_attr, 'stu_id:token'])['gt'].mean()
+        fcd_gt = fcd_gt[0].mean() - fcd_gt[1].mean()
+        return abs(fcd_pd - fcd_gt)
+
+
+    def add_extra_data(self, **kwargs):
+        self.extra_data = kwargs
+
+        df_stu = self.extra_data['df_stu']
+        assert df_stu is not None
+        for attr in self.evaltpl_cfg[self.__class__.__name__]['use_sensi_attrs']:
+            assert attr in df_stu
+            g_names = df_stu[attr].unique()
+
+            for use_metric in self.evaltpl_cfg[self.__class__.__name__]['use_metrics']:
+                assert len(g_names) >= 2
+                if len(g_names) > 2:
+                    if use_metric in {'EO', 'DP', 'FCD'}:
+                        self.logger.warning(f"As the number of sensitive attribute `{attr}` values > 2, the fairness metric {use_metric} is not supported for the `{attr}`")
diff --git a/edustudio/traintpl/general_traintpl.py b/edustudio/traintpl/general_traintpl.py
@@ -111,17 +111,21 @@ def fit(self, train_loader, valid_loader):
     @torch.no_grad()
     def evaluate(self, loader):
         self.model.eval()
+        stu_id_list = list(range(len(loader)))
         pd_list = list(range(len(loader)))
         gt_list = list(range(len(loader)))
         for idx, batch_dict in enumerate(tqdm(loader, ncols=self.frame_cfg['TQDM_NCOLS'], desc="[PREDICT]")):
             batch_dict = self.batch_dict2device(batch_dict)
             eval_dict = self.model.predict(**batch_dict)
+            stu_id_list[idx] = batch_dict['stu_id']
             pd_list[idx] = eval_dict['y_pd']
             gt_list[idx] = eval_dict['y_gt'] if 'y_gt' in eval_dict else batch_dict['label']
         y_pd = torch.hstack(pd_list)
         y_gt = torch.hstack(gt_list)
+        stu_id = torch.hstack(stu_id_list)
 
         eval_data_dict = {
+            'stu_id': stu_id,
             'y_pd': y_pd,
             'y_gt': y_gt,
         }
@@ -142,17 +146,21 @@ def evaluate(self, loader):
     @torch.no_grad()
     def inference(self, loader):
         self.model.eval()
+        stu_id_list = list(range(len(loader)))
         pd_list = list(range(len(loader)))
         gt_list = list(range(len(loader)))
         for idx, batch_dict in enumerate(tqdm(loader, ncols=self.frame_cfg['TQDM_NCOLS'], desc="[PREDICT]")):
             batch_dict = self.batch_dict2device(batch_dict)
             eval_dict = self.model.predict(**batch_dict)
+            stu_id_list[idx] = batch_dict['stu_id']
             pd_list[idx] = eval_dict['y_pd']
             gt_list[idx] = eval_dict['y_gt'] if 'y_gt' in eval_dict else batch_dict['label']
         y_pd = torch.hstack(pd_list)
         y_gt = torch.hstack(gt_list)
+        stu_id = torch.hstack(stu_id_list)
 
         eval_data_dict = {
+            'stu_id': stu_id,
             'y_pd': y_pd,
             'y_gt': y_gt,
         }
-Original file line number
+Diff line change
@@ Expand Up / @@ -104,5 +104,3 @@ def set_dt_info(self, dt_info, **kwargs): @@
                                     dt_info['cpt_count'] = max(dt_info.get('cpt_count', -1), df[col].max() + 1)
                                 else:
                                     dt_info['cpt_count'] = max(dt_info.get('cpt_count', -1), np.max(list(chain(*df[col].to_list()))) + 1)
-                a = 1