Skip to content

Commit

Permalink
add FairnessEvalTPL and SLP_English, SLP_Math dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
kervias committed Feb 3, 2024
1 parent c7b1280 commit 85536e0
Show file tree
Hide file tree
Showing 10 changed files with 285 additions and 19 deletions.
2 changes: 0 additions & 2 deletions edustudio/atom_op/mid2cache/CD/data_split4cd.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,5 +104,3 @@ def set_dt_info(self, dt_info, **kwargs):
dt_info['cpt_count'] = max(dt_info.get('cpt_count', -1), df[col].max() + 1)
else:
dt_info['cpt_count'] = max(dt_info.get('cpt_count', -1), np.max(list(chain(*df[col].to_list()))) + 1)

a = 1
10 changes: 0 additions & 10 deletions edustudio/atom_op/mid2cache/common/build_dtinfo.py

This file was deleted.

5 changes: 4 additions & 1 deletion edustudio/atom_op/raw2mid/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
from .nips12 import R2M_Eedi_20_T12
from .nips34 import R2M_Eedi_20_T34
from .simulated5 import R2M_Simulated5

from .slp_english import R2M_SLP_English
from .slp_math import R2M_SLP_Math

# look up api dict
_cli_api_dict_ = {}
Expand All @@ -35,3 +36,5 @@
_cli_api_dict_['R2M_Eedi_20_T12'] = R2M_Eedi_20_T12.from_cli
_cli_api_dict_['R2M_Eedi_20_T34'] = R2M_Eedi_20_T34.from_cli
_cli_api_dict_['R2M_Simulated5'] = R2M_Simulated5.from_cli
_cli_api_dict_['R2M_SLP_Math'] = R2M_SLP_Math.from_cli
_cli_api_dict_['R2M_SLP_English'] = R2M_SLP_English.from_cli
91 changes: 91 additions & 0 deletions edustudio/atom_op/raw2mid/slp_english.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
from edustudio.atom_op.raw2mid import BaseRaw2Mid
import pandas as pd
import numpy as np
import time

"""
SLP Dataset: https://aic-fe.bnu.edu.cn/en/data/index.html
"""


class R2M_SLP_English(BaseRaw2Mid):
"""
rawdata: https://aic-fe.bnu.edu.cn/en/data/index.html
"""
def process(self):
super().process()

# for stu
df_stu = pd.read_csv(f"{self.rawpath}/student.csv")
df_stu.dropna(subset=['school_id'], inplace=True, how='any', axis=0)
df_stu = df_stu[df_stu['school_id'] != 'n.a.']

df_stu = df_stu.merge(
pd.read_csv(f"{self.rawpath}/family.csv", index_col=False),
on=['student_id'], how='inner'
)

df_stu = df_stu.merge(
pd.read_csv(f"{self.rawpath}/school.csv"),
on=['school_id'], how='inner'
)

df_stu.drop([
'rate_of_higher_educated_teachers',
"rate_of_teachers_with_master's_degree_and_above"
], inplace=True, axis=1)
df_stu.rename(columns={
'student_id': 'stu_id:token', 'gender': 'gender:token',
'school_id': 'sch_id:token', 'class_id': 'class_id:token',
'age_father': 'age_father:float', 'age_mother': 'age_mother:token',
'edubg_father': 'edubg_father:token', 'edubg_mother':'edubg_mother:token',
'affiliation_father':'affiliation_father:token',
'affiliation_mother': 'affiliation_mother:token',
'family_income': 'family_income:token', 'is_only_child':'is_only_child:token',
'live_on_campus': 'live_on_campus:token',
'gathering_frequency_father':'gathering_frequency_father:token',
'gathering_frequency_mother':'gathering_frequency_mother:token',
'family_traveling_times': "family_traveling_times:token",
'school_type': 'school_type:token',
'dist_to_downtown': 'dist_to_downtown:float',
#'rate_of_higher_educated_teachers': 'rate_of_higher_educated_teachers:float',
#"rate_of_teachers_with_master's_degree_and_above": "rate_of_teachers_with_master's_degree_and_above:float",
}, inplace=True)

# for inter
df_inter = pd.read_csv(f"{self.rawpath}/term-eng.csv", index_col=False, low_memory=False)
df_inter = df_inter[(df_inter == 'n.a.').sum(axis=1) == 0].reset_index(drop=True)
df_inter = df_inter[df_inter['concept'] != 'n.a.']
df_inter['label'] = df_inter['score']/df_inter['full_score'].astype(float)

df_exer = df_inter[['question_id', 'exam_id', 'subject_abbr', 'concept']]
df_inter = df_inter[['student_id', 'question_id', 'score', 'full_score', 'time_access', 'label']]
df_exer.drop_duplicates(subset=['question_id'], inplace=True)
df_exer['concept'] = df_exer['concept'].apply(lambda x: x.split(";"))
df_inter['time_access'] = df_inter['time_access'].apply(lambda x: self.convert2timestamp(x))

df_inter.rename(columns={
'student_id': 'stu_id:token', 'question_id': 'exer_id:token',
'score': 'score:float', 'full_score':'full_score:float',
'time_access': 'start_timestamp:float', 'label':'label:float'
}, inplace=True)

df_exer.rename(columns={
'question_id': 'exer_id:token',
'exam_id': 'exam_id:token',
'subject_abbr': 'subject_abbr:token',
'concept': 'cpt_seq:token_seq'
}, inplace=True)

df_inter['order_id:token'] = df_inter['start_timestamp:float'].astype(int)

# save
df_inter.to_csv(f"{self.midpath}/{self.dt}.inter.csv", index=False, encoding='utf-8')
df_stu.to_csv(f"{self.midpath}/{self.dt}.stu.csv", index=False, encoding='utf-8')
df_exer.to_csv(f"{self.midpath}/{self.dt}.exer.csv", index=False, encoding='utf-8')

@staticmethod
def convert2timestamp(dt):
timeArray = time.strptime(dt, "%Y-%m-%d %H:%M:%S")
timestamp = time.mktime(timeArray)
return timestamp
86 changes: 86 additions & 0 deletions edustudio/atom_op/raw2mid/slp_math.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from edustudio.atom_op.raw2mid import BaseRaw2Mid
import pandas as pd
import numpy as np
import time

"""
SLP Dataset: https://aic-fe.bnu.edu.cn/en/data/index.html
"""

class R2M_SLP_Math(BaseRaw2Mid):
def process(self):
super().process()

# for stu
df_stu = pd.read_csv(f"{self.rawpath}/student.csv")
df_stu.dropna(subset=['school_id'], inplace=True, how='any', axis=0)
df_stu = df_stu[df_stu['school_id'] != 'n.a.']

df_stu = df_stu.merge(
pd.read_csv(f"{self.rawpath}/family.csv", index_col=False),
on=['student_id'], how='inner'
)

df_stu = df_stu.merge(
pd.read_csv(f"{self.rawpath}/school.csv"),
on=['school_id'], how='inner'
)

df_stu.drop([
'rate_of_higher_educated_teachers',
"rate_of_teachers_with_master's_degree_and_above"
], inplace=True, axis=1)
df_stu.rename(columns={
'student_id': 'stu_id:token', 'gender': 'gender:token',
'school_id': 'sch_id:token', 'class_id': 'class_id:token',
'age_father': 'age_father:float', 'age_mother': 'age_mother:token',
'edubg_father': 'edubg_father:token', 'edubg_mother':'edubg_mother:token',
'affiliation_father':'affiliation_father:token',
'affiliation_mother': 'affiliation_mother:token',
'family_income': 'family_income:token', 'is_only_child':'is_only_child:token',
'live_on_campus': 'live_on_campus:token',
'gathering_frequency_father':'gathering_frequency_father:token',
'gathering_frequency_mother':'gathering_frequency_mother:token',
'family_traveling_times': "family_traveling_times:token",
'school_type': 'school_type:token',
'dist_to_downtown': 'dist_to_downtown:float',
#'rate_of_higher_educated_teachers': 'rate_of_higher_educated_teachers:float',
#"rate_of_teachers_with_master's_degree_and_above": "rate_of_teachers_with_master's_degree_and_above:float",
}, inplace=True)

# for inter
df_inter = pd.read_csv(f"{self.rawpath}/term-mat.csv", index_col=False)
df_inter = df_inter[df_inter['concept'] != 'n.a.']
df_inter['label'] = df_inter['score']/df_inter['full_score']

df_exer = df_inter[['question_id', 'exam_id', 'subject_abbr', 'concept']]
df_inter = df_inter[['student_id', 'question_id', 'score', 'full_score', 'time_access', 'label']]
df_exer.drop_duplicates(subset=['question_id'], inplace=True)
df_exer['concept'] = df_exer['concept'].apply(lambda x: x.split(";"))
df_inter['time_access'] = df_inter['time_access'].apply(lambda x: self.convert2timestamp(x))

df_inter.rename(columns={
'student_id': 'stu_id:token', 'question_id': 'exer_id:token',
'score': 'score:float', 'full_score':'full_score:float',
'time_access': 'start_timestamp:float', 'label':'label:float'
}, inplace=True)

df_exer.rename(columns={
'question_id': 'exer_id:token',
'exam_id': 'exam_id:token',
'subject_abbr': 'subject_abbr:token',
'concept': 'cpt_seq:token_seq'
}, inplace=True)

df_inter['order_id:token'] = df_inter['start_timestamp:float'].astype(int)

# save
df_inter.to_csv(f"{self.midpath}/{self.dt}.inter.csv", index=False, encoding='utf-8')
df_stu.to_csv(f"{self.midpath}/{self.dt}.stu.csv", index=False, encoding='utf-8')
df_exer.to_csv(f"{self.midpath}/{self.dt}.exer.csv", index=False, encoding='utf-8')

@staticmethod
def convert2timestamp(dt):
timeArray = time.strptime(dt, "%Y-%m-%d %H:%M:%S")
timestamp = time.mktime(timeArray)
return timestamp
17 changes: 12 additions & 5 deletions edustudio/datatpl/common/general_datatpl.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ class GeneralDataTPL(BaseDataTPL):
'cache_id': 'cache_default',
'load_data_from': 'middata', # ['rawdata', 'middata', 'cachedata']
'inter_exclude_feat_names': (),
'raw2mid_op': None,
'raw2mid_op': "None",
'mid2cache_op_seq': []
}

Expand Down Expand Up @@ -456,7 +456,7 @@ def _get_r2m_op(cls, cfg):
"""
from edustudio.atom_op.raw2mid import BaseRaw2Mid
r2m_op = cfg.datatpl_cfg['raw2mid_op']
assert r2m_op is not None
assert r2m_op is not None or r2m_op != "None"
if isinstance(r2m_op, str):
r2m_op = importlib.import_module('edustudio.atom_op.raw2mid').__getattribute__(r2m_op)
elif issubclass(r2m_op, BaseRaw2Mid):
Expand Down Expand Up @@ -541,13 +541,20 @@ def _preprocess_feat(df):
for col in df.columns:
col_name, col_type = col.split(":")
if col_type == 'token':
df[col] = df[col].astype('int64')
# df[col] = df[col].astype('int64')
pass
elif col_type == 'float':
df[col] = df[col].astype('float32')
elif col_type == 'token_seq':
df[col] = df[col].astype(str).apply(lambda x: [int(i) for i in x.split(",")])
try:
df[col] = df[col].astype(str).apply(lambda x: [int(i) for i in x.split(",")])
except:
df[col] = df[col].astype(str).apply(lambda x: eval(x))
elif col_type == 'float_seq':
df[col] = df[col].astype(str).apply(lambda x: [float(i) for i in x.split(",")])
try:
df[col] = df[col].astype(str).apply(lambda x: [float(i) for i in x.split(",")])
except:
df[col] = df[col].astype(str).apply(lambda x: eval(x))
else:
raise ValueError(f"unknown field type of {col_type}")

Expand Down
2 changes: 1 addition & 1 deletion edustudio/evaltpl/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .base_evaltpl import BaseEvalTPL
from .bc_evaltpl import BinaryClassificationEvalTPL
from .cd_evaltpl import CognitiveDiagnosisEvalTPL

from .fairness_evaltpl import FairnessEvalTPL
1 change: 1 addition & 0 deletions edustudio/evaltpl/base_evaltpl.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def __init__(self, cfg):
self.frame_cfg: UnifyConfig = cfg.frame_cfg
self.modeltpl_cfg: UnifyConfig = cfg.modeltpl_cfg
self.logger: logging.Logger = logging.getLogger("edustudio")
self._check_params()

@classmethod
def get_default_cfg(cls):
Expand Down
82 changes: 82 additions & 0 deletions edustudio/evaltpl/fairness_evaltpl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
from .base_evaltpl import BaseEvalTPL
import pandas as pd
import numpy as np
from edustudio.utils.common import tensor2npy


class FairnessEvalTPL(BaseEvalTPL):
default_cfg = {
'use_sensi_attrs': ['gender:token'],
'use_metrics': ['EO', 'DP', 'FCD']
}

def _check_params(self):
assert len(set(self.evaltpl_cfg[self.__class__.__name__]['use_metrics']) - {'EO', 'DP', 'FCD'}) == 0

def eval(self, **kwargs):
stu_id = tensor2npy(kwargs['stu_id'])
pd_soft = tensor2npy(kwargs['y_pd'])
gt = tensor2npy(kwargs['y_gt'])
pd_hard = (pd_soft >= 0.5).astype(np.int64)

df_stu = self.extra_data['df_stu']

df = pd.DataFrame()
df['stu_id:token'] = stu_id
df['pd_soft'] = pd_soft
df['pd_hard'] = pd_hard
df['gt'] = gt
df = df.merge(df_stu, on='stu_id:token', how='left')

ret_dic = {}
for attr in self.evaltpl_cfg[self.__class__.__name__]['use_sensi_attrs']:
g_names = df_stu[attr].unique()

for use_metric in self.evaltpl_cfg[self.__class__.__name__]['use_metrics']:
if len(g_names) == 2:
if use_metric == 'EO': ret_dic[f"EO_{attr}"] = self.get_eo(df, attr)
if use_metric == 'DP': ret_dic[f"DP_{attr}"] = self.get_dp(df, attr)
if use_metric == 'FCD': ret_dic[f"FCD_{attr}"] = self.get_fcd(df, attr)
else:
pass
return ret_dic

def get_dp(self, df, sensitive_attr):
"""Demographic Parity
"""
dp = df.groupby(sensitive_attr)['pd_hard'].mean()
return abs(dp[0] - dp[1])

def get_eo(self, df, sensitive_attr):
"""Equal Opportunity
"""
eo = df.groupby([sensitive_attr, 'gt'])['pd_hard'].mean()
return abs(eo[0][1] - eo[1][1])


def get_fcd(self, df, sensitive_attr):
"""Fair Cognitive Diagnosis [1]
[1]zhang zheng, et al, Understanding and Improving Fairness in Cognitive Diagnosis, SCIENCE CHINA Information Sciences, 2023, ISSN 1674-733X, https://doi.org/10.1007/s11432-022-3852-0.
"""
fcd_pd = df.groupby([sensitive_attr, 'stu_id:token'])['pd_hard'].mean()
fcd_pd = fcd_pd[0].mean() - fcd_pd[1].mean()

fcd_gt = df.groupby([sensitive_attr, 'stu_id:token'])['gt'].mean()
fcd_gt = fcd_gt[0].mean() - fcd_gt[1].mean()
return abs(fcd_pd - fcd_gt)


def add_extra_data(self, **kwargs):
self.extra_data = kwargs

df_stu = self.extra_data['df_stu']
assert df_stu is not None
for attr in self.evaltpl_cfg[self.__class__.__name__]['use_sensi_attrs']:
assert attr in df_stu
g_names = df_stu[attr].unique()

for use_metric in self.evaltpl_cfg[self.__class__.__name__]['use_metrics']:
assert len(g_names) >= 2
if len(g_names) > 2:
if use_metric in {'EO', 'DP', 'FCD'}:
self.logger.warning(f"As the number of sensitive attribute `{attr}` values > 2, the fairness metric {use_metric} is not supported for the `{attr}`")
8 changes: 8 additions & 0 deletions edustudio/traintpl/general_traintpl.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,17 +111,21 @@ def fit(self, train_loader, valid_loader):
@torch.no_grad()
def evaluate(self, loader):
self.model.eval()
stu_id_list = list(range(len(loader)))
pd_list = list(range(len(loader)))
gt_list = list(range(len(loader)))
for idx, batch_dict in enumerate(tqdm(loader, ncols=self.frame_cfg['TQDM_NCOLS'], desc="[PREDICT]")):
batch_dict = self.batch_dict2device(batch_dict)
eval_dict = self.model.predict(**batch_dict)
stu_id_list[idx] = batch_dict['stu_id']
pd_list[idx] = eval_dict['y_pd']
gt_list[idx] = eval_dict['y_gt'] if 'y_gt' in eval_dict else batch_dict['label']
y_pd = torch.hstack(pd_list)
y_gt = torch.hstack(gt_list)
stu_id = torch.hstack(stu_id_list)

eval_data_dict = {
'stu_id': stu_id,
'y_pd': y_pd,
'y_gt': y_gt,
}
Expand All @@ -142,17 +146,21 @@ def evaluate(self, loader):
@torch.no_grad()
def inference(self, loader):
self.model.eval()
stu_id_list = list(range(len(loader)))
pd_list = list(range(len(loader)))
gt_list = list(range(len(loader)))
for idx, batch_dict in enumerate(tqdm(loader, ncols=self.frame_cfg['TQDM_NCOLS'], desc="[PREDICT]")):
batch_dict = self.batch_dict2device(batch_dict)
eval_dict = self.model.predict(**batch_dict)
stu_id_list[idx] = batch_dict['stu_id']
pd_list[idx] = eval_dict['y_pd']
gt_list[idx] = eval_dict['y_gt'] if 'y_gt' in eval_dict else batch_dict['label']
y_pd = torch.hstack(pd_list)
y_gt = torch.hstack(gt_list)
stu_id = torch.hstack(stu_id_list)

eval_data_dict = {
'stu_id': stu_id,
'y_pd': y_pd,
'y_gt': y_gt,
}
Expand Down

0 comments on commit 85536e0

Please sign in to comment.