Skip to content

Commit

Permalink
add data_split4kt and rename cpt to KC for some atomic ops
Browse files Browse the repository at this point in the history
  • Loading branch information
kervias committed Feb 3, 2024
1 parent 6eec637 commit 79e6efb
Show file tree
Hide file tree
Showing 27 changed files with 161 additions and 132 deletions.
2 changes: 1 addition & 1 deletion docs/source/developer_guide/customize_evaltpl.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ This EvalTPL is for the model evaluation using binary classification metrics.
The protocols in ``PredictionEvalTPL`` are listed as follows.


### InterpretabilityEvalT
### InterpretabilityEvalTPL
This EvalTPL is for the model evaluation for interpretability. It uses states of students and Q matrix for ``eval``, which are domain-specific in student assessment.

## Develop a New EvalTPL in EduStudio
Expand Down
6 changes: 3 additions & 3 deletions docs/source/features/atomic_operations.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ In the following, we give a table to display existing atomic operations.
| name | description |
| ---------------------- | ------------------------------------------- |
| M2C_BuildSeqInterFeats | Build Sequential Features and Split dataset |
| M2C_CptAsExer | Treat knowledge concept as exercise |
| M2C_GenCptSeq | Generate knowledge concept seq |
| M2C_GenUnFoldCptSeq | Unfold knowledge concepts |
| M2C_KCAsExer | Treat knowledge concept as exercise |
| M2C_GenKCSeq | Generate knowledge concept seq |
| M2C_GenUnFoldKCSeq | Unfold knowledge concepts |

7 changes: 4 additions & 3 deletions edustudio/atom_op/mid2cache/KT/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .build_seq_inter_feats import M2C_BuildSeqInterFeats
from .cpt_as_exer import M2C_CptAsExer
from .gen_cpt_seq import M2C_GenCptSeq
from .gen_unfold_cpt_seq import M2C_GenUnFoldCptSeq
from .cpt_as_exer import M2C_KCAsExer
from .gen_cpt_seq import M2C_GenKCSeq
from .gen_unfold_cpt_seq import M2C_GenUnFoldKCSeq
from .data_split4kt import M2C_RandomDataSplit4KT
116 changes: 14 additions & 102 deletions edustudio/atom_op/mid2cache/KT/build_seq_inter_feats.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,10 @@

class M2C_BuildSeqInterFeats(BaseMid2Cache):
default_cfg = {
'seed': 2023,
'divide_by': 'stu',
'window_size': 100,
"divide_scale_list": [7,1,2],
"extra_inter_feats": []
}

def __init__(self, m2c_cfg, n_folds, is_dataset_divided) -> None:
super().__init__(m2c_cfg)
self.n_folds = n_folds
Expand All @@ -25,11 +22,7 @@ def from_cfg(cls, cfg):
n_folds = cfg.datatpl_cfg.n_folds
is_dataset_divided = cfg.datatpl_cfg.is_dataset_divided
return cls(m2c_cfg, n_folds, is_dataset_divided)

def _check_params(self):
super()._check_params()
assert self.m2c_cfg['divide_by'] in {'stu', 'time'}


def process(self, **kwargs):
df = kwargs['df']
df_train, df_valid, df_test = kwargs['df_train'], kwargs['df_valid'], kwargs['df_test']
Expand All @@ -40,96 +33,36 @@ def process(self, **kwargs):

if not self.is_dataset_divided:
assert df_train is None and df_valid is None and df_test is None
if self.m2c_cfg['divide_by'] == 'stu':
if self.n_folds == 1:
train_dict, valid_dict, test_dict = self._divide_data_df_by_stu_one_fold(df)
kwargs['df_train_folds'] = [train_dict]
kwargs['df_valid_folds'] = [valid_dict]
kwargs['df_test_folds'] = [test_dict]
else:
kwargs['df_train_folds'], kwargs['df_valid_folds'], kwargs['df_test_folds'] = self._divide_data_df_by_stu_multi_fold(df)
elif self.m2c_cfg['divide_by'] == 'time':
raise NotImplementedError
else:
raise ValueError(f"unknown divide_by: {self.m2c_cfg['divide_by']}")
self.window_size = self.m2c_cfg['window_size']
if self.m2c_cfg['window_size'] <= 0 or self.m2c_cfg['window_size'] is None:
self.window_size = df[['stu_id:token', 'exer_id:token']].groupby('stu_id:token').agg('count')['exer_id:token'].max()
self.logger.info(f"actual window size: {self.window_size}")
kwargs['df_seq'] = self.construct_df2dict(df)

else: # dataset is divided
assert df_train is not None and df_test is not None
if self.m2c_cfg['window_size'] <= 0 or self.m2c_cfg['window_size'] is None:
self.window_size = np.max([
df_train[['stu_id:token', 'exer_id:token']].groupby('stu_id:token').agg('count')['exer_id:token'].max(),
df_valid[['stu_id:token', 'exer_id:token']].groupby('stu_id:token').agg('count')['exer_id:token'].max() if df_valid is not None else 0,
df_valid[['stu_id:token', 'exer_id:token']].groupby('stu_id:token').agg('count')['exer_id:token'].max()
df_test[['stu_id:token', 'exer_id:token']].groupby('stu_id:token').agg('count')['exer_id:token'].max()
])
self.logger.info(f"actual window size: {self.window_size}")
else:
self.window_size = self.m2c_cfg['window_size']
self.logger.info(f"actual window size: {self.window_size}")

train_dict = self.construct_df2dict(df_train)
valid_dict = self.construct_df2dict(df_valid)
test_dict = self.construct_df2dict(df_test)
kwargs['df_train_folds'] = [train_dict]
kwargs['df_valid_folds'] = [valid_dict]
kwargs['df_test_folds'] = [test_dict]
kwargs['df_train_seq'] = train_dict
kwargs['df_valid_seq'] = valid_dict
kwargs['df_test_seq'] = test_dict
return kwargs

@staticmethod
def sort_records(df, col='order_id:token'):
if df is not None:
return df.sort_values(by=col, ascending=True).reset_index(drop=True)

def _divide_data_df_by_stu_one_fold(self, df: pd.DataFrame):
train_stu_id, val_stu_id, test_stu_id = SpliterUtil.divide_data_df_one_fold(
df['stu_id:token'].drop_duplicates(), seed=self.m2c_cfg['seed'], shuffle=True,
divide_scale_list=self.m2c_cfg['divide_scale_list']
)
train_df = df[df['stu_id:token'].isin(train_stu_id)]
val_df = df[df['stu_id:token'].isin(val_stu_id)] if val_stu_id is not None else None
test_df = df[df['stu_id:token'].isin(test_stu_id)]

if self.m2c_cfg['window_size'] <= 0 or self.m2c_cfg['window_size'] is None:
self.window_size = np.max([
train_df[['stu_id:token', 'exer_id:token']].groupby('stu_id:token').agg('count')['exer_id:token'].max(),
val_df[['stu_id:token', 'exer_id:token']].groupby('stu_id:token').agg('count')['exer_id:token'].max() if val_df is not None else 0,
test_df[['stu_id:token', 'exer_id:token']].groupby('stu_id:token').agg('count')['exer_id:token'].max()
])
self.logger.info(f"actual window size: {self.window_size}")
else:
self.window_size = self.m2c_cfg['window_size']

train_dict = self.construct_df2dict(train_df)
val_dict = self.construct_df2dict(val_df)
test_dict = self.construct_df2dict(test_df)
return train_dict, val_dict, test_dict

def _divide_data_df_by_stu_multi_fold(self, df: pd.DataFrame):
res = SpliterUtil.divide_data_df_one_fold(
df['stu_id:token'].drop_duplicates(), seed=self.m2c_cfg['seed'], shuffle=True,
divide_scale_list=self.m2c_cfg['divide_scale_list']
)

train_list, valid_list, test_list = [], [], []
for train_stu_id, val_stu_id, test_stu_id in zip(res):
train_df = df[df['stu_id:token'].isin(train_stu_id)]
val_df = df[df['stu_id:token'].isin(val_stu_id)] if val_stu_id is not None else None
test_df = df[df['stu_id:token'].isin(test_stu_id)]

if self.m2c_cfg['window_size'] <= 0 or self.m2c_cfg['window_size'] is None:
self.window_size = np.max([
train_df[['stu_id:token', 'exer_id:token']].groupby('stu_id:token').agg('count')['exer_id:token'].max(),
val_df[['stu_id:token', 'exer_id:token']].groupby('stu_id:token').agg('count')['exer_id:token'].max() if val_df is not None else 0,
test_df[['stu_id:token', 'exer_id:token']].groupby('stu_id:token').agg('count')['exer_id:token'].max()
])
self.logger.info(f"actual window size: {self.window_size}")
else:
self.window_size = self.m2c_cfg['window_size']

train_dict = self.construct_df2dict(train_df)
valid_dict = self.construct_df2dict(val_df)
test_dict = self.construct_df2dict(test_df)
train_list.append(train_dict)
valid_list.append(valid_dict)
test_list.append(test_dict)

return train_list, valid_list, test_list

def construct_df2dict(self, df: pd.DataFrame):
if df is None: return None
Expand Down Expand Up @@ -170,24 +103,3 @@ def construct_df2dict(self, df: pd.DataFrame):
raise NotImplementedError

return ret_dict

def set_dt_info(self, dt_info, **kwargs):
dt_info['real_window_size'] = self.window_size
if not self.is_dataset_divided:
if 'stu_id:token' in kwargs['df'].columns:
dt_info['stu_count'] = int(kwargs['df']['stu_id:token'].max() + 1)
if 'exer_id:token' in kwargs['df'].columns:
dt_info['exer_count'] = int(kwargs['df']['exer_id:token'].max() + 1)
else:
stu_count = max(kwargs['df_train']['stu_id:token'].max() + 1, kwargs['df_test']['stu_id:token'].max() + 1)
stu_count = max(kwargs['df_valid']['stu_id:token'].max() + 1, stu_count) if 'df_valid' in kwargs else stu_count

exer_count = max(kwargs['df_train']['exer_id:token'].max() + 1, kwargs['df_test']['exer_id:token'].max() + 1)
exer_count = max(kwargs['df_valid']['exer_id:token'].max() + 1, exer_count) if 'df_valid' in kwargs else exer_count

dt_info['stu_count'] = stu_count
dt_info['exer_count'] = exer_count

if kwargs.get('df_exer', None) is not None:
if 'cpt_seq:token_seq' in kwargs['df_exer']:
dt_info['cpt_count'] = len(set(list(chain(*kwargs['df_exer']['cpt_seq:token_seq'].to_list()))))
4 changes: 3 additions & 1 deletion edustudio/atom_op/mid2cache/KT/cpt_as_exer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
from itertools import chain


class M2C_CptAsExer(BaseMid2Cache):
class M2C_KCAsExer(BaseMid2Cache):
"""Knowledge Concept As Exercise
"""
default_cfg = {}

def process(self, **kwargs):
Expand Down
112 changes: 112 additions & 0 deletions edustudio/atom_op/mid2cache/KT/data_split4kt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
from ..common.base_mid2cache import BaseMid2Cache
import pandas as pd
import numpy as np
from edustudio.datatpl.utils import SpliterUtil, PadSeqUtil
from itertools import chain


class M2C_RandomDataSplit4KT(BaseMid2Cache):
default_cfg = {
'seed': 2023,
'divide_by': 'stu',
"divide_scale_list": [7,1,2],
}

def __init__(self, m2c_cfg, n_folds, is_dataset_divided) -> None:
super().__init__(m2c_cfg)
self.n_folds = n_folds
self.is_dataset_divided = is_dataset_divided

@classmethod
def from_cfg(cls, cfg):
m2c_cfg = cfg.datatpl_cfg.get(cls.__name__)
n_folds = cfg.datatpl_cfg.n_folds
is_dataset_divided = cfg.datatpl_cfg.is_dataset_divided
return cls(m2c_cfg, n_folds, is_dataset_divided)

def _check_params(self):
super()._check_params()
assert self.m2c_cfg['divide_by'] in {'stu', 'time'}

def process(self, **kwargs):
df_seq = kwargs['df_seq']
df_train_seq = kwargs.get('df_train_seq', None)
df_valid_seq = kwargs.get('df_validn_seq', None)
df_test_seq = kwargs.get('df_test_seq', None)

if not self.is_dataset_divided:
assert df_train_seq is None and df_valid_seq is None and df_test_seq is None
self.window_size = df_seq['exer_seq:token_seq'].shape[1]
if self.m2c_cfg['divide_by'] == 'stu':
if self.n_folds == 1:
train_dict, valid_dict, test_dict = self._divide_data_df_by_stu_one_fold(df_seq)
kwargs['df_train_folds'] = [train_dict]
kwargs['df_valid_folds'] = [valid_dict]
kwargs['df_test_folds'] = [test_dict]
else:
kwargs['df_train_folds'], kwargs['df_valid_folds'], kwargs['df_test_folds'] = self._divide_data_df_by_stu_multi_fold(df_seq)
elif self.m2c_cfg['divide_by'] == 'time':
raise NotImplementedError
else:
raise ValueError(f"unknown divide_by: {self.m2c_cfg['divide_by']}")
else:
assert df_train_seq is not None and df_test_seq is not None
self.window_size = df_train_seq['exer_seq:token_seq'].shape[1]
kwargs['df_train_folds'] = [df_train_seq]
kwargs['df_valid_folds'] = [df_valid_seq]
kwargs['df_test_folds'] = [df_test_seq]
return kwargs

def _dict_index_flag(self, df_seq:dict, flag: np.array):
return {
k: df_seq[k][flag] for k in df_seq
}

def _divide_data_df_by_stu_one_fold(self, df_seq: dict):
train_stu_id, valid_stu_id, test_stu_id = SpliterUtil.divide_data_df_one_fold(
pd.DataFrame({"stu_id:token": np.unique(df_seq['stu_id:token'])}), seed=self.m2c_cfg['seed'], shuffle=True,
divide_scale_list=self.m2c_cfg['divide_scale_list']
)

df_train_seq = self._dict_index_flag(df_seq, np.isin(df_seq['stu_id:token'], train_stu_id.to_numpy().flatten()))
df_test_seq = self._dict_index_flag(df_seq, np.isin(df_seq['stu_id:token'], test_stu_id.to_numpy().flatten()))
df_valid_seq = None
if valid_stu_id is not None:
df_valid_seq = self._dict_index_flag(df_seq, np.isin(df_seq['stu_id:token'], valid_stu_id.to_numpy().flatten()))

return df_train_seq, df_test_seq, df_valid_seq

def _divide_data_df_by_stu_multi_fold(self, df_seq: pd.DataFrame):
res = SpliterUtil.divide_data_df_multi_folds(
pd.DataFrame({"stu_id:token": np.unique(df_seq['stu_id:token'])}), seed=self.m2c_cfg['seed'], shuffle=True, n_folds=self.n_folds
)

train_list, test_list = [], []
for (train_stu_id, test_stu_id) in zip(*res):
df_train_seq = self._dict_index_flag(df_seq, np.isin(df_seq['stu_id:token'], train_stu_id.to_numpy().flatten()))
df_test_seq = self._dict_index_flag(df_seq, np.isin(df_seq['stu_id:token'], test_stu_id.to_numpy().flatten()))
train_list.append(df_train_seq)
test_list.append(df_test_seq)

return train_list, [], test_list

def set_dt_info(self, dt_info, **kwargs):
dt_info['real_window_size'] = self.window_size
if not self.is_dataset_divided:
if 'stu_id:token' in kwargs['df'].columns:
dt_info['stu_count'] = int(kwargs['df']['stu_id:token'].max() + 1)
if 'exer_id:token' in kwargs['df'].columns:
dt_info['exer_count'] = int(kwargs['df']['exer_id:token'].max() + 1)
else:
stu_count = max(kwargs['df_train']['stu_id:token'].max() + 1, kwargs['df_test']['stu_id:token'].max() + 1)
stu_count = max(kwargs['df_valid']['stu_id:token'].max() + 1, stu_count) if 'df_valid' in kwargs else stu_count

exer_count = max(kwargs['df_train']['exer_id:token'].max() + 1, kwargs['df_test']['exer_id:token'].max() + 1)
exer_count = max(kwargs['df_valid']['exer_id:token'].max() + 1, exer_count) if 'df_valid' in kwargs else exer_count

dt_info['stu_count'] = stu_count
dt_info['exer_count'] = exer_count

if kwargs.get('df_exer', None) is not None:
if 'cpt_seq:token_seq' in kwargs['df_exer']:
dt_info['cpt_count'] = len(set(list(chain(*kwargs['df_exer']['cpt_seq:token_seq'].to_list()))))
4 changes: 3 additions & 1 deletion edustudio/atom_op/mid2cache/KT/gen_cpt_seq.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
from edustudio.datatpl.utils import PadSeqUtil


class M2C_GenCptSeq(BaseMid2Cache):
class M2C_GenKCSeq(BaseMid2Cache):
"""Generate Knowledge Component Sequence
"""
default_cfg = {
'cpt_seq_window_size': -1,
}
Expand Down
2 changes: 1 addition & 1 deletion edustudio/atom_op/mid2cache/KT/gen_unfold_cpt_seq.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pandas as pd


class M2C_GenUnFoldCptSeq(BaseMid2Cache):
class M2C_GenUnFoldKCSeq(BaseMid2Cache):
default_cfg = {}

def __init__(self, m2c_cfg, n_folds, is_dataset_divided) -> None:
Expand Down
2 changes: 1 addition & 1 deletion edustudio/atom_op/mid2cache/common/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
from .label2int import M2C_Label2Int
from .merge_divided_splits import M2C_MergeDividedSplits
from .remapid import M2C_ReMapId
from .build_cpt_relation import M2C_BuildCptRelation
from .build_cpt_relation import M2C_BuildKCRelation
2 changes: 1 addition & 1 deletion edustudio/atom_op/mid2cache/common/build_cpt_relation.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from itertools import chain


class M2C_BuildCptRelation(BaseMid2Cache):
class M2C_BuildKCRelation(BaseMid2Cache):
default_cfg = {
'relation_type': 'rcd_transition',
'threshold': None
Expand Down
4 changes: 2 additions & 2 deletions edustudio/datatpl/CD/RCDDataTPL.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@ class RCDDataTPL(EduDataTPL):
default_cfg = {
'mid2cache_op_seq': [
'M2C_Label2Int', 'M2C_FilterRecords4CD', 'M2C_ReMapId',
'M2C_RandomDataSplit4CD', 'M2C_BuildCptRelation',
'M2C_RandomDataSplit4CD', 'M2C_BuildKCRelation',
'M2C_GenQMat', 'M2C_RCD_OP'
],
'M2C_BuildCptRelation': {
'M2C_BuildKCRelation': {
'relation_type': 'rcd_transition',
'threshold': None
}
Expand Down
2 changes: 1 addition & 1 deletion edustudio/datatpl/KT/CL4KTDataTPL.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

class CL4KTDataTPL(EduDataTPL):
default_cfg = {
'mid2cache_op_seq': ['M2C_Label2Int', 'M2C_ReMapId', 'M2C_GenUnFoldCptSeq', 'M2C_CL4KT_OP'],
'mid2cache_op_seq': ['M2C_Label2Int', 'M2C_ReMapId', 'M2C_GenUnFoldKCSeq', 'M2C_CL4KT_OP'],
'M2C_CL4KT_OP': {
'sequence_truncation': 'recent',
}
Expand Down
2 changes: 1 addition & 1 deletion edustudio/datatpl/KT/DIMKTDataTPL.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

class DIMKTDataTPL(KTInterExtendsQDataTPL):
default_cfg = {
'mid2cache_op_seq': ['M2C_Label2Int', 'M2C_ReMapId', 'M2C_GenUnFoldCptSeq', 'M2C_BuildSeqInterFeats', 'M2C_GenCptSeq', "M2C_DIMKT_OP"],
'mid2cache_op_seq': ['M2C_Label2Int', 'M2C_ReMapId', 'M2C_GenUnFoldKCSeq', 'M2C_BuildSeqInterFeats', 'M2C_RandomDataSplit4KT', 'M2C_GenKCSeq', "M2C_DIMKT_OP"],
'M2C_BuildSeqInterFeats': {
# 'window_size': 200,
"extra_inter_feats": ['start_timestamp:float', 'cpt_unfold:token']
Expand Down
2 changes: 1 addition & 1 deletion edustudio/datatpl/KT/DKTDSCDataTPL.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

class DKTDSCDataTPL(EduDataTPL):
default_cfg = {
'mid2cache_op_seq': ["M2C_CptAsExer", 'M2C_Label2Int', 'M2C_ReMapId', 'M2C_BuildSeqInterFeats', "M2C_DKTDSC_OP"],
'mid2cache_op_seq': ["M2C_KCAsExer", 'M2C_Label2Int', 'M2C_ReMapId', 'M2C_BuildSeqInterFeats','M2C_RandomDataSplit4KT', "M2C_DKTDSC_OP"],
}

def __getitem__(self, index):
Expand Down
2 changes: 1 addition & 1 deletion edustudio/datatpl/KT/DKTForgetDataTPL.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

class DKTForgetDataTPL(EduDataTPL):
default_cfg = {
'mid2cache_op_seq': ['M2C_Label2Int', 'M2C_ReMapId', 'M2C_BuildSeqInterFeats', "M2C_DKTForget_OP"],
'mid2cache_op_seq': ['M2C_Label2Int', 'M2C_ReMapId', 'M2C_BuildSeqInterFeats','M2C_RandomDataSplit4KT', "M2C_DKTForget_OP"],
'M2C_BuildSeqInterFeats': {
"extra_inter_feats": ['start_timestamp:float']
}
Expand Down
Loading

0 comments on commit 79e6efb

Please sign in to comment.