diff --git a/docs/source/developer_guide/customize_evaltpl.md b/docs/source/developer_guide/customize_evaltpl.md index 823e9af..b96423c 100644 --- a/docs/source/developer_guide/customize_evaltpl.md +++ b/docs/source/developer_guide/customize_evaltpl.md @@ -30,7 +30,7 @@ This EvalTPL is for the model evaluation using binary classification metrics. The protocols in ``PredictionEvalTPL`` are listed as follows. -### InterpretabilityEvalT +### InterpretabilityEvalTPL This EvalTPL is for the model evaluation for interpretability. It uses states of students and Q matrix for ``eval``, which are domain-specific in student assessment. ## Develop a New EvalTPL in EduStudio diff --git a/docs/source/features/atomic_operations.md b/docs/source/features/atomic_operations.md index 192e619..6a9810c 100644 --- a/docs/source/features/atomic_operations.md +++ b/docs/source/features/atomic_operations.md @@ -50,7 +50,7 @@ In the following, we give a table to display existing atomic operations. | name | description | | ---------------------- | ------------------------------------------- | | M2C_BuildSeqInterFeats | Build Sequential Features and Split dataset | -| M2C_CptAsExer | Treat knowledge concept as exercise | -| M2C_GenCptSeq | Generate knowledge concept seq | -| M2C_GenUnFoldCptSeq | Unfold knowledge concepts | +| M2C_KCAsExer | Treat knowledge concept as exercise | +| M2C_GenKCSeq | Generate knowledge concept seq | +| M2C_GenUnFoldKCSeq | Unfold knowledge concepts | diff --git a/edustudio/atom_op/mid2cache/KT/__init__.py b/edustudio/atom_op/mid2cache/KT/__init__.py index 70a351a..664e4e6 100644 --- a/edustudio/atom_op/mid2cache/KT/__init__.py +++ b/edustudio/atom_op/mid2cache/KT/__init__.py @@ -1,4 +1,5 @@ from .build_seq_inter_feats import M2C_BuildSeqInterFeats -from .cpt_as_exer import M2C_CptAsExer -from .gen_cpt_seq import M2C_GenCptSeq -from .gen_unfold_cpt_seq import M2C_GenUnFoldCptSeq +from .cpt_as_exer import M2C_KCAsExer +from .gen_cpt_seq import M2C_GenKCSeq +from .gen_unfold_cpt_seq import M2C_GenUnFoldKCSeq +from .data_split4kt import M2C_RandomDataSplit4KT diff --git a/edustudio/atom_op/mid2cache/KT/build_seq_inter_feats.py b/edustudio/atom_op/mid2cache/KT/build_seq_inter_feats.py index 2fa29c0..e951ed4 100644 --- a/edustudio/atom_op/mid2cache/KT/build_seq_inter_feats.py +++ b/edustudio/atom_op/mid2cache/KT/build_seq_inter_feats.py @@ -7,13 +7,10 @@ class M2C_BuildSeqInterFeats(BaseMid2Cache): default_cfg = { - 'seed': 2023, - 'divide_by': 'stu', 'window_size': 100, - "divide_scale_list": [7,1,2], "extra_inter_feats": [] } - + def __init__(self, m2c_cfg, n_folds, is_dataset_divided) -> None: super().__init__(m2c_cfg) self.n_folds = n_folds @@ -25,11 +22,7 @@ def from_cfg(cls, cfg): n_folds = cfg.datatpl_cfg.n_folds is_dataset_divided = cfg.datatpl_cfg.is_dataset_divided return cls(m2c_cfg, n_folds, is_dataset_divided) - - def _check_params(self): - super()._check_params() - assert self.m2c_cfg['divide_by'] in {'stu', 'time'} - + def process(self, **kwargs): df = kwargs['df'] df_train, df_valid, df_test = kwargs['df_train'], kwargs['df_valid'], kwargs['df_test'] @@ -40,96 +33,36 @@ def process(self, **kwargs): if not self.is_dataset_divided: assert df_train is None and df_valid is None and df_test is None - if self.m2c_cfg['divide_by'] == 'stu': - if self.n_folds == 1: - train_dict, valid_dict, test_dict = self._divide_data_df_by_stu_one_fold(df) - kwargs['df_train_folds'] = [train_dict] - kwargs['df_valid_folds'] = [valid_dict] - kwargs['df_test_folds'] = [test_dict] - else: - kwargs['df_train_folds'], kwargs['df_valid_folds'], kwargs['df_test_folds'] = self._divide_data_df_by_stu_multi_fold(df) - elif self.m2c_cfg['divide_by'] == 'time': - raise NotImplementedError - else: - raise ValueError(f"unknown divide_by: {self.m2c_cfg['divide_by']}") + self.window_size = self.m2c_cfg['window_size'] + if self.m2c_cfg['window_size'] <= 0 or self.m2c_cfg['window_size'] is None: + self.window_size = df[['stu_id:token', 'exer_id:token']].groupby('stu_id:token').agg('count')['exer_id:token'].max() + self.logger.info(f"actual window size: {self.window_size}") + kwargs['df_seq'] = self.construct_df2dict(df) + else: # dataset is divided assert df_train is not None and df_test is not None if self.m2c_cfg['window_size'] <= 0 or self.m2c_cfg['window_size'] is None: self.window_size = np.max([ df_train[['stu_id:token', 'exer_id:token']].groupby('stu_id:token').agg('count')['exer_id:token'].max(), df_valid[['stu_id:token', 'exer_id:token']].groupby('stu_id:token').agg('count')['exer_id:token'].max() if df_valid is not None else 0, - df_valid[['stu_id:token', 'exer_id:token']].groupby('stu_id:token').agg('count')['exer_id:token'].max() + df_test[['stu_id:token', 'exer_id:token']].groupby('stu_id:token').agg('count')['exer_id:token'].max() ]) - self.logger.info(f"actual window size: {self.window_size}") else: self.window_size = self.m2c_cfg['window_size'] + self.logger.info(f"actual window size: {self.window_size}") + train_dict = self.construct_df2dict(df_train) valid_dict = self.construct_df2dict(df_valid) test_dict = self.construct_df2dict(df_test) - kwargs['df_train_folds'] = [train_dict] - kwargs['df_valid_folds'] = [valid_dict] - kwargs['df_test_folds'] = [test_dict] + kwargs['df_train_seq'] = train_dict + kwargs['df_valid_seq'] = valid_dict + kwargs['df_test_seq'] = test_dict return kwargs @staticmethod def sort_records(df, col='order_id:token'): if df is not None: return df.sort_values(by=col, ascending=True).reset_index(drop=True) - - def _divide_data_df_by_stu_one_fold(self, df: pd.DataFrame): - train_stu_id, val_stu_id, test_stu_id = SpliterUtil.divide_data_df_one_fold( - df['stu_id:token'].drop_duplicates(), seed=self.m2c_cfg['seed'], shuffle=True, - divide_scale_list=self.m2c_cfg['divide_scale_list'] - ) - train_df = df[df['stu_id:token'].isin(train_stu_id)] - val_df = df[df['stu_id:token'].isin(val_stu_id)] if val_stu_id is not None else None - test_df = df[df['stu_id:token'].isin(test_stu_id)] - - if self.m2c_cfg['window_size'] <= 0 or self.m2c_cfg['window_size'] is None: - self.window_size = np.max([ - train_df[['stu_id:token', 'exer_id:token']].groupby('stu_id:token').agg('count')['exer_id:token'].max(), - val_df[['stu_id:token', 'exer_id:token']].groupby('stu_id:token').agg('count')['exer_id:token'].max() if val_df is not None else 0, - test_df[['stu_id:token', 'exer_id:token']].groupby('stu_id:token').agg('count')['exer_id:token'].max() - ]) - self.logger.info(f"actual window size: {self.window_size}") - else: - self.window_size = self.m2c_cfg['window_size'] - - train_dict = self.construct_df2dict(train_df) - val_dict = self.construct_df2dict(val_df) - test_dict = self.construct_df2dict(test_df) - return train_dict, val_dict, test_dict - - def _divide_data_df_by_stu_multi_fold(self, df: pd.DataFrame): - res = SpliterUtil.divide_data_df_one_fold( - df['stu_id:token'].drop_duplicates(), seed=self.m2c_cfg['seed'], shuffle=True, - divide_scale_list=self.m2c_cfg['divide_scale_list'] - ) - - train_list, valid_list, test_list = [], [], [] - for train_stu_id, val_stu_id, test_stu_id in zip(res): - train_df = df[df['stu_id:token'].isin(train_stu_id)] - val_df = df[df['stu_id:token'].isin(val_stu_id)] if val_stu_id is not None else None - test_df = df[df['stu_id:token'].isin(test_stu_id)] - - if self.m2c_cfg['window_size'] <= 0 or self.m2c_cfg['window_size'] is None: - self.window_size = np.max([ - train_df[['stu_id:token', 'exer_id:token']].groupby('stu_id:token').agg('count')['exer_id:token'].max(), - val_df[['stu_id:token', 'exer_id:token']].groupby('stu_id:token').agg('count')['exer_id:token'].max() if val_df is not None else 0, - test_df[['stu_id:token', 'exer_id:token']].groupby('stu_id:token').agg('count')['exer_id:token'].max() - ]) - self.logger.info(f"actual window size: {self.window_size}") - else: - self.window_size = self.m2c_cfg['window_size'] - - train_dict = self.construct_df2dict(train_df) - valid_dict = self.construct_df2dict(val_df) - test_dict = self.construct_df2dict(test_df) - train_list.append(train_dict) - valid_list.append(valid_dict) - test_list.append(test_dict) - - return train_list, valid_list, test_list def construct_df2dict(self, df: pd.DataFrame): if df is None: return None @@ -170,24 +103,3 @@ def construct_df2dict(self, df: pd.DataFrame): raise NotImplementedError return ret_dict - - def set_dt_info(self, dt_info, **kwargs): - dt_info['real_window_size'] = self.window_size - if not self.is_dataset_divided: - if 'stu_id:token' in kwargs['df'].columns: - dt_info['stu_count'] = int(kwargs['df']['stu_id:token'].max() + 1) - if 'exer_id:token' in kwargs['df'].columns: - dt_info['exer_count'] = int(kwargs['df']['exer_id:token'].max() + 1) - else: - stu_count = max(kwargs['df_train']['stu_id:token'].max() + 1, kwargs['df_test']['stu_id:token'].max() + 1) - stu_count = max(kwargs['df_valid']['stu_id:token'].max() + 1, stu_count) if 'df_valid' in kwargs else stu_count - - exer_count = max(kwargs['df_train']['exer_id:token'].max() + 1, kwargs['df_test']['exer_id:token'].max() + 1) - exer_count = max(kwargs['df_valid']['exer_id:token'].max() + 1, exer_count) if 'df_valid' in kwargs else exer_count - - dt_info['stu_count'] = stu_count - dt_info['exer_count'] = exer_count - - if kwargs.get('df_exer', None) is not None: - if 'cpt_seq:token_seq' in kwargs['df_exer']: - dt_info['cpt_count'] = len(set(list(chain(*kwargs['df_exer']['cpt_seq:token_seq'].to_list())))) diff --git a/edustudio/atom_op/mid2cache/KT/cpt_as_exer.py b/edustudio/atom_op/mid2cache/KT/cpt_as_exer.py index d4f90e1..5ee6e9d 100644 --- a/edustudio/atom_op/mid2cache/KT/cpt_as_exer.py +++ b/edustudio/atom_op/mid2cache/KT/cpt_as_exer.py @@ -3,7 +3,9 @@ from itertools import chain -class M2C_CptAsExer(BaseMid2Cache): +class M2C_KCAsExer(BaseMid2Cache): + """Knowledge Concept As Exercise + """ default_cfg = {} def process(self, **kwargs): diff --git a/edustudio/atom_op/mid2cache/KT/data_split4kt.py b/edustudio/atom_op/mid2cache/KT/data_split4kt.py new file mode 100644 index 0000000..4f27c6a --- /dev/null +++ b/edustudio/atom_op/mid2cache/KT/data_split4kt.py @@ -0,0 +1,112 @@ +from ..common.base_mid2cache import BaseMid2Cache +import pandas as pd +import numpy as np +from edustudio.datatpl.utils import SpliterUtil, PadSeqUtil +from itertools import chain + + +class M2C_RandomDataSplit4KT(BaseMid2Cache): + default_cfg = { + 'seed': 2023, + 'divide_by': 'stu', + "divide_scale_list": [7,1,2], + } + + def __init__(self, m2c_cfg, n_folds, is_dataset_divided) -> None: + super().__init__(m2c_cfg) + self.n_folds = n_folds + self.is_dataset_divided = is_dataset_divided + + @classmethod + def from_cfg(cls, cfg): + m2c_cfg = cfg.datatpl_cfg.get(cls.__name__) + n_folds = cfg.datatpl_cfg.n_folds + is_dataset_divided = cfg.datatpl_cfg.is_dataset_divided + return cls(m2c_cfg, n_folds, is_dataset_divided) + + def _check_params(self): + super()._check_params() + assert self.m2c_cfg['divide_by'] in {'stu', 'time'} + + def process(self, **kwargs): + df_seq = kwargs['df_seq'] + df_train_seq = kwargs.get('df_train_seq', None) + df_valid_seq = kwargs.get('df_validn_seq', None) + df_test_seq = kwargs.get('df_test_seq', None) + + if not self.is_dataset_divided: + assert df_train_seq is None and df_valid_seq is None and df_test_seq is None + self.window_size = df_seq['exer_seq:token_seq'].shape[1] + if self.m2c_cfg['divide_by'] == 'stu': + if self.n_folds == 1: + train_dict, valid_dict, test_dict = self._divide_data_df_by_stu_one_fold(df_seq) + kwargs['df_train_folds'] = [train_dict] + kwargs['df_valid_folds'] = [valid_dict] + kwargs['df_test_folds'] = [test_dict] + else: + kwargs['df_train_folds'], kwargs['df_valid_folds'], kwargs['df_test_folds'] = self._divide_data_df_by_stu_multi_fold(df_seq) + elif self.m2c_cfg['divide_by'] == 'time': + raise NotImplementedError + else: + raise ValueError(f"unknown divide_by: {self.m2c_cfg['divide_by']}") + else: + assert df_train_seq is not None and df_test_seq is not None + self.window_size = df_train_seq['exer_seq:token_seq'].shape[1] + kwargs['df_train_folds'] = [df_train_seq] + kwargs['df_valid_folds'] = [df_valid_seq] + kwargs['df_test_folds'] = [df_test_seq] + return kwargs + + def _dict_index_flag(self, df_seq:dict, flag: np.array): + return { + k: df_seq[k][flag] for k in df_seq + } + + def _divide_data_df_by_stu_one_fold(self, df_seq: dict): + train_stu_id, valid_stu_id, test_stu_id = SpliterUtil.divide_data_df_one_fold( + pd.DataFrame({"stu_id:token": np.unique(df_seq['stu_id:token'])}), seed=self.m2c_cfg['seed'], shuffle=True, + divide_scale_list=self.m2c_cfg['divide_scale_list'] + ) + + df_train_seq = self._dict_index_flag(df_seq, np.isin(df_seq['stu_id:token'], train_stu_id.to_numpy().flatten())) + df_test_seq = self._dict_index_flag(df_seq, np.isin(df_seq['stu_id:token'], test_stu_id.to_numpy().flatten())) + df_valid_seq = None + if valid_stu_id is not None: + df_valid_seq = self._dict_index_flag(df_seq, np.isin(df_seq['stu_id:token'], valid_stu_id.to_numpy().flatten())) + + return df_train_seq, df_test_seq, df_valid_seq + + def _divide_data_df_by_stu_multi_fold(self, df_seq: pd.DataFrame): + res = SpliterUtil.divide_data_df_multi_folds( + pd.DataFrame({"stu_id:token": np.unique(df_seq['stu_id:token'])}), seed=self.m2c_cfg['seed'], shuffle=True, n_folds=self.n_folds + ) + + train_list, test_list = [], [] + for (train_stu_id, test_stu_id) in zip(*res): + df_train_seq = self._dict_index_flag(df_seq, np.isin(df_seq['stu_id:token'], train_stu_id.to_numpy().flatten())) + df_test_seq = self._dict_index_flag(df_seq, np.isin(df_seq['stu_id:token'], test_stu_id.to_numpy().flatten())) + train_list.append(df_train_seq) + test_list.append(df_test_seq) + + return train_list, [], test_list + + def set_dt_info(self, dt_info, **kwargs): + dt_info['real_window_size'] = self.window_size + if not self.is_dataset_divided: + if 'stu_id:token' in kwargs['df'].columns: + dt_info['stu_count'] = int(kwargs['df']['stu_id:token'].max() + 1) + if 'exer_id:token' in kwargs['df'].columns: + dt_info['exer_count'] = int(kwargs['df']['exer_id:token'].max() + 1) + else: + stu_count = max(kwargs['df_train']['stu_id:token'].max() + 1, kwargs['df_test']['stu_id:token'].max() + 1) + stu_count = max(kwargs['df_valid']['stu_id:token'].max() + 1, stu_count) if 'df_valid' in kwargs else stu_count + + exer_count = max(kwargs['df_train']['exer_id:token'].max() + 1, kwargs['df_test']['exer_id:token'].max() + 1) + exer_count = max(kwargs['df_valid']['exer_id:token'].max() + 1, exer_count) if 'df_valid' in kwargs else exer_count + + dt_info['stu_count'] = stu_count + dt_info['exer_count'] = exer_count + + if kwargs.get('df_exer', None) is not None: + if 'cpt_seq:token_seq' in kwargs['df_exer']: + dt_info['cpt_count'] = len(set(list(chain(*kwargs['df_exer']['cpt_seq:token_seq'].to_list())))) diff --git a/edustudio/atom_op/mid2cache/KT/gen_cpt_seq.py b/edustudio/atom_op/mid2cache/KT/gen_cpt_seq.py index 6fcd054..80a0704 100644 --- a/edustudio/atom_op/mid2cache/KT/gen_cpt_seq.py +++ b/edustudio/atom_op/mid2cache/KT/gen_cpt_seq.py @@ -3,7 +3,9 @@ from edustudio.datatpl.utils import PadSeqUtil -class M2C_GenCptSeq(BaseMid2Cache): +class M2C_GenKCSeq(BaseMid2Cache): + """Generate Knowledge Component Sequence + """ default_cfg = { 'cpt_seq_window_size': -1, } diff --git a/edustudio/atom_op/mid2cache/KT/gen_unfold_cpt_seq.py b/edustudio/atom_op/mid2cache/KT/gen_unfold_cpt_seq.py index 3df7f0a..f2257f7 100644 --- a/edustudio/atom_op/mid2cache/KT/gen_unfold_cpt_seq.py +++ b/edustudio/atom_op/mid2cache/KT/gen_unfold_cpt_seq.py @@ -4,7 +4,7 @@ import pandas as pd -class M2C_GenUnFoldCptSeq(BaseMid2Cache): +class M2C_GenUnFoldKCSeq(BaseMid2Cache): default_cfg = {} def __init__(self, m2c_cfg, n_folds, is_dataset_divided) -> None: diff --git a/edustudio/atom_op/mid2cache/common/__init__.py b/edustudio/atom_op/mid2cache/common/__init__.py index cabf59e..e0d9e0b 100644 --- a/edustudio/atom_op/mid2cache/common/__init__.py +++ b/edustudio/atom_op/mid2cache/common/__init__.py @@ -3,4 +3,4 @@ from .label2int import M2C_Label2Int from .merge_divided_splits import M2C_MergeDividedSplits from .remapid import M2C_ReMapId -from .build_cpt_relation import M2C_BuildCptRelation +from .build_cpt_relation import M2C_BuildKCRelation diff --git a/edustudio/atom_op/mid2cache/common/build_cpt_relation.py b/edustudio/atom_op/mid2cache/common/build_cpt_relation.py index ed75664..3bde411 100644 --- a/edustudio/atom_op/mid2cache/common/build_cpt_relation.py +++ b/edustudio/atom_op/mid2cache/common/build_cpt_relation.py @@ -4,7 +4,7 @@ from itertools import chain -class M2C_BuildCptRelation(BaseMid2Cache): +class M2C_BuildKCRelation(BaseMid2Cache): default_cfg = { 'relation_type': 'rcd_transition', 'threshold': None diff --git a/edustudio/datatpl/CD/RCDDataTPL.py b/edustudio/datatpl/CD/RCDDataTPL.py index b69a7cf..663272e 100644 --- a/edustudio/datatpl/CD/RCDDataTPL.py +++ b/edustudio/datatpl/CD/RCDDataTPL.py @@ -7,10 +7,10 @@ class RCDDataTPL(EduDataTPL): default_cfg = { 'mid2cache_op_seq': [ 'M2C_Label2Int', 'M2C_FilterRecords4CD', 'M2C_ReMapId', - 'M2C_RandomDataSplit4CD', 'M2C_BuildCptRelation', + 'M2C_RandomDataSplit4CD', 'M2C_BuildKCRelation', 'M2C_GenQMat', 'M2C_RCD_OP' ], - 'M2C_BuildCptRelation': { + 'M2C_BuildKCRelation': { 'relation_type': 'rcd_transition', 'threshold': None } diff --git a/edustudio/datatpl/KT/CL4KTDataTPL.py b/edustudio/datatpl/KT/CL4KTDataTPL.py index fcfb035..b83f483 100644 --- a/edustudio/datatpl/KT/CL4KTDataTPL.py +++ b/edustudio/datatpl/KT/CL4KTDataTPL.py @@ -4,7 +4,7 @@ class CL4KTDataTPL(EduDataTPL): default_cfg = { - 'mid2cache_op_seq': ['M2C_Label2Int', 'M2C_ReMapId', 'M2C_GenUnFoldCptSeq', 'M2C_CL4KT_OP'], + 'mid2cache_op_seq': ['M2C_Label2Int', 'M2C_ReMapId', 'M2C_GenUnFoldKCSeq', 'M2C_CL4KT_OP'], 'M2C_CL4KT_OP': { 'sequence_truncation': 'recent', } diff --git a/edustudio/datatpl/KT/DIMKTDataTPL.py b/edustudio/datatpl/KT/DIMKTDataTPL.py index 45e2834..3c1081d 100644 --- a/edustudio/datatpl/KT/DIMKTDataTPL.py +++ b/edustudio/datatpl/KT/DIMKTDataTPL.py @@ -6,7 +6,7 @@ class DIMKTDataTPL(KTInterExtendsQDataTPL): default_cfg = { - 'mid2cache_op_seq': ['M2C_Label2Int', 'M2C_ReMapId', 'M2C_GenUnFoldCptSeq', 'M2C_BuildSeqInterFeats', 'M2C_GenCptSeq', "M2C_DIMKT_OP"], + 'mid2cache_op_seq': ['M2C_Label2Int', 'M2C_ReMapId', 'M2C_GenUnFoldKCSeq', 'M2C_BuildSeqInterFeats', 'M2C_RandomDataSplit4KT', 'M2C_GenKCSeq', "M2C_DIMKT_OP"], 'M2C_BuildSeqInterFeats': { # 'window_size': 200, "extra_inter_feats": ['start_timestamp:float', 'cpt_unfold:token'] diff --git a/edustudio/datatpl/KT/DKTDSCDataTPL.py b/edustudio/datatpl/KT/DKTDSCDataTPL.py index 1b6b62d..3e2b50a 100644 --- a/edustudio/datatpl/KT/DKTDSCDataTPL.py +++ b/edustudio/datatpl/KT/DKTDSCDataTPL.py @@ -6,7 +6,7 @@ class DKTDSCDataTPL(EduDataTPL): default_cfg = { - 'mid2cache_op_seq': ["M2C_CptAsExer", 'M2C_Label2Int', 'M2C_ReMapId', 'M2C_BuildSeqInterFeats', "M2C_DKTDSC_OP"], + 'mid2cache_op_seq': ["M2C_KCAsExer", 'M2C_Label2Int', 'M2C_ReMapId', 'M2C_BuildSeqInterFeats','M2C_RandomDataSplit4KT', "M2C_DKTDSC_OP"], } def __getitem__(self, index): diff --git a/edustudio/datatpl/KT/DKTForgetDataTPL.py b/edustudio/datatpl/KT/DKTForgetDataTPL.py index 0a20024..e1ce598 100644 --- a/edustudio/datatpl/KT/DKTForgetDataTPL.py +++ b/edustudio/datatpl/KT/DKTForgetDataTPL.py @@ -3,7 +3,7 @@ class DKTForgetDataTPL(EduDataTPL): default_cfg = { - 'mid2cache_op_seq': ['M2C_Label2Int', 'M2C_ReMapId', 'M2C_BuildSeqInterFeats', "M2C_DKTForget_OP"], + 'mid2cache_op_seq': ['M2C_Label2Int', 'M2C_ReMapId', 'M2C_BuildSeqInterFeats','M2C_RandomDataSplit4KT', "M2C_DKTForget_OP"], 'M2C_BuildSeqInterFeats': { "extra_inter_feats": ['start_timestamp:float'] } diff --git a/edustudio/datatpl/KT/EERNNDataTPL.py b/edustudio/datatpl/KT/EERNNDataTPL.py index 7c8c82b..278a63a 100644 --- a/edustudio/datatpl/KT/EERNNDataTPL.py +++ b/edustudio/datatpl/KT/EERNNDataTPL.py @@ -5,7 +5,7 @@ class EERNNDataTPL(EduDataTPL): default_cfg = { - 'mid2cache_op_seq': ['M2C_Label2Int', 'M2C_ReMapId', 'M2C_BuildSeqInterFeats', 'M2C_EERNN_OP'], + 'mid2cache_op_seq': ['M2C_Label2Int', 'M2C_ReMapId', 'M2C_BuildSeqInterFeats','M2C_RandomDataSplit4KT', 'M2C_EERNN_OP'], } def get_extra_data(self, **kwargs): diff --git a/edustudio/datatpl/KT/EKTDataTPL.py b/edustudio/datatpl/KT/EKTDataTPL.py index 2496f0d..809b8d9 100644 --- a/edustudio/datatpl/KT/EKTDataTPL.py +++ b/edustudio/datatpl/KT/EKTDataTPL.py @@ -5,7 +5,7 @@ class EKTDataTPL(EERNNDataTPL): default_cfg = { - 'mid2cache_op_seq': ['M2C_Label2Int', 'M2C_ReMapId', 'M2C_BuildSeqInterFeats', 'M2C_GenCptSeq', 'M2C_EERNN_OP'], + 'mid2cache_op_seq': ['M2C_Label2Int', 'M2C_ReMapId', 'M2C_BuildSeqInterFeats', 'M2C_RandomDataSplit4KT', 'M2C_GenKCSeq', 'M2C_EERNN_OP'], } def __getitem__(self, index): diff --git a/edustudio/datatpl/KT/GKTDataTPL.py b/edustudio/datatpl/KT/GKTDataTPL.py index ea88759..612a5fa 100644 --- a/edustudio/datatpl/KT/GKTDataTPL.py +++ b/edustudio/datatpl/KT/GKTDataTPL.py @@ -5,7 +5,7 @@ class GKTDataTPL(EduDataTPL): default_cfg = { - 'mid2cache_op_seq': ["M2C_CptAsExer", 'M2C_Label2Int', 'M2C_ReMapId', 'M2C_BuildSeqInterFeats'], + 'mid2cache_op_seq': ["M2C_KCAsExer", 'M2C_Label2Int', 'M2C_ReMapId', 'M2C_BuildSeqInterFeats', 'M2C_RandomDataSplit4KT'], } def process_load_data_from_middata(self): diff --git a/edustudio/datatpl/KT/KTInterCptAsExerDataTPL.py b/edustudio/datatpl/KT/KTInterCptAsExerDataTPL.py index dcae4b1..e61d1c8 100644 --- a/edustudio/datatpl/KT/KTInterCptAsExerDataTPL.py +++ b/edustudio/datatpl/KT/KTInterCptAsExerDataTPL.py @@ -2,6 +2,6 @@ class KTInterCptAsExerDataTPL(EduDataTPL): default_cfg = { - 'mid2cache_op_seq': ["M2C_CptAsExer", 'M2C_Label2Int', 'M2C_ReMapId', 'M2C_BuildSeqInterFeats'], + 'mid2cache_op_seq': ["M2C_KCAsExer", 'M2C_Label2Int', 'M2C_ReMapId', 'M2C_BuildSeqInterFeats', 'M2C_RandomDataSplit4KT'], } diff --git a/edustudio/datatpl/KT/KTInterCptUnfoldDataTPL.py b/edustudio/datatpl/KT/KTInterCptUnfoldDataTPL.py index ec391e4..1856e94 100644 --- a/edustudio/datatpl/KT/KTInterCptUnfoldDataTPL.py +++ b/edustudio/datatpl/KT/KTInterCptUnfoldDataTPL.py @@ -4,7 +4,7 @@ class KTInterCptUnfoldDataTPL(EduDataTPL): default_cfg = { - 'mid2cache_op_seq': ['M2C_Label2Int', 'M2C_ReMapId', 'M2C_GenUnFoldCptSeq', 'M2C_BuildSeqInterFeats'], + 'mid2cache_op_seq': ['M2C_Label2Int', 'M2C_ReMapId', 'M2C_GenUnFoldKCSeq', 'M2C_BuildSeqInterFeats', 'M2C_RandomDataSplit4KT'], 'M2C_BuildSeqInterFeats': { "extra_inter_feats": ['start_timestamp:float', 'cpt_unfold:token'] } diff --git a/edustudio/datatpl/KT/KTInterDataTPL.py b/edustudio/datatpl/KT/KTInterDataTPL.py index f2b1ea4..6db459e 100644 --- a/edustudio/datatpl/KT/KTInterDataTPL.py +++ b/edustudio/datatpl/KT/KTInterDataTPL.py @@ -2,6 +2,6 @@ class KTInterDataTPL(GeneralDataTPL): default_cfg = { - 'mid2cache_op_seq': ['M2C_Label2Int', 'M2C_ReMapId', 'M2C_BuildSeqInterFeats'], + 'mid2cache_op_seq': ['M2C_Label2Int', 'M2C_ReMapId', 'M2C_BuildSeqInterFeats', 'M2C_RandomDataSplit4KT'], } diff --git a/edustudio/datatpl/KT/KTInterExtendsQDataTPL.py b/edustudio/datatpl/KT/KTInterExtendsQDataTPL.py index 5e58a63..a8fca71 100644 --- a/edustudio/datatpl/KT/KTInterExtendsQDataTPL.py +++ b/edustudio/datatpl/KT/KTInterExtendsQDataTPL.py @@ -4,7 +4,7 @@ class KTInterExtendsQDataTPL(EduDataTPL): default_cfg = { - 'mid2cache_op_seq': ['M2C_Label2Int', 'M2C_ReMapId', 'M2C_BuildSeqInterFeats', 'M2C_GenCptSeq'], + 'mid2cache_op_seq': ['M2C_Label2Int', 'M2C_ReMapId', 'M2C_BuildSeqInterFeats', 'M2C_RandomDataSplit4KT', 'M2C_GenKCSeq'], } def __getitem__(self, index): diff --git a/edustudio/datatpl/KT/LPKTDataTPL.py b/edustudio/datatpl/KT/LPKTDataTPL.py index 35ed09f..00755e9 100644 --- a/edustudio/datatpl/KT/LPKTDataTPL.py +++ b/edustudio/datatpl/KT/LPKTDataTPL.py @@ -3,7 +3,7 @@ class LPKTDataTPL(EduDataTPL): default_cfg = { - 'mid2cache_op_seq': ['M2C_Label2Int', 'M2C_ReMapId', 'M2C_BuildSeqInterFeats', 'M2C_LPKT_OP', "M2C_GenQMat"], + 'mid2cache_op_seq': ['M2C_Label2Int', 'M2C_ReMapId', 'M2C_BuildSeqInterFeats', 'M2C_RandomDataSplit4KT', 'M2C_LPKT_OP', "M2C_GenQMat"], 'M2C_BuildSeqInterFeats': { "extra_inter_feats": ['start_timestamp:float', 'answer_time:float'] } diff --git a/edustudio/datatpl/KT/QDKTDataTPL.py b/edustudio/datatpl/KT/QDKTDataTPL.py index 28a37ec..d03b74a 100644 --- a/edustudio/datatpl/KT/QDKTDataTPL.py +++ b/edustudio/datatpl/KT/QDKTDataTPL.py @@ -7,7 +7,7 @@ class QDKTDataTPL(KTInterExtendsQDataTPL): default_cfg = { - 'mid2cache_op_seq': ['M2C_Label2Int', 'M2C_ReMapId', 'M2C_BuildSeqInterFeats', 'M2C_GenCptSeq','M2C_GenQMat','M2C_QDKT_OP'], + 'mid2cache_op_seq': ['M2C_Label2Int', 'M2C_ReMapId', 'M2C_BuildSeqInterFeats', 'M2C_RandomDataSplit4KT', 'M2C_GenKCSeq','M2C_GenQMat','M2C_QDKT_OP'], } def get_extra_data(self, **kwargs): diff --git a/edustudio/datatpl/KT/RKTDataTPL.py b/edustudio/datatpl/KT/RKTDataTPL.py index 22cc51c..ae477cc 100644 --- a/edustudio/datatpl/KT/RKTDataTPL.py +++ b/edustudio/datatpl/KT/RKTDataTPL.py @@ -7,7 +7,7 @@ class RKTDataTPL(EduDataTPL): default_cfg = { - 'mid2cache_op_seq': ['M2C_Label2Int', 'M2C_ReMapId','M2C_GenQMat', 'M2C_BuildSeqInterFeats'], + 'mid2cache_op_seq': ['M2C_Label2Int', 'M2C_ReMapId','M2C_GenQMat', 'M2C_BuildSeqInterFeats', 'M2C_RandomDataSplit4KT'], 'M2C_BuildSeqInterFeats': { "extra_inter_feats": ['start_timestamp:float'] } diff --git a/edustudio/datatpl/utils/spliter_util.py b/edustudio/datatpl/utils/spliter_util.py index 60801e2..1d720d9 100644 --- a/edustudio/datatpl/utils/spliter_util.py +++ b/edustudio/datatpl/utils/spliter_util.py @@ -58,7 +58,7 @@ def divide_data_df_multi_folds(df, n_folds, seed, label_field=None, shuffle=True shuffle=shuffle, random_state=seed) splits = skf.split(df) - train_df_list, test_df_list = [], [], [] + train_df_list, test_df_list = [], [] for train_index, test_index in splits: train_df = df.iloc[train_index].reset_index(drop=True) test_df = df.iloc[test_index].reset_index(drop=True) diff --git a/edustudio/settings.py b/edustudio/settings.py index 7c48f9f..ff50b5e 100644 --- a/edustudio/settings.py +++ b/edustudio/settings.py @@ -8,9 +8,9 @@ from edustudio import __version__ ID = idUtil.get_random_id_bytime() # RUN ID +EDUSTUDIO_VERSION = __version__ WORK_DIR = os.getcwd() -EDUSTUDIO_VERSION = __version__ DATA_FOLDER_PATH = f"{WORK_DIR}/data" TEMP_FOLDER_PATH = f"{WORK_DIR}/temp"