Skip to content

Commit

Permalink
add [NeurIPS 2023] DCD model
Browse files Browse the repository at this point in the history
  • Loading branch information
kervias committed Feb 3, 2024
1 parent 79e6efb commit 2158f36
Show file tree
Hide file tree
Showing 11 changed files with 945 additions and 3 deletions.
3 changes: 2 additions & 1 deletion docs/source/user_guide/models.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ Here is a list of models and corresponding papers of CD and KT including impleme
| | QCCDM | ECAI 2023 | [QCCDM: A Q-Augmented Causal Cognitive Diagnosis Model for Student Learning](https://github.com/lswhim/CDM_ILOG) |
| | CBICDM | ICONIP 2023 | [A Causality-Based Interpretable Cognitive Diagnosis Model](https://link.springer.com/chapter/10.1007/978-981-99-8067-3_16) |
| | CMES | Arxiv 2023 | [Enhancing Cognitive Diagnosis using Un-interacted Exercises: A Collaboration-aware Mixed Sampling Approach](https://arxiv.org/abs/2312.10110) |
| | Zero-1-to-3 | Arxiv 2023 | [Zero-1-to-3: Domain-level Zero-shot Cognitive Diagnosis via One Batch of Early-bird Students towards Three Diagnostic Objectives](https://arxiv.org/abs/2312.13434) |
|✔️ | DCD | NeurIPS 2023 | [Disentangling Cognitive Diagnosis with Limited Exercise Labels](https://openreview.net/pdf?id=ogPBujRhiN) |
| | Zero-1-to-3 | AAAI 2024 | [Zero-1-to-3: Domain-level Zero-shot Cognitive Diagnosis via One Batch of Early-bird Students towards Three Diagnostic Objectives](https://arxiv.org/abs/2312.13434) |

## Knowledge Tracing

Expand Down
2 changes: 2 additions & 0 deletions edustudio/atom_op/mid2cache/common/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,5 @@
from .merge_divided_splits import M2C_MergeDividedSplits
from .remapid import M2C_ReMapId
from .build_cpt_relation import M2C_BuildKCRelation
from .build_missing_Q import M2C_BuildMissingQ
from .fill_missing_Q import M2C_FillMissingQ
68 changes: 68 additions & 0 deletions edustudio/atom_op/mid2cache/common/build_missing_Q.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
from .base_mid2cache import BaseMid2Cache
import numpy as np
import pandas as pd
from itertools import chain
import torch
from edustudio.utils.common import set_same_seeds


class M2C_BuildMissingQ(BaseMid2Cache):
default_cfg = {
'seed': 20230518,
'Q_delete_ratio': 0.0,
}

def process(self, **kwargs):
dt_info = kwargs['dt_info']
self.item_count = dt_info['exer_count']
self.cpt_count = dt_info['cpt_count']
self.df_Q = kwargs['df_exer'][['exer_id:token', 'cpt_seq:token_seq']]

self.missing_df_Q = self.get_missing_df_Q()
self.missing_Q_mat = self.get_Q_mat_from_df_arr(self.missing_df_Q, self.item_count, self.cpt_count)

kwargs['missing_df_Q'] = self.missing_df_Q
kwargs['missing_Q_mat'] = self.missing_Q_mat

return kwargs

def get_missing_df_Q(self):
set_same_seeds(seed=self.m2c_cfg['seed'])
ratio = self.m2c_cfg['Q_delete_ratio']
iid2cptlist = self.df_Q.set_index('exer_id:token')['cpt_seq:token_seq'].to_dict()
iid_lis = np.array(list(chain(*[[i]*len(iid2cptlist[i]) for i in iid2cptlist])))
cpt_lis = np.array(list(chain(*list(iid2cptlist.values()))))
entry_arr = np.vstack([iid_lis, cpt_lis]).T

np.random.shuffle(entry_arr)

# reference: https://stackoverflow.com/questions/64834655/python-how-to-find-first-duplicated-items-in-an-numpy-array
_, idx = np.unique(entry_arr[:, 1], return_index=True) # 先从每个知识点中选出1题出来
bool_idx = np.zeros_like(entry_arr[:, 1], dtype=bool)
bool_idx[idx] = True
preserved_exers = np.unique(entry_arr[bool_idx, 0]) # 选择符合条件的习题作为保留

delete_num = int(ratio * self.item_count)
preserved_num = self.item_count - delete_num

if len(preserved_exers) >= preserved_num:
self.logger.warning(
f"Cant Satisfy Delete Require: {len(preserved_exers)=},{preserved_num=}"
)
else:
need_preserved_num = preserved_num - len(preserved_exers)

left_iids = np.arange(self.item_count)
left_iids = left_iids[~np.isin(left_iids, preserved_exers)]
np.random.shuffle(left_iids)
choose_iids = left_iids[0:need_preserved_num]

preserved_exers = np.hstack([preserved_exers, choose_iids])

return self.df_Q.copy()[self.df_Q['exer_id:token'].isin(preserved_exers)].reset_index(drop=True)


def get_Q_mat_from_df_arr(self, df_Q_arr, item_count, cpt_count):
Q_mat = torch.zeros((item_count, cpt_count), dtype=torch.int64)
for _, item in df_Q_arr.iterrows(): Q_mat[item['exer_id:token'], item['cpt_seq:token_seq']] = 1
return Q_mat
112 changes: 112 additions & 0 deletions edustudio/atom_op/mid2cache/common/fill_missing_Q.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
from .base_mid2cache import BaseMid2Cache
import numpy as np
import pandas as pd
from itertools import chain
import torch
from edustudio.utils.common import set_same_seeds, tensor2npy
from tqdm import tqdm

class M2C_FillMissingQ(BaseMid2Cache):
default_cfg = {
'Q_fill_type': "None",
'params_topk': 5,
'params_votek': 2,
}

def __init__(self, m2c_cfg, cfg) -> None:
self.logger = cfg.logger
self.m2c_cfg = m2c_cfg
self.cfg = cfg

@classmethod
def from_cfg(cls, cfg):
return cls(cfg.datatpl_cfg.get(cls.__name__), cfg)

def process(self, **kwargs):
dt_info = kwargs['dt_info']
self.user_count = dt_info['stu_count']
self.item_count = dt_info['exer_count']
self.cpt_count = dt_info['cpt_count']
self.df_Q = kwargs['df_exer'][['exer_id:token', 'cpt_seq:token_seq']]

Q_mat = kwargs['Q_mat']
missing_Q_mat = kwargs['missing_Q_mat']

self.filling_Q_mat_list = []
for df_train in kwargs['df_train_folds']:
if (missing_Q_mat.sum(dim=1) == 0).sum() > 0:
if self.m2c_cfg['Q_fill_type'] == "sim_dist_for_by_exer":
fill_df_Q = self.fill_df_Q_by_sim_dist(
df_train, kwargs['missing_df_Q'],
params_topk=self.m2c_cfg['params_topk'],
params_votek=self.m2c_cfg['params_votek']
)
fill_Q_mat = self.get_Q_mat_from_df_arr(fill_df_Q, self.item_count, self.cpt_count)
self.filling_Q_mat_list.append(fill_Q_mat)
elif self.m2c_cfg['Q_fill_type'] == "None":
self.filling_Q_mat_list.append(missing_Q_mat)
else:
raise ValueError(f"unknown Q_fill_type: {self.m2c_cfg['Q_fill_type']}")
else:
self.filling_Q_mat_list.append(Q_mat)

kwargs['filling_Q_mat_list'] = self.filling_Q_mat_list
return kwargs

def get_Q_mat_from_df_arr(self, df_Q_arr, item_count, cpt_count):
Q_mat = np.zeros((item_count, cpt_count), dtype=np.int64)
for _, item in df_Q_arr.iterrows(): Q_mat[item['exer_id:token'], item['cpt_seq:token_seq']] = 1
return Q_mat

def fill_df_Q_by_sim_dist(self, df_interaction, df_Q_left, params_topk=5, params_votek=2):
preserved_exers = df_Q_left['exer_id:token'].to_numpy()
interact_mat = torch.zeros((self.user_count, self.item_count), dtype=torch.int8).to(self.cfg.traintpl_cfg['device'])
idx = df_interaction[df_interaction['label:float'] == 1][['stu_id:token','exer_id:token']].to_numpy()
interact_mat[idx[:,0], idx[:,1]] = 1
idx = df_interaction[df_interaction['label:float'] != 1][['stu_id:token','exer_id:token']].to_numpy()
interact_mat[idx[:,0], idx[:,1]] = -1

interact_mat = interact_mat.T

sim_mat = torch.zeros((self.item_count, self.item_count))
missing_iids = np.array(list(set(np.arange(self.item_count)) - set(preserved_exers)))
for iid in tqdm(missing_iids, desc="[FILL_Q_MAT] compute sim_mat", ncols=self.cfg.frame_cfg['TQDM_NCOLS']):
temp = interact_mat[iid] != 0
same_mat = interact_mat[iid] == interact_mat
bool_mat = (temp) & (interact_mat != 0)
same_mat[~bool_mat] = False
sim_mat[iid] = same_mat.sum(dim=1) / (temp).sum()
sim_mat[iid, bool_mat.sum(dim=1) == 0] = 0.0
sim_mat[iid, iid] = -1.0
sim_mat[iid, missing_iids] = -1.0

assert torch.isnan(sim_mat).sum() == 0

_, topk_mat_idx = torch.topk(sim_mat, dim=1, k=params_topk, largest=True, sorted=True)
topk_mat_idx = tensor2npy(topk_mat_idx)

index_df_Q = df_Q_left.set_index('exer_id:token')
missing_iid_fill_cpts = {}
for iid in tqdm(missing_iids, desc="[FILL_Q_MAT] fill process", ncols=self.cfg.frame_cfg['TQDM_NCOLS']):
count_dict = dict(zip(*np.unique(
list(chain(*[index_df_Q.loc[iid2]['cpt_seq:token_seq'] for iid2 in topk_mat_idx[iid] if iid2 in preserved_exers])),
return_counts=True,
)))
count_dict = sorted(count_dict.items(), key=lambda x: x[1], reverse=True)
missing_iid_fill_cpts[iid] = [i[0] for i in count_dict[0:params_votek]]

missing_fill_df_Q = pd.DataFrame(
{'exer_id:token': list(missing_iid_fill_cpts.keys()),'cpt_seq:token_seq':list(missing_iid_fill_cpts.values())}
)
final_df_Q = pd.concat([df_Q_left, missing_fill_df_Q], axis=0, ignore_index=True)

hit_ratio = 0
t_Q = self.df_Q.set_index('exer_id:token')
for iid in missing_iid_fill_cpts:
if len(set(t_Q.loc[iid]['cpt_seq:token_seq']) & set(missing_iid_fill_cpts[iid])) > 0:
hit_ratio += 1
hit_ratio = hit_ratio / len(missing_iid_fill_cpts)

self.logger.info(f"[FILL_Q] Hit_ratio={hit_ratio}")

return final_df_Q
70 changes: 70 additions & 0 deletions edustudio/datatpl/CD/DCDDataTPL.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import os
from ..common.edu_datatpl import EduDataTPL
import json
from edustudio.datatpl.common.general_datatpl import DataTPLStatus
import torch


class DCDDataTPL(EduDataTPL):
default_cfg = {
'n_folds': 5,
'mid2cache_op_seq': ['M2C_Label2Int', 'M2C_FilterRecords4CD', 'M2C_ReMapId', 'M2C_RandomDataSplit4CD', 'M2C_GenQMat', 'M2C_BuildMissingQ', 'M2C_FillMissingQ'],
'cpt_relation_file_name': 'cpt_relation',
}

def __init__(self, cfg, df, df_train=None, df_valid=None, df_test=None, dict_cpt_relation=None, status=DataTPLStatus(), df_stu=None, df_exer=None):
self.dict_cpt_relation = dict_cpt_relation
super().__init__(cfg, df, df_train, df_valid, df_test, df_stu, df_exer, status)

def _check_param(self):
super()._check_params()
assert 0 <= self.datatpl_cfg['Q_delete_ratio'] < 1

@property
def common_str2df(self):
dic = super().common_str2df
dic['dict_cpt_relation'] = self.dict_cpt_relation
return dic


def process_data(self):
super().process_data()
dt_info = self.final_kwargs['dt_info']
user_count = dt_info['stu_count']
item_count = dt_info['exer_count']
self.interact_mat_list = []
for interact_df in self.final_kwargs['df_train_folds']:
interact_mat = torch.zeros((user_count, item_count), dtype=torch.int8)
idx = interact_df[interact_df['label:float'] == 1][['stu_id:token','exer_id:token']].to_numpy()
interact_mat[idx[:,0], idx[:,1]] = 1
idx = interact_df[interact_df['label:float'] != 1][['stu_id:token','exer_id:token']].to_numpy()
interact_mat[idx[:,0], idx[:,1]] = -1
self.interact_mat_list.append(interact_mat)

self.final_kwargs['interact_mat_list'] = self.interact_mat_list

if self.final_kwargs['dict_cpt_relation'] is None:
self.final_kwargs['dict_cpt_relation'] = {i: [i] for i in range(self.final_kwargs['dt_info']['cpt_count'])}

@classmethod
def load_data(cls, cfg):
kwargs = super().load_data(cfg)
fph = f"{cfg.frame_cfg.data_folder_path}/middata/{cfg.datatpl_cfg['cpt_relation_file_name']}.json"
if os.path.exists(fph):
with open(fph, 'r', encoding='utf-8') as f:
kwargs['dict_cpt_relation'] = json.load(f)
else:
cfg.logger.warning("without cpt_relation.json")
kwargs['dict_cpt_relation'] = None
return kwargs

def get_extra_data(self):
extra_dict = super().get_extra_data()
extra_dict['filling_Q_mat'] = self.filling_Q_mat
extra_dict['interact_mat'] = self.interact_mat
return extra_dict

def set_info_for_fold(self, fold_id):
super().set_info_for_fold(fold_id)
self.filling_Q_mat = self.final_kwargs['filling_Q_mat_list'][fold_id]
self.interact_mat = self.final_kwargs['interact_mat_list'][fold_id]
3 changes: 2 additions & 1 deletion edustudio/datatpl/CD/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@
from .CNCDQDataTPL import CNCDQDataTPL
from .RCDDataTPL import RCDDataTPL
from .CDGKDataTPL import CDGKDataTPL
from.ECDDataTPL import ECDDataTPL
from .ECDDataTPL import ECDDataTPL
from .DCDDataTPL import DCDDataTPL
3 changes: 2 additions & 1 deletion edustudio/model/CD/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,5 @@
from .cdgk import CDGK_SINGLE, CDGK_MULTI
from .cdmfkc import CDMFKC
from .ecd import *
from .mgcd import MGCD
from .mgcd import MGCD
from .dcd import DCD
Loading

0 comments on commit 2158f36

Please sign in to comment.