diff --git a/.github/workflows/python-test.yml b/.github/workflows/python-test.yml index 6f481853..00dc8da2 100644 --- a/.github/workflows/python-test.yml +++ b/.github/workflows/python-test.yml @@ -6,10 +6,14 @@ on: [push, pull_request] jobs: build: - runs-on: ubuntu-latest + runs-on: ${{ matrix.os }} strategy: matrix: python-version: [3.6, 3.7, 3.8, 3.9] + include: + - os: "ubuntu-latest" + - os: "ubuntu-20.04" + python-version: "3.6" steps: - uses: actions/checkout@v2 @@ -24,4 +28,4 @@ jobs: - name: Test with pytest run: | pytest - codecov + codecov \ No newline at end of file diff --git a/AUTHORS.md b/AUTHORS.md index f1a9fafc..870aed9c 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -20,5 +20,6 @@ [Jundong Wu](https://github.com/wintermelon008) +[Shangzi Xue](https://github.com/ShangziXue) The stared contributors are the corresponding authors. diff --git a/CHANGE.txt b/CHANGE.txt index 17e33497..cf09733d 100644 --- a/CHANGE.txt +++ b/CHANGE.txt @@ -1,3 +1,9 @@ +v1.0.0 + 1. Support cuda for I2V and T2V. + 2. Add demos for downstream tasks including knowledge & difficulty & discrimination prediction, similarity prediction and paper segmentation. + 3. Refactor quesnet for pretrain and vectorization. + 4. Update documents about tutorials and API. + v0.0.9 1. Refactor tokenizer Basic Tokenizer and Pretrained Tokenizer 2. Refactor model structures following huggingface styles for Elmo, BERT, DisenQNet and QuesNet diff --git a/EduNLP/I2V/i2v.py b/EduNLP/I2V/i2v.py index 720b0eff..975300c3 100644 --- a/EduNLP/I2V/i2v.py +++ b/EduNLP/I2V/i2v.py @@ -1,6 +1,7 @@ # coding: utf-8 # 2021/8/1 @ tongshiwei +import torch import json import os.path from typing import List, Tuple @@ -59,12 +60,12 @@ class I2V(object): """ def __init__(self, tokenizer, t2v, *args, tokenizer_kwargs: dict = None, - pretrained_t2v=False, model_dir=MODEL_DIR, **kwargs): + pretrained_t2v=False, model_dir=MODEL_DIR, device='cpu', **kwargs): if pretrained_t2v: logger.info("Use pretrained t2v model %s" % t2v) - self.t2v = get_t2v_pretrained_model(t2v, model_dir) + self.t2v = get_t2v_pretrained_model(t2v, model_dir, device) else: - self.t2v = T2V(t2v, *args, **kwargs) + self.t2v = T2V(t2v, device=device, *args, **kwargs) if tokenizer == 'bert': self.tokenizer = BertTokenizer.from_pretrained( **tokenizer_kwargs if tokenizer_kwargs is not None else {}) @@ -82,31 +83,53 @@ def __init__(self, tokenizer, t2v, *args, tokenizer_kwargs: dict = None, **tokenizer_kwargs if tokenizer_kwargs is not None else {}) self.params = { "tokenizer": tokenizer, - "tokenizer_kwargs": tokenizer_kwargs, "t2v": t2v, "args": args, + "tokenizer_kwargs": tokenizer_kwargs, + "pretrained_t2v": pretrained_t2v, + "model_dir": model_dir, "kwargs": kwargs, - "pretrained_t2v": pretrained_t2v } + self.device = torch.device(device) def __call__(self, items, *args, **kwargs): """transfer item to vector""" return self.infer_vector(items, *args, **kwargs) def tokenize(self, items, *args, key=lambda x: x, **kwargs) -> list: - # """tokenize item""" + """ + tokenize item + Parameter + ---------- + items: a list of questions + Return + ---------- + tokens: list + """ return self.tokenizer(items, *args, key=key, **kwargs) def infer_vector(self, items, key=lambda x: x, **kwargs) -> tuple: + """ + get question embedding + NotImplemented + """ raise NotImplementedError def infer_item_vector(self, tokens, *args, **kwargs) -> ...: + """NotImplemented""" return self.infer_vector(tokens, *args, **kwargs)[0] def infer_token_vector(self, tokens, *args, **kwargs) -> ...: + """NotImplemented""" return self.infer_vector(tokens, *args, **kwargs)[1] def save(self, config_path): + """ + save model weights in config_path + Parameter: + ---------- + config_path: str + """ with open(config_path, "w", encoding="utf-8") as wf: json.dump(self.params, wf, ensure_ascii=False, indent=2) @@ -123,6 +146,7 @@ def load(cls, config_path, *args, **kwargs): @classmethod def from_pretrained(cls, name, model_dir=MODEL_DIR, *args, **kwargs): + """NotImplemented""" raise NotImplementedError @property @@ -327,13 +351,13 @@ def infer_vector(self, items: Tuple[List[str], List[dict], str, dict], return self.t2v.infer_vector(inputs, *args, **kwargs), self.t2v.infer_tokens(inputs, *args, **kwargs) @classmethod - def from_pretrained(cls, name, model_dir=MODEL_DIR, *args, **kwargs): + def from_pretrained(cls, name, model_dir=MODEL_DIR, device='cpu', *args, **kwargs): model_path = path_append(model_dir, get_pretrained_model_info(name)[0].split('/')[-1], to_str=True) for i in [".tar.gz", ".tar.bz2", ".tar.bz", ".tar.tgz", ".tar", ".tgz", ".zip", ".rar"]: model_path = model_path.replace(i, "") logger.info("model_path: %s" % model_path) tokenizer_kwargs = {"tokenizer_config_dir": model_path} - return cls("elmo", name, pretrained_t2v=True, model_dir=model_dir, + return cls("elmo", name, pretrained_t2v=True, model_dir=model_dir, device=device, tokenizer_kwargs=tokenizer_kwargs) @@ -386,17 +410,19 @@ def infer_vector(self, items: Tuple[List[str], List[dict], str, dict], -------- vector:list """ + is_batch = isinstance(items, list) + items = items if is_batch else [items] inputs = self.tokenize(items, key=key, return_tensors=return_tensors) return self.t2v.infer_vector(inputs, *args, **kwargs), self.t2v.infer_tokens(inputs, *args, **kwargs) @classmethod - def from_pretrained(cls, name, model_dir=MODEL_DIR, *args, **kwargs): + def from_pretrained(cls, name, model_dir=MODEL_DIR, device='cpu', *args, **kwargs): model_path = path_append(model_dir, get_pretrained_model_info(name)[0].split('/')[-1], to_str=True) for i in [".tar.gz", ".tar.bz2", ".tar.bz", ".tar.tgz", ".tar", ".tgz", ".zip", ".rar"]: model_path = model_path.replace(i, "") logger.info("model_path: %s" % model_path) tokenizer_kwargs = {"tokenizer_config_dir": model_path} - return cls("bert", name, pretrained_t2v=True, model_dir=model_dir, + return cls("bert", name, pretrained_t2v=True, model_dir=model_dir, device=device, tokenizer_kwargs=tokenizer_kwargs) @@ -452,7 +478,7 @@ def infer_vector(self, items: Tuple[List[str], List[dict], str, dict], return i_vec, t_vec @classmethod - def from_pretrained(cls, name, model_dir=MODEL_DIR, **kwargs): + def from_pretrained(cls, name, model_dir=MODEL_DIR, device='cpu', **kwargs): model_path = path_append(model_dir, get_pretrained_model_info(name)[0].split('/')[-1], to_str=True) for i in [".tar.gz", ".tar.bz2", ".tar.bz", ".tar.tgz", ".tar", ".tgz", ".zip", ".rar"]: model_path = model_path.replace(i, "") @@ -461,7 +487,7 @@ def from_pretrained(cls, name, model_dir=MODEL_DIR, **kwargs): tokenizer_kwargs = { "tokenizer_config_dir": model_path, } - return cls("disenq", name, pretrained_t2v=True, model_dir=model_dir, + return cls("disenq", name, pretrained_t2v=True, model_dir=model_dir, device=device, tokenizer_kwargs=tokenizer_kwargs, **kwargs) @@ -495,18 +521,20 @@ def infer_vector(self, items: Tuple[List[str], List[dict], str, dict], token embeddings question embedding """ + is_batch = isinstance(items, list) + items = items if is_batch else [items] encodes = self.tokenize(items, key=key, meta=meta, *args, **kwargs) return self.t2v.infer_vector(encodes), self.t2v.infer_tokens(encodes) @classmethod - def from_pretrained(cls, name, model_dir=MODEL_DIR, *args, **kwargs): + def from_pretrained(cls, name, model_dir=MODEL_DIR, device='cpu', *args, **kwargs): model_path = path_append(model_dir, get_pretrained_model_info(name)[0].split('/')[-1], to_str=True) for i in [".tar.gz", ".tar.bz2", ".tar.bz", ".tar.tgz", ".tar", ".tgz", ".zip", ".rar"]: model_path = model_path.replace(i, "") logger.info("model_path: %s" % model_path) tokenizer_kwargs = { "tokenizer_config_dir": model_path} - return cls("quesnet", name, pretrained_t2v=True, model_dir=model_dir, + return cls("quesnet", name, pretrained_t2v=True, model_dir=model_dir, device=device, tokenizer_kwargs=tokenizer_kwargs) @@ -520,7 +548,7 @@ def from_pretrained(cls, name, model_dir=MODEL_DIR, *args, **kwargs): } -def get_pretrained_i2v(name, model_dir=MODEL_DIR): +def get_pretrained_i2v(name, model_dir=MODEL_DIR, device='cpu'): """ It is a good idea if you want to switch item to vector earily. @@ -560,4 +588,4 @@ def get_pretrained_i2v(name, model_dir=MODEL_DIR): ) _, t2v = get_pretrained_model_info(name) _class, *params = MODEL_MAP[t2v], name - return _class.from_pretrained(*params, model_dir=model_dir) + return _class.from_pretrained(*params, model_dir=model_dir, device=device) diff --git a/EduNLP/ModelZoo/__init__.py b/EduNLP/ModelZoo/__init__.py index c4607475..a1fb4b43 100644 --- a/EduNLP/ModelZoo/__init__.py +++ b/EduNLP/ModelZoo/__init__.py @@ -1,3 +1,5 @@ from .utils import * from .bert import * from .rnn import * +from .disenqnet import * +from .quesnet import * diff --git a/EduNLP/ModelZoo/base_model.py b/EduNLP/ModelZoo/base_model.py index a7ee418e..bcf3caef 100644 --- a/EduNLP/ModelZoo/base_model.py +++ b/EduNLP/ModelZoo/base_model.py @@ -31,7 +31,7 @@ def from_pretrained(cls, pretrained_model_path, *args, **kwargs): config_path = os.path.join(pretrained_model_path, "config.json") model_path = os.path.join(pretrained_model_path, "pytorch_model.bin") model = cls.from_config(config_path, *args, **kwargs) - loaded_state_dict = torch.load(model_path) + loaded_state_dict = torch.load(model_path, map_location=torch.device('cpu')) loaded_keys = loaded_state_dict.keys() expected_keys = model.state_dict().keys() diff --git a/EduNLP/ModelZoo/bert/bert.py b/EduNLP/ModelZoo/bert/bert.py index 2e86a72d..956c44bf 100644 --- a/EduNLP/ModelZoo/bert/bert.py +++ b/EduNLP/ModelZoo/bert/bert.py @@ -1,28 +1,27 @@ import torch from torch import nn -from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence -from baize.torch import load_net -import torch.nn.functional as F import json import os from ..base_model import BaseModel -from transformers.modeling_outputs import ModelOutput -from transformers import BertModel, PretrainedConfig -from typing import List, Optional +from ..utils import PropertyPredictionOutput, KnowledgePredictionOutput +from transformers import BertModel, PretrainedConfig, BertConfig +from typing import List from ..rnn.harnn import HAM -__all__ = ["BertForPropertyPrediction", "BertForKnowledgePrediction"] - -class BertForPPOutput(ModelOutput): - loss: torch.FloatTensor = None - logits: torch.FloatTensor = None +__all__ = ["BertForPropertyPrediction", "BertForKnowledgePrediction"] class BertForPropertyPrediction(BaseModel): - def __init__(self, pretrained_model_dir=None, head_dropout=0.5): + def __init__(self, pretrained_model_dir=None, head_dropout=0.5, init=True): super(BertForPropertyPrediction, self).__init__() - self.bert = BertModel.from_pretrained(pretrained_model_dir) + bert_config = BertConfig.from_pretrained(pretrained_model_dir) + if init: + print(f'Load BertModel from checkpoint: {pretrained_model_dir}') + self.bert = BertModel.from_pretrained(pretrained_model_dir) + else: + print(f'Load BertModel from config: {pretrained_model_dir}') + self.bert = BertModel(bert_config) self.hidden_size = self.bert.config.hidden_size self.head_dropout = head_dropout self.dropout = nn.Dropout(head_dropout) @@ -30,7 +29,7 @@ def __init__(self, pretrained_model_dir=None, head_dropout=0.5): self.sigmoid = nn.Sigmoid() self.criterion = nn.MSELoss() - self.config = {k: v for k, v in locals().items() if k not in ["self", "__class__"]} + self.config = {k: v for k, v in locals().items() if k not in ["self", "__class__", "bert_config"]} self.config['architecture'] = 'BertForPropertyPrediction' self.config = PretrainedConfig.from_dict(self.config) @@ -47,44 +46,54 @@ def forward(self, loss = None if labels is not None: loss = self.criterion(logits, labels) if labels is not None else None - return BertForPPOutput( + return PropertyPredictionOutput( loss=loss, logits=logits, ) @classmethod def from_config(cls, config_path, **kwargs): + config_path = os.path.join(os.path.dirname(config_path), 'model_config.json') with open(config_path, "r", encoding="utf-8") as rf: model_config = json.load(rf) + model_config['pretrained_model_dir'] = os.path.dirname(config_path) model_config.update(kwargs) return cls( pretrained_model_dir=model_config['pretrained_model_dir'], - head_dropout=model_config.get("head_dropout", 0.5) + head_dropout=model_config.get("head_dropout", 0.5), + init=model_config.get('init', False) ) - # @classmethod - # def from_pretrained(cls): - # NotImplementedError - # # 需要验证是否和huggingface的模型兼容 + def save_config(self, config_dir): + config_path = os.path.join(config_dir, "model_config.json") + with open(config_path, "w", encoding="utf-8") as wf: + json.dump(self.config.to_dict(), wf, ensure_ascii=False, indent=2) + self.bert.config.save_pretrained(config_dir) class BertForKnowledgePrediction(BaseModel): def __init__(self, + pretrained_model_dir=None, num_classes_list: List[int] = None, num_total_classes: int = None, - pretrained_model_dir=None, head_dropout=0.5, flat_cls_weight=0.5, attention_unit_size=256, fc_hidden_size=512, beta=0.5, + init=True ): super(BertForKnowledgePrediction, self).__init__() - self.bert = BertModel.from_pretrained(pretrained_model_dir) + bert_config = BertConfig.from_pretrained(pretrained_model_dir) + if init: + print(f'Load BertModel from checkpoint: {pretrained_model_dir}') + self.bert = BertModel.from_pretrained(pretrained_model_dir) + else: + print(f'Load BertModel from config: {pretrained_model_dir}') + self.bert = BertModel(bert_config) self.hidden_size = self.bert.config.hidden_size self.head_dropout = head_dropout self.dropout = nn.Dropout(head_dropout) - self.classifier = nn.Linear(self.hidden_size, 1) self.sigmoid = nn.Sigmoid() self.criterion = nn.MSELoss() self.flat_classifier = nn.Linear(self.hidden_size, num_total_classes) @@ -101,7 +110,7 @@ def __init__(self, self.num_classes_list = num_classes_list self.num_total_classes = num_total_classes - self.config = {k: v for k, v in locals().items() if k not in ["self", "__class__"]} + self.config = {k: v for k, v in locals().items() if k not in ["self", "__class__", "bert_config"]} self.config['architecture'] = 'BertForKnowledgePrediction' self.config = PretrainedConfig.from_dict(self.config) @@ -124,15 +133,17 @@ def forward(self, labels = torch.sum(torch.nn.functional.one_hot(labels, num_classes=self.num_total_classes), dim=1) labels = labels.float() loss = self.criterion(logits, labels) if labels is not None else None - return BertForPPOutput( + return KnowledgePredictionOutput( loss=loss, logits=logits, ) @classmethod def from_config(cls, config_path, **kwargs): + config_path = os.path.join(os.path.dirname(config_path), 'model_config.json') with open(config_path, "r", encoding="utf-8") as rf: model_config = json.load(rf) + model_config['pretrained_model_dir'] = os.path.dirname(config_path) model_config.update(kwargs) return cls( pretrained_model_dir=model_config['pretrained_model_dir'], @@ -143,9 +154,11 @@ def from_config(cls, config_path, **kwargs): attention_unit_size=model_config.get('attention_unit_size', 256), fc_hidden_size=model_config.get('fc_hidden_size', 512), beta=model_config.get('beta', 0.5), + init=model_config.get('init', False) ) - # @classmethod - # def from_pretrained(cls): - # NotImplementedError - # # 需要验证是否和huggingface的模型兼容 + def save_config(self, config_dir): + config_path = os.path.join(config_dir, "model_config.json") + with open(config_path, "w", encoding="utf-8") as wf: + json.dump(self.config.to_dict(), wf, ensure_ascii=False, indent=2) + self.bert.config.save_pretrained(config_dir) diff --git a/EduNLP/ModelZoo/disenqnet/__init__.py b/EduNLP/ModelZoo/disenqnet/__init__.py index 975cfaff..e774edb1 100644 --- a/EduNLP/ModelZoo/disenqnet/__init__.py +++ b/EduNLP/ModelZoo/disenqnet/__init__.py @@ -1,3 +1,3 @@ # -*- coding: utf-8 -*- -from .disenqnet import DisenQNet, DisenQNetForPreTraining +from .disenqnet import * diff --git a/EduNLP/ModelZoo/disenqnet/disenqnet.py b/EduNLP/ModelZoo/disenqnet/disenqnet.py index b8dc6fca..49f4f662 100644 --- a/EduNLP/ModelZoo/disenqnet/disenqnet.py +++ b/EduNLP/ModelZoo/disenqnet/disenqnet.py @@ -8,10 +8,13 @@ import os import json from gensim.models import KeyedVectors +from typing import Optional, List from .modules import TextEncoder, AttnModel, ConceptEstimator, MIEstimator, DisenEstimator from .utils import get_mask from ..utils import set_device +from ..utils import PropertyPredictionOutput, KnowledgePredictionOutput +from ..rnn import HAM from ..base_model import BaseModel from transformers.modeling_outputs import ModelOutput from transformers import PretrainedConfig @@ -218,3 +221,145 @@ def from_config(cls, config_path, **kwargs): warmup=model_config['warmup'], n_adversarial=model_config['n_adversarial'], ) + + +class DisenQNetForPropertyPrediction(BaseModel): + base_model_prefix = 'disenq' + + def __init__(self, vocab_size: int, hidden_size: int, dropout_rate: float, wv=None, + head_dropout=0.5, **kwargs): + super(DisenQNetForPropertyPrediction, self).__init__() + self.disenq = DisenQNet( + vocab_size=vocab_size, + hidden_size=hidden_size, + dropout_rate=dropout_rate, + wv=wv, + **kwargs) + self.head_dropout = head_dropout + self.dropout = nn.Dropout(head_dropout) + self.classifier = nn.Linear(hidden_size, 1) + self.sigmoid = nn.Sigmoid() + self.criterion = nn.MSELoss() + + self.config = {k: v for k, v in locals().items() if k not in ["self", "__class__", "kwargs", 'wv']} + self.config.update(kwargs) + self.config['architecture'] = 'DisenQNetForPropertyPrediction' + self.config = PretrainedConfig.from_dict(self.config) + + def forward(self, seq_idx=None, seq_len=None, labels=None, vector_type="i") -> ModelOutput: + outputs = self.disenq(seq_idx, seq_len) + if vector_type == "k": + item_embeds = outputs.k_hidden + elif vector_type == "i": + item_embeds = outputs.i_hidden + else: + raise KeyError("vector_type must be one of ('k', 'i') ") + item_embeds = self.dropout(item_embeds) + + logits = self.sigmoid(self.classifier(item_embeds)) + loss = None + if labels is not None: + loss = self.criterion(logits, labels) + return PropertyPredictionOutput( + loss=loss, + logits=logits + ) + + @classmethod + def from_config(cls, config_path, **kwargs): + with open(config_path, "r", encoding="utf-8") as rf: + model_config = json.load(rf) + model_config.update(kwargs) + return cls( + vocab_size=model_config['vocab_size'], + hidden_size=model_config['hidden_size'], + dropout_rate=model_config['dropout_rate'], + head_dropout=model_config.get('head_dropout', 0.5), + ) + + +class DisenQNetForKnowledgePrediction(BaseModel): + base_model_prefix = 'disenq' + + def __init__(self, vocab_size: int, hidden_size: int, dropout_rate: float, + num_classes_list: List[int], + num_total_classes: int, + wv=None, + head_dropout: Optional[float] = 0.5, + flat_cls_weight: Optional[float] = 0.5, + attention_unit_size: Optional[int] = 256, + fc_hidden_size: Optional[int] = 512, + beta: Optional[float] = 0.5, + **kwargs): + super(DisenQNetForKnowledgePrediction, self).__init__() + self.disenq = DisenQNet( + vocab_size=vocab_size, + hidden_size=hidden_size, + dropout_rate=dropout_rate, + wv=wv, + **kwargs) + self.head_dropout = head_dropout + self.dropout = nn.Dropout(head_dropout) + self.sigmoid = nn.Sigmoid() + self.criterion = nn.MSELoss() + self.flat_classifier = nn.Linear(in_features=hidden_size, out_features=num_total_classes) + self.ham_classifier = HAM( + num_classes_list=num_classes_list, + num_total_classes=num_total_classes, + sequence_model_hidden_size=hidden_size, + attention_unit_size=attention_unit_size, + fc_hidden_size=fc_hidden_size, + beta=beta, + dropout_rate=dropout_rate + ) + self.flat_cls_weight = flat_cls_weight + self.num_classes_list = num_classes_list + self.num_total_classes = num_total_classes + + self.config = {k: v for k, v in locals().items() if k not in ["self", "__class__", "kwargs", 'wv']} + self.config.update(kwargs) + self.config['architecture'] = 'DisenQNetForKnowledgePrediction' + self.config = PretrainedConfig.from_dict(self.config) + + def forward(self, seq_idx=None, seq_len=None, labels=None, vector_type="i") -> ModelOutput: + outputs = self.disenq(seq_idx, seq_len) + if vector_type == "k": + item_embeds = outputs.k_hidden + elif vector_type == "i": + item_embeds = outputs.i_hidden + else: + raise KeyError("vector_type must be one of ('k', 'i') ") + tokens_embeds = outputs.embeded + item_embeds = self.dropout(item_embeds) + tokens_embeds = self.dropout(tokens_embeds) + flat_logits = self.sigmoid(self.flat_classifier(item_embeds)) + ham_outputs = self.ham_classifier(tokens_embeds) + ham_logits = self.sigmoid(ham_outputs.scores) + logits = self.flat_cls_weight * flat_logits + (1 - self.flat_cls_weight) * ham_logits + loss = None + if labels is not None: + labels = torch.sum(torch.nn.functional.one_hot(labels, num_classes=self.num_total_classes), dim=1) + labels = labels.float() + loss = self.criterion(logits, labels) + return KnowledgePredictionOutput( + loss=loss, + logits=logits + ) + + @classmethod + def from_config(cls, config_path, **kwargs): + with open(config_path, "r", encoding="utf-8") as rf: + model_config = json.load(rf) + model_config.update(kwargs) + return cls( + vocab_size=model_config['vocab_size'], + hidden_size=model_config['hidden_size'], + dropout_rate=model_config['dropout_rate'], + num_total_classes=model_config.get('num_total_classes'), + num_classes_list=model_config.get('num_classes_list'), + head_dropout=model_config.get('head_dropout', 0.5), + flat_cls_weight=model_config.get('flat_cls_weight', 0.5), + attention_unit_size=model_config.get('attention_unit_size', 256), + fc_hidden_size=model_config.get('fc_hidden_size', 512), + beta=model_config.get('beta', 0.5) + ) diff --git a/EduNLP/ModelZoo/quesnet/quesnet.py b/EduNLP/ModelZoo/quesnet/quesnet.py index c7c730fb..73b836e0 100644 --- a/EduNLP/ModelZoo/quesnet/quesnet.py +++ b/EduNLP/ModelZoo/quesnet/quesnet.py @@ -80,6 +80,7 @@ def __init__(self, _stoi=None, meta='know_name', pretrained_embs: np.ndarray = N self.config = {k: v for k, v in locals().items() if k not in ["self", "__class__", "kwargs"]} # self.config.update(kwargs) self.config["architecture"] = 'quesnet' + self.config["hidden_size"] = self.hidden_size = feat_size self.config = PretrainedConfig.from_dict(self.config) def init_h(self, batch_size): @@ -114,6 +115,10 @@ def make_batch(self, data, device, pretrain=False): ans_input = [] ans_output = [] false_options = [[] for i in range(3)] + + if not isinstance(data, list): + data = [data] + for q in data: meta = torch.zeros(len(self.stoi[self.meta])).to(device) meta[q.labels.get(self.meta) or []] = 1 @@ -192,6 +197,7 @@ def make_batch(self, data, device, pretrain=False): words = torch.cat(words, dim=0) if words else None ims = torch.cat(ims, dim=0) if ims else None metas = torch.cat(metas, dim=0) if metas else None + if pretrain: return ( lembs, rembs, words, ims, metas, wmask, imask, mmask, @@ -302,7 +308,7 @@ def __init__(self, _stoi=None, pretrained_embs: np.ndarray = None, pretrained_im self.config = PretrainedConfig.from_dict(self.config) def forward(self, batch): - left, right, words, ims, metas, wmask, imask, mmask, inputs, ans_input, ans_output, false_opt_input = batch + left, right, words, ims, metas, wmask, imask, mmask, inputs, ans_input, ans_output, false_opt_input = batch[0] # high-level loss outputs = self.quesnet(inputs) @@ -310,7 +316,8 @@ def forward(self, batch): h = outputs.hidden x = ans_input.packed() - y, _ = self.ans_decode(PackedSequence(self.quesnet.we(x.data), x.batch_sizes), + + y, _ = self.ans_decode(PackedSequence(self.quesnet.we(x[0].data), x.batch_sizes), h.repeat(self.config.layers, 1, 1)) floss = F.cross_entropy(self.ans_output(y.data), ans_output.packed().data) @@ -318,51 +325,53 @@ def forward(self, batch): torch.ones_like(self.ans_judge(y.data))) for false_opt in false_opt_input: x = false_opt.packed() - y, _ = self.ans_decode(PackedSequence(self.quesnet.we(x.data), x.batch_sizes), + if x == (None, None): + continue + y, _ = self.ans_decode(PackedSequence(self.quesnet.we(x[0].data), x.batch_sizes), h.repeat(self.config.layers, 1, 1)) floss = floss + F.binary_cross_entropy_with_logits(self.ans_judge(y.data), torch.zeros_like(self.ans_judge(y.data))) loss = floss * self.lambda_loss[1] # low-level loss - left_hid = self.quesnet(left).pack_embeded.data[:, :self.rnn_size] - right_hid = self.quesnet(right).pack_embeded.data[:, self.rnn_size:] + left_hid = self.quesnet(left).pack_embeded.data[:, :self.rnn_size].clone() + right_hid = self.quesnet(right).pack_embeded.data[:, self.rnn_size:].clone() wloss = iloss = mloss = None if words is not None: - lwfea = torch.masked_select(left_hid, wmask.unsqueeze(1).bool()) \ - .view(-1, self.rnn_size) - lout = self.lwoutput(lwfea) - rwfea = torch.masked_select(right_hid, wmask.unsqueeze(1).bool()) \ - .view(-1, self.rnn_size) - rout = self.rwoutput(rwfea) - out = self.woutput(torch.cat([lwfea, rwfea], dim=1)) + lwfea = torch.masked_select(left_hid.clone(), wmask.unsqueeze(1).bool()) \ + .view(-1, self.rnn_size).clone() + lout = self.lwoutput(lwfea.clone()) + rwfea = torch.masked_select(right_hid.clone(), wmask.unsqueeze(1).bool()) \ + .view(-1, self.rnn_size).clone() + rout = self.rwoutput(rwfea.clone()) + out = self.woutput(torch.cat([lwfea.clone(), rwfea.clone()], dim=1).clone()) wloss = (F.cross_entropy(out, words) + F.cross_entropy(lout, words) + F. cross_entropy(rout, words)) * self.quesnet.lambda_input[0] / 3 wloss *= self.lambda_loss[0] loss = loss + wloss if ims is not None: - lifea = torch.masked_select(left_hid, imask.unsqueeze(1).bool()) \ - .view(-1, self.rnn_size) - lout = self.lioutput(lifea) - rifea = torch.masked_select(right_hid, imask.unsqueeze(1).bool()) \ - .view(-1, self.rnn_size) - rout = self.rioutput(rifea) - out = self.ioutput(torch.cat([lifea, rifea], dim=1)) + lifea = torch.masked_select(left_hid.clone(), imask.unsqueeze(1).bool()) \ + .view(-1, self.rnn_size).clone() + lout = self.lioutput(lifea.clone()) + rifea = torch.masked_select(right_hid.clone(), imask.unsqueeze(1).bool()) \ + .view(-1, self.rnn_size).clone() + rout = self.rioutput(rifea.clone()) + out = self.ioutput(torch.cat([lifea.clone(), rifea.clone()], dim=1).clone()) iloss = (self.quesnet.ie.loss(ims, out) + self.quesnet.ie.loss(ims, lout) + self.quesnet.ie. loss(ims, rout)) * self.quesnet.lambda_input[1] / 3 iloss *= self.lambda_loss[0] loss = loss + iloss if metas is not None: - lmfea = torch.masked_select(left_hid, mmask.unsqueeze(1).bool()) \ - .view(-1, self.rnn_size) - lout = self.lmoutput(lmfea) - rmfea = torch.masked_select(right_hid, mmask.unsqueeze(1).bool()) \ - .view(-1, self.rnn_size) - rout = self.rmoutput(rmfea) - out = self.moutput(torch.cat([lmfea, rmfea], dim=1)) + lmfea = torch.masked_select(left_hid.clone(), mmask.unsqueeze(1).bool()) \ + .view(-1, self.rnn_size).clone() + lout = self.lmoutput(lmfea.clone()) + rmfea = torch.masked_select(right_hid.clone(), mmask.unsqueeze(1).bool()) \ + .view(-1, self.rnn_size).clone() + rout = self.rmoutput(rmfea.clone()) + out = self.moutput(torch.cat([lmfea.clone(), rmfea.clone()], dim=1).clone()) mloss = (self.quesnet.me.loss(metas, out) + self.quesnet.me.loss(metas, lout) + self.quesnet.me. loss(metas, rout)) * self.quesnet.lambda_input[2] / 3 mloss *= self.lambda_loss[0] diff --git a/EduNLP/ModelZoo/quesnet/util.py b/EduNLP/ModelZoo/quesnet/util.py index 2f03910f..b840e37a 100644 --- a/EduNLP/ModelZoo/quesnet/util.py +++ b/EduNLP/ModelZoo/quesnet/util.py @@ -11,7 +11,11 @@ def __init__(self, seqs, dtype=None, device=None): self.dtype = dtype self.device = device self.seqs = seqs - self.lens = [len(x) for x in seqs] + + if not seqs: + self.lens = [0] + else: + self.lens = [len(x) for x in seqs] self.ind = argsort(self.lens)[::-1] self.inv = argsort(self.ind) @@ -19,6 +23,7 @@ def __init__(self, seqs, dtype=None, device=None): self._prefix = [0] self._index = {} c = 0 + for i in range(self.lens[0]): for j in range(len(self.lens)): if self.lens[j] <= i: @@ -28,10 +33,16 @@ def __init__(self, seqs, dtype=None, device=None): def packed(self): ind = torch.tensor(self.ind, dtype=torch.long, device=self.device) + if not ind.numel() or ind.max() >= self.padded()[0].size(1): + return None, None padded = self.padded()[0].index_select(1, ind) return pack_padded_sequence(padded, torch.tensor(self.lens)) def padded(self, max_len=None, batch_first=False): + if not self.seqs: + return torch.empty((0, 0), dtype=self.dtype, device=self.device), \ + torch.empty((0, 0), dtype=torch.bool, device=self.device) + seqs = [torch.tensor(s, dtype=self.dtype, device=self.device) if not isinstance(s, torch.Tensor) else s for s in self.seqs] diff --git a/EduNLP/ModelZoo/rnn/__init__.py b/EduNLP/ModelZoo/rnn/__init__.py index 2ac479e2..e03987fb 100644 --- a/EduNLP/ModelZoo/rnn/__init__.py +++ b/EduNLP/ModelZoo/rnn/__init__.py @@ -2,3 +2,4 @@ # 2021/7/12 @ tongshiwei from .rnn import * +from .harnn import * diff --git a/EduNLP/ModelZoo/rnn/rnn.py b/EduNLP/ModelZoo/rnn/rnn.py index 79f3ef27..eca46efd 100644 --- a/EduNLP/ModelZoo/rnn/rnn.py +++ b/EduNLP/ModelZoo/rnn/rnn.py @@ -11,6 +11,7 @@ from typing import Optional from ..base_model import BaseModel from ..utils import torch_utils as mytorch +from ..utils import PropertyPredictionOutput, KnowledgePredictionOutput from .harnn import HAM __all__ = ["LM", "ElmoLM", "ElmoLMForPreTraining", "ElmoLMForPropertyPrediction", "ElmoLMForKnowledgePrediction"] @@ -323,11 +324,6 @@ def from_config(cls, config_path, **kwargs): ) -class PropertyPredictionOutput(ModelOutput): - loss: torch.FloatTensor = None - logits: torch.FloatTensor = None - - class ElmoLMForPropertyPrediction(BaseModel): base_model_prefix = 'elmo' @@ -385,11 +381,6 @@ def from_config(cls, config_path, **kwargs): ) -class KnowledgePredictionOutput(ModelOutput): - loss: torch.FloatTensor = None - logits: torch.FloatTensor = None - - class ElmoLMForKnowledgePrediction(BaseModel): base_model_prefix = 'elmo' @@ -435,7 +426,7 @@ def __init__(self, vocab_size: int, self.config = {k: v for k, v in locals().items() if k != "self" and k != "__class__" and k != "kwargs"} self.config.update(kwargs) - self.config['architecture'] = 'ElmoLMForPreTraining' + self.config['architecture'] = 'ElmoLMForKnowledgePrediction' self.config = PretrainedConfig.from_dict(self.config) def forward(self, seq_idx=None, seq_len=None, labels=None) -> ModelOutput: diff --git a/EduNLP/ModelZoo/utils/__init__.py b/EduNLP/ModelZoo/utils/__init__.py index 3751fcb8..418a20aa 100644 --- a/EduNLP/ModelZoo/utils/__init__.py +++ b/EduNLP/ModelZoo/utils/__init__.py @@ -7,3 +7,4 @@ from .data import load_items from .modules import MLP, TextCNN from .torch_utils import * +from .downstream_output import * diff --git a/EduNLP/ModelZoo/utils/downstream_output.py b/EduNLP/ModelZoo/utils/downstream_output.py new file mode 100644 index 00000000..15793ab5 --- /dev/null +++ b/EduNLP/ModelZoo/utils/downstream_output.py @@ -0,0 +1,12 @@ +import torch +from transformers.modeling_outputs import ModelOutput + + +class PropertyPredictionOutput(ModelOutput): + loss: torch.FloatTensor = None + logits: torch.FloatTensor = None + + +class KnowledgePredictionOutput(ModelOutput): + loss: torch.FloatTensor = None + logits: torch.FloatTensor = None diff --git a/EduNLP/Pretrain/__init__.py b/EduNLP/Pretrain/__init__.py index 5c736b66..6d687270 100644 --- a/EduNLP/Pretrain/__init__.py +++ b/EduNLP/Pretrain/__init__.py @@ -5,6 +5,6 @@ from .elmo_vec import * from .bert_vec import * from .quesnet_vec import QuesNetTokenizer, pretrain_quesnet, Question -from .disenqnet_vec import DisenQTokenizer, train_disenqnet +from .disenqnet_vec import * from .pretrian_utils import * from .hugginface_utils import * diff --git a/EduNLP/Pretrain/disenqnet_vec.py b/EduNLP/Pretrain/disenqnet_vec.py index d2376689..409935a6 100644 --- a/EduNLP/Pretrain/disenqnet_vec.py +++ b/EduNLP/Pretrain/disenqnet_vec.py @@ -13,10 +13,15 @@ from dataclasses import dataclass, field from ..SIF import EDU_SPYMBOLS from ..ModelZoo.disenqnet.disenqnet import DisenQNetForPreTraining +from ..ModelZoo.disenqnet.disenqnet import DisenQNetForPropertyPrediction, DisenQNetForKnowledgePrediction from ..ModelZoo.utils import load_items, pad_sequence from .pretrian_utils import PretrainedEduTokenizer +__all__ = ["DisenQTokenizer", "DisenQDataset", "train_disenqnet", "finetune_disenqnet_for_property_prediction", + "finetune_disenqnet_for_knowledge_prediction"] + + def check_num(s): # (1/2) -> 1/2 if s.startswith('(') and s.endswith(')'): @@ -156,7 +161,6 @@ def preprocess_dataset(pretrained_dir, disen_tokenizer, items, data_formation, t if not os.path.exists(concept_list_path): concepts = set() for data in items: - print(data) concept = data[data_formation["knowledge"]] for c in concept: if c not in concepts: @@ -226,6 +230,8 @@ def __getitem__(self, index): return_tensors=False, return_text=False) if self.mode in ["train", "val"]: ret['concept'] = self._list_to_onehot(item[self.data_formation["knowledge"]], self.concept_to_idx) + if self.mode in ['finetune']: + ret['labels'] = item[self.data_formation["labels"]] return ret def collate_fn(self, batch_data): @@ -443,3 +449,155 @@ def train_disenqnet(train_items: List[dict], output_dir: str, pretrained_dir: st trainer.model.save_config(output_dir) tokenizer.save_pretrained(output_dir) return output_dir + + +def finetune_disenqnet_for_property_prediction(train_items, + output_dir, + pretrained_model, + eval_items=None, + tokenizer_params=None, + data_params=None, + train_params=None, + model_params=None + ): + """ + Parameters + ---------- + train_items: list, required + The training corpus, each item could be str or dict + output_dir: str, required + The directory to save trained model files + pretrained_model: str, optional + The pretrained model name or path for model and tokenizer + eval_items: list, required + The evaluating items, each item could be str or dict + tokenizer_params: dict, optional, default=None + The parameters passed to ElmoTokenizer + data_params: dict, optional, default=None + The parameters passed to ElmoDataset and ElmoTokenizer + model_params: dict, optional, default=None + The parameters passed to Trainer + train_params: dict, optional, default=None + """ + tokenizer_params = tokenizer_params if tokenizer_params else {} + data_params = data_params if data_params is not None else {} + model_params = model_params if model_params is not None else {} + train_params = train_params if train_params is not None else {} + default_data_formation = { + "ques_content": "ques_content", + "knowledge": "knowledge", + "labels": "difficulty" + } + data_formation = data_params.get("data_formation", None) + if data_formation is not None: + default_data_formation.update(data_formation) + data_formation = default_data_formation + # tokenizer configuration + tokenizer = DisenQTokenizer.from_pretrained(pretrained_model, **tokenizer_params) + # dataset configuration + train_dataset = DisenQDataset(train_items, tokenizer, data_formation, + mode="finetune") + if eval_items: + eval_dataset = DisenQDataset(eval_items, tokenizer, data_formation, + mode="finetune") + else: + eval_dataset = None + # model configuration + model = DisenQNetForPropertyPrediction.from_pretrained(pretrained_model, **model_params) + # training configuration + work_train_params = deepcopy(DEFAULT_TRAIN_PARAMS) + work_train_params["output_dir"] = output_dir + if train_params is not None: + work_train_params.update(train_params if train_params else {}) + if model_params: + if 'hidden_size' in model_params: + work_train_params['hidden_size'] = model_params['hidden_size'] + work_args = DisenQTrainingArguments(**work_train_params) + trainer = Trainer( + model=model, + args=work_args, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + data_collator=train_dataset.collate_fn, + ) + trainer.train() + # trainer.model.save_pretrained(output_dir) + trainer.save_model(output_dir) + trainer.model.save_config(output_dir) + tokenizer.save_pretrained(output_dir) + + +def finetune_disenqnet_for_knowledge_prediction(train_items, + output_dir, + pretrained_model="bert-base-chinese", + eval_items=None, + tokenizer_params=None, + data_params=None, + train_params=None, + model_params=None + ): + """ + Parameters + ---------- + train_items: list, required + The training corpus, each item could be str or dict + output_dir: str, required + The directory to save trained model files + pretrained_model: str, optional + The pretrained model name or path for model and tokenizer + eval_items: list, required + The evaluating items, each item could be str or dict + tokenizer_params: dict, optional, default=None + The parameters passed to ElmoTokenizer + data_params: dict, optional, default=None + The parameters passed to ElmoDataset and ElmoTokenizer + model_params: dict, optional, default=None + The parameters passed to Trainer + train_params: dict, optional, default=None + """ + tokenizer_params = tokenizer_params if tokenizer_params else {} + data_params = data_params if data_params is not None else {} + model_params = model_params if model_params is not None else {} + train_params = train_params if train_params is not None else {} + default_data_formation = { + "ques_content": "ques_content", + "knowledge": "knowledge", + "labels": "know_list" + } + data_formation = data_params.get("data_formation", None) + if data_formation is not None: + default_data_formation.update(data_formation) + data_formation = default_data_formation + # tokenizer configuration + tokenizer = DisenQTokenizer.from_pretrained(pretrained_model, **tokenizer_params) + # dataset configuration + train_dataset = DisenQDataset(train_items, tokenizer, data_formation, + mode="finetune") + if eval_items: + eval_dataset = DisenQDataset(eval_items, tokenizer, data_formation, + mode="finetune") + else: + eval_dataset = None + # model configuration + model = DisenQNetForKnowledgePrediction.from_pretrained(pretrained_model, **model_params) + # training configuration + work_train_params = deepcopy(DEFAULT_TRAIN_PARAMS) + work_train_params["output_dir"] = output_dir + if train_params is not None: + work_train_params.update(train_params if train_params else {}) + if model_params: + if 'hidden_size' in model_params: + work_train_params['hidden_size'] = model_params['hidden_size'] + work_args = DisenQTrainingArguments(**work_train_params) + trainer = Trainer( + model=model, + args=work_args, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + data_collator=train_dataset.collate_fn, + ) + trainer.train() + # trainer.model.save_pretrained(output_dir) + trainer.save_model(output_dir) + trainer.model.save_config(output_dir) + tokenizer.save_pretrained(output_dir) diff --git a/EduNLP/Pretrain/quesnet_vec.py b/EduNLP/Pretrain/quesnet_vec.py index da3899b6..a331157b 100644 --- a/EduNLP/Pretrain/quesnet_vec.py +++ b/EduNLP/Pretrain/quesnet_vec.py @@ -1,39 +1,31 @@ -"""Pre-process input text, tokenizing, building vocabs, and pre-train word -level vectors.""" +from .pretrian_utils import PretrainedEduTokenizer +from ..SIF.segment.segment import FigureSegment +from ..ModelZoo.quesnet import QuesNetForPreTraining, AE +from EduNLP import logger -import os -import logging -from pickle import NONE -import warnings -import numpy as np -import torch from torch.utils.data import DataLoader, Dataset -import signal -import threading -from tqdm import tqdm -from functools import partial -from collections import namedtuple -from copy import copy -import json -import math -import queue -import random -from PIL import Image from torchvision.transforms.functional import to_grayscale from torchvision.transforms.functional import to_tensor from gensim.models import Word2Vec -from ..SIF.segment.segment import FigureSegment -from ..SIF.segment import seg -from ..SIF.tokenization import tokenize -from ..SIF import EDU_SPYMBOLS -from ..ModelZoo.quesnet import QuesNetForPreTraining, AE -from .pretrian_utils import PretrainedEduTokenizer -from EduNLP import logger +import torch + +import warnings +import queue +import random +import math +import threading +import logging +import signal +import os +import json +import copy import linecache +import numpy as np +from PIL import Image from typing import List, Union, Optional - -Question = namedtuple('Question', - ['id', 'content', 'answer', 'false_options', 'labels']) +from collections import namedtuple +from functools import partial +from tqdm import tqdm def save_list(item2index, path): @@ -44,6 +36,15 @@ def save_list(item2index, path): return +def clip(v, low, high): + return max(low, min(v, high)) + + +# Basic unit of Dataset +Question = namedtuple('Question', + ['id', 'content', 'answer', 'false_options', 'labels']) + + class QuesNetTokenizer(PretrainedEduTokenizer): """ Examples @@ -156,12 +157,27 @@ def _convert_to_ids(self, item: Union[str, dict, list], key=lambda x: x, token_item = self.tokenize(item, key) token_idx = [] for _, w in enumerate(token_item): - if isinstance(w, FigureSegment): + if isinstance(w, FigureSegment) and isinstance(item, dict) and 'ques_figure_ids' in item.keys(): # image + try: - im = Image.open(os.path.join(self.img_dir, f'{w.src[10:-1]}.png')) + fig_id = f"{w.src[10:-1]}" + fig_index = item['ques_figure_ids'].index(fig_id) + + if self.img_dir is not None: + fig_src = os.path.join(self.img_dir, fig_id) + if '.png' in item['ques_figure_paths'][fig_index]: + fig_src += '.png' + elif '.jpg' in item['ques_figure_paths'][fig_index]: + fig_src += '.jpg' + else: + fig_src = item['ques_figure_paths'][fig_index] + + print(f"Open figure {fig_src}") + im = Image.open(fig_src) im = im.resize((56, 56)) token_idx.append(to_grayscale(im)) + except Exception: warnings.warn('Open image error!') token_idx.append(self.stoi['word'][self.img_token]) @@ -303,165 +319,135 @@ def set_img_dir(self, path): self.img_dir = path -def clip(v, low, high): - if v < low: - v = low - if v > high: - v = high - return v +class QuesnetDataset(Dataset): + ''' + Quesnet-specific datasets + ''' + def __init__( + self, + filename: str, + tokenizer: QuesNetTokenizer = None, + img_dir: str = "", + meta: Optional[list] = None, + content_key=lambda x: x['ques_content'], + meta_key=lambda x: x['know_name'], + answer_key=lambda x: x['ques_answer'], + option_key=lambda x: x['ques_options'], + pipeline=None, + skip=0 + ): - -class Lines: - def __init__(self, filename, skip=0, preserve_newline=False): self.filename = filename - with open(filename, "r", encoding="utf-8") as f: - self.length = len(f.readlines()) - skip - assert self.length > 0, f'{filename} is empty. Or file length is less than skip length.' self.skip = skip - self.preserve_newline = preserve_newline - - def __len__(self): - return self.length - - def __iter__(self): - for i in range(len(self)): - yield self[i] - - def __getitem__(self, item): - d = self.skip + 1 - if isinstance(item, int): - if item < self.length: - line = linecache.getline(self.filename, - item % len(self) + d) - if self.preserve_newline: - return json.loads(line) - else: - return json.loads(line.strip('\r\n')) - - elif isinstance(item, slice): - low = 0 if item.start is None else item.start - low = clip(low, -len(self), len(self) - 1) - if low < 0: - low += len(self) - high = len(self) if item.stop is None else item.stop - high = clip(high, -len(self), len(self)) - if high < 0: - high += len(self) - ls = [] - for i in range(low, high): - line = linecache.getline(self.filename, i + d) - if not self.preserve_newline: - line = line.strip('\r\n') - ls.append(json.loads(line)) - - return ls - - raise IndexError('index must be int or slice') - - -class QuestionLoader: - def __init__(self, ques: Lines, tokenizer: QuesNetTokenizer, - pipeline=None, range=None, meta: Optional[list] = None, - content_key=lambda x: x['ques_content'], - meta_key=lambda x: x['know_name'], - answer_key=lambda x: x['ques_answer'], - option_key=lambda x: x['ques_options'], - skip=0 - ): - """ Read question file as data list. Same behavior on same file. - - Parameters - ---------- - ques_file : str - path of question file - tokenizer : QuesNetTokenizer - pipeline : _type_, optional - _description_, by default None - range : _type_, optional - _description_, by default None - content_key : function, optional - by default lambda x:x['ques_content'] - meta_key : function, optional - by default lambda x:x['know_name'] - answer_key: function, optional - by default lambda x:x['ques_answer'] - option_key: function, optional - by default lambda x:x['ques_options'] - skip: int, optional - skip the first several lines, by default 0 - """ - self.range = None - self.ques = ques - self.range = range or slice(0, len(self), skip) - self.img_dir = tokenizer.img_dir - self.labels = [] - self.stoi = tokenizer.stoi - self.tokenizer = tokenizer - + self.img_dir = img_dir self.content_key = content_key - self.meta = meta if meta else tokenizer.meta self.meta_key = meta_key self.answer_key = answer_key self.option_key = option_key - self.pipeline = pipeline - def split_(self, split_ratio): - first_size = int(len(self) * (1 - split_ratio)) - other = copy(self) - self.range = slice(0, first_size, 1) - other.range = slice(first_size, len(other), 1) - return other + if tokenizer is None: + tokenizer = QuesNetTokenizer( + meta=['know_name'], + img_dir=img_dir + ) + self.tokenizer = tokenizer + self.meta = meta if meta else tokenizer.meta + self.load_data_lines() + tokenizer.set_vocab( + self.lines, + key=lambda x: x['ques_content'], + trim_min_count=2, + silent=False + ) + tokenizer.set_meta_vocab(self.lines, silent=False) + + def load_data_lines(self): + '''Read data by row from a JSON file + + Important: the data file is loaded during initialization. + ''' + + # TODO: All data is read into memory without chunking. + # This may lead to low efficiency. + data_dir = self.filename + skip = self.skip # Read from Line skip + 1. + self.lines = [] + self.length = 0 + + with open(data_dir, "r", encoding="utf-8") as f: + row = 0 + while True: + row += 1 + line = f.readline() + if row <= skip: + continue + if not line: + break + self.lines.append(json.loads(line.strip())) + + self.length = row - skip - 1 + assert self.length > 0, f'{data_dir} is empty. Or file length is less than skip length.' def __len__(self): - return len(self.ques) if self.range is None \ - else self.range.stop - self.range.start + return len(self.lines) - def __getitem__(self, x): - if isinstance(x, int): - x += self.range.start - item = slice(x, x + 1, 1) - else: - item = slice(x.start + self.range.start, - x.stop + self.range.start, 1) - qs = [] - if item.start > len(self): - raise IndexError - for line in self.ques[item]: - q = line - qid = q['ques_id'] - token = self.tokenizer(q, key=self.content_key, meta=self.meta) + def __getitem__(self, index): + if isinstance(index, int): + line = self.lines[index] + + qid = line['ques_id'] + token = self.tokenizer(line, key=self.content_key, meta=self.meta) content = token['seq_idx'] meta = token['meta_idx'] - if self.answer_key(q).isalpha() and len(self.answer_key(q)) == 1 and ord(self.answer_key(q)) < 128 and len( - self.option_key(q)) > 0: - answer_idx = ord(self.answer_key(q).upper()) - ord('A') - options = self.option_key(q) - answer = self.tokenizer(options.pop(answer_idx), meta=self.meta) - answer = answer['seq_idx'] + + if self.answer_key(line).isalpha() and len(self.answer_key(line)) == 1 \ + and ord(self.answer_key(line)) < 128 and len(self.option_key(line)) > 0: + answer_idx = ord(self.answer_key(line).upper()) - ord('A') + options = self.option_key(line) + answer = self.tokenizer(options.pop(answer_idx), meta=self.meta)['seq_idx'] false_options = [(self.tokenizer(option, meta=self.meta))['seq_idx'] for option in options] - qs.append(Question(qid, content, answer, false_options, meta)) else: - answer = (self.tokenizer(self.answer_key(q), meta=self.meta))['seq_idx'] - qs.append(Question(qid, content, answer, [[0], [0], [0]], meta)) + answer = (self.tokenizer(self.answer_key(line), meta=self.meta))['seq_idx'] + false_options = [[0], [0], [0]] + + qs = Question( + id=qid, + content=content, + answer=answer, + false_options=false_options, + labels=meta + ) + + if callable(self.pipeline): + qs = self.pipeline(qs) - if callable(self.pipeline): - qs = self.pipeline(qs) - if isinstance(x, int): - return qs[0] - else: return qs + elif isinstance(index, slice): + results = [] + for i in range(*index.indices(len(self))): + results.append(self[i]) + return results -def optimizer(*models, **kwargs): - _cur_optim = [m.optim_cls(m.parameters(), **kwargs) - if hasattr(m, 'optim_cls') - else torch.optim.Adam(m.parameters(), **kwargs) - for m in models] - if len(_cur_optim) == 1: - return _cur_optim[0] - else: - return _cur_optim + else: + raise TypeError('Invalid argument type. Index type should be int or slice.') + + +class EmbeddingDataset(Dataset): + def __init__(self, data, data_type='image'): + self.data = data + self.data_type = data_type + assert self.data_type in ['image', 'meta'] + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + if self.data_type == 'image': + return to_tensor(self.data[idx]) + elif self.data_type == 'meta': + return self.data[idx] class PrefetchIter: @@ -527,27 +513,6 @@ def produce(self): return -class EmbeddingDataset(Dataset): - def __init__(self, data, data_type='image'): - self.data = data - self.data_type = data_type - assert self.data_type in ['image', 'meta'] - - def __len__(self): - return len(self.data) - - def __getitem__(self, idx): - if self.data_type == 'image': - return to_tensor(self.data[idx]) - elif self.data_type == 'meta': - return self.data[idx] - - -def pretrain_iter(ques, batch_size): - _cur_iter = PrefetchIter(ques, batch_size=batch_size) - return _cur_iter - - sigint_handler = signal.getsignal(signal.SIGINT) @@ -589,7 +554,27 @@ def pretrain_embedding_layer(dataset: EmbeddingDataset, ae: AE, lr: float = 1e-3 return ae -def pretrain_quesnet(path, output_dir, img_dir=None, save_embs=False, train_params=None): +def optimizer(*models, **kwargs): + _cur_optim = [ + m.optim_cls(m.parameters(), **kwargs) + if hasattr(m, 'optim_cls') + else torch.optim.Adam(m.parameters(), **kwargs) for m in models + ] + if len(_cur_optim) == 1: + return _cur_optim[0] + else: + return _cur_optim + + +def pretrain_quesnet( + path, + output_dir, + pretrain_dir=None, + img_dir=None, + save_embs=False, + load_embs=False, + train_params=None +): """ pretrain quesnet Parameters @@ -602,6 +587,8 @@ def pretrain_quesnet(path, output_dir, img_dir=None, save_embs=False, train_para quesnet tokenizer save_embs : bool, optional whether to save pretrained word/image/meta embeddings seperately + load_embs : bool, optional + whether to load pretrained word/image/meta embeddings seperately train_params : dict, optional the training parameters and model parameters, by default None - "n_epochs": int, default = 1 @@ -633,6 +620,9 @@ def pretrain_quesnet(path, output_dir, img_dir=None, save_embs=False, train_para >>> tokenizer.set_vocab(items, key=lambda x: x['ques_content'], trim_min_count=1, silent=True) >>> pretrain_quesnet('./data/standard_luna_data.json', './testQuesNet', tokenizer) # doctest: +SKIP """ + os.makedirs(output_dir, exist_ok=True) + device = torch.device(train_params['device']) + default_train_params = { # train params "n_epochs": 1, @@ -649,21 +639,13 @@ def pretrain_quesnet(path, output_dir, img_dir=None, save_embs=False, train_para if train_params is not None: default_train_params.update(train_params) train_params = default_train_params - os.makedirs(output_dir, exist_ok=True) - device = torch.device(train_params['device']) - # set tokenizer - tokenizer = QuesNetTokenizer(meta=['know_name'], - img_dir=img_dir) - items = Lines(path, skip=1) - - tokenizer.set_vocab(items, key=lambda x: x['ques_content'], - trim_min_count=2, silent=False) - tokenizer.set_meta_vocab(items, silent=False) - tokenizer.save_pretrained(output_dir) - ques_dl = QuestionLoader(items, tokenizer) + dataset = QuesnetDataset(path, img_dir=img_dir) + tokenizer = dataset.tokenizer + tokenizer.save_pretrained(output_dir) model = QuesNetForPreTraining(_stoi=tokenizer.stoi, feat_size=train_params['feat_size'], emb_size=train_params['emb_size']).to(device) + emb_dict = tokenizer.stoi['word'] emb_dict_rev = tokenizer.itos['word'] emb_size = train_params['emb_size'] @@ -671,7 +653,7 @@ def pretrain_quesnet(path, output_dir, img_dir=None, save_embs=False, train_para w2v_corpus = [] img_corpus = [] meta_corpus = [] - for i, qs in enumerate(tqdm(ques_dl)): + for _, qs in enumerate(tqdm(dataset)): text_content = [] for c in qs.content: if isinstance(c, int): @@ -691,50 +673,75 @@ def pretrain_quesnet(path, output_dir, img_dir=None, save_embs=False, train_para meta_corpus.append(meta_vector) # train word2vec for text embedding - gensim_w2v = Word2Vec(sentences=[[item] for item in emb_dict.keys()], min_count=1, - vector_size=emb_size) - gensim_w2v.init_weights() - gensim_w2v.train(corpus_iterable=w2v_corpus, total_examples=len(w2v_corpus), epochs=train_params['n_epochs']) - w2v_emb = gensim_w2v.syn1neg - emb_weights = [] - for key, item in emb_dict.items(): - w2v_index = gensim_w2v.wv.key_to_index[key] - emb_weights.append(w2v_emb[w2v_index]) - emb_weights = np.array(emb_weights) - model.quesnet.load_emb(emb_weights) + if pretrain_dir is not None and load_embs: + model.quesnet.load_emb(np.load(os.path.join(output_dir, 'w2v_embs.npy'))) + else: + gensim_w2v = Word2Vec( + sentences=[[item] for item in emb_dict.keys()], + min_count=1, + vector_size=emb_size + ) + gensim_w2v.init_weights() + gensim_w2v.train(corpus_iterable=w2v_corpus, total_examples=len(w2v_corpus), epochs=train_params['n_epochs']) + w2v_emb = gensim_w2v.syn1neg + emb_weights = [] + for key, item in emb_dict.items(): + w2v_index = gensim_w2v.wv.key_to_index[key] + emb_weights.append(w2v_emb[w2v_index]) + emb_weights = np.array(emb_weights) + model.quesnet.load_emb(emb_weights) + if save_embs: + np.save(os.path.join(output_dir, 'w2v_embs.npy'), emb_weights) logger.info('quesnet Word Embedding loaded') - if save_embs: - np.save(os.path.join(output_dir, 'w2v_embs.npy'), emb_weights) # train auto-encoder loss for image embedding - img_dataset = EmbeddingDataset(data=img_corpus, data_type='image') - trained_ie = pretrain_embedding_layer(dataset=img_dataset, ae=model.quesnet.ie, lr=train_params['lr'], - log_step=train_params['log_steps'], batch_size=train_params['batch_size'], - epochs=train_params['n_epochs'], device=device) - model.quesnet.load_img(trained_ie) + if pretrain_dir is not None and load_embs: + model.quesnet.load_img(torch.load(os.path.join(pretrain_dir, 'trained_ie.pt'))) + else: + img_dataset = EmbeddingDataset(data=img_corpus, data_type='image') + trained_ie = pretrain_embedding_layer( + dataset=img_dataset, + ae=model.quesnet.ie, + lr=train_params['lr'], + log_step=train_params['log_steps'], + batch_size=train_params['batch_size'], + epochs=train_params['n_epochs'], + device=device + ) + if save_embs: + torch.save(trained_ie.state_dict(), os.path.join(output_dir, 'trained_ie.pt')) + model.quesnet.load_img(trained_ie) logger.info('quesnet Image Embedding loaded') - if save_embs: - torch.save(trained_ie.state_dict(), os.path.join(output_dir, 'trained_ie.pt')) # train auto-encoder loss for meta embedding - meta_dateset = EmbeddingDataset(data=meta_corpus, data_type='meta') - trained_me = pretrain_embedding_layer(dataset=meta_dateset, ae=model.quesnet.me, lr=train_params['lr'], - log_step=train_params['log_steps'], batch_size=train_params['batch_size'], - epochs=train_params['n_epochs'], device=device) - model.quesnet.load_meta(trained_me) + if pretrain_dir is not None and load_embs: + model.quesnet.load_meta(torch.load(os.path.join(pretrain_dir, 'trained_me.pt'))) + else: + meta_dateset = EmbeddingDataset(data=meta_corpus, data_type='meta') + trained_me = pretrain_embedding_layer( + dataset=meta_dateset, + ae=model.quesnet.me, + lr=train_params['lr'], + log_step=train_params['log_steps'], + batch_size=train_params['batch_size'], + epochs=train_params['n_epochs'], + device=device + ) + if save_embs: + torch.save(trained_me.state_dict(), os.path.join(output_dir, 'trained_me.pt')) + model.quesnet.load_meta(trained_me) logger.info('quesnet Meta Embedding loaded') - if save_embs: - torch.save(trained_me.state_dict(), os.path.join(output_dir, 'trained_me.pt')) logger.info("quesnet Word, Image and Meta Embeddings training is done") + # DONE for datasets # HLM and DOO training - ques_dl.pipeline = partial(model.quesnet.make_batch, device=device, pretrain=True) + dataset.pipeline = partial(model.quesnet.make_batch, device=device, pretrain=True) model.train() optim = optimizer(model, lr=train_params['lr']) n_batches = 0 for epoch in range(0, train_params['n_epochs']): - train_iter = pretrain_iter(ques_dl, train_params['batch_size']) + train_iter = PrefetchIter(dataset, batch_size=train_params['batch_size']) bar = enumerate(tqdm(train_iter, initial=train_iter.pos), train_iter.pos) for i, batch in critical(bar): diff --git a/EduNLP/Tokenizer/tokenizer.py b/EduNLP/Tokenizer/tokenizer.py index 56e3b28d..d154fff4 100644 --- a/EduNLP/Tokenizer/tokenizer.py +++ b/EduNLP/Tokenizer/tokenizer.py @@ -115,7 +115,7 @@ def _tokenize(self, item: Union[str, dict], key=lambda x: x, **kwargs): class PureTextTokenizer(Tokenizer): - def __init__(self, handle_figure_formula="skip", **kwargs): + def __init__(self, symbol="gmas", handle_figure_formula="skip", **kwargs): """ Treat all elements in SIF item as prue text. Spectially, tokenize formulas as text. @@ -184,13 +184,14 @@ def __init__(self, handle_figure_formula="skip", **kwargs): "text_params": text_params, "figure_params": kwargs.get("figure_params", None) } + self.symbol = symbol def __call__(self, items: Iterable, key=lambda x: x, **kwargs): for item in items: yield self._tokenize(item, key=key, **kwargs) def _tokenize(self, item: Union[str, dict], key=lambda x: x, **kwargs): - return tokenize(seg(key(item), symbol="gmas"), **self.tokenization_params, **kwargs).tokens + return tokenize(seg(key(item), symbol=self.symbol), **self.tokenization_params, **kwargs).tokens class AstFormulaTokenizer(Tokenizer): diff --git a/EduNLP/Vector/bert_vec.py b/EduNLP/Vector/bert_vec.py index 33fc0061..ce597199 100644 --- a/EduNLP/Vector/bert_vec.py +++ b/EduNLP/Vector/bert_vec.py @@ -1,6 +1,5 @@ # from transformers import BertModel as HFBertModel from transformers import AutoModel -from .const import UNK, PAD from .meta import Vector import torch @@ -29,15 +28,17 @@ class BertModel(Vector): torch.Size([2, 768]) """ - def __init__(self, pretrained_model): - self.model = AutoModel.from_pretrained(pretrained_model) + def __init__(self, pretrained_dir, device="cpu"): + self.device = device + self.model = AutoModel.from_pretrained(pretrained_dir).to(self.device) + self.model.eval() def __call__(self, items: dict): - # batch_size, sent_len, embedding_size + self.cuda_tensor(items) tokens = self.model(**items).last_hidden_state return tokens - def infer_vector(self, items: dict, pooling_strategy='CLS') -> torch.Tensor: + def infer_vector(self, items: dict, pooling_strategy='CLS', **kwargs) -> torch.Tensor: vector = self(items) if pooling_strategy == 'CLS': return vector[:, 0, :] @@ -49,7 +50,7 @@ def infer_vector(self, items: dict, pooling_strategy='CLS') -> torch.Tensor: # batch_size, embedding_dim return mul_mask.sum(1) / (mask.sum(1) + 1e-10) - def infer_tokens(self, items: dict, return_special_tokens=False) -> torch.Tensor: + def infer_tokens(self, items: dict, return_special_tokens=False, **kwargs) -> torch.Tensor: tokens = self(items) if return_special_tokens: # include embedding of [CLS] and [SEP] diff --git a/EduNLP/Vector/disenqnet/disenqnet.py b/EduNLP/Vector/disenqnet/disenqnet.py index edff4412..61cd0d30 100644 --- a/EduNLP/Vector/disenqnet/disenqnet.py +++ b/EduNLP/Vector/disenqnet/disenqnet.py @@ -1,8 +1,9 @@ import torch from EduNLP.ModelZoo.disenqnet.disenqnet import DisenQNet +from EduNLP.Vector.meta import Vector -class DisenQModel(object): +class DisenQModel(Vector): def __init__(self, pretrained_dir, device="cpu"): """ Parameters @@ -13,10 +14,11 @@ def __init__(self, pretrained_dir, device="cpu"): cpu or cuda, default is cpu """ self.device = device - self.model = DisenQNet.from_pretrained(pretrained_dir) - self.model.to(self.device) + self.model = DisenQNet.from_pretrained(pretrained_dir).to(self.device) + self.model.eval() - def __call__(self, items: dict, **kwargs): + def __call__(self, items: dict): + self.cuda_tensor(items) outputs = self.model(**items) return outputs.embeded, outputs.k_hidden, outputs.i_hidden @@ -42,6 +44,16 @@ def infer_vector(self, items: dict, vector_type=None, **kwargs) -> torch.Tensor: def infer_tokens(self, items: dict, **kwargs) -> torch.Tensor: embeded, _, _ = self(items) + """ + get tokens embedding with DisenQModel + Parameters + ---------- + items: dict + {'content_idx': tensor(),'content_len': tensor()}, the tokens about question after tokenizer processing + + Returns: + torch.Tensor: token embedding + """ return embeded @property diff --git a/EduNLP/Vector/elmo_vec.py b/EduNLP/Vector/elmo_vec.py index 84146534..053ffad7 100644 --- a/EduNLP/Vector/elmo_vec.py +++ b/EduNLP/Vector/elmo_vec.py @@ -1,58 +1,56 @@ -from pathlib import PurePath -import os import torch -import torch.optim as optim -import torch.nn as nn -import torch.utils.data as tud -import numpy as np -from torch.nn.utils.rnn import pad_sequence -import torch.nn.functional as F -from EduNLP.Pretrain import train_elmo, ElmoTokenizer from EduNLP.ModelZoo.rnn import ElmoLM from .meta import Vector -import json -from typing import Dict, List, Tuple class ElmoModel(Vector): - def __init__(self, pretrained_dir: str): + def __init__(self, pretrained_dir: str, device="cpu"): """ Parameters ---------- pretrained_model_path: str """ super(ElmoModel, self).__init__() - self.model = ElmoLM.from_pretrained(pretrained_dir) + self.device = device + self.model = ElmoLM.from_pretrained(pretrained_dir).to(device) self.model.eval() - def __call__(self, *args, **kwargs): - return self.infer_vector(*args, **kwargs) - - def infer_vector(self, items: Tuple[dict, List[dict]], *args, **kwargs) -> torch.Tensor: - # TODO: handle batch and unbatch format for inputs and outputs - # is_batch = isinstance(items, list) - # items = items if is_batch else [items] + def __call__(self, items: dict): + self.cuda_tensor(items) outputs = self.model(**items) + return outputs + + def infer_vector(self, items: dict, **kwargs) -> torch.Tensor: + """ + get sentence vector embedding with ElmoModel + Parameters + ---------- + items: dict, {'seq_idx': tensor(),'seq_len':tensor()}, the tokens about question after tokenizer processing + + Returns: + torch.Tensor: sentence embedding + """ + outputs = self(items) item_embeds = torch.cat( (outputs.forward_output[torch.arange(len(items["seq_len"])), torch.tensor(items["seq_len"]) - 1], outputs.backward_output[torch.arange(len(items["seq_len"])), 0]), dim=-1) return item_embeds - def infer_tokens(self, items, *args, **kwargs) -> torch.Tensor: - # is_batch = isinstance(items, list) - outputs = self.model(**items) + def infer_tokens(self, items, **kwargs) -> torch.Tensor: + """ + get tokens embedding with ElmoModel + Parameters + ---------- + items: dict, {'seq_idx': tensor()}, the tokens about question after tokenizer processing + + Returns: + torch.Tensor: token embedding + """ + outputs = self(items) forward_hiddens = outputs.forward_output backward_hiddens = outputs.backward_output return torch.cat((forward_hiddens, backward_hiddens), dim=-1) - # if is_batch: - # ret = [] - # for fh, bh, lg in zip(forward_hiddens, backward_hiddens, items.seq_len): - # _bh = torch.cat((torch.flip(bh[:lg], [0]), bh[lg:]), dim=0) - # ret.append(torch.cat((fh, _bh), dim=-1)) - # return torch.stack(tuple(ret)) - # else: - # return torch.cat((forward_hiddens[0], torch.flip(backward_hiddens, [1])[0]), dim=-1) @property def vector_size(self): diff --git a/EduNLP/Vector/gensim_vec.py b/EduNLP/Vector/gensim_vec.py index ec616e3d..cc8ae9df 100644 --- a/EduNLP/Vector/gensim_vec.py +++ b/EduNLP/Vector/gensim_vec.py @@ -23,7 +23,7 @@ class W2V(Vector): other(Word2Vec) binary: bool """ - def __init__(self, filepath, method=None, binary=None): + def __init__(self, filepath, method=None, binary=None, **kwargs): fp = PurePath(filepath) self.binary = binary if binary is not None else (True if fp.suffix == ".bin" else False) if self.binary is True: @@ -65,11 +65,33 @@ def __getitem__(self, item): index = self.key_to_index(item) return self.wv[item] if index not in self.constants.values() else np.zeros((self.vector_size,)) - def infer_vector(self, items, agg="mean", *args, **kwargs) -> list: - token_vectors = self.infer_tokens(items, *args, **kwargs) - return [eval("np.%s" % agg)(item, axis=0) for item in token_vectors] - - def infer_tokens(self, items, *args, **kwargs) -> list: + def infer_vector(self, items, agg="mean", **kwargs) -> list: + """ + get sentence embedding with word2vec model + Parameters + ---------- + item: list, the tokens after tokenizer processing + Return + ------ + vector: list + [array(), ..., array()] + """ + token_vectors = self.infer_tokens(items, **kwargs) + # return [eval("np.%s" % agg)(item, axis=0) if item else np.array([]) for item in token_vectors] + return [eval("np.%s" % agg)(item, axis=0) if item else np.zeros(self.vector_size,) for item in token_vectors] + + def infer_tokens(self, items, **kwargs) -> list: + """ + get token embedding with word2vec model + Parameters + ---------- + item: list + the tokens after tokenizer processing + Return + ------ + vector: list + [[array(), ..., array()], [...], [...]] + """ return [list(self(*item)) for item in items] @@ -94,6 +116,17 @@ def __init__(self, filepath): self.dictionary = corpora.Dictionary.load(filepath) def infer_vector(self, item, return_vec=False): + """ + get Bow vector + Parameters + ---------- + item: list + the tokens after tokenizer processing + Return + ------ + vector: list + [array(), ..., array()] + """ item = self.dictionary.doc2bow(item) if not return_vec: return item # return dic as default @@ -120,6 +153,17 @@ def __init__(self, filepath): self.dictionary = corpora.Dictionary.load(dictionary_path) def infer_vector(self, item, return_vec=False): + """ + get Tf-idf vector + Parameters + ---------- + item: list + the tokens after tokenizer processing + Return + ------ + vector: list + [array(), ..., array()] + """ dic_item = self.dictionary.doc2bow(item) tfidf_item = self.tfidf_model[dic_item] # return dic as default @@ -152,7 +196,7 @@ class D2V(Vector): --------- d2v model:D2V """ - def __init__(self, filepath, method="d2v"): + def __init__(self, filepath, method="d2v", **kwargs): self._method = method self._filepath = filepath if self._method == "d2v": @@ -180,7 +224,22 @@ def vector_size(self): return self.d2v.vector_size def infer_vector(self, items, *args, **kwargs) -> list: + """ + get vector with D2V model + Parameters + ---------- + item: list + the tokens after tokenizer processing + Return + ------ + vector: list + [array(), ..., array()] + """ return [self(item) for item in items] def infer_tokens(self, item, *args, **kwargs) -> ...: + """ + get token embeddings with D2V + NotImplemented + """ raise NotImplementedError diff --git a/EduNLP/Vector/meta.py b/EduNLP/Vector/meta.py index b87e90fe..83b961d5 100644 --- a/EduNLP/Vector/meta.py +++ b/EduNLP/Vector/meta.py @@ -1,5 +1,7 @@ # coding: utf-8 # 2021/7/13 @ tongshiwei +import torch + class Vector(object): def infer_vector(self, items, *args, **kwargs) -> ...: @@ -18,3 +20,8 @@ def is_frozen(self): # pragma: no cover def freeze(self, *args, **kwargs): # pragma: no cover pass + + def cuda_tensor(self, items: dict): + for k, v in items.items(): + if isinstance(v, torch.Tensor): + items[k] = v.to(self.device) diff --git a/EduNLP/Vector/quesnet/quesnet.py b/EduNLP/Vector/quesnet/quesnet.py index cd845461..b8728c82 100644 --- a/EduNLP/Vector/quesnet/quesnet.py +++ b/EduNLP/Vector/quesnet/quesnet.py @@ -2,10 +2,11 @@ from typing import Union from EduNLP.ModelZoo.quesnet import QuesNet from EduNLP.Pretrain import Question, QuesNetTokenizer +from EduNLP.Vector.meta import Vector -class QuesNetModel(object): - def __init__(self, pretrained_dir, img_dir=None, device="cpu", **kwargs): +class QuesNetModel(Vector): + def __init__(self, pretrained_dir, device="cpu", **kwargs): """ Parameters ---------- @@ -17,10 +18,10 @@ def __init__(self, pretrained_dir, img_dir=None, device="cpu", **kwargs): image dir """ self.device = torch.device(device) - self.model = QuesNet.from_pretrained(pretrained_dir, img_dir=img_dir).to(device) + self.model = QuesNet.from_pretrained(pretrained_dir).to(self.device) self.model.eval() - def __call__(self, items: dict, **kwargs): + def __call__(self, items: dict): """ get question embedding with quesnet Parameters @@ -33,7 +34,7 @@ def __call__(self, items: dict, **kwargs): outputs = self.model(self.model.make_batch(qs, device=self.device)) return outputs.hidden, outputs.embeded - def infer_vector(self, items: Union[dict, list]) -> torch.Tensor: + def infer_vector(self, items: Union[dict, list], **kwargs) -> torch.Tensor: """ get question embedding with quesnet Parameters @@ -43,7 +44,7 @@ def infer_vector(self, items: Union[dict, list]) -> torch.Tensor: """ return self(items)[0] - def infer_tokens(self, items: Union[dict, list]) -> torch.Tensor: + def infer_tokens(self, items: Union[dict, list], **kwargs) -> torch.Tensor: """ get token embeddings with quesnet Parameters diff --git a/EduNLP/Vector/t2v.py b/EduNLP/Vector/t2v.py index 1635eb4b..c02486d3 100644 --- a/EduNLP/Vector/t2v.py +++ b/EduNLP/Vector/t2v.py @@ -45,7 +45,7 @@ class T2V(object): Examples -------- - >>> item = [{'ques_content':'有公式$\\FormFigureID{wrong1?}$和公式$\\FormFigureBase64{wrong2?}$,\ + >>> item = [{'ques_content':'有公式$\\FormFigureID{wrong1?}$和公式$\\FormFigureBase64{wrong2?}$, \ ... 如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,若$x,y$满足约束条件$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$'}] >>> model_dir = "examples/test_model/d2v" >>> url, model_name, *args = get_pretrained_model_info('d2v_test_256') @@ -69,9 +69,24 @@ def __call__(self, items, *args, **kwargs): return self.i2v.infer_vector(items, *args, **kwargs) def infer_vector(self, items, *args, **kwargs): + """ + get question embedding with T2V + Parameters + ---------- + items:list + a list of question + Returns + ------- + vector:list + numpy.ndarray([dtype=float32)] + """ return self.i2v.infer_vector(items, *args, **kwargs) def infer_tokens(self, items, *args, **kwargs): + """ + get token embeddings with T2V + NotImplemented + """ return self.i2v.infer_tokens(items, *args, **kwargs) @property @@ -80,6 +95,24 @@ def vector_size(self) -> int: def get_pretrained_model_info(name): + """ + get the pretrained model information with the given name + Parameters + ---------- + name:str + select the pretrained model + e.g.: + d2v_math_300 + w2v_math_300 + elmo_math_2048 + bert_math_768 + bert_taledu_768 + disenq_math_256 + quesnet_math_512 + Returns + -------- + list: [model url (where to download), model name] + """ url = MODELHUB_URL + 'getPretrainedModel' param = {'name': name} r = requests.get(url, params=param) @@ -89,6 +122,14 @@ def get_pretrained_model_info(name): def get_all_pretrained_models(): + """ + get all pretrained models' name + + Returns + ------- + the pretrained models' name:list + e.g.['bert_bio_ptc', 'bert_geo_ptc', 'bert_math_768', ... ] + """ url = MODELHUB_URL + 'getPretrainedModelList' r = requests.get(url) assert r.status_code == 200, r.status_code @@ -96,7 +137,7 @@ def get_all_pretrained_models(): return r['name'] -def get_pretrained_t2v(name, model_dir=MODEL_DIR, **kwargs): +def get_pretrained_t2v(name, model_dir=MODEL_DIR, device='cpu', **kwargs): """ It is a good idea if you want to switch token list to vector earily. @@ -138,4 +179,4 @@ def get_pretrained_t2v(name, model_dir=MODEL_DIR, **kwargs): if model_name in ["d2v", "w2v"]: postfix = ".bin" if model_name == "d2v" else ".kv" model_path = path_append(model_path, os.path.basename(model_path) + postfix, to_str=True) - return T2V(model_name, model_path, *args, **kwargs) + return T2V(model_name, model_path, device=device, *args, **kwargs) diff --git a/docs/README.md b/docs/README.md index 8d4db8b8..33778258 100644 --- a/docs/README.md +++ b/docs/README.md @@ -3,26 +3,31 @@ EduNLP document and tutorial folder Requirements ------------ + See the requirements `docs_deps` in `setup.py`: + ```sh pip install -e .[doc] ``` - Build documents --------------- + First, clean up existing files: + ``` make clean ``` Then build: + ``` make html ``` Render locally -------------- + ``` cd build/html python3 -m http.server 8000 diff --git a/docs/source/api/ModelZoo.rst b/docs/source/api/ModelZoo.rst index 1b8ec9e4..1ee6569b 100644 --- a/docs/source/api/ModelZoo.rst +++ b/docs/source/api/ModelZoo.rst @@ -7,6 +7,28 @@ base_model .. automodule:: EduNLP.ModelZoo.base_model :members: +:: + 相关方法中的参数说明: + + save_pretrained(output_dir): + output_dir: str + The path you want to save your model + + classmethodfrom_pretrained(pretrained_model_path, *args, **kwargs): + pretrained_model_path: str + The path where you load your checkpoint from + + save_config(config_dir): + config_dir: str + The path you want to save the config file + + @classmethod + from_config(config_path, *args, **kwargs): + config_path: str + The path where you load the config file + + + rnn ----------- @@ -14,6 +36,14 @@ rnn :members: :imported-members: +:: + 参数补充说明: + @classmethod from_config(config_path, **kwargs): + config_path: str + The path where you load the config file + + + disenqnet ----------- @@ -21,6 +51,12 @@ disenqnet :members: :imported-members: +:: + 参数补充说明: + @classmethod from_config(config_path, **kwargs): + config_path: str + The path where you load the config file + quesnet ----------- diff --git a/docs/source/api/tokenizer.rst b/docs/source/api/tokenizer.rst index 63d27f48..e7cf330c 100644 --- a/docs/source/api/tokenizer.rst +++ b/docs/source/api/tokenizer.rst @@ -4,3 +4,85 @@ EduNLP.Tokenizer .. automodule:: EduNLP.Tokenizer :members: :imported-members: + +AstFormulaTokenizer参数定义 +####################################### + +:: + Parameters + ---------- + symbol : str, optional + Elements to symbolize before tokenization, by default "gmas" + figures : _type_, optional + Info for figures in items, by default None + """ + +CharTokenizer参数定义 +####################################### + +:: + """Tokenize text char by char. eg. "题目内容" -> ["题", "目", "内", 容"] + + Parameters + ---------- + stop_words : str, optional + stop_words to skip, by default "default" + """ + +CustomTokenizer参数定义 +####################################### + +:: + """Tokenize SIF items by customized configuration + + Parameters + ---------- + symbol : str, optional + Elements to symbolize before tokenization, by default "gmas" + figures : _type_, optional + Info for figures in items, by default None + kwargs: addtional configuration for SIF items + including text_params, formula_params, figure_params, more details could be found in `EduNLP.SIF.sif4sci` + """ + +PureTextTokenizer参数定义 +####################################### + +:: + """ + Treat all elements in SIF item as prue text. Spectially, tokenize formulas as text. + + Parameters + ---------- + handle_figure_formula : str, optional + whether to skip or symbolize special formulas( $\\FormFigureID{…}$ and $\\FormFigureBase64{…}), + by default skip + +SpaceTokenizer参数定义 +####################################### + +:: + """ + Tokenize text by space. eg. "题目 内容" -> ["题目", "内容"] + + Parameters + ---------- + stop_words : str, optional + stop_words to skip, by default "default" + """ + +EduNLP.Tokenizer.get_tokenizer参数定义 +####################################### + +:: + Parameters + ---------- + name: str + the name of tokenizer, e.g. text, pure_text. + args: + the parameters passed to tokenizer + kwargs: + the parameters passed to tokenizer + Returns + ------- + tokenizer: Tokenizer \ No newline at end of file diff --git a/docs/source/api/vector.rst b/docs/source/api/vector.rst index f8bb56de..b960e97a 100644 --- a/docs/source/api/vector.rst +++ b/docs/source/api/vector.rst @@ -10,13 +10,13 @@ EduNLP.Vector.t2v EduNLP.Vector.disenqnet --------------------- +------------------------- .. automodule:: EduNLP.Vector.disenqnet.disenqnet :members: EduNLP.Vector.quesnet --------------------- +------------------------- .. automodule:: EduNLP.Vector.quesnet.quesnet :members: diff --git a/docs/source/conf.py b/docs/source/conf.py index de5bfe16..92e95a92 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -114,3 +114,5 @@ def copy_tree(src, tar): 'undoc-members': True, } autodoc_member_order = 'bysource' + +nbsphinx_allow_errors = True diff --git a/docs/source/tutorial/en/pretrain.rst b/docs/source/tutorial/en/pretrain.rst index fd07ea27..f3345514 100644 --- a/docs/source/tutorial/en/pretrain.rst +++ b/docs/source/tutorial/en/pretrain.rst @@ -45,6 +45,171 @@ The corpus dictionary is a tool introduced in pre-training to facilitate user po >>> print(res) ['An', '[UNK]', '[UNK]', 'a', '[UNK]', '[UNK]', '[UNK]'] +:: + EduVocab() related parameter descriptions and function definitions: + class EduVocab(object): + """The vocabulary container for a corpus. + + Parameters + ---------- + vocab_path : str, optional + vocabulary path to initialize this container, by default None + corpus_items : List[str], optional + corpus items to update this vocabulary, by default None + bos_token : str, optional + token representing for the start of a sentence, by default "[BOS]" + eos_token : str, optional + token representing for the end of a sentence, by default "[EOS]" + pad_token : str, optional + token representing for padding, by default "[PAD]" + unk_token : str, optional + token representing for unknown word, by default "[UNK]" + specials : List[str], optional + spacials tokens in vocabulary, by default None + lower : bool, optional + wheather to lower the corpus items, by default False + trim_min_count : int, optional + the lower bound number for adding a word into vocabulary, by default 1 + """ + def __init__(self, vocab_path: str = None, corpus_items: List[str] = None, bos_token: str = "[BOS]", + eos_token: str = "[EOS]", pad_token: str = "[PAD]", unk_token: str = "[UNK]", + specials: List[str] = None, lower: bool = False, trim_min_count: int = 1, **kwargs): + super(EduVocab, self).__init__() + + self._tokens = [] + self.idx_to_token = dict() + self.token_to_idx = dict() + self.frequencies = dict() + # 定义特殊词 + self.bos_token = bos_token + self.eos_token = eos_token + self.pad_token = pad_token + self.unk_token = unk_token + self._special_tokens = [self.pad_token, self.unk_token, self.bos_token, self.eos_token] + + if specials: + self._special_tokens += specials + for st in self._special_tokens: + self._add(st) + # 加载词典 + if vocab_path is not None: + self.load_vocab(vocab_path) + elif corpus_items is not None: + self.set_vocab(corpus_items, lower, trim_min_count) + + self.bos_idx = self.token_to_idx[self.bos_token] + self.eos_idx = self.token_to_idx[self.eos_token] + self.pad_idx = self.token_to_idx[self.pad_token] + self.unk_idx = self.token_to_idx[self.unk_token] + + def __len__(self): + return len(self._tokens) + + @property + def vocab_size(self): + return len(self._tokens) + + @property + def special_tokens(self): + return self._special_tokens + + @property + def tokens(self): + return self._tokens + + def to_idx(self, token): + """convert token to index""" + return self.token_to_idx.get(token, self.unk_idx) + + def to_token(self, idx): + """convert index to index""" + return self.idx_to_token.get(idx, self.unk_token) + + def convert_sequence_to_idx(self, tokens, bos=False, eos=False): + """convert sentence of tokens to sentence of indexs""" + res = [self.to_idx(t) for t in tokens] + if bos is True: + res = [self.bos_idx] + res + if eos is True: + res = res + [self.eos_idx] + return res + + def convert_sequence_to_token(self, idxs, **kwargs): + """convert sentence of indexs to sentence of tokens""" + return [self.to_token(i) for i in idxs] + + def set_vocab(self, corpus_items: List[str], lower: bool = False, trim_min_count: int = 1, silent=True): + """Update the vocabulary with the tokens in corpus items + + Parameters + ---------- + corpus_items : List[str], optional + corpus items to update this vocabulary, by default None + lower : bool, optional + wheather to lower the corpus items, by default False + trim_min_count : int, optional + the lower bound number for adding a word into vocabulary, by default 1 + """ + word2cnt = dict() + for item in corpus_items: + for word in item: + word = word.lower() if lower else word + word2cnt[word] = word2cnt.get(word, 0) + 1 + words = [w for w, c in word2cnt.items() if c >= trim_min_count and w not in self._special_tokens] + for token in words: + self._add(token) + if not silent: + keep_word_cnts = sum(word2cnt[w] for w in words) + all_word_cnts = sum(word2cnt.values()) + print(f"save words(trim_min_count={trim_min_count}): {len(words)}/{len(word2cnt)} = {len(words) / len(word2cnt):.4f}\ + with frequency {keep_word_cnts}/{all_word_cnts}={keep_word_cnts / all_word_cnts:.4f}") + + def load_vocab(self, vocab_path: str): + """Load the vocabulary from vocab_file + + Parameters + ---------- + vocab_path : str + path to save vocabulary file + """ + with open(vocab_path, "r", encoding="utf-8") as file: + self._tokens = file.read().strip().split('\n') + self.token_to_idx = {token: idx for idx, token in enumerate(self._tokens)} + self.idx_to_token = {idx: token for idx, token in enumerate(self._tokens)} + + def save_vocab(self, vocab_path: str): + """Save the vocabulary into vocab_file + + Parameters + ---------- + vocab_path : str + path to save vocabulary file + """ + with open(vocab_path, 'w', encoding='utf-8') as file: + for i in range(self.vocab_size): + token = self._tokens[i] + file.write(f"{token}\n") + + def _add(self, token: str): + if token not in self._tokens: + idx = len(self._tokens) + self._tokens.append(token) + self.idx_to_token[idx] = token + self.token_to_idx[token] = idx + + def add_specials(self, tokens: List[str]): + """Add special tokens into vocabulary""" + for token in tokens: + if token not in self._special_tokens: + self._special_tokens += [token] + self._add(token) + + def add_tokens(self, tokens: List[str]): + """Add tokens into vocabulary""" + for token in tokens: + self._add(token) + + Basic Steps @@ -67,6 +232,93 @@ Examples: # 10 dimension with fasstext method train_vector(sif_items, "../../../data/w2v/gensim_luna_stem_tf_", 10, method="d2v") +:: + Definition of train_vector(): + def train_vector(items, w2v_prefix, embedding_dim=None, method="sg", binary=None, train_params=None): + """ + + Parameters + ---------- + items:str + the text of question + w2v_prefix + embedding_dim:int + vector_size + method:str + the method of training, + e.g.: sg, cbow, fasttext, d2v, bow, tfidf + binary: model format + True:bin; + False:kv + train_params: dict + the training parameters passed to model + + Returns + ---------- + tokenizer: Tokenizer + + """ + monitor = MonitorCallback(["word", "I", "less"]) + _train_params = dict( + min_count=0, + vector_size=embedding_dim, + workers=multiprocessing.cpu_count(), + callbacks=[monitor] + ) + if method in {"sg", "cbow"}: + sg = 1 if method == "sg" else 0 + _train_params["sg"] = sg + if train_params is not None: + _train_params.update(train_params) + model = gensim.models.Word2Vec( + items, **_train_params + ) + binary = binary if binary is not None else False + elif method == "fasttext": + if train_params is not None: + _train_params.update(train_params) + model = gensim.models.FastText( + sentences=items, + **_train_params + ) + binary = binary if binary is not None else True + elif method == "d2v": + if train_params is not None: + _train_params.update(train_params) + docs = [TaggedDocument(doc, [i]) for i, doc in enumerate(items)] + model = gensim.models.Doc2Vec( + docs, **_train_params + ) + binary = binary if binary is not None else True + elif method == "bow": + model = gensim.corpora.Dictionary(items) + binary = binary if binary is not None else True + elif method == "tfidf": + dictionary_path = train_vector(items, w2v_prefix, method="bow") + dictionary = BowLoader(dictionary_path) + corpus = [dictionary.infer_vector(item) for item in items] + model = gensim.models.TfidfModel(corpus) + binary = binary if binary is not None else True + else: + raise ValueError("Unknown method: %s" % method) + + filepath = w2v_prefix + method + if embedding_dim is not None: + filepath = filepath + "_" + str(embedding_dim) + + if binary is True: + filepath += ".bin" + logger.info("model is saved to %s" % filepath) + model.save(filepath) + else: + if method in {"fasttext", "d2v"}: # pragma: no cover + logger.warning("binary should be True for %s, otherwise all vectors for ngrams will be lost." % method) + filepath += ".kv" + logger.info("model is saved to %s" % filepath) + model.wv.save(filepath) + return filepath + + Load models ---------------- @@ -80,6 +332,68 @@ Examples: >>> model_path = "../test_model/d2v/test_gensim_luna_stem_tf_d2v_256.bin" >>> i2v = D2V("text","d2v",filepath=model_path, pretrained_t2v = False) +:: + + Taking D2V as an example, the specific definitions are as follows: (For other interfaces, please refer to the definitions under EduNLP/I2V) + + class D2V(I2V): + """ + The model aims to transfer item to vector directly. + + Bases + ------- + I2V + + Parameters + ----------- + tokenizer: str + the tokenizer name + t2v: str + the name of token2vector model + args: + the parameters passed to t2v + tokenizer_kwargs: dict + the parameters passed to tokenizer + pretrained_t2v: bool + True: use pretrained t2v model + False: use your own t2v model + kwargs: + the parameters passed to t2v + + Returns + ------- + i2v model: I2V + """ + + def infer_vector(self, items, tokenize=True, key=lambda x: x, *args, + **kwargs) -> tuple: + """ + It is a function to switch item to vector. And before using the function, it is necessary to load model. + + Parameters + ----------- + items:str + the text of question + tokenize: bool + True: tokenize the item + key: function + determine how to get the text of each item + args: + the parameters passed to t2v + kwargs: + the parameters passed to t2v + + Returns + -------- + vector:list + """ + tokens = self.tokenize(items, key=key) if tokenize is True else items + tokens = [token for token in tokens] + return self.t2v(tokens, *args, **kwargs), None + + @classmethod + def from_pretrained(cls, name, model_dir=MODEL_DIR, *args, **kwargs): + return cls("pure_text", name, pretrained_t2v=True, model_dir=model_dir) Examples of Model Training ------------------------------------ diff --git a/docs/source/tutorial/en/tokenization.rst b/docs/source/tutorial/en/tokenization.rst index e6935a92..fdac21d5 100644 --- a/docs/source/tutorial/en/tokenization.rst +++ b/docs/source/tutorial/en/tokenization.rst @@ -26,6 +26,32 @@ The corresponding instance is `EduNLP.SIF.tokenize`. >>> tokenize(seg(items), formula_params={"method": "ast"}) ['如图所示', '三角形', , '面积', '\\\\SIFBlank', \\FigureID{1}] +:: + + The definition of EduNLP.SIF.tokenize: + + def tokenize(segment_list: SegmentList, text_params=None, formula_params=None, figure_params=None): + """ + an actual api to tokenize item + + Parameters + ---------- + segment_list:list + segmented item + text_params:dict + the method to duel with text + formula_params:dict + the method to duel with formula + figure_params:dict + the method to duel with figure + + Returns + ---------- + list + tokenized item + + """ + return TokenList(segment_list, text_params, formula_params, figure_params) Standard interface ^^^^^^^^^^^^^^^^^^^^^^ @@ -88,6 +114,28 @@ In addition, we provide a key parameter to select the pending content in the inc ['文', '具', '店', '有', '$', '600', '$', '本', '练', '习', '本', '卖', '出', '一', '些', '后', '还', '剩', '$', '4', '$', '包', '每', '包', '$', '25', '$', '本', '卖', '出', '多', '少', '本'] +:: + + The dataefinition of CharTokenizer: + class CharTokenizer(Tokenizer): + def __init__(self, stop_words="punctuations", **kwargs) -> None: + """Tokenize text char by char. eg. "题目内容" -> ["题", "目", "内", 容"] + + Parameters + ---------- + stop_words : str, optional + stop_words to skip, by default "default" + """ + self.stop_words = set("\n\r\t .,;?\"\'。.,、;?“”‘’()") if stop_words == "punctuations" else stop_words + self.stop_words = stop_words if stop_words is not None else set() + + def __call__(self, items: Iterable, key=lambda x: x, **kwargs): + for item in items: + yield self._tokenize(item, key=key, **kwargs) + + def _tokenize(self, item: Union[str, dict], key=lambda x: x, **kwargs): + tokens = tokenize_text(key(item).strip(), granularity="char", stopwords=self.stop_words) + return tokens SpaceTokenizer ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -109,6 +157,30 @@ In addition, we provide a key parameter to select the pending content in the inc ['已知集合$A=\\left\\{x', '\\mid', 'x^{2}-3', 'x-4<0\\right\\},', '\\quad', 'B=\\{-4,1,3,5\\},', '\\quad$', '则', '$A', '\\cap', 'B=$'] +:: + + The definition of SpaceTokenizer: + class SpaceTokenizer(Tokenizer): + """Tokenize text by space. eg. "题目 内容" -> ["题目", "内容"] + + Parameters + ---------- + stop_words : str, optional + stop_words to skip, by default "default" + """ + def __init__(self, stop_words="punctuations", **kwargs) -> None: + stop_words = set("\n\r\t .,;?\"\'。.,、;?“”‘’()") if stop_words == "punctuations" else stop_words + self.stop_words = stop_words if stop_words is not None else set() + + def __call__(self, items: Iterable, key=lambda x: x, **kwargs): + for item in items: + yield self._tokenize(item, key=key, **kwargs) + + def _tokenize(self, item: Union[str, dict], key=lambda x: x, **kwargs): + tokens = key(item).strip().split(' ') + if self.stop_words: + tokens = [w for w in tokens if w != '' and w not in self.stop_words] + return tokens CustomTokenizer ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -136,7 +208,55 @@ In addition, we provide a key parameter to select the pending content in the inc >>> print(next(tokens)) ['已知', '集合', '[FORMULA]', '[FORMULA]'] - +:: + + The definition of CustomTokenizer: + class CustomTokenizer(Tokenizer): + def __init__(self, symbol="gmas", figures=None, **kwargs): + """Tokenize SIF items by customized configuration + + Parameters + ---------- + symbol : str, optional + Elements to symbolize before tokenization, by default "gmas" + figures : _type_, optional + Info for figures in items, by default None + kwargs: addtional configuration for SIF items + including text_params, formula_params, figure_params, more details could be found in `EduNLP.SIF.sif4sci` + """ + self.tokenization_params = { + "text_params": kwargs.get("text_params", None), + "formula_params": kwargs.get("formula_params", None), + "figure_params": kwargs.get("figure_params", None) + } + self.symbol = symbol + self.figures = figures + + def __call__(self, items: Iterable, key=lambda x: x, **kwargs): + """Tokenize items, return iterator genetator + + Parameters + ---------- + item : Iterable + question items + key : function, optional + determine how to get the text of items, by default lambdax: x + """ + for item in items: + yield self._tokenize(item, key=key, **kwargs) + + def _tokenize(self, item: Union[str, dict], key=lambda x: x, **kwargs): + """Tokenize one item, return token list + + Parameters + ---------- + item : Union[str, dict] + question item + key : function, optional + determine how to get the text of item, by default lambdax: x + """ + return tokenize(seg(key(item), symbol=self.symbol, figures=self.figures), + **self.tokenization_params, **kwargs).tokens PureTextTokenizer @@ -157,6 +277,55 @@ In addition, we provide a key parameter to select the pending content in the inc >>> print(next(tokens)) ['公式', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[SEP]', 'z', '=', 'x', '+', '7', 'y', '最大值', '[MARK]'] +:: + + The definition of PureTextTokenizer: + class PureTextTokenizer(Tokenizer): + def __init__(self, handle_figure_formula="skip", **kwargs): + """ + Treat all elements in SIF item as prue text. Spectially, tokenize formulas as text. + + Parameters + ---------- + handle_figure_formula : str, optional + whether to skip or symbolize special formulas( $\\FormFigureID{…}$ and $\\FormFigureBase64{…}), + by default skip + + """ + # Formula images are skipped by default + if handle_figure_formula == "skip": + skip_figure_formula = True + symbolize_figure_formula = False + elif handle_figure_formula == "symbolize": + skip_figure_formula = False + symbolize_figure_formula = True + elif handle_figure_formula is None: + skip_figure_formula, symbolize_figure_formula = False, False + else: + raise ValueError('handle_figure_formula should be one in ["skip", "symbolize", None]') + formula_params = { + "method": "linear", + "skip_figure_formula": skip_figure_formula, + "symbolize_figure_formula": symbolize_figure_formula + } + text_params = { + "granularity": "word", + "stopwords": "default", + } + formula_params.update(kwargs.pop("formula_params", {})) + text_params.update(kwargs.pop("text_params", {})) + self.tokenization_params = { + "formula_params": formula_params, + "text_params": text_params, + "figure_params": kwargs.get("figure_params", None) + } + + def __call__(self, items: Iterable, key=lambda x: x, **kwargs): + for item in items: + yield self._tokenize(item, key=key, **kwargs) + + def _tokenize(self, item: Union[str, dict], key=lambda x: x, **kwargs): + return tokenize(seg(key(item), symbol="gmas"), **self.tokenization_params, **kwargs).tokens AstFormulaTokenizer @@ -176,7 +345,53 @@ In addition, we provide a key parameter to select the pending content in the inc ['公式', '[FORMULA]', '如图', '[FIGURE]', 'mathord_0', ',', 'mathord_1', '约束条件', '公式', '[FORMULA]', '[SEP]', 'mathord_2', '=', 'mathord_0', '+', 'textord', 'mathord_1', '最大值', '[MARK]'] - +:: + The definition of AstFormulaTokenizer: + class AstFormulaTokenizer(Tokenizer): + def __init__(self, symbol="gmas", figures=None, **kwargs): + """Tokenize formulas in SIF items by AST parser. + + Parameters + ---------- + symbol : str, optional + Elements to symbolize before tokenization, by default "gmas" + figures : _type_, optional + Info for figures in items, by default None + """ + formula_params = { + "method": "ast", + + "ord2token": True, + "return_type": "list", + "var_numbering": True, + + "skip_figure_formula": False, + "symbolize_figure_formula": True + } + text_params = { + "granularity": "word", + "stopwords": "default", + } + formula_params.update(kwargs.pop("formula_params", {})) + text_params.update(kwargs.pop("text_params", {})) + self.tokenization_params = { + "formula_params": formula_params, + "text_params": text_params, + "figure_params": kwargs.pop("figure_params", None), + } + self.symbol = symbol + self.figures = figures + + def __call__(self, items: Iterable, key=lambda x: x, **kwargs): + for item in items: + yield self._tokenize(item, key=key, **kwargs) + + def _tokenize(self, item: Union[str, dict], key=lambda x: x, **kwargs): + mode = kwargs.pop("mode", 0) + ret = sif4sci(key(item), figures=self.figures, symbol=self.symbol, mode=mode, + tokenization_params=self.tokenization_params, errors="ignore", **kwargs) + ret = [] if ret is None else ret.tokens + return ret GensimWordTokenizer ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -201,7 +416,59 @@ By default, the pictures, blanks in the question text and other parts of the inc >>> print(token_item.tokens) ['已知', '公式', '[FORMULA]', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[FORMULA]', '[SEP]', 'z', '=', 'x', '+', '7', 'y', '最大值', '[MARK]'] - +:: + + The definition of GensimWordTokenizer: + class GensimWordTokenizer(object): + """ + + Parameters + ---------- + symbol: str + select the methods to symbolize: + "t": text, + "f": formula, + "g": figure, + "m": question mark, + "a": tag, + "s": sep, + e.g.: gm, fgm, gmas, fgmas + general: bool + + True: when item isn't in standard format, and want to tokenize formulas(except formulas in figure) linearly. + + False: when use 'ast' mothed to tokenize formulas instead of 'linear'. + + Returns + ---------- + tokenizer: Tokenizer + + """ + def __init__(self, symbol="gm", general=False): + self.symbol = symbol + if general is True: + self.tokenization_params = { + "formula_params": { + "method": "linear", + "symbolize_figure_formula": True + } + } + else: + self.tokenization_params = { + "formula_params": { + "method": "ast", + "return_type": "list", + "ord2token": True + } + } + + def batch_process(self, *items): + pass + + def __call__(self, item): + return sif4sci( + item, symbol=self.symbol, tokenization_params=self.tokenization_params, errors="ignore" + ) GensimSegTokenizer @@ -236,6 +503,71 @@ Select segmentation level: print(len(token_item), token_item) # 2 [['[TEXT_BEGIN]', '已知', '公式', '[FORMULA_BEGIN]', \FormFigureID{1}, '[TEXT_BEGIN]', '如图', '[FIGURE]', '[FORMULA_BEGIN]', 'mathord', ',', 'mathord', '[TEXT_BEGIN]', '约束条件', '公式', '[FORMULA_BEGIN]', [FORMULA], '[SEP]'], ['[FORMULA_BEGIN]', 'mathord', '=', 'mathord', '+', 'textord', 'mathord', '[TEXT_BEGIN]', '最大值', '[MARK]']] +:: + + The definition of GensimSegTokenizer: + class GensimSegTokenizer(object): # pragma: no cover + """ + + Parameters + ---------- + symbol:str + select the methods to symbolize: + "t": text, + "f": formula, + "g": figure, + "m": question mark, + "a": tag, + "s": sep, + e.g. gms, fgm + + depth: int or None + + 0: only separate at \\SIFSep ; + 1: only separate at \\SIFTag ; + 2: separate at \\SIFTag and \\SIFSep ; + otherwise, separate all segments ; + + Returns + ---------- + tokenizer: Tokenizer + + """ + def __init__(self, symbol="gms", depth=None, flatten=False, **kwargs): + self.symbol = symbol + self.tokenization_params = { + "formula_params": { + "method": "ast", + "return_type": "list", + "ord2token": True + } + } + self.kwargs = dict( + add_seg_type=True if depth in {0, 1, 2} else False, + add_seg_mode="head", + depth=depth, + drop="s" if depth not in {0, 1, 2} else "" + ) + self.kwargs.update(kwargs) + self.flatten = flatten + + def __call__(self, item, flatten=None, **kwargs): + flatten = self.flatten if flatten is None else flatten + tl = sif4sci( + item, symbol=self.symbol, tokenization_params=self.tokenization_params, errors="ignore" + ) + if kwargs: + _kwargs = deepcopy(self.kwargs) + _kwargs.update(kwargs) + else: + _kwargs = self.kwargs + if tl: + ret = tl.get_segments(**_kwargs) + if flatten is True: + return it.chain(*ret) + return ret + return tl + More examples ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/docs/source/tutorial/zh/pipeline.rst b/docs/source/tutorial/zh/pipeline.rst index 6dca611e..31253e11 100644 --- a/docs/source/tutorial/zh/pipeline.rst +++ b/docs/source/tutorial/zh/pipeline.rst @@ -2,9 +2,7 @@ 流水线 ======= -.. nbgallery:: - :caption: This is a thumbnail gallery: - :name: pipleine_gallery - :glob: +.. nbinfo:: + notebook: - 流水线 <../../build/blitz/pipeline/pipeline.ipynb> + `流水线 <../../build/blitz/pipeline/pipeline.ipynb>`_ diff --git a/docs/source/tutorial/zh/pretrain.rst b/docs/source/tutorial/zh/pretrain.rst index 82092160..a4697133 100644 --- a/docs/source/tutorial/zh/pretrain.rst +++ b/docs/source/tutorial/zh/pretrain.rst @@ -42,6 +42,171 @@ >>> print(res) ['An', '[UNK]', '[UNK]', 'a', '[UNK]', '[UNK]', '[UNK]'] +:: + + EduVocab相关参数说明和函数定义: + class EduVocab(object): + """The vocabulary container for a corpus. + + Parameters + ---------- + vocab_path : str, optional + vocabulary path to initialize this container, by default None + corpus_items : List[str], optional + corpus items to update this vocabulary, by default None + bos_token : str, optional + token representing for the start of a sentence, by default "[BOS]" + eos_token : str, optional + token representing for the end of a sentence, by default "[EOS]" + pad_token : str, optional + token representing for padding, by default "[PAD]" + unk_token : str, optional + token representing for unknown word, by default "[UNK]" + specials : List[str], optional + spacials tokens in vocabulary, by default None + lower : bool, optional + wheather to lower the corpus items, by default False + trim_min_count : int, optional + the lower bound number for adding a word into vocabulary, by default 1 + """ + def __init__(self, vocab_path: str = None, corpus_items: List[str] = None, bos_token: str = "[BOS]", + eos_token: str = "[EOS]", pad_token: str = "[PAD]", unk_token: str = "[UNK]", + specials: List[str] = None, lower: bool = False, trim_min_count: int = 1, **kwargs): + super(EduVocab, self).__init__() + + self._tokens = [] + self.idx_to_token = dict() + self.token_to_idx = dict() + self.frequencies = dict() + # 定义特殊词 + self.bos_token = bos_token + self.eos_token = eos_token + self.pad_token = pad_token + self.unk_token = unk_token + self._special_tokens = [self.pad_token, self.unk_token, self.bos_token, self.eos_token] + + if specials: + self._special_tokens += specials + for st in self._special_tokens: + self._add(st) + # 加载词典 + if vocab_path is not None: + self.load_vocab(vocab_path) + elif corpus_items is not None: + self.set_vocab(corpus_items, lower, trim_min_count) + + self.bos_idx = self.token_to_idx[self.bos_token] + self.eos_idx = self.token_to_idx[self.eos_token] + self.pad_idx = self.token_to_idx[self.pad_token] + self.unk_idx = self.token_to_idx[self.unk_token] + + def __len__(self): + return len(self._tokens) + + @property + def vocab_size(self): + return len(self._tokens) + + @property + def special_tokens(self): + return self._special_tokens + + @property + def tokens(self): + return self._tokens + + def to_idx(self, token): + """convert token to index""" + return self.token_to_idx.get(token, self.unk_idx) + + def to_token(self, idx): + """convert index to index""" + return self.idx_to_token.get(idx, self.unk_token) + + def convert_sequence_to_idx(self, tokens, bos=False, eos=False): + """convert sentence of tokens to sentence of indexs""" + res = [self.to_idx(t) for t in tokens] + if bos is True: + res = [self.bos_idx] + res + if eos is True: + res = res + [self.eos_idx] + return res + + def convert_sequence_to_token(self, idxs, **kwargs): + """convert sentence of indexs to sentence of tokens""" + return [self.to_token(i) for i in idxs] + + def set_vocab(self, corpus_items: List[str], lower: bool = False, trim_min_count: int = 1, silent=True): + """Update the vocabulary with the tokens in corpus items + + Parameters + ---------- + corpus_items : List[str], optional + corpus items to update this vocabulary, by default None + lower : bool, optional + wheather to lower the corpus items, by default False + trim_min_count : int, optional + the lower bound number for adding a word into vocabulary, by default 1 + """ + word2cnt = dict() + for item in corpus_items: + for word in item: + word = word.lower() if lower else word + word2cnt[word] = word2cnt.get(word, 0) + 1 + words = [w for w, c in word2cnt.items() if c >= trim_min_count and w not in self._special_tokens] + for token in words: + self._add(token) + if not silent: + keep_word_cnts = sum(word2cnt[w] for w in words) + all_word_cnts = sum(word2cnt.values()) + print(f"save words(trim_min_count={trim_min_count}): {len(words)}/{len(word2cnt)} = {len(words) / len(word2cnt):.4f}\ + with frequency {keep_word_cnts}/{all_word_cnts}={keep_word_cnts / all_word_cnts:.4f}") + + def load_vocab(self, vocab_path: str): + """Load the vocabulary from vocab_file + + Parameters + ---------- + vocab_path : str + path to save vocabulary file + """ + with open(vocab_path, "r", encoding="utf-8") as file: + self._tokens = file.read().strip().split('\n') + self.token_to_idx = {token: idx for idx, token in enumerate(self._tokens)} + self.idx_to_token = {idx: token for idx, token in enumerate(self._tokens)} + + def save_vocab(self, vocab_path: str): + """Save the vocabulary into vocab_file + + Parameters + ---------- + vocab_path : str + path to save vocabulary file + """ + with open(vocab_path, 'w', encoding='utf-8') as file: + for i in range(self.vocab_size): + token = self._tokens[i] + file.write(f"{token}\n") + + def _add(self, token: str): + if token not in self._tokens: + idx = len(self._tokens) + self._tokens.append(token) + self.idx_to_token[idx] = token + self.token_to_idx[token] = idx + + def add_specials(self, tokens: List[str]): + """Add special tokens into vocabulary""" + for token in tokens: + if token not in self._special_tokens: + self._special_tokens += [token] + self._add(token) + + def add_tokens(self, tokens: List[str]): + """Add tokens into vocabulary""" + for token in tokens: + self._add(token) + 预训练令牌化容器 >>>>>>>>>>>>>>>>>>>>>>>> @@ -104,6 +269,94 @@ Examples: # 10 dimension with fasstext method train_vector(sif_items, "../../../data/w2v/gensim_luna_stem_tf_", 10, method="d2v") +:: + + PureTextTokenizer在“令牌化”部分已有详细说明 + train_vector定义如下: + def train_vector(items, w2v_prefix, embedding_dim=None, method="sg", binary=None, train_params=None): + """ + + Parameters + ---------- + items:str + the text of question + w2v_prefix + embedding_dim:int + vector_size + method:str + the method of training, + e.g.: sg, cbow, fasttext, d2v, bow, tfidf + binary: model format + True:bin; + False:kv + train_params: dict + the training parameters passed to model + + Returns + ---------- + tokenizer: Tokenizer + + """ + monitor = MonitorCallback(["word", "I", "less"]) + _train_params = dict( + min_count=0, + vector_size=embedding_dim, + workers=multiprocessing.cpu_count(), + callbacks=[monitor] + ) + if method in {"sg", "cbow"}: + sg = 1 if method == "sg" else 0 + _train_params["sg"] = sg + if train_params is not None: + _train_params.update(train_params) + model = gensim.models.Word2Vec( + items, **_train_params + ) + binary = binary if binary is not None else False + elif method == "fasttext": + if train_params is not None: + _train_params.update(train_params) + model = gensim.models.FastText( + sentences=items, + **_train_params + ) + binary = binary if binary is not None else True + elif method == "d2v": + if train_params is not None: + _train_params.update(train_params) + docs = [TaggedDocument(doc, [i]) for i, doc in enumerate(items)] + model = gensim.models.Doc2Vec( + docs, **_train_params + ) + binary = binary if binary is not None else True + elif method == "bow": + model = gensim.corpora.Dictionary(items) + binary = binary if binary is not None else True + elif method == "tfidf": + dictionary_path = train_vector(items, w2v_prefix, method="bow") + dictionary = BowLoader(dictionary_path) + corpus = [dictionary.infer_vector(item) for item in items] + model = gensim.models.TfidfModel(corpus) + binary = binary if binary is not None else True + else: + raise ValueError("Unknown method: %s" % method) + + filepath = w2v_prefix + method + if embedding_dim is not None: + filepath = filepath + "_" + str(embedding_dim) + + if binary is True: + filepath += ".bin" + logger.info("model is saved to %s" % filepath) + model.save(filepath) + else: + if method in {"fasttext", "d2v"}: # pragma: no cover + logger.warning("binary should be True for %s, otherwise all vectors for ngrams will be lost." % method) + filepath += ".kv" + logger.info("model is saved to %s" % filepath) + model.wv.save(filepath) + return filepath + 加载预训练模型 >>>>>>>>>>>>>>>>>>>>>>>> @@ -127,6 +380,69 @@ Examples: i2v = D2V("text", "d2v", filepath=model_path, pretrained_t2v=False) +:: + + 以D2V为例,具体定义如下:(其余接口可参考EduNLP/I2V下的各个定义) + class D2V(I2V): + """ + The model aims to transfer item to vector directly. + + Bases + ------- + I2V + + Parameters + ----------- + tokenizer: str + the tokenizer name + t2v: str + the name of token2vector model + args: + the parameters passed to t2v + tokenizer_kwargs: dict + the parameters passed to tokenizer + pretrained_t2v: bool + True: use pretrained t2v model + False: use your own t2v model + kwargs: + the parameters passed to t2v + + Returns + ------- + i2v model: I2V + """ + + def infer_vector(self, items, tokenize=True, key=lambda x: x, *args, + **kwargs) -> tuple: + """ + It is a function to switch item to vector. And before using the function, it is necessary to load model. + + Parameters + ----------- + items:str + the text of question + tokenize: bool + True: tokenize the item + key: function + determine how to get the text of each item + args: + the parameters passed to t2v + kwargs: + the parameters passed to t2v + + Returns + -------- + vector:list + """ + tokens = self.tokenize(items, key=key) if tokenize is True else items + tokens = [token for token in tokens] + return self.t2v(tokens, *args, **kwargs), None + + @classmethod + def from_pretrained(cls, name, model_dir=MODEL_DIR, *args, **kwargs): + return cls("pure_text", name, pretrained_t2v=True, model_dir=model_dir) + + 更多模型训练案例 ----------------------- diff --git a/docs/source/tutorial/zh/tokenization.rst b/docs/source/tutorial/zh/tokenization.rst index dcc84c4e..93d154ce 100644 --- a/docs/source/tutorial/zh/tokenization.rst +++ b/docs/source/tutorial/zh/tokenization.rst @@ -30,6 +30,31 @@ >>> tokenize(seg(items), formula_params={"method": "ast"}) ['如图所示', '三角形', , '面积', '\\\\SIFBlank', \\FigureID{1}] +:: + EduNLP.SIF.tokenize的函数形式及参数定义如下: + def tokenize(segment_list: SegmentList, text_params=None, formula_params=None, figure_params=None): + """ + an actual api to tokenize item + + Parameters + ---------- + segment_list:list + segmented item + text_params:dict + the method to duel with text + formula_params:dict + the method to duel with formula + figure_params:dict + the method to duel with figure + + Returns + ---------- + list + tokenized item + + """ + return TokenList(segment_list, text_params, formula_params, figure_params) + 标准接口 ^^^^^^^^^^^^^^^^^^^^^^ @@ -97,6 +122,28 @@ CharTokenizer ['文', '具', '店', '有', '$', '600', '$', '本', '练', '习', '本', '卖', '出', '一', '些', '后', '还', '剩', '$', '4', '$', '包', '每', '包', '$', '25', '$', '本', '卖', '出', '多', '少', '本'] +:: + + CharTokenizer定义如下: + class CharTokenizer(Tokenizer): + def __init__(self, stop_words="punctuations", **kwargs) -> None: + """Tokenize text char by char. eg. "题目内容" -> ["题", "目", "内", 容"] + + Parameters + ---------- + stop_words : str, optional + stop_words to skip, by default "default" + """ + self.stop_words = set("\n\r\t .,;?\"\'。.,、;?“”‘’()") if stop_words == "punctuations" else stop_words + self.stop_words = stop_words if stop_words is not None else set() + + def __call__(self, items: Iterable, key=lambda x: x, **kwargs): + for item in items: + yield self._tokenize(item, key=key, **kwargs) + + def _tokenize(self, item: Union[str, dict], key=lambda x: x, **kwargs): + tokens = tokenize_text(key(item).strip(), granularity="char", stopwords=self.stop_words) + return tokens SpaceTokenizer ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -118,6 +165,30 @@ SpaceTokenizer ['已知集合$A=\\left\\{x', '\\mid', 'x^{2}-3', 'x-4<0\\right\\},', '\\quad', 'B=\\{-4,1,3,5\\},', '\\quad$', '则', '$A', '\\cap', 'B=$'] +:: + + SpaceTokenizer定义如下: + class SpaceTokenizer(Tokenizer): + """Tokenize text by space. eg. "题目 内容" -> ["题目", "内容"] + + Parameters + ---------- + stop_words : str, optional + stop_words to skip, by default "default" + """ + def __init__(self, stop_words="punctuations", **kwargs) -> None: + stop_words = set("\n\r\t .,;?\"\'。.,、;?“”‘’()") if stop_words == "punctuations" else stop_words + self.stop_words = stop_words if stop_words is not None else set() + + def __call__(self, items: Iterable, key=lambda x: x, **kwargs): + for item in items: + yield self._tokenize(item, key=key, **kwargs) + + def _tokenize(self, item: Union[str, dict], key=lambda x: x, **kwargs): + tokens = key(item).strip().split(' ') + if self.stop_words: + tokens = [w for w in tokens if w != '' and w not in self.stop_words] + return tokens CustomTokenizer ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -145,6 +216,55 @@ CustomTokenizer >>> print(next(tokens)) ['已知', '集合', '[FORMULA]', '[FORMULA]'] +:: + + CustomTokenizer定义如下: + class CustomTokenizer(Tokenizer): + def __init__(self, symbol="gmas", figures=None, **kwargs): + """Tokenize SIF items by customized configuration + + Parameters + ---------- + symbol : str, optional + Elements to symbolize before tokenization, by default "gmas" + figures : _type_, optional + Info for figures in items, by default None + kwargs: addtional configuration for SIF items + including text_params, formula_params, figure_params, more details could be found in `EduNLP.SIF.sif4sci` + """ + self.tokenization_params = { + "text_params": kwargs.get("text_params", None), + "formula_params": kwargs.get("formula_params", None), + "figure_params": kwargs.get("figure_params", None) + } + self.symbol = symbol + self.figures = figures + + def __call__(self, items: Iterable, key=lambda x: x, **kwargs): + """Tokenize items, return iterator genetator + + Parameters + ---------- + item : Iterable + question items + key : function, optional + determine how to get the text of items, by default lambdax: x + """ + for item in items: + yield self._tokenize(item, key=key, **kwargs) + + def _tokenize(self, item: Union[str, dict], key=lambda x: x, **kwargs): + """Tokenize one item, return token list + + Parameters + ---------- + item : Union[str, dict] + question item + key : function, optional + determine how to get the text of item, by default lambdax: x + """ + return tokenize(seg(key(item), symbol=self.symbol, figures=self.figures), + **self.tokenization_params, **kwargs).tokens @@ -166,6 +286,55 @@ PureTextTokenizer >>> print(next(tokens)) ['公式', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[SEP]', 'z', '=', 'x', '+', '7', 'y', '最大值', '[MARK]'] +:: + + PureTextTokenizer定义如下: + class PureTextTokenizer(Tokenizer): + def __init__(self, handle_figure_formula="skip", **kwargs): + """ + Treat all elements in SIF item as prue text. Spectially, tokenize formulas as text. + + Parameters + ---------- + handle_figure_formula : str, optional + whether to skip or symbolize special formulas( $\\FormFigureID{…}$ and $\\FormFigureBase64{…}), + by default skip + + """ + # Formula images are skipped by default + if handle_figure_formula == "skip": + skip_figure_formula = True + symbolize_figure_formula = False + elif handle_figure_formula == "symbolize": + skip_figure_formula = False + symbolize_figure_formula = True + elif handle_figure_formula is None: + skip_figure_formula, symbolize_figure_formula = False, False + else: + raise ValueError('handle_figure_formula should be one in ["skip", "symbolize", None]') + formula_params = { + "method": "linear", + "skip_figure_formula": skip_figure_formula, + "symbolize_figure_formula": symbolize_figure_formula + } + text_params = { + "granularity": "word", + "stopwords": "default", + } + formula_params.update(kwargs.pop("formula_params", {})) + text_params.update(kwargs.pop("text_params", {})) + self.tokenization_params = { + "formula_params": formula_params, + "text_params": text_params, + "figure_params": kwargs.get("figure_params", None) + } + + def __call__(self, items: Iterable, key=lambda x: x, **kwargs): + for item in items: + yield self._tokenize(item, key=key, **kwargs) + + def _tokenize(self, item: Union[str, dict], key=lambda x: x, **kwargs): + return tokenize(seg(key(item), symbol="gmas"), **self.tokenization_params, **kwargs).tokens AstFormulaTokenizer @@ -185,6 +354,54 @@ AstFormulaTokenizer ['公式', '[FORMULA]', '如图', '[FIGURE]', 'mathord_0', ',', 'mathord_1', '约束条件', '公式', '[FORMULA]', '[SEP]', 'mathord_2', '=', 'mathord_0', '+', 'textord', 'mathord_1', '最大值', '[MARK]'] +:: + AstFormulaTokenizer定义如下: + class AstFormulaTokenizer(Tokenizer): + def __init__(self, symbol="gmas", figures=None, **kwargs): + """Tokenize formulas in SIF items by AST parser. + + Parameters + ---------- + symbol : str, optional + Elements to symbolize before tokenization, by default "gmas" + figures : _type_, optional + Info for figures in items, by default None + """ + formula_params = { + "method": "ast", + + "ord2token": True, + "return_type": "list", + "var_numbering": True, + + "skip_figure_formula": False, + "symbolize_figure_formula": True + } + text_params = { + "granularity": "word", + "stopwords": "default", + } + formula_params.update(kwargs.pop("formula_params", {})) + text_params.update(kwargs.pop("text_params", {})) + self.tokenization_params = { + "formula_params": formula_params, + "text_params": text_params, + "figure_params": kwargs.pop("figure_params", None), + } + self.symbol = symbol + self.figures = figures + + def __call__(self, items: Iterable, key=lambda x: x, **kwargs): + for item in items: + yield self._tokenize(item, key=key, **kwargs) + + def _tokenize(self, item: Union[str, dict], key=lambda x: x, **kwargs): + mode = kwargs.pop("mode", 0) + ret = sif4sci(key(item), figures=self.figures, symbol=self.symbol, mode=mode, + tokenization_params=self.tokenization_params, errors="ignore", **kwargs) + ret = [] if ret is None else ret.tokens + return ret + @@ -213,6 +430,59 @@ GensimWordTokenizer >>> print(token_item.tokens) ['已知', '公式', '[FORMULA]', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[FORMULA]', '[SEP]', 'z', '=', 'x', '+', '7', 'y', '最大值', '[MARK]'] +:: + + GensimWordTokenizer定义如下: + class GensimWordTokenizer(object): + """ + + Parameters + ---------- + symbol: str + select the methods to symbolize: + "t": text, + "f": formula, + "g": figure, + "m": question mark, + "a": tag, + "s": sep, + e.g.: gm, fgm, gmas, fgmas + general: bool + + True: when item isn't in standard format, and want to tokenize formulas(except formulas in figure) linearly. + + False: when use 'ast' mothed to tokenize formulas instead of 'linear'. + + Returns + ---------- + tokenizer: Tokenizer + + """ + def __init__(self, symbol="gm", general=False): + self.symbol = symbol + if general is True: + self.tokenization_params = { + "formula_params": { + "method": "linear", + "symbolize_figure_formula": True + } + } + else: + self.tokenization_params = { + "formula_params": { + "method": "ast", + "return_type": "list", + "ord2token": True + } + } + + def batch_process(self, *items): + pass + + def __call__(self, item): + return sif4sci( + item, symbol=self.symbol, tokenization_params=self.tokenization_params, errors="ignore" + ) @@ -245,6 +515,71 @@ GensimSegTokenizer print(len(token_item), token_item) # 2 [['[TEXT_BEGIN]', '已知', '公式', '[FORMULA_BEGIN]', \FormFigureID{1}, '[TEXT_BEGIN]', '如图', '[FIGURE]', '[FORMULA_BEGIN]', 'mathord', ',', 'mathord', '[TEXT_BEGIN]', '约束条件', '公式', '[FORMULA_BEGIN]', [FORMULA], '[SEP]'], ['[FORMULA_BEGIN]', 'mathord', '=', 'mathord', '+', 'textord', 'mathord', '[TEXT_BEGIN]', '最大值', '[MARK]']] +:: + + GensimSegTokenizer定义如下: + class GensimSegTokenizer(object): # pragma: no cover + """ + + Parameters + ---------- + symbol:str + select the methods to symbolize: + "t": text, + "f": formula, + "g": figure, + "m": question mark, + "a": tag, + "s": sep, + e.g. gms, fgm + + depth: int or None + + 0: only separate at \\SIFSep ; + 1: only separate at \\SIFTag ; + 2: separate at \\SIFTag and \\SIFSep ; + otherwise, separate all segments ; + + Returns + ---------- + tokenizer: Tokenizer + + """ + def __init__(self, symbol="gms", depth=None, flatten=False, **kwargs): + self.symbol = symbol + self.tokenization_params = { + "formula_params": { + "method": "ast", + "return_type": "list", + "ord2token": True + } + } + self.kwargs = dict( + add_seg_type=True if depth in {0, 1, 2} else False, + add_seg_mode="head", + depth=depth, + drop="s" if depth not in {0, 1, 2} else "" + ) + self.kwargs.update(kwargs) + self.flatten = flatten + + def __call__(self, item, flatten=None, **kwargs): + flatten = self.flatten if flatten is None else flatten + tl = sif4sci( + item, symbol=self.symbol, tokenization_params=self.tokenization_params, errors="ignore" + ) + if kwargs: + _kwargs = deepcopy(self.kwargs) + _kwargs.update(kwargs) + else: + _kwargs = self.kwargs + if tl: + ret = tl.get_segments(**_kwargs) + if flatten is True: + return it.chain(*ret) + return ret + return tl + 更多示例 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/docs/source/tutorial/zh/vectorization.rst b/docs/source/tutorial/zh/vectorization.rst index 9349b91e..1a597e73 100644 --- a/docs/source/tutorial/zh/vectorization.rst +++ b/docs/source/tutorial/zh/vectorization.rst @@ -133,29 +133,18 @@ I2V 向量化容器 更多I2V容器使用示例 ------------------------------------ - -.. nbgallery:: - :caption: This is a thumbnail gallery: - :name: i2v_gallery1 - :glob: - - W2V向量化 <../../build/blitz/i2v/i2v_w2v.ipynb> - - D2V向量化 <../../build/blitz/i2v/i2v_d2v.ipynb> - Elmo向量化 <../../build/blitz/i2v/i2v_elmo.ipynb> +`W2V向量化 <../../build/blitz/i2v/i2v_w2v.ipynb>`_ +`D2V向量化 <../../build/blitz/i2v/i2v_d2v.ipynb>`_ -.. nbgallery:: - :caption: This is a thumbnail gallery: - :name: i2v_gallery2 - :glob: - - Bert向量化 <../../build/blitz/i2v/i2v_bert.ipynb> - - DisenQNet向量化 <../../build/blitz/i2v/i2v_disenq.ipynb> - - QuesNet向量化 <../../build/blitz/i2v/i2v_quesnet.ipynb> +`Elmo向量化 <../../build/blitz/i2v/i2v_elmo.ipynb>`_ + +`Bert向量化 <../../build/blitz/i2v/i2v_bert.ipynb>`_ + +`DisenQNet向量化 <../../build/blitz/i2v/i2v_disenq.ipynb>`_ + +`QuesNet向量化 <../../build/blitz/i2v/i2v_quesnet.ipynb>`_ @@ -246,11 +235,7 @@ T2V 向量化容器 # 或 # t2v = W2V(path) -<<<<<<< HEAD item_vector = t2v.infer_vector(token_items) -======= - tem_vector = t2v.infer_vector(token_items) ->>>>>>> upstream/dev # [array(), ..., array()] token_vector = t2v.infer_tokens(token_items) # [[array(), ..., array()], [...], [...]] @@ -264,25 +249,15 @@ T2V 向量化容器 更多T2V容器使用示例 ------------------------------------ -.. nbgallery:: - :caption: This is a thumbnail gallery: - :name: t2v_gallery1 - :glob: - - W2V向量化 <../../build/blitz/t2v/t2v_w2v.ipynb> - D2V向量化 <../../build/blitz/t2v/t2v_d2v.ipynb> +`W2V向量化 <../../build/blitz/t2v/t2v_w2v.ipynb>`_ - Elmo向量化 <../../build/blitz/t2v/t2v_elmo.ipynb> +`D2V向量化 <../../build/blitz/t2v/t2v_d2v.ipynb>`_ +`Elmo向量化 <../../build/blitz/t2v/t2v_elmo.ipynb>`_ -.. nbgallery:: - :caption: This is a thumbnail gallery: - :name: t2v_gallery2 - :glob: - - Bert向量化 <../../build/blitz/t2v/t2v_bert.ipynb> +`Bert向量化 <../../build/blitz/t2v/t2v_bert.ipynb>`_ - DisenQNet向量化 <../../build/blitz/t2v/t2v_disenq.ipynb> +`DisenQNet向量化 <../../build/blitz/t2v/t2v_disenq.ipynb>`_ - QuesNet向量化 <../../build/blitz/t2v/t2v_quesnet.ipynb> +`QuesNet向量化 <../../build/blitz/t2v/t2v_quesnet.ipynb>`_ diff --git a/examples/downstream/difficulty_prediction/difficulty_prediction.ipynb b/examples/downstream/difficulty_prediction/difficulty_prediction.ipynb new file mode 100644 index 00000000..57001873 --- /dev/null +++ b/examples/downstream/difficulty_prediction/difficulty_prediction.ipynb @@ -0,0 +1,359 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 难度预估" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from torch import nn\n", + "import numpy as np\n", + "from transformers import BertModel, TrainingArguments, Trainer, PretrainedConfig, DataCollatorWithPadding\n", + "import torch.nn.functional as F\n", + "from sklearn.metrics import ndcg_score, mean_squared_error\n", + "from torchmetrics import MeanAbsoluteError, PearsonCorrCoef, SpearmanCorrCoef\n", + "import os\n", + "import tqdm\n", + "from EduNLP.Pretrain import BertTokenizer \n", + "from EduNLP.ModelZoo.base_model import BaseModel\n", + "from EduNLP.Pretrain import EduDataset\n", + "import json\n", + "from utils import load_json, get_val, get_train\n", + "\n", + "ROOT = os.path.dirname(os.path.dirname(__file__))\n", + "DATA_DIR = os.path.join(ROOT, \"data\")\n", + "os.environ[\"CUDA_VISIBLE_DEVICES\"]= \"0\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "MAE = MeanAbsoluteError()\n", + "PCC = PearsonCorrCoef()\n", + "SCC = SpearmanCorrCoef()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 加载数据,定义路径" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "output_dir = \"path/to/output_dir\" #设置模型保存路径\n", + "pretrained_model_dir = os.path.join(DATA_DIR, \"bert_math_768\") #以预训练的bert路径为例,也可以更换为其他模型的路径,如disenqnet, roberta等\n", + "checkpoint_dir = \"path/to/difficulty_prediction_checkpoint\"\n", + "train_data = load_json(os.path.join(DATA_DIR, \"train\", \"高中数学.json\")) #加载训练集\n", + "train_items = get_train(train_data)\n", + "val_data = load_json(os.path.join(DATA_DIR, \"test\", \"高中数学paper.json\")) #加载测试集\n", + "val_items, val_gap = get_val(val_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 训练" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 定义网络结构" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "class BertForDifficultyPrediction(BaseModel): \n", + " def __init__(self, pretrained_model_dir=None, classifier_dropout=0.5):\n", + " super(BertForDifficultyPrediction, self).__init__()\n", + " self.bert = BertModel.from_pretrained(pretrained_model_dir, ignore_mismatched_sizes=True)\n", + " hidden_size = self.bert.config.hidden_size\n", + " self.classifier_dropout = classifier_dropout\n", + " self.dropout = nn.Dropout(classifier_dropout)\n", + " self.classifier = nn.Linear(hidden_size, 1)\n", + " self.sigmoid = nn.Sigmoid()\n", + "\n", + " self.config = {k: v for k, v in locals().items() if k not in [\"self\", \"__class__\"]}\n", + " self.config['architecture'] = 'BertForDifficultyPrediction'\n", + " self.config = PretrainedConfig.from_dict(self.config)\n", + "\n", + " def forward(self,\n", + " input_ids=None,\n", + " attention_mask=None,\n", + " token_type_ids=None,\n", + " labels=None,\n", + " ):\n", + " \n", + " item_embed = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)['last_hidden_state'][:, 0, :]\n", + "\n", + " logits = self.sigmoid(self.classifier(item_embed)).squeeze(0)\n", + " loss = None\n", + " if labels is not None:\n", + " loss = F.mse_loss(logits, labels) if labels is not None else None\n", + " return loss, logits\n", + " \n", + " @classmethod\n", + " def from_config(cls, config_path, *args, **kwargs):\n", + " with open(config_path, \"r\", encoding=\"utf-8\") as rf:\n", + " model_config = json.load(rf)\n", + " model_config.update(kwargs)\n", + " return cls(\n", + " pretrained_model_dir=model_config['pretrained_model_dir'],\n", + " classifier_dropout=model_config.get(\"classifier_dropout\", 0.5), \n", + " )" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 定义训练相关参数" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "class BertDataset(EduDataset):\n", + " pass\n", + "\n", + "class MyTrainer(Trainer):\n", + " pass\n", + "\n", + "def train_diff_pred(\n", + " output_dir,\n", + " pretrained_model_dir,\n", + " train_items=None,\n", + " val_items=None,\n", + " train_params=None):\n", + " tokenizer = BertTokenizer.from_pretrained(pretrained_model_dir)\n", + " model = BertForDifficultyPrediction(pretrained_model_dir=pretrained_model_dir)\n", + " model.bert.resize_token_embeddings(len(tokenizer.bert_tokenizer))\n", + " # training parameters\n", + " if train_params is not None:\n", + " epochs = train_params['epochs'] if 'epochs' in train_params else 1\n", + " batch_size = train_params['batch_size'] if 'batch_size' in train_params else 64\n", + " save_steps = train_params['save_steps'] if 'save_steps' in train_params else 100\n", + " save_total_limit = train_params['save_total_limit'] if 'save_total_limit' in train_params else 2\n", + " logging_steps = train_params['logging_steps'] if 'logging_steps' in train_params else 5\n", + " gradient_accumulation_steps = train_params['gradient_accumulation_steps'] \\\n", + " if 'gradient_accumulation_steps' in train_params else 1\n", + " logging_dir = train_params['logging_dir'] if 'logging_dir' in train_params else f\"{ROOT}/log\"\n", + " else:\n", + " # default\n", + " epochs = 50\n", + " batch_size = 1\n", + " save_steps = 1000\n", + " save_total_limit = 2\n", + " logging_steps = 100\n", + " gradient_accumulation_steps = 1\n", + " logging_dir = f\"{ROOT}/log\"\n", + "\n", + "\n", + " train_dataset = BertDataset(tokenizer=tokenizer, items=train_items, stem_key=\"content\", label_key=\"labels\")\n", + " eval_dataset = BertDataset(tokenizer=tokenizer, items=val_items, stem_key=\"content\", label_key=\"labels\")\n", + "\n", + " training_args = TrainingArguments(\n", + " output_dir=output_dir,\n", + " overwrite_output_dir=True,\n", + "\n", + " num_train_epochs=epochs,\n", + " per_device_train_batch_size=batch_size,\n", + " per_device_eval_batch_size=batch_size,\n", + " evaluation_strategy = \"steps\", \n", + " eval_steps=logging_steps*5,\n", + " \n", + " save_steps=save_steps,\n", + " save_total_limit=save_total_limit,\n", + " \n", + " logging_steps=logging_steps,\n", + " logging_dir=logging_dir,\n", + "\n", + " gradient_accumulation_steps=gradient_accumulation_steps,\n", + " learning_rate=5e-5,\n", + " )\n", + " data_collator = DataCollatorWithPadding(tokenizer.bert_tokenizer)\n", + " trainer = MyTrainer(\n", + " model=model,\n", + " args=training_args,\n", + " data_collator=data_collator,\n", + " train_dataset=train_dataset,\n", + " eval_dataset=eval_dataset,\n", + " )\n", + "\n", + " trainer.train() #训练模型\n", + " trainer.save_model(output_dir)\n", + " trainer.model.save_config(output_dir)\n", + " tokenizer.save_pretrained(output_dir) #保存训练后的模型" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 训练模型" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_diff_pred(\n", + " output_dir,\n", + " pretrained_model_dir,\n", + " train_items=train_items,\n", + " val_items=val_items,\n", + " train_params= None\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 测试" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 加载测试集和模型" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class EvalDataset(torch.utils.data.Dataset):\n", + " def __init__(self, items, tokenizer):\n", + " self.tokenizer = tokenizer\n", + " self.items = items\n", + " \n", + " def __getitem__(self, index):\n", + " content, labels = self.items[index][\"content\"], self.items[index][\"labels\"]\n", + " encodings = self.tokenizer(str(content), max_length=512, truncation=True, return_tensors=\"pt\")\n", + " for k, v in encodings.items():\n", + " encodings[k] = v\n", + " return encodings, torch.as_tensor([labels])\n", + " \n", + " def __len__(self):\n", + " return len(self.items)\n", + " \n", + "tokenizer = BertTokenizer.from_pretrained(pretrained_model_dir)\n", + "eval_dataloader = EvalDataset(\n", + " items=val_items,\n", + " tokenizer=tokenizer,\n", + " )\n", + "model = BertForDifficultyPrediction.from_pretrained(checkpoint_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 在测试集上评估" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def compute_metrics(pres, golds):\n", + " logits = torch.as_tensor(pres)\n", + " labels = torch.as_tensor(golds)\n", + " ret = {\n", + " \"mae\": MAE(logits, labels).tolist(),\n", + " \"mse\": mean_squared_error(golds, pres),\n", + " \"rmse\": np.sqrt(mean_squared_error(golds, pres)),\n", + " \"pcc\": PCC(logits, labels).tolist(),\n", + " \"scc\": SCC(logits, labels).tolist(),\n", + " 'ndcg @all, @10, @20, @30': testdata_metrics(val_gap, golds, pres).tolist(),\n", + " }\n", + " return ret\n", + "\n", + "def testdata_metrics(val_gap, diff, pred):\n", + " diff, pred = np.array(diff), np.array(pred)\n", + " idx = np.where(diff>0)[0]\n", + " ndcg = []\n", + " for s, e in val_gap:\n", + " _diff, _pred = diff[s:e], pred[s:e]\n", + " if _diff[0]==-1:\n", + " _diff = [i+1 for i in range(len(_diff))]\n", + " ndcg.append([ndcg_score([_diff], [_pred]), ndcg_score([_diff], [_pred], k=10), ndcg_score([_diff], [_pred], k=20), ndcg_score([_diff], [_pred], k=30)])\n", + " ndcg = np.mean(ndcg, axis=0)\n", + " return ndcg\n", + "\n", + "model.eval()\n", + "pred_list = []\n", + "label_list = []\n", + "for i, eval_batch in tqdm.tqdm(enumerate(eval_dataloader)):\n", + " input_data, eval_batch_labels = eval_batch\n", + " _, output_logits = model(**input_data)\n", + " pred_list.append(output_logits.tolist()[0])\n", + " label_list.append(eval_batch_labels.tolist()[0])\n", + "\n", + "results = compute_metrics(pred_list, label_list)\n", + "print(f\"Test results: {results}\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "tgen", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.0" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/downstream/difficulty_prediction/utils.py b/examples/downstream/difficulty_prediction/utils.py new file mode 100644 index 00000000..fe503a26 --- /dev/null +++ b/examples/downstream/difficulty_prediction/utils.py @@ -0,0 +1,36 @@ +import json +import pandas as pd + + +def load_json(open_path): + print("[load_json] start : {}".format(open_path)) + with open(open_path, "r", encoding='utf-8') as f: + load_q = json.load(f) + print("[load_json] num = {}, open_path = {}".format(len(load_q), open_path)) + return load_q + + +def get_train(train): + train_data = [] + for item in train: + dic = {} + dic["content"] = item["content"] + dic["labels"] = float(item["difficulty"]) + train_data.append(dic) + return train_data + + +def get_val(val): + test_data, test_gap = [], [] + start, end = 0, 0 + for batch in val: + end += len(batch['questions']) + for item in batch['questions']: + dic = {} + dic['content'] = item["stem"] + dic['labels'] = item['diff'] + # dic["labels"] = dic.pop("difficulty") + test_data.append(dic) + test_gap.append([start, end]) + start = end + return test_data, test_gap diff --git a/examples/downstream/discrimination_prediction/discrimination_prediction.ipynb b/examples/downstream/discrimination_prediction/discrimination_prediction.ipynb new file mode 100644 index 00000000..61afaa19 --- /dev/null +++ b/examples/downstream/discrimination_prediction/discrimination_prediction.ipynb @@ -0,0 +1,352 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 区分度预估" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from torch import nn\n", + "import numpy as np\n", + "from transformers import BertModel, TrainingArguments, Trainer, PretrainedConfig, DataCollatorWithPadding\n", + "import torch.nn.functional as F\n", + "from sklearn.metrics import ndcg_score, mean_squared_error\n", + "from torchmetrics import MeanAbsoluteError, PearsonCorrCoef, SpearmanCorrCoef\n", + "import os\n", + "import tqdm\n", + "from EduNLP.Pretrain import BertTokenizer \n", + "from EduNLP.ModelZoo.base_model import BaseModel\n", + "from EduNLP.Pretrain import EduDataset\n", + "import json\n", + "from utils import pre_disc\n", + "\n", + "ROOT = os.path.dirname(os.path.dirname(__file__))\n", + "DATA_DIR = os.path.join(ROOT, \"data\")\n", + "os.environ[\"CUDA_VISIBLE_DEVICES\"]= \"0\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "MAE = MeanAbsoluteError()\n", + "PCC = PearsonCorrCoef()\n", + "SCC = SpearmanCorrCoef()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 加载数据,定义路径" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "output_dir = \"path/to/output_dir\" #设置模型保存路径\n", + "pretrained_model_dir = os.path.join(DATA_DIR, \"bert_math_768\") #以预训练的bert路径为例,也可以更换为其他模型的路径,如disenqnet, roberta等\n", + "checkpoint_dir = \"path/to/discrimination_prediction_checkpoint\"\n", + "train_items = pre_disc(os.path.join(DATA_DIR, \"train\", \"ctt_train.csv\")) #加载训练集\n", + "val_items = pre_disc(os.path.join(DATA_DIR, \"test\", \"ctt_test.csv\")) #加载测试集" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 训练" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 定义网络结构" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "class BertForDiscriminationPrediction(BaseModel): \n", + " def __init__(self, pretrained_model_dir=None, classifier_dropout=0.5):\n", + " super(BertForDiscriminationPrediction, self).__init__()\n", + " self.bert = BertModel.from_pretrained(pretrained_model_dir, ignore_mismatched_sizes=True)\n", + " hidden_size = self.bert.config.hidden_size\n", + " self.classifier_dropout = classifier_dropout\n", + " self.dropout = nn.Dropout(classifier_dropout)\n", + " self.classifier = nn.Linear(hidden_size, 1)\n", + " self.sigmoid = nn.Sigmoid()\n", + "\n", + " self.config = {k: v for k, v in locals().items() if k not in [\"self\", \"__class__\"]}\n", + " self.config['architecture'] = 'BertForDiscriminationPrediction'\n", + " self.config = PretrainedConfig.from_dict(self.config)\n", + "\n", + " def forward(self,\n", + " input_ids=None,\n", + " attention_mask=None,\n", + " token_type_ids=None,\n", + " labels=None,\n", + " ):\n", + " \n", + " item_embed = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)['last_hidden_state'][:, 0, :]\n", + "\n", + " logits = self.sigmoid(self.classifier(item_embed)).squeeze(0)\n", + " loss = None\n", + " if labels is not None:\n", + " loss = F.mse_loss(logits, labels) if labels is not None else None\n", + " return loss, logits\n", + " \n", + " @classmethod\n", + " def from_config(cls, config_path, *args, **kwargs):\n", + " with open(config_path, \"r\", encoding=\"utf-8\") as rf:\n", + " model_config = json.load(rf)\n", + " model_config.update(kwargs)\n", + " return cls(\n", + " pretrained_model_dir=model_config['pretrained_model_dir'],\n", + " classifier_dropout=model_config.get(\"classifier_dropout\", 0.5), \n", + " )" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 定义训练相关参数" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "class BertDataset(EduDataset):\n", + " pass\n", + "\n", + "class MyTrainer(Trainer):\n", + " pass\n", + "\n", + "def train_disc_pred(\n", + " output_dir,\n", + " pretrained_model_dir,\n", + " train_items=None,\n", + " val_items=None,\n", + " train_params=None):\n", + " tokenizer = BertTokenizer.from_pretrained(pretrained_model_dir)\n", + " model = BertForDiscriminationPrediction(pretrained_model_dir=pretrained_model_dir)\n", + " model.bert.resize_token_embeddings(len(tokenizer.bert_tokenizer))\n", + " # training parameters\n", + " if train_params is not None:\n", + " epochs = train_params['epochs'] if 'epochs' in train_params else 1\n", + " batch_size = train_params['batch_size'] if 'batch_size' in train_params else 64\n", + " save_steps = train_params['save_steps'] if 'save_steps' in train_params else 100\n", + " save_total_limit = train_params['save_total_limit'] if 'save_total_limit' in train_params else 2\n", + " logging_steps = train_params['logging_steps'] if 'logging_steps' in train_params else 5\n", + " gradient_accumulation_steps = train_params['gradient_accumulation_steps'] \\\n", + " if 'gradient_accumulation_steps' in train_params else 1\n", + " logging_dir = train_params['logging_dir'] if 'logging_dir' in train_params else f\"{ROOT}/log\"\n", + " else:\n", + " # default\n", + " epochs = 50\n", + " batch_size = 1\n", + " save_steps = 1000\n", + " save_total_limit = 2\n", + " logging_steps = 100\n", + " gradient_accumulation_steps = 1\n", + " logging_dir = f\"{ROOT}/log\"\n", + "\n", + "\n", + " train_dataset = BertDataset(tokenizer=tokenizer, items=train_items, stem_key=\"content\", label_key=\"labels\")\n", + " eval_dataset = BertDataset(tokenizer=tokenizer, items=val_items, stem_key=\"content\", label_key=\"labels\")\n", + "\n", + " training_args = TrainingArguments(\n", + " output_dir=output_dir,\n", + " overwrite_output_dir=True,\n", + "\n", + " num_train_epochs=epochs,\n", + " per_device_train_batch_size=batch_size,\n", + " per_device_eval_batch_size=batch_size,\n", + " evaluation_strategy = \"steps\", \n", + " eval_steps=logging_steps*5,\n", + " \n", + " save_steps=save_steps,\n", + " save_total_limit=save_total_limit,\n", + " \n", + " logging_steps=logging_steps,\n", + " logging_dir=logging_dir,\n", + "\n", + " gradient_accumulation_steps=gradient_accumulation_steps,\n", + " learning_rate=5e-5,\n", + " )\n", + " data_collator = DataCollatorWithPadding(tokenizer.bert_tokenizer)\n", + " trainer = MyTrainer(\n", + " model=model,\n", + " args=training_args,\n", + " data_collator=data_collator,\n", + " train_dataset=train_dataset,\n", + " eval_dataset=eval_dataset,\n", + " )\n", + "\n", + " trainer.train() #训练模型\n", + " trainer.save_model(output_dir)\n", + " trainer.model.save_config(output_dir)\n", + " tokenizer.save_pretrained(output_dir) #保存训练后的模型" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 训练模型" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_disc_pred(\n", + " output_dir,\n", + " pretrained_model_dir=pretrained_model_dir,\n", + " train_items=train_items,\n", + " val_items=val_items,\n", + " train_params= None\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 测试" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 加载测试集和模型" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class EvalDataset(torch.utils.data.Dataset):\n", + " def __init__(self, items, tokenizer):\n", + " self.tokenizer = tokenizer\n", + " self.items = items\n", + " \n", + " def __getitem__(self, index):\n", + " content, labels = self.items[index][\"content\"], self.items[index][\"labels\"]\n", + " encodings = self.tokenizer(str(content), max_length=512, truncation=True, return_tensors=\"pt\")\n", + " for k, v in encodings.items():\n", + " encodings[k] = v\n", + " return encodings, torch.as_tensor([labels])\n", + " \n", + " def __len__(self):\n", + " return len(self.items)\n", + " \n", + "tokenizer = BertTokenizer.from_pretrained(pretrained_model_dir)\n", + "eval_dataloader = EvalDataset(\n", + " items=val_items,\n", + " tokenizer=tokenizer,\n", + " )\n", + "model = BertForDiscriminationPrediction.from_pretrained(checkpoint_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 在测试集上评估" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def compute_metrics(pres, golds):\n", + " logits = torch.as_tensor(pres)\n", + " labels = torch.as_tensor(golds)\n", + " ret = {\n", + " \"mae\": MAE(logits, labels).tolist(),\n", + " \"mse\": mean_squared_error(golds, pres),\n", + " \"rmse\": np.sqrt(mean_squared_error(golds, pres)),\n", + " \"pcc\": PCC(logits, labels).tolist(),\n", + " \"scc\": SCC(logits, labels).tolist(),\n", + " 'ndcg @all, @10, @20, @30': testdata_metrics(golds, pres).tolist(),\n", + " }\n", + " return ret\n", + "\n", + "def testdata_metrics(diff, pred):\n", + " diff, pred = np.array(diff), np.array(pred)\n", + " ndcg = []\n", + " ndcg.append([ndcg_score([diff], [pred]), ndcg_score([diff], [pred], k=10), ndcg_score([diff], [pred], k=20), ndcg_score([diff], [pred], k=30)])\n", + " ndcg = np.mean(ndcg, axis=0)\n", + " return ndcg\n", + "\n", + "model.eval()\n", + "pred_list = []\n", + "label_list = []\n", + "for i, eval_batch in tqdm.tqdm(enumerate(eval_dataloader)):\n", + " input_data, eval_batch_labels = eval_batch\n", + " _, output_logits = model(**input_data)\n", + " pred_list.append(output_logits.tolist()[0])\n", + " label_list.append(eval_batch_labels.tolist()[0])\n", + "\n", + "results = compute_metrics(pred_list, label_list)\n", + "print(f\"Test results: {results}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "tgen", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.0" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/downstream/discrimination_prediction/utils.py b/examples/downstream/discrimination_prediction/utils.py new file mode 100644 index 00000000..f2bbe167 --- /dev/null +++ b/examples/downstream/discrimination_prediction/utils.py @@ -0,0 +1,15 @@ +import json +import pandas as pd + + +def pre_disc(csv_path): + items = pd.read_csv(csv_path) + stem = items["stem"].tolist() + disc = items["disc"].tolist() + data = [] + for i in range(len(stem)): + dic = {} + dic["content"] = stem[i] + dic["labels"] = disc[i] + data.append(dic) + return data diff --git a/examples/downstream/knowledge_prediction/konwledge_prediction.ipynb b/examples/downstream/knowledge_prediction/konwledge_prediction.ipynb new file mode 100644 index 00000000..c1a70468 --- /dev/null +++ b/examples/downstream/knowledge_prediction/konwledge_prediction.ipynb @@ -0,0 +1,215 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 层级知识点预测" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import yaml\n", + "import tqdm\n", + "import torch\n", + "import numpy as np\n", + "from EduNLP.Pretrain import BertTokenizer\n", + "from EduNLP.ModelZoo.bert import BertForKnowledgePrediction\n", + "from EduNLP.Pretrain import finetune_bert_for_knowledge_prediction\n", + "from EduNLP.ModelZoo import load_items\n", + "\n", + "from utils import compute_perfs_per_layer, get_onehot_label_topk, metric, compute_perfs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_data = load_items(\"/path/to/data/train.jsonl\")\n", + "test_data = load_items(\"/path/to/data/test.jsonl\")\n", + "\n", + "pretrained_model_dir =\"/path/to/bert/checkpoint\"\n", + "checkpoint_dir = \"/path/to/knowledge_model/checkpoint\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 训练" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 以bert为例\n", + "data_params = {\n", + " \"stem_key\": \"ques_content\",\n", + " \"label_key\": \"know_list\"\n", + "}\n", + "train_params = {\n", + " \"num_train_epochs\": 1,\n", + " \"per_device_train_batch_size\": 2,\n", + " \"per_device_eval_batch_size\": 2,\n", + " \"no_cuda\": True,\n", + "}\n", + "model_params = {\n", + " \"num_classes_list\": [10, 27, 963],\n", + " \"num_total_classes\": 1000,\n", + "}\n", + " \n", + "\n", + "\"\"\"\n", + "数据格式:\n", + "{\n", + " 'ques_content': 'question...',\n", + " 'know_list': [lay_1_id, lay_2_id, lay_3_id]\n", + "}\n", + "\"\"\"\n", + "\n", + "# train without eval_items\n", + "finetune_bert_for_knowledge_prediction(\n", + " train_data,\n", + " checkpoint_dir,\n", + " pretrained_model=pretrained_model_dir,\n", + " train_params=train_params,\n", + " data_params=data_params,\n", + " model_params=model_params\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 加载模型和评估数据" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 针对多标签任务处理标签\n", + "class EvalDataset(torch.utils.data.Dataset):\n", + " def __init__(self, data) -> None:\n", + " self.data = data\n", + " self.num_classes = model_params['num_classes_list']\n", + " self.tokenizer = BertTokenizer.from_pretrained(pretrained_model_dir)\n", + "\n", + " def __getitem__(self, idx):\n", + " text, labels = self.data[idx][\"ques_content\"], self.data[idx][\"know_list\"]\n", + " encodings = self.tokenizer(text, padding='max_length', truncation=True, return_tensors='pt')\n", + " for k, v in encodings.items():\n", + " encodings[k] = torch.squeeze(v, dim=0)\n", + " one_hot_labels = [1. if idx in labels else 0. for idx in range(self.num_classes)]\n", + " return encodings, torch.FloatTensor(one_hot_labels)\n", + "\n", + " def __len__(self):\n", + " return len(self.data)\n", + "\n", + "test_dataset = EvalDataset(test_data)\n", + "eval_dataloader = EvalDataset(\n", + " test_data,\n", + " batch_size=1,\n", + " shuffle=False,\n", + " num_workers=4,\n", + ")\n", + "\n", + "model = BertForKnowledgePrediction.from_pretrained(checkpoint_dir)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 评估" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "device = \"cuda\" if not train_params[\"no_cuda\"] else \"cpu\"\n", + "\n", + "# 层级知识标签-配置信息\n", + "levels = len(model_params[\"num_classes_list\"])\n", + "classes_offset_list = [0, 10, 37]\n", + "classes_border_list = [[0, 9], [10, 36], [37, 1000]] # 层级id边界\n", + "hierarchy_dict = {} # child_know_id_to_parent_know_id\n", + "\n", + "# 评估top_k结果\n", + "top_k_list=[10, 20, 30]\n", + "\n", + "model.eval()\n", + "perfs_per_layer = [np.array([0 for _ in range(4)], dtype=np.int32) for _ in range(levels)]\n", + "total_perfs = np.array([0 for _ in range(4)], dtype=np.int32)\n", + "\n", + "k_total_perfs_list = [ np.array([0 for _ in range(4)], dtype=np.int32)for _ in range(len(top_k_list)) ]\n", + "for i, eval_batch in tqdm.tqdm(enumerate(eval_dataloader)):\n", + " input_data, eval_batch_labels = eval_batch\n", + " input_data = input_data.to(device)\n", + " _, output_logits = model(**input_data)\n", + "\n", + " local_perfs_per_layer, local_perfs = compute_perfs_per_layer(\n", + " output_logits.cpu().detach().numpy(),\n", + " eval_batch_labels.cpu().detach().numpy(),\n", + " hierarchy_dict,\n", + " classes_border_list,\n", + " keep_consistency=True\n", + " )\n", + " perfs_per_layer = [perfs_per_layer[idx] + local_perfs_per_layer[idx] for idx in range(levels)]\n", + " total_perfs += local_perfs\n", + " \n", + " # for recall@k\n", + " for i_k, k in enumerate(top_k_list):\n", + " pred_topk = get_onehot_label_topk(\n", + " classes_border_list, classes_offset_list, scores_list=output_logits.cpu().detach().numpy(), top_num=k)\n", + " flat_pred_topk = np.array([x[3] for x in pred_topk])\n", + " k_total_perfs = compute_perfs(flat_pred_topk, eval_batch_labels.cpu().detach().numpy().tolist())\n", + " k_total_perfs_list[i_k] += k_total_perfs\n", + "\n", + "# metric for overall\n", + "micro_precision, micro_recall, micro_f1, total_acc = metric(*total_perfs)\n", + "print(f\"Eval Results: Micro-Precision: {micro_precision:.4f}, \"\n", + " + f\"Micro-Recall: {micro_recall:.4f}, Micro-F1: {micro_f1:.4f}, Acc: {total_acc:.4f}\")\n", + "\n", + "# metrics for per top_k\n", + "for i_k, k_total_perfs in enumerate(k_total_perfs_list):\n", + " k = top_k_list[i_k]\n", + " precision, recall, f1, acc = metric(*k_total_perfs)\n", + " print(f\"TOPK={k}: Precision@{k}: {precision:.4f}, Recall@{k}: {recall:.4f}, F1@{k}: {f1:.4f}, Acc@{k}: {acc:.4f}\")\n", + "\n", + "# metrics for per layer\n", + "for layer_idx, perfs in enumerate(perfs_per_layer):\n", + " precision, recall, f1, acc = metric(*perfs)\n", + " print(f\"Layer {layer_idx + 1}: Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, Acc: {acc:.4f}\")" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/downstream/knowledge_prediction/utils.py b/examples/downstream/knowledge_prediction/utils.py new file mode 100644 index 00000000..03cac7be --- /dev/null +++ b/examples/downstream/knowledge_prediction/utils.py @@ -0,0 +1,120 @@ +import numpy as np +import torch +import heapq +from EduNLP.Pretrain import BertTokenizer + + +def get_onehot_label_topk(classes_border_list, classes_offset_list, scores_list: np.ndarray, top_num=1): + """ + Get the predicted labels based on the topK. + + Args: + classes_border_list + classes_offset_list + scores_list: The all classes predicted scores provided by network + top_num: The max topK number (default: 5) + Returns: + predicted_onehot_labels: The predicted labels (onehot) + """ + pred_onehot_labels = [] + scores_list = np.ndarray.tolist(scores_list) + border, offset = classes_border_list, classes_offset_list + num_level = len(border) + for scores in scores_list: + onehot_labels_list = [0] * len(scores) + hlabels = {} + for level in range(num_level): + begin, end = border[level][0], border[level][1] + cur_scores = scores[begin: end + 1] + cur_offset = offset[level] + cur_onehot_labels_list = [0] * len(cur_scores) + # pred_onehot_scores[level].append(cur_scores) + max_num_index_list = list(map(cur_scores.index, heapq.nlargest(top_num, cur_scores))) + for i in max_num_index_list: + cur_onehot_labels_list[i] = 1 + onehot_labels_list[i + cur_offset] = 1 + hlabels[level] = cur_onehot_labels_list + # pred_onehot_scores[-1].append(scores) + hlabels[num_level] = onehot_labels_list + pred_onehot_labels.append(hlabels) + return pred_onehot_labels + + +def compute_perfs(pred_labels: np.ndarray, true_labels: np.ndarray) -> tuple: + # TP: number of labels which is predicted as True and is actually True. + TP = np.sum(pred_labels * true_labels) + # FP: number of labels which is predicted as True and is actually False. + FP = np.sum(((pred_labels - true_labels) > 0).astype(np.int32)) + # FN: number of labels which is predicted as False and is actually True. + FN = np.sum(((true_labels - pred_labels) > 0).astype(np.int32)) + # FP: number of labels which is predicted as False and is actually False. + TN = np.sum(((pred_labels + true_labels) == 0).astype(np.int32)) + return np.array([TP, FP, FN, TN], dtype=np.int32) + + +def compute_perfs_per_layer(outputs: np.ndarray, true_labels: np.ndarray, hierarchy: dict, classes_border_list: list, keep_consistency: bool = True, threshold=0.5) -> tuple: + def _make_labels_consistent(input_labels: np.ndarray, hierarchy: dict): + input_labels = input_labels.astype(np.int32) + while len(hierarchy) > 0: + bottom_labels = set(hierarchy.keys()) - set(hierarchy.values()) + for child in bottom_labels: + mask = (input_labels[:, child] == 1).astype(np.int32) + input_labels[:, hierarchy[child]] |= mask + for k in bottom_labels: + hierarchy.pop(k) + return input_labels + + preds = [] + for (start, end) in classes_border_list: + threshold_labels = (outputs[:, start:end + 1] >= threshold).astype(np.int32) + max_labels = (outputs[:, start:end + 1] == outputs[:, start:end + 1].max(axis=1)[:,None]).astype(np.int32) + preds.append(threshold_labels | max_labels) + pred_labels = np.concatenate(preds, axis=-1) + del preds + if keep_consistency: + pred_labels = _make_labels_consistent(pred_labels, hierarchy.copy()) + true_labels = _make_labels_consistent(true_labels, hierarchy.copy()) + # get perfs per layer + perfs_per_layer = [] + for (start, end) in classes_border_list: + perfs_per_layer.append(compute_perfs(pred_labels[:, start:end + 1], true_labels[:, start:end + 1])) + total_perfs = compute_perfs(pred_labels, true_labels) + return perfs_per_layer, total_perfs + + +def compute_topk_recall(topk_preds: list, true_labels: list) -> tuple: + rs = [] + for pred, label in zip(topk_preds, true_labels): + _r = len(set(pred) & set(label)) / len(label) + rs.append(_r) + return np.mean(rs) + + +def quantile(array: torch.Tensor, ratio: float): + """ + get quantile of array + """ + assert ratio >= 0 and ratio <= 1 + assert len(array.shape) == 1 + sorted_array = torch.sort(array, dim=-1, descending=True)[0] + index = min(int(len(array) * ratio + 0.5), len(array)) + return sorted_array[index].item() + + +def metric(TP, FP, FN, TN): + def _f1_score(precision, recall): + if precision + recall == 0: + return 0. + else: + return 2 * precision * recall / (precision + recall) + if TP + FP == 0: + precision = 0 + else: + precision = TP / (TP + FP) + if TP + FN == 0: + recall = 0 + else: + recall = TP / (TP + FN) + micro_f1 = _f1_score(precision, recall) + acc = (TP + TN) / (TP + FP + FN + TN) + return precision, recall, micro_f1, acc \ No newline at end of file diff --git a/examples/downstream/paper_segmentation/load_data.py b/examples/downstream/paper_segmentation/load_data.py new file mode 100644 index 00000000..6422ea00 --- /dev/null +++ b/examples/downstream/paper_segmentation/load_data.py @@ -0,0 +1,161 @@ +import os +import numpy as np +from tqdm import tqdm +import jieba +import re +import torch +from torch.utils.data import Dataset +from torch.autograd import Variable +import torch.nn.functional as F +from EduNLP.I2V import W2V, Bert, DisenQ +import warnings + + +VECTOR_MODEL_MAP = { + "w2v": W2V, + "bert": Bert, + "disenq": DisenQ, +} + +class PaperI2V(): + def __init__(self, pretrained_model_type, pretrained_model_dir, device="cpu", language=""): + self.pretrained_model_type = pretrained_model_type + self.pretrained_model_dir = pretrained_model_dir + self.device = device + self.language = language + + tokenizer_kwargs = {"tokenizer_config_dir": pretrained_model_dir} + + if pretrained_model_type == "w2v": + # set text tokenizer + text_params = { + "granularity": "word", + "stopwords": None, + } + tokenizer_kwargs["text_params"] = text_params + self.i2v = VECTOR_MODEL_MAP[pretrained_model_type]("pure_text", + 'w2v', + pretrained_model_dir, + tokenizer_kwargs=tokenizer_kwargs + ) + elif pretrained_model_type in ["bert"]: + self.i2v = VECTOR_MODEL_MAP[pretrained_model_type]('bert', + 'bert', + pretrained_model_dir, + tokenizer_kwargs=tokenizer_kwargs, + device=device + ) + elif pretrained_model_type in ["disenq"]: + self.i2v = VECTOR_MODEL_MAP[pretrained_model_type]('disenq', + 'disenq', + pretrained_model_dir, + tokenizer_kwargs=tokenizer_kwargs, + device=device + ) + @classmethod + def prcoess_line(cls, aline): + return aline + + def to_embedding(self, aline): + aline = self.prcoess_line(aline) + + if self.pretrained_model_type == "w2v_pub": + words = jieba.lcut(aline) + token_vector = [] + for word in words: + if not re.sub(r'\s', '', word) == '': + temp_emb = self.i2v.word_to_embedding(word) + token_vector.append(temp_emb.tolist()) + token_vector = torch.FloatTensor(token_vector) + elif self.pretrained_model_type == "w2v": + token_vector = torch.FloatTensor(np.array(self.i2v.infer_token_vector([aline]))).squeeze(0) + elif self.pretrained_model_type == "bert": + token_vector = self.i2v.infer_token_vector([aline]).squeeze(0) + token_vector = token_vector.float().cpu().detach() + elif self.pretrained_model_type == "disenq": + token_vector = self.i2v.infer_token_vector([aline]).squeeze(0) + token_vector = token_vector.float().cpu().detach() + + if aline == "": + warnings.warn("[ERROR] to_embedding: aline is empty") + return None + + return token_vector + +class VecDataset(Dataset): + def __init__(self, + language="", + text_data_dir=None, + emb_data_path=None, + pretrained_model_type="bert", + pretrained_model_dir=None, + paper_i2v: PaperI2V = None, + device="cpu", + mode="train", + do_w2v = False, + ): + self.device = device + self.text_data_dir = text_data_dir + self.emb_data_path = emb_data_path + self.pretrained_model_type = pretrained_model_type + self.pretrained_model_dir = pretrained_model_dir + self.mode = mode + self.input_data = [] + + if paper_i2v is not None: + self.paper_i2v = paper_i2v + else: + # 适配 双语pub_w2v + language = "english" if language == "english" else "" + self.paper_i2v = PaperI2V(pretrained_model_type, + pretrained_model_dir, + device=self.device, + language=language) + + if not os.path.exists(emb_data_path) or do_w2v: + os.makedirs(os.path.dirname(emb_data_path), exist_ok=True) + self.set_all_text_embedding(text_data_dir, emb_data_path) + else: + self.get_all_text_embedding(emb_data_path) + + @property + def embed_dim(self): + return self.paper_i2v.vector_size + + def __getitem__(self, index): + # return self.input_data[index] + doc, tag = self.input_data[index] + self.input_data[index][0] = [ sent.to(self.device) for sent in doc] + self.input_data[index][1] = tag.to(self.device) + return self.input_data[index] + + def __len__(self): + return len(self.input_data) + + def set_all_text_embedding(self, indir, outpath): + print(f'setting {self.mode} data ... ') + path_list = os.listdir(indir) + for file_name in tqdm(path_list): + file_path = os.path.join(indir, file_name) + doc, tag = self.paper_i2v.get_tagged_text_to_embedding(file_path) + self.input_data.append( [doc, tag] ) + torch.save(self.input_data, outpath) + + def get_all_text_embedding(self, inpath): + print(f'loading {self.mode} data ... ') + self.input_data = torch.load(inpath) + + def pad(self, tags): + max_length = max([_tags.size()[0] for _tags in tags]) + for i, _tags in enumerate(tags): + _length = _tags.size()[0] + tags[i] = F.pad(_tags, (0, max_length - _length)) # (max_length) + return torch.stack(tags, dim=0) + + def collcate_fn(self, batch_data): + documents, tags = list(zip(*batch_data)) + batch = { + "documents": list(documents), + "tags": self.pad(list(tags)), + } + return batch \ No newline at end of file diff --git a/examples/downstream/paper_segmentation/model.py b/examples/downstream/paper_segmentation/model.py new file mode 100644 index 00000000..466c672a --- /dev/null +++ b/examples/downstream/paper_segmentation/model.py @@ -0,0 +1,216 @@ +import torch +import torch.nn as nn +import torch.nn.init as init +import torch.nn.functional as F +from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence +from transformers.modeling_outputs import ModelOutput +from EduNLP.ModelZoo.base_model import BaseModel +import numpy as np +import json +import os +import sys +sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) + + +def zero_state(module, batch_size, device): + # * 2 is for the two directions + return torch.zeros(module.num_layers * 2, batch_size, module.hidden_dim).to(device), \ + torch.zeros(module.num_layers * 2, batch_size, module.hidden_dim).to(device) + + +def unsort(sort_order): + result = [-1] * len(sort_order) + for i, index in enumerate(sort_order): + result[index] = i + return result + +class SentenceEncoder(nn.Module): + def __init__(self, input_size, hidden_dim, num_layers): + super().__init__() + self.input_size = input_size + self.hidden_dim = hidden_dim + self.num_layers = num_layers + + self.lstm = nn.LSTM(input_size=self.input_size, + hidden_size=self.hidden_dim, + num_layers=self.num_layers, + dropout=0, + bidirectional=True) # batch_first=False + + def forward(self, sentence_embs, sentence_lens): + """ + Max-pooling for sentence representations + """ + batch_size = sentence_embs.shape[1] + device = sentence_embs.device + s = zero_state(self, batch_size, device=device) + packed_tensor = pack_padded_sequence(sentence_embs, sentence_lens) + packed_output, _ = self.lstm(packed_tensor, s) + padded_output, lengths = pad_packed_sequence(packed_output) # (max sentence len, batch, 256*2) + maxes = torch.zeros(batch_size, padded_output.size(2)).to(device) + for i in range(batch_size): + maxes[i, :] = torch.max(padded_output[:lengths[i], i, :], 0)[0] + + return maxes + +class TrainForPaperSegOutput(ModelOutput): + loss: torch.FloatTensor = None + logits: torch.FloatTensor = None + + +class PaperSegModel(BaseModel): + def __init__(self, embed_dim, hidden_dim, num_layers): + super().__init__() + self.embed_dim = embed_dim + self.hidden_dim = hidden_dim + self.num_layers = num_layers + self.sent_batch_size = None + + self.sentence_encoder = SentenceEncoder(self.embed_dim, + self.hidden_dim, + self.num_layers) + self.lstm = nn.LSTM(input_size=self.hidden_dim*2, + hidden_size=self.hidden_dim, + num_layers=self.num_layers, + dropout=0, + bidirectional=True) # batch_first=False + self.full_connect = nn.Linear(self.hidden_dim*2, 2) # 2 label + # self.reset_parameters() + + self.criterion = torch.nn.CrossEntropyLoss() + + self.config = {k: v for k, v in locals().items() if k not in ["self", "__class__"]} + self.config['architecture'] = 'PaperSegModel' + + def set_sentence_batch(self, sent_batch_size): + self.sent_batch_size = sent_batch_size + + def make_bach_sentences(self, sentences, lens, sent_batch_size): + idx = 0 + batch_sentences, batch_lens = [], [] + while idx < len(sentences): + next_idx = idx + sent_batch_size if idx + sent_batch_size <= len(sentences) else len(sentences) + + max_length = max( lens[idx: next_idx] ) + padded_sentences = [self.pad_sent(s, max_length) for s in sentences[idx: next_idx]] + padded_sentences = torch.stack(padded_sentences, dim=1) + + batch_sentences.append( padded_sentences ) + batch_lens.append( lens[idx: next_idx] ) + idx = next_idx + return batch_sentences, batch_lens + + def reset_parameters(self): + for name, param in self.lstm.named_parameters(): + if name.startswith('weight'): + init.orthogonal_(param) + else: + assert name.startswith('bias') + init.constant_(param, 0.) + for name, param in self.full_connect.named_parameters(): + if name.startswith('weight'): + init.orthogonal_(param) + else: + assert name.startswith('bias') + init.constant_(param, 0.) + + def pad_sent(self, seq, max_length): + s_length = seq.size()[0] + padded = F.pad(seq, (0, 0, 0, max_length - s_length)) # (1, 1, max_length, 300) + return padded + + def pad_document(self, d, max_document_length): + d_length = d.size()[0] + v = d.unsqueeze(0).unsqueeze(0) + padded = F.pad(v, (0, 0, 0, max_document_length - d_length )) # (1, 1, max_length, 300) + shape = padded.size() + return padded.view(shape[2], 1, shape[3]) # (max_length, 1, 300) + + def forward(self, documents=None, tags=None): # batch [documnts]\ + """ + documnts: + [ sentences[word_embeddings[Tensor([300]), ... ], ...], ...] + tags: + torch.Tensor() + """ + batch_size = len(documents) + device = documents[0][0][0].device + document_lens = [] + all_sentences = [] + for document in documents: + all_sentences.extend(document) + document_lens.append(len(document)) + + # sentence 排序 + all_sentence_lens = [s.size()[0] for s in all_sentences] + sort_order = np.argsort(all_sentence_lens)[::-1] + sorted_sentences = [all_sentences[i] for i in sort_order] + sorted_lengths = [s.size()[0] for s in sorted_sentences] + + # sentence 编码 + if self.sent_batch_size is not None: + all_encoded_sentences = [] + all_sent_embs, all_sent_lens = self.make_bach_sentences(sorted_sentences, sorted_lengths, self.sent_batch_size) + for batch_sent_embs, batch_sent_lens in zip(all_sent_embs, all_sent_lens): + batch_encoded_sentences = self.sentence_encoder(batch_sent_embs, batch_sent_lens) + all_encoded_sentences.extend(batch_encoded_sentences) + all_encoded_sentences = torch.stack(all_encoded_sentences, dim=0) + else: + max_length = max(all_sentence_lens) + sorted_padded_sentences = [self.pad_sent(s, max_length) for s in sorted_sentences] + sorted_padded_sentences = torch.stack(sorted_padded_sentences, dim=1) + all_encoded_sentences = self.sentence_encoder(sorted_padded_sentences, sorted_lengths) + unsort_order = torch.LongTensor(unsort(sort_order)).to(device) + unsorted_encodings = all_encoded_sentences.index_select(0, unsort_order) + + index = 0 + encoded_documents = [] + for sentences_count in document_lens: + end_index = index + sentences_count + encoded_documents.append(unsorted_encodings[index: end_index, :]) + index = end_index + + # document 排序 + max_doc_size = np.max(document_lens) + ordered_document_idx = np.argsort(document_lens)[::-1] + ordered_doc_sizes = sorted(document_lens)[::-1] + ordered_documents = [encoded_documents[idx] for idx in ordered_document_idx] + padded_docs = [self.pad_document(d, max_doc_size) for d in ordered_documents] + docs_tensor = torch.cat(padded_docs, 1) + packed_docs = pack_padded_sequence(docs_tensor, ordered_doc_sizes) + sentence_lstm_output, _ = self.lstm(packed_docs, zero_state(self, batch_size=batch_size, device=device)) + padded_x, _ = pad_packed_sequence(sentence_lstm_output) + + doc_outputs = [] + for i, doc_len in enumerate(ordered_doc_sizes): + # doc_outputs.append(padded_x[0:doc_len - 1, i, :]) # -1 to remove last prediction + doc_outputs.append(padded_x[:, i, :]) + + unsorted_doc_outputs = [doc_outputs[i] for i in unsort(ordered_document_idx)] + self.sentence_outputs = torch.cat(unsorted_doc_outputs, 0) + + logits = self.full_connect(self.sentence_outputs) + loss = None + if tags is not None: + loss = self.criterion(logits, tags.view(-1)) + + return TrainForPaperSegOutput( + loss=loss, + logits=logits + ) + + @classmethod + def from_config(cls, config_path, **kwargs): + with open(config_path, "r", encoding="utf-8") as rf: + model_config = json.load(rf) + model_config.update(kwargs) + return cls( + embed_dim=model_config["embed_dim"], + hidden_dim=model_config["hidden_dim"], + num_layers=model_config["num_layers"] + ) + + def save_config(self, config_dir): + config_path = os.path.join(config_dir, "config.json") + with open(config_path, "w", encoding="utf-8") as wf: + json.dump(self.config, wf, ensure_ascii=False, indent=2) \ No newline at end of file diff --git a/examples/downstream/paper_segmentation/paper_segmentation.ipynb b/examples/downstream/paper_segmentation/paper_segmentation.ipynb new file mode 100644 index 00000000..5d885695 --- /dev/null +++ b/examples/downstream/paper_segmentation/paper_segmentation.ipynb @@ -0,0 +1,216 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 试卷切分" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/data/qlh/anaconda3/envs/py39/lib/python3.9/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "import os\n", + "import torch\n", + "from torch.utils.data import DataLoader\n", + "from torch.utils.tensorboard import SummaryWriter\n", + "from load_data import VecDataset\n", + "from trainer import MyTrainer\n", + "from model import PaperSegModel\n", + "from utils import get_logger, ROOT_DIR" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 以DisneQNet为例\n", + "class Args:\n", + " subject = \"math\"\n", + " data_path = os.path.join(ROOT_DIR, \"data\")\n", + " checkpoint_dir = os.path.join(ROOT_DIR, \"checkpoint\")\n", + " \n", + " pretrained_model_type=\"disenqnet\"\n", + " pretrained_model_dir=\"/path/to/disenqnet/checkpoint\"\n", + "\n", + " device=\"cpu\"\n", + "\n", + "args = Args()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "args.train_data_path = f\"{args.data_path}/train/{args.subject}/paper_txt_tagged\"\n", + "args.valid_data_path = f\"{args.data_path}/valid/{args.subject}/paper_txt_tagged\"\n", + "args.test_data_path = f\"{args.data_path}/test/{args.subject}/paper_txt_tagged\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# logger\n", + "logfile = f'{args.checkpoint_dir}/train.log'\n", + "logger = get_logger(logfile)\n", + "# tensorboard\n", + "tensorboard_dir = f'{args.checkpoint_dir}/tensorboard'\n", + "os.makedirs(tensorboard_dir, exist_ok=True)\n", + "tensorboard_writer = SummaryWriter(tensorboard_dir)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 加载向量数据集" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_set = VecDataset(\n", + " text_data_dir=args.train_data_path,\n", + " emb_data_path=args.train_data_path.replace(\"paper_txt_tagged\", \"emb.train.pt\"),\n", + " mode=\"train\",\n", + " pretrained_model_type=args.pretrained_model_type,\n", + " pretrained_model_dir=args.pretrained_model_dir,\n", + " device=args.device,\n", + " )\n", + "valid_set = VecDataset(\n", + " text_data_dir=args.valid_data_path,\n", + " emb_data_path=args.valid_data_path.replace(\"paper_txt_tagged\", \"emb.valid.pt\"),\n", + " mode=\"valid\",\n", + " pretrained_model_type=args.pretrained_model_type,\n", + " pretrained_model_dir=args.pretrained_model_dir,\n", + " paper_i2v=train_set.paper_i2v,\n", + " device=args.device,\n", + " )\n", + "test_set = VecDataset(\n", + " text_data_dir=args.test_data_path,\n", + " emb_data_path=args.valid_data_path.replace(\"paper_txt_tagged\", \"emb.valid.pt\"),\n", + " mode=\"test\",\n", + " pretrained_model_type=args.pretrained_model_type,\n", + " pretrained_model_dir=args.pretrained_model_dir,\n", + " paper_i2v=train_set.paper_i2v,\n", + " device=args.device,\n", + " )\n", + "train_loader = DataLoader(train_set, batch_size=4, shuffle=True, num_workers=0, collate_fn=train_set.collcate_fn)\n", + "valid_loader = DataLoader(valid_set, batch_size=1, shuffle=False, num_workers=0, collate_fn=valid_set.collcate_fn)\n", + "test_loader = DataLoader(test_set, batch_size=1, shuffle=False, num_workers=0, collate_fn=test_set.collcate_fn)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 加载模型" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = PaperSegModel(\n", + " embed_dim=train_set.embed_dim,\n", + " hidden_dim=256,\n", + " num_layers=2\n", + " )\n", + "model = model.to(args.device)\n", + "logger.info('prepare model have done!')\n", + "# model.save_pretrained(args.checkpoint_dir)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 训练和评估" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)\n", + "trainer = MyTrainer(\n", + " args=args,\n", + " model=model,\n", + " optimizer=optimizer,\n", + " logger=logger,\n", + " tensorboard_writer=tensorboard_writer,\n", + ")\n", + "trainer.train(train_loader, valid_loader)\n", + "logger.info(\"Finish training ... \")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = PaperSegModel.from_pretrained(args.checkpoint_dir).to(args.device)\n", + "trainer = MyTrainer(\n", + " args=args,\n", + " model=model,\n", + " logger=logger,\n", + ")\n", + "trainer.valid(test_loader)\n", + "logger.info(\"Finish testing ... \")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "py39", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/downstream/paper_segmentation/samples/train/math/paper_1.txt b/examples/downstream/paper_segmentation/samples/train/math/paper_1.txt new file mode 100644 index 00000000..559bdd16 --- /dev/null +++ b/examples/downstream/paper_segmentation/samples/train/math/paper_1.txt @@ -0,0 +1,82 @@ +================= +2017年云南省临沧市临翔区民族中学高考数学三模试卷(文科) +选择题 +================= +1. 已知集合,,则 \ ( ) $ +A. B. C. D. +================= +2. 已知复数,则复数的模为 \ ( ) $ +A. B. C. D. +================= +3. 已知点,,向量,若,则为 \ ( ) $ +A. B. C. D. +================= +4. 已知函数满足,且当时,成立,若,,,则,,的大小关系是 \ ( ) $ +A. B. C. D. +================= +5.如图的程序框图的算法思路源于数学名著几何原本中的“辗转相除法”,执行该程序框图图中“”表示除以的余数\ ( ) ab485270b=( +A. B. C. D. +================= +6. 某三棱锥的三视图如图所示,则该三棱锥的表面积为 \ ( ) $ +A. B. C. D. +================= +7. 曲线在点处的切线与轴、轴围成的封闭图形的面积为 \ ( ) $ +A. B. C. D. +================= +8. 已知,则 \ ( ) $ +A. B. C. D. +================= +9. 下列说法正确的个数是 \ ( ) $ +若为奇函数,则; +“在中,若,则”的逆命题是假命题; +“三个数,,成等比数列”是“”的既不充分也不必要条件; +命题“,”的否定是“,”. +A. B. C. D. +================= +10. 将函数的图象向右平移个单位后得到的图象的一个对称轴是 \ ( ) $ +A. B. C. D. +================= +11. 已知等差数列的公差,且,,成等比数列,若,是数列的前项和,则的最小值为 \ ( ) $ +A. B. C. D. +================= +12. 已知焦点为的抛物线上有一点,以为圆心,为半径的圆被轴截得的弦长为,则 \ ( ) $ +A. B. C. D. +================= +填空题 +================= +13. 点是不等式组表示的平面区域内的一动点,且不等式恒成立,则的取值范围是 _____ . +================= +14. 已知的内角,,所对的边分别为,,,且,,,则的值为 _____ . +================= +15. 已知正四面体的棱长为,为棱的中点,过作其外接球的截面,则截面面积的最小值为 _____ . +================= +16. 设函数的图象与的图象关于直线对称,且,则 _____ . +================= +简答题 +================= +17. 已知数列的前项和 +Ⅰ求数列的通项公式; +Ⅱ若,求数列的前项和. +================= +18. +================= +19. 如图,在直角梯形中,,,,是中点,将沿折起,使得面; +Ⅰ求证:平面平面; +Ⅱ若是的中点求三棱锥的体积. +================= +20. 已知椭圆:的离心率为,过的左焦点的直线:,直线被圆:截得的弦长为. +Ⅰ求椭圆的方程; +Ⅱ设的右焦点为,在圆上是否存在点,满足,若存在,指出有几个这样的点不必求出点的坐标;若不存在,说明理由. +================= +21. 已知函数为常数 +当时,求函数的单调区间; +求时,不等式恒成立,求实数的取值范围. +================= +22. 在直角坐标系中,曲线为参数,,其中,在以为极点,轴正半轴为极轴的极坐标系中,曲线:,曲线. +Ⅰ求与交点的直角坐标系; +Ⅱ若与相交于点,与相交于点,求的最大值. +================= +23. 设函数 +Ⅰ解不等式; +Ⅱ当时,,求实数的取值范围. +================= diff --git a/examples/downstream/paper_segmentation/trainer.py b/examples/downstream/paper_segmentation/trainer.py new file mode 100644 index 00000000..572c5215 --- /dev/null +++ b/examples/downstream/paper_segmentation/trainer.py @@ -0,0 +1,144 @@ +import os +import sys +# sys.path.append(os.path.dirname(__file__)) +import torchmetrics +import math +import numpy as np +import torch +from torchmetrics.classification import BinaryPrecision, BinaryRecall, BinaryF1Score +from torchmetrics import MetricCollection +from utils import get_pk +from model import PaperSegModel + +def my_cuda_tensor(items, device): + for k, v in items.items(): + if isinstance(v, torch.Tensor): + items[k] = v.to(device) + elif isinstance(v, list): + items[k] = my_cuda_document(v, device) + return items + +def my_cuda_document(v, device): + if isinstance(v, torch.Tensor): + v = v.to(device) + elif isinstance(v, list): + v = [my_cuda_document(x, device) for x in v] + return v + +class MyTrainer(object): + def __init__(self, args, model, optimizer=None, scheduler=None, logger=None, tensorboard_writer=None, **kwargs): + self.args = args + self.model = model + + self.optimizer = optimizer + self.scheduler = scheduler + self.logger = logger + self.tensorboard_writer = tensorboard_writer + + self.Metric = MetricCollection([ + BinaryPrecision(), + BinaryRecall(), + BinaryF1Score(), + ]).to(args.device) + + def train(self, train_dataloader, eval_dataloader): + self._global_step = 0 + size = len(train_dataloader.dataset) // train_dataloader.batch_size + + # best_val_metric = + best_valid_loss = 9999 + best_epoch = None + + for epoch in range(self.args.epochs): + self.model.train() + # train + train_loss = 0 + self.logger.info(f"------ epoch {epoch} ------") + for batch in train_dataloader: # batch_size = 1 + batch = my_cuda_tensor(batch, self.args.device) + + # self.model.zero_grad() + self.optimizer.zero_grad() + + outputs = self.model(**batch) + + self.tensorboard_writer.add_scalar("train_loss", outputs.loss.item(), self._global_step) + self._global_step +=1 + train_loss += outputs.loss.item() + + # Backpropagation + loss = outputs.loss + loss.backward() + self.optimizer.step() + + train_loss /= size + self.logger.info(f'epoch {epoch:3} done, loss = {train_loss}') + # validate + valid_loss, valid_pk, valid_metrics = self.valid(eval_dataloader) + + for k, v in valid_metrics.items(): + self.tensorboard_writer.add_scalar(f"Metric/{k}", v, epoch) + self.tensorboard_writer.add_scalar(f"Metric/pk", valid_pk, epoch) + self.tensorboard_writer.add_scalars(f"EpochLoss", {"TrainLoss": train_loss, "ValidLoss": valid_loss}, epoch) + + # store best model + # if valid_metrics["BinaryF1Score"] > best_val_metric: + # best_val_metric = valid_metrics["BinaryF1Score"] + if valid_loss < best_valid_loss: + best_valid_loss = valid_loss + + try: + self.model.save_pretrained(self.args.checkpoint_dir) + except Exception: + self.logger.info("[Warning] Model.save_pretrained Error !!!") + + with open(f'{self.args.checkpoint_dir}/best_model.bin', mode='wb') as f: + torch.save(self.model.state_dict(), f) + with open(f'{self.args.checkpoint_dir}/best_model.obj.bin', mode='wb') as f: + torch.save(self.model, f) + + best_epoch = epoch + self.logger.info(f"saving best model at epoch {epoch}...") + + self.logger.info(f"Finish training, best model is at epoch {best_epoch}...") + + def valid(self, valid_dataloader): + assert valid_dataloader.batch_size == 1 + self.model.eval() + size = len(valid_dataloader.dataset) // valid_dataloader.batch_size + valid_pk_list = [] + valid_loss = 0 + for batch in valid_dataloader: + batch = my_cuda_tensor(batch, self.args.device) + # documents, tags = batch + # for document, tag in zip(documents, tags): + if isinstance(self.model, PaperSegModel): + outputs = self.model(**batch) + flat_pred_tags = torch.argmax(torch.softmax(outputs.logits, 1), 1) + valid_loss += outputs.loss.item() + else: + outputs = self.model(batch["documents"]) + flat_pred_tags = torch.argmax(torch.softmax(outputs, 1), 1) + + criterion = torch.nn.CrossEntropyLoss() + loss = criterion(outputs, batch["tags"].view(-1)) + valid_loss += loss.item() + + """ compute Metric""" + self.Metric.update(flat_pred_tags, batch["tags"].view(-1)) + """ compute pk""" + tag = batch["tags"].detach().cpu().numpy() + pred_tag = flat_pred_tags.detach().cpu().numpy() + document = batch["documents"][0] + """ only can compute each document at each time """ + k = max(math.ceil(len(document)/2), 2) + pk = get_pk(pred_tag, tag, k) + valid_pk_list.append(pk) + + valid_loss /= size + valid_pk = np.mean(valid_pk_list) + valid_metrics = {k: v.item() for k,v in self.Metric.compute().items()} + self.logger.info(f"Validate: valid_loss= {valid_loss}, valid_pk= {valid_pk}, Validation metric: {valid_metrics}") + self.Metric.reset() + + return valid_loss, valid_pk, valid_metrics \ No newline at end of file diff --git a/examples/downstream/paper_segmentation/utils.py b/examples/downstream/paper_segmentation/utils.py new file mode 100644 index 00000000..000466d5 --- /dev/null +++ b/examples/downstream/paper_segmentation/utils.py @@ -0,0 +1,39 @@ + + +import os +import logging +from datetime import datetime + +ROOT_DIR = os.path.dirname(os.path.dirname(__file__)) + +def get_logger(logfile): + os.makedirs(os.path.dirname(logfile), exist_ok=True) + + logger = logging.getLogger(name="QuesQuality") + logger.setLevel(logging.INFO) + + handler = logging.FileHandler(filename=logfile, encoding="utf-8", mode="w") + handler.setLevel(logging.INFO) + formatter = logging.Formatter(fmt="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S") + handler.setFormatter(formatter) + + consolehandler = logging.StreamHandler() + consolehandler.setFormatter(formatter) + + logger.addHandler(handler) + logger.addHandler(consolehandler) # log to file and print to console + return logger + + +def get_pk(y_pred, y, k): + tag_num = len(y) + count = 0 + for i in range(0, tag_num-k): + seg_count_y_pred = 0 + seg_count_y = 0 + for j in range(i, i+k): + seg_count_y_pred += y_pred[j] + seg_count_y += y[j] + if seg_count_y_pred != seg_count_y: + count += 1 + return count diff --git a/examples/downstream/quality_evaluation/quality_evaluation.ipynb b/examples/downstream/quality_evaluation/quality_evaluation.ipynb new file mode 100644 index 00000000..7b199588 --- /dev/null +++ b/examples/downstream/quality_evaluation/quality_evaluation.ipynb @@ -0,0 +1,292 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 下游任务Demo:质量评估" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. 数据准备" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from torch.utils.data import Dataset\n", + "from transformers import BertTokenizer as HfBertTokenizer\n", + "import torch\n", + "import tqdm\n", + "import pandas as pd\n", + "\n", + "class IdsDataset(Dataset):\n", + " def __init__(self, \n", + " mode,\n", + " data_path=\"../data\",\n", + " tokenizer=None,\n", + " ):\n", + " self.mode = mode\n", + " self.data = pd.read_csv(data_path)\n", + " self.data['answers']=self.data['answers'].fillna('')\n", + " self.data = self.data.to_dict(orient=\"records\") # [:20]\n", + "\n", + " self.tokenizer = tokenizer\n", + " self.preprocess()\n", + "\n", + " def preprocess(self):\n", + " for item in tqdm(self.data):\n", + " return_tensors = None if isinstance(self.tokenizer, HfBertTokenizer) else False\n", + "\n", + " answers_encodings = self.tokenizer(item['contexts'], truncation=True, padding=True, max_length=512, return_tensors=return_tensors)\n", + " contexts_encodings = self.tokenizer(item['answers'], truncation=True, padding=True, max_length=512, return_tensors=return_tensors)\n", + " item[\"answers_encodings\"] = answers_encodings\n", + " item[\"contexts_encodings\"] = contexts_encodings\n", + "\n", + " def __getitem__(self, index):\n", + " item = self.data[index]\n", + " return item\n", + "\n", + " def __len__(self):\n", + " return len(self.data)\n", + " \n", + " def collate_fn(self, batch_data):\n", + " first = batch_data[0]\n", + " batch = {\n", + " k: [item[k] for item in batch_data] for k in first.keys()\n", + " }\n", + " batch[\"answers_encodings\"] = self.tokenizer.batch_pad(batch[\"answers_encodings\"], return_tensors=True)\n", + " batch[\"contexts_encodings\"] = self.tokenizer.batch_pad(batch[\"contexts_encodings\"], return_tensors=True)\n", + " batch[\"score\"] = torch.as_tensor(batch[\"score\"])\n", + " batch[\"label\"] = torch.as_tensor(batch[\"label\"])\n", + " return batch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 以 Bert 为例\n", + "from EduNLP.Pretrain import BertTokenizer\n", + "from torch.utils.data import DataLoader\n", + "\n", + "tokenizer = BertTokenizer.from_pretrained(path=\"/path/to/bert/checkpoint\")\n", + "trainData = IdsDataset(mode='train', data_path=\"/path/to/train.csv\", tokenizer=tokenizer)\n", + "validData = IdsDataset(mode='valid', data_path=\"/path/to/valid.csv\", tokenizer=tokenizer)\n", + "testData = IdsDataset(mode='test', data_path=\"/path/to/test.csv\", tokenizer=tokenizer)\n", + "\n", + "train_Dataloader = DataLoader(trainData, shuffle=True, num_workers=0, pin_memory=True, collate_fn=trainData.collate_fn)\n", + "valid_Dataloader = DataLoader(validData, shuffle=True, num_workers=0, pin_memory=True, collate_fn=validData.collate_fn)\n", + "test_Dataloader = DataLoader(testData, shuffle=True, num_workers=0, pin_memory=True, collate_fn=testData.collate_fn)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. 质量评估" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from torch import nn\n", + "from transformers import BertModel\n", + "from transformers.modeling_outputs import ModelOutput\n", + "from EduNLP.ModelZoo.base_model import BaseModel\n", + "import os\n", + "import json\n", + "\n", + "\n", + "class Global_Layer(nn.Module):\n", + " \"\"\"\n", + " MultiModal global layer\n", + " \"\"\"\n", + " def __init__(self, input_unit, output_unit, hidden_size, dropout_rate=0.5):\n", + " super().__init__()\n", + " self.net = nn.Sequential(\n", + " nn.Linear(input_unit, hidden_size),\n", + " nn.ReLU(),\n", + " nn.Dropout(dropout_rate),\n", + " nn.Linear(hidden_size, output_unit),\n", + " # nn.Softmax(dim=1)\n", + " )\n", + " \n", + " def forward(self, x):\n", + " x = self.net(x)\n", + " return x\n", + " \n", + " \n", + "class QualityLoss(nn.Module):\n", + " def __init__(self, mode='train'):\n", + " super(QualityLoss, self).__init__()\n", + " if mode=='train':\n", + " self.classify_loss_fn = nn.CrossEntropyLoss()\n", + " self.logits_loss_fn = nn.MSELoss()\n", + " else:\n", + " self.classify_loss_fn = nn.CrossEntropyLoss(reduction='sum')\n", + " self.logits_loss_fn = nn.MSELoss(reduction='sum')\n", + "\n", + " def forward(self, pred_score, pred_label,score, label, lamb=0.5):\n", + " # Loss\n", + " score_loss = self.logits_loss_fn(pred_score, score.float())\n", + " label_loss = self.classify_loss_fn(pred_label, label)\n", + " losses = score_loss*lamb + label_loss*(1-lamb)\n", + " return losses\n", + " \n", + " \n", + "class TrainForQualityOutput(ModelOutput):\n", + " loss: torch.FloatTensor = None\n", + " score_logits: torch.FloatTensor = None\n", + " label_logits: torch.FloatTensor = None\n", + "\n", + "\n", + "class QualityModel(BaseModel):\n", + " def __init__(self, pretrained_model_type=\"bert\", pretrained_model_dir=None, emb_mode=\"index\", hidden_size=None, num_labels=3, dropout_rate=0.5):\n", + " super().__init__()\n", + " self.pretrained_model_type = pretrained_model_type\n", + " self.emb_mode = emb_mode\n", + " self.num_labels = num_labels\n", + " if emb_mode == \"index\":\n", + " assert hidden_size is None\n", + " self.bert = BertModel.from_pretrained(pretrained_model_dir)\n", + " self.hidden_size = self.bert.config.hidden_size # 768\n", + " else: # vector\n", + " assert hidden_size is not None\n", + " self.hidden_size = hidden_size\n", + "\n", + " self.score_decoder = Global_Layer(input_unit=self.hidden_size*2,\n", + " output_unit=1,\n", + " hidden_size=self.hidden_size,\n", + " dropout_rate=dropout_rate)\n", + " self.label_decoder = Global_Layer(input_unit=self.hidden_size*2,\n", + " output_unit=num_labels,\n", + " hidden_size=self.hidden_size,\n", + " dropout_rate=dropout_rate) \n", + " self.criterion = QualityLoss()\n", + "\n", + " self.config = {k: v for k, v in locals().items() if k not in [\"self\", \"__class__\", \"bert\"]}\n", + " self.config['architecture'] = 'QualityModel'\n", + "\n", + " def forward(self,\n", + " context_vectors=None,\n", + " answer_vectors=None,\n", + " contexts_encodings=None,\n", + " answers_encodings=None,\n", + " score=None,\n", + " label=None,\n", + " **argv,\n", + " ):\n", + " \"\"\"\n", + " batch_sentences : [batch_size, seq]\n", + " \"\"\"\n", + " if self.emb_mode == \"index\":\n", + " if self.pretrained_model_type in [\"bert\", \"roberta\"]:\n", + " contexts_encoder_out = self.bert(**contexts_encodings)\n", + " answers_encoder_out = self.bert(**answers_encodings)\n", + " context_vectors = contexts_encoder_out[1] # [batch_size, hidden_size]\n", + " answer_vectors = answers_encoder_out[1] # [batch_size, hidden_size]\n", + "\n", + " elif self.pretrained_model_type == \"jiuzhang\":\n", + " contexts_encoder_out = self.bert(\n", + " input_ids=contexts_encodings[\"input_ids\"],\n", + " attention_mask=contexts_encodings[\"attention_mask\"],\n", + " )\n", + " answers_encoder_out = self.bert(\n", + " input_ids=answers_encodings[\"input_ids\"],\n", + " attention_mask=answers_encodings[\"attention_mask\"],\n", + " )\n", + " context_vectors = contexts_encoder_out[\"last_hidden_state\"][:, 0, :]\n", + " answer_vectors = answers_encoder_out[\"last_hidden_state\"][:, 0, :]\n", + " \n", + " elif self.pretrained_model_type == \"disenq\":\n", + " contexts_encoder_out = self.bert(**contexts_encodings)\n", + " answers_encoder_out = self.bert(**answers_encodings)\n", + " context_vectors = contexts_encoder_out[1]\n", + " answer_vectors = answers_encoder_out[1]\n", + " else:\n", + " assert context_vectors is not None and answer_vectors is not None\n", + " pooler_state = torch.cat([context_vectors, answer_vectors],dim=-1)\n", + " score_logits = self.score_decoder(pooler_state).squeeze(-1)\n", + " label_logits = self.label_decoder(pooler_state)\n", + " \n", + " loss = None\n", + " if score is not None and label is not None:\n", + " loss = self.criterion(score_logits, label_logits, score, label, lamb=0.5)\n", + "\n", + " return TrainForQualityOutput(\n", + " loss=loss,\n", + " score_logits=score_logits,\n", + " label_logits=label_logits,\n", + " )\n", + " \n", + " @classmethod\n", + " def from_config(cls, config_path, **kwargs):\n", + " with open(config_path, \"r\", encoding=\"utf-8\") as rf:\n", + " model_config = json.load(rf)\n", + " model_config.update(kwargs)\n", + " return cls(\n", + " pretrained_model_dir=model_config[\"pretrained_model_dir\"],\n", + " emb_mode=model_config[\"emb_mode\"],\n", + " hidden_size=model_config[\"hidden_size\"],\n", + " num_labels=model_config[\"num_labels\"],\n", + " dropout_rate=model_config[\"dropout_rate\"],\n", + " pretrained_model_type=model_config[\"pretrained_model_type\"]\n", + " )\n", + " \n", + " def save_config(self, config_dir):\n", + " config_path = os.path.join(config_dir, \"config.json\")\n", + " with open(config_path, \"w\", encoding=\"utf-8\") as wf:\n", + " json.dump(self.config, wf, ensure_ascii=False, indent=2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from train import MyTrainer\n", + "\n", + "# Initial model\n", + "checkpoint_dir=\"your/checkpoint_dir\"\n", + "device = \"cuda:0\"\n", + "model = QualityModel.from_pretrained(checkpoint_dir).to(device)\n", + "trainer = MyTrainer(\n", + " model=model,\n", + ")\n", + "trainer.train(train_Dataloader, valid_Dataloader)\n", + "trainer.valid(valid_Dataloader)\n", + "trainer.valid(test_Dataloader)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "edunlp", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.7.16" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/downstream/quality_evaluation/train.py b/examples/downstream/quality_evaluation/train.py new file mode 100644 index 00000000..0075caf4 --- /dev/null +++ b/examples/downstream/quality_evaluation/train.py @@ -0,0 +1,129 @@ +import os +import sys +sys.path.append(os.path.dirname(__file__)) +import math +import numpy as np +import torch +from tqdm import tqdm +import logging +from datetime import datetime +from argparse import ArgumentParser +from torch.utils.data import DataLoader +from torchmetrics import MetricCollection, Accuracy, Precision, Recall, MeanSquaredError, MeanSquaredLogError,R2Score,PearsonCorrCoef +import platform +from torch.utils.tensorboard import SummaryWriter + + +def my_cuda_tensor(items, device): + for k, v in items.items(): + if isinstance(v, torch.Tensor): + items[k] = v.to(device) + elif isinstance(v, dict): + items[k] = my_cuda_tensor(v, device) + + return items + + +class MyTrainer(object): + def __init__(self, args, model, optimizer=None, scheduler=None, logger=None, tensorboard_writer=None, **kwargs): + self.args = args + self.model = model + self.optimizer = optimizer + self.scheduler = scheduler + self.logger = logger + self.tensorboard_writer = tensorboard_writer + + self.classify_metric_collection = MetricCollection([ + Accuracy(task="multiclass", num_classes=3, average="micro"), + # Precision(task="multiclass", num_classes=3, average="micro"), + # Recall(task="multiclass", num_classes=3, average="micro") + ]).to(args.device) + self.logits_metric_collection = MetricCollection([ + MeanSquaredError(), + # MeanSquaredLogError(), + R2Score(), + PearsonCorrCoef() + ]).to(args.device) + + def train(self, train_dataloader, valid_dataloader): + self._global_step = 0 + size = len(train_dataloader.dataset) // train_dataloader.batch_size + + best_valid_loss = 9999 + best_epoch = None + for epoch in tqdm(range(self.args.epochs)): + self.model.train() + # train + train_loss = 0 + self.logger.info(f"------ epoch {epoch} ------") + for idx, batch in enumerate(train_dataloader): + batch = my_cuda_tensor(batch, self.args.device) + # Compute prediction error + outputs = self.model(**batch) + score_logits, label_logits = outputs.score_logits, outputs.label_logits + + pred_label = torch.argmax(label_logits, dim=1) + self.classify_metric_collection.update(pred_label, batch["label"]) + self.logits_metric_collection.update(score_logits, batch["score"].float()) + + self.tensorboard_writer.add_scalar("train_loss", outputs.loss.item(), self._global_step) + self._global_step +=1 + train_loss += outputs.loss.item() + + # Backpropagation + loss = outputs.loss / self.args.grad_accum + loss.backward() + # 梯度积累 + if (idx+1) % self.args.grad_accum == 0: + self.optimizer.step() + self.optimizer.zero_grad() + train_loss /= size + + # train_metric + total_train_classify_metric = {k: v.item() for k,v in self.classify_metric_collection.compute().items()} + total_train_logits_metric = {k: v.item() for k,v in self.logits_metric_collection.compute().items()} + self.logger.info(f"train metric for epoch: {total_train_classify_metric},{total_train_logits_metric}") + self.classify_metric_collection.reset() + self.logits_metric_collection.reset() + + valid_loss, total_valid_classify_metric, total_valid_logits_metric = self.valid(valid_dataloader) + # log + for k, v in total_valid_classify_metric.items(): + self.tensorboard_writer.add_scalar(f"classify_metric/{k}", v, epoch) + for k, v in total_valid_logits_metric.items(): + self.tensorboard_writer.add_scalar(f"logits_metric/{k}", v, epoch) + self.tensorboard_writer.add_scalars(f"EpochLoss", {"TrainLoss": train_loss, "ValidLoss": valid_loss}, epoch) + + if valid_loss < best_valid_loss: + self.model.save_pretrained(self.args.checkpoint_dir) + best_valid_loss = valid_loss + best_epoch = epoch + self.logger.info(f"saving best model at epoch {epoch}...") + + self.logger.info(f"Finish training, best model is at epoch {best_epoch}...") + + def valid(self, valid_dataloader): + self.model.eval() + size = len(valid_dataloader.dataset) // valid_dataloader.batch_size + valid_loss = 0 + with torch.no_grad(): + for idx, batch in enumerate(valid_dataloader): + batch = my_cuda_tensor(batch, self.args.device) + # Compute prediction error + outputs = self.model(**batch) + score_logits, label_logits = outputs.score_logits, outputs.label_logits + valid_loss += outputs.loss.item() + + pred_label = torch.argmax(label_logits, dim=1) + self.classify_metric_collection.update(pred_label, batch["label"]) + self.logits_metric_collection.update(score_logits, batch["score"].float()) + + valid_loss /= size + + total_valid_classify_metric = {k: v.item() for k,v in self.classify_metric_collection.compute().items()} + total_valid_logits_metric = {k: v.item() for k,v in self.logits_metric_collection.compute().items()} + self.logger.info(f"Validation metric: {total_valid_classify_metric},{total_valid_logits_metric}") + self.classify_metric_collection.reset() + self.logits_metric_collection.reset() + + return valid_loss, total_valid_classify_metric, total_valid_logits_metric diff --git a/examples/downstream/similarity_prediction/similarity_prediction.ipynb b/examples/downstream/similarity_prediction/similarity_prediction.ipynb new file mode 100644 index 00000000..ef2e1f93 --- /dev/null +++ b/examples/downstream/similarity_prediction/similarity_prediction.ipynb @@ -0,0 +1,298 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 下游任务Demo:相似度预估" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import numpy as np\n", + "import pandas as pd\n", + "import pickle\n", + "import faiss\n", + "import time\n", + "from tqdm import tqdm\n", + "from scipy.stats import pearsonr, spearmanr\n", + "from sklearn.metrics.pairwise import paired_cosine_distances" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "os.environ['CUDA_VISIBLE_DEVICES'] = '0'" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. 获取题目表征" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 读取相似度预估下游任务题目数据,格式:每行一道题目文本\n", + "with open({'path/to/your/data/math.tsv'}, 'r') as f:\n", + " lines = f.readlines()\n", + "ques = []\n", + "for line in lines:\n", + " ques.append(line.strip('\\n'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 以DisenQNet为例\n", + "\n", + "from EduNLP.Pretrain import DisenQTokenizer\n", + "from EduNLP.Vector import DisenQModel\n", + "\n", + "path = \"/path/to/disenqnet/checkpoint\"\n", + "tokenizer = DisenQTokenizer.from_pretrained(path)\n", + "t2v = DisenQModel(path, device=\"cuda\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ques_emb = []\n", + "with np.errstate(all='raise'):\n", + " for i, text in enumerate(tqdm(ques)):\n", + " encodes = tokenizer([text], key=lambda x: x)\n", + " emb = t2v.infer_vector(encodes, key=lambda x: x[\"stem\"], vector_type=\"k\").detach().cpu().reshape(-1).numpy()\n", + " ques_emb.append(emb)\n", + "ques_emb = np.array(ques_emb)\n", + "ques_emb.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with open('./cache/disenq_300_embs.pkl', 'wb') as f:\n", + " pickle.dump(ques_emb, f)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. 相似度预估 Ranking" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 读取数据\n", + "sim = pd.read_csv('/path/to/your/data/similarity.csv')\n", + "test_id1 = []\n", + "test_id2 = []\n", + "labels = []\n", + "for i, line in sim.iterrows():\n", + " id1, id2, _, _, _, sim = line\n", + " try:\n", + " idx1 = id1-1\n", + " idx2 = id2-1\n", + " score = sum([int(x) for x in sim.split('|')]) / 3\n", + " test_id1.append(idx1)\n", + " test_id2.append(idx2)\n", + " labels.append(score)\n", + " except:\n", + " print(id1, id2, score)\n", + "np.array(labels)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def compute_ranking_metrics(ques_emb):\n", + " ques_emb1 = ques_emb[test_id1]\n", + " ques_emb2 = ques_emb[test_id2]\n", + " cosine_scores = 1 - (paired_cosine_distances(ques_emb1, ques_emb2))\n", + " pearson_cosine, _ = pearsonr(labels, cosine_scores)\n", + " spearman_cosine, _ = spearmanr(labels, cosine_scores)\n", + " print(f'Pearson: {pearson_cosine:.4f}, Spearman: {spearman_cosine:.4f}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 读取Step1中保存的题目表征\n", + "with open('./cache/disenq_300_embs.pkl', 'rb') as f:\n", + " embs = pickle.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "compute_ranking_metrics(embs)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. 相似度预估 Recall" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 读取Step1中保存的题目表征\n", + "with open('./cache/disenq_300_embs.pkl', 'rb') as f:\n", + " embs = pickle.load(f)\n", + "\n", + "norm_embs = embs / (np.linalg.norm(embs, ord=2, axis=-1, keepdims=True) + 1e-12)\n", + "norm_embs = norm_embs.astype('float32')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dim = norm_embs.shape[-1]\n", + "param = 'IVF512,PQ15'\n", + "measure = faiss.METRIC_L2\n", + "index = faiss.index_factory(dim, param, measure)\n", + "index.train(norm_embs)\n", + "index.add(norm_embs)\n", + "faiss.write_index(index, './index/disenq.index')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 读取数据并按照recall任务进行处理\n", + "sim = pd.read_csv('/path/to/your/data/similarity.csv')\n", + "query = {}\n", + "for i, line in sim.iterrows():\n", + " id1, id2, _, _, _, sim = line\n", + " id1 = int(id1)\n", + " id2 = int(id2)\n", + " score = sum([int(x) for x in sim.split('|')]) / 3\n", + " if score >= 5:\n", + " if id1 in query:\n", + " query[id1].append((id2, score))\n", + " else:\n", + " query[id1] = [(id2, score)]\n", + " if id2 in query:\n", + " query[id2].append((id1, score))\n", + " else:\n", + " query[id2] = [(id1, score)]\n", + "for k in query:\n", + " query[k].sort(key=lambda x: x[1], reverse=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def compute_recall_metrics(query, result, p=100):\n", + " total_hr, total_ndcg = 0, 0\n", + " for k, v in query.items():\n", + " res = result[k][:p]\n", + " hit, dcg, idcg = 0, 0, 0\n", + " for i, (label, score) in enumerate(v):\n", + " idcg += (2 ** score - 1) / np.log2(i + 2)\n", + " if label in res:\n", + " hit += 1\n", + " dcg += (2 ** score - 1) / np.log2(res.index(label) + 2)\n", + " total_hr += (hit / len(v))\n", + " total_ndcg += (dcg / idcg)\n", + " print(f'HR@{p}: {total_hr / len(query):.4f}, NDCG@{p}: {total_ndcg / len(query):.4f}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "avg_time = 0\n", + "for _ in range(5):\n", + " result = {}\n", + " total_time = 0\n", + " for k in tqdm(query):\n", + " idx = k-1\n", + " start = time.time()\n", + " _, idxs = index.search(norm_embs[idx].reshape(1, -1), 101)\n", + " end = time.time()\n", + " total_time += (end - start) * 1000\n", + " res_ids = idxs.tolist()[0]\n", + " if idx in res_ids:\n", + " res_ids.remove(idx)\n", + " result[k] = []\n", + " for i in res_ids[:100]:\n", + " try:\n", + " result[k].append(i+1)\n", + " except:\n", + " pass\n", + " print('Average time: ', total_time / len(query))\n", + " avg_time += total_time / len(query)\n", + " compute_recall_metrics(query, result, 10)\n", + " compute_recall_metrics(query, result, 20)\n", + " compute_recall_metrics(query, result, 30)\n", + " compute_recall_metrics(query, result, 50)\n", + " compute_recall_metrics(query, result, 100)\n", + "print(avg_time / 5)" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/pretrain/quesnet.ipynb b/examples/pretrain/quesnet.ipynb index c5469c50..2b0a405f 100644 --- a/examples/pretrain/quesnet.ipynb +++ b/examples/pretrain/quesnet.ipynb @@ -138,8 +138,15 @@ "}\n", "\n", "# 当前仅支持linux下训练\n", - "# pretrain_quesnet(os.path.join(os.path.abspath(data_dir), 'quesnet_data.json'),\n", - "# output_dir, tokenizer, True, train_params)" + "pretrain_quesnet(\n", + " path=os.path.join(os.path.abspath(data_dir),'quesnet_data.json'),\n", + " output_dir=output_dir,\n", + " tokenizer=tokenizer,\n", + " img_dir=None,\n", + " save_embs=True,\n", + " load_embs=False,\n", + " train_params=train_params\n", + ")" ] }, { diff --git a/examples/t2v/t2v_elmo.ipynb b/examples/t2v/t2v_elmo.ipynb index c754b4d1..561a7128 100644 --- a/examples/t2v/t2v_elmo.ipynb +++ b/examples/t2v/t2v_elmo.ipynb @@ -12,16 +12,7 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "d:\\MySoftwares\\Anaconda\\envs\\data\\lib\\site-packages\\gensim\\similarities\\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n", - " warnings.warn(msg)\n" - ] - } - ], + "outputs": [], "source": [ "from EduNLP.Pretrain import ElmoTokenizer\n", "from EduNLP.Vector import T2V, ElmoModel\n", @@ -38,7 +29,7 @@ "BASE_DIR = \"../..\"\n", "\n", "data_dir = f\"{BASE_DIR}/static/test_data\"\n", - "output_dir = f\"{BASE_DIR}/examples/test_model/elmo\"" + "output_dir = f\"{BASE_DIR}/examples/test_model/elmo/elmo_768\"" ] }, { @@ -50,23 +41,29 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "([527, 231, 3, 13, 26, 79, 159, 527, 6, 33, 10, 13, 34, 133, 79, 168, 4], 17)\n", + "{'seq_idx': tensor([ 804, 19, 6, 69, 26, 66, 1381, 804, 9, 254, 27, 69,\n", + " 70, 246, 66, 239, 7]), 'seq_len': tensor(17)}\n", "\n", - "([[527, 231, 3, 13, 26, 79, 159, 527, 6, 33, 10, 13, 34, 133, 79, 168, 4], [7, 104, 13, 15, 16, 17, 18, 34, 79, 15, 16, 17, 18, 19, 105, 13, 10, 23, 106, 107, 104, 108, 109, 110, 111]], [17, 25])\n", + "{'seq_idx': tensor([[ 804, 19, 6, 69, 26, 66, 1381, 804, 9, 254, 27, 69,\n", + " 70, 246, 66, 239, 7, 0, 0, 0, 0, 0, 0, 0,\n", + " 0],\n", + " [ 64, 477, 69, 96, 81, 55, 82, 70, 66, 96, 81, 55,\n", + " 82, 71, 467, 69, 27, 78, 844, 77, 477, 1312, 865, 519,\n", + " 118]]), 'seq_len': tensor([17, 25])}\n", "\n" ] } ], "source": [ "# 加载之前训练的模型tokenizer\n", - "tokenizer = ElmoTokenizer(os.path.join(output_dir, \"vocab.json\"))\n", + "tokenizer = ElmoTokenizer(os.path.join(output_dir, \"vocab.txt\"))\n", "\n", "# 对题目文本进行令牌化\n", "items = [\n", @@ -83,7 +80,8 @@ "print(tokenizer(items, freeze_vocab=True))\n", "print()\n", "\n", - "token_items, lengths = tokenizer(items, pad_to_max_length=True)" + "token_items = tokenizer(items, pad_to_max_length=True)\n", + "_, lengths = token_items" ] }, { @@ -95,17 +93,143 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[EduNLP, INFO] All the weights of ElmoLM were initialized from the model checkpoint at ../../examples/test_model/elmo/elmo_768.\n", + "If your task is similar to the task the model of the checkpoint was trained on, you can already use ElmoLM for predictions without further training.\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "torch.Size([2, 512])\n", + "ElmoLMOutput([('pred_forward', tensor([[[-307.3449, -307.3120, -307.3644, ..., -310.1035, -307.8653,\n", + " -305.8521],\n", + " [-278.2352, -278.3191, -278.3070, ..., -278.2227, -277.5887,\n", + " -277.1101],\n", + " [-363.3187, -363.3951, -363.4167, ..., -365.0335, -361.3292,\n", + " -363.0343],\n", + " ...,\n", + " [-283.5177, -283.5760, -283.6111, ..., -284.5733, -282.6731,\n", + " -283.2103],\n", + " [-248.1853, -248.3669, -248.3075, ..., -248.6257, -247.6015,\n", + " -247.8452],\n", + " [-241.4586, -241.4421, -241.4153, ..., -240.6708, -240.4943,\n", + " -240.6182]],\n", "\n", - "torch.Size([2, 512])\n", - "torch.Size([2, 25, 512])\n", + " [[-334.8899, -334.8294, -334.9643, ..., -334.1731, -335.4581,\n", + " -334.7304],\n", + " [-355.3142, -355.3451, -355.4356, ..., -356.5914, -352.9772,\n", + " -354.9101],\n", + " [-407.1169, -406.9889, -407.2259, ..., -411.4367, -405.8418,\n", + " -407.2929],\n", + " ...,\n", + " [-330.3282, -330.3368, -330.3389, ..., -332.7447, -331.5250,\n", + " -329.7366],\n", + " [-283.1005, -283.1692, -283.1745, ..., -283.0395, -283.1997,\n", + " -282.8568],\n", + " [-235.8705, -235.7804, -235.8927, ..., -235.1325, -235.2520,\n", + " -235.5008]]], grad_fn=)), ('pred_backward', tensor([[[-179.5577, -179.5323, -179.4542, ..., -180.1722, -178.3642,\n", + " -178.2475],\n", + " [-344.9785, -344.9221, -345.0316, ..., -349.3186, -344.6387,\n", + " -344.3976],\n", + " [-311.8071, -311.6877, -311.8076, ..., -315.7125, -312.6975,\n", + " -311.0973],\n", + " ...,\n", + " [-164.1926, -164.2107, -164.1506, ..., -164.0445, -162.9915,\n", + " -163.1473],\n", + " [-161.2745, -161.2983, -161.2401, ..., -161.1260, -160.0812,\n", + " -160.0892],\n", + " [-201.1299, -201.1749, -201.0949, ..., -200.5798, -200.0364,\n", + " -200.2121]],\n", + "\n", + " [[-173.3366, -173.3426, -173.2081, ..., -174.0968, -172.0103,\n", + " -171.8691],\n", + " [-310.7520, -310.6165, -310.7965, ..., -313.6465, -312.9819,\n", + " -309.7775],\n", + " [-297.4435, -297.3187, -297.5530, ..., -300.7402, -298.0805,\n", + " -296.7505],\n", + " ...,\n", + " [-365.5618, -365.3937, -365.5269, ..., -369.6968, -367.3105,\n", + " -365.9371],\n", + " [-360.7122, -360.6502, -360.6949, ..., -365.4681, -362.4482,\n", + " -358.9928],\n", + " [-328.2362, -328.2809, -328.2350, ..., -331.3160, -328.4486,\n", + " -327.7178]]], grad_fn=)), ('forward_output', tensor([[[ 3.6525e-03, 2.2160e-02, 6.8173e-05, ..., -4.7844e-03,\n", + " 9.3836e-03, -4.3491e-01],\n", + " [ 1.8052e-02, 8.7361e-04, 4.8162e-01, ..., -5.3586e-03,\n", + " 6.6052e-02, -1.4424e-02],\n", + " [ 9.5443e-02, 1.6271e-02, 7.0382e-01, ..., -8.3553e-03,\n", + " 1.2897e-02, -3.1771e-02],\n", + " ...,\n", + " [ 5.2170e-03, 1.0105e-02, 2.6849e-01, ..., -3.0296e-03,\n", + " 1.4682e-01, -4.0381e-01],\n", + " [ 8.6507e-03, 1.2562e-02, 9.4650e-01, ..., -1.0862e-03,\n", + " 8.5297e-01, -2.2572e-01],\n", + " [ 3.1620e-02, 1.4642e-01, 1.0650e-01, ..., -1.8507e-01,\n", + " 3.6865e-04, -2.8440e-01]],\n", + "\n", + " [[ 5.3534e-01, 2.1329e-02, 2.8222e-02, ..., -7.0496e-02,\n", + " 6.7711e-02, -3.1365e-03],\n", + " [ 5.6558e-02, 7.0860e-03, 4.0042e-01, ..., -1.3037e-02,\n", + " 2.2477e-02, -2.0711e-02],\n", + " [ 2.1927e-02, 4.2798e-01, 9.1026e-01, ..., -1.8426e-01,\n", + " 6.0737e-03, -1.7819e-01],\n", + " ...,\n", + " [ 3.7156e-02, 2.2477e-02, 6.9470e-01, ..., -1.1230e-02,\n", + " 1.1101e-02, -2.4664e-01],\n", + " [ 1.4176e-02, 1.6747e-02, 7.8785e-02, ..., -1.8862e-02,\n", + " 8.9409e-03, -6.1224e-01],\n", + " [ 7.9827e-02, 3.7614e-02, 2.8973e-01, ..., -9.7911e-02,\n", + " 2.5626e-04, -1.0576e-01]]], grad_fn=)), ('backward_output', tensor([[[ 1.6726e-02, 1.6917e-02, 1.2608e-02, ..., -1.1215e-02,\n", + " -7.0637e-01, -2.7572e-01],\n", + " [ 5.7128e-03, 4.2807e-02, 3.8698e-02, ..., -1.0857e-03,\n", + " -7.1516e-01, -9.0976e-02],\n", + " [ 3.5365e-03, 2.8559e-02, 2.2622e-05, ..., -3.7339e-03,\n", + " 7.1600e-01, -5.7635e-01],\n", + " ...,\n", + " [ 4.6398e-02, 4.9136e-02, 1.2801e-02, ..., -1.9671e-02,\n", + " -9.6720e-03, -9.6724e-02],\n", + " [ 4.0719e-02, 4.4131e-02, 1.2812e-02, ..., -1.8923e-02,\n", + " -3.2772e-03, -8.6033e-02],\n", + " [ 4.6603e-02, 6.7548e-02, 3.4405e-02, ..., -2.9001e-02,\n", + " -2.7431e-03, -1.3146e-01]],\n", + "\n", + " [[ 2.9590e-02, 3.2132e-02, 1.8678e-02, ..., -2.1314e-02,\n", + " -6.8087e-01, -6.1092e-02],\n", + " [ 1.8839e-03, 1.5880e-02, 4.3065e-03, ..., -3.6674e-03,\n", + " 7.4532e-01, -3.9937e-02],\n", + " [ 2.6377e-01, 6.6237e-03, 7.1997e-02, ..., -2.2735e-02,\n", + " 3.1916e-01, -1.1802e-02],\n", + " ...,\n", + " [ 6.5437e-01, 8.1936e-02, 8.6420e-01, ..., -4.1453e-02,\n", + " -6.4451e-01, -1.3480e-01],\n", + " [ 1.7608e-01, 2.3962e-01, 8.7436e-01, ..., -1.5172e-01,\n", + " -7.5390e-01, -2.2155e-01],\n", + " [ 8.7120e-02, 1.5658e-01, 7.4500e-01, ..., -1.3044e-02,\n", + " -7.5951e-01, -3.6064e-01]]], grad_fn=))])\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "f:\\bdaa\\edunlp\\EduNLP\\Vector\\elmo_vec.py:36: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", + " (outputs.forward_output[torch.arange(len(items[\"seq_len\"])), torch.tensor(items[\"seq_len\"]) - 1],\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([2, 768])\n", + "torch.Size([2, 25, 768])\n", "\n" ] } @@ -115,21 +239,21 @@ "\n", "# # 获得句表征\n", "i_vec = t2v(token_items)\n", - "print(i_vec.shape)\n", + "print(i_vec)\n", "print()\n", "\n", "# 获得句表征和词表征\n", "i_vec = t2v.infer_vector(token_items, lengths=lengths)\n", "t_vec = t2v.infer_tokens(token_items, lengths=lengths)\n", - "print(i_vec.shape)\n", - "print(t_vec.shape)\n", + "print(i_vec.size())\n", + "print(t_vec.size())\n", "print()" ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3.8.13 ('nlp')", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -143,9 +267,8 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.13" + "version": "3.8.12" }, - "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "cc3e3b0a667322a868bdd200d76d82ed50310f7037715f6f0bc4c373c1c03ce5" @@ -153,5 +276,5 @@ } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/setup.py b/setup.py index ff792b24..a1950a7b 100644 --- a/setup.py +++ b/setup.py @@ -38,7 +38,7 @@ vec_deps = [ 'gensim', - 'transformers', + 'transformers<4.29.0', 'torchvision', 'datasets'] + ml_pytorch_deps diff --git a/static/test_data/standard_luna_data.json b/static/test_data/standard_luna_data.json index 9b8e3906..0ef4403a 100644 --- a/static/test_data/standard_luna_data.json +++ b/static/test_data/standard_luna_data.json @@ -1,13 +1,13 @@ -{"ques_content": "已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$", "ques_subject": 1, "ques_id": "726cdbec-33a9-11ec-909c-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["\\\\{-4,1\\\\}", "\\\\{1,5\\\\}", "\\\\{3,5\\\\}", "\\\\{1,3\\\\}"], "ques_answer": "D", "know_list": [0, 10, 57], "know_name": ["代数", "集合", "集合的相等"], "difficulty": 0.424379, "ques_figure_ids": [], "ques_figure_paths": []} +{"ques_content": "已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$", "ques_subject": 1, "ques_id": "726cdbec-33a9-11ec-909c-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["\\\\{-4,1\\\\}", "\\\\{1,5\\\\}", "\\\\{3,5\\\\}", "\\\\{1,3\\\\}"], "ques_answer": "C", "know_list": [0, 10, 57], "know_name": ["代数", "集合", "集合的相等"], "difficulty": 0.424379, "ques_figure_ids": [], "ques_figure_paths": []} {"ques_content": "若复数$z=1+2 i+i^{3}$,则$|z|=$", "ques_subject": 1, "ques_id": "726e139c-33a9-11ec-bd9e-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["0", "1", "$\\\\sqrt{2}$", "2"], "ques_answer": "C", "know_list": [0, 19, 269], "know_name": ["代数", "数系的扩充与复数", "复数代数形式的加减运算"], "difficulty": 0.566538, "ques_figure_ids": [], "ques_figure_paths": []} {"ques_content": "埃及胡夫金字塔是古代世界建筑奇迹之一,它的形状可视为一个正四棱锥。以该四棱锥的高为边长的正方形面积等于该四棱锥一个侧面三角形的面积,则其侧面三角形底边上的高与底面正方形的边长的比值为", "ques_subject": 1, "ques_id": "726e3a92-33a9-11ec-88d7-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["$\\\\frac{\\\\sqrt{5}-1}{4}$", "$\\\\frac{\\\\sqrt{5}-1}{2}$", "$\\\\frac{\\\\sqrt{5}+1}{4}$", "$\\\\frac{\\\\sqrt{5}+1}{2}$"], "ques_answer": "C", "know_list": [6, 30, 511], "know_name": ["立体几何", "空间几何体", "棱柱的结构特征"], "difficulty": 0.604718, "ques_figure_ids": ["726e3a92-33a9-11ec-88d7-98fa9b625adb"], "ques_figure_paths": ["/share/luna/database-data/figure/LUNA-figure/7/73/73d66b18-33a9-11ec-a11a-98fa9b625adb.png"]} {"ques_content": "设$O$为正方形$ABCD$中心,在$O, A, B, C, D$中任取$3$点,则取到的$3$点共线的概率为", "ques_subject": 1, "ques_id": "726e3a93-33a9-11ec-9d16-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["$\\\\frac{1}{5}$", "$\\\\frac{2}{5}$", "$\\\\frac{1}{2}$", "$\\\\frac{4}{5}$"], "ques_answer": "A", "know_list": [1, 21, 309], "know_name": ["排列组合与概率统计", "概率", "古典概型及其概率计算公式"], "difficulty": 0.455177, "ques_figure_ids": [], "ques_figure_paths": []} -{"ques_content": "某校一个课外学习小组为研究某作物的发芽率$y$和温度$x$(单位:$^{\\circ} \\mathrm{C}$)的关系,在$20$个不同温度条件下进行种子发芽实验,由实验数据$\\left(x_{i}, y_{i}\\right)(i=1,2, \\cdots, 20)$得到下面的散点图:$\\FigureID{000004d6-0479-11ec-829b-797d5eb43535}$由此散点图,在$10$$^{\\circ} \\mathrm{C}$至$40$$^{\\circ} \\mathrm{C}$之间,下面四个回归方程类型中最适宜作为发芽率$y$和温度$x$的回归方程类型的是", "ques_subject": 1, "ques_id": "726e618c-33a9-11ec-9f9e-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["$y=a+b x$", "$y=a+b x^{2}$", "$y=a+b e^{x}$", "$y=a+b \\\\ln x$"], "ques_answer": "D", "know_list": [1, 20, 286], "know_name": ["排列组合与概率统计", "统计与统计案例", "变量间的相关关系"], "difficulty": 0.530059, "ques_figure_ids": ["726e618c-33a9-11ec-9f9e-98fa9b625adb"], "ques_figure_paths": ["/share/luna/database-data/figure/LUNA-figure/7/73/73d69210-33a9-11ec-b2fe-98fa9b625adb.png"]} +{"ques_content": "某校一个课外学习小组为研究某作物的发芽率$y$和温度$x$(单位:$^{\\circ} \\mathrm{C}$)的关系,在$20$个不同温度条件下进行种子发芽实验,由实验数据$\\left(x_{i}, y_{i}\\right)(i=1,2, \\cdots, 20)$得到下面的散点图:$\\FigureID{000004d6-0479-11ec-829b-797d5eb43535}$由此散点图,在$10$$^{\\circ} \\mathrm{C}$至$40$$^{\\circ} \\mathrm{C}$之间,下面四个回归方程类型中最适宜作为发芽率$y$和温度$x$的回归方程类型的是", "ques_subject": 1, "ques_id": "726e618c-33a9-11ec-9f9e-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["$y=a+b x$", "$y=a+b x^{2}$", "$y=a+b e^{x}$", "$y=a+b \\\\ln x$"], "ques_answer": "C", "know_list": [1, 20, 286], "know_name": ["排列组合与概率统计", "统计与统计案例", "变量间的相关关系"], "difficulty": 0.530059, "ques_figure_ids": ["000004d6-0479-11ec-829b-797d5eb43535"], "ques_figure_paths": ["/share/luna/database-data/figure/LUNA-figure/7/73/73d69210-33a9-11ec-b2fe-98fa9b625adb.png"]} {"ques_content": "已知圆$x^{2}+y^{2}-6 x=0$,过点($1,2)$的直线被该圆所截得的弦的长度的最小值为", "ques_subject": 1, "ques_id": "726e8886-33a9-11ec-b0d0-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["1", "2", "3", "4"], "ques_answer": "A", "know_list": [5, 27, 440], "know_name": ["平面解析几何", "直线与方程", "斜率的计算公式"], "difficulty": 0.58802, "ques_figure_ids": [], "ques_figure_paths": []} -{"ques_content": "设函数$f(x)=\\cos \\left(\\omega x+\\frac{\\pi}{6}\\right)$ 在 $[-\\pi, \\pi]$的图像大致如下图,则$f(x)$的最小周期为", "ques_subject": 1, "ques_id": "726e8887-33a9-11ec-a994-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["$\\\\frac{10 \\\\pi}{9}$", "$\\\\frac{7 \\\\pi}{6}$", "$\\\\frac{4\\\\pi}{3}$", "$\\\\frac{3 \\\\pi}{2}$"], "ques_answer": "C", "know_list": [4, 26, 422], "know_name": ["三角函数", "三角函数", "函数y=Asin(ωx +ф)的图像变换"], "difficulty": 0.421256, "ques_figure_ids": ["726e8887-33a9-11ec-a994-98fa9b625adb"], "ques_figure_paths": ["/share/luna/database-data/figure/LUNA-figure/7/73/73d6b902-33a9-11ec-b912-98fa9b625adb.png"]} +{"ques_content": "设函数$f(x)=\\cos \\left(\\omega x+\\frac{\\pi}{6}\\right)$ 在 $[-\\pi, \\pi]$的图像大致如下图,则$f(x)$的最小周期为", "ques_subject": 1, "ques_id": "726e8887-33a9-11ec-a994-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["$\\\\frac{10 \\\\pi}{9}$", "$\\\\frac{7 \\\\pi}{6}$", "$\\\\frac{4\\\\pi}{3}$", "$\\\\frac{3 \\\\pi}{2}$"], "ques_answer": "C", "know_list": [4, 26, 422], "know_name": ["三角函数", "三角函数", "函数y=Asin(ωx +ф)的图像变换"], "difficulty": 0.421256, "ques_figure_ids": ["000004d6-0479-11ec-829b-797d5eb43535"], "ques_figure_paths": ["/share/luna/database-data/figure/LUNA-figure/7/73/73d6b902-33a9-11ec-b912-98fa9b625adb.png"]} {"ques_content": "设对数$a \\log _{3} 4=2,$ 则 $4^{-a}=$", "ques_subject": 1, "ques_id": "726eb0ac-33a9-11ec-b339-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["$\\\\frac{1}{16}$", "$\\\\frac{1}{9}$", "$\\\\frac{1}{8}$", "$\\\\frac{1}{6}$"], "ques_answer": "B", "know_list": [0, 13, 129], "know_name": ["代数", " 基本初等函数Ⅰ", "指数函数的实际应用"], "difficulty": 0.34596, "ques_figure_ids": [], "ques_figure_paths": []} -{"ques_content": "执行右面的程序框图$\\FigureID{000004d6-0479-11ec-829b-797d5eb43535}$,则输出的$n=$", "ques_subject": 1, "ques_id": "726ed764-33a9-11ec-9fd2-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["17", "19", "21", "23"], "ques_answer": "C", "know_list": [2, 23, 351], "know_name": ["算法与框图", "算法初步与框图", "程序框图"], "difficulty": 0.78378, "ques_figure_ids": ["726ed764-33a9-11ec-9fd2-98fa9b625adb"], "ques_figure_paths": ["/share/luna/database-data/figure/LUNA-figure/7/73/73d6dff8-33a9-11ec-8bfa-98fa9b625adb.png"]} -{"ques_content": "设$\\left\\{a_{n}\\right\\}$是等比数列,且$a_{1}+a_{2}+a_{3}=1, a_{2}+a_{3}+a_{4}=2,$ 则 $a_{6}+a_{7}+a_{8}=$", "ques_subject": 1, "ques_id": "726ed765-33a9-11ec-b319-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["12", "24", "30", "32"], "ques_answer": "D", "know_list": [0, 17, 213], "know_name": ["代数", "数列", "等比数列"], "difficulty": 0.455039, "ques_figure_ids": [], "ques_figure_paths": []} +{"ques_content": "执行右面的程序框图$\\FigureID{000004d6-0479-11ec-829b-797d5eb43535}$,则输出的$n=$", "ques_subject": 1, "ques_id": "726ed764-33a9-11ec-9fd2-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["17", "19", "21", "23"], "ques_answer": "C", "know_list": [2, 23, 351], "know_name": ["算法与框图", "算法初步与框图", "程序框图"], "difficulty": 0.78378, "ques_figure_ids": ["000004d6-0479-11ec-829b-797d5eb43535"], "ques_figure_paths": ["/share/luna/database-data/figure/LUNA-figure/7/73/73d6dff8-33a9-11ec-8bfa-98fa9b625adb.png"]} +{"ques_content": "设$\\left\\{a_{n}\\right\\}$是等比数列,且$a_{1}+a_{2}+a_{3}=1, a_{2}+a_{3}+a_{4}=2,$ 则 $a_{6}+a_{7}+a_{8}=$", "ques_subject": 1, "ques_id": "726ed765-33a9-11ec-b319-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["12", "24", "30", "32"], "ques_answer": "C", "know_list": [0, 17, 213], "know_name": ["代数", "数列", "等比数列"], "difficulty": 0.455039, "ques_figure_ids": [], "ques_figure_paths": []} {"ques_content": "设$F_{1}, F_{2}$是双曲线$C: x^{2}-\\frac{y^{2}}{3}=1$的两个焦点,$O$为坐标原点,点$P$在$C$上且$|O P|=2$,则$\\triangle P F_{1} F_{2}$的面积为", "ques_subject": 1, "ques_id": "726efed0-33a9-11ec-a18c-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["$\\\\frac{7}{2}$", "3", "$\\\\frac{5}{2}$", "2"], "ques_answer": "B", "know_list": [5, 29, 494], "know_name": ["平面解析几何", "圆锥曲线与方程", "双曲线的定义"], "difficulty": 0.393424, "ques_figure_ids": [], "ques_figure_paths": []} {"ques_content": "已知$A, B, C$为球$O$的球面上的三个点,$\\odot O_{1}$ 为 $\\triangle A B C$ 的外接圆,若 $\\odot O_{1}$ 的面积为 $4 \\pi$,$A B=B C=A C=O O_{1},$ 则球 $O$ 的表面积为", "ques_subject": 1, "ques_id": "726f25f0-33a9-11ec-8501-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["$64 \\\\pi$", "$48 \\\\pi$", "$36 \\\\pi$", "$32 \\\\pi$"], "ques_answer": "A", "know_list": [6, 31, 569], "know_name": ["立体几何", "空间向量与立体几何", "用空间向量求直线间的夹角、距离"], "difficulty": 0.365152, "ques_figure_ids": [], "ques_figure_paths": []} {"ques_content": "若$x,y$满足约束条件$\\left\\{\\begin{array}{c}2 x+y-2 \\leq 0 \\\\ x-y-1 \\geq 0 \\\\ y+1 \\geq 0\\end{array}\\right.$,则$z=x+7 y$的最大值为_______", "ques_subject": 1, "ques_id": "726f25f1-33a9-11ec-82f6-98fa9b625adb", "ques_type": 2, "ques_system": 10, "ques_period": 2, "ques_options": [], "ques_answer": "1", "know_list": [4, 25, 379], "know_name": ["三角函数", "三角函数及其恒等变换", "三角函数的定义域"], "difficulty": 0.371465, "ques_figure_ids": [], "ques_figure_paths": []} @@ -16,10 +16,10 @@ {"ques_content": "数列$\\left\\{a_{n}\\right\\}$满足$a_{n+2}+(-1)^{n} a_{n}=3 n-1$,前$16$项和为$540$,则$\\left\\{a_{1}\\right\\}=$_______", "ques_subject": 1, "ques_id": "726f737a-33a9-11ec-8c30-98fa9b625adb", "ques_type": 2, "ques_system": 10, "ques_period": 2, "ques_options": [], "ques_answer": "$\\left\\{a_{1}\\right\\}=7$", "know_list": [7, 33, 587], "know_name": ["高等数学", "矩阵与变换", "二阶矩阵"], "difficulty": 0.559559, "ques_figure_ids": [], "ques_figure_paths": []} {"ques_content": "某厂接受了一项加工业务,加工出来的产品(单位:件)按标准分为$A, B, C, D$四个等级。加工业务约定:对于$A$级品,$B$级品,$C$级品,厂家每件分别收取加工费$90$元,$50$元,$20$元;对于$D$级品,厂家每件要赔偿原料损失费$50$元。该厂有甲、乙两个分厂可承接加工业务,甲分厂加工成本费为$25$元/件,乙分厂加工成本费为$20$元/件。厂家为决定由哪个分厂承接加工业务,在两个分厂各试加工了$100$件这种产品,并统计了这些产品的等级,整理如下:分别估计甲、乙两分厂加工出来的一件产品为$A$级品的概率;分别求甲、乙两分厂加工出来的$100$件产品的平均利润,以平均利润为依据,厂家应选哪个分厂承接加工业务?", "ques_subject": 1, "ques_id": "726f9938-33a9-11ec-a236-98fa9b625adb", "ques_type": 5, "ques_system": 10, "ques_period": 2, "ques_options": [], "ques_answer": "(1)0.4和0.28;(2)$x+y+z=1$", "know_list": [0, 14, 164], "know_name": ["代数", "函数的应用", "函数模型的选择与应用"], "difficulty": 0.29537, "ques_figure_ids": [], "ques_figure_paths": []} {"ques_content": "$\\triangle A B C$ 的内角为 $A, \\quad B, \\quad C$的对边分别为$a, b, c$,已知$B=150^{\\circ}$。若$a=\\sqrt{3} c, \\quad b=2 \\sqrt{7}, \\quad$ 求 $\\triangle A B C$ 的面积;若$\\sin A+\\sqrt{3} \\sin C=\\frac{\\sqrt{2}}{2}, \\quad$ 求 $C$", "ques_subject": 1, "ques_id": "726f9939-33a9-11ec-981c-98fa9b625adb", "ques_type": 5, "ques_system": 10, "ques_period": 2, "ques_options": [], "ques_answer": "(1)$S_{\\triangle A B C}=\\sqrt{3}$;(2)$C=\\frac{\\pi}{12}$。", "know_list": [4, 26, 436], "know_name": ["三角函数", "三角函数", "解三角形"], "difficulty": 0.286434, "ques_figure_ids": [], "ques_figure_paths": []} -{"ques_content": "如图$\\FigureID{000004d6-0479-11ec-829b-797d5eb43535}$,$D$ 为圆锥的顶点,$O$是圆锥底面的圆心,$\\triangle A B C$是底面的内接正三角形,$P$ 为$DO$上一点,$\\angle A P C=90^{\\circ}$。证明:平面$P A B \\perp$平面$P A C $设$D O=\\sqrt{2}$,圆锥的侧面积为$\\sqrt{3} \\pi$,求三棱锥$P-A B C$的体积。", "ques_subject": 1, "ques_id": "726fc02e-33a9-11ec-99e4-98fa9b625adb", "ques_type": 5, "ques_system": 10, "ques_period": 2, "ques_options": [], "ques_answer": "", "know_list": [6, 31, 555], "know_name": ["立体几何", "空间向量与立体几何", "空间点、线、面的位置"], "difficulty": 0.339271, "ques_figure_ids": ["726fc02e-33a9-11ec-99e4-98fa9b625adb"], "ques_figure_paths": ["/share/luna/database-data/figure/LUNA-figure/7/73/73d72de2-33a9-11ec-84f7-98fa9b625adb.png"]} {"ques_content": "已知函数$f(x)=e^{x}-a(x+2)$当$a=1$时,讨论$f(x)$的单调性;若$f(x)$有两个零点,求$a$的取值范围", "ques_subject": 1, "ques_id": "726fc02f-33a9-11ec-8019-98fa9b625adb", "ques_type": 5, "ques_system": 10, "ques_period": 2, "ques_options": [], "ques_answer": "(1)$f(x)$在$(-\\infty, 0)$上单调递减,在$(0, +\\infty)$上单调递增\n(2)$\\left(e^{-1},+\\infty\\right)$", "know_list": [0, 15, 177], "know_name": ["代数", "导数及其应用", "利用导数研究函数的极值"], "difficulty": 0.270787, "ques_figure_ids": [], "ques_figure_paths": []} {"ques_content": "已知$A,B$分别为椭圆$E: \\frac{x^{2}}{a^{2}}+y^{2}=1 \\quad a>1$的左、右顶点,$G$ 为$E$的上顶点, $\\overrightarrow{A G} \\cdot \\overrightarrow{G B}=8$。$P$为直线$x=6$的动点,$P A $与$ E$的另一交点为 $C$, $PB$与$E$的另一交点为$D$求$E$的方程证明:直线$CD$过定点", "ques_subject": 1, "ques_id": "726fe850-33a9-11ec-9724-98fa9b625adb", "ques_type": 5, "ques_system": 10, "ques_period": 2, "ques_options": [], "ques_answer": "(1)$\\frac{x^{2}}{9}+y^{2}=1$;(2)$\\left(\\frac{3}{2}, 0\\right)$", "know_list": [5, 29, 486], "know_name": ["平面解析几何", "圆锥曲线与方程", "椭圆的定义"], "difficulty": 0.358355, "ques_figure_ids": [], "ques_figure_paths": []} {"ques_content": "[选修$4-4$:坐标系与参数方程]\n在直角坐标系$xOy$中,曲线$C_1$的参数方程为$\\left\\{\\begin{array}{l}x=\\cos ^{k} t \\\\ y=\\sin ^{k} t\\end{array}\\right.$($t$为参数),以坐标原点为极点,$x$轴正半轴为极轴建立极坐标系,曲线$C_2$的极坐标方程为$4 \\rho \\cos \\theta-16 \\rho \\sin \\theta+3=0$当$k=1$时,$C_1$是什么曲线?当$k=4$时,求$C_1$与$C_2$的公共点的直角坐标", "ques_subject": 1, "ques_id": "726fe851-33a9-11ec-bf44-98fa9b625adb", "ques_type": 5, "ques_system": 10, "ques_period": 2, "ques_options": [], "ques_answer": "(1)以原点为圆心,以1为半径的圆;\n(2)$\\left(\\frac{1}{4}, \\frac{1}{4}\\right)$", "know_list": [7, 35, 615], "know_name": ["高等数学", "坐标系与参数方程", "简单曲线的极坐标方程"], "difficulty": 0.331245, "ques_figure_ids": [], "ques_figure_paths": []} -{"ques_content": "[选修$4-5$:不等式选讲]\n已知函数$f(x)=|3 x+1|-2|x|$画出$y=f(x)$的图像求不等式$f(x)>f(x+1)$的解集", "ques_subject": 1, "ques_id": "72700f52-33a9-11ec-bdd0-98fa9b625adb", "ques_type": 5, "ques_system": 10, "ques_period": 2, "ques_options": [], "ques_answer": "(1)如图;(2)$\\left(-\\infty,-\\frac{7}{6}\\right)$", "know_list": [7, 36, 635], "know_name": ["高等数学", "不等式选讲", "绝对值不等式"], "difficulty": 0.517198, "ques_figure_ids": ["72700f52-33a9-11ec-bdd0-98fa9b625adb"], "ques_figure_paths": ["/share/luna/database-data/figure/LUNA-figure/7/73/73d754d2-33a9-11ec-8dee-98fa9b625adb.png"]} -{"ques_content": "$1.$已知集合 $A=\\{x||x|<3, x \\in Z\\}, \\quad B=\\{x||x|>1, x \\in \\mathbf{Z}\\}, \\quad$ 则 $A \\cap B=$", "ques_subject": 1, "ques_id": "72700f53-33a9-11ec-8395-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["$\\\\varnothing$", "$\\\\{-3,-2,2,3\\\\}$", "$\\\\{-2,0,2\\\\}$", "$\\\\{-2,2\\\\}$"], "ques_answer": "D", "know_list": [0, 10, 57], "know_name": ["代数", "集合", "集合的相等"], "difficulty": 0.349825, "ques_figure_ids": [], "ques_figure_paths": []} -{"ques_content": "复数 $(1-i)^{4}=$", "ques_subject": 1, "ques_id": "7270351c-33a9-11ec-82f2-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["-4", "4", "$-4i$", "$4i$"], "ques_answer": "A", "know_list": [0, 19, 269], "know_name": ["代数", "数系的扩充与复数", "复数代数形式的加减运算"], "difficulty": 0.649847, "ques_figure_ids": [], "ques_figure_paths": []} \ No newline at end of file +{"ques_content": "[选修$4-5$:不等式选讲]\n已知函数$f(x)=|3 x+1|-2|x|$画出$y=f(x)$的图像求不等式$f(x)>f(x+1)$的解集", "ques_subject": 1, "ques_id": "72700f52-33a9-11ec-bdd0-98fa9b625adb", "ques_type": 5, "ques_system": 10, "ques_period": 2, "ques_options": [], "ques_answer": "(1)如图;(2)$\\left(-\\infty,-\\frac{7}{6}\\right)$", "know_list": [7, 36, 635], "know_name": ["高等数学", "不等式选讲", "绝对值不等式"], "difficulty": 0.517198, "ques_figure_ids": ["000004d6-0479-11ec-829b-797d5eb43535"], "ques_figure_paths": ["/share/luna/database-data/figure/LUNA-figure/7/73/73d754d2-33a9-11ec-8dee-98fa9b625adb.png"]} +{"ques_content": "$1.$已知集合 $A=\\{x||x|<3, x \\in Z\\}, \\quad B=\\{x||x|>1, x \\in \\mathbf{Z}\\}, \\quad$ 则 $A \\cap B=$", "ques_subject": 1, "ques_id": "72700f53-33a9-11ec-8395-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["$\\\\varnothing$", "$\\\\{-3,-2,2,3\\\\}$", "$\\\\{-2,0,2\\\\}$", "$\\\\{-2,2\\\\}$"], "ques_answer": "C", "know_list": [0, 10, 57], "know_name": ["代数", "集合", "集合的相等"], "difficulty": 0.349825, "ques_figure_ids": [], "ques_figure_paths": []} +{"ques_content": "复数 $(1-i)^{4}=$", "ques_subject": 1, "ques_id": "7270351c-33a9-11ec-82f2-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["-4", "4", "$-4i$", "$4i$"], "ques_answer": "A", "know_list": [0, 19, 269], "know_name": ["代数", "数系的扩充与复数", "复数代数形式的加减运算"], "difficulty": 0.649847, "ques_figure_ids": [], "ques_figure_paths": []} +{"ques_content": "如图$\\FigureID{000004d6-0479-11ec-829b-797d5eb43535}$,$D$ 为圆锥的顶点,$O$是圆锥底面的圆心,$\\triangle A B C$是底面的内接正三角形,$P$ 为$DO$上一点,$\\angle A P C=90^{\\circ}$。证明:平面$P A B \\perp$平面$P A C $设$D O=\\sqrt{2}$,圆锥的侧面积为$\\sqrt{3} \\pi$,求三棱锥$P-A B C$的体积。", "ques_subject": 1, "ques_id": "726fc02e-33a9-11ec-99e4-98fa9b625adb", "ques_type": 5, "ques_system": 10, "ques_period": 2, "ques_options": [], "ques_answer": "", "know_list": [6, 31, 555], "know_name": ["立体几何", "空间向量与立体几何", "空间点、线、面的位置"], "difficulty": 0.339271, "ques_figure_ids": ["000004d6-0479-11ec-829b-797d5eb43535"], "ques_figure_paths": ["/share/luna/database-data/figure/LUNA-figure/7/73/73d72de2-33a9-11ec-84f7-98fa9b625adb.png"]} \ No newline at end of file diff --git a/tests/test_pretrain/__init__.py b/tests/test_pretrain/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_pretrain/conftest.py b/tests/test_pretrain/conftest.py index e5e7d145..461f6fcb 100644 --- a/tests/test_pretrain/conftest.py +++ b/tests/test_pretrain/conftest.py @@ -7,7 +7,6 @@ from EduNLP.ModelZoo import load_items # TEST_GPU = torch.cuda.is_available() -TEST_GPU = False @pytest.fixture(scope="module") diff --git a/tests/test_pretrain/test_hugginface_utils.py b/tests/test_pretrain/test_hugginface_utils.py index cb0de6a5..bcfea1f2 100644 --- a/tests/test_pretrain/test_hugginface_utils.py +++ b/tests/test_pretrain/test_hugginface_utils.py @@ -1,9 +1,9 @@ from EduNLP.Pretrain.hugginface_utils import TokenizerForHuggingface from transformers import AutoTokenizer import os +os.environ["WANDB_DISABLED"] = "true" -# TODO class TestPretrainUtils: def test_hf_tokenzier(self, pretrained_tokenizer_dir): tokenizer = TokenizerForHuggingface(tokenize_method=None) diff --git a/tests/test_pretrain/test_pretrained_bert.py b/tests/test_pretrain/test_pretrained_bert.py index 80929a91..146119a3 100644 --- a/tests/test_pretrain/test_pretrained_bert.py +++ b/tests/test_pretrain/test_pretrained_bert.py @@ -1,6 +1,6 @@ import os os.environ["CUDA_VISIBLE_DEVICES"] = "" - +os.environ["WANDB_DISABLED"] = "true" import torch from EduNLP.ModelZoo.bert import BertForPropertyPrediction, BertForKnowledgePrediction from transformers import BertModel as HFBertModel @@ -9,7 +9,7 @@ from EduNLP.Vector import T2V, BertModel from EduNLP.I2V import Bert, get_pretrained_i2v -from conftest import TEST_GPU +TEST_GPU = False class TestPretrainBert: @@ -206,6 +206,8 @@ def test_i2v(self, pretrained_model_dir): i_vec = i2v.infer_item_vector(items, key=lambda x: x['stem']) assert len(i_vec[0]) == i2v.vector_size + i_vec = i2v.infer_item_vector(items, key=lambda x: x['stem'], pooling_strategy='average') + assert len(i_vec[0]) == i2v.vector_size t_vec = i2v.infer_token_vector(items, key=lambda x: x['stem']) assert len(t_vec[0][0]) == i2v.vector_size diff --git a/tests/test_pretrain/test_pretrained_disenqnet.py b/tests/test_pretrain/test_pretrained_disenqnet.py index 41998568..df7eb4a8 100644 --- a/tests/test_pretrain/test_pretrained_disenqnet.py +++ b/tests/test_pretrain/test_pretrained_disenqnet.py @@ -1,13 +1,17 @@ import os os.environ["CUDA_VISIBLE_DEVICES"] = "" - +os.environ["WANDB_DISABLED"] = "true" import pytest import torch from EduNLP.ModelZoo.disenqnet import DisenQNet +from EduNLP.ModelZoo.disenqnet import DisenQNetForKnowledgePrediction, DisenQNetForPropertyPrediction +from EduNLP.Pretrain import finetune_disenqnet_for_knowledge_prediction +from EduNLP.Pretrain import finetune_disenqnet_for_property_prediction from EduNLP.Pretrain import DisenQTokenizer, train_disenqnet from EduNLP.Vector import T2V, DisenQModel from EduNLP.I2V import DisenQ, get_pretrained_i2v -from conftest import TEST_GPU + +TEST_GPU = False # TODO @@ -102,6 +106,84 @@ def test_train_disenq(self, standard_luna_data, pretrained_model_dir): encodes = tokenizer(test_items, lambda x: x['ques_content']) model(**encodes) + def test_train_pp(self, standard_luna_data, pretrained_pp_dir, pretrained_model_dir): + data_params = { + "stem_key": "ques_content", + "label_key": "difficulty" + } + train_params = { + "num_train_epochs": 1, + "per_device_train_batch_size": 2, + "per_device_eval_batch_size": 2, + "no_cuda": not TEST_GPU, + } + train_items = standard_luna_data + # train without eval_items + finetune_disenqnet_for_property_prediction( + train_items, + pretrained_pp_dir, + pretrained_model=pretrained_model_dir, + train_params=train_params, + data_params=data_params + ) + # train with eval_items + finetune_disenqnet_for_property_prediction( + train_items, + pretrained_pp_dir, + pretrained_model=pretrained_model_dir, + eval_items=train_items, + train_params=train_params, + data_params=data_params + ) + model = DisenQNetForPropertyPrediction.from_pretrained(pretrained_pp_dir) + tokenizer = DisenQTokenizer.from_pretrained(pretrained_pp_dir) + + encodes = tokenizer(train_items[:8], lambda x: x['ques_content']) + # TODO: need to handle inference for T2V for batch or single + model(**encodes) + + def test_train_kp(self, standard_luna_data, pretrained_model_dir, pretrained_kp_dir): + data_params = { + "stem_key": "ques_content", + "label_key": "know_list" + } + train_params = { + "num_train_epochs": 1, + "per_device_train_batch_size": 2, + "per_device_eval_batch_size": 2, + "no_cuda": not TEST_GPU, + } + model_params = { + "num_classes_list": [10, 27, 963], + "num_total_classes": 1000, + } + train_items = standard_luna_data + # train without eval_items + finetune_disenqnet_for_knowledge_prediction( + train_items, + pretrained_kp_dir, + pretrained_model=pretrained_model_dir, + train_params=train_params, + data_params=data_params, + model_params=model_params + ) + # train with eval_items + finetune_disenqnet_for_knowledge_prediction( + train_items, + pretrained_kp_dir, + pretrained_model=pretrained_model_dir, + eval_items=train_items, + train_params=train_params, + data_params=data_params, + model_params=model_params + ) + model = DisenQNetForKnowledgePrediction.from_pretrained(pretrained_kp_dir) + tokenizer = DisenQTokenizer.from_pretrained(pretrained_kp_dir) + + encodes = tokenizer(train_items[:8], lambda x: x['ques_content']) + # TODO: need to handle inference for T2V for batch or single + model(**encodes) + def test_disenq_t2v(self, pretrained_model_dir): items = [ {'stem': '如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$, \ diff --git a/tests/test_pretrain/test_pretrained_elmo.py b/tests/test_pretrain/test_pretrained_elmo.py index 16ffb55e..9f106d84 100644 --- a/tests/test_pretrain/test_pretrained_elmo.py +++ b/tests/test_pretrain/test_pretrained_elmo.py @@ -1,6 +1,6 @@ import os os.environ["CUDA_VISIBLE_DEVICES"] = "" - +os.environ["WANDB_DISABLED"] = "true" import pytest import torch from EduNLP.ModelZoo.rnn import ElmoLM @@ -9,7 +9,7 @@ from EduNLP.Vector import T2V, ElmoModel from EduNLP.I2V import Elmo, get_pretrained_i2v -from conftest import TEST_GPU +TEST_GPU = False class TestPretrainEmlo: @@ -196,7 +196,7 @@ def test_elmo_t2v(self, pretrained_model_dir): t2v = ElmoModel(pretrained_model_dir) encodes = tokenizer(items, key=lambda x: x['stem']) output = t2v(encodes) - assert output.shape[1] == t2v.vector_size + assert output.forward_output.shape[-1] == t2v.vector_size // 2 t2v = T2V('elmo', pretrained_model_dir) encodes = tokenizer(items, key=lambda x: x['stem']) diff --git a/tests/test_pretrain/test_pretrained_quesnet.py b/tests/test_pretrain/test_pretrained_quesnet.py index f1ca3323..92dd204f 100644 --- a/tests/test_pretrain/test_pretrained_quesnet.py +++ b/tests/test_pretrain/test_pretrained_quesnet.py @@ -1,11 +1,7 @@ -from lib2to3.pgen2 import token import os - -from bson import encode os.environ["CUDA_VISIBLE_DEVICES"] = "" - +os.environ["WANDB_DISABLED"] = "true" import pytest -import torch from EduNLP.ModelZoo.quesnet import QuesNet from EduNLP.Pretrain import QuesNetTokenizer, Question, pretrain_quesnet # from EduNLP.Pretrain import train_quesnet_for_property_prediction, train_quesnet_for_knowledge_prediction @@ -14,7 +10,7 @@ from EduNLP.I2V import QuesNet as QuesNetI2V, get_pretrained_i2v from EduNLP.utils import abs_current_dir, path_append -from conftest import TEST_GPU +TEST_GPU = False class TestPretrainQuesNet: @@ -72,10 +68,11 @@ def test_tokenizer(self, standard_luna_data, pretrained_tokenizer_dir): def test_train_quesnet(self, standard_luna_data, pretrained_model_dir): test_items = [ {'ques_content': '有公式$\\FormFigureID{wrong1?}$和公式$\\FormFigureBase64{wrong2?}$,\ - 如图$\\FigureID{000004d6-0479-11ec-829b-797d5eb43535}$,\ - 若$x,y$满足约束条件$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$'}, + 如图$\\FigureID{000004d6-0479-11ec-829b-797d5eb43535}$,若$x,y$满足约束条件$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$'}, {'ques_content': '如图$\\FigureID{000004d6-0479-11ec-829b-797d5eb43535}$, \ - 若$x,y$满足约束条件$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$'} + 若$x,y$满足约束条件$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$', + "ques_figure_ids": ["000004d6-0479-11ec-829b-797d5eb43535"], + "ques_figure_paths": ["../../static/test_data/quesnet_img/000004d6-0479-11ec-829b-797d5eb43535.png"]} ] ques_file = path_append(abs_current_dir(__file__), @@ -87,6 +84,7 @@ def test_train_quesnet(self, standard_luna_data, pretrained_model_dir): pretrained_model_dir, img_dir=img_dir, save_embs=True, + load_embs=False, # data_params={ # "stem_key": "ques_content" # }, @@ -142,7 +140,9 @@ def test_quesnet_t2v(self, pretrained_model_dir): def test_quesnet_i2v(self, pretrained_model_dir): items = [ {'ques_content': '如图$\\FigureID{000004d6-0479-11ec-829b-797d5eb43535}$, \ - 若$x,y$满足约束条件$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$'} + 若$x,y$满足约束条件$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$', + "ques_figure_ids": ["000004d6-0479-11ec-829b-797d5eb43535"], + "ques_figure_paths": ["../../static/test_data/quesnet_img/000004d6-0479-11ec-829b-797d5eb43535.png"]} ] img_dir = path_append(abs_current_dir(__file__), "../../static/test_data/quesnet_img", to_str=True)