diff --git a/libai/models/utils/model_loader/base_loader.py b/libai/models/utils/model_loader/base_loader.py index 5222e9f06..e12294cd3 100644 --- a/libai/models/utils/model_loader/base_loader.py +++ b/libai/models/utils/model_loader/base_loader.py @@ -22,7 +22,6 @@ import oneflow as flow from safetensors import safe_open from termcolor import colored -from safetensors import safe_open import libai.utils.distributed as dist from libai.config import LazyCall diff --git a/libai/tokenizer/tokenization_base.py b/libai/tokenizer/tokenization_base.py index e5e5f121d..18aaef8e6 100644 --- a/libai/tokenizer/tokenization_base.py +++ b/libai/tokenizer/tokenization_base.py @@ -827,7 +827,11 @@ def encode(self, text, return_tensors=None, is_global=False, device="cuda", **kw self.build_inputs_with_special_tokens(token_ids) for token_ids in token_ids_list ] token_ids_list = self.convert_to_tensors( - token_ids_list, return_tensors=return_tensors, is_global=is_global, **kwargs + token_ids_list, + return_tensors=return_tensors, + is_global=is_global, + device=device, + **kwargs, ) return token_ids_list elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int): diff --git a/projects/Qwen/README.md b/projects/Qwen/README.md new file mode 100644 index 000000000..65333c443 --- /dev/null +++ b/projects/Qwen/README.md @@ -0,0 +1,67 @@ + +### 推理 + +- cuda PASS + +```bash +python projects/Qwen/pipeline.py --model_path=/root/models/Qwen1.5-7B-Chat --mode=huggingface +``` + +- npu PASS + +```bash +python projects/Qwen/pipeline.py --model_path=/data0/hf_models/qwen2/Qwen1.5-7B-Chat --mode=huggingface --device=npu +``` + +- xpu PASS + +```bash +python projects/Qwen/pipeline.py --model_path=/root/models/Qwen1.5-7B-Chat --mode=huggingface --device=xpu +``` + +### 训练 + +- data preparation + +```bash +python projects/Qwen/utils/data_prepare.py +``` + +- cuda PASS + +```bash +export NUM_GPUS=8 +python3 -m oneflow.distributed.launch \ + --nproc_per_node ${NUM_GPUS} \ + --nnodes 1 \ + --node_rank 0 \ + --master_addr 127.0.0.1 \ + --master_port 12345 \ + tools/train_net.py --config-file=projects/Qwen/configs/qwen_sft.py \ + graph.enabled=True \ + train.input_placement_device="cuda" \ + train.dist.device_type="cuda" \ + train.dist.pipeline_parallel_size=${NUM_GPUS} +``` +A100-PCIE-40GB x 4 OOM + +- xpu OOM + +```bash +export NUM_GPUS=1 +python3 -m oneflow.distributed.launch \ + --nproc_per_node ${NUM_GPUS} \ + --nnodes 1 \ + --node_rank 0 \ + --master_addr 127.0.0.1 \ + --master_port 12345 \ + tools/train_net.py --config-file=projects/Qwen/configs/qwen_sft.py \ + graph.enabled=False \ + train.input_placement_device="xpu" \ + train.dist.device_type="xpu" \ + train.dist.pipeline_parallel_size=${NUM_GPUS} +``` + +- npu 没有测,应该不行 + + diff --git a/projects/Qwen/config/qwen_config.py b/projects/Qwen/configs/qwen_config.py similarity index 86% rename from projects/Qwen/config/qwen_config.py rename to projects/Qwen/configs/qwen_config.py index 20381a5fd..740d0adec 100644 --- a/projects/Qwen/config/qwen_config.py +++ b/projects/Qwen/configs/qwen_config.py @@ -1,10 +1,9 @@ from omegaconf import DictConfig, OmegaConf +from configs.common.train import train from libai.config import LazyCall from projects.Qwen.qwen2 import Qwen2ForCausalLM from projects.Qwen.tokenizer import Qwen2Tokenizer -from configs.common.train import train - cfg = dict( # Model @@ -49,7 +48,7 @@ eos_token_id=151645, pad_token_id=151643, # train - pretrained_model_path="/data/home/xiezipeng/hf_models/Qwen/Qwen1.5-7B", + pretrained_model_path="/root/models/Qwen1.5-7B-Chat", ) cfg = DictConfig(cfg) @@ -58,6 +57,6 @@ tokenization = OmegaConf.create() tokenization.make_vocab_size_divisible_by = 1 tokenization.tokenizer = LazyCall(Qwen2Tokenizer)( - vocab_file="/data/home/xiezipeng/hf_models/Qwen/Qwen1.5-7B/vocab.json", - merges_file="/data/home/xiezipeng/hf_models/Qwen/Qwen1.5-7B/merges.txt", + # vocab_file="/root/models/Qwen1.5-7B/vocab.json", + # merges_file="/root/models/Qwen/Qwen1.5-7B/merges.txt", ) diff --git a/projects/Qwen/config/qwen_sft.py b/projects/Qwen/configs/qwen_sft.py similarity index 72% rename from projects/Qwen/config/qwen_sft.py rename to projects/Qwen/configs/qwen_sft.py index 93ccf5ca9..05f506d97 100644 --- a/projects/Qwen/config/qwen_sft.py +++ b/projects/Qwen/configs/qwen_sft.py @@ -1,26 +1,24 @@ import os + from omegaconf import OmegaConf +from configs.common.models.graph import graph +from configs.common.optim import optim +from configs.common.train import train from libai.config import LazyCall +from libai.data.build import build_nlp_test_loader, build_nlp_train_loader from libai.evaluation import PPLEvaluator from libai.scheduler import WarmupExponentialLR -from libai.data.build import build_nlp_test_loader, build_nlp_train_loader - -from configs.common.train import train -from configs.common.models.graph import graph -from configs.common.optim import optim - -from projects.Qwen.config.qwen_config import cfg -from projects.Qwen.utils.qwen_dataset import QwenDataset -from projects.Qwen.tokenizer import Qwen2Tokenizer +from projects.Qwen.configs.qwen_config import cfg from projects.Qwen.qwen2 import Qwen2ForCausalLM - +from projects.Qwen.tokenizer import Qwen2Tokenizer +from projects.Qwen.qwen_dataset import QwenDataset # Hyperparameters weight_decay = 0.1 learning_rate = 5e-5 -dataset_path = "/data/home/xiezipeng/libai/projects/Qwen/train_set" -pretrained_model_path = "/data/home/xiezipeng/hf_models/Qwen/Qwen1.5-7B" +dataset_path = "./alpaca_data" +pretrained_model_path = "/root/models/Qwen1.5-7B-Chat" # graph & optim graph["enabled"] = False @@ -35,12 +33,13 @@ tokenization = OmegaConf.create() tokenization.make_vocab_size_divisible_by = 1 tokenization.tokenizer = LazyCall(Qwen2Tokenizer)( - vocab_file="/data/home/xiezipeng/hf_models/Qwen/Qwen1.5-7B/vocab.json", - merges_file="/data/home/xiezipeng/hf_models/Qwen/Qwen1.5-7B/merges.txt", + vocab_file=pretrained_model_path + "/vocab.json", + merges_file=pretrained_model_path + "/merges.txt", ) # model +cfg.pretrained_model_path = pretrained_model_path model = LazyCall(Qwen2ForCausalLM)(cfg=cfg) # datasets @@ -48,21 +47,28 @@ dataloader.train = LazyCall(build_nlp_train_loader)( dataset=[ LazyCall(QwenDataset)( - path=dataset_path, tokenizer=tokenization.tokenizer + path=os.path.join(dataset_path, "train"), tokenizer=tokenization.tokenizer ) ], ) +dataloader.test = [ + LazyCall(build_nlp_test_loader)( + dataset=LazyCall(QwenDataset)( + path=os.path.join(dataset_path, "test"), tokenizer=tokenization.tokenizer + ), + ), +] train.update( dict( output_dir="./sft_result", train_micro_batch_size=1, test_micro_batch_size=1, - train_epoch=3, + train_epoch=1, train_iter=1, - log_period=10, + log_period=1, warmup_ratio=1 / 3, - num_accumulation_steps=8, + num_accumulation_steps=1, rdma_enabled=False, amp=dict(enabled=True), activation_checkpoint=dict(enabled=True), diff --git a/projects/Qwen/pipeline.py b/projects/Qwen/pipeline.py index f9628df4b..75c702a95 100644 --- a/projects/Qwen/pipeline.py +++ b/projects/Qwen/pipeline.py @@ -13,6 +13,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +from pathlib import Path + +import click + +from libai.config import try_get_key +from libai.engine import DefaultTrainer from libai.inference.basic import BasePipeline from libai.utils import distributed as dist @@ -67,7 +73,10 @@ def _parse_parameters(self, **pipeline_parameters): def preprocess(self, inputs, **kwargs) -> dict: # tokenizer encoderW - inputs = self.tokenizer.encode(inputs, return_tensors='of', is_global=True) + import oneflow as flow + + inputs = flow.tensor(self.tokenizer.encode(inputs, add_bos=True, padding=True)) + inputs = { "input_ids": inputs, } @@ -75,7 +84,8 @@ def preprocess(self, inputs, **kwargs) -> dict: return inputs def forward(self, inputs, **kwargs) -> dict: - outputs = self.model.generate(inputs["input_ids"], max_length=100, **kwargs) + inputs = dist.convert_to_distributed_default_setting(inputs["input_ids"]) + outputs = self.model.generate(inputs, max_length=50, **kwargs) return {"return_ids": outputs} def postprocess(self, model_output_dict, **kwargs) -> dict: @@ -86,17 +96,47 @@ def postprocess(self, model_output_dict, **kwargs) -> dict: ] return records - -if __name__ == "__main__": - # ----- load huggingface checkpoint ----- + def build_tokenizer(self, cfg): + tokenizer = None + if try_get_key(cfg, "tokenization") is not None: + tokenizer_cfg = cfg.tokenization.tokenizer + if "vocab_file" not in tokenizer_cfg: + # If "vocab_file" does not exist in the tokenizer's config, + # set it to default as f"{model_path}/vocab.json" + tokenizer_cfg.vocab_file = str(Path(self.model_path).joinpath("vocab.json")) + if "merges_file" not in tokenizer_cfg: + # If "merges_file" does not exist in the tokenizer's config, + # set it to default as f"{model_path}/merges.txt" + tokenizer_cfg.merges_file = str(Path(self.model_path).joinpath("merges.txt")) + tokenizer = DefaultTrainer.build_tokenizer(cfg) + return tokenizer + + +@click.command() +@click.option( + "--config_file", + default="projects/Qwen/configs/qwen_config.py", + help="Path to the configuration file.", +) +@click.option("--model_path", default=None, help="Path to the model checkpoint.") +@click.option( + "--mode", + default="libai", + help="Mode for the dataloader pipeline, e.g., 'libai' or 'huggingface'.", +) +@click.option( + "--device", default="cuda", help="Device to run the model on, e.g., 'cuda', 'xpu', 'npu'." +) +def main(config_file, model_path, mode, device): pipeline = TextGenerationPipeline( - "projects/Qwen/config/qwen_config.py", + config_file, data_parallel=1, tensor_parallel=1, pipeline_parallel=1, pipeline_num_layers=32, - model_path="/data/home/xiezipeng/hf_models/Qwen/Qwen1.5-7B", - mode="huggingface", + model_path=model_path, + mode=mode, + device=device, ) text = ["给出3点关于保持身体健康的意见。"] @@ -104,3 +144,7 @@ def postprocess(self, model_output_dict, **kwargs) -> dict: output = pipeline(inputs=text) if dist.is_main_process(): print(output) + + +if __name__ == "__main__": + main() diff --git a/projects/Qwen/qwen_dataset.py b/projects/Qwen/qwen_dataset.py new file mode 100644 index 000000000..c7c412a01 --- /dev/null +++ b/projects/Qwen/qwen_dataset.py @@ -0,0 +1,19 @@ +import oneflow as flow +from oneflow.utils.data import Dataset + +from libai.data.structures import DistTensorData, Instance + + +class QwenDataset(Dataset): + def __init__(self, path, tokenizer): + self.data = flow.load(path) + self.tokenizer = tokenizer + + def __len__(self): + return len(self.data) + + def __getitem__(self, index): + return Instance( + input_ids=DistTensorData(self.data[index]["input_ids"]), + labels=DistTensorData(self.data[index]["labels"]), + ) diff --git a/projects/Qwen/test.py b/projects/Qwen/test.py deleted file mode 100644 index 8fb4e574a..000000000 --- a/projects/Qwen/test.py +++ /dev/null @@ -1,37 +0,0 @@ -# from transformers import Qwen2Tokenizer as T2 -# from projects.Qwen.tokenizer import Qwen2Tokenizer as T1 - - -# tokenizer1 = T1( -# vocab_file="/data/home/xiezipeng/hf_models/Qwen/Qwen1.5-7B/vocab.json", -# merges_file="/data/home/xiezipeng/hf_models/Qwen/Qwen1.5-7B/merges.txt" -# ) -# tokenizer2 = T2.from_pretrained("/data/home/xiezipeng/hf_models/Qwen/Qwen1.5-7B") - -# text = [ -# "清晨的阳光洒落在树叶上,露珠在叶片上闪烁着晶莹的光泽。微风拂过,树枝微微摇曳,像是在向大自然问好。泥土的芳香弥漫在空气中,一只小鸟欢快地啾啾鸣叫,这是一个美好的新的一天。", -# "书本总是向我们敞开怀抱,蕴藏着无穷无尽的智慧和知识。当我打开一本书时,仿佛走进了一个全新的世界。字里行间娓娓道来着作者的心血和思想,让我如痴如醉地沉浸其中,收获了许多启迪和感悟。", -# "夜幕低垂,城市璀璨的灯火像是一颗颗明亮的星星。街道上来来往往的行人、川流不息的车辆,构成了一幅生动活泼的都市夜景。霓虹灯的光影闪烁,将这座城市渲染得更加缤纷多彩。", -# "The morning dew glistened on the blades of grass, each droplet reflecting the warm rays of the rising sun. A gentle breeze carried the sweet scent of flowers, and birds serenaded the new day with their cheerful melodies. It was a picture-perfect start to what promised to be a beautiful day.", -# "As I turned the pages of the worn leather-bound book, I found myself transported to distant lands and bygone eras. The author's words painted vivid scenes that danced across my mind's eye, inviting me to explore the depths of human experience and emotion. Reading has always been an escape, a journey without ever leaving my chair.", -# ] - -# for i in text: -# print(i) -# res1 = tokenizer1.encode(text) -# # res2 = tokenizer2.tokenize(i) -# print(res1) -# # assert res1 == res2 - -from transformers import AutoModelForCausalLM, AutoTokenizer - -# model = AutoModelForCausalLM.from_pretrained("/data/home/xiezipeng/hf_models/Qwen/Qwen1.5-7B") -t = AutoTokenizer.from_pretrained("/data/home/xiezipeng/hf_models/Qwen/Qwen1.5-7B") -print(t.encode("<|endoftext|>")) -print(t.pad_token_id) - -# text = "给出3点关于保持身体健康的意见。" -# input_ids = t.encode(text, return_tensors='pt') -# res = model.generate(input_ids, max_new_tokens=30) -# res = t.decode(res[0]) -# print(res) diff --git a/projects/Qwen/tokenizer.py b/projects/Qwen/tokenizer.py index ba6e3eb05..cc00fa800 100644 --- a/projects/Qwen/tokenizer.py +++ b/projects/Qwen/tokenizer.py @@ -20,12 +20,10 @@ import unicodedata from functools import lru_cache from io import open -from typing import List, Optional +from typing import Optional import regex as re -import oneflow as flow -import libai.utils.distributed as dist from libai.tokenizer.tokenization_base import PreTrainedTokenizer logger = logging.getLogger(__name__) @@ -36,26 +34,32 @@ } PRETRAINED_VOCAB_FILES_MAP = { - "vocab_file": {"qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/vocab.json"}, - "merges_file": {"qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/merges.txt"}, + "vocab_file": { + "qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/vocab.json" + }, + "merges_file": { + "qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/merges.txt" + }, } MAX_MODEL_INPUT_SIZES = {"qwen/qwen-tokenizer": 32768} -PRETOKENIZE_REGEX = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""" +PRETOKENIZE_REGEX = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""" # noqa: E501 @lru_cache() def bytes_to_unicode(): bs = ( - list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) + list(range(ord("!"), ord("~") + 1)) + + list(range(ord("¡"), ord("¬") + 1)) + + list(range(ord("®"), ord("ÿ") + 1)) ) cs = bs[:] n = 0 - for b in range(2**8): + for b in range(2 ** 8): if b not in bs: bs.append(b) - cs.append(2**8 + n) + cs.append(2 ** 8 + n) n += 1 cs = [chr(n) for n in cs] return dict(zip(bs, cs)) @@ -113,11 +117,11 @@ def __init__( self.pat = re.compile(PRETOKENIZE_REGEX) super(Qwen2Tokenizer, self).__init__( - bos_token=bos_token, - eos_token=eos_token, - unk_token=unk_token, + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, pad_token=pad_token, - **kwargs + **kwargs, ) @property diff --git a/projects/Qwen/utils/data_process.py b/projects/Qwen/utils/data_process.py deleted file mode 100644 index 4c9d1946f..000000000 --- a/projects/Qwen/utils/data_process.py +++ /dev/null @@ -1,141 +0,0 @@ -import os -import json -from tqdm import tqdm -import random - -import oneflow as flow - - -IGNORE_TOKEN_ID = -100 - -data = { - 'id': 'i6IyJda_0', - 'conversations': [ - {'from': 'human', 'value': 'How to tell if a customer segment is well segmented? In 3 bullet points.'}, - {'from': 'gpt', 'value': '1. Homogeneity \n2. Distinctiveness \n3. Stability'}, - {'from': 'human', 'value': 'Thank you'}, - {'from': 'gpt', 'value': 'you are welcome'}, - ] -} - - -def qwen2_data_process( - sources, - tokenizer, - system_message: str = "You are a helpful assistant.", -): - max_len = tokenizer.model_max_length - roles = {"user": "<|im_start|>user", "assistant": "<|im_start|>assistant"} - - im_start = tokenizer.encode("<|im_start|>")[0] - im_end = tokenizer.encode("<|im_end|>")[0] - nl_tokens = tokenizer("\n").input_ids - _system = tokenizer("system").input_ids + nl_tokens - _user = tokenizer("user").input_ids + nl_tokens - _assistant = tokenizer("assistant").input_ids + nl_tokens - - # Apply prompt templates - input_ids, targets = [], [] - for i, source in enumerate(sources): - if roles[source[0]["from"]] != roles["user"]: - source = source[1:] - - input_id, target = [], [] - system = ( - [im_start] - + _system - + tokenizer(system_message).input_ids - + [im_end] - + nl_tokens - ) - input_id += system - target += ( - [im_start] + [IGNORE_TOKEN_ID] * (len(system) - 3) + [im_end] + nl_tokens - ) - assert len(input_id) == len(target) - for j, sentence in enumerate(source): - role = roles[sentence["from"]] - _input_id = ( - tokenizer(role).input_ids - + nl_tokens - + tokenizer(sentence["value"]).input_ids - + [im_end] - + nl_tokens - ) - input_id += _input_id - if role == "<|im_start|>user": - _target = ( - [im_start] - + [IGNORE_TOKEN_ID] * (len(_input_id) - 3) - + [im_end] - + nl_tokens - ) - elif role == "<|im_start|>assistant": - _target = ( - [im_start] - + [IGNORE_TOKEN_ID] * (len(tokenizer(role).input_ids) - 1) - + _input_id[len(tokenizer(role).input_ids) : -2] - + [im_end] - + nl_tokens - ) - else: - raise NotImplementedError - target += _target - assert len(input_id) == len(target) - input_id += [tokenizer.pad_token_id] * (max_len - len(input_id)) - target += [IGNORE_TOKEN_ID] * (max_len - len(target)) - input_ids.append(input_id[:max_len]) - targets.append(target[:max_len]) - input_ids = flow.tensor(input_ids, dtype=flow.int, device="cpu") - targets = flow.tensor(targets, dtype=flow.long, device="cpu") - attention_mask = input_ids.ne(tokenizer.pad_token_id) - attention_mask = flow.where(attention_mask, flow.tensor(0.0), flow.tensor(-float("Inf"))) - - return dict( - input_ids=input_ids[0], - labels=targets[0], - attention_mask=attention_mask[0], - ) - - -def preprocess(input_file, targe_file, shuffle=False, tokenizer=None): - file = open(input_file, "r") - data = json.load(file) - if shuffle: - random.shuffle(data) - train_set = [qwen2_data_process([sample["conversations"]], tokenizer) for sample in tqdm(data)] - flow.save(train_set, os.path.join(targe_file, "train_set")) - print("training dataset saved in {}\n".format(os.path.join(targe_file, "train_set"))) - - -if __name__ == "__main__": - - from projects.mock_transformers.mock_tokenization import Qwen2Tokenizer - - input_file = "/data/home/xiezipeng/libai/projects/Qwen/subset.json" - target_file = "/data/home/xiezipeng/libai/projects/Qwen" - model_file = "/data/home/xiezipeng/hf_models/Qwen/Qwen1.5-7B" - - tokenizer = Qwen2Tokenizer.from_pretrained(model_file) - tokenizer.model_max_length = 2048 - - preprocess( - input_file=input_file, - targe_file=target_file, - tokenizer=tokenizer - ) - - # res = qwen2_data_process([data["conversations"]], tokenizer) - # input_ids = res["input_ids"] - # labels = res["labels"] - # attention_mask = res["attention_mask"] - - # print(input_ids[0]) - # print(labels) - # print(attention_mask) - - # labels = labels[0] - # labels[labels==IGNORE_TOKEN_ID] = 151643 - - # print("input text:\n",tokenizer.decode(input_ids[0].tolist())) - # print("labels text: \n",tokenizer.decode(labels.tolist())) diff --git a/projects/Qwen/utils/prepare_alpaca.py b/projects/Qwen/utils/prepare_alpaca.py new file mode 100644 index 000000000..ce422c96b --- /dev/null +++ b/projects/Qwen/utils/prepare_alpaca.py @@ -0,0 +1,161 @@ +"""Implementation derived from https://github.com/tloen/alpaca-lora""" +import copy +import json +import math +import os +from pathlib import Path +from typing import Optional + +import oneflow as flow +import requests +from oneflow.utils.data import random_split +from tqdm import tqdm + +from libai.config import instantiate +from libai.utils.logger import setup_logger +from projects.Qwen.configs.qwen_config import tokenization + +logger = setup_logger() + + +def prepare( + destination_path: Path = Path("./data/libai_xpu_alpaca"), + checkpoint_dir: Path = Path("/root/models/Qwen1.5-7B-Chat"), + test_split_fraction: float = 0.03865, # to get exactly 2000 test samples, + seed: int = 42, + mask_inputs: bool = False, # as in alpaca-lora + data_file_name: str = "alpaca_data_cleaned_archive.json", + data_file_url: str = "https://raw.githubusercontent.com/tloen/alpaca-lora/main/alpaca_data_cleaned_archive.json", # noqa + ignore_index: int = -100, + max_seq_length: Optional[int] = 512, +) -> None: + """Prepare the Alpaca dataset for instruction tuning. + The output is a training and test dataset saved as `train.pt` and `test.pt`, + which stores the preprocessed and tokenized prompts and labels. + """ + if max_seq_length is None: + with open(os.path.join(checkpoint_dir, "config.json"), "r", encoding="utf-8") as file: + config = json.load(file) + max_seq_length = config["max_position_embeddings"] + + destination_path.mkdir(parents=True, exist_ok=True) + data_file_path = destination_path / data_file_name + logger.info("Loading data file...") + download_if_missing(data_file_path, data_file_url) + with open(data_file_path, "r", encoding="utf-8") as file: + data = json.load(file) + + logger.info("Loading tokenizer...") + tokenizer = instantiate(tokenization.tokenizer) + + # Partition the dataset into train and test + num_of_test_samples = math.floor(test_split_fraction * len(data)) + num_of_train_samples = len(data) - num_of_test_samples + train_set, test_set = random_split( + data, + [num_of_train_samples, num_of_test_samples], + generator=flow.Generator().manual_seed(seed), + ) + train_set, test_set = list(train_set), list(test_set) + + logger.info(f"train has {len(train_set):,} samples") + logger.info(f"test has {len(test_set):,} samples") + + logger.info("Processing train split ...") + train_set = [ + prepare_sample( + example=sample, + tokenizer=tokenizer, + max_length=max_seq_length, + ) + for sample in tqdm(train_set) + ] + flow.save(train_set, destination_path / "train") + + logger.info("Processing test split ...") + test_set = [ + prepare_sample( + example=sample, + tokenizer=tokenizer, + max_length=max_seq_length, + ) + for sample in tqdm(test_set) + ] + flow.save(test_set, destination_path / "test") + + max_length = max([i["input_ids"].shape[0] for i in train_set]) + logger.info("Max length of training dataset: {}".format(max_length)) + + +def download_if_missing(file_path: Path, file_url: str) -> None: + """Downloads the raw json data file and saves it in the given destination.""" + if file_path.exists() and file_path.stat().st_size > 0: + return + with open(file_path, "w", encoding="utf-8") as f: + f.write(requests.get(file_url).text) + + +def prepare_sample(example: dict, tokenizer, max_length: int) -> dict: + """Processes a single sample. + Each sample in the dataset consists of: + - instruction: A string describing the task + - input: A string holding a special input value for the instruction. + This only applies to some samples, and in others this is empty. + - output: The response string + This function processes this data to produce a prompt text and a label for + supervised training. The prompt text is formed as a single message including both + the instruction and the input. The label/target is the same message but with the + response attached. + Finally, both the prompt and the label get tokenized. If desired, all tokens + in the label that correspond to the original input prompt get masked out (default). + """ + full_prompt = generate_prompt(example) + full_prompt_and_response = full_prompt + example["output"] + + prompt = tokenizer.encode(full_prompt, device="cpu") + prompt = flow.tensor(prompt, dtype=flow.int, device="cpu") + example = tokenizer.encode(full_prompt_and_response, device="cpu") + example = flow.tensor(example, dtype=flow.int, device="cpu") + + padding = max_length - example.shape[0] + if padding > 0: + example = flow.cat((example, flow.zeros(padding, dtype=flow.long) - 1)) + elif padding < 0: + example = example[:max_length] + labels = copy.deepcopy(example) + labels[: len(prompt)] = -1 + example_mask = example.ge(0) + label_mask = labels.ge(0) + example[~example_mask] = 0 + labels[~label_mask] = -1 + example = example[:-1] + labels = labels[1:] + example_mask = flow.where( + example_mask, flow.tensor(0, dtype=flow.float), flow.tensor(-float("inf")) + ) + example_mask = example_mask[:-1] + return { + "input_ids": example, + "labels": labels, + } + + +def generate_prompt(example: dict) -> str: + """Generates a standardized message to prompt the model with an instruction, optional input and a + 'response' field.""" + + if example["input"]: + return ( + "Below is an instruction that describes a task, paired with an input that provides further context. " # noqa + "Write a response that appropriately completes the request.\n\n" + f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:" # noqa + ) + return ( + "Below is an instruction that describes a task. " + "Write a response that appropriately completes the request.\n\n" + f"### Instruction:\n{example['instruction']}\n\n### Response:" + ) + + +if __name__ == "__main__": + prepare() diff --git a/projects/Qwen/utils/qwen2_loader.py b/projects/Qwen/utils/qwen2_loader.py index 75cf970c9..8668bdca5 100644 --- a/projects/Qwen/utils/qwen2_loader.py +++ b/projects/Qwen/utils/qwen2_loader.py @@ -63,7 +63,7 @@ def _convert_state_dict(self, flow_state_dict, cfg): oneflow_state_dict.pop(query_w) oneflow_state_dict.pop(key_w) oneflow_state_dict.pop(value_w) - + query_b = old_key_qkv_b.format(layer_idx, "q_proj") key_b = old_key_qkv_b.format(layer_idx, "k_proj") value_b = old_key_qkv_b.format(layer_idx, "v_proj") diff --git a/projects/mock_transformers/mock_tokenization.py b/projects/mock_transformers/mock_tokenization.py index 22f42e693..b28cebeb9 100644 --- a/projects/mock_transformers/mock_tokenization.py +++ b/projects/mock_transformers/mock_tokenization.py @@ -19,7 +19,6 @@ import oneflow.mock_torch as mock from libai.utils import distributed as dist -import oneflow.mock_torch as mock with mock.enable(lazy=True): from transformers import ( # noqa @@ -33,7 +32,6 @@ from transformers.utils import generic # noqa from transformers.utils.generic import TensorType # noqa - # ---------------- mock TensorType ------------------ class TensorType(ExplicitEnum): # noqa PYTORCH = "pt" @@ -145,5 +143,4 @@ def flow_convert_to_tensors(self, tensor_type=None, prepend_batch_axis=False): self[k] = v.to_global(sbp=sbp, placement=dist.get_layer_placement(0)) return self - BatchEncoding.convert_to_tensors = flow_convert_to_tensors # noqa