From 2fb5318e7d5a4800ccbd9cdae4e420cf6eecc257 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=B8=A5=E7=85=A7=E4=B8=9C?= Date: Tue, 26 Sep 2023 07:21:02 +0000 Subject: [PATCH] updated model usage method MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 严照东 --- examples/Aquila2/generate_chat.py | 29 --- flagai/auto_model/auto_loader.py | 63 ++++++- flagai/model/aquila2/modeling_aquila.py | 169 ++++++++++++++++++ flagai/model/aquila2/utils.py | 38 ++++ flagai/model/aquila2_model.py | 224 ------------------------ 5 files changed, 261 insertions(+), 262 deletions(-) delete mode 100755 examples/Aquila2/generate_chat.py mode change 100644 => 100755 flagai/model/aquila2/modeling_aquila.py create mode 100755 flagai/model/aquila2/utils.py delete mode 100755 flagai/model/aquila2_model.py diff --git a/examples/Aquila2/generate_chat.py b/examples/Aquila2/generate_chat.py deleted file mode 100755 index 0cff2e8a..00000000 --- a/examples/Aquila2/generate_chat.py +++ /dev/null @@ -1,29 +0,0 @@ -from flagai.auto_model.auto_loader import AutoLoader - -state_dict = "./checkpoints/" -model_name = 'Aquila2Chat-hf' - -state_dict = "/data2/20230907/" -model_name = 'iter_0205000_hf' - -autoloader = AutoLoader("aquila2", - model_dir=state_dict, - model_name=model_name, - qlora_dir="/data2/yzd/FastChat/checkpoints_out/30bhf_save/checkpoint-4200",) - # qlora_dir='/data2/yzd/FlagAI/examples/Aquila2/checkpoints/qlora/aquila2chat-hf') - # lora_dir='/data2/yzd/FlagAI/examples/Aquila2/checkpoints/lora/aquila2chat-hf') - # ) - -model = autoloader.get_model() -tokenizer = autoloader.get_tokenizer() -# - -test_data = [ - "请介绍下北京有哪些景点。", - "唾面自干是什么意思", - "'我'字有几个笔划", -] - -for text in test_data: - print(model.predict(text, tokenizer=tokenizer)) - diff --git a/flagai/auto_model/auto_loader.py b/flagai/auto_model/auto_loader.py index 7375aff3..b068d308 100755 --- a/flagai/auto_model/auto_loader.py +++ b/flagai/auto_model/auto_loader.py @@ -4,10 +4,10 @@ import importlib import os import copy -from flagai.model.file_utils import _get_model_id +from flagai.model.file_utils import _get_model_id, _get_checkpoint_path, _get_vocab_path, _get_model_files +from flagai.model.aquila2.modeling_aquila import AquilaForCausalLM import torch - class LazyImport(object): def __init__(self, name): @@ -207,8 +207,54 @@ def __init__(self, print(f"All supported models are {list(MODEL_DICT.keys())}") return if task_name == "aquila2": - from flagai.model.aquila2_model import Aquila2Model - from accelerate import init_empty_weights, load_checkpoint_and_dispatch + download_path = os.path.join(model_dir, model_name) + + if not os.path.exists(download_path): + # Try to download from ModelHub + try: + model_id = _get_model_id(model_name) + except: + raise FileNotFoundError("Model name not found in local path and ModelHub") + if model_id and model_id != "null": + model_files = eval(_get_model_files(model_name)) + print("model files:" + str(model_files)) + for file_name in model_files: + if not file_name.endswith("bin"): + _get_vocab_path(download_path, file_name, model_id) + + if os.path.exists( + os.path.join(download_path, 'config.json')): + if os.getenv('ENV_TYPE') == 'deepspeed+mpu': + model_parallel_size = int(os.getenv("MODEL_PARALLEL_SIZE")) + if model_parallel_size > 1: + # if gpus == nums_of_modelhub_models + # can load + # else need to download the pytorch_model.bin and to recut. + model_hub_parallel_size = 0 + for f in model_files: + if "pytorch_model_" in f: + model_hub_parallel_size += 1 + else: + model_parallel_size = 1 + + if "pytorch_model_01.bin" in model_files and model_parallel_size > 1 and model_hub_parallel_size == model_parallel_size: + # Only to download the model slices(megatron-lm). + for file_to_load in model_files: + if "pytorch_model_" in file_to_load: + _get_checkpoint_path(download_path, file_to_load, + model_id) + + elif 'pytorch_model.bin' in model_files: + checkpoint_path = _get_checkpoint_path( + download_path, 'pytorch_model.bin', model_id) + else: + checkpoint_merge = {} + # maybe multi weights files + for file_to_load in model_files: + if "pytorch_model-0" in file_to_load: + _get_checkpoint_path(download_path, file_to_load, + model_id) + if qlora_dir: from transformers import BitsAndBytesConfig quantization_config=BitsAndBytesConfig( @@ -217,11 +263,14 @@ def __init__(self, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch_dtype, ) - model = Aquila2Model.from_pretrain(model_dir, model_name, + + + model = AquilaForCausalLM.from_pretrained(download_path, low_cpu_mem_usage=low_cpu_mem_usage, torch_dtype=torch_dtype, quantization_config=quantization_config) model.eval() + # from accelerate import load_checkpoint_and_dispatch # model = load_checkpoint_and_dispatch( # model, model_dir+model_name, device_map="balanced", no_split_module_classes=["LlamaDecoderLayer"]) if not qlora_dir: @@ -236,13 +285,9 @@ def __init__(self, print("Qlora modules loaded") from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(model_dir+model_name) - #args.cuda_index = 0 - # device = f"cuda" self.model = model self.tokenizer = tokenizer - else: - brief_model_name = MODEL_DICT[model_name][2] model_type = MODEL_DICT[model_name][3] # The dir to save config, vocab and model. diff --git a/flagai/model/aquila2/modeling_aquila.py b/flagai/model/aquila2/modeling_aquila.py old mode 100644 new mode 100755 index 17c5c58a..b0731cce --- a/flagai/model/aquila2/modeling_aquila.py +++ b/flagai/model/aquila2/modeling_aquila.py @@ -31,6 +31,17 @@ from transformers.modeling_utils import PreTrainedModel from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings from .configuration_aquila import AquilaConfig +from transformers import ( + LogitsProcessorList, + MinLengthLogitsProcessor, + TopKLogitsWarper, + TemperatureLogitsWarper, + TopPLogitsWarper, + StoppingCriteriaList, + MaxLengthCriteria, + BitsAndBytesConfig, +) +from .utils import * logger = logging.get_logger(__name__) @@ -754,6 +765,57 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @classmethod + def from_pretrain(self, model_dir, model_name, **kwargs): + download_path = os.path.join(model_dir, model_name) + if os.path.exists(download_path): + return self.from_pretrained(download_path, **kwargs) + + + config_path = os.path.join(download_path, "config.json") + checkpoint_path = os.path.join(download_path, "pytorch_model.bin") + from flagai.model.file_utils import _get_model_id + model_id = _get_model_id(model_name) + if model_id and model_id != "null": + model_files = eval(_get_model_files(model_name)) + print("model files:" + str(model_files)) + for file_name in model_files: + if not file_name.endswith("bin"): + _get_vocab_path(download_path, file_name, model_id) + + if os.path.exists( + os.path.join(download_path, 'config.json')): + if os.getenv('ENV_TYPE') == 'deepspeed+mpu': + model_parallel_size = int(os.getenv("MODEL_PARALLEL_SIZE")) + if model_parallel_size > 1: + # if gpus == nums_of_modelhub_models + # can load + # else need to download the pytorch_model.bin and to recut. + model_hub_parallel_size = 0 + for f in model_files: + if "pytorch_model_" in f: + model_hub_parallel_size += 1 + else: + model_parallel_size = 1 + + if "pytorch_model_01.bin" in model_files and model_parallel_size > 1 and model_hub_parallel_size == model_parallel_size: + # Only to download the model slices(megatron-lm). + for file_to_load in model_files: + if "pytorch_model_" in file_to_load: + _get_checkpoint_path(download_path, file_to_load, + model_id) + + elif 'pytorch_model.bin' in model_files: + checkpoint_path = _get_checkpoint_path( + download_path, 'pytorch_model.bin', model_id) + else: + checkpoint_merge = {} + # maybe multi weights files + for file_to_load in model_files: + if "pytorch_model-0" in file_to_load: + _get_checkpoint_path(download_path, file_to_load, + model_id) + def get_input_embeddings(self): return self.model.embed_tokens @@ -905,6 +967,113 @@ def _reorder_cache(past_key_values, beam_idx): ) return reordered_past + def predict(self, text, tokenizer=None, + max_gen_len=200, top_p=0.95, + seed=1234, topk=100, + temperature=0.9, + sft=True, convo_template = "aquila-chat", + device = "cuda"): + + vocab = tokenizer.get_vocab() + #device = device + id2word = {v:k for k, v in vocab.items()} + + + set_random_seed(seed) + if temperature == 0: + topk = 1 + temperature = 1.0 + if sft: + tokens = covert_prompt_to_input_ids_with_history(text, history=[], tokenizer=tokenizer, max_token=2048, convo_template=convo_template) + tokens = torch.tensor(tokens)[None,].to(device) + else : + tokens = tokenizer.encode_plus(text)["input_ids"] + print(tokenizer.decode(tokens)) + tokens = torch.tensor(tokens)[None,].to(device) + input_length = len(tokens[0]) + with torch.no_grad(): + + # instantiate logits processors + logits_processor = LogitsProcessorList( + [ + MinLengthLogitsProcessor(1, eos_token_id=100007), + ] + ) + # instantiate logits processors + logits_warper = LogitsProcessorList( + [ + TopPLogitsWarper(top_p), + TopKLogitsWarper(topk), + TemperatureLogitsWarper(temperature), + + ] + ) + + stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=input_length + max_gen_len)]) + out = self.sample( + tokens, + logits_processor=logits_processor, + logits_warper=logits_warper, + stopping_criteria=stopping_criteria, + return_dict_in_generate=True, + output_scores=True, + ) + + + # print(out) + out_ids = out["sequences"][0][input_length:].cpu().numpy() + + out_scores = out["scores"] + + out_scores = torch.cat(out_scores, dim=0) + out_scores = torch.nn.functional.softmax(out_scores, dim=-1).cpu().numpy() + + probs = [] + for i in range(len(out_ids)): + probs.append(float(out_scores[i][out_ids[i]])) + + # print(f"probs is {probs}") + + convert_tokens = [] + for t in out_ids: + if t == 100006: + convert_tokens.append("[CLS]") + else : + convert_tokens.append(id2word.get(t, "[unkonwn_token]")) + + out_text = tokenizer.decode(out_ids.tolist()) + + + out = out_text + + if "###" in out: + special_index = out.index("###") + out = out[: special_index] + token_length = len(tokenizer.encode_plus(out)["input_ids"]) + convert_tokens = convert_tokens[:token_length] + probs = probs[:token_length] + + if "[UNK]" in out: + special_index = out.index("[UNK]") + out = out[:special_index] + token_length = len(tokenizer.encode_plus(out)["input_ids"]) + convert_tokens = convert_tokens[:token_length] + probs = probs[:token_length] + + if "" in out: + special_index = out.index("") + out = out[: special_index] + token_length = len(tokenizer.encode_plus(out)["input_ids"]) + convert_tokens = convert_tokens[:token_length] + probs = probs[:token_length] + + if len(out) > 0 and out[0] == " ": + out = out[1:] + + convert_tokens = convert_tokens[1:] + probs = probs[1:] + return out + @add_start_docstrings( """ The LLaMa Model transformer with a sequence classification head on top (linear layer). diff --git a/flagai/model/aquila2/utils.py b/flagai/model/aquila2/utils.py new file mode 100755 index 00000000..8e3de5f9 --- /dev/null +++ b/flagai/model/aquila2/utils.py @@ -0,0 +1,38 @@ +import random +import numpy as np +import torch +from fastchat.conversation import get_conv_template + +def set_random_seed(seed): + """Set random seed for reproducability.""" + if seed is not None and seed > 0: + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + + + +def covert_prompt_to_input_ids_with_history(text, history, tokenizer, max_token, convo_template="aquila-chat"): + # aquila-chat as default + conv = get_conv_template(convo_template) + + conv.append_message(conv.roles[1], None) + conv.append_message(conv.roles[0], text) + + example = tokenizer.encode_plus(f"{conv.get_prompt()} ", None, max_length=None)['input_ids'] + + while(len(history) > 0 and (len(example) < max_token)): + tmp = history.pop() + if tmp[0] == 'ASSISTANT': + conv.append_message(conv.roles[1], tmp[1]) + else: + conv.append_message(conv.roles[0], tmp[1]) + example = tokenizer.encode_plus(f"{conv.get_prompt()} ", None, max_length=None)['input_ids'] + + if len(example) >= max_token: + conv.messages.pop() + conv.messages = conv.messages[::-1] + print('model in:', conv.get_prompt()) + example = tokenizer.encode_plus(f"{conv.get_prompt()} ", None, max_length=None)['input_ids'] + + return example \ No newline at end of file diff --git a/flagai/model/aquila2_model.py b/flagai/model/aquila2_model.py deleted file mode 100755 index 48cb3a53..00000000 --- a/flagai/model/aquila2_model.py +++ /dev/null @@ -1,224 +0,0 @@ -from transformers import AutoTokenizer, LlamaForCausalLM , AutoModelForCausalLM -import random -import numpy as np -import torch -from utils import covert_prompt_to_input_ids_with_history -import os -from flagai.model.file_utils import _get_model_id, _get_checkpoint_path, _get_vocab_path, _get_model_files -from transformers import ( - LogitsProcessorList, - MinLengthLogitsProcessor, - TopKLogitsWarper, - TemperatureLogitsWarper, - TopPLogitsWarper, - StoppingCriteriaList, - MaxLengthCriteria, - BitsAndBytesConfig, -) -from fastchat.conversation import get_conv_template - - -def covert_prompt_to_input_ids_with_history(text, history, tokenizer, max_token, convo_template="aquila-chat"): - # aquila-chat as default - conv = get_conv_template(convo_template) - - conv.append_message(conv.roles[1], None) - conv.append_message(conv.roles[0], text) - - example = tokenizer.encode_plus(f"{conv.get_prompt()} ", None, max_length=None)['input_ids'] - - while(len(history) > 0 and (len(example) < max_token)): - tmp = history.pop() - if tmp[0] == 'ASSISTANT': - conv.append_message(conv.roles[1], tmp[1]) - else: - conv.append_message(conv.roles[0], tmp[1]) - example = tokenizer.encode_plus(f"{conv.get_prompt()} ", None, max_length=None)['input_ids'] - - if len(example) >= max_token: - conv.messages.pop() - conv.messages = conv.messages[::-1] - print('model in:', conv.get_prompt()) - example = tokenizer.encode_plus(f"{conv.get_prompt()} ", None, max_length=None)['input_ids'] - - return example - - -def set_random_seed(seed): - """Set random seed for reproducability.""" - if seed is not None and seed > 0: - random.seed(seed) - np.random.seed(seed) - torch.manual_seed(seed) - - -class Aquila2Model(LlamaForCausalLM): - - @classmethod - def from_pretrain(self, model_dir, model_name, **kwargs): - download_path = os.path.join(model_dir, model_name) - if os.path.exists(download_path): - return self.from_pretrained(download_path, **kwargs) - - - config_path = os.path.join(download_path, "config.json") - checkpoint_path = os.path.join(download_path, "pytorch_model.bin") - from flagai.model.file_utils import _get_model_id - model_id = _get_model_id(model_name) - if model_id and model_id != "null": - model_files = eval(_get_model_files(model_name)) - print("model files:" + str(model_files)) - for file_name in model_files: - if not file_name.endswith("bin"): - _get_vocab_path(download_path, file_name, model_id) - - if os.path.exists( - os.path.join(download_path, 'config.json')): - if os.getenv('ENV_TYPE') == 'deepspeed+mpu': - model_parallel_size = int(os.getenv("MODEL_PARALLEL_SIZE")) - if model_parallel_size > 1: - # if gpus == nums_of_modelhub_models - # can load - # else need to download the pytorch_model.bin and to recut. - model_hub_parallel_size = 0 - for f in model_files: - if "pytorch_model_" in f: - model_hub_parallel_size += 1 - else: - model_parallel_size = 1 - - if "pytorch_model_01.bin" in model_files and model_parallel_size > 1 and model_hub_parallel_size == model_parallel_size: - # Only to download the model slices(megatron-lm). - for file_to_load in model_files: - if "pytorch_model_" in file_to_load: - _get_checkpoint_path(download_path, file_to_load, - model_id) - - elif 'pytorch_model.bin' in model_files: - checkpoint_path = _get_checkpoint_path( - download_path, 'pytorch_model.bin', model_id) - else: - checkpoint_merge = {} - # maybe multi weights files - for file_to_load in model_files: - if "pytorch_model-0" in file_to_load: - _get_checkpoint_path(download_path, file_to_load, - model_id) - # checkpoint_to_load = torch.load(os.path.join( - # download_path, file_to_load), - # map_location="cpu") - # for k, v in checkpoint_to_load.items(): - # checkpoint_merge[k] = v - # # save all parameters - # torch.save( - # checkpoint_merge, - # os.path.join(download_path, "pytorch_model.bin")) - - - def predict(self, text, tokenizer=None, - max_gen_len=200, top_p=0.95, - seed=1234, topk=100, - temperature=0.9, - sft=True, convo_template = "aquila-chat", - device = "cuda"): - - vocab = tokenizer.get_vocab() - #device = device - id2word = {v:k for k, v in vocab.items()} - - - set_random_seed(seed) - if temperature == 0: - topk = 1 - temperature = 1.0 - if sft: - tokens = covert_prompt_to_input_ids_with_history(text, history=[], tokenizer=tokenizer, max_token=2048, convo_template=convo_template) - tokens = torch.tensor(tokens)[None,].to(device) - else : - tokens = tokenizer.encode_plus(text)["input_ids"] - print(tokenizer.decode(tokens)) - tokens = torch.tensor(tokens)[None,].to(device) - input_length = len(tokens[0]) - with torch.no_grad(): - - # instantiate logits processors - logits_processor = LogitsProcessorList( - [ - MinLengthLogitsProcessor(1, eos_token_id=100007), - ] - ) - # instantiate logits processors - logits_warper = LogitsProcessorList( - [ - TopPLogitsWarper(top_p), - TopKLogitsWarper(topk), - TemperatureLogitsWarper(temperature), - - ] - ) - - stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=input_length + max_gen_len)]) - out = self.sample( - tokens, - logits_processor=logits_processor, - logits_warper=logits_warper, - stopping_criteria=stopping_criteria, - return_dict_in_generate=True, - output_scores=True, - ) - - - # print(out) - out_ids = out["sequences"][0][input_length:].cpu().numpy() - - out_scores = out["scores"] - - out_scores = torch.cat(out_scores, dim=0) - out_scores = torch.nn.functional.softmax(out_scores, dim=-1).cpu().numpy() - - probs = [] - for i in range(len(out_ids)): - probs.append(float(out_scores[i][out_ids[i]])) - - # print(f"probs is {probs}") - - convert_tokens = [] - for t in out_ids: - if t == 100006: - convert_tokens.append("[CLS]") - else : - convert_tokens.append(id2word.get(t, "[unkonwn_token]")) - - out_text = tokenizer.decode(out_ids.tolist()) - - - out = out_text - - if "###" in out: - special_index = out.index("###") - out = out[: special_index] - token_length = len(tokenizer.encode_plus(out)["input_ids"]) - convert_tokens = convert_tokens[:token_length] - probs = probs[:token_length] - - if "[UNK]" in out: - special_index = out.index("[UNK]") - out = out[:special_index] - token_length = len(tokenizer.encode_plus(out)["input_ids"]) - convert_tokens = convert_tokens[:token_length] - probs = probs[:token_length] - - if "" in out: - special_index = out.index("") - out = out[: special_index] - token_length = len(tokenizer.encode_plus(out)["input_ids"]) - convert_tokens = convert_tokens[:token_length] - probs = probs[:token_length] - - if len(out) > 0 and out[0] == " ": - out = out[1:] - - convert_tokens = convert_tokens[1:] - probs = probs[1:] - return out - # return out, convert_tokens, probs