From 2fb5318e7d5a4800ccbd9cdae4e420cf6eecc257 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=B8=A5=E7=85=A7=E4=B8=9C?= <yanzhaodong2-21@163.com>
Date: Tue, 26 Sep 2023 07:21:02 +0000
Subject: [PATCH] updated model usage method
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 严照东 <yanzhaodong2-21@163.com>
---
 examples/Aquila2/generate_chat.py       |  29 ---
 flagai/auto_model/auto_loader.py        |  63 ++++++-
 flagai/model/aquila2/modeling_aquila.py | 169 ++++++++++++++++++
 flagai/model/aquila2/utils.py           |  38 ++++
 flagai/model/aquila2_model.py           | 224 ------------------------
 5 files changed, 261 insertions(+), 262 deletions(-)
 delete mode 100755 examples/Aquila2/generate_chat.py
 mode change 100644 => 100755 flagai/model/aquila2/modeling_aquila.py
 create mode 100755 flagai/model/aquila2/utils.py
 delete mode 100755 flagai/model/aquila2_model.py

diff --git a/examples/Aquila2/generate_chat.py b/examples/Aquila2/generate_chat.py
deleted file mode 100755
index 0cff2e8a..00000000
--- a/examples/Aquila2/generate_chat.py
+++ /dev/null
@@ -1,29 +0,0 @@
-from flagai.auto_model.auto_loader import AutoLoader
-
-state_dict = "./checkpoints/"
-model_name = 'Aquila2Chat-hf'
-
-state_dict = "/data2/20230907/"
-model_name = 'iter_0205000_hf'
-
-autoloader = AutoLoader("aquila2",
-                    model_dir=state_dict,
-                    model_name=model_name,
-                    qlora_dir="/data2/yzd/FastChat/checkpoints_out/30bhf_save/checkpoint-4200",)
-                    # qlora_dir='/data2/yzd/FlagAI/examples/Aquila2/checkpoints/qlora/aquila2chat-hf')
-                    # lora_dir='/data2/yzd/FlagAI/examples/Aquila2/checkpoints/lora/aquila2chat-hf')
-                    # )
-
-model = autoloader.get_model()
-tokenizer = autoloader.get_tokenizer()
-# 
-
-test_data = [
-    "请介绍下北京有哪些景点。",
-    "唾面自干是什么意思",
-    "'我'字有几个笔划",
-]
-
-for text in test_data:
-    print(model.predict(text, tokenizer=tokenizer))
-
diff --git a/flagai/auto_model/auto_loader.py b/flagai/auto_model/auto_loader.py
index 7375aff3..b068d308 100755
--- a/flagai/auto_model/auto_loader.py
+++ b/flagai/auto_model/auto_loader.py
@@ -4,10 +4,10 @@
 import importlib
 import os
 import copy
-from flagai.model.file_utils import _get_model_id
+from flagai.model.file_utils import _get_model_id, _get_checkpoint_path, _get_vocab_path, _get_model_files
+from flagai.model.aquila2.modeling_aquila import AquilaForCausalLM
 import torch
 
-
 class LazyImport(object):
 
     def __init__(self, name):
@@ -207,8 +207,54 @@ def __init__(self,
             print(f"All supported models are {list(MODEL_DICT.keys())}")
             return
         if task_name == "aquila2":
-            from flagai.model.aquila2_model import Aquila2Model
-            from accelerate import init_empty_weights, load_checkpoint_and_dispatch
+            download_path = os.path.join(model_dir, model_name)
+            
+            if not os.path.exists(download_path):
+                # Try to download from ModelHub
+                try:
+                    model_id = _get_model_id(model_name)
+                except:
+                    raise FileNotFoundError("Model name not found in local path and ModelHub")
+                if model_id and model_id != "null":
+                    model_files = eval(_get_model_files(model_name))
+                    print("model files:" + str(model_files))
+                    for file_name in model_files:
+                        if not file_name.endswith("bin"):
+                            _get_vocab_path(download_path, file_name, model_id)
+
+                    if os.path.exists(
+                            os.path.join(download_path, 'config.json')):
+                        if os.getenv('ENV_TYPE') == 'deepspeed+mpu':
+                            model_parallel_size = int(os.getenv("MODEL_PARALLEL_SIZE"))
+                            if model_parallel_size > 1:
+                                # if gpus == nums_of_modelhub_models
+                                # can load
+                                # else need to download the pytorch_model.bin and to recut.
+                                model_hub_parallel_size = 0
+                                for f in model_files:
+                                    if "pytorch_model_" in f:
+                                        model_hub_parallel_size += 1
+                        else:
+                            model_parallel_size = 1
+
+                        if "pytorch_model_01.bin" in model_files and model_parallel_size > 1 and model_hub_parallel_size == model_parallel_size:
+                            # Only to download the model slices(megatron-lm).
+                            for file_to_load in model_files:
+                                if "pytorch_model_" in file_to_load:
+                                    _get_checkpoint_path(download_path, file_to_load,
+                                                        model_id)
+
+                        elif 'pytorch_model.bin' in model_files:
+                            checkpoint_path = _get_checkpoint_path(
+                                download_path, 'pytorch_model.bin', model_id)
+                        else:
+                            checkpoint_merge = {}
+                            # maybe multi weights files
+                            for file_to_load in model_files:
+                                if "pytorch_model-0" in file_to_load:
+                                    _get_checkpoint_path(download_path, file_to_load,
+                                                        model_id)            
+
             if qlora_dir:
                 from transformers import BitsAndBytesConfig
                 quantization_config=BitsAndBytesConfig(
@@ -217,11 +263,14 @@ def __init__(self,
                     bnb_4bit_quant_type="nf4",
                     bnb_4bit_compute_dtype=torch_dtype,
                 )
-            model = Aquila2Model.from_pretrain(model_dir, model_name,
+
+
+            model = AquilaForCausalLM.from_pretrained(download_path,
                                                     low_cpu_mem_usage=low_cpu_mem_usage, torch_dtype=torch_dtype,
                                                     quantization_config=quantization_config)
             
             model.eval()
+            # from accelerate import load_checkpoint_and_dispatch
             # model = load_checkpoint_and_dispatch(
             #                 model, model_dir+model_name, device_map="balanced", no_split_module_classes=["LlamaDecoderLayer"])
             if not qlora_dir:
@@ -236,13 +285,9 @@ def __init__(self,
                 print("Qlora modules loaded")
             from transformers import AutoTokenizer
             tokenizer = AutoTokenizer.from_pretrained(model_dir+model_name)
-            #args.cuda_index = 0
-            # device = f"cuda"
             self.model = model 
             self.tokenizer = tokenizer 
-
         else:
-
             brief_model_name = MODEL_DICT[model_name][2]
             model_type = MODEL_DICT[model_name][3]
             # The dir to save config, vocab and model.
diff --git a/flagai/model/aquila2/modeling_aquila.py b/flagai/model/aquila2/modeling_aquila.py
old mode 100644
new mode 100755
index 17c5c58a..b0731cce
--- a/flagai/model/aquila2/modeling_aquila.py
+++ b/flagai/model/aquila2/modeling_aquila.py
@@ -31,6 +31,17 @@
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_aquila import AquilaConfig
+from transformers import (
+    LogitsProcessorList,
+    MinLengthLogitsProcessor,
+    TopKLogitsWarper,
+    TemperatureLogitsWarper,
+    TopPLogitsWarper,
+    StoppingCriteriaList,
+    MaxLengthCriteria,
+    BitsAndBytesConfig,
+)
+from .utils import *
 
 
 logger = logging.get_logger(__name__)
@@ -754,6 +765,57 @@ def __init__(self, config):
         # Initialize weights and apply final processing
         self.post_init()
 
+    @classmethod
+    def from_pretrain(self, model_dir, model_name, **kwargs):
+        download_path = os.path.join(model_dir, model_name)
+        if os.path.exists(download_path):
+            return self.from_pretrained(download_path, **kwargs)
+
+
+        config_path = os.path.join(download_path, "config.json")
+        checkpoint_path = os.path.join(download_path, "pytorch_model.bin")
+        from flagai.model.file_utils import _get_model_id
+        model_id = _get_model_id(model_name)
+        if model_id and model_id != "null":
+            model_files = eval(_get_model_files(model_name))
+            print("model files:" + str(model_files))
+            for file_name in model_files:
+                if not file_name.endswith("bin"):
+                    _get_vocab_path(download_path, file_name, model_id)
+
+            if os.path.exists(
+                    os.path.join(download_path, 'config.json')):
+                if os.getenv('ENV_TYPE') == 'deepspeed+mpu':
+                    model_parallel_size = int(os.getenv("MODEL_PARALLEL_SIZE"))
+                    if model_parallel_size > 1:
+                        # if gpus == nums_of_modelhub_models
+                        # can load
+                        # else need to download the pytorch_model.bin and to recut.
+                        model_hub_parallel_size = 0
+                        for f in model_files:
+                            if "pytorch_model_" in f:
+                                model_hub_parallel_size += 1
+                else:
+                    model_parallel_size = 1
+
+                if "pytorch_model_01.bin" in model_files and model_parallel_size > 1 and model_hub_parallel_size == model_parallel_size:
+                    # Only to download the model slices(megatron-lm).
+                    for file_to_load in model_files:
+                        if "pytorch_model_" in file_to_load:
+                            _get_checkpoint_path(download_path, file_to_load,
+                                                 model_id)
+
+                elif 'pytorch_model.bin' in model_files:
+                    checkpoint_path = _get_checkpoint_path(
+                        download_path, 'pytorch_model.bin', model_id)
+                else:
+                    checkpoint_merge = {}
+                    # maybe multi weights files
+                    for file_to_load in model_files:
+                        if "pytorch_model-0" in file_to_load:
+                            _get_checkpoint_path(download_path, file_to_load,
+                                                 model_id)
+
     def get_input_embeddings(self):
         return self.model.embed_tokens
 
@@ -905,6 +967,113 @@ def _reorder_cache(past_key_values, beam_idx):
             )
         return reordered_past
 
+    def predict(self, text, tokenizer=None,
+                max_gen_len=200, top_p=0.95,
+                seed=1234, topk=100,
+                temperature=0.9, 
+                sft=True, convo_template = "aquila-chat",
+                device = "cuda"):
+
+        vocab = tokenizer.get_vocab()
+        #device = device
+        id2word = {v:k for k, v in vocab.items()}
+
+
+        set_random_seed(seed)
+        if temperature == 0:
+            topk = 1
+            temperature = 1.0
+        if sft:
+            tokens = covert_prompt_to_input_ids_with_history(text, history=[], tokenizer=tokenizer, max_token=2048, convo_template=convo_template)
+            tokens = torch.tensor(tokens)[None,].to(device)
+        else :
+            tokens = tokenizer.encode_plus(text)["input_ids"]
+            print(tokenizer.decode(tokens))
+            tokens = torch.tensor(tokens)[None,].to(device)
+        input_length = len(tokens[0])
+        with torch.no_grad():
+
+            # instantiate logits processors
+            logits_processor = LogitsProcessorList(
+                [
+                    MinLengthLogitsProcessor(1, eos_token_id=100007),
+                ]
+            )
+            # instantiate logits processors
+            logits_warper = LogitsProcessorList(
+                [
+                    TopPLogitsWarper(top_p),
+                    TopKLogitsWarper(topk),
+                    TemperatureLogitsWarper(temperature),
+                    
+                ]
+            )
+
+            stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=input_length + max_gen_len)])
+            out = self.sample(
+                                tokens,
+                                logits_processor=logits_processor,
+                                logits_warper=logits_warper,
+                                stopping_criteria=stopping_criteria,
+                                return_dict_in_generate=True, 
+                                output_scores=True,
+                            )
+
+            
+            # print(out)
+            out_ids = out["sequences"][0][input_length:].cpu().numpy()
+
+            out_scores = out["scores"]
+
+            out_scores = torch.cat(out_scores, dim=0)
+            out_scores = torch.nn.functional.softmax(out_scores, dim=-1).cpu().numpy()
+
+            probs = []
+            for i in range(len(out_ids)):
+                probs.append(float(out_scores[i][out_ids[i]]))
+
+            # print(f"probs is {probs}")
+
+            convert_tokens = []
+            for t in out_ids:
+                if t == 100006:
+                    convert_tokens.append("[CLS]")
+                else :
+                    convert_tokens.append(id2word.get(t, "[unkonwn_token]"))
+
+            out_text = tokenizer.decode(out_ids.tolist())
+            
+
+            out = out_text
+
+        if "###" in out:
+            special_index = out.index("###")
+            out = out[: special_index]
+            token_length = len(tokenizer.encode_plus(out)["input_ids"])
+            convert_tokens = convert_tokens[:token_length]
+            probs = probs[:token_length]
+
+        if "[UNK]" in out:
+            special_index = out.index("[UNK]")
+            out = out[:special_index]
+            token_length = len(tokenizer.encode_plus(out)["input_ids"])
+            convert_tokens = convert_tokens[:token_length]
+            probs = probs[:token_length]
+
+        if "</s>" in out:
+            special_index = out.index("</s>")
+            out = out[: special_index]
+            token_length = len(tokenizer.encode_plus(out)["input_ids"])
+            convert_tokens = convert_tokens[:token_length]
+            probs = probs[:token_length]
+
+        if len(out) > 0 and out[0] == " ":
+            out = out[1:]
+
+            convert_tokens = convert_tokens[1:]
+            probs = probs[1:]
+        return out 
+
 @add_start_docstrings(
     """
     The LLaMa Model transformer with a sequence classification head on top (linear layer).
diff --git a/flagai/model/aquila2/utils.py b/flagai/model/aquila2/utils.py
new file mode 100755
index 00000000..8e3de5f9
--- /dev/null
+++ b/flagai/model/aquila2/utils.py
@@ -0,0 +1,38 @@
+import random
+import numpy as np 
+import torch 
+from fastchat.conversation import get_conv_template
+
+def set_random_seed(seed):
+    """Set random seed for reproducability."""
+    if seed is not None and seed > 0:
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+
+
+
+def covert_prompt_to_input_ids_with_history(text, history, tokenizer, max_token, convo_template="aquila-chat"):
+    # aquila-chat as default
+    conv = get_conv_template(convo_template)
+
+    conv.append_message(conv.roles[1], None)
+    conv.append_message(conv.roles[0], text)
+
+    example = tokenizer.encode_plus(f"{conv.get_prompt()} ", None, max_length=None)['input_ids']
+
+    while(len(history) > 0 and (len(example) < max_token)):
+        tmp = history.pop()
+        if tmp[0] == 'ASSISTANT':
+            conv.append_message(conv.roles[1], tmp[1])
+        else:
+            conv.append_message(conv.roles[0], tmp[1])
+        example = tokenizer.encode_plus(f"{conv.get_prompt()} ", None, max_length=None)['input_ids']
+
+    if len(example) >= max_token:
+        conv.messages.pop()
+    conv.messages = conv.messages[::-1]
+    print('model in:', conv.get_prompt())
+    example = tokenizer.encode_plus(f"{conv.get_prompt()} ", None, max_length=None)['input_ids']
+
+    return example
\ No newline at end of file
diff --git a/flagai/model/aquila2_model.py b/flagai/model/aquila2_model.py
deleted file mode 100755
index 48cb3a53..00000000
--- a/flagai/model/aquila2_model.py
+++ /dev/null
@@ -1,224 +0,0 @@
-from transformers import AutoTokenizer, LlamaForCausalLM , AutoModelForCausalLM
-import random 
-import numpy as np
-import torch 
-from utils import covert_prompt_to_input_ids_with_history
-import os 
-from flagai.model.file_utils import _get_model_id, _get_checkpoint_path, _get_vocab_path, _get_model_files
-from transformers import (
-    LogitsProcessorList,
-    MinLengthLogitsProcessor,
-    TopKLogitsWarper,
-    TemperatureLogitsWarper,
-    TopPLogitsWarper,
-    StoppingCriteriaList,
-    MaxLengthCriteria,
-    BitsAndBytesConfig,
-)
-from fastchat.conversation import get_conv_template
-
-
-def covert_prompt_to_input_ids_with_history(text, history, tokenizer, max_token, convo_template="aquila-chat"):
-    # aquila-chat as default
-    conv = get_conv_template(convo_template)
-
-    conv.append_message(conv.roles[1], None)
-    conv.append_message(conv.roles[0], text)
-
-    example = tokenizer.encode_plus(f"{conv.get_prompt()} ", None, max_length=None)['input_ids']
-
-    while(len(history) > 0 and (len(example) < max_token)):
-        tmp = history.pop()
-        if tmp[0] == 'ASSISTANT':
-            conv.append_message(conv.roles[1], tmp[1])
-        else:
-            conv.append_message(conv.roles[0], tmp[1])
-        example = tokenizer.encode_plus(f"{conv.get_prompt()} ", None, max_length=None)['input_ids']
-
-    if len(example) >= max_token:
-        conv.messages.pop()
-    conv.messages = conv.messages[::-1]
-    print('model in:', conv.get_prompt())
-    example = tokenizer.encode_plus(f"{conv.get_prompt()} ", None, max_length=None)['input_ids']
-
-    return example
-
-
-def set_random_seed(seed):
-    """Set random seed for reproducability."""
-    if seed is not None and seed > 0:
-        random.seed(seed)
-        np.random.seed(seed)
-        torch.manual_seed(seed)
-
-
-class Aquila2Model(LlamaForCausalLM):
-
-    @classmethod
-    def from_pretrain(self, model_dir, model_name, **kwargs):
-        download_path = os.path.join(model_dir, model_name)
-        if os.path.exists(download_path):
-            return self.from_pretrained(download_path, **kwargs)
-
-
-        config_path = os.path.join(download_path, "config.json")
-        checkpoint_path = os.path.join(download_path, "pytorch_model.bin")
-        from flagai.model.file_utils import _get_model_id
-        model_id = _get_model_id(model_name)
-        if model_id and model_id != "null":
-            model_files = eval(_get_model_files(model_name))
-            print("model files:" + str(model_files))
-            for file_name in model_files:
-                if not file_name.endswith("bin"):
-                    _get_vocab_path(download_path, file_name, model_id)
-
-            if os.path.exists(
-                    os.path.join(download_path, 'config.json')):
-                if os.getenv('ENV_TYPE') == 'deepspeed+mpu':
-                    model_parallel_size = int(os.getenv("MODEL_PARALLEL_SIZE"))
-                    if model_parallel_size > 1:
-                        # if gpus == nums_of_modelhub_models
-                        # can load
-                        # else need to download the pytorch_model.bin and to recut.
-                        model_hub_parallel_size = 0
-                        for f in model_files:
-                            if "pytorch_model_" in f:
-                                model_hub_parallel_size += 1
-                else:
-                    model_parallel_size = 1
-
-                if "pytorch_model_01.bin" in model_files and model_parallel_size > 1 and model_hub_parallel_size == model_parallel_size:
-                    # Only to download the model slices(megatron-lm).
-                    for file_to_load in model_files:
-                        if "pytorch_model_" in file_to_load:
-                            _get_checkpoint_path(download_path, file_to_load,
-                                                 model_id)
-
-                elif 'pytorch_model.bin' in model_files:
-                    checkpoint_path = _get_checkpoint_path(
-                        download_path, 'pytorch_model.bin', model_id)
-                else:
-                    checkpoint_merge = {}
-                    # maybe multi weights files
-                    for file_to_load in model_files:
-                        if "pytorch_model-0" in file_to_load:
-                            _get_checkpoint_path(download_path, file_to_load,
-                                                 model_id)
-                    #         checkpoint_to_load = torch.load(os.path.join(
-                    #             download_path, file_to_load),
-                    #                                         map_location="cpu")
-                    #         for k, v in checkpoint_to_load.items():
-                    #             checkpoint_merge[k] = v
-                    # # save all parameters
-                    # torch.save(
-                    #     checkpoint_merge,
-                    #     os.path.join(download_path, "pytorch_model.bin"))
-
-
-    def predict(self, text, tokenizer=None,
-                max_gen_len=200, top_p=0.95,
-                seed=1234, topk=100,
-                temperature=0.9, 
-                sft=True, convo_template = "aquila-chat",
-                device = "cuda"):
-
-        vocab = tokenizer.get_vocab()
-        #device = device
-        id2word = {v:k for k, v in vocab.items()}
-
-
-        set_random_seed(seed)
-        if temperature == 0:
-            topk = 1
-            temperature = 1.0
-        if sft:
-            tokens = covert_prompt_to_input_ids_with_history(text, history=[], tokenizer=tokenizer, max_token=2048, convo_template=convo_template)
-            tokens = torch.tensor(tokens)[None,].to(device)
-        else :
-            tokens = tokenizer.encode_plus(text)["input_ids"]
-            print(tokenizer.decode(tokens))
-            tokens = torch.tensor(tokens)[None,].to(device)
-        input_length = len(tokens[0])
-        with torch.no_grad():
-
-            # instantiate logits processors
-            logits_processor = LogitsProcessorList(
-                [
-                    MinLengthLogitsProcessor(1, eos_token_id=100007),
-                ]
-            )
-            # instantiate logits processors
-            logits_warper = LogitsProcessorList(
-                [
-                    TopPLogitsWarper(top_p),
-                    TopKLogitsWarper(topk),
-                    TemperatureLogitsWarper(temperature),
-                    
-                ]
-            )
-
-            stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=input_length + max_gen_len)])
-            out = self.sample(
-                                tokens,
-                                logits_processor=logits_processor,
-                                logits_warper=logits_warper,
-                                stopping_criteria=stopping_criteria,
-                                return_dict_in_generate=True, 
-                                output_scores=True,
-                            )
-
-            
-            # print(out)
-            out_ids = out["sequences"][0][input_length:].cpu().numpy()
-
-            out_scores = out["scores"]
-
-            out_scores = torch.cat(out_scores, dim=0)
-            out_scores = torch.nn.functional.softmax(out_scores, dim=-1).cpu().numpy()
-
-            probs = []
-            for i in range(len(out_ids)):
-                probs.append(float(out_scores[i][out_ids[i]]))
-
-            # print(f"probs is {probs}")
-
-            convert_tokens = []
-            for t in out_ids:
-                if t == 100006:
-                    convert_tokens.append("[CLS]")
-                else :
-                    convert_tokens.append(id2word.get(t, "[unkonwn_token]"))
-
-            out_text = tokenizer.decode(out_ids.tolist())
-            
-
-            out = out_text
-
-        if "###" in out:
-            special_index = out.index("###")
-            out = out[: special_index]
-            token_length = len(tokenizer.encode_plus(out)["input_ids"])
-            convert_tokens = convert_tokens[:token_length]
-            probs = probs[:token_length]
-
-        if "[UNK]" in out:
-            special_index = out.index("[UNK]")
-            out = out[:special_index]
-            token_length = len(tokenizer.encode_plus(out)["input_ids"])
-            convert_tokens = convert_tokens[:token_length]
-            probs = probs[:token_length]
-
-        if "</s>" in out:
-            special_index = out.index("</s>")
-            out = out[: special_index]
-            token_length = len(tokenizer.encode_plus(out)["input_ids"])
-            convert_tokens = convert_tokens[:token_length]
-            probs = probs[:token_length]
-
-        if len(out) > 0 and out[0] == " ":
-            out = out[1:]
-
-            convert_tokens = convert_tokens[1:]
-            probs = probs[1:]
-        return out 
-        # return out, convert_tokens, probs