updated model usage method

Signed-off-by: 严照东 <[email protected]>
FlagAI-Open · Sep 26, 2023 · 2fb5318 · 2fb5318
1 parent 4986a94
commit 2fb5318
Show file tree

Hide file tree

Showing 5 changed files with 261 additions and 262 deletions.
diff --git a/examples/Aquila2/generate_chat.py b/examples/Aquila2/generate_chat.py
diff --git a/flagai/auto_model/auto_loader.py b/flagai/auto_model/auto_loader.py
@@ -4,10 +4,10 @@
 import importlib
 import os
 import copy
-from flagai.model.file_utils import _get_model_id
+from flagai.model.file_utils import _get_model_id, _get_checkpoint_path, _get_vocab_path, _get_model_files
+from flagai.model.aquila2.modeling_aquila import AquilaForCausalLM
 import torch
 
-
 class LazyImport(object):
 
     def __init__(self, name):
@@ -207,8 +207,54 @@ def __init__(self,
             print(f"All supported models are {list(MODEL_DICT.keys())}")
             return
         if task_name == "aquila2":
-            from flagai.model.aquila2_model import Aquila2Model
-            from accelerate import init_empty_weights, load_checkpoint_and_dispatch
+            download_path = os.path.join(model_dir, model_name)
+
+            if not os.path.exists(download_path):
+                # Try to download from ModelHub
+                try:
+                    model_id = _get_model_id(model_name)
+                except:
+                    raise FileNotFoundError("Model name not found in local path and ModelHub")
+                if model_id and model_id != "null":
+                    model_files = eval(_get_model_files(model_name))
+                    print("model files:" + str(model_files))
+                    for file_name in model_files:
+                        if not file_name.endswith("bin"):
+                            _get_vocab_path(download_path, file_name, model_id)
+
+                    if os.path.exists(
+                            os.path.join(download_path, 'config.json')):
+                        if os.getenv('ENV_TYPE') == 'deepspeed+mpu':
+                            model_parallel_size = int(os.getenv("MODEL_PARALLEL_SIZE"))
+                            if model_parallel_size > 1:
+                                # if gpus == nums_of_modelhub_models
+                                # can load
+                                # else need to download the pytorch_model.bin and to recut.
+                                model_hub_parallel_size = 0
+                                for f in model_files:
+                                    if "pytorch_model_" in f:
+                                        model_hub_parallel_size += 1
+                        else:
+                            model_parallel_size = 1
+
+                        if "pytorch_model_01.bin" in model_files and model_parallel_size > 1 and model_hub_parallel_size == model_parallel_size:
+                            # Only to download the model slices(megatron-lm).
+                            for file_to_load in model_files:
+                                if "pytorch_model_" in file_to_load:
+                                    _get_checkpoint_path(download_path, file_to_load,
+                                                        model_id)
+
+                        elif 'pytorch_model.bin' in model_files:
+                            checkpoint_path = _get_checkpoint_path(
+                                download_path, 'pytorch_model.bin', model_id)
+                        else:
+                            checkpoint_merge = {}
+                            # maybe multi weights files
+                            for file_to_load in model_files:
+                                if "pytorch_model-0" in file_to_load:
+                                    _get_checkpoint_path(download_path, file_to_load,
+                                                        model_id)            
+
             if qlora_dir:
                 from transformers import BitsAndBytesConfig
                 quantization_config=BitsAndBytesConfig(
@@ -217,11 +263,14 @@ def __init__(self,
                     bnb_4bit_quant_type="nf4",
                     bnb_4bit_compute_dtype=torch_dtype,
                 )
-            model = Aquila2Model.from_pretrain(model_dir, model_name,
+
+
+            model = AquilaForCausalLM.from_pretrained(download_path,
                                                     low_cpu_mem_usage=low_cpu_mem_usage, torch_dtype=torch_dtype,
                                                     quantization_config=quantization_config)
 
             model.eval()
+            # from accelerate import load_checkpoint_and_dispatch
             # model = load_checkpoint_and_dispatch(
             #                 model, model_dir+model_name, device_map="balanced", no_split_module_classes=["LlamaDecoderLayer"])
             if not qlora_dir:
@@ -236,13 +285,9 @@ def __init__(self,
                 print("Qlora modules loaded")
             from transformers import AutoTokenizer
             tokenizer = AutoTokenizer.from_pretrained(model_dir+model_name)
-            #args.cuda_index = 0
-            # device = f"cuda"
             self.model = model 
             self.tokenizer = tokenizer 
-
         else:
-
             brief_model_name = MODEL_DICT[model_name][2]
             model_type = MODEL_DICT[model_name][3]
             # The dir to save config, vocab and model.

diff --git a/flagai/model/aquila2/modeling_aquila.py b/flagai/model/aquila2/modeling_aquila.py
@@ -31,6 +31,17 @@
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_aquila import AquilaConfig
+from transformers import (
+    LogitsProcessorList,
+    MinLengthLogitsProcessor,
+    TopKLogitsWarper,
+    TemperatureLogitsWarper,
+    TopPLogitsWarper,
+    StoppingCriteriaList,
+    MaxLengthCriteria,
+    BitsAndBytesConfig,
+)
+from .utils import *
 
 
 logger = logging.get_logger(__name__)
@@ -754,6 +765,57 @@ def __init__(self, config):
         # Initialize weights and apply final processing
         self.post_init()
 
+    @classmethod
+    def from_pretrain(self, model_dir, model_name, **kwargs):
+        download_path = os.path.join(model_dir, model_name)
+        if os.path.exists(download_path):
+            return self.from_pretrained(download_path, **kwargs)
+
+
+        config_path = os.path.join(download_path, "config.json")
+        checkpoint_path = os.path.join(download_path, "pytorch_model.bin")
+        from flagai.model.file_utils import _get_model_id
+        model_id = _get_model_id(model_name)
+        if model_id and model_id != "null":
+            model_files = eval(_get_model_files(model_name))
+            print("model files:" + str(model_files))
+            for file_name in model_files:
+                if not file_name.endswith("bin"):
+                    _get_vocab_path(download_path, file_name, model_id)
+
+            if os.path.exists(
+                    os.path.join(download_path, 'config.json')):
+                if os.getenv('ENV_TYPE') == 'deepspeed+mpu':
+                    model_parallel_size = int(os.getenv("MODEL_PARALLEL_SIZE"))
+                    if model_parallel_size > 1:
+                        # if gpus == nums_of_modelhub_models
+                        # can load
+                        # else need to download the pytorch_model.bin and to recut.
+                        model_hub_parallel_size = 0
+                        for f in model_files:
+                            if "pytorch_model_" in f:
+                                model_hub_parallel_size += 1
+                else:
+                    model_parallel_size = 1
+
+                if "pytorch_model_01.bin" in model_files and model_parallel_size > 1 and model_hub_parallel_size == model_parallel_size:
+                    # Only to download the model slices(megatron-lm).
+                    for file_to_load in model_files:
+                        if "pytorch_model_" in file_to_load:
+                            _get_checkpoint_path(download_path, file_to_load,
+                                                 model_id)
+
+                elif 'pytorch_model.bin' in model_files:
+                    checkpoint_path = _get_checkpoint_path(
+                        download_path, 'pytorch_model.bin', model_id)
+                else:
+                    checkpoint_merge = {}
+                    # maybe multi weights files
+                    for file_to_load in model_files:
+                        if "pytorch_model-0" in file_to_load:
+                            _get_checkpoint_path(download_path, file_to_load,
+                                                 model_id)
+
     def get_input_embeddings(self):
         return self.model.embed_tokens
 
@@ -905,6 +967,113 @@ def _reorder_cache(past_key_values, beam_idx):
             )
         return reordered_past
 
+    def predict(self, text, tokenizer=None,
+                max_gen_len=200, top_p=0.95,
+                seed=1234, topk=100,
+                temperature=0.9, 
+                sft=True, convo_template = "aquila-chat",
+                device = "cuda"):
+
+        vocab = tokenizer.get_vocab()
+        #device = device
+        id2word = {v:k for k, v in vocab.items()}
+
+
+        set_random_seed(seed)
+        if temperature == 0:
+            topk = 1
+            temperature = 1.0
+        if sft:
+            tokens = covert_prompt_to_input_ids_with_history(text, history=[], tokenizer=tokenizer, max_token=2048, convo_template=convo_template)
+            tokens = torch.tensor(tokens)[None,].to(device)
+        else :
+            tokens = tokenizer.encode_plus(text)["input_ids"]
+            print(tokenizer.decode(tokens))
+            tokens = torch.tensor(tokens)[None,].to(device)
+        input_length = len(tokens[0])
+        with torch.no_grad():
+
+            # instantiate logits processors
+            logits_processor = LogitsProcessorList(
+                [
+                    MinLengthLogitsProcessor(1, eos_token_id=100007),
+                ]
+            )
+            # instantiate logits processors
+            logits_warper = LogitsProcessorList(
+                [
+                    TopPLogitsWarper(top_p),
+                    TopKLogitsWarper(topk),
+                    TemperatureLogitsWarper(temperature),
+
+                ]
+            )
+
+            stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=input_length + max_gen_len)])
+            out = self.sample(
+                                tokens,
+                                logits_processor=logits_processor,
+                                logits_warper=logits_warper,
+                                stopping_criteria=stopping_criteria,
+                                return_dict_in_generate=True, 
+                                output_scores=True,
+                            )
+
+
+            # print(out)
+            out_ids = out["sequences"][0][input_length:].cpu().numpy()
+
+            out_scores = out["scores"]
+
+            out_scores = torch.cat(out_scores, dim=0)
+            out_scores = torch.nn.functional.softmax(out_scores, dim=-1).cpu().numpy()
+
+            probs = []
+            for i in range(len(out_ids)):
+                probs.append(float(out_scores[i][out_ids[i]]))
+
+            # print(f"probs is {probs}")
+
+            convert_tokens = []
+            for t in out_ids:
+                if t == 100006:
+                    convert_tokens.append("[CLS]")
+                else :
+                    convert_tokens.append(id2word.get(t, "[unkonwn_token]"))
+
+            out_text = tokenizer.decode(out_ids.tolist())
+
+
+            out = out_text
+
+        if "###" in out:
+            special_index = out.index("###")
+            out = out[: special_index]
+            token_length = len(tokenizer.encode_plus(out)["input_ids"])
+            convert_tokens = convert_tokens[:token_length]
+            probs = probs[:token_length]
+
+        if "[UNK]" in out:
+            special_index = out.index("[UNK]")
+            out = out[:special_index]
+            token_length = len(tokenizer.encode_plus(out)["input_ids"])
+            convert_tokens = convert_tokens[:token_length]
+            probs = probs[:token_length]
+
+        if "</s>" in out:
+            special_index = out.index("</s>")
+            out = out[: special_index]
+            token_length = len(tokenizer.encode_plus(out)["input_ids"])
+            convert_tokens = convert_tokens[:token_length]
+            probs = probs[:token_length]
+
+        if len(out) > 0 and out[0] == " ":
+            out = out[1:]
+
+            convert_tokens = convert_tokens[1:]
+            probs = probs[1:]
+        return out 
+
 @add_start_docstrings(
     """
     The LLaMa Model transformer with a sequence classification head on top (linear layer).

diff --git a/flagai/model/aquila2/utils.py b/flagai/model/aquila2/utils.py
@@ -0,0 +1,38 @@
+import random
+import numpy as np 
+import torch 
+from fastchat.conversation import get_conv_template
+
+def set_random_seed(seed):
+    """Set random seed for reproducability."""
+    if seed is not None and seed > 0:
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+
+
+
+def covert_prompt_to_input_ids_with_history(text, history, tokenizer, max_token, convo_template="aquila-chat"):
+    # aquila-chat as default
+    conv = get_conv_template(convo_template)
+
+    conv.append_message(conv.roles[1], None)
+    conv.append_message(conv.roles[0], text)
+
+    example = tokenizer.encode_plus(f"{conv.get_prompt()} ", None, max_length=None)['input_ids']
+
+    while(len(history) > 0 and (len(example) < max_token)):
+        tmp = history.pop()
+        if tmp[0] == 'ASSISTANT':
+            conv.append_message(conv.roles[1], tmp[1])
+        else:
+            conv.append_message(conv.roles[0], tmp[1])
+        example = tokenizer.encode_plus(f"{conv.get_prompt()} ", None, max_length=None)['input_ids']
+
+    if len(example) >= max_token:
+        conv.messages.pop()
+    conv.messages = conv.messages[::-1]
+    print('model in:', conv.get_prompt())
+    example = tokenizer.encode_plus(f"{conv.get_prompt()} ", None, max_length=None)['input_ids']
+
+    return example