Merge pull request #14 from ilya-lavrenov/real-models

Real models
ilya-lavrenov · May 16, 2024 · decb18a · decb18a
2 parents 4d3a4fd + 54a2bbd
commit decb18a
Show file tree

Hide file tree

Showing 5 changed files with 145 additions and 56 deletions.
diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/sampler.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/sampler.hpp
@@ -372,11 +372,8 @@ void GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits, SamplerOutp
                 }
             }
 
-            // HF implementation counts eos_token_id for length penalty calculation
-            if (candidate.m_token_id != m_parameters.eos_token_id) {
-                // append token from candidate to actual sequence
-                forked_sequence->append_token(candidate.m_token_id, candidate.m_log_prob);
-            }
+            // append token from candidate to actual sequence
+            forked_sequence->append_token(candidate.m_token_id, candidate.m_log_prob);
         }
     };
 

diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py b/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import os
+import shutil
 import pytest
 
 from optimum.intel import OVModelForCausalLM
@@ -65,6 +66,7 @@ def convert_to_hf(
 
     # generic parameters
     kwargs['max_length'] = generation_config.max_length
+    # has higher priority than 'max_length'
     kwargs['max_new_tokens'] = generation_config.max_new_tokens
 
     # copy default parameters
@@ -119,9 +121,9 @@ def run_hugging_face(
 
     for prompt, generation_config in zip(prompts, generation_configs):
         inputs = hf_tokenizer(prompt, return_tensors="pt")
-        prompt_len = len(inputs['input_ids'][0])
-        generate_outputs = model.generate(**inputs, generation_config=convert_to_hf(model.generation_config, generation_config), return_dict_in_generate=True)
-        all_text_batch = hf_tokenizer.batch_decode([generated_ids[prompt_len:] for generated_ids in generate_outputs.sequences], skip_special_tokens=True)
+        prompt_len = inputs['input_ids'].numel()
+        generate_outputs = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], generation_config=convert_to_hf(model.generation_config, generation_config), return_dict_in_generate=True)
+        all_text_batch = hf_tokenizer.batch_decode([generated_ids[prompt_len:] for generated_ids in generate_outputs.sequences])
 
         generation_result = GenerationResult()
         generation_result.m_generation_ids = all_text_batch
@@ -145,6 +147,7 @@ def run_continuous_batching(
     pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), scheduler_config)
     output = pipe.generate(prompts, generation_configs)
     del pipe
+    shutil.rmtree(model_path)
     return output
 
 
@@ -187,4 +190,4 @@ def run_test_pipeline(tmp_path: str, model_id: str, scheduler_params: dict = Non
 
     for prompt, hf_result, ov_result, generation_config in zip(prompts, hf_results, ov_results, generation_configs):
         print(f"Prompt = {prompt}\nHF result = {hf_result}\nOV result = {ov_result}")
-        compare_results(hf_result, ov_result, generation_config)
+        compare_results(hf_result, ov_result, generation_config)
diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models
@@ -1,58 +1,60 @@
-microsoft/Phi-3-mini-128k-instruct
-microsoft/Phi-3-mini-4k-instruct
-microsoft/phi-2
-microsoft/phi-1_5
-EleutherAI/gpt-neo-125m
-EleutherAI/gpt-neo-125m
-EleutherAI/gpt-neo-1.3B
-EleutherAI/gpt-j-6b
+# RuntimeError: Check 'unregistered_parameters.str().empty()': microsoft/Phi-3-mini-128k-instruct
+# RuntimeError: Check 'unregistered_parameters.str().empty()': microsoft/Phi-3-mini-4k-instruct
+# passed microsoft/phi-2
+# passed microsoft/phi-1_5
+# passed EleutherAI/gpt-neo-125m
+# passed EleutherAI/gpt-neo-125m
+# passed EleutherAI/gpt-neo-1.3B
+# passed EleutherAI/gpt-j-6b
+EleutherAI/pythia-160m
 # EleutherAI/gpt-neox-20b
-BAAI/AquilaChat2-7B
-BAAI/Aquila-7B
-BAAI/AquilaChat-7B
-baichuan-inc/Baichuan2-7B-Chat
-baichuan-inc/Baichuan-7B
+# optimum-intel: Trying to export a aquila model, that is a custom or unsupported architecture: BAAI/AquilaChat2-7B
+# optimum-intel: Trying to export a aquila model, that is a custom or unsupported architecture: BAAI/Aquila-7B
+# optimum-intel: Trying to export a aquila model, that is a custom or unsupported architecture: BAAI/AquilaChat-7B
+# passed: baichuan-inc/Baichuan2-7B-Chat
+# Exception from src/core/src/shape_util.cpp:65:: baichuan-inc/Baichuan-7B
 bigscience/bloomz-1b7
 bigscience/bloomz-560m
-THUDM/chatglm2-6b
+# passed: THUDM/chatglm2-6b
 THUDM/chatglm3-6b
 databricks/dolly-v2-3b
-tiiuae/falcon-7b
+# Exception from src/core/src/shape_util.cpp:65: tiiuae/falcon-7b
 tiiuae/falcon-rw-7b
-google/gemma-2b
-google/gemma-7b
-openai-community/gpt2
-openai-community/gpt2-xl
-gpt2
-gpt2-xl
-bigcode/starcoderbase-3b
+# passed google/gemma-2b
+# passed google/gemma-7b
+# passed openai-community/gpt2
+# passed openai-community/gpt2-xl
+# passed gpt2
+# passed gpt2-xl
+# RuntimeError: Check 'unregistered_parameters.str().empty()': bigcode/starcoderbase-3b
 bigcode/starcoder2-3b
-bigcode/gpt_bigcode-santacoder
+# RuntimeError: Check 'unregistered_parameters.str().empty()': bigcode/gpt_bigcode-santacoder
 nomic-ai/gpt4all-j
-nomic-ai/gpt4all-mpt
-nomic-ai/gpt4all-falcon
-stabilityai/stablelm-3b-4e1t
-stabilityai/stablelm-2-zephyr-1_6b
-internlm/internlm-chat-7b
-internlm/internlm2-7b
+# RuntimeError: Check 'unregistered_parameters.str().empty()': nomic-ai/gpt4all-mpt
+# optimum-intel: Trying to export a RefinedWebModel model, that is a custom or unsupported architecture: nomic-ai/gpt4all-falcon
+# passed: stabilityai/stablelm-3b-4e1t
+# passed: stabilityai/stablelm-2-zephyr-1_6b
+# optimum-intel: Trying to export a internlm model, that is a custom or unsupported architecture: internlm/internlm-chat-7b
+# optimum-intel: PermissionError: [Errno 13] Permission denied: internlm/internlm2-7b
 # core42/jais-13b
 # core42/jais-13b-chat
-meta-llama/Llama-2-7b-hf
-meta-llama/Meta-Llama-3-8B-Instruct
+# passed: meta-llama/Llama-2-7b-hf
+# passed: meta-llama/Meta-Llama-3-8B-Instruct
+# passed: meta-llama/CodeLlama-7b-hf
 lmsys/vicuna-7b-v1.3
-lmsys/vicuna-7b-v1.5
+# optimum-intel: The generation config instance is invalid -- `.validate(): lmsys/vicuna-7b-v1.5
+# optimum-intel: The generation config instance is invalid -- `.validate(): lmsys/longchat-7b-v1.5-32k
 # young-geng/koala
-openlm-research/open_llama_3b
-openlm-research/open_llama_3b_v2
-openbmb/MiniCPM-V-2
+# passed: openlm-research/open_llama_3b
+# passed: openlm-research/open_llama_3b_v2
 openbmb/MiniCPM-2B-sft-bf16
 openbmb/MiniCPM-2B-dpo-bf16
-mistralai/Mistral-7B-v0.1
-mistralai/Mistral-7B-Instruct-v0.1
+# passed: mistralai/Mistral-7B-v0.1
+# passed: mistralai/Mistral-7B-Instruct-v0.1
 # mistralai/Mixtral-8x7B-v0.1
 # mistralai/Mixtral-8x7B-Instruct-v0.1
-mosaicml/mpt-1b-redpajama-200b
-mosaicml/mpt-7b
+# optimum-intel: Trying to export a mosaic-gpt model, that is a custom or unsupported architecture: mosaicml/mpt-1b-redpajama-200b
+# RuntimeError: Check 'unregistered_parameters.str().empty()': mosaicml/mpt-7b
 # mosaicml/mpt-30b
 allenai/OLMo-1B-hf
 allenai/OLMo-7B-hf
@@ -64,13 +66,21 @@ Qwen/Qwen1.5-0.5B
 Qwen/Qwen1.5-7B-Chat
 # Qwen/Qwen1.5-MoE-A2.7B
 # Qwen/Qwen1.5-MoE-A2.7B-Chat
-xverse/XVERSE-7B-Chat
+# optimum-intel: Trying to export a xverse model, that is a custom or unsupported architecture: xverse/XVERSE-7B-Chat
 # xverse/XVERSE-MoE-A4.2B
 01-ai/Yi-6B
-Salesforce/codegen-350M-multi
-Salesforce/codegen-350M-nl
+# passed: Salesforce/codegen-350M-multi
+# passed: Salesforce/codegen-350M-nl
+# passed, but with export=False: OpenVINO/codegen25-7b-multi-fp16-ov
+# optimum-intel: AttributeError: 'NoneType' object has no attribute 'device': Salesforce/codegen2-1b
+# optimum-intel: TypeError: Object of type method is not JSON serializable: Salesforce/xgen-7b-8k-base
+# optimum-intel: DeciCoderAttention.forward() got an unexpected keyword argument 'cache_position': Deci/DeciCoder-1b
 rinna/bilingual-gpt-neox-4b
-facebook/opt-350m
-facebook/incoder-1B
+# RuntimeError: Check 'unregistered_parameters.str().empty()': facebook/opt-350m
+# RuntimeError: Check 'rt_info.find("eos_token_id") != rt_info.end(): facebook/incoder-1B
+# optimum-intel: IndexError: tuple index out of range: facebook/blenderbot-3B
 google/pegasus-big_patent
-google/pegasus-large
+google/pegasus-large
+# optimum-intel: The generation config instance is invalid -- `.validate()` throws warnings and/or exceptions: openchat/openchat_3.5
+# CPU: head size must be multiple of 16, current: 100: pankajmathur/orca_mini_3b
+# passed: togethercomputer/RedPajama-INCITE-Chat-3B-v1
diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt b/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt
@@ -12,3 +12,17 @@ hf_transfer
 # requirements for specific models
 # - hf-tiny-model-private/tiny-random-RoFormerForCausalLM
 rjieba
+# - baichuan-inc/Baichuan2-7B-Chat
+bitsandbytes
+# - nomic-ai/gpt4all-falcon
+# - Qwen/Qwen-7B
+# - Qwen/Qwen-7B-Chat
+# - mosaicml/mpt-7b
+einops
+# - Qwen/Qwen-7B
+# - Qwen/Qwen-7B-Chat
+transformers_stream_generator
+# - openbmb/MiniCPM-V-2
+torchvision
+# - openbmb/MiniCPM-V-2
+timm
diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_sampling.py b/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_sampling.py
@@ -3,8 +3,16 @@
 import os
 import pytest
 
-from common import run_test_pipeline, get_models_list
-
+from common import (
+    run_test_pipeline,
+    run_hugging_face,
+    run_continuous_batching,
+    get_models_list,
+    get_greedy,
+    get_beam_search,
+    get_scheduler_config,
+    compare_results
+)
 
 # tested models:
 # - facebook/opt-125m
@@ -26,3 +34,60 @@ def test_sampling_nightly(tmp_path, model_id):
 @pytest.mark.parametrize("model_id", get_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "real_models")))
 def test_real_models(tmp_path, model_id):
     run_test_pipeline(tmp_path, model_id)
+
+
+@pytest.mark.precommit
+def test_eos_beam_search(tmp_path):
+    '''
+    Current test checks that in case of beam search, some generation results
+    explicitly have EOS token at the end, which is aligned with HF
+
+    Example of current output:
+    { -1.23264,  that I don't know about.
+    I don't know what you're talking about, but I'm pretty sure it's a Canadian thing.</s> }
+    '''
+    model_id = "facebook/opt-125m"
+    prompts = ["Tell me something about Canada"]
+    generation_configs = [get_beam_search()]
+    scheduler_config = get_scheduler_config()
+
+    (hf_results, model_path) = run_hugging_face(model_id=model_id, prompts=prompts,
+                                                generation_configs=generation_configs, tmp_path=tmp_path,
+                                                use_optimum=True)
+    ov_results: List[GenerationResult] = run_continuous_batching(model_path=model_path, scheduler_config=scheduler_config,
+                                                                 prompts=prompts, generation_configs=generation_configs)
+
+    assert len(prompts) == len(hf_results)
+    assert len(prompts) == len(ov_results)
+
+    for prompt, hf_result, ov_result, generation_config in zip(prompts, hf_results, ov_results, generation_configs):
+        print(f"Prompt = {prompt}\nHF result = {hf_result}\nOV result = {ov_result}")
+        compare_results(hf_result, ov_result, generation_config)
+
+
+@pytest.mark.precommit
+def test_eos_greedy(tmp_path):
+    '''
+    Current test checks that in case of gready, some generation results
+    explicitly have EOS token at the end, which is aligned with HF:
+
+    Example of current output:
+    {  a software program</s> }
+    '''
+    model_id = "bigscience/bloomz-560m"
+    prompts = ["What is OpenVINO?"]
+    generation_configs = [get_greedy()]
+    scheduler_config = get_scheduler_config()
+
+    (hf_results, model_path) = run_hugging_face(model_id=model_id, prompts=prompts,
+                                                generation_configs=generation_configs, tmp_path=tmp_path,
+                                                use_optimum=True)
+    ov_results: List[GenerationResult] = run_continuous_batching(model_path=model_path, scheduler_config=scheduler_config,
+                                                                 prompts=prompts, generation_configs=generation_configs)
+
+    assert len(prompts) == len(hf_results)
+    assert len(prompts) == len(ov_results)
+
+    for prompt, hf_result, ov_result, generation_config in zip(prompts, hf_results, ov_results, generation_configs):
+        print(f"Prompt = {prompt}\nHF result = {hf_result}\nOV result = {ov_result}")
+        compare_results(hf_result, ov_result, generation_config)