diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/sampler.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/sampler.hpp index 00642657f6..13db8ed37f 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/sampler.hpp +++ b/text_generation/causal_lm/cpp/continuous_batching/library/src/sampler.hpp @@ -372,11 +372,8 @@ void GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits, SamplerOutp } } - // HF implementation counts eos_token_id for length penalty calculation - if (candidate.m_token_id != m_parameters.eos_token_id) { - // append token from candidate to actual sequence - forked_sequence->append_token(candidate.m_token_id, candidate.m_log_prob); - } + // append token from candidate to actual sequence + forked_sequence->append_token(candidate.m_token_id, candidate.m_log_prob); } }; diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py b/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py index c86ece9cf1..cf425a8012 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py +++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import os +import shutil import pytest from optimum.intel import OVModelForCausalLM @@ -65,6 +66,7 @@ def convert_to_hf( # generic parameters kwargs['max_length'] = generation_config.max_length + # has higher priority than 'max_length' kwargs['max_new_tokens'] = generation_config.max_new_tokens # copy default parameters @@ -119,9 +121,9 @@ def run_hugging_face( for prompt, generation_config in zip(prompts, generation_configs): inputs = hf_tokenizer(prompt, return_tensors="pt") - prompt_len = len(inputs['input_ids'][0]) - generate_outputs = model.generate(**inputs, generation_config=convert_to_hf(model.generation_config, generation_config), return_dict_in_generate=True) - all_text_batch = hf_tokenizer.batch_decode([generated_ids[prompt_len:] for generated_ids in generate_outputs.sequences], skip_special_tokens=True) + prompt_len = inputs['input_ids'].numel() + generate_outputs = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], generation_config=convert_to_hf(model.generation_config, generation_config), return_dict_in_generate=True) + all_text_batch = hf_tokenizer.batch_decode([generated_ids[prompt_len:] for generated_ids in generate_outputs.sequences]) generation_result = GenerationResult() generation_result.m_generation_ids = all_text_batch @@ -145,6 +147,7 @@ def run_continuous_batching( pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), scheduler_config) output = pipe.generate(prompts, generation_configs) del pipe + shutil.rmtree(model_path) return output @@ -187,4 +190,4 @@ def run_test_pipeline(tmp_path: str, model_id: str, scheduler_params: dict = Non for prompt, hf_result, ov_result, generation_config in zip(prompts, hf_results, ov_results, generation_configs): print(f"Prompt = {prompt}\nHF result = {hf_result}\nOV result = {ov_result}") - compare_results(hf_result, ov_result, generation_config) \ No newline at end of file + compare_results(hf_result, ov_result, generation_config) diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models index 6dbe817195..d3f2a33909 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models +++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models @@ -1,58 +1,60 @@ -microsoft/Phi-3-mini-128k-instruct -microsoft/Phi-3-mini-4k-instruct -microsoft/phi-2 -microsoft/phi-1_5 -EleutherAI/gpt-neo-125m -EleutherAI/gpt-neo-125m -EleutherAI/gpt-neo-1.3B -EleutherAI/gpt-j-6b +# RuntimeError: Check 'unregistered_parameters.str().empty()': microsoft/Phi-3-mini-128k-instruct +# RuntimeError: Check 'unregistered_parameters.str().empty()': microsoft/Phi-3-mini-4k-instruct +# passed microsoft/phi-2 +# passed microsoft/phi-1_5 +# passed EleutherAI/gpt-neo-125m +# passed EleutherAI/gpt-neo-125m +# passed EleutherAI/gpt-neo-1.3B +# passed EleutherAI/gpt-j-6b +EleutherAI/pythia-160m # EleutherAI/gpt-neox-20b -BAAI/AquilaChat2-7B -BAAI/Aquila-7B -BAAI/AquilaChat-7B -baichuan-inc/Baichuan2-7B-Chat -baichuan-inc/Baichuan-7B +# optimum-intel: Trying to export a aquila model, that is a custom or unsupported architecture: BAAI/AquilaChat2-7B +# optimum-intel: Trying to export a aquila model, that is a custom or unsupported architecture: BAAI/Aquila-7B +# optimum-intel: Trying to export a aquila model, that is a custom or unsupported architecture: BAAI/AquilaChat-7B +# passed: baichuan-inc/Baichuan2-7B-Chat +# Exception from src/core/src/shape_util.cpp:65:: baichuan-inc/Baichuan-7B bigscience/bloomz-1b7 bigscience/bloomz-560m -THUDM/chatglm2-6b +# passed: THUDM/chatglm2-6b THUDM/chatglm3-6b databricks/dolly-v2-3b -tiiuae/falcon-7b +# Exception from src/core/src/shape_util.cpp:65: tiiuae/falcon-7b tiiuae/falcon-rw-7b -google/gemma-2b -google/gemma-7b -openai-community/gpt2 -openai-community/gpt2-xl -gpt2 -gpt2-xl -bigcode/starcoderbase-3b +# passed google/gemma-2b +# passed google/gemma-7b +# passed openai-community/gpt2 +# passed openai-community/gpt2-xl +# passed gpt2 +# passed gpt2-xl +# RuntimeError: Check 'unregistered_parameters.str().empty()': bigcode/starcoderbase-3b bigcode/starcoder2-3b -bigcode/gpt_bigcode-santacoder +# RuntimeError: Check 'unregistered_parameters.str().empty()': bigcode/gpt_bigcode-santacoder nomic-ai/gpt4all-j -nomic-ai/gpt4all-mpt -nomic-ai/gpt4all-falcon -stabilityai/stablelm-3b-4e1t -stabilityai/stablelm-2-zephyr-1_6b -internlm/internlm-chat-7b -internlm/internlm2-7b +# RuntimeError: Check 'unregistered_parameters.str().empty()': nomic-ai/gpt4all-mpt +# optimum-intel: Trying to export a RefinedWebModel model, that is a custom or unsupported architecture: nomic-ai/gpt4all-falcon +# passed: stabilityai/stablelm-3b-4e1t +# passed: stabilityai/stablelm-2-zephyr-1_6b +# optimum-intel: Trying to export a internlm model, that is a custom or unsupported architecture: internlm/internlm-chat-7b +# optimum-intel: PermissionError: [Errno 13] Permission denied: internlm/internlm2-7b # core42/jais-13b # core42/jais-13b-chat -meta-llama/Llama-2-7b-hf -meta-llama/Meta-Llama-3-8B-Instruct +# passed: meta-llama/Llama-2-7b-hf +# passed: meta-llama/Meta-Llama-3-8B-Instruct +# passed: meta-llama/CodeLlama-7b-hf lmsys/vicuna-7b-v1.3 -lmsys/vicuna-7b-v1.5 +# optimum-intel: The generation config instance is invalid -- `.validate(): lmsys/vicuna-7b-v1.5 +# optimum-intel: The generation config instance is invalid -- `.validate(): lmsys/longchat-7b-v1.5-32k # young-geng/koala -openlm-research/open_llama_3b -openlm-research/open_llama_3b_v2 -openbmb/MiniCPM-V-2 +# passed: openlm-research/open_llama_3b +# passed: openlm-research/open_llama_3b_v2 openbmb/MiniCPM-2B-sft-bf16 openbmb/MiniCPM-2B-dpo-bf16 -mistralai/Mistral-7B-v0.1 -mistralai/Mistral-7B-Instruct-v0.1 +# passed: mistralai/Mistral-7B-v0.1 +# passed: mistralai/Mistral-7B-Instruct-v0.1 # mistralai/Mixtral-8x7B-v0.1 # mistralai/Mixtral-8x7B-Instruct-v0.1 -mosaicml/mpt-1b-redpajama-200b -mosaicml/mpt-7b +# optimum-intel: Trying to export a mosaic-gpt model, that is a custom or unsupported architecture: mosaicml/mpt-1b-redpajama-200b +# RuntimeError: Check 'unregistered_parameters.str().empty()': mosaicml/mpt-7b # mosaicml/mpt-30b allenai/OLMo-1B-hf allenai/OLMo-7B-hf @@ -64,13 +66,21 @@ Qwen/Qwen1.5-0.5B Qwen/Qwen1.5-7B-Chat # Qwen/Qwen1.5-MoE-A2.7B # Qwen/Qwen1.5-MoE-A2.7B-Chat -xverse/XVERSE-7B-Chat +# optimum-intel: Trying to export a xverse model, that is a custom or unsupported architecture: xverse/XVERSE-7B-Chat # xverse/XVERSE-MoE-A4.2B 01-ai/Yi-6B -Salesforce/codegen-350M-multi -Salesforce/codegen-350M-nl +# passed: Salesforce/codegen-350M-multi +# passed: Salesforce/codegen-350M-nl +# passed, but with export=False: OpenVINO/codegen25-7b-multi-fp16-ov +# optimum-intel: AttributeError: 'NoneType' object has no attribute 'device': Salesforce/codegen2-1b +# optimum-intel: TypeError: Object of type method is not JSON serializable: Salesforce/xgen-7b-8k-base +# optimum-intel: DeciCoderAttention.forward() got an unexpected keyword argument 'cache_position': Deci/DeciCoder-1b rinna/bilingual-gpt-neox-4b -facebook/opt-350m -facebook/incoder-1B +# RuntimeError: Check 'unregistered_parameters.str().empty()': facebook/opt-350m +# RuntimeError: Check 'rt_info.find("eos_token_id") != rt_info.end(): facebook/incoder-1B +# optimum-intel: IndexError: tuple index out of range: facebook/blenderbot-3B google/pegasus-big_patent -google/pegasus-large \ No newline at end of file +google/pegasus-large +# optimum-intel: The generation config instance is invalid -- `.validate()` throws warnings and/or exceptions: openchat/openchat_3.5 +# CPU: head size must be multiple of 16, current: 100: pankajmathur/orca_mini_3b +# passed: togethercomputer/RedPajama-INCITE-Chat-3B-v1 \ No newline at end of file diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt b/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt index 709dfb41cb..b52a385f20 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt +++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt @@ -12,3 +12,17 @@ hf_transfer # requirements for specific models # - hf-tiny-model-private/tiny-random-RoFormerForCausalLM rjieba +# - baichuan-inc/Baichuan2-7B-Chat +bitsandbytes +# - nomic-ai/gpt4all-falcon +# - Qwen/Qwen-7B +# - Qwen/Qwen-7B-Chat +# - mosaicml/mpt-7b +einops +# - Qwen/Qwen-7B +# - Qwen/Qwen-7B-Chat +transformers_stream_generator +# - openbmb/MiniCPM-V-2 +torchvision +# - openbmb/MiniCPM-V-2 +timm \ No newline at end of file diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_sampling.py b/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_sampling.py index 597f0097f3..248e490cc3 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_sampling.py +++ b/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_sampling.py @@ -3,8 +3,16 @@ import os import pytest -from common import run_test_pipeline, get_models_list - +from common import ( + run_test_pipeline, + run_hugging_face, + run_continuous_batching, + get_models_list, + get_greedy, + get_beam_search, + get_scheduler_config, + compare_results +) # tested models: # - facebook/opt-125m @@ -26,3 +34,60 @@ def test_sampling_nightly(tmp_path, model_id): @pytest.mark.parametrize("model_id", get_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "real_models"))) def test_real_models(tmp_path, model_id): run_test_pipeline(tmp_path, model_id) + + +@pytest.mark.precommit +def test_eos_beam_search(tmp_path): + ''' + Current test checks that in case of beam search, some generation results + explicitly have EOS token at the end, which is aligned with HF + + Example of current output: + { -1.23264, that I don't know about. + I don't know what you're talking about, but I'm pretty sure it's a Canadian thing. } + ''' + model_id = "facebook/opt-125m" + prompts = ["Tell me something about Canada"] + generation_configs = [get_beam_search()] + scheduler_config = get_scheduler_config() + + (hf_results, model_path) = run_hugging_face(model_id=model_id, prompts=prompts, + generation_configs=generation_configs, tmp_path=tmp_path, + use_optimum=True) + ov_results: List[GenerationResult] = run_continuous_batching(model_path=model_path, scheduler_config=scheduler_config, + prompts=prompts, generation_configs=generation_configs) + + assert len(prompts) == len(hf_results) + assert len(prompts) == len(ov_results) + + for prompt, hf_result, ov_result, generation_config in zip(prompts, hf_results, ov_results, generation_configs): + print(f"Prompt = {prompt}\nHF result = {hf_result}\nOV result = {ov_result}") + compare_results(hf_result, ov_result, generation_config) + + +@pytest.mark.precommit +def test_eos_greedy(tmp_path): + ''' + Current test checks that in case of gready, some generation results + explicitly have EOS token at the end, which is aligned with HF: + + Example of current output: + { a software program } + ''' + model_id = "bigscience/bloomz-560m" + prompts = ["What is OpenVINO?"] + generation_configs = [get_greedy()] + scheduler_config = get_scheduler_config() + + (hf_results, model_path) = run_hugging_face(model_id=model_id, prompts=prompts, + generation_configs=generation_configs, tmp_path=tmp_path, + use_optimum=True) + ov_results: List[GenerationResult] = run_continuous_batching(model_path=model_path, scheduler_config=scheduler_config, + prompts=prompts, generation_configs=generation_configs) + + assert len(prompts) == len(hf_results) + assert len(prompts) == len(ov_results) + + for prompt, hf_result, ov_result, generation_config in zip(prompts, hf_results, ov_results, generation_configs): + print(f"Prompt = {prompt}\nHF result = {hf_result}\nOV result = {ov_result}") + compare_results(hf_result, ov_result, generation_config)