Skip to content

Commit

Permalink
Merge pull request #14 from ilya-lavrenov/real-models
Browse files Browse the repository at this point in the history
Real models
  • Loading branch information
ilya-lavrenov authored May 16, 2024
2 parents 4d3a4fd + 54a2bbd commit decb18a
Show file tree
Hide file tree
Showing 5 changed files with 145 additions and 56 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -372,11 +372,8 @@ void GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits, SamplerOutp
}
}

// HF implementation counts eos_token_id for length penalty calculation
if (candidate.m_token_id != m_parameters.eos_token_id) {
// append token from candidate to actual sequence
forked_sequence->append_token(candidate.m_token_id, candidate.m_log_prob);
}
// append token from candidate to actual sequence
forked_sequence->append_token(candidate.m_token_id, candidate.m_log_prob);
}
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# SPDX-License-Identifier: Apache-2.0

import os
import shutil
import pytest

from optimum.intel import OVModelForCausalLM
Expand Down Expand Up @@ -65,6 +66,7 @@ def convert_to_hf(

# generic parameters
kwargs['max_length'] = generation_config.max_length
# has higher priority than 'max_length'
kwargs['max_new_tokens'] = generation_config.max_new_tokens

# copy default parameters
Expand Down Expand Up @@ -119,9 +121,9 @@ def run_hugging_face(

for prompt, generation_config in zip(prompts, generation_configs):
inputs = hf_tokenizer(prompt, return_tensors="pt")
prompt_len = len(inputs['input_ids'][0])
generate_outputs = model.generate(**inputs, generation_config=convert_to_hf(model.generation_config, generation_config), return_dict_in_generate=True)
all_text_batch = hf_tokenizer.batch_decode([generated_ids[prompt_len:] for generated_ids in generate_outputs.sequences], skip_special_tokens=True)
prompt_len = inputs['input_ids'].numel()
generate_outputs = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], generation_config=convert_to_hf(model.generation_config, generation_config), return_dict_in_generate=True)
all_text_batch = hf_tokenizer.batch_decode([generated_ids[prompt_len:] for generated_ids in generate_outputs.sequences])

generation_result = GenerationResult()
generation_result.m_generation_ids = all_text_batch
Expand All @@ -145,6 +147,7 @@ def run_continuous_batching(
pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), scheduler_config)
output = pipe.generate(prompts, generation_configs)
del pipe
shutil.rmtree(model_path)
return output


Expand Down Expand Up @@ -187,4 +190,4 @@ def run_test_pipeline(tmp_path: str, model_id: str, scheduler_params: dict = Non

for prompt, hf_result, ov_result, generation_config in zip(prompts, hf_results, ov_results, generation_configs):
print(f"Prompt = {prompt}\nHF result = {hf_result}\nOV result = {ov_result}")
compare_results(hf_result, ov_result, generation_config)
compare_results(hf_result, ov_result, generation_config)
Original file line number Diff line number Diff line change
@@ -1,58 +1,60 @@
microsoft/Phi-3-mini-128k-instruct
microsoft/Phi-3-mini-4k-instruct
microsoft/phi-2
microsoft/phi-1_5
EleutherAI/gpt-neo-125m
EleutherAI/gpt-neo-125m
EleutherAI/gpt-neo-1.3B
EleutherAI/gpt-j-6b
# RuntimeError: Check 'unregistered_parameters.str().empty()': microsoft/Phi-3-mini-128k-instruct
# RuntimeError: Check 'unregistered_parameters.str().empty()': microsoft/Phi-3-mini-4k-instruct
# passed microsoft/phi-2
# passed microsoft/phi-1_5
# passed EleutherAI/gpt-neo-125m
# passed EleutherAI/gpt-neo-125m
# passed EleutherAI/gpt-neo-1.3B
# passed EleutherAI/gpt-j-6b
EleutherAI/pythia-160m
# EleutherAI/gpt-neox-20b
BAAI/AquilaChat2-7B
BAAI/Aquila-7B
BAAI/AquilaChat-7B
baichuan-inc/Baichuan2-7B-Chat
baichuan-inc/Baichuan-7B
# optimum-intel: Trying to export a aquila model, that is a custom or unsupported architecture: BAAI/AquilaChat2-7B
# optimum-intel: Trying to export a aquila model, that is a custom or unsupported architecture: BAAI/Aquila-7B
# optimum-intel: Trying to export a aquila model, that is a custom or unsupported architecture: BAAI/AquilaChat-7B
# passed: baichuan-inc/Baichuan2-7B-Chat
# Exception from src/core/src/shape_util.cpp:65:: baichuan-inc/Baichuan-7B
bigscience/bloomz-1b7
bigscience/bloomz-560m
THUDM/chatglm2-6b
# passed: THUDM/chatglm2-6b
THUDM/chatglm3-6b
databricks/dolly-v2-3b
tiiuae/falcon-7b
# Exception from src/core/src/shape_util.cpp:65: tiiuae/falcon-7b
tiiuae/falcon-rw-7b
google/gemma-2b
google/gemma-7b
openai-community/gpt2
openai-community/gpt2-xl
gpt2
gpt2-xl
bigcode/starcoderbase-3b
# passed google/gemma-2b
# passed google/gemma-7b
# passed openai-community/gpt2
# passed openai-community/gpt2-xl
# passed gpt2
# passed gpt2-xl
# RuntimeError: Check 'unregistered_parameters.str().empty()': bigcode/starcoderbase-3b
bigcode/starcoder2-3b
bigcode/gpt_bigcode-santacoder
# RuntimeError: Check 'unregistered_parameters.str().empty()': bigcode/gpt_bigcode-santacoder
nomic-ai/gpt4all-j
nomic-ai/gpt4all-mpt
nomic-ai/gpt4all-falcon
stabilityai/stablelm-3b-4e1t
stabilityai/stablelm-2-zephyr-1_6b
internlm/internlm-chat-7b
internlm/internlm2-7b
# RuntimeError: Check 'unregistered_parameters.str().empty()': nomic-ai/gpt4all-mpt
# optimum-intel: Trying to export a RefinedWebModel model, that is a custom or unsupported architecture: nomic-ai/gpt4all-falcon
# passed: stabilityai/stablelm-3b-4e1t
# passed: stabilityai/stablelm-2-zephyr-1_6b
# optimum-intel: Trying to export a internlm model, that is a custom or unsupported architecture: internlm/internlm-chat-7b
# optimum-intel: PermissionError: [Errno 13] Permission denied: internlm/internlm2-7b
# core42/jais-13b
# core42/jais-13b-chat
meta-llama/Llama-2-7b-hf
meta-llama/Meta-Llama-3-8B-Instruct
# passed: meta-llama/Llama-2-7b-hf
# passed: meta-llama/Meta-Llama-3-8B-Instruct
# passed: meta-llama/CodeLlama-7b-hf
lmsys/vicuna-7b-v1.3
lmsys/vicuna-7b-v1.5
# optimum-intel: The generation config instance is invalid -- `.validate(): lmsys/vicuna-7b-v1.5
# optimum-intel: The generation config instance is invalid -- `.validate(): lmsys/longchat-7b-v1.5-32k
# young-geng/koala
openlm-research/open_llama_3b
openlm-research/open_llama_3b_v2
openbmb/MiniCPM-V-2
# passed: openlm-research/open_llama_3b
# passed: openlm-research/open_llama_3b_v2
openbmb/MiniCPM-2B-sft-bf16
openbmb/MiniCPM-2B-dpo-bf16
mistralai/Mistral-7B-v0.1
mistralai/Mistral-7B-Instruct-v0.1
# passed: mistralai/Mistral-7B-v0.1
# passed: mistralai/Mistral-7B-Instruct-v0.1
# mistralai/Mixtral-8x7B-v0.1
# mistralai/Mixtral-8x7B-Instruct-v0.1
mosaicml/mpt-1b-redpajama-200b
mosaicml/mpt-7b
# optimum-intel: Trying to export a mosaic-gpt model, that is a custom or unsupported architecture: mosaicml/mpt-1b-redpajama-200b
# RuntimeError: Check 'unregistered_parameters.str().empty()': mosaicml/mpt-7b
# mosaicml/mpt-30b
allenai/OLMo-1B-hf
allenai/OLMo-7B-hf
Expand All @@ -64,13 +66,21 @@ Qwen/Qwen1.5-0.5B
Qwen/Qwen1.5-7B-Chat
# Qwen/Qwen1.5-MoE-A2.7B
# Qwen/Qwen1.5-MoE-A2.7B-Chat
xverse/XVERSE-7B-Chat
# optimum-intel: Trying to export a xverse model, that is a custom or unsupported architecture: xverse/XVERSE-7B-Chat
# xverse/XVERSE-MoE-A4.2B
01-ai/Yi-6B
Salesforce/codegen-350M-multi
Salesforce/codegen-350M-nl
# passed: Salesforce/codegen-350M-multi
# passed: Salesforce/codegen-350M-nl
# passed, but with export=False: OpenVINO/codegen25-7b-multi-fp16-ov
# optimum-intel: AttributeError: 'NoneType' object has no attribute 'device': Salesforce/codegen2-1b
# optimum-intel: TypeError: Object of type method is not JSON serializable: Salesforce/xgen-7b-8k-base
# optimum-intel: DeciCoderAttention.forward() got an unexpected keyword argument 'cache_position': Deci/DeciCoder-1b
rinna/bilingual-gpt-neox-4b
facebook/opt-350m
facebook/incoder-1B
# RuntimeError: Check 'unregistered_parameters.str().empty()': facebook/opt-350m
# RuntimeError: Check 'rt_info.find("eos_token_id") != rt_info.end(): facebook/incoder-1B
# optimum-intel: IndexError: tuple index out of range: facebook/blenderbot-3B
google/pegasus-big_patent
google/pegasus-large
google/pegasus-large
# optimum-intel: The generation config instance is invalid -- `.validate()` throws warnings and/or exceptions: openchat/openchat_3.5
# CPU: head size must be multiple of 16, current: 100: pankajmathur/orca_mini_3b
# passed: togethercomputer/RedPajama-INCITE-Chat-3B-v1
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,17 @@ hf_transfer
# requirements for specific models
# - hf-tiny-model-private/tiny-random-RoFormerForCausalLM
rjieba
# - baichuan-inc/Baichuan2-7B-Chat
bitsandbytes
# - nomic-ai/gpt4all-falcon
# - Qwen/Qwen-7B
# - Qwen/Qwen-7B-Chat
# - mosaicml/mpt-7b
einops
# - Qwen/Qwen-7B
# - Qwen/Qwen-7B-Chat
transformers_stream_generator
# - openbmb/MiniCPM-V-2
torchvision
# - openbmb/MiniCPM-V-2
timm
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,16 @@
import os
import pytest

from common import run_test_pipeline, get_models_list

from common import (
run_test_pipeline,
run_hugging_face,
run_continuous_batching,
get_models_list,
get_greedy,
get_beam_search,
get_scheduler_config,
compare_results
)

# tested models:
# - facebook/opt-125m
Expand All @@ -26,3 +34,60 @@ def test_sampling_nightly(tmp_path, model_id):
@pytest.mark.parametrize("model_id", get_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "real_models")))
def test_real_models(tmp_path, model_id):
run_test_pipeline(tmp_path, model_id)


@pytest.mark.precommit
def test_eos_beam_search(tmp_path):
'''
Current test checks that in case of beam search, some generation results
explicitly have EOS token at the end, which is aligned with HF
Example of current output:
{ -1.23264, that I don't know about.
I don't know what you're talking about, but I'm pretty sure it's a Canadian thing.</s> }
'''
model_id = "facebook/opt-125m"
prompts = ["Tell me something about Canada"]
generation_configs = [get_beam_search()]
scheduler_config = get_scheduler_config()

(hf_results, model_path) = run_hugging_face(model_id=model_id, prompts=prompts,
generation_configs=generation_configs, tmp_path=tmp_path,
use_optimum=True)
ov_results: List[GenerationResult] = run_continuous_batching(model_path=model_path, scheduler_config=scheduler_config,
prompts=prompts, generation_configs=generation_configs)

assert len(prompts) == len(hf_results)
assert len(prompts) == len(ov_results)

for prompt, hf_result, ov_result, generation_config in zip(prompts, hf_results, ov_results, generation_configs):
print(f"Prompt = {prompt}\nHF result = {hf_result}\nOV result = {ov_result}")
compare_results(hf_result, ov_result, generation_config)


@pytest.mark.precommit
def test_eos_greedy(tmp_path):
'''
Current test checks that in case of gready, some generation results
explicitly have EOS token at the end, which is aligned with HF:
Example of current output:
{ a software program</s> }
'''
model_id = "bigscience/bloomz-560m"
prompts = ["What is OpenVINO?"]
generation_configs = [get_greedy()]
scheduler_config = get_scheduler_config()

(hf_results, model_path) = run_hugging_face(model_id=model_id, prompts=prompts,
generation_configs=generation_configs, tmp_path=tmp_path,
use_optimum=True)
ov_results: List[GenerationResult] = run_continuous_batching(model_path=model_path, scheduler_config=scheduler_config,
prompts=prompts, generation_configs=generation_configs)

assert len(prompts) == len(hf_results)
assert len(prompts) == len(ov_results)

for prompt, hf_result, ov_result, generation_config in zip(prompts, hf_results, ov_results, generation_configs):
print(f"Prompt = {prompt}\nHF result = {hf_result}\nOV result = {ov_result}")
compare_results(hf_result, ov_result, generation_config)

0 comments on commit decb18a

Please sign in to comment.