From fb6aa54a9f0b8259fda301131c9a1510dbf21f81 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Sat, 4 Jan 2025 01:25:13 +0400 Subject: [PATCH] Move tests on sampling to test_sampling.py (#1465) - Move extensive tests on decoding / sampling from test_llm_pipeline.py tests to test_sampling.py - Partially refactored common functions in common.py to be more generic (to be continued in next PRs) - Dropped partially predefined functions with generation configs and replaced them in tests with dict of generation parameters, so you can better see tests params closer to tests itself and avoid creating numerous get_** for new generation values combinations. - Sampling tests are now implemented on top of stateful model for better comparison with optimum-intel --- .github/workflows/mac.yml | 4 +- .../openvino_genai/py_openvino_genai.pyi | 1 + .../py_continuous_batching_pipeline.cpp | 15 +- tests/python_tests/common.py | 346 +++++++++--------- tests/python_tests/ov_genai_test_utils.py | 49 ++- .../python_tests/test_continuous_batching.py | 38 +- tests/python_tests/test_kv_cache_eviction.py | 6 +- tests/python_tests/test_llm_pipeline.py | 327 +++-------------- .../python_tests/test_llm_pipeline_static.py | 40 +- tests/python_tests/test_sampling.py | 224 +++++------- tests/python_tests/test_vlm_pipeline.py | 17 +- .../tests/test_cli_image.py | 9 +- 12 files changed, 410 insertions(+), 666 deletions(-) diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml index fb66271ff7..5402b79e70 100644 --- a/.github/workflows/mac.yml +++ b/.github/workflows/mac.yml @@ -1,4 +1,4 @@ -name: macOS (12, Python 3.9) +name: macOS (12, Python 3.10) on: workflow_dispatch: pull_request: @@ -16,7 +16,7 @@ concurrency: cancel-in-progress: true env: - PYTHON_VERSION: '3.9' + PYTHON_VERSION: '3.10' OV_BRANCH: master OV_TARBALL: '' diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index 5d82fa89a3..9ff28859b9 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -697,6 +697,7 @@ class GenerationResult: """ m_generation_ids: list[str] m_scores: list[float] + m_status: GenerationStatus def __init__(self) -> None: ... def __repr__(self) -> str: diff --git a/src/python/py_continuous_batching_pipeline.cpp b/src/python/py_continuous_batching_pipeline.cpp index 2b48e4d44d..48eb124255 100644 --- a/src/python/py_continuous_batching_pipeline.cpp +++ b/src/python/py_continuous_batching_pipeline.cpp @@ -119,6 +119,13 @@ std::ostream& operator << (std::ostream& stream, const GenerationResult& generat } // namespace void init_continuous_batching_pipeline(py::module_& m) { + py::enum_(m, "GenerationStatus") + .value("RUNNING", ov::genai::GenerationStatus::RUNNING) + .value("FINISHED", ov::genai::GenerationStatus::FINISHED) + .value("IGNORED", ov::genai::GenerationStatus::IGNORED) + .value("DROPPED_BY_PIPELINE", ov::genai::GenerationStatus::DROPPED_BY_PIPELINE) + .value("DROPPED_BY_HANDLE", ov::genai::GenerationStatus::DROPPED_BY_HANDLE); + py::class_(m, "GenerationResult", generation_result_docstring) .def(py::init<>()) .def_readonly("m_request_id", &GenerationResult::m_request_id) @@ -130,6 +137,7 @@ void init_continuous_batching_pipeline(py::module_& m) { r.m_generation_ids = generation_ids; }) .def_readwrite("m_scores", &GenerationResult::m_scores) + .def_readwrite("m_status", &GenerationResult::m_status) .def("__repr__", [](const GenerationResult &r) -> py::str { std::stringstream stream; @@ -148,13 +156,6 @@ void init_continuous_batching_pipeline(py::module_& m) { .def_readwrite("m_generation_ids", &EncodedGenerationResult::m_generation_ids) .def_readwrite("m_scores", &EncodedGenerationResult::m_scores); - py::enum_(m, "GenerationStatus") - .value("RUNNING", ov::genai::GenerationStatus::RUNNING) - .value("FINISHED", ov::genai::GenerationStatus::FINISHED) - .value("IGNORED", ov::genai::GenerationStatus::IGNORED) - .value("DROPPED_BY_PIPELINE", ov::genai::GenerationStatus::DROPPED_BY_PIPELINE) - .value("DROPPED_BY_HANDLE", ov::genai::GenerationStatus::DROPPED_BY_HANDLE); - py::enum_(m, "GenerationFinishReason") .value("NONE", ov::genai::GenerationFinishReason::NONE) .value("STOP", ov::genai::GenerationFinishReason::STOP) diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py index 9040fa435f..dc58d1ad2f 100644 --- a/tests/python_tests/common.py +++ b/tests/python_tests/common.py @@ -7,7 +7,7 @@ from optimum.intel import OVModelForCausalLM from pathlib import Path -from openvino_genai import ContinuousBatchingPipeline, SchedulerConfig, GenerationResult, GenerationConfig +from openvino_genai import ContinuousBatchingPipeline, LLMPipeline, SchedulerConfig, GenerationResult, GenerationConfig, DecodedResults, StopCriteria from transformers import AutoTokenizer, AutoModelForCausalLM from transformers import GenerationConfig as HFGenerationConfig from typing import List, Tuple @@ -20,20 +20,6 @@ def get_greedy() -> GenerationConfig: generation_config.max_new_tokens = 30 return generation_config -def get_greedy_with_min_and_max_tokens() -> GenerationConfig: - generation_config = GenerationConfig() - generation_config.num_return_sequences = 1 - generation_config.min_new_tokens = 15 - generation_config.max_new_tokens = 30 - return generation_config - -def get_greedy_with_repetition_penalty() -> GenerationConfig: - generation_config = GenerationConfig() - generation_config.num_return_sequences = 1 - generation_config.repetition_penalty = 2.0 - generation_config.max_new_tokens = 30 - return generation_config - def get_greedy_with_penalties() -> GenerationConfig: generation_config = GenerationConfig() generation_config.num_return_sequences = 1 @@ -42,33 +28,6 @@ def get_greedy_with_penalties() -> GenerationConfig: generation_config.max_new_tokens = 30 return generation_config -def get_greedy_with_single_stop_string() -> GenerationConfig: - generation_config = GenerationConfig() - generation_config.num_return_sequences = 1 - generation_config.min_new_tokens = 15 - generation_config.max_new_tokens = 50 - generation_config.stop_strings = {"anag"} # expected match on "manage" - generation_config.include_stop_str_in_output = True - return generation_config - -def get_greedy_with_multiple_stop_strings() -> GenerationConfig: - generation_config = GenerationConfig() - generation_config.num_return_sequences = 1 - generation_config.min_new_tokens = 1 - generation_config.max_new_tokens = 50 - generation_config.stop_strings = {".", "software", "Intel"} - generation_config.include_stop_str_in_output = True - return generation_config - -def get_greedy_with_multiple_stop_strings_no_match() -> GenerationConfig: - generation_config = GenerationConfig() - generation_config.num_return_sequences = 1 - generation_config.min_new_tokens = 1 - generation_config.max_new_tokens = 50 - generation_config.stop_strings = {"Einstein", "sunny", "geothermal"} - generation_config.include_stop_str_in_output = True - return generation_config - def get_beam_search() -> GenerationConfig: generation_config = GenerationConfig() generation_config.num_beam_groups = 3 @@ -79,78 +38,6 @@ def get_beam_search() -> GenerationConfig: generation_config.num_return_sequences = generation_config.num_beams return generation_config -def get_beam_search_min_and_max_tokens() -> GenerationConfig: - generation_config = GenerationConfig() - generation_config.num_beam_groups = 3 - generation_config.num_beams = 6 - generation_config.diversity_penalty = 1 - generation_config.min_new_tokens = 15 - generation_config.max_new_tokens = 30 - generation_config.num_return_sequences = 3 - generation_config.num_return_sequences = generation_config.num_beams - return generation_config - -def get_beam_search_with_single_stop_string() -> GenerationConfig: - generation_config = GenerationConfig() - generation_config.num_beam_groups = 3 - generation_config.num_beams = 6 - generation_config.diversity_penalty = 1 - generation_config.max_new_tokens = 50 - generation_config.num_return_sequences = generation_config.num_beams - generation_config.stop_strings = {"open sour"} # expected match on "open source" - generation_config.include_stop_str_in_output = True - return generation_config - -def get_beam_search_with_multiple_stop_strings() -> GenerationConfig: - generation_config = GenerationConfig() - generation_config.num_beam_groups = 3 - generation_config.num_beams = 6 - generation_config.diversity_penalty = 1 - generation_config.max_new_tokens = 50 - generation_config.num_return_sequences = generation_config.num_beams - generation_config.stop_strings = {".", "software", "Intel"} - generation_config.include_stop_str_in_output = True - return generation_config - -def get_beam_search_with_multiple_stop_strings_no_match() -> GenerationConfig: - generation_config = GenerationConfig() - generation_config.num_beam_groups = 3 - generation_config.num_beams = 6 - generation_config.diversity_penalty = 1 - generation_config.max_new_tokens = 30 - generation_config.num_return_sequences = generation_config.num_beams - generation_config.stop_strings = {"Einstein", "sunny", "geothermal"} - generation_config.include_stop_str_in_output = True - return generation_config - -def get_greedy_stop_strings_exclude_from_output() -> GenerationConfig: - generation_config = GenerationConfig() - generation_config.max_new_tokens = 30 - generation_config.stop_strings = { "machines" } - generation_config.include_stop_str_in_output = False - return generation_config - -def get_greedy_stop_strings_include_to_output() -> GenerationConfig: - generation_config = GenerationConfig() - generation_config.max_new_tokens = 30 - generation_config.stop_strings = { "machines" } - generation_config.include_stop_str_in_output = True - return generation_config - -def get_greedy_n_stop_strings_exclude_from_output() -> GenerationConfig: - generation_config = GenerationConfig() - generation_config.max_new_tokens = 30 - generation_config.stop_strings = { "machines", "manage" } - generation_config.include_stop_str_in_output = False - return generation_config - -def get_greedy_n_stop_strings_include_to_output() -> GenerationConfig: - generation_config = GenerationConfig() - generation_config.max_new_tokens = 30 - generation_config.stop_strings = { "machines", "manage" } - generation_config.include_stop_str_in_output = True - return generation_config - def get_multinomial_temperature() -> GenerationConfig: generation_config = GenerationConfig() generation_config.do_sample = True @@ -288,8 +175,10 @@ def convert_to_hf( default_generation_config : HFGenerationConfig, generation_config : GenerationConfig ) -> HFGenerationConfig: - kwargs = {} + if generation_config is None: + return + kwargs = {} # generic parameters kwargs['max_length'] = generation_config.max_length # has higher priority than 'max_length' @@ -300,8 +189,16 @@ def convert_to_hf( # copy default parameters kwargs['bos_token_id'] = default_generation_config.bos_token_id - kwargs['eos_token_id'] = default_generation_config.eos_token_id kwargs['pad_token_id'] = default_generation_config.pad_token_id + + if len(generation_config.stop_token_ids) > 0: + kwargs['eos_token_id'] = list(generation_config.stop_token_ids) + elif generation_config.eos_token_id != -1: + kwargs['eos_token_id'] = generation_config.eos_token_id + else: + kwargs['eos_token_id'] = default_generation_config.eos_token_id + + # copy penalties kwargs['repetition_penalty'] = generation_config.repetition_penalty if generation_config.is_beam_search(): @@ -312,8 +209,20 @@ def convert_to_hf( kwargs['no_repeat_ngram_size'] = generation_config.no_repeat_ngram_size kwargs['num_return_sequences'] = generation_config.num_return_sequences kwargs['output_scores'] = True + if generation_config.num_beam_groups > 1: kwargs['diversity_penalty'] = generation_config.diversity_penalty + + # in OpenVINO GenAI this parameter is called stop_criteria, + # while in HF it's called early_stopping. + # HF values True, False and "never" correspond to OV GenAI values "EARLY", "HEURISTIC" and "NEVER" + STOP_CRITERIA_MAP = { + StopCriteria.NEVER: "never", + StopCriteria.EARLY: True, + StopCriteria.HEURISTIC: False + } + + kwargs['early_stopping'] = STOP_CRITERIA_MAP[generation_config.stop_criteria] elif generation_config.is_multinomial(): # mulitinomial kwargs['temperature'] = generation_config.temperature @@ -332,23 +241,55 @@ def run_hugging_face( opt_model, hf_tokenizer, prompts: List[str], - generation_configs: List[GenerationConfig], + generation_configs: List[GenerationConfig] | GenerationConfig, ) -> List[GenerationResult]: generation_results = [] - for prompt, generation_config in zip(prompts, generation_configs): - inputs = hf_tokenizer(prompt, return_tensors="pt") - prompt_len = inputs['input_ids'].numel() - generate_outputs = opt_model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], - generation_config=convert_to_hf(opt_model.generation_config, generation_config), - return_dict_in_generate=True, tokenizer=hf_tokenizer) - all_text_batch = hf_tokenizer.batch_decode([generated_ids[prompt_len:] for generated_ids in generate_outputs.sequences], skip_special_tokens=True) - generation_result = GenerationResult() - generation_result.m_generation_ids = all_text_batch - # sequences_scores are available only for beam search case - if generation_config.is_beam_search(): - generation_result.m_scores = [score for score in generate_outputs.sequences_scores] - generation_results.append(generation_result) + if type(generation_configs) is list: + # process prompt by promp as we have multiple generation configs + for prompt, generation_config in zip(prompts, generation_configs): + hf_generation_config = convert_to_hf(opt_model.generation_config, generation_config) + inputs = hf_tokenizer(prompt, return_tensors="pt") + input_ids, attention_mask = inputs['input_ids'], inputs['attention_mask'] + prompt_len = 0 if generation_config.echo else input_ids.numel() + + generate_outputs = opt_model.generate(input_ids=input_ids, attention_mask=attention_mask, generation_config=hf_generation_config, + return_dict_in_generate=True, tokenizer=hf_tokenizer) + all_text_batch = hf_tokenizer.batch_decode([generated_ids[prompt_len:] for generated_ids in generate_outputs.sequences], skip_special_tokens=True) + + generation_result = GenerationResult() + generation_result.m_generation_ids = all_text_batch + # sequences_scores are available only for beam search case + if generation_config.is_beam_search(): + generation_result.m_scores = [score for score in generate_outputs.sequences_scores] + generation_results.append(generation_result) + else: + # process all prompts as a single batch as we have a single generation config for all prompts + inputs = hf_tokenizer(prompts, return_tensors='pt', padding=True, truncation=True, add_special_tokens=True, padding_side='left') + input_ids, attention_mask = inputs['input_ids'], inputs['attention_mask'] + hf_generation_config = convert_to_hf(opt_model.generation_config, generation_configs) + hf_encoded_outputs = opt_model.generate(input_ids, attention_mask=attention_mask, generation_config=hf_generation_config, + return_dict_in_generate=True, tokenizer=hf_tokenizer) + + generation_ids = [] + scores = [] + + for idx, hf_encoded_out in enumerate(hf_encoded_outputs.sequences): + prompt_idx = idx // hf_generation_config.num_return_sequences + prompt_len = 0 if generation_configs.echo else input_ids[prompt_idx].numel() + decoded_text = hf_tokenizer.decode(hf_encoded_out[prompt_len:], skip_special_tokens=True) + generation_ids.append(decoded_text) + if generation_configs.is_beam_search(): + scores.append(hf_encoded_outputs.sequences_scores[idx]) + + # if we need to move to next generation result + if (idx + 1) // hf_generation_config.num_return_sequences != prompt_idx: + generation_result = GenerationResult() + generation_result.m_generation_ids = generation_ids + generation_result.m_scores = scores + generation_results.append(generation_result) + generation_ids = [] + scores = [] del hf_tokenizer del opt_model @@ -360,16 +301,65 @@ def run_continuous_batching( models_path : Path, scheduler_config : SchedulerConfig, prompts: List[str], - generation_configs : List[GenerationConfig] + generation_configs : List[GenerationConfig] | GenerationConfig ) -> List[GenerationResult]: - pipe = ContinuousBatchingPipeline(models_path, scheduler_config, "CPU") - output = pipe.generate(prompts, generation_configs) - del pipe + if type(generation_configs) is not list: + generation_configs = [generation_configs] * len(prompts) + + cb_pipe = ContinuousBatchingPipeline(models_path, scheduler_config=scheduler_config, device='CPU') + output = cb_pipe.generate(prompts, generation_configs) + + del cb_pipe shutil.rmtree(models_path) + return output -def compare_results(hf_result: GenerationResult, ov_result: GenerationResult, generation_config: GenerationConfig): +def get_default_properties(): + import openvino.properties.hint as hints + import openvino as ov + + return { + hints.inference_precision : ov.Type.f32, + hints.kv_cache_precision : ov.Type.f16, + } + + +def run_llm_pipeline( + models_path : Path, + prompts: List[str], + generation_config : GenerationConfig, + use_cb : bool = False +) -> List[GenerationResult]: + properties = get_default_properties() + if use_cb: + properties['scheduler_config'] = SchedulerConfig() + + ov_pipe = LLMPipeline(models_path, device='CPU', **properties) + + generate_outputs : DecodedResults = ov_pipe.generate(inputs=prompts, generation_config=generation_config) + + index = 0 + generation_results = [] + + for _ in prompts: + generation_result = GenerationResult() + + generation_result.m_generation_ids = generate_outputs.texts[index : index + generation_config.num_return_sequences] + # sequences_scores are available only for beam search case + if generation_config.is_beam_search(): + generation_result.m_scores = generate_outputs.scores[index : index + generation_config.num_return_sequences] + generation_results.append(generation_result) + + index += generation_config.num_return_sequences + + del ov_pipe + shutil.rmtree(models_path) + + return generation_results + + +def compare_generation_result(hf_result: GenerationResult, ov_result: GenerationResult, generation_config: GenerationConfig): if generation_config.is_beam_search(): assert len(hf_result.m_scores) == len(ov_result.m_scores) for hf_score, ov_score in zip(hf_result.m_scores, ov_result.m_scores): @@ -386,46 +376,79 @@ def compare_results(hf_result: GenerationResult, ov_result: GenerationResult, ge assert hf_text == ov_text -def get_hugging_face_model_and_tokenizer(model_id: str, use_optimum = True): +def compare_generation_results(prompts: List[str], hf_results: List[GenerationResult], ov_results: List[GenerationResult], generation_configs: List[GenerationConfig] | GenerationConfig): + if type(generation_configs) is not list: + generation_configs = [generation_configs] + + assert len(prompts) == len(hf_results) + assert len(prompts) == len(ov_results) + + for prompt, ref_result, ov_result, generation_config in zip(prompts, hf_results, ov_results, generation_configs): + print(f"Prompt = {prompt}\nReference result = {ref_result}\nOpenVINO result = {ov_result.m_generation_ids}") + compare_generation_result(ref_result, ov_result, generation_config) + + +def get_hugging_face_models(model_id: str): hf_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) - opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True) if use_optimum else \ - AutoModelForCausalLM.from_pretrained(model_id) + opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, ov_config=get_default_properties()) return opt_model, hf_tokenizer -def save_ov_model_from_optimum(model, hf_tokenizer, models_path: Path): - model.save_pretrained(models_path) +def convert_models(opt_model : OVModelForCausalLM, hf_tokenizer : AutoTokenizer, models_path: Path): + opt_model.save_pretrained(models_path) + + # to store tokenizer config jsons with special tokens + hf_tokenizer.save_pretrained(models_path) + + # save generation config + opt_model.generation_config.save_pretrained(models_path) + # convert tokenizers as well from openvino_tokenizers import convert_tokenizer from openvino import serialize - tokenizer, detokenizer = convert_tokenizer(hf_tokenizer, with_detokenizer=True, skip_special_tokens=True) + + tokenizer, detokenizer = convert_tokenizer(hf_tokenizer, with_detokenizer=True) serialize(tokenizer, models_path / "openvino_tokenizer.xml") serialize(detokenizer, models_path / "openvino_detokenizer.xml") -def _generate_and_compare_with_reference_results(models_path: Path, prompts: List[str], reference_results: List[GenerationResult], generation_configs: List[GenerationConfig], scheduler_config: SchedulerConfig): - ov_results : List[GenerationResult] = run_continuous_batching(models_path, scheduler_config, prompts, generation_configs) +def run_llm_pipeline_with_ref(model_id: str, prompts: List[str], generation_config: GenerationConfig | dict, tmp_path: Path, use_cb : bool = False): + models_path : Path = tmp_path / model_id + opt_model, hf_tokenizer = get_hugging_face_models(model_id) - assert len(prompts) == len(reference_results) - assert len(prompts) == len(ov_results) + if type(generation_config) is dict: + generation_config = GenerationConfig(**generation_config) + + convert_models(opt_model, hf_tokenizer, models_path) - for prompt, ref_result, ov_result, generation_config in zip(prompts, reference_results, ov_results, generation_configs): - print(f"Prompt = {prompt}\nref result = {ref_result}\nOV result = {ov_result.m_generation_ids}") - compare_results(ref_result, ov_result, generation_config) + ov_results = run_llm_pipeline(models_path, prompts, generation_config, use_cb) + hf_results = run_hugging_face(opt_model, hf_tokenizer, prompts, generation_config) + compare_generation_results(prompts, hf_results, ov_results, generation_config) + + +def run_cb_pipeline_with_ref(tmp_path: str, model_id: str, scheduler_params: dict = {}, generation_config : GenerationConfig | dict = None): + prompts, generation_configs = get_test_dataset() + scheduler_config = get_scheduler_config(scheduler_params) + + # override dataset's generation config + if generation_config is not None: + if type(generation_config) is dict: + generation_config = GenerationConfig(**generation_config) + generation_configs = [generation_config] * len(prompts) -def generate_and_compare_with_hf(model_id: str, prompts: List[str], generation_configs: List[GenerationConfig], scheduler_config: SchedulerConfig, tmp_path: Path): - use_optimum = True models_path : Path = tmp_path / model_id - opt_model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum) + opt_model, hf_tokenizer = get_hugging_face_models(model_id) - if use_optimum: - save_ov_model_from_optimum(opt_model, hf_tokenizer, models_path) + convert_models(opt_model, hf_tokenizer, models_path) - hf_results = run_hugging_face(opt_model=opt_model, hf_tokenizer=hf_tokenizer, prompts=prompts, generation_configs=generation_configs) - _generate_and_compare_with_reference_results(models_path, prompts, hf_results, generation_configs, scheduler_config) + hf_results = run_hugging_face(opt_model, hf_tokenizer, prompts, generation_configs) + ov_results = run_continuous_batching(models_path, scheduler_config, prompts, generation_configs) + compare_generation_results(prompts, hf_results, ov_results, generation_configs) + +# TODO: remove after Generator property is supported by LLMPipeline / VLMPipeline def generate_and_compare_with_reference_text(models_path: Path, prompts: List[str], reference_texts_per_prompt: List[List[str]], generation_configs: List[GenerationConfig], scheduler_config: SchedulerConfig): ov_results : List[GenerationResult] = run_continuous_batching(models_path, scheduler_config, prompts, generation_configs) @@ -440,19 +463,6 @@ def generate_and_compare_with_reference_text(models_path: Path, prompts: List[st assert ref_text == ov_text -def run_continuous_batching_pipeline_test(tmp_path: str, model_id: str, scheduler_params: dict = None, generation_config = None): - prompts, generation_configs = get_test_dataset() - scheduler_config = get_scheduler_config(scheduler_params) - - if generation_config is not None: - generation_config.rng_seed = 0 - generation_configs = [generation_config] * len(prompts) - - generate_and_compare_with_hf(model_id, prompts, generation_configs, scheduler_config, tmp_path) - - -DEFAULT_SCHEDULER_CONFIG = get_scheduler_config({"num_kv_blocks": 300, "dynamic_split_fuse": True, "max_num_batched_tokens": 256, "max_num_seqs": 256}) - def get_image_by_link(link): from PIL import Image import requests diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py index 9e8e4681f9..00c74f6628 100644 --- a/tests/python_tests/ov_genai_test_utils.py +++ b/tests/python_tests/ov_genai_test_utils.py @@ -13,6 +13,8 @@ import shutil import json +import openvino_genai as ov_genai + def get_models_list(): precommit_models = [ @@ -52,6 +54,7 @@ def get_models_list(): if pytest.selected_model_ids: model_ids = [model_id for model_id in model_ids if model_id in pytest.selected_model_ids.split(' ')] + # pytest.set_trace() prefix = pathlib.Path(os.getenv('GENAI_MODELS_PATH_PREFIX', '')) return [(model_id, prefix / model_id.split('/')[1]) for model_id in model_ids] @@ -81,66 +84,57 @@ def get_chat_models_list(): @functools.lru_cache(1) def read_model(params, **tokenizer_kwargs): - model_id, path = params + model_id, models_path = params from optimum.intel.openvino import OVModelForCausalLM from transformers import AutoTokenizer hf_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) - if (path / "openvino_model.xml").exists(): - opt_model = OVModelForCausalLM.from_pretrained(path, trust_remote_code=True, + if (models_path / "openvino_model.xml").exists(): + opt_model = OVModelForCausalLM.from_pretrained(models_path, trust_remote_code=True, compile=False, device='CPU') else: ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(hf_tokenizer, with_detokenizer=True, **tokenizer_kwargs) - openvino.save_model(ov_tokenizer, path / "openvino_tokenizer.xml") - openvino.save_model(ov_detokenizer, path / "openvino_detokenizer.xml") + openvino.save_model(ov_tokenizer, models_path / "openvino_tokenizer.xml") + openvino.save_model(ov_detokenizer, models_path / "openvino_detokenizer.xml") # to store tokenizer config jsons with special tokens - hf_tokenizer.save_pretrained(path) + hf_tokenizer.save_pretrained(models_path) opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, compile=False, device='CPU', load_in_8bit=False) - opt_model.generation_config.save_pretrained(path) - opt_model.config.save_pretrained(path) - opt_model.save_pretrained(path) + opt_model.generation_config.save_pretrained(models_path) + opt_model.config.save_pretrained(models_path) + opt_model.save_pretrained(models_path) return ( model_id, - path, + models_path, hf_tokenizer, opt_model, - ov_genai.LLMPipeline(path, 'CPU', ENABLE_MMAP=False), + ov_genai.LLMPipeline(models_path, 'CPU', ENABLE_MMAP=False), ) -# in OpenVINO GenAI this parameter is called stop_criteria, -# while in HF it's called early_stopping. -# HF values True, False and "never" correspond to OV GenAI values "EARLY", "HEURISTIC" and "NEVER" -STOP_CRITERIA_MAP = { - ov_genai.StopCriteria.NEVER: "never", - ov_genai.StopCriteria.EARLY: True, - ov_genai.StopCriteria.HEURISTIC: False -} - - @pytest.fixture(scope="module") def model_tmp_path(tmpdir_factory): - model_id, path, _, _, _ = read_model(get_models_list()[0]) + model_id, models_path, _, _, _ = read_model(get_models_list()[0]) temp_path = tmpdir_factory.mktemp(model_id.replace('/', '_')) # copy openvino converted model and tokenizers for pattern in ['*.xml', '*.bin']: - for src_file in path.glob(pattern): + for src_file in models_path.glob(pattern): if src_file.is_file(): shutil.copy(src_file, temp_path / src_file.name) + yield model_id, Path(temp_path) @pytest.fixture(scope="module") def model_tokenizers_tmp_path(tmpdir_factory): - model_id, path, _, _, _ = read_model(get_models_list()[0]) + model_id, models_path, _, _, _ = read_model(get_models_list()[0]) temp_path = tmpdir_factory.mktemp(model_id.replace('/', '_')) # If tokens were not found in IR, it fallback to reading from config. @@ -148,10 +142,11 @@ def model_tokenizers_tmp_path(tmpdir_factory): # and set tokens in configs and to check if they are read and validated correctly. import openvino as ov + core = ov.Core() + # copy openvino converted model and tokenizers for pattern in ['*.xml', '*.bin']: - for src_file in path.glob(pattern): - core = ov.Core() + for src_file in models_path.glob(pattern): # Update files if they are openvino_tokenizer.xml or openvino_detokenizer.xml if src_file.name in ['openvino_tokenizer.xml', 'openvino_detokenizer.xml']: @@ -166,8 +161,10 @@ def model_tokenizers_tmp_path(tmpdir_factory): if src_file in ['openvino_tokenizer.bin', 'openvino_detokenizer.bin']: continue + if src_file.is_file(): shutil.copy(src_file, temp_path / src_file.name) + yield model_id, Path(temp_path) diff --git a/tests/python_tests/test_continuous_batching.py b/tests/python_tests/test_continuous_batching.py index 01762bf9e3..fabcf06b71 100644 --- a/tests/python_tests/test_continuous_batching.py +++ b/tests/python_tests/test_continuous_batching.py @@ -9,8 +9,8 @@ from pathlib import Path from openvino_genai import ContinuousBatchingPipeline, GenerationConfig, Tokenizer -from common import get_hugging_face_model_and_tokenizer, save_ov_model_from_optimum, generate_and_compare_with_reference_text, \ - get_scheduler_config, get_greedy, run_continuous_batching_pipeline_test, get_beam_search, get_greedy, \ +from common import get_hugging_face_models, convert_models, generate_and_compare_with_reference_text, \ + get_scheduler_config, get_greedy, run_cb_pipeline_with_ref, get_beam_search, get_greedy, \ get_multinomial_all_parameters, get_multinomial_temperature_and_num_return_sequence, \ get_multinomial_temperature_and_top_k, get_multinomial_temperature, get_multinomial_temperature_and_top_p from test_sampling import RandomSamplingTestStruct, get_current_platform_ref_texts @@ -39,19 +39,19 @@ def read_models_list(file_name: str): @pytest.mark.precommit @pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "precommit"))) def test_e2e_precommit(tmp_path, model_id): - run_continuous_batching_pipeline_test(tmp_path, model_id) + run_cb_pipeline_with_ref(tmp_path, model_id) @pytest.mark.nightly @pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "nightly"))) def test_e2e_nightly(tmp_path, model_id): - run_continuous_batching_pipeline_test(tmp_path, model_id) + run_cb_pipeline_with_ref(tmp_path, model_id) @pytest.mark.real_models @pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "real_models"))) def test_e2e_real_models(tmp_path, model_id): - run_continuous_batching_pipeline_test(tmp_path, model_id) + run_cb_pipeline_with_ref(tmp_path, model_id) # # Comparison with stateful @@ -77,8 +77,8 @@ def test_continuous_batching_vs_stateful(prompt, generation_config): "facebook/opt-125m", Path("opt-125m") )) - cb = get_continuous_batching(path) - generated = cb.generate(prompt, **generation_config) + cb_pipe = get_continuous_batching(path) + generated = cb_pipe.generate(prompt, **generation_config) reference = stateful.generate(prompt, **generation_config) assert generated.texts == reference.texts if 1 != generation_config.get("num_return_sequences", 1): @@ -117,8 +117,8 @@ def test_cb_streamer_vs_return_vs_stateful(prompt): @pytest.mark.parametrize("model_descr", get_chat_models_list()) @pytest.mark.precommit def test_chat_scenario_vs_stateful(model_descr, generation_config_kwargs: Dict): - model_id, path, hf_tokenizer, opt_model, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat')) - cb_pipe = get_continuous_batching(path) + model_id, models_path, hf_tokenizer, opt_model, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat')) + cb_pipe = get_continuous_batching(models_path) ov_pipe.start_chat() cb_pipe.start_chat() @@ -150,10 +150,10 @@ def test_post_oom_health(tmp_path, sampling_config): scheduler_config.num_kv_blocks = 10 # Low cache size to trigger OOM quickly model_id : str = "facebook/opt-125m" - opt_model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True) + opt_model, hf_tokenizer = get_hugging_face_models(model_id) models_path : Path = tmp_path / model_id - save_ov_model_from_optimum(opt_model, hf_tokenizer, models_path) + convert_models(opt_model, hf_tokenizer, models_path) cb_pipe = ContinuousBatchingPipeline(models_path, Tokenizer(models_path), scheduler_config, "CPU") @@ -201,7 +201,7 @@ def get_beam_search_seq_len_300() -> GenerationConfig: @pytest.mark.parametrize("params", scheduler_params_list) @pytest.mark.precommit def test_preemption(tmp_path, params): - run_continuous_batching_pipeline_test(tmp_path, "facebook/opt-125m", scheduler_params=params[0], generation_config=params[1]) + run_cb_pipeline_with_ref(tmp_path, "facebook/opt-125m", scheduler_params=params[0], generation_config=params[1]) multinomial_params = RandomSamplingTestStruct( @@ -249,13 +249,12 @@ def test_preemption(tmp_path, params): def test_preemption_with_multinomial(tmp_path, dynamic_split_fuse): generation_configs = multinomial_params.generation_config for config in generation_configs: - config.rng_seed = 0 config.max_new_tokens = 30 model_id : str = "facebook/opt-125m" - model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True) + model, hf_tokenizer = get_hugging_face_models(model_id) models_path : Path = tmp_path / model_id - save_ov_model_from_optimum(model, hf_tokenizer, models_path) + convert_models(model, hf_tokenizer, models_path) scheduler_config = get_scheduler_config({"num_kv_blocks": 3, "dynamic_split_fuse": dynamic_split_fuse, "max_num_batched_tokens": 256, "max_num_seqs": 256}) generate_and_compare_with_reference_text(models_path, multinomial_params.prompts, multinomial_params.ref_texts, generation_configs, scheduler_config) @@ -329,15 +328,12 @@ def test_preemption_with_multinomial(tmp_path, dynamic_split_fuse): @pytest.mark.precommit @pytest.mark.skip(reason="Random sampling results are non deterministic due to: discrete_distribution impl depends on platform, model inference results may depend on CPU. Test passes on CI but fails locally.") def test_preemption_with_multinomial_n_seq(tmp_path, dynamic_split_fuse): - generation_configs = multinomial_params_n_seq.generation_config - for config in generation_configs: - config.rng_seed = 0 model_id : str = "facebook/opt-125m" - model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True) + opt_model, hf_tokenizer = get_hugging_face_models(model_id) models_path : Path = tmp_path / model_id - save_ov_model_from_optimum(model, hf_tokenizer, models_path) + convert_models(opt_model, hf_tokenizer, models_path) # needed kv_blocks - 16 (2 blocks per sequence (30 tokens to generated text + prompt (> 2 tokens)) * (1 + 3 + 4) seq ) scheduler_config = get_scheduler_config({"num_kv_blocks": 8, "dynamic_split_fuse": dynamic_split_fuse, "max_num_batched_tokens": 256, "max_num_seqs": 256}) - generate_and_compare_with_reference_text(models_path, multinomial_params_n_seq.prompts, multinomial_params_n_seq.ref_texts, generation_configs, scheduler_config) + generate_and_compare_with_reference_text(models_path, multinomial_params_n_seq.prompts, multinomial_params_n_seq.ref_texts, multinomial_params_n_seq.generation_config, scheduler_config) diff --git a/tests/python_tests/test_kv_cache_eviction.py b/tests/python_tests/test_kv_cache_eviction.py index 6228f53dd1..41281e9cab 100644 --- a/tests/python_tests/test_kv_cache_eviction.py +++ b/tests/python_tests/test_kv_cache_eviction.py @@ -15,7 +15,7 @@ from openvino import serialize from transformers import AutoTokenizer -from common import TESTS_ROOT, run_continuous_batching_pipeline_test +from common import TESTS_ROOT, run_cb_pipeline_with_ref def load_prompts_dataset(file_name : str) -> Dict[str, List[str]]: @@ -150,6 +150,7 @@ def get_greedy_seq_len_300() -> GenerationConfig: generation_config.max_new_tokens = 300 return generation_config + def get_beam_search_seq_len_300() -> GenerationConfig: generation_config = GenerationConfig() generation_config.num_beam_groups = 3 @@ -159,6 +160,7 @@ def get_beam_search_seq_len_300() -> GenerationConfig: generation_config.num_return_sequences = generation_config.num_beams return generation_config + scheduler_params_list = [ ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": True, "enable_prefix_caching": True}, get_greedy_seq_len_300()), ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": False, "max_num_batched_tokens": 600, "enable_prefix_caching": True}, get_beam_search_seq_len_300()), @@ -168,5 +170,5 @@ def get_beam_search_seq_len_300() -> GenerationConfig: @pytest.mark.parametrize("params", scheduler_params_list) @pytest.mark.precommit def test_dynamic_memory_allocation(tmp_path, params): - run_continuous_batching_pipeline_test(tmp_path, "facebook/opt-125m", params[0], params[1]) + run_cb_pipeline_with_ref(tmp_path, "facebook/opt-125m", scheduler_params=params[0], generation_config=params[1]) diff --git a/tests/python_tests/test_llm_pipeline.py b/tests/python_tests/test_llm_pipeline.py index 6e3cce06d0..986b342c59 100644 --- a/tests/python_tests/test_llm_pipeline.py +++ b/tests/python_tests/test_llm_pipeline.py @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import openvino_genai as ov_genai -from openvino_genai import StopCriteria, GenerationConfig +from openvino_genai import GenerationConfig import pytest from typing import Union, List, Dict, Optional import numpy as np @@ -10,152 +10,30 @@ import sys from pathlib import Path import torch -import math + +from common import run_llm_pipeline_with_ref, convert_to_hf from ov_genai_test_utils import ( get_models_list, read_model, load_genai_pipe_with_configs, get_chat_models_list, model_tmp_path, - STOP_CRITERIA_MAP, ) - -def run_hf_ov_genai_comparison_batched(model_descr, generation_config: Dict, prompts: Union[str, List[str]]): - model_id, path, hf_tokenizer, opt_model, ov_pipe = model_descr - config = generation_config.copy() # to avoid side effects - num_beams = config['num_beams'] if 'num_beams' in config else 1 - config['num_return_sequences'] = num_beams - - if not isinstance(prompts, list): - prompts = [prompts] - - if 'do_sample' not in config: - # Some HF models have default do_sample = True, and if we set beam search generation config - # it conflicts with `diversity_penalty` and/or `num_beam_groups`. - # Need to set explicitly to False, but only if test arguments omitted this arg. - # Do not apply 'repetition_penalty' if sampling is not used. - config['do_sample'] = False - config['repetition_penalty'] = 1.0 # 1.0 means no penalty - - generation_config_hf = config.copy() - if generation_config_hf.get('stop_criteria'): - generation_config_hf['early_stopping'] = STOP_CRITERIA_MAP[generation_config_hf.pop('stop_criteria')] - generation_config_hf.pop('ignore_eos', None) - - # Encode the batch of prompts - hf_tokenizer.padding_side = "left" - encoded_prompts = hf_tokenizer(prompts, return_tensors='pt', padding=True, truncation=True, add_special_tokens=True) - prompt_ids, attention_mask = encoded_prompts['input_ids'], encoded_prompts['attention_mask'] - - hf_encoded_outputs = opt_model.generate(prompt_ids, attention_mask=attention_mask, **generation_config_hf) - - hf_outputs = [] - for idx, hf_encoded_out in enumerate(hf_encoded_outputs): - prompt_count = idx // num_beams - hf_outputs.append(hf_tokenizer.decode(hf_encoded_out[prompt_ids[prompt_count].shape[0]:], skip_special_tokens=True)) - - ov_outputs = ov_pipe.generate(prompts, **config).texts - - hf_outputs.sort() - ov_outputs.sort() - for i, (hf_output, ov_output) in enumerate(zip(hf_outputs, ov_outputs)): - if hf_output != ov_output: - print(f'hf_output: {hf_output}') - print(f'ov_output: {ov_output}') - assert hf_output == ov_output - - -def run_hf_ov_genai_comparison_text_inputs(model_descr, generation_config: Dict, prompt: str): - model_id, path, hf_tokenizer, opt_model, ov_pipe = model_descr - - config = generation_config.copy() # to avoid side effects - - if 'do_sample' not in config: - # Some HF models have default do_sample = True, and if we set beam search generation config - # it conflicts with `diversity_penalty` and/or `num_beam_groups`. - # Need to set explicitly to False, but only if test arguments omitted this arg. - # Do not apply 'repetition_penalty' if sampling is not used. - config['do_sample'] = False - config['repetition_penalty'] = 1.0 # 1.0 means no penalty - - generation_config_hf = config.copy() - if generation_config_hf.get('stop_criteria'): - generation_config_hf['early_stopping'] = STOP_CRITERIA_MAP[generation_config_hf.pop('stop_criteria')] - generation_config_hf.pop('ignore_eos', None) - - encoded_prompt = hf_tokenizer([prompt], return_tensors='pt', add_special_tokens=True) - prompt_ids, attention_mask = encoded_prompt['input_ids'], encoded_prompt['attention_mask'] - hf_encoded_output = opt_model.generate(prompt_ids, attention_mask=attention_mask, **generation_config_hf) - hf_output = hf_tokenizer.decode(hf_encoded_output[0, prompt_ids.shape[1]:], skip_special_tokens=True) - - ov_output = ov_pipe.generate(prompt, **config) - if config.get('num_return_sequences', 1) > 1: - assert hf_output in ov_output.texts - else: - if hf_output != ov_output: - print(f'hf_output: {hf_output}') - print(f'ov_output: {ov_output}') - - assert hf_output == ov_output - - -def run_hf_ov_genai_comparison_encoded_inputs( - model_descr, - generation_config: Dict, - input_ids: np.ndarray, - attention_mask: Optional[np.array] = None - ): - device = 'CPU' - model_id, path, hf_tokenizer, opt_model, ov_pipe = model_descr - - config = generation_config.copy() # to avoid side effects - - if 'do_sample' not in config: - # Some HF models have default do_sample = True, and if we set beam search generation config - # it conflicts with `diversity_penalty` and/or `num_beam_groups`. - # Need to set explicitly to False, but only if test arguments omitted this arg. - # Do not apply 'repetition_penalty' if sampling is not used. - config['do_sample'] = False - config['repetition_penalty'] = 1.0 # 1.0 means no penalty - - generation_config_hf = config.copy() - if generation_config_hf.get('stop_criteria'): - generation_config_hf['early_stopping'] = STOP_CRITERIA_MAP[generation_config_hf.pop('stop_criteria')] - generation_config_hf.pop('ignore_eos', None) - - if attention_mask is not None: - inputs_ov = ov_genai.TokenizedInputs(ov.Tensor(input_ids), ov.Tensor(attention_mask)) - inputs_hf = dict(inputs=torch.tensor(input_ids), attention_mask=torch.tensor(attention_mask)) - else: - inputs_hf = dict(inputs=torch.tensor(input_ids)) - inputs_ov = ov.Tensor(input_ids) - - hf_output = opt_model.generate(**inputs_hf, **generation_config_hf) - ov_output = ov_pipe.generate(inputs_ov, **config) - - hf_res = hf_output[0, input_ids.shape[1]:].numpy() - ov_res = np.array(ov_output.tokens, dtype=np.int64) - assert np.all(ov_res == hf_res) - # # e2e work # test_cases = [ - (dict(max_new_tokens=20), 'table is made of'), (dict(max_new_tokens=20), '你好! 你好嗎?'), - (dict(num_beam_groups=3, num_beams=15, num_return_sequences=15, max_new_tokens=30, diversity_penalty=1.0), 'Alan Turing was a'), - (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.0), 'table is made of'), - (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.0), 'The Sun is yellow because'), - (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.5), 'The Sun is yellow because'), + (dict(max_new_tokens=30, num_beams=15, num_beam_groups=3, num_return_sequences=15, diversity_penalty=1.0), 'Alan Turing was a'), ] -@pytest.mark.parametrize("generation_config,prompt", test_cases) +@pytest.mark.parametrize("generation_config_dict,prompt", test_cases) @pytest.mark.parametrize("model_descr", get_models_list()) @pytest.mark.precommit @pytest.mark.nightly -def test_decoding(model_descr, generation_config, prompt): - run_hf_ov_genai_comparison_text_inputs(read_model(model_descr), generation_config, prompt) +def test_string_inputs(model_descr, generation_config_dict, prompt): + run_llm_pipeline_with_ref(model_id=model_descr[0], prompts=[prompt], generation_config=generation_config_dict, tmp_path=model_descr[1]) input_tensors_list = [ @@ -168,13 +46,32 @@ def test_decoding(model_descr, generation_config, prompt): @pytest.mark.precommit @pytest.mark.nightly def test_encoded_inputs(model_descr, inputs): - run_hf_ov_genai_comparison_encoded_inputs(read_model(model_descr), dict(max_new_tokens=20), *inputs) + device = 'CPU' + model_id, path, hf_tokenizer, opt_model, ov_pipe = read_model(model_descr) + + ov_generation_config = GenerationConfig(max_new_tokens=20) + hf_generation_config = convert_to_hf(opt_model.generation_config, ov_generation_config) + + input_ids, attention_mask = inputs + + if attention_mask is not None: + inputs_ov = ov_genai.TokenizedInputs(ov.Tensor(input_ids), ov.Tensor(attention_mask)) + inputs_hf = dict(inputs=torch.tensor(input_ids), attention_mask=torch.tensor(attention_mask)) + else: + inputs_hf = dict(inputs=torch.tensor(input_ids)) + inputs_ov = ov.Tensor(input_ids) + + hf_output = opt_model.generate(**inputs_hf, generation_config=hf_generation_config) + ov_output = ov_pipe.generate(inputs_ov, ov_generation_config) + + hf_res = hf_output[0, input_ids.shape[1]:].numpy() + ov_res = np.array(ov_output.tokens, dtype=np.int64) + assert np.all(ov_res == hf_res) test_configs = [ dict(max_new_tokens=20), - dict(max_new_tokens=200, ignore_eos=True), - dict(max_new_tokens=20, num_beam_groups=3, num_beams=15, diversity_penalty=1.0) + dict(max_new_tokens=20, num_beam_groups=2, num_beams=6, diversity_penalty=1.0) ] batched_prompts = [ ['table is made', 'They sky is blue because', 'Difference between Jupiter and Mars is that'], @@ -182,107 +79,13 @@ def test_encoded_inputs(model_descr, inputs): ['Alan Turing was a', 'return 0', '你好! 你好嗎?'], ['table is made', 'table is made [force left pad tokens]'] ] -@pytest.mark.parametrize("generation_config", test_configs) +@pytest.mark.parametrize("generation_config_dict", test_configs) @pytest.mark.parametrize("prompts", batched_prompts) @pytest.mark.parametrize("model_descr", get_models_list()) @pytest.mark.precommit @pytest.mark.nightly -def test_batch_text_input(model_descr, generation_config, prompts): - run_hf_ov_genai_comparison_batched(read_model(model_descr), generation_config, prompts) - - -prompts = ['The Sun is yellow because', 'Difference between Jupiter and Mars is that', 'table is made of'] -@pytest.mark.parametrize("num_beam_groups", [2, 3, 8]) -@pytest.mark.parametrize("group_size", [5, 3, 10]) -@pytest.mark.parametrize("max_new_tokens", [20, 15]) -@pytest.mark.parametrize("diversity_penalty", [1.0 , 1.5]) -@pytest.mark.parametrize("prompt", prompts) -@pytest.mark.parametrize("model_descr", get_models_list()) -@pytest.mark.precommit -@pytest.mark.nightly -def test_beam_search_decoding(model_descr, num_beam_groups, group_size, max_new_tokens, diversity_penalty, prompt): - generation_config = dict( - num_beam_groups=num_beam_groups, - num_beams=num_beam_groups * group_size, - diversity_penalty=diversity_penalty, - num_return_sequences=num_beam_groups * group_size, - max_new_tokens=max_new_tokens, - ) - run_hf_ov_genai_comparison_text_inputs(read_model(model_descr), generation_config, prompt) - - -@pytest.mark.parametrize("stop_criteria", [StopCriteria.NEVER, StopCriteria.EARLY, StopCriteria.HEURISTIC]) -@pytest.mark.parametrize("prompt", prompts) -@pytest.mark.parametrize("max_new_tokens", [10, 80]) -@pytest.mark.parametrize("model_descr", get_models_list()) -@pytest.mark.precommit -@pytest.mark.nightly -def test_beam_search_stop_criteria(model_descr, stop_criteria, prompt, max_new_tokens): - # todo: with EARLY stop_criteria looks like HF return invalid out with sentence - # while genai ends sentence with - if (stop_criteria == StopCriteria.EARLY): - pytest.skip() - generation_config = dict( - num_beam_groups=2, - num_beams=2 * 3, - diversity_penalty=1.0, - num_return_sequences=2 * 3, - max_new_tokens=max_new_tokens, - stop_criteria=stop_criteria, - ) - run_hf_ov_genai_comparison_text_inputs(read_model(model_descr), generation_config, prompt) - - -# test long sequences -@pytest.mark.parametrize("num_beam_groups", [2]) -@pytest.mark.parametrize("group_size", [5]) -@pytest.mark.parametrize("max_new_tokens", [800, 2000]) -@pytest.mark.parametrize("prompt", prompts) -@pytest.mark.parametrize("model_descr", get_models_list()) -@pytest.mark.nightly -def test_beam_search_long_sentences(model_descr, num_beam_groups, group_size, - max_new_tokens, prompt): - generation_config = dict( - num_beam_groups=num_beam_groups, - num_beams=num_beam_groups * group_size, - diversity_penalty=1.0, - num_return_sequences=num_beam_groups * group_size, - max_new_tokens=max_new_tokens, - ) - run_hf_ov_genai_comparison_text_inputs(read_model(model_descr), generation_config, prompt) - - -@pytest.mark.parametrize("prompt", prompts) -@pytest.mark.parametrize("model_descr", get_models_list()) -@pytest.mark.precommit -@pytest.mark.nightly -def test_greedy_repetition_penalty(model_descr, prompt): - model_id, path, tokenizer, model, pipe = read_model(model_descr) - - generation_config = dict( - repetition_penalty=2.0, - max_new_tokens=20, - do_sample=False - ) - run_hf_ov_genai_comparison_text_inputs((model_id, path, tokenizer, model, pipe), generation_config, prompt) - - generation_config = dict( - repetition_penalty=1.0, - max_new_tokens=20, - do_sample=False - ) - run_hf_ov_genai_comparison_text_inputs((model_id, path, tokenizer, model, pipe), generation_config, prompt) - - ov_output = pipe.generate(prompt, **generation_config) - - generation_config = dict( - repetition_penalty=0.5, - max_new_tokens=20, - do_sample=False - ) - ov_output_half_penalty = pipe.generate(prompt, **generation_config) - - assert(len(set(ov_output.split(' '))) > len(set(ov_output_half_penalty.split(' ')))) +def test_batch_string_inputs(model_descr, generation_config_dict, prompts): + run_llm_pipeline_with_ref(model_id=model_descr[0], prompts=prompts, generation_config=generation_config_dict, tmp_path=model_descr[1]) @pytest.mark.precommit @@ -313,17 +116,14 @@ def test_batch_size_switch(): @pytest.mark.parametrize("model_descr", get_chat_models_list()) @pytest.mark.precommit @pytest.mark.nightly -def test_chat_compare_with_HF(model_descr, generation_config_kwargs: Dict): +def test_chat_scenario(model_descr, generation_config_kwargs: Dict): chat_history_hf = [] chat_history_ov = [] - chat_prompt = '' - # Will set add_special_tokens=False inside pipeline when start_chat() is called. model_id, path, tokenizer, opt_model, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat')) - from transformers import GenerationConfig as HFGenerationConfig - hf_generation_config = HFGenerationConfig(**generation_config_kwargs) ov_generation_config = GenerationConfig(**generation_config_kwargs) + hf_generation_config = convert_to_hf(opt_model.generation_config, ov_generation_config) ov_pipe.start_chat() for prompt in questions: @@ -559,39 +359,27 @@ def test_unicode_pybind_decoding_one_string_streamer(): # Perf metrics # -def run_perf_metrics_collection(model_descr, generation_config: Dict, prompt: str) -> ov_genai.PerfMetrics: - model_id, path, hf_tokenizer, opt_model, ov_pipe = model_descr - - config = generation_config.copy() # to avoid side effects - - if 'do_sample' not in config: - # Some HF models have default do_sample = True, and if we set beam search generation config - # it conflicts with `diversity_penalty` and/or `num_beam_groups`. - # Need to set explicitly to False, but only if test arguments omitted this arg. - # Do not apply 'repetition_penalty' if sampling is not used. - config['do_sample'] = False - config['repetition_penalty'] = 1.0 # 1.0 means no penalty - - return ov_pipe.generate([prompt], **config).perf_metrics +def run_perf_metrics_collection(model_descr, generation_config_dict: dict, prompt: str) -> ov_genai.PerfMetrics: + model_id, path, hf_tokenizer, opt_model, ov_pipe = read_model(model_descr) + return ov_pipe.generate([prompt], **generation_config_dict).perf_metrics test_cases = [ (dict(max_new_tokens=20), 'table is made of'), ] @pytest.mark.parametrize("generation_config,prompt", test_cases) -@pytest.mark.parametrize("model_descr", get_models_list()) @pytest.mark.precommit @pytest.mark.nightly -@pytest.mark.skip(reason="load_time + mean_gen_duration < total_time fails in https://github.com/openvinotoolkit/openvino.genai/actions/runs/12503590506/job/34884840100?pr=1440.") -def test_perf_metrics(model_descr, generation_config, prompt): +def test_perf_metrics(generation_config, prompt): import time start_time = time.perf_counter() - perf_metrics = run_perf_metrics_collection(read_model(model_descr), generation_config, prompt) + model_id, path = 'katuni4ka/tiny-random-gemma2', Path('katuni4ka-tiny-random-gemma2') + perf_metrics = run_perf_metrics_collection((model_id, path), generation_config, prompt) total_time = (time.perf_counter() - start_time) * 1000 # Check that load time is adequate. load_time = perf_metrics.get_load_time() - assert load_time > 0 and load_time < 1000.0 + assert load_time > 0 and load_time < 2000.0 # Check that num input and generated tokens are adequate. num_generated_tokens = perf_metrics.get_num_generated_tokens() @@ -657,34 +445,6 @@ def test_perf_metrics(model_descr, generation_config, prompt): # Misc # -# TODO: move to test_sampling.py -@pytest.mark.precommit -@pytest.mark.nightly -def test_stop_token_ids(): - ov_pipe = read_model(('katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')))[4] - res = ov_pipe.generate( - ov.Tensor([(1,)]), - max_new_tokens=3, - stop_token_ids={9935, ov_pipe.get_tokenizer().get_eos_token_id()}, - include_stop_str_in_output=False - ) - assert 2 == len(res.tokens[0]) - assert 9935 in res.tokens[0] - - -# TODO: move to test_sampling.py -@pytest.mark.precommit -@pytest.mark.nightly -def test_stop_strings(): - ov_pipe = read_model(('katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')))[4] - res = ov_pipe.generate( - "", - max_new_tokens=5, - stop_strings={"ignored", "боль"} - ) - assert "боль" not in res - - # TODO: move this test to test_tokenizer.py @pytest.mark.skip(reason="probably both models ov + hf doesn't fit to memory") @pytest.mark.precommit @@ -698,7 +458,7 @@ def test_left_pad(): ] models = read_model(("microsoft/phi-1_5", Path("phi-1_5/"))) - config = { + generation_config_dict = { "max_new_tokens": 20, "num_beam_groups": 2, "num_beams": 2, @@ -713,4 +473,5 @@ def test_left_pad(): } models[2].pad_token = models[2].eos_token - run_hf_ov_genai_comparison_batched(models, config, prompts) + + run_llm_pipeline_with_ref(model_id=models[0], prompts=prompts, generation_config=generation_config_dict, tmp_path=models[1]) diff --git a/tests/python_tests/test_llm_pipeline_static.py b/tests/python_tests/test_llm_pipeline_static.py index c3500d15ac..6ef6162043 100644 --- a/tests/python_tests/test_llm_pipeline_static.py +++ b/tests/python_tests/test_llm_pipeline_static.py @@ -2,14 +2,18 @@ # SPDX-License-Identifier: Apache-2.0 import openvino_genai as ov_genai -from openvino.runtime import Core import pytest +import platform import sys from ov_genai_test_utils import ( get_models_list, get_chat_models_list, + read_model ) +from common import get_default_properties +if sys.platform == 'darwin' or platform.machine() in ["aarch64", "arm64", "ARM64"]: + pytest.skip("NPU plugin is available only on Linux and Windows x86_64", allow_module_level=True) # This test suite is designed specifically to validate the functionality and robustness of the StaticLLMPipeline on NPUW:CPU. common_config = { @@ -24,19 +28,18 @@ def generate_chat_history(model_path, device, pipeline_config, questions): pipe = ov_genai.LLMPipeline(model_path, device, **pipeline_config) pipe.start_chat() - chat_history = [ pipe.generate(question, max_new_tokens=50) for question in questions ] + chat_history = [ pipe.generate(question, max_new_tokens=50, do_sample=False) for question in questions ] pipe.finish_chat() return chat_history -@pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI") @pytest.mark.precommit @pytest.mark.nightly def test_generation_compare_with_stateful(): prompt = 'The Sun is yellow because' - model_path = get_models_list()[0][1] + model_path = read_model(get_models_list()[0])[1] - stateful_pipe = ov_genai.LLMPipeline(model_path, "CPU") + stateful_pipe = ov_genai.LLMPipeline(model_path, "CPU", **get_default_properties()) ref_out = stateful_pipe.generate(prompt, max_new_tokens=100) static_pipe = ov_genai.LLMPipeline(model_path, "NPU", **common_config) @@ -48,11 +51,10 @@ def test_generation_compare_with_stateful(): assert ref_out == actual_out -@pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI") @pytest.mark.precommit @pytest.mark.nightly def test_length_properties_set_no_exception(): - model_path = get_models_list()[0][1] + model_path = read_model(get_models_list()[0])[1] # NB: Check it doesn't throw any exception pipeline_config = { "MAX_PROMPT_LEN": 128, "MIN_RESPONSE_LEN": 64 } pipeline_config |= common_config @@ -65,22 +67,20 @@ def test_length_properties_set_no_exception(): { "MIN_RESPONSE_LEN": -1 }, { "MIN_RESPONSE_LEN": "1" } ] -@pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI") @pytest.mark.parametrize("pipeline_config", pipeline_configs) @pytest.mark.precommit @pytest.mark.nightly def test_invalid_length_properties_raise_error(pipeline_config): - model_path = get_models_list()[0][1] + model_path = read_model(get_models_list()[0])[1] pipeline_config |= common_config with pytest.raises(RuntimeError): pipe = ov_genai.LLMPipeline(model_path, "NPU", **pipeline_config) -@pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI") @pytest.mark.precommit @pytest.mark.nightly def test_batch_one_no_exception(): - model_path = get_models_list()[0][1] + model_path = read_model(get_models_list()[0])[1] prompt = 'The Sun is yellow because' static_pipe = ov_genai.LLMPipeline(model_path, "NPU", **common_config) # Check it doesn't throw any exception when batch of size 1 is provided @@ -88,11 +88,10 @@ def test_batch_one_no_exception(): # TODO: For the further batch support -@pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI") @pytest.mark.precommit @pytest.mark.nightly def test_batch_raise_error(): - model_path = get_models_list()[0][1] + model_path = read_model(get_models_list()[0])[1] prompt = 'The Sun is yellow because' pipe = ov_genai.LLMPipeline(model_path, "NPU", **common_config) with pytest.raises(RuntimeError): @@ -101,26 +100,24 @@ def test_batch_raise_error(): # TODO: For the further sampling support generation_configs = [ - dict(num_beam_groups=3), + dict(num_beams=3), dict(do_sample=True) ] -@pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI") @pytest.mark.parametrize("generation_config", generation_configs) @pytest.mark.precommit @pytest.mark.nightly def test_unsupported_sampling_raise_error(generation_config): - model_path = get_models_list()[0][1] + model_path = read_model(get_models_list()[0])[1] prompt = 'The Sun is yellow because' pipe = ov_genai.LLMPipeline(model_path, "NPU", **common_config) with pytest.raises(RuntimeError): pipe.generate(prompt, **generation_config) -@pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI") @pytest.mark.precommit @pytest.mark.nightly def test_max_number_of_tokens(): - model_path = get_models_list()[0][1] + model_path = read_model(get_models_list()[0])[1] prompt = 'The Sun is yellow because' num_tokens = 128 @@ -133,11 +130,10 @@ def test_max_number_of_tokens(): # FIXME: Known problem, output differs from stateful pipeline starting from 3rd prompt! -@pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI") @pytest.mark.skip(reason="JIRA-144780: Output differs from stateful pipeline") @pytest.mark.precommit @pytest.mark.nightly -def test_chat_generation(model_descr): +def test_chat_generation(): questions = [ '1+1=', 'What is the previous answer?', @@ -145,9 +141,9 @@ def test_chat_generation(model_descr): 'What was my first question?' ] - model_path = get_chat_models_list()[0][1] + model_path = read_model(get_chat_models_list()[0])[1] - chat_history_stateful = generate_chat_history(model_path, "CPU", { }, questions) + chat_history_stateful = generate_chat_history(model_path, "CPU", get_default_properties(), questions) chat_history_static = generate_chat_history(model_path, "NPU", common_config, questions) print('npu chat: \n{chat_history_static}\n') diff --git a/tests/python_tests/test_sampling.py b/tests/python_tests/test_sampling.py index 25ae9d8afa..004d4f9d9d 100644 --- a/tests/python_tests/test_sampling.py +++ b/tests/python_tests/test_sampling.py @@ -1,84 +1,96 @@ # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -import os + import sys import pytest -import shutil import sys from dataclasses import dataclass from pathlib import Path -from openvino_genai import ContinuousBatchingPipeline, GenerationConfig, Tokenizer +from openvino_genai import GenerationConfig, StopCriteria from typing import List, TypedDict -from common import get_hugging_face_model_and_tokenizer, save_ov_model_from_optimum, \ - get_greedy, get_beam_search, get_multinomial_temperature, \ - get_greedy_with_penalties, get_multinomial_temperature, \ - get_multinomial_temperature_and_top_k, get_multinomial_temperature_and_top_p, \ - get_multinomial_temperature_top_p_and_top_k, DEFAULT_SCHEDULER_CONFIG, get_greedy_with_repetition_penalty, \ - get_multinomial_all_parameters, get_multinomial_temperature_and_num_return_sequence, \ - get_greedy, get_greedy_with_min_and_max_tokens, \ - get_greedy_with_single_stop_string, get_greedy_with_multiple_stop_strings, get_greedy_with_multiple_stop_strings_no_match, \ - get_beam_search, get_beam_search_min_and_max_tokens, get_beam_search_with_single_stop_string, \ - get_beam_search_with_multiple_stop_strings, get_beam_search_with_multiple_stop_strings_no_match, get_multinomial_max_and_min_token, \ - get_multinomial_temperature_and_frequence_penalty, get_multinomial_temperature_and_presence_penalty, \ - get_greedy_stop_strings_exclude_from_output, get_greedy_stop_strings_include_to_output, \ - get_greedy_n_stop_strings_exclude_from_output, get_greedy_n_stop_strings_include_to_output, \ - generate_and_compare_with_hf, get_multinomial_temperature_and_repetition_penalty, get_scheduler_config, \ - run_continuous_batching +from common import get_hugging_face_models, convert_models, run_llm_pipeline_with_ref, run_llm_pipeline -# TODO: currently, this test drops EOS token as both HF and OV use `skip_special_tokens=True`, which should be disabled for samlpling tests @pytest.mark.precommit -def test_beam_search_has_eos_token_at_end(tmp_path): - ''' - Current test checks that in case of beam search, some generation results - explicitly have EOS token at the end, which is aligned with HF +@pytest.mark.parametrize("generation_config,prompt", + [(dict(max_new_tokens=30), 'table is made of'), + (dict(max_new_tokens=30, min_new_tokens=30), '你好! 你好嗎?'), + (dict(max_new_tokens=30, ignore_eos=True), 'Alan Turing was a'), + # (dict(max_length=40), 'table is made of'), + (dict(stop_token_ids={28998}), 'The Sun is yellow because'), # since a test does not hang, it means stop token is met + # (dict(max_new_tokens=1, min_new_tokens=0, echo=True), 'What is OpenVINO?') + ], + ids=["max_new_tokens", + "min_and_max_new_tokens", + "max_new_tokens_and_ignore_eos_true", + # "max_length", + "stop_token_ids", + # "echo_with_generation", + ]) +def test_basic_stop_criteria(tmp_path, generation_config, prompt): + model_id : str = "katuni4ka/tiny-random-phi3" + run_llm_pipeline_with_ref(model_id, [prompt], generation_config, tmp_path) - Example of current output: - { -1.23264, that I don't know about. - I don't know what you're talking about, but I'm pretty sure it's a Canadian thing. } - ''' - model_id = "facebook/opt-125m" - prompts = ["Tell me something about Canada"] - generation_configs = [get_beam_search()] - scheduler_config = get_scheduler_config() - generate_and_compare_with_hf(model_id, prompts, generation_configs, scheduler_config, tmp_path) - -# TODO: currently, this test drops EOS token as both HF and OV use `skip_special_tokens=True`, which should be disabled for samlpling tests @pytest.mark.precommit -def test_greedy_has_eos_token_at_end(tmp_path): - ''' - Current test checks that in case of gready, some generation results - explicitly have EOS token at the end, which is aligned with HF: +@pytest.mark.parametrize("generation_config", + [dict(max_new_tokens=50, min_new_tokens=15, stop_strings={"anag"}, include_stop_str_in_output=True), # expected match on "manage" + dict(max_new_tokens=50, min_new_tokens=1, stop_strings={".", "software", "Intel"}, include_stop_str_in_output=True), + dict(max_new_tokens=50, min_new_tokens=1, stop_strings={"Einstein", "sunny", "geothermal"}, include_stop_str_in_output=True), # expected no match + dict(max_new_tokens=30, stop_strings={ "machines" }, include_stop_str_in_output=False), + dict(max_new_tokens=30, stop_strings={ "machines" }, include_stop_str_in_output=True), + dict(max_new_tokens=30, stop_strings={ "machines", "manage" }, include_stop_str_in_output=False), + dict(max_new_tokens=30, stop_strings={ "machines", "manage" }, include_stop_str_in_output=True),], + ids=["single_stop_string", + "multiple_stop_strings_match", + "multiple_stop_strings_no_match", + "single_stop_string_exclude_from_output", + "single_stop_string_include_to_output", + "multiple_stop_strings_exclude_from_output", + "multiple_stop_strings_include_to_output"]) +def test_stop_strings(tmp_path, generation_config): + prompts = [ "What is OpenVINO?" ] + model_id : str = "facebook/opt-125m" + run_llm_pipeline_with_ref(model_id, prompts, generation_config, tmp_path) - Example of current output: - { a software program } - ''' - model_id = "bigscience/bloomz-560m" - prompts = ["What is OpenVINO?"] - generation_configs = [get_greedy()] - scheduler_config = get_scheduler_config() - generate_and_compare_with_hf(model_id, prompts, generation_configs, scheduler_config, tmp_path) + +@pytest.mark.precommit +@pytest.mark.parametrize("generation_config", + [dict(max_new_tokens=30), + dict(max_new_tokens=30, repetition_penalty=2.0),], + ids=["basic", + "repetition_penalty",]) +def test_greedy(tmp_path, generation_config): + prompts = [ "What is OpenVINO?" ] + model_id : str = "katuni4ka/tiny-random-phi3" + run_llm_pipeline_with_ref(model_id, prompts, generation_config, tmp_path) -# TODO: consider removing all these functions with generation configs and use Dict with properties, which can be converted to generation config @pytest.mark.precommit @pytest.mark.parametrize("generation_config", - [get_greedy(), get_greedy_with_min_and_max_tokens(), get_greedy_with_repetition_penalty(), get_greedy_with_single_stop_string(), - get_greedy_with_multiple_stop_strings(), get_greedy_with_multiple_stop_strings_no_match(), - get_beam_search(), get_beam_search_min_and_max_tokens(), get_beam_search_with_multiple_stop_strings_no_match(), - get_greedy_stop_strings_exclude_from_output(), get_greedy_stop_strings_include_to_output(), - get_greedy_n_stop_strings_exclude_from_output(), get_greedy_n_stop_strings_include_to_output()], - ids=["greedy", "greedy_with_min_and_max_tokens", "greedy_with_repetition_penalty", "greedy_with_single_stop_string", - "greedy_with_multiple_stop_strings", "greedy_with_multiple_stop_strings_no_match", "beam_search", "beam_search_min_and_max_tokens", - "beam_search_with_multiple_stop_strings_no_match", "greedy_stop_strings_exclude_from_output", "greedy_stop_strings_include_to_output", - "greedy_n_stop_strings_exclude_from_output", "greedy_n_stop_strings_include_to_output"]) -def test_sampling_against_optimum(tmp_path, generation_config): + [dict(max_new_tokens=30, num_beams=2), + dict(max_new_tokens=30, num_beams=2, stop_criteria=StopCriteria.NEVER), + dict(max_new_tokens=30, num_beams=2, stop_criteria=StopCriteria.EARLY), + # dict(max_new_tokens=30, num_beams=2, echo=True), + dict(max_new_tokens=30, num_beams=2, length_penalty=1.0), + dict(max_new_tokens=30, num_beams=2, no_repeat_ngram_size=2), + dict(max_new_tokens=30, num_beams=6, num_beam_groups=3, diversity_penalty=1.2, num_return_sequences=3), + dict(max_new_tokens=30, min_new_tokens=15, num_beams=2, num_return_sequences=1), + dict(max_new_tokens=30, num_beams=2, stop_strings={"Einstein", "sunny", "geothermal"}, include_stop_str_in_output=True),], + ids=["single_group_stop_criteria_heuristic", + "single_group_stop_criteria_never", + "single_group_stop_criteria_early", + # "single_group_with_echo", + "single_group_lenght_penalty", + "single_group_no_repeat_ngram_size", + "multiple_groups", + "single_group_min_new_tokens", + "single_group_with_multiple_stop_strings_no_match",]) +def test_beam_search(tmp_path, generation_config): prompts = [ "What is OpenVINO?" ] - generation_configs = [generation_config] model_id : str = "facebook/opt-125m" - generate_and_compare_with_hf(model_id, prompts, generation_configs, DEFAULT_SCHEDULER_CONFIG, tmp_path) + run_llm_pipeline_with_ref(model_id, prompts, generation_config, tmp_path) @pytest.mark.precommit @@ -87,13 +99,28 @@ def test_sampling_against_optimum(tmp_path, generation_config): reason="Stop strings do not seem to work as expected with beam search in HF, so comparison will fail. If it changes, these cases shall be merged to the test above.", strict=True, ) -@pytest.mark.parametrize("generation_config", [get_beam_search_with_single_stop_string(), get_beam_search_with_multiple_stop_strings()], - ids=["beam_search_with_single_stop_string", "beam_search_with_multiple_stop_strings"]) +@pytest.mark.parametrize("generation_config", + [dict(max_new_tokens=50, num_beams=6, num_beam_groups=3, diversity_penalty=1.0, num_return_sequences=6, stop_strings={"open sour"}, include_stop_str_in_output=True), + dict(max_new_tokens=50, num_beams=6, num_beam_groups=3, diversity_penalty=1.0, num_return_sequences=6, stop_strings={".", "software", "Intel"}, include_stop_str_in_output=True),], + ids=["single_stop_string_match", "multiple_stop_strings_match"]) def test_beam_search_with_stop_string(tmp_path, generation_config): prompts = [ "What is OpenVINO?" ] - generation_configs = [generation_config] model_id : str = "facebook/opt-125m" - generate_and_compare_with_hf(model_id, prompts, generation_configs, DEFAULT_SCHEDULER_CONFIG, tmp_path) + run_llm_pipeline_with_ref(model_id, prompts, generation_config, tmp_path) + + +@pytest.mark.precommit +@pytest.mark.parametrize("generation_config", + [dict(max_new_tokens=1, min_new_tokens=0, echo=True), + dict(max_new_tokens=30, num_beams=2, echo=True),], + ids=["echo_with_generation", + "single_group_with_echo",]) +def test_echo(tmp_path, generation_config): + prompts = [ "What is OpenVINO?" ] + model_id : str = "facebook/opt-125m" + # TODO: support in stateful mode and remove 'use_cb=True' and this test at all + # as we can enable new parameters set in other tests + run_llm_pipeline_with_ref(model_id, prompts, generation_config, tmp_path, use_cb=True) # TODO: remove platform specific reference texts once CVS-159912 is done and use comparison with HF @@ -123,6 +150,12 @@ class RandomSamplingTestStruct: prompts: List[str] ref_texts: List[List[str]] +from common import get_multinomial_temperature, get_greedy_with_penalties, \ + get_multinomial_temperature_and_top_k, get_multinomial_temperature_and_top_p, \ + get_multinomial_temperature_top_p_and_top_k, get_multinomial_all_parameters, \ + get_multinomial_temperature_and_num_return_sequence, get_multinomial_max_and_min_token, \ + get_multinomial_temperature_and_frequence_penalty, get_multinomial_temperature_and_presence_penalty, \ + get_multinomial_temperature_and_repetition_penalty RANDOM_SAMPLING_TEST_CASES = [ RandomSamplingTestStruct( @@ -285,72 +318,15 @@ def test_multinomial_sampling_against_reference(tmp_path, test_struct: RandomSam prompts = test_struct.prompts generation_config.rng_seed = 0 - generation_configs = [generation_config] + generation_configs = generation_config model_id : str = "facebook/opt-125m" - model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True) + model, hf_tokenizer = get_hugging_face_models(model_id) models_path : Path = tmp_path / model_id - save_ov_model_from_optimum(model, hf_tokenizer, models_path) + convert_models(model, hf_tokenizer, models_path) # run multinomial without comparison with reference - _ = run_continuous_batching(models_path, DEFAULT_SCHEDULER_CONFIG, prompts, generation_configs) + _ = run_llm_pipeline(models_path, prompts, generation_configs) # Reference comparison is not performed as sampling results are non-deterministic. # Discrete_distribution impl depends on platform, model inference results may depend on CPU. - - -@pytest.mark.precommit -@pytest.mark.parametrize("get_generation_config", [get_greedy, get_beam_search, get_multinomial_all_parameters], - ids=["greedy", "beam_search", "multinomial_all_parameters"]) -@pytest.mark.parametrize("max_num_batched_tokens", [2, 4, 256]) -def test_echo_prompt_phase_only(tmp_path, get_generation_config, max_num_batched_tokens): - generation_config = get_generation_config() - generation_config.max_new_tokens = 0 - generation_config.echo = True - - scheduler_config = get_scheduler_config() - scheduler_config.max_num_batched_tokens = max_num_batched_tokens - generation_configs = [generation_config] - model_id : str = "facebook/opt-125m" - opt_model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True) - - model_path : Path = tmp_path / model_id - save_ov_model_from_optimum(opt_model, hf_tokenizer, model_path) - - cb_pipe = ContinuousBatchingPipeline(model_path, Tokenizer(model_path), scheduler_config, "CPU") - - outputs = cb_pipe.generate(["What is OpenVINO?"], generation_configs) - assert(len(outputs)) - for output in outputs: - assert(len(output.m_generation_ids)) - for sequence in output.m_generation_ids: - assert(sequence == "What is OpenVINO?") - - -@pytest.mark.precommit -@pytest.mark.parametrize("get_generation_config", [get_greedy, get_beam_search, get_multinomial_all_parameters], - ids=["greedy", "beam_search", "multinomial_all_parameters"]) -@pytest.mark.parametrize("max_num_batched_tokens", [2, 4, 256]) -def test_echo_with_generation_phase(tmp_path, get_generation_config, max_num_batched_tokens): - generation_config = get_generation_config() - generation_config.max_new_tokens = 10 - generation_config.echo = True - - scheduler_config = get_scheduler_config() - scheduler_config.max_num_batched_tokens = max_num_batched_tokens - generation_configs = [generation_config] - model_id : str = "facebook/opt-125m" - opt_model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True) - - model_path : Path = tmp_path / model_id - save_ov_model_from_optimum(opt_model, hf_tokenizer, model_path) - - cb_pipe = ContinuousBatchingPipeline(model_path, Tokenizer(model_path), scheduler_config, "CPU") - outputs = cb_pipe.generate(["What is OpenVINO?"], generation_configs) - assert(len(outputs)) - - for output in outputs: - assert(len(output.m_generation_ids)) - for sequence in output.m_generation_ids: - assert(sequence.startswith("What is OpenVINO?")) - assert(len(sequence) > len("What is OpenVINO?")) diff --git a/tests/python_tests/test_vlm_pipeline.py b/tests/python_tests/test_vlm_pipeline.py index b4df6492bb..81c181bc54 100644 --- a/tests/python_tests/test_vlm_pipeline.py +++ b/tests/python_tests/test_vlm_pipeline.py @@ -6,8 +6,8 @@ import pytest import transformers from optimum.intel.openvino import OVModelForVisualCausalLM -from openvino_genai import VLMPipeline -from common import get_greedy, get_image_by_link, get_beam_search, get_greedy, get_multinomial_all_parameters +from openvino_genai import VLMPipeline, GenerationConfig +from common import get_image_by_link, get_beam_search, get_multinomial_all_parameters def get_ov_model(cache): model_dir = cache.mkdir("tiny-random-minicpmv-2_6") @@ -49,21 +49,22 @@ def streamer(word: str) -> bool: return False models_path = get_ov_model(cache) + generation_config = GenerationConfig(max_new_tokens=30) for links in image_links_for_testing: images = [] for link in links: images.append(get_image_by_link(link)) - pipe = VLMPipeline(models_path, "CPU") - pipe.start_chat() + ov_pipe = VLMPipeline(models_path, "CPU") + ov_pipe.start_chat() - pipe.generate(prompts[0], images=images, generation_config=get_greedy(), streamer=streamer) + ov_pipe.generate(prompts[0], images=images, generation_config=generation_config, streamer=streamer) for prompt in prompts[1:]: - pipe.generate(prompt, generation_config=get_greedy(), streamer=streamer) + ov_pipe.generate(prompt, generation_config=generation_config, streamer=streamer) - pipe.finish_chat() + ov_pipe.finish_chat() @pytest.mark.precommit @@ -95,7 +96,7 @@ def test_perf_metrics(cache): images = [get_image_by_link(image_links[0])] pipe = VLMPipeline(models_path, "CPU") - result = pipe.generate(prompts[0], images=images, generation_config=get_greedy()) + result = pipe.generate(prompts[0], images=images, generation_config=GenerationConfig(max_new_tokens=30)) perf_metrics = result.perf_metrics diff --git a/tools/who_what_benchmark/tests/test_cli_image.py b/tools/who_what_benchmark/tests/test_cli_image.py index 7b966f049e..fec9e96f4c 100644 --- a/tools/who_what_benchmark/tests/test_cli_image.py +++ b/tools/who_what_benchmark/tests/test_cli_image.py @@ -42,8 +42,8 @@ def teardown_module(): ("hf-internal-testing/tiny-stable-diffusion-torch", "text-to-image", "hf"), ("hf-internal-testing/tiny-stable-diffusion-torch", "text-to-image", "openvino"), ("hf-internal-testing/tiny-stable-diffusion-xl-pipe", "text-to-image", "hf"), - ("hf-internal-testing/tiny-stable-diffusion-torch", "image-inpainting", "hf"), - ("hf-internal-testing/tiny-stable-diffusion-xl-pipe", "image-inpainting", "hf"), + # ("hf-internal-testing/tiny-stable-diffusion-torch", "image-inpainting", "hf"), + # ("hf-internal-testing/tiny-stable-diffusion-xl-pipe", "image-inpainting", "hf"), ], ) def test_image_model_types(model_id, model_type, backend): @@ -88,7 +88,10 @@ def test_image_model_types(model_id, model_type, backend): @pytest.mark.parametrize( ("model_id", "model_type"), list(itertools.product(OV_IMAGE_MODELS, - ["image-to-image", "text-to-image", "image-inpainting"])), + ["image-to-image", + "text-to-image", + # "image-inpainting" + ])), ) def test_image_model_genai(model_id, model_type): with tempfile.TemporaryDirectory() as temp_dir: