Move tests on sampling to test_sampling.py (#1465)

- Move extensive tests on decoding / sampling from test_llm_pipeline.py tests to test_sampling.py - Partially refactored common functions in common.py to be more generic (to be continued in next PRs) - Dropped partially predefined functions with generation configs and replaced them in tests with dict of generation parameters, so you can better see tests params closer to tests itself and avoid creating numerous get_** for new generation values combinations. - Sampling tests are now implemented on top of stateful model for better comparison with optimum-intel
openvinotoolkit · Jan 3, 2025 · 1fd1430 · 1fd1430
1 parent 42f3053
commit 1fd1430
Show file tree

Hide file tree

Showing 12 changed files with 410 additions and 666 deletions.
diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
@@ -1,4 +1,4 @@
-name: macOS (12, Python 3.9)
+name: macOS (12, Python 3.10)
 on:
   workflow_dispatch:
   pull_request:
@@ -16,7 +16,7 @@ concurrency:
   cancel-in-progress: true
 
 env:
-  PYTHON_VERSION: '3.9'
+  PYTHON_VERSION: '3.10'
   OV_BRANCH: master
   OV_TARBALL: ''
 

diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi
@@ -697,6 +697,7 @@ class GenerationResult:
     """
     m_generation_ids: list[str]
     m_scores: list[float]
+    m_status: GenerationStatus
     def __init__(self) -> None:
         ...
     def __repr__(self) -> str:

diff --git a/src/python/py_continuous_batching_pipeline.cpp b/src/python/py_continuous_batching_pipeline.cpp
@@ -119,6 +119,13 @@ std::ostream& operator << (std::ostream& stream, const GenerationResult& generat
 } // namespace
 
 void init_continuous_batching_pipeline(py::module_& m) {
+    py::enum_<ov::genai::GenerationStatus>(m, "GenerationStatus")
+        .value("RUNNING", ov::genai::GenerationStatus::RUNNING)
+        .value("FINISHED", ov::genai::GenerationStatus::FINISHED)
+        .value("IGNORED", ov::genai::GenerationStatus::IGNORED)
+        .value("DROPPED_BY_PIPELINE", ov::genai::GenerationStatus::DROPPED_BY_PIPELINE)
+        .value("DROPPED_BY_HANDLE", ov::genai::GenerationStatus::DROPPED_BY_HANDLE);
+
     py::class_<GenerationResult>(m, "GenerationResult", generation_result_docstring)
         .def(py::init<>())
         .def_readonly("m_request_id", &GenerationResult::m_request_id)
@@ -130,6 +137,7 @@ void init_continuous_batching_pipeline(py::module_& m) {
                 r.m_generation_ids = generation_ids;
             })
         .def_readwrite("m_scores", &GenerationResult::m_scores)
+        .def_readwrite("m_status", &GenerationResult::m_status)
         .def("__repr__",
             [](const GenerationResult &r) -> py::str {
                 std::stringstream stream;
@@ -148,13 +156,6 @@ void init_continuous_batching_pipeline(py::module_& m) {
         .def_readwrite("m_generation_ids", &EncodedGenerationResult::m_generation_ids)
         .def_readwrite("m_scores", &EncodedGenerationResult::m_scores);
 
-    py::enum_<ov::genai::GenerationStatus>(m, "GenerationStatus")
-        .value("RUNNING", ov::genai::GenerationStatus::RUNNING)
-        .value("FINISHED", ov::genai::GenerationStatus::FINISHED)
-        .value("IGNORED", ov::genai::GenerationStatus::IGNORED)
-        .value("DROPPED_BY_PIPELINE", ov::genai::GenerationStatus::DROPPED_BY_PIPELINE)
-        .value("DROPPED_BY_HANDLE", ov::genai::GenerationStatus::DROPPED_BY_HANDLE);
-
     py::enum_<ov::genai::GenerationFinishReason>(m, "GenerationFinishReason")
         .value("NONE", ov::genai::GenerationFinishReason::NONE)
         .value("STOP", ov::genai::GenerationFinishReason::STOP)

diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py
diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py
@@ -13,6 +13,8 @@
 import shutil
 import json
 
+import openvino_genai as ov_genai
+
 
 def get_models_list():
     precommit_models = [
@@ -52,6 +54,7 @@ def get_models_list():
 
     if pytest.selected_model_ids:
         model_ids = [model_id for model_id in model_ids if model_id in pytest.selected_model_ids.split(' ')]
+
     # pytest.set_trace()
     prefix = pathlib.Path(os.getenv('GENAI_MODELS_PATH_PREFIX', ''))
     return [(model_id, prefix / model_id.split('/')[1]) for model_id in model_ids]
@@ -81,77 +84,69 @@ def get_chat_models_list():
 
 @functools.lru_cache(1)
 def read_model(params, **tokenizer_kwargs):
-    model_id, path = params
+    model_id, models_path = params
 
     from optimum.intel.openvino import OVModelForCausalLM
     from transformers import AutoTokenizer
     hf_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
 
-    if (path / "openvino_model.xml").exists():
-        opt_model = OVModelForCausalLM.from_pretrained(path, trust_remote_code=True,
+    if (models_path / "openvino_model.xml").exists():
+        opt_model = OVModelForCausalLM.from_pretrained(models_path, trust_remote_code=True,
                                                        compile=False, device='CPU')
     else:
         ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(hf_tokenizer,
                                                                              with_detokenizer=True,
                                                                              **tokenizer_kwargs)
-        openvino.save_model(ov_tokenizer, path / "openvino_tokenizer.xml")
-        openvino.save_model(ov_detokenizer, path / "openvino_detokenizer.xml")
+        openvino.save_model(ov_tokenizer, models_path / "openvino_tokenizer.xml")
+        openvino.save_model(ov_detokenizer, models_path / "openvino_detokenizer.xml")
 
         # to store tokenizer config jsons with special tokens
-        hf_tokenizer.save_pretrained(path)
+        hf_tokenizer.save_pretrained(models_path)
 
         opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True,
                                                        compile=False, device='CPU', load_in_8bit=False)
-        opt_model.generation_config.save_pretrained(path)
-        opt_model.config.save_pretrained(path)
-        opt_model.save_pretrained(path)
+        opt_model.generation_config.save_pretrained(models_path)
+        opt_model.config.save_pretrained(models_path)
+        opt_model.save_pretrained(models_path)
 
     return (
         model_id,
-        path,
+        models_path,
         hf_tokenizer,
         opt_model,
-        ov_genai.LLMPipeline(path, 'CPU', ENABLE_MMAP=False),
+        ov_genai.LLMPipeline(models_path, 'CPU', ENABLE_MMAP=False),
     )
 
 
-# in OpenVINO GenAI this parameter is called stop_criteria,
-# while in HF it's called early_stopping.
-# HF values True, False and "never" correspond to OV GenAI values "EARLY", "HEURISTIC" and "NEVER"
-STOP_CRITERIA_MAP = {
-    ov_genai.StopCriteria.NEVER: "never",
-    ov_genai.StopCriteria.EARLY: True,
-    ov_genai.StopCriteria.HEURISTIC: False
-}
-
-
 @pytest.fixture(scope="module")
 def model_tmp_path(tmpdir_factory):
-    model_id, path, _, _, _ = read_model(get_models_list()[0])
+    model_id, models_path, _, _, _ = read_model(get_models_list()[0])
     temp_path = tmpdir_factory.mktemp(model_id.replace('/', '_'))
 
     # copy openvino converted model and tokenizers
     for pattern in ['*.xml', '*.bin']:
-        for src_file in path.glob(pattern):
+        for src_file in models_path.glob(pattern):
             if src_file.is_file():
                 shutil.copy(src_file, temp_path / src_file.name)
+
     yield model_id, Path(temp_path)
 
 
 @pytest.fixture(scope="module")
 def model_tokenizers_tmp_path(tmpdir_factory):
-    model_id, path, _, _, _ = read_model(get_models_list()[0])
+    model_id, models_path, _, _, _ = read_model(get_models_list()[0])
     temp_path = tmpdir_factory.mktemp(model_id.replace('/', '_'))
 
     # If tokens were not found in IR, it fallback to reading from config.
     # There was no easy way to add tokens to IR in tests, so we remove them
     # and set tokens in configs and to check if they are read and validated correctly.
     import openvino as ov
 
+    core = ov.Core()
+
     # copy openvino converted model and tokenizers
     for pattern in ['*.xml', '*.bin']:
-        for src_file in path.glob(pattern):
-            core = ov.Core()
+        for src_file in models_path.glob(pattern):
 
             # Update files if they are openvino_tokenizer.xml or openvino_detokenizer.xml
             if src_file.name in ['openvino_tokenizer.xml', 'openvino_detokenizer.xml']:
@@ -166,8 +161,10 @@ def model_tokenizers_tmp_path(tmpdir_factory):
 
             if src_file in ['openvino_tokenizer.bin', 'openvino_detokenizer.bin']:
                 continue
+
             if src_file.is_file():
                 shutil.copy(src_file, temp_path / src_file.name)
+
     yield model_id, Path(temp_path)
 
 

diff --git a/tests/python_tests/test_continuous_batching.py b/tests/python_tests/test_continuous_batching.py
@@ -9,8 +9,8 @@
 from pathlib import Path
 from openvino_genai import ContinuousBatchingPipeline, GenerationConfig, Tokenizer
 
-from common import get_hugging_face_model_and_tokenizer, save_ov_model_from_optimum, generate_and_compare_with_reference_text, \
-    get_scheduler_config, get_greedy, run_continuous_batching_pipeline_test, get_beam_search, get_greedy, \
+from common import get_hugging_face_models, convert_models, generate_and_compare_with_reference_text, \
+    get_scheduler_config, get_greedy, run_cb_pipeline_with_ref, get_beam_search, get_greedy, \
     get_multinomial_all_parameters, get_multinomial_temperature_and_num_return_sequence, \
     get_multinomial_temperature_and_top_k, get_multinomial_temperature, get_multinomial_temperature_and_top_p
 from test_sampling import RandomSamplingTestStruct, get_current_platform_ref_texts
@@ -39,19 +39,19 @@ def read_models_list(file_name: str):
 @pytest.mark.precommit
 @pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "precommit")))
 def test_e2e_precommit(tmp_path, model_id):
-    run_continuous_batching_pipeline_test(tmp_path, model_id)
+    run_cb_pipeline_with_ref(tmp_path, model_id)
 
 
 @pytest.mark.nightly
 @pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "nightly")))
 def test_e2e_nightly(tmp_path, model_id):
-    run_continuous_batching_pipeline_test(tmp_path, model_id)
+    run_cb_pipeline_with_ref(tmp_path, model_id)
 
 
 @pytest.mark.real_models
 @pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "real_models")))
 def test_e2e_real_models(tmp_path, model_id):
-    run_continuous_batching_pipeline_test(tmp_path, model_id)
+    run_cb_pipeline_with_ref(tmp_path, model_id)
 
 #
 # Comparison with stateful
@@ -77,8 +77,8 @@ def test_continuous_batching_vs_stateful(prompt, generation_config):
         "facebook/opt-125m",
         Path("opt-125m")
     ))
-    cb = get_continuous_batching(path)
-    generated = cb.generate(prompt, **generation_config)
+    cb_pipe = get_continuous_batching(path)
+    generated = cb_pipe.generate(prompt, **generation_config)
     reference = stateful.generate(prompt, **generation_config)
     assert generated.texts == reference.texts
     if 1 != generation_config.get("num_return_sequences", 1):
@@ -117,8 +117,8 @@ def test_cb_streamer_vs_return_vs_stateful(prompt):
 @pytest.mark.parametrize("model_descr", get_chat_models_list())
 @pytest.mark.precommit
 def test_chat_scenario_vs_stateful(model_descr, generation_config_kwargs: Dict):
-    model_id, path, hf_tokenizer, opt_model, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'))
-    cb_pipe = get_continuous_batching(path)
+    model_id, models_path, hf_tokenizer, opt_model, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'))
+    cb_pipe = get_continuous_batching(models_path)
 
     ov_pipe.start_chat()
     cb_pipe.start_chat()
@@ -150,10 +150,10 @@ def test_post_oom_health(tmp_path, sampling_config):
     scheduler_config.num_kv_blocks = 10 # Low cache size to trigger OOM quickly
 
     model_id : str = "facebook/opt-125m"
-    opt_model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True)
+    opt_model, hf_tokenizer = get_hugging_face_models(model_id)
 
     models_path : Path = tmp_path / model_id
-    save_ov_model_from_optimum(opt_model, hf_tokenizer, models_path)
+    convert_models(opt_model, hf_tokenizer, models_path)
 
     cb_pipe = ContinuousBatchingPipeline(models_path, Tokenizer(models_path), scheduler_config, "CPU")
 
@@ -201,7 +201,7 @@ def get_beam_search_seq_len_300() -> GenerationConfig:
 @pytest.mark.parametrize("params", scheduler_params_list)
 @pytest.mark.precommit
 def test_preemption(tmp_path, params):
-    run_continuous_batching_pipeline_test(tmp_path, "facebook/opt-125m", scheduler_params=params[0], generation_config=params[1])
+    run_cb_pipeline_with_ref(tmp_path, "facebook/opt-125m", scheduler_params=params[0], generation_config=params[1])
 
 
 multinomial_params = RandomSamplingTestStruct(
@@ -249,13 +249,12 @@ def test_preemption(tmp_path, params):
 def test_preemption_with_multinomial(tmp_path, dynamic_split_fuse):
     generation_configs = multinomial_params.generation_config
     for config in generation_configs:
-        config.rng_seed = 0
         config.max_new_tokens = 30
     model_id : str = "facebook/opt-125m"
-    model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True)
+    model, hf_tokenizer = get_hugging_face_models(model_id)
 
     models_path : Path = tmp_path / model_id
-    save_ov_model_from_optimum(model, hf_tokenizer, models_path)
+    convert_models(model, hf_tokenizer, models_path)
 
     scheduler_config = get_scheduler_config({"num_kv_blocks": 3, "dynamic_split_fuse": dynamic_split_fuse, "max_num_batched_tokens": 256, "max_num_seqs": 256})
     generate_and_compare_with_reference_text(models_path, multinomial_params.prompts, multinomial_params.ref_texts, generation_configs, scheduler_config)
@@ -329,15 +328,12 @@ def test_preemption_with_multinomial(tmp_path, dynamic_split_fuse):
 @pytest.mark.precommit
 @pytest.mark.skip(reason="Random sampling results are non deterministic due to: discrete_distribution impl depends on platform, model inference results may depend on CPU. Test passes on CI but fails locally.")
 def test_preemption_with_multinomial_n_seq(tmp_path, dynamic_split_fuse):
-    generation_configs = multinomial_params_n_seq.generation_config
-    for config in generation_configs:
-        config.rng_seed = 0
     model_id : str = "facebook/opt-125m"
-    model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True)
+    opt_model, hf_tokenizer = get_hugging_face_models(model_id)
 
     models_path : Path = tmp_path / model_id
-    save_ov_model_from_optimum(model, hf_tokenizer, models_path)
+    convert_models(opt_model, hf_tokenizer, models_path)
 
     # needed kv_blocks - 16 (2 blocks per sequence (30 tokens to generated text + prompt (> 2 tokens)) * (1 + 3 + 4) seq )
     scheduler_config = get_scheduler_config({"num_kv_blocks": 8, "dynamic_split_fuse": dynamic_split_fuse, "max_num_batched_tokens": 256, "max_num_seqs": 256})
-    generate_and_compare_with_reference_text(models_path, multinomial_params_n_seq.prompts, multinomial_params_n_seq.ref_texts, generation_configs, scheduler_config)
+    generate_and_compare_with_reference_text(models_path, multinomial_params_n_seq.prompts, multinomial_params_n_seq.ref_texts, multinomial_params_n_seq.generation_config, scheduler_config)
diff --git a/tests/python_tests/test_kv_cache_eviction.py b/tests/python_tests/test_kv_cache_eviction.py
@@ -15,7 +15,7 @@
 from openvino import serialize
 from transformers import AutoTokenizer
 
-from common import TESTS_ROOT, run_continuous_batching_pipeline_test
+from common import TESTS_ROOT, run_cb_pipeline_with_ref
 
 
 def load_prompts_dataset(file_name : str) -> Dict[str, List[str]]:
@@ -150,6 +150,7 @@ def get_greedy_seq_len_300() -> GenerationConfig:
     generation_config.max_new_tokens = 300
     return generation_config
 
+
 def get_beam_search_seq_len_300() -> GenerationConfig:
     generation_config = GenerationConfig()
     generation_config.num_beam_groups = 3
@@ -159,6 +160,7 @@ def get_beam_search_seq_len_300() -> GenerationConfig:
     generation_config.num_return_sequences = generation_config.num_beams
     return generation_config
 
+
 scheduler_params_list = [
                          ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": True, "enable_prefix_caching": True}, get_greedy_seq_len_300()),
                          ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": False, "max_num_batched_tokens": 600, "enable_prefix_caching": True}, get_beam_search_seq_len_300()),
@@ -168,5 +170,5 @@ def get_beam_search_seq_len_300() -> GenerationConfig:
 @pytest.mark.parametrize("params", scheduler_params_list)
 @pytest.mark.precommit
 def test_dynamic_memory_allocation(tmp_path, params):
-    run_continuous_batching_pipeline_test(tmp_path, "facebook/opt-125m", params[0], params[1])
+    run_cb_pipeline_with_ref(tmp_path, "facebook/opt-125m", scheduler_params=params[0], generation_config=params[1])