Skip to content

Commit

Permalink
Move tests on sampling to test_sampling.py (#1465)
Browse files Browse the repository at this point in the history
- Move extensive tests on decoding / sampling from test_llm_pipeline.py
tests to test_sampling.py
- Partially refactored common functions in common.py to be more generic
(to be continued in next PRs)
- Dropped partially predefined functions with generation configs and
replaced them in tests with dict of generation parameters, so you can
better see tests params closer to tests itself and avoid creating
numerous get_** for new generation values combinations.
- Sampling tests are now implemented on top of stateful model for better
comparison with optimum-intel
  • Loading branch information
ilya-lavrenov authored Jan 3, 2025
1 parent 42f3053 commit 1fd1430
Show file tree
Hide file tree
Showing 12 changed files with 410 additions and 666 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/mac.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: macOS (12, Python 3.9)
name: macOS (12, Python 3.10)
on:
workflow_dispatch:
pull_request:
Expand All @@ -16,7 +16,7 @@ concurrency:
cancel-in-progress: true

env:
PYTHON_VERSION: '3.9'
PYTHON_VERSION: '3.10'
OV_BRANCH: master
OV_TARBALL: ''

Expand Down
1 change: 1 addition & 0 deletions src/python/openvino_genai/py_openvino_genai.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -697,6 +697,7 @@ class GenerationResult:
"""
m_generation_ids: list[str]
m_scores: list[float]
m_status: GenerationStatus
def __init__(self) -> None:
...
def __repr__(self) -> str:
Expand Down
15 changes: 8 additions & 7 deletions src/python/py_continuous_batching_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,13 @@ std::ostream& operator << (std::ostream& stream, const GenerationResult& generat
} // namespace

void init_continuous_batching_pipeline(py::module_& m) {
py::enum_<ov::genai::GenerationStatus>(m, "GenerationStatus")
.value("RUNNING", ov::genai::GenerationStatus::RUNNING)
.value("FINISHED", ov::genai::GenerationStatus::FINISHED)
.value("IGNORED", ov::genai::GenerationStatus::IGNORED)
.value("DROPPED_BY_PIPELINE", ov::genai::GenerationStatus::DROPPED_BY_PIPELINE)
.value("DROPPED_BY_HANDLE", ov::genai::GenerationStatus::DROPPED_BY_HANDLE);

py::class_<GenerationResult>(m, "GenerationResult", generation_result_docstring)
.def(py::init<>())
.def_readonly("m_request_id", &GenerationResult::m_request_id)
Expand All @@ -130,6 +137,7 @@ void init_continuous_batching_pipeline(py::module_& m) {
r.m_generation_ids = generation_ids;
})
.def_readwrite("m_scores", &GenerationResult::m_scores)
.def_readwrite("m_status", &GenerationResult::m_status)
.def("__repr__",
[](const GenerationResult &r) -> py::str {
std::stringstream stream;
Expand All @@ -148,13 +156,6 @@ void init_continuous_batching_pipeline(py::module_& m) {
.def_readwrite("m_generation_ids", &EncodedGenerationResult::m_generation_ids)
.def_readwrite("m_scores", &EncodedGenerationResult::m_scores);

py::enum_<ov::genai::GenerationStatus>(m, "GenerationStatus")
.value("RUNNING", ov::genai::GenerationStatus::RUNNING)
.value("FINISHED", ov::genai::GenerationStatus::FINISHED)
.value("IGNORED", ov::genai::GenerationStatus::IGNORED)
.value("DROPPED_BY_PIPELINE", ov::genai::GenerationStatus::DROPPED_BY_PIPELINE)
.value("DROPPED_BY_HANDLE", ov::genai::GenerationStatus::DROPPED_BY_HANDLE);

py::enum_<ov::genai::GenerationFinishReason>(m, "GenerationFinishReason")
.value("NONE", ov::genai::GenerationFinishReason::NONE)
.value("STOP", ov::genai::GenerationFinishReason::STOP)
Expand Down
346 changes: 178 additions & 168 deletions tests/python_tests/common.py

Large diffs are not rendered by default.

49 changes: 23 additions & 26 deletions tests/python_tests/ov_genai_test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
import shutil
import json

import openvino_genai as ov_genai


def get_models_list():
precommit_models = [
Expand Down Expand Up @@ -52,6 +54,7 @@ def get_models_list():

if pytest.selected_model_ids:
model_ids = [model_id for model_id in model_ids if model_id in pytest.selected_model_ids.split(' ')]

# pytest.set_trace()
prefix = pathlib.Path(os.getenv('GENAI_MODELS_PATH_PREFIX', ''))
return [(model_id, prefix / model_id.split('/')[1]) for model_id in model_ids]
Expand Down Expand Up @@ -81,77 +84,69 @@ def get_chat_models_list():

@functools.lru_cache(1)
def read_model(params, **tokenizer_kwargs):
model_id, path = params
model_id, models_path = params

from optimum.intel.openvino import OVModelForCausalLM
from transformers import AutoTokenizer
hf_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

if (path / "openvino_model.xml").exists():
opt_model = OVModelForCausalLM.from_pretrained(path, trust_remote_code=True,
if (models_path / "openvino_model.xml").exists():
opt_model = OVModelForCausalLM.from_pretrained(models_path, trust_remote_code=True,
compile=False, device='CPU')
else:
ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(hf_tokenizer,
with_detokenizer=True,
**tokenizer_kwargs)
openvino.save_model(ov_tokenizer, path / "openvino_tokenizer.xml")
openvino.save_model(ov_detokenizer, path / "openvino_detokenizer.xml")
openvino.save_model(ov_tokenizer, models_path / "openvino_tokenizer.xml")
openvino.save_model(ov_detokenizer, models_path / "openvino_detokenizer.xml")

# to store tokenizer config jsons with special tokens
hf_tokenizer.save_pretrained(path)
hf_tokenizer.save_pretrained(models_path)

opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True,
compile=False, device='CPU', load_in_8bit=False)
opt_model.generation_config.save_pretrained(path)
opt_model.config.save_pretrained(path)
opt_model.save_pretrained(path)
opt_model.generation_config.save_pretrained(models_path)
opt_model.config.save_pretrained(models_path)
opt_model.save_pretrained(models_path)

return (
model_id,
path,
models_path,
hf_tokenizer,
opt_model,
ov_genai.LLMPipeline(path, 'CPU', ENABLE_MMAP=False),
ov_genai.LLMPipeline(models_path, 'CPU', ENABLE_MMAP=False),
)


# in OpenVINO GenAI this parameter is called stop_criteria,
# while in HF it's called early_stopping.
# HF values True, False and "never" correspond to OV GenAI values "EARLY", "HEURISTIC" and "NEVER"
STOP_CRITERIA_MAP = {
ov_genai.StopCriteria.NEVER: "never",
ov_genai.StopCriteria.EARLY: True,
ov_genai.StopCriteria.HEURISTIC: False
}


@pytest.fixture(scope="module")
def model_tmp_path(tmpdir_factory):
model_id, path, _, _, _ = read_model(get_models_list()[0])
model_id, models_path, _, _, _ = read_model(get_models_list()[0])
temp_path = tmpdir_factory.mktemp(model_id.replace('/', '_'))

# copy openvino converted model and tokenizers
for pattern in ['*.xml', '*.bin']:
for src_file in path.glob(pattern):
for src_file in models_path.glob(pattern):
if src_file.is_file():
shutil.copy(src_file, temp_path / src_file.name)

yield model_id, Path(temp_path)


@pytest.fixture(scope="module")
def model_tokenizers_tmp_path(tmpdir_factory):
model_id, path, _, _, _ = read_model(get_models_list()[0])
model_id, models_path, _, _, _ = read_model(get_models_list()[0])
temp_path = tmpdir_factory.mktemp(model_id.replace('/', '_'))

# If tokens were not found in IR, it fallback to reading from config.
# There was no easy way to add tokens to IR in tests, so we remove them
# and set tokens in configs and to check if they are read and validated correctly.
import openvino as ov

core = ov.Core()

# copy openvino converted model and tokenizers
for pattern in ['*.xml', '*.bin']:
for src_file in path.glob(pattern):
core = ov.Core()
for src_file in models_path.glob(pattern):

# Update files if they are openvino_tokenizer.xml or openvino_detokenizer.xml
if src_file.name in ['openvino_tokenizer.xml', 'openvino_detokenizer.xml']:
Expand All @@ -166,8 +161,10 @@ def model_tokenizers_tmp_path(tmpdir_factory):

if src_file in ['openvino_tokenizer.bin', 'openvino_detokenizer.bin']:
continue

if src_file.is_file():
shutil.copy(src_file, temp_path / src_file.name)

yield model_id, Path(temp_path)


Expand Down
38 changes: 17 additions & 21 deletions tests/python_tests/test_continuous_batching.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
from pathlib import Path
from openvino_genai import ContinuousBatchingPipeline, GenerationConfig, Tokenizer

from common import get_hugging_face_model_and_tokenizer, save_ov_model_from_optimum, generate_and_compare_with_reference_text, \
get_scheduler_config, get_greedy, run_continuous_batching_pipeline_test, get_beam_search, get_greedy, \
from common import get_hugging_face_models, convert_models, generate_and_compare_with_reference_text, \
get_scheduler_config, get_greedy, run_cb_pipeline_with_ref, get_beam_search, get_greedy, \
get_multinomial_all_parameters, get_multinomial_temperature_and_num_return_sequence, \
get_multinomial_temperature_and_top_k, get_multinomial_temperature, get_multinomial_temperature_and_top_p
from test_sampling import RandomSamplingTestStruct, get_current_platform_ref_texts
Expand Down Expand Up @@ -39,19 +39,19 @@ def read_models_list(file_name: str):
@pytest.mark.precommit
@pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "precommit")))
def test_e2e_precommit(tmp_path, model_id):
run_continuous_batching_pipeline_test(tmp_path, model_id)
run_cb_pipeline_with_ref(tmp_path, model_id)


@pytest.mark.nightly
@pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "nightly")))
def test_e2e_nightly(tmp_path, model_id):
run_continuous_batching_pipeline_test(tmp_path, model_id)
run_cb_pipeline_with_ref(tmp_path, model_id)


@pytest.mark.real_models
@pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "real_models")))
def test_e2e_real_models(tmp_path, model_id):
run_continuous_batching_pipeline_test(tmp_path, model_id)
run_cb_pipeline_with_ref(tmp_path, model_id)

#
# Comparison with stateful
Expand All @@ -77,8 +77,8 @@ def test_continuous_batching_vs_stateful(prompt, generation_config):
"facebook/opt-125m",
Path("opt-125m")
))
cb = get_continuous_batching(path)
generated = cb.generate(prompt, **generation_config)
cb_pipe = get_continuous_batching(path)
generated = cb_pipe.generate(prompt, **generation_config)
reference = stateful.generate(prompt, **generation_config)
assert generated.texts == reference.texts
if 1 != generation_config.get("num_return_sequences", 1):
Expand Down Expand Up @@ -117,8 +117,8 @@ def test_cb_streamer_vs_return_vs_stateful(prompt):
@pytest.mark.parametrize("model_descr", get_chat_models_list())
@pytest.mark.precommit
def test_chat_scenario_vs_stateful(model_descr, generation_config_kwargs: Dict):
model_id, path, hf_tokenizer, opt_model, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'))
cb_pipe = get_continuous_batching(path)
model_id, models_path, hf_tokenizer, opt_model, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'))
cb_pipe = get_continuous_batching(models_path)

ov_pipe.start_chat()
cb_pipe.start_chat()
Expand Down Expand Up @@ -150,10 +150,10 @@ def test_post_oom_health(tmp_path, sampling_config):
scheduler_config.num_kv_blocks = 10 # Low cache size to trigger OOM quickly

model_id : str = "facebook/opt-125m"
opt_model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True)
opt_model, hf_tokenizer = get_hugging_face_models(model_id)

models_path : Path = tmp_path / model_id
save_ov_model_from_optimum(opt_model, hf_tokenizer, models_path)
convert_models(opt_model, hf_tokenizer, models_path)

cb_pipe = ContinuousBatchingPipeline(models_path, Tokenizer(models_path), scheduler_config, "CPU")

Expand Down Expand Up @@ -201,7 +201,7 @@ def get_beam_search_seq_len_300() -> GenerationConfig:
@pytest.mark.parametrize("params", scheduler_params_list)
@pytest.mark.precommit
def test_preemption(tmp_path, params):
run_continuous_batching_pipeline_test(tmp_path, "facebook/opt-125m", scheduler_params=params[0], generation_config=params[1])
run_cb_pipeline_with_ref(tmp_path, "facebook/opt-125m", scheduler_params=params[0], generation_config=params[1])


multinomial_params = RandomSamplingTestStruct(
Expand Down Expand Up @@ -249,13 +249,12 @@ def test_preemption(tmp_path, params):
def test_preemption_with_multinomial(tmp_path, dynamic_split_fuse):
generation_configs = multinomial_params.generation_config
for config in generation_configs:
config.rng_seed = 0
config.max_new_tokens = 30
model_id : str = "facebook/opt-125m"
model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True)
model, hf_tokenizer = get_hugging_face_models(model_id)

models_path : Path = tmp_path / model_id
save_ov_model_from_optimum(model, hf_tokenizer, models_path)
convert_models(model, hf_tokenizer, models_path)

scheduler_config = get_scheduler_config({"num_kv_blocks": 3, "dynamic_split_fuse": dynamic_split_fuse, "max_num_batched_tokens": 256, "max_num_seqs": 256})
generate_and_compare_with_reference_text(models_path, multinomial_params.prompts, multinomial_params.ref_texts, generation_configs, scheduler_config)
Expand Down Expand Up @@ -329,15 +328,12 @@ def test_preemption_with_multinomial(tmp_path, dynamic_split_fuse):
@pytest.mark.precommit
@pytest.mark.skip(reason="Random sampling results are non deterministic due to: discrete_distribution impl depends on platform, model inference results may depend on CPU. Test passes on CI but fails locally.")
def test_preemption_with_multinomial_n_seq(tmp_path, dynamic_split_fuse):
generation_configs = multinomial_params_n_seq.generation_config
for config in generation_configs:
config.rng_seed = 0
model_id : str = "facebook/opt-125m"
model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True)
opt_model, hf_tokenizer = get_hugging_face_models(model_id)

models_path : Path = tmp_path / model_id
save_ov_model_from_optimum(model, hf_tokenizer, models_path)
convert_models(opt_model, hf_tokenizer, models_path)

# needed kv_blocks - 16 (2 blocks per sequence (30 tokens to generated text + prompt (> 2 tokens)) * (1 + 3 + 4) seq )
scheduler_config = get_scheduler_config({"num_kv_blocks": 8, "dynamic_split_fuse": dynamic_split_fuse, "max_num_batched_tokens": 256, "max_num_seqs": 256})
generate_and_compare_with_reference_text(models_path, multinomial_params_n_seq.prompts, multinomial_params_n_seq.ref_texts, generation_configs, scheduler_config)
generate_and_compare_with_reference_text(models_path, multinomial_params_n_seq.prompts, multinomial_params_n_seq.ref_texts, multinomial_params_n_seq.generation_config, scheduler_config)
6 changes: 4 additions & 2 deletions tests/python_tests/test_kv_cache_eviction.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from openvino import serialize
from transformers import AutoTokenizer

from common import TESTS_ROOT, run_continuous_batching_pipeline_test
from common import TESTS_ROOT, run_cb_pipeline_with_ref


def load_prompts_dataset(file_name : str) -> Dict[str, List[str]]:
Expand Down Expand Up @@ -150,6 +150,7 @@ def get_greedy_seq_len_300() -> GenerationConfig:
generation_config.max_new_tokens = 300
return generation_config


def get_beam_search_seq_len_300() -> GenerationConfig:
generation_config = GenerationConfig()
generation_config.num_beam_groups = 3
Expand All @@ -159,6 +160,7 @@ def get_beam_search_seq_len_300() -> GenerationConfig:
generation_config.num_return_sequences = generation_config.num_beams
return generation_config


scheduler_params_list = [
({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": True, "enable_prefix_caching": True}, get_greedy_seq_len_300()),
({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": False, "max_num_batched_tokens": 600, "enable_prefix_caching": True}, get_beam_search_seq_len_300()),
Expand All @@ -168,5 +170,5 @@ def get_beam_search_seq_len_300() -> GenerationConfig:
@pytest.mark.parametrize("params", scheduler_params_list)
@pytest.mark.precommit
def test_dynamic_memory_allocation(tmp_path, params):
run_continuous_batching_pipeline_test(tmp_path, "facebook/opt-125m", params[0], params[1])
run_cb_pipeline_with_ref(tmp_path, "facebook/opt-125m", scheduler_params=params[0], generation_config=params[1])

Loading

0 comments on commit 1fd1430

Please sign in to comment.