diff --git a/src/cpp/src/cache_manager.hpp b/src/cpp/src/cache_manager.hpp index 20d4c0c51c..160a1fa55a 100644 --- a/src/cpp/src/cache_manager.hpp +++ b/src/cpp/src/cache_manager.hpp @@ -16,8 +16,8 @@ class CacheManager { std::vector m_key_cache; std::vector m_value_cache; size_t m_num_allocated_kv_blocks = 0; - ov::Core m_core; ov::InferRequest m_request; + ov::Core m_core; ov::Shape set_first_dim_and_make_static(const ov::PartialShape& shape, size_t dim) { ov::PartialShape res_shape = shape; diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp index 8f33fd83db..35d3115ce6 100644 --- a/src/cpp/src/continuous_batching_impl.cpp +++ b/src/cpp/src/continuous_batching_impl.cpp @@ -65,11 +65,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::initialize_pipeline( m_num_decoder_layers = device_config.get_num_layers(); // setup KV caches - m_cache_manager = std::make_shared(device_config, core); - for (size_t decoder_layer_id = 0; decoder_layer_id < m_num_decoder_layers; ++decoder_layer_id) { - infer_request.set_tensor(std::string("key_cache.") + std::to_string(decoder_layer_id), m_cache_manager->get_key_cache(decoder_layer_id)); - infer_request.set_tensor(std::string("value_cache.") + std::to_string(decoder_layer_id), m_cache_manager->get_value_cache(decoder_layer_id)); - } + m_cache_manager = std::make_shared(device_config, infer_request, core); SchedulerConfig updated_config = scheduler_config; // update KV blocks number in scheduler config @@ -85,8 +81,6 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::initialize_pipeline( } m_scheduler = std::make_shared(device_config.get_block_size(), m_cache_manager, updated_config, device_config.get_num_layers(), can_use_partial_preemption); - m_scheduler = std::make_shared(device_config.get_block_size(), updated_config, m_num_decoder_layers, can_use_partial_preemption); - // and finally create model runner bool is_use_cache_eviction = m_scheduler->get_config().use_cache_eviction; if (is_use_cache_eviction) { @@ -204,6 +198,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::step() { _register_step_cache_usage(scheduler_output.m_cache_usage); m_pipeline_metrics.avg_cache_usage = _get_current_running_average_cache_usage(); + const auto& sched_config = m_scheduler->get_config(); if (sched_config.use_cache_eviction && sched_config.cache_eviction_config.apply_rotation) { _compute_cache_rotation_data(m_requests, scheduler_output); m_model_runner->set_cache_rotation_data(std::move(m_current_step_rotated_block_indices_per_sequence), diff --git a/src/cpp/src/continuous_batching_impl.hpp b/src/cpp/src/continuous_batching_impl.hpp index cd967acecb..914806c400 100644 --- a/src/cpp/src/continuous_batching_impl.hpp +++ b/src/cpp/src/continuous_batching_impl.hpp @@ -58,10 +58,6 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc const ov::AnyMap& plugin_config, const DeviceConfig& device_config, ov::Core& core); - void _register_step_cache_usage(float step_cache_usage); - float _get_current_running_average_cache_usage() const; - void _maybe_evict_cache_blocks(const SchedulerConfig& sched_config); - void _compute_cache_rotation_data(const std::vector& sequence_groups, const Scheduler::Output& scheduler_output); /** @@ -92,6 +88,7 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc void _register_step_cache_usage(float step_cache_usage); float _get_current_running_average_cache_usage() const; + void _compute_cache_rotation_data(const std::vector& sequence_groups, const Scheduler::Output& scheduler_output); public: ContinuousBatchingImpl(const std::shared_ptr& model, diff --git a/src/cpp/src/device_config.hpp b/src/cpp/src/device_config.hpp index 3d768bc544..03c10b205d 100644 --- a/src/cpp/src/device_config.hpp +++ b/src/cpp/src/device_config.hpp @@ -158,8 +158,6 @@ class DeviceConfig { return m_head_size; } - } - ov::PartialShape get_value_cache_shape(size_t id) const { OPENVINO_ASSERT(m_value_cache_shape.size()); return m_value_cache_shape[id]; diff --git a/tests/python_tests/test_whisper_pipeline.py b/tests/python_tests/test_whisper_pipeline.py index b253c2ab04..c7f1fc4c75 100644 --- a/tests/python_tests/test_whisper_pipeline.py +++ b/tests/python_tests/test_whisper_pipeline.py @@ -320,15 +320,10 @@ def test_max_new_tokens(model_descr, sample_from_dataset): @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True)) -@pytest.mark.parametrize( - "test_samples", - [ - (get_samples_from_dataset(language="fr", length=1), "fr"), - (get_samples_from_dataset(language="de", length=1), "de"), - ], -) +@pytest.mark.parametrize("sample_from_dataset", [*get_fixture_params_for_n_whisper_dataset_samples(n=1, language="fr"), + *get_fixture_params_for_n_whisper_dataset_samples(n=1, language="de")], indirect=True) @pytest.mark.precommit -def test_language_mode(model_descr, test_samples): +def test_language_mode(model_descr, sample_from_dataset): assert genai_result.texts[0] == expected genai_result = pipe.generate(sample_from_dataset) @@ -347,7 +342,7 @@ def test_language_mode(model_descr, test_samples): def test_language_mode_fr(model_descr, sample_from_dataset): model_id, path = model_descr model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) - samples, language = test_samples + samples, language = sample_from_dataset expected = opt_pipe( samples[0], max_new_tokens=30, generate_kwargs={"language": language} @@ -370,7 +365,7 @@ def test_language_mode_fr(model_descr, sample_from_dataset): @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True)) @pytest.mark.parametrize("sample_from_dataset", get_fixture_params_for_n_whisper_dataset_samples(n=3, language="fr"), indirect=True) @pytest.mark.precommit -def test_task_mode(model_descr, test_sample): +def test_task_mode(model_descr, sample_from_dataset): model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) expected = opt_pipe( @@ -432,31 +427,31 @@ def test_language_autodetect(model_descr, sample_from_dataset): run_pipeline_with_ref( model_id=model_descr[0], tmp_path=model_descr[1], - sample=test_sample, + sample=sample_from_dataset, generation_config=ov_genai.WhisperGenerationConfig(max_new_tokens=30), ) @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True)) -@pytest.mark.parametrize("test_sample", get_samples_from_dataset(length=1)) +@pytest.mark.parametrize("sample_from_dataset", [*get_fixture_params_for_n_whisper_dataset_samples(n=1)], indirect=True) @pytest.mark.precommit -def test_return_timestamps_short_form(model_descr, test_sample): +def test_return_timestamps_short_form(model_descr, sample_from_dataset): run_pipeline_with_ref( model_id=model_descr[0], tmp_path=model_descr[1], - sample=test_sample, + sample=sample_from_dataset, generation_config=ov_genai.WhisperGenerationConfig(return_timestamps=True), ) @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True)) -@pytest.mark.parametrize("test_sample", get_samples_from_dataset(length=1)) +@pytest.mark.parametrize("sample_from_dataset", [*get_fixture_params_for_n_whisper_dataset_samples(n=1)], indirect=True) @pytest.mark.precommit -def test_return_timestamps_max_new_tokens_short_form(model_descr, test_sample): +def test_return_timestamps_max_new_tokens_short_form(model_descr, sample_from_dataset): run_pipeline_with_ref( model_id=model_descr[0], tmp_path=model_descr[1], - sample=test_sample, + sample=sample_from_dataset, generation_config=ov_genai.WhisperGenerationConfig( return_timestamps=True, language="en", max_new_tokens=30 ), @@ -464,25 +459,23 @@ def test_return_timestamps_max_new_tokens_short_form(model_descr, test_sample): @pytest.mark.parametrize("model_descr", get_whisper_models_list()) -@pytest.mark.parametrize( - "test_sample", get_samples_from_dataset(length=10, long_form=True) -) +@pytest.mark.parametrize("sample_from_dataset", [*get_fixture_params_for_n_whisper_dataset_samples(n=10, long_form=True)], indirect=True) @pytest.mark.precommit -def test_longform_audio(model_descr, test_sample): +def test_longform_audio(model_descr, sample_from_dataset): _, _, hf_pipe, genai_pipe = read_whisper_model(model_descr) streamer_result = [] genai_result = run_genai( genai_pipe, - test_sample, + sample_from_dataset, config=ov_genai.WhisperGenerationConfig(return_timestamps=True), streamer=lambda x: streamer_result.append(x), ) hf_result = run_huggingface( hf_pipe, - test_sample, + sample_from_dataset, config=ov_genai.WhisperGenerationConfig(return_timestamps=True), ) @@ -520,7 +513,8 @@ def test_initial_prompt_hotwords(model_descr, sample_from_dataset): assert "Joel Keaton" in result.texts[0] assert "Joel Kyton" not in result.texts[0] - result = pipe.generate(test_sample, initial_prompt="Joel Kyton") + result = pipe.generate(sample_from_dataset +, initial_prompt="Joel Kyton") assert "Joel Keaton" not in result.texts[0] assert "Joel Kyton" in result.texts[0]