From 3ed69638c56cd4164681f33cf0a24296de65e439 Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Mon, 23 Dec 2024 10:16:31 +0100 Subject: [PATCH 1/6] remove redundant `.tolist()` (#1419) ![image](https://github.com/user-attachments/assets/77013e49-d1bd-4f3a-99aa-1d17e9b8f6b5) - To fix remove redundant `.tolist()` since it was already done above. --------- Co-authored-by: Ilya Lavrenov --- tools/llm_bench/task/text_generation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/llm_bench/task/text_generation.py b/tools/llm_bench/task/text_generation.py index 485de94996..4822b228ca 100644 --- a/tools/llm_bench/task/text_generation.py +++ b/tools/llm_bench/task/text_generation.py @@ -301,7 +301,7 @@ def token_printer(): - np.array(perf_metrics.raw_metrics.m_new_token_times[:-1]) ).tolist() - tm_list = np.array([first_token_time] + second_tokens_durations) / 1000 + tm_list = (np.array([first_token_time] + second_tokens_durations) / 1000).tolist() inference_durations = (np.array(perf_metrics.raw_metrics.token_infer_durations) / 1000 / 1000).tolist() log.debug('latency of all tokens:') [log.debug('[{}]{:.4f}'.format(idx, tm)) for idx, tm in enumerate(tm_list)] @@ -323,8 +323,8 @@ def token_printer(): metrics_print.print_metrics( num, iter_data, - tm_list.tolist(), - inference_durations.tolist(), + tm_list, + inference_durations, warm_up=(num == 0), max_rss_mem=max_rss_mem_consumption, max_shared_mem=max_shared_mem_consumption, From eac4f376e9fc509a68fc3c1f6a3637d9f19b7526 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 23 Dec 2024 14:11:37 +0400 Subject: [PATCH 2/6] [Image generation] Added i64 support for text encoders inputs (#1424) Can be required for new optimum versions --- .../image_generation/models/clip_text_model.cpp | 16 +++++++++++----- .../models/clip_text_model_with_projection.cpp | 16 +++++++++++----- .../image_generation/models/t5_encoder_model.cpp | 10 +++++++--- .../models/unet_inference_dynamic.hpp | 16 ++++------------ .../models/unet_inference_static_bs1.hpp | 3 +-- 5 files changed, 34 insertions(+), 27 deletions(-) diff --git a/src/cpp/src/image_generation/models/clip_text_model.cpp b/src/cpp/src/image_generation/models/clip_text_model.cpp index efbc840d4f..72fdc63082 100644 --- a/src/cpp/src/image_generation/models/clip_text_model.cpp +++ b/src/cpp/src/image_generation/models/clip_text_model.cpp @@ -118,13 +118,20 @@ ov::Tensor CLIPTextModel::infer(const std::string& pos_prompt, const std::string const size_t text_embedding_batch_size = do_classifier_free_guidance ? 2 : 1; auto perform_tokenization = [&](const std::string& prompt, ov::Tensor input_ids) { - std::fill_n(input_ids.data(), input_ids.get_size(), pad_token_id); - ov::Tensor input_ids_token = m_clip_tokenizer.encode(prompt).input_ids; - std::copy_n(input_ids_token.data(), input_ids_token.get_size(), input_ids.data()); + + if (input_ids.get_element_type() == ov::element::i32) { + std::fill_n(input_ids.data(), input_ids.get_size(), pad_token_id); + std::copy_n(input_ids_token.data(), input_ids_token.get_size(), input_ids.data()); + } else { + std::fill_n(input_ids.data(), input_ids.get_size(), pad_token_id); + std::copy_n(input_ids_token.data(), input_ids_token.get_size(), input_ids.data()); + } }; - ov::Tensor input_ids(ov::element::i32, {text_embedding_batch_size, m_config.max_position_embeddings}); + ov::Tensor input_ids = m_request.get_input_tensor(); + input_ids.set_shape({text_embedding_batch_size, m_config.max_position_embeddings}); + size_t current_batch_idx = 0; if (do_classifier_free_guidance) { @@ -141,7 +148,6 @@ ov::Tensor CLIPTextModel::infer(const std::string& pos_prompt, const std::string {current_batch_idx + 1, m_config.max_position_embeddings})); // text embeddings - m_request.set_tensor("input_ids", input_ids); m_request.infer(); return m_request.get_output_tensor(0); diff --git a/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp b/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp index 982800a701..1160c30b6a 100644 --- a/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp +++ b/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp @@ -109,13 +109,20 @@ ov::Tensor CLIPTextModelWithProjection::infer(const std::string& pos_prompt, con const size_t text_embedding_batch_size = do_classifier_free_guidance ? 2 : 1; auto perform_tokenization = [&](const std::string& prompt, ov::Tensor input_ids) { - std::fill_n(input_ids.data(), input_ids.get_size(), pad_token_id); - ov::Tensor input_ids_token = m_clip_tokenizer.encode(prompt).input_ids; - std::copy_n(input_ids_token.data(), input_ids_token.get_size(), input_ids.data()); + + if (input_ids.get_element_type() == ov::element::i32) { + std::fill_n(input_ids.data(), input_ids.get_size(), pad_token_id); + std::copy_n(input_ids_token.data(), input_ids_token.get_size(), input_ids.data()); + } else { + std::fill_n(input_ids.data(), input_ids.get_size(), pad_token_id); + std::copy_n(input_ids_token.data(), input_ids_token.get_size(), input_ids.data()); + } }; - ov::Tensor input_ids(ov::element::i64, {text_embedding_batch_size, m_config.max_position_embeddings}); + ov::Tensor input_ids = m_request.get_input_tensor(); + input_ids.set_shape({text_embedding_batch_size, m_config.max_position_embeddings}); + size_t current_batch_idx = 0; if (do_classifier_free_guidance) { @@ -132,7 +139,6 @@ ov::Tensor CLIPTextModelWithProjection::infer(const std::string& pos_prompt, con {current_batch_idx + 1, m_config.max_position_embeddings})); // text embeddings - m_request.set_tensor("input_ids", input_ids); m_request.infer(); return m_request.get_output_tensor(0); diff --git a/src/cpp/src/image_generation/models/t5_encoder_model.cpp b/src/cpp/src/image_generation/models/t5_encoder_model.cpp index 21df456d46..a83697b2e6 100644 --- a/src/cpp/src/image_generation/models/t5_encoder_model.cpp +++ b/src/cpp/src/image_generation/models/t5_encoder_model.cpp @@ -80,8 +80,13 @@ ov::Tensor T5EncoderModel::infer(const std::string& pos_prompt, const std::strin ov::Tensor input_ids_token = m_tokenizer.encode(prompt).input_ids; size_t min_length = std::min(input_ids.get_size(), input_ids_token.get_size()); - std::fill_n(input_ids.data(), input_ids.get_size(), pad_token_id); - std::copy_n(input_ids_token.data(), min_length, input_ids.data()); + if (input_ids.get_element_type() == ov::element::i32) { + std::fill_n(input_ids.data(), input_ids.get_size(), pad_token_id); + std::copy_n(input_ids_token.data(), min_length, input_ids.data()); + } else { + std::fill_n(input_ids.data(), input_ids.get_size(), pad_token_id); + std::copy_n(input_ids_token.data(), min_length, input_ids.data()); + } }; ov::Tensor input_ids = m_request.get_input_tensor(); @@ -114,7 +119,6 @@ ov::Tensor T5EncoderModel::infer(const std::string& pos_prompt, const std::strin {current_batch_idx + 1, input_ids.get_shape()[1]})); // text embeddings - m_request.set_tensor("input_ids", input_ids); m_request.infer(); return m_request.get_output_tensor(0); diff --git a/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp b/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp index 6dc285f76d..914fbcf50b 100644 --- a/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp +++ b/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp @@ -12,11 +12,8 @@ namespace genai { class UNet2DConditionModel::UNetInferenceDynamic : public UNet2DConditionModel::UNetInference { - public: - - virtual void compile(std::shared_ptr model, const std::string& device, const ov::AnyMap& properties) override - { + virtual void compile(std::shared_ptr model, const std::string& device, const ov::AnyMap& properties) override { ov::Core core = utils::singleton_core(); ov::CompiledModel compiled_model = core.compile_model(model, device, properties); @@ -24,20 +21,17 @@ class UNet2DConditionModel::UNetInferenceDynamic : public UNet2DConditionModel:: m_request = compiled_model.create_infer_request(); } - virtual void set_hidden_states(const std::string& tensor_name, ov::Tensor encoder_hidden_states) override - { + virtual void set_hidden_states(const std::string& tensor_name, ov::Tensor encoder_hidden_states) override { OPENVINO_ASSERT(m_request, "UNet model must be compiled first"); m_request.set_tensor(tensor_name, encoder_hidden_states); } - virtual void set_adapters(AdapterController &adapter_controller, const AdapterConfig& adapters) override - { + virtual void set_adapters(AdapterController &adapter_controller, const AdapterConfig& adapters) override { OPENVINO_ASSERT(m_request, "UNet model must be compiled first"); adapter_controller.apply(m_request, adapters); } - virtual ov::Tensor infer(ov::Tensor sample, ov::Tensor timestep) override - { + virtual ov::Tensor infer(ov::Tensor sample, ov::Tensor timestep) override { OPENVINO_ASSERT(m_request, "UNet model must be compiled first. Cannot infer non-compiled model"); m_request.set_tensor("sample", sample); @@ -49,10 +43,8 @@ class UNet2DConditionModel::UNetInferenceDynamic : public UNet2DConditionModel:: } private: - ov::InferRequest m_request; }; - } // namespace genai } // namespace ov \ No newline at end of file diff --git a/src/cpp/src/image_generation/models/unet_inference_static_bs1.hpp b/src/cpp/src/image_generation/models/unet_inference_static_bs1.hpp index 7aa6f6301c..f63a8ea237 100644 --- a/src/cpp/src/image_generation/models/unet_inference_static_bs1.hpp +++ b/src/cpp/src/image_generation/models/unet_inference_static_bs1.hpp @@ -42,8 +42,7 @@ class UNet2DConditionModel::UNetInferenceStaticBS1 : public UNet2DConditionModel ov::CompiledModel compiled_model = core.compile_model(model, device, properties); ov::genai::utils::print_compiled_model_properties(compiled_model, "UNet 2D Condition batch-1 model"); - for (int i = 0; i < m_native_batch_size; i++) - { + for (int i = 0; i < m_native_batch_size; i++) { m_requests[i] = compiled_model.create_infer_request(); } } From 1179cb611fa65910180e260cf31b98742113a896 Mon Sep 17 00:00:00 2001 From: Aamir Nazir Date: Mon, 23 Dec 2024 15:21:46 +0400 Subject: [PATCH 3/6] [LLM Bench] Allow Image Generation Models to Run in BF16 (#1368) This change allows setting image generation models to BF16 using config passed while running benchmark. Co-authored-by: Ekaterina Aidova Co-authored-by: guozhong wang --- tools/llm_bench/llm_bench_utils/pt_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/llm_bench/llm_bench_utils/pt_utils.py b/tools/llm_bench/llm_bench_utils/pt_utils.py index 4c41efad01..dc2c6d05f5 100644 --- a/tools/llm_bench/llm_bench_utils/pt_utils.py +++ b/tools/llm_bench/llm_bench_utils/pt_utils.py @@ -131,6 +131,7 @@ def create_image_gen_model(model_path, device, **kwargs): model_class = PT_MODEL_CLASSES_MAPPING[model_type] start = time.perf_counter() pipe = model_class.from_pretrained(model_path) + pipe = set_bf16(pipe, device, **kwargs) end = time.perf_counter() from_pretrain_time = end - start else: From 5d68567484594c915d6047cd9a31a95eab40962d Mon Sep 17 00:00:00 2001 From: Aamir Nazir Date: Mon, 23 Dec 2024 15:22:05 +0400 Subject: [PATCH 4/6] [LLM Bench] Defining Framework in Torch Compile Benchmarking (#1354) It looks like the framework needs to be specified as pytorch for the models to be compile with torch compile, otherwise it takes the OV framework route and never hits the torch compile code. Although the following [line](https://github.com/openvinotoolkit/openvino.genai/blob/b26fc8b7a484e0f66accba89ea9f972c6d23fda7/tools/llm_bench/llm_bench_utils/pt_utils.py#L157) tries to use torch compile on the entire image generation pipeline which causes issues since it is expected to compile the models within the pipeline. --------- Co-authored-by: Ekaterina Aidova Co-authored-by: Ilya Lavrenov --- tools/llm_bench/llm_bench_utils/model_utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/llm_bench/llm_bench_utils/model_utils.py b/tools/llm_bench/llm_bench_utils/model_utils.py index f3e7d21777..78f72147c7 100644 --- a/tools/llm_bench/llm_bench_utils/model_utils.py +++ b/tools/llm_bench/llm_bench_utils/model_utils.py @@ -137,6 +137,9 @@ def analyze_args(args): model_framework = args.framework model_path = Path(args.model) + if model_args["torch_compile_backend"]: + log.info("Setting Framework to PyTorch Since torch_compile_backend is provided.") + model_framework = 'pt' if not model_path.exists(): raise RuntimeError(f'==Failure FOUND==: Incorrect model path:{model_path}') if model_framework in ('ov', 'pt'): From c09207cd497e250e8b3e7ad442cec3bc4181827e Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Mon, 23 Dec 2024 12:33:47 +0100 Subject: [PATCH 5/6] [test] Ensure that the first token generation is not included into TPOT (#1414) CVS-155098 --- src/cpp/src/perf_metrics.cpp | 2 +- tests/python_tests/conftest.py | 3 ++- tests/python_tests/test_generate_api.py | 10 +++++++++- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/src/cpp/src/perf_metrics.cpp b/src/cpp/src/perf_metrics.cpp index 3bd6252c78..3725dc0cfc 100644 --- a/src/cpp/src/perf_metrics.cpp +++ b/src/cpp/src/perf_metrics.cpp @@ -101,7 +101,7 @@ void PerfMetrics::evaluate_statistics(std::optional start_time) { auto ttft = tok_times[0] - start_time_val; raw_metrics.m_times_to_first_token = std::vector(); - raw_metrics.m_times_to_first_token.emplace_back(ttft / batch_sizes[0]); + raw_metrics.m_times_to_first_token.emplace_back(ttft); num_generated_tokens = batch_sizes[0]; // The very first infer request (prefill stage) is slower than subsequent ones since we process a sequence of tokens. diff --git a/tests/python_tests/conftest.py b/tests/python_tests/conftest.py index f98f47ecf3..e159045601 100644 --- a/tests/python_tests/conftest.py +++ b/tests/python_tests/conftest.py @@ -3,7 +3,8 @@ def pytest_make_parametrize_id(config, val, argname): if argname in ['prompt', 'prompts', 'batched_prompts']: - return f'{val}' + # Print only first 1000 characters of long prompts. + return f'{val[:1000]}' elif argname == 'model_descr': return f"{val[0]}" elif argname == 'chat_config': diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py index d15747be63..9bb9eff49c 100644 --- a/tests/python_tests/test_generate_api.py +++ b/tests/python_tests/test_generate_api.py @@ -798,6 +798,12 @@ def test_perf_metrics(model_descr, generation_config, prompt): assert (mean_ttft, std_ttft) == (perf_metrics.get_ttft().mean, perf_metrics.get_ttft().std) assert mean_ttft > 0 and mean_ttft < 1000.0 + raw_metrics = perf_metrics.raw_metrics + durations = np.array(raw_metrics.m_durations) / 1000 + # Check that prefill is not included in durations for TPOT calculation. + # For the very long prompt prefill is slow and TTFT is much larger than any other token genration duration. + assert np.all(mean_ttft > durations * 2) + mean_tpot, std_tpot = perf_metrics.get_tpot() assert (mean_tpot, std_tpot) == (perf_metrics.get_tpot().mean, perf_metrics.get_tpot().std) assert mean_tpot > 0 and mean_ttft < 1000.0 @@ -822,7 +828,9 @@ def test_perf_metrics(model_descr, generation_config, prompt): assert std_detok_duration == 0 # assert that calculating statistics manually from the raw counters we get the same restults as from PerfMetrics - raw_metrics = perf_metrics.raw_metrics + assert np.allclose(mean_tpot, np.mean(durations)) + assert np.allclose(std_tpot, np.std(durations)) + raw_dur = np.array(raw_metrics.generate_durations) / 1000 assert np.allclose(mean_gen_duration, np.mean(raw_dur)) assert np.allclose(std_gen_duration, np.std(raw_dur)) From 3496d453ee2a2dd1a0340247076ab64787094446 Mon Sep 17 00:00:00 2001 From: Ekaterina Shiryaeva Date: Mon, 23 Dec 2024 12:48:23 +0100 Subject: [PATCH 6/6] Add perf metrics support for WhisperStaticPipeline (#1337) --- src/cpp/src/whisper/whisper.cpp | 37 ++----------- src/cpp/src/whisper/whisper_utils.cpp | 46 ++++++++++++++++ src/cpp/src/whisper/whisper_utils.hpp | 22 ++++++++ src/cpp/src/whisper_pipeline_static.cpp | 70 +++++++++++++++++++++---- 4 files changed, 131 insertions(+), 44 deletions(-) create mode 100644 src/cpp/src/whisper/whisper_utils.cpp create mode 100644 src/cpp/src/whisper/whisper_utils.hpp diff --git a/src/cpp/src/whisper/whisper.cpp b/src/cpp/src/whisper/whisper.cpp index 9d6aa698ce..04993f288c 100644 --- a/src/cpp/src/whisper/whisper.cpp +++ b/src/cpp/src/whisper/whisper.cpp @@ -18,6 +18,7 @@ #include "whisper_config.hpp" #include "whisper_feature_extractor.hpp" #include "whisper_models.hpp" +#include "whisper_utils.hpp" using ov::genai::MicroSeconds; @@ -79,17 +80,6 @@ void set_past_key_value(ov::InferRequest& source, ov::InferRequest& dest) { } } -void infer_with_perf_metrics(ov::InferRequest& request, ov::genai::RawPerfMetrics& raw_metrics) { - const auto infer_start = std::chrono::steady_clock::now(); - request.infer(); - const auto infer_end = std::chrono::steady_clock::now(); - const auto infer_ms = ov::genai::PerfMetrics::get_microsec(infer_end - infer_start); - raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms); - raw_metrics.m_token_infer_durations.emplace_back(infer_ms); - raw_metrics.m_new_token_times.emplace_back(infer_end); - raw_metrics.m_batch_sizes.emplace_back(1); -} - int64_t decode(ov::Tensor& encoder_hidden_state, ov::InferRequest& decoder, std::vector& input_ids, @@ -102,7 +92,7 @@ int64_t decode(ov::Tensor& encoder_hidden_state, ov::Tensor input_ids_tensor(ov::element::i64, {1, input_ids.size()}, input_ids.data()); decoder.set_tensor("input_ids", input_ids_tensor); - infer_with_perf_metrics(decoder, raw_metrics); + ov::genai::utils::infer_with_perf_metrics(decoder, raw_metrics); auto output_tensor = decoder.get_tensor("logits"); @@ -138,7 +128,7 @@ int64_t decode_with_past(ov::Tensor& encoder_hidden_state, cache_position_tensor.set_shape({1}); cache_position_tensor.data()[0] = cache_position; - infer_with_perf_metrics(decoder_with_past, raw_metrics); + ov::genai::utils::infer_with_perf_metrics(decoder_with_past, raw_metrics); auto output_tensor = decoder_with_past.get_tensor("logits"); @@ -265,25 +255,6 @@ std::pair> full_decode(ov::Tensor& encoder_hidden_sta return {false, output_tokens}; } -template -void filter_by_ranges(std::vector& value, size_t offset, std::vector>& ranges) { - OPENVINO_ASSERT(ranges.empty() || value.size() >= (offset + ranges.back().second)); - std::vector result{value.begin(), value.begin() + offset}; - for (auto [start, end] : ranges) { - result.insert(result.end(), value.begin() + offset + start, value.begin() + offset + end); - } - - value = result; -} - -void filter_non_segment_metrics(ov::genai::RawPerfMetrics& raw_metrics, - size_t offset, - std::vector>& ranges) { - filter_by_ranges(raw_metrics.m_token_infer_durations, offset, ranges); - filter_by_ranges(raw_metrics.m_new_token_times, offset, ranges); - filter_by_ranges(raw_metrics.m_batch_sizes, offset, ranges); -} - } // namespace namespace ov { @@ -362,7 +333,7 @@ WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig& feature_extractor.nb_max_frames, time_precision); - filter_non_segment_metrics(raw_metrics, output_tokens.size(), extracted_segments.segment_ranges); + ov::genai::utils::filter_non_segment_metrics(raw_metrics, output_tokens.size(), extracted_segments.segment_ranges); segments.insert(segments.end(), extracted_segments.segments.begin(), extracted_segments.segments.end()); diff --git a/src/cpp/src/whisper/whisper_utils.cpp b/src/cpp/src/whisper/whisper_utils.cpp new file mode 100644 index 0000000000..6e56a1439d --- /dev/null +++ b/src/cpp/src/whisper/whisper_utils.cpp @@ -0,0 +1,46 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "whisper_utils.hpp" + +namespace { + +template +void filter_by_ranges(std::vector& value, size_t offset, std::vector>& ranges) { + OPENVINO_ASSERT(ranges.empty() || value.size() >= (offset + ranges.back().second)); + std::vector result{value.begin(), value.begin() + offset}; + for (auto [start, end] : ranges) { + result.insert(result.end(), value.begin() + offset + start, value.begin() + offset + end); + } + + value = result; +} + +} // namespace + +namespace ov { +namespace genai { +namespace utils { + +void infer_with_perf_metrics(ov::InferRequest& request, ov::genai::RawPerfMetrics& raw_metrics) { + const auto infer_start = std::chrono::steady_clock::now(); + request.infer(); + const auto infer_end = std::chrono::steady_clock::now(); + const auto infer_ms = ov::genai::PerfMetrics::get_microsec(infer_end - infer_start); + raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms); + raw_metrics.m_token_infer_durations.emplace_back(infer_ms); + raw_metrics.m_new_token_times.emplace_back(infer_end); + raw_metrics.m_batch_sizes.emplace_back(1); +} + +void filter_non_segment_metrics(ov::genai::RawPerfMetrics& raw_metrics, + size_t offset, + std::vector>& ranges) { + filter_by_ranges(raw_metrics.m_token_infer_durations, offset, ranges); + filter_by_ranges(raw_metrics.m_new_token_times, offset, ranges); + filter_by_ranges(raw_metrics.m_batch_sizes, offset, ranges); +} + +} // namespace utils +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/whisper/whisper_utils.hpp b/src/cpp/src/whisper/whisper_utils.hpp new file mode 100644 index 0000000000..234feed6a8 --- /dev/null +++ b/src/cpp/src/whisper/whisper_utils.hpp @@ -0,0 +1,22 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include + +#include "openvino/genai/perf_metrics.hpp" + +namespace ov { +namespace genai { +namespace utils { + +void infer_with_perf_metrics(ov::InferRequest& request, ov::genai::RawPerfMetrics& raw_metrics); + +void filter_non_segment_metrics(ov::genai::RawPerfMetrics& raw_metrics, + size_t offset, + std::vector>& ranges); + +} // namespace utils +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/whisper_pipeline_static.cpp b/src/cpp/src/whisper_pipeline_static.cpp index dc26789846..cc61eb0659 100644 --- a/src/cpp/src/whisper_pipeline_static.cpp +++ b/src/cpp/src/whisper_pipeline_static.cpp @@ -14,6 +14,7 @@ #include "whisper/timestamps.hpp" #include "whisper/whisper.hpp" #include "whisper/whisper_config.hpp" +#include "whisper/whisper_utils.hpp" #include "openvino/core/layout.hpp" #include "openvino/core/preprocess/pre_post_process.hpp" @@ -26,6 +27,8 @@ #include "openvino/op/convert.hpp" #include "openvino/op/parameter.hpp" +using ov::genai::MicroSeconds; + namespace { template @@ -44,7 +47,8 @@ void copy_to_tensor(const std::vector& src_vec, ov::Tensor dst_tensor) { ov::Tensor encode(ov::InferRequest& request, std::vector& mel_data, const size_t feature_size, - const size_t nb_max_frames) { + const size_t nb_max_frames, + ov::genai::RawPerfMetrics& raw_metrics) { OPENVINO_ASSERT(mel_data.size() == feature_size * nb_max_frames, "Mel spectrogram required size: ", feature_size, @@ -54,7 +58,12 @@ ov::Tensor encode(ov::InferRequest& request, mel_data.size(), "."); copy_to_tensor(mel_data, request.get_tensor("input_features")); + + const auto infer_start = std::chrono::steady_clock::now(); request.infer(); + const auto infer_ms = ov::genai::PerfMetrics::get_microsec(std::chrono::steady_clock::now() - infer_start); + raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms); + return request.get_tensor("last_hidden_state"); } @@ -140,13 +149,14 @@ int64_t decode(ov::Tensor& encoder_hidden_state, ov::InferRequest& decoder, const std::vector& init_ids, const ov::genai::WhisperGenerationConfig& config, + ov::genai::RawPerfMetrics& raw_metrics, const bool apply_logit_processors = true, const bool return_timestamps = false) { // NB: Fill decoder inputs encoder_hidden_state.copy_to(decoder.get_tensor("encoder_hidden_states")); set_decoder_input_ids_attention_mask(decoder, init_ids, config.pad_token_id); - decoder.infer(); + ov::genai::utils::infer_with_perf_metrics(decoder, raw_metrics); auto output_tensor = decoder.get_tensor("logits"); @@ -167,6 +177,7 @@ int64_t decode_with_past(ov::InferRequest& decoder_with_past, const int64_t input_id, const int64_t position_id, const ov::genai::WhisperGenerationConfig& config, + ov::genai::RawPerfMetrics& raw_metrics, const bool return_timestamps, const std::vector& generated_tokens) { // FIXME: Avoid this cast to i32. Why it's not i64 precision in model? @@ -175,7 +186,7 @@ int64_t decode_with_past(ov::InferRequest& decoder_with_past, // FIXME: Is "attention_mask" supposed to be f16? decoder_with_past.get_tensor("attention_mask").data()[position_id - 1] = 0u; - decoder_with_past.infer(); + ov::genai::utils::infer_with_perf_metrics(decoder_with_past, raw_metrics); auto output_tensor = decoder_with_past.get_tensor("logits"); ov::genai::do_suppress_tokens(output_tensor, 0, config.suppress_tokens); @@ -217,13 +228,17 @@ void prepare_decoder_with_past(ov::InferRequest& decoder_with_past, ov::InferReq int64_t detect_language(ov::Tensor& encoder_hidden_state, ov::InferRequest decoder, - const ov::genai::WhisperGenerationConfig& config) { + const ov::genai::WhisperGenerationConfig& config, + ov::genai::RawPerfMetrics& raw_metrics) { decoder.set_tensor("encoder_hidden_states", ov::Tensor{encoder_hidden_state}); std::vector init_ids{static_cast(config.decoder_start_token_id)}; set_decoder_input_ids_attention_mask(decoder, init_ids, config.pad_token_id); + const auto infer_start = std::chrono::steady_clock::now(); decoder.infer(); + const auto infer_ms = ov::genai::PerfMetrics::get_microsec(std::chrono::steady_clock::now() - infer_start); + raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms); auto output_tensor = decoder.get_tensor("logits"); @@ -246,7 +261,8 @@ int64_t detect_language(ov::Tensor& encoder_hidden_state, std::vector prepare_init_ids(ov::Tensor& encoder_hidden_state, ov::InferRequest& decoder, const ov::genai::WhisperGenerationConfig& config, - const bool return_timestamps) { + const bool return_timestamps, + ov::genai::RawPerfMetrics& raw_metrics) { if (!config.is_multilingual) { if (return_timestamps) { return std::vector{static_cast(config.decoder_start_token_id)}; @@ -263,7 +279,7 @@ std::vector prepare_init_ids(ov::Tensor& encoder_hidden_state, language_token_id = static_cast(config.lang_to_id.at(language)); } } else { - language_token_id = detect_language(encoder_hidden_state, decoder, config); + language_token_id = detect_language(encoder_hidden_state, decoder, config, raw_metrics); } int32_t task_token_id = static_cast(config.transcribe_token_id); @@ -289,8 +305,9 @@ std::pair> full_decode(ov::Tensor& encoder_hidden_sta std::vector init_ids, const size_t max_new_tokens, const bool return_timestamps, + ov::genai::RawPerfMetrics& raw_metrics, const std::shared_ptr streamer) { - int64_t output_token = decode(encoder_hidden_state, models.decoder, init_ids, config, true, return_timestamps); + int64_t output_token = decode(encoder_hidden_state, models.decoder, init_ids, config, raw_metrics, true, return_timestamps); std::vector output_tokens{output_token}; if (!return_timestamps && streamer && streamer->put(output_token)) { @@ -308,6 +325,7 @@ std::pair> full_decode(ov::Tensor& encoder_hidden_sta output_tokens.back(), i + init_ids.size(), config, + raw_metrics, return_timestamps, output_tokens); update_past_key_value(models.decoder_with_past, models.decoder_with_past, i + init_ids.size()); @@ -576,6 +594,7 @@ WhisperDecodedResults WhisperPipeline::StaticWhisperPipeline::generate( const RawSpeechInput& raw_speech_input, OptionalWhisperGenerationConfig generation_config, ChunkStreamerVariant streamer) { + auto start_time = std::chrono::steady_clock::now(); WhisperGenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config; config.validate(); @@ -591,14 +610,25 @@ WhisperDecodedResults WhisperPipeline::StaticWhisperPipeline::generate( streamer_ptr = std::make_shared(m_tokenizer, *callback); } + size_t max_new_tokens = config.get_max_new_tokens(); + + WhisperPerfMetrics perf_metrics; + perf_metrics.num_input_tokens = 0; + RawPerfMetrics& raw_metrics = perf_metrics.raw_metrics; + raw_metrics.m_new_token_times.reserve(max_new_tokens); + raw_metrics.m_batch_sizes.reserve(max_new_tokens); + raw_metrics.m_token_infer_durations.reserve(max_new_tokens); + raw_metrics.m_inference_durations = {{MicroSeconds(0.0f)}}; + + const auto extract_start = std::chrono::steady_clock::now(); auto input_features = m_feature_extractor.extract(raw_speech_input); + const auto extract_ms = ov::genai::PerfMetrics::get_microsec(std::chrono::steady_clock::now() - extract_start); + perf_metrics.whisper_raw_metrics.features_extraction_durations.emplace_back(extract_ms); const bool is_shortform = input_features.n_frames <= m_feature_extractor.nb_max_frames; // long-form audio processing requires timestamps to be enabled const bool return_timestamps = config.return_timestamps || !is_shortform; - size_t max_new_tokens = config.get_max_new_tokens(); - std::vector init_ids; std::vector output_tokens; std::vector segments; @@ -619,11 +649,12 @@ WhisperDecodedResults WhisperPipeline::StaticWhisperPipeline::generate( ov::Tensor hidden_state_tensor = encode(m_models.encoder, input_features_chunk, m_feature_extractor.feature_size, - m_feature_extractor.nb_max_frames); + m_feature_extractor.nb_max_frames, + raw_metrics); // prepare init_ids just once for whole input if (init_ids.empty()) { - init_ids = prepare_init_ids(hidden_state_tensor, m_models.decoder, config, return_timestamps); + init_ids = prepare_init_ids(hidden_state_tensor, m_models.decoder, config, return_timestamps, raw_metrics); } auto [cancelled, chunk_output_tokens] = full_decode(hidden_state_tensor, @@ -632,6 +663,7 @@ WhisperDecodedResults WhisperPipeline::StaticWhisperPipeline::generate( init_ids, max_new_tokens - output_tokens.size(), return_timestamps, + raw_metrics, streamer_ptr); if (return_timestamps) { @@ -640,6 +672,8 @@ WhisperDecodedResults WhisperPipeline::StaticWhisperPipeline::generate( m_feature_extractor.nb_max_frames, time_precision); + ov::genai::utils::filter_non_segment_metrics(raw_metrics, output_tokens.size(), extracted_segments.segment_ranges); + segments.insert(segments.end(), extracted_segments.segments.begin(), extracted_segments.segments.end()); output_tokens.insert(output_tokens.end(), @@ -669,7 +703,11 @@ WhisperDecodedResults WhisperPipeline::StaticWhisperPipeline::generate( streamer_ptr->end(); } + auto decode_start_time = std::chrono::steady_clock::now(); WhisperDecodedResults result{std::vector{m_tokenizer.decode(output_tokens)}, std::vector{1.f}}; + result.perf_metrics = perf_metrics; + result.perf_metrics.raw_metrics.detokenization_durations.emplace_back( + PerfMetrics::get_microsec(std::chrono::steady_clock::now() - decode_start_time)); // if return_timestamps wasn't enabled by user if (!config.return_timestamps) { @@ -681,13 +719,23 @@ WhisperDecodedResults WhisperPipeline::StaticWhisperPipeline::generate( chunks.reserve(segments.size()); for (auto& segment : segments) { + decode_start_time = std::chrono::steady_clock::now(); chunks.push_back( WhisperDecodedResultChunk{segment.m_start, segment.m_end, m_tokenizer.decode(segment.m_tokens)}); + result.perf_metrics.raw_metrics.detokenization_durations.emplace_back( + PerfMetrics::get_microsec(std::chrono::steady_clock::now() - decode_start_time)); } result.chunks = chunks; } + auto& metrics = result.perf_metrics; + metrics.load_time = this->m_load_time_ms; + auto stop_time = std::chrono::steady_clock::now(); + metrics.raw_metrics.generate_durations.emplace_back(PerfMetrics::get_microsec(stop_time - start_time)); + metrics.raw_metrics.tokenization_durations.emplace_back(MicroSeconds(0.0f)); + metrics.evaluate_statistics(start_time); + return result; }