From bf462f52156e74f2c90e9222dd4f9bfe8017c244 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov <ilya.lavrenov@intel.com> Date: Mon, 30 Dec 2024 11:53:16 +0100 Subject: [PATCH 1/2] Enabled CB by default --- src/cpp/src/llm_pipeline.cpp | 113 ++++++++++++++++++++++++++++------- 1 file changed, 90 insertions(+), 23 deletions(-) diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 0125479f92..3d5697e4e3 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -5,6 +5,7 @@ #include <nlohmann/json.hpp> +#include "openvino/core/visibility.hpp" #include "openvino/genai/llm_pipeline.hpp" #include "openvino/genai/perf_metrics.hpp" @@ -18,9 +19,9 @@ namespace genai { namespace { -/* +/* * NPU reads some properties from the config file, but when LLMPipeline is initialized -* from the model_str and weights_tensor, there are not files. +* from the model_str and weights_tensor, there are not files. * In the later case ModelDesc is stored in properties. * This function pops ModelDescr from the the properties and returns a pair of updated properties and ModelDescr. */ @@ -37,7 +38,7 @@ std::pair<ov::AnyMap, ov::genai::ModelConfigDesc> split_model_descr(const ov::An pop_property(main_properties, "name_or_path", model_descr.name_or_path); pop_property(main_properties, "type", model_descr.type); pop_property(main_properties, "num_key_value_heads", model_descr.num_key_value_heads); - + return {main_properties, model_descr}; } @@ -62,7 +63,7 @@ std::pair<std::string, Any> draft_model( const std::string& device, const ov::AnyMap& properties) { auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties); - + std::filesystem::path openvino_model_name = "openvino_model.xml"; auto model = utils::singleton_core().read_model(models_path / openvino_model_name, {}, plugin_config); auto generation_config = utils::from_config_json_if_exists(models_path); @@ -99,16 +100,39 @@ ov::genai::LLMPipeline::LLMPipeline( const std::string& device, const ov::AnyMap& properties) { auto start_time = std::chrono::steady_clock::now(); - if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || - properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || + + // If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues + if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || + properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || properties.find(ov::genai::prompt_lookup.name()) != properties.end()) { auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties); m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, tokenizer, scheduler_config, device, plugin_config); - } else if (device == "NPU") { + } + + if (m_pimpl == nullptr && device == "NPU") { m_pimpl = std::make_unique<StaticLLMPipeline>(models_path, tokenizer, device, properties); - } else { + } + + // try to call CB adapter one more time, but with safe guard to silent exception + if (m_pimpl == nullptr) { + try { + // we need use CB only for x86, as for other architectures like arm64 or risc-v we can create Paged Attention based model + // but cannot perform its inference later +#ifdef OPENVINO_ARCH_X86_64 + SchedulerConfig default_config; + default_config.max_num_batched_tokens = std::numeric_limits<size_t>::max(); // don't limit total batch size + + m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, tokenizer, default_config, device, properties); +#endif + } catch (ov::Exception&) { + // ignore exceptions from PA + } + } + + if (m_pimpl == nullptr) { m_pimpl = std::make_unique<StatefulLLMPipeline>(models_path, tokenizer, device, properties); } + m_pimpl->save_load_time(start_time); } @@ -118,14 +142,35 @@ ov::genai::LLMPipeline::LLMPipeline( const ov::AnyMap& properties) { auto start_time = std::chrono::steady_clock::now(); - if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || - properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || + // If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues + if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || + properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || properties.find(ov::genai::prompt_lookup.name()) != properties.end()) { auto [device_properties, scheduler_config] = utils::split_scheduler_config(properties); m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, scheduler_config, device, device_properties); - } else if (device == "NPU") { + } + + if (m_pimpl == nullptr && device == "NPU") { m_pimpl = std::make_unique<StaticLLMPipeline>(models_path, device, properties); - } else { + } + + // try to call CB adapter one more time, but with safe guard to silent exception + if (m_pimpl == nullptr) { + try { + // we need use CB only for x86, as for other architectures like arm64 or risc-v we can create Paged Attention based model + // but cannot perform its inference later +#ifdef OPENVINO_ARCH_X86_64 + SchedulerConfig default_config; + default_config.max_num_batched_tokens = std::numeric_limits<size_t>::max(); // don't limit total batch size + + m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, default_config, device, properties); +#endif + } catch (ov::Exception&) { + // ignore exceptions from PA + } + } + + if (m_pimpl == nullptr) { m_pimpl = std::make_unique<StatefulLLMPipeline>(models_path, device, properties); } @@ -141,36 +186,58 @@ ov::genai::LLMPipeline::LLMPipeline( const ov::genai::GenerationConfig& generation_config) { auto start_time = std::chrono::steady_clock::now(); - if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || - properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || + // If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues + if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || + properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || properties.find(ov::genai::prompt_lookup.name()) != properties.end()){ auto [device_properties, scheduler_config] = utils::split_scheduler_config(properties); m_pimpl = std::make_unique<ContinuousBatchingAdapter>(model_str, weights_tensor, tokenizer, scheduler_config, device, device_properties, generation_config); - } else if (device == "NPU") { + } + + if (m_pimpl == nullptr && device == "NPU") { // TODO: CVS-158771 Currently, it's a workaround. Probably there is a better solution. - // NPU reads some properties from the config file, but when LLMPipeline is initialized - // from the model_str and weights_tensor, there is no files. + // NPU reads some properties from the config file, but when LLMPipeline is initialized + // from the model_str and weights_tensor, there is no files. // Therefore, we need to pass these properties manually. // This is necessary only for NPU, for other plugins can be ommited. // Example of usage: - // ov::AnyMap model_descr_properties = {{"name_or_path", "meta-llama/Llama-2-7b-chat-hf"}, - // {"type", "llama"}, + // ov::AnyMap model_descr_properties = {{"name_or_path", "meta-llama/Llama-2-7b-chat-hf"}, + // {"type", "llama"}, // {"num_key_value_heads", 32}}; // ov::genai::LLMPipeline pipe(model_str,..., model_descr_properties); // This will convert from AnyMap to ModelDesc. - auto [filtered_properties, model_descr] = split_model_descr(properties); + auto [device_properties, model_descr] = split_model_descr(properties); m_pimpl = std::make_unique<StaticLLMPipeline>( - utils::singleton_core().read_model(model_str, weights_tensor), + utils::singleton_core().read_model(model_str, weights_tensor), model_descr, tokenizer, device, - filtered_properties, + device_properties, generation_config ); - } else { + } + + // try to call CB adapter one more time, but with safe guard to silent exception + if (m_pimpl == nullptr) { + try { + // we need use CB only for x86, as for other architectures like arm64 or risc-v we can create Paged Attention based model + // but cannot perform its inference later +#ifdef OPENVINO_ARCH_X86_64 + SchedulerConfig default_config; + default_config.max_num_batched_tokens = std::numeric_limits<size_t>::max(); // don't limit total batch size + + m_pimpl = std::make_unique<ContinuousBatchingAdapter>(model_str, weights_tensor, tokenizer, + default_config, device, properties, generation_config); +#endif + } catch (ov::Exception&) { + // ignore exceptions from PA + } + } + + if (m_pimpl == nullptr) { m_pimpl = std::make_unique<StatefulLLMPipeline>( utils::singleton_core().read_model(model_str, weights_tensor), tokenizer, From 2f994721707248c679182605b53c5563ec4a346e Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov <ilya.lavrenov@intel.com> Date: Thu, 2 Jan 2025 08:14:30 +0100 Subject: [PATCH 2/2] Enable prefix caching --- src/cpp/src/llm_pipeline.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 3d5697e4e3..962147945c 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -121,6 +121,7 @@ ov::genai::LLMPipeline::LLMPipeline( #ifdef OPENVINO_ARCH_X86_64 SchedulerConfig default_config; default_config.max_num_batched_tokens = std::numeric_limits<size_t>::max(); // don't limit total batch size + default_config.enable_prefix_caching = true; // for better TTFT in chat scenarios m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, tokenizer, default_config, device, properties); #endif @@ -162,6 +163,7 @@ ov::genai::LLMPipeline::LLMPipeline( #ifdef OPENVINO_ARCH_X86_64 SchedulerConfig default_config; default_config.max_num_batched_tokens = std::numeric_limits<size_t>::max(); // don't limit total batch size + default_config.enable_prefix_caching = true; // for better TTFT in chat scenarios m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, default_config, device, properties); #endif @@ -228,6 +230,7 @@ ov::genai::LLMPipeline::LLMPipeline( #ifdef OPENVINO_ARCH_X86_64 SchedulerConfig default_config; default_config.max_num_batched_tokens = std::numeric_limits<size_t>::max(); // don't limit total batch size + default_config.enable_prefix_caching = true; // for better TTFT in chat scenarios m_pimpl = std::make_unique<ContinuousBatchingAdapter>(model_str, weights_tensor, tokenizer, default_config, device, properties, generation_config);