From bf462f52156e74f2c90e9222dd4f9bfe8017c244 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Mon, 30 Dec 2024 11:53:16 +0100
Subject: [PATCH 1/2] Enabled CB by default

---
 src/cpp/src/llm_pipeline.cpp | 113 ++++++++++++++++++++++++++++-------
 1 file changed, 90 insertions(+), 23 deletions(-)

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 0125479f92..3d5697e4e3 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -5,6 +5,7 @@
 
 #include <nlohmann/json.hpp>
 
+#include "openvino/core/visibility.hpp"
 #include "openvino/genai/llm_pipeline.hpp"
 #include "openvino/genai/perf_metrics.hpp"
 
@@ -18,9 +19,9 @@ namespace genai {
 
 namespace {
 
-/* 
+/*
 * NPU reads some properties from the config file, but when LLMPipeline is initialized
-* from the model_str and weights_tensor, there are not files. 
+* from the model_str and weights_tensor, there are not files.
 * In the later case ModelDesc is stored in properties.
 * This function pops ModelDescr from the the properties and returns a pair of updated properties and ModelDescr.
 */
@@ -37,7 +38,7 @@ std::pair<ov::AnyMap, ov::genai::ModelConfigDesc> split_model_descr(const ov::An
     pop_property(main_properties, "name_or_path", model_descr.name_or_path);
     pop_property(main_properties, "type", model_descr.type);
     pop_property(main_properties, "num_key_value_heads", model_descr.num_key_value_heads);
-    
+
     return {main_properties, model_descr};
 }
 
@@ -62,7 +63,7 @@ std::pair<std::string, Any> draft_model(
     const std::string& device,
     const ov::AnyMap& properties) {
     auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties);
-    
+
     std::filesystem::path openvino_model_name = "openvino_model.xml";
     auto model = utils::singleton_core().read_model(models_path / openvino_model_name, {}, plugin_config);
     auto generation_config = utils::from_config_json_if_exists(models_path);
@@ -99,16 +100,39 @@ ov::genai::LLMPipeline::LLMPipeline(
     const std::string& device,
     const ov::AnyMap& properties) {
     auto start_time = std::chrono::steady_clock::now();
-    if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || 
-        properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || 
+
+    // If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues
+    if (properties.find(ov::genai::scheduler_config.name()) != properties.end() ||
+        properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() ||
         properties.find(ov::genai::prompt_lookup.name()) != properties.end()) {
         auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties);
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, tokenizer, scheduler_config, device, plugin_config);
-    } else if (device == "NPU") {
+    }
+
+    if (m_pimpl == nullptr && device == "NPU") {
         m_pimpl = std::make_unique<StaticLLMPipeline>(models_path, tokenizer, device, properties);
-    } else {
+    }
+
+    // try to call CB adapter one more time, but with safe guard to silent exception
+    if (m_pimpl == nullptr) {
+        try {
+            // we need use CB only for x86, as for other architectures like arm64 or risc-v we can create Paged Attention based model
+            // but cannot perform its inference later
+#ifdef OPENVINO_ARCH_X86_64
+            SchedulerConfig default_config;
+            default_config.max_num_batched_tokens = std::numeric_limits<size_t>::max(); // don't limit total batch size
+
+            m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, tokenizer, default_config, device, properties);
+#endif
+        } catch (ov::Exception&) {
+            // ignore exceptions from PA
+        }
+    }
+
+    if (m_pimpl == nullptr) {
         m_pimpl = std::make_unique<StatefulLLMPipeline>(models_path, tokenizer, device, properties);
     }
+
     m_pimpl->save_load_time(start_time);
 }
 
@@ -118,14 +142,35 @@ ov::genai::LLMPipeline::LLMPipeline(
     const ov::AnyMap& properties) {
     auto start_time = std::chrono::steady_clock::now();
 
-    if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || 
-        properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || 
+    // If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues
+    if (properties.find(ov::genai::scheduler_config.name()) != properties.end() ||
+        properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() ||
         properties.find(ov::genai::prompt_lookup.name()) != properties.end()) {
         auto [device_properties, scheduler_config] = utils::split_scheduler_config(properties);
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, scheduler_config, device, device_properties);
-    } else if (device == "NPU") {
+    }
+
+    if (m_pimpl == nullptr && device == "NPU") {
         m_pimpl = std::make_unique<StaticLLMPipeline>(models_path, device, properties);
-    } else {
+    }
+
+    // try to call CB adapter one more time, but with safe guard to silent exception
+    if (m_pimpl == nullptr) {
+        try {
+            // we need use CB only for x86, as for other architectures like arm64 or risc-v we can create Paged Attention based model
+            // but cannot perform its inference later
+#ifdef OPENVINO_ARCH_X86_64
+            SchedulerConfig default_config;
+            default_config.max_num_batched_tokens = std::numeric_limits<size_t>::max(); // don't limit total batch size
+
+            m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, default_config, device, properties);
+#endif
+        } catch (ov::Exception&) {
+            // ignore exceptions from PA
+        }
+    }
+
+    if (m_pimpl == nullptr) {
         m_pimpl = std::make_unique<StatefulLLMPipeline>(models_path, device, properties);
     }
 
@@ -141,36 +186,58 @@ ov::genai::LLMPipeline::LLMPipeline(
     const ov::genai::GenerationConfig& generation_config) {
     auto start_time = std::chrono::steady_clock::now();
 
-    if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || 
-        properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || 
+    // If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues
+    if (properties.find(ov::genai::scheduler_config.name()) != properties.end() ||
+        properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() ||
         properties.find(ov::genai::prompt_lookup.name()) != properties.end()){
 
         auto [device_properties, scheduler_config] = utils::split_scheduler_config(properties);
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(model_str, weights_tensor,
                                                               tokenizer, scheduler_config, device, device_properties, generation_config);
-    } else if (device == "NPU") {
+    }
+
+    if (m_pimpl == nullptr && device == "NPU") {
         // TODO: CVS-158771 Currently, it's a workaround. Probably there is a better solution.
-        // NPU reads some properties from the config file, but when LLMPipeline is initialized 
-        // from the model_str and weights_tensor, there is no files. 
+        // NPU reads some properties from the config file, but when LLMPipeline is initialized
+        // from the model_str and weights_tensor, there is no files.
         // Therefore, we need to pass these properties manually.
         // This is necessary only for NPU, for other plugins can be ommited.
         // Example of usage:
-        // ov::AnyMap model_descr_properties = {{"name_or_path", "meta-llama/Llama-2-7b-chat-hf"}, 
-        //                                      {"type", "llama"}, 
+        // ov::AnyMap model_descr_properties = {{"name_or_path", "meta-llama/Llama-2-7b-chat-hf"},
+        //                                      {"type", "llama"},
         //                                      {"num_key_value_heads", 32}};
         // ov::genai::LLMPipeline pipe(model_str,..., model_descr_properties);
         // This will convert from AnyMap to ModelDesc.
-        auto [filtered_properties, model_descr] = split_model_descr(properties);
+        auto [device_properties, model_descr] = split_model_descr(properties);
 
         m_pimpl = std::make_unique<StaticLLMPipeline>(
-            utils::singleton_core().read_model(model_str, weights_tensor), 
+            utils::singleton_core().read_model(model_str, weights_tensor),
             model_descr,
             tokenizer,
             device,
-            filtered_properties,
+            device_properties,
             generation_config
         );
-    } else {
+    }
+
+    // try to call CB adapter one more time, but with safe guard to silent exception
+    if (m_pimpl == nullptr) {
+        try {
+            // we need use CB only for x86, as for other architectures like arm64 or risc-v we can create Paged Attention based model
+            // but cannot perform its inference later
+#ifdef OPENVINO_ARCH_X86_64
+            SchedulerConfig default_config;
+            default_config.max_num_batched_tokens = std::numeric_limits<size_t>::max(); // don't limit total batch size
+
+            m_pimpl = std::make_unique<ContinuousBatchingAdapter>(model_str, weights_tensor, tokenizer,
+                                                                  default_config, device, properties, generation_config);
+#endif
+        } catch (ov::Exception&) {
+            // ignore exceptions from PA
+        }
+    }
+
+    if (m_pimpl == nullptr) {
         m_pimpl = std::make_unique<StatefulLLMPipeline>(
             utils::singleton_core().read_model(model_str, weights_tensor),
             tokenizer,

From 2f994721707248c679182605b53c5563ec4a346e Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Thu, 2 Jan 2025 08:14:30 +0100
Subject: [PATCH 2/2] Enable prefix caching

---
 src/cpp/src/llm_pipeline.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 3d5697e4e3..962147945c 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -121,6 +121,7 @@ ov::genai::LLMPipeline::LLMPipeline(
 #ifdef OPENVINO_ARCH_X86_64
             SchedulerConfig default_config;
             default_config.max_num_batched_tokens = std::numeric_limits<size_t>::max(); // don't limit total batch size
+            default_config.enable_prefix_caching = true; // for better TTFT in chat scenarios
 
             m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, tokenizer, default_config, device, properties);
 #endif
@@ -162,6 +163,7 @@ ov::genai::LLMPipeline::LLMPipeline(
 #ifdef OPENVINO_ARCH_X86_64
             SchedulerConfig default_config;
             default_config.max_num_batched_tokens = std::numeric_limits<size_t>::max(); // don't limit total batch size
+            default_config.enable_prefix_caching = true; // for better TTFT in chat scenarios
 
             m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, default_config, device, properties);
 #endif
@@ -228,6 +230,7 @@ ov::genai::LLMPipeline::LLMPipeline(
 #ifdef OPENVINO_ARCH_X86_64
             SchedulerConfig default_config;
             default_config.max_num_batched_tokens = std::numeric_limits<size_t>::max(); // don't limit total batch size
+            default_config.enable_prefix_caching = true; // for better TTFT in chat scenarios
 
             m_pimpl = std::make_unique<ContinuousBatchingAdapter>(model_str, weights_tensor, tokenizer,
                                                                   default_config, device, properties, generation_config);