openvinotoolkit · andrei-kochin · Feb 5, 2025 · Dec 30, 2024 · Jan 2, 2025 · Jan 7, 2025
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
@@ -5,6 +5,7 @@
 
 #include <nlohmann/json.hpp>
 
+#include "openvino/core/visibility.hpp"
 #include "openvino/genai/llm_pipeline.hpp"
 #include "openvino/genai/perf_metrics.hpp"
 
@@ -18,7 +19,7 @@ namespace genai {
 
 namespace {
 
-/* 
+/*
 * NPU reads some properties from the config file, but when LLMPipeline is initialized
 * from the model_str and weights_tensor, there are no files.
 * In the later case ModelDesc is stored in properties.
@@ -37,10 +38,27 @@ std::pair<ov::AnyMap, ov::genai::static_llm::ModelConfigDesc> split_model_descr(
     pop_property(main_properties, "name_or_path", model_descr.name_or_path);
     pop_property(main_properties, "type", model_descr.type);
     pop_property(main_properties, "num_key_value_heads", model_descr.num_key_value_heads);
-    
+
     return {main_properties, model_descr};
 }
 
+const std::string PA_BACKEND = "PA";
+const std::string SPDA_BACKEND = "SPDA";
+
+std::pair<ov::AnyMap, std::string> extract_attention_backend(const ov::AnyMap& external_properties) {
+    ov::AnyMap properties = external_properties;
+    auto it = properties.find("ATTENTION_BACKEND");
+    std::string attention_backend = PA_BACKEND;
+    if (it != properties.end()) {
+        attention_backend = it->second.as<std::string>();
+        OPENVINO_ASSERT(attention_backend == PA_BACKEND || attention_backend == SPDA_BACKEND,
+            "Attention backend must be either '", PA_BACKEND, "' or '", SPDA_BACKEND, "', got '", attention_backend, "'");
+        properties.erase(it);
+    }
+    return {properties, attention_backend};
+};
+
+
 } // namespace
 
 
@@ -62,7 +80,7 @@ std::pair<std::string, Any> draft_model(
     const std::string& device,
     const ov::AnyMap& properties) {
     auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties);
-    
+
     std::filesystem::path openvino_model_name = "openvino_model.xml";
     auto model = utils::singleton_core().read_model(models_path / openvino_model_name, {}, plugin_config);
     auto generation_config = utils::from_config_json_if_exists(models_path);
@@ -91,41 +109,92 @@ ov::genai::LLMPipeline::LLMPipeline(
     OptionalGenerationConfig generation_config) {
     auto start_time = std::chrono::steady_clock::now();
     m_pimpl = std::make_unique<StatefulLLMPipeline>(request, tokenizer, generation_config);
+    m_pimpl->save_load_time(start_time);
 }
 
 ov::genai::LLMPipeline::LLMPipeline(
     const std::filesystem::path& models_path,
     const ov::genai::Tokenizer& tokenizer,
     const std::string& device,
-    const ov::AnyMap& properties) {
+    const ov::AnyMap& user_properties) {
     auto start_time = std::chrono::steady_clock::now();
-    if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || 
-        properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || 
+
+    auto [properties, attention_backend] = extract_attention_backend(user_properties);
+
+    // If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues
+    if (properties.find(ov::genai::scheduler_config.name()) != properties.end() ||
+        properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() ||
         properties.find(ov::genai::prompt_lookup.name()) != properties.end()) {
-        auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties);
-        m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, tokenizer, scheduler_config, device, plugin_config);
-    } else if (device == "NPU") {
+        auto [device_properties, scheduler_config] = utils::split_scheduler_config(properties);
+        m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, tokenizer, scheduler_config, device, device_properties);
+    }
+
+    if (m_pimpl == nullptr && device == "NPU") {
         m_pimpl = static_llm::LLMPipelineFactory::create(models_path, tokenizer, device, properties);
-    } else {
+    }
+
+    // try to call CB adapter one more time, but with safe guard to silent exception
+    if (m_pimpl == nullptr && attention_backend == PA_BACKEND) {
+        try {
+            // we need use CB only for x86, as for other architectures like arm64 or risc-v we can create Paged Attention based model
+            // but cannot perform its inference later
+#ifdef OPENVINO_ARCH_X86_64
+            SchedulerConfig default_config;
+            default_config.max_num_batched_tokens = std::numeric_limits<size_t>::max(); // don't limit total batch size
+            default_config.enable_prefix_caching = true; // for better TTFT in chat scenarios
+
+            m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, tokenizer, default_config, device, properties);
+#endif
+        } catch (ov::Exception&) {
+            // ignore exceptions from PA
+        }
+    }
+
+    if (m_pimpl == nullptr) {
         m_pimpl = std::make_unique<StatefulLLMPipeline>(models_path, tokenizer, device, properties);
     }
+
     m_pimpl->save_load_time(start_time);
 }
 
 ov::genai::LLMPipeline::LLMPipeline(
     const std::filesystem::path& models_path,
     const std::string& device,
-    const ov::AnyMap& properties) {
+    const ov::AnyMap& user_properties) {
     auto start_time = std::chrono::steady_clock::now();
 
-    if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || 
-        properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || 
+    auto [properties, attention_backend] = extract_attention_backend(user_properties);
+
+    // If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues
+    if (properties.find(ov::genai::scheduler_config.name()) != properties.end() ||
+        properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() ||
         properties.find(ov::genai::prompt_lookup.name()) != properties.end()) {
         auto [device_properties, scheduler_config] = utils::split_scheduler_config(properties);
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, scheduler_config, device, device_properties);
-    } else if (device == "NPU") {
+    }
+
+    if (m_pimpl == nullptr && device == "NPU") {
         m_pimpl = static_llm::LLMPipelineFactory::create(models_path, device, properties);
-    } else {
+    }
+
+    // try to call CB adapter one more time, but with safe guard to silent exception
+    if (m_pimpl == nullptr && attention_backend == PA_BACKEND) {
+        try {
+            // we need use CB only for x86, as for other architectures like arm64 or risc-v we can create Paged Attention based model
+            // but cannot perform its inference later
+#ifdef OPENVINO_ARCH_X86_64
+            SchedulerConfig default_config;
+            default_config.max_num_batched_tokens = std::numeric_limits<size_t>::max(); // don't limit total batch size
+            default_config.enable_prefix_caching = true; // for better TTFT in chat scenarios
+
+            m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, default_config, device, properties);
+#endif
+        } catch (ov::Exception&) {
+            // ignore exceptions from PA
+        }
+    }
+
+    if (m_pimpl == nullptr) {
         m_pimpl = std::make_unique<StatefulLLMPipeline>(models_path, device, properties);
     }
 
@@ -137,40 +206,65 @@ ov::genai::LLMPipeline::LLMPipeline(
     const ov::Tensor& weights_tensor,
     const ov::genai::Tokenizer& tokenizer,
     const std::string& device,
-    const ov::AnyMap& properties,
+    const ov::AnyMap& user_properties,
     const ov::genai::GenerationConfig& generation_config) {
     auto start_time = std::chrono::steady_clock::now();
 
-    if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || 
-        properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || 
+    auto [properties, attention_backend] = extract_attention_backend(user_properties);
+
+    // If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues
+    if (properties.find(ov::genai::scheduler_config.name()) != properties.end() ||
+        properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() ||
         properties.find(ov::genai::prompt_lookup.name()) != properties.end()){
 
         auto [device_properties, scheduler_config] = utils::split_scheduler_config(properties);
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(model_str, weights_tensor,
                                                               tokenizer, scheduler_config, device, device_properties, generation_config);
-    } else if (device == "NPU") {
+    }
+
+    if (m_pimpl == nullptr && device == "NPU") {
         // TODO: CVS-158771 Currently, it's a workaround. Probably there is a better solution.
-        // NPU reads some properties from the config file, but when LLMPipeline is initialized 
-        // from the model_str and weights_tensor, there is no files. 
+        // NPU reads some properties from the config file, but when LLMPipeline is initialized
+        // from the model_str and weights_tensor, there is no files.
         // Therefore, we need to pass these properties manually.
         // This is necessary only for NPU, for other plugins can be ommited.
         // Example of usage:
-        // ov::AnyMap model_descr_properties = {{"name_or_path", "meta-llama/Llama-2-7b-chat-hf"}, 
-        //                                      {"type", "llama"}, 
+        // ov::AnyMap model_descr_properties = {{"name_or_path", "meta-llama/Llama-2-7b-chat-hf"},
+        //                                      {"type", "llama"},
         //                                      {"num_key_value_heads", 32}};
         // ov::genai::LLMPipeline pipe(model_str,..., model_descr_properties);
         // This will convert from AnyMap to ModelDesc.
-        auto [filtered_properties, model_descr] = split_model_descr(properties);
+        auto [device_properties, model_descr] = split_model_descr(properties);
 
         m_pimpl = static_llm::LLMPipelineFactory::create(
-            utils::singleton_core().read_model(model_str, weights_tensor), 
+            utils::singleton_core().read_model(model_str, weights_tensor),
             model_descr,
             tokenizer,
             device,
-            filtered_properties,
+            device_properties,
             generation_config
         );
-    } else {
+    }
+
+    // try to call CB adapter one more time, but with safe guard to silent exception
+    if (m_pimpl == nullptr && attention_backend == PA_BACKEND) {
+        try {
+            // we need use CB only for x86, as for other architectures like arm64 or risc-v we can create Paged Attention based model
+            // but cannot perform its inference later
+#ifdef OPENVINO_ARCH_X86_64
+            SchedulerConfig default_config;
+            default_config.max_num_batched_tokens = std::numeric_limits<size_t>::max(); // don't limit total batch size
+            default_config.enable_prefix_caching = true; // for better TTFT in chat scenarios
+
+            m_pimpl = std::make_unique<ContinuousBatchingAdapter>(model_str, weights_tensor, tokenizer,
+                                                                  default_config, device, properties, generation_config);
+#endif
+        } catch (ov::Exception&) {
+            // ignore exceptions from PA
+        }
+    }
+
+    if (m_pimpl == nullptr) {
         m_pimpl = std::make_unique<StatefulLLMPipeline>(
             utils::singleton_core().read_model(model_str, weights_tensor),
             tokenizer,