ilya-lavrenov · ilya-lavrenov · May 13, 2024 · May 13, 2024
diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/cache_manager.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/cache_manager.hpp
@@ -7,24 +7,21 @@
 
 #include "openvino/runtime/tensor.hpp"
 
-#include "model_config.hpp"
 #include "device_config.hpp"
 
 class CacheManager {
-    ModelConfig m_model_config;
     DeviceConfig m_device_config;
     std::vector<ov::Tensor> m_key_cache;
     std::vector<ov::Tensor> m_value_cache;
 
 public:
-    CacheManager(const ModelConfig& model_config, const DeviceConfig& device_config) :
-        m_model_config(model_config),
+    explicit CacheManager(const DeviceConfig& device_config) :
         m_device_config(device_config) {
-        m_key_cache.reserve(m_model_config.get_num_layers());
-        m_value_cache.reserve(m_model_config.get_num_layers());
+        m_key_cache.reserve(m_device_config.get_num_layers());
+        m_value_cache.reserve(m_device_config.get_num_layers());
 
         // Allocate KV caches
-        for (size_t decoder_layer_id = 0; decoder_layer_id < model_config.get_num_layers(); ++decoder_layer_id) {
+        for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) {
             ov::Tensor key_cache(device_config.get_cache_precision(), device_config.get_key_cache_shape());
             ov::Tensor value_cache(device_config.get_cache_precision(), device_config.get_value_cache_shape());
 
@@ -33,10 +30,6 @@ class CacheManager {
         }
     }
 
-    size_t get_num_layers() const {
-        return m_key_cache.size();
-    }
-
     ov::Tensor get_key_cache(size_t decoder_layer_id) const {
         OPENVINO_ASSERT(decoder_layer_id < m_key_cache.size());
         return m_key_cache[decoder_layer_id];
@@ -71,7 +64,7 @@ class CacheManager {
                 key_dst_end_roi[0] = (key_dst_start_roi[0] = dst_block_id) + 1;
                 value_dst_end_roi[0] = (value_dst_start_roi[0] = dst_block_id) + 1;
 
-                for (size_t decoder_layer_id = 0; decoder_layer_id < m_model_config.get_num_layers(); ++decoder_layer_id) {
+                for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) {
                     ov::Tensor key_src_cache_roi(m_key_cache[decoder_layer_id], key_src_start_roi, key_src_end_roi);
                     ov::Tensor key_dst_cache_roi(m_key_cache[decoder_layer_id], key_dst_start_roi, key_dst_end_roi);
 

diff --git a/...generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp b/...generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp
@@ -7,8 +7,6 @@
 #include "model_runner.hpp"
 #include "scheduler.hpp"
 #include "timer.hpp"
-#include "model_config.hpp"
-#include "model_config.hpp"
 #include "tokenizer.hpp"
 
 #include "debug_utils.hpp"
@@ -41,8 +39,7 @@ GenerationResult from_sequence_group(std::shared_ptr<Tokenizer> tokenizer, Seque
 
 } // namespace
 
-void apply_paged_attention_transformations(std::shared_ptr<ov::Model> model,
-                                           const ModelConfig& model_config, const DeviceConfig& device_config);
+void apply_paged_attention_transformations(std::shared_ptr<ov::Model> model, DeviceConfig& device_config);
 
 class ContinuousBatchingPipeline::Impl {
     std::shared_ptr<Tokenizer> m_tokenizer;
@@ -84,17 +81,16 @@ class ContinuousBatchingPipeline::Impl {
 
         // The model can be compiled for GPU as well
         std::shared_ptr<ov::Model> model = core.read_model(models_path + "/openvino_model.xml");
-        ModelConfig model_config(model);
 
         const std::string device = "CPU";
-        DeviceConfig device_config(core, scheduler_config, model_config, device);
+        DeviceConfig device_config(core, scheduler_config, device);
 
-        apply_paged_attention_transformations(model, model_config, device_config);
+        apply_paged_attention_transformations(model, device_config);
         ov::InferRequest infer_request = core.compile_model(model, device_config.get_device(), ov::enable_profiling(true)).create_infer_request();
 
         // setup KV caches
-        m_cache_manager = std::make_shared<CacheManager>(model_config, device_config);
-        for (size_t decoder_layer_id = 0; decoder_layer_id < model_config.get_num_layers(); ++decoder_layer_id) {
+        m_cache_manager = std::make_shared<CacheManager>(device_config);
+        for (size_t decoder_layer_id = 0; decoder_layer_id < device_config.get_num_layers(); ++decoder_layer_id) {
             infer_request.set_input_tensor(2 + decoder_layer_id * 2, m_cache_manager->get_key_cache(decoder_layer_id));
             infer_request.set_input_tensor(2 + decoder_layer_id * 2 + 1, m_cache_manager->get_value_cache(decoder_layer_id));
         }

diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/device_config.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/device_config.hpp
@@ -4,35 +4,47 @@
 #pragma once
 
 #include "openvino/runtime/core.hpp"
+#include "openvino/core/shape.hpp"
 #include "openvino/core/type/element_type.hpp"
 
-#include "model_config.hpp"
 #include "scheduler_config.hpp"
 
 class DeviceConfig {
     ov::element::Type m_kv_cache_type;
-    ov::Shape m_key_cache_shape;
-    ov::Shape m_value_cache_shape;
+    ov::Shape m_key_cache_shape, m_value_cache_shape;
+    ov::Shape::value_type m_num_kv_heads, m_head_size, m_num_decoder_layers;
+    size_t m_num_kv_blocks, m_block_size;
     std::string m_device;
 
 public:
-    DeviceConfig(ov::Core& core, const SchedulerConfig& scheduling_config, const ModelConfig& model_config, const std::string& device) {
+    DeviceConfig(ov::Core& core, const SchedulerConfig& scheduling_config, const std::string& device) {
         m_device = device;
 
+        // keep information about blocsk
+        m_num_kv_blocks = scheduling_config.num_kv_blocks;
+        m_block_size = scheduling_config.block_size;
+
         if (m_device == "CPU") {
             auto inference_precision = core.get_property(device, ov::hint::inference_precision);
             m_kv_cache_type = inference_precision == ov::element::bf16 ? ov::element::bf16 : ov::element::f16;
-            m_key_cache_shape = m_value_cache_shape = ov::Shape{scheduling_config.num_kv_blocks,
-                                                                model_config.get_num_kv_heads(),
-                                                                scheduling_config.block_size,
-                                                                model_config.get_head_size()};
         } else if (m_device == "GPU") {
             OPENVINO_ASSERT("GPU is not currently supported. Please, remove this assert and fill configuration");
         } else {
             OPENVINO_THROW(m_device, " is not supported by OpenVINO Continuous Batching");
         }
     }
 
+    void set_model_params(size_t num_kv_heads, size_t head_size, size_t num_decoder_layers) {
+        m_num_kv_heads = num_kv_heads;
+        m_head_size = head_size;
+        m_num_decoder_layers = num_decoder_layers;
+
+        m_key_cache_shape = m_value_cache_shape = ov::Shape{m_num_kv_blocks,
+                                                            m_num_kv_heads,
+                                                            m_block_size,
+                                                            m_head_size};
+    }
+
     std::string get_device() const {
         return m_device;
     }
@@ -41,11 +53,17 @@ class DeviceConfig {
         return m_kv_cache_type;
     }
 
+    size_t get_num_layers() const {
+        return m_num_decoder_layers;
+    }
+
     ov::Shape get_key_cache_shape() const {
+        OPENVINO_ASSERT(!m_key_cache_shape.empty());
         return m_key_cache_shape;
     }
 
     ov::Shape get_value_cache_shape() const {
+        OPENVINO_ASSERT(!m_value_cache_shape.empty());
         return m_value_cache_shape;
     }
 };
diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/model_config.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/model_config.hpp
diff --git a/...eration/causal_lm/cpp/continuous_batching/library/src/paged_attention_transformations.cpp b/...eration/causal_lm/cpp/continuous_batching/library/src/paged_attention_transformations.cpp
@@ -6,20 +6,38 @@
 #include "openvino/pass/manager.hpp"
 #include "openvino/pass/sdpa_to_paged_attention.hpp"
 
-#include "model_config.hpp"
 #include "device_config.hpp"
 
-void apply_paged_attention_transformations(std::shared_ptr<ov::Model> model, const ModelConfig& model_config, const DeviceConfig& device_config) {
+void apply_paged_attention_transformations(std::shared_ptr<ov::Model> model, DeviceConfig& device_config) {
+    const ov::op::util::VariableVector& variables = model->get_variables();
+    OPENVINO_ASSERT(!variables.empty(), "Model is supposed to be stateful");
+
+    // number of variables is 2 (K and V) multiplied by number of decoder layers
+    size_t num_layers = variables.size() >> 1;
+
     ov::pass::Manager manager;
     manager.register_pass<ov::pass::SDPAToPagedAttention>();
     manager.run_passes(model);
 
     const ov::ParameterVector& parameters = model->get_parameters();
-    for (size_t decoder_layer_id = 0; decoder_layer_id < model_config.get_num_layers(); ++decoder_layer_id) {
-        parameters[2 + 2 * decoder_layer_id]->set_element_type(device_config.get_cache_precision());
-        parameters[2 + 2 * decoder_layer_id + 1]->set_element_type(device_config.get_cache_precision());
-        parameters[2 + 2 * decoder_layer_id]->set_partial_shape(device_config.get_key_cache_shape());
-        parameters[2 + 2 * decoder_layer_id + 1]->set_partial_shape(device_config.get_value_cache_shape());
+
+    for (auto param : parameters) {
+        std::cout << param->get_friendly_name() << " " << param->get_partial_shape() << std::endl;
+    }
+
+    // extract num_kv_heads and head_size
+    size_t kv_caches_inputs_offset = 2;
+    ov::PartialShape k_shape = parameters[kv_caches_inputs_offset]->get_partial_shape();
+    OPENVINO_ASSERT(k_shape.rank().get_length() == 3, "KV cache shape is expected to have rank 3, while shape is ", k_shape);
+    size_t num_kv_heads = k_shape[1].get_length(), head_size = k_shape[2].get_length();
+
+    device_config.set_model_params(num_kv_heads, head_size, num_layers);
+
+    for (size_t decoder_layer_id = 0; decoder_layer_id < num_layers; ++decoder_layer_id) {
+        parameters[kv_caches_inputs_offset + 2 * decoder_layer_id]->set_element_type(device_config.get_cache_precision());
+        parameters[kv_caches_inputs_offset + 2 * decoder_layer_id + 1]->set_element_type(device_config.get_cache_precision());
+        parameters[kv_caches_inputs_offset + 2 * decoder_layer_id]->set_partial_shape(device_config.get_key_cache_shape());
+        parameters[kv_caches_inputs_offset + 2 * decoder_layer_id + 1]->set_partial_shape(device_config.get_value_cache_shape());
     }
     model->validate_nodes_and_infer_types();
 }
diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/scheduler.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/scheduler.hpp
@@ -7,7 +7,6 @@
 #include <cstdlib>
 #include <vector>
 
-#include "model_config.hpp"
 #include "block_manager.hpp"
 #include "sequence_group.hpp"
 #include "block_manager.hpp"