Refactor to deltas + LUT

openvinotoolkit · Dec 11, 2024 · 533585a · 533585a
1 parent a542d3e
commit 533585a
Show file tree

Hide file tree

Showing 6 changed files with 111 additions and 78 deletions.
diff --git a/src/cpp/src/cache_eviction.cpp b/src/cpp/src/cache_eviction.cpp
@@ -301,9 +301,18 @@ namespace ov::genai {
         }
     }
 
-    std::vector<CacheRotationCalculator::BlockRotationData> CacheRotationCalculator::get_rotation_coefficients(
+    const std::vector<std::vector<double>>& CacheRotationCalculator::get_sin_lut() const {
+        return m_rope_sin_lut;
+    }
+
+    const std::vector<std::vector<double>>& CacheRotationCalculator::get_cos_lut() const {
+        return m_rope_cos_lut;
+    }
+
+    std::vector<CacheRotationCalculator::BlockRotationData> CacheRotationCalculator::get_rotation_data(
         const std::set<size_t>& evicted_block_logical_indices,
-        size_t num_logical_blocks_before_eviction) {
+        size_t num_logical_blocks_before_eviction,
+        bool deltas_only) {
         OPENVINO_ASSERT(num_logical_blocks_before_eviction * m_block_size < m_rope_sin_lut.size(),
                         "num_logical_blocks_before_eviction may not correspond to less tokens than max_context_length");
 
@@ -331,13 +340,17 @@ namespace ov::genai {
                 if (current_rotation_delta_in_blocks != 0) {
                     BlockRotationData block_rotation_data;
                     block_rotation_data.logical_block_idx = logical_block_idx - current_rotation_delta_in_blocks;
-                    block_rotation_data.cosines.reserve(m_block_size);
-                    block_rotation_data.sines.reserve(m_block_size);
-                    for (size_t i = 0; i < m_block_size; i++) {
-                        block_rotation_data.cosines.push_back(
-                            m_rope_cos_lut[current_rotation_delta_in_blocks * m_block_size]);
-                        block_rotation_data.sines.push_back(
-                            m_rope_sin_lut[current_rotation_delta_in_blocks * m_block_size]);
+                    block_rotation_data.rotation_delta = current_rotation_delta_in_blocks * m_block_size;
+
+                    if (!deltas_only) {
+                        block_rotation_data.cosines.reserve(m_block_size);
+                        block_rotation_data.sines.reserve(m_block_size);
+                        for (size_t i = 0; i < m_block_size; i++) {
+                            block_rotation_data.cosines.push_back(
+                                m_rope_cos_lut[current_rotation_delta_in_blocks * m_block_size]);
+                            block_rotation_data.sines.push_back(
+                                m_rope_sin_lut[current_rotation_delta_in_blocks * m_block_size]);
+                        }
                     }
 
                     retval.push_back(block_rotation_data);

diff --git a/src/cpp/src/cache_eviction.hpp b/src/cpp/src/cache_eviction.hpp
@@ -158,8 +158,12 @@ class CacheRotationCalculator {
         bool operator==(const BlockRotationData& rhs) const {
             return (logical_block_idx == rhs.logical_block_idx) && (sines == rhs.sines) && (cosines == rhs.cosines);
         }
-        size_t logical_block_idx;             /** Logical index of the block AFTER eviction to which the sine and cosine
-                                                 coefficients should be applied */
+        size_t logical_block_idx;             /** Logical index of the block AFTER eviction to which the rotation
+                                                 should be applied */
+        size_t rotation_delta;                /** Delta, in token positions, that should be applied to block contents
+                                                via rotation **/
+
+        // Fields below are currently only used for testing purposes
         RotationCoefficientsPerToken sines;   /** The sine coefficients to be applied to this block's contents for
                                                  rotation, in order of the block's elements */
         RotationCoefficientsPerToken cosines; /** The cosine coefficients to be applied to this block's contents for
@@ -173,11 +177,13 @@ class CacheRotationCalculator {
      * determined to be necessary to evict.
      * @param num_logical_blocks_before_eviction Number of logical blocks that the evicted-from sequence occupied before
      * the eviction step.
+     * @param deltas_only If true, the sines and cosines fields in each returned BlockRotationData will be left empty.
      * @return A vector of per-block rotation data, including the indices of blocks after eviction that should be
      * rotated, and the pre-computed trigonometric coefficients necessary for rotation.
      */
-    std::vector<BlockRotationData> get_rotation_coefficients(const std::set<size_t>& evicted_block_logical_indices,
-                                                             size_t num_logical_blocks_before_eviction);
+    std::vector<BlockRotationData> get_rotation_data(const std::set<size_t>& evicted_block_logical_indices,
+                                                             size_t num_logical_blocks_before_eviction,
+                                                             bool deltas_only = true);
 
     /**
      * @return The size of the embedding dimension that this CacheRotationCalculator was initialized with.
@@ -186,6 +192,9 @@ class CacheRotationCalculator {
         return m_head_size;
     }
 
+    const std::vector<std::vector<double>>& get_sin_lut() const;
+    const std::vector<std::vector<double>>& get_cos_lut() const;
+
 private:
     size_t m_block_size;
     size_t m_head_size;

diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp
@@ -83,20 +83,38 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::init(
                                                        m_num_decoder_layers,
                                                        /* collect_attention_scores = */ true,
                                                        /* is_use_per_layer_cache_control = */ true);
-        m_rotation_coefficient_stores.reserve(m_num_decoder_layers);
-        ov::Shape rotation_coefficient_store_shape{device_config.get_head_size() *
-                                                   (m_scheduler->get_block_size() * scheduler_config.num_kv_blocks)};
+        m_rotation_deltas_stores.reserve(m_num_decoder_layers);
+        ov::Shape rotation_deltas_store_shape{m_scheduler->get_block_size() * scheduler_config.num_kv_blocks};
         for (size_t i = 0; i < m_num_decoder_layers; i++) {
-            ov::Tensor store(ov::element::f32, rotation_coefficient_store_shape);
+            ov::Tensor store(ov::element::i32, rotation_deltas_store_shape);
             std::memset(store.data(), 0, store.get_byte_size());
-            m_rotation_coefficient_stores.push_back(store);
+            m_rotation_deltas_stores.push_back(store);
         }
+
+        // TODO (vshampor): LUT size equal to max cache size in tokens
+        //  is overkill - find a way to pass the max sequence length defined by pipeline instead
+        size_t max_sequence_length = m_scheduler->get_block_size() * scheduler_config.num_kv_blocks;
+        size_t embedding_size = device_config.get_head_size();
         m_cache_rotation_calculator = std::make_shared<CacheRotationCalculator>(
             m_scheduler->get_block_size(),
-            // TODO (vshampor): LUT size equal to max cache size in tokens
-            //  is overkill - find a way to pass the max sequence length instead
-            m_scheduler->get_block_size() * scheduler_config.num_kv_blocks,
-            device_config.get_head_size());
+            max_sequence_length,
+            embedding_size);
+        auto rotation_trig_lut = ov::Tensor(ov::element::f32, ov::Shape{max_sequence_length, device_config.get_head_size()});
+        float* rotation_trig_lut_data = rotation_trig_lut.data<float>();
+        std::memset(rotation_trig_lut_data, 0, rotation_trig_lut.get_byte_size());
+
+        const auto& cos_lut = m_cache_rotation_calculator->get_cos_lut();
+        const auto& sin_lut = m_cache_rotation_calculator->get_sin_lut();
+
+
+        for (size_t pos_idx = 0; pos_idx < cos_lut.size(); pos_idx++) {
+            for (size_t embedding_pair_idx = 0; embedding_pair_idx < cos_lut[0].size(); embedding_pair_idx++) {
+                rotation_trig_lut_data[pos_idx * embedding_size + embedding_pair_idx] = cos_lut[pos_idx][embedding_pair_idx];
+                rotation_trig_lut_data[pos_idx * embedding_size + embedding_size / 2 + embedding_pair_idx] = sin_lut[pos_idx][embedding_pair_idx];
+            }
+        }
+
+        m_model_runner->set_cache_rotation_trig_lut(std::move(rotation_trig_lut));
     } else {
         m_model_runner =
             std::make_shared<ModelRunner>(infer_request, m_scheduler->get_block_size(), m_num_decoder_layers);
@@ -177,8 +195,8 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::step() {
 
         if (sched_config.use_cache_eviction) {
             _compute_cache_rotation_data(m_requests, scheduler_output);
-            m_model_runner->set_cache_rotation_data(std::move(m_current_step_rotation_coefficients),
-                                                    std::move(m_current_step_rotated_block_indices_per_sequence));
+            m_model_runner->set_cache_rotation_data(std::move(m_current_step_rotated_block_indices_per_sequence),
+                                                    std::move(m_current_step_rotation_deltas));
         }
 
         timer.end();
@@ -413,11 +431,10 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_compute_cache_rotation
         }
     }
 
-    size_t head_size = m_cache_rotation_calculator->get_head_size();
     // necessary since we move from these members during previous steps
-    m_current_step_rotation_coefficients.clear();
     m_current_step_rotated_block_indices_per_sequence.clear();
     m_current_step_rotated_block_indices_per_sequence.resize(m_num_decoder_layers);
+    m_current_step_rotation_deltas.clear();
 
     std::vector<size_t> num_blocks_to_rotate_for_each_layer(m_num_decoder_layers, 0);
 
@@ -426,7 +443,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_compute_cache_rotation
         size_t seq_id = seq_id_and_evicted_blocks.first;
         // Skip sequences that, in the meanwhile before previous step's forward execution and now,
         // have left the cache (e.g. finished or were preempted)
-        if (live_sequences.find(seq_id) == live_sequences.end()) {
+        if (live_seq_ids_to_num_occupied_blocks.find(seq_id) == live_seq_ids_to_num_occupied_blocks.end()) {
             continue;
         }
 
@@ -438,43 +455,31 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_compute_cache_rotation
             }
             size_t num_blocks_before_eviction = m_previous_num_blocks_before_eviction_per_sequence[seq_id];
             auto rotation_multipliers =
-                m_cache_rotation_calculator->get_rotation_coefficients(logical_blocks_to_evict[layer_idx],
+                m_cache_rotation_calculator->get_rotation_data(logical_blocks_to_evict[layer_idx],
                                                                        num_blocks_before_eviction);
             for (size_t i = 0; i < rotation_multipliers.size(); i++) {
                 const auto& block_rotation_data = rotation_multipliers[i];
-                const auto& rotation_multipliers_cos = block_rotation_data.cosines;
-                const auto& rotation_multipliers_sin = block_rotation_data.sines;
-                OPENVINO_ASSERT(rotation_multipliers_cos.size() == rotation_multipliers_sin.size());
-                OPENVINO_ASSERT(rotation_multipliers_cos.size() == m_scheduler->get_block_size());
 
                 m_current_step_rotated_block_indices_per_sequence[layer_idx][seq_id].push_back(
                     block_rotation_data.logical_block_idx);
 
-                // Fill the store tensor with rotation coefficient data - cos and sin coefficients are each contiguous,
-                // cos goes first
                 size_t block_offset =
-                    num_blocks_to_rotate_for_each_layer[layer_idx] * m_scheduler->get_block_size() * head_size;
-                auto rotation_multipliers_tensor_data =
-                    m_rotation_coefficient_stores[layer_idx].data<float>() + block_offset;
-                for (size_t tok_idx = 0; tok_idx < rotation_multipliers_cos.size(); tok_idx++) {
-                    size_t position_offset = head_size * tok_idx;
-                    for (size_t embedding_pair_idx = 0; embedding_pair_idx < head_size / 2; embedding_pair_idx++) {
-                        rotation_multipliers_tensor_data[position_offset + embedding_pair_idx] =
-                            rotation_multipliers_cos[tok_idx][embedding_pair_idx];
-                        rotation_multipliers_tensor_data[position_offset + embedding_pair_idx + head_size / 2] =
-                            rotation_multipliers_sin[tok_idx][embedding_pair_idx];
-                    }
+                    num_blocks_to_rotate_for_each_layer[layer_idx] * m_scheduler->get_block_size();
+                auto rotation_deltas_tensor_data =
+                    m_rotation_deltas_stores[layer_idx].data<int32_t>() + block_offset;
+                for (size_t tok_idx = 0; tok_idx < m_scheduler->get_block_size(); tok_idx++) {
+                   rotation_deltas_tensor_data[tok_idx] = block_rotation_data.rotation_delta;
                 }
                 num_blocks_to_rotate_for_each_layer[layer_idx] += 1;
             }
         }
     }
     // Select the previously filled rotation coefficients from the store tensor
     for (size_t i = 0; i < m_num_decoder_layers; i++) {
-        m_current_step_rotation_coefficients.emplace_back(
-            m_rotation_coefficient_stores[i],
+        m_current_step_rotation_deltas.emplace_back(
+            m_rotation_deltas_stores[i],
             ov::Coordinate{0},
-            ov::Coordinate{num_blocks_to_rotate_for_each_layer[i] * m_scheduler->get_block_size() * head_size});
+            ov::Coordinate{num_blocks_to_rotate_for_each_layer[i] * m_scheduler->get_block_size()});
     }
 }
 

diff --git a/src/cpp/src/continuous_batching_impl.hpp b/src/cpp/src/continuous_batching_impl.hpp
@@ -32,16 +32,14 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc
 
     size_t m_num_decoder_layers = 0;
 
-    // Pre-allocated per-layer storages for the per-token cache re-rotation coefficients used in cache eviction case
-    std::vector<ov::Tensor> m_rotation_coefficient_stores;
+    // Pre-allocated per-layer storages for the per-token cache re-rotation deltas used in cache eviction case
+    std::vector<ov::Tensor> m_rotation_deltas_stores;
 
     std::map<size_t, std::vector<std::set<size_t>>> m_previous_evicted_block_logical_indices_per_sequence;
     std::map<size_t, size_t> m_previous_num_blocks_before_eviction_per_sequence;
 
-    // Per-layer ROI tensors, reusing storage from the pre-allocated tensors above, that actually represent the
-    // re-rotation coefficients to be sent to the proper model inputs at the *next* pipeline step.
-    std::vector<ov::Tensor> m_current_step_rotation_coefficients;
     std::vector<std::map<size_t, std::vector<size_t>>> m_current_step_rotated_block_indices_per_sequence;
+    std::vector<ov::Tensor> m_current_step_rotation_deltas;
 
     std::shared_ptr<ov::genai::CacheRotationCalculator> m_cache_rotation_calculator;
 

diff --git a/src/cpp/src/model_runner.hpp b/src/cpp/src/model_runner.hpp
@@ -33,8 +33,10 @@ class ModelRunner {
     size_t m_num_decoder_layers, m_block_size;
     bool m_collect_attention_scores;
     bool m_is_use_per_layer_cache_control;
-    std::vector<ov::Tensor> m_cache_rotation_coefficients;
+
     std::vector<std::map<size_t, std::vector<size_t>>> m_rotated_block_logical_indices_per_sequence_for_each_layer;
+    std::vector<ov::Tensor> m_cache_rotation_deltas_for_each_layer;
+    ov::Tensor m_cache_rotation_trig_lut;
 
 public:
     /**
@@ -76,12 +78,16 @@ class ModelRunner {
         return m_last_attention_scores;
     }
 
-    void set_cache_rotation_data(std::vector<ov::Tensor>&& cache_rotation_coefficients_for_each_layer,
-                                 const std::vector<std::map<size_t, std::vector<size_t>>>&&
-                                     rotated_logical_block_indices_per_sequence_for_each_layer) {
-        m_cache_rotation_coefficients = std::move(cache_rotation_coefficients_for_each_layer);
+    void set_cache_rotation_trig_lut(ov::Tensor&& rotation_trig_lut) {
+        m_cache_rotation_trig_lut = std::move(rotation_trig_lut);
+    }
+
+    void set_cache_rotation_data(std::vector<std::map<size_t, std::vector<size_t>>>&&
+                                     rotated_logical_block_indices_per_sequence_for_each_layer,
+                                 std::vector<ov::Tensor>&& rotation_deltas_for_each_layer) {
         m_rotated_block_logical_indices_per_sequence_for_each_layer =
             std::move(rotated_logical_block_indices_per_sequence_for_each_layer);
+        m_cache_rotation_deltas_for_each_layer = std::move(rotation_deltas_for_each_layer);
     }
 
     /**
@@ -184,8 +190,9 @@ class ModelRunner {
 
         _set_block_indices(sequence_groups, scheduler_output, total_num_blocks);
 
-        if (!m_cache_rotation_coefficients.empty()) {
+        if (!m_cache_rotation_deltas_for_each_layer.empty()) {
             _set_cache_rotation_coefficients(sequence_groups, scheduler_output);
+            m_request.set_tensor("rotation_trig_lut", m_cache_rotation_trig_lut);
         }
 
         m_request.set_tensor("block_indices_begins", block_indices_begins);
@@ -211,8 +218,6 @@ class ModelRunner {
             _collect_attention_scores(sequence_groups, scheduler_output);
         }
 
-        m_cache_rotation_coefficients.clear();
-
         // return logits
         return m_request.get_tensor("logits");
     }
@@ -307,11 +312,6 @@ class ModelRunner {
 
     void _set_cache_rotation_coefficients(const std::vector<SequenceGroup::Ptr>& sequence_groups,
                                           const Scheduler::Output& scheduler_output) {
-        for (size_t i = 0; i < m_num_decoder_layers; i++) {
-            auto tensor_name = std::string("rotation_coefficients.") + std::to_string(i);
-            m_request.set_tensor(tensor_name, m_cache_rotation_coefficients[i]);
-        }
-
         std::vector<std::string> rotation_indices_tensor_names(m_num_decoder_layers);
         for (size_t i = 0; i < m_num_decoder_layers; i++) {
             auto tensor_name = std::string("rotated_block_indices.") + std::to_string(i);
@@ -324,6 +324,11 @@ class ModelRunner {
             rotated_block_indices_tensor.set_shape({num_indices});
         }
 
+        for (size_t i = 0; i < m_num_decoder_layers; i++) {
+            auto tensor_name = std::string("rotation_deltas.") + std::to_string(i);
+            m_request.set_tensor(tensor_name, m_cache_rotation_deltas_for_each_layer[i]);
+        }
+
 
         // NB: the order of per-sequence index filling in the function below must be the same
         // as the order of `seq_id`s in which the "rotation_coefficients.N" inputs are filled