Skip to content

Commit

Permalink
Refactor to deltas + LUT
Browse files Browse the repository at this point in the history
  • Loading branch information
vshampor committed Dec 11, 2024
1 parent a542d3e commit 533585a
Show file tree
Hide file tree
Showing 6 changed files with 111 additions and 78 deletions.
31 changes: 22 additions & 9 deletions src/cpp/src/cache_eviction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -301,9 +301,18 @@ namespace ov::genai {
}
}

std::vector<CacheRotationCalculator::BlockRotationData> CacheRotationCalculator::get_rotation_coefficients(
const std::vector<std::vector<double>>& CacheRotationCalculator::get_sin_lut() const {
return m_rope_sin_lut;
}

const std::vector<std::vector<double>>& CacheRotationCalculator::get_cos_lut() const {
return m_rope_cos_lut;
}

std::vector<CacheRotationCalculator::BlockRotationData> CacheRotationCalculator::get_rotation_data(
const std::set<size_t>& evicted_block_logical_indices,
size_t num_logical_blocks_before_eviction) {
size_t num_logical_blocks_before_eviction,
bool deltas_only) {
OPENVINO_ASSERT(num_logical_blocks_before_eviction * m_block_size < m_rope_sin_lut.size(),
"num_logical_blocks_before_eviction may not correspond to less tokens than max_context_length");

Expand Down Expand Up @@ -331,13 +340,17 @@ namespace ov::genai {
if (current_rotation_delta_in_blocks != 0) {
BlockRotationData block_rotation_data;
block_rotation_data.logical_block_idx = logical_block_idx - current_rotation_delta_in_blocks;
block_rotation_data.cosines.reserve(m_block_size);
block_rotation_data.sines.reserve(m_block_size);
for (size_t i = 0; i < m_block_size; i++) {
block_rotation_data.cosines.push_back(
m_rope_cos_lut[current_rotation_delta_in_blocks * m_block_size]);
block_rotation_data.sines.push_back(
m_rope_sin_lut[current_rotation_delta_in_blocks * m_block_size]);
block_rotation_data.rotation_delta = current_rotation_delta_in_blocks * m_block_size;

if (!deltas_only) {
block_rotation_data.cosines.reserve(m_block_size);
block_rotation_data.sines.reserve(m_block_size);
for (size_t i = 0; i < m_block_size; i++) {
block_rotation_data.cosines.push_back(
m_rope_cos_lut[current_rotation_delta_in_blocks * m_block_size]);
block_rotation_data.sines.push_back(
m_rope_sin_lut[current_rotation_delta_in_blocks * m_block_size]);
}
}

retval.push_back(block_rotation_data);
Expand Down
17 changes: 13 additions & 4 deletions src/cpp/src/cache_eviction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -158,8 +158,12 @@ class CacheRotationCalculator {
bool operator==(const BlockRotationData& rhs) const {
return (logical_block_idx == rhs.logical_block_idx) && (sines == rhs.sines) && (cosines == rhs.cosines);
}
size_t logical_block_idx; /** Logical index of the block AFTER eviction to which the sine and cosine
coefficients should be applied */
size_t logical_block_idx; /** Logical index of the block AFTER eviction to which the rotation
should be applied */
size_t rotation_delta; /** Delta, in token positions, that should be applied to block contents
via rotation **/

// Fields below are currently only used for testing purposes
RotationCoefficientsPerToken sines; /** The sine coefficients to be applied to this block's contents for
rotation, in order of the block's elements */
RotationCoefficientsPerToken cosines; /** The cosine coefficients to be applied to this block's contents for
Expand All @@ -173,11 +177,13 @@ class CacheRotationCalculator {
* determined to be necessary to evict.
* @param num_logical_blocks_before_eviction Number of logical blocks that the evicted-from sequence occupied before
* the eviction step.
* @param deltas_only If true, the sines and cosines fields in each returned BlockRotationData will be left empty.
* @return A vector of per-block rotation data, including the indices of blocks after eviction that should be
* rotated, and the pre-computed trigonometric coefficients necessary for rotation.
*/
std::vector<BlockRotationData> get_rotation_coefficients(const std::set<size_t>& evicted_block_logical_indices,
size_t num_logical_blocks_before_eviction);
std::vector<BlockRotationData> get_rotation_data(const std::set<size_t>& evicted_block_logical_indices,
size_t num_logical_blocks_before_eviction,
bool deltas_only = true);

/**
* @return The size of the embedding dimension that this CacheRotationCalculator was initialized with.
Expand All @@ -186,6 +192,9 @@ class CacheRotationCalculator {
return m_head_size;
}

const std::vector<std::vector<double>>& get_sin_lut() const;
const std::vector<std::vector<double>>& get_cos_lut() const;

private:
size_t m_block_size;
size_t m_head_size;
Expand Down
75 changes: 40 additions & 35 deletions src/cpp/src/continuous_batching_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,20 +83,38 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::init(
m_num_decoder_layers,
/* collect_attention_scores = */ true,
/* is_use_per_layer_cache_control = */ true);
m_rotation_coefficient_stores.reserve(m_num_decoder_layers);
ov::Shape rotation_coefficient_store_shape{device_config.get_head_size() *
(m_scheduler->get_block_size() * scheduler_config.num_kv_blocks)};
m_rotation_deltas_stores.reserve(m_num_decoder_layers);
ov::Shape rotation_deltas_store_shape{m_scheduler->get_block_size() * scheduler_config.num_kv_blocks};
for (size_t i = 0; i < m_num_decoder_layers; i++) {
ov::Tensor store(ov::element::f32, rotation_coefficient_store_shape);
ov::Tensor store(ov::element::i32, rotation_deltas_store_shape);
std::memset(store.data(), 0, store.get_byte_size());
m_rotation_coefficient_stores.push_back(store);
m_rotation_deltas_stores.push_back(store);
}

// TODO (vshampor): LUT size equal to max cache size in tokens
// is overkill - find a way to pass the max sequence length defined by pipeline instead
size_t max_sequence_length = m_scheduler->get_block_size() * scheduler_config.num_kv_blocks;
size_t embedding_size = device_config.get_head_size();
m_cache_rotation_calculator = std::make_shared<CacheRotationCalculator>(
m_scheduler->get_block_size(),
// TODO (vshampor): LUT size equal to max cache size in tokens
// is overkill - find a way to pass the max sequence length instead
m_scheduler->get_block_size() * scheduler_config.num_kv_blocks,
device_config.get_head_size());
max_sequence_length,
embedding_size);
auto rotation_trig_lut = ov::Tensor(ov::element::f32, ov::Shape{max_sequence_length, device_config.get_head_size()});
float* rotation_trig_lut_data = rotation_trig_lut.data<float>();
std::memset(rotation_trig_lut_data, 0, rotation_trig_lut.get_byte_size());

const auto& cos_lut = m_cache_rotation_calculator->get_cos_lut();
const auto& sin_lut = m_cache_rotation_calculator->get_sin_lut();


for (size_t pos_idx = 0; pos_idx < cos_lut.size(); pos_idx++) {
for (size_t embedding_pair_idx = 0; embedding_pair_idx < cos_lut[0].size(); embedding_pair_idx++) {
rotation_trig_lut_data[pos_idx * embedding_size + embedding_pair_idx] = cos_lut[pos_idx][embedding_pair_idx];
rotation_trig_lut_data[pos_idx * embedding_size + embedding_size / 2 + embedding_pair_idx] = sin_lut[pos_idx][embedding_pair_idx];
}
}

m_model_runner->set_cache_rotation_trig_lut(std::move(rotation_trig_lut));
} else {
m_model_runner =
std::make_shared<ModelRunner>(infer_request, m_scheduler->get_block_size(), m_num_decoder_layers);
Expand Down Expand Up @@ -177,8 +195,8 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::step() {

if (sched_config.use_cache_eviction) {
_compute_cache_rotation_data(m_requests, scheduler_output);
m_model_runner->set_cache_rotation_data(std::move(m_current_step_rotation_coefficients),
std::move(m_current_step_rotated_block_indices_per_sequence));
m_model_runner->set_cache_rotation_data(std::move(m_current_step_rotated_block_indices_per_sequence),
std::move(m_current_step_rotation_deltas));
}

timer.end();
Expand Down Expand Up @@ -413,11 +431,10 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_compute_cache_rotation
}
}

size_t head_size = m_cache_rotation_calculator->get_head_size();
// necessary since we move from these members during previous steps
m_current_step_rotation_coefficients.clear();
m_current_step_rotated_block_indices_per_sequence.clear();
m_current_step_rotated_block_indices_per_sequence.resize(m_num_decoder_layers);
m_current_step_rotation_deltas.clear();

std::vector<size_t> num_blocks_to_rotate_for_each_layer(m_num_decoder_layers, 0);

Expand All @@ -426,7 +443,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_compute_cache_rotation
size_t seq_id = seq_id_and_evicted_blocks.first;
// Skip sequences that, in the meanwhile before previous step's forward execution and now,
// have left the cache (e.g. finished or were preempted)
if (live_sequences.find(seq_id) == live_sequences.end()) {
if (live_seq_ids_to_num_occupied_blocks.find(seq_id) == live_seq_ids_to_num_occupied_blocks.end()) {
continue;
}

Expand All @@ -438,43 +455,31 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_compute_cache_rotation
}
size_t num_blocks_before_eviction = m_previous_num_blocks_before_eviction_per_sequence[seq_id];
auto rotation_multipliers =
m_cache_rotation_calculator->get_rotation_coefficients(logical_blocks_to_evict[layer_idx],
m_cache_rotation_calculator->get_rotation_data(logical_blocks_to_evict[layer_idx],
num_blocks_before_eviction);
for (size_t i = 0; i < rotation_multipliers.size(); i++) {
const auto& block_rotation_data = rotation_multipliers[i];
const auto& rotation_multipliers_cos = block_rotation_data.cosines;
const auto& rotation_multipliers_sin = block_rotation_data.sines;
OPENVINO_ASSERT(rotation_multipliers_cos.size() == rotation_multipliers_sin.size());
OPENVINO_ASSERT(rotation_multipliers_cos.size() == m_scheduler->get_block_size());

m_current_step_rotated_block_indices_per_sequence[layer_idx][seq_id].push_back(
block_rotation_data.logical_block_idx);

// Fill the store tensor with rotation coefficient data - cos and sin coefficients are each contiguous,
// cos goes first
size_t block_offset =
num_blocks_to_rotate_for_each_layer[layer_idx] * m_scheduler->get_block_size() * head_size;
auto rotation_multipliers_tensor_data =
m_rotation_coefficient_stores[layer_idx].data<float>() + block_offset;
for (size_t tok_idx = 0; tok_idx < rotation_multipliers_cos.size(); tok_idx++) {
size_t position_offset = head_size * tok_idx;
for (size_t embedding_pair_idx = 0; embedding_pair_idx < head_size / 2; embedding_pair_idx++) {
rotation_multipliers_tensor_data[position_offset + embedding_pair_idx] =
rotation_multipliers_cos[tok_idx][embedding_pair_idx];
rotation_multipliers_tensor_data[position_offset + embedding_pair_idx + head_size / 2] =
rotation_multipliers_sin[tok_idx][embedding_pair_idx];
}
num_blocks_to_rotate_for_each_layer[layer_idx] * m_scheduler->get_block_size();
auto rotation_deltas_tensor_data =
m_rotation_deltas_stores[layer_idx].data<int32_t>() + block_offset;
for (size_t tok_idx = 0; tok_idx < m_scheduler->get_block_size(); tok_idx++) {
rotation_deltas_tensor_data[tok_idx] = block_rotation_data.rotation_delta;
}
num_blocks_to_rotate_for_each_layer[layer_idx] += 1;
}
}
}
// Select the previously filled rotation coefficients from the store tensor
for (size_t i = 0; i < m_num_decoder_layers; i++) {
m_current_step_rotation_coefficients.emplace_back(
m_rotation_coefficient_stores[i],
m_current_step_rotation_deltas.emplace_back(
m_rotation_deltas_stores[i],
ov::Coordinate{0},
ov::Coordinate{num_blocks_to_rotate_for_each_layer[i] * m_scheduler->get_block_size() * head_size});
ov::Coordinate{num_blocks_to_rotate_for_each_layer[i] * m_scheduler->get_block_size()});
}
}

Expand Down
8 changes: 3 additions & 5 deletions src/cpp/src/continuous_batching_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,14 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc

size_t m_num_decoder_layers = 0;

// Pre-allocated per-layer storages for the per-token cache re-rotation coefficients used in cache eviction case
std::vector<ov::Tensor> m_rotation_coefficient_stores;
// Pre-allocated per-layer storages for the per-token cache re-rotation deltas used in cache eviction case
std::vector<ov::Tensor> m_rotation_deltas_stores;

std::map<size_t, std::vector<std::set<size_t>>> m_previous_evicted_block_logical_indices_per_sequence;
std::map<size_t, size_t> m_previous_num_blocks_before_eviction_per_sequence;

// Per-layer ROI tensors, reusing storage from the pre-allocated tensors above, that actually represent the
// re-rotation coefficients to be sent to the proper model inputs at the *next* pipeline step.
std::vector<ov::Tensor> m_current_step_rotation_coefficients;
std::vector<std::map<size_t, std::vector<size_t>>> m_current_step_rotated_block_indices_per_sequence;
std::vector<ov::Tensor> m_current_step_rotation_deltas;

std::shared_ptr<ov::genai::CacheRotationCalculator> m_cache_rotation_calculator;

Expand Down
31 changes: 18 additions & 13 deletions src/cpp/src/model_runner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,10 @@ class ModelRunner {
size_t m_num_decoder_layers, m_block_size;
bool m_collect_attention_scores;
bool m_is_use_per_layer_cache_control;
std::vector<ov::Tensor> m_cache_rotation_coefficients;

std::vector<std::map<size_t, std::vector<size_t>>> m_rotated_block_logical_indices_per_sequence_for_each_layer;
std::vector<ov::Tensor> m_cache_rotation_deltas_for_each_layer;
ov::Tensor m_cache_rotation_trig_lut;

public:
/**
Expand Down Expand Up @@ -76,12 +78,16 @@ class ModelRunner {
return m_last_attention_scores;
}

void set_cache_rotation_data(std::vector<ov::Tensor>&& cache_rotation_coefficients_for_each_layer,
const std::vector<std::map<size_t, std::vector<size_t>>>&&
rotated_logical_block_indices_per_sequence_for_each_layer) {
m_cache_rotation_coefficients = std::move(cache_rotation_coefficients_for_each_layer);
void set_cache_rotation_trig_lut(ov::Tensor&& rotation_trig_lut) {
m_cache_rotation_trig_lut = std::move(rotation_trig_lut);
}

void set_cache_rotation_data(std::vector<std::map<size_t, std::vector<size_t>>>&&
rotated_logical_block_indices_per_sequence_for_each_layer,
std::vector<ov::Tensor>&& rotation_deltas_for_each_layer) {
m_rotated_block_logical_indices_per_sequence_for_each_layer =
std::move(rotated_logical_block_indices_per_sequence_for_each_layer);
m_cache_rotation_deltas_for_each_layer = std::move(rotation_deltas_for_each_layer);
}

/**
Expand Down Expand Up @@ -184,8 +190,9 @@ class ModelRunner {

_set_block_indices(sequence_groups, scheduler_output, total_num_blocks);

if (!m_cache_rotation_coefficients.empty()) {
if (!m_cache_rotation_deltas_for_each_layer.empty()) {
_set_cache_rotation_coefficients(sequence_groups, scheduler_output);
m_request.set_tensor("rotation_trig_lut", m_cache_rotation_trig_lut);
}

m_request.set_tensor("block_indices_begins", block_indices_begins);
Expand All @@ -211,8 +218,6 @@ class ModelRunner {
_collect_attention_scores(sequence_groups, scheduler_output);
}

m_cache_rotation_coefficients.clear();

// return logits
return m_request.get_tensor("logits");
}
Expand Down Expand Up @@ -307,11 +312,6 @@ class ModelRunner {

void _set_cache_rotation_coefficients(const std::vector<SequenceGroup::Ptr>& sequence_groups,
const Scheduler::Output& scheduler_output) {
for (size_t i = 0; i < m_num_decoder_layers; i++) {
auto tensor_name = std::string("rotation_coefficients.") + std::to_string(i);
m_request.set_tensor(tensor_name, m_cache_rotation_coefficients[i]);
}

std::vector<std::string> rotation_indices_tensor_names(m_num_decoder_layers);
for (size_t i = 0; i < m_num_decoder_layers; i++) {
auto tensor_name = std::string("rotated_block_indices.") + std::to_string(i);
Expand All @@ -324,6 +324,11 @@ class ModelRunner {
rotated_block_indices_tensor.set_shape({num_indices});
}

for (size_t i = 0; i < m_num_decoder_layers; i++) {
auto tensor_name = std::string("rotation_deltas.") + std::to_string(i);
m_request.set_tensor(tensor_name, m_cache_rotation_deltas_for_each_layer[i]);
}


// NB: the order of per-sequence index filling in the function below must be the same
// as the order of `seq_id`s in which the "rotation_coefficients.N" inputs are filled
Expand Down
Loading

0 comments on commit 533585a

Please sign in to comment.