Skip to content

Commit

Permalink
NPUW: Optimize KV-cache copy for npuw::LLMInferRequest (#28340)
Browse files Browse the repository at this point in the history
  • Loading branch information
TolyaTalamanov authored Jan 14, 2025
1 parent 57025dc commit e390175
Show file tree
Hide file tree
Showing 4 changed files with 102 additions and 3 deletions.
2 changes: 1 addition & 1 deletion src/plugins/intel_npu/src/plugin/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ cross_compiled_file(${TARGET_NAME}
ARCH AVX2 ANY
npuw/util_xarch.cpp
API npuw/util_xarch.hpp
NAME unpack_i4i8 unpack_u4i8 unpack_i4f16 unpack_i4f16_scale unpack_i4f16_z unpack_u4f16 unpack_u4f16_scale_zp unpack_u4f16_asymm_zp unpack_u4f16_z unpack_u4f32 unpack_i8f16 unpack_i8f16_scale unpack_u8f16 to_f16
NAME unpack_i4i8 unpack_u4i8 unpack_i4f16 unpack_i4f16_scale unpack_i4f16_z unpack_u4f16 unpack_u4f16_scale_zp unpack_u4f16_asymm_zp unpack_u4f16_z unpack_u4f32 unpack_i8f16 unpack_i8f16_scale unpack_u8f16 to_f16 copy_row_as_column
NAMESPACE ov::npuw::util::XARCH
)

Expand Down
53 changes: 52 additions & 1 deletion src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include "llm_compiled_model.hpp"
#include "logging.hpp"
#include "openvino/runtime/iasync_infer_request.hpp"
#include "util_xarch.hpp"

namespace {
template <typename T>
Expand All @@ -28,7 +29,49 @@ ov::SoPtr<ov::ITensor> make_tensor_slice(ov::SoPtr<ov::ITensor> tensor,
return ov::get_tensor_impl(ov::Tensor(ov::make_tensor(tensor), start_shape, end_shape));
}

void copy_by_planes(ov::SoPtr<ov::ITensor> src_tensor, ov::SoPtr<ov::ITensor> dst_tensor) {
// [1, H, S1, E] -> [1, H, S2, E]
const int N = 0;
const int H = 1;
const int S = 2;
const int E = 3;

OPENVINO_ASSERT(src_tensor->get_shape()[N] == dst_tensor->get_shape()[N]);
OPENVINO_ASSERT(src_tensor->get_shape()[H] == dst_tensor->get_shape()[H]);
OPENVINO_ASSERT(src_tensor->get_shape()[E] == dst_tensor->get_shape()[E]);
OPENVINO_ASSERT(src_tensor->get_element_type() == dst_tensor->get_element_type());
OPENVINO_ASSERT(src_tensor->get_shape()[N] == 1u);
OPENVINO_ASSERT(src_tensor->get_shape().size() == 4u);

const auto* src_tensor_data = reinterpret_cast<uint8_t*>(src_tensor->data());
auto* dst_tensor_data = reinterpret_cast<uint8_t*>(dst_tensor->data());

const auto num_planes = src_tensor->get_shape()[H];
const auto src_plane_stride = src_tensor->get_strides()[H];
const auto dst_plane_stride = dst_tensor->get_strides()[H];
const auto plane_size_in_bytes = src_tensor->get_strides()[S] * src_tensor->get_shape()[S];

for (size_t i = 0; i < num_planes; ++i) {
std::copy_n(src_tensor_data, plane_size_in_bytes, dst_tensor_data);
dst_tensor_data += dst_plane_stride;
src_tensor_data += src_plane_stride;
}
}

void copy_columns_by_row_chunks(ov::SoPtr<ov::ITensor> src, ov::SoPtr<ov::ITensor>& dst) {
/*
src/dst layout: [1, heads, emb_size, seq_len]
X[*,i] - embedding for i-th token,
Instead of copy columns, copy rows X[i,*]
[[X00 X01 ... X0n] [[X00 X01 ... X0n]
[X10 X11 ... X1n] [X10 X11 ... X1n]
[X20 X21 ... X2n] ... [X20 X21 ... X2n]
... ...
[Xm0 Xm1 ... Xmn]] [Xm0 Xm1 ... Xmn]]
*/

const auto src_shape = src->get_shape();

OPENVINO_ASSERT(src_shape.size() == 4u);
Expand Down Expand Up @@ -157,6 +200,8 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,

if (kv_dim == 3u) {
copy_columns_by_row_chunks(prefill_out_slice, kvcache_in_slice);
} else if (kv_dim == 2u) {
copy_by_planes(prefill_out_slice, kvcache_in_slice);
} else {
prefill_out_slice->copy_to(kvcache_in_slice._ptr);
}
Expand Down Expand Up @@ -199,7 +244,13 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
kvcache_desc.num_stored_tokens - 1,
kvcache_desc.num_stored_tokens);
auto kvcache_out_tensor = m_kvcache_request->get_tensor(m_kvcache_out_ports.at(output_name));
kvcache_out_tensor->copy_to(kvcache_in_slice._ptr);
if (kv_dim == 3u) {
ov::npuw::util::XARCH::copy_row_as_column(kvcache_out_tensor, kvcache_in_slice);
} else if (kv_dim == 2u) {
copy_by_planes(kvcache_out_tensor, kvcache_in_slice);
} else {
kvcache_out_tensor->copy_to(kvcache_in_slice._ptr);
}
}
LOG_DEBUG("Done");
}
Expand Down
48 changes: 47 additions & 1 deletion src/plugins/intel_npu/src/plugin/npuw/util_xarch.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (C) 2024 Intel Corporation
// Copyright (C) 2024-2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

Expand Down Expand Up @@ -1427,3 +1427,49 @@ ov::Tensor ov::npuw::util::XARCH::to_f16(const ov::Tensor& t) {
#endif
return tnew;
}

void ov::npuw::util::XARCH::copy_row_as_column(const ov::SoPtr<ov::ITensor>& from, const ov::SoPtr<ov::ITensor>& to) {
#if defined(HAVE_AVX2)
constexpr uint32_t BLOCK_SIZE = sizeof(__m256i) / sizeof(uint16_t);

OPENVINO_ASSERT(from->get_element_type() == ov::element::f16);
OPENVINO_ASSERT(from->is_continuous());
OPENVINO_ASSERT(from->get_size() % BLOCK_SIZE == 0);
OPENVINO_ASSERT(from->get_shape().size() == 4u);
OPENVINO_ASSERT(from->get_shape()[0] == 1u);
OPENVINO_ASSERT(to->get_element_type() == ov::element::f16);
OPENVINO_ASSERT(to->get_shape().size() == 4u);
OPENVINO_ASSERT(to->get_shape()[0] == 1u);
OPENVINO_ASSERT(from->get_shape()[1] == to->get_shape()[1]);
OPENVINO_ASSERT(from->get_shape()[2] == to->get_shape()[2]);

const auto* pSrc = reinterpret_cast<uint16_t*>(from->data());
auto* pDst = reinterpret_cast<uint16_t*>(to->data());

const auto row_step = to->get_strides()[2] / sizeof(uint16_t);
for (size_t k = 0; k < from->get_size(); k += BLOCK_SIZE) {
const uint16_t* pSrcBlock = pSrc + k;
__m256i vsrc = _mm256_lddqu_si256(reinterpret_cast<const __m256i*>(pSrcBlock));
// NB: Assign particular byte from the block to the column
pDst[0 * row_step] = _mm256_extract_epi16(vsrc, 0);
pDst[1 * row_step] = _mm256_extract_epi16(vsrc, 1);
pDst[2 * row_step] = _mm256_extract_epi16(vsrc, 2);
pDst[3 * row_step] = _mm256_extract_epi16(vsrc, 3);
pDst[4 * row_step] = _mm256_extract_epi16(vsrc, 4);
pDst[5 * row_step] = _mm256_extract_epi16(vsrc, 5);
pDst[6 * row_step] = _mm256_extract_epi16(vsrc, 6);
pDst[7 * row_step] = _mm256_extract_epi16(vsrc, 7);
pDst[8 * row_step] = _mm256_extract_epi16(vsrc, 8);
pDst[9 * row_step] = _mm256_extract_epi16(vsrc, 9);
pDst[10 * row_step] = _mm256_extract_epi16(vsrc, 10);
pDst[11 * row_step] = _mm256_extract_epi16(vsrc, 11);
pDst[12 * row_step] = _mm256_extract_epi16(vsrc, 12);
pDst[13 * row_step] = _mm256_extract_epi16(vsrc, 13);
pDst[14 * row_step] = _mm256_extract_epi16(vsrc, 14);
pDst[15 * row_step] = _mm256_extract_epi16(vsrc, 15);
pDst += BLOCK_SIZE * row_step;
}
#else
from->copy_to(to._ptr);
#endif
}
2 changes: 2 additions & 0 deletions src/plugins/intel_npu/src/plugin/npuw/util_xarch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ void unpack_u8f16(const ov::SoPtr<ov::ITensor>& from,

ov::Tensor to_f16(const ov::Tensor& t);

void copy_row_as_column(const ov::SoPtr<ov::ITensor>& from, const ov::SoPtr<ov::ITensor>& to);

} // namespace XARCH
} // namespace util
} // namespace npuw
Expand Down

0 comments on commit e390175

Please sign in to comment.