diff --git a/src/plugins/intel_npu/src/plugin/CMakeLists.txt b/src/plugins/intel_npu/src/plugin/CMakeLists.txt index d54be2f984fdc2..94526d1a20f081 100644 --- a/src/plugins/intel_npu/src/plugin/CMakeLists.txt +++ b/src/plugins/intel_npu/src/plugin/CMakeLists.txt @@ -53,7 +53,7 @@ cross_compiled_file(${TARGET_NAME} ARCH AVX2 ANY npuw/util_xarch.cpp API npuw/util_xarch.hpp - NAME unpack_i4i8 unpack_u4i8 unpack_i4f16 unpack_i4f16_scale unpack_i4f16_z unpack_u4f16 unpack_u4f16_scale_zp unpack_u4f16_asymm_zp unpack_u4f16_z unpack_u4f32 unpack_i8f16 unpack_i8f16_scale unpack_u8f16 to_f16 + NAME unpack_i4i8 unpack_u4i8 unpack_i4f16 unpack_i4f16_scale unpack_i4f16_z unpack_u4f16 unpack_u4f16_scale_zp unpack_u4f16_asymm_zp unpack_u4f16_z unpack_u4f32 unpack_i8f16 unpack_i8f16_scale unpack_u8f16 to_f16 copy_row_as_column NAMESPACE ov::npuw::util::XARCH ) diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp index 0bad68a35aa4ba..2e987036483e34 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp @@ -9,6 +9,7 @@ #include "llm_compiled_model.hpp" #include "logging.hpp" #include "openvino/runtime/iasync_infer_request.hpp" +#include "util_xarch.hpp" namespace { template @@ -28,7 +29,49 @@ ov::SoPtr make_tensor_slice(ov::SoPtr tensor, return ov::get_tensor_impl(ov::Tensor(ov::make_tensor(tensor), start_shape, end_shape)); } +void copy_by_planes(ov::SoPtr src_tensor, ov::SoPtr dst_tensor) { + // [1, H, S1, E] -> [1, H, S2, E] + const int N = 0; + const int H = 1; + const int S = 2; + const int E = 3; + + OPENVINO_ASSERT(src_tensor->get_shape()[N] == dst_tensor->get_shape()[N]); + OPENVINO_ASSERT(src_tensor->get_shape()[H] == dst_tensor->get_shape()[H]); + OPENVINO_ASSERT(src_tensor->get_shape()[E] == dst_tensor->get_shape()[E]); + OPENVINO_ASSERT(src_tensor->get_element_type() == dst_tensor->get_element_type()); + OPENVINO_ASSERT(src_tensor->get_shape()[N] == 1u); + OPENVINO_ASSERT(src_tensor->get_shape().size() == 4u); + + const auto* src_tensor_data = reinterpret_cast(src_tensor->data()); + auto* dst_tensor_data = reinterpret_cast(dst_tensor->data()); + + const auto num_planes = src_tensor->get_shape()[H]; + const auto src_plane_stride = src_tensor->get_strides()[H]; + const auto dst_plane_stride = dst_tensor->get_strides()[H]; + const auto plane_size_in_bytes = src_tensor->get_strides()[S] * src_tensor->get_shape()[S]; + + for (size_t i = 0; i < num_planes; ++i) { + std::copy_n(src_tensor_data, plane_size_in_bytes, dst_tensor_data); + dst_tensor_data += dst_plane_stride; + src_tensor_data += src_plane_stride; + } +} + void copy_columns_by_row_chunks(ov::SoPtr src, ov::SoPtr& dst) { + /* + src/dst layout: [1, heads, emb_size, seq_len] + + X[*,i] - embedding for i-th token, + Instead of copy columns, copy rows X[i,*] + + [[X00 X01 ... X0n] [[X00 X01 ... X0n] + [X10 X11 ... X1n] [X10 X11 ... X1n] + [X20 X21 ... X2n] ... [X20 X21 ... X2n] + ... ... + [Xm0 Xm1 ... Xmn]] [Xm0 Xm1 ... Xmn]] + */ + const auto src_shape = src->get_shape(); OPENVINO_ASSERT(src_shape.size() == 4u); @@ -157,6 +200,8 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr input_ids, if (kv_dim == 3u) { copy_columns_by_row_chunks(prefill_out_slice, kvcache_in_slice); + } else if (kv_dim == 2u) { + copy_by_planes(prefill_out_slice, kvcache_in_slice); } else { prefill_out_slice->copy_to(kvcache_in_slice._ptr); } @@ -199,7 +244,13 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr input_ids, kvcache_desc.num_stored_tokens - 1, kvcache_desc.num_stored_tokens); auto kvcache_out_tensor = m_kvcache_request->get_tensor(m_kvcache_out_ports.at(output_name)); - kvcache_out_tensor->copy_to(kvcache_in_slice._ptr); + if (kv_dim == 3u) { + ov::npuw::util::XARCH::copy_row_as_column(kvcache_out_tensor, kvcache_in_slice); + } else if (kv_dim == 2u) { + copy_by_planes(kvcache_out_tensor, kvcache_in_slice); + } else { + kvcache_out_tensor->copy_to(kvcache_in_slice._ptr); + } } LOG_DEBUG("Done"); } diff --git a/src/plugins/intel_npu/src/plugin/npuw/util_xarch.cpp b/src/plugins/intel_npu/src/plugin/npuw/util_xarch.cpp index 37c4770b9d9fa3..af6354126334fb 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/util_xarch.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/util_xarch.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2024 Intel Corporation +// Copyright (C) 2024-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -1427,3 +1427,49 @@ ov::Tensor ov::npuw::util::XARCH::to_f16(const ov::Tensor& t) { #endif return tnew; } + +void ov::npuw::util::XARCH::copy_row_as_column(const ov::SoPtr& from, const ov::SoPtr& to) { +#if defined(HAVE_AVX2) + constexpr uint32_t BLOCK_SIZE = sizeof(__m256i) / sizeof(uint16_t); + + OPENVINO_ASSERT(from->get_element_type() == ov::element::f16); + OPENVINO_ASSERT(from->is_continuous()); + OPENVINO_ASSERT(from->get_size() % BLOCK_SIZE == 0); + OPENVINO_ASSERT(from->get_shape().size() == 4u); + OPENVINO_ASSERT(from->get_shape()[0] == 1u); + OPENVINO_ASSERT(to->get_element_type() == ov::element::f16); + OPENVINO_ASSERT(to->get_shape().size() == 4u); + OPENVINO_ASSERT(to->get_shape()[0] == 1u); + OPENVINO_ASSERT(from->get_shape()[1] == to->get_shape()[1]); + OPENVINO_ASSERT(from->get_shape()[2] == to->get_shape()[2]); + + const auto* pSrc = reinterpret_cast(from->data()); + auto* pDst = reinterpret_cast(to->data()); + + const auto row_step = to->get_strides()[2] / sizeof(uint16_t); + for (size_t k = 0; k < from->get_size(); k += BLOCK_SIZE) { + const uint16_t* pSrcBlock = pSrc + k; + __m256i vsrc = _mm256_lddqu_si256(reinterpret_cast(pSrcBlock)); + // NB: Assign particular byte from the block to the column + pDst[0 * row_step] = _mm256_extract_epi16(vsrc, 0); + pDst[1 * row_step] = _mm256_extract_epi16(vsrc, 1); + pDst[2 * row_step] = _mm256_extract_epi16(vsrc, 2); + pDst[3 * row_step] = _mm256_extract_epi16(vsrc, 3); + pDst[4 * row_step] = _mm256_extract_epi16(vsrc, 4); + pDst[5 * row_step] = _mm256_extract_epi16(vsrc, 5); + pDst[6 * row_step] = _mm256_extract_epi16(vsrc, 6); + pDst[7 * row_step] = _mm256_extract_epi16(vsrc, 7); + pDst[8 * row_step] = _mm256_extract_epi16(vsrc, 8); + pDst[9 * row_step] = _mm256_extract_epi16(vsrc, 9); + pDst[10 * row_step] = _mm256_extract_epi16(vsrc, 10); + pDst[11 * row_step] = _mm256_extract_epi16(vsrc, 11); + pDst[12 * row_step] = _mm256_extract_epi16(vsrc, 12); + pDst[13 * row_step] = _mm256_extract_epi16(vsrc, 13); + pDst[14 * row_step] = _mm256_extract_epi16(vsrc, 14); + pDst[15 * row_step] = _mm256_extract_epi16(vsrc, 15); + pDst += BLOCK_SIZE * row_step; + } +#else + from->copy_to(to._ptr); +#endif +} diff --git a/src/plugins/intel_npu/src/plugin/npuw/util_xarch.hpp b/src/plugins/intel_npu/src/plugin/npuw/util_xarch.hpp index 0f0d9912f3b221..9148ba0106fa54 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/util_xarch.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/util_xarch.hpp @@ -82,6 +82,8 @@ void unpack_u8f16(const ov::SoPtr& from, ov::Tensor to_f16(const ov::Tensor& t); +void copy_row_as_column(const ov::SoPtr& from, const ov::SoPtr& to); + } // namespace XARCH } // namespace util } // namespace npuw