forked from openvinotoolkit/openvino
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[CPU] PagedAttention supports dynamic-split fuse (openvinotoolkit#24107)
### Details: - *Merge first token and second token inference into one parallel loop* - *~~Additional optimization: pre-transpose k-cache, pre-pack v-cache if needed~~* - *Additional optimization for first token: save q * k' upper triangle matrix computation and (q * k') * v lower triangle matrix computation* - *C++ pipeline can enable it: ilya-lavrenov/openvino.genai#9 - *TODO(in another PR):* - alibi support - performance tuning - testcase ### Tickets: - *[138673](https://jira.devtools.intel.com/browse/CVS-138673)*
- Loading branch information
1 parent
6950460
commit 04a7ecf
Showing
24 changed files
with
2,815 additions
and
1,348 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
56 changes: 56 additions & 0 deletions
56
src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant_kernel.hpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
// Copyright (C) 2018-2024 Intel Corporation | ||
// SPDX-License-Identifier: Apache-2.0 | ||
// | ||
#pragma once | ||
|
||
#include <array> | ||
#include <cstddef> | ||
#include <cstdint> | ||
#include <vector> | ||
#include "openvino/core/type/element_type.hpp" | ||
#include "utils/plain_tensor.hpp" | ||
|
||
namespace ov { | ||
namespace Extensions { | ||
namespace Cpu { | ||
namespace XARCH { | ||
|
||
template<typename TDST> | ||
void attn_dequant_u8_kernel(const uint8_t* src, TDST* dst, size_t n, float scale, float zp) { | ||
size_t i = 0; | ||
// loadu_si128/epi64 does not support const qualifier | ||
uint8_t* src_nc = const_cast<uint8_t*>(src); | ||
#if defined(HAVE_AVX512F) | ||
auto v_zp = _mm512_set1_ps(zp); | ||
auto v_scale = _mm512_set1_ps(scale); | ||
for (; i + vec_len_f32_avx512 <= n; i += vec_len_f32_avx512) { | ||
auto v0_128 = _mm_loadu_si128(reinterpret_cast<__m128i*>(src_nc + i)); | ||
auto v0_512 = _mm512_cvtepu8_epi32(v0_128); | ||
auto v0_value = _mm512_cvtepi32_ps(v0_512); | ||
v0_value = _mm512_sub_ps(v0_value, v_zp); | ||
auto v0_out = _mm512_mul_ps(v0_value, v_scale); | ||
mm512_uni_storeu_ps(dst + i, v0_out); | ||
} | ||
#elif defined(HAVE_AVX2) | ||
auto v_zp = _mm256_set1_ps(zp); | ||
auto v_scale = _mm256_set1_ps(scale); | ||
for (; i + vec_len_f32_avx2 <= n; i += vec_len_f32_avx2) { | ||
auto v0_128 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(src_nc + i)); | ||
auto v0_256 = _mm256_cvtepu8_epi32(v0_128); | ||
auto v0_value = _mm256_cvtepi32_ps(v0_256); | ||
v0_value = _mm256_sub_ps(v0_value, v_zp); | ||
auto v0_out = _mm256_mul_ps(v0_value, v_scale); | ||
mm256_uni_storeu_ps(dst + i, v0_out); | ||
} | ||
#endif | ||
for (; i < n; ++i) { | ||
float tmp = src_nc[i]; | ||
tmp = (tmp - zp) * scale; | ||
dst[i] = tmp; | ||
} | ||
} | ||
|
||
} // namespace XARCH | ||
} // namespace Cpu | ||
} // namespace Extensions | ||
} // namespace ov |
Oops, something went wrong.