From 5c5c129c5e02cf0b8dc6e9709e6c713216380432 Mon Sep 17 00:00:00 2001 From: dmitrygo Date: Fri, 3 Jan 2025 12:51:40 +0000 Subject: [PATCH 1/2] [CPU][ARM] Fixed cvt_copy fast path for mha_single_token_kernel --- .../kernels/scaled_attn/mha_single_token.cpp | 84 ++++++++++++------- .../subgraph_tests/src/arm/concat_sdp.cpp | 7 ++ .../subgraph_tests/src/common/concat_sdp.cpp | 7 ++ .../subgraph_tests/src/x64/concat_sdp.cpp | 7 ++ 4 files changed, 73 insertions(+), 32 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp index 8f56da752b6f5c..3fb98960228dc8 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp @@ -62,31 +62,50 @@ void cvt_copy(TA* dst, TB* src, size_t n) { auto vb = mm256_uni_loadu_ps(src + i); mm256_uni_storeu_ps(dst + i, vb); } -#elif defined(OPENVINO_ARCH_ARM64) -# if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - if (std::is_same::value && std::is_same::value) { -# if defined(HAVE_SVE) - size_t inc = vec_len_f16_sve(); - svbool_t pg = svptrue_b16(); - - while (i < n) { - if (n - i < vec_len_f16_sve()) { - inc = n - i; - pg = svwhilelt_b16(0, static_cast(inc)); - } - svfloat16_t b1 = svld1_f16(pg, reinterpret_cast(src + i)); - svst1_f16(pg, reinterpret_cast(dst + i), b1); - i += inc; - } -# else - for (; i + vec_len_f16_neon <= n; i += vec_len_f16_neon) { - auto vb1 = vld1q_f16(reinterpret_cast(src + i)); - vst1q_f16(reinterpret_cast(dst + i), vb1); +#endif + for (; i < n; i++) { + dst[i] = src[i]; + } +} + +#if defined(OPENVINO_ARCH_ARM64) +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +#if defined(HAVE_SVE) +template <> +void cvt_copy(ov::float16* dst, ov::float16* src, size_t n) { + size_t i = 0; + size_t inc = vec_len_f16_sve(); + svbool_t pg = svptrue_b16(); + + while (i < n) { + if (n - i < vec_len_f16_sve()) { + inc = n - i; + pg = svwhilelt_b16(0, static_cast(inc)); } -# endif + svfloat16_t b1 = svld1_f16(pg, reinterpret_cast(src + i)); + svst1_f16(pg, reinterpret_cast(dst + i), b1); + i += inc; } -# else -# if defined(HAVE_SVE) +} +#else // NEON +template <> +void cvt_copy(ov::float16* dst, ov::float16* src, size_t n) { + size_t i = 0; + for (; i + vec_len_f16_neon <= n; i += vec_len_f16_neon) { + auto vb1 = vld1q_f16(reinterpret_cast(src + i)); + vst1q_f16(reinterpret_cast(dst + i), vb1); + } + for (; i < n; i++) { + dst[i] = src[i]; + } +} +#endif // defined(HAVE_SVE) +#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + +#if defined(HAVE_SVE) +template <> +void cvt_copy(float* dst, float* src, size_t n) { + size_t i = 0; auto _dst = reinterpret_cast(dst); size_t inc = vec_len_f32_sve(); svbool_t pg = svptrue_b32(); @@ -100,20 +119,21 @@ void cvt_copy(TA* dst, TB* src, size_t n) { svst1_f32(pg, _dst + i, b1); i += inc; } -# else - if (std::is_same::value && std::is_same::value) { - for (; i + vec_len_f32_neon <= n; i += vec_len_f32_neon) { - float32x4_t vb1 = __vld1q_f32(src + i); - __vst1q_f32(dst + i, vb1); - } +} +#else // NEON +template <> +void cvt_copy(float* dst, float* src, size_t n) { + size_t i = 0; + for (; i + vec_len_f32_neon <= n; i += vec_len_f32_neon) { + float32x4_t vb1 = __vld1q_f32(src + i); + __vst1q_f32(dst + i, vb1); } -# endif -# endif -#endif for (; i < n; i++) { dst[i] = src[i]; } } +#endif // defined(HAVE_SVE) +#endif // defined(OPENVINO_ARCH_ARM64) template static void attn_acc_value(float* out, float weight, T* v, size_t S, float* scale, float* zp) { diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/arm/concat_sdp.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/arm/concat_sdp.cpp index f049a16a7640fc..2cd237f3786fe8 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/arm/concat_sdp.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/arm/concat_sdp.cpp @@ -31,6 +31,13 @@ const std::vector> inputShapes = { // B, H, L0, S {{-1, 8, -1, 64}, {{4, 8, 0, 64}, {4, 8, 10, 64}, {4, 8, 11, 64}, {4, 8, 12, 64}, {4, 8, 13, 64}}}, }, + // big batch to check cvt_copy fast-path inside mha_single_token_kernel + { + // B, H, L1, S + {{-1, 8, -1, 64}, {{129, 8, 10, 64}, {129, 8, 1, 64}, {129, 8, 1, 64}, {129, 8, 1, 64}, {129, 8, 1, 64}}}, + // B, H, L0, S + {{-1, 8, -1, 64}, {{129, 8, 0, 64}, {129, 8, 10, 64}, {129, 8, 11, 64}, {129, 8, 12, 64}, {129, 8, 13, 64}}}, + }, }; INSTANTIATE_TEST_SUITE_P(smoke_ConcatSDPTest, diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_sdp.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_sdp.cpp index 6761acf8b5dfb1..c40aea778f1a7f 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_sdp.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_sdp.cpp @@ -31,6 +31,13 @@ const std::vector> inputShapes = { // B, H, L0, S {{-1, 8, -1, 64}, {{4, 8, 0, 64}, {4, 8, 10, 64}, {4, 8, 11, 64}, {4, 8, 12, 64}, {4, 8, 13, 64}}}, }, + // big batch to check cvt_copy fast-path inside mha_single_token_kernel + { + // B, H, L1, S + {{-1, 8, -1, 64}, {{129, 8, 10, 64}, {129, 8, 1, 64}, {129, 8, 1, 64}, {129, 8, 1, 64}, {129, 8, 1, 64}}}, + // B, H, L0, S + {{-1, 8, -1, 64}, {{129, 8, 0, 64}, {129, 8, 10, 64}, {129, 8, 11, 64}, {129, 8, 12, 64}, {129, 8, 13, 64}}}, + }, }; INSTANTIATE_TEST_SUITE_P(smoke_ConcatSDPTest, diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/concat_sdp.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/concat_sdp.cpp index 29667e2ffa3072..0000a8f341f879 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/concat_sdp.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/concat_sdp.cpp @@ -31,6 +31,13 @@ const std::vector> inputShapes = { // B, H, L0, S {{-1, 8, -1, 64}, {{4, 8, 0, 64}, {4, 8, 10, 64}, {4, 8, 11, 64}, {4, 8, 12, 64}, {4, 8, 13, 64}}}, }, + // big batch to check cvt_copy fast-path inside mha_single_token_kernel + { + // B, H, L1, S + {{-1, 8, -1, 64}, {{129, 8, 10, 64}, {129, 8, 1, 64}, {129, 8, 1, 64}, {129, 8, 1, 64}, {129, 8, 1, 64}}}, + // B, H, L0, S + {{-1, 8, -1, 64}, {{129, 8, 0, 64}, {129, 8, 10, 64}, {129, 8, 11, 64}, {129, 8, 12, 64}, {129, 8, 13, 64}}}, + }, }; INSTANTIATE_TEST_SUITE_P(smoke_ConcatSDPTest, From c60cd0cdeab609c3cd2f375fd8c66f3e022f27d5 Mon Sep 17 00:00:00 2001 From: dmitrygo Date: Fri, 3 Jan 2025 13:33:42 +0000 Subject: [PATCH 2/2] fixed codestyle --- .../kernels/scaled_attn/mha_single_token.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp index 3fb98960228dc8..f42f15ce1e065a 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp @@ -69,8 +69,8 @@ void cvt_copy(TA* dst, TB* src, size_t n) { } #if defined(OPENVINO_ARCH_ARM64) -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) -#if defined(HAVE_SVE) +# if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +# if defined(HAVE_SVE) template <> void cvt_copy(ov::float16* dst, ov::float16* src, size_t n) { size_t i = 0; @@ -87,7 +87,7 @@ void cvt_copy(ov::float16* dst, ov::float16* src, size_t n) { i += inc; } } -#else // NEON +# else // NEON template <> void cvt_copy(ov::float16* dst, ov::float16* src, size_t n) { size_t i = 0; @@ -99,10 +99,10 @@ void cvt_copy(ov::float16* dst, ov::float16* src, size_t n) { dst[i] = src[i]; } } -#endif // defined(HAVE_SVE) -#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +# endif // defined(HAVE_SVE) +# endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) -#if defined(HAVE_SVE) +# if defined(HAVE_SVE) template <> void cvt_copy(float* dst, float* src, size_t n) { size_t i = 0; @@ -120,7 +120,7 @@ void cvt_copy(float* dst, float* src, size_t n) { i += inc; } } -#else // NEON +# else // NEON template <> void cvt_copy(float* dst, float* src, size_t n) { size_t i = 0; @@ -132,8 +132,8 @@ void cvt_copy(float* dst, float* src, size_t n) { dst[i] = src[i]; } } -#endif // defined(HAVE_SVE) -#endif // defined(OPENVINO_ARCH_ARM64) +# endif // defined(HAVE_SVE) +#endif // defined(OPENVINO_ARCH_ARM64) template static void attn_acc_value(float* out, float weight, T* v, size_t S, float* scale, float* zp) {