diff --git a/src/plugins/intel_cpu/src/nodes/act_sparse_fc.cpp b/src/plugins/intel_cpu/src/nodes/act_sparse_fc.cpp index 40b4dc3c02d6f6..646e517185b8a4 100644 --- a/src/plugins/intel_cpu/src/nodes/act_sparse_fc.cpp +++ b/src/plugins/intel_cpu/src/nodes/act_sparse_fc.cpp @@ -2,11 +2,10 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "act_sparse_fc.h" - #include #include +#include "act_sparse_fc.h" #include "common/arbitrary_order_desc_creator.h" #include "common/bfloat16.hpp" #include "common/cpu_memcpy.h" @@ -74,7 +73,8 @@ struct ActSparseFCKey { } bool operator==(const ActSparseFCKey& rhs) const { - return is_quantized == rhs.is_quantized && is_int4 == rhs.is_int4 && with_zero_point == rhs.with_zero_point && ic_q_group_size == rhs.ic_q_group_size; + return is_quantized == rhs.is_quantized && is_int4 == rhs.is_int4 && with_zero_point == rhs.with_zero_point && + ic_q_group_size == rhs.ic_q_group_size; } }; @@ -82,15 +82,16 @@ void ActSparseFC::createPrimitive() { ActSparseFCKey key; key.is_quantized = m_config.is_quantized; key.is_int4 = m_config.is_int4; - key.with_zero_point = m_config.with_zero_point; + key.with_zero_point = m_config.with_zero_point; key.ic_q_group_size = m_config.ic_q_group_size; - auto buildExecutor = [](const ActSparseFCKey& key) -> std::shared_ptr { + auto buildExecutor = [this](const ActSparseFCKey& key) -> std::shared_ptr { #if defined(OPENVINO_ARCH_X86_64) - return std::make_shared(key.is_quantized, - key.is_int4, - key.with_zero_point, - key.ic_q_group_size); + return std::make_shared(context->getScratchPad(), + key.is_quantized, + key.is_int4, + key.with_zero_point, + key.ic_q_group_size); #else return nullptr; #endif @@ -153,8 +154,7 @@ void ActSparseFC::createPrimitive() { // [OC, IC/group_size, 1] => [IC/group_size, OC] auto raw_scales_mem = getSrcMemoryAtPort(2); ArbitraryOrderDescCreator descCreator({2, 1, 0}); - auto dst_mem_desc = - descCreator.createSharedDesc(raw_scales_mem->getPrecision(), raw_scales_mem->getShape()); + auto dst_mem_desc = descCreator.createSharedDesc(raw_scales_mem->getPrecision(), raw_scales_mem->getShape()); auto scales_mem = std::make_shared(engine, dst_mem_desc); node::Reorder::reorderData(*raw_scales_mem, *scales_mem, context->getParamsCache()); diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/act_sparse_fc_kernel.cpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/act_sparse_fc_kernel.cpp index 42d7b2332d59ed..334905372e17eb 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/x64/act_sparse_fc_kernel.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/act_sparse_fc_kernel.cpp @@ -1,15 +1,15 @@ // Copyright (C) 2018-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // -#include "act_sparse_fc_kernel.hpp" - #include +#include "act_sparse_fc_kernel.hpp" +#include "memory_desc/cpu_blocked_memory_desc.h" #include "openvino/core/except.hpp" #if defined(OPENVINO_ARCH_X86_64) -#include "openvino/core/parallel.hpp" +# include "openvino/core/parallel.hpp" namespace ov { namespace intel_cpu { @@ -161,31 +161,6 @@ static std::shared_ptr jit_compile_gemmRegBlk(int rows, int cols, int p return jit; } -static void gemm6x2_Mx2(const float* pA, - int64_t A_stride, - const float* pB, - int64_t B_stride, - const float* pC, - int64_t C_stride, - int M, - int64_t bK, - int64_t is_accumulate_C) { - static std::shared_ptr gemm6x2[6] = { - jit_compile_gemmRegBlk(6, 2), - jit_compile_gemmRegBlk(1, 2), - jit_compile_gemmRegBlk(2, 2), - jit_compile_gemmRegBlk(3, 2), - jit_compile_gemmRegBlk(4, 2), - jit_compile_gemmRegBlk(5, 2), - }; - int m; - for (m = 0; m + 6 <= M; m += 6, pA += 6 * A_stride, pC += 6 * C_stride) { - (*gemm6x2[0])(pA, A_stride, pB, B_stride, pC, C_stride, bK, is_accumulate_C); - } - if (m < M) - (*gemm6x2[M - m])(pA, A_stride, pB, B_stride, pC, C_stride, bK, is_accumulate_C); -} - static std::shared_ptr jit_compile_accumulate_weight_i4(bool with_zero_point) { auto jit = std::make_shared(__func__); auto simd_width = SIMDJit::vmm_width(); @@ -415,26 +390,6 @@ static std::shared_ptr jit_compile_reduce_outputs() { return jit; } -/* -dst0 : [N, OC] -src0 : [num_copies, N, OC] -*/ -static inline void reduce_outputs(float* dst0, float* src0, int num_copies, int64_t OC) { - static auto jit_reduce = jit_compile_reduce_outputs(); - int64_t simd_width = SIMDJit::vmm_width(); - - ov::parallel_nt(0, [&](const int ithr, const int nthr) { - int64_t oc0, oc1; - ov::splitter(OC / simd_width, nthr, ithr, oc0, oc1); - oc0 *= simd_width; - oc1 *= simd_width; - if (oc1 > OC) - oc1 = OC; - - (*jit_reduce)(dst0 + oc0, src0 + oc0, num_copies, oc1 - oc0, OC); - }); -} - static std::shared_ptr jit_compile_repack_3xsimdw_1xsimdw(bool with_zp) { auto jit = std::make_shared(__func__); auto simd_width = SIMDJit::vmm_width(); @@ -627,46 +582,8 @@ static std::shared_ptr jit_compile_repack_2xsimdw(WeightCompressionType template T* ActSparseFcKernel::scratch_alloc(size_t cnt) { -# if defined(__GNUC__) || defined(__clang__) - thread_local uint8_t scratch[1024 * 1024 * 2] __attribute__((aligned(4096))); -# else - thread_local uint8_t scratch[1024 * 1024 * 2]; -# endif - OPENVINO_ASSERT(cnt * sizeof(T) < sizeof(scratch)); - // DEBUG_LOG(reinterpret_cast(scratch)); - return reinterpret_cast(scratch); -} - -void ActSparseFcKernel::MM_ComputeBounded_reuseA_f16(const float* A, - float* C, - const ov::float16* W, - int M, - int IC, - int OC, - int n0, - int n1) { - static auto repack_2xsimdw = jit_compile_repack_2xsimdw(WeightCompressionType::FP16); - constexpr int BK = 54; - const auto SIMDW = SIMDJit::vmm_width(); - float* scratch = scratch_alloc(BK * (SIMDW * 2) + OC); - - int K = IC; - int64_t A_stride = IC; - int64_t C_stride = OC; - int64_t W_stride = OC; - - float* repacked_B = scratch; - - for (int k = 0; k < K; k += BK, A += BK, W += BK * W_stride) { - int64_t bK = std::min(K - k, BK); - int64_t is_accumulate_C = (k > 0); - - for (int n = n0; n + 2 * SIMDW <= n1; n += 2 * SIMDW) { - // prepack [BK, 16] into scratch - (*repack_2xsimdw)(W + n, W_stride, repacked_B, bK); - gemm6x2_Mx2(A, A_stride, repacked_B, 2 * SIMDW, C + n, C_stride, M, bK, is_accumulate_C); - } - } + OPENVINO_ASSERT(cnt * sizeof(T) < 2048 * 1024); + return reinterpret_cast(m_scratch_base + parallel_get_thread_num() * 2048 * 1024); } static std::shared_ptr get_decompress_zp_u8() { @@ -722,6 +639,54 @@ static std::shared_ptr get_decompress_zp_u4() { return jit; } +void ActSparseFcKernel::gemm6x2_Mx2(const float* pA, + int64_t A_stride, + const float* pB, + int64_t B_stride, + const float* pC, + int64_t C_stride, + int M, + int64_t bK, + int64_t is_accumulate_C) { + int m; + for (m = 0; m + 6 <= M; m += 6, pA += 6 * A_stride, pC += 6 * C_stride) { + (*gemm6x2[0])(pA, A_stride, pB, B_stride, pC, C_stride, bK, is_accumulate_C); + } + if (m < M) + (*gemm6x2[M - m])(pA, A_stride, pB, B_stride, pC, C_stride, bK, is_accumulate_C); +} + +void ActSparseFcKernel::MM_ComputeBounded_reuseA_f16(const float* A, + float* C, + const ov::float16* W, + int M, + int IC, + int OC, + int n0, + int n1) { + constexpr int BK = 54; + const auto SIMDW = SIMDJit::vmm_width(); + float* scratch = scratch_alloc(BK * (SIMDW * 2) + OC); + + int K = IC; + int64_t A_stride = IC; + int64_t C_stride = OC; + int64_t W_stride = OC; + + float* repacked_B = scratch; + + for (int k = 0; k < K; k += BK, A += BK, W += BK * W_stride) { + int64_t bK = std::min(K - k, BK); + int64_t is_accumulate_C = (k > 0); + + for (int n = n0; n + 2 * SIMDW <= n1; n += 2 * SIMDW) { + // prepack [BK, 16] into scratch + (*m_repack_2xsimdw_kernel)(W + n, W_stride, repacked_B, bK); + gemm6x2_Mx2(A, A_stride, repacked_B, 2 * SIMDW, C + n, C_stride, M, bK, is_accumulate_C); + } + } +} + void ActSparseFcKernel::MM_ComputeBounded_reuseA_i8(const float* A, float* C, const uint8_t* W, @@ -732,11 +697,6 @@ void ActSparseFcKernel::MM_ComputeBounded_reuseA_i8(const float* A, int OC, int64_t n0, int64_t n1) { - static auto decompress_zp_u8 = get_decompress_zp_u8(); - static auto repack_2xsimdw_i8_zp = jit_compile_repack_2xsimdw(WeightCompressionType::INT8, true); - static auto repack_2xsimdw_i8_nozp = jit_compile_repack_2xsimdw(WeightCompressionType::INT8, false); - - auto repack_2xsimdw_i8 = zp ? repack_2xsimdw_i8_zp : repack_2xsimdw_i8_nozp; constexpr int BK = 54; const auto SIMDW = SIMDJit::vmm_width(); float* scratch = scratch_alloc(BK * (SIMDW * 2) + OC); @@ -751,7 +711,7 @@ void ActSparseFcKernel::MM_ComputeBounded_reuseA_i8(const float* A, // deocompress zero-point into scratch if (zp) { - int n = n0 + (*decompress_zp_u8)(zp + n0, zero_points, n1 - n0); + int n = n0 + (*m_decompzp_kernel)(zp + n0, zero_points, n1 - n0); for (; n < n1; n++) { zero_points[n - n0] = zp[n]; } @@ -763,7 +723,7 @@ void ActSparseFcKernel::MM_ComputeBounded_reuseA_i8(const float* A, for (int n = n0; n + 2 * SIMDW <= n1; n += 2 * SIMDW) { // prepack [BK, 16] into scratch - (*repack_2xsimdw_i8)(W + n, W_stride, repacked_B, bK, scales + n, zero_points + n - n0); + (*m_repack_2xsimdw_kernel)(W + n, W_stride, repacked_B, bK, scales + n, zero_points + n - n0); gemm6x2_Mx2(A, A_stride, repacked_B, 2 * SIMDW, C + n, C_stride, M, bK, is_accumulate_C); } } @@ -779,24 +739,6 @@ void ActSparseFcKernel::MM_ComputeBounded_reuseB_i8(const float* A, int OC, int n0, int n1) { - static auto decompress_zp_u8 = get_decompress_zp_u8(); - static std::shared_ptr gemm4x3[6] = { - jit_compile_gemmRegBlk(4, 3), - jit_compile_gemmRegBlk(1, 3), - jit_compile_gemmRegBlk(2, 3), - jit_compile_gemmRegBlk(3, 3), - }; - static std::shared_ptr gemm4x1[6] = { - jit_compile_gemmRegBlk(4, 1), - jit_compile_gemmRegBlk(1, 1), - jit_compile_gemmRegBlk(2, 1), - jit_compile_gemmRegBlk(3, 1), - }; - - static auto repack_3xsimdw_i8_1xsimdw_nozp = jit_compile_repack_3xsimdw_1xsimdw(false); - static auto repack_3xsimdw_i8_1xsimdw_withzp = jit_compile_repack_3xsimdw_1xsimdw(true); - auto* repack_3xsimdw_i8 = zp ? repack_3xsimdw_i8_1xsimdw_withzp.get() : repack_3xsimdw_i8_1xsimdw_nozp.get(); - const auto SIMDW = SIMDJit::vmm_width(); constexpr int BK = 512; constexpr int BN = 512; @@ -815,12 +757,19 @@ void ActSparseFcKernel::MM_ComputeBounded_reuseB_i8(const float* A, const auto* pW = W + cur_n; if (zp) { - (*decompress_zp_u8)(zp + cur_n, zero_points, bN); + (*m_decompzp_kernel)(zp + cur_n, zero_points, bN); } for (int k0 = 0; k0 < IC; k0 += BK, pW += BK * B_stride) { int64_t bK = std::min(IC - k0, BK); - (*repack_3xsimdw_i8)(pW, B_stride, scales + cur_n, zero_points, bK, bN, repacked_B_n24, repacked_B_n8); + (*m_repack_3xsimdw_i8_kernel)(pW, + B_stride, + scales + cur_n, + zero_points, + bK, + bN, + repacked_B_n24, + repacked_B_n8); bool is_accumulate_C = (k0 > 0); auto* pC = C + cur_n; @@ -861,11 +810,6 @@ void ActSparseFcKernel::MM_ComputeBounded_reuseA_i4(const float* A, int n0, int n1, int icgs) { - static auto decompress_zp_u4 = get_decompress_zp_u4(); - static auto repack_2xsimdw_nozp = jit_compile_repack_2xsimdw(WeightCompressionType::INT4, false); - static auto repack_2xsimdw_withzp = jit_compile_repack_2xsimdw(WeightCompressionType::INT4, true); - auto* repack_2xsimdw = zp ? repack_2xsimdw_withzp.get() : repack_2xsimdw_nozp.get(); - int BK = icgs; const auto SIMDW = SIMDJit::vmm_width(); float* scratch = scratch_alloc(BK * (SIMDW * 2) + OC); @@ -886,7 +830,7 @@ void ActSparseFcKernel::MM_ComputeBounded_reuseA_i4(const float* A, // deocompress zero-point into scratch buffer if (zp) { const auto* pzp = zp + n0 / 2; - int n = n0 + (*decompress_zp_u4)(pzp, zero_points, n1 - n0); + int n = n0 + (*m_decompzp_kernel)(pzp, zero_points, n1 - n0); for (; n < n1; n += 2, pzp++) { zero_points[n - n0] = (*pzp) & 0xF; zero_points[n - n0 + 1] = (*pzp) >> 4; @@ -896,7 +840,7 @@ void ActSparseFcKernel::MM_ComputeBounded_reuseA_i4(const float* A, for (int n = n0; n + 2 * SIMDW <= n1; n += 2 * SIMDW) { // prepack subB [BK, 16] into scratch // because BK is fully contained within IC-group (BK == n*IC_group_size), it can share same zp & scales - (*repack_2xsimdw)(W + n / 2, W_stride, repacked_B, bK, scales + n, zero_points + n - n0); + (*m_repack_2xsimdw_kernel)(W + n / 2, W_stride, repacked_B, bK, scales + n, zero_points + n - n0); gemm6x2_Mx2(A, A_stride, repacked_B, 2 * SIMDW, C + n, C_stride, M, bK, is_accumulate_C); } } @@ -939,39 +883,68 @@ void ActSparseFcKernel::repack_weights_i4(uint8_t* src, uint8_t* dst, int IC, in }); } -ActSparseFcKernel::ActSparseFcKernel(bool is_quantized, bool is_int4, bool with_zero_points, int ic_group_size) +/* +dst0 : [N, OC] +src0 : [num_copies, N, OC] +*/ +void ActSparseFcKernel::reduce_outputs(float* dst0, float* src0, int num_copies, int64_t OC) { + int64_t simd_width = SIMDJit::vmm_width(); + ov::parallel_nt(0, [&](const int ithr, const int nthr) { + int64_t oc0, oc1; + ov::splitter(OC / simd_width, nthr, ithr, oc0, oc1); + oc0 *= simd_width; + oc1 *= simd_width; + if (oc1 > OC) + oc1 = OC; + + (*m_reduce_outputs_kernel)(dst0 + oc0, src0 + oc0, num_copies, oc1 - oc0, OC); + }); +} + +ActSparseFcKernel::ActSparseFcKernel(DnnlScratchPadPtr scrach_pad, + bool is_quantized, + bool is_int4, + bool with_zero_points, + int ic_group_size) : m_is_quantized(is_quantized), m_is_int4(is_int4), m_with_zp(with_zero_points), - m_ic_group_size(ic_group_size) { - static auto decompress_zp_u8 = get_decompress_zp_u8(); - static auto decompress_zp_u4 = get_decompress_zp_u4(); + m_ic_group_size(ic_group_size), + m_scrach_pad(scrach_pad) { + gemm6x2[0] = jit_compile_gemmRegBlk(6, 2); + gemm6x2[1] = jit_compile_gemmRegBlk(1, 2); + gemm6x2[2] = jit_compile_gemmRegBlk(2, 2); + gemm6x2[3] = jit_compile_gemmRegBlk(3, 2); + gemm6x2[4] = jit_compile_gemmRegBlk(4, 2); + gemm6x2[5] = jit_compile_gemmRegBlk(5, 2); + + gemm4x3[0] = jit_compile_gemmRegBlk(4, 3); + gemm4x3[1] = jit_compile_gemmRegBlk(1, 3); + gemm4x3[2] = jit_compile_gemmRegBlk(2, 3); + gemm4x3[3] = jit_compile_gemmRegBlk(3, 3); + + gemm4x1[0] = jit_compile_gemmRegBlk(4, 1); + gemm4x1[1] = jit_compile_gemmRegBlk(1, 1); + gemm4x1[2] = jit_compile_gemmRegBlk(2, 1); + gemm4x1[3] = jit_compile_gemmRegBlk(3, 1); + + m_reduce_outputs_kernel = jit_compile_reduce_outputs(); - static auto accumulate_weight_fp16 = jit_compile_accumulate_weight(WeightCompressionType::FP16); - static auto accumulate_weight_i8_nozp = jit_compile_accumulate_weight(WeightCompressionType::INT8, false); - static auto accumulate_weight_i8_withzp = jit_compile_accumulate_weight(WeightCompressionType::INT8, true); - static auto accumulate_weight_i4_nozp = jit_compile_accumulate_weight_i4(false); - static auto accumulate_weight_i4_withzp = jit_compile_accumulate_weight_i4(true); - - WeightCompressionType wtype; if (m_is_quantized) - wtype = (m_is_int4 ? WeightCompressionType::INT4 : WeightCompressionType::INT8); - else - wtype = WeightCompressionType::FP16; - - switch (wtype) { - case WeightCompressionType::FP16: - m_accumulate_kernel = accumulate_weight_fp16.get(); + if (m_is_int4) { + m_accumulate_kernel = jit_compile_accumulate_weight_i4(m_with_zp); + m_decompzp_kernel = get_decompress_zp_u4(); + m_repack_2xsimdw_kernel = jit_compile_repack_2xsimdw(WeightCompressionType::INT4, m_with_zp); + } else { + m_accumulate_kernel = jit_compile_accumulate_weight(WeightCompressionType::INT8, m_with_zp); + m_decompzp_kernel = get_decompress_zp_u8(); + m_repack_3xsimdw_i8_kernel = jit_compile_repack_3xsimdw_1xsimdw(m_with_zp); + m_repack_2xsimdw_kernel = jit_compile_repack_2xsimdw(WeightCompressionType::INT8, m_with_zp); + } + else { + m_accumulate_kernel = jit_compile_accumulate_weight(WeightCompressionType::FP16); + m_repack_2xsimdw_kernel = jit_compile_repack_2xsimdw(WeightCompressionType::FP16); m_decompzp_kernel = nullptr; - break; - case WeightCompressionType::INT8: - m_accumulate_kernel = m_with_zp ? accumulate_weight_i8_withzp.get() : accumulate_weight_i8_nozp.get(); - m_decompzp_kernel = decompress_zp_u8.get(); - break; - case WeightCompressionType::INT4: - m_accumulate_kernel = m_with_zp ? accumulate_weight_i4_withzp.get() : accumulate_weight_i4_nozp.get(); - m_decompzp_kernel = decompress_zp_u4.get(); - break; } } @@ -988,6 +961,13 @@ void ActSparseFcKernel::operator()(const float* input, const auto SIMDW = SIMDJit::vmm_width(); OPENVINO_ASSERT((OC % (2 * SIMDW)) == 0, "ActSparseFcKernel: OC is not multiple of ", 2 * SIMDW); + if (!m_scratch_mem) { + auto scratch_size = static_cast(parallel_get_max_threads() * 2048 * 1024); + auto newMemDesc = std::make_shared(ov::element::u8, Shape{scratch_size}); + m_scratch_mem = m_scrach_pad->createScratchPadMem(newMemDesc); + } + m_scratch_base = m_scratch_mem->getDataAs(); + WeightCompressionType wtype; if (m_is_quantized) wtype = (m_is_int4 ? WeightCompressionType::INT4 : WeightCompressionType::INT8); diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/act_sparse_fc_kernel.hpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/act_sparse_fc_kernel.hpp index 781ef5e57b0be1..fbe53dfaec1d47 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/x64/act_sparse_fc_kernel.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/act_sparse_fc_kernel.hpp @@ -7,6 +7,7 @@ #include #include +#include "dnnl_scratch_pad.h" #include "openvino/core/type/float16.hpp" #include "simd_jit.hpp" @@ -16,7 +17,11 @@ namespace intel_cpu { class ActSparseFcKernel { public: // compile time parameters - ActSparseFcKernel(bool is_quantized, bool is_int4, bool with_zero_points, int ic_group_size); + ActSparseFcKernel(DnnlScratchPadPtr scrach_pad, + bool is_quantized, + bool is_int4, + bool with_zero_points, + int ic_group_size); void operator()(const float* input, float* output, @@ -32,6 +37,7 @@ class ActSparseFcKernel { void repack_weights_i4(uint8_t* src, uint8_t* dst, int IC, int OC); private: + void reduce_outputs(float* dst0, float* src0, int num_copies, int64_t OC); void MM_ComputeBounded_reuseA_f16(const float* A, float* C, const ov::float16* W, int M, int IC, int OC, int n0, int n1); void MM_ComputeBounded_reuseA_i8(const float* A, @@ -67,13 +73,27 @@ class ActSparseFcKernel { int n0, int n1, int icgs); - + void gemm6x2_Mx2(const float* pA, + int64_t A_stride, + const float* pB, + int64_t B_stride, + const float* pC, + int64_t C_stride, + int M, + int64_t bK, + int64_t is_accumulate_C); template T* scratch_alloc(size_t cnt); -private: - SIMDJit* m_decompzp_kernel; - SIMDJit* m_accumulate_kernel; + std::shared_ptr gemm6x2[6]; + std::shared_ptr gemm4x3[4]; + std::shared_ptr gemm4x1[4]; + std::shared_ptr m_decompzp_kernel; + std::shared_ptr m_accumulate_kernel; + std::shared_ptr m_reduce_outputs_kernel; + std::shared_ptr m_repack_2xsimdw_kernel; + std::shared_ptr m_repack_3xsimdw_i8_kernel; + const bool m_is_quantized; const bool m_is_int4; const bool m_with_zp; @@ -83,6 +103,10 @@ class ActSparseFcKernel { std::vector m_nonzero_val; std::vector m_output_temp; int m_nonzero_cnt; + + DnnlScratchPadPtr m_scrach_pad; + MemoryPtr m_scratch_mem; + uint8_t* m_scratch_base = nullptr; }; } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/simd_jit.hpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/simd_jit.hpp index 3004867f18e02e..00deb9e0af81f7 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/x64/simd_jit.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/simd_jit.hpp @@ -139,7 +139,7 @@ class VReg { class reg_pool { public: - reg_pool(const char * name, size_t max_num_regs) : m_name(name), m_status(max_num_regs, 0) {} + reg_pool(const char* name, size_t max_num_regs) : m_name(name), m_status(max_num_regs, 0) {} int allocate(int index = -1) { // allocate register with specific index @@ -164,7 +164,7 @@ class reg_pool { } private: - const char * m_name; + const char* m_name; std::vector m_status; }; @@ -522,7 +522,10 @@ class SIMDJit : public jit_generator { return nullptr; } - SIMDJit(const char* name) : jit_generator(name), sreg_pool("sreg_pool", sizeof(abi_x86_64_regs) / sizeof(abi_x86_64_regs[0])), vreg_pool("vreg_pool", use_avx512 ? 32 : 16) { + SIMDJit(const char* name) + : jit_generator(name), + sreg_pool("sreg_pool", sizeof(abi_x86_64_regs) / sizeof(abi_x86_64_regs[0])), + vreg_pool("vreg_pool", use_avx512 ? 32 : 16) { mov(rax, rsp); preamble(); } @@ -747,12 +750,13 @@ class SIMDJit : public jit_generator { std::shared_ptr alloc_reg64(int index) { auto reg_index = sreg_pool.allocate(index); - return std::shared_ptr(new Xbyak::Reg64(abi_x86_64_regs[reg_index]), [this, reg_index](Xbyak::Reg64* preg) { - if (preg) { - sreg_pool.free(reg_index); - delete preg; - } - }); + return std::shared_ptr(new Xbyak::Reg64(abi_x86_64_regs[reg_index]), + [this, reg_index](Xbyak::Reg64* preg) { + if (preg) { + sreg_pool.free(reg_index); + delete preg; + } + }); } std::shared_ptr alloc_vreg(int index) {