From ef6a6c10e21ef1c1fa7dea11920659e4610b0060 Mon Sep 17 00:00:00 2001 From: "Li, Tingqian" Date: Sat, 28 Dec 2024 08:05:31 +0800 Subject: [PATCH] fix review comment --- .../intel_cpu/src/nodes/act_sparse_fc.cpp | 11 -- .../kernels/x64/act_sparse_fc_kernel.cpp | 112 +++++++++--------- .../src/nodes/kernels/x64/simd_jit.hpp | 104 ++++++++-------- .../intel_cpu/tests/unit/simd_jit_test.cpp | 30 ++--- 4 files changed, 121 insertions(+), 136 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/act_sparse_fc.cpp b/src/plugins/intel_cpu/src/nodes/act_sparse_fc.cpp index b63217b11d629b..2df3d9cc810692 100644 --- a/src/plugins/intel_cpu/src/nodes/act_sparse_fc.cpp +++ b/src/plugins/intel_cpu/src/nodes/act_sparse_fc.cpp @@ -41,16 +41,6 @@ struct ActSparseFC::Executor : public ActSparseFC::ExecutorBase { MemoryPtr m_scales; ActSparseFCNode::Config& m_config; - void show(const char* name, uint8_t* src, int stride, int rows, int cols) { - printf("===== %s \n", name); - for (int r = 0; r < rows; r++, src += stride) { - for (int c = 0; c < cols; c++) { - printf("%02X,", src[c]); - } - printf("\n"); - } - } - Executor(ActSparseFC* pnode, DnnlScratchPadPtr scrachPad) : m_node(pnode), m_scrachPad(scrachPad), @@ -64,7 +54,6 @@ struct ActSparseFC::Executor : public ActSparseFC::ExecutorBase { const auto& context = m_node->context; const auto& engine = m_node->getEngine(); - std::cout << m_node->getName() << std::endl; auto create_weight = [&]() { auto raw_weight_mem = m_node->getSrcMemoryAtPort(1); MemoryPtr weight_mem; diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/act_sparse_fc_kernel.cpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/act_sparse_fc_kernel.cpp index f6257bfc5ca157..405795b34d51e2 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/x64/act_sparse_fc_kernel.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/act_sparse_fc_kernel.cpp @@ -5,6 +5,8 @@ #include +#include "openvino/core/except.hpp" + #if defined(OPENVINO_ARCH_X86_64) # include "openvino/core/parallel.hpp" @@ -34,14 +36,14 @@ static std::shared_ptr jit_compile_gemmRegBlk(int rows, int cols, int p }; // load all arguments into register - auto A_ptr = jit->get_sreg(0); - auto A_stride = jit->get_sreg(1); - auto B_ptr = jit->get_sreg(2); - auto B_stride = jit->get_sreg(3); - auto dst_ptr = jit->get_sreg(4); - auto dst_stride = jit->get_sreg(5); - auto K = jit->get_sreg(6); - auto accumulate = jit->get_sreg(7); + auto A_ptr = jit->get_arg(0); + auto A_stride = jit->get_arg(1); + auto B_ptr = jit->get_arg(2); + auto B_stride = jit->get_arg(3); + auto dst_ptr = jit->get_arg(4); + auto dst_stride = jit->get_arg(5); + auto K = jit->get_arg(6); + auto accumulate = jit->get_arg(7); auto stemp = jit->get_sreg(); @@ -95,7 +97,7 @@ static std::shared_ptr jit_compile_gemmRegBlk(int rows, int cols, int p jit->simd_broadcast_ss(vmmA(r), jit->ptr[A_ptr3 + 2 * A_stride]); break; default: - throw std::runtime_error("number of reg-blocking rows is not supported"); + OPENVINO_ASSERT(false, "number of reg-blocking rows is not supported"); } }; @@ -181,20 +183,20 @@ static void gemm6x2_Mx2(const float* pA, static std::shared_ptr jit_compile_accumulate_weight_i4(bool with_zero_point) { auto jit = std::make_shared(__func__); auto simd_width = SIMDJit::vmm_width(); - auto dst = jit->get_sreg(0); // float* - auto p_w0 = jit->get_sreg(1); // int4* - auto p_w1 = jit->get_sreg(2); // int4* - auto p_w2 = jit->get_sreg(3); // int4* - auto p_w3 = jit->get_sreg(4); // int4* - auto dense_x = jit->get_sreg(5); // float* - auto OC = jit->get_sreg(6); // float* - auto scales = jit->get_sreg(7); // float* - auto zero_points = jit->get_sreg(8); // float* + auto dst = jit->get_arg(0); // float* + auto p_w0 = jit->get_arg(1); // int4* + auto p_w1 = jit->get_arg(2); // int4* + auto p_w2 = jit->get_arg(3); // int4* + auto p_w3 = jit->get_arg(4); // int4* + auto dense_x = jit->get_arg(5); // float* + auto OC = jit->get_arg(6); // float* + auto scales = jit->get_arg(7); // float* + auto zero_points = jit->get_arg(8); // float* auto oc = jit->get_sreg(); auto vx = [&](int i) { - ASSERT(i < 4); + OPENVINO_ASSERT(i < 4); return jit->Vmm(i); }; @@ -278,14 +280,14 @@ static std::shared_ptr jit_compile_accumulate_weight(WeightCompressionT auto jit = std::make_shared(__func__); auto simd_width = SIMDJit::vmm_width(); // load all arguments into register - auto dst = jit->get_sreg(0); // float* - auto OC = jit->get_sreg(1); - auto gate_ids = jit->get_sreg(2); // int32_t * - auto gate_cnt = jit->get_sreg(3); // int - auto pw0 = jit->get_sreg(4); // ov::float16* / uint8_t* - auto dense_x = jit->get_sreg(5); // - auto scales = jit->get_sreg(6); // float* - auto zero_points = jit->get_sreg(7); // float* + auto dst = jit->get_arg(0); // float* + auto OC = jit->get_arg(1); + auto gate_ids = jit->get_arg(2); // int32_t * + auto gate_cnt = jit->get_arg(3); // int + auto pw0 = jit->get_arg(4); // ov::float16* / uint8_t* + auto dense_x = jit->get_arg(5); // + auto scales = jit->get_arg(6); // float* + auto zero_points = jit->get_arg(7); // float* auto g = jit->get_sreg(); auto i = jit->get_sreg(); @@ -383,11 +385,11 @@ static std::shared_ptr jit_compile_reduce_outputs() { auto jit = std::make_shared(__func__); auto simd_width = SIMDJit::vmm_width(); // load all arguments into register - auto dst0 = jit->get_sreg(0); // float* - auto src0 = jit->get_sreg(1); // float* - auto num_copies = jit->get_sreg(2); // int - auto OC = jit->get_sreg(3); // int - auto stride = jit->get_sreg(4); // int + auto dst0 = jit->get_arg(0); // float* + auto src0 = jit->get_arg(1); // float* + auto num_copies = jit->get_arg(2); // int + auto OC = jit->get_arg(3); // int + auto stride = jit->get_arg(4); // int auto i = jit->get_sreg(); auto k = jit->get_sreg(); @@ -434,14 +436,14 @@ static std::shared_ptr jit_compile_repack_3xsimdw_1xsimdw(bool with_zp) auto jit = std::make_shared(__func__); auto simd_width = SIMDJit::vmm_width(); // load all arguments into register - auto src = jit->get_sreg(0); // uint8_t* - auto strideW = jit->get_sreg(1); // int - auto scales = jit->get_sreg(2); // float* - auto zero_points = jit->get_sreg(3); // float* - auto K = jit->get_sreg(4); // int - auto N = jit->get_sreg(5); // int - auto repacked_B_nx3 = jit->get_sreg(6); // float* - auto repacked_B_nx1 = jit->get_sreg(7); // float* + auto src = jit->get_arg(0); // uint8_t* + auto strideW = jit->get_arg(1); // int + auto scales = jit->get_arg(2); // float* + auto zero_points = jit->get_arg(3); // float* + auto K = jit->get_arg(4); // int + auto N = jit->get_arg(5); // int + auto repacked_B_nx3 = jit->get_arg(6); // float* + auto repacked_B_nx1 = jit->get_arg(7); // float* auto k = jit->get_sreg(); auto n0 = jit->get_sreg(); @@ -531,12 +533,12 @@ static std::shared_ptr jit_compile_repack_2xsimdw(WeightCompressionType auto jit = std::make_shared(__func__); auto simd_width = SIMDJit::vmm_width(); // load all arguments into register - auto src = jit->get_sreg(0); // pointer to ov::float16/u8/i8/i4 - auto src_stride = jit->get_sreg(1); // in unit of f16 or bytes (int8/int4) - auto dst = jit->get_sreg(2); // float* - auto bK = jit->get_sreg(3); - auto scales = jit->get_sreg(4); // scales - auto zero_point = jit->get_sreg(5); // zero-point + auto src = jit->get_arg(0); // pointer to ov::float16/u8/i8/i4 + auto src_stride = jit->get_arg(1); // in unit of f16 or bytes (int8/int4) + auto dst = jit->get_arg(2); // float* + auto bK = jit->get_arg(3); + auto scales = jit->get_arg(4); // scales + auto zero_point = jit->get_arg(5); // zero-point auto k = jit->get_sreg(); @@ -627,7 +629,7 @@ T* ActSparseFcKernel::scratch_alloc(size_t cnt) { # else thread_local uint8_t scratch[1024 * 1024 * 2]; # endif - ASSERT(cnt * sizeof(T) < sizeof(scratch)); + OPENVINO_ASSERT(cnt * sizeof(T) < sizeof(scratch)); // DEBUG_LOG(reinterpret_cast(scratch)); return reinterpret_cast(scratch); } @@ -668,9 +670,9 @@ static std::shared_ptr get_decompress_zp_u8() { auto jit = std::make_shared(__func__); auto simd_width = SIMDJit::vmm_width(); - auto zp_input_u8 = jit->get_sreg(0); - auto zp_output_f32 = jit->get_sreg(1); - auto cnt = jit->get_sreg(2); + auto zp_input_u8 = jit->get_arg(0); + auto zp_output_f32 = jit->get_arg(1); + auto cnt = jit->get_arg(2); auto n = jit->get_sreg(); @@ -689,9 +691,9 @@ static std::shared_ptr get_decompress_zp_u4() { auto jit = std::make_shared(__func__); auto simd_width = SIMDJit::vmm_width(); - auto zp_input_u4 = jit->get_sreg(0); - auto zp_output_f32 = jit->get_sreg(1); - auto cnt = jit->get_sreg(2); + auto zp_input_u4 = jit->get_arg(0); + auto zp_output_f32 = jit->get_arg(1); + auto cnt = jit->get_arg(2); auto n = jit->get_sreg(); @@ -1011,9 +1013,7 @@ void ActSparseFcKernel::operator()(const float* input, const float* scales, const uint8_t* zp) { const auto SIMDW = SIMDJit::vmm_width(); - if (OC % (2 * SIMDW)) { - throw std::runtime_error(std::string("ActSparseFcKernel: OC is not multiple of ") + std::to_string(2 * SIMDW)); - } + OPENVINO_ASSERT((OC % (2 * SIMDW)) == 0, "ActSparseFcKernel: OC is not multiple of ", 2 * SIMDW); if (M > 1) { const auto SIMDW = SIMDJit::vmm_width(); diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/simd_jit.hpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/simd_jit.hpp index 549956b6ca0a05..6f24ce912592fb 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/x64/simd_jit.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/simd_jit.hpp @@ -15,20 +15,19 @@ # include "../include/jit.h" # define DECLARE_CPU_JIT_AUX_FUNCTIONS(x) static const bool use_avx512 = false; -#else -# include "cpu/x64/jit_generator.hpp" -using jit_generator = dnnl::impl::cpu::x64::jit_generator; -static const bool use_avx512 = dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core); -#endif - -#define ASSERT(cond) \ - if (!(cond)) { \ +#define OPENVINO_ASSERT(cond, ...) if (!(cond)) {\ std::stringstream ss; \ ss << "\033[31m" << __FILE__ << ":" << __LINE__ << " " << #cond << " failed! \033[0m"; \ std::cout << ss.str() << std::endl; \ asm("int3"); \ throw std::runtime_error(ss.str()); \ } +#else +#include "openvino/core/except.hpp" +#include "cpu/x64/jit_generator.hpp" +using jit_generator = dnnl::impl::cpu::x64::jit_generator; +static const bool use_avx512 = dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core); +#endif namespace ov { namespace intel_cpu { @@ -177,11 +176,11 @@ struct RegExprImpl { std::unique_ptr rhs; Xbyak::Reg64 as_r64() { - ASSERT(!is_op("i")); + OPENVINO_ASSERT(!is_op("i")); return Xbyak::Reg64(data); } int as_imm32() { - ASSERT(is_op("i")); + OPENVINO_ASSERT(is_op("i")); return data; } @@ -401,10 +400,10 @@ class SRegExpr { // convert to address operator Xbyak::RegExp() const { - ASSERT(paddr); + OPENVINO_ASSERT(paddr); if (paddr->base_reg < 0) { - ASSERT(paddr->index_reg >= 0); + OPENVINO_ASSERT(paddr->index_reg >= 0); return Xbyak::Reg64(paddr->index_reg) * paddr->scale + paddr->disp; } else if (paddr->index_reg >= 0) return Xbyak::Reg64(paddr->base_reg) + Xbyak::Reg64(paddr->index_reg) * paddr->scale + paddr->disp; @@ -473,17 +472,17 @@ inline SRegExpr operator!=(SRegExpr&& lhs, SRegExpr&& rhs) { return SRegExpr("!=", std::move(lhs), std::move(rhs)); } inline SRegExpr operator&&(SRegExpr&& lhs, SRegExpr&& rhs) { - ASSERT(lhs.pimpl->is_logical_op()); - ASSERT(rhs.pimpl->is_logical_op()); + OPENVINO_ASSERT(lhs.pimpl->is_logical_op()); + OPENVINO_ASSERT(rhs.pimpl->is_logical_op()); return SRegExpr("&&", std::move(lhs), std::move(rhs)); } inline SRegExpr operator||(SRegExpr&& lhs, SRegExpr&& rhs) { - ASSERT(lhs.pimpl->is_logical_op()); - ASSERT(rhs.pimpl->is_logical_op()); + OPENVINO_ASSERT(lhs.pimpl->is_logical_op()); + OPENVINO_ASSERT(rhs.pimpl->is_logical_op()); return SRegExpr("||", std::move(lhs), std::move(rhs)); } inline SRegExpr operator!(SRegExpr&& lhs) { - ASSERT(lhs.pimpl->is_logical_op()); + OPENVINO_ASSERT(lhs.pimpl->is_logical_op()); return SRegExpr("!", std::move(lhs)); } @@ -492,8 +491,6 @@ class SIMDJit : public jit_generator { DECLARE_CPU_JIT_AUX_FUNCTIONS(SIMDJit); static constexpr int num_allocatable_sregs = sizeof(abi_x86_64_regs) / sizeof(abi_x86_64_regs[0]); - int reg_status[num_allocatable_sregs]; - // scalar register variable class JitDisassembler { public: @@ -561,35 +558,22 @@ class SIMDJit : public jit_generator { create_kernel(); } - SReg get_sreg(int idx = -1) { - auto alloc_sreg = [&](int i) { - if (reg_status[i] != 0) { - throw std::runtime_error(std::string("try to allocate an already used register:") + std::to_string(i)); - } - reg_status[i] = 1; - return std::shared_ptr(new Xbyak::Reg64(abi_x86_64_regs[i]), [this, i](Xbyak::Reg64* preg) { - if (preg) { - reg_status[i] = 0; - delete preg; - } - }); - }; + SReg get_arg(int idx) { + auto ret = SReg(this, alloc_reg64(idx)); + if (idx >= abi_param_regs_num) + mov(ret, ptr[rax + (idx - abi_param_regs_num + 1) * 8]); // load from stack + return ret; + } - if (idx >= 0) { - auto ret = SReg(this, alloc_sreg(idx)); - if (idx >= abi_param_regs_num) - mov(ret, ptr[rax + (idx - abi_param_regs_num + 1) * 8]); // load from stack - return ret; - } else { - // find a free register, note argument registers are also allocatable, make sure - // allocate argument registers before any local register-var - for (int i = 0; i < num_allocatable_sregs; i++) { - if (reg_status[i] == 0) { - return SReg(this, alloc_sreg(i)); - } + SReg get_sreg() { + // find a free register, note argument registers are also allocatable, make sure + // allocate argument registers before any local register-var + for (int i = 0; i < num_allocatable_sregs; i++) { + if (reg_status[i] == 0) { + return SReg(this, alloc_reg64(i)); } } - throw std::runtime_error(std::string("scalar register resource exhausted!")); + OPENVINO_ASSERT(false, "scalar register resource exhausted!"); return {}; } @@ -752,6 +736,19 @@ class SIMDJit : public jit_generator { static int vmm_width() { return (use_avx512 ? 512 : 256) / (sizeof(DT) * 8); } + +private: + int reg_status[num_allocatable_sregs]; + std::shared_ptr alloc_reg64(int i) { + OPENVINO_ASSERT(reg_status[i] == 0, "try to allocate an already used register:", i); + reg_status[i] = 1; + return std::shared_ptr(new Xbyak::Reg64(abi_x86_64_regs[i]), [this, i](Xbyak::Reg64* preg) { + if (preg) { + reg_status[i] = 0; + delete preg; + } + }); + } }; inline const SReg& SReg::operator=(const SReg& rhs) const { @@ -826,7 +823,7 @@ inline void SRegExpr::evaluate(SIMDJit* jit, const SReg* pdst, const char assign jit->imul(dst, lhs->as_r64()); break; default: - ASSERT(false); + OPENVINO_ASSERT(false); break; } return; @@ -846,7 +843,7 @@ inline void SRegExpr::evaluate(SIMDJit* jit, const SReg* pdst, const char assign jit->imul(dst, dst, lhs->as_imm32()); break; default: - ASSERT(false); + OPENVINO_ASSERT(false); break; } return; @@ -870,7 +867,7 @@ inline void SRegExpr::evaluate(SIMDJit* jit, const SReg* pdst, const char assign jit->imul(dst, temp); break; default: - ASSERT(false); + OPENVINO_ASSERT(false); break; } } @@ -990,7 +987,7 @@ inline void SRegExpr::evaluate(SIMDJit* jit, const SReg* pdst, const char assign // try to replace last scratch register with assign destination register auto assign_dst_reg_idx = pdst->r64().getIdx(); auto assign_dst_reg_scratch_sn = pimpl->data; - ASSERT(assign_dst_reg_scratch_sn >= scratch_reg_base); + OPENVINO_ASSERT(assign_dst_reg_scratch_sn >= scratch_reg_base); // find the appearance of last access int last_access_exec_id = -1; int op_exec_id = 0; @@ -1089,14 +1086,14 @@ inline void SRegExpr::evaluate(SIMDJit* jit, const SReg* pdst, const char assign jit->sar(dst, p->rhs->as_imm32()); else { // only cl register supportted, we need allocate cl - ASSERT(0); // jit->sar(dst, p->rhs->as_r64()); + OPENVINO_ASSERT(false); // jit->sar(dst, p->rhs->as_r64()); } } else if (p->is_op("<<")) { if (p->rhs->is_imm()) jit->shl(dst, p->rhs->as_imm32()); else { // only cl register supportted, we need allocate cl - ASSERT(0); // jit->shl(dst, p->rhs->as_r64()); + OPENVINO_ASSERT(false); // jit->shl(dst, p->rhs->as_r64()); } } else if (p->is_op("&")) { if (p->rhs->is_imm()) @@ -1154,8 +1151,7 @@ inline void SRegExpr::evaluate(SIMDJit* jit, const SReg* pdst, const char assign jit->setle(dst.cvt8()); } } else { - std::cout << p->op << std::endl; - ASSERT(0); + OPENVINO_ASSERT(0, "Unsupported OP: ", p->op); } return true; }); @@ -1177,7 +1173,7 @@ inline void SRegExpr::evaluate(SIMDJit* jit, const SReg* pdst, const char assign jit->imul(*pdst, pimpl->as_r64()); break; default: - ASSERT(0); + OPENVINO_ASSERT(false); break; } } diff --git a/src/plugins/intel_cpu/tests/unit/simd_jit_test.cpp b/src/plugins/intel_cpu/tests/unit/simd_jit_test.cpp index 9bdb22c0ff3b33..e0c2a39854f11d 100644 --- a/src/plugins/intel_cpu/tests/unit/simd_jit_test.cpp +++ b/src/plugins/intel_cpu/tests/unit/simd_jit_test.cpp @@ -16,13 +16,13 @@ static std::vector> test_exprs; { \ auto jit = std::make_shared(__func__); \ { \ - auto dst = jit->get_sreg(0); \ - auto a = jit->get_sreg(1); \ - auto b = jit->get_sreg(2); \ - auto c = jit->get_sreg(3); \ - auto d = jit->get_sreg(4); \ - auto e = jit->get_sreg(5); \ - auto f = jit->get_sreg(6); \ + auto dst = jit->get_arg(0); \ + auto a = jit->get_arg(1); \ + auto b = jit->get_arg(2); \ + auto c = jit->get_arg(3); \ + auto d = jit->get_arg(4); \ + auto e = jit->get_arg(5); \ + auto f = jit->get_arg(6); \ expr_name(dst, a, b, c, d, e, f); \ jit->finalize(dst); \ } \ @@ -42,13 +42,13 @@ static std::vector> test_exprs; { \ auto jit = std::make_shared(__func__); \ { \ - auto dst = jit->get_sreg(0); \ - auto a = jit->get_sreg(1); \ - auto b = jit->get_sreg(2); \ - auto c = jit->get_sreg(3); \ - auto d = jit->get_sreg(4); \ - auto e = jit->get_sreg(5); \ - auto f = jit->get_sreg(6); \ + auto dst = jit->get_arg(0); \ + auto a = jit->get_arg(1); \ + auto b = jit->get_arg(2); \ + auto c = jit->get_arg(3); \ + auto d = jit->get_arg(4); \ + auto e = jit->get_arg(5); \ + auto f = jit->get_arg(6); \ jit->while_(cond_expr(dst, a, b, c, d, e, f), [&] { \ body_expr(dst, a, b, c, d, e, f); \ }); \ @@ -173,7 +173,7 @@ static int fib(int n) { TEST(SIMDJit, control_flow_fib) { auto jit = std::make_shared(__func__); { - auto n = jit->get_sreg(0); + auto n = jit->get_arg(0); jit->if_(n == 1 || n == 2, [&] { jit->return_(1); });