From 156d88b02d9dee5557bef318fc9474865b4cd6e5 Mon Sep 17 00:00:00 2001 From: Alexandra Sidorova Date: Thu, 30 Jan 2025 16:25:21 +0400 Subject: [PATCH] work --- src/plugins/intel_cpu/CMakeLists.txt | 3 +- .../emitters/plugin/riscv64/jit_emitter.cpp | 15 + .../emitters/plugin/riscv64/jit_emitter.hpp | 148 +++++++++ src/plugins/intel_cpu/src/nodes/eltwise.cpp | 8 + .../kernels}/riscv64/jit_generator.cpp | 20 +- .../kernels}/riscv64/jit_generator.hpp | 32 +- .../riscv64/jit_uni_eltwise_generic.cpp | 314 ++++++++++++++++++ .../riscv64/jit_uni_eltwise_generic.hpp | 124 +++++++ .../intel_cpu/thirdparty/CMakeLists.txt | 1 + 9 files changed, 649 insertions(+), 16 deletions(-) create mode 100644 src/plugins/intel_cpu/src/emitters/plugin/riscv64/jit_emitter.cpp create mode 100644 src/plugins/intel_cpu/src/emitters/plugin/riscv64/jit_emitter.hpp rename src/plugins/intel_cpu/src/{emitters/plugin => nodes/kernels}/riscv64/jit_generator.cpp (56%) rename src/plugins/intel_cpu/src/{emitters/plugin => nodes/kernels}/riscv64/jit_generator.hpp (74%) create mode 100644 src/plugins/intel_cpu/src/nodes/kernels/riscv64/jit_uni_eltwise_generic.cpp create mode 100644 src/plugins/intel_cpu/src/nodes/kernels/riscv64/jit_uni_eltwise_generic.hpp diff --git a/src/plugins/intel_cpu/CMakeLists.txt b/src/plugins/intel_cpu/CMakeLists.txt index f246dc12bd3798..6b28d88ddf15b6 100644 --- a/src/plugins/intel_cpu/CMakeLists.txt +++ b/src/plugins/intel_cpu/CMakeLists.txt @@ -219,7 +219,8 @@ if(NOT AARCH64) endif() if (NOT RISCV64) - list(APPEND EXCLUDE_PATHS ${CMAKE_CURRENT_SOURCE_DIR}/src/emitters/plugin/riscv64/*) + list(APPEND EXCLUDE_PATHS ${CMAKE_CURRENT_SOURCE_DIR}/src/emitters/plugin/riscv64/* + ${CMAKE_CURRENT_SOURCE_DIR}/src/nodes/kernels/riscv64/*) endif() if (NOT ENABLE_MLAS_FOR_CPU) diff --git a/src/plugins/intel_cpu/src/emitters/plugin/riscv64/jit_emitter.cpp b/src/plugins/intel_cpu/src/emitters/plugin/riscv64/jit_emitter.cpp new file mode 100644 index 00000000000000..69a0e661df0166 --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/plugin/riscv64/jit_emitter.cpp @@ -0,0 +1,15 @@ +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "jit_emitter.hpp" + +namespace ov { +namespace intel_cpu { +namespace riscv64 { + + + +} // namespace riscv64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/plugin/riscv64/jit_emitter.hpp b/src/plugins/intel_cpu/src/emitters/plugin/riscv64/jit_emitter.hpp new file mode 100644 index 00000000000000..0959f1a96418dc --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/plugin/riscv64/jit_emitter.hpp @@ -0,0 +1,148 @@ +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +#include + +#include "nodes/kernels/riscv64/jit_generator.hpp" +#include "emitters/utils.hpp" +#include "snippets/generator.hpp" +#include "snippets/snippets_isa.hpp" + + +namespace ov { +namespace intel_cpu { +namespace riscv64 { + +enum emitter_in_out_map { + vec_to_vec, + vec_to_gpr, + gpr_to_vec, + gpr_to_gpr, +}; + +// structure for storage of emitter parameters to hash in map +struct emitter_params { + virtual size_t hash() const = 0; +}; + +class jit_emitter : public ov::snippets::Emitter { +public: + jit_emitter(ov::intel_cpu::riscv64::jit_generator* host, + ov::element::Type exec_prc = ov::element::f32, + emitter_in_out_map in_out_type = emitter_in_out_map::vec_to_vec); + + void emit_code(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_vec_idxs = {}, + const std::vector& pool_gpr_idxs = {}) const override; + void emit_data() const override; + + virtual size_t get_inputs_num() const = 0; + virtual size_t aux_vecs_count() const; + virtual size_t aux_gprs_count() const; + emitter_in_out_map get_in_out_type() const; + + /** + * @brief Returns supported precisions. + * Precisions are ordered, the first bigger bitness precision with the same type will be selected. + * Empty collection means the emitter supports any input precisions. + */ + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); + +protected: + size_t get_max_vecs_count() const; + size_t get_vec_length() const; + + virtual void prepare_table(); + virtual void register_table_entries() {} + + //void load_table_addr() const { + // h->mov(p_table, *l_table.get()); + //} + + virtual void emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const = 0; + + virtual void emitter_preamble(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const; + virtual void emitter_postamble() const; + + void store_context(const std::vector& gpr_regs, + const std::vector& vec_regs, + const std::unordered_set& ignore_vec_regs) const; + void restore_context(const std::vector& gpr_regs, + const std::vector& vec_regs, + const std::unordered_set& ignore_vec_regs) const; + + //Xbyak_riscv64::Address table_val(const std::string& key, size_t key_off_val_shift = 0) const { + // auto off = table_off(key, key_off_val_shift); + // return h->ptr[p_table + off]; + //} + + // we accept only 32bit hexadecimal table values to avoid any rounding + using table_entry_val_t = uint32_t; + using table_entry_offset_t = size_t; // offsets are in bytes wrt p_table + using table_entry_bcast_t = bool; // true => bcast value + + struct table_entry_t { + table_entry_val_t val; + table_entry_bcast_t bcast; + }; + struct mapped_table_entry_t { + table_entry_offset_t off; + table_entry_val_t val; + table_entry_bcast_t bcast; + }; + + using table_t = std::multimap; + using mapped_table_t = std::multimap; + + void push_arg_entry_of(const std::string& key, const table_entry_val_t val, const bool broadcast) { + mapped_table_entry_t te{0, val, broadcast}; + entry_map_.insert(std::make_pair(key, te)); + } + + void push_entries_of(const table_t& t) { + for (auto it = t.begin(); it != t.end(); it++) { + auto key = (*it).first; + auto te = (*it).second; // copy values from table + push_arg_entry_of(key, te.val, te.bcast); + } + } + + virtual void validate_arguments(const std::vector&, const std::vector&) const {} + + ov::intel_cpu::riscv64::jit_generator* h; + ov::element::Type exec_prc_; + + mutable Xbyak_riscv::Reg p_table; + mutable std::shared_ptr l_table; + mutable std::vector aux_vec_idxs; + mutable std::vector aux_gpr_idxs; + + mapped_table_t entry_map_; + emitter_in_out_map in_out_type_; + +private: + mutable std::vector preserved_vec_idxs; + mutable std::vector preserved_gpr_idxs; + + size_t table_off(const std::string& key, size_t key_off_val_shift = 0) const { + const auto it = entry_map_.find(key); // search an entry for a key + OV_CPU_JIT_EMITTER_ASSERT(it != entry_map_.end(), "Value has not been found in the table"); + const auto& te = (*it).second; + const auto scale = te.bcast ? get_vec_length() : sizeof(table_entry_val_t); + return te.off + key_off_val_shift * scale; + } +}; + +} // namespace riscv64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/eltwise.cpp b/src/plugins/intel_cpu/src/nodes/eltwise.cpp index 3ab561f9c5b79a..a66d6c14856dd2 100644 --- a/src/plugins/intel_cpu/src/nodes/eltwise.cpp +++ b/src/plugins/intel_cpu/src/nodes/eltwise.cpp @@ -53,6 +53,10 @@ # include "kernels/x64/jit_uni_eltwise_generic.hpp" #endif +#if defined(OPENVINO_ARCH_RISCV64) +# include "kernels/riscv64/jit_uni_eltwise_generic.hpp" +#endif + using namespace dnnl::impl::utils; using namespace dnnl::impl::cpu; @@ -688,6 +692,7 @@ class EltwiseJitExecutor : public Eltwise::IEltwiseExecutor { } #endif // OPENVINO_ARCH_ARM64 + _pKernel.reset(new ov::intel_cpu::riscv64::jit_uni_eltwise_generic(jep, eltwise_data)); if (_pKernel) { _pKernel->create_ker(); } @@ -1466,6 +1471,9 @@ void Eltwise::initSupportedPrimitiveDescriptors() { OPENVINO_THROW("Unknow CPU architecture"); #endif + const bool useJit = getAlgorithm() == Algorithm::EltwiseAdd; + implType = useJit ? EltwiseImplType::optimized : EltwiseImplType::reference; + #if defined(OV_CPU_WITH_ACL) auto filterPrecision = [&](const ov::element::Type& prc, const ov::element::Type& forcedPrec) { if (isBitwise(algorithm)) { diff --git a/src/plugins/intel_cpu/src/emitters/plugin/riscv64/jit_generator.cpp b/src/plugins/intel_cpu/src/nodes/kernels/riscv64/jit_generator.cpp similarity index 56% rename from src/plugins/intel_cpu/src/emitters/plugin/riscv64/jit_generator.cpp rename to src/plugins/intel_cpu/src/nodes/kernels/riscv64/jit_generator.cpp index 49fd07c2e452a9..8e09225b7776aa 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/riscv64/jit_generator.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/riscv64/jit_generator.cpp @@ -4,6 +4,8 @@ #include "jit_generator.hpp" +#include "utils/general_utils.h" + namespace ov { namespace intel_cpu { namespace riscv64 { @@ -11,22 +13,28 @@ namespace riscv64 { using namespace Xbyak_riscv; void jit_generator::preamble() { + const int frame_size = rnd_up((num_abi_save_gpr_regs + 1) * xlen, sp_aligment); // TODO: FP gpr ? - addi(sp, sp, -num_abi_save_gpr_regs * xlen); + addi(sp, sp, -frame_size); int imm = 0; for (const auto& gpr : abi_save_gpr_regs) { - sw(gpr, sp, imm); - imm += 4; + sd(gpr, sp, imm); + imm += xlen; } + sd(ra, sp, imm); } void jit_generator::postamble() { + const int frame_size = rnd_up((num_abi_save_gpr_regs + 1) * xlen, sp_aligment); int imm = 0; for (const auto& gpr : abi_save_gpr_regs) { - lw(gpr, sp, imm); - imm += 4; + ld(gpr, sp, imm); + imm += xlen; } - addi(sp, sp, num_abi_save_gpr_regs * xlen); + ld(ra, sp, imm); + + addi(sp, sp, frame_size); + ret(); } diff --git a/src/plugins/intel_cpu/src/emitters/plugin/riscv64/jit_generator.hpp b/src/plugins/intel_cpu/src/nodes/kernels/riscv64/jit_generator.hpp similarity index 74% rename from src/plugins/intel_cpu/src/emitters/plugin/riscv64/jit_generator.hpp rename to src/plugins/intel_cpu/src/nodes/kernels/riscv64/jit_generator.hpp index 1371faa504cacd..e6bc508bce997b 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/riscv64/jit_generator.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/riscv64/jit_generator.hpp @@ -9,10 +9,15 @@ #include "openvino/core/except.hpp" + namespace ov { namespace intel_cpu { namespace riscv64 { +#define DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_name) \ + const char *name() const override { return #jit_name; } \ + const char *source_file() const override { return __FILE__; } + // RISCV-64 specific registers mapping // reg | ABI Name | descripion | saved by // ===================================================== @@ -39,15 +44,6 @@ namespace riscv64 { // f18-27 | fs2-11 | FP Saved registers | Callee // f28-31 | ft8-11 | FP Temporaries | Caller -// Callee-saved registers -constexpr Xbyak_riscv::Reg abi_save_gpr_regs[] = {Xbyak_riscv::s0, Xbyak_riscv::s1, Xbyak_riscv::s2, Xbyak_riscv::s3, - Xbyak_riscv::s4, Xbyak_riscv::s5, Xbyak_riscv::s6, Xbyak_riscv::s7, - Xbyak_riscv::s8, Xbyak_riscv::s9, Xbyak_riscv::s10, Xbyak_riscv::s11}; - -static const Xbyak_riscv::Reg abi_param1(Xbyak_riscv::a0), abi_param2(Xbyak_riscv::a1), abi_param3(Xbyak_riscv::a2), - abi_param4(Xbyak_riscv::a3), abi_param5(Xbyak_riscv::a4), abi_param6(Xbyak_riscv::a5), - abi_param7(Xbyak_riscv::a6), abi_param8(Xbyak_riscv::a7); - class jit_generator : public Xbyak_riscv::CodeGenerator { public: jit_generator(size_t maxSize = Xbyak_riscv::DEFAULT_MAX_CODE_SIZE, @@ -79,15 +75,33 @@ class jit_generator : public Xbyak_riscv::CodeGenerator { // Disallow char-based labels completely void L(const char *label) = delete; + void L(Xbyak_riscv::Label &label) { + Xbyak_riscv::CodeGenerator::L(label); + } jit_generator(const jit_generator &) = delete; jit_generator &operator=(const jit_generator &) = delete; + virtual const char *name() const = 0; + virtual const char *source_file() const = 0; + + // Callee-saved registers + static constexpr Xbyak_riscv::Reg abi_save_gpr_regs[] = {Xbyak_riscv::s0, Xbyak_riscv::s1, Xbyak_riscv::s2, Xbyak_riscv::s3, + Xbyak_riscv::s4, Xbyak_riscv::s5, Xbyak_riscv::s6, Xbyak_riscv::s7, + Xbyak_riscv::s8, Xbyak_riscv::s9, Xbyak_riscv::s10, Xbyak_riscv::s11}; + // ABI-arguments registers + static constexpr Xbyak_riscv::Reg abi_param_regs[] = {Xbyak_riscv::a0, Xbyak_riscv::a1, Xbyak_riscv::a2, Xbyak_riscv::a3, + Xbyak_riscv::a4, Xbyak_riscv::a5, Xbyak_riscv::a6, Xbyak_riscv::a7}; + protected: virtual void generate() = 0; const uint8_t *jit_ker_ = nullptr; + // In the standard RISC-V calling convention, the stack pointer is always kept 16-byte aligned + const size_t sp_aligment = 16; + // Vector register count + const size_t vec_count = 32; // integer gpr byte size const size_t xlen = Xbyak_riscv::CPU().getXlen() / 8; // fp gpr byte size diff --git a/src/plugins/intel_cpu/src/nodes/kernels/riscv64/jit_uni_eltwise_generic.cpp b/src/plugins/intel_cpu/src/nodes/kernels/riscv64/jit_uni_eltwise_generic.cpp new file mode 100644 index 00000000000000..87977e4f19ef6b --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/kernels/riscv64/jit_uni_eltwise_generic.cpp @@ -0,0 +1,314 @@ +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "jit_uni_eltwise_generic.hpp" + +namespace ov { +namespace intel_cpu { +namespace riscv64 { + +using namespace Xbyak_riscv; + +#define GET_OFF(field) offsetof(jit_eltwise_call_args_ptrs, field) + +jit_uni_eltwise_generic::jit_uni_eltwise_generic(jit_eltwise_params jep, std::vector eltwise_data) + : jit_uni_eltwise_kernel(std::move(jep)), + jit_generator(), + eltwise_data_(std::move(eltwise_data)) {} + +void jit_uni_eltwise_generic::generate() { + preamble(); + + auto const exec_prc = ov::element::f32; + + //eltwise_emitter = create_eltwise_emitter(eltwise_data_.front(), exec_prc); + //for (size_t i = 1; i < eltwise_data_.size(); ++i) { + // post_op_emitters.push_back(create_eltwise_emitter(eltwise_data_[i], exec_prc)); + //} + + const auto& jep = jep_; + const int offset_count = jep.input_size - 1; + const LMUL lmul = LMUL::m1; + + // ptrs initializing + if (jep.use_runtime_ptrs) { + auto init_ptrs_with_offsets = [&](Reg reg) { + for (int j = 0; j < offset_count; j++) { + ld(reg_tmp_0, reg_offsets, static_cast(j * sizeof(size_t))); + ld(reg_tmp_1, reg_indexes, static_cast(j * sizeof(size_t))); + mul(reg_tmp_0, reg_tmp_0, reg_tmp_1); + add(reg, reg, reg_tmp_0); + } + }; + + for (size_t i = 0; i < jep.inputs_number; i++) { + ld(reg_offsets, reg_const_params, GET_OFF(src_offsets) + i * sizeof(size_t)); + ld(src_gpr(i), reg_const_params, GET_OFF(src_ptr[0]) + i * sizeof(size_t)); + init_ptrs_with_offsets(src_gpr(i)); + } + + ld(reg_offsets, reg_const_params, GET_OFF(dst_offsets)); + ld(dst_gpr(), reg_const_params, GET_OFF(dst_ptr)); + init_ptrs_with_offsets(dst_gpr()); + + ld(reg_work_amount, reg_const_params, GET_OFF(work_amount)); + } else { + auto init_ptrs_with_offsets = [&](Reg reg, const std::vector& offsets) { + for (int j = 0; j < offset_count; j++) { + if (jep_.dims[j] != 1 && offsets[j] != 0) { + // what's about 64bit? + li(reg_tmp_0, static_cast(offsets[j])); + ld(reg_tmp_1, reg_indexes, static_cast(j * sizeof(size_t))); + mul(reg_tmp_0, reg_tmp_0, reg_tmp_1); + add(reg, reg, reg_tmp_0); + } + } + }; + + for (size_t i = 0; i < jep.inputs_number; i++) { + ld(src_gpr(i), reg_const_params, GET_OFF(src_ptr[0]) + i * sizeof(size_t)); + init_ptrs_with_offsets(src_gpr(i), jep.src_offsets[i]); + } + + ld(dst_gpr(), reg_const_params, GET_OFF(dst_ptr)); + init_ptrs_with_offsets(dst_gpr(), jep.dst_offsets); + + li(reg_work_amount, static_cast(jep.work_amount)); + } + + vsetvli(reg_vlen, reg_work_amount, SEW::e32, lmul); + for (size_t i = 0; i < jep.inputs_number; i++) { + if (jep.src_size[i] == 1) { + flw(f0, src_gpr(i)); + vfmv_v_f(src_vec(i, lmul), f0); + } + } + + size_t min_src_size = jep.dst_size; + for (size_t i = 0; i < jep.inputs_number; i++) { + if (jep.src_size[i] != 1) + min_src_size = std::min(min_src_size, jep.src_size[i]); + } + + if (min_src_size == jep.dst_size) { + std::cout << "EQUAL\n"; + Label loop_begin; + Label loop_end; + + L(loop_begin); + { + beqz(reg_work_amount, loop_end); + + vsetvli(reg_vlen, reg_work_amount, SEW::e32, lmul); + sub(reg_work_amount, reg_work_amount, reg_vlen); + slli(reg_vlen, reg_vlen, 2); // in bytes + + for (size_t i = 0; i < jep.inputs_number; i++) { + if (jep.src_size[i] != 1) { + vle32_v(src_vec(i, lmul), src_gpr(i)); + add(src_gpr(i), src_gpr(i), reg_vlen); + } + } + + vfadd_vv(dst_vec(), src_vec(0, lmul), src_vec(1, lmul)); + + //compute_eltwise_op(lmul, input_count, vmm_dst_idx); + + //apply_post_ops(lmul, input_count, vmm_dst_idx); + + vse32_v(dst_vec(), dst_gpr()); + add(dst_gpr(), dst_gpr(), reg_vlen); + + bnez(reg_work_amount, loop_begin); + } + L(loop_end); + } + + if (min_src_size != jep.dst_size) { + std::cout << "UNEQUAL\n"; + std::cout << jep.src_size[0] << " " << jep.src_size[1] << jep.dst_size << std::endl; + bool is_valid_configuration = true; + if (jep.dst_size % min_src_size != 0) + is_valid_configuration = false; + + for (size_t i = 0; i < jep.inputs_number; i++) { + if (jep.src_size[i] != 1 && jep.src_size[i] != min_src_size && jep.src_size[i] != jep.dst_size) + is_valid_configuration = false; + } + + OPENVINO_ASSERT(is_valid_configuration, "Eltwise jitter has invalid configuration for Eltwise node"); + + Label loop_begin; + Label loop_end; + Label inner_loop_begin; + + L(loop_begin); + { + beqz(reg_work_amount, loop_end); + + for (size_t i = 0; i < jep.inputs_number; i++) { + if (jep.src_size[i] != 1) { + mv(src_aux_gpr(i), src_gpr(i)); + } + } + + li(reg_loop_step, min_src_size); + L(inner_loop_begin); + { + vsetvli(reg_vlen, reg_loop_step, SEW::e32, lmul); + + sub(reg_loop_step, reg_loop_step, reg_vlen); + slli(reg_vlen, reg_vlen, 2); // to bytes + + for (size_t i = 0; i < jep.inputs_number; i++) { + if (jep.src_size[i] != 1) { + vle32_v(src_vec(i, lmul), src_aux_gpr(i)); + add(src_aux_gpr(i), src_aux_gpr(i), reg_vlen); + } + } + + vfadd_vv(dst_vec(), src_vec(0, lmul), src_vec(1, lmul)); + + //compute_eltwise_op(lmul, input_count, vmm_dst_idx); + + //apply_post_ops(lmul, input_count, vmm_dst_idx); + + vse32_v(dst_vec(), dst_gpr()); + add(dst_gpr(), dst_gpr(), reg_vlen); + + bnez(reg_loop_step, inner_loop_begin); + } + + const auto reg_tmp = reg_loop_step; + for (size_t i = 0; i < jep.inputs_number; i++) { + if (jep.src_size[i] == jep.dst_size) { + li(reg_tmp, jep.src_prc[i].size() * min_src_size); + add(src_gpr(i), src_gpr(i), reg_tmp); + } + } + + li(reg_loop_step, min_src_size); + sub(reg_work_amount, reg_work_amount, reg_loop_step); + j_(loop_begin); + } + + L(loop_end); + } + + postamble(); + //emit_data(); +} + +void jit_uni_eltwise_generic::emit_data() const { + OPENVINO_ASSERT(eltwise_emitter, "Emitter is missed"); + eltwise_emitter->emit_data(); + for (size_t i = 0; i < post_op_emitters.size(); i++) { + post_op_emitters[i]->emit_data(); + } +} + +int jit_uni_eltwise_generic::lmul2int(const LMUL lmul) const { + switch (lmul) { + case LMUL::m1: + return 1; + case LMUL::m2: + return 2; + case LMUL::m4: + return 4; + case LMUL::m8: + return 8; + default: { + OPENVINO_THROW(std::string("not supported vector length multiplier: ") + std::to_string(static_cast(lmul))); + } + } +} + +namespace { +struct EltwiseEmitterContext { + std::shared_ptr emitter; + ov::intel_cpu::riscv64::jit_generator* host; + const EltwiseData& opData; + ov::element::Type exec_prc; +}; + +template +struct EltwiseEmitter { + void operator()(EltwiseEmitterContext& ctx) { + ctx.emitter = std::make_shared(ctx.host, ctx.exec_prc); + } +}; +} // namespace + +std::shared_ptr jit_uni_eltwise_generic::create_eltwise_emitter(const EltwiseData& data, + const ov::element::Type& exec_prec) { + EltwiseEmitterContext ctx = {nullptr, this, data, exec_prec}; + + //OV_SWITCH( + // intel_cpu, + // EltwiseEmitter, + // ctx, + // data.algo, + // OV_CASE(Algorithm::EltwiseAdd, ov::intel_cpu::riscv64::jit_add_emitter)); + + if (!ctx.emitter) { + OPENVINO_THROW("Unsupported operation type '" + algToString(data.algo) + "' for Eltwise emitter"); + } + + return ctx.emitter; +} + +void jit_uni_eltwise_generic::compute_eltwise_op() { + //std::vector in_idxs; + //for (size_t i = 0; i < eltwise_emitter->get_inputs_num(); i++) { + // in_idxs.push_back(src_vmm(i).getIdx()); + //} +// + //std::vector aux_idxs; + //for (size_t i = 0; i < eltwise_emitter->aux_vecs_count(); i++) { + // aux_idxs.push_back(aux_vmm(i).getIdx()); + //} +// + //std::vector out_idxs; + //out_idxs.push_back(dst_vmm().getIdx()); +// + //std::vector gpr_idxs; + //for (size_t i = 0; i < eltwise_emitter->aux_gprs_count(); i++) { + // gpr_idxs.push_back(aux_gpr(i).getIdx()); + //} + + //eltwise_emitter->emit_code(in_idxs, out_idxs, aux_idxs, gpr_idxs); +} + +void jit_uni_eltwise_generic::apply_post_ops() { + int input_idx = eltwise_emitter->get_inputs_num(); + int eltwise_post_op_idx = 0; + //for (size_t i = 1; i < eltwise_data_.size(); i++) { + // std::vector in_idxs; + // in_idxs.push_back(dst_vmm().getIdx()); + // for (size_t j = 1; j < post_op_emitters[eltwise_post_op_idx]->get_inputs_num(); j++) { + // in_idxs.push_back(src_vmm(input_idx++).getIdx()); + // } +// + // std::vector out_idxs; + // out_idxs.push_back(dst_vmm().getIdx()); +// + // std::vector aux_vmm_idxs; + // for (size_t j = 0; j < post_op_emitters[eltwise_post_op_idx]->aux_vecs_count(); j++) { + // aux_vmm_idxs.push_back(aux_vmm(j).getIdx()); + // } +// + // std::vector aux_gpr_idxs; + // for (size_t j = 0; j < post_op_emitters[eltwise_post_op_idx]->aux_gprs_count(); j++) { + // aux_gpr_idxs.push_back(aux_gpr(j).getIdx()); + // } +// + // post_op_emitters[eltwise_post_op_idx]->emit_code(in_idxs, out_idxs, aux_vmm_idxs, aux_gpr_idxs); +// + // eltwise_post_op_idx++; + //} +} + +} // namespace riscv64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/kernels/riscv64/jit_uni_eltwise_generic.hpp b/src/plugins/intel_cpu/src/nodes/kernels/riscv64/jit_uni_eltwise_generic.hpp new file mode 100644 index 00000000000000..bfac57f2345602 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/kernels/riscv64/jit_uni_eltwise_generic.hpp @@ -0,0 +1,124 @@ +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "jit_generator.hpp" + +#include "emitters/plugin/riscv64/jit_emitter.hpp" +#include "nodes/executors/eltwise.hpp" +#include "nodes/kernels/jit_eltwise_common.hpp" +#include "utils/cpu_utils.hpp" +#include "utils/general_utils.h" + +namespace ov { +namespace intel_cpu { +namespace riscv64 { + +struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, jit_generator { +public: + DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_eltwise_generic) + + jit_uni_eltwise_generic(jit_eltwise_params jep, std::vector eltwise_data); + jit_uni_eltwise_generic() = default; + + void create_ker() override { + jit_generator::create_kernel(); + ker_ = (decltype(ker_))jit_ker(); + } + + void generate() override; + +private: + // Register mapping in the kernel: + // x0-x4 | system | not used + // x5 | temp | reg_tmp_0 / reg_work_amount + // x6 | temp | reg_tmp_1 / reg_vlen + // x7 | temp | reg_offsets / reg_loop_step + // x8 | saved | + // x9 | saved | dst_gpr + // x10 | abi | reg_const_params + // x11 | abi | reg_indexes + // x12 | abi | + // x13 | abi | + // x14 | abi | + // x15 | abi | + // x16 | abi | + // x17 | abi | + // x18 | saved | src_gpr + // x19 | saved | src_gpr + // x20 | saved | src_gpr + // x21 | saved | src_gpr + // x22 | saved | src_gpr + // x23 | saved | src_gpr + // x24 | saved | src_gpr + // x25 | saved | src_aux_gpr + // x26 | saved | src_aux_gpr + // x27 | saved | src_aux_gpr + // x28 | temp | src_aux_gpr + // x29 | temp | src_aux_gpr + // x30 | temp | src_aux_gpr + // x31 | temp | src_aux_gpr + + inline Xbyak_riscv::Reg dst_gpr() const { + // x9 + return Xbyak_riscv::Reg(9); + } + + inline Xbyak_riscv::Reg src_gpr(const int idx) const { + // x18-24 + OPENVINO_ASSERT(idx >= 0 && idx < MAX_ELTWISE_INPUTS, "src reg " + std::to_string(idx) + " is not supported"); + return Xbyak_riscv::Reg(18 + idx); + } + + inline Xbyak_riscv::Reg src_aux_gpr(const int idx) const { + // saved registers: x25-x31 + OPENVINO_ASSERT(idx >= 0 && idx < MAX_ELTWISE_INPUTS, "src aux reg " + std::to_string(idx) + " is not supported"); + return Xbyak_riscv::Reg(25 + idx); + } + + inline Xbyak_riscv::VReg dst_vec() const { + return Xbyak_riscv::VReg(0); + } + + inline Xbyak_riscv::VReg src_vec(const int idx, const Xbyak_riscv::LMUL lmul) const { + const auto vec_idx = (idx + 1) * lmul2int(lmul); + OPENVINO_ASSERT(vec_idx >= 0 && static_cast(vec_idx) < vec_count, + "src vector reg " + std::to_string(vec_idx) + " is not supported"); + return Xbyak_riscv::VReg(vec_idx); + } + + //inline Xbyak_riscv::VReg aux_vec(const int idx) { + // OPENVINO_ASSERT(idx >= 0 && (idx + 8 < vec_count), "aux vector reg " + std::to_string(idx) + " is not supported"); + // return Xbyak_riscv::VReg(idx + 8); + //} + + std::shared_ptr create_eltwise_emitter(const EltwiseData& data, const ov::element::Type& exec_prec); + + Xbyak_riscv::Reg reg_const_params = Xbyak_riscv::a0; + Xbyak_riscv::Reg reg_indexes = Xbyak_riscv::a1; + + Xbyak_riscv::Reg reg_tmp_0 = Xbyak_riscv::t0; + Xbyak_riscv::Reg reg_tmp_1 = Xbyak_riscv::t1; + Xbyak_riscv::Reg reg_offsets = Xbyak_riscv::t2; + Xbyak_riscv::Reg reg_work_amount = reg_tmp_0; + Xbyak_riscv::Reg reg_vlen = reg_tmp_1; + Xbyak_riscv::Reg reg_loop_step = reg_offsets; + + void compute_eltwise_op(); + void apply_post_ops(); + void emit_data() const; + + int lmul2int(const Xbyak_riscv::LMUL lmul) const; + + const std::vector eltwise_data_; + const std::vector ops_list_; + + std::shared_ptr eltwise_emitter = nullptr; + std::vector> post_op_emitters; +}; + +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/thirdparty/CMakeLists.txt b/src/plugins/intel_cpu/thirdparty/CMakeLists.txt index 48847c728d9349..77ee3042c95cd4 100644 --- a/src/plugins/intel_cpu/thirdparty/CMakeLists.txt +++ b/src/plugins/intel_cpu/thirdparty/CMakeLists.txt @@ -172,6 +172,7 @@ endif() if(RISCV64) set(XBYAK_RISCV_V ON) add_subdirectory(xbyak_riscv) + target_compile_definitions(xbyak_riscv INTERFACE XBYAK_RISCV_V=1) ov_install_static_lib(xbyak_riscv ${OV_CPACK_COMP_CORE}) endif()