Skip to content

Commit

Permalink
add and activate the test
Browse files Browse the repository at this point in the history
  • Loading branch information
chenhu-wang committed Jun 12, 2024
1 parent 4827703 commit 0927560
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 17 deletions.
12 changes: 8 additions & 4 deletions src/plugins/intel_cpu/src/emitters/plugin/x64/jit_emitter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,10 @@ void jit_emitter::emitter_preamble(const std::vector<size_t> &in_idxs, const std
bool is_vec_input = (in_out_type_ == emitter_in_out_map::vec_to_vec) || (in_out_type_ == emitter_in_out_map::vec_to_gpr);
bool is_vec_output = (in_out_type_ == emitter_in_out_map::vec_to_vec) || (in_out_type_ == emitter_in_out_map::gpr_to_vec);

for (auto idx : pool_vec_idxs)
aux_vec_idxs.push_back(idx);
size_t aux_vec_from_pool = std::min(aux_vecs_count(), pool_vec_idxs.size());
for (size_t i = 0; i < aux_vec_from_pool; ++i) {
aux_vec_idxs.push_back(pool_vec_idxs[i]);
}

// For sse41 mask register has to be Xmm(0)
if (host_isa_ == cpu::x64::sse41 && aux_vecs_count() > 0) {
Expand Down Expand Up @@ -111,8 +113,10 @@ void jit_emitter::emitter_preamble(const std::vector<size_t> &in_idxs, const std
OV_CPU_JIT_EMITTER_THROW("Failed to allocate required number of vector registers");

// Same logic but to allocate gprs
for (auto idx : pool_gpr_idxs)
aux_gpr_idxs.push_back(idx);
size_t aux_gpr_from_pool = std::min(aux_gprs_count(), pool_gpr_idxs.size());
for (size_t i = 0; i < aux_gpr_from_pool; ++i) {
aux_gpr_idxs.push_back(pool_gpr_idxs[i]);
}

for (size_t gpr_idx = 0; gpr_idx <= Operand::R15; ++gpr_idx) {
size_t _idx = Operand::R15 - gpr_idx; // we allocate from the end
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,15 @@ static int get_aux_regs_as_temp(const int elem_count, const int data_size, bool
const int avx512_threshold_for_mask = 0, const bool is_fill = false) {
if (mayiuse(cpu::x64::avx512_core) && is_fill)
return 1;

// load i8/u8/i16/u16/bf16/fp16 to full xmm/ymm/zmm has direct no-mask ISA, no aux-reg need.
// store i32 on full xmm/ymm/zmm to i8/u8/i16/u16 has direct no-mask ISA, no aux-reg need.
// store f32 on full xmm/ymm/zmm to bf16/fp16. need convert to bf16/fp16 on vmm, then store to memory, use store_base condition.
// for pure move, there are direct no-mask instructions to move on full xmm/ymm/zmm, so aux_gpr is not needed.
// for move+convert:
// there are direct no-mask instructions to load i8/u8/i16/u16/bf16/fp16 to full xmm/ymm/zmm as f32/i32, so aux_gpr is not needed.
// there are direct no-mask instructions to store i32 on full xmm/ymm/zmm to i8/u8/i16/u16, so aux_gpr is not needed.
// store f32 on full xmm/ymm/zmm to bf16/fp16, need convert to bf16/fp16 on vmm, then store vmm to memory, use store_dword_to_word/byte_base condition.
// store_num == 16, vector: 16 * f32 -> 16 * bf16 -> ymm(256bit) -> store
// store_num == 8, vector: 8 * f32 -> 8 * bf16 -> xmm(128bit) -> store
// store_num == 4, vector: 4 * f32 -> 4 * bf16 -> 64bit -> no direct store
// store_num == 4, vector: 4 * f32 -> 4 * bf16 -> 64bit -> masked instruction with aux_gpr needed
// f32<->i32 is on full vmm, so aux_gpr is not needed.
const int byte_size = elem_count * data_size;
if ((is_pure_move && one_of(byte_size, 16, 32, 64)) || (!is_pure_move && one_of(elem_count, 4, 8, 16) && !is_store_as_real16))
return 0;
Expand Down Expand Up @@ -702,8 +704,8 @@ void jit_store_emitter::emit_isa(const int in_vec_idx, const Xbyak::Reg64 &reg_d

data_idx = in_vec_idx;
data_reg_updated = false;
aux_src_idx = aux_vec_idxs.back(); // for avoid src pollution
if (src_prc_ != dst_prc_) {
aux_src_idx = aux_vec_idxs.back(); // to avoid src pollution
switch (src_prc_) {
case ov::element::f32:
if (!dst_prc_.is_real()) {
Expand Down
13 changes: 6 additions & 7 deletions src/plugins/intel_cpu/tests/unit/jit_kernel_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -269,19 +269,18 @@ struct jit_variable_load_store_test_kernel {
struct Params {
const SrcT *src;
DstT *dst;
size_t size;
};

template<size_t N, size_t M, bool is_src>
void test() {
kernel_impl<N, is_src> kernel;
kernel_impl<N, M, is_src> kernel;
kernel.init();
ASSERT_GE(N, M);

std::array<SrcT, N> src {};
std::array<DstT, N> result {};

Params args = { src.data(), result.data(), M };
Params args = { src.data(), result.data()};

src.fill(static_cast<SrcT>(42));
for (size_t i = 0; i < M; ++i) {
Expand All @@ -300,20 +299,19 @@ struct jit_variable_load_store_test_kernel {
}

private:
template<size_t N, bool is_src>
template<size_t N, size_t M, bool is_src>
class kernel_impl : public jit_test_kernel<Params> {
public:
void generate() override {
jit_kernel::preamble();

auto src_ptr = jit_kernel::arg(&Params::src);
auto dst_ptr = jit_kernel::arg(&Params::dst);
auto size = jit_kernel::arg(&Params::size);

auto interm = jit_kernel::var<typename std::conditional<is_src, SrcT[N], DstT[N]>::type>();

jit_kernel::load(interm, src_ptr, size);
jit_kernel::store(dst_ptr, interm, size);
jit_kernel::load(interm, src_ptr, M);
jit_kernel::store(dst_ptr, interm, M);

jit_kernel::postamble();
}
Expand Down Expand Up @@ -360,6 +358,7 @@ TEST(JitKernel, variable_load_and_store) {
{
jit_variable_load_store_test_kernel<float, bfloat16_t> kernel;
if (mayiuse(cpu_isa_t::avx512_core)) {
kernel.test<16, 4, true>();
kernel.test<16, 11, true>();
}
if (mayiuse(cpu_isa_t::avx2)) {
Expand Down

0 comments on commit 0927560

Please sign in to comment.