add and activate the test

chenhu-wang · Jun 12, 2024 · 0927560 · 0927560
1 parent 4827703
commit 0927560
Show file tree

Hide file tree

Showing 3 changed files with 22 additions and 17 deletions.
diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_emitter.cpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_emitter.cpp
@@ -66,8 +66,10 @@ void jit_emitter::emitter_preamble(const std::vector<size_t> &in_idxs, const std
     bool is_vec_input = (in_out_type_ == emitter_in_out_map::vec_to_vec) || (in_out_type_ == emitter_in_out_map::vec_to_gpr);
     bool is_vec_output = (in_out_type_ == emitter_in_out_map::vec_to_vec) || (in_out_type_ == emitter_in_out_map::gpr_to_vec);
 
-    for (auto idx : pool_vec_idxs)
-        aux_vec_idxs.push_back(idx);
+    size_t aux_vec_from_pool = std::min(aux_vecs_count(), pool_vec_idxs.size());
+    for (size_t i = 0; i < aux_vec_from_pool; ++i) {
+        aux_vec_idxs.push_back(pool_vec_idxs[i]);
+    }
 
     // For sse41 mask register has to be Xmm(0)
     if (host_isa_ == cpu::x64::sse41 && aux_vecs_count() > 0) {
@@ -111,8 +113,10 @@ void jit_emitter::emitter_preamble(const std::vector<size_t> &in_idxs, const std
         OV_CPU_JIT_EMITTER_THROW("Failed to allocate required number of vector registers");
 
     // Same logic but to allocate gprs
-    for (auto idx : pool_gpr_idxs)
-        aux_gpr_idxs.push_back(idx);
+    size_t aux_gpr_from_pool = std::min(aux_gprs_count(), pool_gpr_idxs.size());
+    for (size_t i = 0; i < aux_gpr_from_pool; ++i) {
+        aux_gpr_idxs.push_back(pool_gpr_idxs[i]);
+    }
 
     for (size_t gpr_idx = 0; gpr_idx <= Operand::R15; ++gpr_idx) {
         size_t _idx = Operand::R15 - gpr_idx; // we allocate from the end

diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.cpp
@@ -65,13 +65,15 @@ static int get_aux_regs_as_temp(const int elem_count, const int data_size, bool
                                 const int avx512_threshold_for_mask = 0, const bool is_fill = false) {
     if (mayiuse(cpu::x64::avx512_core) && is_fill)
         return 1;
-
-    // load i8/u8/i16/u16/bf16/fp16 to full xmm/ymm/zmm has direct no-mask ISA, no aux-reg need.
-    // store i32 on full xmm/ymm/zmm to i8/u8/i16/u16 has direct no-mask ISA, no aux-reg need.
-    // store f32 on full xmm/ymm/zmm to bf16/fp16. need convert to bf16/fp16 on vmm, then store to memory, use store_base condition.
+    // for pure move, there are direct no-mask instructions to move on full xmm/ymm/zmm, so aux_gpr is not needed.
+    // for move+convert:
+    // there are direct no-mask instructions to load i8/u8/i16/u16/bf16/fp16 to full xmm/ymm/zmm as f32/i32, so aux_gpr is not needed.
+    // there are direct no-mask instructions to store i32 on full xmm/ymm/zmm to i8/u8/i16/u16, so aux_gpr is not needed.
+    // store f32 on full xmm/ymm/zmm to bf16/fp16, need convert to bf16/fp16 on vmm, then store vmm to memory, use store_dword_to_word/byte_base condition.
     // store_num == 16, vector: 16 * f32 -> 16 * bf16 -> ymm(256bit) -> store
     // store_num == 8,  vector:  8 * f32 ->  8 * bf16 -> xmm(128bit)  -> store
-    // store_num == 4,  vector:  4 * f32 ->  4 * bf16 ->       64bit  -> no direct store
+    // store_num == 4,  vector:  4 * f32 ->  4 * bf16 ->       64bit  -> masked instruction with aux_gpr needed
+    // f32<->i32 is on full vmm, so aux_gpr is not needed.
     const int byte_size = elem_count * data_size;
     if ((is_pure_move && one_of(byte_size, 16, 32, 64)) || (!is_pure_move && one_of(elem_count, 4, 8, 16) && !is_store_as_real16))
         return 0;
@@ -702,8 +704,8 @@ void jit_store_emitter::emit_isa(const int in_vec_idx, const Xbyak::Reg64 &reg_d
 
     data_idx = in_vec_idx;
     data_reg_updated = false;
-    aux_src_idx = aux_vec_idxs.back(); // for avoid src pollution
     if (src_prc_ != dst_prc_) {
+        aux_src_idx = aux_vec_idxs.back(); // to avoid src pollution
         switch (src_prc_) {
             case ov::element::f32:
                 if (!dst_prc_.is_real()) {

diff --git a/src/plugins/intel_cpu/tests/unit/jit_kernel_test.cpp b/src/plugins/intel_cpu/tests/unit/jit_kernel_test.cpp
@@ -269,19 +269,18 @@ struct jit_variable_load_store_test_kernel {
     struct Params {
         const SrcT *src;
         DstT *dst;
-        size_t size;
     };
 
     template<size_t N, size_t M, bool is_src>
     void test() {
-        kernel_impl<N, is_src> kernel;
+        kernel_impl<N, M, is_src> kernel;
         kernel.init();
         ASSERT_GE(N, M);
 
         std::array<SrcT, N> src {};
         std::array<DstT, N> result {};
 
-        Params args = { src.data(), result.data(), M };
+        Params args = { src.data(), result.data()};
 
         src.fill(static_cast<SrcT>(42));
         for (size_t i = 0; i < M; ++i) {
@@ -300,20 +299,19 @@ struct jit_variable_load_store_test_kernel {
     }
 
 private:
-    template<size_t N, bool is_src>
+    template<size_t N, size_t M, bool is_src>
     class kernel_impl : public jit_test_kernel<Params> {
     public:
         void generate() override {
             jit_kernel::preamble();
 
             auto src_ptr = jit_kernel::arg(&Params::src);
             auto dst_ptr = jit_kernel::arg(&Params::dst);
-            auto size = jit_kernel::arg(&Params::size);
 
             auto interm = jit_kernel::var<typename std::conditional<is_src, SrcT[N], DstT[N]>::type>();
 
-            jit_kernel::load(interm, src_ptr, size);
-            jit_kernel::store(dst_ptr, interm, size);
+            jit_kernel::load(interm, src_ptr, M);
+            jit_kernel::store(dst_ptr, interm, M);
 
             jit_kernel::postamble();
         }
@@ -360,6 +358,7 @@ TEST(JitKernel, variable_load_and_store) {
     {
         jit_variable_load_store_test_kernel<float, bfloat16_t> kernel;
         if (mayiuse(cpu_isa_t::avx512_core)) {
+            kernel.test<16, 4, true>();
             kernel.test<16, 11, true>();
         }
         if (mayiuse(cpu_isa_t::avx2)) {