openvinotoolkit · IvanNovoselov · Feb 26, 2025 · Jan 8, 2025 · Jan 8, 2025 · Jan 9, 2025
@@ -52,7 +52,7 @@ ov_dependent_option (ENABLE_GPU_DEBUG_CAPS "enable GPU debug capabilities at run
 ov_dependent_option (ENABLE_CPU_DEBUG_CAPS "enable CPU debug capabilities at runtime" ON "ENABLE_DEBUG_CAPS;ENABLE_INTEL_CPU" OFF)
 ov_dependent_option (ENABLE_SNIPPETS_DEBUG_CAPS "enable Snippets debug capabilities at runtime" ON "ENABLE_DEBUG_CAPS" OFF)
 
-ov_dependent_option (ENABLE_SNIPPETS_LIBXSMM_TPP "allow Snippets to use LIBXSMM Tensor Processing Primitives" OFF "ENABLE_INTEL_CPU AND X86_64" OFF)
+ov_dependent_option (ENABLE_SNIPPETS_LIBXSMM_TPP "allow Snippets to use LIBXSMM Tensor Processing Primitives" OFF "ENABLE_INTEL_CPU AND (X86_64 OR AARCH64)" OFF)
 
 ov_option (ENABLE_PROFILING_ITT "Build with ITT tracing. Optionally configure pre-built ittnotify library though INTEL_VTUNE_DIR variable." OFF)
 

@@ -206,7 +206,9 @@ if(NOT X86_64)
                               ${CMAKE_CURRENT_SOURCE_DIR}/src/nodes/kernels/x64/*
                               ${CMAKE_CURRENT_SOURCE_DIR}/src/emitters/plugin/x64/*
                               ${CMAKE_CURRENT_SOURCE_DIR}/src/emitters/snippets/x64/*
-                              ${CMAKE_CURRENT_SOURCE_DIR}/src/transformations/cpu_opset/x64/*)
+                              ${CMAKE_CURRENT_SOURCE_DIR}/src/emitters/tpp/x64/*
+                              ${CMAKE_CURRENT_SOURCE_DIR}/src/transformations/cpu_opset/x64/*
+                              ${CMAKE_CURRENT_SOURCE_DIR}/src/transformations/tpp/x64/*)
 endif()
 
 if (AARCH64)
@@ -218,7 +220,9 @@ endif()
 
 if(NOT (AARCH64 OR ARM))
     list(APPEND EXCLUDE_PATHS ${CMAKE_CURRENT_SOURCE_DIR}/src/transformations/cpu_opset/arm/*
+                              ${CMAKE_CURRENT_SOURCE_DIR}/src/transformations/tpp/aarch64/*
                               ${CMAKE_CURRENT_SOURCE_DIR}/src/emitters/plugin/aarch64/*
+                              ${CMAKE_CURRENT_SOURCE_DIR}/src/emitters/tpp/aarch64/*
                               ${CMAKE_CURRENT_SOURCE_DIR}/src/nodes/executors/aarch64/*
                               ${CMAKE_CURRENT_SOURCE_DIR}/src/nodes/kernels/aarch64/*)
 endif()

@@ -1,4 +1,4 @@
-// Copyright (C) 2024 Intel Corporation
+// Copyright (C) 2024-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -10,6 +10,7 @@
 #include "emitters/snippets/aarch64/jit_kernel_emitter.hpp"
 #include "emitters/snippets/aarch64/jit_loop_emitters.hpp"
 #include "emitters/snippets/aarch64/jit_memory_emitters.hpp"
+#include "emitters/snippets/cpu_kernel_executor_table.hpp"
 #include "emitters/snippets/cpu_runtime_configurator.hpp"
 #include "emitters/utils.hpp"
 #include "jit_snippets_emitters.hpp"
@@ -24,12 +25,17 @@
 #include "transformations/cpu_opset/common/op/swish_cpu.hpp"
 #include "transformations/snippets/common/op/fused_mul_add.hpp"
 
+#ifdef SNIPPETS_LIBXSMM_TPP
+#    include "emitters/tpp/aarch64/jit_brgemm_emitter.hpp"
+#    include "transformations/tpp/common/op/brgemm.hpp"
+#endif
+
 namespace ov {
 
-#define CREATE_SNIPPETS_EMITTER(e_type)                                                              \
+#define CREATE_SNIPPETS_EMITTER(e_type, ...)                                                         \
     {                                                                                                \
         [this](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr<snippets::Emitter> { \
-            return std::make_shared<e_type>(h.get(), isa, expr);                                     \
+            return std::make_shared<e_type>(h.get(), isa, expr, ##__VA_ARGS__);                      \
         },                                                                                           \
             [](const std::shared_ptr<ov::Node>& n) -> std::set<std::vector<element::Type>> {         \
                 return e_type::get_supported_precisions(n);                                          \
@@ -201,6 +207,12 @@ CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::aarch64::cpu_isa_t host_isa,
     jitters[ov::intel_cpu::SwishNode::get_type_info_static()] = CREATE_CPU_EMITTER(jit_swish_emitter);
     jitters[ov::op::v0::Tanh::get_type_info_static()] = CREATE_CPU_EMITTER(jit_tanh_emitter);
 
+#ifdef SNIPPETS_LIBXSMM_TPP
+    // brgemm
+    jitters[ov::intel_cpu::tpp::op::BrgemmTPP::get_type_info_static()] =
+        CREATE_SNIPPETS_EMITTER(jit_brgemm_emitter, configurator->get_kernel_executor_table(), compiled_kernel_cache);
+#endif
+
     // control flow
     jitters[snippets::op::KernelStatic::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_kernel_static_emitter);
     jitters[snippets::op::KernelDynamic::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_kernel_dynamic_emitter);

@@ -0,0 +1,218 @@
+// Copyright (C) 2020-2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "brgemm_generic.hpp"
+
+#include "common/utils.hpp"
+#include "dnnl_extension_utils.h"
+#include "utils/general_utils.h"
+
+#define PRINT(X) ss << #X << " = " << X << "\n"
+#define EQ(X)    X == rhs.X
+#define HASH(X)  seed = dnnl::impl::hash_combine(seed, X)
+
+namespace ov::intel_cpu {
+
+bool BrgemmGenericKernelConfig::is_completed() const {
+    return !one_of(0, m_M, m_N, m_K, m_LDA, m_LDB, m_LDC) || is_empty();
+}
+
+bool BrgemmGenericKernelConfig::is_empty() const {
+    return everyone_is(0, m_M, m_N, m_K, m_LDA, m_LDB, m_LDC, m_beta);
+}
+
+bool BrgemmGenericKernelConfig::operator==(const BrgemmGenericKernelConfig& rhs) const {
+    return EQ(m_beta) && EQ(m_M) && EQ(m_N) && EQ(m_K) && EQ(m_LDA) && EQ(m_LDB) && EQ(m_LDC);
+}
+
+void BrgemmGenericKernelConfig::update(int64_t M,
+                                       int64_t N,
+                                       int64_t K,
+                                       int64_t LDA,
+                                       int64_t LDB,
+                                       int64_t LDC,
+                                       float beta) {
+    // If M/N/K is zero, it means that Brgemm won't be executed (in Loop with work_amount = 0, for example)
+    // To process this case, we have to make this Config as empty (nullify runtime parameters)
+    if (one_of(0, M, N, K)) {
+        m_M = 0;
+        m_N = 0;
+        m_K = 0;
+        m_LDA = 0;
+        m_LDB = 0;
+        m_LDC = 0;
+        m_beta = 0;
+    } else {
+        m_M = M;
+        m_N = N;
+        m_K = K;
+        m_LDA = LDA;
+        m_LDB = LDB;
+        m_LDC = LDC;
+        m_beta = beta;
+    }
+}
+
+size_t BrgemmGenericKernelConfig::compute_hash() const {
+    size_t seed = 0;
+    HASH(m_M);
+    HASH(m_N);
+    HASH(m_K);
+    HASH(m_LDA);
+    HASH(m_LDB);
+    HASH(m_LDC);
+    HASH(m_beta);
+    return seed;
+}
+
+#ifdef SNIPPETS_DEBUG_CAPS
+std::string BrgemmGenericKernelConfig::to_string() const {
+    std::stringstream ss;
+    PRINT(m_M);
+    PRINT(m_N);
+    PRINT(m_K);
+    PRINT(m_LDA);
+    PRINT(m_LDB);
+    PRINT(m_LDC);
+    PRINT(m_beta);
+    return ss.str();
+}
+#endif
+
+float BrgemmKernelExecutorHelper::get_beta(
+    const ov::snippets::lowered::LoopManagerPtr& loop_manager,
+    int loop_id,
+    const ov::snippets::lowered::ExpandedLoopInfoPtr& current_expanded_loop_info) {
+    // Find all Expanded loops with the same Unified loop information -> they were decomposed from this Unified Loop.
+    // Note that LoopInfo are normalized and sorted (due to NormalizedLoopIDs pass).
+    // It means that previous executed Loops have Loop ID less the current Loop ID.
+    // - If there is executed Loop (work_amount > 0) and evaluated before the current -> the current Brgemm should have
+    // `beta = 1`.
+    // - If there is not this Loop -> the current executed Brgemm should have `beta = 0`.
+    if (loop_id > 0) {
+        const auto& current_unified_loop_info = current_expanded_loop_info->get_unified_loop_info();
+        // Check the previous Loops
+        --loop_id;
+        while (loop_id >= 0) {
+            const auto& expanded_loop_info =
+                loop_manager->get_loop_info<ov::snippets::lowered::ExpandedLoopInfo>(loop_id);
+            if (expanded_loop_info->get_unified_loop_info() != current_unified_loop_info) {
+                return 0;
+            }
+            if (expanded_loop_info->get_work_amount() > 0) {
+                // there is previous executed Brgemm with `beta = 0` -> the current Brgemm should have `beta = 1`
+                return 1;
+            }
+            --loop_id;
+        }
+    }
+    return 0;
+}
+
+std::tuple<int64_t, int64_t, int64_t, float> BrgemmKernelExecutorHelper::get_runtime_brgemm_params(
+    const ov::snippets::lowered::ExpressionPtr& expr,
+    const ov::snippets::lowered::LinearIRCPtr& linear_ir) {
+    const auto& input_pds = expr->get_input_port_descriptors();
+    const auto& output_pds = expr->get_output_port_descriptors();
+    OV_CPU_JIT_EMITTER_ASSERT((input_pds.size() == 2 || input_pds.size() == 3) && output_pds.size() == 1,
+                              "Invalid number of in/out port descriptors");
+
+    const auto& in0_shape = snippets::utils::get_planar_vdims(input_pds[0]->get_shape(), input_pds[0]->get_layout());
+    const auto& in1_shape = snippets::utils::get_planar_vdims(input_pds[1]->get_shape(), input_pds[1]->get_layout());
+    const auto& in0_subtensor = input_pds[0]->get_subtensor();
+    const auto& in1_subtensor = input_pds[1]->get_subtensor();
+
+    // Need to update M, K, N
+    // 1. If the original value in subtensor is `FULL_DIM`, it means that
+    //    Brgemm block should process full tensor by this dim -> take dimension from shape
+    // 2. Otherwise, Brgemm block processes part of the tensor by this dim
+    //    (there is blocking by this dimension) -> take from Loop increment
+
+    auto M = *++in0_subtensor.rbegin();
+    auto K = *in0_subtensor.rbegin();
+    auto N = *in1_subtensor.rbegin();
+
+    size_t loop_idx = 0;
+    const auto& loop_ids = expr->get_loop_ids();
+    const auto& loop_manager = linear_ir->get_loop_manager();
+    auto get_loop_info = [&]() {
+        OPENVINO_ASSERT(loop_idx < loop_ids.size(), "Loop is missed");
+        return loop_manager->get_loop_info<ov::snippets::lowered::ExpandedLoopInfo>(loop_ids[loop_idx++]);
+    };
+
+    /* ------- Dimension M ----------*/
+    if (ov::snippets::utils::is_full_dim_value(M)) {
+        M = *++in0_shape.rbegin();
+    } else {
+        const auto& current_expanded_loop_info = get_loop_info();
+        const auto& in_ports = current_expanded_loop_info->get_input_ports();
+        const auto& out_ports = current_expanded_loop_info->get_output_ports();
+        // Quick validation check: Should we check that port is really Brgemm port?
+        // If BrgemmCopyB in the Loop by M -> first input port will be BrgemmCopyB with `incremented=false`
+        // to avoid extra checks, we validate only first input port
+        auto check_port = [&](const ov::snippets::lowered::LoopPort& p) {
+            return p.get_dim_idx() == 1 && p.is_processed();
+        };
+        OPENVINO_ASSERT(
+            in_ports.size() > 1 && check_port(in_ports[0]) && out_ports.size() == 1 && check_port(out_ports[0]),
+            "Incorrect Loop by Brgemm dimension M");
+        M = current_expanded_loop_info->get_work_amount() > 0 ? current_expanded_loop_info->get_increment() : 0;
+        input_pds[0]->set_subtensor_dim(1, M);
+        output_pds[0]->set_subtensor_dim(1, M);
+    }
+
+    /* ------- Dimension N ----------*/
+    if (ov::snippets::utils::is_full_dim_value(N)) {
+        N = *in1_shape.rbegin();
+    } else {
+        const auto& current_expanded_loop_info = get_loop_info();
+        const auto& in_ports = current_expanded_loop_info->get_input_ports();
+        const auto& out_ports = current_expanded_loop_info->get_output_ports();
+        // Quick validation check: Should we check that port is really Brgemm port?
+        auto check_port = [&](const ov::snippets::lowered::LoopPort& p) {
+            return p.get_dim_idx() == 0 && p.is_processed();
+        };
+        OPENVINO_ASSERT(in_ports.size() >= 2 && !in_ports.front().is_processed() &&
+                            std::all_of(in_ports.cbegin() + 1, in_ports.cend(), check_port) && out_ports.size() == 1 &&
+                            check_port(out_ports.back()),
+                        "Incorrect Loop by Brgemm dimension N");
+        N = current_expanded_loop_info->get_work_amount() > 0 ? current_expanded_loop_info->get_increment() : 0;
+        input_pds[1]->set_subtensor_dim(0, N);
+        output_pds[0]->set_subtensor_dim(0, N);
+    }
+
+    /* ------- Dimension K ----------*/
+    // 1. If Brgemm block processes full dimension K -> `beta = 0`
+    // 2. If Brgemm block processes part of the dimension K (there is blocking), need to find
+    //    the most first executed Brgemm Block in Loops which iterate through dimension K (work_amount > 0).
+    //    First of them will have `beta = 0`, other - `beta = 1`
+    float beta = 0;
+    if (ov::snippets::utils::is_full_dim_value(K)) {
+        K = *in0_shape.rbegin();
+    } else {
+        const auto& current_expanded_loop_info = get_loop_info();
+        const auto& in_ports = current_expanded_loop_info->get_input_ports();
+        const auto& out_ports = current_expanded_loop_info->get_output_ports();
+        // Quick validation check: Should we check that port is really Brgemm port?
+        OPENVINO_ASSERT(in_ports.size() >= 2 && in_ports.front().get_dim_idx() == 0 &&
+                            in_ports.front().is_processed() && in_ports.back().get_dim_idx() == 1 &&
+                            in_ports.back().is_processed() && out_ports.size() == 1 &&
+                            !out_ports.front().is_processed(),
+                        "Incorrect Loop by Brgemm dimension K");
+        K = current_expanded_loop_info->get_work_amount() > 0 ? current_expanded_loop_info->get_increment() : 0;
+        input_pds[0]->set_subtensor_dim(0, K);
+        input_pds[1]->set_subtensor_dim(1, K);
+        if (K > 0) {
+            beta = get_beta(loop_manager, static_cast<int>(loop_ids.back()), current_expanded_loop_info);
+        }
+    }
+
+    return std::make_tuple(M, N, K, beta);
+}
+
+#undef PRINT
+#undef EQ
+#undef HASH
+
+}  // namespace ov::intel_cpu
@@ -0,0 +1,76 @@
+// Copyright (C) 2020-2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "emitters/snippets/cpu_kernel_executor_table.hpp"
+#include "emitters/utils.hpp"
+#include "snippets/lowered/loop_info.hpp"
+#include "snippets/lowered/loop_manager.hpp"
+#include "utils/general_utils.h"
+
+namespace ov::intel_cpu {
+
+struct BrgemmGenericKernelConfig : public snippets::KernelExecutorBase::GenericConfig {
+public:
+    BrgemmGenericKernelConfig() = default;
+
+    bool is_completed() const override;
+    bool is_empty() const;
+
+    virtual void update(int64_t M, int64_t N, int64_t K, int64_t LDA, int64_t LDB, int64_t LDC, float beta);
+
+    bool operator==(const BrgemmGenericKernelConfig& rhs) const;
+    bool operator!=(const BrgemmGenericKernelConfig& rhs) const {
+        return !(*this == rhs);
+    }
+
+    int64_t get_M() const {
+        return m_M;
+    }
+    int64_t get_N() const {
+        return m_N;
+    }
+    int64_t get_K() const {
+        return m_K;
+    }
+    float get_beta() const {
+        return m_beta;
+    }
+    int64_t get_LDA() const {
+        return m_LDA;
+    }
+    int64_t get_LDB() const {
+        return m_LDB;
+    }
+    int64_t get_LDC() const {
+        return m_LDC;
+    }
+
+#ifdef SNIPPETS_DEBUG_CAPS
+    std::string to_string() const override;
+#endif
+
+protected:
+    size_t compute_hash() const;
+
+    int64_t m_M{0}, m_N{0}, m_K{0}, m_LDA{0}, m_LDB{0}, m_LDC{0};
+    float m_beta{0};
+};
+
+class BrgemmKernelExecutorHelper {
+public:
+    virtual ~BrgemmKernelExecutorHelper() = default;
+
+    static float get_beta(const ov::snippets::lowered::LoopManagerPtr& loop_manager,
+                          int loop_id,
+                          const ov::snippets::lowered::ExpandedLoopInfoPtr& current_expanded_loop_info);
+
+    // This function returns M, N, K dimensions and beta of brgemm as a tuple, based on loop info in linear_ir.
+    static std::tuple<int64_t, int64_t, int64_t, float> get_runtime_brgemm_params(
+        const ov::snippets::lowered::ExpressionPtr& expr,
+        const ov::snippets::lowered::LinearIRCPtr& linear_ir);
+};
+
+}  // namespace ov::intel_cpu
@@ -44,10 +44,10 @@
 #    include "emitters/tpp/x64/jit_eltwise_emitters.hpp"
 #    include "emitters/tpp/x64/jit_equation_emitter.hpp"
 #    include "emitters/tpp/x64/jit_scalar_emitter.hpp"
-#    include "transformations/tpp/x64/op/brgemm.hpp"
+#    include "transformations/tpp/common/op/brgemm.hpp"
+#    include "transformations/tpp/common/op/modifiers.hpp"
 #    include "transformations/tpp/x64/op/eltwise.hpp"
 #    include "transformations/tpp/x64/op/equation.hpp"
-#    include "transformations/tpp/x64/op/modifiers.hpp"
 #    include "transformations/tpp/x64/op/reduce.hpp"
 #    include "transformations/tpp/x64/op/scalar.hpp"
 // Note: for reference implementations
@@ -295,7 +295,8 @@ intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_t ho
 #endif
 
 #ifdef SNIPPETS_LIBXSMM_TPP
-    jitters[intel_cpu::tpp::op::BrgemmTPP::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(BrgemmTppEmitter);
+    jitters[intel_cpu::tpp::op::BrgemmTPP::get_type_info_static()] =
+        CREATE_SNIPPETS_EMITTER(BrgemmTppEmitter, configurator->get_kernel_executor_table(), compiled_kernel_cache);
     jitters[intel_cpu::tpp::op::Add::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(BinaryEltwiseTppEmitter);
     jitters[intel_cpu::tpp::op::Subtract::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(BinaryEltwiseTppEmitter);
     jitters[intel_cpu::tpp::op::Multiply::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(BinaryEltwiseTppEmitter);