Skip to content

[CPU][ARM][x64]Snippets MatMul via brgemm emitter and executor #28304

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
2 changes: 1 addition & 1 deletion cmake/features.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ ov_dependent_option (ENABLE_GPU_DEBUG_CAPS "enable GPU debug capabilities at run
ov_dependent_option (ENABLE_CPU_DEBUG_CAPS "enable CPU debug capabilities at runtime" ON "ENABLE_DEBUG_CAPS;ENABLE_INTEL_CPU" OFF)
ov_dependent_option (ENABLE_SNIPPETS_DEBUG_CAPS "enable Snippets debug capabilities at runtime" ON "ENABLE_DEBUG_CAPS" OFF)

ov_dependent_option (ENABLE_SNIPPETS_LIBXSMM_TPP "allow Snippets to use LIBXSMM Tensor Processing Primitives" OFF "ENABLE_INTEL_CPU AND X86_64" OFF)
ov_dependent_option (ENABLE_SNIPPETS_LIBXSMM_TPP "allow Snippets to use LIBXSMM Tensor Processing Primitives" OFF "ENABLE_INTEL_CPU AND (X86_64 OR AARCH64)" OFF)

ov_option (ENABLE_PROFILING_ITT "Build with ITT tracing. Optionally configure pre-built ittnotify library though INTEL_VTUNE_DIR variable." OFF)

Expand Down
6 changes: 5 additions & 1 deletion src/plugins/intel_cpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,9 @@ if(NOT X86_64)
${CMAKE_CURRENT_SOURCE_DIR}/src/nodes/kernels/x64/*
${CMAKE_CURRENT_SOURCE_DIR}/src/emitters/plugin/x64/*
${CMAKE_CURRENT_SOURCE_DIR}/src/emitters/snippets/x64/*
${CMAKE_CURRENT_SOURCE_DIR}/src/transformations/cpu_opset/x64/*)
${CMAKE_CURRENT_SOURCE_DIR}/src/emitters/tpp/x64/*
${CMAKE_CURRENT_SOURCE_DIR}/src/transformations/cpu_opset/x64/*
${CMAKE_CURRENT_SOURCE_DIR}/src/transformations/tpp/x64/*)
endif()

if (AARCH64)
Expand All @@ -218,7 +220,9 @@ endif()

if(NOT (AARCH64 OR ARM))
list(APPEND EXCLUDE_PATHS ${CMAKE_CURRENT_SOURCE_DIR}/src/transformations/cpu_opset/arm/*
${CMAKE_CURRENT_SOURCE_DIR}/src/transformations/tpp/aarch64/*
${CMAKE_CURRENT_SOURCE_DIR}/src/emitters/plugin/aarch64/*
${CMAKE_CURRENT_SOURCE_DIR}/src/emitters/tpp/aarch64/*
${CMAKE_CURRENT_SOURCE_DIR}/src/nodes/executors/aarch64/*
${CMAKE_CURRENT_SOURCE_DIR}/src/nodes/kernels/aarch64/*)
endif()
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (C) 2024 Intel Corporation
// Copyright (C) 2024-2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

Expand All @@ -10,6 +10,7 @@
#include "emitters/snippets/aarch64/jit_kernel_emitter.hpp"
#include "emitters/snippets/aarch64/jit_loop_emitters.hpp"
#include "emitters/snippets/aarch64/jit_memory_emitters.hpp"
#include "emitters/snippets/cpu_kernel_executor_table.hpp"
#include "emitters/snippets/cpu_runtime_configurator.hpp"
#include "emitters/utils.hpp"
#include "jit_snippets_emitters.hpp"
Expand All @@ -24,12 +25,17 @@
#include "transformations/cpu_opset/common/op/swish_cpu.hpp"
#include "transformations/snippets/common/op/fused_mul_add.hpp"

#ifdef SNIPPETS_LIBXSMM_TPP
# include "emitters/tpp/aarch64/jit_brgemm_emitter.hpp"
# include "transformations/tpp/common/op/brgemm.hpp"
#endif

namespace ov {

#define CREATE_SNIPPETS_EMITTER(e_type) \
#define CREATE_SNIPPETS_EMITTER(e_type, ...) \
{ \
[this](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr<snippets::Emitter> { \
return std::make_shared<e_type>(h.get(), isa, expr); \
return std::make_shared<e_type>(h.get(), isa, expr, ##__VA_ARGS__); \
}, \
[](const std::shared_ptr<ov::Node>& n) -> std::set<std::vector<element::Type>> { \
return e_type::get_supported_precisions(n); \
Expand Down Expand Up @@ -201,6 +207,12 @@ CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::aarch64::cpu_isa_t host_isa,
jitters[ov::intel_cpu::SwishNode::get_type_info_static()] = CREATE_CPU_EMITTER(jit_swish_emitter);
jitters[ov::op::v0::Tanh::get_type_info_static()] = CREATE_CPU_EMITTER(jit_tanh_emitter);

#ifdef SNIPPETS_LIBXSMM_TPP
// brgemm
jitters[ov::intel_cpu::tpp::op::BrgemmTPP::get_type_info_static()] =
CREATE_SNIPPETS_EMITTER(jit_brgemm_emitter, configurator->get_kernel_executor_table(), compiled_kernel_cache);
#endif

// control flow
jitters[snippets::op::KernelStatic::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_kernel_static_emitter);
jitters[snippets::op::KernelDynamic::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_kernel_dynamic_emitter);
Expand Down
218 changes: 218 additions & 0 deletions src/plugins/intel_cpu/src/emitters/snippets/brgemm_generic.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,218 @@
// Copyright (C) 2020-2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "brgemm_generic.hpp"

#include "common/utils.hpp"
#include "dnnl_extension_utils.h"
#include "utils/general_utils.h"

#define PRINT(X) ss << #X << " = " << X << "\n"
#define EQ(X) X == rhs.X
#define HASH(X) seed = dnnl::impl::hash_combine(seed, X)

namespace ov::intel_cpu {

bool BrgemmGenericKernelConfig::is_completed() const {
return !one_of(0, m_M, m_N, m_K, m_LDA, m_LDB, m_LDC) || is_empty();
}

bool BrgemmGenericKernelConfig::is_empty() const {
return everyone_is(0, m_M, m_N, m_K, m_LDA, m_LDB, m_LDC, m_beta);
}

bool BrgemmGenericKernelConfig::operator==(const BrgemmGenericKernelConfig& rhs) const {
return EQ(m_beta) && EQ(m_M) && EQ(m_N) && EQ(m_K) && EQ(m_LDA) && EQ(m_LDB) && EQ(m_LDC);
}

void BrgemmGenericKernelConfig::update(int64_t M,
int64_t N,
int64_t K,
int64_t LDA,
int64_t LDB,
int64_t LDC,
float beta) {
// If M/N/K is zero, it means that Brgemm won't be executed (in Loop with work_amount = 0, for example)
// To process this case, we have to make this Config as empty (nullify runtime parameters)
if (one_of(0, M, N, K)) {
m_M = 0;
m_N = 0;
m_K = 0;
m_LDA = 0;
m_LDB = 0;
m_LDC = 0;
m_beta = 0;
} else {
m_M = M;
m_N = N;
m_K = K;
m_LDA = LDA;
m_LDB = LDB;
m_LDC = LDC;
m_beta = beta;
}
}

size_t BrgemmGenericKernelConfig::compute_hash() const {
size_t seed = 0;
HASH(m_M);
HASH(m_N);
HASH(m_K);
HASH(m_LDA);
HASH(m_LDB);
HASH(m_LDC);
HASH(m_beta);
return seed;
}

#ifdef SNIPPETS_DEBUG_CAPS
std::string BrgemmGenericKernelConfig::to_string() const {
std::stringstream ss;
PRINT(m_M);
PRINT(m_N);
PRINT(m_K);
PRINT(m_LDA);
PRINT(m_LDB);
PRINT(m_LDC);
PRINT(m_beta);
return ss.str();
}
#endif

float BrgemmKernelExecutorHelper::get_beta(
const ov::snippets::lowered::LoopManagerPtr& loop_manager,
int loop_id,
const ov::snippets::lowered::ExpandedLoopInfoPtr& current_expanded_loop_info) {
// Find all Expanded loops with the same Unified loop information -> they were decomposed from this Unified Loop.
// Note that LoopInfo are normalized and sorted (due to NormalizedLoopIDs pass).
// It means that previous executed Loops have Loop ID less the current Loop ID.
// - If there is executed Loop (work_amount > 0) and evaluated before the current -> the current Brgemm should have
// `beta = 1`.
// - If there is not this Loop -> the current executed Brgemm should have `beta = 0`.
if (loop_id > 0) {
const auto& current_unified_loop_info = current_expanded_loop_info->get_unified_loop_info();
// Check the previous Loops
--loop_id;
while (loop_id >= 0) {
const auto& expanded_loop_info =
loop_manager->get_loop_info<ov::snippets::lowered::ExpandedLoopInfo>(loop_id);
if (expanded_loop_info->get_unified_loop_info() != current_unified_loop_info) {
return 0;
}
if (expanded_loop_info->get_work_amount() > 0) {
// there is previous executed Brgemm with `beta = 0` -> the current Brgemm should have `beta = 1`
return 1;
}
--loop_id;
}
}
return 0;
}

std::tuple<int64_t, int64_t, int64_t, float> BrgemmKernelExecutorHelper::get_runtime_brgemm_params(
const ov::snippets::lowered::ExpressionPtr& expr,
const ov::snippets::lowered::LinearIRCPtr& linear_ir) {
const auto& input_pds = expr->get_input_port_descriptors();
const auto& output_pds = expr->get_output_port_descriptors();
OV_CPU_JIT_EMITTER_ASSERT((input_pds.size() == 2 || input_pds.size() == 3) && output_pds.size() == 1,
"Invalid number of in/out port descriptors");

const auto& in0_shape = snippets::utils::get_planar_vdims(input_pds[0]->get_shape(), input_pds[0]->get_layout());
const auto& in1_shape = snippets::utils::get_planar_vdims(input_pds[1]->get_shape(), input_pds[1]->get_layout());
const auto& in0_subtensor = input_pds[0]->get_subtensor();
const auto& in1_subtensor = input_pds[1]->get_subtensor();

// Need to update M, K, N
// 1. If the original value in subtensor is `FULL_DIM`, it means that
// Brgemm block should process full tensor by this dim -> take dimension from shape
// 2. Otherwise, Brgemm block processes part of the tensor by this dim
// (there is blocking by this dimension) -> take from Loop increment

auto M = *++in0_subtensor.rbegin();
auto K = *in0_subtensor.rbegin();
auto N = *in1_subtensor.rbegin();

size_t loop_idx = 0;
const auto& loop_ids = expr->get_loop_ids();
const auto& loop_manager = linear_ir->get_loop_manager();
auto get_loop_info = [&]() {
OPENVINO_ASSERT(loop_idx < loop_ids.size(), "Loop is missed");
return loop_manager->get_loop_info<ov::snippets::lowered::ExpandedLoopInfo>(loop_ids[loop_idx++]);
};

/* ------- Dimension M ----------*/
if (ov::snippets::utils::is_full_dim_value(M)) {
M = *++in0_shape.rbegin();
} else {
const auto& current_expanded_loop_info = get_loop_info();
const auto& in_ports = current_expanded_loop_info->get_input_ports();
const auto& out_ports = current_expanded_loop_info->get_output_ports();
// Quick validation check: Should we check that port is really Brgemm port?
// If BrgemmCopyB in the Loop by M -> first input port will be BrgemmCopyB with `incremented=false`
// to avoid extra checks, we validate only first input port
auto check_port = [&](const ov::snippets::lowered::LoopPort& p) {
return p.get_dim_idx() == 1 && p.is_processed();
};
OPENVINO_ASSERT(
in_ports.size() > 1 && check_port(in_ports[0]) && out_ports.size() == 1 && check_port(out_ports[0]),
"Incorrect Loop by Brgemm dimension M");
M = current_expanded_loop_info->get_work_amount() > 0 ? current_expanded_loop_info->get_increment() : 0;
input_pds[0]->set_subtensor_dim(1, M);
output_pds[0]->set_subtensor_dim(1, M);
}

/* ------- Dimension N ----------*/
if (ov::snippets::utils::is_full_dim_value(N)) {
N = *in1_shape.rbegin();
} else {
const auto& current_expanded_loop_info = get_loop_info();
const auto& in_ports = current_expanded_loop_info->get_input_ports();
const auto& out_ports = current_expanded_loop_info->get_output_ports();
// Quick validation check: Should we check that port is really Brgemm port?
auto check_port = [&](const ov::snippets::lowered::LoopPort& p) {
return p.get_dim_idx() == 0 && p.is_processed();
};
OPENVINO_ASSERT(in_ports.size() >= 2 && !in_ports.front().is_processed() &&
std::all_of(in_ports.cbegin() + 1, in_ports.cend(), check_port) && out_ports.size() == 1 &&
check_port(out_ports.back()),
"Incorrect Loop by Brgemm dimension N");
N = current_expanded_loop_info->get_work_amount() > 0 ? current_expanded_loop_info->get_increment() : 0;
input_pds[1]->set_subtensor_dim(0, N);
output_pds[0]->set_subtensor_dim(0, N);
}

/* ------- Dimension K ----------*/
// 1. If Brgemm block processes full dimension K -> `beta = 0`
// 2. If Brgemm block processes part of the dimension K (there is blocking), need to find
// the most first executed Brgemm Block in Loops which iterate through dimension K (work_amount > 0).
// First of them will have `beta = 0`, other - `beta = 1`
float beta = 0;
if (ov::snippets::utils::is_full_dim_value(K)) {
K = *in0_shape.rbegin();
} else {
const auto& current_expanded_loop_info = get_loop_info();
const auto& in_ports = current_expanded_loop_info->get_input_ports();
const auto& out_ports = current_expanded_loop_info->get_output_ports();
// Quick validation check: Should we check that port is really Brgemm port?
OPENVINO_ASSERT(in_ports.size() >= 2 && in_ports.front().get_dim_idx() == 0 &&
in_ports.front().is_processed() && in_ports.back().get_dim_idx() == 1 &&
in_ports.back().is_processed() && out_ports.size() == 1 &&
!out_ports.front().is_processed(),
"Incorrect Loop by Brgemm dimension K");
K = current_expanded_loop_info->get_work_amount() > 0 ? current_expanded_loop_info->get_increment() : 0;
input_pds[0]->set_subtensor_dim(0, K);
input_pds[1]->set_subtensor_dim(1, K);
if (K > 0) {
beta = get_beta(loop_manager, static_cast<int>(loop_ids.back()), current_expanded_loop_info);
}
}

return std::make_tuple(M, N, K, beta);
}

#undef PRINT
#undef EQ
#undef HASH

} // namespace ov::intel_cpu
76 changes: 76 additions & 0 deletions src/plugins/intel_cpu/src/emitters/snippets/brgemm_generic.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
// Copyright (C) 2020-2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "emitters/snippets/cpu_kernel_executor_table.hpp"
#include "emitters/utils.hpp"
#include "snippets/lowered/loop_info.hpp"
#include "snippets/lowered/loop_manager.hpp"
#include "utils/general_utils.h"

namespace ov::intel_cpu {

struct BrgemmGenericKernelConfig : public snippets::KernelExecutorBase::GenericConfig {
public:
BrgemmGenericKernelConfig() = default;

bool is_completed() const override;
bool is_empty() const;

virtual void update(int64_t M, int64_t N, int64_t K, int64_t LDA, int64_t LDB, int64_t LDC, float beta);

bool operator==(const BrgemmGenericKernelConfig& rhs) const;
bool operator!=(const BrgemmGenericKernelConfig& rhs) const {
return !(*this == rhs);
}

int64_t get_M() const {
return m_M;
}
int64_t get_N() const {
return m_N;
}
int64_t get_K() const {
return m_K;
}
float get_beta() const {
return m_beta;
}
int64_t get_LDA() const {
return m_LDA;
}
int64_t get_LDB() const {
return m_LDB;
}
int64_t get_LDC() const {
return m_LDC;
}

#ifdef SNIPPETS_DEBUG_CAPS
std::string to_string() const override;
#endif

protected:
size_t compute_hash() const;

int64_t m_M{0}, m_N{0}, m_K{0}, m_LDA{0}, m_LDB{0}, m_LDC{0};
float m_beta{0};
};

class BrgemmKernelExecutorHelper {
public:
virtual ~BrgemmKernelExecutorHelper() = default;

static float get_beta(const ov::snippets::lowered::LoopManagerPtr& loop_manager,
int loop_id,
const ov::snippets::lowered::ExpandedLoopInfoPtr& current_expanded_loop_info);

// This function returns M, N, K dimensions and beta of brgemm as a tuple, based on loop info in linear_ir.
static std::tuple<int64_t, int64_t, int64_t, float> get_runtime_brgemm_params(
const ov::snippets::lowered::ExpressionPtr& expr,
const ov::snippets::lowered::LinearIRCPtr& linear_ir);
};

} // namespace ov::intel_cpu
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,10 @@
# include "emitters/tpp/x64/jit_eltwise_emitters.hpp"
# include "emitters/tpp/x64/jit_equation_emitter.hpp"
# include "emitters/tpp/x64/jit_scalar_emitter.hpp"
# include "transformations/tpp/x64/op/brgemm.hpp"
# include "transformations/tpp/common/op/brgemm.hpp"
# include "transformations/tpp/common/op/modifiers.hpp"
# include "transformations/tpp/x64/op/eltwise.hpp"
# include "transformations/tpp/x64/op/equation.hpp"
# include "transformations/tpp/x64/op/modifiers.hpp"
# include "transformations/tpp/x64/op/reduce.hpp"
# include "transformations/tpp/x64/op/scalar.hpp"
// Note: for reference implementations
Expand Down Expand Up @@ -295,7 +295,8 @@ intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_t ho
#endif

#ifdef SNIPPETS_LIBXSMM_TPP
jitters[intel_cpu::tpp::op::BrgemmTPP::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(BrgemmTppEmitter);
jitters[intel_cpu::tpp::op::BrgemmTPP::get_type_info_static()] =
CREATE_SNIPPETS_EMITTER(BrgemmTppEmitter, configurator->get_kernel_executor_table(), compiled_kernel_cache);
jitters[intel_cpu::tpp::op::Add::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(BinaryEltwiseTppEmitter);
jitters[intel_cpu::tpp::op::Subtract::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(BinaryEltwiseTppEmitter);
jitters[intel_cpu::tpp::op::Multiply::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(BinaryEltwiseTppEmitter);
Expand Down
Loading
Loading