Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CPU][ARM][x64]Snippets MatMul via brgemm emitter and executor #28304

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmake/features.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ ov_dependent_option (ENABLE_GPU_DEBUG_CAPS "enable GPU debug capabilities at run
ov_dependent_option (ENABLE_CPU_DEBUG_CAPS "enable CPU debug capabilities at runtime" ON "ENABLE_DEBUG_CAPS;ENABLE_INTEL_CPU" OFF)
ov_dependent_option (ENABLE_SNIPPETS_DEBUG_CAPS "enable Snippets debug capabilities at runtime" ON "ENABLE_DEBUG_CAPS" OFF)

ov_dependent_option (ENABLE_SNIPPETS_LIBXSMM_TPP "allow Snippets to use LIBXSMM Tensor Processing Primitives" OFF "ENABLE_INTEL_CPU AND X86_64" OFF)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is also RISCV64, AArch32 etc. Can we add only supported archs to condition?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"ENABLE_INTEL_CPU AND (X86_64 OR AARCH64)" is set to TPP condition. Thanks Alexandra!

ov_dependent_option (ENABLE_SNIPPETS_LIBXSMM_TPP "allow Snippets to use LIBXSMM Tensor Processing Primitives" OFF "ENABLE_INTEL_CPU AND (X86_64 OR AARCH64)" OFF)

ov_option (ENABLE_PROFILING_ITT "Build with ITT tracing. Optionally configure pre-built ittnotify library though INTEL_VTUNE_DIR variable." OFF)

Expand Down
10 changes: 9 additions & 1 deletion src/plugins/intel_cpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,10 @@ if(ENABLE_CPU_DEBUG_CAPS)
add_definitions(-DCPU_DEBUG_CAPS)
endif()

if(AARCH64 AND (NOT ANDROID))
set(ENABLE_SNIPPETS_LIBXSMM_TPP ON)
endif()

if (ENABLE_SNIPPETS_LIBXSMM_TPP)
# Note: LIBXSMM_DEFAULT_CONFIG needed so libxsmm_config can be included without issues
add_definitions(-DSNIPPETS_LIBXSMM_TPP -DLIBXSMM_DEFAULT_CONFIG)
Expand Down Expand Up @@ -198,7 +202,9 @@ if(NOT X86_64)
${CMAKE_CURRENT_SOURCE_DIR}/src/nodes/kernels/x64/*
${CMAKE_CURRENT_SOURCE_DIR}/src/emitters/plugin/x64/*
${CMAKE_CURRENT_SOURCE_DIR}/src/emitters/snippets/x64/*
${CMAKE_CURRENT_SOURCE_DIR}/src/transformations/cpu_opset/x64/*)
${CMAKE_CURRENT_SOURCE_DIR}/src/emitters/tpp/x64/*
${CMAKE_CURRENT_SOURCE_DIR}/src/transformations/cpu_opset/x64/*
${CMAKE_CURRENT_SOURCE_DIR}/src/transformations/tpp/x64/*)
endif()

if (AARCH64)
Expand All @@ -208,7 +214,9 @@ endif()

if(NOT (AARCH64 OR ARM))
list(APPEND EXCLUDE_PATHS ${CMAKE_CURRENT_SOURCE_DIR}/src/transformations/cpu_opset/arm/*
${CMAKE_CURRENT_SOURCE_DIR}/src/transformations/tpp/aarch64/*
${CMAKE_CURRENT_SOURCE_DIR}/src/emitters/plugin/aarch64/*
${CMAKE_CURRENT_SOURCE_DIR}/src/emitters/tpp/aarch64/*
${CMAKE_CURRENT_SOURCE_DIR}/src/nodes/executors/aarch64/*
${CMAKE_CURRENT_SOURCE_DIR}/src/nodes/kernels/aarch64/*)
endif()
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (C) 2024 Intel Corporation
// Copyright (C) 2024-2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

Expand All @@ -10,6 +10,7 @@
#include "emitters/snippets/aarch64/jit_kernel_emitter.hpp"
#include "emitters/snippets/aarch64/jit_loop_emitters.hpp"
#include "emitters/snippets/aarch64/jit_memory_emitters.hpp"
#include "emitters/snippets/cpu_kernel_executor_table.hpp"
#include "emitters/snippets/cpu_runtime_configurator.hpp"
#include "emitters/utils.hpp"
#include "jit_snippets_emitters.hpp"
Expand All @@ -24,12 +25,17 @@
#include "transformations/cpu_opset/common/op/swish_cpu.hpp"
#include "transformations/snippets/common/op/fused_mul_add.hpp"

#ifdef SNIPPETS_LIBXSMM_TPP
# include "emitters/tpp/aarch64/jit_brgemm_emitter.hpp"
# include "transformations/tpp/common/op/brgemm.hpp"
#endif

namespace ov {

#define CREATE_SNIPPETS_EMITTER(e_type) \
#define CREATE_SNIPPETS_EMITTER(e_type, ...) \
{ \
[this](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr<snippets::Emitter> { \
return std::make_shared<e_type>(h.get(), isa, expr); \
return std::make_shared<e_type>(h.get(), isa, expr, ##__VA_ARGS__); \
}, \
[](const std::shared_ptr<ov::Node>& n) -> std::set<std::vector<element::Type>> { \
return e_type::get_supported_precisions(n); \
Expand Down Expand Up @@ -202,6 +208,12 @@ CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::aarch64::cpu_isa_t host_isa,
jitters[ov::intel_cpu::SwishNode::get_type_info_static()] = CREATE_CPU_EMITTER(jit_swish_emitter);
jitters[ov::op::v0::Tanh::get_type_info_static()] = CREATE_CPU_EMITTER(jit_tanh_emitter);

#ifdef SNIPPETS_LIBXSMM_TPP
// brgemm
jitters[ov::intel_cpu::tpp::op::BrgemmTPP::get_type_info_static()] =
CREATE_SNIPPETS_EMITTER(jit_brgemm_emitter, configurator->get_kernel_executor_table(), compiled_kernel_cache);
#endif

// control flow
jitters[snippets::op::KernelStatic::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_kernel_static_emitter);
jitters[snippets::op::KernelDynamic::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_kernel_dynamic_emitter);
Expand Down
233 changes: 233 additions & 0 deletions src/plugins/intel_cpu/src/emitters/snippets/brgemm_base.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,233 @@
// Copyright (C) 2020-2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "brgemm_base.hpp"

#include "common/utils.hpp"
#include "dnnl_extension_utils.h"
#include "utils/general_utils.h"

#define PRINT(X) ss << #X << " = " << X << "\n"
#define EQ(X) X == rhs.X
#define HASH(X) seed = dnnl::impl::hash_combine(seed, X)

namespace ov {
namespace intel_cpu {

bool BrgemmBaseKernelConfig::is_completed() const {
return !one_of(0, m_M, m_N, m_K, m_LDA, m_LDB, m_LDC) || is_empty();
}

bool BrgemmBaseKernelConfig::is_empty() const {
return everyone_is(0, m_M, m_N, m_K, m_LDA, m_LDB, m_LDC, m_beta);
}

bool BrgemmBaseKernelConfig::operator==(const BrgemmBaseKernelConfig& rhs) const {
return EQ(m_beta) && EQ(m_M) && EQ(m_N) && EQ(m_K) && EQ(m_LDA) && EQ(m_LDB) && EQ(m_LDC);
}

void BrgemmBaseKernelConfig::update(int64_t M, int64_t N, int64_t K, float beta) {
// If M is zero, it means that Brgemm won't be executed (in Loop with work_amount = 0, for example)
// To process this case, we have to make this Config as empty (nullify runtime parameters)
if (one_of(0, M, N, K)) {
m_M = 0;
m_N = 0;
m_K = 0;
m_beta = 0;
} else {
m_M = M;
m_N = N;
m_K = K;
m_beta = beta;
}
}

void BrgemmBaseKernelConfig::update(int64_t M,
int64_t N,
int64_t K,
int64_t LDA,
int64_t LDB,
int64_t LDC,
float beta) {
// If M is zero, it means that Brgemm won't be executed (in Loop with work_amount = 0, for example)
// To process this case, we have to make this Config as empty (nullify runtime parameters)
if (one_of(0, M, N, K)) {
m_M = 0;
m_N = 0;
m_K = 0;
m_LDA = 0;
m_LDB = 0;
m_LDC = 0;
m_beta = 0;
} else {
m_M = M;
m_N = N;
m_K = K;
m_LDA = LDA;
m_LDB = LDB;
m_LDC = LDC;
m_beta = beta;
}
}

size_t BrgemmBaseKernelConfig::compute_hash() const {
size_t seed = 0;
HASH(m_M);
HASH(m_N);
HASH(m_K);
HASH(m_LDA);
HASH(m_LDB);
HASH(m_LDC);
HASH(m_beta);
return seed;
}

#ifdef SNIPPETS_DEBUG_CAPS
std::string BrgemmBaseKernelConfig::to_string() const {
std::stringstream ss;
PRINT(m_M);
PRINT(m_N);
PRINT(m_K);
PRINT(m_LDA);
PRINT(m_LDB);
PRINT(m_LDC);
PRINT(m_beta);
return ss.str();
}
#endif

float BrgemmBaseKernelExecutor::get_beta(const ov::snippets::lowered::LoopManagerPtr& loop_manager,
int loop_id,
const ov::snippets::lowered::ExpandedLoopInfoPtr& current_expanded_loop_info) {
// Find all Expanded loops with the same Unified loop information -> they were decomposed from this Unified Loop.
// Note that LoopInfo are normalized and sorted (due to NormalizedLoopIDs pass).
// It means that previous executed Loops have Loop ID less the current Loop ID.
// - If there is executed Loop (work_amount > 0) and evaluated before the current -> the current Brgemm should have
// `beta = 1`.
// - If there is not this Loop -> the current executed Brgemm should have `beta = 0`.
if (loop_id > 0) {
const auto& current_unified_loop_info = current_expanded_loop_info->get_unified_loop_info();
// Check the previous Loops
--loop_id;
while (loop_id >= 0) {
const auto& expanded_loop_info =
loop_manager->get_loop_info<ov::snippets::lowered::ExpandedLoopInfo>(loop_id);
if (expanded_loop_info->get_unified_loop_info() != current_unified_loop_info)
return 0;
if (expanded_loop_info->get_work_amount() > 0) {
// there is previous executed Brgemm with `beta = 0` -> the current Brgemm should have `beta = 1`
return 1;
}
--loop_id;
}
}
return 0;
}

void BrgemmBaseKernelExecutor::update_config(const ov::snippets::lowered::ExpressionPtr& expr,
const ov::snippets::lowered::LinearIRCPtr& linear_ir,
BrgemmBaseKernelConfig& config) {
const auto& input_pds = expr->get_input_port_descriptors();
const auto& output_pds = expr->get_output_port_descriptors();
OV_CPU_JIT_EMITTER_ASSERT((input_pds.size() == 2 || input_pds.size() == 3) && output_pds.size() == 1,
"Invalid number of in/out port descriptors");

const auto in0_shape = snippets::utils::get_planar_vdims(input_pds[0]->get_shape(), input_pds[0]->get_layout());
const auto in1_shape = snippets::utils::get_planar_vdims(input_pds[1]->get_shape(), input_pds[1]->get_layout());
auto in0_subtensor = input_pds[0]->get_subtensor();
auto in1_subtensor = input_pds[1]->get_subtensor();

// Need to update M, K, N
// 1. If the original value in subtensor is `FULL_DIM`, it means that
// Brgemm block should process full tensor by this dim -> take dimension from shape
// 2. Otherwise, Brgemm block processes part of the tensor by this dim
// (there is blocking by this dimension) -> take from Loop increment

auto M = *++in0_subtensor.rbegin();
auto K = *in0_subtensor.rbegin();
auto N = *in1_subtensor.rbegin();

size_t loop_idx = 0;
const auto& loop_ids = expr->get_loop_ids();
const auto& loop_manager = linear_ir->get_loop_manager();
auto get_loop_info = [&]() {
OPENVINO_ASSERT(loop_idx < loop_ids.size(), "Loop is missed");
return loop_manager->get_loop_info<ov::snippets::lowered::ExpandedLoopInfo>(loop_ids[loop_idx++]);
};

/* ------- Dimension M ----------*/
if (ov::snippets::utils::is_full_dim_value(M)) {
M = *++in0_shape.rbegin();
} else {
const auto& current_expanded_loop_info = get_loop_info();
const auto& in_ports = current_expanded_loop_info->get_input_ports();
const auto& out_ports = current_expanded_loop_info->get_output_ports();
// Quick validation check: Should we check that port is really Brgemm port?
// If BrgemmCopyB in the Loop by M -> first input port will be BrgemmCopyB with `incremented=false`
// to avoid extra checks, we validate only first input port
auto check_port = [&](const ov::snippets::lowered::LoopPort& p) {
return p.get_dim_idx() == 1 && p.is_processed();
};
OPENVINO_ASSERT(
in_ports.size() > 1 && check_port(in_ports[0]) && out_ports.size() == 1 && check_port(out_ports[0]),
"Incorrect Loop by Brgemm dimension M");
M = current_expanded_loop_info->get_work_amount() > 0 ? current_expanded_loop_info->get_increment() : 0;
input_pds[0]->set_subtensor_dim(1, M);
output_pds[0]->set_subtensor_dim(1, M);
}

/* ------- Dimension N ----------*/
if (ov::snippets::utils::is_full_dim_value(N)) {
N = *in1_shape.rbegin();
} else {
const auto& current_expanded_loop_info = get_loop_info();
const auto& in_ports = current_expanded_loop_info->get_input_ports();
const auto& out_ports = current_expanded_loop_info->get_output_ports();
// Quick validation check: Should we check that port is really Brgemm port?
auto check_port = [&](const ov::snippets::lowered::LoopPort& p) {
return p.get_dim_idx() == 0 && p.is_processed();
};
OPENVINO_ASSERT(in_ports.size() >= 2 && !in_ports.front().is_processed() &&
std::all_of(in_ports.cbegin() + 1, in_ports.cend(), check_port) && out_ports.size() == 1 &&
check_port(out_ports.back()),
"Incorrect Loop by Brgemm dimension N");
N = current_expanded_loop_info->get_work_amount() > 0 ? current_expanded_loop_info->get_increment() : 0;
input_pds[1]->set_subtensor_dim(0, N);
output_pds[0]->set_subtensor_dim(0, N);
}

/* ------- Dimension K ----------*/
// 1. If Brgemm block processes full dimension K -> `beta = 0`
// 2. If Brgemm block processes part of the dimension K (there is blocking), need to find
// the most first executed Brgemm Block in Loops which iterate through dimension K (work_amount > 0).
// First of them will have `beta = 0`, other - `beta = 1`
float beta = 0;
if (ov::snippets::utils::is_full_dim_value(K)) {
K = *in0_shape.rbegin();
} else {
const auto& current_expanded_loop_info = get_loop_info();
const auto& in_ports = current_expanded_loop_info->get_input_ports();
const auto& out_ports = current_expanded_loop_info->get_output_ports();
// Quick validation check: Should we check that port is really Brgemm port?
OPENVINO_ASSERT(in_ports.size() >= 2 && in_ports.front().get_dim_idx() == 0 &&
in_ports.front().is_processed() && in_ports.back().get_dim_idx() == 1 &&
in_ports.back().is_processed() && out_ports.size() == 1 &&
!out_ports.front().is_processed(),
"Incorrect Loop by Brgemm dimension K");
K = current_expanded_loop_info->get_work_amount() > 0 ? current_expanded_loop_info->get_increment() : 0;
input_pds[0]->set_subtensor_dim(0, K);
input_pds[1]->set_subtensor_dim(1, K);
if (K > 0)
beta = get_beta(loop_manager, static_cast<int>(loop_ids.back()), current_expanded_loop_info);
}

config.update(static_cast<int64_t>(M), static_cast<int64_t>(N), static_cast<int64_t>(K), beta);
}

#undef PRINT
#undef EQ
#undef HASH

} // namespace intel_cpu
} // namespace ov
Loading
Loading