diff --git a/src/common/snippets/docs/mha_optimization_guide.md b/src/common/snippets/docs/mha_optimization_guide.md
index 28245017833a4a..1ea3a4c24c3524 100644
--- a/src/common/snippets/docs/mha_optimization_guide.md
+++ b/src/common/snippets/docs/mha_optimization_guide.md
@@ -65,7 +65,7 @@ The supported by decomposition Transpose orders are defined by `TokenizeMHASnipp
 
 [SplitDimensionM](../src/pass/split_dimension_m.cpp) splits M dimension of MHA in 2 parts (`batch_m` and `new_m`) by inserting Reshape on A input of the first Matmul and output of the second Matmul (the rest Subgraph's inputs are reshaped by Unsqueeze-like reshapes in order not to break subgraph semantic).
 This optimization increases parallel work amount by `batch_m` times thus enabling a more efficient parallel execution in some cases.
-The splitting is performed based on heuristic algorithm which can be found in `SplitDimensionM::get_splited_dimensions` method.
+The splitting is performed based on heuristic algorithm which can be found in `SplitDimensionM::split` method.
 
 Let's consider an example of the transformation:
 
diff --git a/src/common/snippets/include/snippets/op/load.hpp b/src/common/snippets/include/snippets/op/load.hpp
index bca4b09fabdcbd..d0a168483bc5ce 100644
--- a/src/common/snippets/include/snippets/op/load.hpp
+++ b/src/common/snippets/include/snippets/op/load.hpp
@@ -41,17 +41,17 @@ class Load : public modifier::MemoryAccess, public ov::op::Op {
 };
 
 /**
- * @interface LoadReshape
+ * @interface LoadReorder
  * @brief It's just Load operation (and it's mapped on LoadEmitter during code generation) that allows to tweak
  *        shape propagation. We need it to keep correct shape propagation  when Transpose is decomposed to
  *        Load and Store. This is a temporary solution until tokenization of Reshape operation is supported.
  * @ingroup snippets
  */
-class LoadReshape : public Load {
+class LoadReorder : public Load {
 public:
-    OPENVINO_OP("LoadReshape", "SnippetsOpset", Load);
-    LoadReshape(const Output<Node>& x, size_t count = 1lu, const size_t offset = 0lu, std::vector<size_t> order = {});
-    LoadReshape() = default;
+    OPENVINO_OP("LoadReorder", "SnippetsOpset", Load);
+    LoadReorder(const Output<Node>& x, size_t count = 1lu, const size_t offset = 0lu, std::vector<size_t> order = {});
+    LoadReorder() = default;
 
     void set_offset(size_t offset) { set_output_offset(offset, 0); }
     void set_count(size_t count) { set_output_count(count, 0); }
diff --git a/src/common/snippets/include/snippets/op/rank_normalization.hpp b/src/common/snippets/include/snippets/op/rank_normalization.hpp
index 47b18601f8d805..645f9edf527141 100644
--- a/src/common/snippets/include/snippets/op/rank_normalization.hpp
+++ b/src/common/snippets/include/snippets/op/rank_normalization.hpp
@@ -4,7 +4,7 @@
 
 #pragma once
 
-#include "openvino/op/op.hpp"
+#include "shape_infer_op.hpp"
 #include "snippets/shape_inference/shape_inference.hpp"
 
 namespace ov {
@@ -21,9 +21,9 @@ namespace op {
  // Note that technically the same goal could be achieved using op::Unsqueeze operation,
  // but RankNormalization has a much narrower semantics, and hence allows for an easier control and a more efficient shape infer.
  //
-class  RankNormalization : public ov::op::Op {
+class  RankNormalization : public ShapeInferOp {
 public:
-    OPENVINO_OP("RankNormalization", "SnippetsOpset");
+    OPENVINO_OP("RankNormalization", "SnippetsOpset", ShapeInferOp);
 
     RankNormalization() = default;
     RankNormalization(const Output<Node>& data, size_t num_prepend, size_t num_append);
diff --git a/src/common/snippets/include/snippets/op/reorder.hpp b/src/common/snippets/include/snippets/op/reorder.hpp
new file mode 100644
index 00000000000000..79b024b768aa76
--- /dev/null
+++ b/src/common/snippets/include/snippets/op/reorder.hpp
@@ -0,0 +1,43 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "shape_infer_op.hpp"
+#include "snippets/shape_inference/shape_inference.hpp"
+
+namespace ov {
+namespace snippets {
+namespace op {
+/**
+ * @interface Reorder
+ * @brief Reorder reshapes input tensor shape by reqiured target order.
+ *        The tensor data is not updated.
+ *        Note: Order is stored in input PortDescriptor
+ * @ingroup snippets
+ */
+class Reorder : public ShapeInferOp {
+public:
+    OPENVINO_OP("Reorder", "SnippetsOpset", ShapeInferOp);
+    Reorder() = default;
+    Reorder(const Output<Node>& x, std::vector<size_t> order);
+
+    bool visit_attributes(AttributeVisitor& visitor) override;
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
+    void validate_and_infer_types() override;
+
+    class ShapeInfer : public IShapeInferSnippets {
+        std::vector<size_t> m_target_order {};
+    public:
+        explicit ShapeInfer(const std::shared_ptr<Node>& n);
+        Result infer(const std::vector<VectorDimsRef>& input_shapes) override;
+    };
+
+private:
+    void custom_constructor_validate_and_infer_types(std::vector<size_t> order);
+};
+
+} // namespace op
+} // namespace snippets
+} // namespace ov
diff --git a/src/common/snippets/include/snippets/op/reshape.hpp b/src/common/snippets/include/snippets/op/reshape.hpp
index b4e0c9233c73f0..c86150a1479364 100644
--- a/src/common/snippets/include/snippets/op/reshape.hpp
+++ b/src/common/snippets/include/snippets/op/reshape.hpp
@@ -4,7 +4,8 @@
 
 #pragma once
 
-#include "openvino/op/op.hpp"
+#include "shape_infer_op.hpp"
+#include "snippets/shape_inference/shape_inference.hpp"
 
 namespace ov {
 namespace snippets {
@@ -15,9 +16,9 @@ namespace op {
  * @brief Reshape input tensor to reqiured target shape
  * @ingroup snippets
  */
-class Reshape : public ov::op::Op {
+class Reshape : public ShapeInferOp {
 public:
-    OPENVINO_OP("Reshape", "SnippetsOpset");
+    OPENVINO_OP("Reshape", "SnippetsOpset", ShapeInferOp);
     Reshape(const Output<Node>& x, ov::PartialShape target_shape);
     Reshape() = default;
 
@@ -28,6 +29,14 @@ class Reshape : public ov::op::Op {
     const ov::PartialShape& get_target_shape() const;
     void set_target_shape(ov::PartialShape shape);
 
+    class ShapeInfer : public IShapeInferSnippets {
+        VectorDims target_shape;
+        size_t target_shape_volume = 0;
+    public:
+        explicit ShapeInfer(const std::shared_ptr<Node>& n);
+        Result infer(const std::vector<VectorDimsRef>& input_shapes) override;
+    };
+
 private:
     ov::PartialShape m_target_shape = {};
 };
diff --git a/src/common/snippets/include/snippets/op/shape_infer_op.hpp b/src/common/snippets/include/snippets/op/shape_infer_op.hpp
new file mode 100644
index 00000000000000..a1462cbb426fd9
--- /dev/null
+++ b/src/common/snippets/include/snippets/op/shape_infer_op.hpp
@@ -0,0 +1,27 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/op/op.hpp"
+
+namespace ov {
+namespace snippets {
+namespace op {
+
+/**
+ * @interface ShapeInferOp
+ * @brief Op which infers shape without actually moving data
+ * @ingroup snippets
+ */
+class ShapeInferOp : public ov::op::Op {
+public:
+    OPENVINO_OP("ShapeInferOp", "SnippetsOpset");
+    ShapeInferOp() = default;
+    ShapeInferOp(const OutputVector& args) : ov::op::Op(args) {}
+};
+
+} // namespace op
+} // namespace snippets
+} // namespace ov
diff --git a/src/common/snippets/include/snippets/pass/split_dimension_m.hpp b/src/common/snippets/include/snippets/pass/split_dimension_m.hpp
index e9a9a46d3847ff..b93f09bf62803e 100644
--- a/src/common/snippets/include/snippets/pass/split_dimension_m.hpp
+++ b/src/common/snippets/include/snippets/pass/split_dimension_m.hpp
@@ -67,11 +67,24 @@ class SplitDimensionM: public CommonOptimizations::SubgraphPass {
 
 private:
     static std::shared_ptr<ov::op::v0::MatMul> get_matmul(const std::shared_ptr<op::Subgraph>& subgraph);
-    static std::pair<size_t, size_t> get_splited_dimensions(size_t batch_dim, size_t m_dim, size_t optimal_parallelism_work_amount);
+    /**
+     * @brief Contains splitM approaches allowing to get the batch ideally divisible by optimal_parallelism_work_amount
+     */
+    static std::pair<size_t, size_t> split_ideally(size_t batch_dim, size_t m_dim, size_t optimal_parallelism_work_amount);
+    /**
+     * @brief Splits m_dim to minimize kernel_m in order to reduce waiting time for idle threads at the last parallel loop iteration.
+     */
+    static std::pair<size_t, size_t> split_minimize_kernel_wa(size_t batch_dim, size_t m_dim, size_t optimal_parallelism_work_amount);
+    /**
+     * @brief Splits m_dim to get the batch in (optimal_parallelism_work_amount, 2 * optimal_parallelism_work_amount) interval
+     */
+    static std::pair<size_t, size_t> split_fallback_increase_parallel_wa(size_t batch_dim, size_t m_dim, size_t optimal_parallelism_work_amount);
 
     void reshape_subgraph(const std::shared_ptr<op::Subgraph>& subgraph, const ov::Shape& shape, size_t batch_m_dim, size_t new_m_dim);
 
     size_t m_concurrency;
+
+    static const size_t min_kernel_m;
 };
 } // namespace pass
 } // namespace snippets
diff --git a/src/common/snippets/include/snippets/shape_inference/shape_infer_instances.hpp b/src/common/snippets/include/snippets/shape_inference/shape_infer_instances.hpp
index 1b91ea573ab1c4..f6cd6f0626f798 100644
--- a/src/common/snippets/include/snippets/shape_inference/shape_infer_instances.hpp
+++ b/src/common/snippets/include/snippets/shape_inference/shape_infer_instances.hpp
@@ -75,12 +75,5 @@ class ReduceShapeInfer : public IShapeInferSnippets {
     Result infer(const std::vector<VectorDimsRef>& input_shapes) override;
 };
 
-class ReshapeShapeInfer : public IShapeInferSnippets {
-    VectorDims target_shape;
-    size_t target_shape_volume = 0;
-public:
-    explicit ReshapeShapeInfer(const std::shared_ptr<Node>& n);
-    Result infer(const std::vector<VectorDimsRef>& input_shapes) override;
-};
 } // namespace snippets
 } // namespace ov
diff --git a/src/common/snippets/include/snippets/snippets_isa.hpp b/src/common/snippets/include/snippets/snippets_isa.hpp
index 08002fa38ed309..72198decc366d9 100644
--- a/src/common/snippets/include/snippets/snippets_isa.hpp
+++ b/src/common/snippets/include/snippets/snippets_isa.hpp
@@ -18,6 +18,7 @@
 #include "op/kernel.hpp"
 #include "op/load.hpp"
 #include "op/reshape.hpp"
+#include "op/reorder.hpp"
 #include "op/nop.hpp"
 #include "op/scalar.hpp"
 #include "op/powerstatic.hpp"
diff --git a/src/common/snippets/include/snippets/snippets_isa_tbl.hpp b/src/common/snippets/include/snippets/snippets_isa_tbl.hpp
index 9b207b09fe411f..9dc416b3f7e38f 100644
--- a/src/common/snippets/include/snippets/snippets_isa_tbl.hpp
+++ b/src/common/snippets/include/snippets/snippets_isa_tbl.hpp
@@ -11,12 +11,13 @@
 
 // SnippetS dialect
 OV_OP(Load, ov::snippets::op)
-OV_OP(LoadReshape, ov::snippets::op)
+OV_OP(LoadReorder, ov::snippets::op)
 OV_OP(LoopBegin, ov::snippets::op)
 OV_OP(LoopEnd, ov::snippets::op)
 OV_OP(Brgemm, ov::snippets::op)
 OV_OP(BroadcastLoad, ov::snippets::op)
 OV_OP(Reshape, ov::snippets::op)
+OV_OP(Reorder, ov::snippets::op)
 
 OV_OP(Store, ov::snippets::op)
 
diff --git a/src/common/snippets/include/snippets/utils/utils.hpp b/src/common/snippets/include/snippets/utils/utils.hpp
index ff4646f24d03b7..0569a230e91f32 100644
--- a/src/common/snippets/include/snippets/utils/utils.hpp
+++ b/src/common/snippets/include/snippets/utils/utils.hpp
@@ -290,13 +290,26 @@ std::shared_ptr<ov::Node> get_leaf_node_of_first_child_shape_infer_seq(const std
 std::shared_ptr<ov::Node> get_leaf_node_of_first_parent_shape_infer_seq(const std::shared_ptr<ov::Node>& start_node);
 
 /**
- *
  * @param Get stride of input/output dimension
  * @param expr_port target port that contains shape and layout info
  * @param idx index of the target dimension starting from the shape's end (default = 1)
  */
 
 int64_t get_dim_stride(const lowered::ExpressionPort& expr_port, size_t idx = 1);
+/**
+ * @brief Get stride of input dimension
+ * @param shape target shape
+ * @param layout target layout
+ * @param idx index of the target dimension starting from the shape's end (default = 1)
+ */
+int64_t get_dim_in_stride(const VectorDims& shape, const VectorDims& layout, size_t idx = 1);
+/**
+ * @brief Get stride of output dimension
+ * @param shape target shape
+ * @param layout target layout
+ * @param idx index of the target dimension starting from the shape's end (default = 1)
+ */
+int64_t get_dim_out_stride(const VectorDims& shape, const VectorDims& layout, size_t idx = 1);
 
 /**
  * @brief Traverses path starting from "expr", and calls "func" for each expression.
diff --git a/src/common/snippets/src/generator.cpp b/src/common/snippets/src/generator.cpp
index d059ddd94d5724..fad0086427c93d 100644
--- a/src/common/snippets/src/generator.cpp
+++ b/src/common/snippets/src/generator.cpp
@@ -77,6 +77,7 @@ RegType Generator::get_op_out_reg_type(const ov::Output<Node>& out) const {
         std::dynamic_pointer_cast<op::Buffer>(op) ||
         std::dynamic_pointer_cast<op::RankNormalization>(op) ||
         std::dynamic_pointer_cast<op::Reshape>(op) ||
+        std::dynamic_pointer_cast<op::Reorder>(op) ||
         std::dynamic_pointer_cast<snippets::op::Store>(op)
 #ifdef SNIPPETS_DEBUG_CAPS
         || std::dynamic_pointer_cast<op::PerfCountBeginBase>(op)
diff --git a/src/common/snippets/src/lowered/pass/mark_invariant_shape_path.cpp b/src/common/snippets/src/lowered/pass/mark_invariant_shape_path.cpp
index b32056d4e32a57..16d4160f1aaeb2 100644
--- a/src/common/snippets/src/lowered/pass/mark_invariant_shape_path.cpp
+++ b/src/common/snippets/src/lowered/pass/mark_invariant_shape_path.cpp
@@ -41,7 +41,7 @@ static bool is_affecting_op(const ExpressionPtr& expr) {
     const auto& node = expr->get_node();
     return ov::is_type<ov::snippets::op::Brgemm>(node) ||
            ov::is_type<ov::snippets::op::Reshape>(node) ||
-           ov::is_type<ov::snippets::op::LoadReshape>(node);
+           ov::is_type<ov::snippets::op::LoadReorder>(node);
 }
 }  // namespace
 
diff --git a/src/common/snippets/src/op/load.cpp b/src/common/snippets/src/op/load.cpp
index 461fec8b1399c0..9bd1e4c7bc8706 100644
--- a/src/common/snippets/src/op/load.cpp
+++ b/src/common/snippets/src/op/load.cpp
@@ -41,19 +41,19 @@ std::shared_ptr<Node> Load::clone_with_new_inputs(const OutputVector& new_args)
     return std::make_shared<Load>(new_args.at(0), get_count(), get_offset());
 }
 
-LoadReshape::LoadReshape(const Output<ov::Node>& x, const size_t count, const size_t offset, std::vector<size_t> order)
+LoadReorder::LoadReorder(const Output<ov::Node>& x, const size_t count, const size_t offset, std::vector<size_t> order)
                             : Load(x, count, offset), m_order(std::move(order)) {
     const auto& in_shape = x.get_partial_shape();
     const auto in_shape_size = in_shape.size();
-    OPENVINO_ASSERT(m_order.size() == in_shape_size, "LoadReshape got new_order of invalid size");
+    OPENVINO_ASSERT(m_order.size() == in_shape_size, "LoadReorder got new_order of invalid size");
     OPENVINO_ASSERT(*std::max_element(m_order.begin(), m_order.end()) == in_shape_size - 1 &&
-                    *std::min_element(m_order.begin(), m_order.end()) == 0, "LoadReshape detected invalid values in new_order");
+                    *std::min_element(m_order.begin(), m_order.end()) == 0, "LoadReorder detected invalid values in new_order");
     const std::set<size_t> unique_dims(order.begin(), order.end());
-    OPENVINO_ASSERT(unique_dims.size() == order.size(), "LoadReshape order must not contain repeated elements");
+    OPENVINO_ASSERT(unique_dims.size() == order.size(), "LoadReorder order must not contain repeated elements");
     constructor_validate_and_infer_types();
 }
 
-void LoadReshape::validate_and_infer_types() {
+void LoadReorder::validate_and_infer_types() {
     validate_memory_access_params();
     const auto& old_shape = get_input_partial_shape(0);
     ov::PartialShape new_shape;
@@ -62,23 +62,23 @@ void LoadReshape::validate_and_infer_types() {
     set_output_type(0, get_input_element_type(0), new_shape);
 }
 
-bool LoadReshape::visit_attributes(AttributeVisitor& visitor) {
+bool LoadReorder::visit_attributes(AttributeVisitor& visitor) {
     MemoryAccess::visit_attributes(visitor);
     visitor.on_attribute("order", m_order);
     return true;
 }
 
-std::shared_ptr<Node> LoadReshape::clone_with_new_inputs(const OutputVector& new_args) const {
-    INTERNAL_OP_SCOPE(LoadReshape);
+std::shared_ptr<Node> LoadReorder::clone_with_new_inputs(const OutputVector& new_args) const {
+    INTERNAL_OP_SCOPE(LoadReorder);
     check_new_args_count(this, new_args);
-    return std::make_shared<LoadReshape>(new_args.at(0), get_count(), get_offset(), m_order);
+    return std::make_shared<LoadReorder>(new_args.at(0), get_count(), get_offset(), m_order);
 }
-LoadReshape::ShapeInfer::ShapeInfer(const std::shared_ptr<ov::Node>& n) {
-    const auto& loadReshape = ov::as_type_ptr<LoadReshape>(n);
-    OPENVINO_ASSERT(loadReshape, "Got invalid node in LoadReshape::ShapeInfer");
-    m_order = loadReshape->m_order;
+LoadReorder::ShapeInfer::ShapeInfer(const std::shared_ptr<ov::Node>& n) {
+    const auto& loadReorder = ov::as_type_ptr<LoadReorder>(n);
+    OPENVINO_ASSERT(loadReorder, "Got invalid node in LoadReorder::ShapeInfer");
+    m_order = loadReorder->m_order;
 }
-IShapeInferSnippets::Result LoadReshape::ShapeInfer::infer(const std::vector<VectorDimsRef>& input_shapes) {
+IShapeInferSnippets::Result LoadReorder::ShapeInfer::infer(const std::vector<VectorDimsRef>& input_shapes) {
     OPENVINO_ASSERT(input_shapes.size() == 1, "Got unexpected number of input shapes");
     return {{utils::get_planar_vdims(input_shapes[0], m_order)}, ShapeInferStatus::success};
 }
diff --git a/src/common/snippets/src/op/rank_normalization.cpp b/src/common/snippets/src/op/rank_normalization.cpp
index 4986f0d7fae6ef..2eab2dedc8aeb5 100644
--- a/src/common/snippets/src/op/rank_normalization.cpp
+++ b/src/common/snippets/src/op/rank_normalization.cpp
@@ -10,11 +10,10 @@ namespace snippets {
 namespace op {
 
 RankNormalization::RankNormalization(const Output<Node>& data, size_t num_prepend, size_t num_append) :
-    Op({data}), m_num_prepend(num_prepend), m_num_append(num_append) {
+    ShapeInferOp({data}), m_num_prepend(num_prepend), m_num_append(num_append) {
     constructor_validate_and_infer_types();
 }
 
-
 std::shared_ptr<ov::Node> RankNormalization::clone_with_new_inputs(const OutputVector& new_args) const {
     check_new_args_count(this, new_args);
     return std::make_shared<RankNormalization>(new_args[0], m_num_prepend, m_num_append);
diff --git a/src/common/snippets/src/op/reorder.cpp b/src/common/snippets/src/op/reorder.cpp
new file mode 100644
index 00000000000000..43d8387a8cb2fb
--- /dev/null
+++ b/src/common/snippets/src/op/reorder.cpp
@@ -0,0 +1,67 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/itt.hpp"
+
+#include "snippets/op/reorder.hpp"
+#include "snippets/utils/utils.hpp"
+
+
+namespace ov {
+namespace snippets {
+namespace op {
+
+Reorder::Reorder(const Output<Node>& arg, std::vector<size_t> order)
+    : ShapeInferOp({arg}) {
+    custom_constructor_validate_and_infer_types(std::move(order));
+}
+
+void Reorder::custom_constructor_validate_and_infer_types(std::vector<size_t> order) {
+    INTERNAL_OP_SCOPE(Reorder_constructor_validate_and_infer_types);
+
+    const auto& input_pshape = get_input_partial_shape(0);
+    OPENVINO_ASSERT(input_pshape.rank().is_static() && input_pshape.size() == order.size(),
+                   "Incompatible shape and order sizes");
+
+    // During ctor call, Reorder doesn't know his port descriptors.
+    // So we use explicit layouts from parameters
+    set_output_type(0, get_input_element_type(0), ov::snippets::utils::get_planar_pshape(input_pshape, order));
+}
+
+void Reorder::validate_and_infer_types() {
+    const auto& input_pshape = get_input_partial_shape(0);
+    const auto& order = lowered::PortDescriptorUtils::get_port_descriptor_ptr(input(0))->get_layout();
+    OPENVINO_ASSERT(input_pshape.rank().is_static() && input_pshape.size() == order.size(),
+                    "Incompatible shape and order sizes");
+    const auto output_pshape = utils::get_planar_pshape(get_input_partial_shape(0), order);
+    set_output_type(0, get_input_element_type(0), output_pshape);
+}
+
+std::shared_ptr<Node> Reorder::clone_with_new_inputs(const OutputVector& new_args) const {
+    INTERNAL_OP_SCOPE(Reorder);
+    check_new_args_count(this, new_args);
+    const auto& order = lowered::PortDescriptorUtils::get_port_descriptor_ptr(input(0))->get_layout();
+    return std::make_shared<Reorder>(new_args.at(0), order);
+}
+
+bool Reorder::visit_attributes(AttributeVisitor& visitor) {
+    auto order = lowered::PortDescriptorUtils::get_port_descriptor_ptr(input(0))->get_layout();
+    visitor.on_attribute("target_order", order);
+    return true;
+}
+
+Reorder::ShapeInfer::ShapeInfer(const std::shared_ptr<Node>& n) {
+    const auto& op = as_type_ptr<ov::snippets::op::Reorder>(n);
+    OPENVINO_ASSERT(op, "Invalid node passed to ReorderShapeInfer.");
+    m_target_order = lowered::PortDescriptorUtils::get_port_descriptor_ptr(op->input(0))->get_layout();
+}
+
+IShapeInferSnippets::Result Reorder::ShapeInfer::infer(const std::vector<VectorDimsRef>& input_shapes) {
+    OPENVINO_ASSERT(input_shapes.size() == 1, "Invalid number of shapes is passed in ReorderShapeInfer");
+    return {{ov::snippets::utils::get_planar_vdims(input_shapes[0].get(), m_target_order)}, ShapeInferStatus::success};
+}
+
+}// namespace op
+}// namespace snippets
+}// namespace ov
\ No newline at end of file
diff --git a/src/common/snippets/src/op/reshape.cpp b/src/common/snippets/src/op/reshape.cpp
index 72823d2815cdbf..9b46ea73247f39 100644
--- a/src/common/snippets/src/op/reshape.cpp
+++ b/src/common/snippets/src/op/reshape.cpp
@@ -11,8 +11,9 @@
 namespace ov {
 namespace snippets {
 namespace op {
+
 Reshape::Reshape(const Output<Node>& arg, ov::PartialShape target_shape)
-    : Op({arg}), m_target_shape(std::move(target_shape)) {
+    : ShapeInferOp({arg}), m_target_shape(std::move(target_shape)) {
     constructor_validate_and_infer_types();
 }
 
@@ -38,6 +39,24 @@ const ov::PartialShape& Reshape::get_target_shape() const {
 void Reshape::set_target_shape(ov::PartialShape shape) {
     m_target_shape = std::move(shape);
 }
+
+Reshape::ShapeInfer::ShapeInfer(const std::shared_ptr<Node>& n) {
+    const auto& reshape = as_type_ptr<ov::snippets::op::Reshape>(n);
+    OPENVINO_ASSERT(reshape, "Invalid node passed to ReshapeShapeInfer.");
+    const auto& partial_shape = reshape->get_target_shape();
+    OPENVINO_ASSERT(partial_shape.is_static(), "target_shape of reshape op should be static in ReshapeShapeInfer");
+    target_shape = partial_shape.get_shape();
+    target_shape_volume = utils::get_shape_size(target_shape);
+}
+
+IShapeInferSnippets::Result Reshape::ShapeInfer::infer(const std::vector<VectorDimsRef>& input_shapes) {
+    OPENVINO_ASSERT(input_shapes.size() == 1, "Invalid number of shapes is passed in ReshapeShapeInfer");
+    const auto input_shape_volume = utils::get_shape_size(input_shapes[0].get());
+    OPENVINO_ASSERT(input_shape_volume == target_shape_volume, "Tensor volume should be the same after reshape in ReshapeShapeInfer");
+
+    return {{target_shape}, ShapeInferStatus::success};
+}
+
 }// namespace op
 }// namespace snippets
 }// namespace ov
\ No newline at end of file
diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp
index 98e3392a65e1e2..aff2341cc8bf9d 100644
--- a/src/common/snippets/src/op/subgraph.cpp
+++ b/src/common/snippets/src/op/subgraph.cpp
@@ -95,8 +95,7 @@ auto Subgraph::is_domain_sensitive_op(const std::shared_ptr<ov::Node>& op) -> bo
 }
 
 auto Subgraph::is_shape_infer_op(const std::shared_ptr<ov::Node>& op) -> bool {
-    return ov::is_type<snippets::op::Reshape>(op) ||
-           ov::is_type<snippets::op::RankNormalization>(op);
+    return ov::is_type<ov::snippets::op::ShapeInferOp>(op);
 }
 
 void Subgraph::init_config() {
diff --git a/src/common/snippets/src/pass/mha_tokenization.cpp b/src/common/snippets/src/pass/mha_tokenization.cpp
index beb465ab3a3fbe..c6b5045cfeee62 100644
--- a/src/common/snippets/src/pass/mha_tokenization.cpp
+++ b/src/common/snippets/src/pass/mha_tokenization.cpp
@@ -344,45 +344,6 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken
          *                  Transpose3
          */
 
-        // First input branch of MatMul0 should be executed before second input branch of MatMul0,
-        // so firstly we insert Transpose1 on the beginning of ordered_ops and then Transpose0
-        // Note: If MatMul0 has transposed_b, we should tokenize only scalars ops from 1st branch
-        //       to move extracted Transpose from MatMul input to body Parameter
-        auto parent = matmul0->get_input_node_shared_ptr(1);
-        // We can support several ops between MatMul0 with transposed_b and Transpose1 with 0213 order (or without this Transpose1)
-        // only if these ops have scalar shapes on other inputs.
-        // There is transformation ExplicitTransposeMatMulInputs that set supported order and transposed_b(false).
-        // We can allow to call this pass only if ops have scalar shapes to avoid shape mismatching
-        const auto is_transposed_b_0 = matmul0->get_transpose_b();
-        bool has_matmul0_has_ops_on_input = false;
-        while (is_supported_intermediate_op(parent)) {
-            // All supported ops have only one output port
-            if (parent->get_output_target_inputs(0).size() != 1)
-                break;
-
-            // Only if MatMul0 has transposed_b, we have to tokenize scalar ops
-            // to move explicit Transpose from MatMul0 input_1 to Parameter of Subgraph body
-            if (is_transposed_b_0 && !ov::snippets::pass::ExplicitTransposeMatMulInputs::are_weights_scalar(parent)) {
-                break;
-            }
-
-            // To avoid unsupported number of non-scalar Constants in the future after FakeQuantize decomposition (plugin specific limitation)
-            // we should calculate potential number of non-scalar Constants for FakeQuantize that will be moved up from body.
-            if (const auto fq_node = ov::as_type_ptr<ov::op::v0::FakeQuantize>(parent)) {
-                hidden_virtual_ports_count += ov::snippets::utils::get_non_scalar_constant_count_for_fq(fq_node);
-            }
-
-            potential_body_params_count += get_potential_body_params(parent);
-            ordered_ops.insert(ordered_ops.begin(), parent);
-            // [107731] To go always through 0-th port - is it safe?
-            parent = parent->get_input_node_shared_ptr(0);
-            has_matmul0_has_ops_on_input = true;
-        }
-        // If there are ops on second input of MatMul0 and only one unique Buffer between MatMuls - there must be one more unique Buffer
-        if (has_matmul0_has_ops_on_input && uniqie_buffer_reg_group_count < 2) {
-            uniqie_buffer_reg_group_count++;
-        }
-
         auto tokenize_transpose = [&](const std::shared_ptr<ov::opset1::Transpose>& transpose,
                                       bool is_input_transposed, std::vector<int32_t> order,
                                       const ov::NodeVector::const_iterator& pos) {
@@ -404,11 +365,15 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken
             }
         };
 
-        const auto transpose1 = ov::as_type_ptr<ov::opset1::Transpose>(parent);
+        // [160177]: Due to performance problems, if operations on 2nd input of MatMuls should be explicitly executed
+        //          (in other words, if the Buffer should be inserted between Brgemm and this op sequence),
+        //          we don't tokenize such operations into Subgraph. The details are described in the ticket 160177.
+        //          Please, return the tokenization of these ops when parallel loops are implemented.
         const auto transpose0 = ov::as_type_ptr<ov::opset1::Transpose>(matmul0->get_input_node_shared_ptr(0));
+        const auto transpose1 = ov::as_type_ptr<ov::opset1::Transpose>(matmul0->get_input_node_shared_ptr(1));
         const auto transpose2 = ov::as_type_ptr<ov::opset1::Transpose>(matmul1->get_input_node_shared_ptr(1));
-        tokenize_transpose(transpose1, is_transposed_b_0, get_decomposed_transpose_order(pattern_rank), ordered_ops.begin());
         tokenize_transpose(transpose0, matmul0->get_transpose_a(), get_fusion_transpose_order(pattern_rank), ordered_ops.begin());
+        tokenize_transpose(transpose1, matmul0->get_transpose_b(), get_fusion_transpose_order(pattern_rank), ordered_ops.begin());
         tokenize_transpose(transpose2, matmul1->get_transpose_b(), get_fusion_transpose_order(pattern_rank), ordered_ops.end());
         ordered_ops.push_back(matmul1);
 
diff --git a/src/common/snippets/src/pass/split_dimension_m.cpp b/src/common/snippets/src/pass/split_dimension_m.cpp
index ae95a371483163..b6b8cdd70f0bc8 100644
--- a/src/common/snippets/src/pass/split_dimension_m.cpp
+++ b/src/common/snippets/src/pass/split_dimension_m.cpp
@@ -4,8 +4,8 @@
 
 #include "snippets/pass/split_dimension_m.hpp"
 
-#include "snippets/utils/utils.hpp"
 #include "snippets/itt.hpp"
+#include "snippets/utils/utils.hpp"
 
 namespace {
 size_t get_dim_M(const ov::Shape& shape) {
@@ -26,50 +26,69 @@ bool is_prime_number(size_t value) {
 namespace ov {
 namespace snippets {
 namespace pass {
+
+const size_t SplitDimensionM::min_kernel_m = 32;
+
 bool SplitDimensionM::is_supported_matmul(const std::shared_ptr<const ov::Node>& node) {
     const auto matmul = ov::as_type_ptr<const ov::op::v0::MatMul>(node);
     return matmul && !matmul->get_transpose_a() && !matmul->is_dynamic();
 }
 
-std::pair<size_t, size_t> SplitDimensionM::get_splited_dimensions(size_t batch_dim, size_t m_dim, size_t optimal_parallelism_work_amount) {
-    std::pair<size_t, size_t> splited = { 1, m_dim };
-
+std::pair<size_t, size_t> SplitDimensionM::split_ideally(size_t batch_dim, size_t m_dim, size_t optimal_parallelism_work_amount) {
     // Ideal case #1: M can be split on the parts one of which complements the batch dimension to the optimal parallel work amount
     // In this case, each thread will execute the Snippets kernel once
     const size_t lower_bound = optimal_parallelism_work_amount / batch_dim;
-    if (lower_bound * batch_dim == optimal_parallelism_work_amount && m_dim % lower_bound == 0) {
-        splited.first = lower_bound;
-        splited.second = m_dim / lower_bound;
-        OPENVINO_ASSERT(splited.first * splited.second == m_dim, "Incorrect dimension M splitting!");
-        return splited;
-    }
+    if (lower_bound * batch_dim == optimal_parallelism_work_amount && m_dim % lower_bound == 0)
+        return std::make_pair(lower_bound, m_dim / lower_bound);
 
     // Ideal case #2: M is divisible by optimal parallel work amount, and the new_m_dim is big enough
     // In this case, each thread will execute the Snippets kernel 'batch_dim' times
     if (m_dim % optimal_parallelism_work_amount == 0) {
         const auto new_m_dim = m_dim / optimal_parallelism_work_amount;
-        const size_t min_kernel_m = 64;
-        if (new_m_dim >= min_kernel_m) {
-            splited.first = optimal_parallelism_work_amount;
-            splited.second = new_m_dim;
-            OPENVINO_ASSERT(splited.first * splited.second == m_dim, "Incorrect dimension M splitting!");
-            return splited;
-        }
+        if (new_m_dim >= min_kernel_m)
+            return std::make_pair(optimal_parallelism_work_amount, new_m_dim);
     }
 
+    return std::make_pair(1, m_dim);
+}
+
+std::pair<size_t, size_t> SplitDimensionM::split_fallback_increase_parallel_wa(size_t batch_dim, size_t m_dim, size_t optimal_parallelism_work_amount) {
+    std::pair<size_t, size_t> splited = { 1, m_dim };
     const size_t upper_bound = utils::div_up(2 * optimal_parallelism_work_amount, batch_dim);
     for (size_t divisor_0 = upper_bound - 1; divisor_0 > 1; divisor_0--) {
         size_t divisor_1 = m_dim / divisor_0;
-        if (divisor_1 * divisor_0 == m_dim) {
-            splited.first = divisor_0;
-            splited.second = divisor_1;
-            break;
-        }
+        if (divisor_1 * divisor_0 == m_dim)
+            return divisor_0 * batch_dim >= optimal_parallelism_work_amount ? std::make_pair(divisor_0, divisor_1) : splited;
     }
-    OPENVINO_ASSERT(splited.first * splited.second == m_dim, "Incorrect dimension M splitting!");
     return splited;
 }
 
+std::pair<size_t, size_t> SplitDimensionM::split_minimize_kernel_wa(size_t batch_dim, size_t m_dim, size_t optimal_parallelism_work_amount) {
+    // This heuristic minimizes 'm_kernel' (=> maximizes 'm_batch') with a limitation that 'm_kernel >= min_kernel_m'.
+    // In other words, it tries to find 'm_kernel' bigger than 'min_kernel_m' and at the same time as close as possible to this value.
+    std::pair<size_t, size_t> best_result = {1, m_dim};
+    for (size_t divisor = 2; divisor < std::sqrt(m_dim); ++divisor) {
+        if (m_dim % divisor != 0)
+            continue;
+        // If divisor is more than 'min_kernel_m', divisor becomes 'm_kernel',
+        // guaranteeing the most optimal implementation from 'm_kernel' minimization perspective.
+        if (divisor >= min_kernel_m)
+            return std::make_pair(m_dim / divisor, divisor);
+
+        // If divisor is less than 'min_kernel_m', divisor becomes m_batch.
+        // However, it is not guaranteed that the current 'm_kernel = m_dim / divisor' is minimized, as one of the next divisors can be more optimal.
+        // So in this case the best result is remembered
+        const size_t m_kernel = m_dim / divisor;
+        if (m_kernel >= min_kernel_m) {
+            best_result.first = divisor;
+            best_result.second = m_kernel;
+        }
+    }
+    if (best_result.first * batch_dim >= optimal_parallelism_work_amount)
+        return best_result;
+    return std::make_pair(1, m_dim);
+}
+
 bool SplitDimensionM::can_be_optimized(const std::shared_ptr<const ov::Node>& node, size_t concurrency) {
     if (!is_supported_matmul(node))
         return false;
@@ -131,16 +150,25 @@ bool SplitDimensionM::split(const ov::Shape& shape, size_t optimal_parallelism_w
     if (is_prime_number(m_dim))
         return false;
 
-    auto is_optimized = [&](size_t batch_dim) {
-        return batch_dim >= optimal_parallelism_work_amount;
-    };
-
     // We skip optimization if the current batch is optimal for concurrency
-    if (is_optimized(batch_dim))
+    if (batch_dim % optimal_parallelism_work_amount == 0)
         return false;
 
-    std::tie(batch_m_dim, new_m_dim) = get_splited_dimensions(batch_dim, m_dim, optimal_parallelism_work_amount);
-    return is_optimized(batch_dim * batch_m_dim);
+    auto split_is_done = [&batch_m_dim]() {
+        return batch_m_dim != 1;
+    };
+
+    std::tie(batch_m_dim, new_m_dim) = split_ideally(batch_dim, m_dim, optimal_parallelism_work_amount);
+    if (split_is_done())
+        return true;
+
+    std::tie(batch_m_dim, new_m_dim) = split_minimize_kernel_wa(batch_dim, m_dim, optimal_parallelism_work_amount);
+    if (split_is_done())
+        return true;
+    // If all the previous heuristics failed, fallback heuristic is used, which reflects the old splitting behavior
+    if (batch_dim < optimal_parallelism_work_amount)
+        std::tie(batch_m_dim, new_m_dim) = split_fallback_increase_parallel_wa(batch_dim, m_dim, optimal_parallelism_work_amount);
+    return split_is_done();
 }
 
 void SplitDimensionM::reshape_subgraph(const std::shared_ptr<op::Subgraph>& subgraph, const ov::Shape& shape, size_t batch_m_dim, size_t new_m_dim) {
diff --git a/src/common/snippets/src/pass/transpose_decomposition.cpp b/src/common/snippets/src/pass/transpose_decomposition.cpp
index 5c29b493af5826..a433cd41377422 100644
--- a/src/common/snippets/src/pass/transpose_decomposition.cpp
+++ b/src/common/snippets/src/pass/transpose_decomposition.cpp
@@ -60,9 +60,9 @@ TransposeDecomposition::TransposeDecomposition() {
         const auto subtensor = std::vector<size_t>{1};
         const auto& layout = order->cast_vector<size_t>();
 
-        // todo: LoadReshape used here is essentially Load + an easy way to maintain correct shape propagation
+        // todo: LoadReorder used here is essentially Load + an easy way to maintain correct shape propagation
         //  fix this in future and develop a more consistent shape propagation approach.
-        auto load = std::make_shared<snippets::op::LoadReshape>(data_input, subtensor[0], 0, layout);
+        auto load = std::make_shared<snippets::op::LoadReorder>(data_input, subtensor[0], 0, layout);
         auto store = std::make_shared<snippets::op::Store>(load, subtensor[0]);
 
         PortDescriptorUtils::set_port_descriptor(load->input(0), subtensor, layout);
diff --git a/src/common/snippets/src/runtime_configurator.cpp b/src/common/snippets/src/runtime_configurator.cpp
index 06beb8db94ae3d..5527cebb63f24f 100644
--- a/src/common/snippets/src/runtime_configurator.cpp
+++ b/src/common/snippets/src/runtime_configurator.cpp
@@ -118,7 +118,26 @@ void RuntimeConfigurator::init_data_info(const lowered::LinearIRCPtr& linear_ir)
         // input->shape changing ops->load
         PortDescriptorPtr desc = nullptr;
         const auto& shape_infer_seq = utils::get_first_child_shape_infer_expr_seq(param);
-        const auto& mem_desc_expr = shape_infer_seq.empty() ? param : shape_infer_seq.back();
+        ExpressionPtr mem_desc_expr = param;
+        if (!shape_infer_seq.empty()) {
+            // [160048] Reorder, as any another ShapeInferOp, should just propagate input shape to output using target order
+            //          without data movement. However, currently we have to save desc of input of the Reorder
+            //          to support correct input data offsets calculations and MHAParallelWAOptimizer pass work.
+            //          Please, remove this code part when the mentioned ticket is completed.
+            const auto& reorder_it = std::find_if(shape_infer_seq.cbegin(), shape_infer_seq.cend(),
+                                                  [](const ExpressionPtr& expr) {
+                                                     return ov::is_type<op::Reorder>(expr->get_node());
+                                                  });
+            if (reorder_it != shape_infer_seq.cend()) {
+                const auto& reorder = *reorder_it;
+                const auto& etype = reorder->get_node()->get_output_element_type(0);
+                update_io_parameters(reorder->get_input_port_descriptor(0), etype);
+                continue;
+            }
+
+            mem_desc_expr = shape_infer_seq.back();
+        }
+
         auto consumer_inputs = mem_desc_expr->get_output_port_connector(0)->get_consumers();
         for (const auto& child_input : consumer_inputs) {
             const auto ma = std::dynamic_pointer_cast<snippets::modifier::MemoryAccess>(child_input.get_expr()->get_node());
@@ -127,6 +146,7 @@ void RuntimeConfigurator::init_data_info(const lowered::LinearIRCPtr& linear_ir)
                 break;
             }
         }
+        OPENVINO_ASSERT(desc, "Descriptor is missed!");
         const auto& etype = mem_desc_expr->get_node()->get_output_element_type(0);
         update_io_parameters(desc, etype);
     }
diff --git a/src/common/snippets/src/shape_inference/shape_infer_instances.cpp b/src/common/snippets/src/shape_inference/shape_infer_instances.cpp
index a3e3d9652c0ac8..3fed1d924a7140 100644
--- a/src/common/snippets/src/shape_inference/shape_infer_instances.cpp
+++ b/src/common/snippets/src/shape_inference/shape_infer_instances.cpp
@@ -228,22 +228,5 @@ Result ReduceShapeInfer::infer(const std::vector<VectorDimsRef>& input_shapes) {
     return {{result_shape}, ShapeInferStatus::success};
 }
 
-ReshapeShapeInfer::ReshapeShapeInfer(const std::shared_ptr<Node>& n) {
-    const auto& reshape = as_type_ptr<ov::snippets::op::Reshape>(n);
-    OPENVINO_ASSERT(reshape, "Invalid node passed to ReshapeShapeInfer.");
-    const auto& partial_shape = reshape->get_target_shape();
-    OPENVINO_ASSERT(partial_shape.is_static(), "target_shape of reshape op should be static in ReshapeShapeInfer");
-    target_shape = partial_shape.get_shape();
-    target_shape_volume = utils::get_shape_size(target_shape);
-}
-
-Result ReshapeShapeInfer::infer(const std::vector<VectorDimsRef>& input_shapes) {
-    OPENVINO_ASSERT(input_shapes.size() == 1, "Invalid number of shapes is passed in ReshapeShapeInfer");
-    const auto input_shape_volume = utils::get_shape_size(input_shapes[0].get());
-    OPENVINO_ASSERT(input_shape_volume == target_shape_volume, "Tensor volume should be the same after reshape in ReshapeShapeInfer");
-
-    return {{target_shape}, ShapeInferStatus::success};
-}
-
 } // namespace snippets
 } // namespace ov
diff --git a/src/common/snippets/src/shape_inference/shape_inference.cpp b/src/common/snippets/src/shape_inference/shape_inference.cpp
index 76a4c491c66983..0e3060501a87d5 100644
--- a/src/common/snippets/src/shape_inference/shape_inference.cpp
+++ b/src/common/snippets/src/shape_inference/shape_inference.cpp
@@ -57,7 +57,6 @@ const IShapeInferSnippetsFactory::TRegistry IShapeInferSnippetsFactory::registry
         SHAPE_INFER_PREDEFINED(op::KernelStatic, EmptyShapeInfer),
         SHAPE_INFER_PREDEFINED(op::KernelDynamic, EmptyShapeInfer),
         SHAPE_INFER_PREDEFINED(op::Nop, EmptyShapeInfer),
-        SHAPE_INFER_OP_SPECIFIC_EXTERNAL(op::Reshape, ReshapeShapeInfer),
         SHAPE_INFER_OP_SPECIFIC_EXTERNAL(opset1::Select, SelectShapeInfer),
         SHAPE_INFER_OP_SPECIFIC_EXTERNAL(op::Brgemm, BrgemmShapeInfer),
         SHAPE_INFER_OP_SPECIFIC_EXTERNAL(op::ReduceMax, ReduceShapeInfer),
@@ -65,7 +64,9 @@ const IShapeInferSnippetsFactory::TRegistry IShapeInferSnippetsFactory::registry
         // Note that Result has no output PortConnectors, so the shape must be empty
         SHAPE_INFER_PREDEFINED(ov::op::v0::Result, EmptyShapeInfer),
         //
-        SHAPE_INFER_OP_SPECIFIC(op::LoadReshape),
+        SHAPE_INFER_OP_SPECIFIC(op::LoadReorder),
+        SHAPE_INFER_OP_SPECIFIC(op::Reshape),
+        SHAPE_INFER_OP_SPECIFIC(op::Reorder),
         SHAPE_INFER_OP_SPECIFIC(op::RankNormalization),
         SHAPE_INFER_OP_SPECIFIC(op::BroadcastLoad),
         SHAPE_INFER_OP_SPECIFIC(op::BroadcastMove),
diff --git a/src/common/snippets/src/utils/utils.cpp b/src/common/snippets/src/utils/utils.cpp
index e7381fe6754758..249970b65baa5d 100644
--- a/src/common/snippets/src/utils/utils.cpp
+++ b/src/common/snippets/src/utils/utils.cpp
@@ -317,14 +317,21 @@ std::shared_ptr<ov::Node> get_leaf_node_of_first_parent_shape_infer_seq(const st
 }
 
 int64_t get_dim_stride(const lowered::ExpressionPort& expr_port, size_t idx) {
-    size_t dim_idx = 0;
+    const auto& shape = expr_port.get_descriptor_ptr()->get_shape();
     const auto& layout = expr_port.get_descriptor_ptr()->get_layout();
     switch (expr_port.get_type()) {
-        case lowered::ExpressionPort::Input: dim_idx = utils::get_input_dim_idx(layout, idx); break;
-        case lowered::ExpressionPort::Output: dim_idx = utils::get_output_dim_idx(layout, idx); break;
-        default: OPENVINO_THROW("Unsupported expression port type!");
+        case lowered::ExpressionPort::Input: return get_dim_in_stride(shape, layout, idx);
+        case lowered::ExpressionPort::Output: return get_dim_out_stride(shape, layout, idx);
     }
-    return get_stride(dim_idx, expr_port.get_descriptor_ptr()->get_shape());
+    OPENVINO_THROW("Unsupported expression port type!");
+}
+
+int64_t get_dim_in_stride(const VectorDims& shape, const VectorDims& layout, size_t idx) {
+    return get_stride(utils::get_input_dim_idx(layout, idx), shape);
+}
+
+int64_t get_dim_out_stride(const VectorDims& shape, const VectorDims& layout, size_t idx) {
+    return get_stride(utils::get_output_dim_idx(layout, idx), shape);
 }
 
 void visit_path(const lowered::ExpressionPtr& expr,
diff --git a/src/common/snippets/tests/src/lowered/pass/extracted_loop_invariants.cpp b/src/common/snippets/tests/src/lowered/pass/extracted_loop_invariants.cpp
index ee76c5af7234d8..b9ff7bda6823ed 100644
--- a/src/common/snippets/tests/src/lowered/pass/extracted_loop_invariants.cpp
+++ b/src/common/snippets/tests/src/lowered/pass/extracted_loop_invariants.cpp
@@ -299,7 +299,7 @@ TEST_F(ExtractLoopInvariantsTest, ExtractedLoopInvariantsImpossible) {
      *
      *        Param0(32,8,1)
      *             |
-     *       LoadReshape with order (1,2,0)
+     *       LoadReorder with order (1,2,0)
      *             |
      *           Store
      *             |
@@ -307,7 +307,7 @@ TEST_F(ExtractLoopInvariantsTest, ExtractedLoopInvariantsImpossible) {
     */
     {
         auto param = linear_ir->push_node<ov::opset10::Parameter>(input_precision, input_shape_0);
-        auto load_reshape = linear_ir->push_node<ov::snippets::op::LoadReshape>(param.second, 1, 0, layout);
+        auto load_reshape = linear_ir->push_node<ov::snippets::op::LoadReorder>(param.second, 1, 0, layout);
         auto store = linear_ir->push_node<ov::snippets::op::Store>(load_reshape.second, 1, 0);
         init_expr_descriptors(*load_reshape.first, {subtensor, subtensor}, {order, layout});
         init_expr_descriptors(*store.first, {subtensor, subtensor}, {layout, layout});
diff --git a/src/common/snippets/tests/src/pass/mha_tokenization.cpp b/src/common/snippets/tests/src/pass/mha_tokenization.cpp
index 382257f935cc49..d725c36e5c35a5 100644
--- a/src/common/snippets/tests/src/pass/mha_tokenization.cpp
+++ b/src/common/snippets/tests/src/pass/mha_tokenization.cpp
@@ -160,7 +160,7 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_Dynamic_Transpose_fusion) {
 TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA3D_SplitM) {
     const auto& f = MHASplitMFunction(std::vector<PartialShape>{{128, 12, 64}, {128, 12, 64}, {12, 128, 128}, {128, 12, 64}},
                                       std::vector<ov::element::Type>({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}),
-                                      std::vector<Shape>{{2, 64, 12, 64}, {128, 12, 1, 64}, {12, 2, 64, 128}, {1, 128, 12, 64}, {128, 12, 64}},
+                                      std::vector<Shape>{{2, 64, 12, 64}, {12, 1, 64, 128}, {12, 2, 64, 128}, {1, 128, 12, 64}, {128, 12, 64}},
                                       false);
     model = f.getOriginal();
     model_ref = f.getReference();
@@ -171,7 +171,7 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA3D_SplitM) {
 TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA3D_SplitM_withMul) {
     const auto& f = MHASplitMFunction(std::vector<PartialShape>{{128, 12, 64}, {128, 12, 64}, {12, 128, 128}, {128, 12, 64}},
                                       std::vector<ov::element::Type>({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}),
-                                      std::vector<Shape>{{2, 64, 12, 64}, {128, 12, 1, 64}, {12, 2, 64, 128}, {1, 128, 12, 64}, {128, 12, 64}},
+                                      std::vector<Shape>{{4, 32, 12, 64}, {12, 1, 64, 128}, {12, 4, 32, 128}, {1, 128, 12, 64}, {128, 12, 64}},
                                       true);
     model = f.getOriginal();
     model_ref = f.getReference();
@@ -182,7 +182,7 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA3D_SplitM_withMul) {
 TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA4D_SplitM) {
     const auto& f = MHASplitMFunction(std::vector<PartialShape>{{1, 384, 16, 64}, {1, 384, 16, 64}, {1, 1, 1, 384}, {1, 384, 16, 64}},
                                       std::vector<ov::element::Type>({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}),
-                                      std::vector<Shape>{{1, 6, 64, 16, 64}, {1, 384, 16, 1, 64}, {1, 1, 1, 1, 384}, {1, 1, 384, 16, 64}, {1, 384, 16, 64}},
+                                      std::vector<Shape>{{1, 12, 32, 16, 64}, {1, 16, 1, 64, 384}, {1, 1, 1, 1, 384}, {1, 1, 384, 16, 64}, {1, 384, 16, 64}},
                                       false);
     model = f.getOriginal();
     model_ref = f.getReference();
@@ -193,7 +193,7 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA4D_SplitM) {
 TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA4D_SplitM_withMul) {
     const auto& f = MHASplitMFunction(std::vector<PartialShape>{{1, 384, 16, 64}, {1, 384, 16, 64}, {1, 1, 1, 384}, {1, 384, 16, 64}},
                                       std::vector<ov::element::Type>({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}),
-                                      std::vector<Shape>{{1, 6, 64, 16, 64}, {1, 384, 16, 1, 64}, {1, 1, 1, 1, 384}, {1, 1, 384, 16, 64}, {1, 384, 16, 64}},
+                                      std::vector<Shape>{{1, 12, 32, 16, 64}, {1, 16, 1, 64, 384}, {1, 1, 1, 1, 384}, {1, 1, 384, 16, 64}, {1, 384, 16, 64}},
                                       true);
     model = f.getOriginal();
     model_ref = f.getReference();
diff --git a/src/common/snippets/tests/src/utils/split_dim_m.cpp b/src/common/snippets/tests/src/utils/split_dim_m.cpp
index 9e801fceae02e9..df7c277d775cb4 100644
--- a/src/common/snippets/tests/src/utils/split_dim_m.cpp
+++ b/src/common/snippets/tests/src/utils/split_dim_m.cpp
@@ -59,6 +59,11 @@ const std::vector<SplitDimensionMParams> split_dimension_cases = {
     {InputData{25, 50, 40}, ReferenceData{true, 2, 25}},
     {InputData{5, 16384, 40}, ReferenceData{true, 8, 2048}},
     {InputData{5, 16384, 32}, ReferenceData{true, 32, 512}},
+    {InputData{48, 4097, 32}, ReferenceData{true, 17, 241}},
+    {InputData{48, 6600, 32}, ReferenceData{true, 200, 33}},
+    {InputData{12, 128, 16}, ReferenceData{true, 4, 32}},
+    {InputData{16, 384, 60}, ReferenceData{true, 12, 32}},
+    {InputData{16, 384, 24}, ReferenceData{true, 12, 32}},
 };
 
 INSTANTIATE_TEST_SUITE_P(smoke_Snippets_SplitDimensionM,
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/cpu_generator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/cpu_generator.cpp
index cf89181e2a7979..cdc768f5d4e1cc 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/cpu_generator.cpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/cpu_generator.cpp
@@ -141,10 +141,11 @@ bool CompiledSnippetCPU::empty() const {
     return get_code_size() == 0;
 }
 
-CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::aarch64::cpu_isa_t host_isa)
-    : TargetMachine(std::make_shared<CPURuntimeConfigurator>()),
+CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, ov::intel_cpu::MultiCacheWeakPtr cache)
+    : TargetMachine(std::make_shared<CPURuntimeConfigurator>(cache)),
       h(new jit_snippet()),
-      isa(host_isa) {
+      isa(host_isa),
+      compiled_kernel_cache(std::move(cache)) {
     // data movement
     jitters[op::v0::Parameter::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_nop_emitter);
     jitters[op::v0::Result::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_nop_emitter);
@@ -213,7 +214,7 @@ CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::aarch64::cpu_isa_t host_isa)
 }
 
 std::shared_ptr<snippets::TargetMachine> CPUTargetMachine::clone() const {
-    const auto cloned = std::make_shared<CPUTargetMachine>(isa);
+    const auto cloned = std::make_shared<CPUTargetMachine>(isa, compiled_kernel_cache);
     cloned->configurator = std::make_shared<ov::snippets::RuntimeConfigurator>(*configurator);
     return cloned;
 }
@@ -250,14 +251,15 @@ dnnl::impl::cpu::aarch64::cpu_isa_t CPUTargetMachine::get_isa() const {
     return isa;
 }
 
-CPUGenerator::CPUGenerator(dnnl::impl::cpu::aarch64::cpu_isa_t isa_)
-    : Generator(std::make_shared<CPUTargetMachine>(isa_)) {}
+CPUGenerator::CPUGenerator(dnnl::impl::cpu::aarch64::cpu_isa_t isa_, ov::intel_cpu::MultiCacheWeakPtr cache)
+    : Generator(std::make_shared<CPUTargetMachine>(isa_, std::move(cache))) {}
+CPUGenerator::CPUGenerator(const std::shared_ptr<CPUTargetMachine>& target) : Generator(target) {}
 
 std::shared_ptr<snippets::Generator> CPUGenerator::clone() const {
     const auto& cpu_target_machine = std::dynamic_pointer_cast<CPUTargetMachine>(target);
     OPENVINO_ASSERT(cpu_target_machine,
                     "Failed to clone CPUGenerator: the instance contains incompatible TargetMachine type");
-    return std::make_shared<CPUGenerator>(cpu_target_machine->get_isa());
+    return std::make_shared<CPUGenerator>(cpu_target_machine);
 }
 
 ov::snippets::RegType CPUGenerator::get_specific_op_out_reg_type(const ov::Output<ov::Node>& out) const {
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/cpu_generator.hpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/cpu_generator.hpp
index 4006fc01b9a1f5..90c2662e33d070 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/cpu_generator.hpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/cpu_generator.hpp
@@ -4,6 +4,7 @@
 
 #pragma once
 
+#include "cache/multi_cache.h"
 #include "cpu/aarch64/jit_generator.hpp"
 #include "snippets/generator.hpp"
 #include "snippets/target_machine.hpp"
@@ -25,7 +26,7 @@ class CompiledSnippetCPU : public snippets::CompiledSnippet {
 
 class CPUTargetMachine : public snippets::TargetMachine {
 public:
-    explicit CPUTargetMachine(dnnl::impl::cpu::aarch64::cpu_isa_t host_isa);
+    explicit CPUTargetMachine(dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, ov::intel_cpu::MultiCacheWeakPtr);
     std::shared_ptr<snippets::TargetMachine> clone() const override;
     bool is_supported() const override;
     snippets::CompiledSnippetPtr get_snippet() override;
@@ -36,11 +37,13 @@ class CPUTargetMachine : public snippets::TargetMachine {
 private:
     std::unique_ptr<dnnl::impl::cpu::aarch64::jit_generator> h;
     dnnl::impl::cpu::aarch64::cpu_isa_t isa;
+    ov::intel_cpu::MultiCacheWeakPtr compiled_kernel_cache;
 };
 
 class CPUGenerator : public snippets::Generator {
 public:
-    CPUGenerator(dnnl::impl::cpu::aarch64::cpu_isa_t isa);
+    CPUGenerator(dnnl::impl::cpu::aarch64::cpu_isa_t isa, ov::intel_cpu::MultiCacheWeakPtr);
+    CPUGenerator(const std::shared_ptr<CPUTargetMachine>& target);
     std::shared_ptr<Generator> clone() const override;
 
 protected:
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp
index 65741d7031d289..3ad41d707bb96b 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp
@@ -7,7 +7,7 @@
 #include "snippets/lowered/loop_manager.hpp"
 #include "snippets/utils/utils.hpp"
 
-#ifndef OPENVINO_ARCH_ARM64
+#ifdef OPENVINO_ARCH_X86_64
 #    include "transformations/snippets/x64/pass/lowered/brgemm_copy_b_loop_ports_adjuster.hpp"
 #    include "transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp"
 #endif
@@ -39,12 +39,13 @@ std::string CPURuntimeConfig::to_string() const {
 }
 #endif
 
-CPURuntimeConfigurator::CPURuntimeConfigurator()
-    : ov::snippets::RuntimeConfigurator(std::make_shared<CPURuntimeConfig>()) {}
+CPURuntimeConfigurator::CPURuntimeConfigurator(ov::intel_cpu::MultiCacheWeakPtr cache)
+    : ov::snippets::RuntimeConfigurator(std::make_shared<CPURuntimeConfig>()),
+      compiled_kernel_cache(std::move(cache)) {}
 
 void CPURuntimeConfigurator::initialization(const ov::snippets::lowered::LinearIRCPtr& linear_ir) {
     RuntimeConfigurator::initialization(linear_ir);
-#ifndef OPENVINO_ARCH_ARM64
+#ifdef OPENVINO_ARCH_X86_64
     RuntimeOptimizer::register_if_applicable<BrgemmCopyBLoopPortsAdjuster>(m_intermediate_optimizers, linear_ir, this);
     RuntimeOptimizer::register_if_applicable<BrgemmExternalRepackingAdjuster>(m_final_optimizers, linear_ir, this);
 #endif
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp
index 1706670ce870d1..425959c289b3a7 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp
@@ -4,8 +4,9 @@
 
 #pragma once
 
+#include "cache/multi_cache.h"
 #include "emitters/snippets/jit_snippets_call_args.hpp"
-#include "memory_desc/cpu_blocked_memory_desc.h"
+#include "emitters/snippets/repacked_input.hpp"
 #include "snippets/lowered/port_descriptor.hpp"
 #include "snippets/runtime_configurator.hpp"
 
@@ -21,13 +22,20 @@ class CPURuntimeConfig : public ov::snippets::RuntimeConfig {
     std::string to_string() const override;
 #endif
 
+    enum class RepackingImplType {
+        NONE,         // no kernel-outside repacking
+        IN_PARALLEL,  // should be executed in parallel_nt by each thread
+        SEPARATE,     // should be separathy from kernel executed
+    };
+    RepackingImplType repacking_impl_type = RepackingImplType::NONE;
+
+    std::unordered_map<size_t, RepackedInput> repacked_inputs = {};
     std::vector<jit_snippets_call_args::loop_args_t> loop_args = {};
-    std::unordered_map<size_t, CpuBlockedMemoryDescPtr> m_in_requested_descs = {};
 };
 
 class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator {
 public:
-    CPURuntimeConfigurator();
+    CPURuntimeConfigurator(ov::intel_cpu::MultiCacheWeakPtr cache);
 
     /**
      * @brief Calculate Loop parameters of Loop emitters and update these values in CPURuntimeConfig
@@ -35,6 +43,12 @@ class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator {
      */
     void update_loop_args(const ov::snippets::lowered::LinearIRCPtr& linear_ir) const;
 
+    // Note: This method is temporarily used only by `BrgemmExternalRepackingAdjuster` to create kernels for repacking.
+    //       Please, remove this method when the adjuster is deprecated
+    const ov::intel_cpu::MultiCacheWeakPtr& get_cache() const {
+        return compiled_kernel_cache;
+    }
+
 protected:
     void update(const ov::snippets::lowered::LinearIRCPtr& linear_ir) override;
     void update_tensor_rank(const ov::snippets::VectorDims& master_shape) const override;
@@ -42,6 +56,8 @@ class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator {
     void initialization(const ov::snippets::lowered::LinearIRCPtr& linear_ir) override;
 
     static const size_t rank6D;
+
+    ov::intel_cpu::MultiCacheWeakPtr compiled_kernel_cache;
 };
 
 }  // namespace intel_cpu
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/repacked_input.cpp b/src/plugins/intel_cpu/src/emitters/snippets/repacked_input.cpp
new file mode 100644
index 00000000000000..a9fbf04d27392b
--- /dev/null
+++ b/src/plugins/intel_cpu/src/emitters/snippets/repacked_input.cpp
@@ -0,0 +1,34 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "repacked_input.hpp"
+
+namespace ov {
+namespace intel_cpu {
+
+RepackedInput::RepackedInput(std::shared_ptr<const RepackedInputKernel> kernel,
+                             CpuBlockedMemoryDescPtr desc,
+                             VectorDims in_offsets,
+                             VectorDims out_offsets)
+    : m_kernel(std::move(kernel)),
+      m_desc(std::move(desc)),
+      m_in_offsets(std::move(in_offsets)),
+      m_out_offsets(std::move(out_offsets)) {
+    OPENVINO_ASSERT(m_in_offsets.size() == m_out_offsets.size(), "Incorrect size of offsets");
+    OPENVINO_ASSERT(m_desc, "Descriptor is empty");
+}
+
+const CpuBlockedMemoryDescPtr& RepackedInput::desc() const {
+    return m_desc;
+}
+
+const VectorDims& RepackedInput::in_offsets() const {
+    return m_in_offsets;
+}
+
+const VectorDims& RepackedInput::out_offsets() const {
+    return m_out_offsets;
+}
+}  // namespace intel_cpu
+}  // namespace ov
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/repacked_input.hpp b/src/plugins/intel_cpu/src/emitters/snippets/repacked_input.hpp
new file mode 100644
index 00000000000000..61daaa859ef603
--- /dev/null
+++ b/src/plugins/intel_cpu/src/emitters/snippets/repacked_input.hpp
@@ -0,0 +1,45 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "memory_desc/cpu_blocked_memory_desc.h"
+
+namespace ov {
+namespace intel_cpu {
+
+struct RepackedInputKernel {
+    RepackedInputKernel() = default;
+    virtual ~RepackedInputKernel() = default;
+    virtual void operator()(const void* args) const = 0;
+};
+
+struct RepackedInput {
+    RepackedInput() = default;
+    RepackedInput(std::shared_ptr<const RepackedInputKernel> kernel,
+                  CpuBlockedMemoryDescPtr desc,
+                  VectorDims in_offsets,
+                  VectorDims out_offsets);
+
+    template <class T = RepackedInputKernel,
+              typename std::enable_if<std::is_base_of<RepackedInputKernel, T>::value, bool>::type = true>
+    std::shared_ptr<const T> kernel() const {
+        const auto ker = std::dynamic_pointer_cast<const T>(m_kernel);
+        OPENVINO_ASSERT(ker, "Kernel is empty!");
+        return ker;
+    }
+
+    const CpuBlockedMemoryDescPtr& desc() const;
+    const VectorDims& in_offsets() const;
+    const VectorDims& out_offsets() const;
+
+private:
+    std::shared_ptr<const RepackedInputKernel> m_kernel{nullptr};
+    CpuBlockedMemoryDescPtr m_desc{nullptr};
+    VectorDims m_in_offsets{};
+    VectorDims m_out_offsets{};
+};
+
+}  // namespace intel_cpu
+}  // namespace ov
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp
index 39e384837856a1..31daa32dfa144f 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp
@@ -165,7 +165,7 @@ class jit_snippet : public dnnl::impl::cpu::x64::jit_generator {
 
 intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_t host_isa,
                                               ov::intel_cpu::MultiCacheWeakPtr cache)
-    : TargetMachine(std::make_shared<CPURuntimeConfigurator>()),
+    : TargetMachine(std::make_shared<CPURuntimeConfigurator>(cache)),
       h(new jit_snippet()),
       isa(host_isa),
       compiled_kernel_cache(std::move(cache)) {
@@ -177,9 +177,10 @@ intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_t ho
     jitters[snippets::op::RankNormalization::get_type_info_static()] =
         CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter);
     jitters[snippets::op::Reshape::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter);
+    jitters[snippets::op::Reorder::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter);
 
     jitters[snippets::op::Load::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_memory_emitter);
-    jitters[snippets::op::LoadReshape::get_type_info_static()] =
+    jitters[snippets::op::LoadReorder::get_type_info_static()] =
         CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_memory_emitter);
     jitters[snippets::op::BroadcastLoad::get_type_info_static()] =
         CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_broadcast_emitter);
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_b_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_b_emitter.cpp
index 6df658d8d72d0c..861b9779c25533 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_b_emitter.cpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_b_emitter.cpp
@@ -21,15 +21,6 @@ using namespace ov::snippets::utils;
 
 namespace ov {
 namespace intel_cpu {
-namespace {
-bool get_is_transposed(const ov::snippets::lowered::ExpressionPtr& expr) {
-    const auto& layout = expr->get_input_port_descriptor(0)->get_layout();
-    const auto is_transposed = !layout.empty() && layout.back() != layout.size() - 1;
-    OV_CPU_JIT_EMITTER_ASSERT(IMPLICATION(is_transposed, (layout[layout.size() - 2] == layout.size() - 1)),
-                              "supports only N dim placed as last or pre last dimension");
-    return is_transposed;
-}
-}  // namespace
 
 jit_brgemm_copy_b_emitter::jit_brgemm_copy_b_emitter(jit_generator* h,
                                                      cpu_isa_t isa,
@@ -50,7 +41,7 @@ jit_brgemm_copy_b_emitter::jit_brgemm_copy_b_emitter(jit_generator* h,
     const auto& src_prc = brgemm_repack->get_src_element_type();
     const auto& wei_prc = brgemm_repack->get_input_element_type(0);
     const auto wei_N_blk = brgemm_utils::repacking::compute_inner_n_block(wei_prc);
-    const auto is_transposed = get_is_transposed(expr);
+    const auto is_transposed = BrgemmCopyB::is_transposed(expr->get_input_port_descriptor(0)->get_layout());
     const auto brgemm_type = get_brgemm_type(src_prc, is_transposed);
     const auto primitive_isa = brgemm_utils::get_primitive_isa(src_prc, with_amx(brgemm_type));
     m_with_comp = with_compensations(brgemm_type);
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.cpp
index dd216517ace12e..7aca5f6c6a696f 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.cpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.cpp
@@ -145,10 +145,11 @@ std::string BrgemmCopyBKernelConfig::StaticParams::to_string() const {
 #    undef PRINT
 #endif
 
-BrgemmCopyBKernel::BrgemmCopyBKernel() : jit_generator(jit_name()), ker_(nullptr) {}
+BrgemmCopyBKernel::BrgemmCopyBKernel() : RepackedInputKernel(), jit_generator(jit_name()), ker_(nullptr) {}
 
 BrgemmCopyBKernel::BrgemmCopyBKernel(const BrgemmCopyBKernelConfig& conf)
-    : jit_generator(jit_name()),
+    : RepackedInputKernel(),
+      jit_generator(jit_name()),
       is_with_comp(conf.is_with_comp()),
       is_transpose(conf.is_transposed_B()),
       wei_data_size(dnnl_data_type_size(conf.get_wei_dt())),
@@ -169,9 +170,11 @@ status_t BrgemmCopyBKernel::create_kernel() {
     return code;
 }
 
-void BrgemmCopyBKernel::operator()(const call_args* args) const {
+void BrgemmCopyBKernel::operator()(const void* args) const {
+    const auto* call_args = reinterpret_cast<const BrgemmCopyBKernel::call_args*>(args);
+    OV_CPU_JIT_EMITTER_ASSERT(call_args, "Call arguments are nullptr!");
     OV_CPU_JIT_EMITTER_ASSERT(ker_, "Kernel is nullptr");
-    ker_(args);
+    ker_(call_args);
 }
 
 void BrgemmCopyBKernel::init_brgemm_copy_b_kernel(
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.hpp
index b3b107cd676705..5ef740067f2035 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.hpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.hpp
@@ -10,6 +10,7 @@
 #include "emitters/plugin/x64/jit_emitter.hpp"
 #include "emitters/snippets/cpu_kernel_executor_table.hpp"
 #include "emitters/snippets/jit_snippets_call_args.hpp"
+#include "emitters/snippets/repacked_input.hpp"
 
 namespace ov {
 namespace intel_cpu {
@@ -139,7 +140,7 @@ struct BrgemmCopyBKernelConfig : public snippets::KernelExecutorBase::GenericCon
     size_t m_hash{SIZE_MAX};
 };
 
-struct BrgemmCopyBKernel : public dnnl::impl::cpu::x64::jit_generator {
+struct BrgemmCopyBKernel : public RepackedInputKernel, public dnnl::impl::cpu::x64::jit_generator {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(BrgemmCopyBKernel)
     struct call_args {
         const void* src = nullptr;
@@ -152,7 +153,7 @@ struct BrgemmCopyBKernel : public dnnl::impl::cpu::x64::jit_generator {
 
     dnnl::impl::status_t create_kernel() override;
 
-    void operator()(const call_args* args) const;
+    void operator()(const void* args) const override;
 
 private:
     void generate() override;
diff --git a/src/plugins/intel_cpu/src/extension.cpp b/src/plugins/intel_cpu/src/extension.cpp
index 95de3720bb1e25..38ba9c484620db 100644
--- a/src/plugins/intel_cpu/src/extension.cpp
+++ b/src/plugins/intel_cpu/src/extension.cpp
@@ -172,7 +172,7 @@ class TypeRelaxedExtension : public ov::OpExtension<ov::op::TypeRelaxed<Op>> {
     OP_EXTENSION(ov::snippets::op::KernelStatic)      \
     OP_EXTENSION(ov::snippets::op::KernelDynamic)     \
     OP_EXTENSION(ov::snippets::op::Load)              \
-    OP_EXTENSION(ov::snippets::op::LoadReshape)       \
+    OP_EXTENSION(ov::snippets::op::LoadReorder)       \
     OP_EXTENSION(ov::snippets::op::LoopBegin)         \
     OP_EXTENSION(ov::snippets::op::LoopEnd)           \
     OP_EXTENSION(ov::snippets::op::Buffer)            \
diff --git a/src/plugins/intel_cpu/src/nodes/executors/aarch64/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/executors/aarch64/subgraph.cpp
new file mode 100644
index 00000000000000..a8fac443391289
--- /dev/null
+++ b/src/plugins/intel_cpu/src/nodes/executors/aarch64/subgraph.cpp
@@ -0,0 +1,81 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "nodes/executors/aarch64/subgraph.hpp"
+
+#include "snippets/op/subgraph.hpp"
+
+namespace ov {
+namespace intel_cpu {
+
+SubgraphExecutor::SubgraphExecutor(const std::shared_ptr<CPURuntimeConfig>& snippet_config,
+                                   const std::shared_ptr<SubgraphAttrs>& snippet_attrs,
+                                   const std::shared_ptr<SubgraphCodeGenerator>& snippet,
+                                   const std::vector<ptrdiff_t>& start_offset_in,
+                                   const std::vector<ptrdiff_t>& start_offset_out,
+                                   const BufferScratchpadAllocator& allocator,
+                                   const ov::intel_cpu::MultiCacheWeakPtr& kernel_cache)
+    : SubgraphBaseExecutor(snippet_config,
+                           snippet_attrs,
+                           snippet,
+                           start_offset_in,
+                           start_offset_out,
+                           allocator,
+                           kernel_cache) {
+    m_buffer_scratchpad = allocator(m_internal_buffer_size);
+}
+
+void SubgraphStaticExecutor::exec_impl(const std::vector<MemoryPtr>& inMemPtrs,
+                                       const std::vector<MemoryPtr>& outMemPtrs) {
+    const auto& callable = m_schedule->get_callable<kernel>();
+
+    auto initializer = [&](jit_snippets_call_args& call_args, size_t ithr) {
+        init_call_args(call_args, inMemPtrs, outMemPtrs, m_start_offset_in, m_start_offset_out, ithr);
+        update_scratchpad_ptr(call_args.buffer_scratchpad_ptr, ithr);
+    };
+    auto caller = [&](jit_snippets_call_args& call_args, const std::vector<size_t>& indexes, size_t ithr) {
+        callable(&call_args, indexes.data());
+    };
+
+    if (m_parallel_exec_domain.size() == rank6D) {
+        parallel_for6d(initializer, caller);
+    } else {
+        parallel_forNd(initializer, caller);
+    }
+}
+
+void SubgraphDynamicSpecializedExecutor::exec_impl(const std::vector<MemoryPtr>& inMemPtrs,
+                                                   const std::vector<MemoryPtr>& outMemPtrs) {
+    const auto& callable = m_schedule->get_callable<dynamic_kernel>();
+
+    OPENVINO_ASSERT(m_data_offsets.size() == inMemPtrs.size() + outMemPtrs.size(), "Incorrect data offset count!");
+    OPENVINO_ASSERT(m_data_offsets.front().size() == m_parallel_exec_domain.size(),
+                    "Data offsets with invalid ranks detected");
+
+    // Note: we need to reset KernelExecutorTable to the state that was recorded in the
+    // SubgraphDynamicSpecializedExecutor constructor because the table might've been used for other shapes
+    m_reset_exec_table_state();
+
+    std::vector<const uint8_t*> src_ptrs;
+    std::vector<uint8_t*> dst_ptrs;
+    init_original_ptrs(inMemPtrs, outMemPtrs, src_ptrs, dst_ptrs, m_start_offset_in, m_start_offset_out);
+
+    auto initializer = [&](jit_snippets_call_args& call_args, size_t ithr) {
+        init_call_args(call_args, ithr);
+        update_scratchpad_ptr(call_args.buffer_scratchpad_ptr, ithr);
+    };
+    auto caller = [&](jit_snippets_call_args& call_args, const std::vector<size_t>& indexes, size_t ithr) {
+        update_ptrs(call_args, src_ptrs, dst_ptrs, indexes);
+        callable(&call_args);
+    };
+
+    if (m_parallel_exec_domain.size() == rank6D) {
+        parallel_for6d(initializer, caller);
+    } else {
+        parallel_forNd(initializer, caller);
+    }
+}
+
+}  // namespace intel_cpu
+}  // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/executors/aarch64/subgraph.hpp b/src/plugins/intel_cpu/src/nodes/executors/aarch64/subgraph.hpp
new file mode 100644
index 00000000000000..54d7f27a79fd17
--- /dev/null
+++ b/src/plugins/intel_cpu/src/nodes/executors/aarch64/subgraph.hpp
@@ -0,0 +1,44 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "nodes/executors/subgraph.hpp"
+
+namespace ov {
+namespace intel_cpu {
+
+class SubgraphExecutor : public SubgraphBaseExecutor {
+public:
+    SubgraphExecutor(const std::shared_ptr<CPURuntimeConfig>& snippet_config,
+                     const std::shared_ptr<SubgraphAttrs>& snippet_attrs,
+                     const std::shared_ptr<SubgraphCodeGenerator>& snippet,
+                     const std::vector<ptrdiff_t>& start_offset_in,
+                     const std::vector<ptrdiff_t>& start_offset_out,
+                     const BufferScratchpadAllocator& allocator,
+                     const ov::intel_cpu::MultiCacheWeakPtr& kernel_cache);
+};
+
+class SubgraphStaticExecutor : public SubgraphExecutor, public SubgraphStaticBaseExecutor {
+public:
+    template <typename T, typename... Args>
+    SubgraphStaticExecutor(T&& first, Args&&... rest)
+        : SubgraphExecutor(std::forward<T>(first), std::forward<Args>(rest)...),
+          SubgraphStaticBaseExecutor() {}
+
+    void exec_impl(const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs) override;
+};
+
+class SubgraphDynamicSpecializedExecutor : public SubgraphExecutor, public SubgraphDynamicSpecializedBaseExecutor {
+public:
+    template <typename T, typename... Args>
+    SubgraphDynamicSpecializedExecutor(T&& first, Args&&... rest)
+        : SubgraphExecutor(std::forward<T>(first), std::forward<Args>(rest)...),
+          SubgraphDynamicSpecializedBaseExecutor(std::forward<T>(first)) {}
+
+    void exec_impl(const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs) override;
+};
+
+}  // namespace intel_cpu
+}  // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/executors/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/executors/subgraph.cpp
new file mode 100644
index 00000000000000..739ae56be3b4ff
--- /dev/null
+++ b/src/plugins/intel_cpu/src/nodes/executors/subgraph.cpp
@@ -0,0 +1,170 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "nodes/executors/subgraph.hpp"
+
+#include "common/primitive_hashing_utils.hpp"
+#include "openvino/core/parallel.hpp"
+
+namespace ov {
+namespace intel_cpu {
+
+bool operator==(const SubgraphAttrs& lhs, const SubgraphAttrs& rhs) {
+    if (&lhs == &rhs)
+        return true;
+    if (lhs.bodyHash != rhs.bodyHash)
+        return false;
+    if (lhs.inMemOrders.size() != rhs.inMemOrders.size() || lhs.inMemPrecs.size() != rhs.inMemPrecs.size())
+        return false;
+    if (lhs.outMemOrders.size() != rhs.outMemOrders.size() || lhs.outMemPrecs.size() != rhs.outMemPrecs.size())
+        return false;
+    if (lhs.inMemOrders != rhs.inMemOrders || lhs.inMemPrecs != rhs.inMemPrecs)
+        return false;
+    if (lhs.outMemOrders != rhs.outMemOrders || lhs.outMemPrecs != rhs.outMemPrecs)
+        return false;
+    return true;
+}
+
+size_t get_attr_hash(size_t seed, const std::shared_ptr<SubgraphAttrs>& attrs) {
+    using namespace dnnl::impl;
+    using namespace dnnl::impl::primitive_hashing;
+
+    for (const auto& order : attrs->inMemOrders)
+        seed = get_vector_hash(seed, order);
+    for (const auto& prec : attrs->inMemPrecs)
+        seed = hash_combine(seed, prec.hash());
+
+    for (const auto& order : attrs->outMemOrders)
+        seed = get_vector_hash(seed, order);
+    for (const auto& prec : attrs->outMemPrecs)
+        seed = hash_combine(seed, prec.hash());
+
+    seed = hash_combine(seed, attrs->bodyHash);
+    return seed;
+}
+
+SubgraphCodeGenerator::SubgraphCodeGenerator(const std::shared_ptr<SubgraphAttrs>& snippet_attrs,
+                                             const std::shared_ptr<CPURuntimeConfig>& config) {
+    OPENVINO_ASSERT(snippet_attrs, "Subgraph attributes are empty!");
+    OPENVINO_ASSERT(config, "Runtime Config is empty!");
+
+    jit_snippets_compile_args jcp;
+    jcp.data_offsets = config->io_data_offsets;
+    SubgraphBaseExecutor::init_parallel_domain(config, jcp.exec_domain);
+    schedule =
+        std::make_shared<ov::snippets::Schedule>(snippet_attrs->snippet->generate(reinterpret_cast<const void*>(&jcp)));
+}
+
+SubgraphBaseExecutor::SubgraphBaseExecutor(const std::shared_ptr<CPURuntimeConfig>& snippet_config,
+                                           const std::shared_ptr<SubgraphAttrs>& snippet_attrs,
+                                           const std::shared_ptr<SubgraphCodeGenerator>& snippet,
+                                           const std::vector<ptrdiff_t>& start_offset_in,
+                                           const std::vector<ptrdiff_t>& start_offset_out,
+                                           const BufferScratchpadAllocator& allocator,
+                                           const ov::intel_cpu::MultiCacheWeakPtr& kernel_cache)
+    : m_schedule(snippet->get()),
+      m_start_offset_in(start_offset_in),
+      m_start_offset_out(start_offset_out) {
+    OPENVINO_ASSERT(m_schedule, "Schedule is empty!");
+    OPENVINO_ASSERT(snippet_config, "Runtime Config is empty!");
+    init_parallel_domain(snippet_config, m_parallel_exec_domain);
+
+    m_tensor_rank = snippet_config->tensor_rank;
+    m_harness_work_amount = std::accumulate(m_parallel_exec_domain.cbegin(),
+                                            m_parallel_exec_domain.cend(),
+                                            size_t(1),
+                                            std::multiplies<size_t>());
+    m_nthreads = std::min(parallel_get_max_threads(), static_cast<int>(m_harness_work_amount));
+
+    m_buffer_scratchpad_size = snippet_config->buffer_scratchpad_size;
+    OPENVINO_ASSERT(!ov::snippets::utils::is_dynamic_value(m_buffer_scratchpad_size),
+                    "Undefined buffer scratchpad size!");
+    m_internal_buffer_size = static_cast<size_t>(m_nthreads) * m_buffer_scratchpad_size;
+}
+
+void SubgraphBaseExecutor::init_parallel_domain(const std::vector<size_t>& master_shape,
+                                                size_t tensor_rank,
+                                                size_t tile_rank,
+                                                std::vector<size_t>& domain) {
+    domain.resize(tensor_rank, 1);
+    std::fill(domain.begin(), domain.end(), 1);
+    std::copy(master_shape.cbegin(),
+              master_shape.cbegin() + (master_shape.size() - tile_rank),
+              domain.begin() + (tensor_rank - master_shape.size()));
+}
+
+void SubgraphBaseExecutor::init_parallel_domain(const std::shared_ptr<CPURuntimeConfig>& snippet_config,
+                                                std::vector<size_t>& domain) {
+    init_parallel_domain(snippet_config->master_shape, snippet_config->tensor_rank, snippet_config->tile_rank, domain);
+}
+
+void SubgraphBaseExecutor::parallel_for6d(const initializer_functor& initializer, const call_functor& caller) {
+    const auto& dom = m_parallel_exec_domain;
+
+    parallel_nt_static(m_nthreads, [&](const int ithr, const int nthr) {
+        jit_snippets_call_args call_args;
+        initializer(call_args, ithr);
+
+        size_t start = 0, end = 0;
+        splitter(m_harness_work_amount, nthr, ithr, start, end);
+
+        std::vector<size_t> indexes{0, 0, 0, 0, 0};
+        parallel_it_init(start,
+                         indexes[0],
+                         dom[0],
+                         indexes[1],
+                         dom[1],
+                         indexes[2],
+                         dom[2],
+                         indexes[3],
+                         dom[3],
+                         indexes[4],
+                         dom[4]);
+        for (size_t iwork = start; iwork < end; ++iwork) {
+            caller(call_args, indexes, ithr);
+            parallel_it_step(indexes[0],
+                             dom[0],
+                             indexes[1],
+                             dom[1],
+                             indexes[2],
+                             dom[2],
+                             indexes[3],
+                             dom[3],
+                             indexes[4],
+                             dom[4]);
+        }
+    });
+}
+
+void SubgraphBaseExecutor::parallel_forNd(const initializer_functor& initializer, const call_functor& caller) {
+    const auto& dom = m_parallel_exec_domain;
+
+    parallel_nt_static(m_nthreads, [&](const int ithr, const int nthr) {
+        jit_snippets_call_args call_args;
+        initializer(call_args, ithr);
+
+        size_t start = 0, end = 0;
+        splitter(m_harness_work_amount, nthr, ithr, start, end);
+
+        std::vector<size_t> indexes(dom.size() - 1, 0);
+        for (size_t iwork = start; iwork < end; ++iwork) {
+            size_t tmp = iwork;
+            for (ptrdiff_t j = static_cast<ptrdiff_t>(dom.size()) - 2; j >= 0; j--) {
+                indexes[j] = tmp % dom[j];
+                tmp /= dom[j];
+            }
+
+            caller(call_args, indexes, ithr);
+        }
+    });
+}
+
+void SubgraphBaseExecutor::execute(const dnnl::stream& strm,
+                                   const std::vector<MemoryPtr>& inMemPtrs,
+                                   const std::vector<MemoryPtr>& outMemPtrs) {
+    exec_impl(inMemPtrs, outMemPtrs);
+}
+
+}  // namespace intel_cpu
+}  // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/executors/subgraph.hpp b/src/plugins/intel_cpu/src/nodes/executors/subgraph.hpp
new file mode 100644
index 00000000000000..78cb56440203d2
--- /dev/null
+++ b/src/plugins/intel_cpu/src/nodes/executors/subgraph.hpp
@@ -0,0 +1,188 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "cpu_memory.h"
+#include "emitters/snippets/cpu_runtime_configurator.hpp"
+#include "emitters/snippets/jit_snippets_call_args.hpp"
+#include "snippets/generator.hpp"
+#include "snippets/op/subgraph.hpp"
+
+namespace ov {
+namespace intel_cpu {
+
+struct SubgraphAttrs {
+    // Local copy of subgraph node for canonization & code generation
+    std::shared_ptr<snippets::op::Subgraph> snippet;
+    uint64_t bodyHash;
+    std::vector<VectorDims> inMemOrders;
+    std::vector<VectorDims> outMemOrders;
+    std::vector<ov::element::Type> inMemPrecs;
+    std::vector<ov::element::Type> outMemPrecs;
+};
+bool operator==(const SubgraphAttrs& lhs, const SubgraphAttrs& rhs);
+size_t get_attr_hash(size_t seed, const std::shared_ptr<SubgraphAttrs>& attrs);
+
+class SubgraphCodeGenerator {
+public:
+    SubgraphCodeGenerator(const std::shared_ptr<SubgraphAttrs>& snippet_attrs,
+                          const std::shared_ptr<CPURuntimeConfig>& config);
+
+    const std::shared_ptr<snippets::Schedule>& get() const {
+        return schedule;
+    }
+
+private:
+    std::shared_ptr<snippets::Schedule> schedule;
+};
+
+class SubgraphBaseExecutor {
+public:
+    using BufferScratchpadAllocator = std::function<MemoryPtr(size_t)>;
+
+    SubgraphBaseExecutor() = default;
+    SubgraphBaseExecutor(const std::shared_ptr<CPURuntimeConfig>& snippet_config,
+                         const std::shared_ptr<SubgraphAttrs>& snippet_attrs,
+                         const std::shared_ptr<SubgraphCodeGenerator>& snippet,
+                         const std::vector<ptrdiff_t>& start_offset_in,
+                         const std::vector<ptrdiff_t>& start_offset_out,
+                         const BufferScratchpadAllocator& allocator,
+                         const ov::intel_cpu::MultiCacheWeakPtr& kernel_cache);
+    virtual ~SubgraphBaseExecutor() = default;
+
+    virtual void execute(const dnnl::stream& strm,
+                         const std::vector<MemoryPtr>& inMemPtrs,
+                         const std::vector<MemoryPtr>& outMemPtrs);
+
+    static void init_parallel_domain(const std::vector<size_t>& master_shape,
+                                     size_t tensor_rank,
+                                     size_t tile_rank,
+                                     std::vector<size_t>& domain);
+    static void init_parallel_domain(const std::shared_ptr<CPURuntimeConfig>& snippet_config,
+                                     std::vector<size_t>& domain);
+
+protected:
+    virtual void exec_impl(const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs) = 0;
+
+    using initializer_functor = std::function<void(jit_snippets_call_args&, size_t)>;
+    using call_functor = std::function<void(jit_snippets_call_args&, const std::vector<size_t>&, size_t)>;
+
+    virtual void parallel_for6d(const initializer_functor& initializer, const call_functor& caller);
+    virtual void parallel_forNd(const initializer_functor& initializer, const call_functor& caller);
+
+    inline void update_scratchpad_ptr(void*& scratchpad_ptr, size_t ithr) const {
+        if (m_buffer_scratchpad_size > 0)
+            scratchpad_ptr = m_buffer_scratchpad->getDataAs<uint8_t>() + ithr * m_buffer_scratchpad_size;
+    }
+
+    std::shared_ptr<snippets::Schedule> m_schedule;
+    // Holds index of output used as in execution domain
+    // it should be compatible with a schedule's work size
+    std::vector<size_t> m_parallel_exec_domain = {};
+    size_t m_harness_work_amount = 0;
+
+    // Buffer scratchpad
+    MemoryPtr m_buffer_scratchpad = nullptr;
+    size_t m_buffer_scratchpad_size = 0;
+    size_t m_internal_buffer_size = 0;
+    size_t m_tensor_rank = 0;
+
+    const size_t rank6D = 6;
+
+    // Count of threads for parallel_nt
+    int m_nthreads = 0;
+
+    std::vector<ptrdiff_t> m_start_offset_in = {};
+    std::vector<ptrdiff_t> m_start_offset_out = {};
+};
+
+// Class for Subgraphs with static shapes
+class SubgraphStaticBaseExecutor {
+public:
+    SubgraphStaticBaseExecutor() = default;
+    virtual ~SubgraphStaticBaseExecutor() = default;
+
+protected:
+    typedef void (*kernel)(const void*, const void*);
+
+    inline void init_call_args(jit_snippets_call_args& call_args,
+                               const std::vector<MemoryPtr>& srcMemPtrs,
+                               const std::vector<MemoryPtr>& dstMemPtrs,
+                               const std::vector<ptrdiff_t>& start_offset_in,
+                               const std::vector<ptrdiff_t>& start_offset_out,
+                               size_t ithr) {
+        for (size_t i = 0; i < srcMemPtrs.size(); i++)
+            call_args.src_ptrs[i] = srcMemPtrs[i]->getDataAs<const uint8_t>() + start_offset_in[i];
+
+        for (size_t i = 0; i < dstMemPtrs.size(); i++)
+            call_args.dst_ptrs[i] = dstMemPtrs[i]->getDataAs<uint8_t>() + start_offset_out[i];
+    }
+};
+
+// Specialized dynamic executor based on shape agnostic kernel for the specific input shapes
+class SubgraphDynamicSpecializedBaseExecutor {
+public:
+    SubgraphDynamicSpecializedBaseExecutor(const std::shared_ptr<CPURuntimeConfig>& snippet_config) {
+        m_buffer_offsets = snippet_config->buffer_cluster_offsets;
+        m_data_offsets = snippet_config->io_data_offsets;
+        m_loop_args = snippet_config->loop_args;
+        m_reset_exec_table_state = snippet_config->kernel_executor_table->get_state_reset();
+    }
+    virtual ~SubgraphDynamicSpecializedBaseExecutor() = default;
+
+protected:
+    typedef void (*dynamic_kernel)(const void*);
+
+    inline void init_call_args(jit_snippets_call_args& call_args, size_t ithr) {
+        call_args.register_loops(m_loop_args);
+        std::copy(m_buffer_offsets.cbegin(), m_buffer_offsets.cend(), call_args.buffer_offsets);
+    }
+
+    inline void init_original_ptrs(const std::vector<MemoryPtr>& srcMemPtrs,
+                                   const std::vector<MemoryPtr>& dstMemPtrs,
+                                   std::vector<const uint8_t*>& src_ptrs,
+                                   std::vector<uint8_t*>& dst_ptrs,
+                                   const std::vector<ptrdiff_t>& start_offset_in,
+                                   const std::vector<ptrdiff_t>& start_offset_out) {
+        const auto in_num = srcMemPtrs.size();
+        const auto out_num = dstMemPtrs.size();
+
+        src_ptrs.resize(in_num, nullptr);
+        dst_ptrs.resize(out_num, nullptr);
+
+        for (size_t i = 0; i < in_num; i++)
+            src_ptrs[i] = srcMemPtrs[i]->getDataAs<const uint8_t>() + start_offset_in[i];
+        for (size_t i = 0; i < out_num; i++)
+            dst_ptrs[i] = dstMemPtrs[i]->getDataAs<uint8_t>() + start_offset_out[i];
+    }
+
+    inline void update_ptrs(jit_snippets_call_args& call_args,
+                            const std::vector<const uint8_t*>& src_ptrs,
+                            const std::vector<uint8_t*>& dst_ptrs,
+                            const std::vector<size_t>& indexes) const {
+        for (size_t i = 0; i < src_ptrs.size(); i++) {
+            auto i_ptr = src_ptrs[i];
+            for (size_t j = 0; j < indexes.size(); j++) {
+                i_ptr += m_data_offsets[i][j] * indexes[j];
+            }
+            call_args.src_ptrs[i] = i_ptr;
+        }
+        for (size_t i = 0; i < dst_ptrs.size(); i++) {
+            auto i_ptr = dst_ptrs[i];
+            for (size_t j = 0; j < indexes.size(); j++) {
+                i_ptr += m_data_offsets[i + src_ptrs.size()][j] * indexes[j];
+            }
+            call_args.dst_ptrs[i] = i_ptr;
+        }
+    }
+
+    std::vector<size_t> m_buffer_offsets = {};
+    std::vector<std::vector<size_t>> m_data_offsets = {};
+    std::vector<jit_snippets_call_args::loop_args_t> m_loop_args = {};
+    std::function<void()> m_reset_exec_table_state;
+};
+
+}  // namespace intel_cpu
+}  // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/executors/x64/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/executors/x64/subgraph.cpp
new file mode 100644
index 00000000000000..983c4410083beb
--- /dev/null
+++ b/src/plugins/intel_cpu/src/nodes/executors/x64/subgraph.cpp
@@ -0,0 +1,328 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "nodes/executors/x64/subgraph.hpp"
+
+#include "emitters/snippets/x64/cpu_generator.hpp"
+#include "emitters/snippets/x64/kernel_executors/brgemm_copy_b.hpp"
+#include "openvino/core/parallel.hpp"
+#include "snippets/op/subgraph.hpp"
+
+#if defined(__linux__) && defined(SNIPPETS_DEBUG_CAPS)
+#    include <signal.h>
+
+#    include "emitters/snippets/x64/jit_segfault_detector_emitter.hpp"
+std::mutex err_print_lock;
+#endif
+
+namespace ov {
+namespace intel_cpu {
+
+namespace {
+inline void parallel4d_repacking(const BrgemmCopyBKernel* ker,
+                                 const VectorDims& dom,
+                                 const VectorDims& in_str,
+                                 const VectorDims& out_str,
+                                 const uint8_t* src,
+                                 uint8_t* dst) {
+    parallel_for4d(dom[0], dom[1], dom[2], dom[3], [&](size_t d0, size_t d1, size_t d2, size_t d3) {
+        BrgemmCopyBKernel::call_args args;
+        args.src = src + d0 * in_str[0] + d1 * in_str[1] + d2 * in_str[2] + d3 * in_str[3];
+        args.tr_src = dst + d0 * out_str[0] + d1 * out_str[1] + d2 * out_str[2] + d3 * out_str[3];
+        (*ker)(&args);
+    });
+};
+inline void parallelNd_repacking(const BrgemmCopyBKernel* ker,
+                                 const VectorDims& dom,
+                                 const VectorDims& in_str,
+                                 const VectorDims& out_str,
+                                 const uint8_t* src,
+                                 uint8_t* dst) {
+    const size_t batch = std::accumulate(dom.rbegin() + 2, dom.rend(), 1lu, std::multiplies<size_t>());
+    parallel_nt_static(0, [&](const int ithr, const int nthr) {
+        BrgemmCopyBKernel::call_args args;
+        size_t start = 0, end = 0;
+        splitter(batch, nthr, ithr, start, end);
+        for (size_t iwork = start; iwork < end; ++iwork) {
+            const uint8_t* src_u8 = src;
+            uint8_t* dst_u8 = dst;
+            size_t tmp = iwork;
+            for (ptrdiff_t j = static_cast<ptrdiff_t>(dom.size()) - 3; j >= 0; j--) {
+                auto idx = tmp % dom[j];
+                tmp /= dom[j];
+
+                src_u8 += idx * in_str[j];
+                dst_u8 += idx * out_str[j];
+            }
+            args.src = src_u8;
+            args.tr_src = dst_u8;
+            (*ker)(&args);
+        }
+    });
+};
+}  // namespace
+
+SubgraphExecutor::SubgraphExecutor(const std::shared_ptr<CPURuntimeConfig>& snippet_config,
+                                   const std::shared_ptr<SubgraphAttrs>& snippet_attrs,
+                                   const std::shared_ptr<SubgraphCodeGenerator>& snippet,
+                                   const std::vector<ptrdiff_t>& start_offset_in,
+                                   const std::vector<ptrdiff_t>& start_offset_out,
+                                   const BufferScratchpadAllocator& allocator,
+                                   const ov::intel_cpu::MultiCacheWeakPtr& kernel_cache)
+    : SubgraphBaseExecutor(snippet_config,
+                           snippet_attrs,
+                           snippet,
+                           start_offset_in,
+                           start_offset_out,
+                           allocator,
+                           kernel_cache) {
+    m_repacking_impl_type = snippet_config->repacking_impl_type;
+    m_repacked_inputs = snippet_config->repacked_inputs;
+
+    auto external_buffer_size = std::accumulate(m_repacked_inputs.begin(),
+                                                m_repacked_inputs.end(),
+                                                size_t(0),
+                                                [](size_t sum, const std::pair<size_t, RepackedInput>& p) {
+                                                    return sum + p.second.desc()->getCurrentMemSize();
+                                                });
+
+    if (get_repacking_impl_type() == RepackingImplType::IN_PARALLEL) {
+        // When external repacking is applied in parallel section,
+        // each thread should have own buffer to store repacked data
+        external_buffer_size *= m_nthreads;
+
+        // To avoid extra overheads in runtime on vector creation,
+        // we initialize `repacked_offsets_by_threads` by default here
+        m_repacked_offsets_by_threads.resize(m_nthreads);
+        for (size_t i = 0; i < m_repacked_offsets_by_threads.size(); ++i)
+            clean_repacked_offsets(i);
+
+        if (m_tensor_rank == rank6D) {
+            init_offset = [](const std::vector<size_t>& offsets, const std::vector<size_t>& indexes, size_t& offset) {
+                offset += offsets[0] * indexes[0] + offsets[1] * indexes[1] + offsets[2] * indexes[2] +
+                          offsets[3] * indexes[3];
+            };
+        } else {
+            init_offset = [](const std::vector<size_t>& offsets, const std::vector<size_t>& indexes, size_t& offset) {
+                for (size_t j = 0; j < indexes.size(); j++)
+                    offset += offsets[j] * indexes[j];
+            };
+        }
+    }
+
+    m_buffer_scratchpad = allocator(m_internal_buffer_size + external_buffer_size);
+
+#if defined(__linux__) && defined(SNIPPETS_DEBUG_CAPS)
+    const auto target = std::dynamic_pointer_cast<const CPUTargetMachine>(
+        snippet_attrs->snippet->get_generator()->get_target_machine());
+    enabled_segfault_detector = target && target->debug_config.enable_segfault_detector;
+#endif
+}
+
+#if defined(__linux__) && defined(SNIPPETS_DEBUG_CAPS)
+void SubgraphExecutor::segfault_detector() {
+    if (enabled_segfault_detector) {
+        __sighandler_t signal_handler = [](int signal) {
+            std::lock_guard<std::mutex> guard(err_print_lock);
+            if (auto segfault_detector_emitter = ov::intel_cpu::g_custom_segfault_handler->local())
+                std::cout << segfault_detector_emitter->info() << std::endl;
+            auto tid = parallel_get_thread_num();
+            OPENVINO_THROW("Segfault was caught by the signal handler in subgraph node execution on thread " +
+                           std::to_string(tid));
+        };
+        struct sigaction new_handler {};
+        new_handler.sa_handler = signal_handler;
+        sigaction(SIGSEGV, &new_handler, nullptr);
+    }
+}
+#endif
+
+std::vector<MemoryPtr> SubgraphExecutor::separately_repack_inputs(const dnnl::stream& strm,
+                                                                  const std::vector<MemoryPtr>& srcMemPtrs) {
+    auto reordered_in_ptrs = srcMemPtrs;
+    size_t offset = m_internal_buffer_size;
+    for (const auto& p : m_repacked_inputs) {
+        const auto in_idx = p.first;
+        const auto& repacked_input = p.second;
+        const auto& desc = repacked_input.desc();
+        const void* data_ptr = m_buffer_scratchpad->getDataAs<uint8_t>() + offset;
+
+        OPENVINO_ASSERT(in_idx < srcMemPtrs.size(), "Incorrect index of input repacked mem ptr");
+        const auto& src_mem = srcMemPtrs[in_idx];
+        const auto& dst_mem = std::make_shared<Memory>(strm.get_engine(), desc, data_ptr, false);
+
+        const auto* src = src_mem->getDataAs<const uint8_t>() + m_start_offset_in[in_idx];
+        auto* dst = dst_mem->getDataAs<uint8_t>();
+
+        VectorDims dom;
+        const auto& shape = dst_mem->getShape().getDims();
+        OPENVINO_ASSERT(shape.size() <= m_tensor_rank, "Unsupported shape rank of repacking data");
+        init_parallel_domain(shape, m_tensor_rank, 2lu, dom);
+
+        const auto& in_strides = repacked_input.in_offsets();
+        const auto& out_strides = repacked_input.out_offsets();
+        OPENVINO_ASSERT(everyone_is(m_tensor_rank, in_strides.size(), out_strides.size(), dom.size()),
+                        "Unsupported shape rank of repacking data");
+
+        const auto& kernel = repacked_input.kernel<BrgemmCopyBKernel>();
+        if (m_tensor_rank == rank6D)
+            parallel4d_repacking(kernel.get(), dom, in_strides, out_strides, src, dst);
+        else
+            parallelNd_repacking(kernel.get(), dom, in_strides, out_strides, src, dst);
+
+        reordered_in_ptrs[in_idx] = dst_mem;
+        offset += desc->getCurrentMemSize();
+    }
+    return reordered_in_ptrs;
+}
+
+void SubgraphExecutor::in_parallel_repack_inputs(const std::vector<MemoryPtr>& inMemPtrs,
+                                                 const std::vector<size_t>& indexes,
+                                                 int ithr,
+                                                 jit_snippets_call_args& call_args) {
+    size_t repacked_offset_idx = 0;
+    for (const auto& p : m_repacked_inputs) {
+        const auto& in_idx = p.first;
+        const auto& repacked_in = p.second;
+
+        size_t src_offset = m_start_offset_in[in_idx];
+        init_offset(repacked_in.in_offsets(), indexes, src_offset);
+
+        auto* repacked_ptr = get_external_scratchpad_ptr(ithr, in_idx);
+
+        auto& last_processed_src_offset = m_repacked_offsets_by_threads[ithr][repacked_offset_idx];
+        if (src_offset != last_processed_src_offset) {
+            BrgemmCopyBKernel::call_args args;
+            args.src = inMemPtrs[in_idx]->getDataAs<const uint8_t>() + src_offset;
+            args.tr_src = repacked_ptr;
+            (*repacked_in.kernel<BrgemmCopyBKernel>())(&args);
+
+            last_processed_src_offset = src_offset;
+        }
+
+        call_args.src_ptrs[in_idx] = repacked_ptr;
+        ++repacked_offset_idx;
+    }
+}
+
+void SubgraphExecutor::execute(const dnnl::stream& strm,
+                               const std::vector<MemoryPtr>& inMemPtrs,
+                               const std::vector<MemoryPtr>& outMemPtrs) {
+    switch (get_repacking_impl_type()) {
+    case RepackingImplType::SEPARATE:
+        exec_impl(separately_repack_inputs(strm, inMemPtrs), outMemPtrs);
+        return;
+    case RepackingImplType::IN_PARALLEL:
+    case RepackingImplType::NONE:
+        exec_impl(inMemPtrs, outMemPtrs);
+        return;
+    default:
+        OPENVINO_THROW("Uknown RepackingImplType");
+    }
+}
+
+void SubgraphStaticExecutor::exec_impl(const std::vector<MemoryPtr>& inMemPtrs,
+                                       const std::vector<MemoryPtr>& outMemPtrs) {
+    const auto& callable = m_schedule->get_callable<kernel>();
+
+    initializer_functor initializer;
+    call_functor caller;
+
+    switch (get_repacking_impl_type()) {
+    case RepackingImplType::IN_PARALLEL:
+        initializer = [&](jit_snippets_call_args& call_args, size_t ithr) {
+            init_call_args(call_args, inMemPtrs, outMemPtrs, m_start_offset_in, m_start_offset_out, ithr);
+            update_scratchpad_ptr(call_args.buffer_scratchpad_ptr, ithr);
+            clean_repacked_offsets(ithr);
+        };
+        caller = [&](jit_snippets_call_args& call_args, const std::vector<size_t>& indexes, size_t ithr) {
+            in_parallel_repack_inputs(inMemPtrs, indexes, ithr, call_args);
+            callable(&call_args, indexes.data());
+        };
+        break;
+    case RepackingImplType::SEPARATE:
+    case RepackingImplType::NONE:
+        initializer = [&](jit_snippets_call_args& call_args, size_t ithr) {
+            init_call_args(call_args, inMemPtrs, outMemPtrs, m_start_offset_in, m_start_offset_out, ithr);
+            update_scratchpad_ptr(call_args.buffer_scratchpad_ptr, ithr);
+        };
+        caller = [&](jit_snippets_call_args& call_args, const std::vector<size_t>& indexes, size_t ithr) {
+            callable(&call_args, indexes.data());
+        };
+        break;
+    default:
+        OPENVINO_THROW("Uknown RepackingImplType");
+    }
+
+#if defined(__linux__) && defined(SNIPPETS_DEBUG_CAPS)
+    segfault_detector();
+#endif
+
+    if (m_parallel_exec_domain.size() == rank6D) {
+        parallel_for6d(initializer, caller);
+    } else {
+        parallel_forNd(initializer, caller);
+    }
+}
+
+void SubgraphDynamicSpecializedExecutor::exec_impl(const std::vector<MemoryPtr>& inMemPtrs,
+                                                   const std::vector<MemoryPtr>& outMemPtrs) {
+    const auto& callable = m_schedule->get_callable<dynamic_kernel>();
+
+    OPENVINO_ASSERT(m_data_offsets.size() == inMemPtrs.size() + outMemPtrs.size(), "Incorrect data offset count!");
+    OPENVINO_ASSERT(m_data_offsets.front().size() == m_parallel_exec_domain.size(),
+                    "Data offsets with invalid ranks detected");
+
+    // Note: we need to reset KernelExecutorTable to the state that was recorded in the
+    // SubgraphDynamicSpecializedExecutor constructor because the table might've been used for other shapes
+    m_reset_exec_table_state();
+
+    std::vector<const uint8_t*> src_ptrs;
+    std::vector<uint8_t*> dst_ptrs;
+    init_original_ptrs(inMemPtrs, outMemPtrs, src_ptrs, dst_ptrs, m_start_offset_in, m_start_offset_out);
+
+    initializer_functor initializer;
+    call_functor caller;
+
+    switch (get_repacking_impl_type()) {
+    case RepackingImplType::IN_PARALLEL:
+        initializer = [&](jit_snippets_call_args& call_args, size_t ithr) {
+            init_call_args(call_args, ithr);
+            update_scratchpad_ptr(call_args.buffer_scratchpad_ptr, ithr);
+            clean_repacked_offsets(ithr);
+        };
+        caller = [&](jit_snippets_call_args& call_args, const std::vector<size_t>& indexes, size_t ithr) {
+            update_ptrs(call_args, src_ptrs, dst_ptrs, indexes);
+            in_parallel_repack_inputs(inMemPtrs, indexes, ithr, call_args);
+            callable(&call_args);
+        };
+        break;
+    case RepackingImplType::SEPARATE:
+    case RepackingImplType::NONE:
+        initializer = [&](jit_snippets_call_args& call_args, size_t ithr) {
+            init_call_args(call_args, ithr);
+            update_scratchpad_ptr(call_args.buffer_scratchpad_ptr, ithr);
+        };
+        caller = [&](jit_snippets_call_args& call_args, const std::vector<size_t>& indexes, size_t ithr) {
+            update_ptrs(call_args, src_ptrs, dst_ptrs, indexes);
+            callable(&call_args);
+        };
+        break;
+    default:
+        OPENVINO_THROW("Uknown RepackingImplType");
+    }
+
+#if defined(__linux__) && defined(SNIPPETS_DEBUG_CAPS)
+    segfault_detector();
+#endif
+
+    if (m_parallel_exec_domain.size() == rank6D) {
+        parallel_for6d(initializer, caller);
+    } else {
+        parallel_forNd(initializer, caller);
+    }
+}
+
+}  // namespace intel_cpu
+}  // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/executors/x64/subgraph.hpp b/src/plugins/intel_cpu/src/nodes/executors/x64/subgraph.hpp
new file mode 100644
index 00000000000000..457d4982cf942a
--- /dev/null
+++ b/src/plugins/intel_cpu/src/nodes/executors/x64/subgraph.hpp
@@ -0,0 +1,94 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "nodes/executors/subgraph.hpp"
+
+namespace ov {
+namespace intel_cpu {
+
+class SubgraphExecutor : public SubgraphBaseExecutor {
+public:
+    SubgraphExecutor(const std::shared_ptr<CPURuntimeConfig>& snippet_config,
+                     const std::shared_ptr<SubgraphAttrs>& snippet_attrs,
+                     const std::shared_ptr<SubgraphCodeGenerator>& snippet,
+                     const std::vector<ptrdiff_t>& start_offset_in,
+                     const std::vector<ptrdiff_t>& start_offset_out,
+                     const BufferScratchpadAllocator& allocator,
+                     const ov::intel_cpu::MultiCacheWeakPtr& kernel_cache);
+
+    void execute(const dnnl::stream& strm,
+                 const std::vector<MemoryPtr>& inMemPtrs,
+                 const std::vector<MemoryPtr>& outMemPtrs) override;
+
+protected:
+    std::vector<MemoryPtr> separately_repack_inputs(const dnnl::stream& strm, const std::vector<MemoryPtr>& srcMemPtrs);
+    void in_parallel_repack_inputs(const std::vector<MemoryPtr>& inMemPtrs,
+                                   const std::vector<size_t>& indexes,
+                                   int ithr,
+                                   jit_snippets_call_args& call_args);
+
+    inline void* get_external_scratchpad_ptr(size_t ithr, size_t idx) const {
+        if (m_repacked_inputs.empty())
+            return nullptr;
+
+        uint8_t* data_ptr = m_buffer_scratchpad->getDataAs<uint8_t>() + m_internal_buffer_size;
+        for (const auto& p : m_repacked_inputs) {
+            const auto& desc = p.second.desc();
+            const auto size = desc->getCurrentMemSize();
+            if (p.first == idx) {
+                return data_ptr + ithr * size;
+            }
+            data_ptr += m_nthreads * size;
+        }
+        OPENVINO_THROW("External buffer pointer has not been found");
+    }
+
+    // [ Thread Index -> Index of input with repacking data - > last repacked src_offset ]
+    std::vector<std::vector<size_t>> m_repacked_offsets_by_threads = {};
+    std::unordered_map<size_t, RepackedInput> m_repacked_inputs = {};
+
+    std::function<void(const std::vector<size_t>&, const std::vector<size_t>&, size_t&)> init_offset = {};
+
+    using RepackingImplType = CPURuntimeConfig::RepackingImplType;
+    const RepackingImplType& get_repacking_impl_type() const {
+        return m_repacking_impl_type;
+    }
+
+    inline void clean_repacked_offsets(size_t ithr) {
+        m_repacked_offsets_by_threads[ithr].assign(m_repacked_inputs.size(), std::numeric_limits<size_t>::max());
+    }
+
+#ifdef SNIPPETS_DEBUG_CAPS
+    bool enabled_segfault_detector = false;
+    inline void segfault_detector();
+#endif
+
+private:
+    RepackingImplType m_repacking_impl_type = RepackingImplType::NONE;
+};
+
+class SubgraphStaticExecutor : public SubgraphExecutor, public SubgraphStaticBaseExecutor {
+public:
+    template <typename T, typename... Args>
+    SubgraphStaticExecutor(T&& first, Args&&... rest)
+        : SubgraphExecutor(std::forward<T>(first), std::forward<Args>(rest)...),
+          SubgraphStaticBaseExecutor() {}
+
+    void exec_impl(const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs) override;
+};
+
+class SubgraphDynamicSpecializedExecutor : public SubgraphExecutor, public SubgraphDynamicSpecializedBaseExecutor {
+public:
+    template <typename T, typename... Args>
+    SubgraphDynamicSpecializedExecutor(T&& first, Args&&... rest)
+        : SubgraphExecutor(std::forward<T>(first), std::forward<Args>(rest)...),
+          SubgraphDynamicSpecializedBaseExecutor(std::forward<T>(first)) {}
+
+    void exec_impl(const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs) override;
+};
+
+}  // namespace intel_cpu
+}  // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp
index ba6f1eda215dce..4a84fc6667406d 100644
--- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp
+++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp
@@ -7,7 +7,6 @@
 #include "dnnl_extension_utils.h"
 #include "onednn/dnnl.h"
 #include "openvino/core/parallel.hpp"
-#include "openvino/core/rt_info.hpp"
 #include "shape_inference/custom/subgraph.hpp"
 #include "snippets/lowered/pass/init_loops.hpp"
 #include "snippets/lowered/pass/insert_buffers.hpp"
@@ -27,9 +26,11 @@
 
 #if defined(OPENVINO_ARCH_ARM64)
 #    include "emitters/snippets/aarch64/cpu_generator.hpp"
+#    include "executors/aarch64/subgraph.hpp"
 #    include "transformations/snippets/aarch64/shape_inference.hpp"
 #else
 #    include "emitters/snippets/x64/cpu_generator.hpp"
+#    include "executors/x64/subgraph.hpp"
 #    include "transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.hpp"
 #    include "transformations/snippets/x64/pass/eliminate_brgemm_copy_b.hpp"
 #    include "transformations/snippets/x64/pass/enforce_precision.hpp"
@@ -48,13 +49,6 @@
 #include "utils/cpu_utils.hpp"
 #include "utils/ngraph_utils.hpp"
 
-#if defined(__linux__) && defined(OPENVINO_ARCH_X86_64) && defined(SNIPPETS_DEBUG_CAPS)
-#    include <signal.h>
-
-#    include "emitters/snippets/x64/jit_segfault_detector_emitter.hpp"
-std::mutex err_print_lock;
-#endif
-
 #ifdef SNIPPETS_LIBXSMM_TPP
 #    include "snippets/lowered/pass/optimize_domain.hpp"
 #    include "transformations/tpp/x64/pass/brgemm_to_brgemm_tpp.hpp"
@@ -70,265 +64,76 @@ namespace intel_cpu {
 namespace node {
 namespace {
 
-// Class for Subgraphs with static shapes
-class SubgraphStaticExecutor : public Subgraph::SubgraphExecutor {
-public:
-    SubgraphStaticExecutor(const std::shared_ptr<Subgraph::SubgraphAttrs>& snippet_attrs,
-                           const std::shared_ptr<Subgraph::SubgraphCodeGenerator>& snippet,
-                           const std::vector<ptrdiff_t>& start_offset_in,
-                           const std::vector<ptrdiff_t>& start_offset_out,
-                           const std::shared_ptr<CPURuntimeConfig>& snippet_config,
-                           const BufferScratchpadAllocator& allocator)
-        : SubgraphExecutor(snippet_attrs, snippet, start_offset_in, start_offset_out, snippet_config, allocator) {}
-
-    void exec_impl(const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs) override {
-        const auto& callable = m_schedule->get_callable<kernel>();
-
-        auto initializer = [&](jit_snippets_call_args& call_args, size_t ithr) {
-            init_call_args(call_args, inMemPtrs, outMemPtrs, ithr);
-        };
-        auto caller = [&](jit_snippets_call_args& call_args, const std::vector<size_t>& indexes) {
-            callable(&call_args, indexes.data());
-        };
-
-        if (m_parallel_exec_domain.size() == rank6D) {
-            parallel_for6d(initializer, caller);
-        } else {
-            parallel_forNd(initializer, caller);
-        }
-    }
-
-protected:
-    typedef void (*kernel)(const void*, const void*);
-
-    inline void init_call_args(jit_snippets_call_args& call_args,
-                               const std::vector<MemoryPtr>& srcMemPtrs,
-                               const std::vector<MemoryPtr>& dstMemPtrs,
-                               size_t ithr) {
-        for (size_t i = 0; i < srcMemPtrs.size(); i++)
-            call_args.src_ptrs[i] = srcMemPtrs[i]->getDataAs<const uint8_t>() + m_start_offset_in[i];
-
-        for (size_t i = 0; i < dstMemPtrs.size(); i++)
-            call_args.dst_ptrs[i] = dstMemPtrs[i]->getDataAs<uint8_t>() + m_start_offset_out[i];
-
-        update_scratchpad_ptr(call_args.buffer_scratchpad_ptr, ithr);
-    }
-};
-
-// Specialized dynamic executor based on shape agnostic kernel for the specific input shapes
-class SubgraphDynamicSpecializedExecutor : public Subgraph::SubgraphExecutor {
-public:
-    SubgraphDynamicSpecializedExecutor(const std::shared_ptr<Subgraph::SubgraphAttrs>& snippet_attrs,
-                                       const std::shared_ptr<Subgraph::SubgraphCodeGenerator>& snippet,
-                                       const std::vector<ptrdiff_t>& start_offset_in,
-                                       const std::vector<ptrdiff_t>& start_offset_out,
-                                       const std::shared_ptr<CPURuntimeConfig>& snippet_config,
-                                       const BufferScratchpadAllocator& allocator)
-        : SubgraphExecutor(snippet_attrs, snippet, start_offset_in, start_offset_out, snippet_config, allocator) {
-        buffer_offsets = snippet_config->buffer_cluster_offsets;
-        data_offsets = snippet_config->io_data_offsets;
-        loop_args = snippet_config->loop_args;
-        reset_exec_table_state = snippet_config->kernel_executor_table->get_state_reset();
-    }
-
-    void exec_impl(const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs) override {
-        const auto& callable = m_schedule->get_callable<dynamic_kernel>();
-
-        OPENVINO_ASSERT(data_offsets.size() == inMemPtrs.size() + outMemPtrs.size(), "Incorrect data offset count!");
-        OPENVINO_ASSERT(data_offsets.front().size() == m_parallel_exec_domain.size(),
-                        "Data offsets with invalid ranks detected");
-
-        // Note: we need to reset KernelExecutorTable to the state that was recorded in the
-        // SubgraphDynamicSpecializedExecutor constructor because the table might've been used for other shapes
-        reset_exec_table_state();
-
-        std::vector<const uint8_t*> src_ptrs;
-        std::vector<uint8_t*> dst_ptrs;
-        init_original_ptrs(inMemPtrs, outMemPtrs, src_ptrs, dst_ptrs);
-
-        auto initializer = [&](jit_snippets_call_args& call_args, size_t ithr) {
-            init_call_args(call_args, ithr);
-        };
-        auto caller = [&](jit_snippets_call_args& call_args, const std::vector<size_t>& indexes) {
-            update_ptrs(call_args, src_ptrs, dst_ptrs, indexes);
-            callable(&call_args);
-        };
-
-        if (m_parallel_exec_domain.size() == rank6D) {
-            parallel_for6d(initializer, caller);
-        } else {
-            parallel_forNd(initializer, caller);
-        }
-    }
-
-protected:
-    typedef void (*dynamic_kernel)(const void*);
-
-    inline void init_call_args(jit_snippets_call_args& call_args, size_t ithr) {
-        call_args.register_loops(loop_args);
-        std::copy(buffer_offsets.cbegin(), buffer_offsets.cend(), call_args.buffer_offsets);
-
-        update_scratchpad_ptr(call_args.buffer_scratchpad_ptr, ithr);
-    }
-
-    inline void init_original_ptrs(const std::vector<MemoryPtr>& srcMemPtrs,
-                                   const std::vector<MemoryPtr>& dstMemPtrs,
-                                   std::vector<const uint8_t*>& src_ptrs,
-                                   std::vector<uint8_t*>& dst_ptrs) {
-        const auto in_num = srcMemPtrs.size();
-        const auto out_num = dstMemPtrs.size();
-
-        src_ptrs.resize(in_num, nullptr);
-        dst_ptrs.resize(out_num, nullptr);
-
-        for (size_t i = 0; i < in_num; i++)
-            src_ptrs[i] = srcMemPtrs[i]->getDataAs<const uint8_t>() + m_start_offset_in[i];
-        for (size_t i = 0; i < out_num; i++)
-            dst_ptrs[i] = dstMemPtrs[i]->getDataAs<uint8_t>() + m_start_offset_out[i];
-    }
-
-    inline void update_ptrs(jit_snippets_call_args& call_args,
-                            const std::vector<const uint8_t*>& src_ptrs,
-                            const std::vector<uint8_t*>& dst_ptrs,
-                            const std::vector<size_t>& indexes) const {
-        for (size_t i = 0; i < src_ptrs.size(); i++) {
-            auto i_ptr = src_ptrs[i];
-            for (size_t j = 0; j < indexes.size(); j++) {
-                i_ptr += data_offsets[i][j] * indexes[j];
-            }
-            call_args.src_ptrs[i] = i_ptr;
-        }
-        for (size_t i = 0; i < dst_ptrs.size(); i++) {
-            auto i_ptr = dst_ptrs[i];
-            for (size_t j = 0; j < indexes.size(); j++) {
-                i_ptr += data_offsets[i + src_ptrs.size()][j] * indexes[j];
-            }
-            call_args.dst_ptrs[i] = i_ptr;
-        }
-    }
-
-    std::vector<size_t> buffer_offsets = {};
-    std::vector<std::vector<size_t>> data_offsets = {};
-    std::vector<jit_snippets_call_args::loop_args_t> loop_args = {};
-    std::function<void()> reset_exec_table_state;
-};
-
+#if defined(OPENVINO_ARCH_X86_64) || defined(OPENVINO_ARCH_ARM64)
 struct SubgraphKey {
     SubgraphKey() = default;
-    SubgraphKey(const std::shared_ptr<Subgraph::SubgraphAttrs>& attrs_, const std::vector<VectorDims>& in_shapes_)
+    SubgraphKey(const std::shared_ptr<SubgraphAttrs>& attrs_, const std::vector<VectorDims>& in_shapes_)
         : attrs(attrs_),
           in_shapes(in_shapes_) {}
     virtual ~SubgraphKey() = default;
 
-    size_t hash() const;
-    bool operator==(const SubgraphKey& rhs) const;
+    size_t hash() const {
+        using namespace dnnl::impl;
+        using namespace dnnl::impl::primitive_hashing;
+
+        size_t seed = get_attr_hash(0, attrs);
+        for (const auto& shape : in_shapes)
+            seed = get_vector_hash(seed, shape);
+
+        return seed;
+    }
+    bool operator==(const SubgraphKey& rhs) const {
+        return *attrs == *rhs.attrs && in_shapes == rhs.in_shapes;
+    }
 
-    std::shared_ptr<Subgraph::SubgraphAttrs> attrs = nullptr;
+    std::shared_ptr<SubgraphAttrs> attrs = nullptr;
     std::vector<VectorDims> in_shapes = {};
 };
 
 struct SubgraphCodeGeneratorKey {
-    SubgraphCodeGeneratorKey(const std::shared_ptr<Subgraph::SubgraphAttrs>& attrs_, uint8_t mask_)
+    SubgraphCodeGeneratorKey(const std::shared_ptr<SubgraphAttrs>& attrs_, uint8_t mask_)
         : attrs(attrs_),
           broadcasting_mask(mask_) {}
 
-    size_t hash() const;
-    bool operator==(const SubgraphCodeGeneratorKey& rhs) const;
+    size_t hash() const {
+        using namespace dnnl::impl;
+        using namespace dnnl::impl::primitive_hashing;
 
-    std::shared_ptr<Subgraph::SubgraphAttrs> attrs = nullptr;
+        size_t seed = get_attr_hash(0, attrs);
+        return hash_combine(seed, broadcasting_mask);
+    }
+    bool operator==(const SubgraphCodeGeneratorKey& rhs) const {
+        return *attrs == *rhs.attrs && broadcasting_mask == rhs.broadcasting_mask;
+    }
+
+    std::shared_ptr<SubgraphAttrs> attrs = nullptr;
     uint32_t broadcasting_mask = 0;
 };
+#endif
 
 struct SubgraphShapeInferResultKey {
     SubgraphShapeInferResultKey(std::vector<VectorDims> in_shapes_, uint64_t body_hash_)
         : in_shapes(std::move(in_shapes_)),
           body_hash(body_hash_) {}
 
-    size_t hash() const;
-    bool operator==(const SubgraphShapeInferResultKey& rhs) const;
+    size_t hash() const {
+        using namespace dnnl::impl;
+        using namespace dnnl::impl::primitive_hashing;
+
+        size_t seed = hash_combine(0, body_hash);
+        for (const auto& shape : in_shapes)
+            seed = get_vector_hash(seed, shape);
+
+        return seed;
+    }
+    bool operator==(const SubgraphShapeInferResultKey& rhs) const {
+        return body_hash == rhs.body_hash && in_shapes == rhs.in_shapes;
+    }
 
     std::vector<VectorDims> in_shapes = {};
     uint64_t body_hash = 0;
 };
 
-size_t get_attr_hash(size_t seed, const std::shared_ptr<Subgraph::SubgraphAttrs>& attrs) {
-    using namespace dnnl::impl;
-    using namespace dnnl::impl::primitive_hashing;
-
-    for (const auto& order : attrs->inMemOrders)
-        seed = get_vector_hash(seed, order);
-    for (const auto& prec : attrs->inMemPrecs)
-        seed = hash_combine(seed, prec.hash());
-
-    for (const auto& order : attrs->outMemOrders)
-        seed = get_vector_hash(seed, order);
-    for (const auto& prec : attrs->outMemPrecs)
-        seed = hash_combine(seed, prec.hash());
-
-    seed = hash_combine(seed, attrs->bodyHash);
-    return seed;
-}
-
-size_t SubgraphKey::hash() const {
-    using namespace dnnl::impl;
-    using namespace dnnl::impl::primitive_hashing;
-
-    size_t seed = get_attr_hash(0, attrs);
-    for (const auto& shape : in_shapes)
-        seed = get_vector_hash(seed, shape);
-
-    return seed;
-}
-
-size_t SubgraphCodeGeneratorKey::hash() const {
-    using namespace dnnl::impl;
-    using namespace dnnl::impl::primitive_hashing;
-
-    size_t seed = get_attr_hash(0, attrs);
-    seed = hash_combine(seed, broadcasting_mask);
-
-    return seed;
-}
-
-size_t SubgraphShapeInferResultKey::hash() const {
-    using namespace dnnl::impl;
-    using namespace dnnl::impl::primitive_hashing;
-
-    size_t seed = hash_combine(0, body_hash);
-    for (const auto& shape : in_shapes)
-        seed = get_vector_hash(seed, shape);
-
-    return seed;
-}
-
-bool operator==(const Subgraph::SubgraphAttrs& lhs, const Subgraph::SubgraphAttrs& rhs) {
-    if (&lhs == &rhs)
-        return true;
-    if (lhs.bodyHash != rhs.bodyHash)
-        return false;
-    if (lhs.inMemOrders.size() != rhs.inMemOrders.size() || lhs.inMemPrecs.size() != rhs.inMemPrecs.size())
-        return false;
-    if (lhs.outMemOrders.size() != rhs.outMemOrders.size() || lhs.outMemPrecs.size() != rhs.outMemPrecs.size())
-        return false;
-    if (lhs.inMemOrders != rhs.inMemOrders || lhs.inMemPrecs != rhs.inMemPrecs)
-        return false;
-    if (lhs.outMemOrders != rhs.outMemOrders || lhs.outMemPrecs != rhs.outMemPrecs)
-        return false;
-    return true;
-}
-
-bool SubgraphKey::operator==(const SubgraphKey& rhs) const {
-    return *attrs == *rhs.attrs && in_shapes == rhs.in_shapes;
-}
-
-bool SubgraphCodeGeneratorKey::operator==(const SubgraphCodeGeneratorKey& rhs) const {
-    return *attrs == *rhs.attrs && broadcasting_mask == rhs.broadcasting_mask;
-}
-
-bool SubgraphShapeInferResultKey::operator==(const SubgraphShapeInferResultKey& rhs) const {
-    return body_hash == rhs.body_hash && in_shapes == rhs.in_shapes;
-}
-
 struct SubgraphShapeInferResult {
     SubgraphShapeInferResult(IShapeInfer::Result res) : result(std::move(res)) {}
 
@@ -352,7 +157,8 @@ Subgraph::Subgraph(const std::shared_ptr<ov::Node>& op, const GraphContext::CPtr
     subgraph_attrs->bodyHash = getBodyHash(tmp_snippet);
 
 #if defined(OPENVINO_ARCH_ARM64)
-    subgraph_attrs->snippet->set_generator(std::make_shared<aarch64::CPUGenerator>(host_isa));
+    subgraph_attrs->snippet->set_generator(
+        std::make_shared<aarch64::CPUGenerator>(host_isa, context->getParamsCache()));
 #elif defined(OPENVINO_ARCH_X86_64)
     subgraph_attrs->snippet->set_generator(std::make_shared<CPUGenerator>(host_isa, context->getParamsCache()));
 #else
@@ -796,12 +602,13 @@ void Subgraph::optimizeIR() {
 }
 
 void Subgraph::prepareParams() {
+#if defined(OPENVINO_ARCH_X86_64) || defined(OPENVINO_ARCH_ARM64)
     const auto& cache = context->getParamsCache();
 
-    auto builder = [this, &cache](const SubgraphKey& key) -> std::shared_ptr<SubgraphExecutor> {
+    auto builder = [this, &cache](const SubgraphKey& key) -> std::shared_ptr<SubgraphBaseExecutor> {
         const auto& snippet = subgraph_attrs->snippet;
 
-        SubgraphExecutor::BufferScratchpadAllocator allocator = [this](size_t size) {
+        SubgraphBaseExecutor::BufferScratchpadAllocator allocator = [this](size_t size) {
             return getScratchPadMem(std::make_shared<CpuBlockedMemoryDesc>(ov::element::u8, intel_cpu::Shape{size}));
         };
 
@@ -824,12 +631,13 @@ void Subgraph::prepareParams() {
                     code_gen->get()->lowering_result.kernel_executor_table);
             }
             const auto& snippet_config = ov::as_type_ptr<CPURuntimeConfig>(snippet->update_runtime_config());
-            return std::make_shared<SubgraphDynamicSpecializedExecutor>(key.attrs,
+            return std::make_shared<SubgraphDynamicSpecializedExecutor>(snippet_config,
+                                                                        key.attrs,
                                                                         code_gen,
                                                                         start_offset_in,
                                                                         start_offset_out,
-                                                                        snippet_config,
-                                                                        allocator);
+                                                                        allocator,
+                                                                        cache);
         } else {
             // Static case:
             // 1. Update runtime config to get static scheduling data (io data offsets, parallel domain) which will be
@@ -842,17 +650,20 @@ void Subgraph::prepareParams() {
                 [&snippet_config](const SubgraphCodeGeneratorKey& key) -> std::shared_ptr<SubgraphCodeGenerator> {
                     return std::make_shared<SubgraphCodeGenerator>(key.attrs, snippet_config);
                 });
-            return std::make_shared<SubgraphStaticExecutor>(key.attrs,
+            return std::make_shared<SubgraphStaticExecutor>(snippet_config,
+                                                            key.attrs,
                                                             code_gen_result.first,
                                                             start_offset_in,
                                                             start_offset_out,
-                                                            snippet_config,
-                                                            allocator);
+                                                            allocator,
+                                                            cache);
         }
     };
 
     const auto result = cache->getOrCreate(SubgraphKey(subgraph_attrs, in_shapes), builder);
     execPtr = result.first;
+#endif
+
     OPENVINO_ASSERT(execPtr != nullptr, "Executor is not created for node ", getName(), ".");
 }
 
@@ -907,191 +718,6 @@ void Subgraph::executeDynamicImpl(dnnl::stream strm) {
     execute(strm);
 }
 
-namespace {
-inline void init_parallel_domain(const std::shared_ptr<CPURuntimeConfig>& snippet_config, std::vector<size_t>& domain) {
-    const auto& master_shape = snippet_config->master_shape;
-    const auto& tensor_rank = snippet_config->tensor_rank;
-    const auto& tile_rank = snippet_config->tile_rank;
-    domain.resize(tensor_rank, 1);
-
-    std::fill(domain.begin(), domain.end(), 1);
-    std::copy(master_shape.cbegin(),
-              master_shape.cbegin() + (master_shape.size() - tile_rank),
-              domain.begin() + (tensor_rank - master_shape.size()));
-}
-}  // namespace
-
-Subgraph::SubgraphCodeGenerator::SubgraphCodeGenerator(const std::shared_ptr<Subgraph::SubgraphAttrs>& snippet_attrs,
-                                                       const std::shared_ptr<CPURuntimeConfig>& config) {
-    OPENVINO_ASSERT(snippet_attrs, "Subgraph attributes are empty!");
-    OPENVINO_ASSERT(config, "Runtime Config is empty!");
-
-    jit_snippets_compile_args jcp;
-    jcp.data_offsets = config->io_data_offsets;
-    init_parallel_domain(config, jcp.exec_domain);
-    schedule =
-        std::make_shared<ov::snippets::Schedule>(snippet_attrs->snippet->generate(reinterpret_cast<const void*>(&jcp)));
-}
-
-Subgraph::SubgraphExecutor::SubgraphExecutor(const std::shared_ptr<Subgraph::SubgraphAttrs>& snippet_attrs,
-                                             const std::shared_ptr<SubgraphCodeGenerator>& snippet,
-                                             const std::vector<ptrdiff_t>& start_offset_in,
-                                             const std::vector<ptrdiff_t>& start_offset_out,
-                                             const std::shared_ptr<CPURuntimeConfig>& snippet_config,
-                                             const BufferScratchpadAllocator& allocator)
-    : m_schedule(snippet->get()),
-      m_start_offset_in(start_offset_in),
-      m_start_offset_out(start_offset_out) {
-    OPENVINO_ASSERT(m_schedule, "Schedule is empty!");
-    OPENVINO_ASSERT(snippet_config, "Runtime Config is empty!");
-    init_parallel_domain(snippet_config, m_parallel_exec_domain);
-
-    m_harness_work_amount = std::accumulate(m_parallel_exec_domain.cbegin(),
-                                            m_parallel_exec_domain.cend(),
-                                            size_t(1),
-                                            std::multiplies<size_t>());
-    m_nthreads = std::min(parallel_get_max_threads(), static_cast<int>(m_harness_work_amount));
-
-    m_buffer_scratchpad_size = snippet_config->buffer_scratchpad_size;
-    OPENVINO_ASSERT(!ov::snippets::utils::is_dynamic_value(m_buffer_scratchpad_size),
-                    "Undefined buffer scratchpad size!");
-    m_internal_buffer_size = static_cast<size_t>(m_nthreads) * m_buffer_scratchpad_size;
-    m_in_requested_descs = snippet_config->m_in_requested_descs;
-    const auto external_repacking_buffer_size =
-        std::accumulate(m_in_requested_descs.begin(),
-                        m_in_requested_descs.end(),
-                        size_t(0),
-                        [](size_t sum, const std::pair<size_t, ov::intel_cpu::MemoryDescPtr>& requested_desc_elem) {
-                            return sum + requested_desc_elem.second->getCurrentMemSize();
-                        });
-    m_buffer_scratchpad = allocator(m_internal_buffer_size + external_repacking_buffer_size);
-
-#if defined(__linux__) && defined(OPENVINO_ARCH_X86_64) && defined(SNIPPETS_DEBUG_CAPS)
-    const auto target = std::dynamic_pointer_cast<const CPUTargetMachine>(
-        snippet_attrs->snippet->get_generator()->get_target_machine());
-    enabled_segfault_detector = target && target->debug_config.enable_segfault_detector;
-#endif
-}
-
-#if defined(__linux__) && defined(OPENVINO_ARCH_X86_64) && defined(SNIPPETS_DEBUG_CAPS)
-void Subgraph::SubgraphExecutor::segfault_detector() {
-    if (enabled_segfault_detector) {
-        __sighandler_t signal_handler = [](int signal) {
-            std::lock_guard<std::mutex> guard(err_print_lock);
-            if (auto segfault_detector_emitter = ov::intel_cpu::g_custom_segfault_handler->local())
-                std::cout << segfault_detector_emitter->info() << std::endl;
-            auto tid = parallel_get_thread_num();
-            OPENVINO_THROW("Segfault was caught by the signal handler in subgraph node execution on thread " +
-                           std::to_string(tid));
-        };
-        struct sigaction new_handler {};
-        new_handler.sa_handler = signal_handler;
-        sigaction(SIGSEGV, &new_handler, nullptr);
-    }
-}
-#endif
-
-void Subgraph::SubgraphExecutor::parallel_for6d(
-    const std::function<void(jit_snippets_call_args&, size_t)>& initializer,
-    const std::function<void(jit_snippets_call_args&, const std::vector<size_t>&)>& caller) {
-    const auto& dom = m_parallel_exec_domain;
-
-#if defined(__linux__) && defined(OPENVINO_ARCH_X86_64) && defined(SNIPPETS_DEBUG_CAPS)
-    segfault_detector();
-#endif
-
-    parallel_nt_static(m_nthreads, [&](const int ithr, const int nthr) {
-        jit_snippets_call_args call_args;
-        initializer(call_args, ithr);
-
-        size_t start = 0, end = 0;
-        splitter(m_harness_work_amount, nthr, ithr, start, end);
-
-        std::vector<size_t> indexes{0, 0, 0, 0, 0};
-        parallel_it_init(start,
-                         indexes[0],
-                         dom[0],
-                         indexes[1],
-                         dom[1],
-                         indexes[2],
-                         dom[2],
-                         indexes[3],
-                         dom[3],
-                         indexes[4],
-                         dom[4]);
-        for (size_t iwork = start; iwork < end; ++iwork) {
-            caller(call_args, indexes);
-            parallel_it_step(indexes[0],
-                             dom[0],
-                             indexes[1],
-                             dom[1],
-                             indexes[2],
-                             dom[2],
-                             indexes[3],
-                             dom[3],
-                             indexes[4],
-                             dom[4]);
-        }
-    });
-}
-
-void Subgraph::SubgraphExecutor::parallel_forNd(
-    const std::function<void(jit_snippets_call_args&, size_t)>& initializer,
-    const std::function<void(jit_snippets_call_args&, const std::vector<size_t>&)>& caller) {
-    const auto& dom = m_parallel_exec_domain;
-
-#if defined(__linux__) && defined(OPENVINO_ARCH_X86_64) && defined(SNIPPETS_DEBUG_CAPS)
-    segfault_detector();
-#endif
-
-    parallel_nt_static(m_nthreads, [&](const int ithr, const int nthr) {
-        jit_snippets_call_args call_args;
-        initializer(call_args, ithr);
-
-        size_t start = 0, end = 0;
-        splitter(m_harness_work_amount, nthr, ithr, start, end);
-
-        std::vector<size_t> indexes(dom.size() - 1, 0);
-        for (size_t iwork = start; iwork < end; ++iwork) {
-            size_t tmp = iwork;
-            for (ptrdiff_t j = static_cast<ptrdiff_t>(dom.size()) - 2; j >= 0; j--) {
-                indexes[j] = tmp % dom[j];
-                tmp /= dom[j];
-            }
-
-            caller(call_args, indexes);
-        }
-    });
-}
-
-void Subgraph::SubgraphExecutor::execute(const dnnl::stream& strm,
-                                         const std::vector<MemoryPtr>& inMemPtrs,
-                                         const std::vector<MemoryPtr>& outMemPtrs) {
-    if (!m_in_requested_descs.empty()) {
-        auto reorderedInMemPtrs = reorder_inputs(strm, inMemPtrs);
-        exec_impl(reorderedInMemPtrs, outMemPtrs);
-    } else {
-        exec_impl(inMemPtrs, outMemPtrs);
-    }
-}
-
-std::vector<MemoryPtr> Subgraph::SubgraphExecutor::reorder_inputs(const dnnl::stream& strm,
-                                                                  const std::vector<MemoryPtr>& inMemPtrs) {
-    auto reordered_in_ptrs = inMemPtrs;
-    size_t offset = m_internal_buffer_size;
-    for (const auto& requested_descs_elem : m_in_requested_descs) {
-        const auto in_idx = requested_descs_elem.first;
-        const auto& requested_desc = requested_descs_elem.second;
-
-        const void* data_ptr = m_buffer_scratchpad->getDataAs<uint8_t>() + offset;
-        const auto scratch_mem = std::make_shared<Memory>(strm.get_engine(), requested_desc, data_ptr, false);
-        scratch_mem->load(*reordered_in_ptrs[in_idx]);
-        reordered_in_ptrs[in_idx] = scratch_mem;
-        offset += requested_desc->getCurrentMemSize();
-    }
-    return reordered_in_ptrs;
-}
-
 }  // namespace node
 }  // namespace intel_cpu
 }  // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.h b/src/plugins/intel_cpu/src/nodes/subgraph.h
index a84e46d9ae02da..9e6cb3cd49a9d7 100644
--- a/src/plugins/intel_cpu/src/nodes/subgraph.h
+++ b/src/plugins/intel_cpu/src/nodes/subgraph.h
@@ -4,10 +4,8 @@
 
 #pragma once
 
-#include "emitters/snippets/cpu_runtime_configurator.hpp"
-#include "emitters/snippets/jit_snippets_call_args.hpp"
+#include "executors/subgraph.hpp"
 #include "node.h"
-#include "snippets/op/subgraph.hpp"
 
 #if defined(OPENVINO_ARCH_ARM64)
 #    include "cpu/aarch64/cpu_isa_traits.hpp"
@@ -15,8 +13,6 @@
 #    include "cpu/x64/cpu_isa_traits.hpp"
 #endif
 
-#include <array>
-
 namespace ov {
 namespace intel_cpu {
 namespace node {
@@ -41,21 +37,6 @@ class Subgraph : public Node {
     void execute(dnnl::stream strm) override;
     void executeDynamicImpl(dnnl::stream strm) override;
 
-    struct SubgraphAttrs {
-        // Local copy of subgraph node for canonization & code generation
-        std::shared_ptr<snippets::op::Subgraph> snippet;
-        uint64_t bodyHash;
-        std::vector<VectorDims> inMemOrders;
-        std::vector<VectorDims> outMemOrders;
-        std::vector<ov::element::Type> inMemPrecs;
-        std::vector<ov::element::Type> outMemPrecs;
-    };
-
-    // Class for snippet compilation
-    class SubgraphCodeGenerator;
-    // Base class for executors
-    class SubgraphExecutor;
-
 protected:
     IShapeInfer::Result shapeInfer() const override;
 
@@ -103,79 +84,7 @@ class Subgraph : public Node {
     // Input shapes that are used in PrepareParams and ShapeInfer to avoid frequent memory allocation
     mutable std::vector<VectorDims> in_shapes;
 
-    std::shared_ptr<SubgraphExecutor> execPtr = nullptr;
-};
-
-class Subgraph::SubgraphCodeGenerator {
-public:
-    SubgraphCodeGenerator(const std::shared_ptr<Subgraph::SubgraphAttrs>& snippet_attrs,
-                          const std::shared_ptr<CPURuntimeConfig>& config);
-
-    const std::shared_ptr<snippets::Schedule>& get() const {
-        return schedule;
-    }
-
-private:
-    std::shared_ptr<snippets::Schedule> schedule;
-};
-
-class Subgraph::SubgraphExecutor {
-public:
-    using BufferScratchpadAllocator = std::function<MemoryPtr(size_t)>;
-
-    SubgraphExecutor(const std::shared_ptr<Subgraph::SubgraphAttrs>& snippet_attrs,
-                     const std::shared_ptr<Subgraph::SubgraphCodeGenerator>& snippet,
-                     const std::vector<ptrdiff_t>& start_offset_in,
-                     const std::vector<ptrdiff_t>& start_offset_out,
-                     const std::shared_ptr<CPURuntimeConfig>& snippet_config,
-                     const BufferScratchpadAllocator& allocator);
-    virtual ~SubgraphExecutor() = default;
-
-    void execute(const dnnl::stream& strm,
-                 const std::vector<MemoryPtr>& inMemPtrs,
-                 const std::vector<MemoryPtr>& outMemPtrs);
-
-protected:
-    virtual void exec_impl(const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs) = 0;
-
-    void parallel_for6d(const std::function<void(jit_snippets_call_args&, size_t)>& initializer,
-                        const std::function<void(jit_snippets_call_args&, const std::vector<size_t>&)>& caller);
-    void parallel_forNd(const std::function<void(jit_snippets_call_args&, size_t)>& initializer,
-                        const std::function<void(jit_snippets_call_args&, const std::vector<size_t>&)>& caller);
-
-    inline void update_scratchpad_ptr(void*& scratchpad_ptr, size_t ithr) const {
-        if (m_buffer_scratchpad_size > 0)
-            scratchpad_ptr = m_buffer_scratchpad->getDataAs<uint8_t>() + ithr * m_buffer_scratchpad_size;
-    }
-
-    std::shared_ptr<snippets::Schedule> m_schedule;
-    // Holds index of output used as in execution domain
-    // it should be compatible with a schedule's work size
-    std::vector<size_t> m_parallel_exec_domain = {};
-    size_t m_harness_work_amount = 0;
-
-    // Buffer scratchpad
-    MemoryPtr m_buffer_scratchpad = nullptr;
-    size_t m_buffer_scratchpad_size = 0;
-    size_t m_internal_buffer_size = 0;
-
-    const size_t rank6D = 6;
-
-    // Count of threads for parallel_nt
-    int m_nthreads = 0;
-
-    std::vector<ptrdiff_t> m_start_offset_in = {};
-    std::vector<ptrdiff_t> m_start_offset_out = {};
-
-#ifdef SNIPPETS_DEBUG_CAPS
-    bool enabled_segfault_detector = false;
-    inline void segfault_detector();
-#endif
-
-private:
-    std::vector<MemoryPtr> reorder_inputs(const dnnl::stream& strm, const std::vector<MemoryPtr>& inMemPtrs);
-
-    std::unordered_map<size_t, CpuBlockedMemoryDescPtr> m_in_requested_descs = {};
+    std::shared_ptr<SubgraphBaseExecutor> execPtr = nullptr;
 };
 
 }  // namespace node
diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_b.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_b.cpp
index 7e52905145869f..ce57cd1529b893 100644
--- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_b.cpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_b.cpp
@@ -114,6 +114,13 @@ size_t BrgemmCopyB::get_offset_compensations() const {
     return get_output_offset(1);
 }
 
+bool BrgemmCopyB::is_transposed(const std::vector<size_t>& layout) {
+    const auto is_transposed = !layout.empty() && layout.back() != layout.size() - 1;
+    OPENVINO_ASSERT(IMPLICATION(is_transposed, (layout[layout.size() - 2] == layout.size() - 1)),
+                    "supports only N dim placed as last or pre last dimension");
+    return is_transposed;
+}
+
 BrgemmCopyB::ShapeInfer::ShapeInfer(const std::shared_ptr<ov::Node>& n) {
     const auto& brg_copyb = ov::as_type_ptr<BrgemmCopyB>(n);
     OPENVINO_ASSERT(brg_copyb, "Got invalid node in BrgemmCopyB::ShapeInfer");
diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_b.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_b.hpp
index 54e2c39fcf1c06..b4e7b030fc605b 100644
--- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_b.hpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_b.hpp
@@ -72,6 +72,8 @@ class BrgemmCopyB : public snippets::modifier::MemoryAccess, public ov::op::Op {
         Result infer(const std::vector<snippets::VectorDimsRef>& input_shapes) override;
     };
 
+    static bool is_transposed(const std::vector<size_t>& layout);
+
 private:
     void custom_constructor_validate_and_infer_types(std::vector<size_t> layout_input = {});
     void validate_element_type(const ov::element::Type& element_type);
diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.cpp
index 939ae93ad92b18..b87a78c6b0cb40 100644
--- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.cpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.cpp
@@ -10,6 +10,7 @@
 #include "openvino/pass/pattern/op/wrap_type.hpp"
 #include "snippets/itt.hpp"
 #include "snippets/op/rank_normalization.hpp"
+#include "snippets/op/reorder.hpp"
 #include "transformations/snippets/x64/op/brgemm_copy_b.hpp"
 
 namespace ov {
@@ -30,12 +31,26 @@ pass::EliminateBrgemmCopyB::EliminateBrgemmCopyB() {
 
         const auto& in_desc = snippets::lowered::PortDescriptorUtils::get_port_descriptor_ptr(copy_b_node->input(0));
         const auto& layout = in_desc->get_layout();
-        // TODO:
-        // 1. Ticket 157340: support external repacking for copyB with compensations
-        // 2. Ticket 157339: support external repacking for non-planar layout
-        if (!ov::snippets::utils::is_planar_layout(layout) ||
-            brgemm_utils::with_compensations(copy_b_node->get_type()) || transformation_callback(copy_b_node))
+
+        auto is_supported_layout = [](const std::vector<size_t>& layout) {
+            return layout.empty() || (layout.size() - 1 == layout.back());
+        };
+
+        // TODO [157340]: support external repacking for copyB with compensations
+        if (!is_supported_layout(layout) || brgemm_utils::with_compensations(copy_b_node->get_type()) ||
+            transformation_callback(copy_b_node))
             return false;
+
+        // If there is non-planar layout, we should insert reshape to support shape inference
+        if (!ov::snippets::utils::is_planar_layout(layout)) {
+            const auto& subtensor = in_desc->get_subtensor();
+            const auto& reshape = std::make_shared<ov::snippets::op::Reorder>(copy_b_node->input_value(0), layout);
+            ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor(reshape->input(0), subtensor, layout);
+            ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor(reshape->output(0), subtensor);
+            return ov::replace_node_update_name(copy_b_node, reshape);
+        }
+
+        // If there is no layout, we can just remove BrgemmCopyB from the subgraph
         return ov::replace_output_update_name(copy_b_out, copy_b_node->input_value(0));
     };
 
diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp
index 1cb8263d189d18..16df97bb209ed9 100644
--- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp
@@ -70,8 +70,11 @@ bool pass::AdjustBrgemmCopyBLoopPorts::run(const snippets::lowered::LinearIR& li
     auto get_repacking_loop_idces = [](const snippets::lowered::ExpressionPtr& brgemm_expr) {
         // Repacking may be extracted outside the snippets kernel. In this case, brgemm parent expression is a
         // parameter.
-        if (is_type<ov::op::v0::Parameter>(
-                brgemm_expr->get_input_port_connector(1)->get_source().get_expr()->get_node()))
+        const auto& brgemm_in1 = brgemm_expr->get_input_port_connector(1)->get_source();
+        const auto& shape_infer_seq = ov::snippets::utils::get_first_parent_shape_infer_expr_seq(brgemm_in1.get_expr());
+        const auto source =
+            shape_infer_seq.empty() ? brgemm_in1 : shape_infer_seq.back()->get_input_port_connector(0)->get_source();
+        if (is_type<ov::op::v0::Parameter>(source.get_expr()->get_node()))
             return std::vector<size_t>{};
         const auto repacking_expr = brgemm_utils::repacking::get_copy_b_expr(brgemm_expr);
         OPENVINO_ASSERT(repacking_expr, "BrgemmCopyB expression is not found");
diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp
index 78f9b928298a9d..add7c66d3d7ffc 100644
--- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp
@@ -5,6 +5,7 @@
 #include "external_repacking_adjuster.hpp"
 
 #include "emitters/snippets/cpu_runtime_configurator.hpp"
+#include "emitters/snippets/x64/kernel_executors/brgemm_copy_b.hpp"
 #include "memory_desc/cpu_blocked_memory_desc.h"
 #include "snippets/itt.hpp"
 #include "snippets/utils/utils.hpp"
@@ -14,59 +15,142 @@
 namespace ov {
 namespace intel_cpu {
 
+const size_t BrgemmExternalRepackingAdjuster::brgemm_kernel_rank = 2;
+
 BrgemmExternalRepackingAdjuster::BrgemmExternalRepackingAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir,
                                                                  const CPURuntimeConfigurator* configurator)
     : snippets::lowered::pass::RuntimeOptimizer(configurator) {
     const auto& params = linear_ir->get_parameters();
     for (size_t i = 0; i < params.size(); ++i) {
         const auto& param = params[i];
-        const auto consumers = param->get_output_port_connector(0)->get_consumers();
-        const bool brgemm_with_extracted_repacking =
-            std::any_of(consumers.begin(), consumers.end(), [](const ov::snippets::lowered::ExpressionPort& port) {
-                auto brgemm = ov::as_type_ptr<ov::intel_cpu::BrgemmCPU>(port.get_expr()->get_node());
-                return brgemm && brgemm_utils::with_repacking(brgemm->get_type()) && port.get_index() == 1;
-            });
-        if (brgemm_with_extracted_repacking) {
-            m_param_idces_with_external_repacking.insert(i);
-            // Ticket 157339: Support non-planar layout
-            OPENVINO_ASSERT(ov::snippets::utils::is_planar_layout(configurator->get_io_descs()[i]->get_layout()),
-                            "Non-planar layout is not supported for external repacking");
+        const auto& shape_infer_consumers = ov::snippets::utils::get_first_child_shape_infer_expr_seq(param);
+        const auto& out = shape_infer_consumers.empty() ? param->get_output_port(0)
+                                                        : shape_infer_consumers.back()->get_output_port(0);
+        const auto consumers = out.get_connected_ports();
+
+        for (const auto& consumer : consumers) {
+            auto brgemm = ov::as_type_ptr<ov::intel_cpu::BrgemmCPU>(consumer.get_expr()->get_node());
+            if (brgemm && brgemm_utils::with_repacking(brgemm->get_type()) && consumer.get_index() == 1) {
+                const auto src_prc = brgemm->get_input_element_type(0);
+                const auto wei_prc = brgemm->get_input_element_type(1);
+                const auto isa = brgemm_utils::get_primitive_isa(src_prc, brgemm_utils::with_amx(brgemm->get_type()));
+                const auto inner_n_block = brgemm_utils::repacking::compute_inner_n_block(wei_prc);
+                const auto is_transposed_b =
+                    BrgemmCopyB::is_transposed(m_configurator->get_io_descs()[i]->get_layout());
+                auto config = BrgemmCopyBKernelConfig(src_prc, wei_prc, isa, false, is_transposed_b, inner_n_block);
+                m_executors[i] = std::make_shared<BrgemmCopyBKernelExecutor>(configurator->get_cache(), config);
+            }
         }
     }
 }
 
+VectorDims BrgemmExternalRepackingAdjuster::get_blk_order(size_t shape_rank) {
+    VectorDims order(shape_rank - brgemm_kernel_rank);
+    std::iota(order.begin(), order.end(), 0);
+    const auto last_idx = shape_rank - 1;
+    order.insert(order.end(), {last_idx - 1, last_idx, last_idx - 1});
+    return order;
+}
+
+VectorDims BrgemmExternalRepackingAdjuster::get_blk_shape(const VectorDims& planar_shape, ov::element::Type prc) {
+    const auto vnni_factor = brgemm_utils::compute_vnni_factor(prc);
+    const auto K = *++planar_shape.rbegin();
+    const auto N = *planar_shape.rbegin();
+    const auto new_K = snippets::utils::div_up(K, vnni_factor);
+    const auto new_N = std::max(N, brgemm_utils::repacking::compute_inner_n_block(prc));
+    VectorDims blk_shape(planar_shape.begin(), planar_shape.end() - brgemm_kernel_rank);
+    blk_shape.insert(blk_shape.end(), {new_K, new_N, vnni_factor});
+    return blk_shape;
+}
+
+void BrgemmExternalRepackingAdjuster::update_kernel(const RepackExecutorPtr& executor,
+                                                    const VectorDims& shape,
+                                                    const VectorDims& layout,
+                                                    size_t N,
+                                                    size_t K,
+                                                    ov::element::Type prc) {
+    const auto generic_config = executor->get_config().get_clone_ptr();
+    auto config = static_cast<BrgemmCopyBKernelConfig*>(generic_config.get());
+    const auto idx = config->is_transposed_B() ? 0 : 1;
+    const auto copy_wei_stride = ov::snippets::utils::get_dim_in_stride(shape, layout, idx) * prc.size();
+    config->update(N, N, K, K, copy_wei_stride, brgemm_utils::repacking::compute_LDB(N, prc));
+    executor->update_by_config(*config);
+}
+
 bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& linear_ir) {
     OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::BrgemmExternalRepackingAdjuster")
     const auto& cpu_config = ov::as_type_ptr<CPURuntimeConfig>(m_configurator->get_config());
-    auto& optimal_descs = cpu_config->m_in_requested_descs;
-    for (const auto& i : m_param_idces_with_external_repacking) {
+
+    size_t data_size = 0;
+    for (const auto& p : m_executors) {
+        const auto& i = p.first;
         const auto& shape = cpu_config->io_shapes[i];
-        const auto& K = *++shape.rbegin();
-        const auto& N = *shape.rbegin();
-
-        const auto& precision = linear_ir.get_parameters()[i]->get_node()->get_output_element_type(0);
-        const auto vnni_factor = brgemm_utils::compute_vnni_factor(precision);
-        const size_t brgemm_kernel_rank = 2;
-        // Firstly, batch dims are set
-        VectorDims requested_blocked_shape(shape.begin(), shape.end() - brgemm_kernel_rank);
-        // Then, the blocked dims are formed
-        requested_blocked_shape.insert(requested_blocked_shape.end(),
-                                       {snippets::utils::div_up(K, vnni_factor),
-                                        std::max(N, brgemm_utils::repacking::compute_inner_n_block(precision)),
-                                        vnni_factor});
-
-        VectorDims requested_order(shape.size() - brgemm_kernel_rank);
-        std::iota(requested_order.begin(), requested_order.end(), 0);
-        const auto last_idx = shape.size() - 1;
-        requested_order.insert(requested_order.end(), {last_idx - 1, last_idx, last_idx - 1});
-
-        optimal_descs[i] =
-            std::make_shared<CpuBlockedMemoryDesc>(precision, Shape(shape), requested_blocked_shape, requested_order);
-
-        ov::snippets::VectorDims shape_for_offset(cpu_config->tensor_rank - shape.size(), 1);
-        shape_for_offset.insert(shape_for_offset.end(), requested_blocked_shape.begin(), requested_blocked_shape.end());
-        m_configurator->compute_offsets(shape_for_offset, i, 0);
+
+        const auto& layout = cpu_config->io_layouts[i];
+        const auto planar_shape = ov::snippets::utils::get_planar_vdims(shape, layout);
+        const auto& K = *++planar_shape.rbegin();
+        const auto& N = *planar_shape.rbegin();
+
+        const auto& prc = linear_ir.get_parameters()[i]->get_node()->get_output_element_type(0);
+        const auto blk_shape = get_blk_shape(planar_shape, prc);
+
+        // src data + dst data per kernel call
+        const auto src_data = N * K * prc.size();
+        const auto dst_data =
+            std::accumulate(blk_shape.rbegin(), blk_shape.rbegin() + 3, prc.size(), std::multiplies<size_t>());
+        data_size += src_data + dst_data;
+
+        update_kernel(p.second, shape, layout, N, K, prc);
     }
+
+    const auto cache_size = dnnl::utils::get_cache_size(1, true) + dnnl::utils::get_cache_size(2, true);
+    const auto fit_into_cache = data_size < cache_size;
+    // Heuristic: If external repacking data doesn't fit in the caches L1 and L2,
+    //            external repacking should be executed in seperate parallel section before kernel execution.
+    cpu_config->repacking_impl_type = fit_into_cache ? CPURuntimeConfig::RepackingImplType::IN_PARALLEL
+                                                     : CPURuntimeConfig::RepackingImplType::SEPARATE;
+
+    const auto is_impl_parallel = cpu_config->repacking_impl_type == CPURuntimeConfig::RepackingImplType::IN_PARALLEL;
+
+    for (const auto& p : m_executors) {
+        const auto& i = p.first;
+        const auto& shape = cpu_config->io_shapes[i];
+        auto& repacked_in = cpu_config->repacked_inputs[i];
+
+        const auto& prc = linear_ir.get_parameters()[i]->get_node()->get_output_element_type(0);
+        auto planar_shape = ov::snippets::utils::get_planar_vdims(shape, cpu_config->io_layouts[i]);
+        auto blk_shape = get_blk_shape(planar_shape, prc);
+        // In parallel impl, each thread needs buffer with only shape [K_blk, N_blk, VNNI] to store repacking data
+        if (is_impl_parallel) {
+            std::fill(planar_shape.rbegin() + brgemm_kernel_rank, planar_shape.rend(), 1);
+            std::fill(blk_shape.rbegin() + brgemm_kernel_rank + 1, blk_shape.rend(), 1);
+        }
+        const auto order = get_blk_order(planar_shape.size());
+        const auto desc = std::make_shared<CpuBlockedMemoryDesc>(prc, Shape(planar_shape), blk_shape, order);
+
+        // Save original input offsets for input before repacking.
+        // If the shape has not been changed, it means that we already created `RepackedInput` for this input
+        // on previous pass call and now `cpu_config->io_data_offsets[i]` contains offsets not for original input -
+        // they were updated for blocked shapes/zeroed for previous initialization and we cannot use them as original
+        // offsets.
+        const auto in_offsets =
+            shape == cpu_config->latest_shapes[i] ? repacked_in.in_offsets() : cpu_config->io_data_offsets[i];
+
+        // In parallel case Kernel should not add offsets to repacked inputs because
+        // they will be applied during repacking in execution stage
+        if (is_impl_parallel) {
+            auto& offsets = cpu_config->io_data_offsets[i];
+            std::fill(offsets.begin(), offsets.end(), 0);
+        } else {
+            ov::snippets::VectorDims shape_for_offset(cpu_config->tensor_rank - shape.size(), 1);
+            shape_for_offset.insert(shape_for_offset.end(), blk_shape.begin(), blk_shape.end());
+            m_configurator->compute_offsets(shape_for_offset, i, 0);
+        }
+        const auto out_offsets = cpu_config->io_data_offsets[i];
+
+        repacked_in = RepackedInput(p.second->get_kernel(), desc, in_offsets, out_offsets);
+    }
+
     return true;
 }
 
diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp
index 4d0c9586f3be31..2ef0b382a6ad8c 100644
--- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp
@@ -5,6 +5,7 @@
 #pragma once
 
 #include "emitters/snippets/cpu_runtime_configurator.hpp"
+#include "emitters/snippets/x64/kernel_executors/brgemm_copy_b.hpp"
 #include "snippets/lowered/pass/runtime_optimizer.hpp"
 #include "snippets/runtime_configurator.hpp"
 
@@ -24,11 +25,23 @@ class BrgemmExternalRepackingAdjuster : public ov::snippets::lowered::pass::Runt
 
     bool run(const snippets::lowered::LinearIR& linear_ir) override;
     bool applicable() const override {
-        return !m_param_idces_with_external_repacking.empty();
+        return !m_executors.empty();
     }
 
 private:
-    std::set<size_t> m_param_idces_with_external_repacking;
+    using RepackExecutorPtr = std::shared_ptr<BrgemmCopyBKernelExecutor>;
+    static VectorDims get_blk_order(size_t shape_rank);
+    static VectorDims get_blk_shape(const VectorDims& planar_shape, ov::element::Type prc);
+
+    void update_kernel(const RepackExecutorPtr& executor,
+                       const VectorDims& shape,
+                       const VectorDims& layout,
+                       size_t N,
+                       size_t K,
+                       ov::element::Type prc);
+
+    static const size_t brgemm_kernel_rank;
+    std::unordered_map<size_t, RepackExecutorPtr> m_executors;
 };
 
 }  // namespace intel_cpu
diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.cpp
index 0f5a6472b741f4..0186e5b66030ca 100644
--- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.cpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.cpp
@@ -22,7 +22,7 @@ bool ov::intel_cpu::pass::FuseLoadStoreConvert::fuse_load_convert(
     const auto& load_output = input_connector->get_source();
     const auto& load_expr = load_output.get_expr();
     const auto load = ov::as_type_ptr<snippets::op::Load>(load_expr->get_node());
-    if (!load || ov::is_type<snippets::op::LoadReshape>(load_expr->get_node()) ||
+    if (!load || ov::is_type<snippets::op::LoadReorder>(load_expr->get_node()) ||
         ov::is_type<snippets::op::BroadcastLoad>(load_expr->get_node()))
         return false;
 
diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
index 81eb70d328630d..7b787f2afd0296 100644
--- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
+++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
@@ -1081,10 +1081,7 @@ void Transformations::MainSnippets(void) {
         // Only FP32 dynamic MHA is supported
         if (matmul->is_dynamic())
             return false;
-        // [114487] brgemm kernel in oneDNN requires brgemm_copy_b kernel if MatMul node has transposed_b=True
-        // The current solution with ExtractExplicitMatMulTranspose pass is slower for non-f32 cases than using of
-        // brgemm_copy_b kernel
-        if (matmul->get_transpose_a() || matmul->get_transpose_b())
+        if (matmul->get_transpose_a())
             return false;
         // [150842] The execution of Brgemm INT8/BF16/FP16 on AMX platforms depends on the value of "K % VNNIFactor".
         //          For more details, please teake a look at the ticket 150842
@@ -1113,6 +1110,7 @@ void Transformations::MainSnippets(void) {
             return false;
         const auto parallel_work_amount =
             std::accumulate(shape.rbegin() + 2, shape.rend(), ov::Dimension(1), std::multiplies<ov::Dimension>());
+        // Ticket 160154: enable tokenization for MHA with insufficient parallel work amount
         const auto is_unsupported_parallel_work_amount =
             static_cast<size_t>(parallel_work_amount.get_length()) < tokenization_config.get_concurrency() &&
             !ov::snippets::pass::SplitDimensionM::can_be_optimized(n, tokenization_config.get_concurrency());
diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/mha.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/mha.cpp
index a94f52be91df02..b69dcb66fb2d44 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/mha.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/mha.cpp
@@ -296,7 +296,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_MHA,
                                                                                        ElementType::f32}),
                                             ::testing::ValuesIn(matMulIn0Precisions),
                                             ::testing::ValuesIn(patternTypes),
-                                            ::testing::Values(ExpectedNodes{{"Subgraph", 1}}),
+                                            ::testing::Values(ExpectedNodes{{"Subgraph", 2}}), // MHA + Decomposed Transpose on input
                                             ::testing::Values(ov::test::utils::DEVICE_CPU)),
                          MHATest::getTestCaseName);
 
@@ -309,7 +309,7 @@ INSTANTIATE_TEST_SUITE_P(
             std::vector<ElementType>{ElementType::bf16, ElementType::bf16, ElementType::bf16, ElementType::bf16}),
         ::testing::ValuesIn(matMulIn0Precisions),
         ::testing::ValuesIn(patternTypes),
-        ::testing::Values(ExpectedNodes{{"Subgraph", 1},
+        ::testing::Values(ExpectedNodes{{"Subgraph", 2}, // MHA + Decomposed Transpose on input
                                         {"Transpose", 1}}),  // Plugin disables tokenization of Transpose on output
         ::testing::Values(ov::test::utils::DEVICE_CPU)),
     MHATest::getTestCaseName);
@@ -323,7 +323,7 @@ INSTANTIATE_TEST_SUITE_P(
             std::vector<ElementType>{ElementType::f16, ElementType::f16, ElementType::f16, ElementType::f16}),
         ::testing::ValuesIn(matMulIn0Precisions),
         ::testing::ValuesIn(patternTypes),
-        ::testing::Values(ExpectedNodes{{"Subgraph", 1},
+        ::testing::Values(ExpectedNodes{{"Subgraph", 2}, // MHA + Decomposed Transpose on input
                                         {"Transpose", 1}}),  // Plugin disables tokenization of Transpose on output
         ::testing::Values(ov::test::utils::DEVICE_CPU)),
     MHATest::getTestCaseName);
@@ -694,7 +694,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_MHAQuant_Pattern0,
                                             ::testing::Values(0),
                                             ::testing::Values(ExpectedNodes{
                                                 {"Subgraph", 5},     // FQs on inputs x 3 + MHA + Deq Mul
-                                                {"Transpose", 1}}),  // Transpose between MHA and Deq Mul
+                                                {"Transpose", 2}}),  // Decomposed Transpose on input + Transpose between MHA and Deq Mul
                                             ::testing::Values(ov::test::utils::DEVICE_CPU)),
                          MHAQuantTest::getTestCaseName);
 
@@ -706,7 +706,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_MHAQuant_Pattern1,
                                             ::testing::Values(1),
                                             ::testing::Values(ExpectedNodes{
                                                 {"Subgraph", 4},     // FQ on input x 2 + MHA + Deq Mul
-                                                {"Transpose", 1}}),  // Transpose between MHA and Deq Mul
+                                                {"Transpose", 2}}),  // Decomposed Transpose on input + Transpose between MHA and Deq Mul
                                             ::testing::Values(ov::test::utils::DEVICE_CPU)),
                          MHAQuantTest::getTestCaseName);
 
@@ -717,7 +717,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_MHAQuant_Pattern2,
                                             ::testing::ValuesIn(matMulIn0PrecisionsQuant),
                                             ::testing::Values(2),
                                             ::testing::Values(ExpectedNodes{{"Subgraph", 3},     // FQ on inputs x 2 + MHA
-                                                                            {"Transpose", 0}}),  // Transpose is fused
+                                                                            {"Transpose", 1}}),  // Decomposed Transpose on input
                                             ::testing::Values(ov::test::utils::DEVICE_CPU)),
                          MHAQuantTest::getTestCaseName);
 
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp
index df0b69f99ef06d..1709fd21f988a0 100644
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp
@@ -75,8 +75,8 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA_4D,
                                             ::testing::Values(ov::element::f32),
                                             ::testing::Values(false),
                                             ::testing::Values(MHA::default_thread_count),
-                                            ::testing::Values(1),
-                                            ::testing::Values(1),
+                                            ::testing::Values(2), // decomposed Transpose + MHA
+                                            ::testing::Values(2), // decomposed Transpose + MHA
                                             ::testing::Values(ov::test::utils::DEVICE_CPU),
                                             ::testing::Values(CPUTestUtils::empty_plugin_config)),
                          MHA::getTestCaseName);
@@ -88,8 +88,8 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA_4D_WithScalarMul,
                                             ::testing::Values(ov::element::f32),
                                             ::testing::Values(true),
                                             ::testing::Values(MHA::default_thread_count),
-                                            ::testing::Values(1),
-                                            ::testing::Values(1),
+                                            ::testing::Values(2), // decomposed Transpose + MHA
+                                            ::testing::Values(2), // decomposed Transpose, Mul + MHA
                                             ::testing::Values(ov::test::utils::DEVICE_CPU),
                                             ::testing::Values(CPUTestUtils::empty_plugin_config)),
                          MHA::getTestCaseName);
@@ -125,9 +125,9 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHABF16_4D,
                          ::testing::Combine(::testing::ValuesIn(transposedShape_4D()),
                                             ::testing::ValuesIn(precision_bf16_if_supported(4)),
                                             ::testing::Values(ov::element::f32),
-                                            ::testing::ValuesIn({false, true}),
+                                            ::testing::Values(false),
                                             ::testing::Values(MHA::default_thread_count),
-                                            ::testing::Values(7),  // MHA + 5 Converts + 1 Transpose on output
+                                            ::testing::Values(8),  // decomposed Transpose + MHA + 5 Converts + 1 Transpose on output
                                             ::testing::Values(6),  // MHA + 5 Converts on inputs and output
                                             ::testing::Values(ov::test::utils::DEVICE_CPU),
                                             ::testing::Values(CPUTestUtils::empty_plugin_config)),
@@ -140,8 +140,8 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAEnforceBF16,
                                             ::testing::Values(ov::element::bf16),
                                             ::testing::ValuesIn({false}),
                                             ::testing::Values(MHA::default_thread_count),
-                                            ::testing::Values(7),
-                                            ::testing::Values(6),
+                                            ::testing::Values(8),  // decomposed Transpose + MHA + 5 Converts + 1 Transpose on output
+                                            ::testing::Values(6),  // MHA + 5 Reorders on inputs and output
                                             ::testing::Values(ov::test::utils::DEVICE_CPU),
                                             ::testing::Values(CPUTestUtils::cpu_bf16_plugin_config)),
                          MHA::getTestCaseName);
@@ -153,8 +153,8 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA_FP16_4D_Without_Multiply,
                                             ::testing::Values(ov::element::f16),
                                             ::testing::ValuesIn({false}),
                                             ::testing::Values(MHA::default_thread_count),
+                                            ::testing::Values(3),
                                             ::testing::Values(2),
-                                            ::testing::Values(1),
                                             ::testing::Values(ov::test::utils::DEVICE_CPU),
                                             ::testing::Values(CPUTestUtils::empty_plugin_config)),
                          MHA::getTestCaseName);
@@ -165,8 +165,8 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA_FP16_4D_With_Multiply_Static,
                                             ::testing::Values(ov::element::f16),
                                             ::testing::ValuesIn({true}),
                                             ::testing::Values(MHA::default_thread_count),
+                                            ::testing::Values(3),
                                             ::testing::Values(2),
-                                            ::testing::Values(1),
                                             ::testing::Values(ov::test::utils::DEVICE_CPU),
                                             ::testing::Values(CPUTestUtils::empty_plugin_config)),
                          MHA::getTestCaseName);
@@ -178,7 +178,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA_FP16_4D_With_Multiply_Dynamic,
                                             ::testing::Values(ov::element::f16),
                                             ::testing::ValuesIn({true}),
                                             ::testing::Values(MHA::default_thread_count),
-                                            ::testing::Values(3),
+                                            ::testing::Values(4),
                                             ::testing::Values(2),
                                             ::testing::Values(ov::test::utils::DEVICE_CPU),
                                             ::testing::Values(CPUTestUtils::empty_plugin_config)),
@@ -191,8 +191,8 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAEnforceFP16_Without_Multiply,
                                             ::testing::Values(ov::element::f16),
                                             ::testing::ValuesIn({false}),
                                             ::testing::Values(MHA::default_thread_count),
+                                            ::testing::Values(3),
                                             ::testing::Values(2),
-                                            ::testing::Values(1),
                                             ::testing::Values(ov::test::utils::DEVICE_CPU),
                                             ::testing::Values(CPUTestUtils::cpu_f16_plugin_config)),
                          MHA::getTestCaseName);
@@ -203,8 +203,8 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAEnforceFP16_With_Multiply_Static,
                                             ::testing::Values(ov::element::f16),
                                             ::testing::ValuesIn({true}),
                                             ::testing::Values(MHA::default_thread_count),
+                                            ::testing::Values(3),
                                             ::testing::Values(2),
-                                            ::testing::Values(1),
                                             ::testing::Values(ov::test::utils::DEVICE_CPU),
                                             ::testing::Values(CPUTestUtils::cpu_f16_plugin_config)),
                          MHA::getTestCaseName);
@@ -215,7 +215,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAEnforceFP16_With_Multiply_Dynamic,
                                             ::testing::Values(ov::element::f16),
                                             ::testing::ValuesIn({true}),
                                             ::testing::Values(MHA::default_thread_count),
-                                            ::testing::Values(3),
+                                            ::testing::Values(4),
                                             ::testing::Values(2),
                                             ::testing::Values(ov::test::utils::DEVICE_CPU),
                                             ::testing::Values(CPUTestUtils::cpu_f16_plugin_config)),
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_fma.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_fma.cpp
index 4bf35e2daa690d..f9bc640160a67c 100644
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_fma.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_fma.cpp
@@ -21,8 +21,8 @@ INSTANTIATE_TEST_SUITE_P(
         ::testing::Values(ov::element::f32),
         ::testing::ValuesIn({false}),  // Need to support True for graph builder in tests
         ::testing::Values(MHA::default_thread_count),
-        ::testing::Values(1),
-        ::testing::Values(1),
+        ::testing::Values(2), // Subgraph with MHA + Subgraph with Transpose1
+        ::testing::Values(2), // Subgraph with MHA + Subgraph with Transpose1
         ::testing::Values(ov::test::utils::DEVICE_CPU),
         ::testing::Values(CPUTestUtils::empty_plugin_config)),
     MHA::getTestCaseName);
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_quantized.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_quantized.cpp
index 0c731b74565863..38806dff765833 100644
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_quantized.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_quantized.cpp
@@ -48,7 +48,7 @@ INSTANTIATE_TEST_SUITE_P(
                        ::testing::Values(ov::element::f32),
                        ::testing::Values(false),  // The graph doesn't contain Multiply
                        ::testing::Values(MHA::default_thread_count),
-                       ::testing::Values(6),  // FQx3 on inputs + MHA + Transpose on output + Deq Mul
+                       ::testing::Values(7),  // FQx3, Transpose1 on inputs + MHA + Transpose on output + Deq Mul
                        ::testing::Values(5),  // FQx3 on inputs + MHA + Deq Mul
                        ::testing::Values(ov::test::utils::DEVICE_CPU),
                        ::testing::Values(CPUTestUtils::empty_plugin_config)),
@@ -63,7 +63,7 @@ INSTANTIATE_TEST_SUITE_P(
         ::testing::Values(ov::element::f32),
         ::testing::Values(false),  // The graph doesn't contain Multiply
         ::testing::Values(MHA::default_thread_count),
-        ::testing::Values(5),  // FQx2 on inputs + MHA + Transpose on output + Deq Mul
+        ::testing::Values(6),  // FQx2, Transpose1 on inputs + MHA + Transpose on output + Deq Mul
         ::testing::Values(4),  // FQx2 on inputs + MHA + Deq Mul
         ::testing::Values(ov::test::utils::DEVICE_CPU),
         ::testing::Values(CPUTestUtils::empty_plugin_config)),
@@ -77,8 +77,8 @@ INSTANTIATE_TEST_SUITE_P(
                        ::testing::Values(ov::element::f32),
                        ::testing::Values(false),  // The graph doesn't contain Multiply
                        ::testing::Values(MHA::default_thread_count),
-                       ::testing::Values(3),  // MHA + Transpose on output + Deq Mul
-                       ::testing::Values(2),  // MHA + Deq Mul
+                       ::testing::Values(4),  // Transpose1 + MHA + Transpose on output + Deq Mul
+                       ::testing::Values(3),  // Transpose1 + MHA + Deq Mul
                        ::testing::Values(ov::test::utils::DEVICE_CPU),
                        ::testing::Values(CPUTestUtils::empty_plugin_config)),
     MHA::getTestCaseName);
@@ -91,7 +91,7 @@ INSTANTIATE_TEST_SUITE_P(
                        ::testing::Values(ov::element::f32),
                        ::testing::Values(false),  // The graph doesn't contain Multiply
                        ::testing::Values(MHA::default_thread_count),
-                       ::testing::Values(7),  // Transposex2 + Subgraphsx5
+                       ::testing::Values(8),  // Transposex3 + Subgraphsx5
                        ::testing::Values(5),  // MHA + Deq Mul on output + Deqs on inputs + 2 xFQ on inputs
                        ::testing::Values(ov::test::utils::DEVICE_CPU),
                        ::testing::Values(CPUTestUtils::empty_plugin_config)),
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_select.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_select.cpp
index 3fc1417d20b102..cc438301101811 100644
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_select.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_select.cpp
@@ -29,8 +29,8 @@ INSTANTIATE_TEST_SUITE_P(
                        ::testing::Values(ov::element::f32),
                        ::testing::Values(false),  // Need to support True for graph builder in tests
                        ::testing::Values(MHA::default_thread_count),
-                       ::testing::Values(2),  // Less + MHA
-                       ::testing::Values(2),
+                       ::testing::Values(3),  // Transpose1 + Less + MHA
+                       ::testing::Values(3),  // Transpose1 + Less + MHA
                        ::testing::Values(ov::test::utils::DEVICE_CPU),
                        ::testing::Values(CPUTestUtils::empty_plugin_config)),
     MHA::getTestCaseName);
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_split_dim_m.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_split_dim_m.cpp
index bb5f7fe2fa5b52..d3598ebba1ac1f 100644
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_split_dim_m.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_split_dim_m.cpp
@@ -24,8 +24,8 @@ INSTANTIATE_TEST_SUITE_P(
                        ::testing::Values(ov::element::f32),
                        ::testing::Values(true),
                        ::testing::Values(4),  // 4 Threads
-                       ::testing::Values(6),  // Subgraph + 4 Reshapes on inputs and 1 Reshape on output
-                       ::testing::Values(1),
+                       ::testing::Values(7),  // Subgraph + 4 Reshapes, Transpose1 on inputs and 1 Reshape on output
+                       ::testing::Values(2),
                        ::testing::Values(ov::test::utils::DEVICE_CPU),
                        ::testing::Values(enable_callback())),
     MHA::getTestCaseName);
@@ -80,8 +80,8 @@ INSTANTIATE_TEST_SUITE_P(
                        ::testing::Values(ov::element::f32),
                        ::testing::Values(false),
                        ::testing::Values(4),  // 4 Threads
-                       ::testing::Values(1),
-                       ::testing::Values(1),
+                       ::testing::Values(2), // Transpose1 + MHA
+                       ::testing::Values(2), // Transpose1 + MHA
                        ::testing::Values(ov::test::utils::DEVICE_CPU),
                        ::testing::Values(CPUTestUtils::empty_plugin_config)),
     MHA::getTestCaseName);
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_with_dyn_mul.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_with_dyn_mul.cpp
index 7876d737af2281..9a9e56621b10a6 100644
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_with_dyn_mul.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_with_dyn_mul.cpp
@@ -43,8 +43,8 @@ INSTANTIATE_TEST_SUITE_P(
                        ::testing::ValuesIn(precision_f32(5)),
                        ::testing::Values(ov::element::f32),
                        ::testing::Values(MHA::default_thread_count),
-                       ::testing::Values(1),
-                       ::testing::Values(1),
+                       ::testing::Values(2), // Transpose1 + MHA
+                       ::testing::Values(2), // Transpose1 + MHA
                        ::testing::Values(ov::test::utils::DEVICE_CPU),
                        ::testing::Values(CPUTestUtils::empty_plugin_config)),
     MHAWithDynamicMul::getTestCaseName);
@@ -56,7 +56,7 @@ INSTANTIATE_TEST_SUITE_P(
                        ::testing::ValuesIn(precision_f32(5)),
                        ::testing::Values(ov::element::bf16),
                        ::testing::Values(MHA::default_thread_count),
-                       ::testing::Values(8),  // MHA + 1 Transpose on output + 6 Converts around
+                       ::testing::Values(9),  // Transpose1 + MHA + 1 Transpose on output + 6 Converts around
                        ::testing::Values(7),  // MHA + 6 Converts around
                        ::testing::Values(ov::test::utils::DEVICE_CPU),
                        ::testing::Values(CPUTestUtils::empty_plugin_config)),
diff --git a/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/buffer_allocation.cpp b/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/buffer_allocation.cpp
index 9ace85b3038afa..7c425b0bca6781 100644
--- a/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/buffer_allocation.cpp
+++ b/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/buffer_allocation.cpp
@@ -149,7 +149,7 @@ class MHAFP32BufferAllocationTest : public BufferAllocationCPUTest {
         const auto parameter2 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, shapes[2]);
 
         const auto order = std::vector<size_t>{0, 2, 3, 1};
-        const auto load_reshape = std::make_shared<ov::snippets::op::LoadReshape>(parameter1, 1, 0, order);
+        const auto load_reshape = std::make_shared<ov::snippets::op::LoadReorder>(parameter1, 1, 0, order);
         const auto store = std::make_shared<ov::snippets::op::Store>(load_reshape);
         const auto relu0 = std::make_shared<ov::op::v0::Relu>(store);
         const auto brgemm_cpu0 = std::make_shared<ov::intel_cpu::BrgemmCPU>(parameter0, relu0, BRGEMM_TYPE::STAND_ALONE);
@@ -199,7 +199,7 @@ class MHABF16AMXBufferAllocationTest : public BufferAllocationCPUTest {
         const auto parameter2 = std::make_shared<ov::op::v0::Parameter>(ov::element::bf16, shapes[2]);
 
         const auto order = std::vector<size_t>{0, 2, 3, 1};
-        const auto load_reshape = std::make_shared<ov::snippets::op::LoadReshape>(parameter1, 1, 0, order);
+        const auto load_reshape = std::make_shared<ov::snippets::op::LoadReorder>(parameter1, 1, 0, order);
         const auto store = std::make_shared<ov::snippets::op::Store>(load_reshape);
         const auto convert0 = std::make_shared<ov::snippets::op::ConvertSaturation>(store, ov::element::f32);
         const auto relu0 = std::make_shared<ov::op::v0::Relu>(convert0);
diff --git a/src/tests/ov_helpers/ov_snippets_models/src/subgraph_mha.cpp b/src/tests/ov_helpers/ov_snippets_models/src/subgraph_mha.cpp
index 5f854326a47217..eb0dfaa8710fa8 100644
--- a/src/tests/ov_helpers/ov_snippets_models/src/subgraph_mha.cpp
+++ b/src/tests/ov_helpers/ov_snippets_models/src/subgraph_mha.cpp
@@ -110,46 +110,43 @@ std::shared_ptr<ov::Model> MHAFunction::initReference() const {
     auto data1 = std::make_shared<ov::opset1::Parameter>(precisions[1], input_shapes[1]);
     auto data2 = std::make_shared<ov::opset1::Parameter>(precisions[2], input_shapes[2]);
     auto data3 = std::make_shared<ov::opset1::Parameter>(precisions[3], input_shapes[3]);
-    ov::ParameterVector ngraphParams = {data0, data1, data2, data3};
-    NodeVector subgraph_inputs = {data0, data1, data2, data3};
-
-    auto transpose0Param = std::make_shared<ov::opset1::Parameter>(precisions[0], input_shapes[0]);
-    auto transpose1Param = std::make_shared<ov::opset1::Parameter>(precisions[1], input_shapes[1]);
-    auto addParam = std::make_shared<ov::opset1::Parameter>(precisions[2], input_shapes[2]);
-    auto transpose2Param = std::make_shared<ov::opset1::Parameter>(precisions[3], input_shapes[3]);
 
-    ov::ParameterVector subgraph_params = {transpose0Param, transpose1Param, addParam, transpose2Param};
+    ov::ParameterVector ngraphParams = {data0, data1, data2, data3};
 
     const auto rank = input_shapes[0].size();
     const auto fusion_order = get_fusion_order(rank);
     const auto decomposed_order = get_decomposed_order(rank);
 
-    const auto transpose0Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order);
     const auto transpose1Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, decomposed_order);
-    const auto transpose2Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order);
-    const auto transpose3Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order);
+    const auto transpose1 = std::make_shared<ov::op::v1::Transpose>(data1, transpose1Const);
 
-    const auto transpose0 = std::make_shared<ov::op::v1::Transpose>(transpose0Param, transpose0Const);
-    const auto transpose1 = std::make_shared<ov::op::v1::Transpose>(transpose1Param, transpose1Const);
-    std::shared_ptr<ov::Node> matmul_parent1 = transpose1;
+    std::shared_ptr<ov::Node> subgraph_parent1 = transpose1;
     if (with_mul) {
         ov::Shape shape(rank, 1);
         if (transpose1->get_output_partial_shape(0).is_static()) {
             shape[rank - 3] = transpose1->get_output_shape(0)[rank - 3];
         }
-        const auto mulConst = ov::test::utils::make_constant(precisions[1], shape);
 
-        if (ov::shape_size(shape) > 1) {
-            const auto mulParam = std::make_shared<ov::opset1::Parameter>(precisions[1], mulConst->get_shape());
-            matmul_parent1 = std::make_shared<ov::op::v1::Multiply>(transpose1, mulParam);
-            subgraph_params = {transpose0Param, transpose1Param, mulParam, addParam, transpose2Param};
-            subgraph_inputs = {data0, data1, mulConst, data2, data3};
-        } else {
-            matmul_parent1 = std::make_shared<ov::op::v1::Multiply>(transpose1, mulConst);
-        }
+        const auto mulConst = ov::test::utils::make_constant(precisions[1], shape);
+        subgraph_parent1 = std::make_shared<ov::op::v1::Multiply>(transpose1, mulConst);
     }
 
-    const auto matMul0 = std::make_shared<ov::op::v0::MatMul>(transpose0, matmul_parent1);
+    NodeVector subgraph_inputs = {data0, subgraph_parent1, data2, data3};
+
+    auto transpose0Param = std::make_shared<ov::opset1::Parameter>(precisions[0], input_shapes[0]);
+    auto brgemm1Param = std::make_shared<ov::opset1::Parameter>(subgraph_parent1->get_element_type(), subgraph_parent1->get_output_partial_shape(0));
+    auto addParam = std::make_shared<ov::opset1::Parameter>(precisions[2], input_shapes[2]);
+    auto transpose2Param = std::make_shared<ov::opset1::Parameter>(precisions[3], input_shapes[3]);
+
+    ov::ParameterVector subgraph_params = {transpose0Param, brgemm1Param, addParam, transpose2Param};
+
+    const auto transpose0Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order);
+    const auto transpose2Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order);
+    const auto transpose3Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order);
+
+    const auto transpose0 = std::make_shared<ov::op::v1::Transpose>(transpose0Param, transpose0Const);
+
+    const auto matMul0 = std::make_shared<ov::op::v0::MatMul>(transpose0, brgemm1Param);
     const auto add = std::make_shared<ov::op::v1::Add>(matMul0, addParam);
     const auto softMax = std::make_shared<ov::opset1::Softmax>(add, rank - 1);
     const auto transpose2 = std::make_shared<ov::op::v1::Transpose>(transpose2Param, transpose2Const);
@@ -168,55 +165,45 @@ std::shared_ptr<ov::Model> MHASplitMFunction::initReference() const {
     auto data3 = std::make_shared<ov::opset1::Parameter>(precisions[3], input_shapes[3]);
     ov::ParameterVector ngraphParams = {data0, data1, data2, data3};
 
+    const auto rank_before = input_shapes[1].size();
+    const auto transpose1Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank_before}, get_decomposed_order(rank_before));
+    const auto transpose1 = std::make_shared<ov::op::v1::Transpose>(data1, transpose1Const);
+
+    std::shared_ptr<ov::Node> subgraph_parent1 = transpose1;
+    if (with_mul) {
+        ov::Shape shape(rank_before, 1);
+        if (transpose1->get_output_partial_shape(0).is_static()) {
+            shape[rank_before - 3] = transpose1->get_output_shape(0)[rank_before - 3];
+        }
+        const auto mulConst = ov::test::utils::make_constant(precisions[1], shape);
+        subgraph_parent1 = std::make_shared<ov::op::v1::Multiply>(transpose1, mulConst);
+    }
+
     auto make_reshape = [](const std::shared_ptr<ov::Node>& node, const ov::Shape& new_shape) {
         auto shape_const = ov::op::v0::Constant::create(ov::element::i32, {new_shape.size()}, new_shape);
         return std::make_shared<ov::op::v1::Reshape>(node, shape_const, true);
     };
 
     auto reshape0 = make_reshape(data0, reshapes[0]);
-    auto reshape1 = make_reshape(data1, reshapes[1]);
+    auto reshape1 = make_reshape(subgraph_parent1, reshapes[1]);
     auto reshape2 = make_reshape(data2, reshapes[2]);
     auto reshape3 = make_reshape(data3, reshapes[3]);
     NodeVector subgraph_inputs = {reshape0, reshape1, reshape2, reshape3};
 
     auto transpose0Param = std::make_shared<ov::opset1::Parameter>(precisions[0], reshape0->get_shape());
-    auto transpose1Param = std::make_shared<ov::opset1::Parameter>(precisions[1], reshape1->get_shape());
+    auto brgemm1Param = std::make_shared<ov::opset1::Parameter>(precisions[1], reshape1->get_shape());
     auto addParam = std::make_shared<ov::opset1::Parameter>(precisions[2], reshape2->get_shape());
     auto transpose2Param = std::make_shared<ov::opset1::Parameter>(precisions[3], reshape3->get_shape());
-    ov::ParameterVector subgraph_params = {transpose0Param, transpose1Param, addParam, transpose2Param};
+    ov::ParameterVector subgraph_params = {transpose0Param, brgemm1Param, addParam, transpose2Param};
 
     const auto rank = input_shapes[0].size() + 1;
-
     const auto transpose0Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, get_fusion_order_after_split_m(rank, true));
-    const auto transpose1Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, get_decomposed_order_after_split_m(rank));
     const auto transpose2Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, get_fusion_order_after_split_m(rank, true));
     const auto transpose3Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, get_fusion_order_after_split_m(rank, false));
 
     const auto transpose0 = std::make_shared<ov::op::v1::Transpose>(transpose0Param, transpose0Const);
-    const auto transpose1 = std::make_shared<ov::op::v1::Transpose>(transpose1Param, transpose1Const);
-
-    std::shared_ptr<ov::Node> matmul_parent1 = transpose1;
-    if (with_mul) {
-        ov::Shape shape(rank - 1, 1);
-        if (transpose1->get_output_partial_shape(0).is_static()) {
-            shape[rank - 4] = transpose1->get_output_shape(0)[rank - 4];
-        }
-        const auto mulConst = ov::test::utils::make_constant(precisions[1], shape);
-
-        if (ov::shape_size(shape) > 1) {
-            ov::Shape reshape_shape = shape;
-            reshape_shape.insert(reshape_shape.cbegin() + (rank - 3), 1);
-            const auto mulReshape = make_reshape(mulConst, reshape_shape);
-            const auto mulParam = std::make_shared<ov::opset1::Parameter>(precisions[1], mulReshape->get_shape());
-            matmul_parent1 = std::make_shared<ov::op::v1::Multiply>(transpose1, mulParam);
-            subgraph_params = {transpose0Param, transpose1Param, mulParam, addParam, transpose2Param};
-            subgraph_inputs = {reshape0, reshape1, mulReshape, reshape2, reshape3};
-        } else {
-            matmul_parent1 = std::make_shared<ov::op::v1::Multiply>(transpose1, mulConst);
-        }
-    }
 
-    const auto matMul0 = std::make_shared<ov::op::v0::MatMul>(transpose0, matmul_parent1);
+    const auto matMul0 = std::make_shared<ov::op::v0::MatMul>(transpose0, brgemm1Param);
     const auto add = std::make_shared<ov::op::v1::Add>(matMul0, addParam);
     const auto softMax = std::make_shared<ov::opset1::Softmax>(add, rank - 1);
     const auto transpose2 = std::make_shared<ov::op::v1::Transpose>(transpose2Param, transpose2Const);
@@ -318,30 +305,36 @@ std::shared_ptr<ov::Model> MHAMatMul0TransposeFunction::initReference() const {
     auto data2 = std::make_shared<ov::opset1::Parameter>(precisions[2], input_shapes[2]);
     auto data3 = std::make_shared<ov::opset1::Parameter>(precisions[3], input_shapes[3]);
     ov::ParameterVector ngraphParams = {data0, data1, data2, data3};
-    NodeVector subgraph_inputs = {data0, data1, data2, data3};
+
+    const auto rank = input_shapes[0].size();
+    const auto fusion_order = get_fusion_order(rank);
+    const auto decomposed_order = get_decomposed_order(rank);
+    std::vector<int64_t> transposed_b_order(rank);
+    std::iota(transposed_b_order.begin(), transposed_b_order.end(), 0);
+    std::swap(transposed_b_order[rank - 1], transposed_b_order[rank - 2]);
+
+    const auto transpose1Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order);
+    const auto transpose1 = std::make_shared<ov::op::v1::Transpose>(data1, transpose1Const);
+    const auto mulConst = ov::test::utils::make_constant(precisions[1], ov::Shape{1});
+    const auto mul = std::make_shared<ov::op::v1::Multiply>(transpose1, mulConst);
+    const auto transposeBConst = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{rank}, transposed_b_order);
+    const auto transposeB = std::make_shared<ov::op::v1::Transpose>(mul, transposeBConst);
+
+    NodeVector subgraph_inputs = {data0, transposeB, data2, data3};
 
     auto transpose0Param = std::make_shared<ov::opset1::Parameter>(precisions[0], input_shapes[0]);
-    auto transpose1Param = std::make_shared<ov::opset1::Parameter>(precisions[1], input_shapes[1]);
+    auto brgemm1Param = std::make_shared<ov::opset1::Parameter>(transposeB->get_element_type(), transposeB->get_output_partial_shape(0));
     auto addParam = std::make_shared<ov::opset1::Parameter>(precisions[2], input_shapes[2]);
     auto transpose2Param = std::make_shared<ov::opset1::Parameter>(precisions[3], input_shapes[3]);
 
-    ov::ParameterVector subgraph_params = {transpose0Param, transpose1Param, addParam, transpose2Param};
-
-    const auto rank = input_shapes[0].size();
-    const auto fusion_order = get_fusion_order(rank);
-    const auto decomposed_order = get_decomposed_order(rank);
+    ov::ParameterVector subgraph_params = {transpose0Param, brgemm1Param, addParam, transpose2Param};
 
     const auto transpose0Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order);
-    const auto transpose1Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, decomposed_order);
     const auto transpose2Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order);
     const auto transpose3Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order);
 
     const auto transpose0 = std::make_shared<ov::op::v1::Transpose>(transpose0Param, transpose0Const);
-    const auto transpose1 = std::make_shared<ov::op::v1::Transpose>(transpose1Param, transpose1Const);
-
-    const auto mulConst = ov::test::utils::make_constant(precisions[1], ov::Shape{1});
-    const auto mul = std::make_shared<ov::op::v1::Multiply>(transpose1, mulConst);
-    const auto matMul0 = std::make_shared<ov::op::v0::MatMul>(transpose0, mul);
+    const auto matMul0 = std::make_shared<ov::op::v0::MatMul>(transpose0, brgemm1Param);
     const auto add = std::make_shared<ov::op::v1::Add>(matMul0, addParam);
     const auto softMax = std::make_shared<ov::opset1::Softmax>(add, rank - 1);
     const auto transpose2 = std::make_shared<ov::op::v1::Transpose>(transpose2Param, transpose2Const);
@@ -818,29 +811,33 @@ std::shared_ptr<ov::Model> MHAINT8MatMulTypeRelaxedFunction::initReference() con
     const auto fq0 = ov::builder::subgraph::makeFakeQuantizeTypeRelaxed(data0, ov::element::f32, fq_signed_params);
     const auto fq1 = ov::builder::subgraph::makeFakeQuantizeTypeRelaxed(data1, ov::element::f32, fq_signed_params);
     const auto fq2 = ov::builder::subgraph::makeFakeQuantizeTypeRelaxed(data3, ov::element::f32, fq_signed_params);
-    NodeVector subgraph_inputs = {fq0, fq1, data2, fq2};
 
-    auto transpose0Param = std::make_shared<ov::opset1::Parameter>(precision, input_shapes[0]);
-    auto transpose1Param = std::make_shared<ov::opset1::Parameter>(precision, input_shapes[1]);
-    auto addParam = std::make_shared<ov::opset1::Parameter>(precision, input_shapes[2]);
-    auto transpose2Param = std::make_shared<ov::opset1::Parameter>(precision, input_shapes[3]);
-    ov::ParameterVector subgraph_params = {transpose0Param, transpose1Param, addParam, transpose2Param};
+    const auto rank = input_shapes[0].get_shape().size();
+    const auto fusion_order = get_fusion_order(rank);
+    const auto decomposed_order = get_decomposed_order(rank);
+    const auto transpose1Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, decomposed_order);
+    const auto transpose1 = std::make_shared<ov::op::v1::Transpose>(fq1, transpose1Const);
 
-    const auto shape_rank = input_shapes[0].get_shape().size();
-    auto transpose0Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector<int64_t>{0, 2, 1, 3});
-    auto transpose1Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector<int64_t>{0, 2, 3, 1});
-    auto transpose2Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector<int64_t>{0, 2, 1, 3});
-    auto transpose3Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector<int64_t>{0, 2, 1, 3});
+    NodeVector subgraph_inputs = {fq0, transpose1, data2, fq2};
+
+    const auto transpose0Param = std::make_shared<ov::opset1::Parameter>(precision, input_shapes[0]);
+    const auto brgemm1Param = std::make_shared<ov::opset1::Parameter>(transpose1->get_element_type(), transpose1->get_output_partial_shape(0));
+    const auto addParam = std::make_shared<ov::opset1::Parameter>(precision, input_shapes[2]);
+    const auto transpose2Param = std::make_shared<ov::opset1::Parameter>(precision, input_shapes[3]);
+    ov::ParameterVector subgraph_params = {transpose0Param, brgemm1Param, addParam, transpose2Param};
+
+    const auto transpose0Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order);
+    const auto transpose2Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order);
+    const auto transpose3Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order);
 
     bool transA = false;
     bool transB = false;
     const auto transpose0 = std::make_shared<ov::op::v1::Transpose>(transpose0Param, transpose0Const);
-    const auto transpose1 = std::make_shared<ov::op::v1::Transpose>(transpose1Param, transpose1Const);
     const auto matMul0 = std::make_shared<op::TypeRelaxed<op::v0::MatMul>>(
             std::vector<element::Type>{ element::f32, element::f32 },
             std::vector<element::Type>{ element::f32 },
             ov::op::TemporaryReplaceOutputType(transpose0, element::f32).get(),
-            ov::op::TemporaryReplaceOutputType(transpose1, element::f32).get(), transA, transB);
+            ov::op::TemporaryReplaceOutputType(brgemm1Param, element::f32).get(), transA, transB);
 
     auto decomposed_fq =
         [](const ov::Output<ov::Node>& input, const ov::element::Type& out_precision, float il, float ih, float scale) {
@@ -941,8 +938,8 @@ std::shared_ptr<ov::Model> MHATransposedInputFunction::initReference() const {
     const auto data2 = std::make_shared<ov::opset1::Parameter>(precision, input_shapes[2]);
     ov::ParameterVector ngraphParam = {data0, data1, data2};
 
-    bool is_supported = ((m_transposed_b && m_order == std::vector<int64_t>{0, 2, 1, 3}) ||
-                         (!m_transposed_b && m_order == std::vector<int64_t>{0, 2, 3, 1}));
+    bool is_supported = ((m_transposed_b && m_order == std::vector<int64_t>{0, 2, 3, 1}) ||
+                         (!m_transposed_b && m_order == std::vector<int64_t>{0, 2, 1, 3}));
 
     std::shared_ptr<ov::Node> in1 = data1;
     if (!m_order.empty() && !is_supported) {
@@ -963,11 +960,16 @@ std::shared_ptr<ov::Model> MHATransposedInputFunction::initReference() const {
     const auto param0 = std::make_shared<ov::opset1::Parameter>(precision, data0->get_output_partial_shape(0));
     const auto param1 = std::make_shared<ov::opset1::Parameter>(precision, in1->get_output_partial_shape(0));
     const auto param2 = std::make_shared<ov::opset1::Parameter>(precision, data2->get_output_partial_shape(0));
+    ov::ParameterVector subgraph_params = {param0, param1, param2};
+    ov::OutputVector subgraphs_inputs = {data0, in1, data2};
 
     std::shared_ptr<ov::Node> matmul0_in1 = param1;
     if (!m_order.empty() && is_supported) {
-        const auto transposeConst = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{m_order.size()}, m_order);
+        const auto transposeConst = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{m_order.size()}, m_order);
         matmul0_in1 = std::make_shared<ov::op::v1::Transpose>(param1, transposeConst);
+
+        std::swap(subgraphs_inputs[0], subgraphs_inputs[1]);
+        std::swap(subgraph_params[0], subgraph_params[1]);
     }
 
     const bool mm0_transpose_b = m_transposed_b && m_transpose_b_native_support;
@@ -975,8 +977,7 @@ std::shared_ptr<ov::Model> MHATransposedInputFunction::initReference() const {
     const auto softmax = std::make_shared<ov::op::v8::Softmax>(matMul0, -1);
     const auto matMul1 = std::make_shared<ov::op::v0::MatMul>(softmax, param2);
 
-    auto subgraph = std::make_shared<ov::snippets::op::Subgraph>(ov::NodeVector{data0, in1, data2},
-                                                                 std::make_shared<ov::Model>(NodeVector{matMul1}, ov::ParameterVector{param0, param1, param2}));
+    auto subgraph = std::make_shared<ov::snippets::op::Subgraph>(subgraphs_inputs, std::make_shared<ov::Model>(NodeVector{matMul1}, subgraph_params));
 
     ov::ResultVector results{std::make_shared<ov::opset1::Result>(subgraph)};
     return std::make_shared<ov::Model>(results, ngraphParam, "mha");