diff --git a/src/common/snippets/include/snippets/generator.hpp b/src/common/snippets/include/snippets/generator.hpp
index 4c10f112bc2c42..f22ba76cba19e4 100644
--- a/src/common/snippets/include/snippets/generator.hpp
+++ b/src/common/snippets/include/snippets/generator.hpp
@@ -24,6 +24,7 @@ class Generator;
  * @brief Holds all relevant information produced during lowering
  * @param compiled_snippet pointer to interface class that encapsulates compiled binary code
  * @param buffer_scratchpad_size the amount of additional memory required by the binary code to execute.
+ * @param buffer_inplace_output buffer share memory with subgraph output result. -1 means no sharing. i>=0 means share ith output memory.
  * Must be allocated and freed by the backend.
  */
 class LoweringResult {
@@ -35,6 +36,7 @@ class LoweringResult {
 public:
     std::shared_ptr<CompiledSnippet> compiled_snippet = nullptr;
     size_t buffer_scratchpad_size = 0;
+    int buffer_inplace_output = -1;
 };
 
 /**
diff --git a/src/common/snippets/include/snippets/lowered/linear_ir.hpp b/src/common/snippets/include/snippets/lowered/linear_ir.hpp
index 9c8ac3f1f25b4d..2ebaa7c2ab1728 100644
--- a/src/common/snippets/include/snippets/lowered/linear_ir.hpp
+++ b/src/common/snippets/include/snippets/lowered/linear_ir.hpp
@@ -223,6 +223,22 @@ class LinearIR {
      */
     exprIt replace_with_expr(const std::vector<ExpressionPtr>& old_exprs, const ExpressionPtr& new_expr);
 
+    /**
+     * @brief Propagate start_expr through zero to several consecutive shape infer exprs(such as reshape, rankNormalization).
+     * @param start_expr Propagate from start_expr.
+     * @param downstream Propagate downstream if it's true, otherwise propagate upstream.
+     * @return shape infer op consumers as a sequence if downstream, or shape infer op sources as a sequence if upstream.
+     */
+    static std::vector<ExpressionPtr> propagate_expr_through_shape_infer_ops(const ExpressionPtr& start_expr, bool downstream);
+
+    /**
+     * @brief Get last shape infer op from start_expr in a sequence. If no shape infer op is connect to start_expr, return start_expr.
+     * @param start_expr Search from start_expr.
+     * @param downstream search downstream if it's true, otherwise search upstream.
+     * @return last shape infer expr
+     */
+    static ExpressionPtr get_last_shape_infer_expr(const ExpressionPtr& start_expr, bool downstream);
+
 private:
     std::shared_ptr<ShapeInferSnippetsNode> m_shape_infer = nullptr;
 
diff --git a/src/common/snippets/include/snippets/lowered/pass/allocate_buffers.hpp b/src/common/snippets/include/snippets/lowered/pass/allocate_buffers.hpp
index 1ec9598ec1d2c2..b31a8ced9da702 100644
--- a/src/common/snippets/include/snippets/lowered/pass/allocate_buffers.hpp
+++ b/src/common/snippets/include/snippets/lowered/pass/allocate_buffers.hpp
@@ -26,7 +26,7 @@ namespace pass {
 class AllocateBuffers: public RangedPass {
 public:
     OPENVINO_RTTI("AllocateBuffers", "RangedPass")
-    AllocateBuffers(size_t& buffer_scratchpad_size, bool is_optimized = true);
+    AllocateBuffers(size_t& buffer_scratchpad_size, int& buffer_inplace_output, bool is_optimized = true);
 
     /**
      * @brief Apply the pass to the Linear IR
@@ -44,8 +44,10 @@ class AllocateBuffers: public RangedPass {
 
     using BufferCluster = std::set<ExpressionPtr>;
     using BufferClusters = std::vector<BufferCluster>;
+
 private:
     size_t& m_buffer_scratchpad_size;
+    int& m_buffer_inplace_output;
     bool m_is_optimized_mode = true;
 };
 
diff --git a/src/common/snippets/include/snippets/op/reshape.hpp b/src/common/snippets/include/snippets/op/reshape.hpp
new file mode 100644
index 00000000000000..8375f3a050e112
--- /dev/null
+++ b/src/common/snippets/include/snippets/op/reshape.hpp
@@ -0,0 +1,38 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/op/op.hpp"
+#include "snippets/shape_inference/shape_inference.hpp"
+
+namespace ov {
+namespace snippets {
+namespace op {
+
+/**
+ * @interface Reshape
+ * @brief Reshape input tensor to reqiured target shape
+ * @ingroup snippets
+ */
+class Reshape : public ov::op::Op {
+public:
+    OPENVINO_OP("Reshape", "SnippetsOpset");
+    Reshape(const Output<Node>& x, ov::PartialShape target_shape);
+    Reshape() = default;
+
+    bool visit_attributes(AttributeVisitor& visitor) override;
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
+    void validate_and_infer_types() override;
+
+    const ov::PartialShape& get_target_shape() const;
+    void set_target_shape(ov::PartialShape shape);
+
+private:
+    ov::PartialShape m_target_shape = {};
+};
+
+} // namespace op
+} // namespace snippets
+} // namespace ov
diff --git a/src/common/snippets/include/snippets/op/subgraph.hpp b/src/common/snippets/include/snippets/op/subgraph.hpp
index 2c0558abdc7529..2b02bac7b7b5c6 100644
--- a/src/common/snippets/include/snippets/op/subgraph.hpp
+++ b/src/common/snippets/include/snippets/op/subgraph.hpp
@@ -139,6 +139,8 @@ class Subgraph : public ov::op::util::SubGraphOp {
     // Return estimated unique buffer count (upper bound). It's needed for tokenization
     static auto get_estimated_buffer_count(const ov::NodeVector& ops) -> size_t;
     static auto is_domain_sensitive_op(const std::shared_ptr<ov::Node>& op) -> bool;
+    static auto is_shape_infer_op(const std::shared_ptr<ov::Node>& op) -> bool;
+    static auto get_last_shape_infer_op(const std::shared_ptr<ov::Node>& op, bool downstream) -> std::shared_ptr<ov::Node>;
 
     void data_flow_transformations(const BlockedShapeVector& blocked_input_shapes = {},
                                    const std::vector<ov::element::Type>& input_precisions = {},
diff --git a/src/common/snippets/include/snippets/pass/gn_decomposition.hpp b/src/common/snippets/include/snippets/pass/gn_decomposition.hpp
new file mode 100644
index 00000000000000..8bd80f90c790ff
--- /dev/null
+++ b/src/common/snippets/include/snippets/pass/gn_decomposition.hpp
@@ -0,0 +1,27 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/pass/graph_rewrite.hpp"
+#include "openvino/pass/pattern/matcher.hpp"
+
+namespace ov {
+namespace snippets {
+namespace pass {
+
+/**
+ * @interface GNDecomposition
+ * @brief Decomposes GroupNormalization to a range of low-level operations
+ * @ingroup snippets
+ */
+class GNDecomposition: public ov::pass::MatcherPass {
+public:
+    OPENVINO_RTTI("GNDecomposition", "0");
+    GNDecomposition();
+};
+
+}  // namespace pass
+}  // namespace snippets
+}  // namespace ov
\ No newline at end of file
diff --git a/src/common/snippets/include/snippets/pass/gn_tokenization.hpp b/src/common/snippets/include/snippets/pass/gn_tokenization.hpp
new file mode 100644
index 00000000000000..220f05f0bbbc88
--- /dev/null
+++ b/src/common/snippets/include/snippets/pass/gn_tokenization.hpp
@@ -0,0 +1,27 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/pass/pattern/matcher.hpp"
+#include "snippets/pass/tokenization.hpp"
+
+namespace ov {
+namespace snippets {
+namespace pass {
+
+/**
+ * @interface TokenizeGNSnippets
+ * @brief Tokenize GroupNormalization to a subgraph
+ * @ingroup snippets
+ */
+class TokenizeGNSnippets : public ov::pass::MatcherPass {
+public:
+    OPENVINO_RTTI("TokenizeGNSnippets", "0");
+    TokenizeGNSnippets();
+};
+
+}  // namespace pass
+}  // namespace snippets
+}  // namespace ov
\ No newline at end of file
diff --git a/src/common/snippets/include/snippets/shape_inference/shape_infer_instances.hpp b/src/common/snippets/include/snippets/shape_inference/shape_infer_instances.hpp
index f6cd6f0626f798..a3dffd973c93dd 100644
--- a/src/common/snippets/include/snippets/shape_inference/shape_infer_instances.hpp
+++ b/src/common/snippets/include/snippets/shape_inference/shape_infer_instances.hpp
@@ -75,5 +75,12 @@ class ReduceShapeInfer : public IShapeInferSnippets {
     Result infer(const std::vector<VectorDimsRef>& input_shapes) override;
 };
 
+class ReshapeShapeInfer : public IShapeInferSnippets {
+    ov::PartialShape target_shape;
+public:
+    explicit ReshapeShapeInfer(const std::shared_ptr<Node>& n);
+    Result infer(const std::vector<VectorDimsRef>& input_shapes) override;
+};
+
 } // namespace snippets
 } // namespace ov
diff --git a/src/common/snippets/include/snippets/snippets_isa.hpp b/src/common/snippets/include/snippets/snippets_isa.hpp
index f0564becaf24b5..08002fa38ed309 100644
--- a/src/common/snippets/include/snippets/snippets_isa.hpp
+++ b/src/common/snippets/include/snippets/snippets_isa.hpp
@@ -17,6 +17,7 @@
 #include "op/fill.hpp"
 #include "op/kernel.hpp"
 #include "op/load.hpp"
+#include "op/reshape.hpp"
 #include "op/nop.hpp"
 #include "op/scalar.hpp"
 #include "op/powerstatic.hpp"
diff --git a/src/common/snippets/include/snippets/snippets_isa_tbl.hpp b/src/common/snippets/include/snippets/snippets_isa_tbl.hpp
index fed0dfcdd5c2b4..9b207b09fe411f 100644
--- a/src/common/snippets/include/snippets/snippets_isa_tbl.hpp
+++ b/src/common/snippets/include/snippets/snippets_isa_tbl.hpp
@@ -16,6 +16,7 @@ OV_OP(LoopBegin, ov::snippets::op)
 OV_OP(LoopEnd, ov::snippets::op)
 OV_OP(Brgemm, ov::snippets::op)
 OV_OP(BroadcastLoad, ov::snippets::op)
+OV_OP(Reshape, ov::snippets::op)
 
 OV_OP(Store, ov::snippets::op)
 
diff --git a/src/common/snippets/include/snippets/utils.hpp b/src/common/snippets/include/snippets/utils.hpp
index 41d87f0e0fe83d..9669796628ad44 100644
--- a/src/common/snippets/include/snippets/utils.hpp
+++ b/src/common/snippets/include/snippets/utils.hpp
@@ -154,6 +154,14 @@ VectorDims get_planar_vdims(const snippets::lowered::ExpressionPort& expr_port);
  * @return preordered shape: `shape[i]` = `planar_shape[order[i]]` where `shape` is shape before applying the order.
  */
 VectorDims get_preordered_vdims(const snippets::lowered::ExpressionPort& expr_port);
+/**
+ * @brief Returns element count of a shape
+ * @param shape input shape
+ * @return element count of input shape
+ */
+inline auto get_shape_size(const VectorDims& shape) -> size_t {
+    return std::accumulate(shape.begin(), shape.end(), static_cast<size_t>(1), std::multiplies<size_t>());
+}
 /* --------------------------- */
 
 } // namespace utils
diff --git a/src/common/snippets/src/generator.cpp b/src/common/snippets/src/generator.cpp
index 8a0ae29f281097..027314d5ad4cb5 100644
--- a/src/common/snippets/src/generator.cpp
+++ b/src/common/snippets/src/generator.cpp
@@ -81,6 +81,7 @@ RegType Generator::get_op_out_reg_type(const ov::Output<Node>& out) const {
         std::dynamic_pointer_cast<op::IntermediateMemoryBuffer>(op) ||
         std::dynamic_pointer_cast<op::NewMemoryBuffer>(op) ||
         std::dynamic_pointer_cast<op::RankNormalization>(op) ||
+        std::dynamic_pointer_cast<op::Reshape>(op) ||
         std::dynamic_pointer_cast<snippets::op::Store>(op)
 #ifdef SNIPPETS_DEBUG_CAPS
         || std::dynamic_pointer_cast<op::PerfCountBeginBase>(op)
diff --git a/src/common/snippets/src/lowered/linear_ir.cpp b/src/common/snippets/src/lowered/linear_ir.cpp
index 64bf3d0b53f712..3bff272f03f6c4 100644
--- a/src/common/snippets/src/lowered/linear_ir.cpp
+++ b/src/common/snippets/src/lowered/linear_ir.cpp
@@ -12,6 +12,7 @@
 #include "openvino/core/graph_util.hpp"
 #include "openvino/core/type.hpp"
 #include "snippets/utils.hpp"
+#include "snippets/op/subgraph.hpp"
 
 namespace ov {
 namespace snippets {
@@ -365,10 +366,14 @@ VectorDims LinearIR::get_master_shape() const {
     }
     // Note: Snippets would benefit from a more generic master_shape calculation approach.
     //  It will be implemented in the scope of ROI propagation activity (ticket 120505)
-    const auto& source = out_exprs[0]->get_input_port_connector(0)->get_source();
-    if (!m_config.m_enable_domain_optimization && out_exprs.size() == 1 &&
-        ov::is_type<snippets::op::Brgemm>(source.get_expr()->get_node())) {
-        master_shape = utils::get_preordered_vdims(source);
+    if (out_exprs.size() == 1) {
+        const auto& source = out_exprs[0]->get_input_port_connector(0)->get_source();
+        if (!m_config.m_enable_domain_optimization && ov::is_type<snippets::op::Brgemm>(source.get_expr()->get_node())) {
+            master_shape = utils::get_preordered_vdims(source);
+        } else {
+            auto last_shape_infer_expr = LinearIR::get_last_shape_infer_expr(out_exprs[0], false);
+            master_shape = utils::get_preordered_vdims(last_shape_infer_expr->get_input_port_connector(0)->get_source());
+        }
     } else {
         for (const auto& oe : out_exprs) {
             const auto& port_desc = oe->get_input_port_descriptor(0);
@@ -493,6 +498,74 @@ LinearIR::exprIt LinearIR::replace_with_expr(const std::vector<ExpressionPtr>& o
     return replace_with_expr(old_exprs, new_expr, insertion_place);
 }
 
+std::vector<ExpressionPtr> LinearIR::propagate_expr_through_shape_infer_ops(const ExpressionPtr& start_expr, bool downstream) {
+    std::vector<ExpressionPtr> shape_infer_exprs;
+    auto current_exp = start_expr;
+    if (op::Subgraph::is_shape_infer_op(current_exp->get_node())) {
+        shape_infer_exprs.push_back(current_exp);
+    }
+    if (downstream) {
+        if (current_exp->get_output_count() == 0)
+            return shape_infer_exprs;
+        auto consumers = current_exp->get_output_port_connector(0)->get_consumers();
+        auto first_child = consumers.begin()->get_expr();
+        while (op::Subgraph::is_shape_infer_op(first_child->get_node())) {
+            OPENVINO_ASSERT(consumers.size() == 1, "Shape infer ops are supposed to be the only consumer.");
+            shape_infer_exprs.push_back(first_child);
+            current_exp = first_child;
+            if (current_exp->get_output_count() == 0)
+                break;
+            consumers = current_exp->get_output_port_connector(0)->get_consumers();
+            first_child = consumers.begin()->get_expr();
+        }
+        return shape_infer_exprs;
+    } else {
+        // upstream
+        if (current_exp->get_input_count() == 0)
+            return shape_infer_exprs;
+        auto first_source = current_exp->get_input_port_connector(0)->get_source().get_expr();
+        while (op::Subgraph::is_shape_infer_op(first_source->get_node())) {
+            shape_infer_exprs.push_back(first_source);
+            current_exp = first_source;
+            if (current_exp->get_input_count() == 0)
+                break;
+            first_source = current_exp->get_input_port_connector(0)->get_source().get_expr();
+        }
+        return shape_infer_exprs;
+    }
+}
+
+ExpressionPtr LinearIR::get_last_shape_infer_expr(const ExpressionPtr& start_expr, bool downstream) {
+    auto last_exp = start_expr;
+    if (downstream) {
+        if (last_exp->get_output_count() == 0)
+            return last_exp;
+        auto consumers = last_exp->get_output_port_connector(0)->get_consumers();
+        auto first_child = consumers.begin()->get_expr();
+        while (op::Subgraph::is_shape_infer_op(first_child->get_node())) {
+            OPENVINO_ASSERT(consumers.size() == 1, "Shape infer ops are supposed to be the only consumer.");
+            last_exp = first_child;
+            if (last_exp->get_output_count() == 0)
+                break;
+            consumers = last_exp->get_output_port_connector(0)->get_consumers();
+            first_child = consumers.begin()->get_expr();
+        }
+        return last_exp;
+    } else {
+        // upstream
+        if (last_exp->get_input_count() == 0)
+            return last_exp;
+        auto first_source = last_exp->get_input_port_connector(0)->get_source().get_expr();
+        while (op::Subgraph::is_shape_infer_op(first_source->get_node())) {
+            last_exp = first_source;
+            if (last_exp->get_input_count() == 0)
+                break;
+            first_source = last_exp->get_input_port_connector(0)->get_source().get_expr();
+        }
+        return last_exp;
+    }
+}
+
 LinearIR::LIRShapeInfer::LIRShapeInfer(container& body_exprs, io_container& io_exprs)
                                        : ShapeInferSnippetsNode(),
                                          m_exprs{std::make_shared<container>(body_exprs)} {
diff --git a/src/common/snippets/src/lowered/pass/allocate_buffers.cpp b/src/common/snippets/src/lowered/pass/allocate_buffers.cpp
index c7cf6b67abd8ea..ffc842c6af4078 100644
--- a/src/common/snippets/src/lowered/pass/allocate_buffers.cpp
+++ b/src/common/snippets/src/lowered/pass/allocate_buffers.cpp
@@ -19,8 +19,8 @@ namespace snippets {
 namespace lowered {
 namespace pass {
 
-AllocateBuffers::AllocateBuffers(size_t& buffer_scratchpad_size, bool is_optimized)
-    : m_buffer_scratchpad_size(buffer_scratchpad_size), m_is_optimized_mode(is_optimized) {}
+AllocateBuffers::AllocateBuffers(size_t& buffer_scratchpad_size, int& buffer_inplace_output, bool is_optimized)
+    : m_buffer_scratchpad_size(buffer_scratchpad_size), m_is_optimized_mode(is_optimized), m_buffer_inplace_output(buffer_inplace_output) {}
 
 void AllocateBuffers::set_buffer_offset(const ExpressionPtr& buffer_expr, const size_t offset) {
     // If Buffer has offset We set this offset in the connected MemoryAccess ops
@@ -46,7 +46,8 @@ void AllocateBuffers::set_buffer_offset(const ExpressionPtr& buffer_expr, const
         }
     }
     // Propagate to down: in Load. Buffer can have several Load
-    const auto& buffer_out = buffer_expr->get_output_port_connector(0);
+    auto last_shape_infer = ov::snippets::lowered::LinearIR::get_last_shape_infer_expr(buffer_expr, true);
+    const auto& buffer_out = last_shape_infer->get_output_port_connector(0);
     for (const auto& child_expr_input : buffer_out->get_consumers()) {
         const auto& child_expr = child_expr_input.get_expr();
         const auto port = child_expr_input.get_index();
@@ -59,7 +60,7 @@ void AllocateBuffers::set_buffer_offset(const ExpressionPtr& buffer_expr, const
             continue;
         } else {
             OPENVINO_THROW(
-                    "Buffer::set_offset() was called when Buffer didn't have the corresponding MemoryAccess op for offset propagation");
+                "Buffer::set_offset() was called when Buffer didn't have the corresponding MemoryAccess op for offset propagation");
         }
     }
 }
@@ -77,6 +78,7 @@ bool AllocateBuffers::run(lowered::LinearIR& linear_ir, lowered::LinearIR::const
         pipeline.register_pass<SolveBufferMemory>(m_buffer_scratchpad_size, buffer_clusters);
         pipeline.register_pass<NormalizeBufferIDs>();
         pipeline.run(linear_ir);
+        m_buffer_inplace_output = 0;
     } else {
         InitBuffersDefault(m_buffer_scratchpad_size).run(linear_ir, linear_ir.cbegin(), linear_ir.cend());
     }
diff --git a/src/common/snippets/src/lowered/pass/assign_registers.cpp b/src/common/snippets/src/lowered/pass/assign_registers.cpp
index e4b828547e9ce5..65499819b3685a 100644
--- a/src/common/snippets/src/lowered/pass/assign_registers.cpp
+++ b/src/common/snippets/src/lowered/pass/assign_registers.cpp
@@ -5,6 +5,7 @@
 #include "snippets/lowered/pass/assign_registers.hpp"
 
 #include "snippets/lowered/linear_ir.hpp"
+#include "snippets/op/subgraph.hpp"
 #include "snippets/snippets_isa.hpp"
 #include "snippets/itt.hpp"
 
@@ -79,15 +80,23 @@ bool AssignRegisters::run(LinearIR& linear_ir) {
             if (io_expr->get_type() == IOExpression::io_type::INPUT) {
                 const auto& out_connector = expr->get_output_port_connector(0);
                 manually_assigned_gprs[out_connector] = io_expr->get_index();
-                const auto& consumer_inputs = out_connector->get_consumers();
-                const auto& first_consumer = consumer_inputs.begin()->get_expr();
-                // TODO [96434]: Support RankNormalization (Reshape) in arbitrary place in pipeline, not just after inputs
-                if (ov::is_type<op::RankNormalization>(first_consumer->get_node())) {
-                    OPENVINO_ASSERT(consumer_inputs.size() == 1, "RankNormalization is supposed to be the only consumer");
-                    manually_assigned_gprs[first_consumer->get_output_port_connector(0)] = io_expr->get_index();
+                // TODO [96434]: Support shape infer ops in arbitrary place in pipeline, not just after inputs
+                // shape infer ops sequence after input
+                auto shape_infer_consumers = LinearIR::propagate_expr_through_shape_infer_ops(io_expr, true);
+                if (!shape_infer_consumers.empty()) {
+                    for (const auto& child_shape_infer_expr : shape_infer_consumers) {
+                        manually_assigned_gprs[child_shape_infer_expr->get_output_port_connector(0)] = io_expr->get_index();
+                    }
                 }
             } else if (io_expr->get_type() == IOExpression::io_type::OUTPUT) {
                 manually_assigned_gprs[expr->get_input_port_connector(0)] = num_parameters + io_expr->get_index();
+                // shape infer ops sequence before result
+                auto shape_infer_sources = LinearIR::propagate_expr_through_shape_infer_ops(io_expr, false);
+                if (!shape_infer_sources.empty()) {
+                    for (const auto& parent_shape_infer_expr : shape_infer_sources) {
+                        manually_assigned_gprs[parent_shape_infer_expr->get_input_port_connector(0)] = num_parameters + io_expr->get_index();
+                    }
+                }
             } else {
                 OPENVINO_THROW("Unsupported io_type detected");
             }
@@ -97,6 +106,16 @@ bool AssignRegisters::run(LinearIR& linear_ir) {
             if (ov::is_type<op::IntermediateMemoryBuffer>(buffer)) {
                 manually_assigned_gprs[expr->get_input_port_connector(0)] =
                         static_cast<Reg>(num_results + num_parameters + buffer_id);
+                // shape infer ops in the middle of subgraph. IntermediateMemoryBuffer is inserted before reshape as new loop should start.
+                // child shape info ops share the same memory as IntermediateMemoryBuffer.
+                auto shape_infer_consumers = LinearIR::propagate_expr_through_shape_infer_ops(expr, true);
+                if (!shape_infer_consumers.empty()) {
+                    for (const auto& child_shape_infer_expr : shape_infer_consumers) {
+                        manually_assigned_gprs[child_shape_infer_expr->get_input_port_connector(0)] =
+                            manually_assigned_gprs[child_shape_infer_expr->get_output_port_connector(0)] =
+                            static_cast<Reg>(num_results + num_parameters + buffer_id);
+                    }
+                }
             }
             manually_assigned_gprs[expr->get_output_port_connector(0)] =
                     static_cast<Reg>(num_results + num_parameters + buffer_id);
diff --git a/src/common/snippets/src/lowered/pass/insert_buffers.cpp b/src/common/snippets/src/lowered/pass/insert_buffers.cpp
index eb72f971ced1c4..3174add775fae5 100644
--- a/src/common/snippets/src/lowered/pass/insert_buffers.cpp
+++ b/src/common/snippets/src/lowered/pass/insert_buffers.cpp
@@ -147,10 +147,22 @@ void InsertBuffers::insertion(LinearIR& linear_ir,
         const auto& expr = entry_port->get_expr();
         const auto port_idx = entry_port->get_index();
         const auto node = expr->get_node();
-        const auto& parent_expr_output = expr->get_input_port_connector(port_idx)->get_source();
+        auto parent_expr_output = expr->get_input_port_connector(port_idx)->get_source();
+
+        const auto& first_parent_expr = parent_expr_output.get_expr();
+        bool has_shape_infer_parent = false;
+        auto top_shape_infer_expr = expr;
+        // parent before shape infer ops is used to determine if buffer needed according loopInfo
+        auto shape_infer_parents = LinearIR::propagate_expr_through_shape_infer_ops(first_parent_expr, false);
+        if (!shape_infer_parents.empty()) {
+            parent_expr_output = shape_infer_parents.back()->get_input_port_connector(0)->get_source();
+            has_shape_infer_parent = true;
+            top_shape_infer_expr = shape_infer_parents.back();
+        }
+
         const auto& parent_expr = parent_expr_output.get_expr();
-        const auto parent_port = parent_expr_output.get_index();
-        const auto parent = parent_expr->get_node();
+        const auto& parent_port = parent_expr_output.get_index();
+        const auto& parent = parent_expr->get_node();
         if (ov::is_type<op::Buffer>(parent) ||
             ov::is_type<op::VectorBuffer>(parent) ||
             ov::is_type<ov::op::v0::Parameter>(parent) ||
@@ -178,7 +190,8 @@ void InsertBuffers::insertion(LinearIR& linear_ir,
                                                                    parent_expr_output,
                                                                    m_buffer_allocation_rank);
             const auto buffer = std::make_shared<op::IntermediateMemoryBuffer>(parent->output(parent_port), allocation_shape);
-            linear_ir.insert_node(buffer, std::vector<ExpressionPort>{ parent_expr_output }, buffer_loop_ids, false, pos, { *entry_port });
+            const auto buffer_consumer = has_shape_infer_parent ? top_shape_infer_expr->get_input_port(0)  : *entry_port;
+            linear_ir.insert_node(buffer, std::vector<ExpressionPort>{ parent_expr_output }, buffer_loop_ids, false, pos, { buffer_consumer  });
         }
     }
 
diff --git a/src/common/snippets/src/lowered/pass/insert_load_store.cpp b/src/common/snippets/src/lowered/pass/insert_load_store.cpp
index 2accd66309d49a..fcc21ceedc1cde 100644
--- a/src/common/snippets/src/lowered/pass/insert_load_store.cpp
+++ b/src/common/snippets/src/lowered/pass/insert_load_store.cpp
@@ -36,12 +36,7 @@ size_t InsertLoadStore::get_count(const ExpressionPort& port) const {
 
 bool InsertLoadStore::insert_load(LinearIR& linear_ir, const LinearIR::constExprIt& data_expr_it) {
     std::shared_ptr<Expression> data_expr = *data_expr_it;
-    auto consumer_inputs = data_expr->get_output_port_connector(0)->get_consumers();
-    const auto& first_consumer = consumer_inputs.begin()->get_expr();
-    if (is_type<op::RankNormalization>(first_consumer->get_node())) {
-        OPENVINO_ASSERT(consumer_inputs.size() == 1, "RankNormalization is supposed to be the only consumer");
-        data_expr = first_consumer;
-    }
+    data_expr = LinearIR::get_last_shape_infer_expr(data_expr, true);
     const auto& data_ngraph_output = data_expr->get_node()->output(0);
     bool was_inserted = false;
     const auto& data_out = data_expr->get_output_port_connector(0);
@@ -61,7 +56,9 @@ bool InsertLoadStore::insert_load(LinearIR& linear_ir, const LinearIR::constExpr
 }
 
 bool InsertLoadStore::insert_store(LinearIR& linear_ir, const LinearIR::constExprIt& data_expr_it) {
-    const auto& data_expr = *data_expr_it;
+    auto data_expr = *data_expr_it;
+    data_expr = LinearIR::get_last_shape_infer_expr(data_expr, false);
+
     const auto& parent_output = data_expr->get_input_port_connector(0)->get_source();
     const auto& parent_expr = parent_output.get_expr();
     const auto port = parent_output.get_index();
diff --git a/src/common/snippets/src/lowered/pass/mark_loops.cpp b/src/common/snippets/src/lowered/pass/mark_loops.cpp
index 3ff96b6ce374f4..ded7de36040576 100644
--- a/src/common/snippets/src/lowered/pass/mark_loops.cpp
+++ b/src/common/snippets/src/lowered/pass/mark_loops.cpp
@@ -27,7 +27,8 @@ bool MarkLoops::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, l
         return ov::is_type<ov::op::v0::Result>(node) ||
                ov::is_type<ov::op::v0::Constant>(node) ||
                ov::is_type<ov::op::v0::Parameter>(node) ||
-               ov::is_type<op::RankNormalization>(node);
+               ov::is_type<op::RankNormalization>(node) ||
+               ov::is_type<op::Reshape>(node);
     };
 
     auto are_conflicted = [](const ExpressionPort& lhs, const ExpressionPort& rhs) {
diff --git a/src/common/snippets/src/lowered/pass/validate.cpp b/src/common/snippets/src/lowered/pass/validate.cpp
index 8dc95a94f9c015..b9a57801d6a351 100644
--- a/src/common/snippets/src/lowered/pass/validate.cpp
+++ b/src/common/snippets/src/lowered/pass/validate.cpp
@@ -32,13 +32,8 @@ void validate_ports(const ExpressionPtr& expr) {
 void validate_parameter(const ExpressionPtr& expr, const LinearIR& linear_ir) {
     OPENVINO_ASSERT(ov::is_type<ov::op::v0::Parameter>(expr->get_node()),
                     "Parameter validation expects Parameter op");
-    auto consumer_inputs = expr->get_output_port_connector(0)->get_consumers();
-    const auto& first_consumer = consumer_inputs.begin()->get_expr();
-    if (is_type<snippets::op::RankNormalization>(first_consumer->get_node())) {
-        OPENVINO_ASSERT(consumer_inputs.size() == 1,
-                        "If there is RankNormalization after Parameter, it should be single consumer of the Parameter");
-        consumer_inputs = first_consumer->get_output_port_connector(0)->get_consumers();
-    }
+    auto expr_val = LinearIR::get_last_shape_infer_expr(expr, true);
+    auto consumer_inputs = expr_val->get_output_port_connector(0)->get_consumers();
     std::set<std::vector<size_t>> layouts;
     for (const auto& consumer_input : consumer_inputs) {
         const auto& node = consumer_input.get_expr()->get_node();
@@ -56,7 +51,8 @@ void validate_parameter(const ExpressionPtr& expr, const LinearIR& linear_ir) {
 void validate_result(const ExpressionPtr& expr, const LinearIR& linear_ir) {
     OPENVINO_ASSERT(ov::is_type<ov::op::v0::Result>(expr->get_node()),
                     "Result validation expects Result op");
-    const auto source = expr->get_input_port_connector(0)->get_source();
+    auto expr_val = LinearIR::get_last_shape_infer_expr(expr, false);
+    const auto source = expr_val->get_input_port_connector(0)->get_source();
     const auto ma = ov::as_type_ptr<snippets::op::MemoryAccess>(source.get_expr()->get_node());
     OPENVINO_ASSERT(ma && ma->is_memory_access_output_port(source.get_index()),
                     "Result expects MemoryAccess parent");
@@ -70,8 +66,8 @@ void validate_buffer(const ExpressionPtr& expr, const LinearIR& linear_ir) {
     const auto ma = ov::as_type_ptr<snippets::op::MemoryAccess>(source.get_expr()->get_node());
     OPENVINO_ASSERT(ma && ma->is_memory_access_input_port(source.get_index()),
                     "Buffer expects MemoryAccess parent");
-
-    const auto& out = expr->get_output_port_connector(0);
+    auto expr_val = LinearIR::get_last_shape_infer_expr(expr, true);
+    const auto& out = expr_val->get_output_port_connector(0);
     const auto consumers = out->get_consumers();
     for (const auto& consumer_input : consumers) {
         const auto& node = consumer_input.get_expr()->get_node();
diff --git a/src/common/snippets/src/op/reshape.cpp b/src/common/snippets/src/op/reshape.cpp
new file mode 100644
index 00000000000000..65927f2ee4e2bf
--- /dev/null
+++ b/src/common/snippets/src/op/reshape.cpp
@@ -0,0 +1,43 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/itt.hpp"
+
+#include "snippets/op/reshape.hpp"
+#include "snippets/utils.hpp"
+
+
+namespace ov {
+namespace snippets {
+namespace op {
+Reshape::Reshape(const Output<Node>& arg, ov::PartialShape target_shape)
+    : Op({arg}), m_target_shape(std::move(target_shape)) {
+    constructor_validate_and_infer_types();
+}
+
+void Reshape::validate_and_infer_types() {
+    set_output_type(0, get_input_element_type(0), m_target_shape);
+}
+
+std::shared_ptr<Node> Reshape::clone_with_new_inputs(const OutputVector& new_args) const {
+    INTERNAL_OP_SCOPE(Reshape);
+    check_new_args_count(this, new_args);
+    return std::make_shared<Reshape>(new_args.at(0), m_target_shape);
+}
+
+bool Reshape::visit_attributes(AttributeVisitor& visitor) {
+    visitor.on_attribute("target_shape", m_target_shape);
+    return true;
+}
+
+const ov::PartialShape& Reshape::get_target_shape() const {
+    return m_target_shape;
+}
+
+void Reshape::set_target_shape(ov::PartialShape shape) {
+    m_target_shape = std::move(shape);
+}
+}// namespace op
+}// namespace snippets
+}// namespace ov
\ No newline at end of file
diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp
index 70224751f1f810..30da2c387ff422 100644
--- a/src/common/snippets/src/op/subgraph.cpp
+++ b/src/common/snippets/src/op/subgraph.cpp
@@ -56,6 +56,8 @@
 #include <memory>
 #include <array>
 
+#include "snippets/lowered/pass/serialize_control_flow.hpp"
+
 using namespace std;
 using namespace ov::op::util;
 
@@ -77,7 +79,42 @@ auto Subgraph::is_domain_sensitive_op(const std::shared_ptr<ov::Node>& op) -> bo
            ov::is_type<ov::op::v8::Softmax>(op) ||
            ov::is_type<ov::op::v0::MatMul>(op) ||
            ov::is_type<ov::op::v1::Broadcast>(op) || // Broadcast is domain sensetive op because the output shape depends on
-           ov::is_type<ov::op::v3::Broadcast>(op);   // the both input and broadcast shapes (the both - are inputs of op). Note: is used only in MHA pattern
+           ov::is_type<ov::op::v3::Broadcast>(op) ||   // the both input and broadcast shapes (the both - are inputs of op). Note: is used only in MHA pattern
+           ov::is_type<ov::op::v12::GroupNormalization>(op) ||
+           ov::is_type<op::Reshape>(op);
+}
+
+auto Subgraph::is_shape_infer_op(const std::shared_ptr<ov::Node>& op) -> bool {
+    return ov::is_type<snippets::op::Reshape>(op) ||
+           ov::is_type<snippets::op::RankNormalization>(op);
+}
+
+auto Subgraph::get_last_shape_infer_op(const std::shared_ptr<ov::Node>& op, bool downstream) -> std::shared_ptr<ov::Node> {
+    auto last_op = op;
+    if (downstream) {
+        if (last_op->get_output_size() == 0)
+            return last_op;
+        auto first_child = last_op->get_output_target_inputs(0).begin()->get_node()->shared_from_this();
+        while (op::Subgraph::is_shape_infer_op(first_child)) {
+            last_op = first_child;
+            if (last_op->get_output_size() == 0)
+                break;
+            first_child = last_op->get_output_target_inputs(0).begin()->get_node()->shared_from_this();
+        }
+        return last_op;
+    } else {
+        // upstream
+        if (last_op->get_input_size() == 0)
+            return last_op;
+        auto first_parent = last_op->get_input_node_shared_ptr(0);
+        while (op::Subgraph::is_shape_infer_op(first_parent)) {
+            last_op = first_parent;
+            if (last_op->get_input_size() == 0)
+                break;
+            first_parent = last_op->get_input_node_shared_ptr(0);
+        }
+        return last_op;
+    }
 }
 
 void Subgraph::init_config() {
@@ -273,7 +310,8 @@ auto Subgraph::constant_input_should_be_inside_body(const std::shared_ptr<ov::No
     return ov::is_type<ov::op::v1::Transpose>(node) ||
            ov::is_type<ov::op::v1::Broadcast>(node) ||
            ov::is_type<ov::op::v3::Broadcast>(node) ||
-           ov::is_type<ov::op::v1::Reshape>(node);
+           ov::is_type<ov::op::v1::Reshape>(node) ||
+           ov::is_type<op::ReduceBase>(node);
 }
 
 bool Subgraph::check_broadcast(const std::shared_ptr<const ov::Node>& node) noexcept {
@@ -319,7 +357,7 @@ VectorDims Subgraph::infer_master_shape() {
         OPENVINO_ASSERT(!output_dims.empty(), "Can't calculate master_shape before the first shape inference");
     } else {
         for (const auto& res : body_ptr()->get_results()) {
-            const auto& res_input = res->input(0);
+            auto res_input = get_last_shape_infer_op(res, false)->input(0);
             OPENVINO_ASSERT(res_input.get_partial_shape().is_static(), "Result have dynamic shape in static pipeline");
             // We need to account to the shape's layout stored in Output<Node> rt_info
             const auto& planar_shape = utils::get_preordered_pshape(res_input.get_source_output());
@@ -405,6 +443,12 @@ void Subgraph::data_flow_transformations(const BlockedShapeVector& blocked_input
 
     manager.register_positioned_passes(backend_passes);
     manager.run_passes(body_ptr());
+
+    // ov::pass::Manager magr;
+    // std::string xmlo = "data_flow.xml";
+    // std::string bino = "data_flow.bin";
+    // magr.register_pass<ov::pass::Serialize>(xmlo, bino);
+    // magr.run_passes(body_ptr());
 }
 
 void Subgraph::control_flow_transformations(lowered::LinearIR& linear_ir,
@@ -437,7 +481,8 @@ void Subgraph::control_flow_transformations(lowered::LinearIR& linear_ir,
     pipeline.register_pass<lowered::pass::ValidateLoops>();
     pipeline.register_pass<lowered::pass::InitLoops>();
     pipeline.register_pass<lowered::pass::InsertLoops>();
-    pipeline.register_pass<lowered::pass::AllocateBuffers>(lowering_result.buffer_scratchpad_size, linear_ir.get_config().m_are_buffers_optimized);
+    pipeline.register_pass<lowered::pass::AllocateBuffers>(lowering_result.buffer_scratchpad_size, lowering_result.buffer_inplace_output,
+        linear_ir.get_config().m_are_buffers_optimized);
     pipeline.register_pass<lowered::pass::CleanRepeatedDataPointerShifts>();
     pipeline.register_positioned_passes(lowered_backend_passes);
     pipeline.register_pass<lowered::pass::Validate>(); // must be last
@@ -478,6 +523,11 @@ snippets::Schedule Subgraph::generate_from_linear_ir(const std::shared_ptr<lower
         perf_count_pass.run(linear_ir, linear_ir.cbegin(), linear_ir.cend());
     }
 #endif
+
+    // std::string xmlo = "LIR.xml";
+    // lowered::pass::SerializeControlFlow SerializeLIR(xmlo);
+    // SerializeLIR.run(linear_ir);
+
     m_generator->generate(linear_ir, lowering_result, compile_params);
 
     VectorDims parallel_exec_domain = linear_ir.get_master_shape();
diff --git a/src/common/snippets/src/pass/align_element_types.cpp b/src/common/snippets/src/pass/align_element_types.cpp
index 625294d9e092e4..c159167c7496e7 100644
--- a/src/common/snippets/src/pass/align_element_types.cpp
+++ b/src/common/snippets/src/pass/align_element_types.cpp
@@ -29,7 +29,7 @@ bool pass::AlignElementTypes::run_on_model(const std::shared_ptr<ov::Model>& m)
     for (size_t i = 0; i < m_output_precisions.size(); i++) {
         const auto needed_out_type = m_output_precisions[i];
         if (results[i]->get_input_element_type(0) != needed_out_type) {
-            std::shared_ptr<ov::Node> consumer = results[i];
+            std::shared_ptr<ov::Node> consumer = op::Subgraph::get_last_shape_infer_op(results[i], false);
             auto parent_output = consumer->get_input_source_output(0);
 
             // Snippets supports Transpose only after Parameter or before Result nodes
@@ -76,17 +76,11 @@ bool pass::AlignElementTypes::run_on_model(const std::shared_ptr<ov::Model>& m)
             parameter->set_element_type(needed_in_type);
             parameter->validate_and_infer_types();
 
-            auto parent_output = parameter->output(0);
-            auto consumer_inputs = parent_output.get_target_inputs();
-
-            const auto& first_child = consumer_inputs.begin()->get_node()->shared_from_this();
-            // Note: RankNormalization of is designed for shape-inference purposes only.
+            // Note: shape infer ops is designed for shape-inference purposes only.
             // It does not process any data (nor does it emit any code), so it doesn't require Convert operations
-            if (is_type<op::RankNormalization>(first_child)) {
-                OPENVINO_ASSERT(consumer_inputs.size() == 1, "RankNormalization is supposed to be the only consumer");
-                parent_output = first_child->output(0);
-                consumer_inputs = parent_output.get_target_inputs();
-            }
+            auto first_child = op::Subgraph::get_last_shape_infer_op(parameter, true);
+            auto parent_output = first_child->output(0);
+            auto consumer_inputs = parent_output.get_target_inputs();
 
             // Snippets supports Transpose only after Parameter or before Result nodes
             // So we have to insert Convert after Transpose (if there is) on Subgraph inputs
diff --git a/src/common/snippets/src/pass/common_optimizations.cpp b/src/common/snippets/src/pass/common_optimizations.cpp
index 1e10d2dc6dfe6e..7c8089bc776bec 100644
--- a/src/common/snippets/src/pass/common_optimizations.cpp
+++ b/src/common/snippets/src/pass/common_optimizations.cpp
@@ -6,6 +6,7 @@
 
 #include "snippets/pass/fq_decomposition.hpp"
 #include "snippets/pass/softmax_reshape_elimination.hpp"
+#include "snippets/pass/gn_decomposition.hpp"
 #include "snippets/pass/explicit_transpose_matmul_inputs.hpp"
 #include "snippets/pass/transpose_decomposition.hpp"
 #include "snippets/pass/fuse_transpose_brgemm.hpp"
@@ -50,6 +51,7 @@ CommonOptimizations::CommonOptimizations(const SnippetsTokenization::Config& con
         REGISTER_SNIPPETS_PASS(manager, ov::snippets::pass::ExplicitTransposeMatMulInputs, is_domain_sensitive);
         REGISTER_SNIPPETS_PASS(manager, ov::snippets::pass::CommonFakeQuantizeDecomposition, is_quantized);
         REGISTER_SNIPPETS_PASS(manager, ov::snippets::pass::SoftmaxReshapeElimination, is_domain_sensitive);
+        REGISTER_SNIPPETS_PASS(manager, ov::snippets::pass::GNDecomposition, is_domain_sensitive);
         manager.run_passes(body);
 
         ov::snippets::pass::CommonOptimizations::SubgraphManager subgraph_manager;
diff --git a/src/common/snippets/src/pass/gn_decomposition.cpp b/src/common/snippets/src/pass/gn_decomposition.cpp
new file mode 100644
index 00000000000000..aec78587b588a5
--- /dev/null
+++ b/src/common/snippets/src/pass/gn_decomposition.cpp
@@ -0,0 +1,157 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/pass/gn_decomposition.hpp"
+
+#include "openvino/op/group_normalization.hpp"
+#include "snippets/op/reduce.hpp"
+#include "openvino/pass/pattern/op/wrap_type.hpp"
+#include "snippets/itt.hpp"
+#include "snippets/lowered/port_descriptor.hpp"
+#include "snippets/snippets_isa.hpp"
+#include "openvino/core/rt_info.hpp"
+
+namespace ov {
+namespace snippets {
+namespace pass {
+using namespace lowered;
+
+// groupNorm -> reshape + mvn + reshape + mul + add,
+// where mvn = (x - mean) / Sqrt(ReduceMean((x - mean) ^ 2) + eps),
+// where mean = ReduceMean(x, axes)
+GNDecomposition::GNDecomposition() {
+    MATCHER_SCOPE(GNDecomposition);
+    auto group_norm_pattern = ov::pass::pattern::wrap_type<ov::op::v12::GroupNormalization>();
+
+    ov::matcher_pass_callback callback = [=](ov::pass::pattern::Matcher& m) {
+        OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::pass::GNDecomposition")
+        auto group_norm_node = ov::as_type_ptr<ov::op::v12::GroupNormalization>(m.get_match_root());
+
+        const auto data = group_norm_node->input_value(0);
+        const auto scale = group_norm_node->input_value(1);
+        const auto bias = group_norm_node->input_value(2);
+
+        const auto num_groups = static_cast<size_t>(group_norm_node->get_num_groups());
+        const float eps = static_cast<float>(group_norm_node->get_epsilon());
+
+        ////////////collapse to reduce lastDim to avoid nested loop overhead(e.g. reduce tails in inner loop)///////////
+        // reshape [N, C, spatial] to [N, group, 1, (C / group) * spatial]
+        const auto orig_shape = group_norm_node->get_input_partial_shape(0);
+        size_t orig_rank = orig_shape.rank().get_length();
+        size_t group_rank = 4;
+        std::vector<Dimension> group_dims(group_rank);
+        group_dims[0] = orig_shape[0];
+        group_dims[1] = Dimension(num_groups);
+        group_dims[2] = Dimension(1);
+        group_dims[3] = Dimension(orig_shape[1] / num_groups);
+        Dimension spatial_dim = 1;
+        for (size_t i = 2; i < orig_rank; ++i) {
+            spatial_dim = spatial_dim * orig_shape[i];
+        }
+        group_dims[3] = group_dims[3] * spatial_dim;
+        ov::PartialShape group_shape(group_dims);
+        std::shared_ptr<ov::op::Op> reshaped_node_orig = std::make_shared<ov::snippets::op::Reshape>(data, group_shape);
+
+        std::shared_ptr<ov::op::Op> reshaped_node1 = reshaped_node_orig;
+        if (data.get_element_type() != element::f32) {
+            reshaped_node1 = std::make_shared<ov::snippets::op::ConvertSaturation>(reshaped_node_orig, element::f32);
+        }
+
+        const auto reduce_sum = std::make_shared<ov::snippets::op::ReduceSum>(reshaped_node1, group_rank - 1);
+        op::ReduceBase::compute_and_set_reduce_subtensors(reduce_sum);
+
+        // reduceMean
+        auto group_shape_static = group_shape.to_shape();
+        float group_size_inv = 1.0f / static_cast<float>(group_shape_static[3]);
+        const auto group_size_inv_node = std::make_shared<ov::op::v0::Constant>(element::f32, Shape{}, std::vector<float>{group_size_inv});
+        const auto reduce_mean = std::make_shared<ov::op::v1::Multiply>(reduce_sum, group_size_inv_node);
+
+        // x - mean
+        std::shared_ptr<ov::op::Op> reshaped_node2 = reshaped_node_orig;
+        if (data.get_element_type() != element::f32) {
+            reshaped_node2 = std::make_shared<ov::snippets::op::ConvertSaturation>(reshaped_node_orig, element::f32);
+        }
+        auto sub_mean = std::make_shared<ov::op::v1::Subtract>(reshaped_node2, reduce_mean);
+        // (x - mean) ^ 2
+        auto sqr_const = std::make_shared<ov::op::v0::Constant>(element::f32, Shape{1}, std::vector<int64_t>{2});
+        auto sqr = std::make_shared<ov::op::v1::Power>(sub_mean, sqr_const);
+        // reduceSum((x - mean) ^ 2)
+        auto sqr_reduce_sum = std::make_shared<ov::snippets::op::ReduceSum>(sqr, group_rank - 1);
+        op::ReduceBase::compute_and_set_reduce_subtensors(sqr_reduce_sum);
+        // reduceMean((x - mean) ^ 2)
+        const auto group_size_inv_node_aux = std::make_shared<ov::op::v0::Constant>(element::f32, Shape{}, std::vector<float>{group_size_inv});
+        auto sqr_mean = std::make_shared<ov::op::v1::Multiply>(sqr_reduce_sum, group_size_inv_node_aux);
+        // reduceMean((x - mean) ^ 2) + eps
+        auto eps_node = std::make_shared<ov::op::v0::Constant>(element::f32, Shape{1}, std::vector<float>{eps});
+        auto eps_add = std::make_shared<ov::op::v1::Add>(sqr_mean, eps_node);  // fma to this add and parent multiply
+        // variance = sqrt( reducemean( (x - mean) ^ 2 ) + eps )
+        auto variance = std::make_shared<ov::op::v0::Sqrt>(eps_add);
+
+        // divide variance
+        const auto variance_inv = std::make_shared<ov::snippets::op::PowerStatic>(variance, -1.f);
+
+        // remove invariance in inner loop
+        std::vector<size_t> subtensor_invariance(group_rank, 1);
+        subtensor_invariance[3] = PortDescriptor::ServiceDimensions::FULL_DIM;
+        PortDescriptorUtils::set_port_descriptor_ptr(reduce_mean->input(0), std::make_shared<PortDescriptor>(reduce_mean->input(0), subtensor_invariance));
+        PortDescriptorUtils::set_port_descriptor_ptr(reduce_mean->output(0), std::make_shared<PortDescriptor>(reduce_mean->output(0), subtensor_invariance));
+        PortDescriptorUtils::set_port_descriptor_ptr(sqr_mean->input(0), std::make_shared<PortDescriptor>(sqr_mean->input(0), subtensor_invariance));
+        PortDescriptorUtils::set_port_descriptor_ptr(sqr_mean->input(1), std::make_shared<PortDescriptor>(sqr_mean->input(1), subtensor_invariance));
+        PortDescriptorUtils::set_port_descriptor_ptr(sqr_mean->output(0), std::make_shared<PortDescriptor>(sqr_mean->output(0), subtensor_invariance));
+        PortDescriptorUtils::set_port_descriptor_ptr(eps_add->input(0), std::make_shared<PortDescriptor>(eps_add->input(0), subtensor_invariance));
+        PortDescriptorUtils::set_port_descriptor_ptr(eps_add->input(1), std::make_shared<PortDescriptor>(eps_add->input(1), subtensor_invariance));
+        PortDescriptorUtils::set_port_descriptor_ptr(eps_add->output(0), std::make_shared<PortDescriptor>(eps_add->output(0), subtensor_invariance));
+        PortDescriptorUtils::set_port_descriptor_ptr(variance->input(0), std::make_shared<PortDescriptor>(variance->input(0), subtensor_invariance));
+        PortDescriptorUtils::set_port_descriptor_ptr(variance->output(0), std::make_shared<PortDescriptor>(variance->output(0), subtensor_invariance));
+        PortDescriptorUtils::set_port_descriptor_ptr(variance_inv->input(0), std::make_shared<PortDescriptor>(variance_inv->input(0), subtensor_invariance));
+        PortDescriptorUtils::set_port_descriptor_ptr(variance_inv->output(0), std::make_shared<PortDescriptor>(variance_inv->output(0), subtensor_invariance));
+
+        auto mvn = std::make_shared<ov::op::v1::Multiply>(sub_mean, variance_inv);
+
+        // reshape mvn from [N, group, 1, (C / group) * spatial] to [N, group, C / group, spatial]
+        std::vector<Dimension> group_channel_dims(group_rank);
+        group_channel_dims[0] = group_dims[0];
+        group_channel_dims[1] = group_dims[1];
+        group_channel_dims[2] = Dimension(orig_shape[1] / num_groups);
+        group_channel_dims[3] = spatial_dim;
+        ov::PartialShape group_channel_shape(group_channel_dims);
+        const auto mvn_reshaped = std::make_shared<ov::snippets::op::Reshape>(mvn, group_channel_shape);
+
+        // reshape scale and bias to [1, group, C / group, 1]
+        std::vector<Dimension> scale_bias_dims(group_rank, Dimension(1));
+        scale_bias_dims[1] = group_channel_dims[1];
+        scale_bias_dims[2] = group_channel_dims[2];
+        ov::PartialShape scale_bias_shape(scale_bias_dims);
+        std::shared_ptr<ov::op::Op> reshape_scale = std::make_shared<ov::snippets::op::Reshape>(scale, scale_bias_shape);
+        if (scale.get_element_type() != element::f32) {
+            reshape_scale = std::make_shared<ov::snippets::op::ConvertSaturation>(reshape_scale, element::f32);
+        }
+        std::shared_ptr<ov::op::Op> reshape_bias = std::make_shared<ov::snippets::op::Reshape>(bias, scale_bias_shape);
+        if (bias.get_element_type() != element::f32) {
+            reshape_bias = std::make_shared<ov::snippets::op::ConvertSaturation>(reshape_bias, element::f32);
+        }
+
+        // scaled mvn_reshape[2,5,2,64] reshape_scale[1,5,2,1] -> scaled_node[2,5,2,64]
+        auto scaled_node = std::make_shared<ov::op::v1::Multiply>(mvn_reshaped, reshape_scale);
+        auto biased_node = std::make_shared<ov::op::v1::Add>(scaled_node, reshape_bias);
+
+        auto result_prec = group_norm_node->get_output_element_type(0);
+        std::shared_ptr<ov::op::Op> biased_node_convert = biased_node;
+        if (result_prec != element::f32) {
+            biased_node_convert = std::make_shared<ov::snippets::op::ConvertSaturation>(biased_node, result_prec);
+        }
+
+        // reshape_back [N, group, C / group, spatial] to [N, C, spatial]
+        const auto reshape_back_node = std::make_shared<ov::snippets::op::Reshape>(biased_node_convert, orig_shape);
+
+        return ov::replace_node_update_name(group_norm_node, reshape_back_node);
+    };
+
+    auto m = std::make_shared<ov::pass::pattern::Matcher>(group_norm_pattern, matcher_name);
+    register_matcher(m, callback);
+}
+
+}  // namespace pass
+}  // namespace snippets
+}  // namespace ov
\ No newline at end of file
diff --git a/src/common/snippets/src/pass/gn_tokenization.cpp b/src/common/snippets/src/pass/gn_tokenization.cpp
new file mode 100644
index 00000000000000..62fe124b2a4f01
--- /dev/null
+++ b/src/common/snippets/src/pass/gn_tokenization.cpp
@@ -0,0 +1,38 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/pass/gn_tokenization.hpp"
+#include "snippets/pass/collapse_subgraph.hpp"
+
+#include "snippets/itt.hpp"
+#include "snippets/op/subgraph.hpp"
+#include "snippets/utils.hpp"
+
+#include "openvino/core/rt_info.hpp"
+#include "openvino/pass/pattern/op/wrap_type.hpp"
+
+ov::snippets::pass::TokenizeGNSnippets::TokenizeGNSnippets() {
+    MATCHER_SCOPE(TokenizeGNSnippets);
+
+    auto group_norm_pattern = ov::pass::pattern::wrap_type<ov::op::v12::GroupNormalization>();
+
+    ov::matcher_pass_callback callback = [=](ov::pass::pattern::Matcher& m) {
+        OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::pass::TokenizeGNSnippets")
+        auto group_norm_node = ov::as_type_ptr<ov::op::v12::GroupNormalization>(m.get_match_root());
+        if (group_norm_node->is_dynamic() || group_norm_node->get_element_type() != element::f32)
+            return false;
+
+        auto subgraph = op::Subgraph::wrap_node_as_subgraph(group_norm_node);
+        subgraph->get_rt_info()["originalLayersNames"] = group_norm_node->get_friendly_name();
+        ov::replace_node(group_norm_node, subgraph);
+        op::update_out_tensor_name(subgraph);
+
+        // mark the Subgraph as Completed to not allow Snippets to include any nodes into the GN Subgraph in common Tokenization
+        SetSnippetsSubgraphType(subgraph, SnippetsSubgraphType::Completed);
+
+        return true;
+    };
+    auto m = std::make_shared<ov::pass::pattern::Matcher>(group_norm_pattern, matcher_name);
+    register_matcher(m, callback);
+}
diff --git a/src/common/snippets/src/pass/tokenization.cpp b/src/common/snippets/src/pass/tokenization.cpp
index 4b96a1f60a8977..30cbb42a031f46 100644
--- a/src/common/snippets/src/pass/tokenization.cpp
+++ b/src/common/snippets/src/pass/tokenization.cpp
@@ -9,6 +9,7 @@
 #include "snippets/pass/common_optimizations.hpp"
 #include "snippets/pass/extract_reshapes_from_mha.hpp"
 #include "snippets/pass/mha_tokenization.hpp"
+#include "snippets/pass/gn_tokenization.hpp"
 #include "snippets/pass/collapse_subgraph.hpp"
 
 
@@ -81,6 +82,7 @@ bool SnippetsTokenization::run_on_model(const std::shared_ptr<ov::Model>& m) {
     manager.register_pass<EnumerateNodes>();
     manager.register_pass<ExtractReshapesFromMHA>();
     manager.register_pass<TokenizeMHASnippets>(m_config);
+    manager.register_pass<TokenizeGNSnippets>();
     manager.register_pass<TokenizeSnippets>();
     manager.register_pass<CommonOptimizations>(m_config);
     manager.run_passes(m);
diff --git a/src/common/snippets/src/shape_inference/shape_infer_instances.cpp b/src/common/snippets/src/shape_inference/shape_infer_instances.cpp
index e8df94bb670d12..371de613305f37 100644
--- a/src/common/snippets/src/shape_inference/shape_infer_instances.cpp
+++ b/src/common/snippets/src/shape_inference/shape_infer_instances.cpp
@@ -245,5 +245,22 @@ Result ReduceShapeInfer::infer(const std::vector<VectorDimsRef>& input_shapes) {
     return {{result_shape}, ShapeInferStatus::success};
 }
 
+ReshapeShapeInfer::ReshapeShapeInfer(const std::shared_ptr<Node>& n) {
+    const auto& reshape = as_type_ptr<ov::snippets::op::Reshape>(n);
+    OPENVINO_ASSERT(reshape, "Invalid node passed to ReshapeShapeInfer.");
+    target_shape = reshape->get_target_shape();
+}
+
+Result ReshapeShapeInfer::infer(const std::vector<VectorDimsRef>& input_shapes) {
+    OPENVINO_ASSERT(input_shapes.size() == 1, "Invalid number of shapes is passed in ReshapeShapeInfer");
+    OPENVINO_ASSERT(target_shape.is_static(), "target_shape should be static in ReshapeShapeInfer");
+    VectorDims result_shape = target_shape.get_shape();
+    const auto input_elems = utils::get_shape_size(input_shapes[0].get());
+    const auto output_elems = utils::get_shape_size(result_shape);
+    OPENVINO_ASSERT(input_elems == output_elems, "Tensor volume should be the same after reshape in ReshapeShapeInfer");
+
+    return {{result_shape}, ShapeInferStatus::success};
+}
+
 } // namespace snippets
 } // namespace ov
diff --git a/src/common/snippets/src/shape_inference/shape_inference.cpp b/src/common/snippets/src/shape_inference/shape_inference.cpp
index b6760d87e1afcb..d6c6081113ea1f 100644
--- a/src/common/snippets/src/shape_inference/shape_inference.cpp
+++ b/src/common/snippets/src/shape_inference/shape_inference.cpp
@@ -60,6 +60,7 @@ const IShapeInferSnippetsFactory::TRegistry IShapeInferSnippetsFactory::registry
         SHAPE_INFER_PREDEFINED(op::KernelStatic, EmptyShapeInfer),
         SHAPE_INFER_PREDEFINED(op::KernelDynamic, EmptyShapeInfer),
         SHAPE_INFER_PREDEFINED(op::Nop, EmptyShapeInfer),
+        SHAPE_INFER_OP_SPECIFIC_EXTERNAL(op::Reshape, ReshapeShapeInfer),
         SHAPE_INFER_OP_SPECIFIC_EXTERNAL(opset1::Select, SelectShapeInfer),
         SHAPE_INFER_OP_SPECIFIC_EXTERNAL(op::Brgemm, BrgemmShapeInfer),
         SHAPE_INFER_OP_SPECIFIC_EXTERNAL(op::ReduceMax, ReduceShapeInfer),
diff --git a/src/common/snippets/tests/include/lowered/pass/buffer_allocation.hpp b/src/common/snippets/tests/include/lowered/pass/buffer_allocation.hpp
index dd5dd631437cd8..e6bafc19ef1700 100644
--- a/src/common/snippets/tests/include/lowered/pass/buffer_allocation.hpp
+++ b/src/common/snippets/tests/include/lowered/pass/buffer_allocation.hpp
@@ -17,7 +17,8 @@ typedef std::tuple<
     bool,   // Optimized pipeline
     bool,   // With SplitLoops opt
     size_t, // Expected Buffer size in bytes
-    size_t  // Expected unique Buffer IDs count
+    size_t // Expected unique Buffer IDs count
+    // int     // buffer output inplace
 > BufferAllocationParams;
 
 class BufferAllocationTest : public testing::TestWithParam<BufferAllocationParams> {
@@ -46,6 +47,7 @@ class BufferAllocationTest : public testing::TestWithParam<BufferAllocationParam
 
     bool m_is_buffer_optimized = true;
     bool m_with_split_loops = true;
+    int m_buffer_inplace_out = -1;
 };
 
 class EltwiseBufferAllocationTest : public BufferAllocationTest {
diff --git a/src/common/snippets/tests/src/lowered/pass/buffer_allocation.cpp b/src/common/snippets/tests/src/lowered/pass/buffer_allocation.cpp
index 62f82705f0bd87..aa18c61a3ab46a 100644
--- a/src/common/snippets/tests/src/lowered/pass/buffer_allocation.cpp
+++ b/src/common/snippets/tests/src/lowered/pass/buffer_allocation.cpp
@@ -76,7 +76,7 @@ void BufferAllocationTest::ApplyTransformations(const std::shared_ptr<ov::snippe
     pipeline.register_pass<ov::snippets::lowered::pass::InsertLoadStore>(m_vector_size);
     pipeline.register_pass<ov::snippets::lowered::pass::InitLoops>();
     pipeline.register_pass<ov::snippets::lowered::pass::InsertLoops>();
-    pipeline.register_pass<ov::snippets::lowered::pass::AllocateBuffers>(m_buffer_scratchpad, m_is_buffer_optimized);
+    pipeline.register_pass<ov::snippets::lowered::pass::AllocateBuffers>(m_buffer_scratchpad, m_buffer_inplace_out, m_is_buffer_optimized);
     pipeline.run(m_linear_ir);
 }
 
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp
index cf46840aad8407..3a23ce5de6e655 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp
@@ -138,6 +138,7 @@ intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_t ho
     jitters[snippets::op::NewMemoryBuffer::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter);
     jitters[snippets::op::VectorBuffer::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter);
     jitters[snippets::op::RankNormalization::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter);
+    jitters[snippets::op::Reshape::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter);
 
     jitters[snippets::op::Load::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_memory_emitter);
     jitters[snippets::op::LoadReshape::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_memory_emitter);
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_kernel_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_kernel_emitter.cpp
index 75372646a23622..e82d7cdd5b36dc 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_kernel_emitter.cpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_kernel_emitter.cpp
@@ -200,13 +200,9 @@ jit_kernel_static_emitter::jit_kernel_static_emitter(dnnl::impl::cpu::x64::jit_g
         element::Type etype;
         switch (expr->get_type()) {
             case snippets::lowered::IOExpression::io_type::INPUT: {
-                // Note that here we consider only the first child (which is usually load),
-                // but often there is another child - LoopEnd
-                auto consumer_inputs = expr->get_output_port_connector(0)->get_consumers();
-                const auto& first_consumer = consumer_inputs.begin()->get_expr();
-                // If there is a RankNormalization op after a parameter - we should skip it
-                if (is_type<snippets::op::RankNormalization>(first_consumer->get_node()))
-                    consumer_inputs = first_consumer->get_output_port_connector(0)->get_consumers();
+                // input->shape changing ops->load
+                auto mem_desc_expr = ov::snippets::lowered::LinearIR::get_last_shape_infer_expr(expr, true);
+                auto consumer_inputs = mem_desc_expr->get_output_port_connector(0)->get_consumers();
                 for (const auto& child_input : consumer_inputs) {
                     const auto ma = ov::as_type_ptr<snippets::op::MemoryAccess>(child_input.get_expr()->get_node());
                     if (ma && ma->is_memory_access_input_port(child_input.get_index())) {
@@ -214,12 +210,15 @@ jit_kernel_static_emitter::jit_kernel_static_emitter(dnnl::impl::cpu::x64::jit_g
                         break;
                     }
                 }
-                etype = expr->get_node()->get_output_element_type(0);
+                etype = mem_desc_expr->get_node()->get_output_element_type(0);
+                        break;
                 break;
             }
             case snippets::lowered::IOExpression::io_type::OUTPUT: {
-                desc = expr->get_input_port_connector(0)->get_source().get_descriptor_ptr();
-                etype = expr->get_node()->get_input_element_type(0);
+                // store->shape changing ops->result
+                auto mem_desc_expr = ov::snippets::lowered::LinearIR::get_last_shape_infer_expr(expr, false);
+                desc = mem_desc_expr->get_input_port_connector(0)->get_source().get_descriptor_ptr();
+                etype = mem_desc_expr->get_node()->get_input_element_type(0);
                 break;
             } default : {
                 OPENVINO_THROW("Kernel detected unsupported io_type");
diff --git a/src/plugins/intel_cpu/src/extension.cpp b/src/plugins/intel_cpu/src/extension.cpp
index 15b18de1d9689a..34b56d0cd4e5af 100644
--- a/src/plugins/intel_cpu/src/extension.cpp
+++ b/src/plugins/intel_cpu/src/extension.cpp
@@ -168,7 +168,8 @@ class TypeRelaxedExtension : public ov::OpExtension<ov::op::TypeRelaxed<Op>> {
     OP_EXTENSION(ov::snippets::op::VectorBuffer)             \
     OP_EXTENSION(ov::snippets::op::RankNormalization)        \
     OP_EXTENSION(ov::snippets::op::ReduceMax)                \
-    OP_EXTENSION(ov::snippets::op::ReduceSum)
+    OP_EXTENSION(ov::snippets::op::ReduceSum)                \
+    OP_EXTENSION(ov::snippets::op::Reshape)
 
 OPENVINO_CREATE_EXTENSIONS(std::vector<ov::Extension::Ptr>(
     {CPU_EXTENSIONS TYPE_RELAXED_EXTENSIONS SNIPPETS_EXTENSIONS SNIPPETS_DEBUG_CAPS_EXTENSIONS}));
diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp
index f7a50ffa14852f..791be15c197a1c 100644
--- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp
+++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp
@@ -505,16 +505,20 @@ void Snippet::SnippetJitExecutor::exec(const std::vector<MemoryPtr>& inMemPtrs,
 }
 
 void Snippet::SnippetJitExecutor::update_ptrs(jit_snippets_call_args& call_args,
-    const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs) {
+    const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs, size_t buffer_offset) {
     for (size_t i = 0; i < inMemPtrs.size(); i++)
         call_args.src_ptrs[i] = inMemPtrs[i]->getDataAs<const uint8_t>() + start_offset_in[i];
 
     for (size_t i = 0; i < outMemPtrs.size(); i++)
         call_args.dst_ptrs[i] = outMemPtrs[i]->getDataAs<uint8_t>() + start_offset_out[i];
 
-    if (buffer_scratchpad_size > 0) {
-        call_args.buffer_scratchpad_ptr =
+    if (buffer_inplace_output >= 0) {
+        call_args.buffer_scratchpad_ptr = call_args.dst_ptrs[buffer_inplace_output] + buffer_offset * dataSize[buffer_inplace_output + numInput];
+    } else {
+        if (buffer_scratchpad_size > 0) {
+            call_args.buffer_scratchpad_ptr =
                 reinterpret_cast<uint8_t*>(buffer_scratchpad.data()) + parallel_get_thread_num() * buffer_scratchpad_size;
+        }
     }
 }
 
@@ -547,7 +551,12 @@ void Snippet::SnippetJitExecutor::schedule_6d(const std::vector<MemoryPtr>& inMe
         [&](int64_t d0, int64_t d1, int64_t d2, int64_t d3, int64_t d4) {
             int64_t indexes[] = {d0, d1, d2, d3, d4};
             jit_snippets_call_args call_args;
-            update_ptrs(call_args, inMemPtrs, outMemPtrs);
+            size_t buffer_offset = 0;
+            if (buffer_inplace_output >= 0) {
+                for (size_t i = 0; i < sizeof(indexes) / sizeof(indexes[0]); i++)
+                    buffer_offset += indexes[i] * master_shape_stride[i];
+            }
+            update_ptrs(call_args, inMemPtrs, outMemPtrs, buffer_offset);
             callable(&call_args, indexes);
         });
 }
@@ -558,9 +567,6 @@ void Snippet::SnippetJitExecutor::schedule_nt(const std::vector<MemoryPtr>& inMe
     segfault_detector();
 #endif
     parallel_nt(0, [&](const int ithr, const int nthr) {
-        jit_snippets_call_args call_args;
-        update_ptrs(call_args, inMemPtrs, outMemPtrs);
-
         size_t start = 0, end = 0;
         splitter(harnessWorkAmount, nthr, ithr, start, end);
 
@@ -571,7 +577,13 @@ void Snippet::SnippetJitExecutor::schedule_nt(const std::vector<MemoryPtr>& inMe
                 indexes[j] = static_cast<int64_t>(tmp % work_size[j]);
                 tmp /= work_size[j];
             }
-
+            size_t buffer_offset = 0;
+            if (buffer_inplace_output >= 0) {
+                for (size_t i = 0; i < indexes.size(); i++)
+                    buffer_offset += indexes[i] * master_shape_stride[i];
+            }
+            jit_snippets_call_args call_args;
+            update_ptrs(call_args, inMemPtrs, outMemPtrs, buffer_offset);
             schedule.get_callable<kernel>()(&call_args, indexes.data());
         }
     });
@@ -595,10 +607,10 @@ Snippet::SnippetJitExecutor::SnippetJitExecutor(SnippetAttrs attrs, bool is_dyna
             in_shapes.emplace_back(s);
         snippetAttrs.snippet->shape_infer(in_shapes);
     }
-    const VectorDims& canonicalShape = snippetAttrs.snippet->infer_master_shape();
+    master_shape = snippetAttrs.snippet->infer_master_shape();
 
     // initialize by maximum output dimension. Dimensions of outputs should be broadcastable
-    tensorRank = std::max(static_cast<size_t>(rank6D), canonicalShape.size());
+    tensorRank = std::max(static_cast<size_t>(rank6D), master_shape.size());
     auto initDataSizes = [this]() {
         dataSize.resize(numInput + numOutput);
         for (size_t i = 0; i < numInput; i++)
@@ -608,18 +620,26 @@ Snippet::SnippetJitExecutor::SnippetJitExecutor(SnippetAttrs attrs, bool is_dyna
     };
     initDataSizes();
 
-    if (snippets::utils::is_dynamic_vdims(canonicalShape))
+    if (snippets::utils::is_dynamic_vdims(master_shape))
         OPENVINO_THROW("Snippets: Canonicalization returned dynamic shape in static pipeline");
 
     // generate
     jit_snippets_compile_args jcp;
     jcp.parallel_executor_ndims = tensorRank;
     generate(&jcp);
-    buffer_scratchpad_size = schedule.lowering_result.buffer_scratchpad_size;
-    buffer_scratchpad.resize(buffer_scratchpad_size * parallel_get_max_threads(), 0);
+    buffer_inplace_output = schedule.lowering_result.buffer_inplace_output;
+    if (buffer_inplace_output == -1) {
+        buffer_scratchpad_size = schedule.lowering_result.buffer_scratchpad_size;
+        buffer_scratchpad.resize(buffer_scratchpad_size * parallel_get_max_threads(), 0);
+    }
     parallel_exec_domain = schedule.parallel_exec_domain;
     harnessWorkAmount = std::accumulate(parallel_exec_domain.begin(), parallel_exec_domain.end(), 1, std::multiplies<size_t>());
     parallel_exec_domain = getNormalizedDimsBySize(parallel_exec_domain, tensorRank);
+    master_shape = getNormalizedDimsBySize(master_shape, tensorRank);
+    master_shape_stride = std::vector<size_t>(master_shape.size(), 1);
+    for (int i = master_shape_stride.size() - 2 ; i >= 0; i--) {
+        master_shape_stride[i] = master_shape_stride[i + 1] * master_shape[i + 1];
+    }
 }
 
 void Snippet::SnippetJitExecutor::generate(const jit_snippets_compile_args* jcp) {
diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.h b/src/plugins/intel_cpu/src/nodes/subgraph.h
index 9ce3a3b71b760b..89f5221e09f978 100644
--- a/src/plugins/intel_cpu/src/nodes/subgraph.h
+++ b/src/plugins/intel_cpu/src/nodes/subgraph.h
@@ -101,7 +101,8 @@ class Snippet : public Node {
             size_t numOutput = 0;
 
             void generate(const jit_snippets_compile_args*);
-            inline void update_ptrs(jit_snippets_call_args&, const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs);
+            inline void update_ptrs(jit_snippets_call_args&, const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs,
+                size_t buffer_offset);
             // Evaluates generated snippet using parallel backend
             void schedule_6d(const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs);
             void schedule_nt(const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs);
@@ -125,6 +126,9 @@ class Snippet : public Node {
             // Buffer scratchpad
             std::vector<uint8_t> buffer_scratchpad = {};
             size_t buffer_scratchpad_size = 0;
+            int buffer_inplace_output = -1;
+            VectorDims master_shape = {};
+            VectorDims master_shape_stride = {};
 
 #ifdef SNIPPETS_DEBUG_CAPS
             inline void segfault_detector();
diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
index 8dbdd42cee0726..f740b397b54f9f 100644
--- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
+++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
@@ -67,6 +67,7 @@
 #include "transformations/op_conversions/hswish_decomposition.hpp"
 #include "transformations/op_conversions/gru_cell_decomposition.hpp"
 #include "transformations/op_conversions/lstm_cell_decomposition.hpp"
+#include "transformations/op_conversions/group_normalization_decomposition.hpp"
 #include "transformations/op_conversions/mvn6_decomposition.hpp"
 #include "transformations/op_conversions/normalize_l2_decomposition.hpp"
 #include "transformations/op_conversions/reduce_l1_decomposition.hpp"
@@ -470,6 +471,12 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
         },
         ov::pass::NormalizeL2Decomposition);
 
+    CPU_SET_CALLBACK_X64(manager,
+        [this](const_node_ptr &node) -> bool {
+            return !node->is_dynamic() && node->get_element_type() == element::f32 && inferencePrecision != ov::element::bf16;
+        },
+        ov::pass::GroupNormalizationDecomposition);
+
     CPU_ENABLE_PASS_COMMON(manager, ov::pass::SoftmaxDecomposition);
     CPU_SET_CALLBACK_COMMON(manager,
             [](const_node_ptr &node) -> bool {
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/group_normalization.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/group_normalization.cpp
new file mode 100644
index 00000000000000..bd7257318235be
--- /dev/null
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/group_normalization.cpp
@@ -0,0 +1,74 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#include "single_op_tests/group_normalization.hpp"
+
+namespace {
+using ov::test::GroupNormalizationTest;
+
+const std::vector<ov::test::ElementType> netPrecisions = {
+    ov::element::f32,
+};
+
+// static shapes
+const std::vector<ov::Shape> staticInputShapes = {
+    {3, 8, 3},
+    {3, 8, 8},
+    {3, 8, 16},
+    {3, 8, 21},
+    {1, 4, 8, 8},
+    {1, 8, 1, 22},
+    {3, 16, 1, 33},
+    {1, 4, 1, 1, 34},
+    {1, 8, 1, 8, 2, 2},
+    {1, 8, 1, 8, 2, 2, 2},
+};
+
+// dynmaic shapes
+const std::vector<ov::test::InputShape> DynamicInputShapes = {
+    {{-1, -1, -1}, {{1, 8, 22}, {2, 4, 7}, {1, 8, 22}}},
+    {{-1, -1, -1, -1}, {{1, 16, 8, 8}, {2, 8, 4, 4}, {1, 16, 8, 8}}},
+    {{{1, 4}, {4, 16}, -1, -1}, {{1, 4, 6, 6}, {4, 16, 10, 10}, {1, 4, 6, 6}}},
+    {{-1, -1, -1, -1, -1}, {{1, 16, 7, 7, 1}, {2, 8, 4, 4, 1}, {1, 16, 7, 7, 1}}},
+};
+
+const std::vector<int64_t> numGroups = {
+    2, 4,
+};
+
+const std::vector<double> epsilon = {
+    0.0001
+};
+
+std::vector<ov::AnyMap> additionalConfig = {
+    {{ov::hint::inference_precision(ov::element::f32)}},
+    {{ov::hint::inference_precision(ov::element::bf16)}}
+};
+
+INSTANTIATE_TEST_SUITE_P(
+    smoke_GroupNormalizationStatic,
+    GroupNormalizationTest,
+    testing::Combine(testing::ValuesIn(netPrecisions),
+                     ::testing::Values(ov::element::undefined),
+                     ::testing::Values(ov::element::undefined),
+                     testing::ValuesIn(ov::test::static_shapes_to_test_representation(staticInputShapes)),
+                     testing::ValuesIn(numGroups),
+                     testing::ValuesIn(epsilon),
+                     testing::Values(ov::test::utils::DEVICE_CPU),
+                     testing::ValuesIn(additionalConfig)),
+                     GroupNormalizationTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(
+    smoke_GroupNormalizationDyn,
+    GroupNormalizationTest,
+    testing::Combine(testing::ValuesIn(netPrecisions),
+                     ::testing::Values(ov::element::undefined),
+                     ::testing::Values(ov::element::undefined),
+                     testing::ValuesIn(DynamicInputShapes),
+                     testing::ValuesIn(numGroups),
+                     testing::ValuesIn(epsilon),
+                     testing::Values(ov::test::utils::DEVICE_CPU),
+                     testing::ValuesIn(additionalConfig)),
+                     GroupNormalizationTest::getTestCaseName);
+
+} // anonymous namespace
\ No newline at end of file
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
index 8280bdfe251783..cbdee358936444 100644
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
@@ -292,6 +292,8 @@ std::vector<std::string> disabledTestPatterns() {
         retVector.emplace_back(R"(.*Extension.OnnxModelWithExtensionFromDSO.*)");
         retVector.emplace_back(R"(.*ONNXQuantizedModels/QuantizedModelsTests.MaxPool.*)");
         retVector.emplace_back(R"(.*ONNXQuantizedModels/QuantizedModelsTests.Convolution.*)");
+        // Ticket: 134601
+        retVector.emplace_back(R"(.*smoke_GroupNormalization.*)");
     }
     // invalid test: checks u8 precision for runtime graph, while it should be f32
     retVector.emplace_back(R"(smoke_NegativeQuantizedMatMulMultiplyFusion.*)");
diff --git a/src/plugins/intel_cpu/tests/unit/snippets_transformations/lowered/buffer_allocation.cpp b/src/plugins/intel_cpu/tests/unit/snippets_transformations/lowered/buffer_allocation.cpp
index b66ce1919f6d23..07c84885796093 100644
--- a/src/plugins/intel_cpu/tests/unit/snippets_transformations/lowered/buffer_allocation.cpp
+++ b/src/plugins/intel_cpu/tests/unit/snippets_transformations/lowered/buffer_allocation.cpp
@@ -88,7 +88,8 @@ class BufferAllocationCPUTest : public testing::TestWithParam<BufferAllocationCP
         pipeline.register_pass<ov::snippets::lowered::pass::InitLoops>();
         pipeline.register_pass<ov::snippets::lowered::pass::InsertLoops>();
         pipeline.register_pass<ov::intel_cpu::pass::SetBrgemmCopyBBuffersShape>();
-        pipeline.register_pass<ov::snippets::lowered::pass::AllocateBuffers>(m_buffer_scratchpad, m_is_buffer_optimized);
+        int inplace = -1;
+        pipeline.register_pass<ov::snippets::lowered::pass::AllocateBuffers>(m_buffer_scratchpad, inplace, m_is_buffer_optimized);
         pipeline.run(m_linear_ir);
     }
 
diff --git a/src/tests/functional/shared_test_classes/include/shared_test_classes/single_op/group_normalization.hpp b/src/tests/functional/shared_test_classes/include/shared_test_classes/single_op/group_normalization.hpp
index 612c53db90ab39..606ee8ede9e972 100644
--- a/src/tests/functional/shared_test_classes/include/shared_test_classes/single_op/group_normalization.hpp
+++ b/src/tests/functional/shared_test_classes/include/shared_test_classes/single_op/group_normalization.hpp
@@ -27,8 +27,8 @@ class GroupNormalizationTest : public testing::WithParamInterface<GroupNormaliza
         std::int64_t num_groups;
         double epsilon;
         TargetDevice targetDevice;
-        Config config;
-        std::tie(netType, inType, outType, shapes, num_groups, epsilon, targetDevice, config) = obj.param;
+        ov::AnyMap additional_config;
+        std::tie(netType, inType, outType, shapes, num_groups, epsilon, targetDevice, additional_config) = obj.param;
 
         std::ostringstream result;
         result << "NetType=" << netType << "_";
@@ -42,6 +42,10 @@ class GroupNormalizationTest : public testing::WithParamInterface<GroupNormaliza
         result << "NumGroups=" << num_groups << "_";
         result << "Epsilon=" << epsilon << "_";
         result << "Device=" << targetDevice;
+        for (auto const& config_item : additional_config) {
+            result << "_config_item=" << config_item.first << "=";
+            config_item.second.print(result);
+        }
 
         return result.str();
     }
@@ -52,8 +56,9 @@ class GroupNormalizationTest : public testing::WithParamInterface<GroupNormaliza
         ElementType ngPrc;
         std::int64_t num_groups;
         double epsilon;
+        ov::AnyMap additional_config;
 
-        std::tie(ngPrc, inType, outType, shapes, num_groups, epsilon, targetDevice, configuration) = this->GetParam();
+        std::tie(ngPrc, inType, outType, shapes, num_groups, epsilon, targetDevice, additional_config) = this->GetParam();
         InputShape biasInputShape = ExtractBiasShape(shapes);
         init_input_shapes({shapes, biasInputShape, biasInputShape});
         ov::ParameterVector params;
@@ -73,6 +78,8 @@ class GroupNormalizationTest : public testing::WithParamInterface<GroupNormaliza
             abs_threshold = 0.007;
         }
 
+        configuration.insert(additional_config.begin(), additional_config.end());
+
         function = std::make_shared<ov::Model>(results, params, "GroupNormalization");
     }