[TRANSFORMATIONS][GPU] SDPA Fusion passes (openvinotoolkit#28042)

### Details: - Added basic SDPA fusion pass and QK scaling fusion into SDPA T5 case --------- Signed-off-by: Vladimir Paramuzov <[email protected]>
jane-intel · Dec 24, 2024 · b4c81e0 · b4c81e0
1 parent f62b94f
commit b4c81e0
Show file tree

Hide file tree

Showing 9 changed files with 853 additions and 0 deletions.
diff --git a/src/common/transformations/include/transformations/common_optimizations/sdpa_fusion.hpp b/src/common/transformations/include/transformations/common_optimizations/sdpa_fusion.hpp
@@ -0,0 +1,60 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/pass/matcher_pass.hpp"
+#include "transformations_visibility.hpp"
+
+namespace ov {
+namespace pass {
+
+/// This pass transforms the following sub-graph to a single Scaled Dot Product Attention operation.
+/// Before:
+///     ┌───────┐     ┌───────┐    ┌───────┐
+///     │   Q   │     │   K   │    │   V   │
+///     └───┬───┘     └───┬───┘    └───┬───┘
+///         │             │            │
+///         │             │            │
+///     ┌───┴───┐   ┌─────┴──────┐     │
+///     │ MatMul│<──│ Transpose  │     │
+///     └───┬───┘   | (Optional) │     │
+///         │       └────────────┘     │
+///     ┌───┴───┐    ┌─────────────┐   │
+///     │  Add  │<───│AttentionMask│   │
+///     └───┬───┘    | (Optional)  │   │
+///         │        └─────────────┘   │
+///     ┌───┴───┐                      │
+///     │Softmax│                      │
+///     └───┬───┘                      │
+///         │                          │
+///     ┌───┴───┐                      │
+///     │ MatMul│<─────────────────────┘
+///     └───┬───┘
+///     ┌───┴───┐
+///     │ Output│
+///     └───────┘
+///
+/// After:
+///     ┌───────┐    ┌───────┐    ┌───────┐    ┌─────────────┐
+///     │   Q   │    │   K   │    │   V   │    │AttentionMask│
+///     └───┬───┘    └───┬───┘    └───┬───┘    └──────┬──────┘
+///         │            │            │               │
+///         │            │            │               │
+///     ┌───┴────────────┴────────────┴───────────────┴─┐
+///     │           ScaledDotProductAttention           │
+///     └────────────────────┬──────────────────────────┘
+///                          │
+///                          │
+///                     ┌────┴────┐
+///                     │  Output │
+///                     └─────────┘
+class TRANSFORMATIONS_API SDPAFusion : public ov::pass::MatcherPass {
+public:
+    OPENVINO_MATCHER_PASS_RTTI("SDPAFusion", "0");
+    SDPAFusion();
+};
+
+}  // namespace pass
+}  // namespace ov
diff --git a/...common/transformations/include/transformations/common_optimizations/sdpa_scale_fusion.hpp b/...common/transformations/include/transformations/common_optimizations/sdpa_scale_fusion.hpp
@@ -0,0 +1,58 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/pass/matcher_pass.hpp"
+#include "transformations_visibility.hpp"
+
+namespace ov {
+namespace pass {
+
+/// Merges explicit multiplication by scalar value for Q and K into scale attribute of SDPA op
+/// Before:
+///     ┌───────┐    ┌───────┐    ┌───────┐  ┌─────────────┐     ┌─────────────┐
+///     │   Q   │    │   K   │    │   V   │  │AttentionMask│     │    Scale    |
+///     └───┬───┘    └───┬───┘    └───┬───┘  │ (Optional)  │     │  (Optional) │
+///         │            │            │      └──────┬──────┘     └───────┬─────┘
+///         │            │            │             │                    |
+///     ┌───┴───┐    ┌───┴───┐        │             │                    |
+///     │  Mul  |    │  Mul  │        |             │                    |
+///     └───┬───┘    └───┬───┘        │             │                    │
+///         │            │            │             │                    │
+///         |            │            │             │                    │
+///     ┌───┴────────────┴────────────┴─────────────┴─┐                  |
+///     │           ScaledDotProductAttention         │──────────────────┘
+///     └────────────────────┬────────────────────────┘
+///                          │
+///                          │
+///                     ┌────┴────┐
+///                     │  Output │
+///                     └─────────┘
+/// After:
+///     ┌───────┐    ┌───────┐    ┌───────┐  ┌─────────────┐  ┌───────┐
+///     │   Q   │    │   K   │    │   V   │  │AttentionMask│  │ Scale |
+///     └───┬───┘    └───┬───┘    └───┬───┘  └──────┬──────┘  └───┬───┘
+///         │            │            │             │             |
+///         │            │            │             │             |
+///         |            │            │             │             |
+///     ┌───┴────────────┴────────────┴─────────────┴─┐           |
+///     │           ScaledDotProductAttention         │───────────┘
+///     └────────────────────┬────────────────────────┘
+///                          │
+///                          │
+///                     ┌────┴────┐
+///                     │  Output │
+///                     └─────────┘
+/// Multiply ops for Q and K are eliminated in the following cases:
+/// 1. Q_scale and K_scale are constant
+/// 2. Q_scale * SDPA_Scale == 1 or K_scale * SDPA_Scale == 1
+class TRANSFORMATIONS_API SDPAScaleFusion : public ov::pass::MatcherPass {
+public:
+    OPENVINO_MATCHER_PASS_RTTI("SDPAScaleFusion", "0");
+    SDPAScaleFusion();
+};
+
+}  // namespace pass
+}  // namespace ov
diff --git a/src/common/transformations/src/transformations/common_optimizations/moc_transformations.cpp b/src/common/transformations/src/transformations/common_optimizations/moc_transformations.cpp
@@ -65,6 +65,7 @@
 #include "transformations/common_optimizations/remove_multi_subgraph_op_dangling_params.hpp"
 #include "transformations/common_optimizations/reshape_sequence_fusion.hpp"
 #include "transformations/common_optimizations/ric_fusion.hpp"
+#include "transformations/common_optimizations/sdpa_fusion.hpp"
 #include "transformations/common_optimizations/select_with_one_value_condition.hpp"
 #include "transformations/common_optimizations/sequence_fusion.hpp"
 #include "transformations/common_optimizations/shared_ops_optimization.hpp"
@@ -229,6 +230,7 @@ bool ov::pass::MOCTransformations::run_on_model(const std::shared_ptr<ov::Model>
     ADD_MATCHER(common_fusions, ConvertTensorIteratorToSequence)
     ADD_MATCHER(common_fusions, SplitConcatPairToInterpolateFusion, m_use_shapes)
     ADD_MATCHER(common_fusions, ConvolutionToGroupConvolutionFusion)
+    ADD_MATCHER(common_fusions, SDPAFusion)
     if (m_use_shapes) {
         ADD_MATCHER(common_fusions, NearestNeighborUpsamplingFusion)
     }

diff --git a/src/common/transformations/src/transformations/common_optimizations/sdpa_fusion.cpp b/src/common/transformations/src/transformations/common_optimizations/sdpa_fusion.cpp
@@ -0,0 +1,127 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "transformations/common_optimizations/sdpa_fusion.hpp"
+
+#include "openvino/core/rt_info.hpp"
+#include "openvino/core/type.hpp"
+#include "openvino/op/add.hpp"
+#include "openvino/op/constant.hpp"
+#include "openvino/op/matmul.hpp"
+#include "openvino/op/scaled_dot_product_attention.hpp"
+#include "openvino/op/softmax.hpp"
+#include "openvino/op/transpose.hpp"
+#include "openvino/op/unsqueeze.hpp"
+#include "openvino/pass/pattern/op/optional.hpp"
+#include "openvino/pass/pattern/op/pattern.hpp"
+#include "openvino/pass/pattern/op/wrap_type.hpp"
+#include "transformations/utils/gen_pattern.hpp"
+
+namespace ov {
+namespace pass {
+
+SDPAFusion::SDPAFusion() {
+    using namespace ov::pass::pattern;
+    using namespace ov::gen_pattern;
+
+    auto q = makePattern(ov::Rank(4));
+    auto k = makePattern(ov::Rank(4));
+    auto v = makePattern(ov::Rank(4));
+    auto mask = makePattern();
+
+    auto k_transpose_order = pattern::wrap_type<ov::op::v0::Constant>([](const Output<Node>& node) {
+        auto axis_order =
+            std::dynamic_pointer_cast<ov::op::v0::Constant>(node.get_node_shared_ptr())->cast_vector<int64_t>();
+        return axis_order == std::vector<int64_t>{0, 1, 3, 2};
+    });
+
+    auto k_t = pattern::wrap_type<ov::op::v1::Transpose>({k, k_transpose_order});
+    auto qk_nn = makePattern<ov::op::v0::MatMul>({q, k_t}, {{"transpose_a", false}, {"transpose_b", false}});
+    auto qk_nt = makePattern<ov::op::v0::MatMul>({q, k}, {{"transpose_a", false}, {"transpose_b", true}});
+    auto qk = qk_nt | qk_nn;
+    auto optional_add_mask = optional<ov::op::v1::Add>({qk, mask});
+    auto softmax = makePattern<ov::op::v8::Softmax>({optional_add_mask}, {{"axis", "-1"}});
+    auto qkv = makePattern<ov::op::v0::MatMul>({softmax, v}, {{"transpose_a", false}, {"transpose_b", false}});
+
+    auto valid_qk_shapes = [](const std::shared_ptr<ov::op::v0::MatMul>& qk_matmul) {
+        auto q_pshape = qk_matmul->get_input_partial_shape(0);
+        auto k_pshape = qk_matmul->get_input_partial_shape(1);
+
+        const size_t q_head_size_idx = 3;
+        const size_t k_head_size_idx = qk_matmul->get_transpose_b() ? 3 : 2;
+
+        return q_pshape.size() == 4 && k_pshape.size() == 4 && q_pshape[q_head_size_idx].is_static() &&
+               k_pshape[k_head_size_idx].is_static() &&
+               q_pshape[q_head_size_idx].get_length() == k_pshape[k_head_size_idx].get_length();
+    };
+
+    ov::matcher_pass_callback callback = [=](ov::pass::pattern::Matcher& m) {
+        const auto& pattern_map = m.get_pattern_value_map();
+        if (transformation_callback(m.get_match_root())) {
+            return false;
+        }
+
+        auto q_node = pattern_map.at(q);
+        auto k_node = pattern_map.at(k);
+        auto v_node = pattern_map.at(v);
+
+        if (!valid_qk_shapes(ov::as_type_ptr<ov::op::v0::MatMul>(pattern_map.at(qk).get_node_shared_ptr()))) {
+            return false;
+        }
+
+        if (pattern_map.at(qk).get_target_inputs().size() > 1 ||
+            pattern_map.at(softmax).get_target_inputs().size() > 1) {
+            return false;
+        }
+        if (pattern_map.count(optional_add_mask) && (pattern_map.at(optional_add_mask).get_target_inputs().size() > 1 ||
+                                                     pattern_map.at(mask).get_partial_shape().size() > 4)) {
+            return false;
+        }
+
+        Output<ov::Node> mask_value;
+        Output<ov::Node> mask_input;
+        if (pattern_map.find(optional_add_mask) != pattern_map.end()) {
+            mask_value = pattern_map.at(mask);
+        } else {
+            mask_value = ov::op::v0::Constant::create(q_node.get_element_type(), ov::Shape{}, std::vector<float>{0});
+        }
+
+        if (mask_value.get_partial_shape().size() > 4) {
+            return false;
+        }
+
+        if (mask_value.get_partial_shape().rank() == 0 || mask_value.get_partial_shape().rank() == 4) {
+            mask_input = mask_value;
+        } else {
+            size_t rank_diff = q_node.get_partial_shape().size() - mask_value.get_partial_shape().size();
+            std::vector<int64_t> axes(rank_diff);
+            std::iota(axes.begin(), axes.end(), 0);
+            mask_input = std::make_shared<ov::op::v0::Unsqueeze>(
+                mask_value,
+                ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank_diff}, axes));
+        }
+
+        std::shared_ptr<ov::Node> scale_node =
+            ov::op::v0::Constant::create(q_node.get_element_type(), ov::Shape{}, std::vector<float>{1.0f});
+
+        std::shared_ptr<ov::Node> sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(q_node,
+                                                                                                  k_node,
+                                                                                                  v_node,
+                                                                                                  mask_input,
+                                                                                                  scale_node,
+                                                                                                  false);
+
+        sdpa->set_friendly_name(m.get_match_root()->get_friendly_name());
+        ov::copy_runtime_info(m.get_matched_nodes(), sdpa);
+        ov::replace_node(m.get_match_root(), sdpa);
+
+        return true;
+    };
+
+    auto m = std::make_shared<ov::pass::pattern::Matcher>(qkv, "SDPAFusion");
+    this->register_matcher(m, callback);
+}
+
+}  // namespace pass
+}  // namespace ov