From d85fafd8eea65c68a197c7bbcd1b9596b074320a Mon Sep 17 00:00:00 2001 From: Alexandra Sidorova Date: Mon, 6 Jan 2025 07:00:23 +0100 Subject: [PATCH] [Snippets] Disabled non-inplace ops tokenization on 2nd inputs of MatMuls in MHA, supported transposed_b=true [Snippets] Disable second input of MM toknization [Snippets][Tests] Updated tokenization unit tests [Snippets][CPU][Tests] Updated cpu func tests [Snippets] Updated the explanation in the tokenization pass [Snippets][CPU] Removed check from tokenization callback [Snippets] Updated comment in pass of tokenization --- .../snippets/src/pass/mha_tokenization.cpp | 47 +---- .../tests/src/pass/mha_tokenization.cpp | 8 +- .../transformation_pipeline.cpp | 5 +- .../custom/subgraph_tests/src/x64/mha.cpp | 12 +- .../shared_tests_instances/snippets/mha.cpp | 28 +-- .../snippets/mha_fma.cpp | 4 +- .../snippets/mha_quantized.cpp | 10 +- .../snippets/mha_select.cpp | 4 +- .../snippets/mha_split_dim_m.cpp | 8 +- .../snippets/mha_with_dyn_mul.cpp | 6 +- .../ov_snippets_models/src/subgraph_mha.cpp | 167 +++++++++--------- 11 files changed, 131 insertions(+), 168 deletions(-) diff --git a/src/common/snippets/src/pass/mha_tokenization.cpp b/src/common/snippets/src/pass/mha_tokenization.cpp index beb465ab3a3fbe..c6b5045cfeee62 100644 --- a/src/common/snippets/src/pass/mha_tokenization.cpp +++ b/src/common/snippets/src/pass/mha_tokenization.cpp @@ -344,45 +344,6 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken * Transpose3 */ - // First input branch of MatMul0 should be executed before second input branch of MatMul0, - // so firstly we insert Transpose1 on the beginning of ordered_ops and then Transpose0 - // Note: If MatMul0 has transposed_b, we should tokenize only scalars ops from 1st branch - // to move extracted Transpose from MatMul input to body Parameter - auto parent = matmul0->get_input_node_shared_ptr(1); - // We can support several ops between MatMul0 with transposed_b and Transpose1 with 0213 order (or without this Transpose1) - // only if these ops have scalar shapes on other inputs. - // There is transformation ExplicitTransposeMatMulInputs that set supported order and transposed_b(false). - // We can allow to call this pass only if ops have scalar shapes to avoid shape mismatching - const auto is_transposed_b_0 = matmul0->get_transpose_b(); - bool has_matmul0_has_ops_on_input = false; - while (is_supported_intermediate_op(parent)) { - // All supported ops have only one output port - if (parent->get_output_target_inputs(0).size() != 1) - break; - - // Only if MatMul0 has transposed_b, we have to tokenize scalar ops - // to move explicit Transpose from MatMul0 input_1 to Parameter of Subgraph body - if (is_transposed_b_0 && !ov::snippets::pass::ExplicitTransposeMatMulInputs::are_weights_scalar(parent)) { - break; - } - - // To avoid unsupported number of non-scalar Constants in the future after FakeQuantize decomposition (plugin specific limitation) - // we should calculate potential number of non-scalar Constants for FakeQuantize that will be moved up from body. - if (const auto fq_node = ov::as_type_ptr(parent)) { - hidden_virtual_ports_count += ov::snippets::utils::get_non_scalar_constant_count_for_fq(fq_node); - } - - potential_body_params_count += get_potential_body_params(parent); - ordered_ops.insert(ordered_ops.begin(), parent); - // [107731] To go always through 0-th port - is it safe? - parent = parent->get_input_node_shared_ptr(0); - has_matmul0_has_ops_on_input = true; - } - // If there are ops on second input of MatMul0 and only one unique Buffer between MatMuls - there must be one more unique Buffer - if (has_matmul0_has_ops_on_input && uniqie_buffer_reg_group_count < 2) { - uniqie_buffer_reg_group_count++; - } - auto tokenize_transpose = [&](const std::shared_ptr& transpose, bool is_input_transposed, std::vector order, const ov::NodeVector::const_iterator& pos) { @@ -404,11 +365,15 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken } }; - const auto transpose1 = ov::as_type_ptr(parent); + // [160177]: Due to performance problems, if operations on 2nd input of MatMuls should be explicitly executed + // (in other words, if the Buffer should be inserted between Brgemm and this op sequence), + // we don't tokenize such operations into Subgraph. The details are described in the ticket 160177. + // Please, return the tokenization of these ops when parallel loops are implemented. const auto transpose0 = ov::as_type_ptr(matmul0->get_input_node_shared_ptr(0)); + const auto transpose1 = ov::as_type_ptr(matmul0->get_input_node_shared_ptr(1)); const auto transpose2 = ov::as_type_ptr(matmul1->get_input_node_shared_ptr(1)); - tokenize_transpose(transpose1, is_transposed_b_0, get_decomposed_transpose_order(pattern_rank), ordered_ops.begin()); tokenize_transpose(transpose0, matmul0->get_transpose_a(), get_fusion_transpose_order(pattern_rank), ordered_ops.begin()); + tokenize_transpose(transpose1, matmul0->get_transpose_b(), get_fusion_transpose_order(pattern_rank), ordered_ops.begin()); tokenize_transpose(transpose2, matmul1->get_transpose_b(), get_fusion_transpose_order(pattern_rank), ordered_ops.end()); ordered_ops.push_back(matmul1); diff --git a/src/common/snippets/tests/src/pass/mha_tokenization.cpp b/src/common/snippets/tests/src/pass/mha_tokenization.cpp index 382257f935cc49..dfd269bba49597 100644 --- a/src/common/snippets/tests/src/pass/mha_tokenization.cpp +++ b/src/common/snippets/tests/src/pass/mha_tokenization.cpp @@ -160,7 +160,7 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_Dynamic_Transpose_fusion) { TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA3D_SplitM) { const auto& f = MHASplitMFunction(std::vector{{128, 12, 64}, {128, 12, 64}, {12, 128, 128}, {128, 12, 64}}, std::vector({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}), - std::vector{{2, 64, 12, 64}, {128, 12, 1, 64}, {12, 2, 64, 128}, {1, 128, 12, 64}, {128, 12, 64}}, + std::vector{{2, 64, 12, 64}, {12, 1, 64, 128}, {12, 2, 64, 128}, {1, 128, 12, 64}, {128, 12, 64}}, false); model = f.getOriginal(); model_ref = f.getReference(); @@ -171,7 +171,7 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA3D_SplitM) { TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA3D_SplitM_withMul) { const auto& f = MHASplitMFunction(std::vector{{128, 12, 64}, {128, 12, 64}, {12, 128, 128}, {128, 12, 64}}, std::vector({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}), - std::vector{{2, 64, 12, 64}, {128, 12, 1, 64}, {12, 2, 64, 128}, {1, 128, 12, 64}, {128, 12, 64}}, + std::vector{{2, 64, 12, 64}, {12, 1, 64, 128}, {12, 2, 64, 128}, {1, 128, 12, 64}, {128, 12, 64}}, true); model = f.getOriginal(); model_ref = f.getReference(); @@ -182,7 +182,7 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA3D_SplitM_withMul) { TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA4D_SplitM) { const auto& f = MHASplitMFunction(std::vector{{1, 384, 16, 64}, {1, 384, 16, 64}, {1, 1, 1, 384}, {1, 384, 16, 64}}, std::vector({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}), - std::vector{{1, 6, 64, 16, 64}, {1, 384, 16, 1, 64}, {1, 1, 1, 1, 384}, {1, 1, 384, 16, 64}, {1, 384, 16, 64}}, + std::vector{{1, 6, 64, 16, 64}, {1, 16, 1, 64, 384}, {1, 1, 1, 1, 384}, {1, 1, 384, 16, 64}, {1, 384, 16, 64}}, false); model = f.getOriginal(); model_ref = f.getReference(); @@ -193,7 +193,7 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA4D_SplitM) { TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA4D_SplitM_withMul) { const auto& f = MHASplitMFunction(std::vector{{1, 384, 16, 64}, {1, 384, 16, 64}, {1, 1, 1, 384}, {1, 384, 16, 64}}, std::vector({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}), - std::vector{{1, 6, 64, 16, 64}, {1, 384, 16, 1, 64}, {1, 1, 1, 1, 384}, {1, 1, 384, 16, 64}, {1, 384, 16, 64}}, + std::vector{{1, 6, 64, 16, 64}, {1, 16, 1, 64, 384}, {1, 1, 1, 1, 384}, {1, 1, 384, 16, 64}, {1, 384, 16, 64}}, true); model = f.getOriginal(); model_ref = f.getReference(); diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index 81eb70d328630d..0acce355e8262f 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -1081,10 +1081,7 @@ void Transformations::MainSnippets(void) { // Only FP32 dynamic MHA is supported if (matmul->is_dynamic()) return false; - // [114487] brgemm kernel in oneDNN requires brgemm_copy_b kernel if MatMul node has transposed_b=True - // The current solution with ExtractExplicitMatMulTranspose pass is slower for non-f32 cases than using of - // brgemm_copy_b kernel - if (matmul->get_transpose_a() || matmul->get_transpose_b()) + if (matmul->get_transpose_a()) return false; // [150842] The execution of Brgemm INT8/BF16/FP16 on AMX platforms depends on the value of "K % VNNIFactor". // For more details, please teake a look at the ticket 150842 diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/mha.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/mha.cpp index a94f52be91df02..b69dcb66fb2d44 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/mha.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/mha.cpp @@ -296,7 +296,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_MHA, ElementType::f32}), ::testing::ValuesIn(matMulIn0Precisions), ::testing::ValuesIn(patternTypes), - ::testing::Values(ExpectedNodes{{"Subgraph", 1}}), + ::testing::Values(ExpectedNodes{{"Subgraph", 2}}), // MHA + Decomposed Transpose on input ::testing::Values(ov::test::utils::DEVICE_CPU)), MHATest::getTestCaseName); @@ -309,7 +309,7 @@ INSTANTIATE_TEST_SUITE_P( std::vector{ElementType::bf16, ElementType::bf16, ElementType::bf16, ElementType::bf16}), ::testing::ValuesIn(matMulIn0Precisions), ::testing::ValuesIn(patternTypes), - ::testing::Values(ExpectedNodes{{"Subgraph", 1}, + ::testing::Values(ExpectedNodes{{"Subgraph", 2}, // MHA + Decomposed Transpose on input {"Transpose", 1}}), // Plugin disables tokenization of Transpose on output ::testing::Values(ov::test::utils::DEVICE_CPU)), MHATest::getTestCaseName); @@ -323,7 +323,7 @@ INSTANTIATE_TEST_SUITE_P( std::vector{ElementType::f16, ElementType::f16, ElementType::f16, ElementType::f16}), ::testing::ValuesIn(matMulIn0Precisions), ::testing::ValuesIn(patternTypes), - ::testing::Values(ExpectedNodes{{"Subgraph", 1}, + ::testing::Values(ExpectedNodes{{"Subgraph", 2}, // MHA + Decomposed Transpose on input {"Transpose", 1}}), // Plugin disables tokenization of Transpose on output ::testing::Values(ov::test::utils::DEVICE_CPU)), MHATest::getTestCaseName); @@ -694,7 +694,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_MHAQuant_Pattern0, ::testing::Values(0), ::testing::Values(ExpectedNodes{ {"Subgraph", 5}, // FQs on inputs x 3 + MHA + Deq Mul - {"Transpose", 1}}), // Transpose between MHA and Deq Mul + {"Transpose", 2}}), // Decomposed Transpose on input + Transpose between MHA and Deq Mul ::testing::Values(ov::test::utils::DEVICE_CPU)), MHAQuantTest::getTestCaseName); @@ -706,7 +706,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_MHAQuant_Pattern1, ::testing::Values(1), ::testing::Values(ExpectedNodes{ {"Subgraph", 4}, // FQ on input x 2 + MHA + Deq Mul - {"Transpose", 1}}), // Transpose between MHA and Deq Mul + {"Transpose", 2}}), // Decomposed Transpose on input + Transpose between MHA and Deq Mul ::testing::Values(ov::test::utils::DEVICE_CPU)), MHAQuantTest::getTestCaseName); @@ -717,7 +717,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_MHAQuant_Pattern2, ::testing::ValuesIn(matMulIn0PrecisionsQuant), ::testing::Values(2), ::testing::Values(ExpectedNodes{{"Subgraph", 3}, // FQ on inputs x 2 + MHA - {"Transpose", 0}}), // Transpose is fused + {"Transpose", 1}}), // Decomposed Transpose on input ::testing::Values(ov::test::utils::DEVICE_CPU)), MHAQuantTest::getTestCaseName); diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp index df0b69f99ef06d..1709fd21f988a0 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp @@ -75,8 +75,8 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA_4D, ::testing::Values(ov::element::f32), ::testing::Values(false), ::testing::Values(MHA::default_thread_count), - ::testing::Values(1), - ::testing::Values(1), + ::testing::Values(2), // decomposed Transpose + MHA + ::testing::Values(2), // decomposed Transpose + MHA ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::empty_plugin_config)), MHA::getTestCaseName); @@ -88,8 +88,8 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA_4D_WithScalarMul, ::testing::Values(ov::element::f32), ::testing::Values(true), ::testing::Values(MHA::default_thread_count), - ::testing::Values(1), - ::testing::Values(1), + ::testing::Values(2), // decomposed Transpose + MHA + ::testing::Values(2), // decomposed Transpose, Mul + MHA ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::empty_plugin_config)), MHA::getTestCaseName); @@ -125,9 +125,9 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHABF16_4D, ::testing::Combine(::testing::ValuesIn(transposedShape_4D()), ::testing::ValuesIn(precision_bf16_if_supported(4)), ::testing::Values(ov::element::f32), - ::testing::ValuesIn({false, true}), + ::testing::Values(false), ::testing::Values(MHA::default_thread_count), - ::testing::Values(7), // MHA + 5 Converts + 1 Transpose on output + ::testing::Values(8), // decomposed Transpose + MHA + 5 Converts + 1 Transpose on output ::testing::Values(6), // MHA + 5 Converts on inputs and output ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::empty_plugin_config)), @@ -140,8 +140,8 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAEnforceBF16, ::testing::Values(ov::element::bf16), ::testing::ValuesIn({false}), ::testing::Values(MHA::default_thread_count), - ::testing::Values(7), - ::testing::Values(6), + ::testing::Values(8), // decomposed Transpose + MHA + 5 Converts + 1 Transpose on output + ::testing::Values(6), // MHA + 5 Reorders on inputs and output ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::cpu_bf16_plugin_config)), MHA::getTestCaseName); @@ -153,8 +153,8 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA_FP16_4D_Without_Multiply, ::testing::Values(ov::element::f16), ::testing::ValuesIn({false}), ::testing::Values(MHA::default_thread_count), + ::testing::Values(3), ::testing::Values(2), - ::testing::Values(1), ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::empty_plugin_config)), MHA::getTestCaseName); @@ -165,8 +165,8 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA_FP16_4D_With_Multiply_Static, ::testing::Values(ov::element::f16), ::testing::ValuesIn({true}), ::testing::Values(MHA::default_thread_count), + ::testing::Values(3), ::testing::Values(2), - ::testing::Values(1), ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::empty_plugin_config)), MHA::getTestCaseName); @@ -178,7 +178,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA_FP16_4D_With_Multiply_Dynamic, ::testing::Values(ov::element::f16), ::testing::ValuesIn({true}), ::testing::Values(MHA::default_thread_count), - ::testing::Values(3), + ::testing::Values(4), ::testing::Values(2), ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::empty_plugin_config)), @@ -191,8 +191,8 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAEnforceFP16_Without_Multiply, ::testing::Values(ov::element::f16), ::testing::ValuesIn({false}), ::testing::Values(MHA::default_thread_count), + ::testing::Values(3), ::testing::Values(2), - ::testing::Values(1), ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::cpu_f16_plugin_config)), MHA::getTestCaseName); @@ -203,8 +203,8 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAEnforceFP16_With_Multiply_Static, ::testing::Values(ov::element::f16), ::testing::ValuesIn({true}), ::testing::Values(MHA::default_thread_count), + ::testing::Values(3), ::testing::Values(2), - ::testing::Values(1), ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::cpu_f16_plugin_config)), MHA::getTestCaseName); @@ -215,7 +215,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAEnforceFP16_With_Multiply_Dynamic, ::testing::Values(ov::element::f16), ::testing::ValuesIn({true}), ::testing::Values(MHA::default_thread_count), - ::testing::Values(3), + ::testing::Values(4), ::testing::Values(2), ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::cpu_f16_plugin_config)), diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_fma.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_fma.cpp index 4bf35e2daa690d..f9bc640160a67c 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_fma.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_fma.cpp @@ -21,8 +21,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(ov::element::f32), ::testing::ValuesIn({false}), // Need to support True for graph builder in tests ::testing::Values(MHA::default_thread_count), - ::testing::Values(1), - ::testing::Values(1), + ::testing::Values(2), // Subgraph with MHA + Subgraph with Transpose1 + ::testing::Values(2), // Subgraph with MHA + Subgraph with Transpose1 ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::empty_plugin_config)), MHA::getTestCaseName); diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_quantized.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_quantized.cpp index 0c731b74565863..38806dff765833 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_quantized.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_quantized.cpp @@ -48,7 +48,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(ov::element::f32), ::testing::Values(false), // The graph doesn't contain Multiply ::testing::Values(MHA::default_thread_count), - ::testing::Values(6), // FQx3 on inputs + MHA + Transpose on output + Deq Mul + ::testing::Values(7), // FQx3, Transpose1 on inputs + MHA + Transpose on output + Deq Mul ::testing::Values(5), // FQx3 on inputs + MHA + Deq Mul ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::empty_plugin_config)), @@ -63,7 +63,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(ov::element::f32), ::testing::Values(false), // The graph doesn't contain Multiply ::testing::Values(MHA::default_thread_count), - ::testing::Values(5), // FQx2 on inputs + MHA + Transpose on output + Deq Mul + ::testing::Values(6), // FQx2, Transpose1 on inputs + MHA + Transpose on output + Deq Mul ::testing::Values(4), // FQx2 on inputs + MHA + Deq Mul ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::empty_plugin_config)), @@ -77,8 +77,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(ov::element::f32), ::testing::Values(false), // The graph doesn't contain Multiply ::testing::Values(MHA::default_thread_count), - ::testing::Values(3), // MHA + Transpose on output + Deq Mul - ::testing::Values(2), // MHA + Deq Mul + ::testing::Values(4), // Transpose1 + MHA + Transpose on output + Deq Mul + ::testing::Values(3), // Transpose1 + MHA + Deq Mul ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::empty_plugin_config)), MHA::getTestCaseName); @@ -91,7 +91,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(ov::element::f32), ::testing::Values(false), // The graph doesn't contain Multiply ::testing::Values(MHA::default_thread_count), - ::testing::Values(7), // Transposex2 + Subgraphsx5 + ::testing::Values(8), // Transposex3 + Subgraphsx5 ::testing::Values(5), // MHA + Deq Mul on output + Deqs on inputs + 2 xFQ on inputs ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::empty_plugin_config)), diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_select.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_select.cpp index 3fc1417d20b102..cc438301101811 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_select.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_select.cpp @@ -29,8 +29,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(ov::element::f32), ::testing::Values(false), // Need to support True for graph builder in tests ::testing::Values(MHA::default_thread_count), - ::testing::Values(2), // Less + MHA - ::testing::Values(2), + ::testing::Values(3), // Transpose1 + Less + MHA + ::testing::Values(3), // Transpose1 + Less + MHA ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::empty_plugin_config)), MHA::getTestCaseName); diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_split_dim_m.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_split_dim_m.cpp index bb5f7fe2fa5b52..d3598ebba1ac1f 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_split_dim_m.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_split_dim_m.cpp @@ -24,8 +24,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(ov::element::f32), ::testing::Values(true), ::testing::Values(4), // 4 Threads - ::testing::Values(6), // Subgraph + 4 Reshapes on inputs and 1 Reshape on output - ::testing::Values(1), + ::testing::Values(7), // Subgraph + 4 Reshapes, Transpose1 on inputs and 1 Reshape on output + ::testing::Values(2), ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(enable_callback())), MHA::getTestCaseName); @@ -80,8 +80,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(ov::element::f32), ::testing::Values(false), ::testing::Values(4), // 4 Threads - ::testing::Values(1), - ::testing::Values(1), + ::testing::Values(2), // Transpose1 + MHA + ::testing::Values(2), // Transpose1 + MHA ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::empty_plugin_config)), MHA::getTestCaseName); diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_with_dyn_mul.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_with_dyn_mul.cpp index 7876d737af2281..9a9e56621b10a6 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_with_dyn_mul.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_with_dyn_mul.cpp @@ -43,8 +43,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::ValuesIn(precision_f32(5)), ::testing::Values(ov::element::f32), ::testing::Values(MHA::default_thread_count), - ::testing::Values(1), - ::testing::Values(1), + ::testing::Values(2), // Transpose1 + MHA + ::testing::Values(2), // Transpose1 + MHA ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::empty_plugin_config)), MHAWithDynamicMul::getTestCaseName); @@ -56,7 +56,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::ValuesIn(precision_f32(5)), ::testing::Values(ov::element::bf16), ::testing::Values(MHA::default_thread_count), - ::testing::Values(8), // MHA + 1 Transpose on output + 6 Converts around + ::testing::Values(9), // Transpose1 + MHA + 1 Transpose on output + 6 Converts around ::testing::Values(7), // MHA + 6 Converts around ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::empty_plugin_config)), diff --git a/src/tests/ov_helpers/ov_snippets_models/src/subgraph_mha.cpp b/src/tests/ov_helpers/ov_snippets_models/src/subgraph_mha.cpp index 5f854326a47217..eb0dfaa8710fa8 100644 --- a/src/tests/ov_helpers/ov_snippets_models/src/subgraph_mha.cpp +++ b/src/tests/ov_helpers/ov_snippets_models/src/subgraph_mha.cpp @@ -110,46 +110,43 @@ std::shared_ptr MHAFunction::initReference() const { auto data1 = std::make_shared(precisions[1], input_shapes[1]); auto data2 = std::make_shared(precisions[2], input_shapes[2]); auto data3 = std::make_shared(precisions[3], input_shapes[3]); - ov::ParameterVector ngraphParams = {data0, data1, data2, data3}; - NodeVector subgraph_inputs = {data0, data1, data2, data3}; - - auto transpose0Param = std::make_shared(precisions[0], input_shapes[0]); - auto transpose1Param = std::make_shared(precisions[1], input_shapes[1]); - auto addParam = std::make_shared(precisions[2], input_shapes[2]); - auto transpose2Param = std::make_shared(precisions[3], input_shapes[3]); - ov::ParameterVector subgraph_params = {transpose0Param, transpose1Param, addParam, transpose2Param}; + ov::ParameterVector ngraphParams = {data0, data1, data2, data3}; const auto rank = input_shapes[0].size(); const auto fusion_order = get_fusion_order(rank); const auto decomposed_order = get_decomposed_order(rank); - const auto transpose0Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order); const auto transpose1Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, decomposed_order); - const auto transpose2Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order); - const auto transpose3Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order); + const auto transpose1 = std::make_shared(data1, transpose1Const); - const auto transpose0 = std::make_shared(transpose0Param, transpose0Const); - const auto transpose1 = std::make_shared(transpose1Param, transpose1Const); - std::shared_ptr matmul_parent1 = transpose1; + std::shared_ptr subgraph_parent1 = transpose1; if (with_mul) { ov::Shape shape(rank, 1); if (transpose1->get_output_partial_shape(0).is_static()) { shape[rank - 3] = transpose1->get_output_shape(0)[rank - 3]; } - const auto mulConst = ov::test::utils::make_constant(precisions[1], shape); - if (ov::shape_size(shape) > 1) { - const auto mulParam = std::make_shared(precisions[1], mulConst->get_shape()); - matmul_parent1 = std::make_shared(transpose1, mulParam); - subgraph_params = {transpose0Param, transpose1Param, mulParam, addParam, transpose2Param}; - subgraph_inputs = {data0, data1, mulConst, data2, data3}; - } else { - matmul_parent1 = std::make_shared(transpose1, mulConst); - } + const auto mulConst = ov::test::utils::make_constant(precisions[1], shape); + subgraph_parent1 = std::make_shared(transpose1, mulConst); } - const auto matMul0 = std::make_shared(transpose0, matmul_parent1); + NodeVector subgraph_inputs = {data0, subgraph_parent1, data2, data3}; + + auto transpose0Param = std::make_shared(precisions[0], input_shapes[0]); + auto brgemm1Param = std::make_shared(subgraph_parent1->get_element_type(), subgraph_parent1->get_output_partial_shape(0)); + auto addParam = std::make_shared(precisions[2], input_shapes[2]); + auto transpose2Param = std::make_shared(precisions[3], input_shapes[3]); + + ov::ParameterVector subgraph_params = {transpose0Param, brgemm1Param, addParam, transpose2Param}; + + const auto transpose0Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order); + const auto transpose2Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order); + const auto transpose3Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order); + + const auto transpose0 = std::make_shared(transpose0Param, transpose0Const); + + const auto matMul0 = std::make_shared(transpose0, brgemm1Param); const auto add = std::make_shared(matMul0, addParam); const auto softMax = std::make_shared(add, rank - 1); const auto transpose2 = std::make_shared(transpose2Param, transpose2Const); @@ -168,55 +165,45 @@ std::shared_ptr MHASplitMFunction::initReference() const { auto data3 = std::make_shared(precisions[3], input_shapes[3]); ov::ParameterVector ngraphParams = {data0, data1, data2, data3}; + const auto rank_before = input_shapes[1].size(); + const auto transpose1Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank_before}, get_decomposed_order(rank_before)); + const auto transpose1 = std::make_shared(data1, transpose1Const); + + std::shared_ptr subgraph_parent1 = transpose1; + if (with_mul) { + ov::Shape shape(rank_before, 1); + if (transpose1->get_output_partial_shape(0).is_static()) { + shape[rank_before - 3] = transpose1->get_output_shape(0)[rank_before - 3]; + } + const auto mulConst = ov::test::utils::make_constant(precisions[1], shape); + subgraph_parent1 = std::make_shared(transpose1, mulConst); + } + auto make_reshape = [](const std::shared_ptr& node, const ov::Shape& new_shape) { auto shape_const = ov::op::v0::Constant::create(ov::element::i32, {new_shape.size()}, new_shape); return std::make_shared(node, shape_const, true); }; auto reshape0 = make_reshape(data0, reshapes[0]); - auto reshape1 = make_reshape(data1, reshapes[1]); + auto reshape1 = make_reshape(subgraph_parent1, reshapes[1]); auto reshape2 = make_reshape(data2, reshapes[2]); auto reshape3 = make_reshape(data3, reshapes[3]); NodeVector subgraph_inputs = {reshape0, reshape1, reshape2, reshape3}; auto transpose0Param = std::make_shared(precisions[0], reshape0->get_shape()); - auto transpose1Param = std::make_shared(precisions[1], reshape1->get_shape()); + auto brgemm1Param = std::make_shared(precisions[1], reshape1->get_shape()); auto addParam = std::make_shared(precisions[2], reshape2->get_shape()); auto transpose2Param = std::make_shared(precisions[3], reshape3->get_shape()); - ov::ParameterVector subgraph_params = {transpose0Param, transpose1Param, addParam, transpose2Param}; + ov::ParameterVector subgraph_params = {transpose0Param, brgemm1Param, addParam, transpose2Param}; const auto rank = input_shapes[0].size() + 1; - const auto transpose0Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, get_fusion_order_after_split_m(rank, true)); - const auto transpose1Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, get_decomposed_order_after_split_m(rank)); const auto transpose2Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, get_fusion_order_after_split_m(rank, true)); const auto transpose3Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, get_fusion_order_after_split_m(rank, false)); const auto transpose0 = std::make_shared(transpose0Param, transpose0Const); - const auto transpose1 = std::make_shared(transpose1Param, transpose1Const); - - std::shared_ptr matmul_parent1 = transpose1; - if (with_mul) { - ov::Shape shape(rank - 1, 1); - if (transpose1->get_output_partial_shape(0).is_static()) { - shape[rank - 4] = transpose1->get_output_shape(0)[rank - 4]; - } - const auto mulConst = ov::test::utils::make_constant(precisions[1], shape); - - if (ov::shape_size(shape) > 1) { - ov::Shape reshape_shape = shape; - reshape_shape.insert(reshape_shape.cbegin() + (rank - 3), 1); - const auto mulReshape = make_reshape(mulConst, reshape_shape); - const auto mulParam = std::make_shared(precisions[1], mulReshape->get_shape()); - matmul_parent1 = std::make_shared(transpose1, mulParam); - subgraph_params = {transpose0Param, transpose1Param, mulParam, addParam, transpose2Param}; - subgraph_inputs = {reshape0, reshape1, mulReshape, reshape2, reshape3}; - } else { - matmul_parent1 = std::make_shared(transpose1, mulConst); - } - } - const auto matMul0 = std::make_shared(transpose0, matmul_parent1); + const auto matMul0 = std::make_shared(transpose0, brgemm1Param); const auto add = std::make_shared(matMul0, addParam); const auto softMax = std::make_shared(add, rank - 1); const auto transpose2 = std::make_shared(transpose2Param, transpose2Const); @@ -318,30 +305,36 @@ std::shared_ptr MHAMatMul0TransposeFunction::initReference() const { auto data2 = std::make_shared(precisions[2], input_shapes[2]); auto data3 = std::make_shared(precisions[3], input_shapes[3]); ov::ParameterVector ngraphParams = {data0, data1, data2, data3}; - NodeVector subgraph_inputs = {data0, data1, data2, data3}; + + const auto rank = input_shapes[0].size(); + const auto fusion_order = get_fusion_order(rank); + const auto decomposed_order = get_decomposed_order(rank); + std::vector transposed_b_order(rank); + std::iota(transposed_b_order.begin(), transposed_b_order.end(), 0); + std::swap(transposed_b_order[rank - 1], transposed_b_order[rank - 2]); + + const auto transpose1Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order); + const auto transpose1 = std::make_shared(data1, transpose1Const); + const auto mulConst = ov::test::utils::make_constant(precisions[1], ov::Shape{1}); + const auto mul = std::make_shared(transpose1, mulConst); + const auto transposeBConst = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{rank}, transposed_b_order); + const auto transposeB = std::make_shared(mul, transposeBConst); + + NodeVector subgraph_inputs = {data0, transposeB, data2, data3}; auto transpose0Param = std::make_shared(precisions[0], input_shapes[0]); - auto transpose1Param = std::make_shared(precisions[1], input_shapes[1]); + auto brgemm1Param = std::make_shared(transposeB->get_element_type(), transposeB->get_output_partial_shape(0)); auto addParam = std::make_shared(precisions[2], input_shapes[2]); auto transpose2Param = std::make_shared(precisions[3], input_shapes[3]); - ov::ParameterVector subgraph_params = {transpose0Param, transpose1Param, addParam, transpose2Param}; - - const auto rank = input_shapes[0].size(); - const auto fusion_order = get_fusion_order(rank); - const auto decomposed_order = get_decomposed_order(rank); + ov::ParameterVector subgraph_params = {transpose0Param, brgemm1Param, addParam, transpose2Param}; const auto transpose0Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order); - const auto transpose1Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, decomposed_order); const auto transpose2Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order); const auto transpose3Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order); const auto transpose0 = std::make_shared(transpose0Param, transpose0Const); - const auto transpose1 = std::make_shared(transpose1Param, transpose1Const); - - const auto mulConst = ov::test::utils::make_constant(precisions[1], ov::Shape{1}); - const auto mul = std::make_shared(transpose1, mulConst); - const auto matMul0 = std::make_shared(transpose0, mul); + const auto matMul0 = std::make_shared(transpose0, brgemm1Param); const auto add = std::make_shared(matMul0, addParam); const auto softMax = std::make_shared(add, rank - 1); const auto transpose2 = std::make_shared(transpose2Param, transpose2Const); @@ -818,29 +811,33 @@ std::shared_ptr MHAINT8MatMulTypeRelaxedFunction::initReference() con const auto fq0 = ov::builder::subgraph::makeFakeQuantizeTypeRelaxed(data0, ov::element::f32, fq_signed_params); const auto fq1 = ov::builder::subgraph::makeFakeQuantizeTypeRelaxed(data1, ov::element::f32, fq_signed_params); const auto fq2 = ov::builder::subgraph::makeFakeQuantizeTypeRelaxed(data3, ov::element::f32, fq_signed_params); - NodeVector subgraph_inputs = {fq0, fq1, data2, fq2}; - auto transpose0Param = std::make_shared(precision, input_shapes[0]); - auto transpose1Param = std::make_shared(precision, input_shapes[1]); - auto addParam = std::make_shared(precision, input_shapes[2]); - auto transpose2Param = std::make_shared(precision, input_shapes[3]); - ov::ParameterVector subgraph_params = {transpose0Param, transpose1Param, addParam, transpose2Param}; + const auto rank = input_shapes[0].get_shape().size(); + const auto fusion_order = get_fusion_order(rank); + const auto decomposed_order = get_decomposed_order(rank); + const auto transpose1Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, decomposed_order); + const auto transpose1 = std::make_shared(fq1, transpose1Const); - const auto shape_rank = input_shapes[0].get_shape().size(); - auto transpose0Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector{0, 2, 1, 3}); - auto transpose1Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector{0, 2, 3, 1}); - auto transpose2Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector{0, 2, 1, 3}); - auto transpose3Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector{0, 2, 1, 3}); + NodeVector subgraph_inputs = {fq0, transpose1, data2, fq2}; + + const auto transpose0Param = std::make_shared(precision, input_shapes[0]); + const auto brgemm1Param = std::make_shared(transpose1->get_element_type(), transpose1->get_output_partial_shape(0)); + const auto addParam = std::make_shared(precision, input_shapes[2]); + const auto transpose2Param = std::make_shared(precision, input_shapes[3]); + ov::ParameterVector subgraph_params = {transpose0Param, brgemm1Param, addParam, transpose2Param}; + + const auto transpose0Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order); + const auto transpose2Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order); + const auto transpose3Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order); bool transA = false; bool transB = false; const auto transpose0 = std::make_shared(transpose0Param, transpose0Const); - const auto transpose1 = std::make_shared(transpose1Param, transpose1Const); const auto matMul0 = std::make_shared>( std::vector{ element::f32, element::f32 }, std::vector{ element::f32 }, ov::op::TemporaryReplaceOutputType(transpose0, element::f32).get(), - ov::op::TemporaryReplaceOutputType(transpose1, element::f32).get(), transA, transB); + ov::op::TemporaryReplaceOutputType(brgemm1Param, element::f32).get(), transA, transB); auto decomposed_fq = [](const ov::Output& input, const ov::element::Type& out_precision, float il, float ih, float scale) { @@ -941,8 +938,8 @@ std::shared_ptr MHATransposedInputFunction::initReference() const { const auto data2 = std::make_shared(precision, input_shapes[2]); ov::ParameterVector ngraphParam = {data0, data1, data2}; - bool is_supported = ((m_transposed_b && m_order == std::vector{0, 2, 1, 3}) || - (!m_transposed_b && m_order == std::vector{0, 2, 3, 1})); + bool is_supported = ((m_transposed_b && m_order == std::vector{0, 2, 3, 1}) || + (!m_transposed_b && m_order == std::vector{0, 2, 1, 3})); std::shared_ptr in1 = data1; if (!m_order.empty() && !is_supported) { @@ -963,11 +960,16 @@ std::shared_ptr MHATransposedInputFunction::initReference() const { const auto param0 = std::make_shared(precision, data0->get_output_partial_shape(0)); const auto param1 = std::make_shared(precision, in1->get_output_partial_shape(0)); const auto param2 = std::make_shared(precision, data2->get_output_partial_shape(0)); + ov::ParameterVector subgraph_params = {param0, param1, param2}; + ov::OutputVector subgraphs_inputs = {data0, in1, data2}; std::shared_ptr matmul0_in1 = param1; if (!m_order.empty() && is_supported) { - const auto transposeConst = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{m_order.size()}, m_order); + const auto transposeConst = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{m_order.size()}, m_order); matmul0_in1 = std::make_shared(param1, transposeConst); + + std::swap(subgraphs_inputs[0], subgraphs_inputs[1]); + std::swap(subgraph_params[0], subgraph_params[1]); } const bool mm0_transpose_b = m_transposed_b && m_transpose_b_native_support; @@ -975,8 +977,7 @@ std::shared_ptr MHATransposedInputFunction::initReference() const { const auto softmax = std::make_shared(matMul0, -1); const auto matMul1 = std::make_shared(softmax, param2); - auto subgraph = std::make_shared(ov::NodeVector{data0, in1, data2}, - std::make_shared(NodeVector{matMul1}, ov::ParameterVector{param0, param1, param2})); + auto subgraph = std::make_shared(subgraphs_inputs, std::make_shared(NodeVector{matMul1}, subgraph_params)); ov::ResultVector results{std::make_shared(subgraph)}; return std::make_shared(results, ngraphParam, "mha");