From 83622095d586b6c34bbab1bf385a3c8dd29604b8 Mon Sep 17 00:00:00 2001 From: fishbell Date: Tue, 24 Dec 2024 15:20:08 +0800 Subject: [PATCH] optimize code Signed-off-by: fishbell --- .../prepare_primitive_fusing.cpp | 1 + .../graph_optimizer/reshape_transfer.cpp | 94 ++++++++----------- .../unit/passes/reorder_reshape_permute.cpp | 91 ++++++++++-------- 3 files changed, 95 insertions(+), 91 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp index 96bb1a65da7279..93f0905b3a1ef7 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp @@ -137,6 +137,7 @@ void prepare_primitive_fusing::fuse_reorders(program &p) { program_helpers::do_for_types(*node, [&p](reorder_node& node) { auto& input = node.input(); + // Restrictions: // - inputs cannot be padded // - primitives input cannot be output diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/reshape_transfer.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/reshape_transfer.cpp index 23b6a2eae092fb..771fb7057a3283 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/reshape_transfer.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/reshape_transfer.cpp @@ -16,7 +16,8 @@ void reshape_transfer::run(program& p) { // (reorder) + reshape + transpose // sink reshape for further possible optimization auto is_suitable_permute = [](cldnn::program_node* node) { - return node->get_users().size() == 1 && node->is_dynamic() == false; + return node->get_users().size() == 1 && node->is_dynamic() == false && + node->get_output_layout().get_rank() == 4; }; auto is_suitable_reshape = [](cldnn::program_node* node) -> bool { @@ -28,8 +29,8 @@ void reshape_transfer::run(program& p) { return true; return false; }; - std::function is_suitable_reorder; + std::function is_suitable_reorder; is_suitable_reorder = [&is_suitable_reorder](const cldnn::program_node* node) -> bool { if (node->get_users().size() != 1 || node->is_dynamic()) return false; @@ -58,51 +59,32 @@ void reshape_transfer::run(program& p) { // updated order must be (0,2,3,1): // dim with index=2 is split into 2 parts: 2 and 3 const auto& reshape_in_shape = reshape->get_input_layout().get_dims(); - const auto& reshape_out_dim = reshape->get_output_layout().get_dims(); - auto reshape_out_shape = reshape_out_dim; + const auto& reshape_out_shape = reshape->get_output_layout().get_dims(); auto transformed_order = original_order; ov::Shape new_shape(transformed_order.size()); - if (original_order.size() < reshape_out_dim.size() && reshape_out_dim.size() == 4) { - // if order dims is less than reshape dims, means reshape shape has been converted to upper dims some time - // before merge spatial dims - reshape_out_shape.resize(original_order.size()); - for (size_t i = 0; i < reshape_out_dim.size(); ++i) { - if (i < 2) { - reshape_out_shape[i] = reshape_out_dim[i]; - } else { - reshape_out_shape[2] *= reshape_out_dim[i]; - } + const size_t merge_dim_idx = [&]() { + for (size_t i = 0; i < reshape_in_shape.size(); ++i) { + if (reshape_in_shape[i] != reshape_out_shape[i]) + return i; } - const size_t merge_dim_idx = [&]() { - for (size_t i = 0; i < reshape_in_shape.size(); ++i) { - if (reshape_in_shape[i] != reshape_out_shape[i]) - return i; - } - OPENVINO_THROW("merged_dim_idx can not be found"); - }(); - auto insertIt = transformed_order.end(); - for (auto it = transformed_order.begin(); it != transformed_order.end(); ++it) { - auto& elem = *it; - if (elem > merge_dim_idx) { - elem++; - } else if (elem == merge_dim_idx) { - insertIt = it + 1; - } + OPENVINO_THROW("same input/output for reshape node"); + }(); + auto insertIt = transformed_order.end(); + for (auto it = transformed_order.begin(); it != transformed_order.end(); ++it) { + auto& elem = *it; + if (elem > merge_dim_idx) { + elem++; + } else if (elem == merge_dim_idx) { + insertIt = it + 1; } - transformed_order.insert(insertIt, merge_dim_idx + 1); - } else { - auto reorder_orders = [](std::vector& order, std::vector place_order) { - // for all elements to put in place - for (size_t i = 0; i < order.size() - 1; ++i) { - while (i != place_order[i]) { - // swap it with the element at its final place - auto alt = place_order[i]; - std::swap(order[i], order[alt]); - std::swap(place_order[i], place_order[alt]); - } - } - }; - reorder_orders(transformed_order, std::vector({0, 1, 3, 2})); + } + transformed_order.insert(insertIt, merge_dim_idx + 1); + // remove invalid orders + if (transformed_order.size() > reshape_out_shape.size()) { + transformed_order.erase( + std::remove_if(transformed_order.begin(), transformed_order.end(), [&](uint16_t& order) { + return order >= reshape_out_shape.size(); + })); } return transformed_order; }; @@ -136,20 +118,24 @@ void reshape_transfer::run(program& p) { reshape_node = &(inter_node->as()); auto transpose_order = update_order(transpose_node.get_permute_order(), reshape_node); - auto next_node = transpose_node.get_users().front(); - auto new_reshape_tensor = transpose_node.get_output_layout().get_tensor(); - p.move_node(*reshape_node, *node, *next_node); - // replace the permute node and reshape node auto new_permute = std::make_shared(transpose_node.id() + "_reordered", parent_node->id(), transpose_order); auto& new_permute_node = p.get_or_create(new_permute); - auto new_reshape = - std::make_shared(reshape_node->id() + "_sinked", new_permute_node.id(), new_reshape_tensor); - auto& new_reshape_node = p.get_or_create(new_reshape); + if (new_permute_node.as().is_rotating_except_batch()) { + auto next_node = transpose_node.get_users().front(); + auto new_reshape_tensor = transpose_node.get_output_layout().get_tensor(); + p.move_node(*reshape_node, *node, *next_node); + // replace the permute node and reshape node + auto new_reshape = + std::make_shared(reshape_node->id() + "_sinked", new_permute_node.id(), new_reshape_tensor); + auto& new_reshape_node = p.get_or_create(new_reshape); - p.replace(transpose_node, new_permute_node); - p.replace(*reshape_node, new_reshape_node); - new_permute_node.recalc_output_layout(false); - new_reshape_node.recalc_output_layout(false); + p.replace(transpose_node, new_permute_node); + p.replace(*reshape_node, new_reshape_node); + new_permute_node.recalc_output_layout(false); + new_reshape_node.recalc_output_layout(false); + } else { + p.remove_if_dangling(new_permute_node); + } } } diff --git a/src/plugins/intel_gpu/tests/unit/passes/reorder_reshape_permute.cpp b/src/plugins/intel_gpu/tests/unit/passes/reorder_reshape_permute.cpp index 50260af60cc3c7..edd0b9205f90dc 100644 --- a/src/plugins/intel_gpu/tests/unit/passes/reorder_reshape_permute.cpp +++ b/src/plugins/intel_gpu/tests/unit/passes/reorder_reshape_permute.cpp @@ -12,7 +12,7 @@ using namespace cldnn; using namespace ::tests; -TEST(merge_reorder_reshape_permute, no_reshape) { +TEST(opt_reorder_reshape_permute, no_reshape) { auto& engine = get_test_engine(); auto in_layout = layout{ov::PartialShape({1, 2, 4, 6}), data_types::f16, format::bfyx}; auto input = engine.allocate_memory(layout{ov::PartialShape({1, 2, 4, 6}), data_types::f16, format::bfyx}); @@ -60,7 +60,6 @@ TEST(merge_reorder_reshape_permute, no_reshape) { ref_network.set_input_data("input", input); auto ref_output = ref_network.execute(); - auto ref_out_mem = ref_output.at("softmax").get_memory(); mem_lock lock_ref(ref_out_mem, get_test_stream()); for (size_t i = 0; i < out_mem->count(); i++) { @@ -69,33 +68,25 @@ TEST(merge_reorder_reshape_permute, no_reshape) { } } -// output in byxf layout, check further.... -/*TEST(merge_reorder_reshape_permute, no_reorder) { +TEST(opt_reorder_reshape_permute, no_reorder_no_reshape) { auto& engine = get_test_engine(); auto in_layout = layout{ov::PartialShape({1, 2, 4, 6}), data_types::f16, format::bfyx}; auto input = engine.allocate_memory(layout{ov::PartialShape({1, 2, 4, 6}), data_types::f16, format::bfyx}); auto weight = engine.allocate_memory(layout{ov::PartialShape({3, 2, 1, 1}), data_types::f16, format::bfyx}); + set_values(input, {2.0f, 3.0f, 4.0f, 4.0f, 3.0f, 2.0f, 1.f, 2.f, 3.f, 1.f, 2.f, 4.f, + 5.f, 1.f, 1.f, 2.f, 1.f, 2.f, 2.0f, 3.0f, 1.0f, 4.0f, 1.0f, 4.0f, + 3.0f, 2.0f, 0.0f, 1.0f, 0.0f, 2.0f, 2.f, 4.f, 1.f, 1.f, 2.f, 1.f, + 1.f, 2.f, 0.f, 2.f, 5.f, 2.f, 4.0f, 3.0f, 1.0f, 0.0f, 3.0f, 2.0f}); - set_values(input, { 2.0f, 3.0f, 4.0f, 4.0f, 3.0f, 2.0f, - 1.f, 2.f, 3.f, 1.f, 2.f, 4.f, - 5.f, 1.f, 1.f, 2.f, 1.f, 2.f, - 2.0f, 3.0f, 1.0f, 4.0f, 1.0f, 4.0f, - 3.0f, 2.0f, 0.0f, 1.0f, 0.0f, 2.0f, - 2.f, 4.f, 1.f, 1.f, 2.f, 1.f, - 1.f, 2.f, 0.f, 2.f, 5.f, 2.f, - 4.0f, 3.0f, 1.0f, 0.0f, 3.0f, 2.0f}); - - set_values(weight, { 1.f, 1.f, 1.f, 1.f, 1.f, 1.f}); + set_values(weight, {1.f, 1.f, 1.f, 1.f, 1.f, 1.f}); topology topology; topology.add(input_layout("input", in_layout)); topology.add(data("weight", weight)); - topology.add(convolution("convolution", input_info("input"), "weight", "", 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, -false)); topology.add(reshape("reshape_inter", input_info("convolution"), false, {1, 3, 24}, ov::PartialShape{1, 3, -24})); topology.add(permute("permute_inter", input_info("reshape_inter"), {0, 2, 1})); topology.add(softmax("softmax", -input_info("permute_inter"), 1)); ExecutionConfig config_test = get_test_default_config(engine); - ov::intel_gpu::ImplementationDesc softmax_impl_test = { format::bfyx, "", impl_types::ocl }; - config_test.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ { "softmax_inter", -softmax_impl_test } })); ExecutionConfig config = get_test_default_config(engine); + topology.add( + convolution("convolution", input_info("input"), "weight", "", 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, false)); + topology.add(permute("permute_inter", input_info("convolution"), {0, 2, 3, 1})); + topology.add(softmax("softmax", input_info("permute_inter"), 1)); + ExecutionConfig config = get_test_default_config(engine); config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); config.set_property(ov::intel_gpu::optimize_data(true)); auto prog = program::build_program(engine, topology, config); @@ -104,18 +95,27 @@ softmax_impl_test } })); ExecutionConfig config = get_test_default_config(engine net.set_input_data("input", input); auto output = net.execute(); + + ExecutionConfig ref_config = get_test_default_config(engine); + ref_config.set_property(ov::intel_gpu::optimize_data(false)); + cldnn::network ref_network(engine, topology, ref_config); + // select preferred formats, conv + permute auto permute_inst = net.get_primitive("permute_inter"); - //ASSERT_TRUE(permute_inst->can_be_optimized()); + ASSERT_TRUE(permute_inst->can_be_optimized()); auto out_mem = output.at("softmax").get_memory(); mem_lock lock(out_mem, get_test_stream()); + ref_network.set_input_data("input", input); + auto ref_output = ref_network.execute(); + auto ref_out_mem = ref_output.at("softmax").get_memory(); + mem_lock lock_ref(ref_out_mem, get_test_stream()); for (size_t i = 0; i < out_mem->count(); i++) { float actual = lock[i]; - ASSERT_EQ(actual, ref_output[i]); + ASSERT_EQ(actual, lock_ref[i]); } -}*/ +} -TEST(merge_reorder_reshape_permute, no_reorder_no_reshape) { +TEST(opt_reorder_reshape_permute, cutomized_net_yolov6_alike) { auto& engine = get_test_engine(); auto in_layout = layout{ov::PartialShape({1, 2, 4, 6}), data_types::f16, format::bfyx}; auto input = engine.allocate_memory(layout{ov::PartialShape({1, 2, 4, 6}), data_types::f16, format::bfyx}); @@ -131,24 +131,42 @@ TEST(merge_reorder_reshape_permute, no_reorder_no_reshape) { topology.add(data("weight", weight)); topology.add( convolution("convolution", input_info("input"), "weight", "", 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, false)); - topology.add(permute("permute_inter", input_info("convolution"), {0, 2, 3, 1})); + topology.add(reorder("reorder_inter", input_info("convolution"), format::bfyx, data_types::f16)); + topology.add( + reshape("reshape_inter", input_info("reorder_inter"), false, {1, 3, 24, 1}, ov::PartialShape{1, 3, 24, 1})); + topology.add(permute("permute_inter", input_info("reshape_inter"), {0, 2, 1})); topology.add(softmax("softmax", input_info("permute_inter"), 1)); ExecutionConfig config = get_test_default_config(engine); - config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + config.set_property(ov::intel_gpu::allow_new_shape_infer(false)); config.set_property(ov::intel_gpu::optimize_data(true)); auto prog = program::build_program(engine, topology, config); - network net(prog); - net.set_input_data("input", input); - auto output = net.execute(); - ExecutionConfig ref_config = get_test_default_config(engine); ref_config.set_property(ov::intel_gpu::optimize_data(false)); cldnn::network ref_network(engine, topology, ref_config); - // select preferred formats, conv + permute + + net.set_input_data("input", input); + auto output = net.execute(); + auto optimzed_nodes = net.get_program()->get_optimized(); + auto it = + std::find_if(std::begin(optimzed_nodes), std::end(optimzed_nodes), [&](cldnn::program::optimized_info& oi) { + return oi.first == "reorder_inter"; + }); + ASSERT_NE(it, optimzed_nodes.end()); auto permute_inst = net.get_primitive("permute_inter"); ASSERT_TRUE(permute_inst->can_be_optimized()); + auto reshape_inst = net.get_primitive("reshape_inter"); + ASSERT_TRUE(reshape_inst->can_be_optimized()); + + auto& processing_order = prog->get_processing_order(); + + auto reshape_node = std::find(processing_order.begin(), processing_order.end(), &prog->get_node("reshape_inter")); + size_t reshape_dist = std::distance(processing_order.begin(), reshape_node); + + auto permute_node = std::find(processing_order.begin(), processing_order.end(), &prog->get_node("permute_inter")); + size_t permute_dist = std::distance(processing_order.begin(), permute_node); + ASSERT_TRUE(reshape_dist > permute_dist); auto out_mem = output.at("softmax").get_memory(); mem_lock lock(out_mem, get_test_stream()); @@ -163,7 +181,7 @@ TEST(merge_reorder_reshape_permute, no_reorder_no_reshape) { } } -TEST(merge_reorder_reshape_permute, cutomized_net_yolov6_alike) { +TEST(opt_reorder_reshape_permute, cutomized_net_yolov6_alike_4d) { auto& engine = get_test_engine(); auto in_layout = layout{ov::PartialShape({1, 2, 4, 6}), data_types::f16, format::bfyx}; auto input = engine.allocate_memory(layout{ov::PartialShape({1, 2, 4, 6}), data_types::f16, format::bfyx}); @@ -182,7 +200,7 @@ TEST(merge_reorder_reshape_permute, cutomized_net_yolov6_alike) { topology.add(reorder("reorder_inter", input_info("convolution"), format::bfyx, data_types::f16)); topology.add( reshape("reshape_inter", input_info("reorder_inter"), false, {1, 3, 24, 1}, ov::PartialShape{1, 3, 24, 1})); - topology.add(permute("permute_inter", input_info("reshape_inter"), {0, 2, 1})); + topology.add(permute("permute_inter", input_info("reshape_inter"), {0, 2, 1, 3})); topology.add(softmax("softmax", input_info("permute_inter"), 1)); ExecutionConfig config = get_test_default_config(engine); config.set_property(ov::intel_gpu::allow_new_shape_infer(false)); @@ -225,12 +243,11 @@ TEST(merge_reorder_reshape_permute, cutomized_net_yolov6_alike) { mem_lock lock_ref(ref_out_mem, get_test_stream()); for (size_t i = 0; i < out_mem->count(); i++) { float actual = lock[i]; - std::cout << actual << ", " << std::endl; ASSERT_EQ(actual, lock_ref[i]); } } -TEST(merge_reorder_reshape_permute, not_sinking_reshape) { +TEST(opt_reorder_reshape_permute, not_sinking_reshape) { auto& engine = get_test_engine(); auto in_layout = layout{ov::PartialShape({1, 2, 4, 6}), data_types::f16, format::bfyx}; auto input = engine.allocate_memory(layout{ov::PartialShape({1, 2, 4, 6}), data_types::f16, format::bfyx}); @@ -248,7 +265,7 @@ TEST(merge_reorder_reshape_permute, not_sinking_reshape) { convolution("convolution", input_info("input"), "weight", "", 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, false)); topology.add(reorder("reorder_inter", input_info("convolution"), format::bfyx, data_types::f16)); topology.add( - reshape("reshape_inter", input_info("reorder_inter"), false, {1, 3, 2, 1}, ov::PartialShape{1, 3, 2, 1})); + reshape("reshape_inter", input_info("reorder_inter"), false, {1, 18, 4, 1}, ov::PartialShape{1, 18, 4, 1})); topology.add(permute("permute_inter", input_info("reshape_inter"), {0, 2, 1})); topology.add(softmax("softmax", input_info("permute_inter"), 1)); ExecutionConfig config = get_test_default_config(engine);