From 1e3b88e4e3f89774923e04e845428579f8ffa0fe Mon Sep 17 00:00:00 2001 From: "Min, Byungil" Date: Fri, 19 Jul 2024 22:06:57 +0900 Subject: [PATCH] [GPU] Fix regression by selection of reference MatMul (#25633) + Resolve unexpected input of dynamic shape from Reshape ### Details: - *item1* - *...* ### Tickets: - 147083 --------- Signed-off-by: Min, Byung-il --- .../fully_connected_gpu_bf_tiled.cl | 20 ++------ .../fully_connected_kernel_bf_tiled.cpp | 37 ++++++++++----- .../single_layer_tests/dynamic/matmul.cpp | 3 +- .../test_cases/fully_connected_gpu_test.cpp | 47 +++++++++++++++++-- 4 files changed, 75 insertions(+), 32 deletions(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl index f22c1ee136c004..2132ce27b7e40e 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl @@ -116,25 +116,13 @@ KERNEL(quantize_input)( #if !REALIGN_FP16_OFFSET -# if OUTPUT_3D -# define MAIN_LOOP_ELEMENTS_COUNT INPUT0_SIZE_Y -# else -# define MAIN_LOOP_ELEMENTS_COUNT INPUT0_ELEMENTS_COUNT -# endif + #define MAIN_LOOP_ELEMENTS_COUNT IFM_SIZE #else -// For REALIGN_FP16_OFFSET one feature is processed separately before entering main loop to correct alignment. -# if OUTPUT_3D -# define MAIN_LOOP_ELEMENTS_COUNT (INPUT0_SIZE_Y - 1) -# else -# define MAIN_LOOP_ELEMENTS_COUNT (INPUT0_ELEMENTS_COUNT - 1) -# endif + // For REALIGN_FP16_OFFSET one feature is processed separately before entering main loop to correct alignment. + #define MAIN_LOOP_ELEMENTS_COUNT (IFM_SIZE - 1) #endif -#if OUTPUT_3D -# define INPUT_ELEMENTS_COUNT INPUT0_SIZE_Y -#else -# define INPUT_ELEMENTS_COUNT INPUT0_ELEMENTS_COUNT -#endif +#define INPUT_ELEMENTS_COUNT IFM_SIZE #if IS_DYNAMIC && COMPRESSED_WEIGHTS_INT4 #pragma disable_includes_optimization diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp index c6b0acda06c56a..07d81dce5e3f23 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp @@ -15,14 +15,20 @@ static constexpr size_t min_slm_size = 256; namespace kernel_selector { static std::pair get_input_bf_size(const fully_connected_params& params) { - size_t input_f = params.inputs[0].Feature().v; - size_t input_batch = params.inputs[0].Batch().v; + auto& input = params.inputs[0]; + size_t input_f = input.Feature().v; + size_t input_batch = input.Batch().v; + // 3D input if (params.outputs[0].GetLayout() == DataLayout::bfyx) { - input_f = params.inputs[0].Y().v; - input_batch = params.inputs[0].Batch().v * params.inputs[0].Feature().v; + input_f = input.Y().v; + input_batch = input.Batch().v * input.Feature().v; } + // In Some model, input_f could be dynamic in input0. It refers to IFM value of weight. + if (input.is_dynamic() && input_f == 0 && params.weights.IFM().v != 0) + input_f = params.weights.IFM().v; + return {input_batch, input_f}; } @@ -153,8 +159,7 @@ bool FullyConnected_bf_tiled::Validate(const Params& params) const { // Dynamic kernel doesn't support dynamic weights yet if (fc_params.is_shape_agnostic && input.is_dynamic()) { - if ((output.GetLayout() == DataLayout::bfyx && input.Y().v == 0) || - (output.GetLayout() == DataLayout::bf && input.Feature().v == 0)) + if (get_input_bf_size(fc_params).second == 0) return false; } @@ -509,6 +514,7 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para jit.AddConstant(MakeJitConstant("DYNAMIC_QUANTIZE", 0)); } + jit.AddConstant(MakeJitConstant("IFM_SIZE", get_input_bf_size(params).second)); jit.AddConstant(MakeJitConstant("SIMD", simd)); jit.AddConstant(MakeJitConstant("TILE_B", dispatchData.tile_m)); jit.AddConstant(MakeJitConstant("HALF_TILE_B", dispatchData.tile_m/2)); @@ -539,16 +545,18 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para // for 3d output we are treating spatial as features if (params.outputs[0].GetLayout() == DataLayout::bfyx) { + auto tile_in_b_pitch = (params.inputs[0].Feature().pitch == 0) ? get_input_bf_size(params).second : params.inputs[0].Feature().pitch; jit.AddConstant(MakeJitConstant("TILE_OUT_F_NUM", params.outputs[0].Y().v)); jit.AddConstant(MakeJitConstant("TILE_OUT_F_PITCH", params.outputs[0].Y().pitch)); - jit.AddConstant(MakeJitConstant("TILE_IN_B_PITCH", params.inputs[0].Feature().pitch)); + jit.AddConstant(MakeJitConstant("TILE_IN_B_PITCH", tile_in_b_pitch)); jit.AddConstant(MakeJitConstant("TILE_OUT_B_PITCH", params.outputs[0].Feature().pitch)); jit.AddConstant(MakeJitConstant("OUTPUT_3D", true)); jit.AddConstant(MakeJitConstant("BATCH_SIZE", "(OUTPUT_BATCH_NUM * OUTPUT_FEATURE_NUM)")); } else { + auto tile_in_b_pitch = (params.inputs[0].Batch().pitch == 0) ? get_input_bf_size(params).second : params.inputs[0].Batch().pitch; jit.AddConstant(MakeJitConstant("TILE_OUT_F_NUM", params.outputs[0].Feature().v)); jit.AddConstant(MakeJitConstant("TILE_OUT_F_PITCH", params.outputs[0].Feature().pitch)); - jit.AddConstant(MakeJitConstant("TILE_IN_B_PITCH", params.inputs[0].Batch().pitch)); + jit.AddConstant(MakeJitConstant("TILE_IN_B_PITCH", tile_in_b_pitch)); jit.AddConstant(MakeJitConstant("TILE_OUT_B_PITCH", params.outputs[0].Batch().pitch)); jit.AddConstant(MakeJitConstant("BATCH_SIZE", "(OUTPUT_BATCH_NUM)")); } @@ -614,6 +622,12 @@ void FullyConnected_bf_tiled::GetUpdateDispatchDataFunc(KernelData& kd) const { kd.kernels[execute_kernel_idx].params.workGroups.local = dispatchData.lws; kd.kernels[execute_kernel_idx].skip_execution = KernelData::SkipKernelExecution(prim_params); + auto& input = prim_params.inputs[0]; + if (prim_params.outputs[0].GetLayout() == DataLayout::bfyx) + OPENVINO_ASSERT(input.X().pad.Total() == 0 && input.Y().pad.Total() == 0, "[GPU] Invalid padding in spatial axes observed in FC bf tiled."); + else + OPENVINO_ASSERT(input.Feature().pad.Total() == 0, "[GPU] Invalid padding in f axis observed in FC bf tiled."); + if (!kd.internalBufferSizes.empty()) { // Pre-quantizing kernel was generated. Update the kernel and intermediate buffers or disable it. if (execute_type == KernelType::DEFAULT) { @@ -784,7 +798,8 @@ KernelsData FullyConnected_bf_tiled::GetMultiKernelsData(const Params ¶ms, { auto& quan_kernel = kd.kernels[0]; DispatchData dyn_quan_dispatch = dispatchData; - dyn_quan_dispatch.gws = {std::max((fc_params.inputs[0].PhysicalSize() / quantize_grp_size), (size_t)1), 1, 1}; + auto input_size = std::max(fc_params.inputs[0].PhysicalSize(), get_input_bf_size(fc_params).second); + dyn_quan_dispatch.gws = {input_size / quantize_grp_size, 1, 1}; dyn_quan_dispatch.lws = {16, 1, 1}; quan_kernel.params.workGroups.global = dyn_quan_dispatch.gws; quan_kernel.params.workGroups.local = dyn_quan_dispatch.lws; @@ -814,8 +829,8 @@ KernelsData FullyConnected_bf_tiled::GetMultiKernelsData(const Params ¶ms, quan_kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 0}); quan_kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0}); quan_kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1}); - kd.internalBufferSizes.push_back(fc_params.inputs[0].PhysicalSize()); - kd.internalBufferSizes.push_back(fc_params.inputs[0].PhysicalSize() / quantize_grp_size * 2); + kd.internalBufferSizes.push_back(input_size); + kd.internalBufferSizes.push_back(input_size / quantize_grp_size * 2); kernel_number++; } kd.internalBufferDataType = Datatype::F16; diff --git a/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/matmul.cpp b/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/matmul.cpp index c88a3826fe0f8f..36b6370a85c2f4 100644 --- a/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/matmul.cpp +++ b/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/matmul.cpp @@ -266,9 +266,10 @@ const std::vector IS3D_smoke = { }, {ov::test::static_shapes_to_test_representation({{1, 429}, {1, 429, 1}}), {true, true}}, + { { - {{-1, -1}, {{1, 129}, {2, 129}, {1, 129}, {2, 129}}}, + {{-1, -1, -1}, {{1, 1, 129}, {1, 2, 129}, {1, 1, 129}, {1, 2, 129}}}, {{1, 129, 1}, {{1, 129, 1}, {1, 129, 1}, {1, 129, 1}, {1, 129, 1}}} }, {true, true} diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp index 35d65554408252..5137ad4ebdaf8d 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp @@ -1255,7 +1255,7 @@ class fully_connected_gpu_tests: public ::testing::Test { } } - void test_compressed_int4_scale_dyn_quan(bool is_caching_test, bool is_dynamic, int batch = 1) { + void test_compressed_int4_scale_dyn_quan(bool is_caching_test, bool is_dynamic, int batch = 1, bool is_wei_dyn = false) { tests::random_generator rg(GET_SUITE_NAME); auto& engine = get_test_engine(); @@ -1285,6 +1285,11 @@ class fully_connected_gpu_tests: public ::testing::Test { auto scale_data = rg.generate_random_1d(ofm_num * ifm_num / scales_group_size, -4.0f, 4.0f); set_values(scale_mem, scale_data); + if (is_wei_dyn) { + // ifm_num is dynamic + dyn_input_ps = is_3d ? ov::PartialShape{ -1, -1, -1 } : ov::PartialShape{ -1, -1}; + } + auto in_layout = is_dynamic ? layout{ dyn_input_ps, data_types::f16, format::bfyx } : layout{ input_ps, data_types::f16, format::bfyx }; @@ -1302,7 +1307,8 @@ class fully_connected_gpu_tests: public ::testing::Test { auto config = get_test_default_config(engine); config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); - config.set_property(ov::intel_gpu::optimize_data(true)); + ov::intel_gpu::ImplementationDesc fc_impl_desc = { format::bfyx, "fully_connected_gpu_bfyx_ref", impl_types::ocl }; + config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"fc_prim", fc_impl_desc} })); network network(engine, topology, config); network.set_input_data("input", input_mem); @@ -1365,13 +1371,13 @@ class fully_connected_gpu_tests: public ::testing::Test { } - void test_compressed_int4_scale(bool is_caching_test, bool is_dynamic, long int batch_num, long int scales_group_size = 128) { + void test_compressed_int4_scale(bool is_caching_test, bool is_dynamic, long int batch_num, long int scales_group_size = 128, bool is_wei_dyn = false) { tests::random_generator rg(GET_SUITE_NAME); auto& engine = get_test_engine(); auto supports_immad = engine.get_device_info().supports_immad; long int ifm_num = 256; - long int ofm_num = 256; + long int ofm_num = 512; auto input_mem = engine.allocate_memory({ { batch_num, ifm_num}, data_types::f16, format::bfyx }); auto weights_mem = engine.allocate_memory({ {ofm_num, ifm_num}, data_types::u4, format::bfyx }); @@ -1392,6 +1398,11 @@ class fully_connected_gpu_tests: public ::testing::Test { auto in_layout = is_dynamic ? layout{ {-1, ifm_num}, data_types::f16, format::bfyx } : layout{ {batch_num, ifm_num}, data_types::f16, format::bfyx }; + if (is_dynamic && is_wei_dyn) { + // ifm_num is dynamic + in_layout = layout{ {-1, -1}, data_types::f16, format::bfyx }; + } + auto dcomp_zp_name = supports_immad ? "dcomp_zp" : ""; auto fc_prim = fully_connected("fc_prim", input_info("input"), "weights", "", "scale", dcomp_zp_name, data_types::f16, padding(), 2, 2); @@ -1409,6 +1420,8 @@ class fully_connected_gpu_tests: public ::testing::Test { auto config = get_test_default_config(engine); config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + ov::intel_gpu::ImplementationDesc fc_impl_desc = { format::bfyx, "fully_connected_gpu_bfyx_ref", impl_types::ocl }; + config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"fc_prim", fc_impl_desc} })); network network(engine, topology, config); network.set_input_data("input", input_mem); @@ -3324,6 +3337,32 @@ TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dyn_cache_dynamic) { this->test_compressed_int4_scale_dyn_quan(true, true, 512); } +TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_f_input) { + this->test_compressed_int4_scale(false, true, 256, true); +} + +TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_f_input_cached) { + this->test_compressed_int4_scale(true, true, 260, true); +} +TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_f_input_b1g64) { + this->test_compressed_int4_scale(false, true, 1, 64, true); +} + +TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_f_input_b1g128) { + this->test_compressed_int4_scale(false, true, 1, 128, true); +} + +TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dyn_quan_dynamic_f_input_single_batch) { + this->test_compressed_int4_scale_dyn_quan(false, true, 1, true); +} + +TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dyn_quan_dynamic_f_input) { + this->test_compressed_int4_scale_dyn_quan(false, true, 512, true); +} + +TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dyn_quan_dynamic_f_input_unaligned) { + this->test_compressed_int4_scale_dyn_quan(false, true, 511, true); +} TEST_F(fully_connected_gpu_tests, compressed_scale_bias) {