From f93d051f24cbf40d2d5d6f1f24a1a780573dfc2b Mon Sep 17 00:00:00 2001 From: Paul Youngsoo Ahn Date: Mon, 23 Dec 2024 17:28:35 +0900 Subject: [PATCH] Add missing code in dynamic fc impl (#28026) ### Details: - *Add acc_tmp in general calc in fc funcion in common include file* ### Tickets: - *158460* --- .../fully_connected_gpu_bf_tiled.cl | 4 +++- .../fully_connected_gpu_bf_tiled_common.cl | 19 ++++++++++++------- .../test_cases/fully_connected_gpu_test.cpp | 4 ++++ 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl index 01c8e8853e350d..6a5c9e54a8e904 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl @@ -601,8 +601,10 @@ inline void FUNC(fc_bf_tiled_kernel_default)( #endif #if TILE_OFM > 1 ((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += ((ACCUMULATOR_TYPE*)(&acc_tmp[bi]))[fi] * ds; + acc_tmp[bi][fi] = 0; #else acc[bi] += acc_tmp[bi] * ds; + acc_tmp[bi] = 0; #endif } } @@ -972,7 +974,7 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)( // ===================================================================================================================================== // Main computation loop const uint iterations = MAIN_LOOP_ELEMENTS_COUNT / TILE_IFM_ELEMENTS_SIZE; // TILE_IFM_ELEMENTS_SIZE : (TILE_IFM * SIMD) - // Each sub-group loads 2 Batch + // Each sub-group loads 2 Batch uint idx_sglid = (sglid * TILE_K) % TILE_IFM_ELEMENTS_SIZE; // same index for sglid 0~7 : to tile_k direction uint batch_sglid = (sglid * TILE_K) / TILE_IFM_ELEMENTS_SIZE; // 0 to 1 : to batch direction diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/fully_connected_gpu_bf_tiled_common.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/fully_connected_gpu_bf_tiled_common.cl index ca5c1ea3646d02..3f5796a30933ac 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/fully_connected_gpu_bf_tiled_common.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/fully_connected_gpu_bf_tiled_common.cl @@ -147,9 +147,7 @@ inline void (FUNC_NAME)( // NOTE: Manually unrolling multiplication loop leads to lower register pressure and allows for bigger block sizes, // but significantly degrades readability and generality of code. // It doesn't also show noticable performance improvement on tested configurations. - #if DECOMPRESSION_SCALE_POST_OP - ACCUMULATOR_VEC_TYPE acc_tmp[FORCED_TILE_B] = { }; - #endif + ACCUMULATOR_VEC_TYPE acc_tmp[FORCED_TILE_B] = { }; unroll_for(uint ki = 0; ki < (TILE_IFM * SIMD) / TILE_K; ++ki) { #if COMPRESSED_WEIGHTS_INT4 @@ -201,11 +199,7 @@ inline void (FUNC_NAME)( unroll_for (uint bi = 0; bi < FORCED_TILE_B; ++bi) { INPUT0_TYPE in_val = _sub_group_shuffle(((INPUT0_TYPE*)(&in_0[bi]))[total_k / SIMD], total_k % SIMD); unroll_for (uint fi = 0; fi < TILE_OFM; ++fi) { -#if DECOMPRESSION_SCALE_POST_OP ((ACCUMULATOR_TYPE*)(&acc_tmp[bi]))[fi] += in_val * ((ACCUMULATOR_TYPE*)(&wei))[W_IDX]; -#else - ((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += in_val * ((ACCUMULATOR_TYPE*)(&wei))[W_IDX]; -#endif } } } @@ -240,9 +234,20 @@ inline void (FUNC_NAME)( ACCUMULATOR_TYPE ds = d_scales[fi % DECOMPRESSION_SCALE_LENGTH]; #endif ((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += ((ACCUMULATOR_TYPE*)(&acc_tmp[bi]))[fi] * ds; + acc_tmp[bi][fi] = 0; } } #endif + +#if !DECOMPRESSION_SCALE_POST_OP + unroll_for (uint bi = 0; bi < FORCED_TILE_B; ++bi) { + unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) { + ((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += ((ACCUMULATOR_TYPE*)(&acc_tmp[bi]))[fi]; + } + } +#endif + + } // ===================================================================================================================================== // Leftovers diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp index f59dc5c42cffc1..c3caebe9d0ba68 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp @@ -4137,6 +4137,10 @@ TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dyn_quan_dynamic_f_input this->test_compressed_int4_scale_dyn_quan(false, true, 511, true); } +TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_quantize_batch_1) { + this->test_compressed_int4_scale_dyn_quan_weight_i4(true, 1, 2048, 3072); +} + TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_quantize_edge_case) { this->test_compressed_int4_scale_dyn_quan_weight_i4(true, 359, 1536, 2560); }