diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp index fbc8ae84c36a29..c65aa3e5894cb8 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp @@ -175,6 +175,7 @@ class debug_configuration { } dump_prof_data_iter_params; static std::ostream* verbose_stream; + static const int DYNAMIC_QUANTIZE_GROUP_SIZE_NOT_SET = -2; }; } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp index c4115d74f54a92..b26b11ce97df6a 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp @@ -55,7 +55,7 @@ static size_t get_dynamic_quantize_group_size(const fully_connected_params& para auto dynamic_quantization_group_size = params.dynamic_quantization_group_size; GPU_DEBUG_GET_INSTANCE(debug_config); - GPU_DEBUG_IF(debug_config->dynamic_quantize_group_size) { + GPU_DEBUG_IF(debug_config->dynamic_quantize_group_size != debug_config->DYNAMIC_QUANTIZE_GROUP_SIZE_NOT_SET) { dynamic_quantization_group_size = debug_config->dynamic_quantize_group_size; // Specify which Fully-connected layer would be dynamic-quantized diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index f173e378fca3f9..b75519ac40e678 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -872,7 +872,7 @@ void TransformationsPipeline::apply(std::shared_ptr func) { manager.register_pass(); auto dynamic_quantization_group_size = config.get_property(ov::hint::dynamic_quantization_group_size); - if (device_info.supports_immad) { // XXX: 1048576 is considered per-token + if (device_info.supports_immad) { pass_config->set_callback([=](const_node_ptr& root) -> bool { if (root->get_input_node_shared_ptr(0)->get_element_type() == ov::element::Type_t::f32) { GPU_DEBUG_TRACE << root->get_friendly_name() << " Dynamic quantization is turned off because input type is not supported" << std::endl; diff --git a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp index dcbabff548cc5d..5f943564d6f50e 100644 --- a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp +++ b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp @@ -253,7 +253,7 @@ debug_configuration::debug_configuration() , disable_runtime_skip_reorder(0) , disable_primitive_fusing(0) , disable_fake_alignment(0) - , dynamic_quantize_group_size(0) + , dynamic_quantize_group_size(DYNAMIC_QUANTIZE_GROUP_SIZE_NOT_SET) , disable_horizontal_fc_fusion(0) { #ifdef GPU_DEBUG_CONFIG get_gpu_debug_env_var("Help", help); diff --git a/src/plugins/intel_gpu/src/runtime/execution_config.cpp b/src/plugins/intel_gpu/src/runtime/execution_config.cpp index 9c24fae1d6729a..7661444cc4fd7b 100644 --- a/src/plugins/intel_gpu/src/runtime/execution_config.cpp +++ b/src/plugins/intel_gpu/src/runtime/execution_config.cpp @@ -46,7 +46,6 @@ void ExecutionConfig::set_default() { std::make_tuple(ov::hint::execution_mode, ov::hint::ExecutionMode::PERFORMANCE), std::make_tuple(ov::hint::num_requests, 0), std::make_tuple(ov::hint::enable_cpu_pinning, false), - std::make_tuple(ov::hint::dynamic_quantization_group_size, 0), std::make_tuple(ov::intel_gpu::hint::host_task_priority, ov::hint::Priority::MEDIUM), std::make_tuple(ov::intel_gpu::hint::queue_throttle, ov::intel_gpu::hint::ThrottleLevel::MEDIUM), @@ -58,7 +57,7 @@ void ExecutionConfig::set_default() { std::make_tuple(ov::internal::query_model_ratio, 1.0f), std::make_tuple(ov::cache_mode, ov::CacheMode::OPTIMIZE_SPEED), std::make_tuple(ov::cache_encryption_callbacks, EncryptionCallbacks{}), - std::make_tuple(ov::hint::dynamic_quantization_group_size, 0), + std::make_tuple(ov::hint::dynamic_quantization_group_size, 32), std::make_tuple(ov::intel_gpu::hint::enable_kernels_reuse, false), std::make_tuple(ov::weights_path, ""), @@ -204,7 +203,7 @@ void ExecutionConfig::apply_debug_options(const cldnn::device_info& info) { set_property(ov::intel_gpu::use_only_static_kernels_for_dynamic_shape(true)); } - GPU_DEBUG_IF(debug_config->dynamic_quantize_group_size) { + GPU_DEBUG_IF(debug_config->dynamic_quantize_group_size != debug_config->DYNAMIC_QUANTIZE_GROUP_SIZE_NOT_SET) { if (debug_config->dynamic_quantize_group_size == -1) set_property(ov::hint::dynamic_quantization_group_size(UINT64_MAX)); else diff --git a/src/plugins/intel_gpu/tests/unit/fusions/fully_connected_fusion_test.cpp b/src/plugins/intel_gpu/tests/unit/fusions/fully_connected_fusion_test.cpp index 3743298a3c981a..5e9b5134fb3802 100644 --- a/src/plugins/intel_gpu/tests/unit/fusions/fully_connected_fusion_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/fusions/fully_connected_fusion_test.cpp @@ -666,6 +666,7 @@ TEST_P(fc_compressed_int8_bias_dynamic_onednn, basic) { bool is_dynamic = true; cfg_not_fused.set_property(ov::intel_gpu::allow_new_shape_infer(is_dynamic)); + cfg_not_fused.set_property(ov::hint::dynamic_quantization_group_size(0)); tolerance = 1.0f; execute(p, false, is_dynamic); } @@ -705,6 +706,7 @@ TEST_P(fc_compressed_int8_bias_prod_unfused_dynamic_onednn, basic) { bool is_dynamic = true; cfg_not_fused.set_property(ov::intel_gpu::allow_new_shape_infer(is_dynamic)); + cfg_not_fused.set_property(ov::hint::dynamic_quantization_group_size(0)); tolerance = 1.0f; execute(p, false, is_dynamic); } diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp index 0ef7b6a5ca088b..dde1b6215148b3 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp @@ -1590,6 +1590,7 @@ class fully_connected_gpu_tests: public ::testing::Test { config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); ov::intel_gpu::ImplementationDesc fc_impl_desc = { format::bfyx, "fully_connected_gpu_bfyx_ref", impl_types::ocl }; config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"fc_prim", fc_impl_desc} })); + config.set_property(ov::hint::dynamic_quantization_group_size(0)); network network(engine, topology, config); network.set_input_data("input", input_mem); @@ -1615,6 +1616,7 @@ class fully_connected_gpu_tests: public ::testing::Test { auto config = get_test_default_config(engine); config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); config.set_property(ov::intel_gpu::optimize_data(true)); + config.set_property(ov::hint::dynamic_quantization_group_size(0)); network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test); @@ -1698,9 +1700,7 @@ class fully_connected_gpu_tests: public ::testing::Test { config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); ov::intel_gpu::ImplementationDesc fc_impl_desc = { format::bfyx, "fully_connected_gpu_bfyx_ref", impl_types::ocl }; config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"fc_prim", fc_impl_desc} })); - if (is_dyn_quan) { - config.set_property(ov::hint::dynamic_quantization_group_size(0)); - } + config.set_property(ov::hint::dynamic_quantization_group_size(0)); network network(engine, topology, config); network.set_input_data("input", input_mem); @@ -1728,6 +1728,8 @@ class fully_connected_gpu_tests: public ::testing::Test { config.set_property(ov::intel_gpu::optimize_data(true)); if (is_dyn_quan) { config.set_property(ov::hint::dynamic_quantization_group_size(32)); + } else { + config.set_property(ov::hint::dynamic_quantization_group_size(0)); } network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test); @@ -1868,6 +1870,7 @@ class fully_connected_gpu_tests: public ::testing::Test { config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); ov::intel_gpu::ImplementationDesc fc_impl = { in_layout.format, "", impl_types::ocl }; config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ { "fc_prim1", fc_impl }, { "fc_prim2", fc_impl } })); + config.set_property(ov::hint::dynamic_quantization_group_size(0)); network network(engine, topology, config); network.set_input_data("input", input_mem); @@ -1896,6 +1899,7 @@ class fully_connected_gpu_tests: public ::testing::Test { auto config = get_test_default_config(engine); config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); config.set_property(ov::intel_gpu::optimize_data(true)); + config.set_property(ov::hint::dynamic_quantization_group_size(0)); network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test);