Skip to content

Commit

Permalink
[GPU] Enable dynamic quantization gs32 as default for non-systolic (o…
Browse files Browse the repository at this point in the history
…penvinotoolkit#27119)

### Details:
 - It is applied only to int4 compressed model, non-systolic path
 - Though it is a global configuration, systolic hardware will ignore it

### Tickets:
 - 151708
  • Loading branch information
isanghao authored Oct 21, 2024
1 parent 2cb8222 commit b785e6e
Show file tree
Hide file tree
Showing 7 changed files with 15 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@ class debug_configuration {
} dump_prof_data_iter_params;

static std::ostream* verbose_stream;
static const int DYNAMIC_QUANTIZE_GROUP_SIZE_NOT_SET = -2;
};

} // namespace cldnn
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ static size_t get_dynamic_quantize_group_size(const fully_connected_params& para
auto dynamic_quantization_group_size = params.dynamic_quantization_group_size;

GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->dynamic_quantize_group_size) {
GPU_DEBUG_IF(debug_config->dynamic_quantize_group_size != debug_config->DYNAMIC_QUANTIZE_GROUP_SIZE_NOT_SET) {
dynamic_quantization_group_size = debug_config->dynamic_quantize_group_size;

// Specify which Fully-connected layer would be dynamic-quantized
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -872,7 +872,7 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
manager.register_pass<ov::pass::Validate>();

auto dynamic_quantization_group_size = config.get_property(ov::hint::dynamic_quantization_group_size);
if (device_info.supports_immad) { // XXX: 1048576 is considered per-token
if (device_info.supports_immad) {
pass_config->set_callback<ov::intel_gpu::DynamicQuantizeFullyConnected>([=](const_node_ptr& root) -> bool {
if (root->get_input_node_shared_ptr(0)->get_element_type() == ov::element::Type_t::f32) {
GPU_DEBUG_TRACE << root->get_friendly_name() << " Dynamic quantization is turned off because input type is not supported" << std::endl;
Expand Down
2 changes: 1 addition & 1 deletion src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ debug_configuration::debug_configuration()
, disable_runtime_skip_reorder(0)
, disable_primitive_fusing(0)
, disable_fake_alignment(0)
, dynamic_quantize_group_size(0)
, dynamic_quantize_group_size(DYNAMIC_QUANTIZE_GROUP_SIZE_NOT_SET)
, disable_horizontal_fc_fusion(0) {
#ifdef GPU_DEBUG_CONFIG
get_gpu_debug_env_var("Help", help);
Expand Down
5 changes: 2 additions & 3 deletions src/plugins/intel_gpu/src/runtime/execution_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ void ExecutionConfig::set_default() {
std::make_tuple(ov::hint::execution_mode, ov::hint::ExecutionMode::PERFORMANCE),
std::make_tuple(ov::hint::num_requests, 0),
std::make_tuple(ov::hint::enable_cpu_pinning, false),
std::make_tuple(ov::hint::dynamic_quantization_group_size, 0),

std::make_tuple(ov::intel_gpu::hint::host_task_priority, ov::hint::Priority::MEDIUM),
std::make_tuple(ov::intel_gpu::hint::queue_throttle, ov::intel_gpu::hint::ThrottleLevel::MEDIUM),
Expand All @@ -58,7 +57,7 @@ void ExecutionConfig::set_default() {
std::make_tuple(ov::internal::query_model_ratio, 1.0f),
std::make_tuple(ov::cache_mode, ov::CacheMode::OPTIMIZE_SPEED),
std::make_tuple(ov::cache_encryption_callbacks, EncryptionCallbacks{}),
std::make_tuple(ov::hint::dynamic_quantization_group_size, 0),
std::make_tuple(ov::hint::dynamic_quantization_group_size, 32),
std::make_tuple(ov::intel_gpu::hint::enable_kernels_reuse, false),
std::make_tuple(ov::weights_path, ""),

Expand Down Expand Up @@ -204,7 +203,7 @@ void ExecutionConfig::apply_debug_options(const cldnn::device_info& info) {
set_property(ov::intel_gpu::use_only_static_kernels_for_dynamic_shape(true));
}

GPU_DEBUG_IF(debug_config->dynamic_quantize_group_size) {
GPU_DEBUG_IF(debug_config->dynamic_quantize_group_size != debug_config->DYNAMIC_QUANTIZE_GROUP_SIZE_NOT_SET) {
if (debug_config->dynamic_quantize_group_size == -1)
set_property(ov::hint::dynamic_quantization_group_size(UINT64_MAX));
else
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -666,6 +666,7 @@ TEST_P(fc_compressed_int8_bias_dynamic_onednn, basic) {

bool is_dynamic = true;
cfg_not_fused.set_property(ov::intel_gpu::allow_new_shape_infer(is_dynamic));
cfg_not_fused.set_property(ov::hint::dynamic_quantization_group_size(0));
tolerance = 1.0f;
execute(p, false, is_dynamic);
}
Expand Down Expand Up @@ -705,6 +706,7 @@ TEST_P(fc_compressed_int8_bias_prod_unfused_dynamic_onednn, basic) {

bool is_dynamic = true;
cfg_not_fused.set_property(ov::intel_gpu::allow_new_shape_infer(is_dynamic));
cfg_not_fused.set_property(ov::hint::dynamic_quantization_group_size(0));
tolerance = 1.0f;
execute(p, false, is_dynamic);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1590,6 +1590,7 @@ class fully_connected_gpu_tests: public ::testing::Test {
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
ov::intel_gpu::ImplementationDesc fc_impl_desc = { format::bfyx, "fully_connected_gpu_bfyx_ref", impl_types::ocl };
config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"fc_prim", fc_impl_desc} }));
config.set_property(ov::hint::dynamic_quantization_group_size(0));

network network(engine, topology, config);
network.set_input_data("input", input_mem);
Expand All @@ -1615,6 +1616,7 @@ class fully_connected_gpu_tests: public ::testing::Test {
auto config = get_test_default_config(engine);
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
config.set_property(ov::intel_gpu::optimize_data(true));
config.set_property(ov::hint::dynamic_quantization_group_size(0));

network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test);

Expand Down Expand Up @@ -1698,9 +1700,7 @@ class fully_connected_gpu_tests: public ::testing::Test {
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
ov::intel_gpu::ImplementationDesc fc_impl_desc = { format::bfyx, "fully_connected_gpu_bfyx_ref", impl_types::ocl };
config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"fc_prim", fc_impl_desc} }));
if (is_dyn_quan) {
config.set_property(ov::hint::dynamic_quantization_group_size(0));
}
config.set_property(ov::hint::dynamic_quantization_group_size(0));

network network(engine, topology, config);
network.set_input_data("input", input_mem);
Expand Down Expand Up @@ -1728,6 +1728,8 @@ class fully_connected_gpu_tests: public ::testing::Test {
config.set_property(ov::intel_gpu::optimize_data(true));
if (is_dyn_quan) {
config.set_property(ov::hint::dynamic_quantization_group_size(32));
} else {
config.set_property(ov::hint::dynamic_quantization_group_size(0));
}

network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test);
Expand Down Expand Up @@ -1868,6 +1870,7 @@ class fully_connected_gpu_tests: public ::testing::Test {
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
ov::intel_gpu::ImplementationDesc fc_impl = { in_layout.format, "", impl_types::ocl };
config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ { "fc_prim1", fc_impl }, { "fc_prim2", fc_impl } }));
config.set_property(ov::hint::dynamic_quantization_group_size(0));

network network(engine, topology, config);
network.set_input_data("input", input_mem);
Expand Down Expand Up @@ -1896,6 +1899,7 @@ class fully_connected_gpu_tests: public ::testing::Test {
auto config = get_test_default_config(engine);
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
config.set_property(ov::intel_gpu::optimize_data(true));
config.set_property(ov::hint::dynamic_quantization_group_size(0));

network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test);

Expand Down

0 comments on commit b785e6e

Please sign in to comment.