From ba70ef19c1678d260b0c932aa3b041b36fb44ce1 Mon Sep 17 00:00:00 2001 From: Alexey Smirnov Date: Thu, 9 Jan 2025 11:59:39 +0000 Subject: [PATCH 1/6] Update DQ query --- src/cpp/src/llm_pipeline_static.cpp | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index c98b571179..517a121d12 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -475,13 +475,7 @@ std::optional extract_npu_descriptor(ov::Core& core) { } const auto arch = core.get_property("NPU", ov::device::architecture); const auto max_tiles = core.get_property("NPU", ov::intel_npu::max_tiles); - - bool compiler_dq = false; - const auto device_caps = core.get_property("NPU", ov::device::capabilities); - if (std::find(device_caps.begin(), device_caps.end(), - "COMPILER_DYNAMIC_QUANTIZATION") != device_caps.end()) { - compiler_dq = true; - } + const auto compiler_dq = core.get_property("NPU", ov::intel_npu::compiler_dynamic_quantization); return std::make_optional(NPUDesc{arch, max_tiles, compiler_dq}); } @@ -526,6 +520,7 @@ ov::AnyMap get_default_prefill_config(const std::shared_ptr& model, } if (npudesc.has_value() && npudesc->compiler_dq) { config.emplace("NPUW_DQ_FULL", "NO"); + config.emplace("NPU_COMPILATION_MODE_PARAMS", "enable-weights-dynamic-dequantization=true"); } return config; } @@ -547,6 +542,7 @@ ov::AnyMap get_default_generate_config(const std::shared_ptr& model, } if (npudesc.has_value() && npudesc->compiler_dq) { config.emplace("NPUW_DQ_FULL", "NO"); + config.emplace("NPU_COMPILATION_MODE_PARAMS", "enable-weights-dynamic-dequantization=true"); } return config; } From f5dd5b177fa610173f5ce1421c2c9930a203d503 Mon Sep 17 00:00:00 2001 From: Alexey Smirnov Date: Mon, 13 Jan 2025 14:29:58 +0000 Subject: [PATCH 2/6] Unconditionally utilize compiler DQ --- src/cpp/src/llm_pipeline_static.cpp | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index 517a121d12..c07672b580 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -465,7 +465,6 @@ void merge_config_with(ov::AnyMap& lhs, const ov::AnyMap& rhs) { struct NPUDesc { std::string arch; int64_t max_tiles; - bool compiler_dq; }; std::optional extract_npu_descriptor(ov::Core& core) { @@ -475,8 +474,7 @@ std::optional extract_npu_descriptor(ov::Core& core) { } const auto arch = core.get_property("NPU", ov::device::architecture); const auto max_tiles = core.get_property("NPU", ov::intel_npu::max_tiles); - const auto compiler_dq = core.get_property("NPU", ov::intel_npu::compiler_dynamic_quantization); - return std::make_optional(NPUDesc{arch, max_tiles, compiler_dq}); + return std::make_optional(NPUDesc{arch, max_tiles}); } ov::AnyMap get_baseline_common_config() { @@ -510,6 +508,8 @@ ov::AnyMap get_default_prefill_config(const std::shared_ptr& model, auto config = get_default_common_config(model); if (is_cw_compressed(model)) { config.emplace("NPUW_DQ", "YES"); + config.emplace("NPUW_DQ_FULL", "NO"); + config.emplace("NPU_COMPILER_DYNAMIC_QUANTIZATION", true); } else { config.emplace("NPUW_PMM", "NO"); } @@ -518,10 +518,6 @@ ov::AnyMap get_default_prefill_config(const std::shared_ptr& model, npudesc->max_tiles != -1) { config.emplace("NPU_DPU_GROUPS", npudesc->max_tiles); } - if (npudesc.has_value() && npudesc->compiler_dq) { - config.emplace("NPUW_DQ_FULL", "NO"); - config.emplace("NPU_COMPILATION_MODE_PARAMS", "enable-weights-dynamic-dequantization=true"); - } return config; } @@ -534,16 +530,14 @@ ov::AnyMap get_default_generate_config(const std::shared_ptr& model, } // NB: Unconditionally set for generation model config.emplace("NPUW_DQ", "YES"); + config.emplace("NPUW_DQ_FULL", "NO"); + config.emplace("NPU_COMPILER_DYNAMIC_QUANTIZATION", true); if (npudesc.has_value() && npudesc->arch == "4000") { config.emplace("NPU_DPU_GROUPS", 4); } if (hint == GenerateHint::FAST_COMPILE) { config.emplace("NPUW_UNFOLD_IREQS", "YES"); } - if (npudesc.has_value() && npudesc->compiler_dq) { - config.emplace("NPUW_DQ_FULL", "NO"); - config.emplace("NPU_COMPILATION_MODE_PARAMS", "enable-weights-dynamic-dequantization=true"); - } return config; } From cc44a0d70d5df2d900de2d5df8eea30b283a8c1a Mon Sep 17 00:00:00 2001 From: Alexey Smirnov Date: Mon, 13 Jan 2025 14:59:28 +0000 Subject: [PATCH 3/6] DQ only when in supported props --- src/cpp/src/llm_pipeline_static.cpp | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index c07672b580..8c5ca2ea4b 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -465,6 +465,7 @@ void merge_config_with(ov::AnyMap& lhs, const ov::AnyMap& rhs) { struct NPUDesc { std::string arch; int64_t max_tiles; + bool compiler_dq; }; std::optional extract_npu_descriptor(ov::Core& core) { @@ -474,7 +475,13 @@ std::optional extract_npu_descriptor(ov::Core& core) { } const auto arch = core.get_property("NPU", ov::device::architecture); const auto max_tiles = core.get_property("NPU", ov::intel_npu::max_tiles); - return std::make_optional(NPUDesc{arch, max_tiles}); + bool compiler_dq = false; + const auto supported_properties = core.get_property("NPU", ov::supported_properties); + if (std::find(supported_properties.begin(), supported_properties.end(), + "COMPILER_DYNAMIC_QUANTIZATION") != supported_properties.end()) { + compiler_dq = true; + } + return std::make_optional(NPUDesc{arch, max_tiles, compiler_dq}); } ov::AnyMap get_baseline_common_config() { @@ -508,8 +515,10 @@ ov::AnyMap get_default_prefill_config(const std::shared_ptr& model, auto config = get_default_common_config(model); if (is_cw_compressed(model)) { config.emplace("NPUW_DQ", "YES"); - config.emplace("NPUW_DQ_FULL", "NO"); - config.emplace("NPU_COMPILER_DYNAMIC_QUANTIZATION", true); + if (npudesc.has_value() && npudesc->compiler_dq) { + config.emplace("NPUW_DQ_FULL", "NO"); + config.emplace("NPU_COMPILER_DYNAMIC_QUANTIZATION", true); + } } else { config.emplace("NPUW_PMM", "NO"); } @@ -530,8 +539,10 @@ ov::AnyMap get_default_generate_config(const std::shared_ptr& model, } // NB: Unconditionally set for generation model config.emplace("NPUW_DQ", "YES"); - config.emplace("NPUW_DQ_FULL", "NO"); - config.emplace("NPU_COMPILER_DYNAMIC_QUANTIZATION", true); + if (npudesc.has_value() && npudesc->compiler_dq) { + config.emplace("NPUW_DQ_FULL", "NO"); + config.emplace("NPU_COMPILER_DYNAMIC_QUANTIZATION", true); + } if (npudesc.has_value() && npudesc->arch == "4000") { config.emplace("NPU_DPU_GROUPS", 4); } From fa76cf716eaf80f361c0a468898068b649530fdf Mon Sep 17 00:00:00 2001 From: Alexey Smirnov Date: Mon, 13 Jan 2025 15:02:33 +0000 Subject: [PATCH 4/6] Add prefix --- src/cpp/src/llm_pipeline_static.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index 8c5ca2ea4b..ac0b2664ff 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -478,7 +478,7 @@ std::optional extract_npu_descriptor(ov::Core& core) { bool compiler_dq = false; const auto supported_properties = core.get_property("NPU", ov::supported_properties); if (std::find(supported_properties.begin(), supported_properties.end(), - "COMPILER_DYNAMIC_QUANTIZATION") != supported_properties.end()) { + "NPU_COMPILER_DYNAMIC_QUANTIZATION") != supported_properties.end()) { compiler_dq = true; } return std::make_optional(NPUDesc{arch, max_tiles, compiler_dq}); From 868a7ac40431c0371677bbd1fe917e509e4ed99f Mon Sep 17 00:00:00 2001 From: Alexey Smirnov Date: Mon, 13 Jan 2025 18:21:44 +0000 Subject: [PATCH 5/6] Align DQ behaviour --- src/cpp/src/llm_pipeline_static.cpp | 49 +++++++++++++++++------------ 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index ac0b2664ff..a291b297d5 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -484,7 +484,7 @@ std::optional extract_npu_descriptor(ov::Core& core) { return std::make_optional(NPUDesc{arch, max_tiles, compiler_dq}); } -ov::AnyMap get_baseline_common_config() { +ov::AnyMap get_baseline_common_config(bool enable_compiler_dq) { ov::AnyMap config = { { "NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add_RMSNorm" }, { "NPUW_DEVICES", "NPU" }, @@ -496,11 +496,23 @@ ov::AnyMap get_baseline_common_config() { { "NPUW_SLICE_OUT", "YES" }, { "NPUW_FUNCALL_ASYNC", "YES" } }; + if (enable_compiler_dq) { + config.emplace("NPUW_DQ", "YES"); + config.emplace("NPUW_DQ_FULL", "NO"); + config.emplace("NPU_COMPILER_DYNAMIC_QUANTIZATION", true); + config.erase("NPUW_DCOFF_TYPE"); + config.erase("NPUW_DCOFF_SCALE"); + } return config; } -ov::AnyMap get_default_common_config(const std::shared_ptr& model) { - auto config = get_baseline_common_config(); +bool enable_compiler_dq(const std::optional& npudesc) { + return npudesc.has_value() && npudesc->compiler_dq; +} + +ov::AnyMap get_default_common_config(const std::shared_ptr& model, + const std::optional& npudesc) { + auto config = get_baseline_common_config(enable_compiler_dq(npudesc)); const char* npu_l0 = std::getenv("DISABLE_OPENVINO_GENAI_NPU_L0"); if (npu_l0 && std::atoi(npu_l0) == 1) { config.emplace("NPUW_WEIGHTS_BANK_ALLOC", "CPU"); @@ -512,43 +524,40 @@ ov::AnyMap get_default_common_config(const std::shared_ptr& model) { ov::AnyMap get_default_prefill_config(const std::shared_ptr& model, const std::optional& npudesc) { - auto config = get_default_common_config(model); - if (is_cw_compressed(model)) { - config.emplace("NPUW_DQ", "YES"); - if (npudesc.has_value() && npudesc->compiler_dq) { - config.emplace("NPUW_DQ_FULL", "NO"); - config.emplace("NPU_COMPILER_DYNAMIC_QUANTIZATION", true); - } - } else { - config.emplace("NPUW_PMM", "NO"); - } + auto config = get_default_common_config(model, npudesc); if (npudesc.has_value() && npudesc->arch == "4000" && npudesc->max_tiles != -1) { config.emplace("NPU_DPU_GROUPS", npudesc->max_tiles); } + // Specify NPUW DQ if Compiler DQ is not enabled + if (!enable_compiler_dq(npudesc)) { + if (is_cw_compressed(model)) { + config.emplace("NPUW_DQ", "YES"); + } else { + config.emplace("NPUW_PMM", "NO"); + } + } return config; } ov::AnyMap get_default_generate_config(const std::shared_ptr& model, const std::optional& npudesc, const GenerateHint hint) { - auto config = get_default_common_config(model); + auto config = get_default_common_config(model, npudesc); if (hint == GenerateHint::BEST_PERF) { config.emplace("NPUW_ONLINE_PIPELINE", "NONE"); } - // NB: Unconditionally set for generation model - config.emplace("NPUW_DQ", "YES"); - if (npudesc.has_value() && npudesc->compiler_dq) { - config.emplace("NPUW_DQ_FULL", "NO"); - config.emplace("NPU_COMPILER_DYNAMIC_QUANTIZATION", true); - } if (npudesc.has_value() && npudesc->arch == "4000") { config.emplace("NPU_DPU_GROUPS", 4); } if (hint == GenerateHint::FAST_COMPILE) { config.emplace("NPUW_UNFOLD_IREQS", "YES"); } + // Specify NPUW DQ if Compiler DQ is not enabled + if (!enable_compiler_dq(npudesc)) { + config.emplace("NPUW_DQ", "YES"); + } return config; } From 3c72f4d8d03e8a91b9bd70f40a1128c63f28ce77 Mon Sep 17 00:00:00 2001 From: Alexey Smirnov Date: Mon, 13 Jan 2025 19:07:25 +0000 Subject: [PATCH 6/6] Address review comments --- src/cpp/src/llm_pipeline_static.cpp | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index a291b297d5..5fd02ca2bf 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -484,7 +484,7 @@ std::optional extract_npu_descriptor(ov::Core& core) { return std::make_optional(NPUDesc{arch, max_tiles, compiler_dq}); } -ov::AnyMap get_baseline_common_config(bool enable_compiler_dq) { +ov::AnyMap get_baseline_common_config(const std::optional& npudesc) { ov::AnyMap config = { { "NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add_RMSNorm" }, { "NPUW_DEVICES", "NPU" }, @@ -496,23 +496,20 @@ ov::AnyMap get_baseline_common_config(bool enable_compiler_dq) { { "NPUW_SLICE_OUT", "YES" }, { "NPUW_FUNCALL_ASYNC", "YES" } }; - if (enable_compiler_dq) { + // FIXME: this config logic is getting more and more complex + if (npudesc.has_value() && npudesc->compiler_dq) { config.emplace("NPUW_DQ", "YES"); config.emplace("NPUW_DQ_FULL", "NO"); - config.emplace("NPU_COMPILER_DYNAMIC_QUANTIZATION", true); + config.emplace("NPU_COMPILER_DYNAMIC_QUANTIZATION", "YES"); config.erase("NPUW_DCOFF_TYPE"); config.erase("NPUW_DCOFF_SCALE"); } return config; } -bool enable_compiler_dq(const std::optional& npudesc) { - return npudesc.has_value() && npudesc->compiler_dq; -} - ov::AnyMap get_default_common_config(const std::shared_ptr& model, const std::optional& npudesc) { - auto config = get_baseline_common_config(enable_compiler_dq(npudesc)); + auto config = get_baseline_common_config(npudesc); const char* npu_l0 = std::getenv("DISABLE_OPENVINO_GENAI_NPU_L0"); if (npu_l0 && std::atoi(npu_l0) == 1) { config.emplace("NPUW_WEIGHTS_BANK_ALLOC", "CPU"); @@ -531,7 +528,7 @@ ov::AnyMap get_default_prefill_config(const std::shared_ptr& model, config.emplace("NPU_DPU_GROUPS", npudesc->max_tiles); } // Specify NPUW DQ if Compiler DQ is not enabled - if (!enable_compiler_dq(npudesc)) { + if (!npudesc.has_value() || !npudesc->compiler_dq) { if (is_cw_compressed(model)) { config.emplace("NPUW_DQ", "YES"); } else { @@ -555,7 +552,7 @@ ov::AnyMap get_default_generate_config(const std::shared_ptr& model, config.emplace("NPUW_UNFOLD_IREQS", "YES"); } // Specify NPUW DQ if Compiler DQ is not enabled - if (!enable_compiler_dq(npudesc)) { + if (!npudesc.has_value() || !npudesc->compiler_dq) { config.emplace("NPUW_DQ", "YES"); } return config;