From ba70ef19c1678d260b0c932aa3b041b36fb44ce1 Mon Sep 17 00:00:00 2001
From: Alexey Smirnov <alexey.smirnov@intel.com>
Date: Thu, 9 Jan 2025 11:59:39 +0000
Subject: [PATCH 1/6] Update DQ query

---
 src/cpp/src/llm_pipeline_static.cpp | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)
diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
index c98b571179..517a121d12 100644
--- a/src/cpp/src/llm_pipeline_static.cpp
+++ b/src/cpp/src/llm_pipeline_static.cpp
@@ -475,13 +475,7 @@ std::optional<NPUDesc> extract_npu_descriptor(ov::Core& core) {
     }
     const auto arch = core.get_property("NPU", ov::device::architecture);
     const auto max_tiles = core.get_property("NPU", ov::intel_npu::max_tiles);
-
-    bool compiler_dq = false;
-    const auto device_caps = core.get_property("NPU", ov::device::capabilities);
-    if (std::find(device_caps.begin(), device_caps.end(),
-                  "COMPILER_DYNAMIC_QUANTIZATION") != device_caps.end()) {
-        compiler_dq = true;
-    }
+    const auto compiler_dq = core.get_property("NPU", ov::intel_npu::compiler_dynamic_quantization);
     return std::make_optional(NPUDesc{arch, max_tiles, compiler_dq});
 }
 
@@ -526,6 +520,7 @@ ov::AnyMap get_default_prefill_config(const std::shared_ptr<ov::Model>& model,
     }
     if (npudesc.has_value() && npudesc->compiler_dq) {
         config.emplace("NPUW_DQ_FULL", "NO");
+        config.emplace("NPU_COMPILATION_MODE_PARAMS", "enable-weights-dynamic-dequantization=true");
     }
     return config;
 }
@@ -547,6 +542,7 @@ ov::AnyMap get_default_generate_config(const std::shared_ptr<ov::Model>& model,
     }
     if (npudesc.has_value() && npudesc->compiler_dq) {
         config.emplace("NPUW_DQ_FULL", "NO");
+        config.emplace("NPU_COMPILATION_MODE_PARAMS", "enable-weights-dynamic-dequantization=true");
     }
     return config;
 }

From f5dd5b177fa610173f5ce1421c2c9930a203d503 Mon Sep 17 00:00:00 2001
From: Alexey Smirnov <alexey.smirnov@intel.com>
Date: Mon, 13 Jan 2025 14:29:58 +0000
Subject: [PATCH 2/6] Unconditionally utilize compiler DQ

---
 src/cpp/src/llm_pipeline_static.cpp | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
index 517a121d12..c07672b580 100644
--- a/src/cpp/src/llm_pipeline_static.cpp
+++ b/src/cpp/src/llm_pipeline_static.cpp
@@ -465,7 +465,6 @@ void merge_config_with(ov::AnyMap& lhs, const ov::AnyMap& rhs) {
 struct NPUDesc {
     std::string arch;
     int64_t max_tiles;
-    bool compiler_dq;
 };
 
 std::optional<NPUDesc> extract_npu_descriptor(ov::Core& core) {
@@ -475,8 +474,7 @@ std::optional<NPUDesc> extract_npu_descriptor(ov::Core& core) {
     }
     const auto arch = core.get_property("NPU", ov::device::architecture);
     const auto max_tiles = core.get_property("NPU", ov::intel_npu::max_tiles);
-    const auto compiler_dq = core.get_property("NPU", ov::intel_npu::compiler_dynamic_quantization);
-    return std::make_optional(NPUDesc{arch, max_tiles, compiler_dq});
+    return std::make_optional(NPUDesc{arch, max_tiles});
 }
 
 ov::AnyMap get_baseline_common_config() {
@@ -510,6 +508,8 @@ ov::AnyMap get_default_prefill_config(const std::shared_ptr<ov::Model>& model,
     auto config = get_default_common_config(model);
     if (is_cw_compressed(model)) {
         config.emplace("NPUW_DQ", "YES");
+        config.emplace("NPUW_DQ_FULL", "NO");
+        config.emplace("NPU_COMPILER_DYNAMIC_QUANTIZATION", true);
     } else {
         config.emplace("NPUW_PMM", "NO");
     }
@@ -518,10 +518,6 @@ ov::AnyMap get_default_prefill_config(const std::shared_ptr<ov::Model>& model,
         npudesc->max_tiles != -1) {
         config.emplace("NPU_DPU_GROUPS", npudesc->max_tiles);
     }
-    if (npudesc.has_value() && npudesc->compiler_dq) {
-        config.emplace("NPUW_DQ_FULL", "NO");
-        config.emplace("NPU_COMPILATION_MODE_PARAMS", "enable-weights-dynamic-dequantization=true");
-    }
     return config;
 }
 
@@ -534,16 +530,14 @@ ov::AnyMap get_default_generate_config(const std::shared_ptr<ov::Model>& model,
     }
     // NB: Unconditionally set for generation model
     config.emplace("NPUW_DQ", "YES");
+    config.emplace("NPUW_DQ_FULL", "NO");
+    config.emplace("NPU_COMPILER_DYNAMIC_QUANTIZATION", true);
     if (npudesc.has_value() && npudesc->arch == "4000") {
         config.emplace("NPU_DPU_GROUPS", 4);
     }
     if (hint == GenerateHint::FAST_COMPILE) {
         config.emplace("NPUW_UNFOLD_IREQS", "YES");
     }
-    if (npudesc.has_value() && npudesc->compiler_dq) {
-        config.emplace("NPUW_DQ_FULL", "NO");
-        config.emplace("NPU_COMPILATION_MODE_PARAMS", "enable-weights-dynamic-dequantization=true");
-    }
     return config;
 }
 

From cc44a0d70d5df2d900de2d5df8eea30b283a8c1a Mon Sep 17 00:00:00 2001
From: Alexey Smirnov <alexey.smirnov@intel.com>
Date: Mon, 13 Jan 2025 14:59:28 +0000
Subject: [PATCH 3/6] DQ only when in supported props

---
 src/cpp/src/llm_pipeline_static.cpp | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
index c07672b580..8c5ca2ea4b 100644
--- a/src/cpp/src/llm_pipeline_static.cpp
+++ b/src/cpp/src/llm_pipeline_static.cpp
@@ -465,6 +465,7 @@ void merge_config_with(ov::AnyMap& lhs, const ov::AnyMap& rhs) {
 struct NPUDesc {
     std::string arch;
     int64_t max_tiles;
+    bool compiler_dq;
 };
 
 std::optional<NPUDesc> extract_npu_descriptor(ov::Core& core) {
@@ -474,7 +475,13 @@ std::optional<NPUDesc> extract_npu_descriptor(ov::Core& core) {
     }
     const auto arch = core.get_property("NPU", ov::device::architecture);
     const auto max_tiles = core.get_property("NPU", ov::intel_npu::max_tiles);
-    return std::make_optional(NPUDesc{arch, max_tiles});
+    bool compiler_dq = false;
+    const auto supported_properties = core.get_property("NPU", ov::supported_properties);
+    if (std::find(supported_properties.begin(), supported_properties.end(),
+                  "COMPILER_DYNAMIC_QUANTIZATION") != supported_properties.end()) {
+        compiler_dq = true;
+    }
+    return std::make_optional(NPUDesc{arch, max_tiles, compiler_dq});
 }
 
 ov::AnyMap get_baseline_common_config() {
@@ -508,8 +515,10 @@ ov::AnyMap get_default_prefill_config(const std::shared_ptr<ov::Model>& model,
     auto config = get_default_common_config(model);
     if (is_cw_compressed(model)) {
         config.emplace("NPUW_DQ", "YES");
-        config.emplace("NPUW_DQ_FULL", "NO");
-        config.emplace("NPU_COMPILER_DYNAMIC_QUANTIZATION", true);
+        if (npudesc.has_value() && npudesc->compiler_dq) {
+            config.emplace("NPUW_DQ_FULL", "NO");
+            config.emplace("NPU_COMPILER_DYNAMIC_QUANTIZATION", true);
+        }
     } else {
         config.emplace("NPUW_PMM", "NO");
     }
@@ -530,8 +539,10 @@ ov::AnyMap get_default_generate_config(const std::shared_ptr<ov::Model>& model,
     }
     // NB: Unconditionally set for generation model
     config.emplace("NPUW_DQ", "YES");
-    config.emplace("NPUW_DQ_FULL", "NO");
-    config.emplace("NPU_COMPILER_DYNAMIC_QUANTIZATION", true);
+    if (npudesc.has_value() && npudesc->compiler_dq) {
+        config.emplace("NPUW_DQ_FULL", "NO");
+        config.emplace("NPU_COMPILER_DYNAMIC_QUANTIZATION", true);
+    }
     if (npudesc.has_value() && npudesc->arch == "4000") {
         config.emplace("NPU_DPU_GROUPS", 4);
     }

From fa76cf716eaf80f361c0a468898068b649530fdf Mon Sep 17 00:00:00 2001
From: Alexey Smirnov <alexey.smirnov@intel.com>
Date: Mon, 13 Jan 2025 15:02:33 +0000
Subject: [PATCH 4/6] Add prefix

---
 src/cpp/src/llm_pipeline_static.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
index 8c5ca2ea4b..ac0b2664ff 100644
--- a/src/cpp/src/llm_pipeline_static.cpp
+++ b/src/cpp/src/llm_pipeline_static.cpp
@@ -478,7 +478,7 @@ std::optional<NPUDesc> extract_npu_descriptor(ov::Core& core) {
     bool compiler_dq = false;
     const auto supported_properties = core.get_property("NPU", ov::supported_properties);
     if (std::find(supported_properties.begin(), supported_properties.end(),
-                  "COMPILER_DYNAMIC_QUANTIZATION") != supported_properties.end()) {
+                  "NPU_COMPILER_DYNAMIC_QUANTIZATION") != supported_properties.end()) {
         compiler_dq = true;
     }
     return std::make_optional(NPUDesc{arch, max_tiles, compiler_dq});

From 868a7ac40431c0371677bbd1fe917e509e4ed99f Mon Sep 17 00:00:00 2001
From: Alexey Smirnov <alexey.smirnov@intel.com>
Date: Mon, 13 Jan 2025 18:21:44 +0000
Subject: [PATCH 5/6] Align DQ behaviour

---
 src/cpp/src/llm_pipeline_static.cpp | 49 +++++++++++++++++------------
 1 file changed, 29 insertions(+), 20 deletions(-)

diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
index ac0b2664ff..a291b297d5 100644
--- a/src/cpp/src/llm_pipeline_static.cpp
+++ b/src/cpp/src/llm_pipeline_static.cpp
@@ -484,7 +484,7 @@ std::optional<NPUDesc> extract_npu_descriptor(ov::Core& core) {
     return std::make_optional(NPUDesc{arch, max_tiles, compiler_dq});
 }
 
-ov::AnyMap get_baseline_common_config() {
+ov::AnyMap get_baseline_common_config(bool enable_compiler_dq) {
     ov::AnyMap config = {
         { "NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add_RMSNorm" },
         { "NPUW_DEVICES", "NPU" },
@@ -496,11 +496,23 @@ ov::AnyMap get_baseline_common_config() {
         { "NPUW_SLICE_OUT", "YES" },
         { "NPUW_FUNCALL_ASYNC", "YES" }
     };
+    if (enable_compiler_dq) {
+        config.emplace("NPUW_DQ", "YES");
+        config.emplace("NPUW_DQ_FULL", "NO");
+        config.emplace("NPU_COMPILER_DYNAMIC_QUANTIZATION", true);
+        config.erase("NPUW_DCOFF_TYPE");
+        config.erase("NPUW_DCOFF_SCALE");
+    }
     return config;
 }
 
-ov::AnyMap get_default_common_config(const std::shared_ptr<ov::Model>& model) {
-    auto config = get_baseline_common_config();
+bool enable_compiler_dq(const std::optional<NPUDesc>& npudesc) {
+    return npudesc.has_value() && npudesc->compiler_dq;
+}
+
+ov::AnyMap get_default_common_config(const std::shared_ptr<ov::Model>& model,
+                                     const std::optional<NPUDesc>& npudesc) {
+    auto config = get_baseline_common_config(enable_compiler_dq(npudesc));
     const char* npu_l0 = std::getenv("DISABLE_OPENVINO_GENAI_NPU_L0");
     if (npu_l0 && std::atoi(npu_l0) == 1) {
         config.emplace("NPUW_WEIGHTS_BANK_ALLOC", "CPU");
@@ -512,43 +524,40 @@ ov::AnyMap get_default_common_config(const std::shared_ptr<ov::Model>& model) {
 
 ov::AnyMap get_default_prefill_config(const std::shared_ptr<ov::Model>& model,
                                       const std::optional<NPUDesc>& npudesc) {
-    auto config = get_default_common_config(model);
-    if (is_cw_compressed(model)) {
-        config.emplace("NPUW_DQ", "YES");
-        if (npudesc.has_value() && npudesc->compiler_dq) {
-            config.emplace("NPUW_DQ_FULL", "NO");
-            config.emplace("NPU_COMPILER_DYNAMIC_QUANTIZATION", true);
-        }
-    } else {
-        config.emplace("NPUW_PMM", "NO");
-    }
+    auto config = get_default_common_config(model, npudesc);
     if (npudesc.has_value() &&
         npudesc->arch == "4000" &&
         npudesc->max_tiles != -1) {
         config.emplace("NPU_DPU_GROUPS", npudesc->max_tiles);
     }
+    // Specify NPUW DQ if Compiler DQ is not enabled
+    if (!enable_compiler_dq(npudesc)) {
+        if (is_cw_compressed(model)) {
+            config.emplace("NPUW_DQ", "YES");
+        } else {
+            config.emplace("NPUW_PMM", "NO");
+        }
+    }
     return config;
 }
 
 ov::AnyMap get_default_generate_config(const std::shared_ptr<ov::Model>& model,
                                        const std::optional<NPUDesc>& npudesc,
                                        const GenerateHint hint) {
-    auto config = get_default_common_config(model);
+    auto config = get_default_common_config(model, npudesc);
     if (hint == GenerateHint::BEST_PERF) {
         config.emplace("NPUW_ONLINE_PIPELINE", "NONE");
     }
-    // NB: Unconditionally set for generation model
-    config.emplace("NPUW_DQ", "YES");
-    if (npudesc.has_value() && npudesc->compiler_dq) {
-        config.emplace("NPUW_DQ_FULL", "NO");
-        config.emplace("NPU_COMPILER_DYNAMIC_QUANTIZATION", true);
-    }
     if (npudesc.has_value() && npudesc->arch == "4000") {
         config.emplace("NPU_DPU_GROUPS", 4);
     }
     if (hint == GenerateHint::FAST_COMPILE) {
         config.emplace("NPUW_UNFOLD_IREQS", "YES");
     }
+    // Specify NPUW DQ if Compiler DQ is not enabled
+    if (!enable_compiler_dq(npudesc)) {
+        config.emplace("NPUW_DQ", "YES");
+    }
     return config;
 }
 

From 3c72f4d8d03e8a91b9bd70f40a1128c63f28ce77 Mon Sep 17 00:00:00 2001
From: Alexey Smirnov <alexey.smirnov@intel.com>
Date: Mon, 13 Jan 2025 19:07:25 +0000
Subject: [PATCH 6/6] Address review comments

---
 src/cpp/src/llm_pipeline_static.cpp | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
index a291b297d5..5fd02ca2bf 100644
--- a/src/cpp/src/llm_pipeline_static.cpp
+++ b/src/cpp/src/llm_pipeline_static.cpp
@@ -484,7 +484,7 @@ std::optional<NPUDesc> extract_npu_descriptor(ov::Core& core) {
     return std::make_optional(NPUDesc{arch, max_tiles, compiler_dq});
 }
 
-ov::AnyMap get_baseline_common_config(bool enable_compiler_dq) {
+ov::AnyMap get_baseline_common_config(const std::optional<NPUDesc>& npudesc) {
     ov::AnyMap config = {
         { "NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add_RMSNorm" },
         { "NPUW_DEVICES", "NPU" },
@@ -496,23 +496,20 @@ ov::AnyMap get_baseline_common_config(bool enable_compiler_dq) {
         { "NPUW_SLICE_OUT", "YES" },
         { "NPUW_FUNCALL_ASYNC", "YES" }
     };
-    if (enable_compiler_dq) {
+    // FIXME: this config logic is getting more and more complex
+    if (npudesc.has_value() && npudesc->compiler_dq) {
         config.emplace("NPUW_DQ", "YES");
         config.emplace("NPUW_DQ_FULL", "NO");
-        config.emplace("NPU_COMPILER_DYNAMIC_QUANTIZATION", true);
+        config.emplace("NPU_COMPILER_DYNAMIC_QUANTIZATION", "YES");
         config.erase("NPUW_DCOFF_TYPE");
         config.erase("NPUW_DCOFF_SCALE");
     }
     return config;
 }
 
-bool enable_compiler_dq(const std::optional<NPUDesc>& npudesc) {
-    return npudesc.has_value() && npudesc->compiler_dq;
-}
-
 ov::AnyMap get_default_common_config(const std::shared_ptr<ov::Model>& model,
                                      const std::optional<NPUDesc>& npudesc) {
-    auto config = get_baseline_common_config(enable_compiler_dq(npudesc));
+    auto config = get_baseline_common_config(npudesc);
     const char* npu_l0 = std::getenv("DISABLE_OPENVINO_GENAI_NPU_L0");
     if (npu_l0 && std::atoi(npu_l0) == 1) {
         config.emplace("NPUW_WEIGHTS_BANK_ALLOC", "CPU");
@@ -531,7 +528,7 @@ ov::AnyMap get_default_prefill_config(const std::shared_ptr<ov::Model>& model,
         config.emplace("NPU_DPU_GROUPS", npudesc->max_tiles);
     }
     // Specify NPUW DQ if Compiler DQ is not enabled
-    if (!enable_compiler_dq(npudesc)) {
+    if (!npudesc.has_value() || !npudesc->compiler_dq) {
         if (is_cw_compressed(model)) {
             config.emplace("NPUW_DQ", "YES");
         } else {
@@ -555,7 +552,7 @@ ov::AnyMap get_default_generate_config(const std::shared_ptr<ov::Model>& model,
         config.emplace("NPUW_UNFOLD_IREQS", "YES");
     }
     // Specify NPUW DQ if Compiler DQ is not enabled
-    if (!enable_compiler_dq(npudesc)) {
+    if (!npudesc.has_value() || !npudesc->compiler_dq) {
         config.emplace("NPUW_DQ", "YES");
     }
     return config;