diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index 10a32c4731..b29bec3b4a 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -475,17 +475,16 @@ std::optional extract_npu_descriptor(ov::Core& core) { } const auto arch = core.get_property("NPU", ov::device::architecture); const auto max_tiles = core.get_property("NPU", ov::intel_npu::max_tiles); - bool compiler_dq = false; - const auto device_caps = core.get_property("NPU", ov::device::capabilities); - if (std::find(device_caps.begin(), device_caps.end(), - "COMPILER_DYNAMIC_QUANTIZATION") != device_caps.end()) { + const auto supported_properties = core.get_property("NPU", ov::supported_properties); + if (std::find(supported_properties.begin(), supported_properties.end(), + "NPU_COMPILER_DYNAMIC_QUANTIZATION") != supported_properties.end()) { compiler_dq = true; } return std::make_optional(NPUDesc{arch, max_tiles, compiler_dq}); } -ov::AnyMap get_baseline_common_config() { +ov::AnyMap get_baseline_common_config(const std::optional& npudesc) { ov::AnyMap config = { { "NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add_RMSNorm" }, { "NPUW_DEVICES", "NPU" }, @@ -497,11 +496,20 @@ ov::AnyMap get_baseline_common_config() { { "NPUW_SLICE_OUT", "YES" }, { "NPUW_FUNCALL_ASYNC", "YES" } }; + // FIXME: this config logic is getting more and more complex + if (npudesc.has_value() && npudesc->compiler_dq) { + config.emplace("NPUW_DQ", "YES"); + config.emplace("NPUW_DQ_FULL", "NO"); + config.emplace("NPU_COMPILER_DYNAMIC_QUANTIZATION", "YES"); + config.erase("NPUW_DCOFF_TYPE"); + config.erase("NPUW_DCOFF_SCALE"); + } return config; } -ov::AnyMap get_default_common_config(const std::shared_ptr& model) { - auto config = get_baseline_common_config(); +ov::AnyMap get_default_common_config(const std::shared_ptr& model, + const std::optional& npudesc) { + auto config = get_baseline_common_config(npudesc); const char* npu_l0 = std::getenv("DISABLE_OPENVINO_GENAI_NPU_L0"); if (npu_l0 && std::atoi(npu_l0) == 1) { config.emplace("NPUW_WEIGHTS_BANK_ALLOC", "CPU"); @@ -513,19 +521,19 @@ ov::AnyMap get_default_common_config(const std::shared_ptr& model) { ov::AnyMap get_default_prefill_config(const std::shared_ptr& model, const std::optional& npudesc) { - auto config = get_default_common_config(model); - if (is_cw_compressed(model)) { - config.emplace("NPUW_DQ", "YES"); - } else { - config.emplace("NPUW_PMM", "NO"); - } + auto config = get_default_common_config(model, npudesc); if (npudesc.has_value() && npudesc->arch == "4000" && npudesc->max_tiles != -1) { config.emplace("NPU_DPU_GROUPS", npudesc->max_tiles); } - if (npudesc.has_value() && npudesc->compiler_dq) { - config.emplace("NPUW_DQ_FULL", "NO"); + // Specify NPUW DQ if Compiler DQ is not enabled + if (!npudesc.has_value() || !npudesc->compiler_dq) { + if (is_cw_compressed(model)) { + config.emplace("NPUW_DQ", "YES"); + } else { + config.emplace("NPUW_PMM", "NO"); + } } return config; } @@ -533,20 +541,19 @@ ov::AnyMap get_default_prefill_config(const std::shared_ptr& model, ov::AnyMap get_default_generate_config(const std::shared_ptr& model, const std::optional& npudesc, const GenerateHint hint) { - auto config = get_default_common_config(model); + auto config = get_default_common_config(model, npudesc); if (hint == GenerateHint::BEST_PERF) { config.emplace("NPUW_ONLINE_PIPELINE", "NONE"); } - // NB: Unconditionally set for generation model - config.emplace("NPUW_DQ", "YES"); if (npudesc.has_value() && npudesc->arch == "4000") { config.emplace("NPU_DPU_GROUPS", 4); } if (hint == GenerateHint::FAST_COMPILE) { config.emplace("NPUW_UNFOLD_IREQS", "YES"); } - if (npudesc.has_value() && npudesc->compiler_dq) { - config.emplace("NPUW_DQ_FULL", "NO"); + // Specify NPUW DQ if Compiler DQ is not enabled + if (!npudesc.has_value() || !npudesc->compiler_dq) { + config.emplace("NPUW_DQ", "YES"); } return config; }