From c904b8e182d0f62bda6fa2781cdbdeb79678344c Mon Sep 17 00:00:00 2001 From: "Wang, Yang" Date: Thu, 23 May 2024 10:02:46 +0800 Subject: [PATCH] [AUTO] Load cached model to target device W/O CPU accelerating (#24618) ### Details: - update logic of only loading cached model to GPU with AUTO if GPU cached blob exists and `ov::intel_auto::enable_startup_fallback` is enable ### Tickets: - CVS-138574 --- src/plugins/auto/src/auto_schedule.cpp | 44 +++++++----------- src/plugins/auto/src/common.hpp | 1 - src/plugins/auto/src/plugin.cpp | 1 - .../functional/behavior/caching_test.cpp | 45 ++++++++++++++++++- 4 files changed, 59 insertions(+), 32 deletions(-) diff --git a/src/plugins/auto/src/auto_schedule.cpp b/src/plugins/auto/src/auto_schedule.cpp index 0c248a2b491054..3de0cc9f00b871 100644 --- a/src/plugins/auto/src/auto_schedule.cpp +++ b/src/plugins/auto/src/auto_schedule.cpp @@ -133,7 +133,6 @@ void AutoSchedule::init() { if (m_compile_context[ACTUALDEVICE].m_is_enabled) { LOG_INFO_TAG("select device:%s", m_compile_context[ACTUALDEVICE].m_device_info.device_name.c_str()); bool is_actual_cpu = m_compile_context[ACTUALDEVICE].m_device_info.device_name.find("CPU") != std::string::npos; - bool is_actual_gpu = m_compile_context[ACTUALDEVICE].m_device_info.device_name.find("GPU") != std::string::npos; // if Actual device is CPU or perf_hint is cumulative, disabled m_compile_context[CPU], only use // m_compile_context[ACTUALDEVICE] if (is_actual_cpu || !m_context->m_startup_fallback) { @@ -148,29 +147,11 @@ void AutoSchedule::init() { // limit the threads num for compiling auto device = m_compile_context[ACTUALDEVICE].m_device_info.device_name; auto& device_config = m_compile_context[ACTUALDEVICE].m_device_info.config; - if (is_actual_gpu) { - int max_threads = 0; - try { - max_threads = m_context->m_ov_core->get_property(device, ov::compilation_num_threads); - } catch (const ov::Exception&) { - LOG_DEBUG_TAG("cannot get MAX_NUM_THREADS from GPU"); - } - if (max_threads == static_cast(std::thread::hardware_concurrency())) { - int thread_num = max_threads / 2; - m_compile_context[ACTUALDEVICE].m_device_info.config.insert( - ov::compilation_num_threads(thread_num)); - LOG_DEBUG_TAG("gpu streams number for compiling: %d", thread_num); - } else { - // user set the compiling threads num - // use the user's val anyway - LOG_DEBUG_TAG("user defined compiling threads: %d", max_threads); - } - } std::string cache_dir = device_config.count(ov::cache_dir.name()) ? device_config[ov::cache_dir.name()].as() : m_context->m_ov_core->get_property("", ov::cache_dir); - if (!m_context->m_is_set_startup_fallback && !cache_dir.empty()) { + if (m_context->m_startup_fallback && !cache_dir.empty()) { const auto properties = m_context->m_ov_core->create_compile_config(ov::DeviceIDParser(device).get_device_name(), device_config); @@ -323,15 +304,20 @@ void AutoSchedule::try_to_compile_model(AutoCompileContext& context, const std:: device_config.find(ov::compilation_num_threads.name()) != device_config.end()); if (cur_dev_is_gpu && m_compile_context[CPU].m_is_enabled && !is_already_set_gpu) { device_config.insert(ov::intel_gpu::hint::host_task_priority(ov::hint::Priority::HIGH)); - auto proc_type_table = get_org_proc_type_table(); - int compilation_num_threads = proc_type_table[0][MAIN_CORE_PROC] != 0 - ? proc_type_table[0][MAIN_CORE_PROC] - : proc_type_table[0][EFFICIENT_CORE_PROC]; - if (device_config.insert(ov::compilation_num_threads(compilation_num_threads)).second) - LOG_DEBUG_TAG("gpu streams number for compiling: %d", compilation_num_threads); - else - LOG_DEBUG_TAG("user defined compiling threads: %d", - device_config[ov::compilation_num_threads.name()].as()); + int max_threads = 0; + try { + m_context->m_ov_core->get_property(device, ov::compilation_num_threads); + auto proc_type_table = get_org_proc_type_table(); + max_threads = proc_type_table[0][MAIN_CORE_PROC] != 0 ? proc_type_table[0][MAIN_CORE_PROC] + : proc_type_table[0][EFFICIENT_CORE_PROC]; + if (device_config.insert(ov::compilation_num_threads(max_threads)).second) + LOG_DEBUG_TAG("gpu streams number for compiling: %d", max_threads); + else + LOG_DEBUG_TAG("user defined compiling threads: %d", + device_config[ov::compilation_num_threads.name()].as()); + } catch (const ov::Exception&) { + LOG_DEBUG_TAG("cannot get MAX_NUM_THREADS from GPU"); + } } } try { diff --git a/src/plugins/auto/src/common.hpp b/src/plugins/auto/src/common.hpp index 63fb8753e4fff2..28567eb23392c4 100644 --- a/src/plugins/auto/src/common.hpp +++ b/src/plugins/auto/src/common.hpp @@ -207,7 +207,6 @@ class ScheduleContext : public std::enable_shared_from_this { bool m_need_perf_counters; bool m_batching_disabled = false; bool m_startup_fallback = true; - bool m_is_set_startup_fallback = false; bool m_runtime_fallback = true; bool m_bind_buffer = false; std::shared_ptr m_model; diff --git a/src/plugins/auto/src/plugin.cpp b/src/plugins/auto/src/plugin.cpp index 9d8174252d21c9..06b3b7dbc947e4 100644 --- a/src/plugins/auto/src/plugin.cpp +++ b/src/plugins/auto/src/plugin.cpp @@ -436,7 +436,6 @@ std::shared_ptr Plugin::compile_model_impl(const std::string OPENVINO_ASSERT(auto_s_context->m_ov_core); auto_s_context->m_log_tag = get_device_name(); auto_s_context->m_model_precision = model_precision; - auto_s_context->m_is_set_startup_fallback = load_config.is_set_by_user(ov::intel_auto::enable_startup_fallback); auto_s_context->m_startup_fallback = load_config.get_property(ov::intel_auto::enable_startup_fallback); auto_s_context->m_runtime_fallback = load_config.get_property(ov::intel_auto::enable_runtime_fallback); auto_s_context->m_bind_buffer = load_config.get_property(ov::intel_auto::device_bind_buffer); diff --git a/src/plugins/auto/tests/functional/behavior/caching_test.cpp b/src/plugins/auto/tests/functional/behavior/caching_test.cpp index 1b606470fa2f53..1b2df23f9c0d1c 100644 --- a/src/plugins/auto/tests/functional/behavior/caching_test.cpp +++ b/src/plugins/auto/tests/functional/behavior/caching_test.cpp @@ -32,7 +32,7 @@ TEST_F(AutoFuncTests, compiled_with_cache_enabled) { core.set_property(ov::cache_dir("")); } -TEST_F(AutoFuncTests, load_cached_model_to_actual_device_and_disable_CPU_accelerating) { +TEST_F(AutoFuncTests, load_cached_model_to_actual_device_and_disable_CPU_accelerating_default_startup_fallback) { core.set_property(ov::cache_dir(cache_path)); core.set_property("MOCK_GPU", ov::device::id("test")); // device id for cache property distinguish with MOCK_CPU { @@ -74,6 +74,49 @@ TEST_F(AutoFuncTests, load_cached_model_to_actual_device_and_disable_CPU_acceler core.set_property(ov::cache_dir("")); } +TEST_F(AutoFuncTests, load_cached_model_to_actual_device_and_disable_CPU_accelerating_set_startup_fallback) { + core.set_property(ov::cache_dir(cache_path)); + core.set_property("MOCK_GPU", ov::device::id("test")); // device id for cache property distinguish with MOCK_CPU + { + auto compiled_model = core.compile_model(model_cannot_batch, + "AUTO", + {ov::device::priorities("MOCK_GPU", "MOCK_CPU"), + ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT)}); + } + // No cached model for actual device + // will cache model for both actual device and CPU plugin + ASSERT_EQ(ov::test::utils::listFilesWithExt(cache_path, "blob").size(), 2); + ov::test::utils::removeFilesWithExt(cache_path, "blob"); + { + auto compiled_model = core.compile_model( + model_cannot_batch, + "AUTO", + {ov::device::priorities("MOCK_GPU"), ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT)}); + } + { + auto compiled_model = core.compile_model(model_cannot_batch, + "AUTO", + {ov::device::priorities("MOCK_GPU", "MOCK_CPU"), + ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT), + ov::intel_auto::enable_startup_fallback(true)}); + } + // cached model exists for actual device + // will reuse cached model for actual device without CPU accelerating(No cached model for CPU) + ASSERT_EQ(ov::test::utils::listFilesWithExt(cache_path, "blob").size(), 1); + core.set_property("MOCK_GPU", ov::device::id("test_regenerate")); + { + auto compiled_model = core.compile_model(model_cannot_batch, + "AUTO", + {ov::device::priorities("MOCK_GPU", "MOCK_CPU"), + ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT), + ov::intel_auto::enable_startup_fallback(false)}); + } + // model hash id changed for actual device + // will cache 2 models for actual device and no cached model for CPU + ASSERT_EQ(ov::test::utils::listFilesWithExt(cache_path, "blob").size(), 2); + core.set_property(ov::cache_dir("")); +} + TEST_F(AutoFuncTests, compiled_with_cache_enabled_batch_enabled) { #ifdef ENABLE_AUTO_BATCH core.set_property(ov::cache_dir(cache_path));