From c904b8e182d0f62bda6fa2781cdbdeb79678344c Mon Sep 17 00:00:00 2001
From: "Wang, Yang" <yang4.wang@intel.com>
Date: Thu, 23 May 2024 10:02:46 +0800
Subject: [PATCH] [AUTO] Load cached model to target device W/O CPU
 accelerating (#24618)

### Details:
- update logic of only loading cached model to GPU with AUTO if GPU
cached blob exists and `ov::intel_auto::enable_startup_fallback` is
enable

### Tickets:
 - CVS-138574
---
 src/plugins/auto/src/auto_schedule.cpp        | 44 +++++++-----------
 src/plugins/auto/src/common.hpp               |  1 -
 src/plugins/auto/src/plugin.cpp               |  1 -
 .../functional/behavior/caching_test.cpp      | 45 ++++++++++++++++++-
 4 files changed, 59 insertions(+), 32 deletions(-)
diff --git a/src/plugins/auto/src/auto_schedule.cpp b/src/plugins/auto/src/auto_schedule.cpp
index 0c248a2b491054..3de0cc9f00b871 100644
--- a/src/plugins/auto/src/auto_schedule.cpp
+++ b/src/plugins/auto/src/auto_schedule.cpp
@@ -133,7 +133,6 @@ void AutoSchedule::init() {
     if (m_compile_context[ACTUALDEVICE].m_is_enabled) {
         LOG_INFO_TAG("select device:%s", m_compile_context[ACTUALDEVICE].m_device_info.device_name.c_str());
         bool is_actual_cpu = m_compile_context[ACTUALDEVICE].m_device_info.device_name.find("CPU") != std::string::npos;
-        bool is_actual_gpu = m_compile_context[ACTUALDEVICE].m_device_info.device_name.find("GPU") != std::string::npos;
         // if Actual device is CPU or perf_hint is cumulative, disabled m_compile_context[CPU], only use
         // m_compile_context[ACTUALDEVICE]
         if (is_actual_cpu || !m_context->m_startup_fallback) {
@@ -148,29 +147,11 @@ void AutoSchedule::init() {
                     // limit the threads num for compiling
                     auto device = m_compile_context[ACTUALDEVICE].m_device_info.device_name;
                     auto& device_config = m_compile_context[ACTUALDEVICE].m_device_info.config;
-                    if (is_actual_gpu) {
-                        int max_threads = 0;
-                        try {
-                            max_threads = m_context->m_ov_core->get_property(device, ov::compilation_num_threads);
-                        } catch (const ov::Exception&) {
-                            LOG_DEBUG_TAG("cannot get MAX_NUM_THREADS from GPU");
-                        }
-                        if (max_threads == static_cast<int>(std::thread::hardware_concurrency())) {
-                            int thread_num = max_threads / 2;
-                            m_compile_context[ACTUALDEVICE].m_device_info.config.insert(
-                                ov::compilation_num_threads(thread_num));
-                            LOG_DEBUG_TAG("gpu streams number for compiling: %d", thread_num);
-                        } else {
-                            // user set the compiling threads num
-                            // use the user's val anyway
-                            LOG_DEBUG_TAG("user defined compiling threads: %d", max_threads);
-                        }
-                    }
                     std::string cache_dir = device_config.count(ov::cache_dir.name())
                                                 ? device_config[ov::cache_dir.name()].as<std::string>()
                                                 : m_context->m_ov_core->get_property("", ov::cache_dir);
 
-                    if (!m_context->m_is_set_startup_fallback && !cache_dir.empty()) {
+                    if (m_context->m_startup_fallback && !cache_dir.empty()) {
                         const auto properties =
                             m_context->m_ov_core->create_compile_config(ov::DeviceIDParser(device).get_device_name(),
                                                                         device_config);
@@ -323,15 +304,20 @@ void AutoSchedule::try_to_compile_model(AutoCompileContext& context, const std::
              device_config.find(ov::compilation_num_threads.name()) != device_config.end());
         if (cur_dev_is_gpu && m_compile_context[CPU].m_is_enabled && !is_already_set_gpu) {
             device_config.insert(ov::intel_gpu::hint::host_task_priority(ov::hint::Priority::HIGH));
-            auto proc_type_table = get_org_proc_type_table();
-            int compilation_num_threads = proc_type_table[0][MAIN_CORE_PROC] != 0
-                                              ? proc_type_table[0][MAIN_CORE_PROC]
-                                              : proc_type_table[0][EFFICIENT_CORE_PROC];
-            if (device_config.insert(ov::compilation_num_threads(compilation_num_threads)).second)
-                LOG_DEBUG_TAG("gpu streams number for compiling: %d", compilation_num_threads);
-            else
-                LOG_DEBUG_TAG("user defined compiling threads: %d",
-                              device_config[ov::compilation_num_threads.name()].as<int32_t>());
+            int max_threads = 0;
+            try {
+                m_context->m_ov_core->get_property(device, ov::compilation_num_threads);
+                auto proc_type_table = get_org_proc_type_table();
+                max_threads = proc_type_table[0][MAIN_CORE_PROC] != 0 ? proc_type_table[0][MAIN_CORE_PROC]
+                                                                      : proc_type_table[0][EFFICIENT_CORE_PROC];
+                if (device_config.insert(ov::compilation_num_threads(max_threads)).second)
+                    LOG_DEBUG_TAG("gpu streams number for compiling: %d", max_threads);
+                else
+                    LOG_DEBUG_TAG("user defined compiling threads: %d",
+                                  device_config[ov::compilation_num_threads.name()].as<int32_t>());
+            } catch (const ov::Exception&) {
+                LOG_DEBUG_TAG("cannot get MAX_NUM_THREADS from GPU");
+            }
         }
     }
     try {
diff --git a/src/plugins/auto/src/common.hpp b/src/plugins/auto/src/common.hpp
index 63fb8753e4fff2..28567eb23392c4 100644
--- a/src/plugins/auto/src/common.hpp
+++ b/src/plugins/auto/src/common.hpp
@@ -207,7 +207,6 @@ class ScheduleContext : public std::enable_shared_from_this<ScheduleContext>  {
     bool                                           m_need_perf_counters;
     bool                                           m_batching_disabled = false;
     bool                                           m_startup_fallback = true;
-    bool                                           m_is_set_startup_fallback = false;
     bool                                           m_runtime_fallback = true;
     bool                                           m_bind_buffer = false;
     std::shared_ptr<ov::Model>                     m_model;
diff --git a/src/plugins/auto/src/plugin.cpp b/src/plugins/auto/src/plugin.cpp
index 9d8174252d21c9..06b3b7dbc947e4 100644
--- a/src/plugins/auto/src/plugin.cpp
+++ b/src/plugins/auto/src/plugin.cpp
@@ -436,7 +436,6 @@ std::shared_ptr<ov::ICompiledModel> Plugin::compile_model_impl(const std::string
     OPENVINO_ASSERT(auto_s_context->m_ov_core);
     auto_s_context->m_log_tag = get_device_name();
     auto_s_context->m_model_precision = model_precision;
-    auto_s_context->m_is_set_startup_fallback = load_config.is_set_by_user(ov::intel_auto::enable_startup_fallback);
     auto_s_context->m_startup_fallback = load_config.get_property(ov::intel_auto::enable_startup_fallback);
     auto_s_context->m_runtime_fallback = load_config.get_property(ov::intel_auto::enable_runtime_fallback);
     auto_s_context->m_bind_buffer = load_config.get_property(ov::intel_auto::device_bind_buffer);
diff --git a/src/plugins/auto/tests/functional/behavior/caching_test.cpp b/src/plugins/auto/tests/functional/behavior/caching_test.cpp
index 1b606470fa2f53..1b2df23f9c0d1c 100644
--- a/src/plugins/auto/tests/functional/behavior/caching_test.cpp
+++ b/src/plugins/auto/tests/functional/behavior/caching_test.cpp
@@ -32,7 +32,7 @@ TEST_F(AutoFuncTests, compiled_with_cache_enabled) {
     core.set_property(ov::cache_dir(""));
 }
 
-TEST_F(AutoFuncTests, load_cached_model_to_actual_device_and_disable_CPU_accelerating) {
+TEST_F(AutoFuncTests, load_cached_model_to_actual_device_and_disable_CPU_accelerating_default_startup_fallback) {
     core.set_property(ov::cache_dir(cache_path));
     core.set_property("MOCK_GPU", ov::device::id("test"));  // device id for cache property distinguish with MOCK_CPU
     {
@@ -74,6 +74,49 @@ TEST_F(AutoFuncTests, load_cached_model_to_actual_device_and_disable_CPU_acceler
     core.set_property(ov::cache_dir(""));
 }
 
+TEST_F(AutoFuncTests, load_cached_model_to_actual_device_and_disable_CPU_accelerating_set_startup_fallback) {
+    core.set_property(ov::cache_dir(cache_path));
+    core.set_property("MOCK_GPU", ov::device::id("test"));  // device id for cache property distinguish with MOCK_CPU
+    {
+        auto compiled_model = core.compile_model(model_cannot_batch,
+                                                 "AUTO",
+                                                 {ov::device::priorities("MOCK_GPU", "MOCK_CPU"),
+                                                  ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT)});
+    }
+    // No cached model for actual device
+    // will cache model for both actual device and CPU plugin
+    ASSERT_EQ(ov::test::utils::listFilesWithExt(cache_path, "blob").size(), 2);
+    ov::test::utils::removeFilesWithExt(cache_path, "blob");
+    {
+        auto compiled_model = core.compile_model(
+            model_cannot_batch,
+            "AUTO",
+            {ov::device::priorities("MOCK_GPU"), ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT)});
+    }
+    {
+        auto compiled_model = core.compile_model(model_cannot_batch,
+                                                 "AUTO",
+                                                 {ov::device::priorities("MOCK_GPU", "MOCK_CPU"),
+                                                  ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT),
+                                                  ov::intel_auto::enable_startup_fallback(true)});
+    }
+    // cached model exists for actual device
+    // will reuse cached model for actual device without CPU accelerating(No cached model for CPU)
+    ASSERT_EQ(ov::test::utils::listFilesWithExt(cache_path, "blob").size(), 1);
+    core.set_property("MOCK_GPU", ov::device::id("test_regenerate"));
+    {
+        auto compiled_model = core.compile_model(model_cannot_batch,
+                                                 "AUTO",
+                                                 {ov::device::priorities("MOCK_GPU", "MOCK_CPU"),
+                                                  ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT),
+                                                  ov::intel_auto::enable_startup_fallback(false)});
+    }
+    // model hash id changed for actual device
+    // will cache 2 models for actual device and no cached model for CPU
+    ASSERT_EQ(ov::test::utils::listFilesWithExt(cache_path, "blob").size(), 2);
+    core.set_property(ov::cache_dir(""));
+}
+
 TEST_F(AutoFuncTests, compiled_with_cache_enabled_batch_enabled) {
 #ifdef ENABLE_AUTO_BATCH
     core.set_property(ov::cache_dir(cache_path));