From 3b97b2ff3bdc425c66c81b7b2c4a2d8d4583f203 Mon Sep 17 00:00:00 2001 From: GuanLuo <41310872+GuanLuo@users.noreply.github.com> Date: Fri, 5 Jan 2024 14:47:30 -0800 Subject: [PATCH] Add retry on model loading. Expose option to set model retry count (#308) * Group model repository files * Expose option to set model retry count --- include/triton/core/tritonserver.h | 11 ++++++- python/tritonserver/_c/tritonserver_pybind.cc | 9 ++++++ src/CMakeLists.txt | 8 ++--- src/ensemble_scheduler/ensemble_utils.h | 2 +- .../model_lifecycle.cc | 17 +++++++--- .../model_lifecycle.h | 31 +++++++++---------- .../model_repository_manager.cc | 2 +- .../model_repository_manager.h | 0 src/server.cc | 4 +-- src/server.h | 5 ++- src/tritonserver.cc | 17 +++++++++- src/tritonserver_stub.cc | 4 +++ 12 files changed, 78 insertions(+), 32 deletions(-) rename src/{ => model_repository_manager}/model_lifecycle.cc (97%) rename src/{ => model_repository_manager}/model_lifecycle.h (93%) rename src/{ => model_repository_manager}/model_repository_manager.cc (99%) rename src/{ => model_repository_manager}/model_repository_manager.h (100%) diff --git a/include/triton/core/tritonserver.h b/include/triton/core/tritonserver.h index 93df7a907..8b850ffd2 100644 --- a/include/triton/core/tritonserver.h +++ b/include/triton/core/tritonserver.h @@ -91,7 +91,7 @@ struct TRITONSERVER_MetricFamily; /// } /// #define TRITONSERVER_API_VERSION_MAJOR 1 -#define TRITONSERVER_API_VERSION_MINOR 27 +#define TRITONSERVER_API_VERSION_MINOR 28 /// Get the TRITONBACKEND API version supported by the Triton shared /// library. This value can be compared against the @@ -1978,6 +1978,15 @@ TRITONSERVER_DECLSPEC struct TRITONSERVER_Error* TRITONSERVER_ServerOptionsSetModelLoadThreadCount( struct TRITONSERVER_ServerOptions* options, unsigned int thread_count); +/// Set the number of retry to load a model in a server options. +/// +/// \param options The server options object. +/// \param retry_count The number of retry. +/// \return a TRITONSERVER_Error indicating success or failure. +TRITONSERVER_DECLSPEC struct TRITONSERVER_Error* +TRITONSERVER_ServerOptionsSetModelLoadRetryCount( + struct TRITONSERVER_ServerOptions* options, unsigned int retry_count); + /// Enable model namespacing to allow serving models with the same name if /// they are in different namespaces. /// diff --git a/python/tritonserver/_c/tritonserver_pybind.cc b/python/tritonserver/_c/tritonserver_pybind.cc index 6e0f39842..e8ec3c37b 100644 --- a/python/tritonserver/_c/tritonserver_pybind.cc +++ b/python/tritonserver/_c/tritonserver_pybind.cc @@ -1282,6 +1282,12 @@ class PyServerOptions : public PyWrapper { triton_object_, thread_count)); } + void SetModelLoadRetryCount(unsigned int retry_count) + { + ThrowIfError(TRITONSERVER_ServerOptionsSetModelLoadRetryCount( + triton_object_, retry_count)); + } + void SetModelNamespacing(bool enable_namespace) { ThrowIfError(TRITONSERVER_ServerOptionsSetModelNamespacing( @@ -2017,6 +2023,9 @@ PYBIND11_MODULE(triton_bindings, m) .def( "set_model_load_thread_count", &PyServerOptions::SetModelLoadThreadCount) + .def( + "set_model_load_retry_count", + &PyServerOptions::SetModelLoadRetryCount) .def("set_model_namespacing", &PyServerOptions::SetModelNamespacing) .def("set_log_file", &PyServerOptions::SetLogFile) .def("set_log_info", &PyServerOptions::SetLogInfo) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 9ecf60f7a..dc11d745d 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -140,8 +140,8 @@ set( metric_family.cc model.cc model_config_utils.cc - model_lifecycle.cc - model_repository_manager.cc + model_repository_manager/model_lifecycle.cc + model_repository_manager/model_repository_manager.cc numa_utils.cc payload.cc pinned_memory_manager.cc @@ -187,8 +187,8 @@ set( metric_family.h model_config_utils.h model.h - model_lifecycle.h - model_repository_manager.h + model_repository_manager/model_lifecycle.h + model_repository_manager/model_repository_manager.h numa_utils.h payload.h pinned_memory_manager.h diff --git a/src/ensemble_scheduler/ensemble_utils.h b/src/ensemble_scheduler/ensemble_utils.h index 4bcccfbdc..f23306744 100644 --- a/src/ensemble_scheduler/ensemble_utils.h +++ b/src/ensemble_scheduler/ensemble_utils.h @@ -31,7 +31,7 @@ #include #include "model_config.pb.h" -#include "model_repository_manager.h" +#include "model_repository_manager/model_repository_manager.h" #include "status.h" #include "triton/common/model_config.h" diff --git a/src/model_lifecycle.cc b/src/model_repository_manager/model_lifecycle.cc similarity index 97% rename from src/model_lifecycle.cc rename to src/model_repository_manager/model_lifecycle.cc index 0b46ad27c..6018ce9fa 100644 --- a/src/model_lifecycle.cc +++ b/src/model_repository_manager/model_lifecycle.cc @@ -513,7 +513,15 @@ ModelLifeCycle::AsyncLoad( // Load model asynchronously via thread pool load_pool_->Enqueue([this, model_id, version, model_info, OnComplete, load_tracker, is_config_provided]() { - CreateModel(model_id, version, model_info, is_config_provided); + for (size_t retry = 0; retry <= options_.load_retry; ++retry) { + model_info->state_ = ModelReadyState::LOADING; + CreateModel(model_id, version, model_info, is_config_provided); + // Model state will be changed to NOT loading if failed to load, + // so the model is loaded if state is LOADING. + if (model_info->state_ == ModelReadyState::LOADING) { + break; + } + } OnLoadComplete( model_id, version, model_info, false /* is_update */, OnComplete, load_tracker); @@ -540,15 +548,16 @@ ModelLifeCycle::CreateModel( if (!model_config.backend().empty()) { std::unique_ptr model; status = TritonModel::Create( - server_, model_info->model_path_, cmdline_config_map_, host_policy_map_, - version, model_config, is_config_provided, &model); + server_, model_info->model_path_, options_.backend_cmdline_config_map, + options_.host_policy_map, version, model_config, is_config_provided, + &model); is.reset(model.release()); } else { #ifdef TRITON_ENABLE_ENSEMBLE if (model_info->is_ensemble_) { status = EnsembleModel::Create( server_, model_info->model_path_, version, model_config, - is_config_provided, min_compute_capability_, &is); + is_config_provided, options_.min_compute_capability, &is); // Complete label provider with label information from involved models // Must be done here because involved models may not be able to // obtained from server because this may happen during server diff --git a/src/model_lifecycle.h b/src/model_repository_manager/model_lifecycle.h similarity index 93% rename from src/model_lifecycle.h rename to src/model_repository_manager/model_lifecycle.h index 5905441a7..a9632e141 100644 --- a/src/model_lifecycle.h +++ b/src/model_repository_manager/model_lifecycle.h @@ -45,21 +45,23 @@ struct ModelLifeCycleOptions { const double min_compute_capability, const triton::common::BackendCmdlineConfigMap& backend_cmdline_config_map, const triton::common::HostPolicyCmdlineConfigMap& host_policy_map, - const unsigned int model_load_thread_count) - : min_compute_capability_(min_compute_capability), - backend_cmdline_config_map_(backend_cmdline_config_map), - host_policy_map_(host_policy_map), - model_load_thread_count_(model_load_thread_count) + const unsigned int model_load_thread_count, const size_t load_retry) + : min_compute_capability(min_compute_capability), + backend_cmdline_config_map(backend_cmdline_config_map), + host_policy_map(host_policy_map), + model_load_thread_count(model_load_thread_count), load_retry(load_retry) { } // The minimum supported CUDA compute capability. - const double min_compute_capability_; + const double min_compute_capability; // The backend configuration settings specified on the command-line - const triton::common::BackendCmdlineConfigMap& backend_cmdline_config_map_; + const triton::common::BackendCmdlineConfigMap& backend_cmdline_config_map; // The host policy setting used when loading models. - const triton::common::HostPolicyCmdlineConfigMap& host_policy_map_; + const triton::common::HostPolicyCmdlineConfigMap& host_policy_map; // Number of the threads to use for concurrently loading models - const unsigned int model_load_thread_count_; + const unsigned int model_load_thread_count; + // Number of retry on model loading before considering the load has failed. + const size_t load_retry{0}; }; @@ -283,13 +285,10 @@ class ModelLifeCycle { }; ModelLifeCycle(InferenceServer* server, const ModelLifeCycleOptions& options) - : server_(server), - min_compute_capability_(options.min_compute_capability_), - cmdline_config_map_(options.backend_cmdline_config_map_), - host_policy_map_(options.host_policy_map_) + : server_(server), options_(options) { load_pool_.reset(new triton::common::ThreadPool( - std::max(1u, options.model_load_thread_count_))); + std::max(1u, options_.model_load_thread_count))); } // Create a new model, the 'model_id' can either be a new or existing model. @@ -327,9 +326,7 @@ class ModelLifeCycle { std::map> background_models_; InferenceServer* server_; - const double min_compute_capability_; - const triton::common::BackendCmdlineConfigMap cmdline_config_map_; - const triton::common::HostPolicyCmdlineConfigMap host_policy_map_; + const ModelLifeCycleOptions options_; // Fixed-size thread pool to load models at specified concurrency std::unique_ptr load_pool_; diff --git a/src/model_repository_manager.cc b/src/model_repository_manager/model_repository_manager.cc similarity index 99% rename from src/model_repository_manager.cc rename to src/model_repository_manager/model_repository_manager.cc index 9dd5bbc0d..6a882e552 100644 --- a/src/model_repository_manager.cc +++ b/src/model_repository_manager/model_repository_manager.cc @@ -416,7 +416,7 @@ ModelRepositoryManager::Create( std::unique_ptr local_manager( new ModelRepositoryManager( repository_paths, !strict_model_config, polling_enabled, - model_control_enabled, life_cycle_options.min_compute_capability_, + model_control_enabled, life_cycle_options.min_compute_capability, enable_model_namespacing, std::move(life_cycle))); *model_repository_manager = std::move(local_manager); diff --git a/src/model_repository_manager.h b/src/model_repository_manager/model_repository_manager.h similarity index 100% rename from src/model_repository_manager.h rename to src/model_repository_manager/model_repository_manager.h diff --git a/src/server.cc b/src/server.cc index b662cd3e9..f582ee09b 100644 --- a/src/server.cc +++ b/src/server.cc @@ -42,7 +42,6 @@ #include "model.h" #include "model_config.pb.h" #include "model_config_utils.h" -#include "model_repository_manager.h" #include "pinned_memory_manager.h" #include "repo_agent.h" #include "triton/common/async_work_queue.h" @@ -108,6 +107,7 @@ InferenceServer::InferenceServer() pinned_memory_pool_size_ = 1 << 28; buffer_manager_thread_count_ = 0; model_load_thread_count_ = 4; + model_load_retry_count_ = 0; enable_model_namespacing_ = false; #ifdef TRITON_ENABLE_GPU @@ -258,7 +258,7 @@ InferenceServer::Init() (model_control_mode_ == ModelControlMode::MODE_EXPLICIT); const ModelLifeCycleOptions life_cycle_options( min_supported_compute_capability_, backend_cmdline_config_map_, - host_policy_map_, model_load_thread_count_); + host_policy_map_, model_load_thread_count_, model_load_retry_count_); status = ModelRepositoryManager::Create( this, version_, model_repository_paths_, startup_models_, strict_model_config_, polling_enabled, model_control_enabled, diff --git a/src/server.h b/src/server.h index 192c9b22f..067d6e9a9 100644 --- a/src/server.h +++ b/src/server.h @@ -38,7 +38,7 @@ #include "cache_manager.h" #include "infer_parameter.h" #include "model_config.pb.h" -#include "model_repository_manager.h" +#include "model_repository_manager/model_repository_manager.h" #include "rate_limiter.h" #include "status.h" #include "triton/common/model_config.h" @@ -257,6 +257,8 @@ class InferenceServer { void SetModelLoadThreadCount(unsigned int c) { model_load_thread_count_ = c; } + void SetModelLoadRetryCount(unsigned int c) { model_load_retry_count_ = c; } + void SetModelNamespacingEnabled(const bool e) { enable_model_namespacing_ = e; @@ -334,6 +336,7 @@ class InferenceServer { uint32_t exit_timeout_secs_; uint32_t buffer_manager_thread_count_; uint32_t model_load_thread_count_; + uint32_t model_load_retry_count_; bool enable_model_namespacing_; uint64_t pinned_memory_pool_size_; bool response_cache_enabled_; diff --git a/src/tritonserver.cc b/src/tritonserver.cc index 15791b6b2..272fd6cf1 100644 --- a/src/tritonserver.cc +++ b/src/tritonserver.cc @@ -37,7 +37,7 @@ #include "metrics.h" #include "model.h" #include "model_config_utils.h" -#include "model_repository_manager.h" +#include "model_repository_manager/model_repository_manager.h" #include "rate_limiter.h" #include "response_allocator.h" #include "server.h" @@ -280,6 +280,9 @@ class TritonServerOptions { unsigned int ModelLoadThreadCount() const { return model_load_thread_count_; } void SetModelLoadThreadCount(unsigned int c) { model_load_thread_count_ = c; } + unsigned int ModelLoadRetryCount() const { return model_load_retry_count_; } + void SetModelLoadRetryCount(unsigned int c) { model_load_retry_count_ = c; } + bool ModelNamespacingEnabled() { return enable_model_namespacing_; } void SetModelNamespacingEnabled(const bool e) { @@ -356,6 +359,7 @@ class TritonServerOptions { uint64_t pinned_memory_pool_size_; unsigned int buffer_manager_thread_count_; unsigned int model_load_thread_count_; + unsigned int model_load_retry_count_; bool enable_model_namespacing_; std::map cuda_memory_pool_size_; double min_compute_capability_; @@ -1355,6 +1359,16 @@ TRITONSERVER_ServerOptionsSetModelLoadThreadCount( return nullptr; // Success } +TRITONAPI_DECLSPEC TRITONSERVER_Error* +TRITONSERVER_ServerOptionsSetModelLoadRetryCount( + TRITONSERVER_ServerOptions* options, unsigned int retry_count) +{ + TritonServerOptions* loptions = + reinterpret_cast(options); + loptions->SetModelLoadRetryCount(retry_count); + return nullptr; // Success +} + TRITONAPI_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerOptionsSetModelNamespacing( TRITONSERVER_ServerOptions* options, bool enable_namespace) @@ -2342,6 +2356,7 @@ TRITONSERVER_ServerNew( lserver->SetRepoAgentDir(loptions->RepoAgentDir()); lserver->SetBufferManagerThreadCount(loptions->BufferManagerThreadCount()); lserver->SetModelLoadThreadCount(loptions->ModelLoadThreadCount()); + lserver->SetModelLoadRetryCount(loptions->ModelLoadRetryCount()); lserver->SetModelNamespacingEnabled(loptions->ModelNamespacingEnabled()); // SetBackendCmdlineConfig must be called after all AddBackendConfig calls diff --git a/src/tritonserver_stub.cc b/src/tritonserver_stub.cc index 840429d3f..f54e10dc5 100644 --- a/src/tritonserver_stub.cc +++ b/src/tritonserver_stub.cc @@ -427,6 +427,10 @@ TRITONSERVER_ServerOptionsSetModelLoadThreadCount() { } TRITONAPI_DECLSPEC void +TRITONSERVER_ServerOptionsSetModelLoadRetryCount() +{ +} +TRITONAPI_DECLSPEC void TRITONSERVER_ServerOptionsSetModelNamespacing() { }