diff --git a/CMakeLists.txt b/CMakeLists.txt index 682b3fd..6a8fc9c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -37,6 +37,8 @@ set(TRITON_MIN_CXX_STANDARD 17 CACHE STRING "The minimum C++ standard which feat option(TRITON_ENABLE_GPU "Enable GPU support in backend." ON) option(TRITON_ENABLE_STATS "Include statistics collections in backend." ON) option(TRITON_ENABLE_NVTX "Include nvtx markers collection in backend." OFF) +option(TRITON_ENABLE_CUDA_CTX_SHARING "Enable Cuda context sharing support in backend." OFF) + set(TRITON_TENSORRT_LIB_PATHS "" CACHE PATH "Paths to TensorRT libraries. Multiple paths may be specified by separating them with a semicolon.") set(TRITON_TENSORRT_INCLUDE_PATHS "" CACHE PATH "Paths to TensorRT includes. Multiple paths may be specified by separating them with a semicolon.") @@ -232,6 +234,17 @@ target_link_libraries( CUDA::cudart ) +if(${TRITON_ENABLE_CUDA_CTX_SHARING}) + target_compile_definitions( + triton-tensorrt-backend + PRIVATE TRITON_ENABLE_CUDA_CTX_SHARING + ) + target_link_libraries( + triton-tensorrt-backend + PRIVATE + CUDA::cuda_driver + ) +endif() # # Install diff --git a/src/instance_state.cc b/src/instance_state.cc index 56208a1..1e0517b 100644 --- a/src/instance_state.cc +++ b/src/instance_state.cc @@ -257,7 +257,9 @@ ModelInstanceState::ModelInstanceState( ModelInstanceState::~ModelInstanceState() { - cudaSetDevice(DeviceId()); + if (!model_state_->IsCudaContextSharingEnabled()) { + cudaSetDevice(DeviceId()); + } for (auto& io_binding_infos : io_binding_infos_) { for (auto& io_binding_info : io_binding_infos) { if (!io_binding_info.IsDynamicShapeOutput() && @@ -424,7 +426,9 @@ ModelInstanceState::Run( payload_.reset(new Payload(next_set_, requests, request_count)); SET_TIMESTAMP(payload_->compute_start_ns_); - cudaSetDevice(DeviceId()); + if (!model_state_->IsCudaContextSharingEnabled()) { + cudaSetDevice(DeviceId()); + } #ifdef TRITON_ENABLE_STATS { SET_TIMESTAMP(payload_->compute_start_ns_); @@ -1551,13 +1555,16 @@ ModelInstanceState::EvaluateTensorRTContext( TRITONSERVER_Error* ModelInstanceState::InitStreamsAndEvents() { - // Set the device before preparing the context. - auto cuerr = cudaSetDevice(DeviceId()); - if (cuerr != cudaSuccess) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, (std::string("unable to set device for ") + - Name() + ": " + cudaGetErrorString(cuerr)) - .c_str()); + if (!model_state_->IsCudaContextSharingEnabled()) { + // Set the device before preparing the context. + auto cuerr = cudaSetDevice(DeviceId()); + if (cuerr != cudaSuccess) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + (std::string("unable to set device for ") + Name() + ": " + + cudaGetErrorString(cuerr)) + .c_str()); + } } // Create CUDA streams associated with the instance diff --git a/src/model_state.cc b/src/model_state.cc index 6127989..9d547c1 100644 --- a/src/model_state.cc +++ b/src/model_state.cc @@ -175,7 +175,9 @@ ModelState::ModelState(TRITONBACKEND_Model* triton_model) ModelState::~ModelState() { for (auto& device_engine : device_engines_) { - cudaSetDevice(device_engine.first.first); + if (!IsCudaContextSharingEnabled()) { + cudaSetDevice(device_engine.first.first); + } auto& runtime = device_engine.second.first; auto& engine = device_engine.second.second; // Need to reset explicitly to ensure proper destruction order @@ -209,15 +211,16 @@ ModelState::CreateEngine( // We share the engine (for models that don't have dynamic shapes) and // runtime across instances that have access to the same GPU/NVDLA. if (eit->second.second == nullptr) { - auto cuerr = cudaSetDevice(gpu_device); - if (cuerr != cudaSuccess) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("unable to set device for ") + Name() + ": " + - cudaGetErrorString(cuerr)) - .c_str()); + if (!IsCudaContextSharingEnabled()) { + auto cuerr = cudaSetDevice(gpu_device); + if (cuerr != cudaSuccess) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + (std::string("unable to set device for ") + Name() + ": " + + cudaGetErrorString(cuerr)) + .c_str()); + } } - const bool new_runtime = (eit->second.first == nullptr); RETURN_IF_ERROR(LoadPlan( model_path, dla_core_id, &eit->second.first, &eit->second.second, @@ -321,6 +324,18 @@ ModelState::AutoCompleteConfig() " to auto-complete config for " + Name()) .c_str())); +#ifdef TRITON_ENABLE_CUDA_CTX_SHARING + // Return failure if Cuda context sharing is enabled and + // if it is a multi GPU setup + if (IsCudaContextSharingEnabled() && device_id != 0) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + (std::string( + "Cuda context sharing is not supported on multi-GPU system.")) + .c_str()); + } +#endif // TRITON_ENABLE_CUDA_CTX_SHARING + cuerr = cudaSetDevice(device_id); if (cuerr != cudaSuccess) { return TRITONSERVER_ErrorNew( @@ -373,13 +388,15 @@ ModelState::AutoCompleteConfig() RETURN_IF_ERROR(AutoCompleteConfigHelper(model_path)); - cuerr = cudaSetDevice(current_device); - if (cuerr != cudaSuccess) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("unable to revert CUDA device to GPU ") + - std::to_string(current_device) + " : " + cudaGetErrorString(cuerr)) - .c_str()); + if (!IsCudaContextSharingEnabled()) { + cuerr = cudaSetDevice(current_device); + if (cuerr != cudaSuccess) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + (std::string("unable to revert CUDA device to GPU ") + + std::to_string(current_device) + " : " + cudaGetErrorString(cuerr)) + .c_str()); + } } if (TRITONSERVER_LogIsEnabled(TRITONSERVER_LOG_VERBOSE)) { diff --git a/src/tensorrt.cc b/src/tensorrt.cc index 2c2d2a4..747b867 100644 --- a/src/tensorrt.cc +++ b/src/tensorrt.cc @@ -318,6 +318,7 @@ TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance) DeviceMemoryTracker::TrackThreadMemoryUsage(lusage.get()); } + ScopedRuntimeCudaContext cuda_scope(model_state); // With each instance we create a ModelInstanceState object and // associate it with the TRITONBACKEND_ModelInstance. @@ -353,6 +354,11 @@ TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance) LOG_MESSAGE( TRITONSERVER_LOG_INFO, "TRITONBACKEND_ModelInstanceFinalize: delete instance state"); + if (!instance_state) { + return nullptr; + } + + ScopedRuntimeCudaContext cuda_scope(instance_state->StateForModel()); delete instance_state; @@ -377,6 +383,8 @@ TRITONBACKEND_ModelInstanceExecute( instance, reinterpret_cast(&instance_state))); ModelState* model_state = instance_state->StateForModel(); + ScopedRuntimeCudaContext cuda_scope(model_state); + // For TensorRT backend, the executing instance may not closely tie to // TRITONBACKEND_ModelInstance, the instance will be assigned based on // execution policy. diff --git a/src/tensorrt_model.cc b/src/tensorrt_model.cc index bf2959d..b466bfa 100644 --- a/src/tensorrt_model.cc +++ b/src/tensorrt_model.cc @@ -90,6 +90,14 @@ TensorRTModel::ParseModelConfig() } } +#ifdef TRITON_ENABLE_CUDA_CTX_SHARING + std::string ptr_str = ""; + RETURN_IF_ERROR(GetParameter("CUDA_CONTEXT_PTR", ptr_str)); + cuda_ctx = static_cast(StringToPointer(ptr_str)); + // cuda_ctx = static_cast(reinterpret_cast(ptr_str)); + LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, "Cuda Context pointer is set"); +#endif // TRITON_ENABLE_CUDA_CTX_SHARING + return nullptr; // Success } @@ -120,4 +128,19 @@ TensorRTModel::GetCudaStreamPriority() return cuda_stream_priority; } +template <> +TRITONSERVER_Error* +TensorRTModel::GetParameter( + std::string const& name, std::string& str_value) +{ + triton::common::TritonJson::Value parameters; + RETURN_IF_ERROR(model_config_.MemberAsObject("parameters", ¶meters)); + + triton::common::TritonJson::Value value; + RETURN_IF_ERROR(parameters.MemberAsObject(name.c_str(), &value)); + + value.MemberAsString("string_value", &str_value); + return nullptr; +} + }}} // namespace triton::backend::tensorrt diff --git a/src/tensorrt_model.h b/src/tensorrt_model.h index 86c67a2..64f43a0 100644 --- a/src/tensorrt_model.h +++ b/src/tensorrt_model.h @@ -25,6 +25,11 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma once +#ifdef TRITON_ENABLE_CUDA_CTX_SHARING +#include +#endif // TRITON_ENABLE_CUDA_CTX_SHARING +#include + #include "triton/backend/backend_model.h" namespace triton { namespace backend { namespace tensorrt { @@ -34,6 +39,14 @@ class TensorRTModel : public BackendModel { TensorRTModel(TRITONBACKEND_Model* triton_model); virtual ~TensorRTModel() = default; + template + TRITONSERVER_Error* GetParameter(std::string const& name, T& value) + { + assert(false); + auto dummy = T(); + return dummy; + } + TRITONSERVER_Error* SetTensorRTModelConfig(); TRITONSERVER_Error* ParseModelConfig(); @@ -53,6 +66,61 @@ class TensorRTModel : public BackendModel { bool EagerBatching() { return eager_batching_; } bool BusyWaitEvents() { return busy_wait_events_; } + void* StringToPointer(std::string& str) + { + std::stringstream ss; + ss << str; + + void* ctx_ptr; + ss >> ctx_ptr; + return ctx_ptr; + } + + //! Following functions are related to custom Cuda context (Cuda in Graphics) + //! sharing for gaming use case. Creating a shared contexts reduces context + //! switching overhead and leads to better performance of model execution + //! along side Graphics workload. + + bool IsCudaContextSharingEnabled() + { +#ifdef TRITON_ENABLE_CUDA_CTX_SHARING + return cuda_ctx != nullptr; +#else + return false; +#endif // TRITON_ENABLE_CUDA_CTX_SHARING + } + + inline TRITONSERVER_Error* PushCudaContext() + { +#ifdef TRITON_ENABLE_CUDA_CTX_SHARING + if (CUDA_SUCCESS != cuCtxPushCurrent(cuda_ctx)) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + (std::string("unable to push Cuda context for ") + Name()).c_str()); + } +#endif // TRITON_ENABLE_CUDA_CTX_SHARING + return nullptr; + } + + inline TRITONSERVER_Error* PopCudaContext() + { +#ifdef TRITON_ENABLE_CUDA_CTX_SHARING + CUcontext oldCtx{}; + if (CUDA_SUCCESS != cuCtxPopCurrent(&oldCtx)) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + (std::string("unable to pop Cuda context for ") + Name()).c_str()); + } + if (oldCtx != cuda_ctx) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + (std::string("popping the wrong Cuda context for ") + Name()) + .c_str()); + } +#endif // TRITON_ENABLE_CUDA_CTX_SHARING + return nullptr; + } + protected: common::TritonJson::Value graph_specs_; Priority priority_; @@ -61,6 +129,34 @@ class TensorRTModel : public BackendModel { bool separate_output_stream_; bool eager_batching_; bool busy_wait_events_; +#ifdef TRITON_ENABLE_CUDA_CTX_SHARING + CUcontext cuda_ctx = nullptr; +#endif // TRITON_ENABLE_CUDA_CTX_SHARING +}; + +template <> +TRITONSERVER_Error* TensorRTModel::GetParameter( + std::string const& name, std::string& str_value); + +struct ScopedRuntimeCudaContext { + ScopedRuntimeCudaContext(TensorRTModel* model_state) + : model_state_(model_state) + { +#ifdef TRITON_ENABLE_CUDA_CTX_SHARING + if (model_state_->IsCudaContextSharingEnabled()) { + THROW_IF_BACKEND_MODEL_ERROR(model_state_->PushCudaContext()); + } +#endif // TRITON_ENABLE_CUDA_CTX_SHARING + } + ~ScopedRuntimeCudaContext() + { +#ifdef TRITON_ENABLE_CUDA_CTX_SHARING + if (model_state_->IsCudaContextSharingEnabled()) { + THROW_IF_BACKEND_MODEL_ERROR(model_state_->PopCudaContext()); + } +#endif // TRITON_ENABLE_CUDA_CTX_SHARING + } + TensorRTModel* model_state_; }; }}} // namespace triton::backend::tensorrt