From 9e0144b154ae2e90966c8f8833324b4cb288ffd3 Mon Sep 17 00:00:00 2001
From: Ryan McCormick <rmccormick@nvidia.com>
Date: Thu, 31 Aug 2023 14:51:41 -0700
Subject: [PATCH 1/5] Allow transitioning from RELEASED to STARTED state for
 re-using requests

---
 src/infer_request.cc | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/src/infer_request.cc b/src/infer_request.cc
index 803335bca..3dd9702cf 100644
--- a/src/infer_request.cc
+++ b/src/infer_request.cc
@@ -124,6 +124,8 @@ InferenceRequest::~InferenceRequest()
 Status
 InferenceRequest::SetState(InferenceRequest::State new_state)
 {
+  LOG_VERBOSE(1) << LogRequest() << "Setting state from " << state_ << " to "
+                 << new_state;
   // No-op if this is already the current state, or if this is a null request.
   if (new_state == state_ || null_request_) {
     return Status::Success;
@@ -142,7 +144,7 @@ InferenceRequest::SetState(InferenceRequest::State new_state)
     std::stringstream ss;
     ss << LogRequest() << "Invalid request state transition from " << state_
        << " to " << new_state;
-    return Status(Status::Code::INVALID_ARG, ss.str());
+    return Status(Status::Code::INTERNAL, ss.str());
   };
 
   // Define state transitions
@@ -151,7 +153,6 @@ InferenceRequest::SetState(InferenceRequest::State new_state)
       if (new_state != InferenceRequest::State::STARTED) {
         return generate_error();
       }
-      state_ = new_state;
       IncrementPendingRequestCount();
       break;
     }
@@ -159,7 +160,6 @@ InferenceRequest::SetState(InferenceRequest::State new_state)
       if (new_state != InferenceRequest::State::EXECUTING) {
         return generate_error();
       }
-      state_ = new_state;
       DecrementPendingRequestCount();
       break;
     }
@@ -167,14 +167,18 @@ InferenceRequest::SetState(InferenceRequest::State new_state)
       if (new_state != InferenceRequest::State::RELEASED) {
         return generate_error();
       }
-      state_ = new_state;
       break;
     }
     case InferenceRequest::State::RELEASED: {
-      // No state transition currently supported after release.
-      return generate_error();
+      if (new_state != InferenceRequest::State::STARTED) {
+        // Only transition currently supported after release is to start again
+        // when re-using request objects for multiple inferences.
+        return generate_error();
+      }
+      break;
     }
   }
+  state_ = new_state;
   return Status::Success;
 }
 
@@ -182,9 +186,13 @@ void
 InferenceRequest::IncrementPendingRequestCount()
 {
 #ifdef TRITON_ENABLE_METRICS
-  auto reporter = model_raw_->MetricReporter();
-  if (reporter) {
-    reporter->IncrementGauge(kPendingRequestMetric, 1);
+  // Only increment once and do not increment again until decremented.
+  const bool increment_pending_count = !decrement_pending_count_;
+  if (increment_pending_count) {
+    auto reporter = model_raw_->MetricReporter();
+    if (reporter) {
+      reporter->IncrementGauge(kPendingRequestMetric, 1);
+    }
     decrement_pending_count_ = true;
   }
 #endif  // TRITON_ENABLE_METRICS

From bf84f54683fc05184e563b0dbc5a05c41b959426 Mon Sep 17 00:00:00 2001
From: Ryan McCormick <rmccormick@nvidia.com>
Date: Thu, 31 Aug 2023 22:28:14 -0700
Subject: [PATCH 2/5] Rename STARTED request state to PENDING

---
 src/infer_request.cc | 12 ++++++------
 src/infer_request.h  |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/infer_request.cc b/src/infer_request.cc
index 3dd9702cf..ce2fac139 100644
--- a/src/infer_request.cc
+++ b/src/infer_request.cc
@@ -150,13 +150,13 @@ InferenceRequest::SetState(InferenceRequest::State new_state)
   // Define state transitions
   switch (state_) {
     case InferenceRequest::State::INITIALIZED: {
-      if (new_state != InferenceRequest::State::STARTED) {
+      if (new_state != InferenceRequest::State::PENDING) {
         return generate_error();
       }
       IncrementPendingRequestCount();
       break;
     }
-    case InferenceRequest::State::STARTED: {
+    case InferenceRequest::State::PENDING: {
       if (new_state != InferenceRequest::State::EXECUTING) {
         return generate_error();
       }
@@ -170,7 +170,7 @@ InferenceRequest::SetState(InferenceRequest::State new_state)
       break;
     }
     case InferenceRequest::State::RELEASED: {
-      if (new_state != InferenceRequest::State::STARTED) {
+      if (new_state != InferenceRequest::State::PENDING) {
         // Only transition currently supported after release is to start again
         // when re-using request objects for multiple inferences.
         return generate_error();
@@ -384,7 +384,7 @@ InferenceRequest::OutputBufferProperties(
 Status
 InferenceRequest::Run(std::unique_ptr<InferenceRequest>& request)
 {
-  RETURN_IF_ERROR(request->SetState(InferenceRequest::State::STARTED));
+  RETURN_IF_ERROR(request->SetState(InferenceRequest::State::PENDING));
   return request->model_raw_->Enqueue(request);
 }
 
@@ -1588,8 +1588,8 @@ operator<<(std::ostream& out, const InferenceRequest::State& state)
       out << "INITIALIZED";
       break;
     }
-    case InferenceRequest::State::STARTED: {
-      out << "STARTED";
+    case InferenceRequest::State::PENDING: {
+      out << "PENDING";
       break;
     }
     case InferenceRequest::State::EXECUTING: {
diff --git a/src/infer_request.h b/src/infer_request.h
index 97c56ba27..09be610de 100644
--- a/src/infer_request.h
+++ b/src/infer_request.h
@@ -63,7 +63,7 @@ class InferenceRequest {
     INITIALIZED,
 
     // The request has been enqueued, but is not yet executing.
-    STARTED,
+    PENDING,
 
     // The request has been picked up by a backend model instance for execution,
     // but hasn't been released yet.

From 948f1c53affa96e499775fa626052ff88d445d45 Mon Sep 17 00:00:00 2001
From: Ryan McCormick <rmccormick@nvidia.com>
Date: Thu, 31 Aug 2023 22:46:49 -0700
Subject: [PATCH 3/5] Reset to INITIALIZED in PrepareForInference() to help
 enforce preparation before Run()

---
 src/infer_request.cc | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/infer_request.cc b/src/infer_request.cc
index ce2fac139..6bf00a002 100644
--- a/src/infer_request.cc
+++ b/src/infer_request.cc
@@ -133,7 +133,7 @@ InferenceRequest::SetState(InferenceRequest::State new_state)
 
   // Allow RELEASED state transition from any state for now.
   // Not all requests will follow linear transition, such as null requests
-  // used for padding batches, and ensemble requests.
+  // used for padding batches, ensemble requests, and errors.
   if (new_state == InferenceRequest::State::RELEASED) {
     state_ = new_state;
     return Status::Success;
@@ -170,9 +170,9 @@ InferenceRequest::SetState(InferenceRequest::State new_state)
       break;
     }
     case InferenceRequest::State::RELEASED: {
-      if (new_state != InferenceRequest::State::PENDING) {
-        // Only transition currently supported after release is to start again
-        // when re-using request objects for multiple inferences.
+      if (new_state != InferenceRequest::State::INITIALIZED) {
+        // Only transition currently supported after release is to start over
+        // again, such as re-using request objects for multiple inferences.
         return generate_error();
       }
       break;
@@ -857,8 +857,10 @@ InferenceRequest::PrepareForInference()
   request_start_ns_ = 0;
 #endif  // TRITON_ENABLE_STATS
 
-  LOG_VERBOSE(1) << LogRequest() << "prepared: " << *this;
+  // Help enforce that PrepareForInference() is called prior to Run().
+  RETURN_IF_ERROR(SetState(InferenceRequest::State::INITIALIZED));
 
+  LOG_VERBOSE(1) << LogRequest() << "prepared: " << *this;
   return Status::Success;
 }
 

From a7be7c08202a56f6a06c35d9155f0012c2d07435 Mon Sep 17 00:00:00 2001
From: Ryan McCormick <rmccormick@nvidia.com>
Date: Thu, 31 Aug 2023 22:46:49 -0700
Subject: [PATCH 4/5] Capture pending count updates strictly in state
 transitions

---
 src/infer_request.cc | 71 ++++++++++++++++++--------------------------
 src/infer_request.h  |  3 --
 2 files changed, 29 insertions(+), 45 deletions(-)

diff --git a/src/infer_request.cc b/src/infer_request.cc
index ce2fac139..ef6abca78 100644
--- a/src/infer_request.cc
+++ b/src/infer_request.cc
@@ -106,21 +106,11 @@ InferenceRequest::InferenceRequest(
     : needs_normalization_(true), model_raw_(model),
       requested_model_version_(requested_model_version), flags_(0),
       correlation_id_(0), batch_size_(0), timeout_us_(0), collect_stats_(true),
-      state_(InferenceRequest::State::INITIALIZED), null_request_(false),
-      decrement_pending_count_(false)
+      state_(InferenceRequest::State::INITIALIZED), null_request_(false)
 {
   SetPriority(0);
 }
 
-InferenceRequest::~InferenceRequest()
-{
-  // If request has been enqueued but hasn't started executing by destruction
-  // time, an error occurred and the pending request count will need to be
-  // decremented.
-  DecrementPendingRequestCount();
-}
-
-
 Status
 InferenceRequest::SetState(InferenceRequest::State new_state)
 {
@@ -131,14 +121,6 @@ InferenceRequest::SetState(InferenceRequest::State new_state)
     return Status::Success;
   }
 
-  // Allow RELEASED state transition from any state for now.
-  // Not all requests will follow linear transition, such as null requests
-  // used for padding batches, and ensemble requests.
-  if (new_state == InferenceRequest::State::RELEASED) {
-    state_ = new_state;
-    return Status::Success;
-  }
-
   // Generate error when called rather than copying it into every case below.
   const auto generate_error = [&]() {
     std::stringstream ss;
@@ -150,17 +132,25 @@ InferenceRequest::SetState(InferenceRequest::State new_state)
   // Define state transitions
   switch (state_) {
     case InferenceRequest::State::INITIALIZED: {
-      if (new_state != InferenceRequest::State::PENDING) {
+      if (new_state == InferenceRequest::State::PENDING) {
+        IncrementPendingRequestCount();
+      } else if (new_state == InferenceRequest::State::RELEASED) {
+        // No-op when moving from initialized to released, just releasing early.
+      } else {
         return generate_error();
       }
-      IncrementPendingRequestCount();
       break;
     }
     case InferenceRequest::State::PENDING: {
-      if (new_state != InferenceRequest::State::EXECUTING) {
+      // Request may move from pending to either execution when scheduled to
+      // backend, or released early due to some error.
+      if (new_state == InferenceRequest::State::EXECUTING ||
+          new_state == InferenceRequest::State::RELEASED) {
+        DecrementPendingRequestCount();
+      } else {
+        // Unexpected state transition
         return generate_error();
       }
-      DecrementPendingRequestCount();
       break;
     }
     case InferenceRequest::State::EXECUTING: {
@@ -170,9 +160,9 @@ InferenceRequest::SetState(InferenceRequest::State new_state)
       break;
     }
     case InferenceRequest::State::RELEASED: {
-      if (new_state != InferenceRequest::State::PENDING) {
-        // Only transition currently supported after release is to start again
-        // when re-using request objects for multiple inferences.
+      if (new_state != InferenceRequest::State::INITIALIZED) {
+        // Only transition currently supported after release is to start over
+        // again, such as re-using request objects for multiple inferences.
         return generate_error();
       }
       break;
@@ -186,14 +176,11 @@ void
 InferenceRequest::IncrementPendingRequestCount()
 {
 #ifdef TRITON_ENABLE_METRICS
-  // Only increment once and do not increment again until decremented.
-  const bool increment_pending_count = !decrement_pending_count_;
-  if (increment_pending_count) {
-    auto reporter = model_raw_->MetricReporter();
-    if (reporter) {
-      reporter->IncrementGauge(kPendingRequestMetric, 1);
-    }
-    decrement_pending_count_ = true;
+  // Pending request count should always be 0 or 1 per-request. If a request
+  // increments the count, it should not be incremented again until decremented.
+  auto reporter = model_raw_->MetricReporter();
+  if (reporter) {
+    reporter->IncrementGauge(kPendingRequestMetric, 1);
   }
 #endif  // TRITON_ENABLE_METRICS
 }
@@ -202,13 +189,11 @@ void
 InferenceRequest::DecrementPendingRequestCount()
 {
 #ifdef TRITON_ENABLE_METRICS
-  // Only decrement if count has been incremented, and not already decremented.
-  if (decrement_pending_count_) {
-    auto reporter = model_raw_->MetricReporter();
-    if (reporter) {
-      reporter->DecrementGauge(kPendingRequestMetric, 1);
-    }
-    decrement_pending_count_ = false;
+  // Pending request count should always be 0 or 1 per-request. A request should
+  // not decrement the count unless it has already been incremented.
+  auto reporter = model_raw_->MetricReporter();
+  if (reporter) {
+    reporter->DecrementGauge(kPendingRequestMetric, 1);
   }
 #endif  // TRITON_ENABLE_METRICS
 }
@@ -857,8 +842,10 @@ InferenceRequest::PrepareForInference()
   request_start_ns_ = 0;
 #endif  // TRITON_ENABLE_STATS
 
-  LOG_VERBOSE(1) << LogRequest() << "prepared: " << *this;
+  // Help enforce that PrepareForInference() is called prior to Run().
+  RETURN_IF_ERROR(SetState(InferenceRequest::State::INITIALIZED));
 
+  LOG_VERBOSE(1) << LogRequest() << "prepared: " << *this;
   return Status::Success;
 }
 
diff --git a/src/infer_request.h b/src/infer_request.h
index 09be610de..228cff03f 100644
--- a/src/infer_request.h
+++ b/src/infer_request.h
@@ -799,9 +799,6 @@ class InferenceRequest {
   // Whether this is a null request used for direct sequence batch padding or
   // not.
   bool null_request_;
-  // Catch-all to correctly decrement pending count if needed on destruction
-  // if request doesn't follow normal execution path (error, unused, ensembles)
-  bool decrement_pending_count_;
 };
 
 std::ostream& operator<<(std::ostream& out, const InferenceRequest& request);

From c742fc6363fed4e505051310aa822a704e7c778a Mon Sep 17 00:00:00 2001
From: Ryan McCormick <rmccormick@nvidia.com>
Date: Tue, 5 Sep 2023 12:41:34 -0700
Subject: [PATCH 5/5] Remove destructor, not necessary for now

---
 src/infer_request.h             | 1 -
 src/test/response_cache_test.cc | 2 --
 2 files changed, 3 deletions(-)

diff --git a/src/infer_request.h b/src/infer_request.h
index 228cff03f..41e7deed8 100644
--- a/src/infer_request.h
+++ b/src/infer_request.h
@@ -291,7 +291,6 @@ class InferenceRequest {
       const int64_t requested_model_version);
 
   InferenceRequest(Model* model, const int64_t requested_model_version);
-  ~InferenceRequest();
 
   const std::string& ModelName() const;
   int64_t RequestedModelVersion() const { return requested_model_version_; }
diff --git a/src/test/response_cache_test.cc b/src/test/response_cache_test.cc
index 2b40a04c5..dad7d0faf 100644
--- a/src/test/response_cache_test.cc
+++ b/src/test/response_cache_test.cc
@@ -66,8 +66,6 @@ InferenceRequest::InferenceRequest(
   response_factory_.reset(new InferenceResponseFactory());
 }
 
-InferenceRequest::~InferenceRequest() {}
-
 InferenceRequest::Input::Input(
     const std::string& name, const inference::DataType datatype,
     const int64_t* shape, const uint64_t dim_count)