Calculate response throughput metric (#356)

* Calculate response throughput metric * Address feedback * Cleanup
triton-inference-server · Jul 28, 2023 · c753ff4 · c753ff4
1 parent bccbe4b
commit c753ff4
Show file tree

Hide file tree

Showing 15 changed files with 355 additions and 31 deletions.
diff --git a/src/c++/library/common.h b/src/c++/library/common.h
@@ -513,10 +513,14 @@ class InferResult {
       const std::string& output_name, const uint8_t** buf,
       size_t* byte_size) const = 0;
 
-  /// Get final response bool of the request which generated this response.
+  /// Get final response bool for this response.
   /// \return Error object indicating the success or failure.
   virtual Error IsFinalResponse(bool* is_final_response) const = 0;
 
+  /// Get null response bool for this response.
+  /// \return Error object indicating the success or failure.
+  virtual Error IsNullResponse(bool* is_null_response) const = 0;
+
   /// Get the result data as a vector of strings. The vector will
   /// receive a copy of result data. An error will be generated if
   /// the datatype of output is not 'BYTES'.

diff --git a/src/c++/library/grpc_client.cc b/src/c++/library/grpc_client.cc
@@ -1,4 +1,4 @@
-// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -190,6 +190,7 @@ class InferResultGrpc : public InferResult {
       const std::string& output_name, const uint8_t** buf,
       size_t* byte_size) const override;
   Error IsFinalResponse(bool* is_final_response) const override;
+  Error IsNullResponse(bool* is_null_response) const override;
   Error StringData(
       const std::string& output_name,
       std::vector<std::string>* string_result) const override;
@@ -211,6 +212,7 @@ class InferResultGrpc : public InferResult {
   std::shared_ptr<inference::ModelStreamInferResponse> stream_response_;
   Error request_status_;
   bool is_final_response_{true};
+  bool is_null_response_{false};
 };
 
 Error
@@ -322,6 +324,16 @@ InferResultGrpc::IsFinalResponse(bool* is_final_response) const
   return Error::Success;
 }
 
+Error
+InferResultGrpc::IsNullResponse(bool* is_null_response) const
+{
+  if (is_null_response == nullptr) {
+    return Error("is_null_response cannot be nullptr");
+  }
+  *is_null_response = is_null_response_;
+  return Error::Success;
+}
+
 Error
 InferResultGrpc::StringData(
     const std::string& output_name,
@@ -384,6 +396,7 @@ InferResultGrpc::InferResultGrpc(
   if (is_final_response_itr != response_->parameters().end()) {
     is_final_response_ = is_final_response_itr->second.bool_param();
   }
+  is_null_response_ = response_->outputs().empty() && is_final_response_;
 }
 
 InferResultGrpc::InferResultGrpc(
@@ -409,6 +422,7 @@ InferResultGrpc::InferResultGrpc(
   if (is_final_response_itr != response_->parameters().end()) {
     is_final_response_ = is_final_response_itr->second.bool_param();
   }
+  is_null_response_ = response_->outputs().empty() && is_final_response_;
 }
 
 //==============================================================================

diff --git a/src/c++/library/http_client.cc b/src/c++/library/http_client.cc
@@ -740,6 +740,7 @@ class InferResultHttp : public InferResult {
       const std::string& output_name, const uint8_t** buf,
       size_t* byte_size) const override;
   Error IsFinalResponse(bool* is_final_response) const override;
+  Error IsNullResponse(bool* is_null_response) const override;
   Error StringData(
       const std::string& output_name,
       std::vector<std::string>* string_result) const override;
@@ -769,6 +770,7 @@ class InferResultHttp : public InferResult {
 
   bool binary_data_{true};
   bool is_final_response_{true};
+  bool is_null_response_{false};
 };
 
 void
@@ -951,6 +953,16 @@ InferResultHttp::IsFinalResponse(bool* is_final_response) const
   return Error::Success;
 }
 
+Error
+InferResultHttp::IsNullResponse(bool* is_null_response) const
+{
+  if (is_null_response == nullptr) {
+    return Error("is_null_response cannot be nullptr");
+  }
+  *is_null_response = is_null_response_;
+  return Error::Success;
+}
+
 Error
 InferResultHttp::StringData(
     const std::string& output_name,

diff --git a/src/c++/perf_analyzer/client_backend/client_backend.h b/src/c++/perf_analyzer/client_backend/client_backend.h
@@ -617,12 +617,19 @@ class InferResult {
       const std::string& output_name, const uint8_t** buf,
       size_t* byte_size) const = 0;
 
-  /// Get final response bool of the request which generated this response.
+  /// Get final response bool for this response.
   /// \return Error object indicating the success or failure.
   virtual Error IsFinalResponse(bool* is_final_response) const
   {
     return Error("InferResult::IsFinalResponse() not implemented");
   };
+
+  /// Get null response bool for this response.
+  /// \return Error object indicating the success or failure.
+  virtual Error IsNullResponse(bool* is_null_response) const
+  {
+    return Error("InferResult::IsNullResponse() not implemented");
+  };
 };
 
 }}}  // namespace triton::perfanalyzer::clientbackend
diff --git a/src/c++/perf_analyzer/client_backend/mock_client_backend.h b/src/c++/perf_analyzer/client_backend/mock_client_backend.h
@@ -127,6 +127,15 @@ class MockInferResult : public InferResult {
     return Error::Success;
   }
 
+  Error IsNullResponse(bool* is_null_response) const override
+  {
+    if (is_null_response == nullptr) {
+      return Error("is_null_response cannot be nullptr");
+    }
+    *is_null_response = false;
+    return Error::Success;
+  }
+
  private:
   std::string req_id_;
 };

diff --git a/src/c++/perf_analyzer/client_backend/triton/triton_client_backend.cc b/src/c++/perf_analyzer/client_backend/triton/triton_client_backend.cc
@@ -827,6 +827,13 @@ TritonInferResult::IsFinalResponse(bool* is_final_response) const
   return Error::Success;
 }
 
+Error
+TritonInferResult::IsNullResponse(bool* is_null_response) const
+{
+  RETURN_IF_TRITON_ERROR(result_->IsNullResponse(is_null_response));
+  return Error::Success;
+}
+
 //==============================================================================
 
 }}}}  // namespace triton::perfanalyzer::clientbackend::tritonremote
diff --git a/src/c++/perf_analyzer/client_backend/triton/triton_client_backend.h b/src/c++/perf_analyzer/client_backend/triton/triton_client_backend.h
@@ -331,6 +331,8 @@ class TritonInferResult : public InferResult {
       size_t* byte_size) const override;
   /// See InferResult::IsFinalResponse()
   Error IsFinalResponse(bool* is_final_response) const override;
+  /// See InferResult::IsNullResponse()
+  Error IsNullResponse(bool* is_null_response) const override;
 
  private:
   std::unique_ptr<tc::InferResult> result_;

diff --git a/src/c++/perf_analyzer/infer_context.cc b/src/c++/perf_analyzer/infer_context.cc
@@ -1,4 +1,4 @@
-// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -236,20 +236,26 @@ void
 InferContext::AsyncCallbackFuncImpl(cb::InferResult* result)
 {
   std::shared_ptr<cb::InferResult> result_ptr(result);
+  bool is_final_response{true};
   if (thread_stat_->cb_status_.IsOk()) {
     // Add the request timestamp to thread Timestamp vector with
     // proper locking
     std::lock_guard<std::mutex> lock(thread_stat_->mu_);
     thread_stat_->cb_status_ = result_ptr->RequestStatus();
     if (thread_stat_->cb_status_.IsOk()) {
-      std::chrono::time_point<std::chrono::system_clock> end_time_async;
-      end_time_async = std::chrono::system_clock::now();
       std::string request_id;
       thread_stat_->cb_status_ = result_ptr->Id(&request_id);
       const auto& it = async_req_map_.find(request_id);
       if (it != async_req_map_.end()) {
-        it->second.end_times.push_back(end_time_async);
-        bool is_final_response{false};
+        bool is_null_response{false};
+        thread_stat_->cb_status_ =
+            result_ptr->IsNullResponse(&is_null_response);
+        if (thread_stat_->cb_status_.IsOk() == false) {
+          return;
+        }
+        if (is_null_response == false) {
+          it->second.end_times.push_back(std::chrono::system_clock::now());
+        }
         thread_stat_->cb_status_ =
             result_ptr->IsFinalResponse(&is_final_response);
         if (thread_stat_->cb_status_.IsOk() == false) {
@@ -267,10 +273,12 @@ InferContext::AsyncCallbackFuncImpl(cb::InferResult* result)
     }
   }
 
-  total_ongoing_requests_--;
+  if (is_final_response) {
+    total_ongoing_requests_--;
 
-  if (async_callback_finalize_func_ != nullptr) {
-    async_callback_finalize_func_(id_);
+    if (async_callback_finalize_func_ != nullptr) {
+      async_callback_finalize_func_(id_);
+    }
   }
 }
 

diff --git a/src/c++/perf_analyzer/inference_profiler.cc b/src/c++/perf_analyzer/inference_profiler.cc
@@ -1012,6 +1012,8 @@ InferenceProfiler::MergePerfStatusReports(
         perf_status.client_stats.sequence_count;
     experiment_perf_status.client_stats.delayed_request_count +=
         perf_status.client_stats.delayed_request_count;
+    experiment_perf_status.client_stats.response_count +=
+        perf_status.client_stats.response_count;
     experiment_perf_status.client_stats.duration_ns +=
         perf_status.client_stats.duration_ns;
 
@@ -1079,6 +1081,8 @@ InferenceProfiler::MergePerfStatusReports(
       (experiment_perf_status.client_stats.request_count *
        experiment_perf_status.batch_size) /
       client_duration_sec;
+  experiment_perf_status.client_stats.responses_per_sec =
+      experiment_perf_status.client_stats.response_count / client_duration_sec;
   RETURN_IF_ERROR(SummarizeLatency(
       experiment_perf_status.client_stats.latencies, experiment_perf_status));
 
@@ -1211,18 +1215,20 @@ InferenceProfiler::Summarize(
 {
   size_t valid_sequence_count = 0;
   size_t delayed_request_count = 0;
+  size_t response_count = 0;
 
   // Get measurement from requests that fall within the time interval
   std::pair<uint64_t, uint64_t> valid_range{window_start_ns, window_end_ns};
   uint64_t window_duration_ns = valid_range.second - valid_range.first;
   std::vector<uint64_t> latencies;
   ValidLatencyMeasurement(
-      valid_range, valid_sequence_count, delayed_request_count, &latencies);
+      valid_range, valid_sequence_count, delayed_request_count, &latencies,
+      response_count);
 
   RETURN_IF_ERROR(SummarizeLatency(latencies, summary));
   RETURN_IF_ERROR(SummarizeClientStat(
       start_stat, end_stat, window_duration_ns, latencies.size(),
-      valid_sequence_count, delayed_request_count, summary));
+      valid_sequence_count, delayed_request_count, response_count, summary));
   summary.client_stats.latencies = std::move(latencies);
 
   SummarizeOverhead(window_duration_ns, manager_->GetIdleTime(), summary);
@@ -1245,10 +1251,11 @@ void
 InferenceProfiler::ValidLatencyMeasurement(
     const std::pair<uint64_t, uint64_t>& valid_range,
     size_t& valid_sequence_count, size_t& delayed_request_count,
-    std::vector<uint64_t>* valid_latencies)
+    std::vector<uint64_t>* valid_latencies, size_t& response_count)
 {
   valid_latencies->clear();
   valid_sequence_count = 0;
+  response_count = 0;
   std::vector<size_t> erase_indices{};
   for (size_t i = 0; i < all_timestamps_.size(); i++) {
     const auto& timestamp = all_timestamps_[i];
@@ -1260,6 +1267,7 @@ InferenceProfiler::ValidLatencyMeasurement(
       if ((request_end_ns >= valid_range.first) &&
           (request_end_ns <= valid_range.second)) {
         valid_latencies->push_back(request_end_ns - request_start_ns);
+        response_count += std::get<1>(timestamp).size();
         erase_indices.push_back(i);
         // Just add the sequence_end flag here.
         if (std::get<2>(timestamp)) {
@@ -1358,7 +1366,7 @@ InferenceProfiler::SummarizeClientStat(
     const cb::InferStat& start_stat, const cb::InferStat& end_stat,
     const uint64_t duration_ns, const size_t valid_request_count,
     const size_t valid_sequence_count, const size_t delayed_request_count,
-    PerfStatus& summary)
+    const size_t response_count, PerfStatus& summary)
 {
   summary.on_sequence_model =
       ((parser_->SchedulerType() == ModelParser::SEQUENCE) ||
@@ -1367,13 +1375,15 @@ InferenceProfiler::SummarizeClientStat(
   summary.client_stats.request_count = valid_request_count;
   summary.client_stats.sequence_count = valid_sequence_count;
   summary.client_stats.delayed_request_count = delayed_request_count;
+  summary.client_stats.response_count = response_count;
   summary.client_stats.duration_ns = duration_ns;
   float client_duration_sec =
       (float)summary.client_stats.duration_ns / NANOS_PER_SECOND;
   summary.client_stats.sequence_per_sec =
       valid_sequence_count / client_duration_sec;
   summary.client_stats.infer_per_sec =
       (valid_request_count * summary.batch_size) / client_duration_sec;
+  summary.client_stats.responses_per_sec = response_count / client_duration_sec;
 
   if (include_lib_stats_) {
     size_t completed_count =

diff --git a/src/c++/perf_analyzer/inference_profiler.h b/src/c++/perf_analyzer/inference_profiler.h
@@ -48,6 +48,7 @@
 namespace triton { namespace perfanalyzer {
 
 #ifndef DOCTEST_CONFIG_DISABLE
+class NaggyMockInferenceProfiler;
 class TestInferenceProfiler;
 #endif
 
@@ -126,6 +127,8 @@ struct ClientSideStats {
   uint64_t sequence_count;
   // The number of requests that missed their schedule
   uint64_t delayed_request_count;
+  // The number of responses
+  uint64_t response_count;
   uint64_t duration_ns;
   uint64_t avg_latency_ns;
   // a ordered map of percentiles to be reported (<percentile, value> pair)
@@ -139,6 +142,7 @@ struct ClientSideStats {
   uint64_t avg_receive_time_ns;
   // Per sec stat
   double infer_per_sec;
+  double responses_per_sec;
   double sequence_per_sec;
 
   // Completed request count reported by the client library
@@ -440,16 +444,17 @@ class InferenceProfiler {
   /// sequence model.
   /// \param latencies Returns the vector of request latencies where the
   /// requests are completed within the measurement window.
-  void ValidLatencyMeasurement(
+  /// \param response_count Returns the number of responses
+  virtual void ValidLatencyMeasurement(
       const std::pair<uint64_t, uint64_t>& valid_range,
       size_t& valid_sequence_count, size_t& delayed_request_count,
-      std::vector<uint64_t>* latencies);
+      std::vector<uint64_t>* latencies, size_t& response_count);
 
   /// \param latencies The vector of request latencies collected.
   /// \param summary Returns the summary that the latency related fields are
   /// set.
   /// \return cb::Error object indicating success or failure.
-  cb::Error SummarizeLatency(
+  virtual cb::Error SummarizeLatency(
       const std::vector<uint64_t>& latencies, PerfStatus& summary);
 
   /// \param latencies The vector of request latencies collected.
@@ -466,14 +471,15 @@ class InferenceProfiler {
   /// \param valid_sequence_count The number of completed sequences recorded.
   /// \param delayed_request_count The number of requests that missed their
   /// schedule.
+  /// \param response_count The number of responses.
   /// \param summary Returns the summary that the fields recorded by
   /// client are set.
   /// \return cb::Error object indicating success or failure.
-  cb::Error SummarizeClientStat(
+  virtual cb::Error SummarizeClientStat(
       const cb::InferStat& start_stat, const cb::InferStat& end_stat,
       const uint64_t duration_ns, const size_t valid_request_count,
       const size_t delayed_request_count, const size_t valid_sequence_count,
-      PerfStatus& summary);
+      const size_t response_count, PerfStatus& summary);
 
   /// Adds the send request rate metric to the summary object.
   /// \param window_duration_s The duration of the window in seconds.
@@ -557,15 +563,15 @@ class InferenceProfiler {
   /// \param perf_status List of perf status reports to be merged.
   /// \param summary_status Final merged summary status.
   /// \return cb::Error object indicating success or failure.
-  cb::Error MergePerfStatusReports(
+  virtual cb::Error MergePerfStatusReports(
       std::deque<PerfStatus>& perf_status, PerfStatus& summary_status);
 
   /// Merge individual server side statistics into a single server side report.
   /// \param server_side_stats List of server side statistics reports to be
   /// merged.
   /// \param server_side_summary Final merged summary status.
   /// \return cb::Error object indicating success or failure.
-  cb::Error MergeServerSideStats(
+  virtual cb::Error MergeServerSideStats(
       std::vector<ServerSideStats>& server_side_stats,
       ServerSideStats& server_side_summary);
 
@@ -695,10 +701,11 @@ class InferenceProfiler {
   const double overhead_pct_threshold_{0.0};
 
 #ifndef DOCTEST_CONFIG_DISABLE
+  friend NaggyMockInferenceProfiler;
   friend TestInferenceProfiler;
 
  public:
-  InferenceProfiler(){};
+  InferenceProfiler() = default;
 #endif
 };