diff --git a/src/c++/perf_analyzer/infer_context.cc b/src/c++/perf_analyzer/infer_context.cc index 48e9ea527..4b5e5a14d 100644 --- a/src/c++/perf_analyzer/infer_context.cc +++ b/src/c++/perf_analyzer/infer_context.cc @@ -154,14 +154,16 @@ InferContext::SendRequest(const uint64_t request_id, const bool delayed) return; } end_time_sync = std::chrono::system_clock::now(); + std::vector> + end_time_syncs{end_time_sync}; { // Add the request timestamp to thread Timestamp vector with proper // locking std::lock_guard lock(thread_stat_->mu_); auto total = end_time_sync - start_time_sync; thread_stat_->request_timestamps_.emplace_back(std::make_tuple( - start_time_sync, end_time_sync, infer_data_.options_->sequence_end_, - delayed)); + start_time_sync, std::move(end_time_syncs), + infer_data_.options_->sequence_end_, delayed)); thread_stat_->status_ = infer_backend_->ClientInferStat(&(thread_stat_->contexts_stat_[id_])); if (!thread_stat_->status_.IsOk()) { @@ -235,6 +237,8 @@ InferContext::AsyncCallbackFuncImpl(cb::InferResult* result) { std::shared_ptr result_ptr(result); if (thread_stat_->cb_status_.IsOk()) { + // TODO TMA-1257 use final response parameter from grpc client + bool final_response = true; // Add the request timestamp to thread Timestamp vector with // proper locking std::lock_guard lock(thread_stat_->mu_); @@ -246,12 +250,15 @@ InferContext::AsyncCallbackFuncImpl(cb::InferResult* result) thread_stat_->cb_status_ = result_ptr->Id(&request_id); const auto& it = async_req_map_.find(request_id); if (it != async_req_map_.end()) { - thread_stat_->request_timestamps_.emplace_back(std::make_tuple( - it->second.start_time_, end_time_async, it->second.sequence_end_, - it->second.delayed_)); - infer_backend_->ClientInferStat(&(thread_stat_->contexts_stat_[id_])); - thread_stat_->cb_status_ = ValidateOutputs(result); - async_req_map_.erase(request_id); + it->second.end_times.push_back(end_time_async); + if (final_response) { + thread_stat_->request_timestamps_.emplace_back(std::make_tuple( + it->second.start_time_, it->second.end_times, + it->second.sequence_end_, it->second.delayed_)); + infer_backend_->ClientInferStat(&(thread_stat_->contexts_stat_[id_])); + thread_stat_->cb_status_ = ValidateOutputs(result); + async_req_map_.erase(request_id); + } } } } diff --git a/src/c++/perf_analyzer/infer_context.h b/src/c++/perf_analyzer/infer_context.h index b7f6ada89..c91fbcacc 100644 --- a/src/c++/perf_analyzer/infer_context.h +++ b/src/c++/perf_analyzer/infer_context.h @@ -74,6 +74,8 @@ struct AsyncRequestProperties { bool sequence_end_; // Whether or not the request is delayed as per schedule. bool delayed_; + // Collection of response times + std::vector> end_times; }; #ifndef DOCTEST_CONFIG_DISABLE diff --git a/src/c++/perf_analyzer/inference_profiler.cc b/src/c++/perf_analyzer/inference_profiler.cc index 7bd2c87a1..b60006286 100644 --- a/src/c++/perf_analyzer/inference_profiler.cc +++ b/src/c++/perf_analyzer/inference_profiler.cc @@ -1253,7 +1253,7 @@ InferenceProfiler::ValidLatencyMeasurement( for (size_t i = 0; i < all_timestamps_.size(); i++) { const auto& timestamp = all_timestamps_[i]; uint64_t request_start_ns = CHRONO_TO_NANOS(std::get<0>(timestamp)); - uint64_t request_end_ns = CHRONO_TO_NANOS(std::get<1>(timestamp)); + uint64_t request_end_ns = CHRONO_TO_NANOS(std::get<1>(timestamp).back()); if (request_start_ns <= request_end_ns) { // Only counting requests that end within the time interval diff --git a/src/c++/perf_analyzer/perf_utils.h b/src/c++/perf_analyzer/perf_utils.h index 38ea47dc5..f11bf9815 100644 --- a/src/c++/perf_analyzer/perf_utils.h +++ b/src/c++/perf_analyzer/perf_utils.h @@ -55,7 +55,7 @@ constexpr uint64_t NANOS_PER_MILLIS = 1000000; //============================================================================== using TimestampVector = std::vector, - std::chrono::time_point, uint32_t, bool>>; + std::vector>, uint32_t, bool>>; // Will use the characters specified here to construct random strings std::string const character_set = diff --git a/src/c++/perf_analyzer/test_inference_profiler.cc b/src/c++/perf_analyzer/test_inference_profiler.cc index 51154c037..27f75519f 100644 --- a/src/c++/perf_analyzer/test_inference_profiler.cc +++ b/src/c++/perf_analyzer/test_inference_profiler.cc @@ -170,33 +170,42 @@ TEST_CASE("testing the ValidLatencyMeasurement function") // request ends before window starts, this should not be possible to exist // in the vector of requests, but if it is, we exclude it: not included in // current window - std::make_tuple(time_point(ns(1)), time_point(ns(2)), 0, false), + std::make_tuple( + time_point(ns(1)), std::vector{time_point(ns(2))}, 0, + false), // request starts before window starts and ends inside window: included in // current window - std::make_tuple(time_point(ns(3)), time_point(ns(5)), 0, false), + std::make_tuple( + time_point(ns(3)), std::vector{time_point(ns(5))}, 0, + false), // requests start and end inside window: included in current window - std::make_tuple(time_point(ns(6)), time_point(ns(9)), 0, false), - std::make_tuple(time_point(ns(10)), time_point(ns(14)), 0, false), + std::make_tuple( + time_point(ns(6)), std::vector{time_point(ns(9))}, 0, + false), + std::make_tuple( + time_point(ns(10)), std::vector{time_point(ns(14))}, 0, + false), // request starts before window ends and ends after window ends: not // included in current window - std::make_tuple(time_point(ns(15)), time_point(ns(20)), 0, false), + std::make_tuple( + time_point(ns(15)), std::vector{time_point(ns(20))}, 0, + false), // request starts after window ends: not included in current window - std::make_tuple(time_point(ns(21)), time_point(ns(27)), 0, false)}; + std::make_tuple( + time_point(ns(21)), std::vector{time_point(ns(27))}, 0, + false)}; TestInferenceProfiler::ValidLatencyMeasurement( window, valid_sequence_count, delayed_request_count, &latencies, all_timestamps); const auto& convert_timestamp_to_latency{ - [](std::tuple< - std::chrono::time_point, - std::chrono::time_point, uint32_t, bool> - t) { - return CHRONO_TO_NANOS(std::get<1>(t)) - + [](std::tuple, uint32_t, bool> t) { + return CHRONO_TO_NANOS(std::get<1>(t).back()) - CHRONO_TO_NANOS(std::get<0>(t)); }}; diff --git a/src/c++/perf_analyzer/test_load_manager.cc b/src/c++/perf_analyzer/test_load_manager.cc index 1e0798c45..224dc895f 100644 --- a/src/c++/perf_analyzer/test_load_manager.cc +++ b/src/c++/perf_analyzer/test_load_manager.cc @@ -117,12 +117,15 @@ class TestLoadManager : public TestLoadManagerBase, public LoadManager { { using time_point = std::chrono::time_point; using ns = std::chrono::nanoseconds; - auto timestamp1 = - std::make_tuple(time_point(ns(1)), time_point(ns(2)), 0, false); - auto timestamp2 = - std::make_tuple(time_point(ns(3)), time_point(ns(4)), 0, false); - auto timestamp3 = - std::make_tuple(time_point(ns(5)), time_point(ns(6)), 0, false); + auto timestamp1 = std::make_tuple( + time_point(ns(1)), std::vector{time_point(ns(2))}, 0, + false); + auto timestamp2 = std::make_tuple( + time_point(ns(3)), std::vector{time_point(ns(4))}, 0, + false); + auto timestamp3 = std::make_tuple( + time_point(ns(5)), std::vector{time_point(ns(6))}, 0, + false); TimestampVector source_timestamps; @@ -275,12 +278,15 @@ class TestLoadManager : public TestLoadManagerBase, public LoadManager { { using time_point = std::chrono::time_point; using ns = std::chrono::nanoseconds; - auto timestamp1 = - std::make_tuple(time_point(ns(1)), time_point(ns(2)), 0, false); - auto timestamp2 = - std::make_tuple(time_point(ns(3)), time_point(ns(4)), 0, false); - auto timestamp3 = - std::make_tuple(time_point(ns(5)), time_point(ns(6)), 0, false); + auto timestamp1 = std::make_tuple( + time_point(ns(1)), std::vector{time_point(ns(2))}, 0, + false); + auto timestamp2 = std::make_tuple( + time_point(ns(3)), std::vector{time_point(ns(4))}, 0, + false); + auto timestamp3 = std::make_tuple( + time_point(ns(5)), std::vector{time_point(ns(6))}, 0, + false); SUBCASE("No threads") {