diff --git a/src/c++/perf_analyzer/infer_context.cc b/src/c++/perf_analyzer/infer_context.cc
index 48e9ea527..4b5e5a14d 100644
--- a/src/c++/perf_analyzer/infer_context.cc
+++ b/src/c++/perf_analyzer/infer_context.cc
@@ -154,14 +154,16 @@ InferContext::SendRequest(const uint64_t request_id, const bool delayed)
       return;
     }
     end_time_sync = std::chrono::system_clock::now();
+    std::vector<std::chrono::time_point<std::chrono::system_clock>>
+        end_time_syncs{end_time_sync};
     {
       // Add the request timestamp to thread Timestamp vector with proper
       // locking
       std::lock_guard<std::mutex> lock(thread_stat_->mu_);
       auto total = end_time_sync - start_time_sync;
       thread_stat_->request_timestamps_.emplace_back(std::make_tuple(
-          start_time_sync, end_time_sync, infer_data_.options_->sequence_end_,
-          delayed));
+          start_time_sync, std::move(end_time_syncs),
+          infer_data_.options_->sequence_end_, delayed));
       thread_stat_->status_ =
           infer_backend_->ClientInferStat(&(thread_stat_->contexts_stat_[id_]));
       if (!thread_stat_->status_.IsOk()) {
@@ -235,6 +237,8 @@ InferContext::AsyncCallbackFuncImpl(cb::InferResult* result)
 {
   std::shared_ptr<cb::InferResult> result_ptr(result);
   if (thread_stat_->cb_status_.IsOk()) {
+    // TODO TMA-1257 use final response parameter from grpc client
+    bool final_response = true;
     // Add the request timestamp to thread Timestamp vector with
     // proper locking
     std::lock_guard<std::mutex> lock(thread_stat_->mu_);
@@ -246,12 +250,15 @@ InferContext::AsyncCallbackFuncImpl(cb::InferResult* result)
       thread_stat_->cb_status_ = result_ptr->Id(&request_id);
       const auto& it = async_req_map_.find(request_id);
       if (it != async_req_map_.end()) {
-        thread_stat_->request_timestamps_.emplace_back(std::make_tuple(
-            it->second.start_time_, end_time_async, it->second.sequence_end_,
-            it->second.delayed_));
-        infer_backend_->ClientInferStat(&(thread_stat_->contexts_stat_[id_]));
-        thread_stat_->cb_status_ = ValidateOutputs(result);
-        async_req_map_.erase(request_id);
+        it->second.end_times.push_back(end_time_async);
+        if (final_response) {
+          thread_stat_->request_timestamps_.emplace_back(std::make_tuple(
+              it->second.start_time_, it->second.end_times,
+              it->second.sequence_end_, it->second.delayed_));
+          infer_backend_->ClientInferStat(&(thread_stat_->contexts_stat_[id_]));
+          thread_stat_->cb_status_ = ValidateOutputs(result);
+          async_req_map_.erase(request_id);
+        }
       }
     }
   }
diff --git a/src/c++/perf_analyzer/infer_context.h b/src/c++/perf_analyzer/infer_context.h
index b7f6ada89..c91fbcacc 100644
--- a/src/c++/perf_analyzer/infer_context.h
+++ b/src/c++/perf_analyzer/infer_context.h
@@ -74,6 +74,8 @@ struct AsyncRequestProperties {
   bool sequence_end_;
   // Whether or not the request is delayed as per schedule.
   bool delayed_;
+  // Collection of response times
+  std::vector<std::chrono::time_point<std::chrono::system_clock>> end_times;
 };
 
 #ifndef DOCTEST_CONFIG_DISABLE
diff --git a/src/c++/perf_analyzer/inference_profiler.cc b/src/c++/perf_analyzer/inference_profiler.cc
index 7bd2c87a1..b60006286 100644
--- a/src/c++/perf_analyzer/inference_profiler.cc
+++ b/src/c++/perf_analyzer/inference_profiler.cc
@@ -1253,7 +1253,7 @@ InferenceProfiler::ValidLatencyMeasurement(
   for (size_t i = 0; i < all_timestamps_.size(); i++) {
     const auto& timestamp = all_timestamps_[i];
     uint64_t request_start_ns = CHRONO_TO_NANOS(std::get<0>(timestamp));
-    uint64_t request_end_ns = CHRONO_TO_NANOS(std::get<1>(timestamp));
+    uint64_t request_end_ns = CHRONO_TO_NANOS(std::get<1>(timestamp).back());
 
     if (request_start_ns <= request_end_ns) {
       // Only counting requests that end within the time interval
diff --git a/src/c++/perf_analyzer/perf_utils.h b/src/c++/perf_analyzer/perf_utils.h
index 38ea47dc5..f11bf9815 100644
--- a/src/c++/perf_analyzer/perf_utils.h
+++ b/src/c++/perf_analyzer/perf_utils.h
@@ -55,7 +55,7 @@ constexpr uint64_t NANOS_PER_MILLIS = 1000000;
 //==============================================================================
 using TimestampVector = std::vector<std::tuple<
     std::chrono::time_point<std::chrono::system_clock>,
-    std::chrono::time_point<std::chrono::system_clock>, uint32_t, bool>>;
+    std::vector<std::chrono::time_point<std::chrono::system_clock>>, uint32_t, bool>>;
 
 // Will use the characters specified here to construct random strings
 std::string const character_set =
diff --git a/src/c++/perf_analyzer/test_inference_profiler.cc b/src/c++/perf_analyzer/test_inference_profiler.cc
index 51154c037..27f75519f 100644
--- a/src/c++/perf_analyzer/test_inference_profiler.cc
+++ b/src/c++/perf_analyzer/test_inference_profiler.cc
@@ -170,33 +170,42 @@ TEST_CASE("testing the ValidLatencyMeasurement function")
       // request ends before window starts, this should not be possible to exist
       // in the vector of requests, but if it is, we exclude it: not included in
       // current window
-      std::make_tuple(time_point(ns(1)), time_point(ns(2)), 0, false),
+      std::make_tuple(
+          time_point(ns(1)), std::vector<time_point>{time_point(ns(2))}, 0,
+          false),
 
       // request starts before window starts and ends inside window: included in
       // current window
-      std::make_tuple(time_point(ns(3)), time_point(ns(5)), 0, false),
+      std::make_tuple(
+          time_point(ns(3)), std::vector<time_point>{time_point(ns(5))}, 0,
+          false),
 
       // requests start and end inside window: included in current window
-      std::make_tuple(time_point(ns(6)), time_point(ns(9)), 0, false),
-      std::make_tuple(time_point(ns(10)), time_point(ns(14)), 0, false),
+      std::make_tuple(
+          time_point(ns(6)), std::vector<time_point>{time_point(ns(9))}, 0,
+          false),
+      std::make_tuple(
+          time_point(ns(10)), std::vector<time_point>{time_point(ns(14))}, 0,
+          false),
 
       // request starts before window ends and ends after window ends: not
       // included in current window
-      std::make_tuple(time_point(ns(15)), time_point(ns(20)), 0, false),
+      std::make_tuple(
+          time_point(ns(15)), std::vector<time_point>{time_point(ns(20))}, 0,
+          false),
 
       // request starts after window ends: not included in current window
-      std::make_tuple(time_point(ns(21)), time_point(ns(27)), 0, false)};
+      std::make_tuple(
+          time_point(ns(21)), std::vector<time_point>{time_point(ns(27))}, 0,
+          false)};
 
   TestInferenceProfiler::ValidLatencyMeasurement(
       window, valid_sequence_count, delayed_request_count, &latencies,
       all_timestamps);
 
   const auto& convert_timestamp_to_latency{
-      [](std::tuple<
-          std::chrono::time_point<std::chrono::system_clock>,
-          std::chrono::time_point<std::chrono::system_clock>, uint32_t, bool>
-             t) {
-        return CHRONO_TO_NANOS(std::get<1>(t)) -
+      [](std::tuple<time_point, std::vector<time_point>, uint32_t, bool> t) {
+        return CHRONO_TO_NANOS(std::get<1>(t).back()) -
                CHRONO_TO_NANOS(std::get<0>(t));
       }};
 
diff --git a/src/c++/perf_analyzer/test_load_manager.cc b/src/c++/perf_analyzer/test_load_manager.cc
index 1e0798c45..224dc895f 100644
--- a/src/c++/perf_analyzer/test_load_manager.cc
+++ b/src/c++/perf_analyzer/test_load_manager.cc
@@ -117,12 +117,15 @@ class TestLoadManager : public TestLoadManagerBase, public LoadManager {
   {
     using time_point = std::chrono::time_point<std::chrono::system_clock>;
     using ns = std::chrono::nanoseconds;
-    auto timestamp1 =
-        std::make_tuple(time_point(ns(1)), time_point(ns(2)), 0, false);
-    auto timestamp2 =
-        std::make_tuple(time_point(ns(3)), time_point(ns(4)), 0, false);
-    auto timestamp3 =
-        std::make_tuple(time_point(ns(5)), time_point(ns(6)), 0, false);
+    auto timestamp1 = std::make_tuple(
+        time_point(ns(1)), std::vector<time_point>{time_point(ns(2))}, 0,
+        false);
+    auto timestamp2 = std::make_tuple(
+        time_point(ns(3)), std::vector<time_point>{time_point(ns(4))}, 0,
+        false);
+    auto timestamp3 = std::make_tuple(
+        time_point(ns(5)), std::vector<time_point>{time_point(ns(6))}, 0,
+        false);
 
     TimestampVector source_timestamps;
 
@@ -275,12 +278,15 @@ class TestLoadManager : public TestLoadManagerBase, public LoadManager {
   {
     using time_point = std::chrono::time_point<std::chrono::system_clock>;
     using ns = std::chrono::nanoseconds;
-    auto timestamp1 =
-        std::make_tuple(time_point(ns(1)), time_point(ns(2)), 0, false);
-    auto timestamp2 =
-        std::make_tuple(time_point(ns(3)), time_point(ns(4)), 0, false);
-    auto timestamp3 =
-        std::make_tuple(time_point(ns(5)), time_point(ns(6)), 0, false);
+    auto timestamp1 = std::make_tuple(
+        time_point(ns(1)), std::vector<time_point>{time_point(ns(2))}, 0,
+        false);
+    auto timestamp2 = std::make_tuple(
+        time_point(ns(3)), std::vector<time_point>{time_point(ns(4))}, 0,
+        false);
+    auto timestamp3 = std::make_tuple(
+        time_point(ns(5)), std::vector<time_point>{time_point(ns(6))}, 0,
+        false);
 
     SUBCASE("No threads")
     {