Skip to content

Commit

Permalink
Calculate response throughput metric (#356)
Browse files Browse the repository at this point in the history
* Calculate response throughput metric

* Address feedback

* Cleanup
  • Loading branch information
matthewkotila committed Jul 28, 2023
1 parent bccbe4b commit c753ff4
Show file tree
Hide file tree
Showing 15 changed files with 355 additions and 31 deletions.
6 changes: 5 additions & 1 deletion src/c++/library/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -513,10 +513,14 @@ class InferResult {
const std::string& output_name, const uint8_t** buf,
size_t* byte_size) const = 0;

/// Get final response bool of the request which generated this response.
/// Get final response bool for this response.
/// \return Error object indicating the success or failure.
virtual Error IsFinalResponse(bool* is_final_response) const = 0;

/// Get null response bool for this response.
/// \return Error object indicating the success or failure.
virtual Error IsNullResponse(bool* is_null_response) const = 0;

/// Get the result data as a vector of strings. The vector will
/// receive a copy of result data. An error will be generated if
/// the datatype of output is not 'BYTES'.
Expand Down
16 changes: 15 additions & 1 deletion src/c++/library/grpc_client.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -190,6 +190,7 @@ class InferResultGrpc : public InferResult {
const std::string& output_name, const uint8_t** buf,
size_t* byte_size) const override;
Error IsFinalResponse(bool* is_final_response) const override;
Error IsNullResponse(bool* is_null_response) const override;
Error StringData(
const std::string& output_name,
std::vector<std::string>* string_result) const override;
Expand All @@ -211,6 +212,7 @@ class InferResultGrpc : public InferResult {
std::shared_ptr<inference::ModelStreamInferResponse> stream_response_;
Error request_status_;
bool is_final_response_{true};
bool is_null_response_{false};
};

Error
Expand Down Expand Up @@ -322,6 +324,16 @@ InferResultGrpc::IsFinalResponse(bool* is_final_response) const
return Error::Success;
}

Error
InferResultGrpc::IsNullResponse(bool* is_null_response) const
{
if (is_null_response == nullptr) {
return Error("is_null_response cannot be nullptr");
}
*is_null_response = is_null_response_;
return Error::Success;
}

Error
InferResultGrpc::StringData(
const std::string& output_name,
Expand Down Expand Up @@ -384,6 +396,7 @@ InferResultGrpc::InferResultGrpc(
if (is_final_response_itr != response_->parameters().end()) {
is_final_response_ = is_final_response_itr->second.bool_param();
}
is_null_response_ = response_->outputs().empty() && is_final_response_;
}

InferResultGrpc::InferResultGrpc(
Expand All @@ -409,6 +422,7 @@ InferResultGrpc::InferResultGrpc(
if (is_final_response_itr != response_->parameters().end()) {
is_final_response_ = is_final_response_itr->second.bool_param();
}
is_null_response_ = response_->outputs().empty() && is_final_response_;
}

//==============================================================================
Expand Down
12 changes: 12 additions & 0 deletions src/c++/library/http_client.cc
Original file line number Diff line number Diff line change
Expand Up @@ -740,6 +740,7 @@ class InferResultHttp : public InferResult {
const std::string& output_name, const uint8_t** buf,
size_t* byte_size) const override;
Error IsFinalResponse(bool* is_final_response) const override;
Error IsNullResponse(bool* is_null_response) const override;
Error StringData(
const std::string& output_name,
std::vector<std::string>* string_result) const override;
Expand Down Expand Up @@ -769,6 +770,7 @@ class InferResultHttp : public InferResult {

bool binary_data_{true};
bool is_final_response_{true};
bool is_null_response_{false};
};

void
Expand Down Expand Up @@ -951,6 +953,16 @@ InferResultHttp::IsFinalResponse(bool* is_final_response) const
return Error::Success;
}

Error
InferResultHttp::IsNullResponse(bool* is_null_response) const
{
if (is_null_response == nullptr) {
return Error("is_null_response cannot be nullptr");
}
*is_null_response = is_null_response_;
return Error::Success;
}

Error
InferResultHttp::StringData(
const std::string& output_name,
Expand Down
9 changes: 8 additions & 1 deletion src/c++/perf_analyzer/client_backend/client_backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -617,12 +617,19 @@ class InferResult {
const std::string& output_name, const uint8_t** buf,
size_t* byte_size) const = 0;

/// Get final response bool of the request which generated this response.
/// Get final response bool for this response.
/// \return Error object indicating the success or failure.
virtual Error IsFinalResponse(bool* is_final_response) const
{
return Error("InferResult::IsFinalResponse() not implemented");
};

/// Get null response bool for this response.
/// \return Error object indicating the success or failure.
virtual Error IsNullResponse(bool* is_null_response) const
{
return Error("InferResult::IsNullResponse() not implemented");
};
};

}}} // namespace triton::perfanalyzer::clientbackend
9 changes: 9 additions & 0 deletions src/c++/perf_analyzer/client_backend/mock_client_backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,15 @@ class MockInferResult : public InferResult {
return Error::Success;
}

Error IsNullResponse(bool* is_null_response) const override
{
if (is_null_response == nullptr) {
return Error("is_null_response cannot be nullptr");
}
*is_null_response = false;
return Error::Success;
}

private:
std::string req_id_;
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -827,6 +827,13 @@ TritonInferResult::IsFinalResponse(bool* is_final_response) const
return Error::Success;
}

Error
TritonInferResult::IsNullResponse(bool* is_null_response) const
{
RETURN_IF_TRITON_ERROR(result_->IsNullResponse(is_null_response));
return Error::Success;
}

//==============================================================================

}}}} // namespace triton::perfanalyzer::clientbackend::tritonremote
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,8 @@ class TritonInferResult : public InferResult {
size_t* byte_size) const override;
/// See InferResult::IsFinalResponse()
Error IsFinalResponse(bool* is_final_response) const override;
/// See InferResult::IsNullResponse()
Error IsNullResponse(bool* is_null_response) const override;

private:
std::unique_ptr<tc::InferResult> result_;
Expand Down
24 changes: 16 additions & 8 deletions src/c++/perf_analyzer/infer_context.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -236,20 +236,26 @@ void
InferContext::AsyncCallbackFuncImpl(cb::InferResult* result)
{
std::shared_ptr<cb::InferResult> result_ptr(result);
bool is_final_response{true};
if (thread_stat_->cb_status_.IsOk()) {
// Add the request timestamp to thread Timestamp vector with
// proper locking
std::lock_guard<std::mutex> lock(thread_stat_->mu_);
thread_stat_->cb_status_ = result_ptr->RequestStatus();
if (thread_stat_->cb_status_.IsOk()) {
std::chrono::time_point<std::chrono::system_clock> end_time_async;
end_time_async = std::chrono::system_clock::now();
std::string request_id;
thread_stat_->cb_status_ = result_ptr->Id(&request_id);
const auto& it = async_req_map_.find(request_id);
if (it != async_req_map_.end()) {
it->second.end_times.push_back(end_time_async);
bool is_final_response{false};
bool is_null_response{false};
thread_stat_->cb_status_ =
result_ptr->IsNullResponse(&is_null_response);
if (thread_stat_->cb_status_.IsOk() == false) {
return;
}
if (is_null_response == false) {
it->second.end_times.push_back(std::chrono::system_clock::now());
}
thread_stat_->cb_status_ =
result_ptr->IsFinalResponse(&is_final_response);
if (thread_stat_->cb_status_.IsOk() == false) {
Expand All @@ -267,10 +273,12 @@ InferContext::AsyncCallbackFuncImpl(cb::InferResult* result)
}
}

total_ongoing_requests_--;
if (is_final_response) {
total_ongoing_requests_--;

if (async_callback_finalize_func_ != nullptr) {
async_callback_finalize_func_(id_);
if (async_callback_finalize_func_ != nullptr) {
async_callback_finalize_func_(id_);
}
}
}

Expand Down
18 changes: 14 additions & 4 deletions src/c++/perf_analyzer/inference_profiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1012,6 +1012,8 @@ InferenceProfiler::MergePerfStatusReports(
perf_status.client_stats.sequence_count;
experiment_perf_status.client_stats.delayed_request_count +=
perf_status.client_stats.delayed_request_count;
experiment_perf_status.client_stats.response_count +=
perf_status.client_stats.response_count;
experiment_perf_status.client_stats.duration_ns +=
perf_status.client_stats.duration_ns;

Expand Down Expand Up @@ -1079,6 +1081,8 @@ InferenceProfiler::MergePerfStatusReports(
(experiment_perf_status.client_stats.request_count *
experiment_perf_status.batch_size) /
client_duration_sec;
experiment_perf_status.client_stats.responses_per_sec =
experiment_perf_status.client_stats.response_count / client_duration_sec;
RETURN_IF_ERROR(SummarizeLatency(
experiment_perf_status.client_stats.latencies, experiment_perf_status));

Expand Down Expand Up @@ -1211,18 +1215,20 @@ InferenceProfiler::Summarize(
{
size_t valid_sequence_count = 0;
size_t delayed_request_count = 0;
size_t response_count = 0;

// Get measurement from requests that fall within the time interval
std::pair<uint64_t, uint64_t> valid_range{window_start_ns, window_end_ns};
uint64_t window_duration_ns = valid_range.second - valid_range.first;
std::vector<uint64_t> latencies;
ValidLatencyMeasurement(
valid_range, valid_sequence_count, delayed_request_count, &latencies);
valid_range, valid_sequence_count, delayed_request_count, &latencies,
response_count);

RETURN_IF_ERROR(SummarizeLatency(latencies, summary));
RETURN_IF_ERROR(SummarizeClientStat(
start_stat, end_stat, window_duration_ns, latencies.size(),
valid_sequence_count, delayed_request_count, summary));
valid_sequence_count, delayed_request_count, response_count, summary));
summary.client_stats.latencies = std::move(latencies);

SummarizeOverhead(window_duration_ns, manager_->GetIdleTime(), summary);
Expand All @@ -1245,10 +1251,11 @@ void
InferenceProfiler::ValidLatencyMeasurement(
const std::pair<uint64_t, uint64_t>& valid_range,
size_t& valid_sequence_count, size_t& delayed_request_count,
std::vector<uint64_t>* valid_latencies)
std::vector<uint64_t>* valid_latencies, size_t& response_count)
{
valid_latencies->clear();
valid_sequence_count = 0;
response_count = 0;
std::vector<size_t> erase_indices{};
for (size_t i = 0; i < all_timestamps_.size(); i++) {
const auto& timestamp = all_timestamps_[i];
Expand All @@ -1260,6 +1267,7 @@ InferenceProfiler::ValidLatencyMeasurement(
if ((request_end_ns >= valid_range.first) &&
(request_end_ns <= valid_range.second)) {
valid_latencies->push_back(request_end_ns - request_start_ns);
response_count += std::get<1>(timestamp).size();
erase_indices.push_back(i);
// Just add the sequence_end flag here.
if (std::get<2>(timestamp)) {
Expand Down Expand Up @@ -1358,7 +1366,7 @@ InferenceProfiler::SummarizeClientStat(
const cb::InferStat& start_stat, const cb::InferStat& end_stat,
const uint64_t duration_ns, const size_t valid_request_count,
const size_t valid_sequence_count, const size_t delayed_request_count,
PerfStatus& summary)
const size_t response_count, PerfStatus& summary)
{
summary.on_sequence_model =
((parser_->SchedulerType() == ModelParser::SEQUENCE) ||
Expand All @@ -1367,13 +1375,15 @@ InferenceProfiler::SummarizeClientStat(
summary.client_stats.request_count = valid_request_count;
summary.client_stats.sequence_count = valid_sequence_count;
summary.client_stats.delayed_request_count = delayed_request_count;
summary.client_stats.response_count = response_count;
summary.client_stats.duration_ns = duration_ns;
float client_duration_sec =
(float)summary.client_stats.duration_ns / NANOS_PER_SECOND;
summary.client_stats.sequence_per_sec =
valid_sequence_count / client_duration_sec;
summary.client_stats.infer_per_sec =
(valid_request_count * summary.batch_size) / client_duration_sec;
summary.client_stats.responses_per_sec = response_count / client_duration_sec;

if (include_lib_stats_) {
size_t completed_count =
Expand Down
23 changes: 15 additions & 8 deletions src/c++/perf_analyzer/inference_profiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
namespace triton { namespace perfanalyzer {

#ifndef DOCTEST_CONFIG_DISABLE
class NaggyMockInferenceProfiler;
class TestInferenceProfiler;
#endif

Expand Down Expand Up @@ -126,6 +127,8 @@ struct ClientSideStats {
uint64_t sequence_count;
// The number of requests that missed their schedule
uint64_t delayed_request_count;
// The number of responses
uint64_t response_count;
uint64_t duration_ns;
uint64_t avg_latency_ns;
// a ordered map of percentiles to be reported (<percentile, value> pair)
Expand All @@ -139,6 +142,7 @@ struct ClientSideStats {
uint64_t avg_receive_time_ns;
// Per sec stat
double infer_per_sec;
double responses_per_sec;
double sequence_per_sec;

// Completed request count reported by the client library
Expand Down Expand Up @@ -440,16 +444,17 @@ class InferenceProfiler {
/// sequence model.
/// \param latencies Returns the vector of request latencies where the
/// requests are completed within the measurement window.
void ValidLatencyMeasurement(
/// \param response_count Returns the number of responses
virtual void ValidLatencyMeasurement(
const std::pair<uint64_t, uint64_t>& valid_range,
size_t& valid_sequence_count, size_t& delayed_request_count,
std::vector<uint64_t>* latencies);
std::vector<uint64_t>* latencies, size_t& response_count);

/// \param latencies The vector of request latencies collected.
/// \param summary Returns the summary that the latency related fields are
/// set.
/// \return cb::Error object indicating success or failure.
cb::Error SummarizeLatency(
virtual cb::Error SummarizeLatency(
const std::vector<uint64_t>& latencies, PerfStatus& summary);

/// \param latencies The vector of request latencies collected.
Expand All @@ -466,14 +471,15 @@ class InferenceProfiler {
/// \param valid_sequence_count The number of completed sequences recorded.
/// \param delayed_request_count The number of requests that missed their
/// schedule.
/// \param response_count The number of responses.
/// \param summary Returns the summary that the fields recorded by
/// client are set.
/// \return cb::Error object indicating success or failure.
cb::Error SummarizeClientStat(
virtual cb::Error SummarizeClientStat(
const cb::InferStat& start_stat, const cb::InferStat& end_stat,
const uint64_t duration_ns, const size_t valid_request_count,
const size_t delayed_request_count, const size_t valid_sequence_count,
PerfStatus& summary);
const size_t response_count, PerfStatus& summary);

/// Adds the send request rate metric to the summary object.
/// \param window_duration_s The duration of the window in seconds.
Expand Down Expand Up @@ -557,15 +563,15 @@ class InferenceProfiler {
/// \param perf_status List of perf status reports to be merged.
/// \param summary_status Final merged summary status.
/// \return cb::Error object indicating success or failure.
cb::Error MergePerfStatusReports(
virtual cb::Error MergePerfStatusReports(
std::deque<PerfStatus>& perf_status, PerfStatus& summary_status);

/// Merge individual server side statistics into a single server side report.
/// \param server_side_stats List of server side statistics reports to be
/// merged.
/// \param server_side_summary Final merged summary status.
/// \return cb::Error object indicating success or failure.
cb::Error MergeServerSideStats(
virtual cb::Error MergeServerSideStats(
std::vector<ServerSideStats>& server_side_stats,
ServerSideStats& server_side_summary);

Expand Down Expand Up @@ -695,10 +701,11 @@ class InferenceProfiler {
const double overhead_pct_threshold_{0.0};

#ifndef DOCTEST_CONFIG_DISABLE
friend NaggyMockInferenceProfiler;
friend TestInferenceProfiler;

public:
InferenceProfiler(){};
InferenceProfiler() = default;
#endif
};

Expand Down
Loading

0 comments on commit c753ff4

Please sign in to comment.