diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2d31aa2fe..41ba6434b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -32,7 +32,7 @@ repos: - id: isort additional_dependencies: [toml] - repo: https://github.com/psf/black - rev: 24.4.0 + rev: 23.1.0 hooks: - id: black types_or: [python, cython] diff --git a/CMakeLists.txt b/CMakeLists.txt index 5628f0a27..9db890fc5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,9 +31,6 @@ project(tritonclient LANGUAGES C CXX) # Use C++17 standard as Triton's minimum required. set(TRITON_MIN_CXX_STANDARD 17 CACHE STRING "The minimum C++ standard which features are requested to build this target.") -set(CMAKE_CXX_STANDARD 17) -set(CMAKE_CXX_STANDARD_REQUIRED ON) - # # Options # diff --git a/src/c++/CMakeLists.txt b/src/c++/CMakeLists.txt index a54253172..ab9810905 100644 --- a/src/c++/CMakeLists.txt +++ b/src/c++/CMakeLists.txt @@ -28,12 +28,6 @@ cmake_minimum_required(VERSION 3.17) project(cc-clients LANGUAGES C CXX) -# Use C++17 standard as Triton's minimum required. -set(TRITON_MIN_CXX_STANDARD 17 CACHE STRING "The minimum C++ standard which features are requested to build this target.") - -set(CMAKE_CXX_STANDARD 17) -set(CMAKE_CXX_STANDARD_REQUIRED ON) - # # Options # diff --git a/src/c++/library/http_client.cc b/src/c++/library/http_client.cc index a2651f2eb..9f2f5ab5e 100644 --- a/src/c++/library/http_client.cc +++ b/src/c++/library/http_client.cc @@ -1371,23 +1371,27 @@ InferenceServerHttpClient::InferenceServerHttpClient( InferenceServerHttpClient::~InferenceServerHttpClient() { - { - std::lock_guard lock(mutex_); - exiting_ = true; - } - - curl_multi_wakeup(multi_handle_); + exiting_ = true; // thread not joinable if AsyncInfer() is not called // (it is default constructed thread before the first AsyncInfer() call) if (worker_.joinable()) { + cv_.notify_all(); worker_.join(); } if (easy_handle_ != nullptr) { curl_easy_cleanup(reinterpret_cast(easy_handle_)); } - curl_multi_cleanup(multi_handle_); + + if (multi_handle_ != nullptr) { + for (auto& request : ongoing_async_requests_) { + CURL* easy_handle = reinterpret_cast(request.first); + curl_multi_remove_handle(multi_handle_, easy_handle); + curl_easy_cleanup(easy_handle); + } + curl_multi_cleanup(multi_handle_); + } } Error @@ -1883,28 +1887,25 @@ InferenceServerHttpClient::AsyncInfer( { std::lock_guard lock(mutex_); - if (exiting_) { - return Error("Client is exiting."); - } - - auto insert_result = new_async_requests_.emplace(std::make_pair( + auto insert_result = ongoing_async_requests_.emplace(std::make_pair( reinterpret_cast(multi_easy_handle), async_request)); if (!insert_result.second) { curl_easy_cleanup(multi_easy_handle); return Error("Failed to insert new asynchronous request context."); } - } - async_request->Timer().CaptureTimestamp(RequestTimers::Kind::SEND_START); - curl_multi_wakeup(multi_handle_); + async_request->Timer().CaptureTimestamp(RequestTimers::Kind::SEND_START); + if (async_request->total_input_byte_size_ == 0) { + // Set SEND_END here because CURLOPT_READFUNCTION will not be called if + // content length is 0. In that case, we can't measure SEND_END properly + // (send ends after sending request header). + async_request->Timer().CaptureTimestamp(RequestTimers::Kind::SEND_END); + } - if (async_request->total_input_byte_size_ == 0) { - // Set SEND_END here because CURLOPT_READFUNCTION will not be called if - // content length is 0. In that case, we can't measure SEND_END properly - // (send ends after sending request header). - async_request->Timer().CaptureTimestamp(RequestTimers::Kind::SEND_END); + curl_multi_add_handle(multi_handle_, multi_easy_handle); } + cv_.notify_all(); return Error::Success; } @@ -2248,103 +2249,88 @@ InferenceServerHttpClient::PreRunProcessing( void InferenceServerHttpClient::AsyncTransfer() { - int messages_in_queue = 0; - int still_running = 0; - int numfds = 0; + int place_holder = 0; CURLMsg* msg = nullptr; - AsyncReqMap ongoing_async_requests; do { - // Check for new requests and add them to ongoing requests - { - std::lock_guard lock(mutex_); - - for (auto& pair : new_async_requests_) { - curl_multi_add_handle( - multi_handle_, reinterpret_cast(pair.first)); + std::vector> request_list; - ongoing_async_requests[pair.first] = std::move(pair.second); + // sleep if no work is available + std::unique_lock lock(mutex_); + cv_.wait(lock, [this] { + if (this->exiting_) { + return true; } - new_async_requests_.clear(); - } - - CURLMcode mc = curl_multi_perform(multi_handle_, &still_running); - - if (mc != CURLM_OK) { - std::cerr << "Unexpected error: curl_multi failed. Code:" << mc + // wake up if an async request has been generated + return !this->ongoing_async_requests_.empty(); + }); + + CURLMcode mc = curl_multi_perform(multi_handle_, &place_holder); + int numfds; + if (mc == CURLM_OK) { + // Wait for activity. If there are no descriptors in the multi_handle_ + // then curl_multi_wait will return immediately + mc = curl_multi_wait(multi_handle_, NULL, 0, INT_MAX, &numfds); + if (mc == CURLM_OK) { + while ((msg = curl_multi_info_read(multi_handle_, &place_holder))) { + uintptr_t identifier = reinterpret_cast(msg->easy_handle); + auto itr = ongoing_async_requests_.find(identifier); + // This shouldn't happen + if (itr == ongoing_async_requests_.end()) { + std::cerr + << "Unexpected error: received completed request that is not " + "in the list of asynchronous requests" << std::endl; - continue; - } + curl_multi_remove_handle(multi_handle_, msg->easy_handle); + curl_easy_cleanup(msg->easy_handle); + continue; + } - while ((msg = curl_multi_info_read(multi_handle_, &messages_in_queue))) { - if (msg->msg != CURLMSG_DONE) { - // Something wrong happened. - std::cerr << "Unexpected error: received CURLMsg=" << msg->msg - << std::endl; - continue; - } + long http_code = 400; + if (msg->data.result == CURLE_OK) { + curl_easy_getinfo( + msg->easy_handle, CURLINFO_RESPONSE_CODE, &http_code); + } else if (msg->data.result == CURLE_OPERATION_TIMEDOUT) { + http_code = 499; + } - uintptr_t identifier = reinterpret_cast(msg->easy_handle); - auto itr = ongoing_async_requests.find(identifier); - // This shouldn't happen - if (itr == ongoing_async_requests.end()) { - std::cerr << "Unexpected error: received completed request that is not " - "in the list of asynchronous requests" - << std::endl; - curl_multi_remove_handle(multi_handle_, msg->easy_handle); - curl_easy_cleanup(msg->easy_handle); - continue; - } - auto async_request = itr->second; - - uint32_t http_code = 400; - if (msg->data.result == CURLE_OK) { - curl_easy_getinfo(msg->easy_handle, CURLINFO_RESPONSE_CODE, &http_code); - async_request->Timer().CaptureTimestamp( - RequestTimers::Kind::REQUEST_END); - Error err = UpdateInferStat(async_request->Timer()); - if (!err.IsOk()) { - std::cerr << "Failed to update context stat: " << err << std::endl; + request_list.emplace_back(itr->second); + ongoing_async_requests_.erase(itr); + curl_multi_remove_handle(multi_handle_, msg->easy_handle); + curl_easy_cleanup(msg->easy_handle); + + std::shared_ptr async_request = request_list.back(); + async_request->http_code_ = http_code; + + if (msg->msg != CURLMSG_DONE) { + // Something wrong happened. + std::cerr << "Unexpected error: received CURLMsg=" << msg->msg + << std::endl; + } else { + async_request->Timer().CaptureTimestamp( + RequestTimers::Kind::REQUEST_END); + Error err = UpdateInferStat(async_request->Timer()); + if (!err.IsOk()) { + std::cerr << "Failed to update context stat: " << err + << std::endl; + } + } } - } else if (msg->data.result == CURLE_OPERATION_TIMEDOUT) { - http_code = 499; + } else { + std::cerr << "Unexpected error: curl_multi failed. Code:" << mc + << std::endl; } - - async_request->http_code_ = http_code; - InferResult* result; - InferResultHttp::Create(&result, async_request); - async_request->callback_(result); - ongoing_async_requests.erase(itr); - curl_multi_remove_handle(multi_handle_, msg->easy_handle); - curl_easy_cleanup(msg->easy_handle); + } else { + std::cerr << "Unexpected error: curl_multi failed. Code:" << mc + << std::endl; } + lock.unlock(); - // Wait for activity on existing requests or - // explicit curl_multi_wakeup call - // - // If there are no descriptors in the multi_handle_ - // then curl_multi_poll will wait until curl_multi_wakeup - // is called - // - // curl_multi_wakeup is called when adding a new request - // or exiting - - mc = curl_multi_poll(multi_handle_, NULL, 0, INT_MAX, &numfds); - if (mc != CURLM_OK) { - std::cerr << "Unexpected error: curl_multi_poll failed. Code:" << mc - << std::endl; + for (auto& this_request : request_list) { + InferResult* result; + InferResultHttp::Create(&result, this_request); + this_request->callback_(result); } } while (!exiting_); - - for (auto& request : ongoing_async_requests) { - CURL* easy_handle = reinterpret_cast(request.first); - curl_multi_remove_handle(multi_handle_, easy_handle); - curl_easy_cleanup(easy_handle); - } - - for (auto& request : new_async_requests_) { - CURL* easy_handle = reinterpret_cast(request.first); - curl_easy_cleanup(easy_handle); - } } size_t diff --git a/src/c++/library/http_client.h b/src/c++/library/http_client.h index 7dbe1976d..e06b2eef3 100644 --- a/src/c++/library/http_client.h +++ b/src/c++/library/http_client.h @@ -643,9 +643,9 @@ class InferenceServerHttpClient : public InferenceServerClient { void* easy_handle_; // curl multi handle for processing asynchronous requests void* multi_handle_; - // map to record new asynchronous requests with pointer to easy handle + // map to record ongoing asynchronous requests with pointer to easy handle // or tag id as key - AsyncReqMap new_async_requests_; + AsyncReqMap ongoing_async_requests_; }; }} // namespace triton::client diff --git a/src/c++/perf_analyzer/client_backend/openai/http_client.cc b/src/c++/perf_analyzer/client_backend/openai/http_client.cc index 17fb42e08..08e4b4b3c 100644 --- a/src/c++/perf_analyzer/client_backend/openai/http_client.cc +++ b/src/c++/perf_analyzer/client_backend/openai/http_client.cc @@ -104,19 +104,20 @@ HttpClient::HttpClient( HttpClient::~HttpClient() { - { - std::lock_guard lock(mutex_); - exiting_ = true; - } - - curl_multi_wakeup(multi_handle_); + exiting_ = true; // thread not joinable if AsyncInfer() is not called // (it is default constructed thread before the first AsyncInfer() call) if (worker_.joinable()) { + cv_.notify_all(); worker_.join(); } + for (auto& request : ongoing_async_requests_) { + CURL* easy_handle = reinterpret_cast(request.first); + curl_multi_remove_handle(multi_handle_, easy_handle); + curl_easy_cleanup(easy_handle); + } curl_multi_cleanup(multi_handle_); { @@ -182,120 +183,94 @@ HttpClient::SetSSLCurlOptions(CURL* curl_handle) void HttpClient::Send(CURL* handle, std::unique_ptr&& request) { - { - std::lock_guard lock(mutex_); - - if (exiting_) { - return; - } - - auto insert_result = new_async_requests_.emplace(std::make_pair( - reinterpret_cast(handle), std::move(request))); - if (!insert_result.second) { - curl_easy_cleanup(handle); - throw std::runtime_error( - "Failed to insert new asynchronous request context."); - } + std::lock_guard lock(mutex_); + + auto insert_result = ongoing_async_requests_.emplace( + std::make_pair(reinterpret_cast(handle), std::move(request))); + if (!insert_result.second) { + curl_easy_cleanup(handle); + throw std::runtime_error( + "Failed to insert new asynchronous request context."); } - curl_multi_wakeup(multi_handle_); + curl_multi_add_handle(multi_handle_, handle); + cv_.notify_all(); } void HttpClient::AsyncTransfer() { - int messages_in_queue = 0; - int still_running = 0; - int numfds = 0; + int place_holder = 0; CURLMsg* msg = nullptr; - AsyncReqMap ongoing_async_requests; - do { - { - // Check for new requests and add them to ongoing requests - - std::lock_guard lock(mutex_); + std::vector> request_list; - for (auto& pair : new_async_requests_) { - curl_multi_add_handle( - multi_handle_, reinterpret_cast(pair.first)); - - ongoing_async_requests[pair.first] = std::move(pair.second); + // sleep if no work is available + std::unique_lock lock(mutex_); + cv_.wait(lock, [this] { + if (this->exiting_) { + return true; } - new_async_requests_.clear(); - } - - CURLMcode mc = curl_multi_perform(multi_handle_, &still_running); - - if (mc != CURLM_OK) { - std::cerr << "Unexpected error: curl_multi failed. Code:" << mc + // wake up if an async request has been generated + return !this->ongoing_async_requests_.empty(); + }); + + CURLMcode mc = curl_multi_perform(multi_handle_, &place_holder); + int numfds; + if (mc == CURLM_OK) { + // Wait for activity. If there are no descriptors in the multi_handle_ + // then curl_multi_wait will return immediately + mc = curl_multi_wait(multi_handle_, NULL, 0, INT_MAX, &numfds); + if (mc == CURLM_OK) { + while ((msg = curl_multi_info_read(multi_handle_, &place_holder))) { + uintptr_t identifier = reinterpret_cast(msg->easy_handle); + auto itr = ongoing_async_requests_.find(identifier); + // This shouldn't happen + if (itr == ongoing_async_requests_.end()) { + std::cerr + << "Unexpected error: received completed request that is not " + "in the list of asynchronous requests" << std::endl; - continue; - } - - while ((msg = curl_multi_info_read(multi_handle_, &messages_in_queue))) { - if (msg->msg != CURLMSG_DONE) { - // Something wrong happened. - std::cerr << "Unexpected error: received CURLMsg=" << msg->msg + curl_multi_remove_handle(multi_handle_, msg->easy_handle); + curl_easy_cleanup(msg->easy_handle); + continue; + } + + uint32_t http_code = 400; + if (msg->data.result == CURLE_OK) { + curl_easy_getinfo( + msg->easy_handle, CURLINFO_RESPONSE_CODE, &http_code); + } else if (msg->data.result == CURLE_OPERATION_TIMEDOUT) { + http_code = 499; + } + + request_list.emplace_back(std::move(itr->second)); + ongoing_async_requests_.erase(itr); + curl_multi_remove_handle(multi_handle_, msg->easy_handle); + curl_easy_cleanup(msg->easy_handle); + + std::unique_ptr& async_request = request_list.back(); + async_request->http_code_ = http_code; + + if (msg->msg != CURLMSG_DONE) { + // Something wrong happened. + std::cerr << "Unexpected error: received CURLMsg=" << msg->msg + << std::endl; + } + } + } else { + std::cerr << "Unexpected error: curl_multi failed. Code:" << mc << std::endl; - continue; } - - uintptr_t identifier = reinterpret_cast(msg->easy_handle); - auto itr = ongoing_async_requests.find(identifier); - // This shouldn't happen - if (itr == ongoing_async_requests.end()) { - std::cerr << "Unexpected error: received completed request that is not " - "in the list of asynchronous requests" - << std::endl; - curl_multi_remove_handle(multi_handle_, msg->easy_handle); - curl_easy_cleanup(msg->easy_handle); - continue; - } - - uint32_t http_code = 400; - if (msg->data.result == CURLE_OK) { - curl_easy_getinfo(msg->easy_handle, CURLINFO_RESPONSE_CODE, &http_code); - } else if (msg->data.result == CURLE_OPERATION_TIMEDOUT) { - http_code = 499; - } - - itr->second->http_code_ = http_code; - itr->second->completion_callback_(itr->second.get()); - ongoing_async_requests.erase(itr); - curl_multi_remove_handle(multi_handle_, msg->easy_handle); - curl_easy_cleanup(msg->easy_handle); - } - - - // Wait for activity on existing requests or - // explicit curl_multi_wakeup call - // - // If there are no descriptors in the multi_handle_ - // then curl_multi_poll will wait until curl_multi_wakeup - // is called - // - // curl_multi_wakeup is called when adding a new request - // or exiting - - mc = curl_multi_poll(multi_handle_, NULL, 0, INT_MAX, &numfds); - - if (mc != CURLM_OK) { + } else { std::cerr << "Unexpected error: curl_multi failed. Code:" << mc << std::endl; } + lock.unlock(); + for (auto& this_request : request_list) { + this_request->completion_callback_(this_request.get()); + } } while (!exiting_); - - for (auto& request : ongoing_async_requests) { - CURL* easy_handle = reinterpret_cast(request.first); - curl_multi_remove_handle(multi_handle_, easy_handle); - curl_easy_cleanup(easy_handle); - } - - for (auto& request : new_async_requests_) { - CURL* easy_handle = reinterpret_cast(request.first); - curl_easy_cleanup(easy_handle); - } } }}}} // namespace triton::perfanalyzer::clientbackend::openai diff --git a/src/c++/perf_analyzer/client_backend/openai/http_client.h b/src/c++/perf_analyzer/client_backend/openai/http_client.h index 7ff9bb14e..6b78d836e 100644 --- a/src/c++/perf_analyzer/client_backend/openai/http_client.h +++ b/src/c++/perf_analyzer/client_backend/openai/http_client.h @@ -149,6 +149,7 @@ class HttpClient { std::thread worker_; std::mutex mutex_; + std::condition_variable cv_; // The server url const std::string url_; @@ -158,9 +159,9 @@ class HttpClient { using AsyncReqMap = std::map>; // curl multi handle for processing asynchronous requests void* multi_handle_; - // map to record new asynchronous requests with pointer to easy handle + // map to record ongoing asynchronous requests with pointer to easy handle // or tag id as key - AsyncReqMap new_async_requests_; + AsyncReqMap ongoing_async_requests_; bool verbose_; diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_client.cc b/src/c++/perf_analyzer/client_backend/openai/openai_client.cc index cd517f6a6..9b167fae1 100644 --- a/src/c++/perf_analyzer/client_backend/openai/openai_client.cc +++ b/src/c++/perf_analyzer/client_backend/openai/openai_client.cc @@ -63,6 +63,7 @@ namespace openai { void ChatCompletionRequest::SendResponse(bool is_final, bool is_null) { + final_response_sent_ = is_final; response_callback_(new ChatCompletionResult( http_code_, std::move(response_buffer_), is_final, is_null, request_id_)); } @@ -172,7 +173,11 @@ ChatCompletionClient::AsyncInfer( request->timer_.CaptureTimestamp( triton::client::RequestTimers::Kind::REQUEST_END); UpdateInferStat(request->timer_); - if (!request->is_stream_) { + + // Send final response on request completion + // if it has not already been sent. + // (e.g. in the case of seeing [DONE] in streaming case) + if (!request->IsFinalResponseSent()) { request->SendResponse(true /* is_final */, false /* is_null */); } }; diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_client.h b/src/c++/perf_analyzer/client_backend/openai/openai_client.h index aadcb3252..00ccbd5fa 100644 --- a/src/c++/perf_analyzer/client_backend/openai/openai_client.h +++ b/src/c++/perf_analyzer/client_backend/openai/openai_client.h @@ -121,12 +121,14 @@ class ChatCompletionRequest : public HttpRequest { request_id_(request_id) { } + bool IsFinalResponseSent() { return final_response_sent_; }; void SendResponse(bool is_final, bool is_null); bool is_stream_{false}; std::function response_callback_{nullptr}; // The timers for infer request. triton::client::RequestTimers timer_; const std::string request_id_; + bool final_response_sent_{false}; }; class ChatCompletionClient : public HttpClient { diff --git a/src/c++/perf_analyzer/client_backend/triton_c_api/triton_c_api_backend.cc b/src/c++/perf_analyzer/client_backend/triton_c_api/triton_c_api_backend.cc index e97f1ea80..3803fbbf0 100644 --- a/src/c++/perf_analyzer/client_backend/triton_c_api/triton_c_api_backend.cc +++ b/src/c++/perf_analyzer/client_backend/triton_c_api/triton_c_api_backend.cc @@ -55,8 +55,7 @@ TritonCApiClientBackend::Create( std::unique_ptr triton_client_backend( new TritonCApiClientBackend()); - RETURN_IF_ERROR( - TritonLoader::Create(triton_server_path, model_repository_path, verbose)); + TritonLoader::Create(triton_server_path, model_repository_path, verbose); *client_backend = std::move(triton_client_backend); return Error::Success; } diff --git a/src/c++/perf_analyzer/data_loader.cc b/src/c++/perf_analyzer/data_loader.cc index 38bfe9403..c3a5170ce 100644 --- a/src/c++/perf_analyzer/data_loader.cc +++ b/src/c++/perf_analyzer/data_loader.cc @@ -1,4 +1,4 @@ -// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -38,34 +38,6 @@ DataLoader::DataLoader(const size_t batch_size) { } -cb::Error -DataLoader::ValidateIOExistsInModel( - const std::shared_ptr& inputs, - const std::shared_ptr& outputs, - const std::string& data_directory) -{ - if (!std::filesystem::exists(data_directory) || - !std::filesystem::is_directory(data_directory)) { - return cb::Error( - "Error: Directory does not exist or is not a directory: " + - std::string(data_directory), - pa::GENERIC_ERROR); - } - - for (const auto& file : std::filesystem::directory_iterator(data_directory)) { - std::string io_name = file.path().filename().string(); - if (inputs->find(io_name) == inputs->end() && - outputs->find(io_name) == outputs->end()) { - return cb::Error( - "Provided data file '" + io_name + - "' does not correspond to a valid model input or output.", - pa::GENERIC_ERROR); - } - } - - return cb::Error::Success; -} - cb::Error DataLoader::ReadDataFromDir( const std::shared_ptr& inputs, @@ -406,7 +378,6 @@ DataLoader::GetOutputData( data.data_ptr = nullptr; data.batch1_size = 0; data.is_valid = false; - data.name = ""; // If json data is available then try to retrieve the data from there if (!output_data_.empty()) { @@ -422,7 +393,6 @@ DataLoader::GetOutputData( data.is_valid = true; data.batch1_size = data_vec->size(); data.data_ptr = (const uint8_t*)data_vec->data(); - data.name = output_name; } } return cb::Error::Success; @@ -477,11 +447,9 @@ DataLoader::ReadTensorData( const std::shared_ptr& tensors, const int stream_index, const int step_index, const bool is_input) { - std::unordered_set model_io_names; auto& tensor_data = is_input ? input_data_ : output_data_; auto& tensor_shape = is_input ? input_shapes_ : output_shapes_; for (const auto& io : *tensors) { - model_io_names.insert(io.first); if (step.HasMember(io.first.c_str())) { std::string key_name( io.first + "_" + std::to_string(stream_index) + "_" + @@ -570,19 +538,6 @@ DataLoader::ReadTensorData( } } - // Add allowed non-model inputs/outputs to the model_io_names set - model_io_names.insert("model"); - - for (auto itr = step.MemberBegin(); itr != step.MemberEnd(); ++itr) { - if (model_io_names.find(itr->name.GetString()) == model_io_names.end()) { - return cb::Error( - "The input or output '" + std::string(itr->name.GetString()) + - "' is not found in the model configuration", - pa::GENERIC_ERROR); - } - } - - return cb::Error::Success; } diff --git a/src/c++/perf_analyzer/data_loader.h b/src/c++/perf_analyzer/data_loader.h index 2f83f959f..0a7e91aec 100644 --- a/src/c++/perf_analyzer/data_loader.h +++ b/src/c++/perf_analyzer/data_loader.h @@ -25,9 +25,7 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma once -#include #include -#include #include "model_parser.h" #include "perf_utils.h" @@ -58,22 +56,9 @@ class DataLoader { return 0; } - /// Validate user-supplied inputs and outputs exist in the model - /// \param inputs The pointer to the map holding the information about - /// input tensors of a model - /// \param outputs The pointer to the map holding the information about - /// output tensors of a model - /// \param data_directory The path to the directory containing the data - cb::Error ValidateIOExistsInModel( - const std::shared_ptr& inputs, - const std::shared_ptr& outputs, - const std::string& data_directory); - /// Reads the input data from the specified data directory. /// \param inputs The pointer to the map holding the information about /// input tensors of a model - /// \param outputs The pointer to the map holding the information about - /// output tensors of a model /// \param data_directory The path to the directory containing the data cb::Error ReadDataFromDir( const std::shared_ptr& inputs, diff --git a/src/c++/perf_analyzer/docs/cli.md b/src/c++/perf_analyzer/docs/cli.md index bd82415c8..399596fd6 100644 --- a/src/c++/perf_analyzer/docs/cli.md +++ b/src/c++/perf_analyzer/docs/cli.md @@ -157,13 +157,6 @@ will also be reported in the results. Default is `-1` indicating that the average latency is used to determine stability. -#### `--request-count=` - -Specifies a total number of requests to use for measurement. - -Default is `0`, which means that there is no request count and the measurement -will proceed using windows until stabilization is detected. - #### `-r ` #### `--max-trials=` diff --git a/src/c++/perf_analyzer/docs/input_data.md b/src/c++/perf_analyzer/docs/input_data.md index af2328fcd..aa2448632 100644 --- a/src/c++/perf_analyzer/docs/input_data.md +++ b/src/c++/perf_analyzer/docs/input_data.md @@ -37,10 +37,9 @@ of your model. You can select a different input data mode with the generates random data once per input and reuses that for all inferences - _zero_: Send zeros for each input. - directory path: A path to a directory containing a binary file for each input, - named the same as the input (and optionally a binary file for each output for - validation, named the same as the output). Each binary file must contain the - data required for that input/output for a batch-1 request. Each file should - contain the raw binary representation of the input/output in row-major order. + named the same as the input. Each binary file must contain the data required + for that input for a batch-1 request. Each file should contain the raw binary + representation of the input in row-major order. - file path: A path to a JSON file containing data to be used with every inference request. See the "Real Input Data" section for further details. [`--input-data`](cli.md#--input-datazerorandompath) can be provided multiple diff --git a/src/c++/perf_analyzer/genai-perf/README.md b/src/c++/perf_analyzer/genai-perf/README.md index 1d03b3dd0..eebea223c 100644 --- a/src/c++/perf_analyzer/genai-perf/README.md +++ b/src/c++/perf_analyzer/genai-perf/README.md @@ -62,49 +62,43 @@ Available starting with the 24.03 release of the Run the Triton Inference Server SDK docker container: ```bash -export RELEASE="yy.mm" # e.g. export RELEASE="24.03" +export RELEASE="mm.yy" # e.g. export RELEASE="24.03" docker run -it --net=host --gpus=all nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk ``` -
- -Alternatively, to install from source: - -## From Source - -GenAI-Perf depends on Perf Analyzer. Here is how to install Perf Analyzer: - -### Install Perf Analyzer (Ubuntu, Python 3.8+) - -Note: you must already have CUDA 12 installed. +Run GenAI-Perf: ```bash -pip install tritonclient - -apt update && apt install -y --no-install-recommends libb64-0d libcurl4 +genai-perf --help ``` -Alternatively, you can install Perf Analyzer -[from source](../docs/install.md#build-from-source). +
+ +To install from source: -### Install GenAI-Perf from source +## From Source + +This method requires that Perf Analyzer is installed in your development +environment and that you have at least Python 3.10 installed. To build Perf +Analyzer from source, see +[here](../docs/install.md#build-from-source). ```bash -export RELEASE="yy.mm" # e.g. export RELEASE="24.03" +export RELEASE="mm.yy" # e.g. export RELEASE="24.03" pip install "git+https://github.com/triton-inference-server/client.git@r${RELEASE}#subdirectory=src/c++/perf_analyzer/genai-perf" ``` -
-
- Run GenAI-Perf: ```bash genai-perf --help ``` +
+
+ # Quick Start ## Measuring Throughput and Latency of GPT2 using Triton + TensorRT-LLM @@ -117,7 +111,7 @@ genai-perf --help 1. Run Triton Inference Server with TensorRT-LLM backend container: ```bash -export RELEASE="yy.mm" # e.g. export RELEASE="24.03" +export RELEASE="mm.yy" # e.g. export RELEASE="24.03" docker run -it --net=host --rm --gpus=all --shm-size=2g --ulimit memlock=-1 --ulimit stack=67108864 nvcr.io/nvidia/tritonserver:${RELEASE}-trtllm-python-py3 ``` @@ -154,7 +148,7 @@ triton start 1. Run Triton Inference Server SDK container: ```bash -export RELEASE="yy.mm" # e.g. export RELEASE="24.03" +export RELEASE="mm.yy" # e.g. export RELEASE="24.03" docker run -it --net=host --rm --gpus=all nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk ``` @@ -162,7 +156,7 @@ docker run -it --net=host --rm --gpus=all nvcr.io/nvidia/tritonserver:${RELEASE} 2. Run GenAI-Perf: ```bash -genai-perf profile \ +genai-perf \ -m gpt2 \ --service-kind triton \ --backend tensorrtllm \ @@ -191,8 +185,8 @@ Example output: │ Time to first token (ns) │ 13,266,974 │ 11,818,732 │ 18,351,779 │ 16,513,479 │ 13,741,986 │ 13,544,376 │ │ Inter token latency (ns) │ 2,069,766 │ 42,023 │ 15,307,799 │ 3,256,375 │ 3,020,580 │ 2,090,930 │ │ Request latency (ns) │ 223,532,625 │ 219,123,330 │ 241,004,192 │ 238,198,306 │ 229,676,183 │ 224,715,918 │ -│ Output sequence length │ 104 │ 100 │ 129 │ 128 │ 109 │ 105 │ -│ Input sequence length │ 199 │ 199 │ 199 │ 199 │ 199 │ 199 │ +│ Num output token │ 104 │ 100 │ 129 │ 128 │ 109 │ 105 │ +│ Num input token │ 199 │ 199 │ 199 │ 199 │ 199 │ 199 │ └──────────────────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┘ Output token throughput (per sec): 460.42 Request throughput (per sec): 4.44 @@ -209,7 +203,7 @@ current profile run. This is disabled by default but users can easily enable it by passing the `--generate-plots` option when running the benchmark: ```bash -genai-perf profile \ +genai-perf \ -m gpt2 \ --service-kind triton \ --backend tensorrtllm \ @@ -221,9 +215,9 @@ genai-perf profile \ This will generate a [set of default plots](docs/compare.md#example-plots) such as: - Time to first token (TTFT) analysis - Request latency analysis -- TTFT vs Input sequence lengths +- TTFT vs Number of input tokens - Inter token latencies vs Token positions -- Input sequence lengths vs Output sequence lengths +- Number of input tokens vs Number of output tokens ## Using `compare` Subcommand to Visualize Multiple Runs @@ -245,15 +239,15 @@ Executing the above command will perform the following actions under the 1. Generate a YAML configuration file (e.g. `config.yaml`) containing the metadata for each plot generated during the comparison process. 2. Automatically generate the [default set of plots](docs/compare.md#example-plots) -(e.g. TTFT vs. Input Sequence Lengths) that compare the two profile runs. +(e.g. TTFT vs. Number of Input Tokens) that compare the two profile runs. ``` compare ├── config.yaml -├── distribution_of_input_sequence_lengths_to_output_sequence_lengths.jpeg +├── distribution_of_input_tokens_to_generated_tokens.jpeg ├── request_latency.jpeg ├── time_to_first_token.jpeg -├── time_to_first_token_vs_input_sequence_lengths.jpeg +├── time_to_first_token_vs_number_of_input_tokens.jpeg ├── token-to-token_latency_vs_output_token_position.jpeg └── ... ``` @@ -301,8 +295,8 @@ options: When the dataset is coming from a file, you can specify the following options: -* `--input-file `: The input file containing the prompts to - use for benchmarking as JSON objects. +* `--input-file `: The input file containing the single prompt to + use for benchmarking. For any dataset, you can specify the following options: * `--output-tokens-mean `: The mean number of tokens in each output. Ensure @@ -333,8 +327,7 @@ the inference server. | Time to First Token | Time between when a request is sent and when its first response is received, one value per request in benchmark | Avg, min, max, p99, p90, p75 | | Inter Token Latency | Time between intermediate responses for a single request divided by the number of generated tokens of the latter response, one value per response per request in benchmark | Avg, min, max, p99, p90, p75 | | Request Latency | Time between when a request is sent and when its final response is received, one value per request in benchmark | Avg, min, max, p99, p90, p75 | -| Output Sequence Length | Total number of output tokens of a request, one value per request in benchmark | Avg, min, max, p99, p90, p75 | -| Input Sequence Length | Total number of input tokens of a request, one value per request in benchmark | Avg, min, max, p99, p90, p75 | +| Number of Output Tokens | Total number of output tokens of a request, one value per request in benchmark | Avg, min, max, p99, p90, p75 | | Output Token Throughput | Total number of output tokens from benchmark divided by benchmark duration | None–one value per benchmark | | Request Throughput | Number of final responses from benchmark divided by benchmark duration | None–one value per benchmark | @@ -373,7 +366,7 @@ model config to not echo the input tokens in the output. (default: tensorrtllm) Set a custom endpoint that differs from the OpenAI defaults. (default: `None`) -##### `--endpoint-type {chat,completions,embeddings,rankings}` +##### `--endpoint-type {chat,completions}` The endpoint-type to send requests to on the server. This is only used with the `openai` service-kind. (default: `None`) @@ -394,20 +387,10 @@ URL of the endpoint to target for benchmarking. (default: `None`) ## Input Options -##### `-b ` -##### `--batch-size ` - -The batch size of the requests GenAI-Perf should send. -This is currently only supported with the -[embeddings endpoint type](docs/embeddings.md). -(default: `1`) and -[rankings endpoint type](docs/rankings.md). - ##### `--extra-inputs ` Provide additional inputs to include with every request. You can repeat this flag for multiple inputs. Inputs should be in an input_name:value format. -Alternatively, a string representing a json formatted dict can be provided. (default: `None`) ##### `--input-dataset {openorca,cnn_dailymail}` @@ -417,9 +400,7 @@ The HuggingFace dataset to use for prompts. ##### `--input-file ` -The input file containing the prompts to use for profiling. -Each line should be a JSON object with a 'text_input' field in JSONL format. -Example: {\"text_input\": \"Your prompt here\"}" +The input file containing the single prompt to use for profiling. ##### `--num-prompts ` @@ -485,11 +466,6 @@ infer per second and latency. (default: `999`) ## Output Options -##### `--artifact-dir` - -The directory to store all the (output) artifacts generated by GenAI-Perf and -Perf Analyzer. (default: `artifacts`) - ##### `--generate-plots` An option to enable the generation of plots. (default: False) diff --git a/src/c++/perf_analyzer/genai-perf/docs/assets/distribution_of_input_sequence_lengths_to_output_sequence_lengths.jpeg b/src/c++/perf_analyzer/genai-perf/docs/assets/distribution_of_input_sequence_lengths_to_output_sequence_lengths.jpeg deleted file mode 100644 index 1f9b2cba6..000000000 Binary files a/src/c++/perf_analyzer/genai-perf/docs/assets/distribution_of_input_sequence_lengths_to_output_sequence_lengths.jpeg and /dev/null differ diff --git a/src/c++/perf_analyzer/genai-perf/docs/assets/distribution_of_input_tokens_to_generated_tokens.jpeg b/src/c++/perf_analyzer/genai-perf/docs/assets/distribution_of_input_tokens_to_generated_tokens.jpeg new file mode 100644 index 000000000..e51f5f49f Binary files /dev/null and b/src/c++/perf_analyzer/genai-perf/docs/assets/distribution_of_input_tokens_to_generated_tokens.jpeg differ diff --git a/src/c++/perf_analyzer/genai-perf/docs/assets/time_to_first_token_vs_input_sequence_lengths.jpeg b/src/c++/perf_analyzer/genai-perf/docs/assets/time_to_first_token_vs_input_sequence_lengths.jpeg deleted file mode 100644 index 1b81ef532..000000000 Binary files a/src/c++/perf_analyzer/genai-perf/docs/assets/time_to_first_token_vs_input_sequence_lengths.jpeg and /dev/null differ diff --git a/src/c++/perf_analyzer/genai-perf/docs/assets/time_to_first_token_vs_number_of_input_tokens.jpeg b/src/c++/perf_analyzer/genai-perf/docs/assets/time_to_first_token_vs_number_of_input_tokens.jpeg new file mode 100644 index 000000000..f3097064a Binary files /dev/null and b/src/c++/perf_analyzer/genai-perf/docs/assets/time_to_first_token_vs_number_of_input_tokens.jpeg differ diff --git a/src/c++/perf_analyzer/genai-perf/docs/compare.md b/src/c++/perf_analyzer/genai-perf/docs/compare.md index 5d1a36413..a7234a035 100644 --- a/src/c++/perf_analyzer/genai-perf/docs/compare.md +++ b/src/c++/perf_analyzer/genai-perf/docs/compare.md @@ -76,11 +76,11 @@ plot2: - profile3.json output: compare plot3: - title: Distribution of Input Sequence Lengths to Output Sequence Lengths - x_metric: input_sequence_lengths - y_metric: output_sequence_lengths - x_label: Input Sequence Length - y_label: Output Sequence Length + title: Distribution of Input Tokens to Generated Tokens + x_metric: num_input_tokens + y_metric: num_output_tokens + x_label: Number of Input Tokens Per Request + y_label: Number of Generated Tokens Per Request width: 1200 height: 450 type: heatmap @@ -90,10 +90,10 @@ plot3: - profile3.json output: compare plot4: - title: Time to First Token vs Input Sequence Lengths - x_metric: input_sequence_lengths + title: Time to First Token vs Number of Input Tokens + x_metric: num_input_tokens y_metric: time_to_first_tokens - x_label: Input Sequence Length + x_label: Number of Input Tokens y_label: Time to First Token (ms) width: 1200 height: 700 @@ -234,8 +234,8 @@ configuration file. Here are the list of sample plots that gets created by default from running the `compare` subcommand: -### Distribution of Input Sequence Lengths to Output Sequence Lengths - +### Distribution of Input Tokens to Generated Tokens + ### Request Latency Analysis @@ -243,8 +243,8 @@ Here are the list of sample plots that gets created by default from running the ### Time to First Token Analysis -### Time to First Token vs. Input Sequence Lengths - +### Time to First Token vs. Number of Input Tokens + ### Token-to-Token Latency vs. Output Token Position diff --git a/src/c++/perf_analyzer/genai-perf/docs/embeddings.md b/src/c++/perf_analyzer/genai-perf/docs/embeddings.md deleted file mode 100644 index e508f9eff..000000000 --- a/src/c++/perf_analyzer/genai-perf/docs/embeddings.md +++ /dev/null @@ -1,93 +0,0 @@ - - -# Profile Embeddings Models with GenAI-Perf - -GenAI-Perf allows you to profile embedding models running on an -[OpenAI Embeddings API](https://platform.openai.com/docs/api-reference/embeddings)-compatible server. - -## Create a Sample Embeddings Input File - -To create a sample embeddings input file, use the following command: - -```bash -echo '{"text": "What was the first car ever driven?"} -{"text": "Who served as the 5th President of the United States of America?"} -{"text": "Is the Sydney Opera House located in Australia?"} -{"text": "In what state did they film Shrek 2?"}' > embeddings.jsonl -``` - -This will generate a file named embeddings.jsonl with the following content: -```jsonl -{"text": "What was the first car ever driven?"} -{"text": "Who served as the 5th President of the United States of America?"} -{"text": "Is the Sydney Opera House located in Australia?"} -{"text": "In what state did they film Shrek 2?"} -``` - -## Start an OpenAI Embeddings-Compatible Server -To start an OpenAI embeddings-compatible server, run the following command: -```bash -docker run -it --net=host --rm --gpus=all vllm/vllm-openai:latest --model intfloat/e5-mistral-7b-instruct --dtype float16 --max-model-len 1024 -``` - -## Run GenAI-Perf -To profile embeddings models using GenAI-Perf, use the following command: - -```bash -genai-perf profile \ - -m intfloat/e5-mistral-7b-instruct \ - --service-kind openai \ - --endpoint-type embeddings \ - --batch-size 2 \ - --input-file embeddings.jsonl -``` - -This will use default values for optional arguments. You can also pass in -additional arguments with the `--extra-inputs` [flag](../README.md#input-options). -For example, you could use this command: - -```bash -genai-perf profile \ - -m intfloat/e5-mistral-7b-instruct \ - --service-kind openai \ - --endpoint-type embeddings \ - --extra-inputs user:sample_user -``` - -Example output: - -``` - Embeddings Metrics -┏━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┓ -┃ Statistic ┃ avg ┃ min ┃ max ┃ p99 ┃ p90 ┃ p75 ┃ -┡━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━┩ -│ Request latency (ms) │ 42.21 │ 28.18 │ 318.61 │ 56.50 │ 49.21 │ 43.07 │ -└──────────────────────┴───────┴───────┴────────┴───────┴───────┴───────┘ -Request throughput (per sec): 23.63 -``` diff --git a/src/c++/perf_analyzer/genai-perf/docs/files.md b/src/c++/perf_analyzer/genai-perf/docs/files.md index 6ebdf69fa..fb33410d2 100644 --- a/src/c++/perf_analyzer/genai-perf/docs/files.md +++ b/src/c++/perf_analyzer/genai-perf/docs/files.md @@ -58,14 +58,14 @@ The data subdirectory contains the raw and processed performance data files. ##### GZIP Files - all_data.gzip: Aggregated performance data from all collected metrics. -- input_sequence_lengths_vs_output_sequence_lengths.gzip: This contains data on -the input sequence lengths versus the output sequence lengths for each request. +- input_tokens_vs_generated_tokens.gzip: This contains data on the number of +input tokens versus the number of generated tokens for each request. - request_latency.gzip: This contains the latency for each request. - time_to_first_token.gzip: This contains the time to first token for each request. - token_to_token_vs_output_position.gzip: This contains the time from one token generation to the next versus the position of the output token for each token. -- ttft_vs_input_sequence_lengths.gzip: This contains the time to first token -versus the input sequence length for each request. +- ttft_vs_input_tokens.gzip: This contains the time to first token versus +the number of input tokens for each request. ##### JSON Files @@ -85,14 +85,14 @@ The images subdirectory contains visual representations of the performance data. All images are in both HTML and JPEG formats. ##### HTML and JPEG Files -- input_sequence_lengths_vs_output_sequence_lengths: A heat map showing the -relationship between input and generated tokens. +- input_tokens_vs_generated_tokens: A heat map showing the relationship +between input and generated tokens. - request_latency: A box plot showing request latency. - time_to_first_token: A box plot showing time to first token. - token_to_token_vs_output_position: A scatterplot showing token-to-token time versus output token position. -- ttft_vs_input_sequence_lengths: A scatterplot showing token-to-token time -versus the input sequence lengths. +- ttft_vs_input_tokens: A scatterplot showing token-to-token time versus the +number of input tokens. ## Usage Instructions @@ -126,4 +126,4 @@ View .html visualizations in a web browser for interactive data exploration. ### JPEG Files -Use an image software to open .jpeg images for static visual representations. +Use an image software to open .jpeg images for static visual representations. \ No newline at end of file diff --git a/src/c++/perf_analyzer/genai-perf/docs/lora.md b/src/c++/perf_analyzer/genai-perf/docs/lora.md index d30867eda..60be30c95 100644 --- a/src/c++/perf_analyzer/genai-perf/docs/lora.md +++ b/src/c++/perf_analyzer/genai-perf/docs/lora.md @@ -26,22 +26,22 @@ OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. --> -# Profile Multiple LoRA Adapters +# Profiling Multiple LoRA Adapters GenAI-Perf allows you to profile multiple LoRA adapters on top of a base model. -## Select LoRA Adapters +## Selecting LoRA Adapters To do this, list multiple adapters after the model name option `-m`: ```bash genai-perf -m lora_adapter1 lora_adapter2 lora_adapter3 ``` -## Choose a Strategy for Selecting Models +## Choosing a Strategy for Selecting Models When profiling with multiple models, you can specify how the models should be assigned to prompts using the `--model-selection-strategy` option: ```bash -genai-perf profile \ +genai-perf \ -m lora_adapter1 lora_adapter2 lora_adapter3 \ --model-selection-strategy round_robin ``` diff --git a/src/c++/perf_analyzer/genai-perf/docs/rankings.md b/src/c++/perf_analyzer/genai-perf/docs/rankings.md deleted file mode 100644 index a316ef857..000000000 --- a/src/c++/perf_analyzer/genai-perf/docs/rankings.md +++ /dev/null @@ -1,100 +0,0 @@ - - -# Profile Ranking Models with GenAI-Perf - - -GenAI-Perf allows you to profile ranking models compatible with Hugging Face's -[Text Embeddings Inference's re-ranker API](https://huggingface.co/docs/text-embeddings-inference/en/quick_tour#re-rankers). - -## Create a Sample Rankings Input Directory - -To create a sample rankings input directory, follow these steps: - -Create a directory called rankings_jsonl: -```bash -mkdir rankings_jsonl -``` - -Inside this directory, create a JSONL file named queries.jsonl with queries data: - -```bash -echo '{"text": "What was the first car ever driven?"} -{"text": "Who served as the 5th President of the United States of America?"} -{"text": "Is the Sydney Opera House located in Australia?"} -{"text": "In what state did they film Shrek 2?"}' > rankings_jsonl/queries.jsonl -``` - -Create another JSONL file named passages.jsonl with passages data: - -```bash -echo '{"text": "Eric Anderson (born January 18, 1968) is an American sociologist and sexologist."} -{"text": "Kevin Loader is a British film and television producer."} -{"text": "Francisco Antonio Zea Juan Francisco Antonio Hilari was a Colombian journalist, botanist, diplomat, politician, and statesman who served as the 1st Vice President of Colombia."} -{"text": "Daddys Home 2 Principal photography on the film began in Massachusetts in March 2017 and it was released in the United States by Paramount Pictures on November 10, 2017. Although the film received unfavorable reviews, it has grossed over $180 million worldwide on a $69 million budget."}' > rankings_jsonl/passages.jsonl -``` - -## Start a Hugging Face Re-Ranker-Compatible Server -To start a Hugging Face re-ranker-compatible server, run the following commands: - -```bash -model=BAAI/bge-reranker-large -revision=refs/pr/4 -volume=$PWD/data - -docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.3 --model-id $model --revision $revision -``` - -## Run GenAI-Perf -To profile ranking models using GenAI-Perf, use the following command: - -```bash -genai-perf profile \ - -m BAAI/bge-reranker-large \ - --service-kind openai \ - --endpoint-type rankings \ - --endpoint rerank \ - --input-file rankings_jsonl/ \ - -u localhost:8080 \ - --extra-inputs rankings:tei \ - --batch-size 2 -``` - -This command specifies the use of Hugging Face's ranking API with `--endpoint rerank` and `--extra-inputs rankings:tei`. - -Example output: - -``` - Rankings Metrics -┏━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━┳━━━━━━┓ -┃ Statistic ┃ avg ┃ min ┃ max ┃ p99 ┃ p90 ┃ p75 ┃ -┡━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━╇━━━━━━┩ -│ Request latency (ms) │ 5.48 │ 2.50 │ 23.91 │ 10.27 │ 8.34 │ 6.07 │ -└──────────────────────┴──────┴──────┴───────┴───────┴──────┴──────┘ -Request throughput (per sec): 180.11 -``` diff --git a/src/c++/perf_analyzer/genai-perf/docs/tutorial.md b/src/c++/perf_analyzer/genai-perf/docs/tutorial.md index 6d6f3e301..99d88ad07 100644 --- a/src/c++/perf_analyzer/genai-perf/docs/tutorial.md +++ b/src/c++/perf_analyzer/genai-perf/docs/tutorial.md @@ -26,15 +26,9 @@ OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. --> -# Tutorials +# Tutorial -- [Profile GPT2 running on Triton + TensorRT-LLM](#tensorrt-llm) -- [Profile GPT2 running on Triton + vLLM](#triton-vllm) -- [Profile GPT2 running on OpenAI API-Compatible Server](#openai) - ---- - -## Profile GPT2 running on Triton + TensorRT-LLM +## Measuring Throughput and Latency of GPT2 using Triton + TensorRT-LLM ### Running GPT2 on Triton Inference Server using TensorRT-LLM @@ -44,7 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 1. Run Triton Inference Server with TensorRT-LLM backend container: ```bash -export RELEASE="yy.mm" # e.g. export RELEASE="24.03" +export RELEASE="mm.yy" # e.g. export RELEASE="24.03" docker run -it --net=host --rm --gpus=all --shm-size=2g --ulimit memlock=-1 --ulimit stack=67108864 nvcr.io/nvidia/tritonserver:${RELEASE}-trtllm-python-py3 ``` @@ -52,7 +46,14 @@ docker run -it --net=host --rm --gpus=all --shm-size=2g --ulimit memlock=-1 --ul 2. Install Triton CLI (~5 min): ```bash -pip install "git+https://github.com/triton-inference-server/triton_cli@0.0.8" +pip install \ + --extra-index-url https://pypi.nvidia.com \ + -U \ + psutil \ + "pynvml>=11.5.0" \ + torch==2.1.2 \ + tensorrt_llm==0.8.0 \ + "git+https://github.com/triton-inference-server/triton_cli@0.0.6" ``` 3. Download model: @@ -74,7 +75,7 @@ triton start 1. Run Triton Inference Server SDK container: ```bash -export RELEASE="yy.mm" # e.g. export RELEASE="24.03" +export RELEASE="mm.yy" # e.g. export RELEASE="24.03" docker run -it --net=host --rm --gpus=all nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk ``` @@ -82,10 +83,11 @@ docker run -it --net=host --rm --gpus=all nvcr.io/nvidia/tritonserver:${RELEASE} 2. Run GenAI-Perf: ```bash -genai-perf profile \ +genai-perf \ -m gpt2 \ --service-kind triton \ --backend tensorrtllm \ + --prompt-source synthetic \ --num-prompts 100 \ --random-seed 123 \ --synthetic-input-tokens-mean 200 \ @@ -111,14 +113,14 @@ Example output: │ Time to first token (ns) │ 13,266,974 │ 11,818,732 │ 18,351,779 │ 16,513,479 │ 13,741,986 │ 13,544,376 │ │ Inter token latency (ns) │ 2,069,766 │ 42,023 │ 15,307,799 │ 3,256,375 │ 3,020,580 │ 2,090,930 │ │ Request latency (ns) │ 223,532,625 │ 219,123,330 │ 241,004,192 │ 238,198,306 │ 229,676,183 │ 224,715,918 │ -│ Output sequence length │ 104 │ 100 │ 129 │ 128 │ 109 │ 105 │ -│ Input sequence length │ 199 │ 199 │ 199 │ 199 │ 199 │ 199 │ +│ Num output token │ 104 │ 100 │ 129 │ 128 │ 109 │ 105 │ +│ Num input token │ 199 │ 199 │ 199 │ 199 │ 199 │ 199 │ └──────────────────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┘ Output token throughput (per sec): 460.42 Request throughput (per sec): 4.44 ``` -## Profile GPT2 running on Triton + vLLM +## Measuring Throughput and Latency of GPT2 using Triton + vLLM ### Running GPT2 on Triton Inference Server using vLLM @@ -128,7 +130,7 @@ Request throughput (per sec): 4.44 1. Run Triton Inference Server with vLLM backend container: ```bash -export RELEASE="yy.mm" # e.g. export RELEASE="24.03" +export RELEASE="mm.yy" # e.g. export RELEASE="24.03" docker run -it --net=host --rm --gpus=all --shm-size=2g --ulimit memlock=-1 --ulimit stack=67108864 nvcr.io/nvidia/tritonserver:${RELEASE}-vllm-python-py3 ``` @@ -136,7 +138,7 @@ docker run -it --net=host --rm --gpus=all --shm-size=2g --ulimit memlock=-1 --ul 2. Install Triton CLI (~5 min): ```bash -pip install "git+https://github.com/triton-inference-server/triton_cli@0.0.8" +pip install "git+https://github.com/triton-inference-server/triton_cli@0.0.6" ``` 3. Download model: @@ -158,7 +160,7 @@ triton start 1. Run Triton Inference Server SDK container: ```bash -export RELEASE="yy.mm" # e.g. export RELEASE="24.03" +export RELEASE="mm.yy" # e.g. export RELEASE="24.03" docker run -it --net=host --rm --gpus=all nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk ``` @@ -166,10 +168,11 @@ docker run -it --net=host --rm --gpus=all nvcr.io/nvidia/tritonserver:${RELEASE} 2. Run GenAI-Perf: ```bash -genai-perf profile \ +genai-perf \ -m gpt2 \ --service-kind triton \ --backend vllm \ + --prompt-source synthetic \ --num-prompts 100 \ --random-seed 123 \ --synthetic-input-tokens-mean 200 \ @@ -195,14 +198,14 @@ Example output: │ Time to first token (ns) │ 15,786,560 │ 11,437,189 │ 49,550,549 │ 40,129,652 │ 21,248,091 │ 17,824,695 │ │ Inter token latency (ns) │ 3,543,380 │ 591,898 │ 10,013,690 │ 6,152,260 │ 5,039,278 │ 4,060,982 │ │ Request latency (ns) │ 388,415,721 │ 312,552,612 │ 528,229,817 │ 518,189,390 │ 484,281,365 │ 459,417,637 │ -│ Output sequence length │ 113 │ 105 │ 123 │ 122 │ 119 │ 115 │ -│ Input sequence length │ 199 │ 199 │ 199 │ 199 │ 199 │ 199 │ +│ Num output token │ 113 │ 105 │ 123 │ 122 │ 119 │ 115 │ +│ Num input token │ 199 │ 199 │ 199 │ 199 │ 199 │ 199 │ └──────────────────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┘ Output token throughput (per sec): 290.24 Request throughput (per sec): 2.57 ``` -## Profile GPT2 running on OpenAI API-Compatible Server +## Measuring Throughput and Latency of GPT2 using OpenAI API-Compatible Server ### OpenAI Chat Completions API @@ -224,7 +227,7 @@ docker run -it --net=host --rm --gpus=all vllm/vllm-openai:latest --model gpt2 - 1. Run Triton Inference Server SDK container: ```bash -export RELEASE="yy.mm" # e.g. export RELEASE="24.03" +export RELEASE="mm.yy" # e.g. export RELEASE="24.03" docker run -it --net=host --rm --gpus=all nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk ``` @@ -232,11 +235,12 @@ docker run -it --net=host --rm --gpus=all nvcr.io/nvidia/tritonserver:${RELEASE} 2. Run GenAI-Perf: ```bash -genai-perf profile \ +genai-perf \ -m gpt2 \ --service-kind openai \ --endpoint v1/chat/completions \ --endpoint-type chat \ + --prompt-source synthetic \ --num-prompts 100 \ --random-seed 123 \ --synthetic-input-tokens-mean 200 \ @@ -261,8 +265,8 @@ Example output: │ Time to first token (ns) │ 13,546,815 │ 9,821,658 │ 48,317,756 │ 34,361,913 │ 16,541,625 │ 14,612,026 │ │ Inter token latency (ns) │ 2,560,813 │ 457,703 │ 6,507,334 │ 3,754,617 │ 3,059,158 │ 2,953,540 │ │ Request latency (ns) │ 283,597,027 │ 240,098,890 │ 361,730,568 │ 349,164,037 │ 323,279,761 │ 306,507,562 │ -│ Output sequence length │ 114 │ 103 │ 142 │ 136 │ 122 │ 119 │ -│ Input sequence length │ 199 │ 199 │ 199 │ 199 │ 199 │ 199 │ +│ Num output token │ 114 │ 103 │ 142 │ 136 │ 122 │ 119 │ +│ Num input token │ 199 │ 199 │ 199 │ 199 │ 199 │ 199 │ └──────────────────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┘ Output token throughput (per sec): 401.62 Request throughput (per sec): 3.52 @@ -288,7 +292,7 @@ docker run -it --net=host --rm --gpus=all vllm/vllm-openai:latest --model gpt2 - 1. Run Triton Inference Server SDK container: ```bash -export RELEASE="yy.mm" # e.g. export RELEASE="24.03" +export RELEASE="mm.yy" # e.g. export RELEASE="24.03" docker run -it --net=host --rm --gpus=all nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk ``` @@ -296,11 +300,12 @@ docker run -it --net=host --rm --gpus=all nvcr.io/nvidia/tritonserver:${RELEASE} 2. Run GenAI-Perf: ```bash -genai-perf profile \ +genai-perf \ -m gpt2 \ --service-kind openai \ --endpoint v1/completions \ --endpoint-type completions \ + --prompt-source synthetic \ --num-prompts 100 \ --random-seed 123 \ --synthetic-input-tokens-mean 200 \ @@ -318,13 +323,13 @@ Example output: ``` LLM Metrics -┏━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓ -┃ Statistic ┃ avg ┃ min ┃ max ┃ p99 ┃ p90 ┃ p75 ┃ -┡━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩ -│ Request latency (ns) │ 296,990,497 │ 43,312,449 │ 332,788,242 │ 327,475,292 │ 317,392,767 │ 310,343,333 │ -│ Output sequence length │ 109 │ 11 │ 158 │ 142 │ 118 │ 113 │ -│ Input sequence length │ 1 │ 1 │ 1 │ 1 │ 1 │ 1 │ -└────────────────────────┴─────────────┴────────────┴─────────────┴─────────────┴─────────────┴─────────────┘ +┏━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓ +┃ Statistic ┃ avg ┃ min ┃ max ┃ p99 ┃ p90 ┃ p75 ┃ +┡━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩ +│ Request latency (ns) │ 296,990,497 │ 43,312,449 │ 332,788,242 │ 327,475,292 │ 317,392,767 │ 310,343,333 │ +│ Num output token │ 109 │ 11 │ 158 │ 142 │ 118 │ 113 │ +│ Num input token │ 1 │ 1 │ 1 │ 1 │ 1 │ 1 │ +└──────────────────────┴─────────────┴────────────┴─────────────┴─────────────┴─────────────┴─────────────┘ Output token throughput (per sec): 366.78 Request throughput (per sec): 3.37 ``` diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/__init__.py b/src/c++/perf_analyzer/genai-perf/genai_perf/__init__.py index cb5c26999..025456b0f 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/__init__.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/__init__.py @@ -24,4 +24,4 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -__version__ = "0.0.4dev" +__version__ = "0.0.3dev" diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/constants.py b/src/c++/perf_analyzer/genai-perf/genai_perf/constants.py index b951524bf..df2f6f7bb 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/constants.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/constants.py @@ -31,6 +31,7 @@ OPEN_ORCA = "openorca" CNN_DAILY_MAIL = "cnn_dailymail" DEFAULT_INPUT_DATA_JSON = "llm_inputs.json" +DEFAULT_OUTPUT_DATA_JSON = "profile_export_genai_perf.json" DEFAULT_ARTIFACT_DIR = "artifacts" diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/console_exporter.py b/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/console_exporter.py deleted file mode 100644 index 460fe5976..000000000 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/console_exporter.py +++ /dev/null @@ -1,107 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -from genai_perf.export_data.exporter_config import ExporterConfig -from rich.console import Console -from rich.table import Table - - -class ConsoleExporter: - """ - A class to export the statistics and arg values to the console. - """ - - STAT_COLUMN_KEYS = ["avg", "min", "max", "p99", "p90", "p75"] - - def __init__(self, config: ExporterConfig): - self._stats = config.stats - self._metrics = config.metrics - self._args = config.args - - def _get_title(self): - if self._args.endpoint_type == "embeddings": - return "Embeddings Metrics" - elif self._args.endpoint_type == "rankings": - return "Rankings Metrics" - else: - return "LLM Metrics" - - def export(self) -> None: - table = Table(title=self._get_title()) - - table.add_column("Statistic", justify="right", style="cyan", no_wrap=True) - for stat in self.STAT_COLUMN_KEYS: - table.add_column(stat, justify="right", style="green") - - # Request metrics table - self._construct_table(table) - - console = Console() - console.print(table) - - # System metrics are printed after the table - for metric in self._metrics.system_metrics: - line = metric.name.replace("_", " ").capitalize() - value = self._stats[metric.name]["avg"] - line += f" ({metric.unit}): {value:.2f}" - print(line) - - def _construct_table(self, table: Table) -> None: - for metric in self._metrics.request_metrics: - if self._should_skip(metric.name): - continue - - metric_str = metric.name.replace("_", " ").capitalize() - metric_str += f" ({metric.unit})" if metric.unit != "tokens" else "" - row_values = [metric_str] - for stat in self.STAT_COLUMN_KEYS: - value = self._stats[metric.name][stat] - row_values.append(f"{value:,.2f}") - - table.add_row(*row_values) - - # (TMA-1976) Refactor this method as the csv exporter shares identical method. - def _should_skip(self, metric_name: str) -> bool: - if self._args.endpoint_type == "embeddings": - return False # skip nothing - - # TODO (TMA-1712): need to decide if we need this metric. Remove - # from statistics display for now. - # TODO (TMA-1678): output_token_throughput_per_request is treated - # separately since the current code treats all throughput metrics to - # be displayed outside of the statistics table. - if metric_name == "output_token_throughput_per_request": - return True - - # When non-streaming, skip ITL and TTFT - streaming_metrics = [ - "inter_token_latency", - "time_to_first_token", - ] - if not self._args.streaming and metric_name in streaming_metrics: - return True - return False diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/csv_exporter.py b/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/csv_exporter.py deleted file mode 100644 index efbb9b754..000000000 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/csv_exporter.py +++ /dev/null @@ -1,119 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -import csv - -import genai_perf.logging as logging -from genai_perf.export_data.exporter_config import ExporterConfig - -DEFAULT_OUTPUT_DATA_CSV = "profile_export_genai_perf.csv" - -logger = logging.getLogger(__name__) - - -class CsvExporter: - """ - A class to export the statistics and arg values in a csv format. - """ - - REQUEST_METRICS_HEADER = [ - "Metric", - "avg", - "min", - "max", - "p99", - "p95", - "p90", - "p75", - "p50", - "p25", - ] - - SYSTEM_METRICS_HEADER = [ - "Metric", - "Value", - ] - - def __init__(self, config: ExporterConfig): - self._stats = config.stats - self._metrics = config.metrics - self._output_dir = config.artifact_dir - self._args = config.args - - def export(self) -> None: - csv_filename = self._output_dir / DEFAULT_OUTPUT_DATA_CSV - logger.info(f"Generating {csv_filename}") - - with open(csv_filename, mode="w", newline="") as csvfile: - csv_writer = csv.writer(csvfile) - self._write_request_metrics(csv_writer) - csv_writer.writerow([]) - self._write_system_metrics(csv_writer) - - def _write_request_metrics(self, csv_writer) -> None: - csv_writer.writerow(self.REQUEST_METRICS_HEADER) - for metric in self._metrics.request_metrics: - if self._should_skip(metric.name): - continue - - metric_str = metric.name.replace("_", " ").title() - metric_str += f" ({metric.unit})" if metric.unit != "tokens" else "" - row_values = [metric_str] - for stat in self.REQUEST_METRICS_HEADER[1:]: - value = self._stats[metric.name][stat] - row_values.append(f"{value:,.2f}") - - csv_writer.writerow(row_values) - - def _write_system_metrics(self, csv_writer) -> None: - csv_writer.writerow(self.SYSTEM_METRICS_HEADER) - for metric in self._metrics.system_metrics: - metric_str = metric.name.replace("_", " ").title() - metric_str += f" ({metric.unit})" - value = self._stats[metric.name]["avg"] - csv_writer.writerow([metric_str, f"{value:.2f}"]) - - def _should_skip(self, metric_name: str) -> bool: - if self._args.endpoint_type == "embeddings": - return False # skip nothing - - # TODO (TMA-1712): need to decide if we need this metric. Remove - # from statistics display for now. - # TODO (TMA-1678): output_token_throughput_per_request is treated - # separately since the current code treats all throughput metrics to - # be displayed outside of the statistics table. - if metric_name == "output_token_throughput_per_request": - return True - - # When non-streaming, skip ITL and TTFT - streaming_metrics = [ - "inter_token_latency", - "time_to_first_token", - ] - if not self._args.streaming and metric_name in streaming_metrics: - return True - return False diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/data_exporter_factory.py b/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/data_exporter_factory.py deleted file mode 100644 index ac226bdf5..000000000 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/data_exporter_factory.py +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from typing import List - -from genai_perf.export_data.console_exporter import ConsoleExporter -from genai_perf.export_data.csv_exporter import CsvExporter -from genai_perf.export_data.exporter_config import ExporterConfig -from genai_perf.export_data.json_exporter import JsonExporter - -DataExporterList = [ConsoleExporter, JsonExporter, CsvExporter] - - -class DataExporterFactory: - def create_data_exporters(self, config: ExporterConfig) -> List: - data_exporters = [] - for exporter in DataExporterList: - data_exporters.append(exporter(config)) - return data_exporters diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/data_exporter_interface.py b/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/data_exporter_interface.py deleted file mode 100644 index 56bde9a53..000000000 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/data_exporter_interface.py +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -from typing import Protocol - - -class DataExporterInterface(Protocol): - def export(self): - pass diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/exporter_config.py b/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/exporter_config.py deleted file mode 100644 index 0d9c7cd0b..000000000 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/exporter_config.py +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -from genai_perf.metrics import Metrics - - -class ExporterConfig: - def __init__(self): - self._stats = None - self._metrics = None - self._args = None - self._extra_inputs = None - self._artifact_dir = None - - @property - def stats(self): - return self._stats - - @stats.setter - def stats(self, stats_value): - self._stats = stats_value - - @property - def metrics(self): - return self._metrics - - @metrics.setter - def metrics(self, metrics: Metrics): - self._metrics = metrics - - @property - def args(self): - return self._args - - @args.setter - def args(self, args_value): - self._args = args_value - - @property - def extra_inputs(self): - return self._extra_inputs - - @extra_inputs.setter - def extra_inputs(self, extra_inputs_value): - self._extra_inputs = extra_inputs_value - - @property - def artifact_dir(self): - return self._artifact_dir - - @artifact_dir.setter - def artifact_dir(self, artifact_dir_value): - self._artifact_dir = artifact_dir_value diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/json_exporter.py b/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/json_exporter.py index 2ec24fae1..cd50f1c2c 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/json_exporter.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/json_exporter.py @@ -26,15 +26,12 @@ import json +from argparse import Namespace from enum import Enum +from pathlib import Path from typing import Dict -import genai_perf.logging as logging -from genai_perf.export_data.exporter_config import ExporterConfig - -DEFAULT_OUTPUT_DATA_JSON = "profile_export_genai_perf.json" - -logger = logging.getLogger(__name__) +from genai_perf.constants import DEFAULT_OUTPUT_DATA_JSON class JsonExporter: @@ -42,25 +39,22 @@ class JsonExporter: A class to export the statistics and arg values in a json format. """ - def __init__(self, config: ExporterConfig): - self._stats: Dict = config.stats - self._args = dict(vars(config.args)) - self._extra_inputs = config.extra_inputs - self._output_dir = config.artifact_dir + def __init__(self, stats: Dict, args: Namespace, extra_inputs: Dict): + self._stats = stats + self._args = dict(vars(args)) + self._extra_inputs = extra_inputs self._stats_and_args: Dict = {} self._prepare_args_for_export() self._merge_stats_and_args() - def export(self) -> None: - filename = self._output_dir / DEFAULT_OUTPUT_DATA_JSON - logger.info(f"Generating {filename}") + def export_to_file(self, output_dir: Path) -> None: + filename = output_dir / DEFAULT_OUTPUT_DATA_JSON with open(str(filename), "w") as f: f.write(json.dumps(self._stats_and_args, indent=2)) def _prepare_args_for_export(self) -> None: - self._args.pop("func", None) - self._args.pop("output_format", None) - self._args.pop("input_file", None) + del self._args["func"] + del self._args["output_format"] self._args["profile_export_file"] = str(self._args["profile_export_file"]) self._args["artifact_dir"] = str(self._args["artifact_dir"]) for k, v in self._args.items(): diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/output_reporter.py b/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/output_reporter.py deleted file mode 100644 index ec8123b95..000000000 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/output_reporter.py +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -from argparse import Namespace - -from genai_perf.export_data.data_exporter_factory import DataExporterFactory -from genai_perf.export_data.exporter_config import ExporterConfig -from genai_perf.metrics import Statistics -from genai_perf.parser import get_extra_inputs_as_dict - - -class OutputReporter: - """ - A class to orchestrate output generation. - """ - - def __init__(self, stats: Statistics, args: Namespace): - self.args = args - self.stats = stats - self.stats.scale_data() - - def report_output(self) -> None: - factory = DataExporterFactory() - exporter_config = self._create_exporter_config() - data_exporters = factory.create_data_exporters(exporter_config) - - for exporter in data_exporters: - exporter.export() - - def _create_exporter_config(self) -> ExporterConfig: - config = ExporterConfig() - config.stats = self.stats.stats_dict - config.metrics = self.stats.metrics - config.args = self.args - config.artifact_dir = self.args.artifact_dir - config.extra_inputs = get_extra_inputs_as_dict(self.args) - return config diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py index de528aac4..a531d2ad5 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py @@ -41,8 +41,7 @@ class PromptSource(Enum): class OutputFormat(Enum): OPENAI_CHAT_COMPLETIONS = auto() OPENAI_COMPLETIONS = auto() - OPENAI_EMBEDDINGS = auto() - RANKINGS = auto() + TRITON_GENERATE = auto() TENSORRTLLM = auto() VLLM = auto() @@ -66,7 +65,6 @@ class LlmInputs: DEFAULT_TENSORRTLLM_MAX_TOKENS = 256 - DEFAULT_BATCH_SIZE = 1 DEFAULT_RANDOM_SEED = 0 DEFAULT_PROMPT_TOKENS_MEAN = 550 DEFAULT_PROMPT_TOKENS_STDDEV = 0 @@ -102,7 +100,6 @@ def create_llm_inputs( add_stream: bool = False, tokenizer: Tokenizer = get_tokenizer(DEFAULT_TOKENIZER), extra_inputs: Optional[Dict] = None, - batch_size: int = 1, output_dir: Path = Path(""), ) -> Dict: """ @@ -138,8 +135,6 @@ def create_llm_inputs( The standard deviation of the length of the output to generate. This is only used if output_tokens_mean is provided. output_tokens_deterministic: If true, the output tokens will set the minimum and maximum tokens to be equivalent. - batch_size: - The number of inputs per request (currently only used for the embeddings and rankings endpoints) Required Synthetic Prompt Generation Parameters ----------------------------------------------- @@ -162,21 +157,36 @@ def create_llm_inputs( input_type, dataset_name, starting_index, length, tokenizer ) - random.seed(random_seed) - - generic_dataset_json = cls.get_generic_dataset_json( - input_type, - output_format, - dataset_name, - starting_index, - length, - tokenizer, - prompt_tokens_mean, - prompt_tokens_stddev, - num_of_output_prompts, - batch_size, - input_filename, - ) + if input_type == PromptSource.DATASET: + dataset = cls._get_input_dataset_from_url( + dataset_name, starting_index, length + ) + generic_dataset_json = cls._convert_input_url_dataset_to_generic_json( + dataset + ) + elif input_type == PromptSource.SYNTHETIC: + random.seed(random_seed) + synthetic_dataset = cls._get_input_dataset_from_synthetic( + tokenizer, + prompt_tokens_mean, + prompt_tokens_stddev, + num_of_output_prompts, + ) + generic_dataset_json = ( + cls._convert_input_synthetic_or_file_dataset_to_generic_json( + synthetic_dataset + ) + ) + elif input_type == PromptSource.FILE: + input_filename = cast(Path, input_filename) + input_file_dataset = cls._get_input_dataset_from_file(input_filename) + generic_dataset_json = ( + cls._convert_input_synthetic_or_file_dataset_to_generic_json( + input_file_dataset + ) + ) + else: + raise GenAIPerfException("Input source is not recognized.") if extra_inputs is None: extra_inputs = {} @@ -197,178 +207,6 @@ def create_llm_inputs( return json_in_pa_format - @classmethod - def get_generic_dataset_json( - cls, - input_type: PromptSource, - output_format: OutputFormat, - dataset_name: str, - starting_index: int, - length: int, - tokenizer: Tokenizer, - prompt_tokens_mean: int, - prompt_tokens_stddev: int, - num_of_output_prompts: int, - batch_size: int, - input_filename: Optional[Path], - ) -> Dict: - """ - Retrieve and convert the dataset based on the input type. - - Parameters - ---------- - input_type: - Specify how the input is received - output_format: - Specify the output format - dataset_name: - The name of the dataset - starting_index: - Offset from within the list to start gathering inputs - length: - Number of entries to gather - tokenizer: - The tokenizer to use when generating synthetic prompts - prompt_tokens_mean: - The mean length of the prompt to generate - prompt_tokens_stddev: - The standard deviation of the length of the prompt to generate - num_of_output_prompts: - The number of synthetic output prompts to generate - batch_size: - The number of inputs per request (currently only used for the embeddings and rankings endpoints) - input_filename: - The path to the input file containing the prompts in JSONL format. - Returns - ------- - Dict: - The generic dataset JSON - """ - - if output_format == OutputFormat.OPENAI_EMBEDDINGS: - if input_type != PromptSource.FILE: - raise GenAIPerfException( - f"{OutputFormat.OPENAI_EMBEDDINGS.to_lowercase()} only supports a file as input." - ) - input_filename = cast(Path, input_filename) - input_file_dataset = cls._get_input_dataset_from_embeddings_file( - input_filename, - batch_size, - num_of_output_prompts, - ) - generic_dataset_json = ( - cls._convert_input_synthetic_or_file_dataset_to_generic_json( - input_file_dataset - ) - ) - elif output_format == OutputFormat.RANKINGS: - if input_type != PromptSource.FILE: - raise GenAIPerfException( - f"{OutputFormat.RANKINGS.to_lowercase()} only supports a directory as input." - ) - queries_filename = cast(Path, input_filename) / "queries.jsonl" - passages_filename = cast(Path, input_filename) / "passages.jsonl" - input_file_dataset = cls._get_input_dataset_from_rankings_files( - queries_filename, passages_filename, batch_size, num_of_output_prompts - ) - - generic_dataset_json = ( - cls._convert_input_synthetic_or_file_dataset_to_generic_json( - input_file_dataset - ) - ) - else: - if input_type == PromptSource.DATASET: - dataset = cls._get_input_dataset_from_url( - dataset_name, starting_index, length - ) - generic_dataset_json = cls._convert_input_url_dataset_to_generic_json( - dataset - ) - elif input_type == PromptSource.SYNTHETIC: - synthetic_dataset = cls._get_input_dataset_from_synthetic( - tokenizer, - prompt_tokens_mean, - prompt_tokens_stddev, - num_of_output_prompts, - ) - generic_dataset_json = ( - cls._convert_input_synthetic_or_file_dataset_to_generic_json( - synthetic_dataset - ) - ) - elif input_type == PromptSource.FILE: - input_filename = cast(Path, input_filename) - input_file_dataset = cls._get_input_dataset_from_file(input_filename) - generic_dataset_json = ( - cls._convert_input_synthetic_or_file_dataset_to_generic_json( - input_file_dataset - ) - ) - else: - raise GenAIPerfException("Input source is not recognized.") - - return generic_dataset_json - - @classmethod - def _get_input_dataset_from_embeddings_file( - cls, input_filename: Path, batch_size: int, num_prompts: int - ) -> Dict[str, Any]: - with open(input_filename, "r") as file: - file_content = [json.loads(line) for line in file] - - texts = [item["text"] for item in file_content] - - if batch_size > len(texts): - raise ValueError( - "Batch size cannot be larger than the number of available texts" - ) - - dataset_json: Dict[str, Any] = {} - dataset_json["features"] = [{"name": "input"}] - dataset_json["rows"] = [] - - for _ in range(num_prompts): - sampled_texts = random.sample(texts, batch_size) - dataset_json["rows"].append({"row": {"payload": {"input": sampled_texts}}}) - - return dataset_json - - @classmethod - def _get_input_dataset_from_rankings_files( - cls, - queries_filename: Path, - passages_filename: Path, - batch_size: int, - num_prompts: int, - ) -> Dict[str, Any]: - - with open(queries_filename, "r") as file: - queries_content = [json.loads(line) for line in file] - queries_texts = [item for item in queries_content] - - with open(passages_filename, "r") as file: - passages_content = [json.loads(line) for line in file] - passages_texts = [item for item in passages_content] - - if batch_size > len(passages_texts): - raise ValueError( - "Batch size cannot be larger than the number of available passages" - ) - - dataset_json: Dict[str, Any] = {} - dataset_json["features"] = [{"name": "input"}] - dataset_json["rows"] = [] - - for _ in range(num_prompts): - sampled_texts = random.sample(passages_texts, batch_size) - query_sample = random.choice(queries_texts) - entry_dict = {} - entry_dict["query"] = query_sample - entry_dict["passages"] = sampled_texts - dataset_json["rows"].append({"row": {"payload": entry_dict}}) - return dataset_json - @classmethod def _check_for_valid_args( cls, @@ -394,7 +232,6 @@ def _get_input_dataset_from_url( url = cls._resolve_url(dataset_name) configured_url = cls._create_configured_url(url, starting_index, length) dataset = cls._download_dataset(configured_url) - return dataset @classmethod @@ -495,55 +332,24 @@ def _add_rows_to_generic_json( @classmethod def _get_input_dataset_from_file(cls, input_filename: Path) -> Dict: - """ - Reads the input prompts from a JSONL file and converts them into the required dataset format. - - Parameters - ---------- - input_filename : Path - The path to the input file containing the prompts in JSONL format. - - Returns - ------- - Dict - The dataset in the required format with the prompts read from the file. - """ cls.verify_file(input_filename) - input_file_prompts = cls._get_prompts_from_input_file(input_filename) + input_file_prompt = cls._get_prompt_from_input_file(input_filename) dataset_json: Dict[str, Any] = {} dataset_json["features"] = [{"name": "text_input"}] - dataset_json["rows"] = [ - {"row": {"text_input": prompt}} for prompt in input_file_prompts - ] + dataset_json["rows"] = [] + dataset_json["rows"].append({"row": {"text_input": input_file_prompt}}) return dataset_json - @classmethod - def _get_prompts_from_input_file(cls, input_filename: Path) -> List[str]: - """ - Reads the input prompts from a JSONL file and returns a list of prompts. - - Parameters - ---------- - input_filename : Path - The path to the input file containing the prompts in JSONL format. - - Returns - ------- - List[str] - A list of prompts read from the file. - """ - prompts = [] - with open(input_filename, mode="r", newline=None) as file: - for line in file: - if line.strip(): - prompts.append(json.loads(line).get("text_input", "").strip()) - return prompts - @classmethod def verify_file(cls, input_filename: Path) -> None: if not input_filename.exists(): raise FileNotFoundError(f"The file '{input_filename}' does not exist.") + @classmethod + def _get_prompt_from_input_file(cls, input_filename: Path) -> str: + with open(input_filename, mode="r", newline=None) as file: + return file.read() + @classmethod def _convert_generic_json_to_output_format( cls, @@ -558,8 +364,8 @@ def _convert_generic_json_to_output_format( model_name: list = [], model_selection_strategy: ModelSelectionStrategy = ModelSelectionStrategy.ROUND_ROBIN, ) -> Dict: - if output_format == OutputFormat.OPENAI_CHAT_COMPLETIONS: - output_json = cls._convert_generic_json_to_openai_chat_completions_format( + if output_format == OutputFormat.TRITON_GENERATE: + output_json = cls._convert_generic_json_to_generate_format( generic_dataset, add_model_name, add_stream, @@ -568,10 +374,9 @@ def _convert_generic_json_to_output_format( output_tokens_stddev, output_tokens_deterministic, model_name, - model_selection_strategy, ) - elif output_format == OutputFormat.OPENAI_COMPLETIONS: - output_json = cls._convert_generic_json_to_openai_completions_format( + elif output_format == OutputFormat.OPENAI_CHAT_COMPLETIONS: + output_json = cls._convert_generic_json_to_openai_chat_completions_format( generic_dataset, add_model_name, add_stream, @@ -582,17 +387,15 @@ def _convert_generic_json_to_output_format( model_name, model_selection_strategy, ) - elif output_format == OutputFormat.OPENAI_EMBEDDINGS: - output_json = cls._convert_generic_json_to_openai_embeddings_format( - generic_dataset, - extra_inputs, - model_name, - model_selection_strategy, - ) - elif output_format == OutputFormat.RANKINGS: - output_json = cls._convert_generic_json_to_rankings_format( + elif output_format == OutputFormat.OPENAI_COMPLETIONS: + output_json = cls._convert_generic_json_to_openai_completions_format( generic_dataset, + add_model_name, + add_stream, extra_inputs, + output_tokens_mean, + output_tokens_stddev, + output_tokens_deterministic, model_name, model_selection_strategy, ) @@ -663,7 +466,7 @@ def _convert_generic_json_to_openai_chat_completions_format( return pa_json @classmethod - def _convert_generic_json_to_openai_completions_format( + def _convert_generic_json_to_generate_format( cls, dataset_json: Dict, add_model_name: bool, @@ -680,7 +483,8 @@ def _convert_generic_json_to_openai_completions_format( user_role_headers, text_input_headers, ) = cls._determine_json_feature_roles(dataset_json) - pa_json = cls._populate_openai_completions_output_json( + + pa_json = cls._populate_triton_generate_output_json( dataset_json, system_role_headers, user_role_headers, @@ -692,106 +496,42 @@ def _convert_generic_json_to_openai_completions_format( output_tokens_stddev, output_tokens_deterministic, model_name, - model_selection_strategy, ) return pa_json @classmethod - def _convert_generic_json_to_openai_embeddings_format( - cls, - generic_dataset: Dict, - extra_inputs: Dict, - model_name: list = [], - model_selection_strategy: ModelSelectionStrategy = ModelSelectionStrategy.ROUND_ROBIN, - ) -> Dict[str, Any]: - pa_json: Dict[str, Any] = {"data": []} - - for index, entry in enumerate(generic_dataset["rows"]): - iter_model_name = cls._select_model_name( - model_name, index, model_selection_strategy - ) - payload = entry.get("payload", {}) - input_values = payload.get("input") - - if input_values is None: - raise ValueError("Missing required fields 'input' in dataset entry") - if not isinstance(input_values, list): - raise ValueError( - f"Required field 'input' must be a list (actual: {type(input_values)})" - ) - - payload = { - "input": input_values, - "model": iter_model_name, - } - - for key, value in extra_inputs.items(): - payload[key] = value - - pa_json["data"].append({"payload": [payload]}) - - return pa_json - - @classmethod - def contains_rankings_tei(cls, extra_inputs: Optional[Dict]) -> bool: - """ - Check if user specified that they are using the Hugging Face - Text Embeddings Interface for ranking models - """ - if extra_inputs and extra_inputs.get("rankings") == "tei": - return True - return False - - @classmethod - def _convert_generic_json_to_rankings_format( + def _convert_generic_json_to_openai_completions_format( cls, - generic_dataset: Dict, + dataset_json: Dict, + add_model_name: bool, + add_stream: bool, extra_inputs: Dict, + output_tokens_mean: int, + output_tokens_stddev: int, + output_tokens_deterministic: bool, model_name: list = [], model_selection_strategy: ModelSelectionStrategy = ModelSelectionStrategy.ROUND_ROBIN, - ) -> Dict[str, Any]: - pa_json: Dict[str, Any] = {"data": []} - use_tei_format = cls.contains_rankings_tei(extra_inputs) - - for index, entry in enumerate(generic_dataset["rows"]): - iter_model_name = cls._select_model_name( - model_name, index, model_selection_strategy - ) - payload = entry.get("payload", {}) - query_values = payload.get("query") - - if use_tei_format: - passage_values = payload.get("passages", []) - passage_values = [item.get("text", "") for item in passage_values] - else: - passage_values = payload.get("passages") - - if query_values is None: - raise ValueError("Missing required fields 'query' in dataset entry") - if passage_values is None: - raise ValueError( - f"Missing required fields '{'texts' if use_tei_format else 'passages'}' in dataset entry" - ) - if not isinstance(passage_values, list): - raise ValueError( - f"Required field '{'texts' if use_tei_format else 'passages'}' must be a list (actual: {type(passage_values)})" - ) - - if use_tei_format: - payload = {"query": query_values["text"], "texts": passage_values} - else: - payload = { - "query": query_values, - "passages": passage_values, - "model": iter_model_name, - } - - for key, value in extra_inputs.items(): - if not (key == "rankings" and value == "tei"): - payload[key] = value - - pa_json["data"].append({"payload": [payload]}) + ) -> Dict: + ( + system_role_headers, + user_role_headers, + text_input_headers, + ) = cls._determine_json_feature_roles(dataset_json) + pa_json = cls._populate_openai_completions_output_json( + dataset_json, + system_role_headers, + user_role_headers, + text_input_headers, + add_model_name, + add_stream, + extra_inputs, + output_tokens_mean, + output_tokens_stddev, + output_tokens_deterministic, + model_name, + model_selection_strategy, + ) return pa_json @@ -959,6 +699,52 @@ def _populate_openai_chat_completions_output_json( return pa_json + @classmethod + def _populate_triton_generate_output_json( + cls, + dataset: Dict, + system_role_headers: List[str], + user_role_headers: List[str], + text_input_headers: List[str], + add_model_name: bool, + add_stream: bool, + extra_inputs: Dict, + output_tokens_mean: int, + output_tokens_stddev: int, + output_tokens_deterministic: bool, + model_name: list = [], + model_selection_strategy: ModelSelectionStrategy = ModelSelectionStrategy.ROUND_ROBIN, + ) -> Dict: + pa_json: dict = {"data": [{"payload": [{}]} for _ in dataset["rows"]]} + + for index, entry in enumerate(dataset["rows"]): + for header, content in entry.items(): + new_text_input = cls._create_new_text_input( + header, + system_role_headers, + user_role_headers, + text_input_headers, + content, + ) + pa_json["data"][index]["payload"][0]["text_input"] = new_text_input + + iter_model_name = cls._select_model_name( + model_name, index, model_selection_strategy + ) + pa_json = cls._add_optional_tags_to_openai_json( + pa_json, + index, + False, + add_stream, + extra_inputs, + output_tokens_mean, + output_tokens_stddev, + output_tokens_deterministic, + iter_model_name, + ) + + return pa_json + @classmethod def _populate_openai_completions_output_json( cls, diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py new file mode 100755 index 000000000..14b250735 --- /dev/null +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py @@ -0,0 +1,757 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import csv +import json +from collections import defaultdict +from enum import Enum, auto +from itertools import tee +from pathlib import Path +from typing import Dict, List, Tuple, Union + +import numpy as np +import pandas as pd +from genai_perf.tokenizer import Tokenizer +from genai_perf.utils import load_json, remove_sse_prefix +from rich.console import Console +from rich.table import Table + + +class ResponseFormat(Enum): + OPENAI_CHAT_COMPLETIONS = auto() + OPENAI_COMPLETIONS = auto() + TRITON = auto() + TRITON_GENERATE = auto() + + +class Metrics: + """A base class for all the metrics class that contains common metrics.""" + + metric_labels = [ + "time_to_first_token", + "inter_token_latency", + "request_latency", + "output_token_throughput", + "output_token_throughput_per_request", + "request_throughput", + "num_output_token", + "num_input_token", + ] + + time_fields = [ + "inter_token_latency", + "time_to_first_token", + "request_latency", + ] + + # TODO (TMA-1678): output_token_throughput_per_request is not on this list + # since the current code treats all the throughput metrics to be displayed + # outside of the statistics table. + throughput_fields = [ + "request_throughput", + "output_token_throughput", + ] + + def __init__( + self, + request_throughputs: List[float] = [], + request_latencies: List[int] = [], + ) -> None: + self.request_throughputs = request_throughputs + self.request_latencies = request_latencies + self._base_names = { + "request_throughputs": "request_throughput", + "request_latencies": "request_latency", + } + + def __repr__(self): + attr_strs = [] + for k, v in self.__dict__.items(): + if not k.startswith("_"): + attr_strs.append(f"{k}={v}") + return f"Metrics({','.join(attr_strs)})" + + @property + def data(self) -> dict: + """Returns all the metrics.""" + return {k: v for k, v in self.__dict__.items() if not k.startswith("_")} + + def get_base_name(self, metric_name: str) -> str: + """Returns singular name of a given metric.""" + if metric_name in self._base_names: + return self._base_names[metric_name] + else: + raise KeyError(f"No metric named '{metric_name}' exists.") + + +class LLMMetrics(Metrics): + """A simple dataclass that holds core LLM performance metrics.""" + + def __init__( + self, + request_throughputs: List[float] = [], + request_latencies: List[int] = [], + time_to_first_tokens: List[int] = [], + inter_token_latencies: List[List[int]] = [[]], + output_token_throughputs: List[float] = [], + output_token_throughputs_per_request: List[int] = [], + num_output_tokens: List[int] = [], + num_input_tokens: List[int] = [], + ) -> None: + super().__init__(request_throughputs, request_latencies) + self.time_to_first_tokens = time_to_first_tokens + self.inter_token_latencies = inter_token_latencies + self.output_token_throughputs = output_token_throughputs + self.output_token_throughputs_per_request = output_token_throughputs_per_request + self.num_output_tokens = num_output_tokens + self.num_input_tokens = num_input_tokens + + # add base name mapping + self._base_names["time_to_first_tokens"] = "time_to_first_token" + self._base_names["inter_token_latencies"] = "inter_token_latency" + self._base_names["output_token_throughputs"] = "output_token_throughput" + self._base_names[ + "output_token_throughputs_per_request" + ] = "output_token_throughput_per_request" + self._base_names["num_output_tokens"] = "num_output_token" + self._base_names["num_input_tokens"] = "num_input_token" + + +class Statistics: + """A class that aggregates various statistics from given metrics class. + + The Statistics class goes through each metric in the metrics class and + calculates several statistics such as: + - average (arithmetic mean) + - percentiles (p25, p50, p75, p90, p95, p99) + - minimum & maximum + - standard deviation + The class will store each calculated statistics as part of its attribute. + + Example: + + >>> metrics = LLMMetrics(request_throughputs=[2, 4]) + >>> stats = Statistics(metrics) + >>> print(stats.avg_request_throughput) # output: 3 + """ + + def __init__(self, metrics: Metrics): + # iterate through Metrics to calculate statistics and set attributes + self._metrics = metrics + self._stats_dict: Dict = defaultdict(dict) + for attr, data in metrics.data.items(): + attr = metrics.get_base_name(attr) + self._add_units(attr) + data = self._preprocess_data(data, attr) + if data: + self._calculate_mean(data, attr) + if not self._is_throughput_field(attr): + self._calculate_percentiles(data, attr) + self._calculate_minmax(data, attr) + self._calculate_std(data, attr) + + def _preprocess_data(self, data: List, attr: str) -> List[Union[int, float]]: + new_data = [] + if attr == "inter_token_latency": + # flatten inter token latencies to 1D + for d in data: + new_data += d + else: + new_data = data + return new_data + + def _calculate_mean(self, data: List[Union[int, float]], attr: str) -> None: + avg = np.mean(data) + self._stats_dict[attr]["avg"] = float(avg) + setattr(self, "avg_" + attr, avg) + + def _calculate_percentiles(self, data: List[Union[int, float]], attr: str) -> None: + p25, p50, p75 = np.percentile(data, [25, 50, 75]) + p90, p95, p99 = np.percentile(data, [90, 95, 99]) + self._stats_dict[attr]["p99"] = float(p99) + self._stats_dict[attr]["p95"] = float(p95) + self._stats_dict[attr]["p90"] = float(p90) + self._stats_dict[attr]["p75"] = float(p75) + self._stats_dict[attr]["p50"] = float(p50) + self._stats_dict[attr]["p25"] = float(p25) + setattr(self, "p25_" + attr, p25) + setattr(self, "p50_" + attr, p50) + setattr(self, "p75_" + attr, p75) + setattr(self, "p90_" + attr, p90) + setattr(self, "p95_" + attr, p95) + setattr(self, "p99_" + attr, p99) + + def _calculate_minmax(self, data: List[Union[int, float]], attr: str) -> None: + min, max = np.min(data), np.max(data) + self._stats_dict[attr]["max"] = float(max) + self._stats_dict[attr]["min"] = float(min) + setattr(self, "min_" + attr, min) + setattr(self, "max_" + attr, max) + + def _calculate_std(self, data: List[Union[int, float]], attr: str) -> None: + std = np.std(data) + self._stats_dict[attr]["std"] = float(std) + setattr(self, "std_" + attr, std) + + def _add_units(self, key) -> None: + if self._is_time_field(key): + self._stats_dict[key]["unit"] = "ns" + if key == "request_throughput": + self._stats_dict[key]["unit"] = "requests/sec" + if key.startswith("output_token_throughput"): + self._stats_dict[key]["unit"] = "tokens/sec" + if key == "num_input_token" or key == "num_output_token": + self._stats_dict[key]["unit"] = "tokens" + + def __repr__(self) -> str: + attr_strs = [] + for k, v in self.__dict__.items(): + if not k.startswith("_"): + attr_strs.append(f"{k}={v}") + return f"Statistics({','.join(attr_strs)})" + + @property + def data(self) -> dict: + """Return all the aggregated statistics.""" + return {k: v for k, v in self.__dict__.items() if not k.startswith("_")} + + @property + def metrics(self) -> Metrics: + """Return the underlying metrics used to calculate the statistics.""" + return self._metrics + + @property + def stats_dict(self) -> Dict: + return self._stats_dict + + def _is_throughput_field(self, field: str) -> bool: + return field in Metrics.throughput_fields + + def _is_time_field(self, field: str) -> bool: + return field in Metrics.time_fields + + def pretty_print(self) -> None: + """Prints the statistics in a tabular format.""" + + singular_metric_rows = [] + table = Table(title="LLM Metrics") + + table.add_column("Statistic", justify="right", style="cyan", no_wrap=True) + stats = ["avg", "min", "max", "p99", "p90", "p75"] + for stat in stats: + table.add_column(stat, justify="right", style="green") + + for metric in Metrics.metric_labels: + formatted_metric = metric.replace("_", " ").capitalize() + + # Throughput fields are printed after the table + is_throughput_field = self._is_throughput_field(metric) + if is_throughput_field: + value = self.__dict__.get(f"{stats[0]}_{metric}", -1) + formatted_metric += f" (per sec): {value:.2f}" + singular_metric_rows.append(formatted_metric) + continue + + # TODO (TMA-1712): need to decide if we need this metric. Remove + # from statistics display for now. + # TODO (TMA-1678): output_token_throughput_per_request is treated + # separately since the current code treats all throughput metrics to + # be displayed outside of the statistics table. + if metric == "output_token_throughput_per_request": + formatted_metric += f" (per sec)" + continue + + is_time_field = self._is_time_field(metric) + if is_time_field: + formatted_metric += " (ns)" + + row_values = [formatted_metric] + + for stat in stats: + value = self.__dict__.get(f"{stat}_{metric}", -1) + row_values.append(f"{value:,.0f}") + + # Without streaming, there is no inter-token latency available, so do not print it. + if metric == "inter_token_latency": + if all(value == "-1" for value in row_values[1:]): + continue + # Without streaming, TTFT and request latency are the same, so do not print TTFT. + elif metric == "time_to_first_token": + unique_values = False + for stat in stats: + value_ttft = self.__dict__.get(f"{stat}_{metric}", -1) + value_req_latency = self.__dict__.get(f"{stat}_request_latency", -1) + if value_ttft != value_req_latency: + unique_values = True + break + if not unique_values: + continue + + table.add_row(*row_values) + + console = Console() + console.print(table) + + for row in singular_metric_rows: + print(row) + + def export_to_csv(self, csv_filename: str) -> None: + """Exports the statistics to a CSV file.""" + + multiple_metric_header = [ + "Metric", + "avg", + "min", + "max", + "p99", + "p95", + "p90", + "p75", + "p50", + "p25", + ] + + single_metric_header = [ + "Metric", + "Value", + ] + + with open(csv_filename, mode="w", newline="") as csvfile: + singular_metric_rows = [] + + csv_writer = csv.writer(csvfile) + csv_writer.writerow(multiple_metric_header) + + for metric in Metrics.metric_labels: + formatted_metric = metric.replace("_", " ").title() + + is_throughput_field = self._is_throughput_field(metric) + is_time_field = self._is_time_field(metric) + + if is_time_field: + formatted_metric += " (ns)" + elif is_throughput_field: + formatted_metric += " (per sec)" + # TODO (TMA-1712): need to decide if we need this metric. Do not + # include in the csv for now. + # TODO (TMA-1678): output_token_throughput_per_request is treated + # separately since the current code treats all throughput metrics + # to be displayed outside of the statistics table. + elif metric == "output_token_throughput_per_request": + formatted_metric += " (per sec)" + continue + + row_values = [formatted_metric] + + if is_throughput_field: + value = self.__dict__.get( + f"{multiple_metric_header[1]}_{metric}", -1 + ) + row_values.append(f"{value:.2f}") + singular_metric_rows.append(row_values) + continue + + for stat in multiple_metric_header[1:]: + value = self.__dict__.get(f"{stat}_{metric}", -1) + row_values.append(f"{value:.0f}") + + # Without streaming, there is no inter-token latency available, so do not print it. + if metric == "inter_token_latency": + if all(value == "-1" for value in row_values[1:]): + continue + # Without streaming, TTFT and request latency are the same, so do not print TTFT. + elif metric == "time_to_first_token": + unique_values = False + for stat in multiple_metric_header[1:]: + value_ttft = self.__dict__.get(f"{stat}_{metric}", -1) + value_req_latency = self.__dict__.get( + f"{stat}_request_latency", -1 + ) + if value_ttft != value_req_latency: + unique_values = True + break + if not unique_values: + continue + + csv_writer.writerow(row_values) + + csv_writer.writerow([]) + csv_writer.writerow(single_metric_header) + for row in singular_metric_rows: + csv_writer.writerow(row) + + def export_parquet(self, artifact_dir: Path, filename: str) -> None: + max_length = -1 + col_index = 0 + filler_list = [] + df = pd.DataFrame() + + # Data frames require all columns of the same length + # find the max length column + for key, value in self._metrics.data.items(): + max_length = max(max_length, len(value)) + + # Insert None for shorter columns to match longest column + for key, value in self._metrics.data.items(): + if len(value) < max_length: + diff = max_length - len(value) + filler_list = [None] * diff + df.insert(col_index, key, value + filler_list) + diff = 0 + filler_list = [] + col_index = col_index + 1 + + filepath = artifact_dir / f"{filename}.gzip" + df.to_parquet(filepath, compression="gzip") + + +class ProfileDataParser: + """Base profile data parser class that reads the profile data JSON file to + extract core metrics and calculate various performance statistics. + """ + + def __init__(self, filename: Path) -> None: + data = load_json(filename) + self._get_profile_metadata(data) + self._parse_profile_data(data) + + def _get_profile_metadata(self, data: dict) -> None: + self._service_kind = data["service_kind"] + if self._service_kind == "openai": + if data["endpoint"] == "v1/chat/completions": + self._response_format = ResponseFormat.OPENAI_CHAT_COMPLETIONS + elif data["endpoint"] == "v1/completions": + self._response_format = ResponseFormat.OPENAI_COMPLETIONS + elif "generate" in data["endpoint"]: + self._response_format = ResponseFormat.TRITON_GENERATE + else: + # TPA-66: add PA metadata to handle this case + # When endpoint field is either empty or custom endpoint, fall + # back to parsing the response to extract the response format. + request = data["experiments"][0]["requests"][0] + response = request["response_outputs"][0]["response"] + if "chat.completion" in response: + self._response_format = ResponseFormat.OPENAI_CHAT_COMPLETIONS + elif "text_completion" in response: + self._response_format = ResponseFormat.OPENAI_COMPLETIONS + else: + raise RuntimeError("Unknown OpenAI response format.") + + elif self._service_kind == "triton": + self._response_format = ResponseFormat.TRITON + else: + raise ValueError(f"Unknown service kind: {self._service_kind}") + + def _parse_profile_data(self, data: dict) -> None: + """Parse through the entire profile data to collect statistics.""" + self._profile_results = {} + for experiment in data["experiments"]: + infer_mode = experiment["experiment"]["mode"] + load_level = experiment["experiment"]["value"] + requests = experiment["requests"] + + metrics = self._parse_requests(requests) + + # aggregate and calculate statistics + statistics = Statistics(metrics) + self._profile_results[(infer_mode, str(load_level))] = statistics + + def _parse_requests(self, requests: dict) -> LLMMetrics: + """Parse each request in profile data to extract core metrics.""" + raise NotImplementedError + + def get_statistics(self, infer_mode: str, load_level: str) -> Statistics: + """Return profile statistics if it exists.""" + if (infer_mode, load_level) not in self._profile_results: + raise KeyError(f"Profile with {infer_mode}={load_level} does not exist.") + return self._profile_results[(infer_mode, load_level)] + + def get_profile_load_info(self) -> List[Tuple[str, str]]: + """Return available (infer_mode, load_level) tuple keys.""" + return [k for k, _ in self._profile_results.items()] + + +class LLMProfileDataParser(ProfileDataParser): + """A class that calculates and aggregates all the LLM performance statistics + across the Perf Analyzer profile results. + + The LLMProfileDataParser class parses profile export JSON file, collects the + core LLM performance metrics, and calculates summary statistics for each + different Perf Analyzer runs/experiments. + + Example: + + >>> ... # run Perf Analyzer with concurrency level 10 + >>> + >>> from transformers import AutoTokenizer + >>> + >>> tokenizer = AutoTokenizer.from_pretrained("gpt2") + >>> pd = LLMProfileDataParser( + >>> filename="profile_export.json", + >>> tokenizer=tokenizer, + >>> ) + >>> stats = pd.get_statistics(infer_mode="concurrency", level=10) + >>> + >>> print(stats) # output: Statistics(avg_time_to_first_token=...) + >>> stats.pretty_print() # Output: time_to_first_token_s: ... + """ + + def __init__( + self, + filename: Path, + tokenizer: Tokenizer, + ) -> None: + self._tokenizer = tokenizer + super().__init__(filename) + + def _parse_requests(self, requests: dict) -> LLMMetrics: + """Parse each requests in profile export data to extract key metrics.""" + min_req_timestamp, max_res_timestamp = float("inf"), 0 + request_latencies = [] + time_to_first_tokens = [] + inter_token_latencies = [] + output_token_throughputs_per_request = [] + num_input_tokens = [] + num_generated_tokens = [] + for request in requests: + req_timestamp = request["timestamp"] + req_inputs = request["request_inputs"] + res_timestamps = request["response_timestamps"] + res_outputs = request["response_outputs"] + + self._preprocess_response(res_timestamps, res_outputs) + + # Skip requests with empty response. This happens sometimes when the + # model returns a single response with empty string. + if not res_timestamps: + continue + + # track entire benchmark duration + min_req_timestamp = min(min_req_timestamp, req_timestamp) + max_res_timestamp = max(max_res_timestamp, res_timestamps[-1]) + + # request latencies + req_latency = res_timestamps[-1] - req_timestamp + request_latencies.append(req_latency) # nanosec + req_latency = req_latency / 1e9 # sec + + # time to first token + time_to_first_tokens.append(res_timestamps[0] - req_timestamp) + + # number of input tokens + input_tokens = self._tokenize_request_inputs(req_inputs) + num_input_tokens.append(len(input_tokens)) + + # output token throughput per request + output_tokens = self._tokenize_response_outputs(res_outputs) + num_output_tokens = list(map(len, output_tokens)) + total_output_tokens = np.sum(num_output_tokens) + output_token_throughputs_per_request.append( + total_output_tokens / req_latency + ) + num_generated_tokens.append(total_output_tokens) + + # inter token latency + itl_per_request = [] + for (t1, _), (t2, n2) in self._pairwise( + zip(res_timestamps, num_output_tokens) + ): + # TMA-1676: handle empty first/last responses + # if the latter response has zero token (e.g. empty string), + # then set it default to one for the sake of inter token latency + # calculation and to avoid divide by zero. + num_token = 1 if n2 == 0 else n2 + itl_per_request.append(round((t2 - t1) / num_token)) + inter_token_latencies.append(itl_per_request) + + # request & output token throughput + benchmark_duration = (max_res_timestamp - min_req_timestamp) / 1e9 # nanosec + request_throughputs = [len(requests) / benchmark_duration] + output_token_throughputs = [sum(num_generated_tokens) / benchmark_duration] + + return LLMMetrics( + request_throughputs, + request_latencies, + time_to_first_tokens, + inter_token_latencies, + output_token_throughputs, + output_token_throughputs_per_request, + num_generated_tokens, + num_input_tokens, + ) + + def _pairwise(self, iterable): + """Generate pairs of consecutive elements from the given iterable.""" + a, b = tee(iterable) + next(b, None) + return zip(a, b) + + def _preprocess_response( + self, res_timestamps: List[int], res_outputs: List[Dict[str, str]] + ) -> None: + """Helper function to preprocess responses of a request.""" + if self._service_kind == "openai": + # PA sometimes receives multiple SSE responses at once (as a single + # response). Handle these responses by merging into a single response. + for i in range(len(res_outputs)): + response = res_outputs[i]["response"] + responses = response.strip().split("\n\n") + if len(responses) > 1: + merged_response = json.loads(remove_sse_prefix(responses[0])) + if ( + merged_response["choices"][0]["delta"].get("content", None) + is None + ): + merged_response["choices"][0]["delta"]["content"] = "" + for r in responses[1:]: + text = self._extract_openai_text_output(r) + merged_response["choices"][0]["delta"]["content"] += text + + res_outputs[i] = {"response": json.dumps(merged_response)} + + # Remove responses without any content + indices_to_remove = [] + for idx, out in enumerate(res_outputs): + if self._is_openai_empty_response(out["response"]): + indices_to_remove.append(idx) + indices_to_remove.sort(reverse=True) + for index in indices_to_remove: + res_timestamps.pop(index) + res_outputs.pop(index) + + def _tokenize_request_inputs(self, req_inputs: dict) -> List[int]: + """Deserialize the request input and return tokenized inputs.""" + if self._service_kind == "triton": + return self._tokenize_triton_request_input(req_inputs) + elif self._service_kind == "openai": + return self._tokenize_openai_request_input(req_inputs) + else: + raise ValueError(f"Unknown service kind: '{self._service_kind}'.") + + def _tokenize_triton_request_input(self, req_inputs: dict) -> List[int]: + """Tokenize the Triton request input texts.""" + encodings = self._tokenizer(req_inputs["text_input"]) + return encodings.data["input_ids"] + + def _tokenize_openai_request_input(self, req_inputs: dict) -> List[int]: + """Tokenize the OpenAI request input texts.""" + payload = json.loads(req_inputs["payload"]) + if self._response_format == ResponseFormat.OPENAI_CHAT_COMPLETIONS: + input_text = payload["messages"][0]["content"] + elif self._response_format == ResponseFormat.OPENAI_COMPLETIONS: + input_text = payload["prompt"] + elif self._response_format == ResponseFormat.TRITON_GENERATE: + input_text = payload["text_input"] + else: + raise ValueError( + "Failed to parse OpenAI request input in profile export file." + ) + encodings = self._tokenizer(input_text) + return encodings.data["input_ids"] + + def _tokenize_response_outputs(self, res_outputs: dict) -> List[List[int]]: + """Deserialize the response output and return tokenized outputs.""" + if self._service_kind == "triton": + return self._tokenize_triton_response_output(res_outputs) + elif self._service_kind == "openai": + return self._tokenize_openai_response_output(res_outputs) + else: + raise ValueError(f"Unknown service kind: '{self._service_kind}'.") + + def _tokenize_triton_response_output(self, res_outputs: dict) -> List[List[int]]: + """Tokenize the Triton response output texts.""" + output_texts = [] + for output in res_outputs: + output_texts.append(output["text_output"]) + return self._run_tokenizer(output_texts) + + def _tokenize_openai_response_output(self, res_outputs: dict) -> List[List[int]]: + """Tokenize the OpenAI response output texts.""" + output_texts = [] + for output in res_outputs: + if self._response_format == ResponseFormat.TRITON_GENERATE: + text = self._extract_generate_text_output(output["response"]) + else: + text = self._extract_openai_text_output(output["response"]) + output_texts.append(text) + return self._run_tokenizer(output_texts) + + def _run_tokenizer(self, output_texts: List[str]) -> List[List[int]]: + # exclamation mark trick forces the llama tokenization to consistently + # start each output with a specific token which allows us to safely skip + # the first token of every tokenized output and get only the ones that + # are returned by the model + output_texts = ["!" + txt for txt in output_texts] + encodings = self._tokenizer(output_texts) + return [out[1:] for out in encodings.data["input_ids"]] + + def _extract_generate_text_output(self, response: str) -> str: + response = remove_sse_prefix(response) + + if response == "": + return response + + data = json.loads(response) + return data["text_output"] + + def _extract_openai_text_output(self, response: str) -> str: + """Extracts text/content of the OpenAI response object.""" + response = remove_sse_prefix(response) + + if response == "[DONE]": + return "" + + data = json.loads(response) + completions = data["choices"][0] + + text_output = "" + if "object" not in data: + # FIXME: TPA-47 workaround for vLLM not following OpenAI Completions + # API specification when streaming, missing 'object' field: + # https://platform.openai.com/docs/api-reference/completions + text_output = completions.get("text", "") + elif data["object"] == "text_completion": # legacy + text_output = completions.get("text", "") + elif data["object"] == "chat.completion": # non-streaming + text_output = completions["message"].get("content", "") + elif data["object"] == "chat.completion.chunk": # streaming + text_output = completions["delta"].get("content", "") + else: + obj_type = data["object"] + raise ValueError(f"Unknown OpenAI response object type '{obj_type}'.") + return text_output + + def _is_openai_empty_response(self, response: str) -> bool: + """Returns true if the response is an openai response with no content (or empty content)""" + if self._response_format == ResponseFormat.TRITON_GENERATE: + text = self._extract_generate_text_output(response) + else: + text = self._extract_openai_text_output(response) + if text: + return False + return True diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/logging.py b/src/c++/perf_analyzer/genai-perf/genai_perf/logging.py index f5cab490a..db23dff06 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/logging.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/logging.py @@ -80,16 +80,6 @@ def init_logging() -> None: "level": "DEBUG", "propagate": False, }, - "genai_perf.export_data.json_exporter": { - "handlers": ["console"], - "level": "DEBUG", - "propagate": False, - }, - "genai_perf.export_data.csv_exporter": { - "handlers": ["console"], - "level": "DEBUG", - "propagate": False, - }, }, } logging.config.dictConfig(LOGGING_CONFIG) diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/main.py b/src/c++/perf_analyzer/genai-perf/genai_perf/main.py index 912ee4725..da5fd0e79 100755 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/main.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/main.py @@ -33,29 +33,26 @@ import genai_perf.logging as logging from genai_perf import parser +from genai_perf.constants import DEFAULT_PARQUET_FILE from genai_perf.exceptions import GenAIPerfException -from genai_perf.export_data.output_reporter import OutputReporter +from genai_perf.export_data.json_exporter import JsonExporter from genai_perf.llm_inputs.llm_inputs import LlmInputs +from genai_perf.llm_metrics import LLMProfileDataParser from genai_perf.plots.plot_config_parser import PlotConfigParser from genai_perf.plots.plot_manager import PlotManager -from genai_perf.profile_data_parser import LLMProfileDataParser, ProfileDataParser from genai_perf.tokenizer import Tokenizer, get_tokenizer def create_artifacts_dirs(args: Namespace) -> None: + # TMA-1911: support plots CLI option plot_dir = args.artifact_dir / "plots" os.makedirs(args.artifact_dir, exist_ok=True) - if hasattr(args, "generate_plots") and args.generate_plots: - os.makedirs(plot_dir, exist_ok=True) + os.makedirs(plot_dir, exist_ok=True) def generate_inputs(args: Namespace, tokenizer: Tokenizer) -> None: # TODO (TMA-1759): review if add_model_name is always true - if args.input_file: - filepath, _ = args.input_file - input_filename = Path(filepath) - else: - input_filename = None + input_filename = Path(args.input_file.name) if args.input_file else None add_model_name = True try: extra_input_dict = parser.get_extra_inputs_as_dict(args) @@ -82,22 +79,18 @@ def generate_inputs(args: Namespace, tokenizer: Tokenizer) -> None: add_stream=args.streaming, tokenizer=tokenizer, extra_inputs=extra_input_dict, - batch_size=args.batch_size, output_dir=args.artifact_dir, ) -def calculate_metrics(args: Namespace, tokenizer: Tokenizer) -> ProfileDataParser: - if args.endpoint_type in ["embeddings", "rankings"]: - return ProfileDataParser(args.profile_export_file) - else: - return LLMProfileDataParser( - filename=args.profile_export_file, - tokenizer=tokenizer, - ) +def calculate_metrics(args: Namespace, tokenizer: Tokenizer) -> LLMProfileDataParser: + return LLMProfileDataParser( + filename=args.profile_export_file, + tokenizer=tokenizer, + ) -def report_output(data_parser: ProfileDataParser, args: Namespace) -> None: +def report_output(data_parser: LLMProfileDataParser, args: Namespace) -> None: if args.concurrency: infer_mode = "concurrency" load_level = f"{args.concurrency}" @@ -108,10 +101,17 @@ def report_output(data_parser: ProfileDataParser, args: Namespace) -> None: raise GenAIPerfException("No valid infer mode specified") stats = data_parser.get_statistics(infer_mode, load_level) - reporter = OutputReporter(stats, args) - reporter.report_output() + export_csv_name = args.profile_export_file.with_name( + args.profile_export_file.stem + "_genai_perf.csv" + ) + stats.export_to_csv(export_csv_name) + stats.export_parquet(args.artifact_dir, DEFAULT_PARQUET_FILE) + stats.pretty_print() if args.generate_plots: create_plots(args) + extra_inputs_dict = parser.get_extra_inputs_as_dict(args) + json_exporter = JsonExporter(stats.stats_dict, args, extra_inputs_dict) + json_exporter.export_to_file(args.artifact_dir) def create_plots(args: Namespace) -> None: diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/metrics/__init__.py b/src/c++/perf_analyzer/genai-perf/genai_perf/metrics/__init__.py deleted file mode 100644 index 01ca53c59..000000000 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/metrics/__init__.py +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from genai_perf.metrics.llm_metrics import LLMMetrics -from genai_perf.metrics.metrics import MetricMetadata, Metrics -from genai_perf.metrics.statistics import Statistics diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/metrics/llm_metrics.py b/src/c++/perf_analyzer/genai-perf/genai_perf/metrics/llm_metrics.py deleted file mode 100755 index 13dff8a63..000000000 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/metrics/llm_metrics.py +++ /dev/null @@ -1,108 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from typing import List - -from genai_perf.metrics.metrics import MetricMetadata, Metrics - - -class LLMMetrics(Metrics): - """A simple dataclass that holds core LLM performance metrics.""" - - LLM_REQUEST_METRICS = [ - MetricMetadata("time_to_first_token", "ms"), - MetricMetadata("inter_token_latency", "ms"), - MetricMetadata("output_token_throughput_per_request", "tokens/sec"), - MetricMetadata("output_sequence_length", "tokens"), - MetricMetadata("input_sequence_length", "tokens"), - ] - - LLM_SYSTEM_METRICS = [ - # (TMA-1977) Make the unit consistent with statistics dict (e.g. tokens/sec) - MetricMetadata("output_token_throughput", "per sec"), - ] - - def __init__( - self, - request_throughputs: List[float] = [], - request_latencies: List[int] = [], - time_to_first_tokens: List[int] = [], - inter_token_latencies: List[int] = [], - output_token_throughputs: List[float] = [], - output_token_throughputs_per_request: List[int] = [], - output_sequence_lengths: List[int] = [], - input_sequence_lengths: List[int] = [], - chunked_inter_token_latencies: List[List[int]] = [[]], - ) -> None: - super().__init__(request_throughputs, request_latencies) - self.time_to_first_tokens = time_to_first_tokens - self.inter_token_latencies = inter_token_latencies - self.output_token_throughputs = output_token_throughputs - self.output_token_throughputs_per_request = output_token_throughputs_per_request - self.output_sequence_lengths = output_sequence_lengths - self.input_sequence_lengths = input_sequence_lengths - - # Keeping chunked ITL (old) as a WAR to preserve visualization. - # Excluded from data. - self._chunked_inter_token_latencies = chunked_inter_token_latencies - - # add base name mapping - self._base_names["time_to_first_tokens"] = "time_to_first_token" - self._base_names["inter_token_latencies"] = "inter_token_latency" - self._base_names["output_token_throughputs"] = "output_token_throughput" - self._base_names["output_token_throughputs_per_request"] = ( - "output_token_throughput_per_request" - ) - self._base_names["output_sequence_lengths"] = "output_sequence_length" - self._base_names["input_sequence_lengths"] = "input_sequence_length" - - @property - def request_metrics(self) -> List[MetricMetadata]: - base_metrics = super().request_metrics # base metrics - - # (TMA-1975) The order is hardcoded as below to avoid introducing any - # breaking changes to the users who might be parsing the outputs. However, - # we would eventually want to impose some consistent order such as a - # base metrics first and then task specific metrics. Uncomment the below - # line to enable this order: - # return base_metrics + self.LLM_REQUEST_METRICS - return ( - self.LLM_REQUEST_METRICS[:2] + base_metrics + self.LLM_REQUEST_METRICS[2:] - ) - - @property - def system_metrics(self) -> List[MetricMetadata]: - base_metrics = super().system_metrics # base metrics - - # (TMA-1975) The order is hardcoded as below to avoid introducing any - # breaking changes to the users who might be parsing the outputs. However, - # we would eventually want to impose some consistent order such as a - # base metrics first and then task specific metrics. Uncomment the below - # line to enable this order: - # return base_metrics + self.LLM_SYSTEM_METRICS - return self.LLM_SYSTEM_METRICS + base_metrics diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/metrics/metrics.py b/src/c++/perf_analyzer/genai-perf/genai_perf/metrics/metrics.py deleted file mode 100755 index 7e047094d..000000000 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/metrics/metrics.py +++ /dev/null @@ -1,88 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from dataclasses import dataclass -from typing import List - - -@dataclass -class MetricMetadata: - name: str - unit: str - - -class Metrics: - """A base class that contains common request level metrics.""" - - REQUEST_METRICS = [ - MetricMetadata("request_latency", "ms"), - ] - - SYSTEM_METRICS = [ - # (TMA-1977) Make the unit consistent with statistics dict (e.g. tokens/sec) - MetricMetadata("request_throughput", "per sec"), - ] - - def __init__( - self, - request_throughputs: List[float] = [], - request_latencies: List[int] = [], - ) -> None: - self.request_throughputs = request_throughputs - self.request_latencies = request_latencies - self._base_names = { - "request_throughputs": "request_throughput", - "request_latencies": "request_latency", - } - - def __repr__(self): - attr_strs = [] - for k, v in self.__dict__.items(): - if not k.startswith("_"): - attr_strs.append(f"{k}={v}") - return f"Metrics({','.join(attr_strs)})" - - @property - def request_metrics(self) -> List[MetricMetadata]: - return self.REQUEST_METRICS - - @property - def system_metrics(self) -> List[MetricMetadata]: - return self.SYSTEM_METRICS - - @property - def data(self) -> dict: - """Returns all the metrics.""" - return {k: v for k, v in self.__dict__.items() if not k.startswith("_")} - - def get_base_name(self, metric_name: str) -> str: - """Returns singular name of a given metric.""" - if metric_name in self._base_names: - return self._base_names[metric_name] - else: - raise KeyError(f"No metric named '{metric_name}' exists.") diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/metrics/statistics.py b/src/c++/perf_analyzer/genai-perf/genai_perf/metrics/statistics.py deleted file mode 100755 index f0d12cef6..000000000 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/metrics/statistics.py +++ /dev/null @@ -1,196 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from collections import defaultdict -from pathlib import Path -from typing import Dict, List, Union - -import numpy as np -import pandas as pd -from genai_perf.metrics.metrics import Metrics - - -class Statistics: - """A class that aggregates various statistics from given metrics class. - - The Statistics class goes through each metric in the metrics class and - calculates several statistics such as: - - average (arithmetic mean) - - percentiles (p25, p50, p75, p90, p95, p99) - - minimum & maximum - - standard deviation - The class will store each calculated statistics as part of its attribute. - - Example: - - >>> metrics = LLMMetrics(request_throughputs=[2, 4]) - >>> stats = Statistics(metrics) - >>> print(stats.avg_request_throughput) # output: 3 - """ - - def __init__(self, metrics: Metrics): - # iterate through Metrics to calculate statistics and set attributes - self._metrics = metrics - self._stats_dict: Dict = defaultdict(dict) - for attr, data in metrics.data.items(): - if self._should_skip(data, attr): - continue - - attr = metrics.get_base_name(attr) - self._add_units(attr) - self._calculate_mean(data, attr) - if not self._is_system_metric(metrics, attr): - self._calculate_percentiles(data, attr) - self._calculate_minmax(data, attr) - self._calculate_std(data, attr) - - def _should_skip(self, data: List[Union[int, float]], attr: str) -> bool: - """Checks if some metrics should be skipped.""" - # No data points - if len(data) == 0: - return True - # Skip ITL when non-streaming (all zero) - elif attr == "inter_token_latencies" and sum(data) == 0: - return True - return False - - def _calculate_mean(self, data: List[Union[int, float]], attr: str) -> None: - avg = np.mean(data) - setattr(self, "avg_" + attr, avg) - self._stats_dict[attr]["avg"] = float(avg) - - def _calculate_percentiles(self, data: List[Union[int, float]], attr: str) -> None: - p25, p50, p75 = np.percentile(data, [25, 50, 75]) - p90, p95, p99 = np.percentile(data, [90, 95, 99]) - setattr(self, "p25_" + attr, p25) - setattr(self, "p50_" + attr, p50) - setattr(self, "p75_" + attr, p75) - setattr(self, "p90_" + attr, p90) - setattr(self, "p95_" + attr, p95) - setattr(self, "p99_" + attr, p99) - self._stats_dict[attr]["p99"] = float(p99) - self._stats_dict[attr]["p95"] = float(p95) - self._stats_dict[attr]["p90"] = float(p90) - self._stats_dict[attr]["p75"] = float(p75) - self._stats_dict[attr]["p50"] = float(p50) - self._stats_dict[attr]["p25"] = float(p25) - - def _calculate_minmax(self, data: List[Union[int, float]], attr: str) -> None: - min, max = np.min(data), np.max(data) - setattr(self, "min_" + attr, min) - setattr(self, "max_" + attr, max) - self._stats_dict[attr]["max"] = float(max) - self._stats_dict[attr]["min"] = float(min) - - def _calculate_std(self, data: List[Union[int, float]], attr: str) -> None: - std = np.std(data) - setattr(self, "std_" + attr, std) - self._stats_dict[attr]["std"] = float(std) - - def scale_data(self, factor: float = 1 / 1e6) -> None: - for k1, v1 in self.stats_dict.items(): - if self._is_time_metric(k1): - for k2, v2 in v1.items(): - if k2 != "unit": - self.stats_dict[k1][k2] = self._scale(v2, factor) - - def _scale(self, metric: float, factor: float = 1 / 1e6) -> float: - """ - Scale metrics from nanoseconds by factor. - Default is nanoseconds to milliseconds. - """ - return metric * factor - - def _add_units(self, key) -> None: - if self._is_time_metric(key): - self._stats_dict[key]["unit"] = "ms" - elif key == "request_throughput": - self._stats_dict[key]["unit"] = "requests/sec" - elif key.startswith("output_token_throughput"): - self._stats_dict[key]["unit"] = "tokens/sec" - elif "sequence_length" in key: - self._stats_dict[key]["unit"] = "tokens" - else: - self._stats_dict[key]["unit"] = "" - - def __repr__(self) -> str: - attr_strs = [] - for k, v in self.__dict__.items(): - if not k.startswith("_"): - attr_strs.append(f"{k}={v}") - return f"Statistics({','.join(attr_strs)})" - - @property - def data(self) -> dict: - """Return all the aggregated statistics.""" - return {k: v for k, v in self.__dict__.items() if not k.startswith("_")} - - @property - def metrics(self) -> Metrics: - """Return the underlying metrics used to calculate the statistics.""" - return self._metrics - - @property - def stats_dict(self) -> Dict: - return self._stats_dict - - def _is_system_metric(self, metrics: Metrics, attr: str) -> bool: - return attr in [m.name for m in metrics.system_metrics] - - def _is_time_metric(self, field: str) -> bool: - # TPA-188: Remove the hardcoded time metrics list - time_metrics = [ - "inter_token_latency", - "time_to_first_token", - "request_latency", - ] - return field in time_metrics - - def export_parquet(self, artifact_dir: Path, filename: str) -> None: - max_length = -1 - col_index = 0 - filler_list = [] - df = pd.DataFrame() - - # Data frames require all columns of the same length - # find the max length column - for key, value in self._metrics.data.items(): - max_length = max(max_length, len(value)) - - # Insert None for shorter columns to match longest column - for key, value in self._metrics.data.items(): - if len(value) < max_length: - diff = max_length - len(value) - filler_list = [None] * diff - df.insert(col_index, key, value + filler_list) - diff = 0 - filler_list = [] - col_index = col_index + 1 - - filepath = artifact_dir / f"{filename}.gzip" - df.to_parquet(filepath, compression="gzip") diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py b/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py index 521b30e53..5416ee331 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py @@ -25,12 +25,9 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import argparse -import json import os import sys -from enum import Enum, auto from pathlib import Path -from typing import Tuple import genai_perf.logging as logging import genai_perf.utils as utils @@ -52,30 +49,13 @@ from . import __version__ - -class PathType(Enum): - FILE = auto() - DIRECTORY = auto() - - def to_lowercase(self): - return self.name.lower() - - -class Subcommand(Enum): - PROFILE = auto() - COMPARE = auto() - - def to_lowercase(self): - return self.name.lower() - - logger = logging.getLogger(__name__) _endpoint_type_map = { "chat": "v1/chat/completions", "completions": "v1/completions", - "embeddings": "v1/embeddings", - "rankings": "v1/ranking", + "generate": "v2/models/{MODEL_NAME}/generate", + "kserve": "v2/models/{MODEL_NAME}/infer", } @@ -85,7 +65,7 @@ def _check_model_args( """ Check if model name is provided. """ - if not args.model: + if not args.subcommand and not args.model: parser.error("The -m/--model option is required and cannot be empty.") args = _convert_str_to_enum_entry( args, "model_selection_strategy", ModelSelectionStrategy @@ -110,8 +90,9 @@ def _check_compare_args( """ Check compare subcommand args """ - if not args.config and not args.files: - parser.error("Either the --config or --files option must be specified.") + if args.subcommand == "compare": + if not args.config and not args.files: + parser.error("Either the --config or --files option must be specified.") return args @@ -122,35 +103,31 @@ def _check_conditional_args( Check for conditional args and raise an error if they are not set. """ - # Endpoint and output format checks - if args.service_kind == "openai": - if args.endpoint_type is None: - parser.error( - "The --endpoint-type option is required when using the 'openai' service-kind." - ) - else: - if args.endpoint_type == "chat": - args.output_format = OutputFormat.OPENAI_CHAT_COMPLETIONS - elif args.endpoint_type == "completions": - args.output_format = OutputFormat.OPENAI_COMPLETIONS - elif args.endpoint_type == "embeddings": - args.output_format = OutputFormat.OPENAI_EMBEDDINGS - elif args.endpoint_type == "rankings": - args.output_format = OutputFormat.RANKINGS - - if args.endpoint is not None: - args.endpoint = args.endpoint.lstrip(" /") - else: - args.endpoint = _endpoint_type_map[args.endpoint_type] - elif args.endpoint_type is not None: - parser.error( - "The --endpoint-type option should only be used when using the 'openai' service-kind." - ) - - if args.service_kind == "triton": + if args.endpoint_type == "chat": + args.output_format = OutputFormat.OPENAI_CHAT_COMPLETIONS + args.service_kind = "openai" + elif args.endpoint_type == "completions": + args.output_format = OutputFormat.OPENAI_COMPLETIONS + args.service_kind = "openai" + elif args.endpoint_type == "generate": + args.output_format = OutputFormat.TRITON_GENERATE + args.service_kind = "openai" + elif args.endpoint_type == "kserve": + args.service_kind = "triton" args = _convert_str_to_enum_entry(args, "backend", OutputFormat) args.output_format = args.backend + if args.endpoint is not None: + args.endpoint = args.endpoint.lstrip(" /") + else: + if args.model: + model_name = args.model[0] + else: + model_name = "" + args.endpoint = _endpoint_type_map[args.endpoint_type].format( + MODEL_NAME=model_name + ) + # Output token distribution checks if args.output_tokens_mean == LlmInputs.DEFAULT_OUTPUT_TOKENS_MEAN: if args.output_tokens_stddev != LlmInputs.DEFAULT_OUTPUT_TOKENS_STDDEV: @@ -165,51 +142,12 @@ def _check_conditional_args( if args.service_kind != "triton": if args.output_tokens_mean_deterministic: parser.error( - "The --output-tokens-mean-deterministic option is only supported with the Triton service-kind." + "The --output-tokens-mean-deterministic option is only supported with the kserve endpoint type." ) - _check_conditional_args_embeddings_rankings(parser, args) - return args -def _check_conditional_args_embeddings_rankings( - parser: argparse.ArgumentParser, args: argparse.Namespace -): - - if args.output_format in [ - OutputFormat.OPENAI_EMBEDDINGS, - OutputFormat.RANKINGS, - ]: - if args.streaming: - parser.error( - f"The --streaming option is not supported with the {args.endpoint_type} endpoint type." - ) - - if args.generate_plots: - parser.error( - f"The --generate-plots option is not currently supported with the {args.endpoint_type} endpoint type." - ) - else: - if args.batch_size != LlmInputs.DEFAULT_BATCH_SIZE: - parser.error( - "The --batch-size option is currently only supported with the embeddings and rankings endpoint types." - ) - - if args.input_file: - _, path_type = args.input_file - if args.output_format != OutputFormat.RANKINGS: - if path_type == "directory": - parser.error( - "A directory is only currently supported for the rankings endpoint type." - ) - else: - if path_type == PathType.FILE: - parser.error( - "The rankings endpoint-type requires a directory value for the --input-file flag." - ) - - def _check_load_manager_args(args: argparse.Namespace) -> argparse.Namespace: """ Check inference load args @@ -267,12 +205,7 @@ def _infer_prompt_source(args: argparse.Namespace) -> argparse.Namespace: logger.debug(f"Input source is the following dataset: {args.input_dataset}") elif args.input_file: args.prompt_source = PromptSource.FILE - if args.endpoint_type == "rankings": - logger.debug( - f"Input source is the following directory: {args.input_file[0]}" - ) - else: - logger.debug(f"Input source is the following file: {args.input_file[0]}") + logger.debug(f"Input source is the following file: {args.input_file.name}") else: args.prompt_source = PromptSource.SYNTHETIC logger.debug("Input source is synthetic data") @@ -289,40 +222,17 @@ def _convert_str_to_enum_entry(args, option, enum): return args -### Types ### - - -def file_or_directory(path: str) -> Tuple[Path, PathType]: - if os.path.isfile(path): - return (Path(path), PathType.FILE) - elif os.path.isdir(path): - return (Path(path), PathType.DIRECTORY) - else: - raise ValueError(f"'{path}' is not a valid file or directory") - - ### Parsers ### def _add_input_args(parser): input_group = parser.add_argument_group("Input") - input_group.add_argument( - "--batch-size", - "-b", - type=int, - default=LlmInputs.DEFAULT_BATCH_SIZE, - required=False, - help=f"The batch size of the requests GenAI-Perf should send. " - "This is currently only supported with the embeddings and rankings endpoint types.", - ) - input_group.add_argument( "--extra-inputs", action="append", help="Provide additional inputs to include with every request. " - "You can repeat this flag for multiple inputs. Inputs should be in an input_name:value format." - "Alternatively, a string representing a json formatted dict can be provided.", + "You can repeat this flag for multiple inputs. Inputs should be in an input_name:value format.", ) prompt_source_group = input_group.add_mutually_exclusive_group(required=False) @@ -337,14 +247,10 @@ def _add_input_args(parser): prompt_source_group.add_argument( "--input-file", - type=file_or_directory, + type=argparse.FileType("r"), default=None, required=False, - help="The input file containing the prompts to use for profiling. " - "Each line should be a JSON object with a 'text_input' field in JSONL format. " - 'Example: {"text_input": "Your prompt here"}' - "For the rankings endpoint-type, a directory should be passed in instead with " - 'a "queries.jsonl" file and a "passages.jsonl" file with the same format.', + help="The input file containing the single prompt to use for profiling.", ) input_group.add_argument( @@ -371,7 +277,7 @@ def _add_input_args(parser): help=f"When using --output-tokens-mean, this flag can be set to " "improve precision by setting the minimum number of tokens " "equal to the requested number of tokens. This is currently " - "supported with the Triton service-kind. " + "supported with the kserve endpoint type. " "Note that there is still some variability in the requested number " "of output tokens, but GenAi-Perf attempts its best effort with your " "model to get the right number of output tokens. ", @@ -479,10 +385,10 @@ def _add_endpoint_args(parser): endpoint_group.add_argument( "--backend", type=str, - choices=utils.get_enum_names(OutputFormat)[2:], + choices=["tensorrtllm", "vllm"], default="tensorrtllm", required=False, - help=f'When using the "triton" service-kind, ' + help=f'When using the "kserve" endpoint type, ' "this is the backend of the model. " "For the TENSORRT-LLM backend, you currently must set " "'exclude_input_in_output' to true in the model config to " @@ -499,21 +405,10 @@ def _add_endpoint_args(parser): endpoint_group.add_argument( "--endpoint-type", type=str, - choices=["chat", "completions", "embeddings", "rankings"], + choices=["chat", "completions", "generate", "kserve"], + default="kserve", required=False, - help=f"The endpoint-type to send requests to on the " - 'server. This is only used with the "openai" service-kind.', - ) - - endpoint_group.add_argument( - "--service-kind", - type=str, - choices=["triton", "openai"], - default="triton", - required=False, - help="The kind of service perf_analyzer will " - 'generate load for. In order to use "openai", ' - "you must specify an api via --endpoint-type.", + help=f"The endpoint-type for requests. Inputs will be formatted according to endpoint-type.", ) endpoint_group.add_argument( @@ -536,13 +431,6 @@ def _add_endpoint_args(parser): def _add_output_args(parser): output_group = parser.add_argument_group("Output") - output_group.add_argument( - "--artifact-dir", - type=Path, - default=Path(DEFAULT_ARTIFACT_DIR), - help="The directory to store all the (output) artifacts generated by " - "GenAI-Perf and Perf Analyzer.", - ) output_group.add_argument( "--generate-plots", action="store_true", @@ -559,6 +447,13 @@ def _add_output_args(parser): "For example, if the profile export file is profile_export.json, the genai-perf file will be " "exported to profile_export_genai_perf.csv.", ) + output_group.add_argument( + "--artifact-dir", + type=Path, + default=Path(DEFAULT_ARTIFACT_DIR), + help="The directory to store all the (output) artifacts generated by " + "GenAI-Perf and Perf Analyzer.", + ) def _add_other_args(parser): @@ -580,56 +475,60 @@ def _add_other_args(parser): help="An option to enable verbose mode.", ) + other_group.add_argument( + "--version", + action="version", + version="%(prog)s " + __version__, + help=f"An option to print the version and exit.", + ) + def get_extra_inputs_as_dict(args: argparse.Namespace) -> dict: request_inputs = {} if args.extra_inputs: for input_str in args.extra_inputs: - if input_str.startswith("{") and input_str.endswith("}"): - request_inputs.update(json.loads(input_str)) - else: - semicolon_count = input_str.count(":") - if semicolon_count != 1: - raise ValueError( - f"Invalid input format for --extra-inputs: {input_str}\n" - "Expected input format: 'input_name:value'" - ) - input_name, value = input_str.split(":", 1) - - if not input_name or not value: - raise ValueError( - f"Input name or value is empty in --extra-inputs: {input_str}\n" - "Expected input format: 'input_name:value'" - ) - - is_bool = value.lower() in ["true", "false"] - is_int = value.isdigit() - is_float = value.count(".") == 1 and ( - value[0] == "." or value.replace(".", "").isdigit() + semicolon_count = input_str.count(":") + if semicolon_count != 1: + raise ValueError( + f"Invalid input format for --extra-inputs: {input_str}\n" + "Expected input format: 'input_name:value'" ) + input_name, value = input_str.split(":", 1) - if is_bool: - value = value.lower() == "true" - elif is_int: - value = int(value) - elif is_float: - value = float(value) + if not input_name or not value: + raise ValueError( + f"Input name or value is empty in --extra-inputs: {input_str}\n" + "Expected input format: 'input_name:value'" + ) - if input_name in request_inputs: - raise ValueError( - f"Input name already exists in request_inputs dictionary: {input_name}" - ) - request_inputs[input_name] = value + is_bool = value.lower() in ["true", "false"] + is_int = value.isdigit() + is_float = value.count(".") == 1 and ( + value[0] == "." or value.replace(".", "").isdigit() + ) + + if is_bool: + value = value.lower() == "true" + elif is_int: + value = int(value) + elif is_float: + value = float(value) + + if input_name in request_inputs: + raise ValueError( + f"Input name already exists in request_inputs dictionary: {input_name}" + ) + request_inputs[input_name] = value return request_inputs def _parse_compare_args(subparsers) -> argparse.ArgumentParser: compare = subparsers.add_parser( - Subcommand.COMPARE.to_lowercase(), + "compare", description="Subcommand to generate plots that compare multiple profile runs.", ) - compare_group = compare.add_argument_group("Input") + compare_group = compare.add_argument_group("Compare") mx_group = compare_group.add_mutually_exclusive_group(required=False) mx_group.add_argument( "--config", @@ -651,20 +550,6 @@ def _parse_compare_args(subparsers) -> argparse.ArgumentParser: return compare -def _parse_profile_args(subparsers) -> argparse.ArgumentParser: - profile = subparsers.add_parser( - Subcommand.PROFILE.to_lowercase(), - description="Subcommand to profile LLMs and Generative AI models.", - ) - _add_endpoint_args(profile) - _add_input_args(profile) - _add_profile_args(profile) - _add_output_args(profile) - _add_other_args(profile) - profile.set_defaults(func=profile_handler) - return profile - - ### Handlers ### @@ -673,6 +558,12 @@ def create_compare_dir() -> None: os.mkdir(DEFAULT_COMPARE_DIR) +def profile_handler(args, extra_args): + from genai_perf.wrapper import Profiler + + Profiler.run(args=args, extra_args=extra_args) + + def compare_handler(args: argparse.Namespace): """Handles `compare` subcommand workflow.""" if args.files: @@ -687,75 +578,45 @@ def compare_handler(args: argparse.Namespace): plot_manager.generate_plots() -def profile_handler(args, extra_args): - from genai_perf.wrapper import Profiler - - Profiler.run(args=args, extra_args=extra_args) - +### Entrypoint ### -### Parser Initialization ### +def parse_args(): + argv = sys.argv -def init_parsers(): parser = argparse.ArgumentParser( prog="genai-perf", description="CLI to profile LLMs and Generative AI models with Perf Analyzer", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) - parser.add_argument( - "--version", - action="version", - version="%(prog)s " + __version__, - help=f"An option to print the version and exit.", - ) + parser.set_defaults(func=profile_handler) + + # Conceptually group args for easier visualization + _add_endpoint_args(parser) + _add_input_args(parser) + _add_profile_args(parser) + _add_output_args(parser) + _add_other_args(parser) # Add subcommands subparsers = parser.add_subparsers( help="List of subparser commands.", dest="subcommand" ) - _ = _parse_compare_args(subparsers) - _ = _parse_profile_args(subparsers) - subparsers.required = True + compare_parser = _parse_compare_args(subparsers) - return parser - - -def get_passthrough_args_index(argv: list) -> int: + # Check for passthrough args if "--" in argv: passthrough_index = argv.index("--") logger.info(f"Detected passthrough args: {argv[passthrough_index + 1:]}") else: passthrough_index = len(argv) - return passthrough_index - - -def refine_args( - parser: argparse.ArgumentParser, args: argparse.Namespace -) -> argparse.Namespace: - if args.subcommand == Subcommand.PROFILE.to_lowercase(): - args = _infer_prompt_source(args) - args = _check_model_args(parser, args) - args = _check_conditional_args(parser, args) - args = _check_load_manager_args(args) - args = _set_artifact_paths(args) - elif args.subcommand == Subcommand.COMPARE.to_lowercase(): - args = _check_compare_args(parser, args) - else: - raise ValueError(f"Unknown subcommand: {args.subcommand}") - - return args - - -### Entrypoint ### - - -def parse_args(): - argv = sys.argv - - parser = init_parsers() - passthrough_index = get_passthrough_args_index(argv) args = parser.parse_args(argv[1:passthrough_index]) - args = refine_args(parser, args) + args = _infer_prompt_source(args) + args = _check_model_args(parser, args) + args = _check_conditional_args(parser, args) + args = _check_compare_args(compare_parser, args) + args = _check_load_manager_args(args) + args = _set_artifact_paths(args) return args, argv[passthrough_index + 1 :] diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/plots/plot_config_parser.py b/src/c++/perf_analyzer/genai-perf/genai_perf/plots/plot_config_parser.py index 00588f6bb..c174024a2 100755 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/plots/plot_config_parser.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/plots/plot_config_parser.py @@ -33,9 +33,8 @@ # Skip type checking to avoid mypy error # Issue: https://github.com/python/mypy/issues/10632 import yaml # type: ignore -from genai_perf.metrics import Statistics +from genai_perf.llm_metrics import LLMProfileDataParser, Statistics from genai_perf.plots.plot_config import PlotConfig, PlotType, ProfileRunData -from genai_perf.profile_data_parser import LLMProfileDataParser from genai_perf.tokenizer import DEFAULT_TOKENIZER, get_tokenizer from genai_perf.utils import load_yaml, scale @@ -109,12 +108,14 @@ def _get_metric(self, stats: Statistics, name: str) -> List[Union[int, float]]: if not name: # no metric return [] elif name == "inter_token_latencies": - itls = stats.metrics.data[name] - return [scale(x, (1 / 1e6)) for x in itls] # ns to ms + # Flatten ITL since they are grouped by request + itl_flatten = [] + for request_itls in stats.metrics.data[name]: + itl_flatten += request_itls + return [scale(x, (1 / 1e6)) for x in itl_flatten] # ns to ms elif name == "token_positions": - chunked_itls = getattr(stats.metrics, "_chunked_inter_token_latencies") token_positions: List[Union[int, float]] = [] - for request_itls in chunked_itls: + for request_itls in stats.metrics.data["inter_token_latencies"]: token_positions += list(range(1, len(request_itls) + 1)) return token_positions elif name == "time_to_first_tokens": @@ -168,11 +169,11 @@ def create_init_yaml_config(filenames: List[Path], output_dir: Path) -> None: output: {output_dir} plot3: - title: Distribution of Input Sequence Lengths to Output Sequence Lengths - x_metric: input_sequence_lengths - y_metric: output_sequence_lengths - x_label: Input Sequence Length - y_label: Output Sequence Length + title: Distribution of Input Tokens to Generated Tokens + x_metric: num_input_tokens + y_metric: num_output_tokens + x_label: Number of Input Tokens Per Request + y_label: Number of Generated Tokens Per Request width: {1200 if len(filenames) > 1 else 700} height: 450 type: heatmap @@ -180,10 +181,10 @@ def create_init_yaml_config(filenames: List[Path], output_dir: Path) -> None: output: {output_dir} plot4: - title: Time to First Token vs Input Sequence Lengths - x_metric: input_sequence_lengths + title: Time to First Token vs Number of Input Tokens + x_metric: num_input_tokens y_metric: time_to_first_tokens - x_label: Input Sequence Length + x_label: Number of Input Tokens y_label: Time to First Token (ms) width: {1200 if len(filenames) > 1 else 700} height: 450 diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/profile_data_parser/__init__.py b/src/c++/perf_analyzer/genai-perf/genai_perf/profile_data_parser/__init__.py deleted file mode 100644 index 2e7798c40..000000000 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/profile_data_parser/__init__.py +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from genai_perf.profile_data_parser.llm_profile_data_parser import LLMProfileDataParser -from genai_perf.profile_data_parser.profile_data_parser import ( - ProfileDataParser, - ResponseFormat, -) diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/profile_data_parser/llm_profile_data_parser.py b/src/c++/perf_analyzer/genai-perf/genai_perf/profile_data_parser/llm_profile_data_parser.py deleted file mode 100755 index cbb2da5ee..000000000 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/profile_data_parser/llm_profile_data_parser.py +++ /dev/null @@ -1,296 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import json -from itertools import tee -from pathlib import Path -from typing import Dict, List, Tuple - -from genai_perf.metrics import LLMMetrics, Metrics -from genai_perf.profile_data_parser.profile_data_parser import ( - ProfileDataParser, - ResponseFormat, -) -from genai_perf.tokenizer import Tokenizer -from genai_perf.utils import remove_sse_prefix - - -class LLMProfileDataParser(ProfileDataParser): - """A class that calculates and aggregates all the LLM performance statistics - across the Perf Analyzer profile results. - - The LLMProfileDataParser class parses profile export JSON file, collects the - core LLM performance metrics, and calculates summary statistics for each - different Perf Analyzer runs/experiments. - - Example: - - >>> ... # run Perf Analyzer with concurrency level 10 - >>> - >>> from transformers import AutoTokenizer - >>> - >>> tokenizer = AutoTokenizer.from_pretrained("gpt2") - >>> pd = LLMProfileDataParser( - >>> filename="profile_export.json", - >>> tokenizer=tokenizer, - >>> ) - >>> stats = pd.get_statistics(infer_mode="concurrency", level=10) - >>> - >>> print(stats) # output: Statistics(avg_time_to_first_token=...) - >>> stats.pretty_print() # Output: time_to_first_token_s: ... - """ - - def __init__( - self, - filename: Path, - tokenizer: Tokenizer, - ) -> None: - self._tokenizer = tokenizer - super().__init__(filename) - - def _parse_requests(self, requests: dict) -> Metrics: - """Parse each requests in profile export data to extract key metrics.""" - min_req_timestamp, max_res_timestamp = float("inf"), 0 - request_latencies = [] - time_to_first_tokens = [] - inter_token_latencies = [] - output_token_throughputs_per_request = [] - input_sequence_lengths = [] - output_sequence_lengths = [] - chunked_inter_token_latencies = [] - - for request in requests: - req_timestamp = request["timestamp"] - req_inputs = request["request_inputs"] - res_timestamps = request["response_timestamps"] - res_outputs = request["response_outputs"] - - self._preprocess_response(res_timestamps, res_outputs) - - # Skip requests with empty response. This happens sometimes when the - # model returns a single response with empty string. - if not res_timestamps: - continue - - # track entire benchmark duration - min_req_timestamp = min(min_req_timestamp, req_timestamp) - max_res_timestamp = max(max_res_timestamp, res_timestamps[-1]) - - # request latencies - req_latency_ns = res_timestamps[-1] - req_timestamp - request_latencies.append(req_latency_ns) # nanosec - req_latency_s = req_latency_ns / 1e9 # sec - - # time to first token - ttft = res_timestamps[0] - req_timestamp - time_to_first_tokens.append(ttft) - - # number of input tokens - input_seq_len = self._get_input_token_count(req_inputs) - input_sequence_lengths.append(input_seq_len) - - # output token throughput per request - output_token_counts, total_output_token = self._get_output_token_counts( - res_outputs - ) - output_token_throughputs_per_request.append( - total_output_token / req_latency_s - ) - output_sequence_lengths.append(total_output_token) - - # inter token latencies - if total_output_token > 1: - inter_token_latency = (req_latency_ns - ttft) / (total_output_token - 1) - inter_token_latencies.append(round(inter_token_latency)) - - # The new ITL calculation above loses all token-level ITL information - # and as a result breaks ITL vs token position visualization. Keep - # the old version of inter token latency as a WAR to preserve the - # visualization. - chunked_inter_token_latency = [] - for (t1, _), (t2, n2) in self._pairwise( - zip(res_timestamps, output_token_counts) - ): - # TMA-1676: handle empty first/last responses - # if the latter response has zero token (e.g. empty string), - # then set it default to one for the sake of inter token latency - # calculation and to avoid divide by zero. - num_token = 1 if n2 == 0 else n2 - chunked_inter_token_latency.append(round((t2 - t1) / num_token)) - chunked_inter_token_latencies.append(chunked_inter_token_latency) - - # request & output token throughput - benchmark_duration = (max_res_timestamp - min_req_timestamp) / 1e9 # nanosec - request_throughputs = [len(requests) / benchmark_duration] - output_token_throughputs = [sum(output_sequence_lengths) / benchmark_duration] - - return LLMMetrics( - request_throughputs, - request_latencies, - time_to_first_tokens, - inter_token_latencies, - output_token_throughputs, - output_token_throughputs_per_request, - output_sequence_lengths, - input_sequence_lengths, - chunked_inter_token_latencies, - ) - - def _pairwise(self, iterable): - """Generate pairs of consecutive elements from the given iterable.""" - a, b = tee(iterable) - next(b, None) - return zip(a, b) - - def _preprocess_response( - self, res_timestamps: List[int], res_outputs: List[Dict[str, str]] - ) -> None: - """Helper function to preprocess responses of a request.""" - if self._service_kind == "openai": - # PA sometimes receives multiple SSE responses at once (as a single - # response). Handle these responses by merging into a single response. - for i in range(len(res_outputs)): - response = res_outputs[i]["response"] - responses = response.strip().split("\n\n") - if len(responses) > 1: - merged_response = json.loads(remove_sse_prefix(responses[0])) - if ( - merged_response["choices"][0]["delta"].get("content", None) - is None - ): - merged_response["choices"][0]["delta"]["content"] = "" - for r in responses[1:]: - text = self._extract_openai_text_output(r) - merged_response["choices"][0]["delta"]["content"] += text - - res_outputs[i] = {"response": json.dumps(merged_response)} - - # Remove responses without any content - indices_to_remove = [] - for idx, out in enumerate(res_outputs): - if self._is_openai_empty_response(out["response"]): - indices_to_remove.append(idx) - indices_to_remove.sort(reverse=True) - for index in indices_to_remove: - res_timestamps.pop(index) - res_outputs.pop(index) - - def _get_input_token_count(self, req_inputs: dict) -> int: - """Deserialize the request input and return tokenized inputs.""" - if self._service_kind == "triton": - input_text = req_inputs["text_input"] - elif self._service_kind == "openai": - input_text = self._get_openai_input_text(req_inputs) - else: - raise ValueError(f"Unknown service kind: '{self._service_kind}'.") - - return len(self._tokenizer.encode(input_text)) - - def _get_openai_input_text(self, req_inputs: dict) -> str: - """Tokenize the OpenAI request input texts.""" - payload = json.loads(req_inputs["payload"]) - if self._response_format == ResponseFormat.OPENAI_CHAT_COMPLETIONS: - return payload["messages"][0]["content"] - elif self._response_format == ResponseFormat.OPENAI_COMPLETIONS: - return payload["prompt"] - else: - raise ValueError( - "Failed to parse OpenAI request input in profile export file." - ) - - def _get_output_token_counts( - self, res_outputs: List[Dict] - ) -> Tuple[List[int], int]: - """Return response-level token counts and total token count.""" - if self._service_kind == "triton": - output_texts = self._get_triton_output_tokens(res_outputs) - elif self._service_kind == "openai": - output_texts = self._get_openai_output_tokens(res_outputs) - else: - raise ValueError(f"Unknown service kind: '{self._service_kind}'.") - - full_text_token_count = len(self._tokenizer.encode("".join(output_texts))) - - output_tokens = self._get_response_output_tokens(output_texts) - output_token_counts = list(map(len, output_tokens)) - return output_token_counts, full_text_token_count - - def _get_triton_output_tokens(self, res_outputs: List[Dict]) -> List[str]: - """Return a list of Triton response texts.""" - return [r["text_output"] for r in res_outputs] - - def _get_openai_output_tokens(self, res_outputs: List[Dict]) -> List[str]: - """Return a list of OpenAI response texts.""" - output_texts = [] - for output in res_outputs: - text = self._extract_openai_text_output(output["response"]) - output_texts.append(text) - return output_texts - - def _get_response_output_tokens(self, output_texts: List[str]) -> List[List[int]]: - """Return a list of response output tokens.""" - # Exclamation mark trick forces the llama tokenization to consistently - # start each output with a specific token which allows us to safely skip - # the first token of every tokenized output and get only the ones that - # are returned by the model - encodings = self._tokenizer(["!" + txt for txt in output_texts]) - return [out[1:] for out in encodings.data["input_ids"]] - - def _extract_openai_text_output(self, response: str) -> str: - """Extracts text/content of the OpenAI response object.""" - response = remove_sse_prefix(response) - - if response == "[DONE]": - return "" - - data = json.loads(response) - completions = data["choices"][0] - - text_output = "" - if "object" not in data: - # FIXME: TPA-47 workaround for vLLM not following OpenAI Completions - # API specification when streaming, missing 'object' field: - # https://platform.openai.com/docs/api-reference/completions - text_output = completions.get("text", "") - elif data["object"] == "text_completion": # legacy - text_output = completions.get("text", "") - elif data["object"] == "chat.completion": # non-streaming - text_output = completions["message"].get("content", "") - elif data["object"] == "chat.completion.chunk": # streaming - text_output = completions["delta"].get("content", "") - else: - obj_type = data["object"] - raise ValueError(f"Unknown OpenAI response object type '{obj_type}'.") - return text_output - - def _is_openai_empty_response(self, response: str) -> bool: - """Returns true if the response is an openai response with no content (or empty content)""" - text = self._extract_openai_text_output(response) - if text: - return False - return True diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/profile_data_parser/profile_data_parser.py b/src/c++/perf_analyzer/genai-perf/genai_perf/profile_data_parser/profile_data_parser.py deleted file mode 100755 index d18d8f6fb..000000000 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/profile_data_parser/profile_data_parser.py +++ /dev/null @@ -1,139 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from enum import Enum, auto -from pathlib import Path -from typing import List, Tuple - -from genai_perf.metrics import Metrics, Statistics -from genai_perf.utils import load_json - - -class ResponseFormat(Enum): - HUGGINGFACE_RANKINGS = auto() - OPENAI_CHAT_COMPLETIONS = auto() - OPENAI_COMPLETIONS = auto() - OPENAI_EMBEDDINGS = auto() - RANKINGS = auto() - TRITON = auto() - - -class ProfileDataParser: - """Base profile data parser class that reads the profile data JSON file to - extract core metrics and calculate various performance statistics. - """ - - def __init__(self, filename: Path) -> None: - data = load_json(filename) - self._get_profile_metadata(data) - self._parse_profile_data(data) - - def _get_profile_metadata(self, data: dict) -> None: - self._service_kind = data["service_kind"] - if self._service_kind == "openai": - if data["endpoint"] == "rerank": - self._response_format = ResponseFormat.HUGGINGFACE_RANKINGS - elif data["endpoint"] == "v1/chat/completions": - self._response_format = ResponseFormat.OPENAI_CHAT_COMPLETIONS - elif data["endpoint"] == "v1/completions": - self._response_format = ResponseFormat.OPENAI_COMPLETIONS - elif data["endpoint"] == "v1/embeddings": - self._response_format = ResponseFormat.OPENAI_EMBEDDINGS - elif data["endpoint"] == "v1/ranking": - self._response_format = ResponseFormat.RANKINGS - else: - # TPA-66: add PA metadata to handle this case - # When endpoint field is either empty or custom endpoint, fall - # back to parsing the response to extract the response format. - request = data["experiments"][0]["requests"][0] - response = request["response_outputs"][0]["response"] - if "chat.completion" in response: - self._response_format = ResponseFormat.OPENAI_CHAT_COMPLETIONS - elif "text_completion" in response: - self._response_format = ResponseFormat.OPENAI_COMPLETIONS - elif "embedding" in response: - self._response_format = ResponseFormat.OPENAI_EMBEDDINGS - elif "ranking" in response: - self._response_format = ResponseFormat.RANKINGS - else: - raise RuntimeError("Unknown OpenAI response format.") - - elif self._service_kind == "triton": - self._response_format = ResponseFormat.TRITON - else: - raise ValueError(f"Unknown service kind: {self._service_kind}") - - def _parse_profile_data(self, data: dict) -> None: - """Parse through the entire profile data to collect statistics.""" - self._profile_results = {} - for experiment in data["experiments"]: - infer_mode = experiment["experiment"]["mode"] - load_level = experiment["experiment"]["value"] - requests = experiment["requests"] - - metrics = self._parse_requests(requests) - - # aggregate and calculate statistics - statistics = Statistics(metrics) - self._profile_results[(infer_mode, str(load_level))] = statistics - - def _parse_requests(self, requests: dict) -> Metrics: - """Parse each request in profile data to extract core metrics.""" - min_req_timestamp, max_res_timestamp = float("inf"), 0 - request_latencies = [] - - for request in requests: - req_timestamp = request["timestamp"] - res_timestamps = request["response_timestamps"] - - # track entire benchmark duration - min_req_timestamp = min(min_req_timestamp, req_timestamp) - max_res_timestamp = max(max_res_timestamp, res_timestamps[-1]) - - # request latencies - req_latency = res_timestamps[-1] - req_timestamp - request_latencies.append(req_latency) - - # request throughput - benchmark_duration = (max_res_timestamp - min_req_timestamp) / 1e9 # to seconds - request_throughputs = [len(requests) / benchmark_duration] - - return Metrics( - request_throughputs, - request_latencies, - ) - - def get_statistics(self, infer_mode: str, load_level: str) -> Statistics: - """Return profile statistics if it exists.""" - if (infer_mode, load_level) not in self._profile_results: - raise KeyError(f"Profile with {infer_mode}={load_level} does not exist.") - return self._profile_results[(infer_mode, load_level)] - - def get_profile_load_info(self) -> List[Tuple[str, str]]: - """Return available (infer_mode, load_level) tuple keys.""" - return [k for k, _ in self._profile_results.items()] diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/test_end_to_end.py b/src/c++/perf_analyzer/genai-perf/genai_perf/test_end_to_end.py index a44304348..3cc2999f5 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/test_end_to_end.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/test_end_to_end.py @@ -10,7 +10,7 @@ # For all cases but vllm_openai, it assumes that the server will be on port 9999 # # This script will run a sweep of all combinations of values in the testing matrix -# by appending those options on to the genai-perf base command +# by appending those options on to the genai-pa base command # @@ -20,11 +20,11 @@ ] base_commands = { - "nim_chat": "genai-perf profile -s 999 -p 20000 -m llama-2-7b-chat -u http://localhost:9999 --service-kind openai --endpoint-type chat", - "nim_completions": "genai-perf profile -s 999 -p 20000 -m llama-2-7b -u http://localhost:9999 --service-kind openai --endpoint-type completions", - "vllm_openai": "genai-perf profile -s 999 -p 20000 -m mistralai/Mistral-7B-v0.1 --service-kind openai --endpoint-type chat", - "triton_tensorrtllm": "genai-perf profile -s 999 -p 20000 -m llama-2-7b -u 0.0.0.0:9999 --service-kind triton --backend tensorrtllm", - "triton_vllm": "genai-perf profile -s 999 -p 20000 -m gpt2_vllm --service-kind triton --backend vllm", + "nim_chat": "genai-perf -s 999 -p 20000 -m llama-2-7b-chat -u http://localhost:9999 --service-kind openai --endpoint-type chat", + "nim_completions": "genai-perf -s 999 -p 20000 -m llama-2-7b -u http://localhost:9999 --service-kind openai --endpoint-type completions", + "vllm_openai": "genai-perf -s 999 -p 20000 -m mistralai/Mistral-7B-v0.1 --service-kind openai --endpoint-type chat", + "triton_tensorrtllm": "genai-perf -s 999 -p 20000 -m llama-2-7b -u 0.0.0.0:9999 --service-kind triton --backend tensorrtllm", + "triton_vllm": "genai-perf -s 999 -p 20000 -m gpt2_vllm --service-kind triton --backend vllm", } testname = "" diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/wrapper.py b/src/c++/perf_analyzer/genai-perf/genai_perf/wrapper.py index dbaacc32b..e5f704423 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/wrapper.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/wrapper.py @@ -64,7 +64,6 @@ def build_cmd(args: Namespace, extra_args: Optional[List[str]] = None) -> List[s skip_args = [ "artifact_dir", "backend", - "batch_size", "concurrency", "endpoint_type", "extra_inputs", diff --git a/src/c++/perf_analyzer/genai-perf/pyproject.toml b/src/c++/perf_analyzer/genai-perf/pyproject.toml index 982ee24b7..7be2c8474 100644 --- a/src/c++/perf_analyzer/genai-perf/pyproject.toml +++ b/src/c++/perf_analyzer/genai-perf/pyproject.toml @@ -46,7 +46,7 @@ maintainers = [] keywords = [] requires-python = ">=3.8,<4" dependencies = [ - "numpy<2", + "numpy", "pytest", "rich", "transformers", diff --git a/src/c++/perf_analyzer/genai-perf/tests/__init__.py b/src/c++/perf_analyzer/genai-perf/tests/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_artifacts.py b/src/c++/perf_analyzer/genai-perf/tests/test_artifacts.py deleted file mode 100644 index cdcc4afc9..000000000 --- a/src/c++/perf_analyzer/genai-perf/tests/test_artifacts.py +++ /dev/null @@ -1,59 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from argparse import Namespace -from pathlib import Path - -import pytest -from genai_perf.main import create_artifacts_dirs - - -@pytest.fixture -def mock_makedirs(mocker): - return mocker.patch("os.makedirs") - - -def test_create_artifacts_dirs_custom_path(mock_makedirs): - artifacts_dir_path = "/genai_perf_artifacts" - mock_args = Namespace(artifact_dir=Path(artifacts_dir_path), generate_plots=True) - create_artifacts_dirs(mock_args) - mock_makedirs.assert_any_call( - Path(artifacts_dir_path), exist_ok=True - ), f"Expected os.makedirs to create artifacts directory inside {artifacts_dir_path} path." - mock_makedirs.assert_any_call( - Path(artifacts_dir_path) / "plots", exist_ok=True - ), f"Expected os.makedirs to create plots directory inside {artifacts_dir_path}/plots path." - assert mock_makedirs.call_count == 2 - - -def test_create_artifacts_disable_generate_plots(mock_makedirs): - artifacts_dir_path = "/genai_perf_artifacts" - mock_args = Namespace(artifact_dir=Path(artifacts_dir_path)) - create_artifacts_dirs(mock_args) - mock_makedirs.assert_any_call( - Path(artifacts_dir_path), exist_ok=True - ), f"Expected os.makedirs to create artifacts directory inside {artifacts_dir_path} path." - assert mock_makedirs.call_count == 1 diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_cli.py b/src/c++/perf_analyzer/genai-perf/tests/test_cli.py index eb891fd02..15050184c 100644 --- a/src/c++/perf_analyzer/genai-perf/tests/test_cli.py +++ b/src/c++/perf_analyzer/genai-perf/tests/test_cli.py @@ -24,7 +24,6 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import argparse from pathlib import Path import genai_perf.logging as logging @@ -35,7 +34,6 @@ OutputFormat, PromptSource, ) -from genai_perf.parser import PathType class TestCLIArguments: @@ -52,7 +50,10 @@ class TestCLIArguments: [ (["-h"], expected_help_output), (["--help"], expected_help_output), + (["-m", "abc", "--help"], expected_help_output), + (["-m", "abc", "-h"], expected_help_output), (["--version"], expected_version_output), + (["-m", "abc", "--version"], expected_version_output), ], ) def test_help_version_arguments_output_and_exit( @@ -77,47 +78,19 @@ def test_help_version_arguments_output_and_exit( ["--artifact-dir", "test_artifact_dir"], {"artifact_dir": Path("test_artifact_dir")}, ), - ( - [ - "--batch-size", - "5", - "--endpoint-type", - "embeddings", - "--service-kind", - "openai", - ], - {"batch_size": 5}, - ), - ( - [ - "-b", - "5", - "--endpoint-type", - "embeddings", - "--service-kind", - "openai", - ], - {"batch_size": 5}, - ), (["--concurrency", "3"], {"concurrency": 3}), ( - ["--endpoint-type", "completions", "--service-kind", "openai"], + ["--endpoint-type", "completions"], {"endpoint": "v1/completions"}, ), ( - ["--endpoint-type", "chat", "--service-kind", "openai"], + ["--endpoint-type", "chat"], {"endpoint": "v1/chat/completions"}, ), - ( - ["--endpoint-type", "rankings", "--service-kind", "openai"], - {"endpoint": "v1/ranking"}, - ), ( [ "--endpoint-type", "chat", - "--service-kind", - "openai", "--endpoint", "custom/address", ], @@ -127,8 +100,6 @@ def test_help_version_arguments_output_and_exit( [ "--endpoint-type", "chat", - "--service-kind", - "openai", "--endpoint", " /custom/address", ], @@ -138,8 +109,6 @@ def test_help_version_arguments_output_and_exit( [ "--endpoint-type", "completions", - "--service-kind", - "openai", "--endpoint", "custom/address", ], @@ -158,17 +127,6 @@ def test_help_version_arguments_output_and_exit( ], {"extra_inputs": ["test_key:5", "another_test_key:6"]}, ), - ( - [ - "--extra-inputs", - '{"name": "Wolverine","hobbies": ["hacking", "slashing"],"address": {"street": "1407 Graymalkin Lane, Salem Center","city": "NY"}}', - ], - { - "extra_inputs": [ - '{"name": "Wolverine","hobbies": ["hacking", "slashing"],"address": {"street": "1407 Graymalkin Lane, Salem Center","city": "NY"}}' - ] - }, - ), (["--input-dataset", "openorca"], {"input_dataset": "openorca"}), (["--measurement-interval", "100"], {"measurement_interval": 100}), ( @@ -200,9 +158,9 @@ def test_help_version_arguments_output_and_exit( (["--random-seed", "8"], {"random_seed": 8}), (["--request-rate", "9.0"], {"request_rate": 9.0}), (["-s", "99.5"], {"stability_percentage": 99.5}), - (["--service-kind", "triton"], {"service_kind": "triton"}), + (["--endpoint-type", "kserve"], {"service_kind": "triton"}), ( - ["--service-kind", "openai", "--endpoint-type", "chat"], + ["--endpoint-type", "chat"], {"service_kind": "openai", "endpoint": "v1/chat/completions"}, ), (["--stability-percentage", "99.5"], {"stability_percentage": 99.5}), @@ -223,7 +181,7 @@ def test_help_version_arguments_output_and_exit( ) def test_non_file_flags_parsed(self, monkeypatch, arg, expected_attributes, capsys): logging.init_logging() - combined_args = ["genai-perf", "profile", "--model", "test_model"] + arg + combined_args = ["genai-perf", "--model", "test_model"] + arg monkeypatch.setattr("sys.argv", combined_args) args, _ = parser.parse_args() @@ -264,7 +222,7 @@ def test_multiple_model_args( self, monkeypatch, models, expected_model_list, formatted_name, capsys ): logging.init_logging() - combined_args = ["genai-perf", "profile"] + models + combined_args = ["genai-perf"] + models monkeypatch.setattr("sys.argv", combined_args) args, _ = parser.parse_args() @@ -281,10 +239,9 @@ def test_multiple_model_args( assert captured.out == "" def test_file_flags_parsed(self, monkeypatch, mocker): - _ = mocker.patch("os.path.isfile", return_value=True) + mocked_open = mocker.patch("builtins.open", mocker.mock_open(read_data="data")) combined_args = [ "genai-perf", - "profile", "--model", "test_model", "--input-file", @@ -292,39 +249,33 @@ def test_file_flags_parsed(self, monkeypatch, mocker): ] monkeypatch.setattr("sys.argv", combined_args) args, _ = parser.parse_args() - filepath, pathtype = args.input_file - assert filepath == Path( - "fakefile.txt" - ), "The file argument should be the path to the file" - assert pathtype == PathType.FILE + assert ( + args.input_file == mocked_open.return_value + ), "The file argument should be the mock object" @pytest.mark.parametrize( "arg, expected_path", [ ( - ["--service-kind", "openai", "--endpoint-type", "chat"], + ["--endpoint-type", "chat"], "artifacts/test_model-openai-chat-concurrency1", ), ( - ["--service-kind", "openai", "--endpoint-type", "completions"], + ["--endpoint-type", "completions"], "artifacts/test_model-openai-completions-concurrency1", ), ( - ["--service-kind", "openai", "--endpoint-type", "rankings"], - "artifacts/test_model-openai-rankings-concurrency1", - ), - ( - ["--service-kind", "triton", "--backend", "tensorrtllm"], + ["--endpoint-type", "kserve", "--backend", "tensorrtllm"], "artifacts/test_model-triton-tensorrtllm-concurrency1", ), ( - ["--service-kind", "triton", "--backend", "vllm"], + ["--endpoint-type", "kserve", "--backend", "vllm"], "artifacts/test_model-triton-vllm-concurrency1", ), ( [ - "--service-kind", - "triton", + "--endpoint-type", + "kserve", "--backend", "vllm", "--concurrency", @@ -338,7 +289,7 @@ def test_default_profile_export_filepath( self, monkeypatch, arg, expected_path, capsys ): logging.init_logging() - combined_args = ["genai-perf", "profile", "--model", "test_model"] + arg + combined_args = ["genai-perf", "--model", "test_model"] + arg monkeypatch.setattr("sys.argv", combined_args) args, _ = parser.parse_args() @@ -361,8 +312,6 @@ def test_default_profile_export_filepath( [ "--model", "hello/world/test_model", - "--service-kind", - "openai", "--endpoint-type", "chat", ], @@ -378,7 +327,7 @@ def test_model_name_artifact_path( self, monkeypatch, arg, expected_path, expected_output, capsys ): logging.init_logging() - combined_args = ["genai-perf", "profile"] + arg + combined_args = ["genai-perf"] + arg monkeypatch.setattr("sys.argv", combined_args) args, _ = parser.parse_args() @@ -388,9 +337,7 @@ def test_model_name_artifact_path( def test_default_load_level(self, monkeypatch, capsys): logging.init_logging() - monkeypatch.setattr( - "sys.argv", ["genai-perf", "profile", "--model", "test_model"] - ) + monkeypatch.setattr("sys.argv", ["genai-perf", "--model", "test_model"]) args, _ = parser.parse_args() assert args.concurrency == 1 captured = capsys.readouterr() @@ -398,8 +345,7 @@ def test_default_load_level(self, monkeypatch, capsys): def test_load_level_mutually_exclusive(self, monkeypatch, capsys): monkeypatch.setattr( - "sys.argv", - ["genai-perf", "profile", "--concurrency", "3", "--request-rate", "9.0"], + "sys.argv", ["genai-perf", "--concurrency", "3", "--request-rate", "9.0"] ) expected_output = ( "argument --request-rate: not allowed with argument --concurrency" @@ -413,7 +359,7 @@ def test_load_level_mutually_exclusive(self, monkeypatch, capsys): assert expected_output in captured.err def test_model_not_provided(self, monkeypatch, capsys): - monkeypatch.setattr("sys.argv", ["genai-perf", "profile"]) + monkeypatch.setattr("sys.argv", ["genai-perf"]) expected_output = "The -m/--model option is required and cannot be empty." with pytest.raises(SystemExit) as excinfo: @@ -424,7 +370,7 @@ def test_model_not_provided(self, monkeypatch, capsys): assert expected_output in captured.err def test_pass_through_args(self, monkeypatch): - args = ["genai-perf", "profile", "-m", "test_model"] + args = ["genai-perf", "-m", "test_model"] other_args = ["--", "With", "great", "power"] monkeypatch.setattr("sys.argv", args + other_args) _, pass_through_args = parser.parse_args() @@ -436,7 +382,6 @@ def test_unrecognized_arg(self, monkeypatch, capsys): "sys.argv", [ "genai-perf", - "profile", "-m", "nonexistent_model", "--wrong-arg", @@ -455,44 +400,12 @@ def test_unrecognized_arg(self, monkeypatch, capsys): "args, expected_output", [ ( - [ - "genai-perf", - "profile", - "-m", - "test_model", - "--service-kind", - "openai", - ], - "The --endpoint-type option is required when using the 'openai' service-kind.", - ), - ( - [ - "genai-perf", - "profile", - "-m", - "test_model", - "--service-kind", - "openai", - "--endpoint", - "custom/address", - ], - "The --endpoint-type option is required when using the 'openai' service-kind.", - ), - ( - [ - "genai-perf", - "profile", - "-m", - "test_model", - "--output-tokens-stddev", - "5", - ], + ["genai-perf", "-m", "test_model", "--output-tokens-stddev", "5"], "The --output-tokens-mean option is required when using --output-tokens-stddev.", ), ( [ "genai-perf", - "profile", "-m", "test_model", "--output-tokens-mean-deterministic", @@ -502,7 +415,6 @@ def test_unrecognized_arg(self, monkeypatch, capsys): ( [ "genai-perf", - "profile", "-m", "test_model", "--output-tokens-mean-deterministic", @@ -512,85 +424,15 @@ def test_unrecognized_arg(self, monkeypatch, capsys): ( [ "genai-perf", - "profile", "-m", "test_model", - "--service-kind", - "openai", "--endpoint-type", "chat", "--output-tokens-mean", "100", "--output-tokens-mean-deterministic", ], - "The --output-tokens-mean-deterministic option is only supported with the Triton service-kind", - ), - ( - [ - "genai-perf", - "profile", - "-m", - "test_model", - "--batch-size", - "10", - ], - "The --batch-size option is currently only supported with the embeddings and rankings endpoint types", - ), - ( - [ - "genai-perf", - "profile", - "-m", - "test_model", - "--service-kind", - "openai", - "--endpoint-type", - "embeddings", - "--streaming", - ], - "The --streaming option is not supported with the embeddings endpoint type", - ), - ( - [ - "genai-perf", - "profile", - "-m", - "test_model", - "--service-kind", - "openai", - "--endpoint-type", - "rankings", - "--streaming", - ], - "The --streaming option is not supported with the rankings endpoint type", - ), - ( - [ - "genai-perf", - "profile", - "-m", - "test_model", - "--service-kind", - "openai", - "--endpoint-type", - "embeddings", - "--generate-plots", - ], - "The --generate-plots option is not currently supported with the embeddings endpoint type", - ), - ( - [ - "genai-perf", - "profile", - "-m", - "test_model", - "--service-kind", - "openai", - "--endpoint-type", - "rankings", - "--generate-plots", - ], - "The --generate-plots option is not currently supported with the rankings endpoint type", + "The --output-tokens-mean-deterministic option is only supported with the kserve endpoint type", ), ], ) @@ -608,17 +450,15 @@ def test_conditional_errors(self, args, expected_output, monkeypatch, capsys): "args, expected_format", [ ( - ["--service-kind", "openai", "--endpoint-type", "chat"], + ["--endpoint-type", "chat"], OutputFormat.OPENAI_CHAT_COMPLETIONS, ), ( - ["--service-kind", "openai", "--endpoint-type", "completions"], + ["--endpoint-type", "completions"], OutputFormat.OPENAI_COMPLETIONS, ), ( [ - "--service-kind", - "openai", "--endpoint-type", "completions", "--endpoint", @@ -627,20 +467,14 @@ def test_conditional_errors(self, args, expected_output, monkeypatch, capsys): OutputFormat.OPENAI_COMPLETIONS, ), ( - ["--service-kind", "openai", "--endpoint-type", "rankings"], - OutputFormat.RANKINGS, - ), - ( - ["--service-kind", "triton", "--backend", "tensorrtllm"], + ["--endpoint-type", "kserve", "--backend", "tensorrtllm"], OutputFormat.TENSORRTLLM, ), - (["--service-kind", "triton", "--backend", "vllm"], OutputFormat.VLLM), + (["--endpoint-type", "kserve", "--backend", "vllm"], OutputFormat.VLLM), ], ) def test_inferred_output_format(self, monkeypatch, args, expected_format): - monkeypatch.setattr( - "sys.argv", ["genai-perf", "profile", "-m", "test_model"] + args - ) + monkeypatch.setattr("sys.argv", ["genai-perf", "-m", "test_model"] + args) parsed_args, _ = parser.parse_args() assert parsed_args.output_format == expected_format @@ -671,7 +505,7 @@ def test_inferred_output_format(self, monkeypatch, args, expected_format): ], ) def test_repeated_extra_arg_warning(self, monkeypatch, args, expected_error): - combined_args = ["genai-perf", "profile", "-m", "test_model"] + args + combined_args = ["genai-perf", "-m", "test_model"] + args monkeypatch.setattr("sys.argv", combined_args) parsed_args, _ = parser.parse_args() @@ -697,9 +531,7 @@ def test_inferred_prompt_source( self, monkeypatch, mocker, args, expected_prompt_source ): _ = mocker.patch("builtins.open", mocker.mock_open(read_data="data")) - _ = mocker.patch("os.path.isfile", return_value=True) - _ = mocker.patch("os.path.isdir", return_value=True) - combined_args = ["genai-perf", "profile", "--model", "test_model"] + args + combined_args = ["genai-perf", "--model", "test_model"] + args monkeypatch.setattr("sys.argv", combined_args) args, _ = parser.parse_args() @@ -707,11 +539,8 @@ def test_inferred_prompt_source( def test_prompt_source_assertions(self, monkeypatch, mocker, capsys): _ = mocker.patch("builtins.open", mocker.mock_open(read_data="data")) - _ = mocker.patch("os.path.isfile", return_value=True) - _ = mocker.patch("os.path.isdir", return_value=True) args = [ "genai-perf", - "profile", "--model", "test_model", "--input-dataset", @@ -787,30 +616,15 @@ def test_compare_not_provided(self, monkeypatch, capsys): assert expected_output in captured.err @pytest.mark.parametrize( - "extra_inputs_list, expected_dict", + "args, expected_model", [ - (["test_key:test_value"], {"test_key": "test_value"}), - ( - ["test_key:1", "another_test_key:2"], - {"test_key": 1, "another_test_key": 2}, - ), - ( - [ - '{"name": "Wolverine","hobbies": ["hacking", "slashing"],"address": {"street": "1407 Graymalkin Lane, Salem Center","city": "NY"}}' - ], - { - "name": "Wolverine", - "hobbies": ["hacking", "slashing"], - "address": { - "street": "1407 Graymalkin Lane, Salem Center", - "city": "NY", - }, - }, - ), + (["--files", "profile1.json", "profile2.json", "profile3.json"], None), + (["--config", "config.yaml"], None), ], ) - def test_get_extra_inputs_as_dict(self, extra_inputs_list, expected_dict): - namespace = argparse.Namespace() - namespace.extra_inputs = extra_inputs_list - actual_dict = parser.get_extra_inputs_as_dict(namespace) - assert actual_dict == expected_dict + def test_compare_model_arg(self, monkeypatch, args, expected_model): + combined_args = ["genai-perf", "compare"] + args + monkeypatch.setattr("sys.argv", combined_args) + args, _ = parser.parse_args() + + assert args.model == expected_model diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_console_exporter.py b/src/c++/perf_analyzer/genai-perf/tests/test_console_exporter.py deleted file mode 100644 index dda62e04a..000000000 --- a/src/c++/perf_analyzer/genai-perf/tests/test_console_exporter.py +++ /dev/null @@ -1,176 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from genai_perf import parser -from genai_perf.export_data.console_exporter import ConsoleExporter -from genai_perf.export_data.exporter_config import ExporterConfig -from genai_perf.metrics import LLMMetrics, Metrics, Statistics - - -class TestConsoleExporter: - - def test_streaming_llm_output(self, monkeypatch, capsys) -> None: - argv = [ - "genai-perf", - "profile", - "-m", - "model_name", - "--service-kind", - "openai", - "--endpoint-type", - "chat", - "--streaming", - ] - monkeypatch.setattr("sys.argv", argv) - args, _ = parser.parse_args() - - metrics = LLMMetrics( - request_throughputs=[123], - request_latencies=[4, 5, 6], - time_to_first_tokens=[7, 8, 9], - inter_token_latencies=[10, 11, 12], - output_token_throughputs=[456], - output_sequence_lengths=[1, 2, 3], - input_sequence_lengths=[5, 6, 7], - ) - stats = Statistics(metrics=metrics) - - config = ExporterConfig() - config.stats = stats.stats_dict - config.metrics = stats.metrics - config.args = args - - exporter = ConsoleExporter(config) - exporter.export() - - expected_content = ( - " LLM Metrics \n" - "┏━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┓\n" - "┃ Statistic ┃ avg ┃ min ┃ max ┃ p99 ┃ p90 ┃ p75 ┃\n" - "┡━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━┩\n" - "│ Time to first token (ms) │ 8.00 │ 7.00 │ 9.00 │ 8.98 │ 8.80 │ 8.50 │\n" - "│ Inter token latency (ms) │ 11.00 │ 10.00 │ 12.00 │ 11.98 │ 11.80 │ 11.50 │\n" - "│ Request latency (ms) │ 5.00 │ 4.00 │ 6.00 │ 5.98 │ 5.80 │ 5.50 │\n" - "│ Output sequence length │ 2.00 │ 1.00 │ 3.00 │ 2.98 │ 2.80 │ 2.50 │\n" - "│ Input sequence length │ 6.00 │ 5.00 │ 7.00 │ 6.98 │ 6.80 │ 6.50 │\n" - "└──────────────────────────┴───────┴───────┴───────┴───────┴───────┴───────┘\n" - "Output token throughput (per sec): 456.00\n" - "Request throughput (per sec): 123.00\n" - ) - - returned_data = capsys.readouterr().out - assert returned_data == expected_content - - def test_nonstreaming_llm_output(self, monkeypatch, capsys) -> None: - argv = [ - "genai-perf", - "profile", - "-m", - "model_name", - "--service-kind", - "openai", - "--endpoint-type", - "chat", - ] - monkeypatch.setattr("sys.argv", argv) - args, _ = parser.parse_args() - - metrics = LLMMetrics( - request_throughputs=[123], - request_latencies=[4, 5, 6], - time_to_first_tokens=[4, 5, 6], # same as request_latency - inter_token_latencies=[], # no ITL - output_token_throughputs=[456], - output_sequence_lengths=[1, 2, 3], - input_sequence_lengths=[5, 6, 7], - ) - stats = Statistics(metrics=metrics) - - config = ExporterConfig() - config.stats = stats.stats_dict - config.metrics = stats.metrics - config.args = args - - exporter = ConsoleExporter(config) - exporter.export() - - # No TTFT and ITL in the output - expected_content = ( - " LLM Metrics \n" - "┏━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━┓\n" - "┃ Statistic ┃ avg ┃ min ┃ max ┃ p99 ┃ p90 ┃ p75 ┃\n" - "┡━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━┩\n" - "│ Request latency (ms) │ 5.00 │ 4.00 │ 6.00 │ 5.98 │ 5.80 │ 5.50 │\n" - "│ Output sequence length │ 2.00 │ 1.00 │ 3.00 │ 2.98 │ 2.80 │ 2.50 │\n" - "│ Input sequence length │ 6.00 │ 5.00 │ 7.00 │ 6.98 │ 6.80 │ 6.50 │\n" - "└────────────────────────┴──────┴──────┴──────┴──────┴──────┴──────┘\n" - "Output token throughput (per sec): 456.00\n" - "Request throughput (per sec): 123.00\n" - ) - - returned_data = capsys.readouterr().out - assert returned_data == expected_content - - def test_embedding_output(self, monkeypatch, capsys) -> None: - argv = [ - "genai-perf", - "profile", - "-m", - "model_name", - "--service-kind", - "openai", - "--endpoint-type", - "embeddings", - ] - monkeypatch.setattr("sys.argv", argv) - args, _ = parser.parse_args() - - metrics = Metrics( - request_throughputs=[123], - request_latencies=[4, 5, 6], - ) - stats = Statistics(metrics=metrics) - - config = ExporterConfig() - config.stats = stats.stats_dict - config.metrics = stats.metrics - config.args = args - - exporter = ConsoleExporter(config) - exporter.export() - - expected_content = ( - " Embeddings Metrics \n" - "┏━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━┓\n" - "┃ Statistic ┃ avg ┃ min ┃ max ┃ p99 ┃ p90 ┃ p75 ┃\n" - "┡━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━┩\n" - "│ Request latency (ms) │ 5.00 │ 4.00 │ 6.00 │ 5.98 │ 5.80 │ 5.50 │\n" - "└──────────────────────┴──────┴──────┴──────┴──────┴──────┴──────┘\n" - "Request throughput (per sec): 123.00\n" - ) - - returned_data = capsys.readouterr().out - assert returned_data == expected_content diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_csv_exporter.py b/src/c++/perf_analyzer/genai-perf/tests/test_csv_exporter.py deleted file mode 100644 index 6a60bc2dc..000000000 --- a/src/c++/perf_analyzer/genai-perf/tests/test_csv_exporter.py +++ /dev/null @@ -1,213 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from io import StringIO -from pathlib import Path -from typing import Any, List - -import pytest -from genai_perf import parser -from genai_perf.export_data.csv_exporter import CsvExporter -from genai_perf.export_data.exporter_config import ExporterConfig -from genai_perf.metrics import LLMMetrics, Metrics, Statistics - - -class TestCsvExporter: - @pytest.fixture - def mock_read_write(self, monkeypatch: pytest.MonkeyPatch) -> List[str]: - """ - This function will mock the open function for specific files. - """ - - written_data = [] - - original_open = open - - def custom_open(filename, *args, **kwargs): - def write(self: Any, content: str) -> int: - written_data.append(content) - return len(content) - - if str(filename) == "profile_export_genai_perf.csv": - tmp_file = StringIO() - tmp_file.write = write.__get__(tmp_file) - return tmp_file - else: - return original_open(filename, *args, **kwargs) - - monkeypatch.setattr("builtins.open", custom_open) - - return written_data - - def test_streaming_llm_csv_output( - self, monkeypatch, mock_read_write: pytest.MonkeyPatch - ) -> None: - """ - Collect LLM metrics from profile export data and confirm correct values are - printed in csv. - """ - argv = [ - "genai-perf", - "profile", - "-m", - "model_name", - "--service-kind", - "openai", - "--endpoint-type", - "chat", - "--streaming", - ] - monkeypatch.setattr("sys.argv", argv) - args, _ = parser.parse_args() - - metrics = LLMMetrics( - request_throughputs=[123], - request_latencies=[4, 5, 6], - time_to_first_tokens=[7, 8, 9], - inter_token_latencies=[10, 11, 12], - output_token_throughputs=[456], - output_sequence_lengths=[1, 2, 3], - input_sequence_lengths=[5, 6, 7], - ) - stats = Statistics(metrics=metrics) - - config = ExporterConfig() - config.stats = stats.stats_dict - config.metrics = stats.metrics - config.artifact_dir = Path(".") - config.args = args - - exporter = CsvExporter(config) - exporter.export() - - expected_content = [ - "Metric,avg,min,max,p99,p95,p90,p75,p50,p25\r\n", - "Time To First Token (ms),8.00,7.00,9.00,8.98,8.90,8.80,8.50,8.00,7.50\r\n", - "Inter Token Latency (ms),11.00,10.00,12.00,11.98,11.90,11.80,11.50,11.00,10.50\r\n", - "Request Latency (ms),5.00,4.00,6.00,5.98,5.90,5.80,5.50,5.00,4.50\r\n", - "Output Sequence Length,2.00,1.00,3.00,2.98,2.90,2.80,2.50,2.00,1.50\r\n", - "Input Sequence Length,6.00,5.00,7.00,6.98,6.90,6.80,6.50,6.00,5.50\r\n", - "\r\n", - "Metric,Value\r\n", - "Output Token Throughput (per sec),456.00\r\n", - "Request Throughput (per sec),123.00\r\n", - ] - returned_data = mock_read_write - assert returned_data == expected_content - - def test_nonstreaming_llm_csv_output( - self, monkeypatch, mock_read_write: pytest.MonkeyPatch - ) -> None: - """ - Collect LLM metrics from profile export data and confirm correct values are - printed in csv. - """ - argv = [ - "genai-perf", - "profile", - "-m", - "model_name", - "--service-kind", - "openai", - "--endpoint-type", - "chat", - ] - monkeypatch.setattr("sys.argv", argv) - args, _ = parser.parse_args() - - metrics = LLMMetrics( - request_throughputs=[123], - request_latencies=[4, 5, 6], - time_to_first_tokens=[4, 5, 6], # same as request_latency - inter_token_latencies=[], # no ITL - output_token_throughputs=[456], - output_sequence_lengths=[1, 2, 3], - input_sequence_lengths=[5, 6, 7], - ) - stats = Statistics(metrics=metrics) - - config = ExporterConfig() - config.stats = stats.stats_dict - config.metrics = stats.metrics - config.artifact_dir = Path(".") - config.args = args - - exporter = CsvExporter(config) - exporter.export() - - expected_content = [ - "Metric,avg,min,max,p99,p95,p90,p75,p50,p25\r\n", - "Request Latency (ms),5.00,4.00,6.00,5.98,5.90,5.80,5.50,5.00,4.50\r\n", - "Output Sequence Length,2.00,1.00,3.00,2.98,2.90,2.80,2.50,2.00,1.50\r\n", - "Input Sequence Length,6.00,5.00,7.00,6.98,6.90,6.80,6.50,6.00,5.50\r\n", - "\r\n", - "Metric,Value\r\n", - "Output Token Throughput (per sec),456.00\r\n", - "Request Throughput (per sec),123.00\r\n", - ] - returned_data = mock_read_write - assert returned_data == expected_content - - def test_embedding_csv_output( - self, monkeypatch, mock_read_write: pytest.MonkeyPatch - ) -> None: - argv = [ - "genai-perf", - "profile", - "-m", - "model_name", - "--service-kind", - "openai", - "--endpoint-type", - "embeddings", - ] - monkeypatch.setattr("sys.argv", argv) - args, _ = parser.parse_args() - - metrics = Metrics( - request_throughputs=[123], - request_latencies=[4, 5, 6], - ) - stats = Statistics(metrics=metrics) - - config = ExporterConfig() - config.stats = stats.stats_dict - config.metrics = stats.metrics - config.artifact_dir = Path(".") - config.args = args - - exporter = CsvExporter(config) - exporter.export() - - expected_content = [ - "Metric,avg,min,max,p99,p95,p90,p75,p50,p25\r\n", - "Request Latency (ms),5.00,4.00,6.00,5.98,5.90,5.80,5.50,5.00,4.50\r\n", - "\r\n", - "Metric,Value\r\n", - "Request Throughput (per sec),123.00\r\n", - ] - returned_data = mock_read_write - assert returned_data == expected_content diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_data_exporter_factory.py b/src/c++/perf_analyzer/genai-perf/tests/test_data_exporter_factory.py deleted file mode 100644 index 1a1628ac7..000000000 --- a/src/c++/perf_analyzer/genai-perf/tests/test_data_exporter_factory.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -from argparse import Namespace - -import genai_perf.export_data.data_exporter_factory as factory -from genai_perf.export_data.console_exporter import ConsoleExporter -from genai_perf.export_data.csv_exporter import CsvExporter -from genai_perf.export_data.exporter_config import ExporterConfig -from genai_perf.export_data.json_exporter import JsonExporter -from genai_perf.parser import get_extra_inputs_as_dict - - -class TestOutputReporter: - stats = { - "request_latency": { - "unit": "ms", - "avg": 1, - "p99": 2, - "p95": 3, - "p90": 4, - "p75": 5, - "p50": 6, - "p25": 7, - "max": 8, - "min": 9, - "std": 0, - }, - } - args = { - "model": ["gpt2_vllm"], - "formatted_model_name": "gpt2_vllm", - "model_selection_strategy": "round_robin", - "func": "Should_be_removed", - "output_format": "Should_be_removed", - "profile_export_file": ".", - "artifact_dir": ".", - "extra_inputs": ["max_tokens:200"], - } - args_namespace = Namespace(**args) - - config = ExporterConfig() - config.stats = stats - config.args = args_namespace - config.artifact_dir = args_namespace.artifact_dir - config.extra_inputs = get_extra_inputs_as_dict(args_namespace) - f = factory.DataExporterFactory() - - def test_return_json_exporter(self) -> None: - exporter_list = self.f.create_data_exporters(self.config) - assert any(isinstance(exporter, JsonExporter) for exporter in exporter_list) - - def test_return_csv_exporter(self) -> None: - exporter_list = self.f.create_data_exporters(self.config) - assert any(isinstance(exporter, CsvExporter) for exporter in exporter_list) - - def test_return_console_exporter(self) -> None: - exporter_list = self.f.create_data_exporters(self.config) - assert any(isinstance(exporter, ConsoleExporter) for exporter in exporter_list) diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_json_exporter.py b/src/c++/perf_analyzer/genai-perf/tests/test_json_exporter.py index e4a29267d..7bb76ee5e 100644 --- a/src/c++/perf_analyzer/genai-perf/tests/test_json_exporter.py +++ b/src/c++/perf_analyzer/genai-perf/tests/test_json_exporter.py @@ -27,39 +27,14 @@ import json import genai_perf.parser as parser -from genai_perf.export_data.exporter_config import ExporterConfig from genai_perf.export_data.json_exporter import JsonExporter class TestJsonExporter: - def test_generate_json(self, monkeypatch) -> None: - cli_cmd = [ - "genai-perf", - "profile", - "-m", - "gpt2_vllm", - "--backend", - "vllm", - "--streaming", - "--extra-inputs", - "max_tokens:256", - "--extra-inputs", - "ignore_eos:true", - ] - monkeypatch.setattr("sys.argv", cli_cmd) - args, _ = parser.parse_args() - config = ExporterConfig() - config.stats = self.stats - config.args = args - config.extra_inputs = parser.get_extra_inputs_as_dict(args) - config.artifact_dir = args.artifact_dir - json_exporter = JsonExporter(config) - assert json_exporter._stats_and_args == json.loads(self.expected_json_output) - stats = { "request_throughput": {"unit": "requests/sec", "avg": "7"}, "request_latency": { - "unit": "ms", + "unit": "ns", "avg": 1, "p99": 2, "p95": 3, @@ -72,7 +47,7 @@ def test_generate_json(self, monkeypatch) -> None: "std": 0, }, "time_to_first_token": { - "unit": "ms", + "unit": "ns", "avg": 11, "p99": 12, "p95": 13, @@ -85,7 +60,7 @@ def test_generate_json(self, monkeypatch) -> None: "std": 10, }, "inter_token_latency": { - "unit": "ms", + "unit": "ns", "avg": 21, "p99": 22, "p95": 23, @@ -114,7 +89,7 @@ def test_generate_json(self, monkeypatch) -> None: "min": 49, "std": 40, }, - "output_sequence_length": { + "num_output_token": { "unit": "tokens", "avg": 51, "p99": 52, @@ -127,7 +102,7 @@ def test_generate_json(self, monkeypatch) -> None: "min": 59, "std": 50, }, - "input_sequence_length": { + "num_input_token": { "unit": "tokens", "avg": 61, "p99": 62, @@ -149,7 +124,7 @@ def test_generate_json(self, monkeypatch) -> None: "avg": "7" }, "request_latency": { - "unit": "ms", + "unit": "ns", "avg": 1, "p99": 2, "p95": 3, @@ -162,7 +137,7 @@ def test_generate_json(self, monkeypatch) -> None: "std": 0 }, "time_to_first_token": { - "unit": "ms", + "unit": "ns", "avg": 11, "p99": 12, "p95": 13, @@ -175,7 +150,7 @@ def test_generate_json(self, monkeypatch) -> None: "std": 10 }, "inter_token_latency": { - "unit": "ms", + "unit": "ns", "avg": 21, "p99": 22, "p95": 23, @@ -204,7 +179,7 @@ def test_generate_json(self, monkeypatch) -> None: "min": 49, "std": 40 }, - "output_sequence_length": { + "num_output_token": { "unit": "tokens", "avg": 51, "p99": 52, @@ -217,7 +192,7 @@ def test_generate_json(self, monkeypatch) -> None: "min": 59, "std": 50 }, - "input_sequence_length": { + "num_input_token": { "unit": "tokens", "avg": 61, "p99": 62, @@ -235,13 +210,13 @@ def test_generate_json(self, monkeypatch) -> None: "formatted_model_name": "gpt2_vllm", "model_selection_strategy": "round_robin", "backend": "vllm", - "batch_size": 1, - "endpoint": null, - "endpoint_type": null, + "endpoint": "v2/models/gpt2_vllm/infer", + "endpoint_type": "kserve", "service_kind": "triton", "streaming": true, "u": null, "input_dataset": null, + "input_file": null, "num_prompts": 100, "output_tokens_mean": -1, "output_tokens_mean_deterministic": false, @@ -258,7 +233,7 @@ def test_generate_json(self, monkeypatch) -> None: "artifact_dir": "artifacts/gpt2_vllm-triton-vllm-concurrency1", "tokenizer": "hf-internal-testing/llama-tokenizer", "verbose": false, - "subcommand": "profile", + "subcommand": null, "prompt_source": "synthetic", "extra_inputs": { "max_tokens": 256, @@ -267,3 +242,22 @@ def test_generate_json(self, monkeypatch) -> None: } } """ + + def test_generate_json(self, monkeypatch) -> None: + cli_cmd = [ + "genai-perf", + "-m", + "gpt2_vllm", + "--backend", + "vllm", + "--streaming", + "--extra-inputs", + "max_tokens:256", + "--extra-inputs", + "ignore_eos:true", + ] + monkeypatch.setattr("sys.argv", cli_cmd) + args, _ = parser.parse_args() + extra_inputs = parser.get_extra_inputs_as_dict(args) + json_exporter = JsonExporter(self.stats, args, extra_inputs) + assert json_exporter._stats_and_args == json.loads(self.expected_json_output) diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_llm_inputs.py b/src/c++/perf_analyzer/genai-perf/tests/test_llm_inputs.py index c6351918e..4486ba3d9 100644 --- a/src/c++/perf_analyzer/genai-perf/tests/test_llm_inputs.py +++ b/src/c++/perf_analyzer/genai-perf/tests/test_llm_inputs.py @@ -17,7 +17,6 @@ import random import statistics from pathlib import Path -from unittest.mock import mock_open, patch import pytest import responses @@ -657,36 +656,6 @@ def test_get_input_file_without_file_existing(self): with pytest.raises(FileNotFoundError): LlmInputs._get_input_dataset_from_file(Path("prompt.txt")) - @patch("pathlib.Path.exists", return_value=True) - @patch( - "builtins.open", - new_callable=mock_open, - read_data='{"text_input": "single prompt"}\n', - ) - def test_get_input_file_with_single_prompt(self, mock_file, mock_exists): - expected_prompts = ["single prompt"] - dataset = LlmInputs._get_input_dataset_from_file(Path("prompt.txt")) - - assert dataset is not None - assert len(dataset["rows"]) == len(expected_prompts) - for i, prompt in enumerate(expected_prompts): - assert dataset["rows"][i]["row"]["text_input"] == prompt - - @patch("pathlib.Path.exists", return_value=True) - @patch( - "builtins.open", - new_callable=mock_open, - read_data='{"text_input": "prompt1"}\n{"text_input": "prompt2"}\n{"text_input": "prompt3"}\n', - ) - def test_get_input_file_with_multiple_prompts(self, mock_file, mock_exists): - expected_prompts = ["prompt1", "prompt2", "prompt3"] - dataset = LlmInputs._get_input_dataset_from_file(Path("prompt.txt")) - - assert dataset is not None - assert len(dataset["rows"]) == len(expected_prompts) - for i, prompt in enumerate(expected_prompts): - assert dataset["rows"][i]["row"]["text_input"] == prompt - @pytest.mark.parametrize( "seed, model_name_list, index,model_selection_strategy,expected_model", [ diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_llm_inputs_embeddings.py b/src/c++/perf_analyzer/genai-perf/tests/test_llm_inputs_embeddings.py deleted file mode 100644 index 0cefa38a7..000000000 --- a/src/c++/perf_analyzer/genai-perf/tests/test_llm_inputs_embeddings.py +++ /dev/null @@ -1,172 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from pathlib import Path -from unittest.mock import mock_open, patch - -import pytest -from genai_perf.llm_inputs.llm_inputs import LlmInputs, ModelSelectionStrategy - - -class TestLlmInputsEmbeddings: - @patch("pathlib.Path.exists", return_value=True) - @patch( - "builtins.open", - new_callable=mock_open, - read_data="\n".join( - [ - '{"text": "What production company co-owned by Kevin Loader and Rodger Michell produced My Cousin Rachel?"}', - '{"text": "Who served as the 1st Vice President of Colombia under El Libertador?"}', - '{"text": "Are the Barton Mine and Hermiston-McCauley Mine located in The United States of America?"}', - '{"text": "what state did they film daddy\'s home 2"}', - ] - ), - ) - def test_get_input_dataset_from_embeddings_file(self, mock_file, mock_exists): - input_filename = Path("embeddings.jsonl") - batch_size = 3 - dataset = LlmInputs._get_input_dataset_from_embeddings_file( - input_filename, batch_size, num_prompts=100 - ) - - assert dataset is not None - assert len(dataset["rows"]) == 100 - for row in dataset["rows"]: - assert "row" in row - assert "payload" in row["row"] - payload = row["row"]["payload"] - assert "input" in payload - assert isinstance(payload["input"], list) - assert len(payload["input"]) == batch_size - - # Try error case where batch size is larger than the number of available texts - with pytest.raises( - ValueError, - match="Batch size cannot be larger than the number of available texts", - ): - LlmInputs._get_input_dataset_from_embeddings_file( - input_filename, 5, num_prompts=10 - ) - - def test_convert_generic_json_to_openai_embeddings_format(self): - generic_dataset = { - "rows": [ - {"payload": {"input": ["text 1", "text 2"]}}, - {"payload": {"input": ["text 3", "text 4"]}}, - ] - } - - expected_result = { - "data": [ - { - "payload": [ - { - "input": ["text 1", "text 2"], - "model": "test_model", - } - ] - }, - { - "payload": [ - { - "input": ["text 3", "text 4"], - "model": "test_model", - } - ] - }, - ] - } - - result = LlmInputs._convert_generic_json_to_openai_embeddings_format( - generic_dataset, - extra_inputs={}, - model_name=["test_model"], - model_selection_strategy=ModelSelectionStrategy.ROUND_ROBIN, - ) - - assert result is not None - assert "data" in result - assert len(result["data"]) == len(expected_result["data"]) - - for i, item in enumerate(expected_result["data"]): - assert "payload" in result["data"][i] - assert result["data"][i]["payload"] == item["payload"] - - def test_convert_generic_json_to_openai_embeddings_format_with_extra_inputs(self): - generic_dataset = { - "rows": [ - {"payload": {"input": ["text 1", "text 2"]}}, - {"payload": {"input": ["text 3", "text 4"]}}, - ] - } - - extra_inputs = { - "encoding_format": "base64", - "truncate": "END", - "additional_key": "additional_value", - } - - expected_result = { - "data": [ - { - "payload": [ - { - "input": ["text 1", "text 2"], - "model": "test_model", - "encoding_format": "base64", - "truncate": "END", - "additional_key": "additional_value", - } - ] - }, - { - "payload": [ - { - "input": ["text 3", "text 4"], - "model": "test_model", - "encoding_format": "base64", - "truncate": "END", - "additional_key": "additional_value", - } - ] - }, - ] - } - - result = LlmInputs._convert_generic_json_to_openai_embeddings_format( - generic_dataset, - extra_inputs=extra_inputs, - model_name=["test_model"], - model_selection_strategy=ModelSelectionStrategy.ROUND_ROBIN, - ) - - assert result is not None - assert "data" in result - assert len(result["data"]) == len(expected_result["data"]) - - for i, item in enumerate(expected_result["data"]): - assert "payload" in result["data"][i] - assert result["data"][i]["payload"] == item["payload"] diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_llm_inputs_rankings.py b/src/c++/perf_analyzer/genai-perf/tests/test_llm_inputs_rankings.py deleted file mode 100644 index bfe2be482..000000000 --- a/src/c++/perf_analyzer/genai-perf/tests/test_llm_inputs_rankings.py +++ /dev/null @@ -1,182 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from pathlib import Path -from unittest.mock import mock_open, patch - -import pytest -from genai_perf.llm_inputs.llm_inputs import LlmInputs, ModelSelectionStrategy - - -class TestLlmInputsRankings: - - def open_side_effects(filepath, *args, **kwargs): - queries_content = "\n".join( - [ - '{"text": "What production company co-owned by Kevin Loader and Rodger Michell produced My Cousin Rachel?"}', - '{"text": "Who served as the 1st Vice President of Colombia under El Libertador?"}', - '{"text": "Are the Barton Mine and Hermiston-McCauley Mine located in The United States of America?"}', - ] - ) - passages_content = "\n".join( - [ - '{"text": "Eric Anderson (sociologist) Eric Anderson (born January 18, 1968) is an American sociologist"}', - '{"text": "Kevin Loader is a British film and television producer. "}', - '{"text": "Barton Mine, also known as Net Lake Mine, is an abandoned surface and underground mine in Northeastern Ontario"}', - ] - ) - - file_contents = { - "queries.jsonl": queries_content, - "passages.jsonl": passages_content, - } - return mock_open( - read_data=file_contents.get(filepath, file_contents["queries.jsonl"]) - )() - - mock_open_obj = mock_open() - mock_open_obj.side_effect = open_side_effects - - @patch("pathlib.Path.exists", return_value=True) - @patch("builtins.open", mock_open_obj) - def test_get_input_dataset_from_rankings_file(self, mock_file): - queries_filename = Path("queries.jsonl") - passages_filename = Path("passages.jsonl") - batch_size = 2 - dataset = LlmInputs._get_input_dataset_from_rankings_files( - queries_filename, passages_filename, batch_size, num_prompts=100 - ) - - assert dataset is not None - assert len(dataset["rows"]) == 100 - for row in dataset["rows"]: - assert "row" in row - assert "payload" in row["row"] - payload = row["row"]["payload"] - assert "query" in payload - assert "passages" in payload - assert isinstance(payload["passages"], list) - assert len(payload["passages"]) == batch_size - - # Try error case where batch size is larger than the number of available texts - with pytest.raises( - ValueError, - match="Batch size cannot be larger than the number of available passages", - ): - LlmInputs._get_input_dataset_from_rankings_files( - queries_filename, passages_filename, 5, num_prompts=10 - ) - - def test_convert_generic_json_to_openai_rankings_format(self): - generic_dataset = { - "rows": [ - { - "payload": { - "query": {"text": "1"}, - "passages": [{"text": "2"}, {"text": "3"}, {"text": "4"}], - } - } - ] - } - - expected_result = { - "data": [ - { - "payload": [ - { - "query": {"text": "1"}, - "passages": [{"text": "2"}, {"text": "3"}, {"text": "4"}], - "model": "test_model", - } - ] - } - ] - } - - result = LlmInputs._convert_generic_json_to_rankings_format( - generic_dataset, - extra_inputs={}, - model_name=["test_model"], - model_selection_strategy=ModelSelectionStrategy.ROUND_ROBIN, - ) - - assert result is not None - assert "data" in result - assert len(result["data"]) == len(expected_result["data"]) - - for i, item in enumerate(expected_result["data"]): - assert "payload" in result["data"][i] - assert result["data"][i]["payload"] == item["payload"] - - def test_convert_generic_json_to_openai_rankings_format_with_extra_inputs(self): - generic_dataset = { - "rows": [ - { - "payload": { - "query": {"text": "1"}, - "passages": [{"text": "2"}, {"text": "3"}, {"text": "4"}], - } - } - ] - } - - extra_inputs = { - "encoding_format": "base64", - "truncate": "END", - "additional_key": "additional_value", - } - - expected_result = { - "data": [ - { - "payload": [ - { - "query": {"text": "1"}, - "passages": [{"text": "2"}, {"text": "3"}, {"text": "4"}], - "model": "test_model", - "encoding_format": "base64", - "truncate": "END", - "additional_key": "additional_value", - } - ] - } - ] - } - - result = LlmInputs._convert_generic_json_to_rankings_format( - generic_dataset, - extra_inputs=extra_inputs, - model_name=["test_model"], - model_selection_strategy=ModelSelectionStrategy.ROUND_ROBIN, - ) - - assert result is not None - assert "data" in result - assert len(result["data"]) == len(expected_result["data"]) - - for i, item in enumerate(expected_result["data"]): - assert "payload" in result["data"][i] - assert result["data"][i]["payload"] == item["payload"] diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_llm_metrics.py b/src/c++/perf_analyzer/genai-perf/tests/test_llm_metrics.py old mode 100644 new mode 100755 index 05de5b122..7014780f8 --- a/src/c++/perf_analyzer/genai-perf/tests/test_llm_metrics.py +++ b/src/c++/perf_analyzer/genai-perf/tests/test_llm_metrics.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -24,57 +26,393 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import json +from io import StringIO +from pathlib import Path +from typing import Any, List, Union + +import numpy as np import pytest -from genai_perf.metrics import LLMMetrics +from genai_perf.llm_metrics import LLMMetrics, LLMProfileDataParser, ResponseFormat +from genai_perf.tokenizer import DEFAULT_TOKENIZER, get_tokenizer -class TestLLMMetrics: +def ns_to_sec(ns: int) -> Union[int, float]: + """Convert from nanosecond to second.""" + return ns / 1e9 - def test_llm_metric_request_metrics(self) -> None: - """Test request_metrics property.""" - m = LLMMetrics( - request_throughputs=[10.12, 11.33], - request_latencies=[3, 44], - time_to_first_tokens=[1, 2, 3], - inter_token_latencies=[4, 5], - output_token_throughputs=[22.13, 9423.02], - output_token_throughputs_per_request=[7, 8, 9], - output_sequence_lengths=[3, 4], - input_sequence_lengths=[12, 34], - ) - req_metrics = m.request_metrics - assert len(req_metrics) == 6 - assert req_metrics[0].name == "time_to_first_token" - assert req_metrics[0].unit == "ms" - assert req_metrics[1].name == "inter_token_latency" - assert req_metrics[1].unit == "ms" - assert req_metrics[2].name == "request_latency" - assert req_metrics[2].unit == "ms" - assert req_metrics[3].name == "output_token_throughput_per_request" - assert req_metrics[3].unit == "tokens/sec" - assert req_metrics[4].name == "output_sequence_length" - assert req_metrics[4].unit == "tokens" - assert req_metrics[5].name == "input_sequence_length" - assert req_metrics[5].unit == "tokens" - - def test_llm_metric_system_metrics(self) -> None: - """Test system_metrics property.""" - m = LLMMetrics( - request_throughputs=[10.12, 11.33], - request_latencies=[3, 44], - time_to_first_tokens=[1, 2, 3], - inter_token_latencies=[4, 5], - output_token_throughputs=[22.13, 9423.02], - output_token_throughputs_per_request=[7, 8, 9], - output_sequence_lengths=[3, 4], - input_sequence_lengths=[12, 34], + +class TestLLMProfileDataParser: + @pytest.fixture + def mock_read_write(self, monkeypatch: pytest.MonkeyPatch) -> List[str]: + """ + This function will mock the open function for specific files: + + - For "triton_profile_export.json", it will read and return the + contents of self.triton_profile_data + - For "openai_profile_export.json", it will read and return the + contents of self.openai_profile_data + - For "profile_export.csv", it will capture all data written to + the file, and return it as the return value of this function + - For all other files, it will behave like the normal open function + """ + + written_data = [] + + original_open = open + + def custom_open(filename, *args, **kwargs): + def write(self: Any, content: str) -> int: + written_data.append(content) + return len(content) + + if filename == "triton_profile_export.json": + tmp_file = StringIO(json.dumps(self.triton_profile_data)) + return tmp_file + elif filename == "openai_profile_export.json": + tmp_file = StringIO(json.dumps(self.openai_profile_data)) + return tmp_file + elif filename == "empty_profile_export.json": + tmp_file = StringIO(json.dumps(self.empty_profile_data)) + return tmp_file + elif filename == "profile_export.csv": + tmp_file = StringIO() + tmp_file.write = write.__get__(tmp_file) + return tmp_file + else: + return original_open(filename, *args, **kwargs) + + monkeypatch.setattr("builtins.open", custom_open) + + return written_data + + def test_csv_output(self, mock_read_write: pytest.MonkeyPatch) -> None: + """ + Collect LLM metrics from profile export data and confirm correct values are + printed in csv. + """ + + tokenizer = get_tokenizer(DEFAULT_TOKENIZER) + pd = LLMProfileDataParser( + filename=Path("triton_profile_export.json"), + tokenizer=tokenizer, + ) + stat = pd.get_statistics(infer_mode="concurrency", load_level="10") + + expected_content = [ + "Metric,avg,min,max,p99,p95,p90,p75,p50,p25\r\n", + "Time To First Token (ns),2,2,2,2,2,2,2,2,2\r\n", + "Inter Token Latency (ns),2,1,3,3,3,3,2,2,2\r\n", + "Request Latency (ns),8,7,9,9,9,9,8,8,8\r\n", + "Num Output Token,4,3,6,6,6,6,5,4,4\r\n", + "Num Input Token,4,3,4,4,4,4,4,4,3\r\n", + "\r\n", + "Metric,Value\r\n", + "Output Token Throughput (per sec),900000000.00\r\n", + "Request Throughput (per sec),200000000.00\r\n", + ] + + stat.export_to_csv("profile_export.csv") + + returned_data = mock_read_write + + assert returned_data == expected_content + + def test_triton_llm_profile_data(self, mock_read_write: pytest.MonkeyPatch) -> None: + """Collect LLM metrics from profile export data and check values. + + Metrics + * time to first tokens + - experiment 1: [3 - 1, 4 - 2] = [2, 2] + - experiment 2: [7 - 5, 6 - 3] = [2, 3] + * inter token latencies + - experiment 1: [[(5 - 3)/1, (8 - 5)/1], [(7 - 4)/3, (11 - 7)/2]] + : [[2, 3], [3/3, 2]] + : [[2, 3], [1, 2]] + - experiment 2: [[(8 - 7)/1, (13 - 8)/1, (18 - 13)/1], [(8 - 6)/1, (11 - 8)/2]] + : [[1, 5, 5], [2, 3/2]] + : [[1, 5, 5], [2, 2]] # rounded + * output token throughputs per request + - experiment 1: [3/(8 - 1), 6/(11 - 2)] = [3/7, 6/9] + - experiment 2: [4/(18 - 5), 6/(11 - 3)] = [4/13, 6/8] + * output token throughputs + - experiment 1: [(3 + 6)/(11 - 1)] = [9/10] + - experiment 2: [(4 + 6)/(18 - 3)] = [2/3] + * num output tokens + - experiment 1: [3, 6] + - experiment 2: [4, 6] + * num input tokens + - experiment 1: [3, 4] + - experiment 2: [3, 4] + """ + tokenizer = get_tokenizer(DEFAULT_TOKENIZER) + pd = LLMProfileDataParser( + filename=Path("triton_profile_export.json"), + tokenizer=tokenizer, + ) + + # experiment 1 metrics & statistics + stat = pd.get_statistics(infer_mode="concurrency", load_level="10") + metrics = stat.metrics + assert isinstance(metrics, LLMMetrics) + + assert metrics.time_to_first_tokens == [2, 2] + assert metrics.inter_token_latencies == [[2, 3], [1, 2]] + ottpr = [3 / ns_to_sec(7), 6 / ns_to_sec(9)] + assert metrics.output_token_throughputs_per_request == pytest.approx(ottpr) + ott = [9 / ns_to_sec(10)] + assert metrics.output_token_throughputs == pytest.approx(ott) + assert metrics.num_output_tokens == [3, 6] + assert metrics.num_input_tokens == [3, 4] + + # Disable Pylance warnings for dynamically set attributes due to Statistics + # not having strict attributes listed. + assert stat.avg_time_to_first_token == 2 # type: ignore + assert stat.avg_inter_token_latency == 2 # type: ignore + assert stat.avg_output_token_throughput_per_request == pytest.approx( # type: ignore + np.mean(ottpr) + ) + assert stat.avg_num_output_token == 4.5 # type: ignore + assert stat.avg_num_input_token == 3.5 # type: ignore + + assert stat.p50_time_to_first_token == 2 # type: ignore + assert stat.p50_inter_token_latency == 2 # type: ignore + assert stat.p50_output_token_throughput_per_request == pytest.approx( # type: ignore + np.percentile(ottpr, 50) + ) + assert stat.p50_num_output_token == 4.5 # type: ignore + assert stat.p50_num_input_token == 3.5 # type: ignore + + assert stat.min_time_to_first_token == 2 # type: ignore + assert stat.min_inter_token_latency == 1 # type: ignore + min_ottpr = 3 / ns_to_sec(7) + assert stat.min_output_token_throughput_per_request == pytest.approx(min_ottpr) # type: ignore + assert stat.min_num_output_token == 3 # type: ignore + assert stat.min_num_input_token == 3 # type: ignore + + assert stat.max_time_to_first_token == 2 # type: ignore + assert stat.max_inter_token_latency == 3 # type: ignore + max_ottpr = 6 / ns_to_sec(9) + assert stat.max_output_token_throughput_per_request == pytest.approx(max_ottpr) # type: ignore + assert stat.max_num_output_token == 6 # type: ignore + assert stat.max_num_input_token == 4 # type: ignore + + assert stat.std_time_to_first_token == np.std([2, 2]) # type: ignore + assert stat.std_inter_token_latency == np.std([2, 3, 1, 2]) # type: ignore + assert stat.std_output_token_throughput_per_request == pytest.approx( # type: ignore + np.std(ottpr) + ) + assert stat.std_num_output_token == np.std([3, 6]) # type: ignore + assert stat.std_num_input_token == np.std([3, 4]) # type: ignore + + oott = 9 / ns_to_sec(10) + assert stat.avg_output_token_throughput == pytest.approx(oott) # type: ignore + + # experiment 2 statistics + stat = pd.get_statistics(infer_mode="request_rate", load_level="2.0") + metrics = stat.metrics + assert isinstance(metrics, LLMMetrics) + + assert metrics.time_to_first_tokens == [2, 3] + assert metrics.inter_token_latencies == [[1, 5, 5], [2, 2]] + ottpr = [4 / ns_to_sec(13), 6 / ns_to_sec(8)] + assert metrics.output_token_throughputs_per_request == pytest.approx(ottpr) + ott = [2 / ns_to_sec(3)] + assert metrics.output_token_throughputs == pytest.approx(ott) + assert metrics.num_output_tokens == [4, 6] + assert metrics.num_input_tokens == [3, 4] + + assert stat.avg_time_to_first_token == 2.5 # type: ignore + assert stat.avg_inter_token_latency == 3 # type: ignore + assert stat.avg_output_token_throughput_per_request == pytest.approx( # type: ignore + np.mean(ottpr) + ) + assert stat.avg_num_output_token == 5 # type: ignore + assert stat.avg_num_input_token == 3.5 # type: ignore + + assert stat.p50_time_to_first_token == 2.5 # type: ignore + assert stat.p50_inter_token_latency == 2 # type: ignore + assert stat.p50_output_token_throughput_per_request == pytest.approx( # type: ignore + np.percentile(ottpr, 50) + ) + assert stat.p50_num_output_token == 5 # type: ignore + assert stat.p50_num_input_token == 3.5 # type: ignore + + assert stat.min_time_to_first_token == 2 # type: ignore + assert stat.min_inter_token_latency == 1 # type: ignore + min_ottpr = 4 / ns_to_sec(13) + assert stat.min_output_token_throughput_per_request == pytest.approx(min_ottpr) # type: ignore + assert stat.min_num_output_token == 4 # type: ignore + assert stat.min_num_input_token == 3 # type: ignore + + assert stat.max_time_to_first_token == 3 # type: ignore + assert stat.max_inter_token_latency == 5 # type: ignore + max_ottpr = 6 / ns_to_sec(8) + assert stat.max_output_token_throughput_per_request == pytest.approx(max_ottpr) # type: ignore + assert stat.max_num_output_token == 6 # type: ignore + assert stat.max_num_input_token == 4 # type: ignore + + assert stat.std_time_to_first_token == np.std([2, 3]) # type: ignore + assert stat.std_inter_token_latency == np.std([1, 5, 5, 2, 2]) # type: ignore + assert stat.std_output_token_throughput_per_request == pytest.approx( # type: ignore + np.std(ottpr) + ) + assert stat.std_num_output_token == np.std([4, 6]) # type: ignore + assert stat.std_num_input_token == np.std([3, 4]) # type: ignore + + oott = 2 / ns_to_sec(3) + assert stat.avg_output_token_throughput == pytest.approx(oott) # type: ignore + + # check non-existing profile data + with pytest.raises(KeyError): + pd.get_statistics(infer_mode="concurrency", load_level="30") + + def test_openai_llm_profile_data(self, mock_read_write: pytest.MonkeyPatch) -> None: + """Collect LLM metrics from profile export data and check values. + + Metrics + * time to first tokens + - experiment 1: [5 - 1, 7 - 2] = [4, 5] + * inter token latencies + - experiment 1: [[(8 - 5)/1, (12 - 8)/1], [(11 - 7)/3, (15 - 11)/2]] + : [[3, 4], [4/3, 2]] + : [[3, 4], [1, 2]] # rounded + * output token throughputs per request + - experiment 1: [3/(12 - 1), 6/(15 - 2)] = [3/11, 6/13] + * output token throughputs + - experiment 1: [(3 + 6)/(15 - 1)] = [9/14] + * num output tokens + - experiment 1: [3, 6] + * num input tokens + - experiment 1: [3, 4] + """ + tokenizer = get_tokenizer(DEFAULT_TOKENIZER) + pd = LLMProfileDataParser( + filename=Path("openai_profile_export.json"), + tokenizer=tokenizer, ) - sys_metrics = m.system_metrics - assert len(sys_metrics) == 2 - assert sys_metrics[0].name == "output_token_throughput" - assert sys_metrics[0].unit == "per sec" - assert sys_metrics[1].name == "request_throughput" - assert sys_metrics[1].unit == "per sec" + + # experiment 1 statistics + stat = pd.get_statistics(infer_mode="concurrency", load_level="10") + metrics = stat.metrics + assert isinstance(metrics, LLMMetrics) + + assert metrics.time_to_first_tokens == [4, 5] + assert metrics.inter_token_latencies == [[3, 4], [1, 2]] + ottpr = [3 / ns_to_sec(11), 6 / ns_to_sec(13)] + assert metrics.output_token_throughputs_per_request == pytest.approx(ottpr) + ott = [9 / ns_to_sec(14)] + assert metrics.output_token_throughputs == pytest.approx(ott) + assert metrics.num_output_tokens == [3, 6] + assert metrics.num_input_tokens == [3, 4] + + assert stat.avg_time_to_first_token == 4.5 # type: ignore + assert stat.avg_inter_token_latency == 2.5 # type: ignore + assert stat.avg_output_token_throughput_per_request == pytest.approx( # type: ignore + np.mean(ottpr) + ) + assert stat.avg_num_output_token == 4.5 # type: ignore + assert stat.avg_num_input_token == 3.5 # type: ignore + + assert stat.p50_time_to_first_token == 4.5 # type: ignore + assert stat.p50_inter_token_latency == 2.5 # type: ignore + assert stat.p50_output_token_throughput_per_request == pytest.approx( # type: ignore + np.percentile(ottpr, 50) + ) + assert stat.p50_num_output_token == 4.5 # type: ignore + assert stat.p50_num_input_token == 3.5 # type: ignore + + assert stat.min_time_to_first_token == 4 # type: ignore + assert stat.min_inter_token_latency == 1 # type: ignore + min_ottpr = 3 / ns_to_sec(11) + assert stat.min_output_token_throughput_per_request == pytest.approx(min_ottpr) # type: ignore + assert stat.min_num_output_token == 3 # type: ignore + assert stat.min_num_input_token == 3 # type: ignore + + assert stat.max_time_to_first_token == 5 # type: ignore + assert stat.max_inter_token_latency == 4 # type: ignore + max_ottpr = 6 / ns_to_sec(13) + assert stat.max_output_token_throughput_per_request == pytest.approx(max_ottpr) # type: ignore + assert stat.max_num_output_token == 6 # type: ignore + assert stat.max_num_input_token == 4 # type: ignore + + assert stat.std_time_to_first_token == np.std([4, 5]) # type: ignore + assert stat.std_inter_token_latency == np.std([3, 4, 1, 2]) # type: ignore + assert stat.std_output_token_throughput_per_request == pytest.approx( # type: ignore + np.std(ottpr) + ) + assert stat.std_num_output_token == np.std([3, 6]) # type: ignore + assert stat.std_num_input_token == np.std([3, 4]) # type: ignore + + oott = 9 / ns_to_sec(14) + assert stat.avg_output_token_throughput == pytest.approx(oott) # type: ignore + + # check non-existing profile data + with pytest.raises(KeyError): + pd.get_statistics(infer_mode="concurrency", load_level="40") + + def test_merged_sse_response(self, mock_read_write: pytest.MonkeyPatch) -> None: + """Test merging the multiple sse response.""" + res_timestamps = [0, 1, 2, 3] + res_outputs = [ + { + "response": 'data: {"choices":[{"delta":{"content":"aaa"}}],"object":"chat.completion.chunk"}\n\n' + }, + { + "response": ( + 'data: {"choices":[{"delta":{"content":"abc"}}],"object":"chat.completion.chunk"}\n\n' + 'data: {"choices":[{"delta":{"content":"1234"}}],"object":"chat.completion.chunk"}\n\n' + 'data: {"choices":[{"delta":{"content":"helloworld"}}],"object":"chat.completion.chunk"}\n\n' + ) + }, + {"response": "data: [DONE]\n\n"}, + ] + expected_response = '{"choices": [{"delta": {"content": "abc1234helloworld"}}], "object": "chat.completion.chunk"}' + + tokenizer = get_tokenizer(DEFAULT_TOKENIZER) + pd = LLMProfileDataParser( + filename=Path("openai_profile_export.json"), + tokenizer=tokenizer, + ) + + pd._preprocess_response(res_timestamps, res_outputs) + assert res_outputs[1]["response"] == expected_response + + def test_no_special_tokens(self, mock_read_write: pytest.MonkeyPatch) -> None: + """Test special tokens are not included when counting input/output tokens.""" + tokenizer = get_tokenizer(DEFAULT_TOKENIZER) + pd = LLMProfileDataParser( + filename=Path("openai_profile_export.json"), + tokenizer=tokenizer, + ) + + # There are 3 special tokens in the default tokenizer + # - : 0 (unknown) + # - : 1 (beginning of sentence) + # - : 2 (end of sentence) + special_token_ids = list(tokenizer._tokenizer.added_tokens_encoder.values()) + + # Check if special tokens are present in request input + req_input = {"text_input": "This is test input."} + tokens = pd._tokenize_triton_request_input(req_input) + assert all([s not in tokens for s in special_token_ids]) + + pd._response_format = ResponseFormat.OPENAI_COMPLETIONS + req_input = {"payload": '{"prompt":"This is test input."}'} + tokens = pd._tokenize_openai_request_input(req_input) + assert all([s not in tokens for s in special_token_ids]) + + pd._response_format = ResponseFormat.OPENAI_CHAT_COMPLETIONS + req_input = {"payload": '{"messages":[{"content":"This is test input."}]}'} + tokens = pd._tokenize_openai_request_input(req_input) + assert all([s not in tokens for s in special_token_ids]) + + # Check if special tokens are present in the responses + res_outputs = ["This", "is", "test", "input."] + tokens = [] + for t in pd._run_tokenizer(res_outputs): + tokens += t + assert all([s not in tokens for s in special_token_ids]) def test_llm_metrics_get_base_name(self) -> None: """Test get_base_name method in LLMMetrics class.""" @@ -83,11 +421,11 @@ def test_llm_metrics_get_base_name(self) -> None: request_throughputs=[10.12, 11.33], request_latencies=[3, 44], time_to_first_tokens=[1, 2, 3], - inter_token_latencies=[4, 5], + inter_token_latencies=[[4, 5]], output_token_throughputs=[22.13, 9423.02], output_token_throughputs_per_request=[7, 8, 9], - output_sequence_lengths=[3, 4], - input_sequence_lengths=[12, 34], + num_output_tokens=[3, 4], + num_input_tokens=[12, 34], ) assert metrics.get_base_name("time_to_first_tokens") == "time_to_first_token" assert metrics.get_base_name("inter_token_latencies") == "inter_token_latency" @@ -95,11 +433,179 @@ def test_llm_metrics_get_base_name(self) -> None: metrics.get_base_name("output_token_throughputs_per_request") == "output_token_throughput_per_request" ) - assert ( - metrics.get_base_name("output_sequence_lengths") == "output_sequence_length" - ) - assert ( - metrics.get_base_name("input_sequence_lengths") == "input_sequence_length" - ) + assert metrics.get_base_name("num_output_tokens") == "num_output_token" + assert metrics.get_base_name("num_input_tokens") == "num_input_token" with pytest.raises(KeyError): metrics.get_base_name("hello1234") + + def test_empty_response(self, mock_read_write: pytest.MonkeyPatch) -> None: + """Check if it handles all empty responses.""" + tokenizer = get_tokenizer(DEFAULT_TOKENIZER) + + # Should not throw error + _ = LLMProfileDataParser( + filename=Path("empty_profile_export.json"), + tokenizer=tokenizer, + ) + + empty_profile_data = { + "service_kind": "openai", + "endpoint": "v1/chat/completions", + "experiments": [ + { + "experiment": { + "mode": "concurrency", + "value": 10, + }, + "requests": [ + { + "timestamp": 1, + "request_inputs": { + "payload": '{"messages":[{"role":"user","content":"This is test"}],"model":"llama-2-7b","stream":true}', + }, + "response_timestamps": [3, 5, 8], + "response_outputs": [ + { + "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"role":"assistant"},"finish_reason":null}]}\n\n' + }, + { + "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"content":""},"finish_reason":null}]}\n\n' + }, + {"response": "data: [DONE]\n\n"}, + ], + }, + ], + }, + ], + } + + openai_profile_data = { + "service_kind": "openai", + "endpoint": "v1/chat/completions", + "experiments": [ + { + "experiment": { + "mode": "concurrency", + "value": 10, + }, + "requests": [ + { + "timestamp": 1, + "request_inputs": { + "payload": '{"messages":[{"role":"user","content":"This is test"}],"model":"llama-2-7b","stream":true}', + }, + # the first, and the last two responses will be ignored because they have no "content" + "response_timestamps": [3, 5, 8, 12, 13, 14], + "response_outputs": [ + { + "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"role":"assistant"},"finish_reason":null}]}\n\n' + }, + { + "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"content":"I"},"finish_reason":null}]}\n\n' + }, + { + "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"content":" like"},"finish_reason":null}]}\n\n' + }, + { + "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"content":" dogs"},"finish_reason":null}]}\n\n' + }, + { + "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{},"finish_reason":null}]}\n\n' + }, + {"response": "data: [DONE]\n\n"}, + ], + }, + { + "timestamp": 2, + "request_inputs": { + "payload": '{"messages":[{"role":"user","content":"This is test too"}],"model":"llama-2-7b","stream":true}', + }, + # the first, and the last two responses will be ignored because they have no "content" + "response_timestamps": [4, 7, 11, 15, 18, 19], + "response_outputs": [ + { + "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"role":"assistant"},"finish_reason":null}]}\n\n' + }, + { + "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"content":"I"},"finish_reason":null}]}\n\n' + }, + { + "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"content":"don\'t"},"finish_reason":null}]}\n\n' + }, + { + "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"content":"cook food"},"finish_reason":null}]}\n\n' + }, + { + "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{},"finish_reason":null}]}\n\n' + }, + {"response": "data: [DONE]\n\n"}, + ], + }, + ], + }, + ], + } + + triton_profile_data = { + "service_kind": "triton", + "endpoint": "", + "experiments": [ + { + "experiment": { + "mode": "concurrency", + "value": 10, + }, + "requests": [ + { + "timestamp": 1, + "request_inputs": {"text_input": "This is test"}, + "response_timestamps": [3, 5, 8], + "response_outputs": [ + {"text_output": "I"}, + {"text_output": " like"}, + {"text_output": " dogs"}, + ], + }, + { + "timestamp": 2, + "request_inputs": {"text_input": "This is test too"}, + "response_timestamps": [4, 7, 11], + "response_outputs": [ + {"text_output": "I"}, + {"text_output": " don't"}, + {"text_output": " cook food"}, + ], + }, + ], + }, + { + "experiment": { + "mode": "request_rate", + "value": 2.0, + }, + "requests": [ + { + "timestamp": 5, + "request_inputs": {"text_input": "This is test"}, + "response_timestamps": [7, 8, 13, 18], + "response_outputs": [ + {"text_output": "cat"}, + {"text_output": " is"}, + {"text_output": " cool"}, + {"text_output": " too"}, + ], + }, + { + "timestamp": 3, + "request_inputs": {"text_input": "This is test too"}, + "response_timestamps": [6, 8, 11], + "response_outputs": [ + {"text_output": "it's"}, + {"text_output": " very"}, + {"text_output": " simple work"}, + ], + }, + ], + }, + ], + } diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_llm_profile_data_parser.py b/src/c++/perf_analyzer/genai-perf/tests/test_llm_profile_data_parser.py deleted file mode 100644 index 75976189d..000000000 --- a/src/c++/perf_analyzer/genai-perf/tests/test_llm_profile_data_parser.py +++ /dev/null @@ -1,587 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import json -from io import StringIO -from pathlib import Path -from typing import Any, List, Union - -import numpy as np -import pytest -from genai_perf.metrics import LLMMetrics -from genai_perf.profile_data_parser import LLMProfileDataParser -from genai_perf.tokenizer import DEFAULT_TOKENIZER, get_tokenizer - - -def ns_to_sec(ns: int) -> Union[int, float]: - """Convert from nanosecond to second.""" - return ns / 1e9 - - -class TestLLMProfileDataParser: - @pytest.fixture - def mock_read_write(self, monkeypatch: pytest.MonkeyPatch) -> List[str]: - """ - This function will mock the open function for specific files: - - - For "triton_profile_export.json", it will read and return the - contents of self.triton_profile_data - - For "openai_profile_export.json", it will read and return the - contents of self.openai_profile_data - - For "profile_export.csv", it will capture all data written to - the file, and return it as the return value of this function - - For all other files, it will behave like the normal open function - """ - - written_data = [] - - original_open = open - - def custom_open(filename, *args, **kwargs): - def write(self: Any, content: str) -> int: - written_data.append(content) - return len(content) - - if filename == "triton_profile_export.json": - tmp_file = StringIO(json.dumps(self.triton_profile_data)) - return tmp_file - elif filename == "openai_profile_export.json": - tmp_file = StringIO(json.dumps(self.openai_profile_data)) - return tmp_file - elif filename == "empty_profile_export.json": - tmp_file = StringIO(json.dumps(self.empty_profile_data)) - return tmp_file - elif filename == "profile_export.csv": - tmp_file = StringIO() - tmp_file.write = write.__get__(tmp_file) - return tmp_file - else: - return original_open(filename, *args, **kwargs) - - monkeypatch.setattr("builtins.open", custom_open) - - return written_data - - def test_triton_llm_profile_data(self, mock_read_write: pytest.MonkeyPatch) -> None: - """Collect LLM metrics from profile export data and check values. - - Metrics - * time to first tokens - - experiment 1: [3 - 1, 4 - 2] = [2, 2] - - experiment 2: [7 - 5, 6 - 3] = [2, 3] - * inter token latencies - - experiment 1: [((8 - 1) - 2)/(3 - 1), ((11 - 2) - 2)/(6 - 1)] - : [2.5, 1.4] - : [2, 1] # rounded - - experiment 2: [((18 - 5) - 2)/(4 - 1), ((11 - 3) - 3)/(6 - 1)] - : [11/3, 1] - : [4, 1] # rounded - * output token throughputs per request - - experiment 1: [3/(8 - 1), 6/(11 - 2)] = [3/7, 6/9] - - experiment 2: [4/(18 - 5), 6/(11 - 3)] = [4/13, 6/8] - * output token throughputs - - experiment 1: [(3 + 6)/(11 - 1)] = [9/10] - - experiment 2: [(4 + 6)/(18 - 3)] = [2/3] - * output sequence lengths - - experiment 1: [3, 6] - - experiment 2: [4, 6] - * input sequence lengths - - experiment 1: [3, 4] - - experiment 2: [3, 4] - """ - tokenizer = get_tokenizer(DEFAULT_TOKENIZER) - pd = LLMProfileDataParser( - filename=Path("triton_profile_export.json"), - tokenizer=tokenizer, - ) - - # experiment 1 metrics & statistics - stat_obj = pd.get_statistics(infer_mode="concurrency", load_level="10") - metrics = stat_obj.metrics - stat = stat_obj.stats_dict - - assert isinstance(metrics, LLMMetrics) - - assert metrics.time_to_first_tokens == [2, 2] - assert metrics.inter_token_latencies == [2, 1] - ottpr = [3 / ns_to_sec(7), 6 / ns_to_sec(9)] - assert metrics.output_token_throughputs_per_request == pytest.approx(ottpr) - ott = [9 / ns_to_sec(10)] - assert metrics.output_token_throughputs == pytest.approx(ott) - assert metrics.output_sequence_lengths == [3, 6] - assert metrics.input_sequence_lengths == [3, 4] - - # Disable Pylance warnings for dynamically set attributes due to Statistics - # not having strict attributes listed. - assert stat["time_to_first_token"]["avg"] == 2 # type: ignore - assert stat["inter_token_latency"]["avg"] == 1.5 # type: ignore - assert stat["output_token_throughput_per_request"]["avg"] == pytest.approx( # type: ignore - np.mean(ottpr) - ) - assert stat["output_sequence_length"]["avg"] == 4.5 # type: ignore - assert stat["input_sequence_length"]["avg"] == 3.5 # type: ignore - - assert stat["time_to_first_token"]["p50"] == 2 # type: ignore - assert stat["inter_token_latency"]["p50"] == 1.5 # type: ignore - assert stat["output_token_throughput_per_request"]["p50"] == pytest.approx( # type: ignore - np.percentile(ottpr, 50) - ) - assert stat["output_sequence_length"]["p50"] == 4.5 # type: ignore - assert stat["input_sequence_length"]["p50"] == 3.5 # type: ignore - - assert stat["time_to_first_token"]["min"] == 2 # type: ignore - assert stat["inter_token_latency"]["min"] == 1 # type: ignore - min_ottpr = 3 / ns_to_sec(7) - assert stat["output_token_throughput_per_request"]["min"] == pytest.approx(min_ottpr) # type: ignore - assert stat["output_sequence_length"]["min"] == 3 # type: ignore - assert stat["input_sequence_length"]["min"] == 3 # type: ignore - - assert stat["time_to_first_token"]["max"] == 2 # type: ignore - assert stat["inter_token_latency"]["max"] == 2 # type: ignore - max_ottpr = 6 / ns_to_sec(9) - assert stat["output_token_throughput_per_request"]["max"] == pytest.approx(max_ottpr) # type: ignore - assert stat["output_sequence_length"]["max"] == 6 # type: ignore - assert stat["input_sequence_length"]["max"] == 4 # type: ignore - - assert stat["time_to_first_token"]["std"] == np.std([2, 2]) # type: ignore - assert stat["inter_token_latency"]["std"] == np.std([2, 1]) # type: ignore - assert stat["output_token_throughput_per_request"]["std"] == pytest.approx( # type: ignore - np.std(ottpr) - ) - assert stat["output_sequence_length"]["std"] == np.std([3, 6]) # type: ignore - assert stat["input_sequence_length"]["std"] == np.std([3, 4]) # type: ignore - - oott = 9 / ns_to_sec(10) - assert stat["output_token_throughput"]["avg"] == pytest.approx(oott) # type: ignore - - # experiment 2 statistics - stat_obj = pd.get_statistics(infer_mode="request_rate", load_level="2.0") - metrics = stat_obj.metrics - stat = stat_obj.stats_dict - assert isinstance(metrics, LLMMetrics) - - assert metrics.time_to_first_tokens == [2, 3] - assert metrics.inter_token_latencies == [4, 1] - ottpr = [4 / ns_to_sec(13), 6 / ns_to_sec(8)] - assert metrics.output_token_throughputs_per_request == pytest.approx(ottpr) - ott = [2 / ns_to_sec(3)] - assert metrics.output_token_throughputs == pytest.approx(ott) - assert metrics.output_sequence_lengths == [4, 6] - assert metrics.input_sequence_lengths == [3, 4] - - assert stat["time_to_first_token"]["avg"] == pytest.approx(2.5) # type: ignore - assert stat["inter_token_latency"]["avg"] == pytest.approx(2.5) # type: ignore - assert stat["output_token_throughput_per_request"]["avg"] == pytest.approx( # type: ignore - np.mean(ottpr) - ) - assert stat["output_sequence_length"]["avg"] == 5 # type: ignore - assert stat["input_sequence_length"]["avg"] == 3.5 # type: ignore - - assert stat["time_to_first_token"]["p50"] == pytest.approx(2.5) # type: ignore - assert stat["inter_token_latency"]["p50"] == pytest.approx(2.5) # type: ignore - assert stat["output_token_throughput_per_request"]["p50"] == pytest.approx( # type: ignore - np.percentile(ottpr, 50) - ) - assert stat["output_sequence_length"]["p50"] == 5 # type: ignore - assert stat["input_sequence_length"]["p50"] == 3.5 # type: ignore - - assert stat["time_to_first_token"]["min"] == pytest.approx(2) # type: ignore - assert stat["inter_token_latency"]["min"] == pytest.approx(1) # type: ignore - min_ottpr = 4 / ns_to_sec(13) - assert stat["output_token_throughput_per_request"]["min"] == pytest.approx(min_ottpr) # type: ignore - assert stat["output_sequence_length"]["min"] == 4 # type: ignore - assert stat["input_sequence_length"]["min"] == 3 # type: ignore - - assert stat["time_to_first_token"]["max"] == pytest.approx(3) # type: ignore - assert stat["inter_token_latency"]["max"] == pytest.approx(4) # type: ignore - max_ottpr = 6 / ns_to_sec(8) - assert stat["output_token_throughput_per_request"]["max"] == pytest.approx(max_ottpr) # type: ignore - assert stat["output_sequence_length"]["max"] == 6 # type: ignore - assert stat["input_sequence_length"]["max"] == 4 # type: ignore - - assert stat["time_to_first_token"]["std"] == np.std([2, 3]) * (1) # type: ignore - assert stat["inter_token_latency"]["std"] == np.std([4, 1]) * (1) # type: ignore - assert stat["output_token_throughput_per_request"]["std"] == pytest.approx( # type: ignore - np.std(ottpr) - ) - assert stat["output_sequence_length"]["std"] == np.std([4, 6]) # type: ignore - assert stat["input_sequence_length"]["std"] == np.std([3, 4]) # type: ignore - - oott = 2 / ns_to_sec(3) - assert stat["output_token_throughput"]["avg"] == pytest.approx(oott) # type: ignore - - # check non-existing profile data - with pytest.raises(KeyError): - pd.get_statistics(infer_mode="concurrency", load_level="30") - - def test_openai_llm_profile_data(self, mock_read_write: pytest.MonkeyPatch) -> None: - """Collect LLM metrics from profile export data and check values. - - Metrics - * time to first tokens - - experiment 1: [5 - 1, 7 - 2] = [4, 5] - * inter token latencies - - experiment 1: [((12 - 1) - 4)/(3 - 1), ((15 - 2) - 5)/(6 - 1)] - : [3.5, 1.6] - : [4, 2] # rounded - * output token throughputs per request - - experiment 1: [3/(12 - 1), 6/(15 - 2)] = [3/11, 6/13] - * output token throughputs - - experiment 1: [(3 + 6)/(15 - 1)] = [9/14] - * output sequence lengths - - experiment 1: [3, 6] - * input sequence lengths - - experiment 1: [3, 4] - """ - tokenizer = get_tokenizer(DEFAULT_TOKENIZER) - pd = LLMProfileDataParser( - filename=Path("openai_profile_export.json"), - tokenizer=tokenizer, - ) - - # experiment 1 statistics - stat_obj = pd.get_statistics(infer_mode="concurrency", load_level="10") - metrics = stat_obj.metrics - stat = stat_obj.stats_dict - assert isinstance(metrics, LLMMetrics) - - assert metrics.time_to_first_tokens == [4, 5] - assert metrics.inter_token_latencies == [4, 2] - ottpr = [3 / ns_to_sec(11), 6 / ns_to_sec(13)] - assert metrics.output_token_throughputs_per_request == pytest.approx(ottpr) - ott = [9 / ns_to_sec(14)] - assert metrics.output_token_throughputs == pytest.approx(ott) - assert metrics.output_sequence_lengths == [3, 6] - assert metrics.input_sequence_lengths == [3, 4] - - assert stat["time_to_first_token"]["avg"] == pytest.approx(4.5) # type: ignore - assert stat["inter_token_latency"]["avg"] == pytest.approx(3) # type: ignore - assert stat["output_token_throughput_per_request"]["avg"] == pytest.approx( # type: ignore - np.mean(ottpr) - ) - assert stat["output_sequence_length"]["avg"] == 4.5 # type: ignore - assert stat["input_sequence_length"]["avg"] == 3.5 # type: ignore - - assert stat["time_to_first_token"]["p50"] == pytest.approx(4.5) # type: ignore - assert stat["inter_token_latency"]["p50"] == pytest.approx(3) # type: ignore - assert stat["output_token_throughput_per_request"]["p50"] == pytest.approx( # type: ignore - np.percentile(ottpr, 50) - ) - assert stat["output_sequence_length"]["p50"] == 4.5 # type: ignore - assert stat["input_sequence_length"]["p50"] == 3.5 # type: ignore - - assert stat["time_to_first_token"]["min"] == pytest.approx(4) # type: ignore - assert stat["inter_token_latency"]["min"] == pytest.approx(2) # type: ignore - min_ottpr = 3 / ns_to_sec(11) - assert stat["output_token_throughput_per_request"]["min"] == pytest.approx(min_ottpr) # type: ignore - assert stat["output_sequence_length"]["min"] == 3 # type: ignore - assert stat["input_sequence_length"]["min"] == 3 # type: ignore - - assert stat["time_to_first_token"]["max"] == pytest.approx(5) # type: ignore - assert stat["inter_token_latency"]["max"] == pytest.approx(4) # type: ignore - max_ottpr = 6 / ns_to_sec(13) - assert stat["output_token_throughput_per_request"]["max"] == pytest.approx(max_ottpr) # type: ignore - assert stat["output_sequence_length"]["max"] == 6 # type: ignore - assert stat["input_sequence_length"]["max"] == 4 # type: ignore - - assert stat["time_to_first_token"]["std"] == np.std([4, 5]) * (1) # type: ignore - assert stat["inter_token_latency"]["std"] == np.std([4, 2]) * (1) # type: ignore - assert stat["output_token_throughput_per_request"]["std"] == pytest.approx( # type: ignore - np.std(ottpr) - ) - assert stat["output_sequence_length"]["std"] == np.std([3, 6]) # type: ignore - assert stat["input_sequence_length"]["std"] == np.std([3, 4]) # type: ignore - - oott = 9 / ns_to_sec(14) - assert stat["output_token_throughput"]["avg"] == pytest.approx(oott) # type: ignore - - # check non-existing profile data - with pytest.raises(KeyError): - pd.get_statistics(infer_mode="concurrency", load_level="40") - - def test_merged_sse_response(self, mock_read_write: pytest.MonkeyPatch) -> None: - """Test merging the multiple sse response.""" - res_timestamps = [0, 1, 2, 3] - res_outputs = [ - { - "response": 'data: {"choices":[{"delta":{"content":"aaa"}}],"object":"chat.completion.chunk"}\n\n' - }, - { - "response": ( - 'data: {"choices":[{"delta":{"content":"abc"}}],"object":"chat.completion.chunk"}\n\n' - 'data: {"choices":[{"delta":{"content":"1234"}}],"object":"chat.completion.chunk"}\n\n' - 'data: {"choices":[{"delta":{"content":"helloworld"}}],"object":"chat.completion.chunk"}\n\n' - ) - }, - {"response": "data: [DONE]\n\n"}, - ] - expected_response = '{"choices": [{"delta": {"content": "abc1234helloworld"}}], "object": "chat.completion.chunk"}' - - tokenizer = get_tokenizer(DEFAULT_TOKENIZER) - pd = LLMProfileDataParser( - filename=Path("openai_profile_export.json"), - tokenizer=tokenizer, - ) - - pd._preprocess_response(res_timestamps, res_outputs) - assert res_outputs[1]["response"] == expected_response - - def test_openai_output_token_counts( - self, mock_read_write: pytest.MonkeyPatch - ) -> None: - output_texts = [ - "Ad", - "idas", - " Orig", - "inals", - " are", - " now", - " available", - " in", - " more", - " than", - ] - res_outputs = [] - for text in output_texts: - response = f'data: {{"choices":[{{"delta":{{"content":"{text}"}}}}],"object":"chat.completion.chunk"}}\n\n' - res_outputs.append({"response": response}) - - tokenizer = get_tokenizer(DEFAULT_TOKENIZER) - pd = LLMProfileDataParser( - filename=Path("openai_profile_export.json"), - tokenizer=tokenizer, - ) - - output_token_counts, total_output_token = pd._get_output_token_counts( - res_outputs - ) - assert output_token_counts == [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] # total 10 - assert total_output_token == 9 - assert total_output_token != sum(output_token_counts) - - def test_triton_output_token_counts( - self, mock_read_write: pytest.MonkeyPatch - ) -> None: - output_texts = [ - "Ad", - "idas", - " Orig", - "inals", - " are", - " now", - " available", - " in", - " more", - " than", - ] - res_outputs = [] - for text in output_texts: - res_outputs.append({"text_output": text}) - - tokenizer = get_tokenizer(DEFAULT_TOKENIZER) - pd = LLMProfileDataParser( - filename=Path("triton_profile_export.json"), - tokenizer=tokenizer, - ) - - output_token_counts, total_output_token = pd._get_output_token_counts( - res_outputs - ) - assert output_token_counts == [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] # total 10 - assert total_output_token == 9 - assert total_output_token != sum(output_token_counts) - - def test_empty_response(self, mock_read_write: pytest.MonkeyPatch) -> None: - """Check if it handles all empty responses.""" - tokenizer = get_tokenizer(DEFAULT_TOKENIZER) - - # Should not throw error - _ = LLMProfileDataParser( - filename=Path("empty_profile_export.json"), - tokenizer=tokenizer, - ) - - empty_profile_data = { - "service_kind": "openai", - "endpoint": "v1/chat/completions", - "experiments": [ - { - "experiment": { - "mode": "concurrency", - "value": 10, - }, - "requests": [ - { - "timestamp": 1, - "request_inputs": { - "payload": '{"messages":[{"role":"user","content":"This is test"}],"model":"llama-2-7b","stream":true}', - }, - "response_timestamps": [3, 5, 8], - "response_outputs": [ - { - "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"role":"assistant"},"finish_reason":null}]}\n\n' - }, - { - "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"content":""},"finish_reason":null}]}\n\n' - }, - {"response": "data: [DONE]\n\n"}, - ], - }, - ], - }, - ], - } - - openai_profile_data = { - "service_kind": "openai", - "endpoint": "v1/chat/completions", - "experiments": [ - { - "experiment": { - "mode": "concurrency", - "value": 10, - }, - "requests": [ - { - "timestamp": 1, - "request_inputs": { - "payload": '{"messages":[{"role":"user","content":"This is test"}],"model":"llama-2-7b","stream":true}', - }, - # the first, and the last two responses will be ignored because they have no "content" - "response_timestamps": [3, 5, 8, 12, 13, 14], - "response_outputs": [ - { - "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"role":"assistant"},"finish_reason":null}]}\n\n' - }, - { - "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"content":"I"},"finish_reason":null}]}\n\n' - }, - { - "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"content":" like"},"finish_reason":null}]}\n\n' - }, - { - "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"content":" dogs"},"finish_reason":null}]}\n\n' - }, - { - "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{},"finish_reason":null}]}\n\n' - }, - {"response": "data: [DONE]\n\n"}, - ], - }, - { - "timestamp": 2, - "request_inputs": { - "payload": '{"messages":[{"role":"user","content":"This is test too"}],"model":"llama-2-7b","stream":true}', - }, - # the first, and the last two responses will be ignored because they have no "content" - "response_timestamps": [4, 7, 11, 15, 18, 19], - "response_outputs": [ - { - "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"role":"assistant"},"finish_reason":null}]}\n\n' - }, - { - "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"content":"I"},"finish_reason":null}]}\n\n' - }, - { - "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"content":"don\'t"},"finish_reason":null}]}\n\n' - }, - { - "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"content":"cook food"},"finish_reason":null}]}\n\n' - }, - { - "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{},"finish_reason":null}]}\n\n' - }, - {"response": "data: [DONE]\n\n"}, - ], - }, - ], - }, - ], - } - - triton_profile_data = { - "service_kind": "triton", - "endpoint": "", - "experiments": [ - { - "experiment": { - "mode": "concurrency", - "value": 10, - }, - "requests": [ - { - "timestamp": 1, - "request_inputs": {"text_input": "This is test"}, - "response_timestamps": [3, 5, 8], - "response_outputs": [ - {"text_output": "I"}, - {"text_output": " like"}, - {"text_output": " dogs"}, - ], - }, - { - "timestamp": 2, - "request_inputs": {"text_input": "This is test too"}, - "response_timestamps": [4, 7, 11], - "response_outputs": [ - {"text_output": "I"}, - {"text_output": " don't"}, - {"text_output": " cook food"}, - ], - }, - ], - }, - { - "experiment": { - "mode": "request_rate", - "value": 2.0, - }, - "requests": [ - { - "timestamp": 5, - "request_inputs": {"text_input": "This is test"}, - "response_timestamps": [7, 8, 13, 18], - "response_outputs": [ - {"text_output": "cat"}, - {"text_output": " is"}, - {"text_output": " cool"}, - {"text_output": " too"}, - ], - }, - { - "timestamp": 3, - "request_inputs": {"text_input": "This is test too"}, - "response_timestamps": [6, 8, 11], - "response_outputs": [ - {"text_output": "it's"}, - {"text_output": " very"}, - {"text_output": " simple work"}, - ], - }, - ], - }, - ], - } diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_metrics.py b/src/c++/perf_analyzer/genai-perf/tests/test_metrics.py deleted file mode 100644 index 2af489fc4..000000000 --- a/src/c++/perf_analyzer/genai-perf/tests/test_metrics.py +++ /dev/null @@ -1,64 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import pytest -from genai_perf.metrics import Metrics - - -class TestMetrics: - - def test_metric_request_metrics(self) -> None: - """Test request_metrics property.""" - m = Metrics( - request_throughputs=[10.12, 11.33], - request_latencies=[3, 44], - ) - req_metrics = m.request_metrics - assert len(req_metrics) == 1 - assert req_metrics[0].name == "request_latency" - assert req_metrics[0].unit == "ms" - - def test_metric_system_metrics(self) -> None: - """Test system_metrics property.""" - m = Metrics( - request_throughputs=[10.12, 11.33], - request_latencies=[3, 44], - ) - sys_metrics = m.system_metrics - assert len(sys_metrics) == 1 - assert sys_metrics[0].name == "request_throughput" - assert sys_metrics[0].unit == "per sec" - - def test_metrics_get_base_name(self) -> None: - """Test get_base_name method in Metrics class.""" - metrics = Metrics( - request_throughputs=[10.12, 11.33], - request_latencies=[3, 44], - ) - assert metrics.get_base_name("request_throughputs") == "request_throughput" - assert metrics.get_base_name("request_latencies") == "request_latency" - with pytest.raises(KeyError): - metrics.get_base_name("hello1234") diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_plot_configs.py b/src/c++/perf_analyzer/genai-perf/tests/test_plot_configs.py index 8a1dfee7a..1e1391e4c 100644 --- a/src/c++/perf_analyzer/genai-perf/tests/test_plot_configs.py +++ b/src/c++/perf_analyzer/genai-perf/tests/test_plot_configs.py @@ -51,11 +51,11 @@ class TestPlotConfigParser: output: test_output_1 plot2: - title: Input Sequence Length vs Output Sequence Length - x_metric: input_sequence_lengths - y_metric: output_sequence_lengths - x_label: Input Sequence Length - y_label: Output Sequence Length + title: Num Input Token vs Num Output Token + x_metric: num_input_tokens + y_metric: num_output_tokens + x_label: Input Tokens + y_label: Output Tokens width: 1234 height: 5678 type: scatter @@ -97,9 +97,9 @@ def test_generate_configs(self, monkeypatch) -> None: assert prd.y_metric == [1, 2, 3] # plot config 2 - assert pc2.title == "Input Sequence Length vs Output Sequence Length" - assert pc2.x_label == "Input Sequence Length" - assert pc2.y_label == "Output Sequence Length" + assert pc2.title == "Num Input Token vs Num Output Token" + assert pc2.x_label == "Input Tokens" + assert pc2.y_label == "Output Tokens" assert pc2.width == 1234 assert pc2.height == 5678 assert pc2.type == PlotType.SCATTER diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_profile_data_parser.py b/src/c++/perf_analyzer/genai-perf/tests/test_profile_data_parser.py deleted file mode 100644 index fe303c514..000000000 --- a/src/c++/perf_analyzer/genai-perf/tests/test_profile_data_parser.py +++ /dev/null @@ -1,297 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import json -from io import StringIO -from pathlib import Path -from typing import Any, List, Union - -import numpy as np -import pytest -from genai_perf.metrics import Metrics -from genai_perf.profile_data_parser import ProfileDataParser - - -def ns_to_sec(ns: int) -> Union[int, float]: - """Convert from nanosecond to second.""" - return ns / 1e9 - - -class TestProfileDataParser: - @pytest.fixture - def mock_read_write(self, monkeypatch: pytest.MonkeyPatch) -> List[str]: - """ - This function will mock the open function for specific files: - - - For "triton_profile_export.json", it will read and return the - contents of self.triton_profile_data - - For "openai_profile_export.json", it will read and return the - contents of self.openai_profile_data - - For "profile_export.csv", it will capture all data written to - the file, and return it as the return value of this function - - For all other files, it will behave like the normal open function - """ - - written_data = [] - - original_open = open - - def custom_open(filename, *args, **kwargs): - def write(self: Any, content: str) -> int: - written_data.append(content) - return len(content) - - if filename == "embedding_profile_export.json": - tmp_file = StringIO(json.dumps(self.embedding_profile_data)) - return tmp_file - elif filename == "ranking_profile_export.json": - tmp_file = StringIO(json.dumps(self.ranking_profile_data)) - return tmp_file - elif filename == "huggingface_ranking_profile_export.json": - tmp_file = StringIO(json.dumps(self.huggingface_ranking_profile_data)) - return tmp_file - elif filename == "profile_export.csv": - tmp_file = StringIO() - tmp_file.write = write.__get__(tmp_file) - return tmp_file - else: - return original_open(filename, *args, **kwargs) - - monkeypatch.setattr("builtins.open", custom_open) - - return written_data - - # ================================================ - # EMBEDDINGS API - # ================================================ - embedding_profile_data = { - "service_kind": "openai", - "endpoint": "v1/embeddings", - "experiments": [ - { - "experiment": { - "mode": "concurrency", - "value": 10, - }, - "requests": [ - { - "timestamp": 1, - "request_inputs": { - "payload": '{"input":"This is test","model":"NV-Embed-QA","input_type":"passage","encoding_format":"float","truncate":"NONE"}', - }, - "response_timestamps": [3], - "response_outputs": [ - { - "response": '{"object":"list","data":[{"index":0,"embedding":[1, 2, 3],"object":"embedding"}],"model":"NV-Embed-QA","usage":{"prompt_tokens":7,"total_tokens":7}}' - }, - ], - }, - { - "timestamp": 2, - "request_inputs": { - "payload": '{"input":"This is test too","model":"NV-Embed-QA","input_type":"passage","encoding_format":"float","truncate":"NONE"}', - }, - "response_timestamps": [5], - "response_outputs": [ - { - "response": '{"object":"list","data":[{"index":0,"embedding":[1, 2, 3, 4],"object":"embedding"}],"model":"NV-Embed-QA","usage":{"prompt_tokens":8,"total_tokens":8}}' - }, - ], - }, - ], - }, - ], - } - - def test_embedding_profile_data(self, mock_read_write: pytest.MonkeyPatch) -> None: - """Collect base metrics from profile export data and check values. - - Metrics - * request latencies - - [3 - 1, 5 - 2] = [2, 3] - * request throughputs - - [2 / (5e-9 - 1e-9)] = [5e8] - """ - pd = ProfileDataParser(filename=Path("embedding_profile_export.json")) - - # experiment 1 statistics - stats = pd.get_statistics(infer_mode="concurrency", load_level="10") - metrics = stats.metrics - stats_dict = stats.stats_dict - assert isinstance(metrics, Metrics) - - assert metrics.request_latencies == [2, 3] - assert metrics.request_throughputs == [pytest.approx(5e8)] - - assert stats_dict["request_latency"]["avg"] == pytest.approx(2.5) # type: ignore - assert stats_dict["request_latency"]["p50"] == pytest.approx(2.5) # type: ignore - assert stats_dict["request_latency"]["min"] == pytest.approx(2) # type: ignore - assert stats_dict["request_latency"]["max"] == pytest.approx(3) # type: ignore - assert stats_dict["request_latency"]["std"] == np.std([2, 3]) # type: ignore - - assert stats_dict["request_throughput"]["avg"] == pytest.approx(5e8) # type: ignore - - # ================================================ - # RANKINGS API - # ================================================ - ranking_profile_data = { - "service_kind": "openai", - "endpoint": "v1/ranking", - "experiments": [ - { - "experiment": { - "mode": "concurrency", - "value": 10, - }, - "requests": [ - { - "timestamp": 1, - "request_inputs": { - "payload": '{"query":{"text":"This is a test."},"passages":[{"text":"test output one"},{"text":"test output two"},{"text":"test output three"}],"model":"nv-rerank-qa-mistral-4b:1","truncate":"END"}', - }, - "response_timestamps": [3], - "response_outputs": [ - { - "response": '{"rankings":[{"index":0,"logit":-5.98828125},{"index":1,"logit":-6.828125},{"index":2,"logit":-7.60546875}]}' - }, - ], - }, - { - "timestamp": 2, - "request_inputs": { - "payload": '{"query":{"text":"This is a test."},"passages":[{"text":"test output one"},{"text":"test output two"},{"text":"test output three"}],"model":"nv-rerank-qa-mistral-4b:1","truncate":"END"}', - }, - "response_timestamps": [5], - "response_outputs": [ - { - "response": '{"rankings":[{"index":2,"logit":-6.15625},{"index":1,"logit":-7.83984375},{"index":0,"logit":-7.84765625}]}' - }, - ], - }, - ], - }, - ], - } - - def test_ranking_profile_data(self, mock_read_write: pytest.MonkeyPatch) -> None: - """Collect base metrics from profile export data and check values. - - Metrics - * request latencies - - [3 - 1, 5 - 2] = [2, 3] - * request throughputs - - [2 / (5e-9 - 1e-9)] = [5e8] - """ - pd = ProfileDataParser(filename=Path("ranking_profile_export.json")) - - # experiment 1 statistics - stats = pd.get_statistics(infer_mode="concurrency", load_level="10") - metrics = stats.metrics - stats_dict = stats.stats_dict - assert isinstance(metrics, Metrics) - - assert metrics.request_latencies == [2, 3] - assert metrics.request_throughputs == [pytest.approx(5e8)] - - assert stats_dict["request_latency"]["avg"] == pytest.approx(2.5) # type: ignore - assert stats_dict["request_latency"]["p50"] == pytest.approx(2.5) # type: ignore - assert stats_dict["request_latency"]["min"] == pytest.approx(2) # type: ignore - assert stats_dict["request_latency"]["max"] == pytest.approx(3) # type: ignore - assert stats_dict["request_latency"]["std"] == np.std([2, 3]) # type: ignore - - assert stats_dict["request_throughput"]["avg"] == pytest.approx(5e8) # type: ignore - - # ================================================ - # HUGGINGFACE RANKINGS API - # ================================================ - huggingface_ranking_profile_data = { - "service_kind": "openai", - "endpoint": "rerank", - "experiments": [ - { - "experiment": { - "mode": "concurrency", - "value": 10, - }, - "requests": [ - { - "timestamp": 1, - "request_inputs": { - "payload": '{"query":"What was the first car ever driven?","texts":["Daddys Home 2 Principal photography on the film began in Massachusetts in March 2017 and it was released in the United States by Paramount Pictures on November 10, 2017. Although the film received unfavorable reviews, it has grossed over $180 million worldwide on a $69 million budget.","Kevin Loader is a British film and television producer."]}' - }, - "response_timestamps": [3], - "response_outputs": [ - { - "response": '[{"index":0,"score":0.0032476764},{"index":1,"score":0.00036117696}]' - }, - ], - }, - { - "timestamp": 2, - "request_inputs": { - "payload": '{"query":"In what state did they film Shrek 2?","texts":["Francisco Antonio Zea Juan Francisco Antonio Hilari was a Colombian journalist, botanist, diplomat, politician, and statesman who served as the 1st Vice President of Colombia.","Daddys Home 2 Principal photography on the film began in Massachusetts in March 2017 and it was released in the United States by Paramount Pictures on November 10, 2017. Although the film received unfavorable reviews, it has grossed over $180 million worldwide on a $69 million budget."]}' - }, - "response_timestamps": [5], - "response_outputs": [ - { - "response": '[{"index":0,"score":0.020177318},{"index":1,"score":0.01461567}]' - }, - ], - }, - ], - }, - ], - } - - def test_huggingface_ranking_profile_data( - self, mock_read_write: pytest.MonkeyPatch - ) -> None: - """Collect base metrics from HuggingFace ranking profile export data and check values. - - Metrics - * request latencies - - [3 - 1, 5 - 2] = [2, 3] - * request throughputs - - [2 / (5e-9 - 1e-9)] = [5e8] - """ - pd = ProfileDataParser(filename=Path("huggingface_ranking_profile_export.json")) - - # experiment 1 statistics - stats = pd.get_statistics(infer_mode="concurrency", load_level="10") - metrics = stats.metrics - stats_dict = stats.stats_dict - assert isinstance(metrics, Metrics) - - assert metrics.request_latencies == [2, 3] - assert metrics.request_throughputs == [pytest.approx(5e8)] - - assert stats_dict["request_latency"]["avg"] == pytest.approx(2.5) # type: ignore - assert stats_dict["request_latency"]["p50"] == pytest.approx(2.5) # type: ignore - assert stats_dict["request_latency"]["min"] == pytest.approx(2) # type: ignore - assert stats_dict["request_latency"]["max"] == pytest.approx(3) # type: ignore - assert stats_dict["request_latency"]["std"] == np.std([2, 3]) # type: ignore - - assert stats_dict["request_throughput"]["avg"] == pytest.approx(5e8) # type: ignore diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_wrapper.py b/src/c++/perf_analyzer/genai-perf/tests/test_wrapper.py index fd4c34b51..8a33853ec 100644 --- a/src/c++/perf_analyzer/genai-perf/tests/test_wrapper.py +++ b/src/c++/perf_analyzer/genai-perf/tests/test_wrapper.py @@ -43,14 +43,7 @@ class TestWrapper: ], ) def test_url_exactly_once_triton(self, monkeypatch, arg): - args = [ - "genai-perf", - "profile", - "-m", - "test_model", - "--service-kind", - "triton", - ] + arg + args = ["genai-perf", "-m", "test_model", "--endpoint-type", "kserve"] + arg monkeypatch.setattr("sys.argv", args) args, extra_args = parser.parse_args() cmd = Profiler.build_cmd(args, extra_args) @@ -77,14 +70,7 @@ def test_url_exactly_once_triton(self, monkeypatch, arg): ], ) def test_profile_export_filepath(self, monkeypatch, arg, expected_filepath): - args = [ - "genai-perf", - "profile", - "-m", - "test_model", - "--service-kind", - "triton", - ] + arg + args = ["genai-perf", "-m", "test_model", "--endpoint-type", "kserve"] + arg monkeypatch.setattr("sys.argv", args) args, extra_args = parser.parse_args() cmd = Profiler.build_cmd(args, extra_args) @@ -101,14 +87,7 @@ def test_profile_export_filepath(self, monkeypatch, arg, expected_filepath): ], ) def test_service_triton(self, monkeypatch, arg): - args = [ - "genai-perf", - "profile", - "-m", - "test_model", - "--service-kind", - "triton", - ] + arg + args = ["genai-perf", "-m", "test_model", "--endpoint-type", "kserve"] + arg monkeypatch.setattr("sys.argv", args) args, extra_args = parser.parse_args() cmd = Profiler.build_cmd(args, extra_args) @@ -132,11 +111,8 @@ def test_service_triton(self, monkeypatch, arg): def test_service_openai(self, monkeypatch, arg): args = [ "genai-perf", - "profile", "-m", "test_model", - "--service-kind", - "openai", ] + arg monkeypatch.setattr("sys.argv", args) args, extra_args = parser.parse_args() diff --git a/src/c++/perf_analyzer/infer_context.cc b/src/c++/perf_analyzer/infer_context.cc index aa868eba7..8929e6c99 100644 --- a/src/c++/perf_analyzer/infer_context.cc +++ b/src/c++/perf_analyzer/infer_context.cc @@ -260,13 +260,11 @@ InferContext::ValidateOutputs(const cb::InferResult* result_ptr) { // Validate output if set if (!infer_data_.expected_outputs_.empty()) { - for (size_t i = 0; i < infer_data_.expected_outputs_.size(); ++i) { + for (size_t i = 0; i < infer_data_.outputs_.size(); ++i) { const uint8_t* buf = nullptr; size_t byte_size = 0; + result_ptr->RawData(infer_data_.outputs_[i]->Name(), &buf, &byte_size); for (const auto& expected : infer_data_.expected_outputs_[i]) { - // Request output by validation output's name explicitly, rather than - // relying on the array indices being sorted equally in both arrays. - result_ptr->RawData(expected.name, &buf, &byte_size); if (!expected.is_valid) { return cb::Error( "Expected output can't be invalid", pa::GENERIC_ERROR); diff --git a/src/c++/perf_analyzer/inference_profiler.cc b/src/c++/perf_analyzer/inference_profiler.cc index a36f51c10..4d6af44b6 100644 --- a/src/c++/perf_analyzer/inference_profiler.cc +++ b/src/c++/perf_analyzer/inference_profiler.cc @@ -484,7 +484,6 @@ InferenceProfiler::Create( uint64_t measurement_request_count, MeasurementMode measurement_mode, std::shared_ptr mpi_driver, const uint64_t metrics_interval_ms, const bool should_collect_metrics, const double overhead_pct_threshold, - const bool async_mode, const std::shared_ptr collector, const bool should_collect_profile_data) { @@ -493,8 +492,7 @@ InferenceProfiler::Create( (percentile != -1), percentile, latency_threshold_ms_, protocol, parser, profile_backend, std::move(manager), measurement_request_count, measurement_mode, mpi_driver, metrics_interval_ms, should_collect_metrics, - overhead_pct_threshold, async_mode, collector, - should_collect_profile_data)); + overhead_pct_threshold, collector, should_collect_profile_data)); *profiler = std::move(local_profiler); return cb::Error::Success; @@ -510,7 +508,7 @@ InferenceProfiler::InferenceProfiler( std::unique_ptr manager, uint64_t measurement_request_count, MeasurementMode measurement_mode, std::shared_ptr mpi_driver, const uint64_t metrics_interval_ms, const bool should_collect_metrics, - const double overhead_pct_threshold, const bool async_mode, + const double overhead_pct_threshold, const std::shared_ptr collector, const bool should_collect_profile_data) : verbose_(verbose), measurement_window_ms_(measurement_window_ms), @@ -521,8 +519,7 @@ InferenceProfiler::InferenceProfiler( measurement_request_count_(measurement_request_count), measurement_mode_(measurement_mode), mpi_driver_(mpi_driver), should_collect_metrics_(should_collect_metrics), - overhead_pct_threshold_(overhead_pct_threshold), async_mode_(async_mode), - collector_(collector), + overhead_pct_threshold_(overhead_pct_threshold), collector_(collector), should_collect_profile_data_(should_collect_profile_data) { load_parameters_.stability_threshold = stability_threshold; @@ -723,22 +720,13 @@ InferenceProfiler::ProfileHelper( measurement_perf_status.request_rate = experiment_perf_status.request_rate; RETURN_IF_ERROR(manager_->CheckHealth()); - MeasureConfig measure_config; if (measurement_mode_ == MeasurementMode::TIME_WINDOWS) { - measure_config.measurement_window = measurement_window_ms_; - measure_config.is_count_based = false; + error.push( + Measure(measurement_perf_status, measurement_window_ms_, false)); } else { - measure_config.measurement_window = measurement_request_count_; - measure_config.is_count_based = true; + error.push( + Measure(measurement_perf_status, measurement_request_count_, true)); } - - // When request_count is not 0, the experiment will run for exactly X - // requests. In that case, we are not measuring based on window stability, - // and instead need to clamp the windows to be from the start of the - // first request to the end of the last request of the request count - // - measure_config.clamp_window = (request_count != 0); - error.push(Measure(measurement_perf_status, measure_config)); measurement_perf_statuses.push_back(measurement_perf_status); if (error.size() > load_parameters_.stability_window) { @@ -801,14 +789,6 @@ InferenceProfiler::ProfileHelper( completed_trials++; } while ((!early_exit) && (completed_trials < max_trials_)); - // For async requests, print a warning if the latency threshold is not met. - if (async_mode_ && !*is_stable && DetermineStability(load_status, false)) { - std::cerr << "Warning: Request latency is not stabilizing. " - "Please try lowering the request rate." - << std::endl; - *is_stable = true; - } - if (should_collect_metrics_) { metrics_manager_->StopQueryingMetrics(); } @@ -836,8 +816,7 @@ InferenceProfiler::ProfileHelper( } bool -InferenceProfiler::DetermineStability( - LoadStatus& load_status, bool check_latency) +InferenceProfiler::DetermineStability(LoadStatus& load_status) { bool stable = false; if (load_status.infer_per_sec.size() >= load_parameters_.stability_window) { @@ -851,17 +830,16 @@ InferenceProfiler::DetermineStability( } } - stable = stable && CheckWindowForStability(idx, load_status, check_latency); + stable = stable && CheckWindowForStability(idx, load_status); } return stable; } bool -InferenceProfiler::CheckWindowForStability( - size_t idx, LoadStatus& load_status, bool check_latency) +InferenceProfiler::CheckWindowForStability(size_t idx, LoadStatus& load_status) { return IsInferWindowStable(idx, load_status) && - (!check_latency || IsLatencyWindowStable(idx, load_status)); + IsLatencyWindowStable(idx, load_status); } bool @@ -888,8 +866,6 @@ InferenceProfiler::IsLatencyWindowStable(size_t idx, LoadStatus& load_status) double max_latency = *latencies_per_sec_measurements.second; double min_latency = *latencies_per_sec_measurements.first; - auto is_stable = - max_latency / min_latency <= 1 + load_parameters_.stability_threshold; return max_latency / min_latency <= 1 + load_parameters_.stability_threshold; } @@ -1178,7 +1154,8 @@ InferenceProfiler::GetServerSideStatus( // Used for measurement cb::Error -InferenceProfiler::Measure(PerfStatus& perf_status, MeasureConfig config) +InferenceProfiler::Measure( + PerfStatus& perf_status, uint64_t measurement_window, bool is_count_based) { std::map start_status; std::map end_status; @@ -1215,10 +1192,10 @@ InferenceProfiler::Measure(PerfStatus& perf_status, MeasureConfig config) } } - if (!config.is_count_based) { + if (!is_count_based) { // Wait for specified time interval in msec std::this_thread::sleep_for( - std::chrono::milliseconds((uint64_t)(config.measurement_window * 1.2))); + std::chrono::milliseconds((uint64_t)(measurement_window_ms_ * 1.2))); } else { do { // Check the health of the worker threads. @@ -1226,7 +1203,7 @@ InferenceProfiler::Measure(PerfStatus& perf_status, MeasureConfig config) // Wait for 1s until enough samples have been collected. std::this_thread::sleep_for(std::chrono::milliseconds((uint64_t)1000)); - } while (manager_->CountCollectedRequests() < config.measurement_window); + } while (manager_->CountCollectedRequests() < measurement_window); } uint64_t window_end_ns = @@ -1257,7 +1234,7 @@ InferenceProfiler::Measure(PerfStatus& perf_status, MeasureConfig config) RETURN_IF_ERROR(Summarize( start_status, end_status, start_stat, end_stat, perf_status, - window_start_ns, window_end_ns, config.clamp_window)); + window_start_ns, window_end_ns)); return cb::Error::Success; } @@ -1267,8 +1244,7 @@ InferenceProfiler::Summarize( const std::map& start_status, const std::map& end_status, const cb::InferStat& start_stat, const cb::InferStat& end_stat, - PerfStatus& summary, uint64_t window_start_ns, uint64_t window_end_ns, - bool clamp_window) + PerfStatus& summary, uint64_t window_start_ns, uint64_t window_end_ns) { size_t valid_sequence_count = 0; size_t delayed_request_count = 0; @@ -1276,19 +1252,13 @@ InferenceProfiler::Summarize( // Get measurement from requests that fall within the time interval std::pair valid_range{window_start_ns, window_end_ns}; + uint64_t window_duration_ns = valid_range.second - valid_range.first; std::vector latencies; std::vector valid_requests{}; ValidLatencyMeasurement( valid_range, valid_sequence_count, delayed_request_count, &latencies, response_count, valid_requests); - - if (clamp_window) { - auto [start, end] = ClampWindow(valid_requests); - } - - uint64_t window_duration_ns = window_end_ns - window_start_ns; - if (should_collect_profile_data_) { CollectData( summary, window_start_ns, window_end_ns, std::move(valid_requests)); @@ -1381,24 +1351,6 @@ InferenceProfiler::ValidLatencyMeasurement( std::sort(valid_latencies->begin(), valid_latencies->end()); } -std::pair -InferenceProfiler::ClampWindow(std::vector& requests) -{ - auto earliest_start = - std::chrono::time_point::max(); - auto latest_end = std::chrono::time_point::min(); - - for (auto x : requests) { - earliest_start = std::min(earliest_start, x.start_time_); - latest_end = std::max(latest_end, x.response_timestamps_.back()); - } - - return std::make_pair( - earliest_start.time_since_epoch().count(), - latest_end.time_since_epoch().count()); -} - - void InferenceProfiler::CollectData( PerfStatus& summary, uint64_t window_start_ns, uint64_t window_end_ns, diff --git a/src/c++/perf_analyzer/inference_profiler.h b/src/c++/perf_analyzer/inference_profiler.h index a73651319..013dd0483 100644 --- a/src/c++/perf_analyzer/inference_profiler.h +++ b/src/c++/perf_analyzer/inference_profiler.h @@ -77,13 +77,6 @@ struct LoadStatus { uint64_t avg_latency = 0; }; -/// Configuration for the Measure function -struct MeasureConfig { - uint64_t measurement_window{0}; - bool is_count_based{false}; - bool clamp_window{false}; -}; - // Holds the total of the timiming components of composing models of an // ensemble. struct EnsembleDurations { @@ -267,7 +260,6 @@ class InferenceProfiler { uint64_t measurement_request_count, MeasurementMode measurement_mode, std::shared_ptr mpi_driver, const uint64_t metrics_interval_ms, const bool should_collect_metrics, const double overhead_pct_threshold, - const bool async_mode, const std::shared_ptr collector, const bool should_collect_profile_data); @@ -371,7 +363,7 @@ class InferenceProfiler { std::unique_ptr manager, uint64_t measurement_request_count, MeasurementMode measurement_mode, std::shared_ptr mpi_driver, const uint64_t metrics_interval_ms, const bool should_collect_metrics, - const double overhead_pct_threshold, const bool async_mode, + const double overhead_pct_threshold, const std::shared_ptr collector, const bool should_collect_profile_data); @@ -440,9 +432,8 @@ class InferenceProfiler { /// A helper function to determine if profiling is stable /// \param load_status Stores the observations of infer_per_sec and latencies - /// \param check_latency Whether to check latency for stability /// \return Returns if the threshold and latencies are stable. - bool DetermineStability(LoadStatus& load_status, bool check_latency = true); + bool DetermineStability(LoadStatus& load_status); /// Check if latency at index idx is within the latency threshold /// \param idx index in latency vector @@ -461,10 +452,8 @@ class InferenceProfiler { /// for a single window starting at idx /// \param idx index in latency vector /// \param load_status Stores the observations of infer_per_sec and latencies - /// \param check_latency Whether to check latency for stability /// \return Returns whether inference and latency are stable - bool CheckWindowForStability( - size_t idx, LoadStatus& load_status, bool check_latency); + bool CheckWindowForStability(size_t idx, LoadStatus& load_status); /// Check if observed inferences are within threshold /// for a single window starting at idx @@ -482,9 +471,14 @@ class InferenceProfiler { /// Helper function to perform measurement. /// \param status_summary The summary of this measurement. - /// \param config The configuration for measurement. + /// \param measurement_window Indicating the number of requests or the + /// duration in milliseconds to collect requests. + /// \param is_count_based determines whether measurement_window is indicating + /// time or count. /// \return cb::Error object indicating success or failure. - cb::Error Measure(PerfStatus& status_summary, MeasureConfig config); + cb::Error Measure( + PerfStatus& status_summary, uint64_t measurement_window, + bool is_count_based); /// Gets the server side statistics /// \param model_status Returns the status of the models provided by @@ -503,15 +497,12 @@ class InferenceProfiler { /// \param summary Returns the summary of the measurement. /// \param window_start_ns The window start timestamp in nanoseconds. /// \param window_end_ns The window end timestamp in nanoseconds. - /// \param clamp_window If true, the actual window range is reduced to the - /// start of the first request to the final response. /// \return cb::Error object indicating success or failure. cb::Error Summarize( const std::map& start_status, const std::map& end_status, const cb::InferStat& start_stat, const cb::InferStat& end_stat, - PerfStatus& summary, uint64_t window_start_ns, uint64_t window_end_ns, - bool clamp_window); + PerfStatus& summary, uint64_t window_start_ns, uint64_t window_end_ns); /// \param valid_range The start and end timestamp of the measurement window. /// \param valid_sequence_count Returns the number of completed sequences @@ -527,13 +518,6 @@ class InferenceProfiler { std::vector* latencies, size_t& response_count, std::vector& valid_requests); - /// Clamp a window around a set of requests, from the earliest start time to - /// the latest response - /// \param requests A vector of requests to clamp the window around. - /// \return std::pair object containing of the window. - std::pair ClampWindow( - std::vector& requests); - /// Add the data from the request records to the Raw Data Collector /// \param perf_status PerfStatus of the current measurement /// \param window_start_ns The window start timestamp in nanoseconds. @@ -802,9 +786,6 @@ class InferenceProfiler { // Whether to collect profile data. bool should_collect_profile_data_{false}; - // Whether the client is operating in async mode. - const bool async_mode_{false}; - #ifndef DOCTEST_CONFIG_DISABLE friend NaggyMockInferenceProfiler; friend TestInferenceProfiler; diff --git a/src/c++/perf_analyzer/load_manager.cc b/src/c++/perf_analyzer/load_manager.cc index 1f648a7f4..ac9150a9d 100644 --- a/src/c++/perf_analyzer/load_manager.cc +++ b/src/c++/perf_analyzer/load_manager.cc @@ -218,8 +218,6 @@ LoadManager::InitManagerInputs( // Read provided data if (!user_data.empty()) { if (IsDirectory(user_data[0])) { - RETURN_IF_ERROR(data_loader_->ValidateIOExistsInModel( - parser_->Inputs(), parser_->Outputs(), user_data[0])); RETURN_IF_ERROR(data_loader_->ReadDataFromDir( parser_->Inputs(), parser_->Outputs(), user_data[0])); } else { diff --git a/src/c++/perf_analyzer/perf_analyzer.cc b/src/c++/perf_analyzer/perf_analyzer.cc index c10101e1c..b8b4de7ea 100644 --- a/src/c++/perf_analyzer/perf_analyzer.cc +++ b/src/c++/perf_analyzer/perf_analyzer.cc @@ -284,7 +284,7 @@ PerfAnalyzer::CreateAnalyzerObjects() params_->measurement_request_count, params_->measurement_mode, params_->mpi_driver, params_->metrics_interval_ms, params_->should_collect_metrics, params_->overhead_pct_threshold, - params_->async, collector_, !params_->profile_export_file.empty()), + collector_, !params_->profile_export_file.empty()), "failed to create profiler"); } @@ -311,16 +311,11 @@ PerfAnalyzer::PrerunReport() << std::endl; } - std::string stabilization_metric = "latency and throughput"; - if (params_->async) { - stabilization_metric = "throughput"; - } if (params_->percentile == -1) { - std::cout << " Stabilizing using average " << stabilization_metric - << std::endl; + std::cout << " Stabilizing using average latency" << std::endl; } else { - std::cout << " Stabilizing using p" << params_->percentile - << stabilization_metric << std::endl; + std::cout << " Stabilizing using p" << params_->percentile << " latency" + << std::endl; } if (params_->measurement_mode == pa::MeasurementMode::TIME_WINDOWS) { diff --git a/src/c++/perf_analyzer/tensor_data.h b/src/c++/perf_analyzer/tensor_data.h index 6f5cf7191..b989e4dc1 100644 --- a/src/c++/perf_analyzer/tensor_data.h +++ b/src/c++/perf_analyzer/tensor_data.h @@ -1,4 +1,4 @@ -// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -33,7 +33,6 @@ struct TensorData { const uint8_t* data_ptr{nullptr}; size_t batch1_size{0}; bool is_valid{false}; - std::string name; }; diff --git a/src/c++/perf_analyzer/test_dataloader.cc b/src/c++/perf_analyzer/test_dataloader.cc index 656571cb9..c9296fa3c 100644 --- a/src/c++/perf_analyzer/test_dataloader.cc +++ b/src/c++/perf_analyzer/test_dataloader.cc @@ -193,18 +193,6 @@ TEST_CASE("dataloader: ParseData: Misc error cases") expected_message = "missing tensor INPUT1 ( Location stream id: 0, step id: 0)"; } - SUBCASE("Invalid input") - { - json_str = R"({"data": - [{ - "INPUT1": [2], - "INVALID_INPUT": [2] - }] - })"; - expected_message = - "The input or output 'INVALID_INPUT' is not found in the model " - "configuration"; - } MockDataLoader dataloader; std::shared_ptr inputs = std::make_shared(); diff --git a/src/c++/perf_analyzer/test_inference_profiler.cc b/src/c++/perf_analyzer/test_inference_profiler.cc index 2941867fc..8ff39605b 100644 --- a/src/c++/perf_analyzer/test_inference_profiler.cc +++ b/src/c++/perf_analyzer/test_inference_profiler.cc @@ -81,17 +81,16 @@ class TestInferenceProfiler : public InferenceProfiler { ip.load_parameters_.stability_threshold = lp.stability_threshold; ip.load_parameters_.stability_window = lp.stability_window; - return ip.CheckWindowForStability(idx, ls, true); + return ip.CheckWindowForStability(idx, ls); }; - static bool TestDetermineStability( - LoadStatus& ls, LoadParams& lp, bool check_latency = true) + static bool TestDetermineStability(LoadStatus& ls, LoadParams& lp) { InferenceProfiler ip; ip.load_parameters_.stability_threshold = lp.stability_threshold; ip.load_parameters_.stability_window = lp.stability_window; - return ip.DetermineStability(ls, check_latency); + return ip.DetermineStability(ls); } static bool TestIsDoneProfiling( @@ -107,11 +106,6 @@ class TestInferenceProfiler : public InferenceProfiler { return ip.IsDoneProfiling(ls, &is_stable); }; - std::pair ClampWindow(std::vector& reqs) - { - return InferenceProfiler::ClampWindow(reqs); - } - cb::Error MergeMetrics( const std::vector>& all_metrics, Metrics& merged_metrics) @@ -355,16 +349,6 @@ TEST_CASE("test_determine_stability") ls.infer_per_sec = {500.0, 520.0, 510.0}; CHECK(TestInferenceProfiler::TestDetermineStability(ls, lp) == true); } - - SUBCASE("test determine stability without latency check") - { - ls.infer_per_sec = {500.0, 520.0, 510.0}; - ls.latencies = {100, 106, 112}; - lp.stability_window = 3; - lp.stability_threshold = 0.1; - uint64_t latency_threshold_ms = 1; - CHECK(TestInferenceProfiler::TestDetermineStability(ls, lp, false) == true); - } } TEST_CASE("test_is_done_profiling") @@ -1065,41 +1049,6 @@ TEST_CASE( } } -TEST_CASE("clamp window") -{ - TestInferenceProfiler tip{}; - std::vector reqs{}; - - auto clock_epoch{std::chrono::time_point()}; - - auto request1_timestamp{clock_epoch + std::chrono::nanoseconds(5)}; - auto response1_timestamp{clock_epoch + std::chrono::nanoseconds(20)}; - - reqs.emplace_back( - request1_timestamp, - std::vector>{ - response1_timestamp}); - - auto request2_timestamp{clock_epoch + std::chrono::nanoseconds(3)}; - auto response2_timestamp{clock_epoch + std::chrono::nanoseconds(15)}; - reqs.emplace_back( - request2_timestamp, - std::vector>{ - response2_timestamp}); - - auto request3_timestamp{clock_epoch + std::chrono::nanoseconds(7)}; - auto response3_timestamp{clock_epoch + std::chrono::nanoseconds(17)}; - reqs.emplace_back( - request3_timestamp, - std::vector>{ - response3_timestamp}); - - auto window = tip.ClampWindow(reqs); - - CHECK(window.first == 3); - CHECK(window.second == 20); -} - TEST_CASE("summarize_client_stat: testing the SummarizeClientStat function") { MockInferenceProfiler mock_inference_profiler{}; diff --git a/src/c++/perf_analyzer/test_request_rate_manager.cc b/src/c++/perf_analyzer/test_request_rate_manager.cc index 07b9016dd..e4870b95b 100644 --- a/src/c++/perf_analyzer/test_request_rate_manager.cc +++ b/src/c++/perf_analyzer/test_request_rate_manager.cc @@ -509,7 +509,7 @@ class TestRequestRateManager : public TestLoadManagerBase, // CHECK( delay_average == - doctest::Approx(expected_delay_average).epsilon(0.1)); + doctest::Approx(expected_delay_average).epsilon(0.01)); CHECK_LT(delay_variance, max_allowed_delay_variance); } else { throw std::invalid_argument("Unexpected distribution type"); @@ -1008,22 +1008,21 @@ TEST_CASE( ModelTensor model_tensor2 = model_tensor1; model_tensor2.name_ = "INPUT2"; + std::string json_str{R"({ + "data": [ + { "INPUT1": [1], "INPUT2": [21] }, + { "INPUT1": [2], "INPUT2": [22] }, + { "INPUT1": [3], "INPUT2": [23] } + ]})"}; + size_t num_requests = 4; size_t num_threads = 1; - std::string json_str; const auto& ParameterizeTensors{[&]() { SUBCASE("one tensor") { tensors.push_back(model_tensor1); - json_str = R"({ - "data": [ - { "INPUT1": [1] }, - { "INPUT1": [2] }, - { "INPUT1": [3] } - ]})"; - switch (params.batch_size) { case 1: expected_results = {{1}, {2}, {3}, {1}}; @@ -1044,13 +1043,6 @@ TEST_CASE( tensors.push_back(model_tensor1); tensors.push_back(model_tensor2); - json_str = R"({ - "data": [ - { "INPUT1": [1], "INPUT2": [21] }, - { "INPUT1": [2], "INPUT2": [22] }, - { "INPUT1": [3], "INPUT2": [23] } - ]})"; - switch (params.batch_size) { case 1: expected_results = {{1, 21}, {2, 22}, {3, 23}, {1, 21}}; diff --git a/src/python/library/requirements/requirements.txt b/src/python/library/requirements/requirements.txt index b53763f38..6f84e21f9 100644 --- a/src/python/library/requirements/requirements.txt +++ b/src/python/library/requirements/requirements.txt @@ -1,4 +1,4 @@ -# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -24,6 +24,6 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -numpy>=1.19.1,<2 +numpy>=1.19.1 python-rapidjson>=0.9.1 urllib3>=2.0.7 diff --git a/src/python/library/requirements/requirements_grpc.txt b/src/python/library/requirements/requirements_grpc.txt index fd7ebe67d..ea9fb9bec 100644 --- a/src/python/library/requirements/requirements_grpc.txt +++ b/src/python/library/requirements/requirements_grpc.txt @@ -1,4 +1,4 @@ -# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -28,7 +28,7 @@ # use known working version until the memory leak is resolved in the future # (see https://github.com/grpc/grpc/issues/28513) grpcio>=1.41.0 -numpy>=1.19.1,<2 +numpy>=1.19.1 packaging>=14.1 protobuf>=3.5.0,<5 python-rapidjson>=0.9.1 diff --git a/src/python/library/requirements/requirements_http.txt b/src/python/library/requirements/requirements_http.txt index febc32a3f..6e1906967 100644 --- a/src/python/library/requirements/requirements_http.txt +++ b/src/python/library/requirements/requirements_http.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -26,5 +26,5 @@ aiohttp>=3.8.1,<4.0.0 geventhttpclient>=1.4.4,<=2.0.2 -numpy>=1.19.1,<2 +numpy>=1.19.1 python-rapidjson>=0.9.1