diff --git a/benchmark/profile_restful_api.py b/benchmark/profile_restful_api.py index 032e2e6c28..3b9618f04d 100644 --- a/benchmark/profile_restful_api.py +++ b/benchmark/profile_restful_api.py @@ -34,8 +34,8 @@ def infer(server_addr: str, session_id: int, req_queue: Queue, res_que: Queue): first_token_latency = np.round(timestamps[1] - timestamps[0], 3) token_latency = np.round(timestamps[-1] - timestamps[0], 3) - generated_tokens = tokens[-1] - tokens[0] - total_tokens = tokens[-1] + generated_tokens = tokens[-1] + total_tokens = tokens[-1] + input_seqlen stats.append([ first_token_latency, generated_tokens, total_tokens, token_latency ]) @@ -151,7 +151,7 @@ def main(server_addr: str, first_token_latency_max = np.max(stats[:, 0], axis=0) first_token_latency_ave = np.mean(stats[:, 0], axis=0) generated_token_throughput = np.sum(stats[:, 1], axis=0) / elapsed_time - total_token_throughput = np.sum(stats[:, 1], axis=0) / elapsed_time + total_token_throughput = np.sum(stats[:, 2], axis=0) / elapsed_time rqs = n_req / elapsed_time rqm = rqs * 60 diff --git a/benchmark/profile_serving.py b/benchmark/profile_serving.py index d232eb255b..2277b0b30b 100644 --- a/benchmark/profile_serving.py +++ b/benchmark/profile_serving.py @@ -26,11 +26,10 @@ def infer(chatbot, session_id: int, req_que: mp.Queue, res_que: mp.Queue): sequence_end=True): timestamps.append(time.perf_counter()) tokens.append(token) - first_token_latency = np.round(timestamps[1] - timestamps[0], 3) token_latency = np.round(timestamps[-1] - timestamps[0], 3) - generated_tokens = tokens[-1] - tokens[0] - total_tokens = tokens[-1] + generated_tokens = tokens[-1] + total_tokens = tokens[-1] + input_seqlen stats.append([ first_token_latency, generated_tokens, total_tokens, token_latency ]) @@ -164,7 +163,7 @@ def main(tritonserver_addr: str, first_token_latency_max = np.max(stats[:, 0], axis=0) first_token_latency_ave = np.mean(stats[:, 0], axis=0) generated_token_throughput = np.sum(stats[:, 1], axis=0) / elapsed_time - total_token_throughput = np.sum(stats[:, 1], axis=0) / elapsed_time + total_token_throughput = np.sum(stats[:, 2], axis=0) / elapsed_time rqs = n_req / elapsed_time rqm = rqs * 60 diff --git a/benchmark/profile_throughput.py b/benchmark/profile_throughput.py index 394124f913..38e2e0303c 100644 --- a/benchmark/profile_throughput.py +++ b/benchmark/profile_throughput.py @@ -91,8 +91,8 @@ def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int): tokens.append(token) first_token_latency = np.round(timestamps[1] - timestamps[0], 3) token_latency = np.round(timestamps[-1] - timestamps[0], 3) - generated_tokens = tokens[-1] - tokens[0] - total_tokens = tokens[-1] + generated_tokens = tokens[-1] + total_tokens = tokens[-1] + len(input_ids) stats.append([ first_token_latency, generated_tokens, total_tokens, token_latency @@ -140,7 +140,7 @@ def process_request(self, requests, concurrency: int = 1): first_token_latency_max = np.max(stats[:, 0], axis=0) first_token_latency_ave = np.mean(stats[:, 0], axis=0) generated_token_throughput = np.sum(stats[:, 1], axis=0) / elapsed_time - total_token_throughput = np.sum(stats[:, 1], axis=0) / elapsed_time + total_token_throughput = np.sum(stats[:, 2], axis=0) / elapsed_time rqs = len(requests) / elapsed_time rqm = rqs * 60 print(