diff --git a/benchmark/profile_restful_api.py b/benchmark/profile_restful_api.py index 607cc467ca..249245fc3a 100644 --- a/benchmark/profile_restful_api.py +++ b/benchmark/profile_restful_api.py @@ -54,7 +54,7 @@ def infer(server_addr: str, session_id: int, req_queue: mp.Queue, f'input_seqlen {input_seqlen}, output_seqlen {output_seqlen}') timestamps = [] tokens = [] - start = time.perf_counter() + timestamps.append(time.perf_counter()) for res, token in get_streaming_response( prompt, server_addr, @@ -65,7 +65,7 @@ def infer(server_addr: str, session_id: int, req_queue: mp.Queue, timestamps.append(time.perf_counter()) tokens.append(token) - first_token_latency = timestamps[1] - start + first_token_latency = timestamps[1] - timestamps[0] token_latency = timestamps[-1] - timestamps[0] token = tokens[-1] - tokens[0] stats.append([first_token_latency, token, token_latency]) diff --git a/benchmark/profile_serving.py b/benchmark/profile_serving.py index ea2edcf9f9..5a613dbfbe 100644 --- a/benchmark/profile_serving.py +++ b/benchmark/profile_serving.py @@ -17,7 +17,7 @@ def infer(chatbot, session_id: int, req_que: mp.Queue, res_que: mp.Queue): [None, None, None]): timestamps = [] tokens = [] - start = time.perf_counter() + timestamps.append(time.perf_counter()) for status, res, token in chatbot.stream_infer( session_id, prompt, @@ -27,7 +27,7 @@ def infer(chatbot, session_id: int, req_que: mp.Queue, res_que: mp.Queue): timestamps.append(time.perf_counter()) tokens.append(token) - first_token_latency = np.round(timestamps[1] - start, 3) + first_token_latency = np.round(timestamps[1] - timestamps[0], 3) token_latency = np.round(timestamps[-1] - timestamps[0], 3) token = tokens[-1] - tokens[0] stats.append([first_token_latency, token, token_latency])