fix

AllentDan · Nov 6, 2023 · 4bee521 · 4bee521
1 parent 2e41177
commit 4bee521
Show file tree

Hide file tree

Showing 3 changed files with 9 additions and 10 deletions.
diff --git a/benchmark/profile_restful_api.py b/benchmark/profile_restful_api.py
@@ -34,8 +34,8 @@ def infer(server_addr: str, session_id: int, req_queue: Queue, res_que: Queue):
 
         first_token_latency = np.round(timestamps[1] - timestamps[0], 3)
         token_latency = np.round(timestamps[-1] - timestamps[0], 3)
-        generated_tokens = tokens[-1] - tokens[0]
-        total_tokens = tokens[-1]
+        generated_tokens = tokens[-1]
+        total_tokens = tokens[-1] + input_seqlen
         stats.append([
             first_token_latency, generated_tokens, total_tokens, token_latency
         ])
@@ -151,7 +151,7 @@ def main(server_addr: str,
     first_token_latency_max = np.max(stats[:, 0], axis=0)
     first_token_latency_ave = np.mean(stats[:, 0], axis=0)
     generated_token_throughput = np.sum(stats[:, 1], axis=0) / elapsed_time
-    total_token_throughput = np.sum(stats[:, 1], axis=0) / elapsed_time
+    total_token_throughput = np.sum(stats[:, 2], axis=0) / elapsed_time
     rqs = n_req / elapsed_time
     rqm = rqs * 60
 

diff --git a/benchmark/profile_serving.py b/benchmark/profile_serving.py
@@ -26,11 +26,10 @@ def infer(chatbot, session_id: int, req_que: mp.Queue, res_que: mp.Queue):
                 sequence_end=True):
             timestamps.append(time.perf_counter())
             tokens.append(token)
-
         first_token_latency = np.round(timestamps[1] - timestamps[0], 3)
         token_latency = np.round(timestamps[-1] - timestamps[0], 3)
-        generated_tokens = tokens[-1] - tokens[0]
-        total_tokens = tokens[-1]
+        generated_tokens = tokens[-1]
+        total_tokens = tokens[-1] + input_seqlen
         stats.append([
             first_token_latency, generated_tokens, total_tokens, token_latency
         ])
@@ -164,7 +163,7 @@ def main(tritonserver_addr: str,
     first_token_latency_max = np.max(stats[:, 0], axis=0)
     first_token_latency_ave = np.mean(stats[:, 0], axis=0)
     generated_token_throughput = np.sum(stats[:, 1], axis=0) / elapsed_time
-    total_token_throughput = np.sum(stats[:, 1], axis=0) / elapsed_time
+    total_token_throughput = np.sum(stats[:, 2], axis=0) / elapsed_time
     rqs = n_req / elapsed_time
     rqm = rqs * 60
 

diff --git a/benchmark/profile_throughput.py b/benchmark/profile_throughput.py
@@ -91,8 +91,8 @@ def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int):
                 tokens.append(token)
             first_token_latency = np.round(timestamps[1] - timestamps[0], 3)
             token_latency = np.round(timestamps[-1] - timestamps[0], 3)
-            generated_tokens = tokens[-1] - tokens[0]
-            total_tokens = tokens[-1]
+            generated_tokens = tokens[-1]
+            total_tokens = tokens[-1] + len(input_ids)
             stats.append([
                 first_token_latency, generated_tokens, total_tokens,
                 token_latency
@@ -140,7 +140,7 @@ def process_request(self, requests, concurrency: int = 1):
         first_token_latency_max = np.max(stats[:, 0], axis=0)
         first_token_latency_ave = np.mean(stats[:, 0], axis=0)
         generated_token_throughput = np.sum(stats[:, 1], axis=0) / elapsed_time
-        total_token_throughput = np.sum(stats[:, 1], axis=0) / elapsed_time
+        total_token_throughput = np.sum(stats[:, 2], axis=0) / elapsed_time
         rqs = len(requests) / elapsed_time
         rqm = rqs * 60
         print(