max_num_seqs export to benchmarkscript

ROCm · Oct 11, 2024 · 9f32da2 · 9f32da2
1 parent 96284e6
commit 9f32da2
Show file tree

Hide file tree

Showing 2 changed files with 17 additions and 7 deletions.
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
@@ -48,6 +48,7 @@ def main(args: argparse.Namespace):
         otlp_traces_endpoint=args.otlp_traces_endpoint,
         enable_prefix_caching=args.enable_prefix_caching,
         num_scheduler_steps=args.num_scheduler_steps,
+        max_seq_len_to_capture=args.max_seq_len_to_capture,
     )
 
     sampling_params = SamplingParams(
@@ -284,5 +285,9 @@ def run_to_completion(profile_dir: Optional[str] = None):
         type=int,
         default=1,
         help="Maximum number of forward steps per scheduler call.")
+    parser.add_argument(
+        "--max-seq-len-to-capture",
+        type=int,
+        help="Maximum sequence length covered by CUDA/HIP graph.")
     args = parser.parse_args()
     main(args)
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
@@ -113,6 +113,7 @@ def run_vllm(
         distributed_executor_backend=distributed_executor_backend,
         load_format=load_format,
         num_scheduler_steps=num_scheduler_steps,
+        max_seq_len_to_capture=args.max_seq_len_to_capture,
         max_num_seqs=args.max_num_seqs,
         use_v2_block_manager=use_v2_block_manager,
         disable_async_output_proc=disable_async_output_proc,
@@ -468,17 +469,21 @@ def main(args: argparse.Namespace):
                         choices=DEVICE_OPTIONS,
                         help='device type for vLLM execution')
     parser.add_argument(
-        "--num-scheduler-steps",
-        type=int,
-        default=1,
-        help="Maximum number of forward steps per scheduler call.")
-    parser.add_argument("--use-v2-block-manager",
-                        action='store_true',
-                        help="Enable block manager v2.")
+                        "--num-scheduler-steps",
+                        type=int,
+                        default=1,
+                        help="Maximum number of forward steps per scheduler call.")
+    parser.add_argument(
+                        "--max-seq-len-to-capture",
+                        type=int,
+                        help="Maximum sequence length covered by CUDA/HIP graph.")
     parser.add_argument('--max-num-seqs',
                         type=int,
                         default=256,
                         help="Max number of sequences a model can run in a single batch")
+    parser.add_argument("--use-v2-block-manager",
+                        action='store_true',
+                        help="Enable block manager v2.")
     parser.add_argument(
         "--enable-prefix-caching",
         action='store_true',