diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 7ba421772ef40..0a2237c85719a 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -48,6 +48,7 @@ def main(args: argparse.Namespace): otlp_traces_endpoint=args.otlp_traces_endpoint, enable_prefix_caching=args.enable_prefix_caching, num_scheduler_steps=args.num_scheduler_steps, + max_seq_len_to_capture=args.max_seq_len_to_capture, ) sampling_params = SamplingParams( @@ -284,5 +285,9 @@ def run_to_completion(profile_dir: Optional[str] = None): type=int, default=1, help="Maximum number of forward steps per scheduler call.") + parser.add_argument( + "--max-seq-len-to-capture", + type=int, + help="Maximum sequence length covered by CUDA/HIP graph.") args = parser.parse_args() main(args) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index eae1921e4c5f1..5c2bba0dba552 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -113,6 +113,7 @@ def run_vllm( distributed_executor_backend=distributed_executor_backend, load_format=load_format, num_scheduler_steps=num_scheduler_steps, + max_seq_len_to_capture=args.max_seq_len_to_capture, max_num_seqs=args.max_num_seqs, use_v2_block_manager=use_v2_block_manager, disable_async_output_proc=disable_async_output_proc, @@ -468,17 +469,21 @@ def main(args: argparse.Namespace): choices=DEVICE_OPTIONS, help='device type for vLLM execution') parser.add_argument( - "--num-scheduler-steps", - type=int, - default=1, - help="Maximum number of forward steps per scheduler call.") - parser.add_argument("--use-v2-block-manager", - action='store_true', - help="Enable block manager v2.") + "--num-scheduler-steps", + type=int, + default=1, + help="Maximum number of forward steps per scheduler call.") + parser.add_argument( + "--max-seq-len-to-capture", + type=int, + help="Maximum sequence length covered by CUDA/HIP graph.") parser.add_argument('--max-num-seqs', type=int, default=256, help="Max number of sequences a model can run in a single batch") + parser.add_argument("--use-v2-block-manager", + action='store_true', + help="Enable block manager v2.") parser.add_argument( "--enable-prefix-caching", action='store_true',