Skip to content

Commit

Permalink
max_num_seqs export to benchmarkscript
Browse files Browse the repository at this point in the history
  • Loading branch information
seungrokj committed Oct 11, 2024
1 parent 96284e6 commit 9f32da2
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 7 deletions.
5 changes: 5 additions & 0 deletions benchmarks/benchmark_latency.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def main(args: argparse.Namespace):
otlp_traces_endpoint=args.otlp_traces_endpoint,
enable_prefix_caching=args.enable_prefix_caching,
num_scheduler_steps=args.num_scheduler_steps,
max_seq_len_to_capture=args.max_seq_len_to_capture,
)

sampling_params = SamplingParams(
Expand Down Expand Up @@ -284,5 +285,9 @@ def run_to_completion(profile_dir: Optional[str] = None):
type=int,
default=1,
help="Maximum number of forward steps per scheduler call.")
parser.add_argument(
"--max-seq-len-to-capture",
type=int,
help="Maximum sequence length covered by CUDA/HIP graph.")
args = parser.parse_args()
main(args)
19 changes: 12 additions & 7 deletions benchmarks/benchmark_throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ def run_vllm(
distributed_executor_backend=distributed_executor_backend,
load_format=load_format,
num_scheduler_steps=num_scheduler_steps,
max_seq_len_to_capture=args.max_seq_len_to_capture,
max_num_seqs=args.max_num_seqs,
use_v2_block_manager=use_v2_block_manager,
disable_async_output_proc=disable_async_output_proc,
Expand Down Expand Up @@ -468,17 +469,21 @@ def main(args: argparse.Namespace):
choices=DEVICE_OPTIONS,
help='device type for vLLM execution')
parser.add_argument(
"--num-scheduler-steps",
type=int,
default=1,
help="Maximum number of forward steps per scheduler call.")
parser.add_argument("--use-v2-block-manager",
action='store_true',
help="Enable block manager v2.")
"--num-scheduler-steps",
type=int,
default=1,
help="Maximum number of forward steps per scheduler call.")
parser.add_argument(
"--max-seq-len-to-capture",
type=int,
help="Maximum sequence length covered by CUDA/HIP graph.")
parser.add_argument('--max-num-seqs',
type=int,
default=256,
help="Max number of sequences a model can run in a single batch")
parser.add_argument("--use-v2-block-manager",
action='store_true',
help="Enable block manager v2.")
parser.add_argument(
"--enable-prefix-caching",
action='store_true',
Expand Down

0 comments on commit 9f32da2

Please sign in to comment.