diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 9fe0d0bb0a301..04b9e8032c0cf 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -56,7 +56,7 @@ class EngineArgs: quantization: Optional[str] = None enforce_eager: bool = False max_context_len_to_capture: Optional[int] = None - max_seq_len_to_capture: int = 8192 + max_seq_len_to_capture: int = 32768 disable_custom_all_reduce: bool = False tokenizer_pool_size: int = 0 tokenizer_pool_type: str = "ray" diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 6e971ae73f5d0..bbb5d31f0606a 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -115,7 +115,7 @@ def __init__( swap_space: int = 4, enforce_eager: bool = False, max_context_len_to_capture: Optional[int] = None, - max_seq_len_to_capture: int = 8192, + max_seq_len_to_capture: int = 32768, disable_custom_all_reduce: bool = False, **kwargs, ) -> None: