From 2aa6f99df8760b1b0ced0d383a5711d74398d4c9 Mon Sep 17 00:00:00 2001 From: baoyf4244 Date: Wed, 3 Jul 2024 21:59:14 +0800 Subject: [PATCH] vLLM expose a parameter called max_model_len. This will not match context_len when vllm startup if user update max_model_len. --- fastchat/serve/vllm_worker.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fastchat/serve/vllm_worker.py b/fastchat/serve/vllm_worker.py index 0af680bb5..7e540b1f4 100644 --- a/fastchat/serve/vllm_worker.py +++ b/fastchat/serve/vllm_worker.py @@ -22,7 +22,7 @@ logger, worker_id, ) -from fastchat.utils import get_context_length, is_partial_stop +from fastchat.utils import is_partial_stop app = FastAPI() @@ -59,7 +59,7 @@ def __init__( # and llm_engine.engine.tokenizer was no longer a raw tokenizer if hasattr(self.tokenizer, "tokenizer"): self.tokenizer = llm_engine.engine.tokenizer.tokenizer - self.context_len = get_context_length(llm_engine.engine.model_config.hf_config) + self.context_len = llm_engine.engine.model_config.max_model_len if not no_register: self.init_heart_beat()