diff --git a/06_gpu_and_ml/llm-serving/vllm_gemma.py b/06_gpu_and_ml/llm-serving/vllm_gemma.py index 2a3545961..634c6d47a 100644 --- a/06_gpu_and_ml/llm-serving/vllm_gemma.py +++ b/06_gpu_and_ml/llm-serving/vllm_gemma.py @@ -121,7 +121,7 @@ class Model: @modal.enter() def load(self): self.template = ( - "start_of_turn>user\n{user}\nmodel" + "user\n{user}\nmodel\n" ) # Load the model. Tip: Some models, like MPT, may require `trust_remote_code=true`. diff --git a/06_gpu_and_ml/llm-serving/vllm_inference.py b/06_gpu_and_ml/llm-serving/vllm_inference.py index 3f67aa908..c24e345db 100644 --- a/06_gpu_and_ml/llm-serving/vllm_inference.py +++ b/06_gpu_and_ml/llm-serving/vllm_inference.py @@ -109,11 +109,11 @@ class Model: def load_model(self): # Tip: models that are not fully implemented by Hugging Face may require `trust_remote_code=true`. self.llm = vllm.LLM(MODEL_DIR, tensor_parallel_size=GPU_CONFIG.count) - self.template = """[INST] <> + self.template = """[INST] <> {system} <> -{user} [/INST] """ +{user} [/INST]""" @modal.method() def generate(self, user_questions): diff --git a/06_gpu_and_ml/llm-serving/vllm_mixtral.py b/06_gpu_and_ml/llm-serving/vllm_mixtral.py index 57618ae28..eb236b9cb 100644 --- a/06_gpu_and_ml/llm-serving/vllm_mixtral.py +++ b/06_gpu_and_ml/llm-serving/vllm_mixtral.py @@ -121,7 +121,7 @@ def start_engine(self): disable_log_stats=True, # disable logging so we can stream tokens disable_log_requests=True, ) - self.template = " [INST] {user} [/INST] " + self.template = "[INST] {user} [/INST]" # this can take some time! self.engine = AsyncLLMEngine.from_engine_args(engine_args)