diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py index d0e579fd20..d924b30da7 100644 --- a/lmdeploy/serve/async_engine.py +++ b/lmdeploy/serve/async_engine.py @@ -200,7 +200,7 @@ async def generate( prompt = messages if do_preprocess: prompt = self.model.messages2prompt(prompt, sequence_start) - input_ids = self.tokenizer.encode(prompt, sequence_start) + input_ids = self.tokenizer.encode(prompt, add_bos=sequence_start) finish_reason = 'stop' if stop else None if self.steps[str(session_id)] + len( input_ids) + request_output_len >= self.tm_model.session_len: diff --git a/lmdeploy/serve/turbomind/chatbot.py b/lmdeploy/serve/turbomind/chatbot.py index 19a11a6159..ab7f30a003 100644 --- a/lmdeploy/serve/turbomind/chatbot.py +++ b/lmdeploy/serve/turbomind/chatbot.py @@ -459,7 +459,7 @@ def _stream_infer(self, session.sequence_length = 0 input_ids, input_lengths = self.preprocess(prompt) - # got input_ids with default sequence_start == True + # got input_ids with default add_bos == True if not sequence_start and input_ids[0][0] == self.bos_id: input_ids = input_ids[:, 1:] input_lengths = input_lengths - 1