Skip to content

Commit

Permalink
sequence_start -> add_bos
Browse files Browse the repository at this point in the history
  • Loading branch information
AllentDan committed Nov 7, 2023
1 parent 8a555bd commit f7b8ca2
Showing 1 changed file with 5 additions and 6 deletions.
11 changes: 5 additions & 6 deletions lmdeploy/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,15 +52,14 @@ def _maybe_add_prefix_space(self, tokens, decoded):
else:
return decoded

def encode(self, s: str, sequence_start: bool = True, **kwargs):
def encode(self, s: str, add_bos: bool = True, **kwargs):
"""Tokenize a prompt.
Args:
s (str): a prompt
Returns:
list[int]: token ids
"""
add_bos = sequence_start
return self.model.Encode(s, add_bos=add_bos, **kwargs)

def decode(self, t: Sequence[int], offset: Optional[int] = None):
Expand Down Expand Up @@ -166,7 +165,7 @@ def _maybe_add_prefix_space(self, tokens, decoded):
else:
return decoded

def encode(self, s: str, sequence_start: bool = True, **kwargs):
def encode(self, s: str, add_bos: bool = True, **kwargs):
"""Tokenize a prompt.
Args:
Expand All @@ -175,7 +174,7 @@ def encode(self, s: str, sequence_start: bool = True, **kwargs):
list[int]: token ids
"""
encoded = self.model.encode(s, **kwargs)
if not sequence_start:
if not add_bos:
# in the middle of a session
if len(encoded) and encoded[0] == self.bos_token_id:
encoded = encoded[1:]
Expand Down Expand Up @@ -250,15 +249,15 @@ def eos_token_id(self):
"""end of the sentence token id."""
return self.model.eos_token_id

def encode(self, s: str, sequence_start: bool = True, **kwargs):
def encode(self, s: str, add_bos: bool = True, **kwargs):
"""Tokenize a prompt.
Args:
s (str): a prompt
Returns:
list[int]: token ids
"""
return self.model.encode(s, sequence_start, **kwargs)
return self.model.encode(s, add_bos, **kwargs)

def decode(self, t: Sequence[int], offset: Optional[int] = None):
"""De-tokenize.
Expand Down

0 comments on commit f7b8ca2

Please sign in to comment.