Skip to content

Commit e57557d

Browse files
Fixed errors by upper-case of model name, and changed the description (#82)
* fix a glm tokenizer bug Signed-off-by: zhaohu xing <[email protected]> * Update tokenizer.py Signed-off-by: Anhforth <[email protected]>
1 parent dee25b7 commit e57557d

File tree

3 files changed

+8
-14
lines changed

3 files changed

+8
-14
lines changed

doc_zh/TUTORIAL_4_TRAINER.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -389,7 +389,6 @@ python train.py --test1=1
389389

390390
2. [glm-title-generation-env-trainer](https://github.com/FlagAI-Open/FlagAI/tree/master/examples/glm_title_generation/train_env_trainer.py)
391391

392-
393392
# 使用 pytorchDDP launcher 或 deepspeed launcher 运行
394393
如果你使用多个GPU来训练模型,你可以直接运行train.py来调用FlagAI训练器中的启动器。
395394
```commandline

flagai/auto_model/auto_loader.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ def __getattr__(self, name):
100100
}
101101

102102

103+
103104
class AutoLoader:
104105

105106
def __init__(self,

flagai/data/tokenizer/uni_tokenizer/tokenizer.py

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -33,16 +33,13 @@
3333

3434

3535
def is_control(ch):
36-
"""控制类字符判断
36+
"""
3737
https://en.wikipedia.org/wiki/Control_character
3838
https://www.fileformat.info/info/unicode/category/Cc/index.htm
3939
https://www.fileformat.info/info/unicode/category/Cf/index.htm
40-
4140
"""
4241
return unicodedata.category(ch) in ('Cc', 'Cf')
4342

44-
45-
4643
class Tokenizer(BaseTokenizer):
4744
def __init__(self,
4845
add_block_symbols=True,
@@ -56,7 +53,7 @@ def __init__(self,
5653
if self.tokenizer_class == "wp":
5754
self.text_tokenizer = WordpieceTokenizer(self.vocab_file)
5855
elif self.tokenizer_class == "bpe":
59-
if self.tokenizer_model_name.startswith('clip'):
56+
if self.tokenizer_model_name.lower().startswith('clip'):
6057
self.text_tokenizer = MMBPETokenizer(self.vocab_file, self.merges_file)
6158
else:
6259
self.text_tokenizer = BPETokenizer(self.vocab_file, self.merges_file)
@@ -65,8 +62,6 @@ def __init__(self,
6562
else:
6663
raise NotImplementedError("cannot assign a tokenize class")
6764

68-
self.is_glm = self.tokenizer_model_name.startswith('GLM')
69-
# self.is_clip = self.tokenizer_model_name.startswith('clip')
7065
self.num_tokens = self.text_tokenizer.vocab_size
7166

7267
if self.tokenizer_class == "wp":
@@ -125,7 +120,7 @@ def __init__(self,
125120
self.num_tokens += 2
126121
self.num_command_tokens += 2
127122
elif self.tokenizer_class == "bpe":
128-
if self.tokenizer_model_name.startswith('roberta'):
123+
if self.tokenizer_model_name.lower().startswith('roberta'):
129124
self.num_command_tokens = 6
130125
self.num_text_tokens = self.num_tokens - 3
131126
self._command_tokens = [
@@ -151,7 +146,7 @@ def __init__(self,
151146
])
152147
self.num_tokens += 2
153148
self.num_command_tokens += 2
154-
elif self.tokenizer_model_name.startswith('clip'):
149+
elif self.tokenizer_model_name.lower().startswith('clip'):
155150
self.num_command_tokens = 2
156151
self._command_tokens = [
157152
CommandToken('sot', '<start_of_text>',
@@ -170,7 +165,7 @@ def __init__(self,
170165
self.text_tokenizer.convert_token_to_id('<|endoftext|>'))
171166
]
172167
if add_block_symbols:
173-
if self.tokenizer_model_name.startswith('GLM'):
168+
if self.tokenizer_model_name.lower().startswith('glm'):
174169
unk_token_id = self.num_tokens + 5
175170
cls_token_id = self.num_tokens + 2
176171
num_tokens_to_add = 5
@@ -215,7 +210,7 @@ def __init__(self,
215210
self.num_text_tokens = self.text_tokenizer.vocab_size
216211
self.num_tokens = self.num_text_tokens
217212

218-
if self.tokenizer_model_name.startswith('GLM'):
213+
if self.tokenizer_model_name.lower().startswith('glm'):
219214
pad_token_id = self.num_tokens
220215
eos_token_id = self.num_tokens
221216
unk_token_id = self.num_tokens + 4
@@ -450,7 +445,6 @@ def CommandTokenIds(self, exception=None):
450445
result.append(s.Id)
451446
return (result)
452447

453-
454448
def encode_plus_non_glm(
455449
self,
456450
text,
@@ -517,7 +511,7 @@ def encode_plus( #for Seq2seq
517511
truncation=True,
518512
max_length=None,
519513
):
520-
if not self.tokenizer_model_name.startswith("GLM"):
514+
if not self.tokenizer_model_name.lower().startswith("glm"):
521515
return self.encode_plus_non_glm(source_text, second_text, truncation, max_length)
522516
sop_id = self.get_command_id('sop') #start of piece
523517
eop_id = self.get_command_id('eop') #end of piece

0 commit comments

Comments
 (0)