33
33
34
34
35
35
def is_control (ch ):
36
- """控制类字符判断
36
+ """
37
37
https://en.wikipedia.org/wiki/Control_character
38
38
https://www.fileformat.info/info/unicode/category/Cc/index.htm
39
39
https://www.fileformat.info/info/unicode/category/Cf/index.htm
40
-
41
40
"""
42
41
return unicodedata .category (ch ) in ('Cc' , 'Cf' )
43
42
44
-
45
-
46
43
class Tokenizer (BaseTokenizer ):
47
44
def __init__ (self ,
48
45
add_block_symbols = True ,
@@ -56,7 +53,7 @@ def __init__(self,
56
53
if self .tokenizer_class == "wp" :
57
54
self .text_tokenizer = WordpieceTokenizer (self .vocab_file )
58
55
elif self .tokenizer_class == "bpe" :
59
- if self .tokenizer_model_name .startswith ('clip' ):
56
+ if self .tokenizer_model_name .lower (). startswith ('clip' ):
60
57
self .text_tokenizer = MMBPETokenizer (self .vocab_file , self .merges_file )
61
58
else :
62
59
self .text_tokenizer = BPETokenizer (self .vocab_file , self .merges_file )
@@ -65,8 +62,6 @@ def __init__(self,
65
62
else :
66
63
raise NotImplementedError ("cannot assign a tokenize class" )
67
64
68
- self .is_glm = self .tokenizer_model_name .startswith ('GLM' )
69
- # self.is_clip = self.tokenizer_model_name.startswith('clip')
70
65
self .num_tokens = self .text_tokenizer .vocab_size
71
66
72
67
if self .tokenizer_class == "wp" :
@@ -125,7 +120,7 @@ def __init__(self,
125
120
self .num_tokens += 2
126
121
self .num_command_tokens += 2
127
122
elif self .tokenizer_class == "bpe" :
128
- if self .tokenizer_model_name .startswith ('roberta' ):
123
+ if self .tokenizer_model_name .lower (). startswith ('roberta' ):
129
124
self .num_command_tokens = 6
130
125
self .num_text_tokens = self .num_tokens - 3
131
126
self ._command_tokens = [
@@ -151,7 +146,7 @@ def __init__(self,
151
146
])
152
147
self .num_tokens += 2
153
148
self .num_command_tokens += 2
154
- elif self .tokenizer_model_name .startswith ('clip' ):
149
+ elif self .tokenizer_model_name .lower (). startswith ('clip' ):
155
150
self .num_command_tokens = 2
156
151
self ._command_tokens = [
157
152
CommandToken ('sot' , '<start_of_text>' ,
@@ -170,7 +165,7 @@ def __init__(self,
170
165
self .text_tokenizer .convert_token_to_id ('<|endoftext|>' ))
171
166
]
172
167
if add_block_symbols :
173
- if self .tokenizer_model_name .startswith ('GLM ' ):
168
+ if self .tokenizer_model_name .lower (). startswith ('glm ' ):
174
169
unk_token_id = self .num_tokens + 5
175
170
cls_token_id = self .num_tokens + 2
176
171
num_tokens_to_add = 5
@@ -215,7 +210,7 @@ def __init__(self,
215
210
self .num_text_tokens = self .text_tokenizer .vocab_size
216
211
self .num_tokens = self .num_text_tokens
217
212
218
- if self .tokenizer_model_name .startswith ('GLM ' ):
213
+ if self .tokenizer_model_name .lower (). startswith ('glm ' ):
219
214
pad_token_id = self .num_tokens
220
215
eos_token_id = self .num_tokens
221
216
unk_token_id = self .num_tokens + 4
@@ -450,7 +445,6 @@ def CommandTokenIds(self, exception=None):
450
445
result .append (s .Id )
451
446
return (result )
452
447
453
-
454
448
def encode_plus_non_glm (
455
449
self ,
456
450
text ,
@@ -517,7 +511,7 @@ def encode_plus( #for Seq2seq
517
511
truncation = True ,
518
512
max_length = None ,
519
513
):
520
- if not self .tokenizer_model_name .startswith ("GLM " ):
514
+ if not self .tokenizer_model_name .lower (). startswith ("glm " ):
521
515
return self .encode_plus_non_glm (source_text , second_text , truncation , max_length )
522
516
sop_id = self .get_command_id ('sop' ) #start of piece
523
517
eop_id = self .get_command_id ('eop' ) #end of piece
0 commit comments