This repository has been archived by the owner on Apr 23, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 103
/
speed_test.py
257 lines (202 loc) · 9.01 KB
/
speed_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
import argparse
import os
from pathlib import Path
from time import time
from tabulate import tabulate
from tokenizers import pre_tokenizers
from tokenizers import Tokenizer as HuggingFaceBPETokenizer
from tokenizers.models import BPE as HuggingFaceBPEModel
from tokenizers.trainers import BpeTrainer as HuggingFaceBPETrainer
MODEL_FILE_NAME = "bpe.model"
MODEL_SUFFIX = ".model"
YOU_TOKEN_TO_ME = "YouTokenToMe"
SENTENCE_PIECE = "SentencePiece"
FAST_BPE = "fastBPE"
HUGGING_FACE_BPE = "Hugging_Face_BPE"
PATH_TO_FASTBPE = "./fastBPE"
class HuggingfaceInterface:
def train_from_file(self, train_file, vocab_size, model_file, _):
tokenizer = HuggingFaceBPETokenizer(HuggingFaceBPEModel(unk_token="[UNK]"))
trainer = HuggingFaceBPETrainer(special_tokens=["[UNK]", "[PAD]"], vocab_size=vocab_size)
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
tokenizer.train([str(train_file)], trainer)
tokenizer.save(model_file)
def encode_file(self, model_path, path_in, path_out, _):
tokenizer = HuggingFaceBPETokenizer.from_file(model_path)
with open(path_in) as fin:
full_text = fin.readlines()
tokenizer.encode_batch(full_text)
class SentencePieceInterface:
def train_from_file(self, train_file, vocab_size, model_file, _):
tmp = model_file.split(".")
assert len(tmp) == 2
assert tmp[1] == "model"
train_command = f"spm_train "
train_command += f" --input={str(train_file)} "
train_command += f" --model_prefix={tmp[0]} "
train_command += f" --vocab_size={vocab_size} "
train_command += f" --character_coverage=1.0 "
train_command += f" --model_type=bpe "
assert os.system(train_command) == 0
def encode_file(self, model_path, path_in, path_out, _):
encode_command = f"spm_encode "
encode_command += f" --model={model_path} "
encode_command += f" --output_format=piece "
encode_command += f" < {path_in} > {path_out} "
assert os.system(encode_command) == 0
class FastBPEInterface:
def train_from_file(self, file_path, vocab_size, model_file, _):
train_command = f"{PATH_TO_FASTBPE} learnbpe"
train_command += f" {vocab_size} {str(file_path)} > {model_file}"
assert os.system(train_command) == 0
def encode_file(self, model_path, path_in, path_out, _):
encode_command = f"{PATH_TO_FASTBPE} applybpe {path_out} {path_in} {model_path}"
assert os.system(encode_command) == 0
class YouTokenToMeInterface:
def train_from_file(self, file_path, vocab_size, model_path, n_threads):
train_command = f"yttm bpe "
train_command += f" --data={file_path} --model={model_path} "
train_command += f" --vocab_size={vocab_size} --n_threads={n_threads} "
assert os.system(train_command) == 0
def encode_file(self, model_path, path_in, path_out, n_threads):
encode_command = "yttm encode "
encode_command += f" --model={model_path} --output_type=id "
encode_command += f" --n_threads={n_threads} "
encode_command += f" < {str(path_in)} > {str(path_out)}"
assert os.system(encode_command) == 0
def get_bpe(impl_name):
if impl_name == YOU_TOKEN_TO_ME:
return YouTokenToMeInterface()
if impl_name == SENTENCE_PIECE:
return SentencePieceInterface()
if impl_name == FAST_BPE:
return FastBPEInterface()
if impl_name == HUGGING_FACE_BPE:
return HuggingfaceInterface()
assert False
def check_train(algorithm, vocab_size, corpus_path, n_threads):
bpe = get_bpe(algorithm)
start_time = time()
bpe.train_from_file(corpus_path, vocab_size, MODEL_FILE_NAME, n_threads)
return time() - start_time
def check_inference_file(algorithm, corpus_path, n_threads):
bpe = get_bpe(algorithm)
start_time = time()
bpe.encode_file(MODEL_FILE_NAME, corpus_path, "rm_it.txt", n_threads)
return time() - start_time
def download_xml2txt():
if not Path("xml2txt.pl").exists():
print("downloading xml2txt.pl ...")
os.system("wget https://www.dropbox.com/s/p3ta9spzfviovk0/xml2txt.pl")
def prepare_data(zip_path, size_mb):
expected_extension = ".xml.bz2"
assert zip_path.endswith(expected_extension)
base_path = Path(zip_path).parent
unzip_path = base_path / "wiki.xml"
full_text_path = base_path / "wiki.txt"
cutted_text_path = base_path / f"wiki_{size_mb}MB.txt"
if not Path(unzip_path).exists():
print(f"unziping file {zip_path} ...")
assert os.system(f"bzip2 -kdc {zip_path} > {unzip_path}") == 0
if not Path(full_text_path).exists():
print(f"converting xml to text {unzip_path} ...")
download_xml2txt()
preprocess_command = f"perl xml2txt.pl "
preprocess_command += f" -nomath -notables "
preprocess_command += f" {unzip_path} {full_text_path}"
assert os.system(preprocess_command) == 0
if not Path(cutted_text_path).exists():
byte_processed = 0
with open(cutted_text_path, "w") as fout:
with open(full_text_path, "r") as fin:
while byte_processed < size_mb * 1_000_000:
s = fin.readline()
byte_processed += len(s.encode())
fout.write(s)
return cutted_text_path
def speed_test(corpus_path, vocab_size, algorithms, n_threads):
train_res = {}
infer_res = {}
for algorithm in algorithms:
time_train = check_train(algorithm, vocab_size, corpus_path, n_threads)
time_infer = check_inference_file(algorithm, corpus_path, n_threads)
train_res[algorithm] = time_train
infer_res[algorithm] = time_infer
return train_res, infer_res
def print_results(cfg, result_name, corpuses, algorithms):
result_table = [
["#" for _ in range(len(corpuses) + 1)] for _ in range(len(algorithms))
]
table_header = ["#"] + [lang for lang in corpuses]
rev_lang = {lang: i for i, lang in enumerate(table_header)}
rev_algo = {algo: i for i, algo in enumerate(algorithms)}
for i, algo_name in enumerate(algorithms):
result_table[i][0] = algo_name
for lang, res in cfg.items():
best = min(res.values())
for algo in res:
j = rev_lang[lang]
i = rev_algo[algo]
multiplier_str = f"{res[algo]/best:.1f}".rstrip('0').rstrip('.')
result_table[i][j] = f"{res[algo]:.1f} (x{multiplier_str})"
table_header[0] = result_name
column_align = ["left"] + ["center" for _ in corpuses]
print(tabulate(result_table, table_header, tablefmt="github", colalign=column_align))
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--vocab_size", type=int, default=30000)
parser.add_argument("--n_threads", type=int, default=4)
parser.add_argument(
"--corpus_size", type=int, default=100, help="Size of testing corpus in MB"
)
parser.add_argument(
"--langs",
type=str,
nargs="+",
help="list of languages for speed test",
default="ru",
)
return parser.parse_args()
def main(args):
langs = args.langs if isinstance(args.langs, list) else [args.langs]
# Hugging Face - limit number of processes
os.environ["RAYON_RS_NUM_CPUS"] = str(args.n_threads)
short_to_long_names = {
"en": "English",
"ru": "Russian",
"ja": "Japanese",
"zh": "Chinese",
}
# For adding more languages check out this page https://linguatools.org/tools/corpora/wikipedia-monolingual-corpora/
all_links = {
"English": "https://www.dropbox.com/s/cnrhd11zdtc1pic/enwiki-20181001-corpus.xml.bz2?dl=1",
"Russian": "https://www.dropbox.com/s/lpfmyrl7nxn5ugg/ruwiki-20181001-corpus.xml.bz2?dl=1",
"Japanese": "https://www.dropbox.com/s/wf496hlu512z9kc/jawiki-20140807-corpus.xml.bz2?dl=1",
"Chinese": "https://www.dropbox.com/s/czhr6s5jwaljeue/zhwiki-20140804-corpus.xml.bz2?dl=1",
}
links = {
short_to_long_names[lang]: all_links[short_to_long_names[lang]]
for lang in langs
}
corpuses = {}
Path("data").mkdir(exist_ok=True)
for lang, link in links.items():
Path(f"data/{lang}").mkdir(exist_ok=True)
zip_file = f"data/{lang}/wiki.xml.bz2"
if not Path(zip_file).exists():
os.system(f"wget -O {zip_file} {link}")
corpuses[lang] = prepare_data(zip_file, args.corpus_size)
algorithms = [YOU_TOKEN_TO_ME, HUGGING_FACE_BPE, SENTENCE_PIECE, FAST_BPE]
global_train = {}
global_tokenization = {}
for lang, corpus_path in corpuses.items():
train_stat, tokenization_stat = speed_test(
corpus_path, args.vocab_size, algorithms, args.n_threads
)
global_train[lang] = train_stat
global_tokenization[lang] = tokenization_stat
print_results(global_train, f"Train {args.corpus_size}MB", corpuses, algorithms)
print_results(global_tokenization, f"Tokenization {args.corpus_size}MB", corpuses, algorithms)
if __name__ == "__main__":
args = parse_args()
main(args)