forked from espnet/espnet
-
Notifications
You must be signed in to change notification settings - Fork 0
/
hugging_face_export_vocabulary.py
executable file
·118 lines (96 loc) · 3.1 KB
/
hugging_face_export_vocabulary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/bin/env python3
import argparse
import logging
import sys
from pathlib import Path
from typing import List
from typeguard import check_argument_types
from espnet.utils.cli_utils import get_commandline_args
try:
from transformers import AutoTokenizer
is_transformers_available = True
except ImportError:
is_transformers_available = False
def export_vocabulary(
output: str,
model_name_or_path: str,
log_level: str,
add_symbol: List[str],
):
assert check_argument_types()
if not is_transformers_available:
raise ImportError(
"`transformers` is not available. Please install it via `pip install"
" transformers` or `cd /path/to/espnet/tools && . ./activate_python.sh"
" && ./installers/install_transformers.sh`."
)
logging.basicConfig(
level=log_level,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
if output == "-":
fout = sys.stdout
else:
p = Path(output)
p.parent.mkdir(parents=True, exist_ok=True)
fout = p.open("w", encoding="utf-8")
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
words = ["" for _ in range(tokenizer.vocab_size)]
vocab = tokenizer.get_vocab()
for w in vocab:
if vocab[w] < tokenizer.vocab_size: # pythia tokenizer
words[vocab[w]] = w
# Parse the values of --add_symbol
for symbol_and_id in add_symbol:
# e.g symbol="<blank>:0"
try:
symbol, idx = symbol_and_id.split(":")
idx = int(idx)
except ValueError:
raise RuntimeError(f"Format error: e.g. '<blank>:0': {symbol_and_id}")
symbol = symbol.strip()
# e.g. idx=0 -> append as the first symbol
# e.g. idx=-1 -> append as the last symbol
if idx < 0:
idx = len(words) + 1 + idx
words.insert(idx, symbol)
# Write words
for w in words:
fout.write(w + "\n")
def get_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description="Export Hugging Face vocabulary",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"--log_level",
type=lambda x: x.upper(),
default="INFO",
choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
help="The verbose level of logging",
)
parser.add_argument(
"--output", "-o", required=True, help="Output text. - indicates sys.stdout"
)
parser.add_argument(
"--model_name_or_path",
type=str,
required=True,
help="Hugging Face model name or path",
)
parser.add_argument(
"--add_symbol",
type=str,
default=[],
action="append",
help="Append symbol e.g. --add_symbol '<blank>:0' --add_symbol '<unk>:1'",
)
return parser
def main(cmd=None):
print(get_commandline_args(), file=sys.stderr)
parser = get_parser()
args = parser.parse_args(cmd)
kwargs = vars(args)
export_vocabulary(**kwargs)
if __name__ == "__main__":
main()