-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtokenizer.cpp
105 lines (89 loc) · 2.86 KB
/
tokenizer.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#include "tokenizer.h"
#include <cstddef>
#include <cstdint>
#include <cstdio>
#include <vector>
#include "utils.h"
namespace embeddings {
Tokenizer::Tokenizer(const std::string &path) {
auto blob = load_bytes_from_file(path);
auto tok_ = tokenizers::HFTokenizer::FromBlobJSON(blob);
tok = tok_.release();
}
Encoding Tokenizer::Encode(const std::string &text, bool add_special_tokens) {
std::vector<std::string> texts = {text};
return EncodeBatch(texts, add_special_tokens)[0];
}
std::vector<Encoding> Tokenizer::EncodeBatch(
const std::vector<std::string> &texts, bool add_special_tokens) {
std::vector<Encoding> results;
auto hf_results = tok->EncodeBatch(texts, add_special_tokens);
if (hf_results.empty()) {
return results;
}
for (auto item : hf_results) {
results.push_back({item.ids, item.attention_mask, item.ids.size()});
}
auto size0 = results[0].ids.size();
bool is_same_size = true;
for (size_t i = 1; i < results.size(); ++i) {
if (results[i].ids.size() != size0) {
is_same_size = false;
break;
}
}
if (is_same_size) {
// some model always returns full length, like text2vec-base-multilingual
// shrink to the max size in the batch
size_t max_size = 0;
for (auto &enc : results) {
for (size_t pos = 0; pos < enc.attention_mask.size(); pos++) {
if (enc.attention_mask[pos] == 0) {
enc.no_pad_len = pos;
if (pos > max_size) max_size = pos;
break;
}
}
}
if (max_size > 0) {
for (size_t i = 0; i < results.size(); i++) {
results[i].attention_mask.resize(max_size);
results[i].ids.resize(max_size);
}
}
} else {
// we should pad them to the same length using <PAD>, and set the attention
// mask to 0 on the padded position
size_t max_size = 0;
for (auto enc : results) {
if (enc.ids.size() > max_size) {
max_size = enc.ids.size();
}
}
for (size_t i = 0; i < results.size(); i++) {
size_t cur_size = results[i].ids.size();
int32_t pad_id = 0;
auto added_tokens = tok->GetAddedTokens();
for (size_t t = 0; t < added_tokens.size(); t++) {
std::string lower_word = to_lowercase(added_tokens[t].content);
if (lower_word.find("pad") != std::string::npos) {
pad_id = added_tokens[t].id;
}
}
if (cur_size < max_size) {
results[i].attention_mask.resize(max_size);
results[i].ids.resize(max_size);
for (size_t j = cur_size; j < max_size; ++j) {
results[i].attention_mask[j] = 0;
results[i].ids[j] = pad_id;
}
}
}
}
return results;
}
std::string Tokenizer::Decode(const tokens &ids, bool skip_special_tokens) {
return tok->Decode(ids, skip_special_tokens);
}
tokenizers::HFTokenizer *Tokenizer::GetFastTokenizer() { return tok; }
} // namespace embeddings