Skip to content

Commit 549279d

Browse files
authored
llama : avoid double token-to-piece cache (#7654)
ggml-ci
1 parent 9e405b6 commit 549279d

File tree

1 file changed

+11
-11
lines changed

1 file changed

+11
-11
lines changed

llama.cpp

+11-11
Original file line numberDiff line numberDiff line change
@@ -2164,8 +2164,7 @@ struct llama_vocab {
21642164
std::vector<token_data> id_to_token;
21652165

21662166
std::vector<id> cache_special_tokens;
2167-
std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = false);
2168-
std::vector<token> cache_token_to_piece_special; // llama_token_to_piece(special = true);
2167+
std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = true);
21692168

21702169
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
21712170

@@ -4845,23 +4844,19 @@ static void llm_load_vocab(
48454844
LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size());
48464845
}
48474846

4848-
// build token to piece caches
4847+
// build token to piece cache
48494848
{
48504849
size_t size_cache = 0;
48514850

4852-
std::vector<llama_vocab::token> cache_token_to_piece (n_vocab);
4853-
std::vector<llama_vocab::token> cache_token_to_piece_special(n_vocab);
4851+
std::vector<llama_vocab::token> cache_token_to_piece(n_vocab);
48544852

48554853
for (uint32_t id = 0; id < n_vocab; ++id) {
4856-
cache_token_to_piece[id] = llama_token_to_piece(&model, id, false);
4857-
cache_token_to_piece_special[id] = llama_token_to_piece(&model, id, true);
4854+
cache_token_to_piece[id] = llama_token_to_piece(&model, id, true);
48584855

48594856
size_cache += cache_token_to_piece[id].size();
4860-
size_cache += cache_token_to_piece_special[id].size();
48614857
}
48624858

4863-
std::swap(vocab.cache_token_to_piece, cache_token_to_piece);
4864-
std::swap(vocab.cache_token_to_piece_special, cache_token_to_piece_special);
4859+
std::swap(vocab.cache_token_to_piece, cache_token_to_piece);
48654860

48664861
LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
48674862
}
@@ -18318,9 +18313,14 @@ static std::string llama_decode_text(const std::string & text) {
1831818313

1831918314
// does not write null-terminator to buf
1832018315
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
18316+
// ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843
18317+
if (!special && llama_is_control_token(model->vocab, token)) {
18318+
return 0;
18319+
}
18320+
1832118321
// if we have a cache - use it
1832218322
{
18323-
const auto & cache = special ? model->vocab.cache_token_to_piece_special : model->vocab.cache_token_to_piece;
18323+
const auto & cache = model->vocab.cache_token_to_piece;
1832418324

1832518325
if (!cache.empty()) {
1832618326
const auto & res = cache.at(token);

0 commit comments

Comments
 (0)