From 98f33bae767dd19e213ef663b22ad99979ca71d7 Mon Sep 17 00:00:00 2001 From: ochafik Date: Sun, 21 Apr 2024 01:12:05 +0100 Subject: [PATCH 01/12] grammars: early exit when no next_candidates to reject --- llama.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llama.cpp b/llama.cpp index ec4c1242b20bc..258d50d270394 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12727,6 +12727,10 @@ static std::vector llama_grammar_reject_candidates_for_ } } + if (next_candidates.empty()) { + return rejects; + } + const auto * stack_pos_after = llama_grammar_match_char(stack_pos, 0).second; // update top of stack to next element, if any From f608415de06fc914568d7ffc84e293d0a94bd16a Mon Sep 17 00:00:00 2001 From: ochafik Date: Sun, 21 Apr 2024 15:52:16 +0100 Subject: [PATCH 02/12] grammars: cache decoded tokens --- llama.cpp | 32 +++++++++++++++++++++++++------- llama.h | 7 ++++++- 2 files changed, 31 insertions(+), 8 deletions(-) diff --git a/llama.cpp b/llama.cpp index 258d50d270394..2f73284fa78b2 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12808,7 +12808,7 @@ struct llama_grammar * llama_grammar_init( } } while (true); - return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} }; + return new llama_grammar{ std::move(vec_rules), std::move(stacks), {}, {}, {} }; } void llama_grammar_free(struct llama_grammar * grammar) { @@ -12816,7 +12816,7 @@ void llama_grammar_free(struct llama_grammar * grammar) { } struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) { - llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8 }; + llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8, grammar->token_pieces, grammar->token_codepoints }; // redirect elements in stacks to point to new rules for (size_t is = 0; is < result->stacks.size(); is++) { @@ -13297,7 +13297,7 @@ void llama_sample_repetition_penalties( } } -void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) { +void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, struct llama_grammar * grammar) { GGML_ASSERT(ctx); const int64_t t_start_sample_us = ggml_time_us(); @@ -13309,20 +13309,36 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c } } + if (grammar->token_codepoints.empty()) { + auto n_vocab = llama_n_vocab(llama_get_model(ctx)); + grammar->token_codepoints.resize(n_vocab); + grammar->token_pieces.resize(n_vocab); + for (llama_token id = 0; id < n_vocab; ++id) { + const std::string piece = llama_token_to_piece(ctx, id); + grammar->token_pieces[id] = piece; + grammar->token_codepoints[id] = decode_utf8(piece, {0, 0}); + } + } + std::vector, llama_partial_utf8>> candidates_decoded; - candidates_decoded.reserve(candidates->size); + if (grammar->partial_utf8.n_remain > 0) { + candidates_decoded.reserve(candidates->size); + } std::vector candidates_grammar; candidates_grammar.reserve(candidates->size); for (size_t i = 0; i < candidates->size; ++i) { const llama_token id = candidates->data[i].id; - const std::string piece = llama_token_to_piece(ctx, id); + const auto & piece = grammar->token_pieces[id]; if (llama_token_is_eog(&ctx->model, id)) { if (!allow_eog) { candidates->data[i].logit = -INFINITY; } } else if (piece.empty() || piece[0] == 0) { candidates->data[i].logit = -INFINITY; + } else if (grammar->partial_utf8.n_remain == 0){ + const auto & decoded = grammar->token_codepoints.at(id); + candidates_grammar.push_back({ i, decoded.first.data(), decoded.second }); } else { candidates_decoded.push_back(decode_utf8(piece, grammar->partial_utf8)); candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second }); @@ -13516,10 +13532,12 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar GGML_ASSERT(false); } - const std::string piece = llama_token_to_piece(ctx, token); + const auto & piece = grammar->token_pieces.at(token); // Note terminating 0 in decoded string - const auto decoded = decode_utf8(piece, grammar->partial_utf8); + const auto decoded = grammar->partial_utf8.n_remain == 0 + ? grammar->token_codepoints[token] + : decode_utf8(piece, grammar->partial_utf8); const auto & code_points = decoded.first; std::vector> tmp_new_stacks; for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) { diff --git a/llama.h b/llama.h index 5bed97ad1ef9f..1e6a2ef078a82 100644 --- a/llama.h +++ b/llama.h @@ -951,7 +951,7 @@ extern "C" { LLAMA_API void llama_sample_grammar( struct llama_context * ctx, llama_token_data_array * candidates, - const struct llama_grammar * grammar); + struct llama_grammar * grammar); /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. @@ -1088,6 +1088,11 @@ struct llama_grammar { // buffer for partially generated UTF-8 sequence from accepted tokens llama_partial_utf8 partial_utf8; + + // caching the token pieces & their decoded codepoints. + std::vector token_pieces; + std::vector, + llama_partial_utf8>> token_codepoints; }; struct llama_grammar_candidate { From cbc75809be064c1391873fc48c30bf5e4480f72f Mon Sep 17 00:00:00 2001 From: ochafik Date: Sun, 21 Apr 2024 15:52:25 +0100 Subject: [PATCH 03/12] grammars: faster llama_grammar_copy --- llama.cpp | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/llama.cpp b/llama.cpp index 2f73284fa78b2..3793d669ffb7c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12818,16 +12818,22 @@ void llama_grammar_free(struct llama_grammar * grammar) { struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) { llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8, grammar->token_pieces, grammar->token_codepoints }; + std::unordered_map element_map; + element_map.reserve(std::accumulate( + grammar->rules.begin(), grammar->rules.end(), 0, + [](size_t acc, const std::vector & rule) { + return acc + rule.size(); + })); + for (size_t ir = 0; ir < grammar->rules.size(); ir++) { + for (size_t ie = 0; ie < grammar->rules[ir].size(); ie++) { + element_map[&grammar->rules[ir][ie]] = &result->rules[ir][ie]; + } + } + // redirect elements in stacks to point to new rules for (size_t is = 0; is < result->stacks.size(); is++) { for (size_t ie = 0; ie < result->stacks[is].size(); ie++) { - for (size_t ir0 = 0; ir0 < grammar->rules.size(); ir0++) { - for (size_t ir1 = 0; ir1 < grammar->rules[ir0].size(); ir1++) { - if (grammar->stacks[is][ie] == &grammar->rules[ir0][ir1]) { - result->stacks[is][ie] = &result->rules[ir0][ir1]; - } - } - } + result->stacks[is][ie] = element_map.at(grammar->stacks[is][ie]); } } From 24769f9a80765c1240aee81b80ecadb5a962ac52 Mon Sep 17 00:00:00 2001 From: ochafik Date: Sun, 21 Apr 2024 18:34:59 +0100 Subject: [PATCH 04/12] grammars: fix bad merge --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index ecf2ae515513c..23aeacde2ff95 100644 --- a/llama.cpp +++ b/llama.cpp @@ -13320,7 +13320,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c grammar->token_codepoints.resize(n_vocab); grammar->token_pieces.resize(n_vocab); for (llama_token id = 0; id < n_vocab; ++id) { - const std::string piece = llama_token_to_piece(ctx, id); + const std::string piece = llama_token_to_piece(ctx, id, false); grammar->token_pieces[id] = piece; grammar->token_codepoints[id] = decode_utf8(piece, {0, 0}); } From 05efa34d92d46c4d2e25ff1432e123b5e42243be Mon Sep 17 00:00:00 2001 From: Olivier Chafik Date: Wed, 24 Apr 2024 14:28:16 +0100 Subject: [PATCH 05/12] grammars: keep llama_grammar_copy non-quadratic optim for later --- llama.cpp | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/llama.cpp b/llama.cpp index 23aeacde2ff95..d04cd12457ff7 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12818,22 +12818,16 @@ void llama_grammar_free(struct llama_grammar * grammar) { struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) { llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8, grammar->token_pieces, grammar->token_codepoints }; - std::unordered_map element_map; - element_map.reserve(std::accumulate( - grammar->rules.begin(), grammar->rules.end(), 0, - [](size_t acc, const std::vector & rule) { - return acc + rule.size(); - })); - for (size_t ir = 0; ir < grammar->rules.size(); ir++) { - for (size_t ie = 0; ie < grammar->rules[ir].size(); ie++) { - element_map[&grammar->rules[ir][ie]] = &result->rules[ir][ie]; - } - } - // redirect elements in stacks to point to new rules for (size_t is = 0; is < result->stacks.size(); is++) { for (size_t ie = 0; ie < result->stacks[is].size(); ie++) { - result->stacks[is][ie] = element_map.at(grammar->stacks[is][ie]); + for (size_t ir0 = 0; ir0 < grammar->rules.size(); ir0++) { + for (size_t ir1 = 0; ir1 < grammar->rules[ir0].size(); ir1++) { + if (grammar->stacks[is][ie] == &grammar->rules[ir0][ir1]) { + result->stacks[is][ie] = &result->rules[ir0][ir1]; + } + } + } } } From d41f314740c40e99d58965bc1adb827ee29398b9 Mon Sep 17 00:00:00 2001 From: ochafik Date: Sun, 28 Apr 2024 14:55:28 +0100 Subject: [PATCH 06/12] grammars: move token caches to llama_context --- llama.cpp | 27 ++++++++++++++++----------- llama.h | 5 ----- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/llama.cpp b/llama.cpp index 704c5e24b40ca..0a44cf6689b8a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2325,6 +2325,11 @@ struct llama_context { // control vectors struct llama_control_vector cvec; + // caching token pieces & their decoded codepoints. + std::vector token_pieces; + std::vector, + llama_partial_utf8>> token_codepoints; + #ifdef GGML_USE_MPI ggml_mpi_context * ctx_mpi = NULL; #endif @@ -13051,7 +13056,7 @@ struct llama_grammar * llama_grammar_init( } } while (true); - return new llama_grammar{ std::move(vec_rules), std::move(stacks), {}, {}, {} }; + return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} }; } void llama_grammar_free(struct llama_grammar * grammar) { @@ -13059,7 +13064,7 @@ void llama_grammar_free(struct llama_grammar * grammar) { } struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) { - llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8, grammar->token_pieces, grammar->token_codepoints }; + llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8 }; // redirect elements in stacks to point to new rules for (size_t is = 0; is < result->stacks.size(); is++) { @@ -13552,14 +13557,14 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c } } - if (grammar->token_codepoints.empty()) { + if (ctx->token_codepoints.empty()) { auto n_vocab = llama_n_vocab(llama_get_model(ctx)); - grammar->token_codepoints.resize(n_vocab); - grammar->token_pieces.resize(n_vocab); + ctx->token_codepoints.resize(n_vocab); + ctx->token_pieces.resize(n_vocab); for (llama_token id = 0; id < n_vocab; ++id) { const std::string piece = llama_token_to_piece(ctx, id, false); - grammar->token_pieces[id] = piece; - grammar->token_codepoints[id] = decode_utf8(piece, {0, 0}); + ctx->token_pieces[id] = piece; + ctx->token_codepoints[id] = decode_utf8(piece, {0, 0}); } } @@ -13572,7 +13577,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c for (size_t i = 0; i < candidates->size; ++i) { const llama_token id = candidates->data[i].id; - const auto & piece = grammar->token_pieces[id]; + const auto & piece = ctx->token_pieces[id]; if (llama_token_is_eog(&ctx->model, id)) { if (!allow_eog) { candidates->data[i].logit = -INFINITY; @@ -13580,7 +13585,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c } else if (piece.empty() || piece[0] == 0) { candidates->data[i].logit = -INFINITY; } else if (grammar->partial_utf8.n_remain == 0){ - const auto & decoded = grammar->token_codepoints.at(id); + const auto & decoded = ctx->token_codepoints.at(id); candidates_grammar.push_back({ i, decoded.first.data(), decoded.second }); } else { candidates_decoded.push_back(decode_utf8(piece, grammar->partial_utf8)); @@ -13778,11 +13783,11 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar GGML_ASSERT(false); } - const auto & piece = grammar->token_pieces.at(token); + const auto & piece = ctx->token_pieces.at(token); // Note terminating 0 in decoded string const auto decoded = grammar->partial_utf8.n_remain == 0 - ? grammar->token_codepoints[token] + ? ctx->token_codepoints[token] : decode_utf8(piece, grammar->partial_utf8); const auto & code_points = decoded.first; std::vector> tmp_new_stacks; diff --git a/llama.h b/llama.h index 13c5963539748..9c849c0555f56 100644 --- a/llama.h +++ b/llama.h @@ -1099,11 +1099,6 @@ struct llama_grammar { // buffer for partially generated UTF-8 sequence from accepted tokens llama_partial_utf8 partial_utf8; - - // caching the token pieces & their decoded codepoints. - std::vector token_pieces; - std::vector, - llama_partial_utf8>> token_codepoints; }; struct llama_grammar_candidate { From 49f0faaa0e7f63a58b3eea3f4bba86e5170ff927 Mon Sep 17 00:00:00 2001 From: ochafik Date: Sun, 28 Apr 2024 15:24:35 +0100 Subject: [PATCH 07/12] grammars: cache codepoints in llama_new_context_with_model --- llama.cpp | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/llama.cpp b/llama.cpp index 0a44cf6689b8a..31e7cbd9a45bd 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2328,7 +2328,7 @@ struct llama_context { // caching token pieces & their decoded codepoints. std::vector token_pieces; std::vector, - llama_partial_utf8>> token_codepoints; + llama_partial_utf8>> token_codepoints_without_partial_utf8_prefix; #ifdef GGML_USE_MPI ggml_mpi_context * ctx_mpi = NULL; @@ -13557,17 +13557,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c } } - if (ctx->token_codepoints.empty()) { - auto n_vocab = llama_n_vocab(llama_get_model(ctx)); - ctx->token_codepoints.resize(n_vocab); - ctx->token_pieces.resize(n_vocab); - for (llama_token id = 0; id < n_vocab; ++id) { - const std::string piece = llama_token_to_piece(ctx, id, false); - ctx->token_pieces[id] = piece; - ctx->token_codepoints[id] = decode_utf8(piece, {0, 0}); - } - } - + // Store decoded codepoints when they are not cached. std::vector, llama_partial_utf8>> candidates_decoded; if (grammar->partial_utf8.n_remain > 0) { candidates_decoded.reserve(candidates->size); @@ -13585,7 +13575,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c } else if (piece.empty() || piece[0] == 0) { candidates->data[i].logit = -INFINITY; } else if (grammar->partial_utf8.n_remain == 0){ - const auto & decoded = ctx->token_codepoints.at(id); + const auto & decoded = ctx->token_codepoints_without_partial_utf8_prefix.at(id); candidates_grammar.push_back({ i, decoded.first.data(), decoded.second }); } else { candidates_decoded.push_back(decode_utf8(piece, grammar->partial_utf8)); @@ -13787,7 +13777,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar // Note terminating 0 in decoded string const auto decoded = grammar->partial_utf8.n_remain == 0 - ? ctx->token_codepoints[token] + ? ctx->token_codepoints_without_partial_utf8_prefix[token] : decode_utf8(piece, grammar->partial_utf8); const auto & code_points = decoded.first; std::vector> tmp_new_stacks; @@ -15645,6 +15635,18 @@ struct llama_context * llama_new_context_with_model( } } + // cache tokens & their decoded codepoints (for common case where there's no partial utf8 prefix bytes) for grammar constrained sampling. + { + auto n_vocab = llama_n_vocab(llama_get_model(ctx)); + ctx->token_codepoints_without_partial_utf8_prefix.resize(n_vocab); + ctx->token_pieces.resize(n_vocab); + for (llama_token id = 0; id < n_vocab; ++id) { + const std::string piece = llama_token_to_piece(ctx, id, false); + ctx->token_pieces[id] = piece; + ctx->token_codepoints_without_partial_utf8_prefix[id] = decode_utf8(piece, {0, 0}); + } + } + #ifdef GGML_USE_MPI ctx->ctx_mpi = ggml_mpi_init(); From d3425f5cf18278c91206f18a054bbe86ad0c09a7 Mon Sep 17 00:00:00 2001 From: ochafik Date: Sun, 28 Apr 2024 15:28:58 +0100 Subject: [PATCH 08/12] grammar: nit (layout) --- llama.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llama.cpp b/llama.cpp index 31e7cbd9a45bd..d63a2a8884f5a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2326,9 +2326,9 @@ struct llama_context { struct llama_control_vector cvec; // caching token pieces & their decoded codepoints. - std::vector token_pieces; - std::vector, - llama_partial_utf8>> token_codepoints_without_partial_utf8_prefix; + std::vector token_pieces; + std::vector, llama_partial_utf8>> + token_codepoints_without_partial_utf8_prefix; #ifdef GGML_USE_MPI ggml_mpi_context * ctx_mpi = NULL; From c70037f2b342f7c6d9e59d7a6532b6c075890873 Mon Sep 17 00:00:00 2001 From: ochafik Date: Sun, 28 Apr 2024 15:34:56 +0100 Subject: [PATCH 09/12] grammars: nits (revert const grammar sig, fix comment) --- llama.cpp | 4 ++-- llama.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/llama.cpp b/llama.cpp index d63a2a8884f5a..17b493746fe3f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -13545,7 +13545,7 @@ void llama_sample_repetition_penalties( } } -void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, struct llama_grammar * grammar) { +void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) { GGML_ASSERT(ctx); const int64_t t_start_sample_us = ggml_time_us(); @@ -13557,7 +13557,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c } } - // Store decoded codepoints when they are not cached. + // Store decoded codepoints when they are not cached (happens when there's a partial utf8 string prefix). std::vector, llama_partial_utf8>> candidates_decoded; if (grammar->partial_utf8.n_remain > 0) { candidates_decoded.reserve(candidates->size); diff --git a/llama.h b/llama.h index 9c849c0555f56..8b1b15ed4ad55 100644 --- a/llama.h +++ b/llama.h @@ -961,7 +961,7 @@ extern "C" { LLAMA_API void llama_sample_grammar( struct llama_context * ctx, llama_token_data_array * candidates, - struct llama_grammar * grammar); + const struct llama_grammar * grammar); /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. From 80736c556b153d944cc5bb93b7e35bef9aa4dbf7 Mon Sep 17 00:00:00 2001 From: Olivier Chafik Date: Wed, 1 May 2024 01:55:09 +0100 Subject: [PATCH 10/12] Update llama.cpp Co-authored-by: Clint Herron --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index e7ce0fbb292bc..df381594582e6 100644 --- a/llama.cpp +++ b/llama.cpp @@ -15730,7 +15730,7 @@ struct llama_context * llama_new_context_with_model( } } - // cache tokens & their decoded codepoints (for common case where there's no partial utf8 prefix bytes) for grammar constrained sampling. + // cache tokens & their decoded codepoints (for common case where there's no partial utf8 prefix bytes) for grammar-constrained sampling. { auto n_vocab = llama_n_vocab(llama_get_model(ctx)); ctx->token_codepoints_without_partial_utf8_prefix.resize(n_vocab); From 939e143fe2ff1172bd8056c9b8dcbdb08b1c8e81 Mon Sep 17 00:00:00 2001 From: ochafik Date: Sat, 18 May 2024 22:37:14 +0100 Subject: [PATCH 11/12] grammars: mutex-guarded lazy caching of token pieces in llama_sample_grammar --- llama.cpp | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/llama.cpp b/llama.cpp index df381594582e6..319cb1a696d53 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2329,6 +2329,7 @@ struct llama_context { struct llama_control_vector cvec; // caching token pieces & their decoded codepoints. + std::mutex token_cache_mutex; std::vector token_pieces; std::vector, llama_partial_utf8>> token_codepoints_without_partial_utf8_prefix; @@ -13624,6 +13625,21 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c } } + { + // cache tokens & their decoded codepoints (for common case where there's no partial utf8 prefix bytes) for grammar-constrained sampling. + std::unique_lock lock(ctx->token_cache_mutex); + if (ctx->token_pieces.empty()) { + auto n_vocab = llama_n_vocab(llama_get_model(ctx)); + ctx->token_codepoints_without_partial_utf8_prefix.resize(n_vocab); + ctx->token_pieces.resize(n_vocab); + for (llama_token id = 0; id < n_vocab; ++id) { + const std::string piece = llama_token_to_piece(ctx, id, false); + ctx->token_pieces[id] = piece; + ctx->token_codepoints_without_partial_utf8_prefix[id] = decode_utf8(piece, {0, 0}); + } + } + } + // Store decoded codepoints when they are not cached (happens when there's a partial utf8 string prefix). std::vector, llama_partial_utf8>> candidates_decoded; if (grammar->partial_utf8.n_remain > 0) { @@ -15730,18 +15746,6 @@ struct llama_context * llama_new_context_with_model( } } - // cache tokens & their decoded codepoints (for common case where there's no partial utf8 prefix bytes) for grammar-constrained sampling. - { - auto n_vocab = llama_n_vocab(llama_get_model(ctx)); - ctx->token_codepoints_without_partial_utf8_prefix.resize(n_vocab); - ctx->token_pieces.resize(n_vocab); - for (llama_token id = 0; id < n_vocab; ++id) { - const std::string piece = llama_token_to_piece(ctx, id, false); - ctx->token_pieces[id] = piece; - ctx->token_codepoints_without_partial_utf8_prefix[id] = decode_utf8(piece, {0, 0}); - } - } - #ifdef GGML_USE_MPI ctx->ctx_mpi = ggml_mpi_init(); From 60745acad4b0782ec8dbe37f75d044d8859646af Mon Sep 17 00:00:00 2001 From: ochafik Date: Sat, 18 May 2024 22:37:58 +0100 Subject: [PATCH 12/12] grammars: remove early exit --> https://github.com/ggerganov/llama.cpp/pull/7370 --- llama.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/llama.cpp b/llama.cpp index 319cb1a696d53..931a84c68274d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -13043,10 +13043,6 @@ static std::vector llama_grammar_reject_candidates_for_ } } - if (next_candidates.empty()) { - return rejects; - } - const auto * stack_pos_after = llama_grammar_match_char(stack_pos, 0).second; // update top of stack to next element, if any