From 0b4fc09ae44ff796b389a2d1c7f2f2c7a90da43a Mon Sep 17 00:00:00 2001 From: Jhen Date: Tue, 23 Jul 2024 15:42:50 +0800 Subject: [PATCH] feat: sync llama.cpp --- cpp/ggml-quants.c | 2 +- cpp/ggml.c | 2 +- cpp/llama.cpp | 60 ++++++++++++++++++++++++++++------------------- cpp/llama.h | 3 +++ llama.cpp | 2 +- 5 files changed, 42 insertions(+), 27 deletions(-) diff --git a/cpp/ggml-quants.c b/cpp/ggml-quants.c index 7dd0f96..6804b28 100644 --- a/cpp/ggml-quants.c +++ b/cpp/ggml-quants.c @@ -4748,7 +4748,7 @@ void lm_ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void int sumi = __riscv_vmv_x_s_i32m1_i32(vs2); - sumf += (LM_GGML_FP16_TO_FP32(x[i].d)*LM_GGML_FP16_TO_FP32(y[i].d)) * sumi; + sumf += (LM_GGML_FP16_TO_FP32(x[ib].d)*LM_GGML_FP16_TO_FP32(y[ib].d)) * sumi; } #elif defined(__POWER9_VECTOR__) diff --git a/cpp/ggml.c b/cpp/ggml.c index acd2314..05bcb3d 100644 --- a/cpp/ggml.c +++ b/cpp/ggml.c @@ -21015,7 +21015,7 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg lm_gguf_tensor_info_sanitize(info); // make sure there is no duplicated tensor names - for (uint64_t j = 0; j < i; ++j) { + for (uint64_t j = 0; j < i && ok; ++j) { if (strcmp(info->name.data, ctx->infos[j].name.data) == 0) { fprintf(stderr, "%s: duplicated tensor name %s\n", __func__, info->name.data); ok = false; diff --git a/cpp/llama.cpp b/cpp/llama.cpp index 5883887..fce2269 100644 --- a/cpp/llama.cpp +++ b/cpp/llama.cpp @@ -3718,7 +3718,7 @@ struct llama_model_loader { } if (param_overrides_p != nullptr) { - for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) { + for (const struct llama_model_kv_override * p = param_overrides_p; p->key[0] != 0; p++) { kv_overrides.insert({std::string(p->key), *p}); } } @@ -3886,7 +3886,7 @@ struct llama_model_loader { ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED); { - const int kid = lm_gguf_find_key(meta, "general.file_type"); + const int kid = lm_gguf_find_key(meta, "general.file_type"); // TODO: use LLM_KV if (kid >= 0) { ftype = (llama_ftype) lm_gguf_get_val_u32(meta, kid); } @@ -5018,7 +5018,7 @@ static void llm_load_hparams( { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); switch (hparams.n_layer) { - case 42: model.type = e_model::MODEL_SMALL; break; + case 42: model.type = e_model::MODEL_7B; break; default: model.type = e_model::MODEL_UNKNOWN; } } break; @@ -5380,6 +5380,7 @@ static void llm_load_vocab( if (merges_keyidx == -1) { throw std::runtime_error("cannot find tokenizer merges in model file\n"); } + const int n_merges = lm_gguf_get_arr_n(ctx, merges_keyidx); for (int i = 0; i < n_merges; i++) { const std::string word = lm_gguf_get_arr_str(ctx, merges_keyidx, i); @@ -5418,16 +5419,6 @@ static void llm_load_vocab( vocab.special_cls_id = -1; vocab.special_mask_id = -1; - const int add_space_prefix_keyidx = lm_gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str()); - if (add_space_prefix_keyidx != -1) { - vocab.tokenizer_add_space_prefix = lm_gguf_get_val_bool(ctx, add_space_prefix_keyidx); - } // The default value of add_space_prefix is true. - - const int remove_extra_whitespaces_keyidx = lm_gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS).c_str()); - if (remove_extra_whitespaces_keyidx != -1) { - vocab.tokenizer_remove_extra_whitespaces = lm_gguf_get_val_bool(ctx, remove_extra_whitespaces_keyidx); - } // The default value of remove_extra_whitespaces is false. - const int precompiled_charsmap_keyidx = lm_gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str()); if (precompiled_charsmap_keyidx != -1) { size_t n_precompiled_charsmap = lm_gguf_get_arr_n(ctx, precompiled_charsmap_keyidx); @@ -5535,6 +5526,19 @@ static void llm_load_vocab( } else if ( tokenizer_pre == "jais") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS; + } else if ( + tokenizer_pre == "tekken") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_TEKKEN; + vocab.tokenizer_clean_spaces = false; + vocab.tokenizer_ignore_merges = true; + vocab.tokenizer_add_bos = true; + } else if ( + tokenizer_pre == "smollm") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMOLLM; + vocab.tokenizer_clean_spaces = false; + } else if ( + tokenizer_pre == "codeshell") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } @@ -5558,10 +5562,8 @@ static void llm_load_vocab( vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; } - const int add_space_prefix_keyidx = lm_gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str()); - if (add_space_prefix_keyidx != -1) { - vocab.tokenizer_add_space_prefix = lm_gguf_get_val_bool(ctx, add_space_prefix_keyidx); - } + ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, vocab.tokenizer_add_space_prefix, false); + ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, vocab.tokenizer_remove_extra_whitespaces, false); } const int token_idx = lm_gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str()); @@ -6142,10 +6144,10 @@ static bool llm_load_tensors( layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); - layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); - layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); - layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); - layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}); // optional bias tensors layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); @@ -15559,6 +15561,8 @@ struct llm_tokenizer_bpe { case LLAMA_VOCAB_PRE_TYPE_STARCODER: case LLAMA_VOCAB_PRE_TYPE_REFACT: case LLAMA_VOCAB_PRE_TYPE_COMMAND_R: + case LLAMA_VOCAB_PRE_TYPE_SMOLLM: + case LLAMA_VOCAB_PRE_TYPE_CODESHELL: regex_exprs = { "\\p{N}", "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", @@ -15596,6 +15600,13 @@ struct llm_tokenizer_bpe { "\\p{N}", }; break; + case LLAMA_VOCAB_PRE_TYPE_TEKKEN: + // original regex from tokenizer.json + // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" + regex_exprs = { + "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", + }; + break; default: // default regex for BPE tokenization pre-processing regex_exprs = { @@ -18286,8 +18297,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // copy the KV pairs from the input file lm_gguf_set_kv (ctx_out, ml.meta); - lm_gguf_set_val_u32(ctx_out, "general.quantization_version", LM_GGML_QNT_VERSION); - lm_gguf_set_val_u32(ctx_out, "general.file_type", ftype); + lm_gguf_set_val_u32(ctx_out, "general.quantization_version", LM_GGML_QNT_VERSION); // TODO: use LLM_KV + lm_gguf_set_val_u32(ctx_out, "general.file_type", ftype); // TODO: use LLM_KV + // Remove split metadata lm_gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_NO).c_str()); lm_gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str()); @@ -19450,7 +19462,6 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_BAICHUAN: case LLM_ARCH_STARCODER: case LLM_ARCH_PLAMO: - case LLM_ARCH_CODESHELL: case LLM_ARCH_ORION: case LLM_ARCH_INTERNLM2: case LLM_ARCH_MINICPM: @@ -19480,6 +19491,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_STARCODER2: case LLM_ARCH_OPENELM: case LLM_ARCH_GPTNEOX: + case LLM_ARCH_CODESHELL: return LLAMA_ROPE_TYPE_NEOX; // all model arches should be listed explicitly here diff --git a/cpp/llama.h b/cpp/llama.h index 54cd2ed..7ca44d5 100644 --- a/cpp/llama.h +++ b/cpp/llama.h @@ -92,6 +92,9 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17, LLAMA_VOCAB_PRE_TYPE_VIKING = 18, LLAMA_VOCAB_PRE_TYPE_JAIS = 19, + LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20, + LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21, + LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22, }; // note: these values should be synchronized with lm_ggml_rope diff --git a/llama.cpp b/llama.cpp index c3776ca..081fe43 160000 --- a/llama.cpp +++ b/llama.cpp @@ -1 +1 @@ -Subproject commit c3776cacabce2ee35f172fb72be7a519752125fa +Subproject commit 081fe431aa8fb6307145c4feb3eed4f48cab19f8