Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: sync llama.cpp #68

Merged
merged 1 commit into from
Jul 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cpp/ggml-quants.c
Original file line number Diff line number Diff line change
Expand Up @@ -4748,7 +4748,7 @@ void lm_ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void

int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);

sumf += (LM_GGML_FP16_TO_FP32(x[i].d)*LM_GGML_FP16_TO_FP32(y[i].d)) * sumi;
sumf += (LM_GGML_FP16_TO_FP32(x[ib].d)*LM_GGML_FP16_TO_FP32(y[ib].d)) * sumi;
}

#elif defined(__POWER9_VECTOR__)
Expand Down
2 changes: 1 addition & 1 deletion cpp/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -21015,7 +21015,7 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg
lm_gguf_tensor_info_sanitize(info);

// make sure there is no duplicated tensor names
for (uint64_t j = 0; j < i; ++j) {
for (uint64_t j = 0; j < i && ok; ++j) {
if (strcmp(info->name.data, ctx->infos[j].name.data) == 0) {
fprintf(stderr, "%s: duplicated tensor name %s\n", __func__, info->name.data);
ok = false;
Expand Down
60 changes: 36 additions & 24 deletions cpp/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3718,7 +3718,7 @@ struct llama_model_loader {
}

if (param_overrides_p != nullptr) {
for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) {
for (const struct llama_model_kv_override * p = param_overrides_p; p->key[0] != 0; p++) {
kv_overrides.insert({std::string(p->key), *p});
}
}
Expand Down Expand Up @@ -3886,7 +3886,7 @@ struct llama_model_loader {
ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);

{
const int kid = lm_gguf_find_key(meta, "general.file_type");
const int kid = lm_gguf_find_key(meta, "general.file_type"); // TODO: use LLM_KV
if (kid >= 0) {
ftype = (llama_ftype) lm_gguf_get_val_u32(meta, kid);
}
Expand Down Expand Up @@ -5018,7 +5018,7 @@ static void llm_load_hparams(
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
switch (hparams.n_layer) {
case 42: model.type = e_model::MODEL_SMALL; break;
case 42: model.type = e_model::MODEL_7B; break;
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
Expand Down Expand Up @@ -5380,6 +5380,7 @@ static void llm_load_vocab(
if (merges_keyidx == -1) {
throw std::runtime_error("cannot find tokenizer merges in model file\n");
}

const int n_merges = lm_gguf_get_arr_n(ctx, merges_keyidx);
for (int i = 0; i < n_merges; i++) {
const std::string word = lm_gguf_get_arr_str(ctx, merges_keyidx, i);
Expand Down Expand Up @@ -5418,16 +5419,6 @@ static void llm_load_vocab(
vocab.special_cls_id = -1;
vocab.special_mask_id = -1;

const int add_space_prefix_keyidx = lm_gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
if (add_space_prefix_keyidx != -1) {
vocab.tokenizer_add_space_prefix = lm_gguf_get_val_bool(ctx, add_space_prefix_keyidx);
} // The default value of add_space_prefix is true.

const int remove_extra_whitespaces_keyidx = lm_gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS).c_str());
if (remove_extra_whitespaces_keyidx != -1) {
vocab.tokenizer_remove_extra_whitespaces = lm_gguf_get_val_bool(ctx, remove_extra_whitespaces_keyidx);
} // The default value of remove_extra_whitespaces is false.

const int precompiled_charsmap_keyidx = lm_gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
if (precompiled_charsmap_keyidx != -1) {
size_t n_precompiled_charsmap = lm_gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
Expand Down Expand Up @@ -5535,6 +5526,19 @@ static void llm_load_vocab(
} else if (
tokenizer_pre == "jais") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
} else if (
tokenizer_pre == "tekken") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
vocab.tokenizer_clean_spaces = false;
vocab.tokenizer_ignore_merges = true;
vocab.tokenizer_add_bos = true;
} else if (
tokenizer_pre == "smollm") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
vocab.tokenizer_clean_spaces = false;
} else if (
tokenizer_pre == "codeshell") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
} else {
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
}
Expand All @@ -5558,10 +5562,8 @@ static void llm_load_vocab(
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
}

const int add_space_prefix_keyidx = lm_gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
if (add_space_prefix_keyidx != -1) {
vocab.tokenizer_add_space_prefix = lm_gguf_get_val_bool(ctx, add_space_prefix_keyidx);
}
ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, vocab.tokenizer_add_space_prefix, false);
ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, vocab.tokenizer_remove_extra_whitespaces, false);
}

const int token_idx = lm_gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
Expand Down Expand Up @@ -6142,10 +6144,10 @@ static bool llm_load_tensors(

layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});

layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head});
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});

// optional bias tensors
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
Expand Down Expand Up @@ -15559,6 +15561,8 @@ struct llm_tokenizer_bpe {
case LLAMA_VOCAB_PRE_TYPE_STARCODER:
case LLAMA_VOCAB_PRE_TYPE_REFACT:
case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
regex_exprs = {
"\\p{N}",
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
Expand Down Expand Up @@ -15596,6 +15600,13 @@ struct llm_tokenizer_bpe {
"\\p{N}",
};
break;
case LLAMA_VOCAB_PRE_TYPE_TEKKEN:
// original regex from tokenizer.json
// "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
regex_exprs = {
"[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
};
break;
default:
// default regex for BPE tokenization pre-processing
regex_exprs = {
Expand Down Expand Up @@ -18286,8 +18297,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s

// copy the KV pairs from the input file
lm_gguf_set_kv (ctx_out, ml.meta);
lm_gguf_set_val_u32(ctx_out, "general.quantization_version", LM_GGML_QNT_VERSION);
lm_gguf_set_val_u32(ctx_out, "general.file_type", ftype);
lm_gguf_set_val_u32(ctx_out, "general.quantization_version", LM_GGML_QNT_VERSION); // TODO: use LLM_KV
lm_gguf_set_val_u32(ctx_out, "general.file_type", ftype); // TODO: use LLM_KV

// Remove split metadata
lm_gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_NO).c_str());
lm_gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str());
Expand Down Expand Up @@ -19450,7 +19462,6 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_BAICHUAN:
case LLM_ARCH_STARCODER:
case LLM_ARCH_PLAMO:
case LLM_ARCH_CODESHELL:
case LLM_ARCH_ORION:
case LLM_ARCH_INTERNLM2:
case LLM_ARCH_MINICPM:
Expand Down Expand Up @@ -19480,6 +19491,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_STARCODER2:
case LLM_ARCH_OPENELM:
case LLM_ARCH_GPTNEOX:
case LLM_ARCH_CODESHELL:
return LLAMA_ROPE_TYPE_NEOX;

// all model arches should be listed explicitly here
Expand Down
3 changes: 3 additions & 0 deletions cpp/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,9 @@ extern "C" {
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
};

// note: these values should be synchronized with lm_ggml_rope
Expand Down
2 changes: 1 addition & 1 deletion llama.cpp
Loading