diff --git a/cpp/build-info.h b/cpp/build-info.h deleted file mode 100644 index b795570..0000000 --- a/cpp/build-info.h +++ /dev/null @@ -1,9 +0,0 @@ -#ifndef BUILD_INFO_H -#define BUILD_INFO_H - -#define BUILD_NUMBER 1461 -#define BUILD_COMMIT "c43c2da" -#define BUILD_COMPILER "" -#define BUILD_TARGET "unknown" - -#endif // BUILD_INFO_H diff --git a/cpp/common.cpp b/cpp/common.cpp index 1aa8b1c..f737b29 100644 --- a/cpp/common.cpp +++ b/cpp/common.cpp @@ -1,5 +1,4 @@ #include "common.h" -#include "build-info.h" #include "llama.h" #include @@ -219,12 +218,52 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } params.rope_freq_scale = std::stof(argv[i]); + } else if (arg == "--rope-scaling") { + if (++i >= argc) { + invalid_param = true; + break; + } + std::string value(argv[i]); + /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; } + else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; } + else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; } + else { invalid_param = true; break; } } else if (arg == "--rope-scale") { if (++i >= argc) { invalid_param = true; break; } params.rope_freq_scale = 1.0f/std::stof(argv[i]); + } else if (arg == "--yarn-orig-ctx") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.yarn_orig_ctx = std::stoi(argv[i]); + } else if (arg == "--yarn-ext-factor") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.yarn_ext_factor = std::stof(argv[i]); + } else if (arg == "--yarn-attn-factor") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.yarn_attn_factor = std::stof(argv[i]); + } else if (arg == "--yarn-beta-fast") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.yarn_beta_fast = std::stof(argv[i]); + } else if (arg == "--yarn-beta-slow") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.yarn_beta_slow = std::stof(argv[i]); } else if (arg == "--memory-f32") { params.memory_f16 = false; } else if (arg == "--top-p") { @@ -716,9 +755,16 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --cfg-negative-prompt-file FNAME\n"); printf(" negative prompt file to use for guidance. (default: empty)\n"); printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", sparams.cfg_scale); - printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale\n"); + printf(" --rope-scaling {none,linear,yarn}\n"); + printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n"); + printf(" --rope-scale N RoPE context scaling factor, expands context by a factor of N\n"); printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n"); - printf(" --rope-freq-scale N RoPE frequency linear scaling factor (default: loaded from model)\n"); + printf(" --rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N\n"); + printf(" --yarn-orig-ctx N YaRN: original context size of model (default: 0 = model training context size)\n"); + printf(" --yarn-ext-factor N YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n"); + printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n"); + printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow); + printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast); printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); printf(" --no-penalize-nl do not penalize newline token\n"); printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); @@ -826,17 +872,23 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) { auto cparams = llama_context_default_params(); - cparams.n_ctx = params.n_ctx; - cparams.n_batch = params.n_batch; - cparams.n_threads = params.n_threads; - cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; - cparams.mul_mat_q = params.mul_mat_q; - cparams.seed = params.seed; - cparams.f16_kv = params.memory_f16; - cparams.logits_all = params.logits_all; - cparams.embedding = params.embedding; - cparams.rope_freq_base = params.rope_freq_base; - cparams.rope_freq_scale = params.rope_freq_scale; + cparams.n_ctx = params.n_ctx; + cparams.n_batch = params.n_batch; + cparams.n_threads = params.n_threads; + cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; + cparams.mul_mat_q = params.mul_mat_q; + cparams.seed = params.seed; + cparams.f16_kv = params.memory_f16; + cparams.logits_all = params.logits_all; + cparams.embedding = params.embedding; + cparams.rope_scaling_type = params.rope_scaling_type; + cparams.rope_freq_base = params.rope_freq_base; + cparams.rope_freq_scale = params.rope_freq_scale; + cparams.yarn_ext_factor = params.yarn_ext_factor; + cparams.yarn_attn_factor = params.yarn_attn_factor; + cparams.yarn_beta_fast = params.yarn_beta_fast; + cparams.yarn_beta_slow = params.yarn_beta_slow; + cparams.yarn_orig_ctx = params.yarn_orig_ctx; return cparams; } @@ -1146,8 +1198,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc) { const llama_sampling_params & sparams = params.sparams; - fprintf(stream, "build_commit: %s\n", BUILD_COMMIT); - fprintf(stream, "build_number: %d\n", BUILD_NUMBER); fprintf(stream, "cpu_has_arm_fma: %s\n", lm_ggml_cpu_has_arm_fma() ? "true" : "false"); fprintf(stream, "cpu_has_avx: %s\n", lm_ggml_cpu_has_avx() ? "true" : "false"); fprintf(stream, "cpu_has_avx2: %s\n", lm_ggml_cpu_has_avx2() ? "true" : "false"); diff --git a/cpp/common.h b/cpp/common.h index 343b272..72a49b8 100644 --- a/cpp/common.h +++ b/cpp/common.h @@ -9,6 +9,7 @@ #define LOG_NO_FILE_LINE_FUNCTION #include "log.h" +#include #include #include #include @@ -25,11 +26,17 @@ #define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0) #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0) -#define print_build_info() do { \ - fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); \ - fprintf(stderr, "%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET); \ +#define print_build_info() do { \ + fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \ + fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \ } while(0) +// build info +extern int LLAMA_BUILD_NUMBER; +extern char const *LLAMA_COMMIT; +extern char const *LLAMA_COMPILER; +extern char const *LLAMA_BUILD_TARGET; + // // CLI argument parsing // @@ -54,6 +61,12 @@ struct gpt_params { int32_t n_beams = 0; // if non-zero then use beam search of given width. float rope_freq_base = 0.0f; // RoPE base frequency float rope_freq_scale = 0.0f; // RoPE frequency scaling factor + float yarn_ext_factor = NAN; // YaRN extrapolation mix factor + float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor + float yarn_beta_fast = 32.0f;// YaRN low correction dim + float yarn_beta_slow = 1.0f; // YaRN high correction dim + int32_t yarn_orig_ctx = 0; // YaRN original context length + int8_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; // // sampling parameters struct llama_sampling_params sparams; diff --git a/cpp/ggml-metal-llama.metal b/cpp/ggml-metal-llama.metal index f315277..7c35f23 100644 --- a/cpp/ggml-metal-llama.metal +++ b/cpp/ggml-metal-llama.metal @@ -1061,6 +1061,45 @@ kernel void kernel_alibi_f32( } } +static float rope_yarn_ramp(const float low, const float high, const int i0) { + const float y = (i0 / 2 - low) / max(0.001f, high - low); + return 1.0f - min(1.0f, max(0.0f, y)); +} + +// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn +// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng. +static void rope_yarn( + float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale, + thread float * cos_theta, thread float * sin_theta +) { + // Get n-d rotational scaling corrected for extrapolation + float theta_interp = freq_scale * theta_extrap; + float theta = theta_interp; + if (ext_factor != 0.0f) { + float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor; + theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; + + // Get n-d magnitude scaling corrected for interpolation + mscale *= 1.0f + 0.1f * log(1.0f / freq_scale); + } + *cos_theta = cos(theta) * mscale; + *sin_theta = sin(theta) * mscale; +} + +// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get +// `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))` +static float rope_yarn_corr_factor(int n_dims, int n_orig_ctx, float n_rot, float base) { + return n_dims * log(n_orig_ctx / (n_rot * 2 * M_PI_F)) / (2 * log(base)); +} + +static void rope_yarn_corr_dims( + int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2] +) { + // start and end correction dims + dims[0] = max(0.0f, floor(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_fast, freq_base))); + dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_slow, freq_base))); +} + typedef void (rope_t)( device const void * src0, device const int32_t * src1, @@ -1084,8 +1123,13 @@ typedef void (rope_t)( constant int & n_past, constant int & n_dims, constant int & mode, + constant int & n_orig_ctx, constant float & freq_base, constant float & freq_scale, + constant float & ext_factor, + constant float & attn_factor, + constant float & beta_fast, + constant float & beta_slow, uint tiitg[[thread_index_in_threadgroup]], uint3 tptg[[threads_per_threadgroup]], uint3 tgpig[[threadgroup_position_in_grid]]); @@ -1114,8 +1158,13 @@ kernel void kernel_rope( constant int & n_past, constant int & n_dims, constant int & mode, + constant int & n_orig_ctx, constant float & freq_base, constant float & freq_scale, + constant float & ext_factor, + constant float & attn_factor, + constant float & beta_fast, + constant float & beta_slow, uint tiitg[[thread_index_in_threadgroup]], uint3 tptg[[threads_per_threadgroup]], uint3 tgpig[[threadgroup_position_in_grid]]) { @@ -1125,19 +1174,22 @@ kernel void kernel_rope( const bool is_neox = mode & 2; + float corr_dims[2]; + rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims); + device const int32_t * pos = src1; const int64_t p = pos[i2]; - const float theta_0 = freq_scale * (float)p; + const float theta_0 = (float)p; const float inv_ndims = -1.f/n_dims; if (!is_neox) { for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) { const float theta = theta_0 * pow(freq_base, inv_ndims*i0); - const float cos_theta = cos(theta); - const float sin_theta = sin(theta); + float cos_theta, sin_theta; + rope_yarn(theta, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta); device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); device T * dst_data = (device T *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); @@ -1152,9 +1204,12 @@ kernel void kernel_rope( for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) { - const float theta = theta_0 * pow(freq_base, inv_ndims*ic - ib); - const float cos_theta = cos(theta); - const float sin_theta = sin(theta); + // simplified from `(ib * n_dims + ic) * inv_ndims` + const float cur_rot = inv_ndims*ic - ib; + + const float theta = theta_0 * pow(freq_base, cur_rot); + float cos_theta, sin_theta; + rope_yarn(theta, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta); const int64_t i0 = ib*n_dims + ic/2; diff --git a/cpp/ggml-metal.m b/cpp/ggml-metal.m index 2b123c9..6dc61d5 100644 --- a/cpp/ggml-metal.m +++ b/cpp/ggml-metal.m @@ -1400,14 +1400,18 @@ void lm_ggml_metal_graph_compute( const int nth = MIN(1024, ne00); - const int n_past = ((int32_t *) dst->op_params)[0]; - const int n_dims = ((int32_t *) dst->op_params)[1]; - const int mode = ((int32_t *) dst->op_params)[2]; - - float freq_base; - float freq_scale; - memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); - memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); + const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; + const int n_orig_ctx = ((int32_t *) dst->op_params)[3]; + + float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; + memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); + memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); + memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); + memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); switch (src0->type) { case LM_GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_rope_f32]; break; @@ -1415,30 +1419,35 @@ void lm_ggml_metal_graph_compute( default: LM_GGML_ASSERT(false); }; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3]; - [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:4]; - [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:5]; - [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:6]; - [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:7]; - [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:8]; - [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:9]; - [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:10]; - [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:11]; - [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:12]; - [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:13]; - [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:14]; - [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:15]; - [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:16]; - [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:17]; - [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:18]; - [encoder setBytes:&n_past length:sizeof( int) atIndex:19]; - [encoder setBytes:&n_dims length:sizeof( int) atIndex:20]; - [encoder setBytes:&mode length:sizeof( int) atIndex:21]; - [encoder setBytes:&freq_base length:sizeof(float) atIndex:22]; - [encoder setBytes:&freq_scale length:sizeof(float) atIndex:23]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3]; + [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:4]; + [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:5]; + [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:6]; + [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:7]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:8]; + [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:9]; + [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:10]; + [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:11]; + [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:12]; + [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:13]; + [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:14]; + [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:15]; + [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:16]; + [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:17]; + [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:18]; + [encoder setBytes:&n_past length:sizeof( int) atIndex:19]; + [encoder setBytes:&n_dims length:sizeof( int) atIndex:20]; + [encoder setBytes:&mode length:sizeof( int) atIndex:21]; + [encoder setBytes:&n_orig_ctx length:sizeof( int) atIndex:22]; + [encoder setBytes:&freq_base length:sizeof( float) atIndex:23]; + [encoder setBytes:&freq_scale length:sizeof( float) atIndex:24]; + [encoder setBytes:&ext_factor length:sizeof( float) atIndex:25]; + [encoder setBytes:&attn_factor length:sizeof( float) atIndex:26]; + [encoder setBytes:&beta_fast length:sizeof( float) atIndex:27]; + [encoder setBytes:&beta_slow length:sizeof( float) atIndex:28]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; diff --git a/cpp/ggml.c b/cpp/ggml.c index ac433c4..9688880 100644 --- a/cpp/ggml.c +++ b/cpp/ggml.c @@ -1,4 +1,5 @@ #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows +#define _USE_MATH_DEFINES // For M_PI on MSVC #include "ggml-impl.h" #include "ggml-quants.h" @@ -4845,8 +4846,13 @@ static struct lm_ggml_tensor * lm_ggml_rope_impl( int n_dims, int mode, int n_ctx, + int n_orig_ctx, float freq_base, float freq_scale, + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow, float xpos_base, bool xpos_down, bool inplace) { @@ -4862,11 +4868,15 @@ static struct lm_ggml_tensor * lm_ggml_rope_impl( struct lm_ggml_tensor * result = inplace ? lm_ggml_view_tensor(ctx, a) : lm_ggml_dup_tensor(ctx, a); - int32_t params[8] = { /*n_past*/ 0, n_dims, mode, n_ctx }; - memcpy(params + 4, &freq_base, sizeof(float)); - memcpy(params + 5, &freq_scale, sizeof(float)); - memcpy(params + 6, &xpos_base, sizeof(float)); - memcpy(params + 7, &xpos_down, sizeof(bool)); + int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx }; + memcpy(params + 5, &freq_base, sizeof(float)); + memcpy(params + 6, &freq_scale, sizeof(float)); + memcpy(params + 7, &ext_factor, sizeof(float)); + memcpy(params + 8, &attn_factor, sizeof(float)); + memcpy(params + 9, &beta_fast, sizeof(float)); + memcpy(params + 10, &beta_slow, sizeof(float)); + memcpy(params + 11, &xpos_base, sizeof(float)); + memcpy(params + 12, &xpos_down, sizeof(bool)); lm_ggml_set_op_params(result, params, sizeof(params)); result->op = LM_GGML_OP_ROPE; @@ -4884,7 +4894,9 @@ struct lm_ggml_tensor * lm_ggml_rope( int n_dims, int mode, int n_ctx) { - return lm_ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, false); + return lm_ggml_rope_impl( + ctx, a, b, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, false + ); } struct lm_ggml_tensor * lm_ggml_rope_inplace( @@ -4894,7 +4906,9 @@ struct lm_ggml_tensor * lm_ggml_rope_inplace( int n_dims, int mode, int n_ctx) { - return lm_ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, true); + return lm_ggml_rope_impl( + ctx, a, b, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, true + ); } struct lm_ggml_tensor * lm_ggml_rope_custom( @@ -4904,9 +4918,17 @@ struct lm_ggml_tensor * lm_ggml_rope_custom( int n_dims, int mode, int n_ctx, + int n_orig_ctx, float freq_base, - float freq_scale) { - return lm_ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, false); + float freq_scale, + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow) { + return lm_ggml_rope_impl( + ctx, a, b, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, false + ); } struct lm_ggml_tensor * lm_ggml_rope_custom_inplace( @@ -4916,9 +4938,17 @@ struct lm_ggml_tensor * lm_ggml_rope_custom_inplace( int n_dims, int mode, int n_ctx, + int n_orig_ctx, float freq_base, - float freq_scale) { - return lm_ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, true); + float freq_scale, + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow) { + return lm_ggml_rope_impl( + ctx, a, b, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, true + ); } struct lm_ggml_tensor * lm_ggml_rope_xpos_inplace( @@ -4928,7 +4958,7 @@ struct lm_ggml_tensor * lm_ggml_rope_xpos_inplace( int n_dims, float base, bool down) { - return lm_ggml_rope_impl(ctx, a, b, n_dims, 0, 0, 10000.0f, 1.0f, base, down, true); + return lm_ggml_rope_impl(ctx, a, b, n_dims, 0, 0, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, base, down, true); } // lm_ggml_rope_back @@ -10901,6 +10931,45 @@ static void lm_ggml_compute_forward_clamp( // lm_ggml_compute_forward_rope +static float rope_yarn_ramp(const float low, const float high, const int i0) { + const float y = (i0 / 2 - low) / MAX(0.001f, high - low); + return 1 - MIN(1, MAX(0, y)); +} + +// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn +// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng. +static void rope_yarn( + float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale, + float * cos_theta, float * sin_theta +) { + // Get n-d rotational scaling corrected for extrapolation + float theta_interp = freq_scale * theta_extrap; + float theta = theta_interp; + if (ext_factor != 0.0f) { + float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor; + theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; + + // Get n-d magnitude scaling corrected for interpolation + mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale); + } + *cos_theta = cosf(theta) * mscale; + *sin_theta = sinf(theta) * mscale; +} + +// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get +// `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))` +static float lm_ggml_rope_yarn_corr_dim(int n_dims, int n_orig_ctx, float n_rot, float base) { + return n_dims * logf(n_orig_ctx / (n_rot * 2 * (float)M_PI)) / (2 * logf(base)); +} + +void lm_ggml_rope_yarn_corr_dims( + int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2] +) { + // start and end correction dims + dims[0] = MAX(0, floorf(lm_ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_fast, freq_base))); + dims[1] = MIN(n_dims - 1, ceilf(lm_ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_slow, freq_base))); +} + static void lm_ggml_compute_forward_rope_f32( const struct lm_ggml_compute_params * params, const struct lm_ggml_tensor * src0, @@ -10910,21 +10979,26 @@ static void lm_ggml_compute_forward_rope_f32( return; } - float freq_base; - float freq_scale; + float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; // these two only relevant for xPos RoPE: float xpos_base; bool xpos_down; - //const int n_past = ((int32_t *) dst->op_params)[0]; - const int n_dims = ((int32_t *) dst->op_params)[1]; - const int mode = ((int32_t *) dst->op_params)[2]; - const int n_ctx = ((int32_t *) dst->op_params)[3]; - memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); - memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); - memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float)); - memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool)); + //const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; + const int n_ctx = ((int32_t *) dst->op_params)[3]; + const int n_orig_ctx = ((int32_t *) dst->op_params)[4]; + + memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); + memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); + memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); + memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); + memcpy(&xpos_base, (int32_t *) dst->op_params + 11, sizeof(float)); + memcpy(&xpos_down, (int32_t *) dst->op_params + 12, sizeof(bool)); LM_GGML_TENSOR_UNARY_OP_LOCALS @@ -10952,6 +11026,9 @@ static void lm_ggml_compute_forward_rope_f32( int ir = 0; const float theta_scale = powf(freq_base, -2.0f/n_dims); + const float inv_ndims = -1.f/n_dims; + float corr_dims[2]; + lm_ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims); const bool is_neox = mode & 2; const bool is_glm = mode & 4; @@ -10965,18 +11042,18 @@ static void lm_ggml_compute_forward_rope_f32( if (ir++ < ir0) continue; if (ir > ir1) break; - float theta = freq_scale * (float)p; + float theta_base = (float)p; if (is_glm) { - theta = MIN(p, n_ctx - 2); + theta_base = MIN(p, n_ctx - 2); float block_theta = MAX(p - (n_ctx - 2), 0); for (int64_t i0 = 0; i0 < ne0 / 4; i0++) { - const float cos_theta = cosf(theta); - const float sin_theta = sinf(theta); + const float cos_theta = cosf(theta_base); + const float sin_theta = sinf(theta_base); const float cos_block_theta = cosf(block_theta); const float sin_block_theta = sinf(block_theta); - theta *= theta_scale; + theta_base *= theta_scale; block_theta *= theta_scale; const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); @@ -10994,13 +11071,16 @@ static void lm_ggml_compute_forward_rope_f32( } } else if (!is_neox) { for (int64_t i0 = 0; i0 < ne0; i0 += 2) { - const float cos_theta = cosf(theta); - const float sin_theta = sinf(theta); + float cos_theta, sin_theta; + rope_yarn( + theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta + ); + // zeta scaling for xPos only: float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f; if (xpos_down) zeta = 1.0f / zeta; - theta *= theta_scale; + theta_base *= theta_scale; const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); @@ -11014,12 +11094,19 @@ static void lm_ggml_compute_forward_rope_f32( } else { // TODO: this might be wrong for ne0 != n_dims - need double check // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28 + theta_base *= freq_scale; for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { for (int64_t ic = 0; ic < n_dims; ic += 2) { - const float cos_theta = cosf(theta); - const float sin_theta = sinf(theta); + // simplified from `(ib * n_dims + ic) * inv_ndims` + float cur_rot = inv_ndims * ic - ib; - theta *= theta_scale; + float cos_theta, sin_theta; + rope_yarn( + theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, + &cos_theta, &sin_theta + ); + + theta_base *= theta_scale; const int64_t i0 = ib*n_dims + ic/2; @@ -11048,15 +11135,19 @@ static void lm_ggml_compute_forward_rope_f16( return; } - float freq_base; - float freq_scale; + float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; - //const int n_past = ((int32_t *) dst->op_params)[0]; - const int n_dims = ((int32_t *) dst->op_params)[1]; - const int mode = ((int32_t *) dst->op_params)[2]; - const int n_ctx = ((int32_t *) dst->op_params)[3]; - memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); - memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); + //const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; + const int n_ctx = ((int32_t *) dst->op_params)[3]; + const int n_orig_ctx = ((int32_t *) dst->op_params)[4]; + memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); + memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); + memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); + memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); LM_GGML_TENSOR_UNARY_OP_LOCALS @@ -11084,6 +11175,9 @@ static void lm_ggml_compute_forward_rope_f16( int ir = 0; const float theta_scale = powf(freq_base, -2.0f/n_dims); + const float inv_ndims = -1.f/n_dims; + float corr_dims[2]; + lm_ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims); const bool is_neox = mode & 2; const bool is_glm = mode & 4; @@ -11097,18 +11191,18 @@ static void lm_ggml_compute_forward_rope_f16( if (ir++ < ir0) continue; if (ir > ir1) break; - float theta = freq_scale * (float)p; + float theta_base = (float)p; if (is_glm) { - theta = MIN(p, n_ctx - 2); + theta_base = MIN(p, n_ctx - 2); float block_theta = MAX(p - (n_ctx - 2), 0); for (int64_t i0 = 0; i0 < ne0 / 4; i0++) { - const float cos_theta = cosf(theta); - const float sin_theta = sinf(theta); + const float cos_theta = cosf(theta_base); + const float sin_theta = sinf(theta_base); const float cos_block_theta = cosf(block_theta); const float sin_block_theta = sinf(block_theta); - theta *= theta_scale; + theta_base *= theta_scale; block_theta *= theta_scale; const lm_ggml_fp16_t * const src = (lm_ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); @@ -11126,10 +11220,12 @@ static void lm_ggml_compute_forward_rope_f16( } } else if (!is_neox) { for (int64_t i0 = 0; i0 < ne0; i0 += 2) { - const float cos_theta = cosf(theta); - const float sin_theta = sinf(theta); + float cos_theta, sin_theta; + rope_yarn( + theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta + ); - theta *= theta_scale; + theta_base *= theta_scale; const lm_ggml_fp16_t * const src = (lm_ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); lm_ggml_fp16_t * dst_data = (lm_ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); @@ -11143,12 +11239,19 @@ static void lm_ggml_compute_forward_rope_f16( } else { // TODO: this might be wrong for ne0 != n_dims - need double check // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28 + theta_base *= freq_scale; for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { for (int64_t ic = 0; ic < n_dims; ic += 2) { - const float cos_theta = cosf(theta); - const float sin_theta = sinf(theta); + // simplified from `(ib * n_dims + ic) * inv_ndims` + float cur_rot = inv_ndims * ic - ib; + + float cos_theta, sin_theta; + rope_yarn( + theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, + &cos_theta, &sin_theta + ); - theta *= theta_scale; + theta_base *= theta_scale; const int64_t i0 = ib*n_dims + ic/2; @@ -11256,17 +11359,18 @@ static void lm_ggml_compute_forward_rope_back_f32( if (ir++ < ir0) continue; if (ir > ir1) break; - float theta = freq_scale * (float)p; + float theta_base = freq_scale * (float)p; if (!is_neox) { for (int64_t i0 = 0; i0 < ne0; i0 += 2) { - const float cos_theta = cosf(theta); - const float sin_theta = sinf(theta); + const float cos_theta = cosf(theta_base); + const float sin_theta = sinf(theta_base); + // zeta scaling for xPos only: float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f; if (xpos_down) zeta = 1.0f / zeta; - theta *= theta_scale; + theta_base *= theta_scale; const float * const dy = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); float * dx = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); @@ -11280,10 +11384,10 @@ static void lm_ggml_compute_forward_rope_back_f32( } else { for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { for (int64_t ic = 0; ic < n_dims; ic += 2) { - const float cos_theta = cosf(theta); - const float sin_theta = sinf(theta); + const float cos_theta = cosf(theta_base); + const float sin_theta = sinf(theta_base); - theta *= theta_scale; + theta_base *= theta_scale; const int64_t i0 = ib*n_dims + ic/2; @@ -11356,14 +11460,14 @@ static void lm_ggml_compute_forward_rope_back_f16( if (ir++ < ir0) continue; if (ir > ir1) break; - float theta = (float)p; + float theta_base = (float)p; if (!is_neox) { for (int64_t i0 = 0; i0 < ne0; i0 += 2) { - const float cos_theta = cosf(theta); - const float sin_theta = sinf(theta); + const float cos_theta = cosf(theta_base); + const float sin_theta = sinf(theta_base); - theta *= theta_scale; + theta_base *= theta_scale; const lm_ggml_fp16_t * const dy = (lm_ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); lm_ggml_fp16_t * dx = (lm_ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); @@ -11377,10 +11481,10 @@ static void lm_ggml_compute_forward_rope_back_f16( } else { for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { for (int64_t ic = 0; ic < n_dims; ic += 2) { - const float cos_theta = cosf(theta); - const float sin_theta = sinf(theta); + const float cos_theta = cosf(theta_base); + const float sin_theta = sinf(theta_base); - theta *= theta_scale; + theta_base *= theta_scale; const int64_t i0 = ib*n_dims + ic/2; @@ -15505,9 +15609,14 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm src1, n_dims, mode, + 0, n_ctx, freq_base, freq_scale, + 0.0f, + 1.0f, + 0.0f, + 0.0f, xpos_base, xpos_down, false), @@ -18702,8 +18811,7 @@ static bool lm_gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offs return n == size; } -// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023 -static bool lm_gguf_fread_str_cur(FILE * file, struct lm_gguf_str * p, size_t * offset) { +static bool lm_gguf_fread_str(FILE * file, struct lm_gguf_str * p, size_t * offset) { p->n = 0; p->data = NULL; @@ -18715,19 +18823,6 @@ static bool lm_gguf_fread_str_cur(FILE * file, struct lm_gguf_str * p, size_t * return ok; } -static bool lm_gguf_fread_str_v1(FILE * file, struct lm_gguf_str * p, size_t * offset) { - p->n = 0; - p->data = NULL; - - bool ok = true; - - uint32_t n = 0; - ok = ok && lm_gguf_fread_el(file, &n, sizeof(n), offset); p->data = calloc(n + 1, 1); p->n = n; - ok = ok && lm_gguf_fread_el(file, p->data, p->n, offset); - - return ok; -} - struct lm_gguf_context * lm_gguf_init_empty(void) { struct lm_gguf_context * ctx = LM_GGML_ALIGNED_MALLOC(sizeof(struct lm_gguf_context)); @@ -18786,20 +18881,14 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg ctx->data = NULL; ok = ok && lm_gguf_fread_el(file, &ctx->header.version, sizeof(ctx->header.version), &offset); + ok = ok && lm_gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset); + ok = ok && lm_gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset); if (ctx->header.version == 1) { - // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023 - uint32_t n_tensors = 0; - uint32_t n_kv = 0; - - ok = ok && lm_gguf_fread_el(file, &n_tensors, sizeof(n_tensors), &offset); - ok = ok && lm_gguf_fread_el(file, &n_kv, sizeof(n_kv), &offset); - - ctx->header.n_tensors = n_tensors; - ctx->header.n_kv = n_kv; - } else { - ok = ok && lm_gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset); - ok = ok && lm_gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset); + fprintf(stderr, "%s: GGUFv1 is no longer supported. please use a more up-to-date version\n", __func__); + fclose(file); + lm_gguf_free(ctx); + return NULL; } if (!ok) { @@ -18810,12 +18899,6 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg } } - // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023 - bool (* lm_gguf_fread_str)(FILE *, struct lm_gguf_str *, size_t *) = lm_gguf_fread_str_cur; - if (ctx->header.version == 1) { - lm_gguf_fread_str = lm_gguf_fread_str_v1; - } - // read the kv pairs { ctx->kv = malloc(ctx->header.n_kv * sizeof(struct lm_gguf_kv)); @@ -18846,15 +18929,7 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg case LM_GGUF_TYPE_ARRAY: { ok = ok && lm_gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset); - - if (ctx->header.version == 1) { - // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023 - uint32_t n = 0; - ok = ok && lm_gguf_fread_el(file, &n, sizeof(n), &offset); - kv->value.arr.n = n; - } else { - ok = ok && lm_gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset); - } + ok = ok && lm_gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset); switch (kv->value.arr.type) { case LM_GGUF_TYPE_UINT8: @@ -18913,14 +18988,7 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg ok = ok && lm_gguf_fread_str(file, &info->name, &offset); ok = ok && lm_gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset); for (uint32_t j = 0; j < info->n_dims; ++j) { - if (ctx->header.version == 1) { - // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023 - uint32_t t = 0; - ok = ok && lm_gguf_fread_el(file, &t, sizeof(t), &offset); - info->ne[j] = t; - } else { - ok = ok && lm_gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset); - } + ok = ok && lm_gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset); } ok = ok && lm_gguf_fread_el (file, &info->type, sizeof(info->type), &offset); ok = ok && lm_gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset); diff --git a/cpp/ggml.h b/cpp/ggml.h index 1b065cd..d13ee9b 100644 --- a/cpp/ggml.h +++ b/cpp/ggml.h @@ -219,7 +219,7 @@ #define LM_GGML_MAX_CONTEXTS 64 #define LM_GGML_MAX_SRC 6 #define LM_GGML_MAX_NAME 64 -#define LM_GGML_MAX_OP_PARAMS 32 +#define LM_GGML_MAX_OP_PARAMS 64 #define LM_GGML_DEFAULT_N_THREADS 4 #if UINTPTR_MAX == 0xFFFFFFFF @@ -1326,8 +1326,13 @@ extern "C" { int n_dims, int mode, int n_ctx, + int n_orig_ctx, float freq_base, - float freq_scale); + float freq_scale, + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow); // in-place, returns view(a) LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope_custom_inplace( @@ -1337,8 +1342,17 @@ extern "C" { int n_dims, int mode, int n_ctx, + int n_orig_ctx, float freq_base, - float freq_scale); + float freq_scale, + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow); + + // compute correction dims for YaRN RoPE scaling + void lm_ggml_rope_yarn_corr_dims( + int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]); // xPos RoPE, in-place, returns view(a) LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope_xpos_inplace( diff --git a/cpp/llama.cpp b/cpp/llama.cpp index eab2049..220e14d 100644 --- a/cpp/llama.cpp +++ b/cpp/llama.cpp @@ -54,6 +54,7 @@ #include #include #include +#include #include #include #include @@ -246,6 +247,10 @@ enum llm_kv { LLM_KV_ROPE_DIMENSION_COUNT, LLM_KV_ROPE_FREQ_BASE, LLM_KV_ROPE_SCALE_LINEAR, + LLM_KV_ROPE_SCALING_TYPE, + LLM_KV_ROPE_SCALING_FACTOR, + LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, + LLM_KV_ROPE_SCALING_FINETUNED, LLM_KV_TOKENIZER_MODEL, LLM_KV_TOKENIZER_LIST, @@ -287,9 +292,13 @@ static std::map LLM_KV_NAMES = { { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" }, { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" }, - { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, - { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, - { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" }, + { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, + { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, + { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" }, + { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" }, + { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" }, + { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" }, + { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" }, { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" }, @@ -563,6 +572,22 @@ do { \ } \ } while (0) +static std::map LLAMA_ROPE_SCALING_TYPES = { + { LLAMA_ROPE_SCALING_NONE, "none" }, + { LLAMA_ROPE_SCALING_LINEAR, "linear" }, + { LLAMA_ROPE_SCALING_YARN, "yarn" }, +}; + +static int8_t llama_rope_scaling_type_from_string(const std::string & name) { + for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) { + if (kv.second == name) { + return kv.first; + } + } + + return LLAMA_ROPE_SCALING_UNSPECIFIED; +} + // // ggml helpers // @@ -1046,8 +1071,11 @@ struct llama_hparams { float f_norm_eps; float f_norm_rms_eps; - float rope_freq_base_train; - float rope_freq_scale_train; + float rope_freq_base_train; + float rope_freq_scale_train; + uint32_t n_yarn_orig_ctx; + int8_t rope_scaling_type_train : 3; + bool rope_finetuned : 1; float f_clamp_kqv; float f_max_alibi_bias; @@ -1062,6 +1090,8 @@ struct llama_hparams { if (this->n_layer != other.n_layer) return true; if (this->n_rot != other.n_rot) return true; if (this->n_ff != other.n_ff) return true; + if (this->rope_finetuned != other.rope_finetuned) return true; + if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true; const float EPSILON = 1e-9; @@ -1092,8 +1122,16 @@ struct llama_cparams { uint32_t n_threads; // number of threads to use for generation uint32_t n_threads_batch; // number of threads to use for batch processing - float rope_freq_base; - float rope_freq_scale; + float rope_freq_base; + float rope_freq_scale; + + uint32_t n_yarn_orig_ctx; + // These hyperparameters are not exposed in GGUF, because all + // existing YaRN models use the same values for them. + float yarn_ext_factor; + float yarn_attn_factor; + float yarn_beta_fast; + float yarn_beta_slow; bool mul_mat_q; }; @@ -1810,6 +1848,12 @@ struct llama_model_loader { throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str())); } + if (backend == LM_GGML_BACKEND_GPU_SPLIT) { + if (ne.size() == 1) { + throw std::runtime_error(format("%s: 1-dimensional tensor '%s' cannot be split on the GPU", __func__, name.c_str())); + } + } + { bool is_ok = true; for (size_t i = 0; i < ne.size(); ++i) { @@ -2025,14 +2069,30 @@ static void llm_load_hparams( hparams.n_head_kv = hparams.n_head; LM_GGUF_GET_KEY(ctx, hparams.n_head_kv, lm_gguf_get_val_u32, LM_GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV)); + hparams.rope_finetuned = false; + LM_GGUF_GET_KEY(ctx, hparams.rope_finetuned, lm_gguf_get_val_bool, LM_GGUF_TYPE_BOOL, false, + kv(LLM_KV_ROPE_SCALING_FINETUNED)); + + hparams.n_yarn_orig_ctx = hparams.n_ctx_train; + LM_GGUF_GET_KEY(ctx, hparams.n_yarn_orig_ctx, lm_gguf_get_val_u32, LM_GGUF_TYPE_UINT32, false, + kv(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN)); + // rope_freq_base (optional) hparams.rope_freq_base_train = 10000.0f; LM_GGUF_GET_KEY(ctx, hparams.rope_freq_base_train, lm_gguf_get_val_f32, LM_GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE)); + std::string rope_scaling("linear"); + LM_GGUF_GET_KEY(ctx, rope_scaling, lm_gguf_get_val_str, LM_GGUF_TYPE_STRING, false, kv(LLM_KV_ROPE_SCALING_TYPE)); + hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling); + LM_GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_UNSPECIFIED); + // rope_freq_scale (inverse of the kv) is optional - float ropescale = 1.0f; - LM_GGUF_GET_KEY(ctx, ropescale, lm_gguf_get_val_f32, LM_GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR)); - hparams.rope_freq_scale_train = 1.0f/ropescale; + float ropescale = 0.0f; + LM_GGUF_GET_KEY(ctx, ropescale, lm_gguf_get_val_f32, LM_GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALING_FACTOR)); + if (ropescale == 0.0f) { // try the old key name + LM_GGUF_GET_KEY(ctx, ropescale, lm_gguf_get_val_f32, LM_GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR)); + } + hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale; // sanity check for n_rot (optional) { @@ -2382,6 +2442,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { const auto & hparams = model.hparams; const auto & vocab = model.vocab; + const auto rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train); + // hparams LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver)); LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str()); @@ -2400,8 +2462,11 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv); LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias); LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff); + LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str()); LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train); LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train); + LLAMA_LOG_INFO("%s: n_yarn_orig_ctx = %u\n", __func__, hparams.n_yarn_orig_ctx); + LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown"); LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type)); LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str()); LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9); @@ -2769,8 +2834,8 @@ static void llm_load_tensors( layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); if (backend == LM_GGML_BACKEND_GPU) { vram_weights += @@ -2829,13 +2894,13 @@ static void llm_load_tensors( layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); - layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split); + layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend); layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); - layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split); + layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend); layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); - layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split); + layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend); layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split); + layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); layer.attn_q_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64}, backend); @@ -2901,19 +2966,19 @@ static void llm_load_tensors( layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); - layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split); + layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend); layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); - layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split); + layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend); layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); - layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split); + layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); if (backend == LM_GGML_BACKEND_GPU) { vram_weights += @@ -3058,21 +3123,11 @@ static void llm_load_tensors( model.t_load_us = lm_ggml_time_us() - model.t_start_us; } -static bool llama_model_load( - const std::string & fname, - llama_model & model, - int n_gpu_layers, - int main_gpu, - const float * tensor_split, - bool use_mmap, - bool use_mlock, - bool vocab_only, - llama_progress_callback progress_callback, - void *progress_callback_user_data) { +static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) { try { - llama_model_loader ml(fname, use_mmap); + llama_model_loader ml(fname, params.use_mmap); - model.hparams.vocab_only = vocab_only; + model.hparams.vocab_only = params.vocab_only; llm_load_arch (ml, model); llm_load_hparams(ml, model); @@ -3084,15 +3139,15 @@ static bool llama_model_load( throw std::runtime_error("vocab size mismatch"); } - if (vocab_only) { + if (params.vocab_only) { LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__); return true; } llm_load_tensors( - ml, model, n_gpu_layers, - main_gpu, tensor_split, - use_mlock, progress_callback, progress_callback_user_data); + ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock, + params.progress_callback, params.progress_callback_user_data + ); } catch (const std::exception & err) { LLAMA_LOG_ERROR("error loading model: %s\n", err.what()); return false; @@ -3161,6 +3216,7 @@ static struct lm_ggml_tensor * llm_build_inp_embd( static void llm_build_k_shift( struct lm_ggml_context * ctx, const llama_hparams & hparams, + const llama_cparams & cparams, const llama_kv_cache & kv, struct lm_ggml_cgraph * graph, llm_rope_type type, @@ -3173,6 +3229,11 @@ static void llm_build_k_shift( const int64_t n_head_kv = hparams.n_head_kv; const int64_t n_embd_gqa = hparams.n_embd_gqa(); const int64_t n_embd_head = hparams.n_embd_head(); + const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx; + const float ext_factor = cparams.yarn_ext_factor; + const float attn_factor = cparams.yarn_attn_factor; + const float beta_fast = cparams.yarn_beta_fast; + const float beta_slow = cparams.yarn_beta_slow; LM_GGML_ASSERT(n_embd_head % n_rot == 0); @@ -3196,7 +3257,8 @@ static void llm_build_k_shift( lm_ggml_element_size(kv.k)*n_embd_head, lm_ggml_element_size(kv.k)*n_embd_gqa, lm_ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il), - K_shift, n_rot, rope_type, 0, freq_base, freq_scale); + K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); cb(tmp, "K_shifted", il); lm_ggml_build_forward_expand(graph, tmp); } @@ -3453,12 +3515,17 @@ struct llm_build_context { const float freq_base; const float freq_scale; + const float ext_factor; + const float attn_factor; + const float beta_fast; + const float beta_slow; const float norm_eps; const float norm_rms_eps; const int32_t n_tokens; const int32_t n_kv; // size of KV cache to consider (n_kv <= n_ctx) const int32_t kv_head; // index of where we store new KV data in the cache + const int32_t n_orig_ctx; const bool do_rope_shift; @@ -3488,11 +3555,16 @@ struct llm_build_context { n_embd_gqa (hparams.n_embd_gqa()), freq_base (cparams.rope_freq_base), freq_scale (cparams.rope_freq_scale), + ext_factor (cparams.yarn_ext_factor), + attn_factor (cparams.yarn_attn_factor), + beta_fast (cparams.yarn_beta_fast), + beta_slow (cparams.yarn_beta_slow), norm_eps (hparams.f_norm_eps), norm_rms_eps (hparams.f_norm_rms_eps), n_tokens (batch.n_tokens), n_kv (worst_case ? n_ctx : kv_self.n), kv_head (worst_case ? n_ctx - n_tokens : kv_self.head), + n_orig_ctx (cparams.n_yarn_orig_ctx), do_rope_shift (worst_case || kv_self.has_shift), cb (cb), buf_compute (lctx.buf_compute) { @@ -3543,7 +3615,7 @@ struct llm_build_context { // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -3567,10 +3639,18 @@ struct llm_build_context { struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); - Qcur = lm_ggml_rope_custom(ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); + Qcur = lm_ggml_rope_custom( + ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); cb(Qcur, "Qcur", il); - Kcur = lm_ggml_rope_custom(ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); + Kcur = lm_ggml_rope_custom( + ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); cb(Kcur, "Kcur", il); llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); @@ -3645,7 +3725,7 @@ struct llm_build_context { // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -3669,8 +3749,16 @@ struct llm_build_context { switch (model.type) { case MODEL_7B: - Qcur = lm_ggml_rope_custom(ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); - Kcur = lm_ggml_rope_custom(ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); + Qcur = lm_ggml_rope_custom( + ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + Kcur = lm_ggml_rope_custom( + ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); break; case MODEL_13B: Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, n_tokens); @@ -3757,7 +3845,7 @@ struct llm_build_context { // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -3797,10 +3885,16 @@ struct llm_build_context { Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); // using mode = 2 for neox mode - Qcur = lm_ggml_rope_custom(ctx0, Qcur, inp_pos, n_embd_head, 2, 0, freq_base, freq_scale); + Qcur = lm_ggml_rope_custom( + ctx0, Qcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx, + freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ); cb(Qcur, "Qcur", il); - Kcur = lm_ggml_rope_custom(ctx0, Kcur, inp_pos, n_embd_head, 2, 0, freq_base, freq_scale); + Kcur = lm_ggml_rope_custom( + ctx0, Kcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx, + freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ); cb(Kcur, "Kcur", il); llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); @@ -3971,7 +4065,7 @@ struct llm_build_context { cb(KQ_mask, "KQ_mask", -1); if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -4064,13 +4158,15 @@ struct llm_build_context { cb(kpass, "kpass", il); struct lm_ggml_tensor * qrotated = lm_ggml_rope_custom( - ctx0, qrot, inp_pos, n_rot, 2, 0, freq_base, freq_scale - ); + ctx0, qrot, inp_pos, n_rot, 2, 0, n_orig_ctx, + freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ); cb(qrotated, "qrotated", il); struct lm_ggml_tensor * krotated = lm_ggml_rope_custom( - ctx0, krot, inp_pos, n_rot, 2, 0, freq_base, freq_scale - ); + ctx0, krot, inp_pos, n_rot, 2, 0, n_orig_ctx, + freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ); cb(krotated, "krotated", il); // ggml currently only supports concatenation on dim=2 @@ -7894,8 +7990,14 @@ struct llama_context_params llama_context_default_params() { /*.n_batch =*/ 512, /*.n_threads =*/ LM_GGML_DEFAULT_N_THREADS, // TODO: better default /*.n_threads_batch =*/ LM_GGML_DEFAULT_N_THREADS, + /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_UNSPECIFIED, /*.rope_freq_base =*/ 0.0f, /*.rope_freq_scale =*/ 0.0f, + /*.yarn_ext_factor =*/ NAN, + /*.yarn_attn_factor =*/ 1.0f, + /*.yarn_beta_fast =*/ 32.0f, + /*.yarn_beta_slow =*/ 1.0f, + /*.yarn_orig_ctx =*/ 0, /*.mul_mat_q =*/ true, /*.f16_kv =*/ true, /*.logits_all =*/ false, @@ -7982,10 +8084,7 @@ struct llama_model * llama_load_model_from_file( }; } - if (!llama_model_load(path_model, *model, params.n_gpu_layers, - params.main_gpu, params.tensor_split, - params.use_mmap, params.use_mlock, params.vocab_only, - params.progress_callback, params.progress_callback_user_data)) { + if (!llama_model_load(path_model, *model, params)) { LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); delete model; return nullptr; @@ -8011,13 +8110,35 @@ struct llama_context * llama_new_context_with_model( const auto & hparams = model->hparams; auto & cparams = ctx->cparams; - cparams.n_batch = params.n_batch; - cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; - cparams.rope_freq_base = params.rope_freq_base == 0 ? hparams.rope_freq_base_train : params.rope_freq_base; - cparams.rope_freq_scale = params.rope_freq_scale == 0 ? hparams.rope_freq_scale_train : params.rope_freq_scale; - cparams.n_threads = params.n_threads; - cparams.n_threads_batch = params.n_threads_batch; - cparams.mul_mat_q = params.mul_mat_q; + cparams.n_batch = params.n_batch; + cparams.n_threads = params.n_threads; + cparams.n_threads_batch = params.n_threads_batch; + cparams.yarn_ext_factor = params.yarn_ext_factor; + cparams.yarn_attn_factor = params.yarn_attn_factor; + cparams.yarn_beta_fast = params.yarn_beta_fast; + cparams.yarn_beta_slow = params.yarn_beta_slow; + cparams.mul_mat_q = params.mul_mat_q; + + cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; + cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base; + cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale; + + cparams.n_yarn_orig_ctx = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx : + hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx : + hparams.n_ctx_train; + + auto rope_scaling_type = params.rope_scaling_type; + if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) { + rope_scaling_type = hparams.rope_scaling_type_train; + } + + if (rope_scaling_type == LLAMA_ROPE_SCALING_NONE) { + cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none + } + + if (std::isnan(cparams.yarn_ext_factor)) { // NaN indicates 'not set' + cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_YARN ? 1.0f : 0.0f; + } if (params.seed == LLAMA_DEFAULT_SEED) { params.seed = time(NULL); diff --git a/cpp/llama.h b/cpp/llama.h index 491fdb6..4b1ceec 100644 --- a/cpp/llama.h +++ b/cpp/llama.h @@ -106,6 +106,14 @@ extern "C" { LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; + enum llama_rope_scaling_type { + LLAMA_ROPE_SCALING_UNSPECIFIED = -1, + LLAMA_ROPE_SCALING_NONE = 0, + LLAMA_ROPE_SCALING_LINEAR = 1, + LLAMA_ROPE_SCALING_YARN = 2, + LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN, + }; + typedef struct llama_token_data { llama_token id; // token id float logit; // log-odds of the token @@ -172,10 +180,16 @@ extern "C" { uint32_t n_batch; // prompt processing maximum batch size uint32_t n_threads; // number of threads to use for generation uint32_t n_threads_batch; // number of threads to use for batch processing + int8_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` // ref: https://github.com/ggerganov/llama.cpp/pull/2054 - float rope_freq_base; // RoPE base frequency, 0 = from model - float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model + float rope_freq_base; // RoPE base frequency, 0 = from model + float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model + float yarn_ext_factor; // YaRN extrapolation mix factor, NaN = from model + float yarn_attn_factor; // YaRN magnitude scaling factor + float yarn_beta_fast; // YaRN low correction dim + float yarn_beta_slow; // YaRN high correction dim + uint32_t yarn_orig_ctx; // YaRN original context size // Keep the booleans together to avoid misalignment during copy-by-value. bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true) diff --git a/llama.cpp b/llama.cpp index c43c2da..629f917 160000 --- a/llama.cpp +++ b/llama.cpp @@ -1 +1 @@ -Subproject commit c43c2da8afacaddfe51c09b21dbd9922cd0ea46b +Subproject commit 629f917cd6b96ba1274c49a8aab163b1b189229d diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index 28c7cb2..7fd9bc8 100755 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -3,11 +3,6 @@ git submodule init git submodule update --recursive -cd llama.cpp -./scripts/build-info.sh > build-info.h -cd - - -cp ./llama.cpp/build-info.h ./cpp/build-info.h cp ./llama.cpp/ggml.h ./cpp/ggml.h cp ./llama.cpp/ggml.c ./cpp/ggml.c cp ./llama.cpp/ggml-metal.h ./cpp/ggml-metal.h @@ -73,6 +68,7 @@ echo "Replacement completed successfully!" yarn example # Apply patch +patch -p0 -d ./cpp < ./scripts/common.cpp.patch patch -p0 -d ./cpp < ./scripts/log.h.patch patch -p0 -d ./cpp < ./scripts/llama.cpp.patch patch -p0 -d ./cpp < ./scripts/ggml-metal.m.patch diff --git a/scripts/common.cpp.patch b/scripts/common.cpp.patch new file mode 100644 index 0000000..3d335f0 --- /dev/null +++ b/scripts/common.cpp.patch @@ -0,0 +1,11 @@ +--- common.cpp.orig 2023-11-03 14:51:27 ++++ common.cpp 2023-11-03 14:51:30 +@@ -1198,8 +1198,6 @@ + const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc) { + const llama_sampling_params & sparams = params.sparams; + +- fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT); +- fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER); + fprintf(stream, "cpu_has_arm_fma: %s\n", lm_ggml_cpu_has_arm_fma() ? "true" : "false"); + fprintf(stream, "cpu_has_avx: %s\n", lm_ggml_cpu_has_avx() ? "true" : "false"); + fprintf(stream, "cpu_has_avx2: %s\n", lm_ggml_cpu_has_avx2() ? "true" : "false"); diff --git a/scripts/llama.cpp.patch b/scripts/llama.cpp.patch index f2ba51f..028665b 100644 --- a/scripts/llama.cpp.patch +++ b/scripts/llama.cpp.patch @@ -1,6 +1,6 @@ ---- llama.cpp.orig 2023-11-02 11:16:33 -+++ llama.cpp 2023-11-02 11:16:34 -@@ -102,6 +102,17 @@ +--- llama.cpp.orig 2023-11-03 14:26:21 ++++ llama.cpp 2023-11-03 14:26:22 +@@ -103,6 +103,17 @@ #define LLAMA_LOG_WARN(...) llama_log_internal(LM_GGML_LOG_LEVEL_WARN , __VA_ARGS__) #define LLAMA_LOG_ERROR(...) llama_log_internal(LM_GGML_LOG_LEVEL_ERROR, __VA_ARGS__) @@ -18,7 +18,7 @@ // // helpers // -@@ -736,16 +747,16 @@ +@@ -761,16 +772,16 @@ if (prefetch > 0) { // Advise the kernel to preload the mapped memory