Skip to content

Commit

Permalink
feat: sync llama.cpp (#73)
Browse files Browse the repository at this point in the history
* feat: sync llama.cpp

* feat: sync llama.cpp
  • Loading branch information
a-ghorbani authored Aug 6, 2024
1 parent 1d6a492 commit da5dad1
Show file tree
Hide file tree
Showing 19 changed files with 906 additions and 858 deletions.
7 changes: 6 additions & 1 deletion cpp/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1330,6 +1330,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
else { invalid_param = true; }
return true;
}
if (arg == "--no-warmup") {
params.warmup = false;
return true;
}
#ifndef LOG_DISABLE_LOGS
// Parse args for logging parameters
if (log_param_single_parse(argv[i])) {
Expand Down Expand Up @@ -1452,6 +1456,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" });
options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" });
options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" });
options.push_back({ "main", " --no-warmup", "skip warming up the model with an empty run" });
options.push_back({ "server infill",
" --spm-infill", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" });

Expand Down Expand Up @@ -1635,7 +1640,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "server", " --host HOST", "ip address to listen (default: %s)", params.hostname.c_str() });
options.push_back({ "server", " --port PORT", "port to listen (default: %d)", params.port });
options.push_back({ "server", " --path PATH", "path to serve static files from (default: %s)", params.public_path.c_str() });
options.push_back({ "server", " --embedding(s)", "enable embedding endpoint (default: %s)", params.embedding ? "enabled" : "disabled" });
options.push_back({ "server", " --embedding(s)", "restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled" });
options.push_back({ "server", " --api-key KEY", "API key to use for authentication (default: none)" });
options.push_back({ "server", " --api-key-file FNAME", "path to file containing API keys (default: none)" });
options.push_back({ "server", " --ssl-key-file FNAME", "path to file a PEM-encoded SSL private key" });
Expand Down
28 changes: 14 additions & 14 deletions cpp/ggml-aarch64.c
Original file line number Diff line number Diff line change
Expand Up @@ -384,8 +384,8 @@ void lm_ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void
UNUSED(blocklen);

#if defined(__ARM_FEATURE_SVE)
if (svcntw() == 8) {
LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (svcntw() == 8)) &&
if (lm_ggml_sve_cnt_b == QK8_0) {
LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
}
#endif
Expand Down Expand Up @@ -496,8 +496,8 @@ void lm_ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void
UNUSED(blocklen);

#if defined(__ARM_FEATURE_SVE)
if (svcntw() == 8) {
LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (svcntw() == 8)) &&
if (lm_ggml_sve_cnt_b == QK8_0) {
LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
}
#endif
Expand Down Expand Up @@ -614,7 +614,7 @@ void lm_ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
UNUSED(blocklen);

#if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
if (svcntw() == 8) {
if (lm_ggml_sve_cnt_b == QK8_0) {
const void * b_ptr = vx;
const void * a_ptr = vy;
float * res_ptr = s;
Expand Down Expand Up @@ -680,12 +680,12 @@ void lm_ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
return;
}
else if (lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_matmul_int8()) {
LM_GGML_ASSERT((lm_ggml_cpu_has_sve() && (svcntw() == 8)) &&
LM_GGML_ASSERT((lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
"__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
"performance");
}
else if (lm_ggml_cpu_has_neon()) {
LM_GGML_ASSERT(((lm_ggml_cpu_has_sve() && (svcntw() == 8)) || lm_ggml_cpu_has_matmul_int8()) &&
LM_GGML_ASSERT(((lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) || lm_ggml_cpu_has_matmul_int8()) &&
"__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
"quantization format for optimal performance");
}
Expand Down Expand Up @@ -745,8 +745,8 @@ void lm_ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void
UNUSED(blocklen);

#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
if (svcntw() == 8) {
LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (svcntw() == 8)) &&
if (lm_ggml_sve_cnt_b == QK8_0) {
LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
}
#endif
Expand Down Expand Up @@ -1266,8 +1266,8 @@ void lm_ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void
UNUSED(blocklen);

#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
if (svcntw() == 8) {
LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (svcntw() == 8)) &&
if (lm_ggml_sve_cnt_b == QK8_0) {
LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
}
#endif
Expand Down Expand Up @@ -1728,7 +1728,7 @@ void lm_ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
UNUSED(blocklen);

#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
if (svcntw() == 8) {
if (lm_ggml_sve_cnt_b == QK8_0) {
const void * b_ptr = vx;
const void * a_ptr = vy;
float * res_ptr = s;
Expand Down Expand Up @@ -2139,12 +2139,12 @@ void lm_ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
return;
}
else if (lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_matmul_int8()) {
LM_GGML_ASSERT((lm_ggml_cpu_has_sve() && (svcntw() == 8)) &&
LM_GGML_ASSERT((lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
"__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
"performance");
}
else if (lm_ggml_cpu_has_neon()) {
LM_GGML_ASSERT(((lm_ggml_cpu_has_sve() && (svcntw() == 8)) || lm_ggml_cpu_has_matmul_int8()) &&
LM_GGML_ASSERT(((lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) || lm_ggml_cpu_has_matmul_int8()) &&
"__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
"quantization format for optimal performance");
}
Expand Down
6 changes: 5 additions & 1 deletion cpp/ggml-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,11 @@ typedef half2 lm_ggml_half2;

#define LM_GGML_COMMON_DECL
#elif defined(LM_GGML_COMMON_DECL_CUDA)
#if defined(LM_GGML_COMMON_DECL_MUSA)
#include <musa_fp16.h>
#else
#include <cuda_fp16.h>
#endif
#include <cstdint>

typedef half lm_ggml_half;
Expand Down Expand Up @@ -415,7 +419,7 @@ static_assert(sizeof(block_iq4_xs) == sizeof(lm_ggml_half) + sizeof(uint16_t) +
#define LM_GGML_TABLE_END() };

#define LM_GGML_COMMON_IMPL
#elif defined(LM_GGML_COMMON_IMPL_CUDA) || defined(LM_GGML_COMMON_IMPL_HIP)
#elif defined(LM_GGML_COMMON_IMPL_CUDA) || defined(LM_GGML_COMMON_IMPL_HIP) || defined(LM_GGML_COMMON_IMPL_MUSA)
#include <cstdint>

#define LM_GGML_TABLE_BEGIN(type, name, size) static const __device__ type name[size] = {
Expand Down
10 changes: 4 additions & 6 deletions cpp/ggml-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,9 @@ static inline float lm_ggml_compute_bf16_to_fp32(lm_ggml_bf16_t h) {
/**
* Converts float32 to brain16.
*
* This function is binary identical to AMD Zen4 VCVTNEPS2BF16.
* Subnormals shall be flushed to zero, and NANs will be quiet.
* This is binary identical with Google Brain float conversion.
* Floats shall round to nearest even, and NANs shall be quiet.
* Subnormals aren't flushed to zero, except perhaps when used.
* This code should vectorize nicely if using modern compilers.
*/
static inline lm_ggml_bf16_t lm_ggml_compute_fp32_to_bf16(float s) {
Expand All @@ -95,10 +96,6 @@ static inline lm_ggml_bf16_t lm_ggml_compute_fp32_to_bf16(float s) {
h.bits = (u.i >> 16) | 64; /* force to quiet */
return h;
}
if (!(u.i & 0x7f800000)) { /* subnormal */
h.bits = (u.i & 0x80000000) >> 16; /* flush to zero */
return h;
}
h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
return h;
}
Expand Down Expand Up @@ -146,6 +143,7 @@ extern "C" {

#if defined(__ARM_FEATURE_SVE)
#include <arm_sve.h>
#include <sys/prctl.h>
#endif

// 16-bit float
Expand Down
48 changes: 30 additions & 18 deletions cpp/ggml-quants.c
Original file line number Diff line number Diff line change
Expand Up @@ -3818,7 +3818,7 @@ void lm_ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void
float sumf = 0;

#if defined(__ARM_FEATURE_SVE)
if (svcntb() == QK8_0) {
if (lm_ggml_sve_cnt_b == QK8_0) {
const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);

Expand Down Expand Up @@ -4190,15 +4190,18 @@ void lm_ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void
sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
#endif
for (; ib < nb; ++ib) {
int sumi = 0;
int sumi0 = 0;
int sumi1 = 0;

for (int j = 0; j < qk/2; ++j) {
const int v0 = (x[ib].qs[j] & 0x0F) - 8;
const int v1 = (x[ib].qs[j] >> 4) - 8;

sumi += (v0 * y[ib].qs[j]) + (v1 * y[ib].qs[j + qk/2]);
sumi0 += (v0 * y[ib].qs[j]);
sumi1 += (v1 * y[ib].qs[j + qk/2]);
}

int sumi = sumi0 + sumi1;
sumf += sumi*LM_GGML_FP16_TO_FP32(x[ib].d)*LM_GGML_FP16_TO_FP32(y[ib].d);
}

Expand Down Expand Up @@ -4474,15 +4477,18 @@ void lm_ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void
sumf = hsum_float_8(acc) + summs;
#endif
for (; ib < nb; ++ib) {
int sumi = 0;
int sumi0 = 0;
int sumi1 = 0;

for (int j = 0; j < qk/2; ++j) {
const int v0 = (x[ib].qs[j] & 0x0F);
const int v1 = (x[ib].qs[j] >> 4);

sumi += (v0 * y[ib].qs[j]) + (v1 * y[ib].qs[j + qk/2]);
sumi0 += (v0 * y[ib].qs[j]);
sumi1 += (v1 * y[ib].qs[j + qk/2]);
}

int sumi = sumi0 + sumi1;
sumf += (LM_GGML_FP16_TO_FP32(x[ib].d)*LM_GGML_FP16_TO_FP32(y[ib].d))*sumi + LM_GGML_FP16_TO_FP32(x[ib].m)*LM_GGML_FP16_TO_FP32(y[ib].s);
}

Expand Down Expand Up @@ -4823,18 +4829,21 @@ void lm_ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void
uint32_t qh;
memcpy(&qh, x[ib].qh, sizeof(qh));

int sumi = 0;
int sumi0 = 0;
int sumi1 = 0;

for (int j = 0; j < qk/2; ++j) {
const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));

const int32_t x0 = ((x[ib].qs[j] & 0x0F) | xh_0) - 16;
const int32_t x1 = ((x[ib].qs[j] >> 4) | xh_1) - 16;
const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16);

sumi += (x0 * y[ib].qs[j]) + (x1 * y[ib].qs[j + qk/2]);
sumi0 += (x0 * y[ib].qs[j]);
sumi1 += (x1 * y[ib].qs[j + qk/2]);
}

int sumi = sumi0 + sumi1;
sumf += (LM_GGML_FP16_TO_FP32(x[ib].d)*LM_GGML_FP16_TO_FP32(y[ib].d)) * sumi;
}

Expand Down Expand Up @@ -5194,7 +5203,8 @@ void lm_ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void
uint32_t qh;
memcpy(&qh, x[ib].qh, sizeof(qh));

int sumi = 0;
int sumi0 = 0;
int sumi1 = 0;

for (int j = 0; j < qk/2; ++j) {
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
Expand All @@ -5203,9 +5213,11 @@ void lm_ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void
const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1;

sumi += (x0 * y[ib].qs[j]) + (x1 * y[ib].qs[j + qk/2]);
sumi0 += (x0 * y[ib].qs[j]);
sumi1 += (x1 * y[ib].qs[j + qk/2]);
}

int sumi = sumi0 + sumi1;
sumf += (LM_GGML_FP16_TO_FP32(x[ib].d)*LM_GGML_FP16_TO_FP32(y[ib].d))*sumi + LM_GGML_FP16_TO_FP32(x[ib].m)*LM_GGML_FP16_TO_FP32(y[ib].s);
}

Expand Down Expand Up @@ -5291,7 +5303,7 @@ void lm_ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void
float sumf = 0;

#if defined(__ARM_FEATURE_SVE)
if (svcntb() == QK8_0) {
if (lm_ggml_sve_cnt_b == QK8_0) {
svfloat32_t sumv0 = svdup_n_f32(0.0f);
svfloat32_t sumv1 = svdup_n_f32(0.0f);

Expand Down Expand Up @@ -6437,22 +6449,22 @@ void lm_ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void
// compute mask for subtraction
vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl);
vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl);
vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_m(vmask_0, q3_0, 0x4, vl);
vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_mu(vmask_0, q3_0, q3_0, 0x4, vl);
m <<= 1;

vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl);
vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_m(vmask_1, q3_1, 0x4, vl);
vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_mu(vmask_1, q3_1, q3_1, 0x4, vl);
m <<= 1;

vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl);
vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_m(vmask_2, q3_2, 0x4, vl);
vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_mu(vmask_2, q3_2, q3_2, 0x4, vl);
m <<= 1;

vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl);
vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl);
vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_m(vmask_3, q3_3, 0x4, vl);
vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_mu(vmask_3, q3_3, q3_3, 0x4, vl);
m <<= 1;

// load Q8 and take product with Q3
Expand Down Expand Up @@ -7708,13 +7720,13 @@ void lm_ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void
vint8m1_t q5_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q5_x, 0x0F, vl));
vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
vbool8_t vmask_1 = __riscv_vmsne_vx_u8m1_b8(qh_m1, 0, vl);
vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_m(vmask_1, q5_a, 16, vl);
vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_mu(vmask_1, q5_a, q5_a, 16, vl);
m <<= 1;

vint8m1_t q5_l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q5_x, 0x04, vl));
vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
vbool8_t vmask_2 = __riscv_vmsne_vx_u8m1_b8(qh_m2, 0, vl);
vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_m(vmask_2, q5_l, 16, vl);
vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_mu(vmask_2, q5_l, q5_l, 16, vl);
m <<= 1;

vint16m2_t v0 = __riscv_vwmul_vv_i16m2(q5_m1, q8_y1, vl);
Expand Down
4 changes: 4 additions & 0 deletions cpp/ggml-quants.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,10 @@ void iq2xs_free_impl(enum lm_ggml_type type);
void iq3xs_init_impl(int grid_size);
void iq3xs_free_impl(int grid_size);

#if defined(__ARM_FEATURE_SVE)
extern int lm_ggml_sve_cnt_b;
#endif

#ifdef __cplusplus
}
#endif
Loading

0 comments on commit da5dad1

Please sign in to comment.