Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: sync llama.cpp #73

Merged
merged 2 commits into from
Aug 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion cpp/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1330,6 +1330,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
else { invalid_param = true; }
return true;
}
if (arg == "--no-warmup") {
params.warmup = false;
return true;
}
#ifndef LOG_DISABLE_LOGS
// Parse args for logging parameters
if (log_param_single_parse(argv[i])) {
Expand Down Expand Up @@ -1452,6 +1456,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" });
options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" });
options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" });
options.push_back({ "main", " --no-warmup", "skip warming up the model with an empty run" });
options.push_back({ "server infill",
" --spm-infill", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" });

Expand Down Expand Up @@ -1635,7 +1640,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "server", " --host HOST", "ip address to listen (default: %s)", params.hostname.c_str() });
options.push_back({ "server", " --port PORT", "port to listen (default: %d)", params.port });
options.push_back({ "server", " --path PATH", "path to serve static files from (default: %s)", params.public_path.c_str() });
options.push_back({ "server", " --embedding(s)", "enable embedding endpoint (default: %s)", params.embedding ? "enabled" : "disabled" });
options.push_back({ "server", " --embedding(s)", "restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled" });
options.push_back({ "server", " --api-key KEY", "API key to use for authentication (default: none)" });
options.push_back({ "server", " --api-key-file FNAME", "path to file containing API keys (default: none)" });
options.push_back({ "server", " --ssl-key-file FNAME", "path to file a PEM-encoded SSL private key" });
Expand Down
28 changes: 14 additions & 14 deletions cpp/ggml-aarch64.c
Original file line number Diff line number Diff line change
Expand Up @@ -384,8 +384,8 @@ void lm_ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void
UNUSED(blocklen);

#if defined(__ARM_FEATURE_SVE)
if (svcntw() == 8) {
LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (svcntw() == 8)) &&
if (lm_ggml_sve_cnt_b == QK8_0) {
LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
}
#endif
Expand Down Expand Up @@ -496,8 +496,8 @@ void lm_ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void
UNUSED(blocklen);

#if defined(__ARM_FEATURE_SVE)
if (svcntw() == 8) {
LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (svcntw() == 8)) &&
if (lm_ggml_sve_cnt_b == QK8_0) {
LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
}
#endif
Expand Down Expand Up @@ -614,7 +614,7 @@ void lm_ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
UNUSED(blocklen);

#if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
if (svcntw() == 8) {
if (lm_ggml_sve_cnt_b == QK8_0) {
const void * b_ptr = vx;
const void * a_ptr = vy;
float * res_ptr = s;
Expand Down Expand Up @@ -680,12 +680,12 @@ void lm_ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
return;
}
else if (lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_matmul_int8()) {
LM_GGML_ASSERT((lm_ggml_cpu_has_sve() && (svcntw() == 8)) &&
LM_GGML_ASSERT((lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
"__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
"performance");
}
else if (lm_ggml_cpu_has_neon()) {
LM_GGML_ASSERT(((lm_ggml_cpu_has_sve() && (svcntw() == 8)) || lm_ggml_cpu_has_matmul_int8()) &&
LM_GGML_ASSERT(((lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) || lm_ggml_cpu_has_matmul_int8()) &&
"__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
"quantization format for optimal performance");
}
Expand Down Expand Up @@ -745,8 +745,8 @@ void lm_ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void
UNUSED(blocklen);

#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
if (svcntw() == 8) {
LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (svcntw() == 8)) &&
if (lm_ggml_sve_cnt_b == QK8_0) {
LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
}
#endif
Expand Down Expand Up @@ -1266,8 +1266,8 @@ void lm_ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void
UNUSED(blocklen);

#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
if (svcntw() == 8) {
LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (svcntw() == 8)) &&
if (lm_ggml_sve_cnt_b == QK8_0) {
LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
}
#endif
Expand Down Expand Up @@ -1728,7 +1728,7 @@ void lm_ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
UNUSED(blocklen);

#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
if (svcntw() == 8) {
if (lm_ggml_sve_cnt_b == QK8_0) {
const void * b_ptr = vx;
const void * a_ptr = vy;
float * res_ptr = s;
Expand Down Expand Up @@ -2139,12 +2139,12 @@ void lm_ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
return;
}
else if (lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_matmul_int8()) {
LM_GGML_ASSERT((lm_ggml_cpu_has_sve() && (svcntw() == 8)) &&
LM_GGML_ASSERT((lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
"__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
"performance");
}
else if (lm_ggml_cpu_has_neon()) {
LM_GGML_ASSERT(((lm_ggml_cpu_has_sve() && (svcntw() == 8)) || lm_ggml_cpu_has_matmul_int8()) &&
LM_GGML_ASSERT(((lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) || lm_ggml_cpu_has_matmul_int8()) &&
"__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
"quantization format for optimal performance");
}
Expand Down
6 changes: 5 additions & 1 deletion cpp/ggml-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,11 @@ typedef half2 lm_ggml_half2;

#define LM_GGML_COMMON_DECL
#elif defined(LM_GGML_COMMON_DECL_CUDA)
#if defined(LM_GGML_COMMON_DECL_MUSA)
#include <musa_fp16.h>
#else
#include <cuda_fp16.h>
#endif
#include <cstdint>

typedef half lm_ggml_half;
Expand Down Expand Up @@ -415,7 +419,7 @@ static_assert(sizeof(block_iq4_xs) == sizeof(lm_ggml_half) + sizeof(uint16_t) +
#define LM_GGML_TABLE_END() };

#define LM_GGML_COMMON_IMPL
#elif defined(LM_GGML_COMMON_IMPL_CUDA) || defined(LM_GGML_COMMON_IMPL_HIP)
#elif defined(LM_GGML_COMMON_IMPL_CUDA) || defined(LM_GGML_COMMON_IMPL_HIP) || defined(LM_GGML_COMMON_IMPL_MUSA)
#include <cstdint>

#define LM_GGML_TABLE_BEGIN(type, name, size) static const __device__ type name[size] = {
Expand Down
10 changes: 4 additions & 6 deletions cpp/ggml-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,9 @@ static inline float lm_ggml_compute_bf16_to_fp32(lm_ggml_bf16_t h) {
/**
* Converts float32 to brain16.
*
* This function is binary identical to AMD Zen4 VCVTNEPS2BF16.
* Subnormals shall be flushed to zero, and NANs will be quiet.
* This is binary identical with Google Brain float conversion.
* Floats shall round to nearest even, and NANs shall be quiet.
* Subnormals aren't flushed to zero, except perhaps when used.
* This code should vectorize nicely if using modern compilers.
*/
static inline lm_ggml_bf16_t lm_ggml_compute_fp32_to_bf16(float s) {
Expand All @@ -95,10 +96,6 @@ static inline lm_ggml_bf16_t lm_ggml_compute_fp32_to_bf16(float s) {
h.bits = (u.i >> 16) | 64; /* force to quiet */
return h;
}
if (!(u.i & 0x7f800000)) { /* subnormal */
h.bits = (u.i & 0x80000000) >> 16; /* flush to zero */
return h;
}
h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
return h;
}
Expand Down Expand Up @@ -146,6 +143,7 @@ extern "C" {

#if defined(__ARM_FEATURE_SVE)
#include <arm_sve.h>
#include <sys/prctl.h>
#endif

// 16-bit float
Expand Down
48 changes: 30 additions & 18 deletions cpp/ggml-quants.c
Original file line number Diff line number Diff line change
Expand Up @@ -3818,7 +3818,7 @@ void lm_ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void
float sumf = 0;

#if defined(__ARM_FEATURE_SVE)
if (svcntb() == QK8_0) {
if (lm_ggml_sve_cnt_b == QK8_0) {
const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);

Expand Down Expand Up @@ -4190,15 +4190,18 @@ void lm_ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void
sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
#endif
for (; ib < nb; ++ib) {
int sumi = 0;
int sumi0 = 0;
int sumi1 = 0;

for (int j = 0; j < qk/2; ++j) {
const int v0 = (x[ib].qs[j] & 0x0F) - 8;
const int v1 = (x[ib].qs[j] >> 4) - 8;

sumi += (v0 * y[ib].qs[j]) + (v1 * y[ib].qs[j + qk/2]);
sumi0 += (v0 * y[ib].qs[j]);
sumi1 += (v1 * y[ib].qs[j + qk/2]);
}

int sumi = sumi0 + sumi1;
sumf += sumi*LM_GGML_FP16_TO_FP32(x[ib].d)*LM_GGML_FP16_TO_FP32(y[ib].d);
}

Expand Down Expand Up @@ -4474,15 +4477,18 @@ void lm_ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void
sumf = hsum_float_8(acc) + summs;
#endif
for (; ib < nb; ++ib) {
int sumi = 0;
int sumi0 = 0;
int sumi1 = 0;

for (int j = 0; j < qk/2; ++j) {
const int v0 = (x[ib].qs[j] & 0x0F);
const int v1 = (x[ib].qs[j] >> 4);

sumi += (v0 * y[ib].qs[j]) + (v1 * y[ib].qs[j + qk/2]);
sumi0 += (v0 * y[ib].qs[j]);
sumi1 += (v1 * y[ib].qs[j + qk/2]);
}

int sumi = sumi0 + sumi1;
sumf += (LM_GGML_FP16_TO_FP32(x[ib].d)*LM_GGML_FP16_TO_FP32(y[ib].d))*sumi + LM_GGML_FP16_TO_FP32(x[ib].m)*LM_GGML_FP16_TO_FP32(y[ib].s);
}

Expand Down Expand Up @@ -4823,18 +4829,21 @@ void lm_ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void
uint32_t qh;
memcpy(&qh, x[ib].qh, sizeof(qh));

int sumi = 0;
int sumi0 = 0;
int sumi1 = 0;

for (int j = 0; j < qk/2; ++j) {
const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));

const int32_t x0 = ((x[ib].qs[j] & 0x0F) | xh_0) - 16;
const int32_t x1 = ((x[ib].qs[j] >> 4) | xh_1) - 16;
const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16);

sumi += (x0 * y[ib].qs[j]) + (x1 * y[ib].qs[j + qk/2]);
sumi0 += (x0 * y[ib].qs[j]);
sumi1 += (x1 * y[ib].qs[j + qk/2]);
}

int sumi = sumi0 + sumi1;
sumf += (LM_GGML_FP16_TO_FP32(x[ib].d)*LM_GGML_FP16_TO_FP32(y[ib].d)) * sumi;
}

Expand Down Expand Up @@ -5194,7 +5203,8 @@ void lm_ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void
uint32_t qh;
memcpy(&qh, x[ib].qh, sizeof(qh));

int sumi = 0;
int sumi0 = 0;
int sumi1 = 0;

for (int j = 0; j < qk/2; ++j) {
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
Expand All @@ -5203,9 +5213,11 @@ void lm_ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void
const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1;

sumi += (x0 * y[ib].qs[j]) + (x1 * y[ib].qs[j + qk/2]);
sumi0 += (x0 * y[ib].qs[j]);
sumi1 += (x1 * y[ib].qs[j + qk/2]);
}

int sumi = sumi0 + sumi1;
sumf += (LM_GGML_FP16_TO_FP32(x[ib].d)*LM_GGML_FP16_TO_FP32(y[ib].d))*sumi + LM_GGML_FP16_TO_FP32(x[ib].m)*LM_GGML_FP16_TO_FP32(y[ib].s);
}

Expand Down Expand Up @@ -5291,7 +5303,7 @@ void lm_ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void
float sumf = 0;

#if defined(__ARM_FEATURE_SVE)
if (svcntb() == QK8_0) {
if (lm_ggml_sve_cnt_b == QK8_0) {
svfloat32_t sumv0 = svdup_n_f32(0.0f);
svfloat32_t sumv1 = svdup_n_f32(0.0f);

Expand Down Expand Up @@ -6437,22 +6449,22 @@ void lm_ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void
// compute mask for subtraction
vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl);
vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl);
vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_m(vmask_0, q3_0, 0x4, vl);
vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_mu(vmask_0, q3_0, q3_0, 0x4, vl);
m <<= 1;

vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl);
vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_m(vmask_1, q3_1, 0x4, vl);
vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_mu(vmask_1, q3_1, q3_1, 0x4, vl);
m <<= 1;

vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl);
vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_m(vmask_2, q3_2, 0x4, vl);
vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_mu(vmask_2, q3_2, q3_2, 0x4, vl);
m <<= 1;

vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl);
vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl);
vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_m(vmask_3, q3_3, 0x4, vl);
vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_mu(vmask_3, q3_3, q3_3, 0x4, vl);
m <<= 1;

// load Q8 and take product with Q3
Expand Down Expand Up @@ -7708,13 +7720,13 @@ void lm_ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void
vint8m1_t q5_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q5_x, 0x0F, vl));
vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
vbool8_t vmask_1 = __riscv_vmsne_vx_u8m1_b8(qh_m1, 0, vl);
vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_m(vmask_1, q5_a, 16, vl);
vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_mu(vmask_1, q5_a, q5_a, 16, vl);
m <<= 1;

vint8m1_t q5_l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q5_x, 0x04, vl));
vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
vbool8_t vmask_2 = __riscv_vmsne_vx_u8m1_b8(qh_m2, 0, vl);
vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_m(vmask_2, q5_l, 16, vl);
vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_mu(vmask_2, q5_l, q5_l, 16, vl);
m <<= 1;

vint16m2_t v0 = __riscv_vwmul_vv_i16m2(q5_m1, q8_y1, vl);
Expand Down
4 changes: 4 additions & 0 deletions cpp/ggml-quants.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,10 @@ void iq2xs_free_impl(enum lm_ggml_type type);
void iq3xs_init_impl(int grid_size);
void iq3xs_free_impl(int grid_size);

#if defined(__ARM_FEATURE_SVE)
extern int lm_ggml_sve_cnt_b;
#endif

#ifdef __cplusplus
}
#endif
Loading
Loading