From b4ebdb6b57dedf9e82eec6e13ac70e8089e06190 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 27 Feb 2023 20:14:32 +0200 Subject: [PATCH] bench : add Q4_0 and Q4_1 mul_mat benchmarks --- .gitignore | 1 + whisper.cpp | 21 +++++++++++++++------ 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index f9a12783d65..2dae32884e6 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ build-em/ build-debug/ build-release/ build-static/ +build-no-accel/ build-sanitize-addr/ build-sanitize-thread/ diff --git a/whisper.cpp b/whisper.cpp index 027175e3fa2..e266c457032 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -4492,23 +4492,32 @@ WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads) { // when F16 is used, there is an extra work buffer of size N*N*sizeof(float) std::vector buf(4llu*N_max*N_max*sizeof(float) + 4*256); + // put a bunch of random data in the buffer for (size_t i = 0; i < buf.size(); i++) buf[i] = i; for (int j = 0; j < (int) sizes.size(); j++) { + int n_q4_0 = 0; + int n_q4_1 = 0; int n_fp16 = 0; int n_fp32 = 0; // GFLOPS/s + double s_q4_0 = 0.0; + double s_q4_1 = 0.0; double s_fp16 = 0.0; double s_fp32 = 0.0; const size_t N = sizes[j]; - for (int k = 0; k < 2; ++k) { - const ggml_type wtype = k == 0 ? GGML_TYPE_F16 : GGML_TYPE_F32; + for (int k = 0; k < 4; ++k) { + const ggml_type wtype = + k == 0 ? GGML_TYPE_Q4_0 : + k == 1 ? GGML_TYPE_Q4_1 : + k == 2 ? GGML_TYPE_F16 : + GGML_TYPE_F32; - double & s = k == 0 ? s_fp16 : s_fp32; - int & n = k == 0 ? n_fp16 : n_fp32; + double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_fp16 : s_fp32; + int & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_fp16 : n_fp32; struct ggml_init_params gparams = { /*.mem_size =*/ buf.size(), @@ -4551,8 +4560,8 @@ WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads) { s = ((2.0*N*N*N*n)/tsum)*1e-9; } - fprintf(stderr, "ggml_mul_mat: %5zu x %5zu: F16 %8.1f GFLOPS (%3d runs) / F32 %8.1f GFLOPS (%3d runs)\n", - N, N, s_fp16, n_fp16, s_fp32, n_fp32); + fprintf(stderr, "ggml_mul_mat: %4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) / Q4_1 %7.1f GFLOPS (%3d runs) / F16 %7.1f GFLOPS (%3d runs) / F32 %7.1f GFLOPS (%3d runs)\n", + N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1, s_fp16, n_fp16, s_fp32, n_fp32); } return 0;