diff --git a/README.md b/README.md index 747d2e98b5a94..c1624b9f9a348 100644 --- a/README.md +++ b/README.md @@ -10,13 +10,8 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) ### Hot topics -- Remove LLAMA_MAX_DEVICES and LLAMA_SUPPORTS_GPU_OFFLOAD: https://github.com/ggerganov/llama.cpp/pull/5240 -- Incoming backends: https://github.com/ggerganov/llama.cpp/discussions/5138 - - [SYCL backend](README-sycl.md) is ready (1/28/2024), support Linux/Windows in Intel GPUs (iGPU, Arc/Flex/Max series) -- New SOTA quantized models, including pure 2-bits: https://huggingface.co/ikawrakow -- Collecting Apple Silicon performance stats: - - M-series: https://github.com/ggerganov/llama.cpp/discussions/4167 - - A-series: https://github.com/ggerganov/llama.cpp/discussions/4508 +- Support for Gemma models: https://github.com/ggerganov/llama.cpp/pull/5631 +- Non-linear quantization IQ4_NL: https://github.com/ggerganov/llama.cpp/pull/5590 - Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216 ---- @@ -107,6 +102,7 @@ Typically finetunes of the base models below are supported as well. - [x] [Orion 14B](https://github.com/ggerganov/llama.cpp/pull/5118) - [x] [InternLM2](https://huggingface.co/models?search=internlm2) - [x] [CodeShell](https://github.com/WisdomShell/codeshell) +- [x] [Gemma](https://ai.google.dev/gemma) **Multimodal models:** @@ -145,6 +141,7 @@ Unless otherwise noted these projects are open-source with permissive licensing: - [nat/openplayground](https://github.com/nat/openplayground) - [Faraday](https://faraday.dev/) (proprietary) - [LMStudio](https://lmstudio.ai/) (proprietary) +- [LocalAI](https://github.com/mudler/LocalAI) (MIT) - [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL) - [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile) - [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all) diff --git a/examples/llava/README.md b/examples/llava/README.md index 25ea967153510..35e6d9e5d00f7 100644 --- a/examples/llava/README.md +++ b/examples/llava/README.md @@ -63,13 +63,12 @@ Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` director ```console git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b ``` -2) Backup your pth/safetensor model files as llava-surgery modifies them -3) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models: +2) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models: ```console python examples/llava/llava-surgery-v2.py -C -m ../llava-v1.6-vicuna-7b/ ``` - you will find a llava.projector and a llava.clip file in your model directory -4) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory: +3) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory: ```console mkdir vit cp ../llava-v1.6-vicuna-7b/llava.clip vit/pytorch_model.bin @@ -77,18 +76,18 @@ cp ../llava-v1.6-vicuna-7b/llava.projector vit/ curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.json -o vit/config.json ``` -5) Create the visual gguf model: +4) Create the visual gguf model: ```console python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision ``` - This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP -6) Then convert the model to gguf format: +5) Then convert the model to gguf format: ```console -python ./convert.py ../llava-v1.6-vicuna-7b/ +python ./convert.py ../llava-v1.6-vicuna-7b/ --skip-unknown ``` -7) And finally we can run the llava-cli using the 1.6 model version: +6) And finally we can run the llava-cli using the 1.6 model version: ```console ./llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096 ``` diff --git a/examples/llava/llava-surgery-v2.py b/examples/llava/llava-surgery-v2.py index 5bc5bc5137fe0..eb56d6988ac26 100644 --- a/examples/llava/llava-surgery-v2.py +++ b/examples/llava/llava-surgery-v2.py @@ -65,9 +65,7 @@ def clean_vision_tower_from_checkpoint(checkpoint_path): for name in clip_tensors: del checkpoint[name] - # Save the updated checkpoint checkpoint_path = checkpoint_path - save_model(checkpoint, checkpoint_path, file_type) return True return False @@ -152,16 +150,6 @@ def proj_criteria(checkpoint): if len(projector) > 0: save_model(projector, f"{args.model}/llava.projector", 'pytorch') -for name in mm_tensors: - del last_checkpoint[name] -for name in first_mm_tensors: - del first_checkpoint[name] - -if len(mm_tensors) > 0: - save_model(last_checkpoint, projector_checkpoint_path, file_type) -if len(first_mm_tensors) > 0: - save_model(first_checkpoint, newline_checkpoint_path, file_type) - print("Done!") print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.") print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.") diff --git a/examples/main/main.cpp b/examples/main/main.cpp index f5d2f48935eb6..7555dffe441f0 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -334,6 +334,8 @@ int main(int argc, char ** argv) { // number of tokens to keep when resetting context if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct || params.chatml) { params.n_keep = (int)embd_inp.size(); + } else { + params.n_keep += add_bos; // always keep the BOS token } // prefix & suffix for instruct mode @@ -383,8 +385,8 @@ int main(int argc, char ** argv) { } } - if (params.n_keep > 0) { - LOG_TEE("%s: static prompt based on n_keep: '", __func__); + if (params.n_keep > add_bos) { + LOG_TEE("%s: static prompt based on n_keep: '", __func__); for (int i = 0; i < params.n_keep; i++) { LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str()); } @@ -540,14 +542,14 @@ int main(int argc, char ** argv) { break; } - const int n_left = n_past - params.n_keep - 1; + const int n_left = n_past - params.n_keep; const int n_discard = n_left/2; LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", n_past, n_left, n_ctx, params.n_keep, n_discard); - llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1); - llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard); + llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard); + llama_kv_cache_seq_shift(ctx, 0, params.n_keep + n_discard, n_past, -n_discard); n_past -= n_discard; diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index ea7ba50c96eb6..37520857f99f7 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -32,6 +32,7 @@ static const std::vector QUANT_OPTIONS = { { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", }, { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", }, { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", }, + { "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.25 bpw non-linear quantization", }, { "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", }, { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.59G, +0.0992 ppl @ LLaMA-v1-7B", }, { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0532 ppl @ LLaMA-v1-7B", }, diff --git a/examples/server/README.md b/examples/server/README.md index f6b9c74021f30..6d9f96cd4ba64 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -140,6 +140,8 @@ node index.js - 200 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available. - 503 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if the query parameter `fail_on_no_slot` is provided and no slot are currently available. + If the query parameter `include_slots` is passed, `slots` field will contain internal slots data except if `--slots-endpoint-disable` is set. + - **POST** `/completion`: Given a `prompt`, it returns the predicted completion. *Options:* diff --git a/examples/server/server.cpp b/examples/server/server.cpp index eb01729fa7a8a..c84719a0d15d0 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1394,6 +1394,46 @@ struct llama_server_context case TASK_TYPE_NEXT_RESPONSE: { // do nothing } break; + case TASK_TYPE_SLOTS_DATA: { + json slots_data = json::array(); + int n_idle_slots = 0; + int n_processing_slots = 0; + + for (llama_client_slot &slot: slots) { + if (slot.available()) { + n_idle_slots++; + } else { + n_processing_slots++; + } + json slot_data = get_formated_generation(slot); + slot_data["id"] = slot.id; + slot_data["task_id"] = slot.task_id; + slot_data["state"] = slot.state; + slot_data["prompt"] = slot.prompt; + slot_data["next_token"] = { + {"has_next_token", slot.has_next_token}, + {"n_remain", slot.n_remaining}, + {"num_tokens_predicted", slot.n_decoded}, + {"stopped_eos", slot.stopped_eos}, + {"stopped_word", slot.stopped_word}, + {"stopped_limit", slot.stopped_limit}, + {"stopping_word", slot.stopping_word}, + }; + slots_data.push_back(slot_data); + } + LOG_TEE("task %i - slots data: idle=%i processing=%i\n", task.id, n_idle_slots, n_processing_slots); + task_result res; + res.id = task.id; + res.multitask_id = task.multitask_id; + res.stop = true; + res.error = false; + res.result_json = { + { "idle", n_idle_slots }, + { "processing", n_processing_slots }, + { "slots", slots_data } + }; + queue_results.send(res); + } break; } } @@ -1447,14 +1487,15 @@ struct llama_server_context if (slot.is_processing() && system_tokens.size() + slot.cache_tokens.size() >= (size_t) slot.n_ctx) { // Shift context - const int n_left = system_tokens.size() + slot.n_past - slot.params.n_keep - 1; + const int n_keep = slot.params.n_keep + add_bos_token; + const int n_left = system_tokens.size() + slot.n_past - n_keep; const int n_discard = n_left / 2; - LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, slot.params.n_keep, n_left, n_discard); - llama_kv_cache_seq_rm (ctx, slot.id, slot.params.n_keep + 1 , slot.params.n_keep + n_discard + 1); - llama_kv_cache_seq_shift(ctx, slot.id, slot.params.n_keep + 1 + n_discard, system_tokens.size() + slot.n_past, -n_discard); + LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, n_keep, n_left, n_discard); + llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard); + llama_kv_cache_seq_shift(ctx, slot.id, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard); - for (size_t i = slot.params.n_keep + 1 + n_discard; i < slot.cache_tokens.size(); i++) + for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) { slot.cache_tokens[i - n_discard] = slot.cache_tokens[i]; } @@ -1467,7 +1508,7 @@ struct llama_server_context LOG_VERBOSE("context shift", { { "n_ctx", n_ctx }, - { "n_keep", params.n_keep }, + { "n_keep", n_keep }, { "n_left", n_left }, }); } @@ -2557,34 +2598,38 @@ int main(int argc, char **argv) server_state current_state = state.load(); switch(current_state) { case SERVER_STATE_READY: { - int available_slots = 0; - int processing_slots = 0; - for (llama_client_slot &slot: llama.slots) { - if (slot.available()) { - available_slots++; - } else { - processing_slots++; - } + // request slots data using task queue + task_server task; + task.id = llama.queue_tasks.get_new_id(); + task.type = TASK_TYPE_SLOTS_DATA; + task.target_id = -1; + + llama.queue_results.add_waiting_task_id(task.id); + llama.queue_tasks.post(task); + + // get the result + task_result result = llama.queue_results.recv(task.id); + llama.queue_results.remove_waiting_task_id(task.id); + + int n_idle_slots = result.result_json["idle"]; + int n_processing_slots = result.result_json["processing"]; + + json health = { + {"status", "ok"}, + {"slots_idle", n_idle_slots}, + {"slots_processing", n_processing_slots}}; + res.status = 200; // HTTP OK + if (sparams.slots_endpoint && req.has_param("include_slots")) { + health["slots"] = result.result_json["slots"]; } - if (available_slots > 0) { - json health = { - {"status", "ok"}, - {"slots_idle", available_slots}, - {"slots_processing", processing_slots}}; - res.set_content(health.dump(), "application/json"); - res.status = 200; // HTTP OK - } else { - json health = { - {"status", "no slot available"}, - {"slots_idle", available_slots}, - {"slots_processing", processing_slots}}; - res.set_content(health.dump(), "application/json"); + + if (n_idle_slots == 0) { + health["status"] = "no slot available"; if (req.has_param("fail_on_no_slot")) { res.status = 503; // HTTP Service Unavailable - } else { - res.status = 200; // HTTP OK } } + res.set_content(health.dump(), "application/json"); break; } case SERVER_STATE_LOADING_MODEL: @@ -2600,26 +2645,20 @@ int main(int argc, char **argv) if (sparams.slots_endpoint) { svr.Get("/slots", [&](const httplib::Request&, httplib::Response& res) { - json slots; - for (llama_client_slot & slot : llama.slots) { - json slot_data = llama.get_formated_generation(slot); - slot_data["id"] = slot.id; - slot_data["task_id"] = slot.task_id; - slot_data["state"] = slot.state; - slot_data["prompt"] = slot.prompt; - slot_data["next_token"] = { - {"has_next_token", slot.has_next_token}, - {"n_remain", slot.n_remaining}, - {"num_tokens_predicted", slot.n_decoded}, - {"stopped_eos", slot.stopped_eos}, - {"stopped_word", slot.stopped_word}, - {"stopped_limit", slot.stopped_limit}, - {"stopping_word", slot.stopping_word}, - }; + // request slots data using task queue + task_server task; + task.id = llama.queue_tasks.get_new_id(); + task.type = TASK_TYPE_SLOTS_DATA; + task.target_id = -1; - slots.push_back(slot_data); - } - res.set_content(slots.dump(), "application/json"); + llama.queue_results.add_waiting_task_id(task.id); + llama.queue_tasks.post(task); + + // get the result + task_result result = llama.queue_results.recv(task.id); + llama.queue_results.remove_waiting_task_id(task.id); + + res.set_content(result.result_json["slots"].dump(), "application/json"); res.status = 200; // HTTP OK }); } diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index e954fb0ef0413..88545eb6931d0 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -49,7 +49,8 @@ enum server_state { enum task_type { TASK_TYPE_COMPLETION, TASK_TYPE_CANCEL, - TASK_TYPE_NEXT_RESPONSE + TASK_TYPE_NEXT_RESPONSE, + TASK_TYPE_SLOTS_DATA }; struct task_server { diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 6caae56b0244b..e7c211d7d6087 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -528,6 +528,15 @@ typedef struct { } block_iq1_s; static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding"); +#define QK4_NL 32 +#define QR4_NL 2 +#define QI4_NL (QK4_NL / (4*QR4_NL)) +typedef struct { + half d; + uint8_t qs[QK4_NL/2]; +} block_iq4_nl; +static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding"); + #define WARP_SIZE 32 #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses @@ -1987,6 +1996,26 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_ } +static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113}; + +template +static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) { + + const int i = blockIdx.x; + const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL); + + const int tid = threadIdx.x; + const int il = tid/8; // 0...3 + const int ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 4*il; + const uint8_t * q4 = x[ib].qs + 4*il; + const float d = (float)x[ib].d; + for (int j = 0; j < 4; ++j) { + y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf]; + y[j+16] = d * kvalues_iq4nl[q4[j] >> 4]; + } + +} static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) { @@ -4732,6 +4761,56 @@ static __device__ __forceinline__ float vec_dot_iq1_s_q8_1( #endif } +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics +static __device__ __forceinline__ void get_int_from_table_16(const uint32_t & q4, const uint8_t * values, + int & val1, int & val2) { + + uint32_t aux32; const uint8_t * q8 = (const uint8_t *)&aux32; + aux32 = q4 & 0x0f0f0f0f; + uint16_t v1 = values[q8[0]] | (values[q8[1]] << 8); + uint16_t v2 = values[q8[2]] | (values[q8[3]] << 8); + val1 = v1 | (v2 << 16); + aux32 = (q4 >> 4) & 0x0f0f0f0f; + v1 = values[q8[0]] | (values[q8[1]] << 8); + v2 = values[q8[2]] | (values[q8[3]] << 8); + val2 = v1 | (v2 << 16); +} +#endif + +static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { + + const block_iq4_nl * bq = (const block_iq4_nl *) vbq; + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + const uint16_t * q4 = (const uint16_t *)bq->qs + 2*iqs; + const int32_t * q8 = (const int32_t *)bq8_1->qs + iqs; + + const uint8_t * values = (const uint8_t *)kvalues_iq4nl; + + int v1, v2; + int sumi1 = 0, sumi2 = 0; + for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) { + const uint32_t aux = q4[2*l] | (q4[2*l+1] << 16); + get_int_from_table_16(aux, values, v1, v2); + sumi1 = __dp4a(v1, q8[l+0], sumi1); + sumi2 = __dp4a(v2, q8[l+4], sumi2); + } + +#else + const uint8_t * q4 = bq->qs + 4*iqs; + const int8_t * q8 = bq8_1->qs + 4*iqs; + + int sumi1 = 0, sumi2 = 0; + for (int l = 0; l < 4*VDR_Q4_0_Q8_1_MMVQ; ++l) { + sumi1 += q8[l+ 0] * kvalues_iq4nl[q4[l] & 0xf]; + sumi2 += q8[l+16] * kvalues_iq4nl[q4[l] >> 4]; + } +#endif + const float d = (float)bq->d * __low2float(bq8_1->ds); + return d * (sumi1 + sumi2); +} + template static __device__ __forceinline__ void mul_mat_q( @@ -6777,6 +6856,12 @@ static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int k, c dequantize_block_iq1_s<<>>(vx, y); } +template +static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { + const int nb = (k + QK_K - 1) / QK_K; + dequantize_block_iq4_nl<<>>(vx, y); +} + template static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) { const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE; @@ -6818,6 +6903,8 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { return dequantize_row_iq3_xxs_cuda; case GGML_TYPE_IQ1_S: return dequantize_row_iq1_s_cuda; + case GGML_TYPE_IQ4_NL: + return dequantize_row_iq4_nl_cuda; case GGML_TYPE_F32: return convert_unary_cuda; default: @@ -6855,6 +6942,8 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { return dequantize_row_iq3_xxs_cuda; case GGML_TYPE_IQ1_S: return dequantize_row_iq1_s_cuda; + case GGML_TYPE_IQ4_NL: + return dequantize_row_iq4_nl_cuda; case GGML_TYPE_F16: return convert_unary_cuda; default: @@ -8599,6 +8688,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array= CC_RDNA2 ? 128 : 64; default: GGML_ASSERT(false); @@ -8623,6 +8713,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array= CC_VOLTA ? 128 : 64; case GGML_TYPE_Q6_K: return 64; @@ -8724,6 +8815,10 @@ static void ggml_cuda_op_mul_mat_vec_q( mul_mat_vec_q_cuda (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); break; + case GGML_TYPE_IQ4_NL: + mul_mat_vec_q_cuda + (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); + break; default: GGML_ASSERT(false); break; @@ -11446,7 +11541,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons return false; } ggml_type a_type = a->type; - if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS || a_type == GGML_TYPE_IQ1_S) { + if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS || + a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL) { if (b->ne[1] == 1 && ggml_nrows(b) > 1) { return false; } diff --git a/ggml-metal.m b/ggml-metal.m index 956e323a0d053..0d4aa43093739 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -62,6 +62,7 @@ GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS, GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS, GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S, + GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL, GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, GGML_METAL_KERNEL_TYPE_RMS_NORM, GGML_METAL_KERNEL_TYPE_GROUP_NORM, @@ -85,6 +86,7 @@ GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, //GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32, @@ -104,6 +106,7 @@ GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32, @@ -120,6 +123,7 @@ GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32, @@ -136,6 +140,7 @@ GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32, GGML_METAL_KERNEL_TYPE_ROPE_F32, GGML_METAL_KERNEL_TYPE_ROPE_F16, GGML_METAL_KERNEL_TYPE_ALIBI_F32, @@ -448,6 +453,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS, get_rows_iq2_xs, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS, get_rows_iq3_xxs, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S, get_rows_iq1_s, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL, get_rows_iq4_nl, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, get_rows_i32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM, rms_norm, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM, group_norm, ctx->support_simdgroup_reduction); @@ -471,6 +477,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32, mul_mv_iq2_xs_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32, mul_mv_iq3_xxs_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32, mul_mv_iq1_s_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32, mul_mv_iq4_nl_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, mul_mv_id_f32_f32, ctx->support_simdgroup_reduction); //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16, mul_mv_id_f16_f16, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32, mul_mv_id_f16_f32, ctx->support_simdgroup_reduction); @@ -490,6 +497,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32, mul_mv_id_iq2_xs_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32, mul_mv_id_iq3_xxs_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32, mul_mv_id_iq1_s_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32, mul_mv_id_iq4_nl_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, mul_mm_f32_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32, mul_mm_f16_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32, mul_mm_q4_0_f32, ctx->support_simdgroup_mm); @@ -506,6 +514,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32, mul_mm_iq2_xs_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32, mul_mm_iq3_xxs_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32, mul_mm_iq1_s_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32, mul_mm_iq4_nl_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, mul_mm_id_f32_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32, mul_mm_id_f16_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32, mul_mm_id_q4_0_f32, ctx->support_simdgroup_mm); @@ -522,6 +531,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, mul_mm_id_iq2_xs_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32, mul_mm_id_iq3_xxs_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32, mul_mm_id_iq1_s_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32, mul_mm_id_iq4_nl_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F32, rope_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F16, rope_f16, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ALIBI_F32, alibi_f32, true); @@ -1338,6 +1348,7 @@ static bool ggml_metal_graph_compute( case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break; case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32].pipeline; break; case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32 ].pipeline; break; + case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break; default: GGML_ASSERT(false && "MUL MAT-MAT not implemented"); } @@ -1478,6 +1489,12 @@ static bool ggml_metal_graph_compute( nth1 = 16; pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32].pipeline; } break; + case GGML_TYPE_IQ4_NL: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32].pipeline; + } break; default: { GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t); @@ -1525,6 +1542,11 @@ static bool ggml_metal_graph_compute( [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } + else if (src0t == GGML_TYPE_IQ4_NL) { + const int mem_size = 32*sizeof(float); + [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } else if (src0t == GGML_TYPE_Q4_K) { [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } @@ -1619,6 +1641,7 @@ static bool ggml_metal_graph_compute( case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break; case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32].pipeline; break; case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32 ].pipeline; break; + case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32 ].pipeline; break; default: GGML_ASSERT(false && "MUL_MAT_ID not implemented"); } @@ -1762,6 +1785,12 @@ static bool ggml_metal_graph_compute( nth1 = 16; pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32].pipeline; } break; + case GGML_TYPE_IQ4_NL: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32].pipeline; + } break; default: { GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t); @@ -1825,6 +1854,11 @@ static bool ggml_metal_graph_compute( [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } + else if (src2t == GGML_TYPE_IQ4_NL) { + const int mem_size = 32*sizeof(float); + [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; + [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } else if (src2t == GGML_TYPE_Q4_K) { [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } @@ -1867,6 +1901,7 @@ static bool ggml_metal_graph_compute( case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS ].pipeline; break; case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS].pipeline; break; case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S ].pipeline; break; + case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL ].pipeline; break; case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break; default: GGML_ASSERT(false && "not implemented"); } diff --git a/ggml-metal.metal b/ggml-metal.metal index f0d77d446755c..c223a981c246a 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -2531,6 +2531,12 @@ typedef struct { uint8_t scales[QK_K/16]; } block_iq1_s; +// Non-linear quants +#define QK4_NL 32 +typedef struct { + half d; + uint8_t qs[QK4_NL/2]; +} block_iq4_nl; //====================================== dot products ========================= @@ -4384,7 +4390,6 @@ void kernel_mul_mv_iq1_s_f32_impl( const uint i13 = im/ne12; const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); - device const block_iq1_s * x = (device const block_iq1_s *) src0 + ib_row + offset0; device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; @@ -4447,6 +4452,103 @@ void kernel_mul_mv_iq1_s_f32_impl( } } +constexpr constant static float kvalues_iq4nl_f[16] = { + -127.f, -104.f, -83.f, -65.f, -49.f, -35.f, -22.f, -10.f, 1.f, 13.f, 25.f, 38.f, 53.f, 69.f, 89.f, 113.f +}; + +void kernel_mul_mv_iq4_nl_f32_impl( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne10, + constant int64_t & ne12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + threadgroup float * shared_values [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + const int nb = ne00/QK4_NL; + const int r0 = tgpig.x; + const int r1 = tgpig.y; + const int im = tgpig.z; + const int first_row = (r0 * 2 + sgitg) * 2; + const int ib_row = first_row * nb; + + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + device const block_iq4_nl * x = (device const block_iq4_nl *) src0 + ib_row + offset0; + device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; + + const int ix = tiisg/2; // 0...15 + const int it = tiisg%2; // 0 or 1 + + shared_values[tiisg] = kvalues_iq4nl_f[tiisg%16]; + threadgroup_barrier(mem_flags::mem_threadgroup); + + float4 yl[4]; + float sumf[2]={0.f}, all_sum; + + device const float * yb = y + ix * QK4_NL + it * 8; + + uint32_t aux32[2]; + thread const uint8_t * q8 = (thread const uint8_t *)aux32; + + float4 qf1, qf2; + + for (int ib = ix; ib < nb; ib += 16) { + + device const float4 * y4 = (device const float4 *)yb; + yl[0] = y4[0]; yl[1] = y4[4]; yl[2] = y4[1]; yl[3] = y4[5]; + + for (int row = 0; row < 2; ++row) { + + device const block_iq4_nl & xb = x[row*nb + ib]; + device const uint16_t * q4 = (device const uint16_t *)(xb.qs + 8*it); + + float4 acc1 = {0.f}, acc2 = {0.f}; + + aux32[0] = q4[0] | (q4[1] << 16); + aux32[1] = (aux32[0] >> 4) & 0x0f0f0f0f; + aux32[0] &= 0x0f0f0f0f; + qf1 = {shared_values[q8[0]], shared_values[q8[1]], shared_values[q8[2]], shared_values[q8[3]]}; + qf2 = {shared_values[q8[4]], shared_values[q8[5]], shared_values[q8[6]], shared_values[q8[7]]}; + acc1 += yl[0] * qf1; + acc2 += yl[1] * qf2; + + aux32[0] = q4[2] | (q4[3] << 16); + aux32[1] = (aux32[0] >> 4) & 0x0f0f0f0f; + aux32[0] &= 0x0f0f0f0f; + qf1 = {shared_values[q8[0]], shared_values[q8[1]], shared_values[q8[2]], shared_values[q8[3]]}; + qf2 = {shared_values[q8[4]], shared_values[q8[5]], shared_values[q8[6]], shared_values[q8[7]]}; + acc1 += yl[2] * qf1; + acc2 += yl[3] * qf2; + + acc1 += acc2; + + sumf[row] += (float)xb.d * (acc1[0] + acc1[1] + acc1[2] + acc1[3]); + + } + + yb += 16 * QK4_NL; + } + + for (int row = 0; row < 2; ++row) { + all_sum = simd_sum(sumf[row]); + if (tiisg == 0) { + dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum; + } + } +} + [[host_name("kernel_mul_mv_iq1_s_f32")]] kernel void kernel_mul_mv_iq1_s_f32( device const void * src0, @@ -4475,6 +4577,34 @@ kernel void kernel_mul_mv_iq1_s_f32( kernel_mul_mv_iq1_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, tgpig, tiisg, sgitg); } +[[host_name("kernel_mul_mv_iq4_nl_f32")]] +kernel void kernel_mul_mv_iq4_nl_f32( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + threadgroup float * shared_values [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + kernel_mul_mv_iq4_nl_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg); +} //============================= templates and their specializations ============================= @@ -4838,6 +4968,21 @@ void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 & } } +template +void dequantize_iq4_nl(device const block_iq4_nl * xb, short il, thread type4x4 & reg) { + device const uint16_t * q4 = (device const uint16_t *)xb->qs; + const float d = xb->d; + uint32_t aux32; + thread const uint8_t * q8 = (thread const uint8_t *)&aux32; + for (int i = 0; i < 4; ++i) { + aux32 = ((q4[2*i] | (q4[2*i+1] << 16)) >> 4*il) & 0x0f0f0f0f; + reg[i][0] = d * kvalues_iq4nl_f[q8[0]]; + reg[i][1] = d * kvalues_iq4nl_f[q8[1]]; + reg[i][2] = d * kvalues_iq4nl_f[q8[2]]; + reg[i][3] = d * kvalues_iq4nl_f[q8[3]]; + } +} + template kernel void kernel_get_rows( device const void * src0, @@ -5381,6 +5526,7 @@ template [[host_name("kernel_get_rows_iq2_xxs")]] kernel get_rows_t kernel_get_r template [[host_name("kernel_get_rows_iq2_xs")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_iq3_xxs")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_iq1_s")]] kernel get_rows_t kernel_get_rows; +template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_t kernel_get_rows; // // matrix-matrix multiplication @@ -5421,6 +5567,7 @@ template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mat_mm_t kernel_mul_m template [[host_name("kernel_mul_mm_iq2_xs_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mat_mm_t kernel_mul_mm; // // indirect matrix-matrix multiplication @@ -5473,6 +5620,7 @@ template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mat_mm_id_t kernel template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; // // matrix-vector multiplication @@ -6503,3 +6651,68 @@ kernel void kernel_mul_mv_id_iq1_s_f32( tiisg, sgitg); } + +[[host_name("kernel_mul_mv_id_iq4_nl_f32")]] +kernel void kernel_mul_mv_id_iq4_nl_f32( + device const char * ids, + device const char * src1, + device float * dst, + constant uint64_t & nbi1, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant int64_t & ne13, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint64_t & nb1, + constant uint & r2, + constant uint & r3, + constant int & idx, + device const char * src00, + device const char * src01, + device const char * src02, + device const char * src03, + device const char * src04, + device const char * src05, + device const char * src06, + device const char * src07, + threadgroup float * shared_values [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; + + const int64_t bid = tgpig.z/(ne12*ne13); + + tgpig.z = tgpig.z%(ne12*ne13); + + const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; + + kernel_mul_mv_iq4_nl_f32_impl( + src0[id], + (device const float *) (src1 + bid*nb11), + dst + bid*ne0, + ne00, + ne01, + ne02, + ne10, + ne12, + ne0, + ne1, + r2, + r3, + shared_values, + tgpig, + tiisg, + sgitg); +} diff --git a/ggml-quants.c b/ggml-quants.c index 3319d2ccf1812..6336538f0e99e 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -3754,6 +3754,26 @@ void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, in } } +static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113}; + +void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, int k) { + assert(k % QK4_NL == 0); + const int nb = k / QK4_NL; + + for (int i = 0; i < nb; i++) { + + const uint8_t * qs = x[i].qs; + + const float d = GGML_FP16_TO_FP32(x[i].d); + for (int j = 0; j < QK4_NL/2; ++j) { + y[j+ 0] = d * kvalues_iq4nl[qs[j] & 0xf]; + y[j+QK4_NL/2] = d * kvalues_iq4nl[qs[j] >> 4]; + } + y += QK4_NL; + qs += QK4_NL/2; + } +} + //===================================== Q8_K ============================================== void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) { @@ -9148,7 +9168,6 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void * #endif } -// TODO void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); @@ -9452,7 +9471,100 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const *s = sumf; #endif +} + +void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + assert(n % QK4_NL == 0); + static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same"); + + const block_iq4_nl * restrict x = vx; + const block_q8_0 * restrict y = vy; + + const int nb = n / QK4_NL; + +#if defined __ARM_NEON + const int8x16_t values = vld1q_s8(kvalues_iq4nl); + const uint8x16_t m4b = vdupq_n_u8(0x0f); + uint8x16x2_t q4bits; + int8x16x4_t q4b; + int8x16x4_t q8b; + int32x4_t prod_1, prod_2; + float sumf = 0; + + for (int ib = 0; ib < nb; ib += 2) { + + q4bits.val[0] = vld1q_u8(x[ib+0].qs); + q4bits.val[1] = vld1q_u8(x[ib+1].qs); + q8b.val[0] = vld1q_s8(y[ib+0].qs); + q8b.val[1] = vld1q_s8(y[ib+0].qs + 16); + q8b.val[2] = vld1q_s8(y[ib+1].qs); + q8b.val[3] = vld1q_s8(y[ib+1].qs + 16); + + q4b.val[0] = vqtbl1q_s8(values, vandq_u8(q4bits.val[0], m4b)); + q4b.val[1] = vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4)); + q4b.val[2] = vqtbl1q_s8(values, vandq_u8(q4bits.val[1], m4b)); + q4b.val[3] = vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4)); + + prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]); + prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]); + + sumf += (float)x[ib+0].d * (float)y[ib+0].d * vaddvq_s32(prod_1) + (float)x[ib+1].d * (float)y[ib+1].d * vaddvq_s32(prod_2); + + } + + *s = sumf; + +#elif defined __AVX2__ + + const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl); + const __m128i m4b = _mm_set1_epi8(0x0f); + const __m256i mone = _mm256_set1_epi16(1); + + __m256 accum1 = _mm256_setzero_ps(); + __m256 accum2 = _mm256_setzero_ps(); + for (int ib = 0; ib < nb; ib += 2) { + const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)x[0].qs); + const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[1].qs); + const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[0].qs); + const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[1].qs); + const __m256i q4b_1 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)), + _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b))); + const __m256i q4b_2 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)), + _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b))); + const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1); + const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2); + const __m256i p_1 = _mm256_madd_epi16(p16_1, mone); + const __m256i p_2 = _mm256_madd_epi16(p16_2, mone); + accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[0].d)*GGML_FP16_TO_FP32(x[0].d)), + _mm256_cvtepi32_ps(p_1), accum1); + accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[1].d)*GGML_FP16_TO_FP32(x[1].d)), + _mm256_cvtepi32_ps(p_2), accum2); + + y += 2; + x += 2; + } + + *s = hsum_float_8(_mm256_add_ps(accum1, accum2)); + +#else + float sumf = 0; + for (int ib = 0; ib < nb; ++ib) { + const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d); + int sumi1 = 0, sumi2 = 0; + for (int j = 0; j < QK4_NL/2; ++j) { + sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf]; + sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >> 4]; + } + sumf += d * (sumi1 + sumi2); + } + *s = sumf; +#endif } // ================================ IQ2 quantization ============================================= @@ -10729,3 +10841,123 @@ size_t quantize_iq1_s(const float * src, void * dst, int nrow, int n_per_row, in } return nrow * nblock * sizeof(block_iq1_s); } + +// ============================ 4-bit non-linear quants + +static inline int best_index_int8(int n, const int8_t * val, float x) { + if (x <= val[0]) return 0; + if (x >= val[n-1]) return n-1; + int ml = 0, mu = n-1; + while (mu-ml > 1) { + int mav = (ml+mu)/2; + if (x < val[mav]) mu = mav; else ml = mav; + } + return x - val[mu-1] < val[mu] - x ? mu-1 : mu; +} + +static void quantize_row_iq4_nl_impl(const int block_size, const float * GGML_RESTRICT x, + ggml_fp16_t * dh, uint8_t * q4, + float * weight, uint8_t * L, + const int8_t * values, + const float * quant_weights) { + + const int ntry = 7; + + float sigma2 = 0; + for (int j = 0; j < QK4_NL; ++j) sigma2 += x[j]*x[j]; + sigma2 *= 2.f/QK4_NL; + + const int nb = QK4_NL/block_size; + + memset(q4, 0, QK4_NL/2); + for (int ib = 0; ib < nb; ++ib) { + dh[ib] = GGML_FP32_TO_FP16(0.f); + const float * xb = x + ib*block_size; + if (quant_weights) { + const float * qw = quant_weights + ib*block_size; + for (int j = 0; j < block_size; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); + } else { + for (int j = 0; j < block_size; ++j) weight[j] = xb[j]*xb[j]; + } + float amax = 0, max = 0; + for (int j = 0; j < block_size; ++j) { + float ax = fabsf(xb[j]); + if (ax > amax) { + amax = ax; max = xb[j]; + } + } + if (!amax) { + continue; + } + float d = -max/values[0]; + float id = 1/d; + float sumqx = 0, sumq2 = 0; + for (int j = 0; j < block_size; ++j) { + float al = id*xb[j]; + int l = best_index_int8(16, values, al); + float q = values[l]; + float w = weight[j]; + sumqx += w*q*xb[j]; + sumq2 += w*q*q; + } + float best_id = id; + d = sumqx/sumq2; + float best = d*sumqx; + for (int itry = -ntry; itry <= ntry; ++itry) { + id = (itry + values[0])/max; + sumqx = sumq2 = 0; + for (int j = 0; j < block_size; ++j) { + float al = id*xb[j]; + int l = best_index_int8(16, values, al); + float q = values[l]; + float w = weight[j]; + sumqx += w*q*xb[j]; + sumq2 += w*q*q; + } + if (sumq2 > 0 && sumqx*sumqx > best*sumq2) { + d = sumqx/sumq2; best = d * sumqx; + best_id = id; + } + } + dh[ib] = GGML_FP32_TO_FP16(d); + for (int j = 0; j < block_size; ++j) { + L[ib*block_size + j] = best_index_int8(16, values, best_id*xb[j]); + } + } + for (int i = 0; i < QK4_NL/32; ++i) { + for (int j = 0; j < 16; ++j) { + q4[16*i + j] = L[32*i + j] | (L[32*i + 16 + j] << 4); + } + } +} + +size_t quantize_iq4_nl(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { + (void)hist; + GGML_ASSERT(n_per_row%QK4_NL == 0); + int nblock = n_per_row/QK4_NL; + char * qrow = (char *)dst; + uint8_t L[QK4_NL]; + float weight[32]; + for (int row = 0; row < nrow; ++row) { + block_iq4_nl * iq4 = (block_iq4_nl *)qrow; + for (int ibl = 0; ibl < nblock; ++ibl) { + const float * qw = quant_weights ? quant_weights + QK4_NL*ibl : NULL; + quantize_row_iq4_nl_impl(32, src + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, weight, L, kvalues_iq4nl, qw); + } + src += n_per_row; + qrow += nblock*sizeof(block_iq4_nl); + } + return nrow * nblock * sizeof(block_iq4_nl); +} + +void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) { + assert(k % QK4_NL == 0); + block_iq4_nl * restrict y = vy; + quantize_row_iq4_nl_reference(x, y, k); +} + +void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int k) { + assert(k % QK4_NL == 0); + quantize_iq4_nl(x, y, 1, k, NULL, NULL); +} + diff --git a/ggml-quants.h b/ggml-quants.h index ad381cfab022d..113623b62938a 100644 --- a/ggml-quants.h +++ b/ggml-quants.h @@ -198,6 +198,14 @@ typedef struct { } block_iq1_s; static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding"); +// Non-linear quants +#define QK4_NL 32 +typedef struct { + ggml_fp16_t d; + uint8_t qs[QK4_NL/2]; +} block_iq4_nl; +static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding"); + #ifdef __cplusplus extern "C" { #endif @@ -217,6 +225,7 @@ void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGM void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k); void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k); void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k); +void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int k); void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); @@ -232,6 +241,7 @@ void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); +void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); // Dequantization void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); @@ -251,6 +261,7 @@ void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_ void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); // Dot product void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); @@ -268,6 +279,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); // // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization") @@ -276,6 +288,7 @@ size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_iq3_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_iq1_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); +size_t quantize_iq4_nl (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index df182611241b0..b897828f967b1 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -14642,7 +14642,8 @@ GGML_CALL static const char * ggml_backend_sycl_buffer_type_name(ggml_backend_bu static ggml_backend_buffer_t ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) try { - int device = (int) (intptr_t) buft->context; + ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context; + int device = (int) buft_ctx->device; ggml_sycl_set_device(device); int device_index = get_device_index_by_id(device); @@ -14720,7 +14721,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) { for (int i = 0; i < GGML_SYCL_MAX_DEVICES; i++) { ggml_backend_sycl_buffer_types[i] = { /* .iface = */ ggml_backend_sycl_buffer_type_interface, - /* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i, + /* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i)}, }; } ggml_backend_sycl_buffer_type_initialized = true; @@ -14782,10 +14783,6 @@ ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() { // backend -struct ggml_backend_context_sycl { - int device; -}; - static const char * ggml_backend_sycl_name(ggml_backend_t backend) { return GGML_SYCL_NAME; @@ -14793,14 +14790,14 @@ static const char * ggml_backend_sycl_name(ggml_backend_t backend) { } static void ggml_backend_sycl_free(ggml_backend_t backend) { - ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context; + ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; delete sycl_ctx; delete backend; } static ggml_backend_buffer_type_t ggml_backend_sycl_get_default_buffer_type(ggml_backend_t backend) { - ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context; + ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; return ggml_backend_sycl_buffer_type(sycl_ctx->device); } @@ -14809,7 +14806,7 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend, ggml_tensor *tensor, const void *data, size_t offset, size_t size) try { - ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context; + ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type"); GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); @@ -14827,7 +14824,7 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend, const ggml_tensor *tensor, void *data, size_t offset, size_t size) try { - ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context; + ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type"); GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); @@ -14842,7 +14839,7 @@ catch (sycl::exception const &exc) { } static void ggml_backend_sycl_synchronize(ggml_backend_t backend) try { - ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context; + ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->wait())); @@ -14878,7 +14875,7 @@ static void ggml_backend_sycl_graph_plan_compute(ggml_backend_t backend, ggml_ba } static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { - ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context; + ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; ggml_sycl_set_main_device(sycl_ctx->device); @@ -15092,8 +15089,9 @@ ggml_backend_t ggml_backend_sycl_init(int device) { // not strictly necessary, but it may reduce the overhead of the first graph_compute ggml_sycl_set_main_device(device); - ggml_backend_context_sycl * ctx = new ggml_backend_context_sycl { - /* .device = */ device + ggml_backend_sycl_context * ctx = new ggml_backend_sycl_context { + /* .device = */ device, + /* .name = */ GGML_SYCL_NAME + std::to_string(device), }; ggml_backend_t sycl_backend = new ggml_backend { diff --git a/ggml.c b/ggml.c index d129df50548a3..5b9fa741a6479 100644 --- a/ggml.c +++ b/ggml.c @@ -690,6 +690,18 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, }, + [GGML_TYPE_IQ4_NL] = { + .type_name = "iq4_nl", + .blck_size = QK4_NL, + .type_size = sizeof(block_iq4_nl), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_iq4_nl, + .from_float = quantize_row_iq4_nl, + .from_float_reference = (ggml_from_float_t)quantize_row_iq4_nl_reference, + .vec_dot = ggml_vec_dot_iq4_nl_q8_0, + .vec_dot_type = GGML_TYPE_Q8_0, + .nrows = 1, + }, [GGML_TYPE_Q8_K] = { .type_name = "q8_K", .blck_size = QK_K, @@ -2291,6 +2303,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { case GGML_FTYPE_MOSTLY_IQ2_XS: wtype = GGML_TYPE_IQ2_XS; break; case GGML_FTYPE_MOSTLY_IQ3_XXS: wtype = GGML_TYPE_IQ3_XXS; break; case GGML_FTYPE_MOSTLY_IQ1_S: wtype = GGML_TYPE_IQ1_S; break; + case GGML_FTYPE_MOSTLY_IQ4_NL: wtype = GGML_TYPE_IQ4_NL; break; case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break; case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break; } @@ -5631,7 +5644,9 @@ struct ggml_tensor * ggml_conv_2d( ggml_reshape_2d(ctx, im2col, im2col->ne[0], im2col->ne[3] * im2col->ne[2] * im2col->ne[1]), // [N, OH, OW, IC * KH * KW] => [N*OH*OW, IC * KH * KW] ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1] * a->ne[2]), a->ne[3])); // [OC,IC, KH, KW] => [OC, IC * KH * KW] - result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], a->ne[3], im2col->ne[3]); // [N, OC, OH, OW] + result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], im2col->ne[3], a->ne[3]); // [OC, N, OH, OW] + result = ggml_cont(ctx, ggml_permute(ctx, result, 0, 1, 3, 2)); // [N, OC, OH, OW] + return result; } @@ -6637,8 +6652,10 @@ void ggml_set_param( static void ggml_compute_forward_dup_same_cont( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); GGML_ASSERT(src0->type == dst->type); @@ -6669,8 +6686,10 @@ static void ggml_compute_forward_dup_same_cont( } static void ggml_compute_forward_dup_f16( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -6683,7 +6702,7 @@ static void ggml_compute_forward_dup_f16( const int nth = params->nth; // number of threads if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) { - ggml_compute_forward_dup_same_cont(params, src0, dst); + ggml_compute_forward_dup_same_cont(params, dst); return; } @@ -6940,8 +6959,10 @@ static void ggml_compute_forward_dup_f16( static void ggml_compute_forward_dup_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -6954,7 +6975,7 @@ static void ggml_compute_forward_dup_f32( const int nth = params->nth; // number of threads if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) { - ggml_compute_forward_dup_same_cont(params, src0, dst); + ggml_compute_forward_dup_same_cont(params, dst); return; } @@ -7190,8 +7211,10 @@ static void ggml_compute_forward_dup_f32( // A simplified version of ggml_compute_forward_dup that doesn't do float upcasting, and just plain old memcpy. static void ggml_compute_forward_dup_bytes( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); GGML_ASSERT(src0->type == dst->type); @@ -7200,7 +7223,7 @@ static void ggml_compute_forward_dup_bytes( } if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) { - ggml_compute_forward_dup_same_cont(params, src0, dst); + ggml_compute_forward_dup_same_cont(params, dst); return; } @@ -7339,21 +7362,23 @@ static void ggml_compute_forward_dup_bytes( static void ggml_compute_forward_dup( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + if (src0->type == dst->type) { - ggml_compute_forward_dup_bytes(params, src0, dst); + ggml_compute_forward_dup_bytes(params, dst); return; } switch (src0->type) { case GGML_TYPE_F16: { - ggml_compute_forward_dup_f16(params, src0, dst); + ggml_compute_forward_dup_f16(params, dst); } break; case GGML_TYPE_F32: { - ggml_compute_forward_dup_f32(params, src0, dst); + ggml_compute_forward_dup_f32(params, dst); } break; default: { @@ -7366,9 +7391,11 @@ static void ggml_compute_forward_dup( static void ggml_compute_forward_add_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -7454,9 +7481,11 @@ static void ggml_compute_forward_add_f32( static void ggml_compute_forward_add_f16_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -7531,9 +7560,11 @@ static void ggml_compute_forward_add_f16_f32( static void ggml_compute_forward_add_f16_f16( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -7585,9 +7616,11 @@ static void ggml_compute_forward_add_f16_f16( static void ggml_compute_forward_add_q_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -7663,14 +7696,16 @@ static void ggml_compute_forward_add_q_f32( static void ggml_compute_forward_add( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + switch (src0->type) { case GGML_TYPE_F32: { if (src1->type == GGML_TYPE_F32) { - ggml_compute_forward_add_f32(params, src0, src1, dst); + ggml_compute_forward_add_f32(params, dst); } else { GGML_ASSERT(false); @@ -7679,10 +7714,10 @@ static void ggml_compute_forward_add( case GGML_TYPE_F16: { if (src1->type == GGML_TYPE_F16) { - ggml_compute_forward_add_f16_f16(params, src0, src1, dst); + ggml_compute_forward_add_f16_f16(params, dst); } else if (src1->type == GGML_TYPE_F32) { - ggml_compute_forward_add_f16_f32(params, src0, src1, dst); + ggml_compute_forward_add_f16_f32(params, dst); } else { GGML_ASSERT(false); @@ -7702,8 +7737,9 @@ static void ggml_compute_forward_add( case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ4_NL: { - ggml_compute_forward_add_q_f32(params, src0, src1, dst); + ggml_compute_forward_add_q_f32(params, dst); } break; default: { @@ -7716,9 +7752,11 @@ static void ggml_compute_forward_add( static void ggml_compute_forward_add1_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_is_scalar(src1)); @@ -7768,9 +7806,11 @@ static void ggml_compute_forward_add1_f32( static void ggml_compute_forward_add1_f16_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_is_scalar(src1)); @@ -7818,9 +7858,11 @@ static void ggml_compute_forward_add1_f16_f32( static void ggml_compute_forward_add1_f16_f16( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_is_scalar(src1)); @@ -7868,9 +7910,11 @@ static void ggml_compute_forward_add1_f16_f16( static void ggml_compute_forward_add1_q_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_is_scalar(src1)); @@ -7935,21 +7979,23 @@ static void ggml_compute_forward_add1_q_f32( static void ggml_compute_forward_add1( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_add1_f32(params, src0, src1, dst); + ggml_compute_forward_add1_f32(params, dst); } break; case GGML_TYPE_F16: { if (src1->type == GGML_TYPE_F16) { - ggml_compute_forward_add1_f16_f16(params, src0, src1, dst); + ggml_compute_forward_add1_f16_f16(params, dst); } else if (src1->type == GGML_TYPE_F32) { - ggml_compute_forward_add1_f16_f32(params, src0, src1, dst); + ggml_compute_forward_add1_f16_f32(params, dst); } else { GGML_ASSERT(false); @@ -7970,8 +8016,9 @@ static void ggml_compute_forward_add1( case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ4_NL: { - ggml_compute_forward_add1_q_f32(params, src0, src1, dst); + ggml_compute_forward_add1_q_f32(params, dst); } break; default: { @@ -7984,9 +8031,11 @@ static void ggml_compute_forward_add1( static void ggml_compute_forward_acc_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); @@ -8066,14 +8115,14 @@ static void ggml_compute_forward_acc_f32( static void ggml_compute_forward_acc( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_acc_f32(params, src0, src1, dst); + ggml_compute_forward_acc_f32(params, dst); } break; case GGML_TYPE_F16: case GGML_TYPE_Q4_0: @@ -8091,6 +8140,7 @@ static void ggml_compute_forward_acc( case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ4_NL: default: { GGML_ASSERT(false); @@ -8102,9 +8152,11 @@ static void ggml_compute_forward_acc( static void ggml_compute_forward_sub_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + assert(params->ith == 0); assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); @@ -8162,13 +8214,14 @@ static void ggml_compute_forward_sub_f32( static void ggml_compute_forward_sub( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_sub_f32(params, src0, src1, dst); + ggml_compute_forward_sub_f32(params, dst); } break; default: { @@ -8181,9 +8234,11 @@ static void ggml_compute_forward_sub( static void ggml_compute_forward_mul_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -8264,15 +8319,17 @@ static void ggml_compute_forward_mul_f32( static void ggml_compute_forward_mul( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(src1->type == GGML_TYPE_F32 && "only f32 src1 supported for now"); switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_mul_f32(params, src0, src1, dst); + ggml_compute_forward_mul_f32(params, dst); } break; default: { @@ -8285,9 +8342,11 @@ static void ggml_compute_forward_mul( static void ggml_compute_forward_div_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -8358,13 +8417,14 @@ static void ggml_compute_forward_div_f32( static void ggml_compute_forward_div( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_div_f32(params, src0, src1, dst); + ggml_compute_forward_div_f32(params, dst); } break; default: { @@ -8377,8 +8437,10 @@ static void ggml_compute_forward_div( static void ggml_compute_forward_sqr_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); @@ -8401,12 +8463,14 @@ static void ggml_compute_forward_sqr_f32( static void ggml_compute_forward_sqr( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_sqr_f32(params, src0, dst); + ggml_compute_forward_sqr_f32(params, dst); } break; default: { @@ -8419,8 +8483,10 @@ static void ggml_compute_forward_sqr( static void ggml_compute_forward_sqrt_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); @@ -8443,12 +8509,14 @@ static void ggml_compute_forward_sqrt_f32( static void ggml_compute_forward_sqrt( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_sqrt_f32(params, src0, dst); + ggml_compute_forward_sqrt_f32(params, dst); } break; default: { @@ -8461,8 +8529,10 @@ static void ggml_compute_forward_sqrt( static void ggml_compute_forward_log_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(params->ith == 0); GGML_ASSERT(ggml_are_same_shape(src0, dst)); @@ -8485,12 +8555,14 @@ static void ggml_compute_forward_log_f32( static void ggml_compute_forward_log( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_log_f32(params, src0, dst); + ggml_compute_forward_log_f32(params, dst); } break; default: { @@ -8503,8 +8575,10 @@ static void ggml_compute_forward_log( static void ggml_compute_forward_sum_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(ggml_is_scalar(dst)); @@ -8536,8 +8610,10 @@ static void ggml_compute_forward_sum_f32( static void ggml_compute_forward_sum_f16( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(ggml_is_scalar(dst)); @@ -8568,16 +8644,18 @@ static void ggml_compute_forward_sum_f16( static void ggml_compute_forward_sum( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_sum_f32(params, src0, dst); + ggml_compute_forward_sum_f32(params, dst); } break; case GGML_TYPE_F16: { - ggml_compute_forward_sum_f16(params, src0, dst); + ggml_compute_forward_sum_f16(params, dst); } break; default: { @@ -8590,8 +8668,10 @@ static void ggml_compute_forward_sum( static void ggml_compute_forward_sum_rows_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(params->ith == 0); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -8623,12 +8703,14 @@ static void ggml_compute_forward_sum_rows_f32( static void ggml_compute_forward_sum_rows( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_sum_rows_f32(params, src0, dst); + ggml_compute_forward_sum_rows_f32(params, dst); } break; default: { @@ -8641,8 +8723,10 @@ static void ggml_compute_forward_sum_rows( static void ggml_compute_forward_mean_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -8678,12 +8762,14 @@ static void ggml_compute_forward_mean_f32( static void ggml_compute_forward_mean( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_mean_f32(params, src0, dst); + ggml_compute_forward_mean_f32(params, dst); } break; default: { @@ -8696,8 +8782,10 @@ static void ggml_compute_forward_mean( static void ggml_compute_forward_argmax_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -8724,12 +8812,14 @@ static void ggml_compute_forward_argmax_f32( static void ggml_compute_forward_argmax( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_argmax_f32(params, src0, dst); + ggml_compute_forward_argmax_f32(params, dst); } break; default: { @@ -8742,8 +8832,10 @@ static void ggml_compute_forward_argmax( static void ggml_compute_forward_repeat_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(params->ith == 0); GGML_ASSERT(ggml_can_repeat(src0, dst)); @@ -8785,8 +8877,10 @@ static void ggml_compute_forward_repeat_f32( static void ggml_compute_forward_repeat_f16( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(params->ith == 0); GGML_ASSERT(ggml_can_repeat(src0, dst)); @@ -8831,18 +8925,20 @@ static void ggml_compute_forward_repeat_f16( static void ggml_compute_forward_repeat( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F16: case GGML_TYPE_I16: { - ggml_compute_forward_repeat_f16(params, src0, dst); + ggml_compute_forward_repeat_f16(params, dst); } break; case GGML_TYPE_F32: case GGML_TYPE_I32: { - ggml_compute_forward_repeat_f32(params, src0, dst); + ggml_compute_forward_repeat_f32(params, dst); } break; default: { @@ -8855,8 +8951,10 @@ static void ggml_compute_forward_repeat( static void ggml_compute_forward_repeat_back_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(params->ith == 0); GGML_ASSERT(ggml_can_repeat(dst, src0)); @@ -8912,12 +9010,14 @@ static void ggml_compute_forward_repeat_back_f32( static void ggml_compute_forward_repeat_back( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_repeat_back_f32(params, src0, dst); + ggml_compute_forward_repeat_back_f32(params, dst); } break; default: { @@ -8930,10 +9030,11 @@ static void ggml_compute_forward_repeat_back( static void ggml_compute_forward_concat_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -8978,14 +9079,15 @@ static void ggml_compute_forward_concat_f32( static void ggml_compute_forward_concat( const struct ggml_compute_params* params, - const struct ggml_tensor* src0, - const struct ggml_tensor* src1, struct ggml_tensor* dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: case GGML_TYPE_I32: { - ggml_compute_forward_concat_f32(params, src0, src1, dst); + ggml_compute_forward_concat_f32(params, dst); } break; default: { @@ -8998,8 +9100,10 @@ static void ggml_compute_forward_concat( static void ggml_compute_forward_abs_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); @@ -9022,12 +9126,14 @@ static void ggml_compute_forward_abs_f32( static void ggml_compute_forward_abs( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_abs_f32(params, src0, dst); + ggml_compute_forward_abs_f32(params, dst); } break; default: { @@ -9040,8 +9146,10 @@ static void ggml_compute_forward_abs( static void ggml_compute_forward_sgn_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); @@ -9064,12 +9172,14 @@ static void ggml_compute_forward_sgn_f32( static void ggml_compute_forward_sgn( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_sgn_f32(params, src0, dst); + ggml_compute_forward_sgn_f32(params, dst); } break; default: { @@ -9082,8 +9192,10 @@ static void ggml_compute_forward_sgn( static void ggml_compute_forward_neg_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); @@ -9106,12 +9218,14 @@ static void ggml_compute_forward_neg_f32( static void ggml_compute_forward_neg( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_neg_f32(params, src0, dst); + ggml_compute_forward_neg_f32(params, dst); } break; default: { @@ -9124,8 +9238,10 @@ static void ggml_compute_forward_neg( static void ggml_compute_forward_step_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); @@ -9148,12 +9264,14 @@ static void ggml_compute_forward_step_f32( static void ggml_compute_forward_step( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_step_f32(params, src0, dst); + ggml_compute_forward_step_f32(params, dst); } break; default: { @@ -9166,8 +9284,10 @@ static void ggml_compute_forward_step( static void ggml_compute_forward_tanh_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); @@ -9190,12 +9310,14 @@ static void ggml_compute_forward_tanh_f32( static void ggml_compute_forward_tanh( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_tanh_f32(params, src0, dst); + ggml_compute_forward_tanh_f32(params, dst); } break; default: { @@ -9208,8 +9330,10 @@ static void ggml_compute_forward_tanh( static void ggml_compute_forward_elu_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); @@ -9232,12 +9356,14 @@ static void ggml_compute_forward_elu_f32( static void ggml_compute_forward_elu( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_elu_f32(params, src0, dst); + ggml_compute_forward_elu_f32(params, dst); } break; default: { @@ -9250,8 +9376,10 @@ static void ggml_compute_forward_elu( static void ggml_compute_forward_relu_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); @@ -9274,12 +9402,14 @@ static void ggml_compute_forward_relu_f32( static void ggml_compute_forward_relu( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_relu_f32(params, src0, dst); + ggml_compute_forward_relu_f32(params, dst); } break; default: { @@ -9292,8 +9422,10 @@ static void ggml_compute_forward_relu( static void ggml_compute_forward_gelu_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0)); GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst)); GGML_ASSERT(ggml_are_same_shape(src0, dst)); @@ -9333,12 +9465,14 @@ static void ggml_compute_forward_gelu_f32( static void ggml_compute_forward_gelu( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_gelu_f32(params, src0, dst); + ggml_compute_forward_gelu_f32(params, dst); } break; default: { @@ -9351,8 +9485,10 @@ static void ggml_compute_forward_gelu( static void ggml_compute_forward_gelu_quick_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0)); GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst)); GGML_ASSERT(ggml_are_same_shape(src0, dst)); @@ -9392,12 +9528,14 @@ static void ggml_compute_forward_gelu_quick_f32( static void ggml_compute_forward_gelu_quick( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_gelu_quick_f32(params, src0, dst); + ggml_compute_forward_gelu_quick_f32(params, dst); } break; default: { @@ -9410,8 +9548,10 @@ static void ggml_compute_forward_gelu_quick( static void ggml_compute_forward_silu_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0)); GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst)); GGML_ASSERT(ggml_are_same_shape(src0, dst)); @@ -9451,12 +9591,14 @@ static void ggml_compute_forward_silu_f32( static void ggml_compute_forward_silu( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_silu_f32(params, src0, dst); + ggml_compute_forward_silu_f32(params, dst); } break; default: { @@ -9468,8 +9610,10 @@ static void ggml_compute_forward_silu( static void ggml_compute_forward_leaky_relu_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); @@ -9495,12 +9639,14 @@ static void ggml_compute_forward_leaky_relu_f32( static void ggml_compute_forward_leaky_relu( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_leaky_relu_f32(params, src0, dst); + ggml_compute_forward_leaky_relu_f32(params, dst); } break; default: { @@ -9513,9 +9659,11 @@ static void ggml_compute_forward_leaky_relu( static void ggml_compute_forward_silu_back_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * grad, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * grad = dst->src[1]; + GGML_ASSERT(ggml_is_contiguous_except_dim_1(grad)); GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0)); GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst)); @@ -9558,13 +9706,14 @@ static void ggml_compute_forward_silu_back_f32( static void ggml_compute_forward_silu_back( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * grad, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_silu_back_f32(params, src0, grad, dst); + ggml_compute_forward_silu_back_f32(params, dst); } break; default: { @@ -9576,8 +9725,10 @@ static void ggml_compute_forward_silu_back( static void ggml_compute_forward_hardswish_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); @@ -9599,12 +9750,14 @@ static void ggml_compute_forward_hardswish_f32( } static void ggml_compute_forward_hardswish( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_hardswish_f32(params, src0, dst); + ggml_compute_forward_hardswish_f32(params, dst); } break; default: { @@ -9615,8 +9768,10 @@ static void ggml_compute_forward_hardswish( static void ggml_compute_forward_hardsigmoid_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); @@ -9639,12 +9794,14 @@ static void ggml_compute_forward_hardsigmoid_f32( static void ggml_compute_forward_hardsigmoid( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_hardsigmoid_f32(params, src0, dst); + ggml_compute_forward_hardsigmoid_f32(params, dst); } break; default: { @@ -9658,8 +9815,10 @@ static void ggml_compute_forward_hardsigmoid( static void ggml_compute_forward_norm_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(ggml_are_same_shape(src0, dst)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -9711,12 +9870,14 @@ static void ggml_compute_forward_norm_f32( static void ggml_compute_forward_norm( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_norm_f32(params, src0, dst); + ggml_compute_forward_norm_f32(params, dst); } break; default: { @@ -9729,8 +9890,10 @@ static void ggml_compute_forward_norm( static void ggml_compute_forward_rms_norm_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(ggml_are_same_shape(src0, dst)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -9779,12 +9942,14 @@ static void ggml_compute_forward_rms_norm_f32( static void ggml_compute_forward_rms_norm( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_rms_norm_f32(params, src0, dst); + ggml_compute_forward_rms_norm_f32(params, dst); } break; default: { @@ -9795,9 +9960,11 @@ static void ggml_compute_forward_rms_norm( static void ggml_compute_forward_rms_norm_back_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_are_same_shape(src0, src1)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -9952,13 +10119,14 @@ static void ggml_compute_forward_rms_norm_back_f32( static void ggml_compute_forward_rms_norm_back( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_rms_norm_back_f32(params, src0, src1, dst); + ggml_compute_forward_rms_norm_back_f32(params, dst); } break; default: { @@ -9971,8 +10139,10 @@ static void ggml_compute_forward_rms_norm_back( static void ggml_compute_forward_group_norm_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(ggml_are_same_shape(src0, dst)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -10043,12 +10213,14 @@ static void ggml_compute_forward_group_norm_f32( static void ggml_compute_forward_group_norm( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_group_norm_f32(params, src0, dst); + ggml_compute_forward_group_norm_f32(params, dst); } break; default: { @@ -10094,9 +10266,11 @@ static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) { static void ggml_compute_forward_mul_mat( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + int64_t t0 = ggml_perf_time_us(); UNUSED(t0); @@ -10341,10 +10515,11 @@ static void ggml_compute_forward_mul_mat( static void ggml_compute_forward_mul_mat_id( const struct ggml_compute_params * params, - const struct ggml_tensor * ids, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + const struct ggml_tensor * ids = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + const struct ggml_tensor * src0 = dst->src[2]; // only for GGML_TENSOR_BINARY_OP_LOCALS GGML_TENSOR_BINARY_OP_LOCALS @@ -10535,9 +10710,11 @@ static void ggml_compute_forward_mul_mat_id( static void ggml_compute_forward_out_prod_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + // int64_t t0 = ggml_perf_time_us(); // UNUSED(t0); @@ -10727,9 +10904,11 @@ static void ggml_compute_forward_out_prod_f32( static void ggml_compute_forward_out_prod_q_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + // int64_t t0 = ggml_perf_time_us(); // UNUSED(t0); @@ -10840,9 +11019,10 @@ static void ggml_compute_forward_out_prod_q_f32( static void ggml_compute_forward_out_prod( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: @@ -10858,17 +11038,18 @@ static void ggml_compute_forward_out_prod( case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ4_NL: { - ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst); + ggml_compute_forward_out_prod_q_f32(params, dst); } break; case GGML_TYPE_F16: { GGML_ASSERT(false); // todo - // ggml_compute_forward_out_prod_f16_f32(params, src0, src1, dst); + // ggml_compute_forward_out_prod_f16_f32(params, dst); } break; case GGML_TYPE_F32: { - ggml_compute_forward_out_prod_f32(params, src0, src1, dst); + ggml_compute_forward_out_prod_f32(params, dst); } break; default: { @@ -10881,8 +11062,10 @@ static void ggml_compute_forward_out_prod( static void ggml_compute_forward_scale_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(dst)); GGML_ASSERT(ggml_are_same_shape(src0, dst)); @@ -10923,12 +11106,14 @@ static void ggml_compute_forward_scale_f32( static void ggml_compute_forward_scale( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_scale_f32(params, src0, dst); + ggml_compute_forward_scale_f32(params, dst); } break; default: { @@ -10941,9 +11126,11 @@ static void ggml_compute_forward_scale( static void ggml_compute_forward_set_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); @@ -11014,14 +11201,14 @@ static void ggml_compute_forward_set_f32( static void ggml_compute_forward_set( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_set_f32(params, src0, src1, dst); + ggml_compute_forward_set_f32(params, dst); } break; case GGML_TYPE_F16: case GGML_TYPE_Q4_0: @@ -11039,6 +11226,7 @@ static void ggml_compute_forward_set( case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ4_NL: default: { GGML_ASSERT(false); @@ -11050,29 +11238,25 @@ static void ggml_compute_forward_set( static void ggml_compute_forward_cpy( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { - ggml_compute_forward_dup(params, src0, dst); + ggml_compute_forward_dup(params, dst); } // ggml_compute_forward_cont static void ggml_compute_forward_cont( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { - ggml_compute_forward_dup(params, src0, dst); + ggml_compute_forward_dup(params, dst); } // ggml_compute_forward_reshape static void ggml_compute_forward_reshape( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { // NOP UNUSED(params); - UNUSED(src0); UNUSED(dst); } @@ -11080,39 +11264,41 @@ static void ggml_compute_forward_reshape( static void ggml_compute_forward_view( const struct ggml_compute_params * params, - const struct ggml_tensor * src0) { + const struct ggml_tensor * dst) { // NOP UNUSED(params); - UNUSED(src0); + UNUSED(dst); } // ggml_compute_forward_permute static void ggml_compute_forward_permute( const struct ggml_compute_params * params, - const struct ggml_tensor * src0) { + const struct ggml_tensor * dst) { // NOP UNUSED(params); - UNUSED(src0); + UNUSED(dst); } // ggml_compute_forward_transpose static void ggml_compute_forward_transpose( const struct ggml_compute_params * params, - const struct ggml_tensor * src0) { + const struct ggml_tensor * dst) { // NOP UNUSED(params); - UNUSED(src0); + UNUSED(dst); } // ggml_compute_forward_get_rows static void ggml_compute_forward_get_rows_q( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + assert(params->ith == 0); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -11148,9 +11334,11 @@ static void ggml_compute_forward_get_rows_q( static void ggml_compute_forward_get_rows_f16( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + assert(params->ith == 0); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -11183,9 +11371,11 @@ static void ggml_compute_forward_get_rows_f16( static void ggml_compute_forward_get_rows_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + assert(params->ith == 0); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -11218,9 +11408,10 @@ static void ggml_compute_forward_get_rows_f32( static void ggml_compute_forward_get_rows( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: @@ -11237,17 +11428,18 @@ static void ggml_compute_forward_get_rows( case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ4_NL: { - ggml_compute_forward_get_rows_q(params, src0, src1, dst); + ggml_compute_forward_get_rows_q(params, dst); } break; case GGML_TYPE_F16: { - ggml_compute_forward_get_rows_f16(params, src0, src1, dst); + ggml_compute_forward_get_rows_f16(params, dst); } break; case GGML_TYPE_F32: case GGML_TYPE_I32: { - ggml_compute_forward_get_rows_f32(params, src0, src1, dst); + ggml_compute_forward_get_rows_f32(params, dst); } break; default: { @@ -11278,9 +11470,11 @@ static void ggml_compute_forward_get_rows( static void ggml_compute_forward_get_rows_back_f32_f16( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(params->ith == 0); GGML_ASSERT(ggml_is_contiguous(dst)); @@ -11315,9 +11509,11 @@ static void ggml_compute_forward_get_rows_back_f32_f16( static void ggml_compute_forward_get_rows_back_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(params->ith == 0); GGML_ASSERT(ggml_is_contiguous(dst)); @@ -11352,17 +11548,18 @@ static void ggml_compute_forward_get_rows_back_f32( static void ggml_compute_forward_get_rows_back( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F16: { - ggml_compute_forward_get_rows_back_f32_f16(params, src0, src1, dst); + ggml_compute_forward_get_rows_back_f32_f16(params, dst); } break; case GGML_TYPE_F32: { - ggml_compute_forward_get_rows_back_f32(params, src0, src1, dst); + ggml_compute_forward_get_rows_back_f32(params, dst); } break; default: { @@ -11393,8 +11590,10 @@ static void ggml_compute_forward_get_rows_back( static void ggml_compute_forward_diag_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(params->ith == 0); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -11433,12 +11632,14 @@ static void ggml_compute_forward_diag_f32( static void ggml_compute_forward_diag( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_diag_f32(params, src0, dst); + ggml_compute_forward_diag_f32(params, dst); } break; default: { @@ -11451,10 +11652,11 @@ static void ggml_compute_forward_diag( static void ggml_compute_forward_diag_mask_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst, const float value) { + const struct ggml_tensor * src0 = dst->src[0]; + const int ith = params->ith; const int nth = params->nth; @@ -11504,12 +11706,14 @@ static void ggml_compute_forward_diag_mask_f32( static void ggml_compute_forward_diag_mask_inf( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_diag_mask_f32(params, src0, dst, -INFINITY); + ggml_compute_forward_diag_mask_f32(params, dst, -INFINITY); } break; default: { @@ -11520,12 +11724,14 @@ static void ggml_compute_forward_diag_mask_inf( static void ggml_compute_forward_diag_mask_zero( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_diag_mask_f32(params, src0, dst, 0); + ggml_compute_forward_diag_mask_f32(params, dst, 0); } break; default: { @@ -11538,10 +11744,12 @@ static void ggml_compute_forward_diag_mask_zero( static void ggml_compute_forward_soft_max_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - const struct ggml_tensor * src2, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + const struct ggml_tensor * src2 = dst->src[2]; + assert(ggml_is_contiguous(dst)); assert(ggml_are_same_shape(src0, dst)); @@ -11652,14 +11860,14 @@ static void ggml_compute_forward_soft_max_f32( static void ggml_compute_forward_soft_max( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - const struct ggml_tensor * src2, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_soft_max_f32(params, src0, src1, src2, dst); + ggml_compute_forward_soft_max_f32(params, dst); } break; default: { @@ -11672,9 +11880,11 @@ static void ggml_compute_forward_soft_max( static void ggml_compute_forward_soft_max_back_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(src1)); GGML_ASSERT(ggml_is_contiguous(dst)); @@ -11749,13 +11959,14 @@ static void ggml_compute_forward_soft_max_back_f32( static void ggml_compute_forward_soft_max_back( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_soft_max_back_f32(params, src0, src1, dst); + ggml_compute_forward_soft_max_back_f32(params, dst); } break; default: { @@ -11768,8 +11979,10 @@ static void ggml_compute_forward_soft_max_back( static void ggml_compute_forward_alibi_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -11825,8 +12038,10 @@ static void ggml_compute_forward_alibi_f32( static void ggml_compute_forward_alibi_f16( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -11885,16 +12100,18 @@ static void ggml_compute_forward_alibi_f16( static void ggml_compute_forward_alibi( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F16: { - ggml_compute_forward_alibi_f16(params, src0, dst); + ggml_compute_forward_alibi_f16(params, dst); } break; case GGML_TYPE_F32: { - ggml_compute_forward_alibi_f32(params, src0, dst); + ggml_compute_forward_alibi_f32(params, dst); } break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: @@ -11911,6 +12128,7 @@ static void ggml_compute_forward_alibi( case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ4_NL: case GGML_TYPE_Q8_K: case GGML_TYPE_I8: case GGML_TYPE_I16: @@ -11926,8 +12144,10 @@ static void ggml_compute_forward_alibi( static void ggml_compute_forward_clamp_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -11966,12 +12186,14 @@ static void ggml_compute_forward_clamp_f32( static void ggml_compute_forward_clamp( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_clamp_f32(params, src0, dst); + ggml_compute_forward_clamp_f32(params, dst); } break; case GGML_TYPE_F16: case GGML_TYPE_Q4_0: @@ -11989,6 +12211,7 @@ static void ggml_compute_forward_clamp( case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ4_NL: case GGML_TYPE_Q8_K: case GGML_TYPE_I8: case GGML_TYPE_I16: @@ -12060,10 +12283,12 @@ GGML_CALL void ggml_rope_yarn_corr_dims( static void ggml_compute_forward_rope_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst, const bool forward) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -12236,10 +12461,12 @@ static void ggml_compute_forward_rope_f32( static void ggml_compute_forward_rope_f16( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst, const bool forward) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -12401,17 +12628,18 @@ static void ggml_compute_forward_rope_f16( static void ggml_compute_forward_rope( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F16: { - ggml_compute_forward_rope_f16(params, src0, src1, dst, true); + ggml_compute_forward_rope_f16(params, dst, true); } break; case GGML_TYPE_F32: { - ggml_compute_forward_rope_f32(params, src0, src1, dst, true); + ggml_compute_forward_rope_f32(params, dst, true); } break; default: { @@ -12424,17 +12652,18 @@ static void ggml_compute_forward_rope( static void ggml_compute_forward_rope_back( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F16: { - ggml_compute_forward_rope_f16(params, src0, src1, dst, false); + ggml_compute_forward_rope_f16(params, dst, false); } break; case GGML_TYPE_F32: { - ggml_compute_forward_rope_f32(params, src0, src1, dst, false); + ggml_compute_forward_rope_f32(params, dst, false); } break; default: { @@ -12447,9 +12676,11 @@ static void ggml_compute_forward_rope_back( static void ggml_compute_forward_conv_transpose_1d_f16_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); @@ -12544,9 +12775,11 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32( static void ggml_compute_forward_conv_transpose_1d_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); @@ -12641,17 +12874,18 @@ static void ggml_compute_forward_conv_transpose_1d_f32( static void ggml_compute_forward_conv_transpose_1d( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F16: { - ggml_compute_forward_conv_transpose_1d_f16_f32(params, src0, src1, dst); + ggml_compute_forward_conv_transpose_1d_f16_f32(params, dst); } break; case GGML_TYPE_F32: { - ggml_compute_forward_conv_transpose_1d_f32(params, src0, src1, dst); + ggml_compute_forward_conv_transpose_1d_f32(params, dst); } break; default: { @@ -12665,9 +12899,11 @@ static void ggml_compute_forward_conv_transpose_1d( // dst: result [N, OH, OW, IC*KH*KW] static void ggml_compute_forward_im2col_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); @@ -12751,9 +12987,11 @@ static void ggml_compute_forward_im2col_f32( // dst: result [N, OH, OW, IC*KH*KW] static void ggml_compute_forward_im2col_f16( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F16); @@ -12833,17 +13071,15 @@ static void ggml_compute_forward_im2col_f16( static void ggml_compute_forward_im2col( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { switch (dst->type) { case GGML_TYPE_F16: { - ggml_compute_forward_im2col_f16(params, src0, src1, dst); + ggml_compute_forward_im2col_f16(params, dst); } break; case GGML_TYPE_F32: { - ggml_compute_forward_im2col_f32(params, src0, src1, dst); + ggml_compute_forward_im2col_f32(params, dst); } break; default: { @@ -12857,9 +13093,11 @@ static void ggml_compute_forward_im2col( static void ggml_compute_forward_conv_transpose_2d( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); @@ -12963,9 +13201,11 @@ static void ggml_compute_forward_conv_transpose_2d( static void ggml_compute_forward_pool_1d_sk_p0( const struct ggml_compute_params * params, const enum ggml_op_pool op, - const struct ggml_tensor * src, const int k, struct ggml_tensor * dst) { + + const struct ggml_tensor * src = dst->src[0]; + assert(src->type == GGML_TYPE_F32); assert(params->ith == 0); @@ -13014,7 +13254,6 @@ static void ggml_compute_forward_pool_1d_sk_p0( static void ggml_compute_forward_pool_1d( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { const int32_t * opts = (const int32_t *)dst->op_params; @@ -13025,15 +13264,17 @@ static void ggml_compute_forward_pool_1d( GGML_ASSERT(p0 == 0); // padding not supported GGML_ASSERT(k0 == s0); // only s = k supported - ggml_compute_forward_pool_1d_sk_p0(params, op, src0, k0, dst); + ggml_compute_forward_pool_1d_sk_p0(params, op, k0, dst); } // ggml_compute_forward_pool_2d static void ggml_compute_forward_pool_2d( const struct ggml_compute_params * params, - const struct ggml_tensor * src, struct ggml_tensor * dst) { + + const struct ggml_tensor * src = dst->src[0]; + GGML_ASSERT(src->type == GGML_TYPE_F32); GGML_ASSERT(params->ith == 0); @@ -13106,9 +13347,10 @@ static void ggml_compute_forward_pool_2d( static void ggml_compute_forward_upscale_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -13145,12 +13387,14 @@ static void ggml_compute_forward_upscale_f32( static void ggml_compute_forward_upscale( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_upscale_f32(params, src0, dst); + ggml_compute_forward_upscale_f32(params, dst); } break; default: { @@ -13163,9 +13407,10 @@ static void ggml_compute_forward_upscale( static void ggml_compute_forward_pad_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -13203,12 +13448,14 @@ static void ggml_compute_forward_pad_f32( static void ggml_compute_forward_pad( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_pad_f32(params, src0, dst); + ggml_compute_forward_pad_f32(params, dst); } break; default: { @@ -13221,9 +13468,10 @@ static void ggml_compute_forward_pad( static void ggml_compute_forward_argsort_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -13263,13 +13511,14 @@ static void ggml_compute_forward_argsort_f32( static void ggml_compute_forward_argsort( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_argsort_f32(params, src0, dst); + ggml_compute_forward_argsort_f32(params, dst); } break; default: { @@ -13282,11 +13531,13 @@ static void ggml_compute_forward_argsort( static void ggml_compute_forward_flash_attn_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * q, - const struct ggml_tensor * k, - const struct ggml_tensor * v, const bool masked, struct ggml_tensor * dst) { + + const struct ggml_tensor * q = dst->src[0]; + const struct ggml_tensor * k = dst->src[1]; + const struct ggml_tensor * v = dst->src[2]; + int64_t t0 = ggml_perf_time_us(); UNUSED(t0); @@ -13472,11 +13723,13 @@ static void ggml_compute_forward_flash_attn_f32( static void ggml_compute_forward_flash_attn_f16( const struct ggml_compute_params * params, - const struct ggml_tensor * q, - const struct ggml_tensor * k, - const struct ggml_tensor * v, const bool masked, struct ggml_tensor * dst) { + + const struct ggml_tensor * q = dst->src[0]; + const struct ggml_tensor * k = dst->src[1]; + const struct ggml_tensor * v = dst->src[2]; + int64_t t0 = ggml_perf_time_us(); UNUSED(t0); @@ -13698,19 +13951,19 @@ static void ggml_compute_forward_flash_attn_f16( static void ggml_compute_forward_flash_attn( const struct ggml_compute_params * params, - const struct ggml_tensor * q, - const struct ggml_tensor * k, - const struct ggml_tensor * v, const bool masked, struct ggml_tensor * dst) { + + const struct ggml_tensor * q = dst->src[0]; + switch (q->type) { case GGML_TYPE_F16: { - ggml_compute_forward_flash_attn_f16(params, q, k, v, masked, dst); + ggml_compute_forward_flash_attn_f16(params, masked, dst); } break; case GGML_TYPE_F32: { - ggml_compute_forward_flash_attn_f32(params, q, k, v, masked, dst); + ggml_compute_forward_flash_attn_f32(params, masked, dst); } break; default: { @@ -13723,12 +13976,14 @@ static void ggml_compute_forward_flash_attn( static void ggml_compute_forward_flash_ff_f16( const struct ggml_compute_params * params, - const struct ggml_tensor * a, // F16 - const struct ggml_tensor * b0, // F16 fc_w - const struct ggml_tensor * b1, // F32 fc_b - const struct ggml_tensor * c0, // F16 proj_w - const struct ggml_tensor * c1, // F32 proj_b struct ggml_tensor * dst) { + + const struct ggml_tensor * a = dst->src[0]; // F16 + const struct ggml_tensor * b0 = dst->src[1]; // F16 fc_w + const struct ggml_tensor * b1 = dst->src[2]; // F32 fc_b + const struct ggml_tensor * c0 = dst->src[3]; // F16 proj_w + const struct ggml_tensor * c1 = dst->src[4]; // F32 proj_b + int64_t t0 = ggml_perf_time_us(); UNUSED(t0); @@ -13856,16 +14111,14 @@ static void ggml_compute_forward_flash_ff_f16( static void ggml_compute_forward_flash_ff( const struct ggml_compute_params * params, - const struct ggml_tensor * a, - const struct ggml_tensor * b0, - const struct ggml_tensor * b1, - const struct ggml_tensor * c0, - const struct ggml_tensor * c1, struct ggml_tensor * dst) { + + const struct ggml_tensor * b0 = dst->src[1]; + switch (b0->type) { case GGML_TYPE_F16: { - ggml_compute_forward_flash_ff_f16(params, a, b0, b1, c0, c1, dst); + ggml_compute_forward_flash_ff_f16(params, dst); } break; case GGML_TYPE_F32: { @@ -13882,12 +14135,14 @@ static void ggml_compute_forward_flash_ff( static void ggml_compute_forward_flash_attn_back_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * q, - const struct ggml_tensor * k, - const struct ggml_tensor * v, - const struct ggml_tensor * d, const bool masked, struct ggml_tensor * dst) { + + const struct ggml_tensor * q = dst->src[0]; + const struct ggml_tensor * k = dst->src[1]; + const struct ggml_tensor * v = dst->src[2]; + const struct ggml_tensor * d = dst->src[3]; + int64_t t0 = ggml_perf_time_us(); UNUSED(t0); @@ -14235,16 +14490,15 @@ static void ggml_compute_forward_flash_attn_back_f32( static void ggml_compute_forward_flash_attn_back( const struct ggml_compute_params * params, - const struct ggml_tensor * q, - const struct ggml_tensor * k, - const struct ggml_tensor * v, - const struct ggml_tensor * d, const bool masked, struct ggml_tensor * dst) { + + const struct ggml_tensor * q = dst->src[0]; + switch (q->type) { case GGML_TYPE_F32: { - ggml_compute_forward_flash_attn_back_f32(params, q, k, v, d, masked, dst); + ggml_compute_forward_flash_attn_back_f32(params, masked, dst); } break; default: { @@ -14257,8 +14511,10 @@ static void ggml_compute_forward_flash_attn_back( static void ggml_compute_forward_win_part_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -14301,12 +14557,14 @@ static void ggml_compute_forward_win_part_f32( static void ggml_compute_forward_win_part( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_win_part_f32(params, src0, dst); + ggml_compute_forward_win_part_f32(params, dst); } break; default: { @@ -14319,8 +14577,10 @@ static void ggml_compute_forward_win_part( static void ggml_compute_forward_win_unpart_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -14361,12 +14621,14 @@ static void ggml_compute_forward_win_unpart_f32( static void ggml_compute_forward_win_unpart( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_win_unpart_f32(params, src0, dst); + ggml_compute_forward_win_unpart_f32(params, dst); } break; default: { @@ -14379,58 +14641,58 @@ static void ggml_compute_forward_win_unpart( static void ggml_compute_forward_unary( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + const enum ggml_unary_op op = ggml_get_unary_op(dst); switch (op) { case GGML_UNARY_OP_ABS: { - ggml_compute_forward_abs(params, src0, dst); + ggml_compute_forward_abs(params, dst); } break; case GGML_UNARY_OP_SGN: { - ggml_compute_forward_sgn(params, src0, dst); + ggml_compute_forward_sgn(params, dst); } break; case GGML_UNARY_OP_NEG: { - ggml_compute_forward_neg(params, src0, dst); + ggml_compute_forward_neg(params, dst); } break; case GGML_UNARY_OP_STEP: { - ggml_compute_forward_step(params, src0, dst); + ggml_compute_forward_step(params, dst); } break; case GGML_UNARY_OP_TANH: { - ggml_compute_forward_tanh(params, src0, dst); + ggml_compute_forward_tanh(params, dst); } break; case GGML_UNARY_OP_ELU: { - ggml_compute_forward_elu(params, src0, dst); + ggml_compute_forward_elu(params, dst); } break; case GGML_UNARY_OP_RELU: { - ggml_compute_forward_relu(params, src0, dst); + ggml_compute_forward_relu(params, dst); } break; case GGML_UNARY_OP_GELU: { - ggml_compute_forward_gelu(params, src0, dst); + ggml_compute_forward_gelu(params, dst); } break; case GGML_UNARY_OP_GELU_QUICK: { - ggml_compute_forward_gelu_quick(params, src0, dst); + ggml_compute_forward_gelu_quick(params, dst); } break; case GGML_UNARY_OP_SILU: { - ggml_compute_forward_silu(params, src0, dst); + ggml_compute_forward_silu(params, dst); } break; case GGML_UNARY_OP_HARDSWISH: { - ggml_compute_forward_hardswish(params, src0, dst); + ggml_compute_forward_hardswish(params, dst); } break; case GGML_UNARY_OP_HARDSIGMOID: { - ggml_compute_forward_hardsigmoid(params, src0, dst); + ggml_compute_forward_hardsigmoid(params, dst); } break; default: { @@ -14443,8 +14705,10 @@ static void ggml_compute_forward_unary( static void ggml_compute_forward_get_rel_pos_f16( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -14470,12 +14734,14 @@ static void ggml_compute_forward_get_rel_pos_f16( static void ggml_compute_forward_get_rel_pos( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F16: { - ggml_compute_forward_get_rel_pos_f16(params, src0, dst); + ggml_compute_forward_get_rel_pos_f16(params, dst); } break; default: { @@ -14488,11 +14754,12 @@ static void ggml_compute_forward_get_rel_pos( static void ggml_compute_forward_add_rel_pos_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - const struct ggml_tensor * src2, struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + const struct ggml_tensor * src2 = dst->src[2]; + const bool inplace = (bool) ((int32_t *) dst->op_params)[0]; if (!inplace && params->type == GGML_TASK_INIT) { if (params->ith != 0) { @@ -14556,14 +14823,14 @@ static void ggml_compute_forward_add_rel_pos_f32( static void ggml_compute_forward_add_rel_pos( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - const struct ggml_tensor * src2, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_add_rel_pos_f32(params, src0, src1, src2, dst); + ggml_compute_forward_add_rel_pos_f32(params, dst); } break; default: { @@ -14576,9 +14843,11 @@ static void ggml_compute_forward_add_rel_pos( static void ggml_compute_forward_map_unary_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst, const ggml_unary_op_f32_t fun) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(ggml_are_same_shape(src0, dst)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -14600,13 +14869,15 @@ static void ggml_compute_forward_map_unary_f32( static void ggml_compute_forward_map_unary( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst, const ggml_unary_op_f32_t fun) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_map_unary_f32(params, src0, dst, fun); + ggml_compute_forward_map_unary_f32(params, dst, fun); } break; default: { @@ -14619,10 +14890,12 @@ static void ggml_compute_forward_map_unary( static void ggml_compute_forward_map_binary_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst, const ggml_binary_op_f32_t fun) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + assert(params->ith == 0); assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); @@ -14647,14 +14920,15 @@ static void ggml_compute_forward_map_binary_f32( static void ggml_compute_forward_map_binary( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst, const ggml_binary_op_f32_t fun) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_map_binary_f32(params, src0, src1, dst, fun); + ggml_compute_forward_map_binary_f32(params, dst, fun); } break; default: { @@ -14667,9 +14941,11 @@ static void ggml_compute_forward_map_binary( static void ggml_compute_forward_map_custom1_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * a, struct ggml_tensor * dst, const ggml_custom1_op_f32_t fun) { + + const struct ggml_tensor * a = dst->src[0]; + assert(params->ith == 0); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -14683,10 +14959,12 @@ static void ggml_compute_forward_map_custom1_f32( static void ggml_compute_forward_map_custom2_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * a, - const struct ggml_tensor * b, struct ggml_tensor * dst, const ggml_custom2_op_f32_t fun) { + + const struct ggml_tensor * a = dst->src[0]; + const struct ggml_tensor * b = dst->src[1]; + assert(params->ith == 0); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -14700,11 +14978,13 @@ static void ggml_compute_forward_map_custom2_f32( static void ggml_compute_forward_map_custom3_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * a, - const struct ggml_tensor * b, - const struct ggml_tensor * c, struct ggml_tensor * dst, const ggml_custom3_op_f32_t fun) { + + const struct ggml_tensor * a = dst->src[0]; + const struct ggml_tensor * b = dst->src[1]; + const struct ggml_tensor * c = dst->src[1]; + assert(params->ith == 0); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -14718,8 +14998,10 @@ static void ggml_compute_forward_map_custom3_f32( static void ggml_compute_forward_map_custom1( const struct ggml_compute_params * params, - const struct ggml_tensor * a, struct ggml_tensor * dst) { + + const struct ggml_tensor * a = dst->src[0]; + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -14733,9 +15015,11 @@ static void ggml_compute_forward_map_custom1( static void ggml_compute_forward_map_custom2( const struct ggml_compute_params * params, - const struct ggml_tensor * a, - const struct ggml_tensor * b, struct ggml_tensor * dst) { + + const struct ggml_tensor * a = dst->src[0]; + const struct ggml_tensor * b = dst->src[1]; + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -14749,10 +15033,12 @@ static void ggml_compute_forward_map_custom2( static void ggml_compute_forward_map_custom3( const struct ggml_compute_params * params, - const struct ggml_tensor * a, - const struct ggml_tensor * b, - const struct ggml_tensor * c, struct ggml_tensor * dst) { + + const struct ggml_tensor * a = dst->src[0]; + const struct ggml_tensor * b = dst->src[1]; + const struct ggml_tensor * c = dst->src[2]; + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -14766,9 +15052,11 @@ static void ggml_compute_forward_map_custom3( static void ggml_compute_forward_cross_entropy_loss_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(src1)); GGML_ASSERT(ggml_is_scalar(dst)); @@ -14872,13 +15160,14 @@ static void ggml_compute_forward_cross_entropy_loss_f32( static void ggml_compute_forward_cross_entropy_loss( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_cross_entropy_loss_f32(params, src0, src1, dst); + ggml_compute_forward_cross_entropy_loss_f32(params, dst); } break; default: { @@ -14891,10 +15180,12 @@ static void ggml_compute_forward_cross_entropy_loss( static void ggml_compute_forward_cross_entropy_loss_back_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - const struct ggml_tensor * opt0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + const struct ggml_tensor * opt0 = dst->src[2]; + GGML_ASSERT(ggml_is_contiguous(dst)); GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(src1)); @@ -14981,14 +15272,14 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32( static void ggml_compute_forward_cross_entropy_loss_back( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - const struct ggml_tensor * opt0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_cross_entropy_loss_back_f32(params, src0, src1, opt0, dst); + ggml_compute_forward_cross_entropy_loss_back_f32(params, dst); } break; default: { @@ -15036,312 +15327,312 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm switch (tensor->op) { case GGML_OP_DUP: { - ggml_compute_forward_dup(params, tensor->src[0], tensor); + ggml_compute_forward_dup(params, tensor); } break; case GGML_OP_ADD: { - ggml_compute_forward_add(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_add(params, tensor); } break; case GGML_OP_ADD1: { - ggml_compute_forward_add1(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_add1(params, tensor); } break; case GGML_OP_ACC: { - ggml_compute_forward_acc(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_acc(params, tensor); } break; case GGML_OP_SUB: { - ggml_compute_forward_sub(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_sub(params, tensor); } break; case GGML_OP_MUL: { - ggml_compute_forward_mul(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_mul(params, tensor); } break; case GGML_OP_DIV: { - ggml_compute_forward_div(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_div(params, tensor); } break; case GGML_OP_SQR: { - ggml_compute_forward_sqr(params, tensor->src[0], tensor); + ggml_compute_forward_sqr(params, tensor); } break; case GGML_OP_SQRT: { - ggml_compute_forward_sqrt(params, tensor->src[0], tensor); + ggml_compute_forward_sqrt(params, tensor); } break; case GGML_OP_LOG: { - ggml_compute_forward_log(params, tensor->src[0], tensor); + ggml_compute_forward_log(params, tensor); } break; case GGML_OP_SUM: { - ggml_compute_forward_sum(params, tensor->src[0], tensor); + ggml_compute_forward_sum(params, tensor); } break; case GGML_OP_SUM_ROWS: { - ggml_compute_forward_sum_rows(params, tensor->src[0], tensor); + ggml_compute_forward_sum_rows(params, tensor); } break; case GGML_OP_MEAN: { - ggml_compute_forward_mean(params, tensor->src[0], tensor); + ggml_compute_forward_mean(params, tensor); } break; case GGML_OP_ARGMAX: { - ggml_compute_forward_argmax(params, tensor->src[0], tensor); + ggml_compute_forward_argmax(params, tensor); } break; case GGML_OP_REPEAT: { - ggml_compute_forward_repeat(params, tensor->src[0], tensor); + ggml_compute_forward_repeat(params, tensor); } break; case GGML_OP_REPEAT_BACK: { - ggml_compute_forward_repeat_back(params, tensor->src[0], tensor); + ggml_compute_forward_repeat_back(params, tensor); } break; case GGML_OP_CONCAT: { - ggml_compute_forward_concat(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_concat(params, tensor); } break; case GGML_OP_SILU_BACK: { - ggml_compute_forward_silu_back(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_silu_back(params, tensor); } break; case GGML_OP_NORM: { - ggml_compute_forward_norm(params, tensor->src[0], tensor); + ggml_compute_forward_norm(params, tensor); } break; case GGML_OP_RMS_NORM: { - ggml_compute_forward_rms_norm(params, tensor->src[0], tensor); + ggml_compute_forward_rms_norm(params, tensor); } break; case GGML_OP_RMS_NORM_BACK: { - ggml_compute_forward_rms_norm_back(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_rms_norm_back(params, tensor); } break; case GGML_OP_GROUP_NORM: { - ggml_compute_forward_group_norm(params, tensor->src[0], tensor); + ggml_compute_forward_group_norm(params, tensor); } break; case GGML_OP_MUL_MAT: { - ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_mul_mat(params, tensor); } break; case GGML_OP_MUL_MAT_ID: { - ggml_compute_forward_mul_mat_id(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_mul_mat_id(params, tensor); } break; case GGML_OP_OUT_PROD: { - ggml_compute_forward_out_prod(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_out_prod(params, tensor); } break; case GGML_OP_SCALE: { - ggml_compute_forward_scale(params, tensor->src[0], tensor); + ggml_compute_forward_scale(params, tensor); } break; case GGML_OP_SET: { - ggml_compute_forward_set(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_set(params, tensor); } break; case GGML_OP_CPY: { - ggml_compute_forward_cpy(params, tensor->src[0], tensor); + ggml_compute_forward_cpy(params, tensor); } break; case GGML_OP_CONT: { - ggml_compute_forward_cont(params, tensor->src[0], tensor); + ggml_compute_forward_cont(params, tensor); } break; case GGML_OP_RESHAPE: { - ggml_compute_forward_reshape(params, tensor->src[0], tensor); + ggml_compute_forward_reshape(params, tensor); } break; case GGML_OP_VIEW: { - ggml_compute_forward_view(params, tensor->src[0]); + ggml_compute_forward_view(params, tensor); } break; case GGML_OP_PERMUTE: { - ggml_compute_forward_permute(params, tensor->src[0]); + ggml_compute_forward_permute(params, tensor); } break; case GGML_OP_TRANSPOSE: { - ggml_compute_forward_transpose(params, tensor->src[0]); + ggml_compute_forward_transpose(params, tensor); } break; case GGML_OP_GET_ROWS: { - ggml_compute_forward_get_rows(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_get_rows(params, tensor); } break; case GGML_OP_GET_ROWS_BACK: { - ggml_compute_forward_get_rows_back(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_get_rows_back(params, tensor); } break; case GGML_OP_DIAG: { - ggml_compute_forward_diag(params, tensor->src[0], tensor); + ggml_compute_forward_diag(params, tensor); } break; case GGML_OP_DIAG_MASK_INF: { - ggml_compute_forward_diag_mask_inf(params, tensor->src[0], tensor); + ggml_compute_forward_diag_mask_inf(params, tensor); } break; case GGML_OP_DIAG_MASK_ZERO: { - ggml_compute_forward_diag_mask_zero(params, tensor->src[0], tensor); + ggml_compute_forward_diag_mask_zero(params, tensor); } break; case GGML_OP_SOFT_MAX: { - ggml_compute_forward_soft_max(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); + ggml_compute_forward_soft_max(params, tensor); } break; case GGML_OP_SOFT_MAX_BACK: { - ggml_compute_forward_soft_max_back(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_soft_max_back(params, tensor); } break; case GGML_OP_ROPE: { - ggml_compute_forward_rope(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_rope(params, tensor); } break; case GGML_OP_ROPE_BACK: { - ggml_compute_forward_rope_back(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_rope_back(params, tensor); } break; case GGML_OP_ALIBI: { - ggml_compute_forward_alibi(params, tensor->src[0], tensor); + ggml_compute_forward_alibi(params, tensor); } break; case GGML_OP_CLAMP: { - ggml_compute_forward_clamp(params, tensor->src[0], tensor); + ggml_compute_forward_clamp(params, tensor); } break; case GGML_OP_CONV_TRANSPOSE_1D: { - ggml_compute_forward_conv_transpose_1d(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_conv_transpose_1d(params, tensor); } break; case GGML_OP_IM2COL: { - ggml_compute_forward_im2col(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_im2col(params, tensor); } break; case GGML_OP_CONV_TRANSPOSE_2D: { - ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_conv_transpose_2d(params, tensor); } break; case GGML_OP_POOL_1D: { - ggml_compute_forward_pool_1d(params, tensor->src[0], tensor); + ggml_compute_forward_pool_1d(params, tensor); } break; case GGML_OP_POOL_2D: { - ggml_compute_forward_pool_2d(params, tensor->src[0], tensor); + ggml_compute_forward_pool_2d(params, tensor); } break; case GGML_OP_UPSCALE: { - ggml_compute_forward_upscale(params, tensor->src[0], tensor); + ggml_compute_forward_upscale(params, tensor); } break; case GGML_OP_PAD: { - ggml_compute_forward_pad(params, tensor->src[0], tensor); + ggml_compute_forward_pad(params, tensor); } break; case GGML_OP_ARGSORT: { - ggml_compute_forward_argsort(params, tensor->src[0], tensor); + ggml_compute_forward_argsort(params, tensor); } break; case GGML_OP_LEAKY_RELU: { - ggml_compute_forward_leaky_relu(params, tensor->src[0], tensor); + ggml_compute_forward_leaky_relu(params, tensor); } break; case GGML_OP_FLASH_ATTN: { const int32_t t = ggml_get_op_params_i32(tensor, 0); GGML_ASSERT(t == 0 || t == 1); const bool masked = t != 0; - ggml_compute_forward_flash_attn(params, tensor->src[0], tensor->src[1], tensor->src[2], masked, tensor); + ggml_compute_forward_flash_attn(params, masked, tensor); } break; case GGML_OP_FLASH_FF: { - ggml_compute_forward_flash_ff(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor->src[4], tensor); + ggml_compute_forward_flash_ff(params, tensor); } break; case GGML_OP_FLASH_ATTN_BACK: { int32_t t = ggml_get_op_params_i32(tensor, 0); GGML_ASSERT(t == 0 || t == 1); bool masked = t != 0; - ggml_compute_forward_flash_attn_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], masked, tensor); + ggml_compute_forward_flash_attn_back(params, masked, tensor); } break; case GGML_OP_WIN_PART: { - ggml_compute_forward_win_part(params, tensor->src[0], tensor); + ggml_compute_forward_win_part(params, tensor); } break; case GGML_OP_WIN_UNPART: { - ggml_compute_forward_win_unpart(params, tensor->src[0], tensor); + ggml_compute_forward_win_unpart(params, tensor); } break; case GGML_OP_UNARY: { - ggml_compute_forward_unary(params, tensor->src[0], tensor); + ggml_compute_forward_unary(params, tensor); } break; case GGML_OP_GET_REL_POS: { - ggml_compute_forward_get_rel_pos(params, tensor->src[0], tensor); + ggml_compute_forward_get_rel_pos(params, tensor); } break; case GGML_OP_ADD_REL_POS: { - ggml_compute_forward_add_rel_pos(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); + ggml_compute_forward_add_rel_pos(params, tensor); } break; case GGML_OP_MAP_UNARY: { ggml_unary_op_f32_t fun; memcpy(&fun, tensor->op_params, sizeof(fun)); - ggml_compute_forward_map_unary(params, tensor->src[0], tensor, fun); + ggml_compute_forward_map_unary(params, tensor, fun); } break; case GGML_OP_MAP_BINARY: { ggml_binary_op_f32_t fun; memcpy(&fun, tensor->op_params, sizeof(fun)); - ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun); + ggml_compute_forward_map_binary(params, tensor, fun); } break; case GGML_OP_MAP_CUSTOM1_F32: { ggml_custom1_op_f32_t fun; memcpy(&fun, tensor->op_params, sizeof(fun)); - ggml_compute_forward_map_custom1_f32(params, tensor->src[0], tensor, fun); + ggml_compute_forward_map_custom1_f32(params, tensor, fun); } break; case GGML_OP_MAP_CUSTOM2_F32: { ggml_custom2_op_f32_t fun; memcpy(&fun, tensor->op_params, sizeof(fun)); - ggml_compute_forward_map_custom2_f32(params, tensor->src[0], tensor->src[1], tensor, fun); + ggml_compute_forward_map_custom2_f32(params, tensor, fun); } break; case GGML_OP_MAP_CUSTOM3_F32: { ggml_custom3_op_f32_t fun; memcpy(&fun, tensor->op_params, sizeof(fun)); - ggml_compute_forward_map_custom3_f32(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun); + ggml_compute_forward_map_custom3_f32(params, tensor, fun); } break; case GGML_OP_MAP_CUSTOM1: { - ggml_compute_forward_map_custom1(params, tensor->src[0], tensor); + ggml_compute_forward_map_custom1(params, tensor); } break; case GGML_OP_MAP_CUSTOM2: { - ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_map_custom2(params, tensor); } break; case GGML_OP_MAP_CUSTOM3: { - ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); + ggml_compute_forward_map_custom3(params, tensor); } break; case GGML_OP_CROSS_ENTROPY_LOSS: { - ggml_compute_forward_cross_entropy_loss(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_cross_entropy_loss(params, tensor); } break; case GGML_OP_CROSS_ENTROPY_LOSS_BACK: { - ggml_compute_forward_cross_entropy_loss_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); + ggml_compute_forward_cross_entropy_loss_back(params, tensor); } break; case GGML_OP_NONE: @@ -19455,6 +19746,15 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i result = quantize_iq1_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); GGML_ASSERT(result == row_size * nrows); } break; + case GGML_TYPE_IQ4_NL: + { + GGML_ASSERT(start % QK4_NL == 0); + GGML_ASSERT(start % n_per_row == 0); + size_t start_row = start / n_per_row; + size_t row_size = ggml_row_size(type, n_per_row); + result = quantize_iq4_nl(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); + GGML_ASSERT(result == row_size * nrows); + } break; case GGML_TYPE_F16: { size_t elemsize = sizeof(ggml_fp16_t); diff --git a/ggml.h b/ggml.h index 004d09c700f32..bed7a36a0ee6a 100644 --- a/ggml.h +++ b/ggml.h @@ -355,6 +355,7 @@ extern "C" { GGML_TYPE_IQ2_XS = 17, GGML_TYPE_IQ3_XXS = 18, GGML_TYPE_IQ1_S = 19, + GGML_TYPE_IQ4_NL = 20, GGML_TYPE_I8, GGML_TYPE_I16, GGML_TYPE_I32, @@ -393,6 +394,7 @@ extern "C" { GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors GGML_FTYPE_MOSTLY_IQ1_S = 18, // except 1d tensors + GGML_FTYPE_MOSTLY_IQ4_NL = 19, // except 1d tensors }; // available tensor operations: diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 114a9a9743081..8f9139d1b7eca 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -111,6 +111,7 @@ class MODEL_ARCH(IntEnum): ORION = auto() INTERNLM2 = auto() MINICPM = auto() + GEMMA = auto() class MODEL_TENSOR(IntEnum): @@ -167,6 +168,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.ORION: "orion", MODEL_ARCH.INTERNLM2: "internlm2", MODEL_ARCH.MINICPM: "minicpm", + MODEL_ARCH.GEMMA: "gemma", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -511,6 +513,19 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, ], + MODEL_ARCH.GEMMA: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_NORM, + ], # TODO } diff --git a/llama.cpp b/llama.cpp index 4296eca3261e8..3a226c4260c0b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -208,6 +208,7 @@ enum llm_arch { LLM_ARCH_ORION, LLM_ARCH_INTERNLM2, LLM_ARCH_MINICPM, + LLM_ARCH_GEMMA, LLM_ARCH_UNKNOWN, }; @@ -234,6 +235,7 @@ static std::map LLM_ARCH_NAMES = { { LLM_ARCH_ORION, "orion" }, { LLM_ARCH_INTERNLM2, "internlm2" }, { LLM_ARCH_MINICPM, "minicpm" }, + { LLM_ARCH_GEMMA, "gemma" }, }; enum llm_kv { @@ -760,6 +762,22 @@ static std::map> LLM_TENSOR_NAMES = { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" }, }, }, + { + LLM_ARCH_GEMMA, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -2527,6 +2545,7 @@ struct llama_model_loader { case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break; case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break; case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break; + case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break; default: { LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max)); @@ -2877,6 +2896,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small"; case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw"; case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw"; default: return "unknown, may not work"; } @@ -3241,6 +3261,16 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_GEMMA: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + + switch (hparams.n_layer) { + case 18: model.type = e_model::MODEL_2B; break; + case 28: model.type = e_model::MODEL_7B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; default: (void)0; } @@ -4358,6 +4388,37 @@ static bool llm_load_tensors( layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); } } break; + case LLM_ARCH_GEMMA: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // output + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + + const int64_t n_ff = hparams.n_ff; + const int64_t n_embd_head_k = hparams.n_embd_head_k; + const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(); + const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(); + + for (uint32_t i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * hparams.n_head}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * hparams.n_head, n_embd}); + + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + } + } break; default: throw std::runtime_error("unknown architecture"); } @@ -7364,6 +7425,113 @@ struct llm_build_context { return gf; } + + struct ggml_cgraph * build_gemma() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + const int64_t n_embd_head_k = hparams.n_embd_head_k; + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); + cb(inpL, "inp_embd", -1); + inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd)); + cb(inpL, "inp_scaled", -1); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); + cb(inp_pos, "inp_pos", -1); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); + cb(KQ_mask, "KQ_mask", -1); + + // shift the entire K-cache if needed + if (do_rope_shift) { + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); + } + + for (int il = 0; il < n_layer; ++il) { + + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Qcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, + n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + cb(Qcur, "Qcur", il); + Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); + cb(Qcur, "Qcur_scaled", il); + + Kcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, + n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + cb(Kcur, "Kcur", il); + + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + model.layers[il].wo, NULL, + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il); + cb(cur, "kqv_out", il); + } + struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL); + cb(sa_out, "sa_out", il); + + cur = llm_build_norm(ctx0, sa_out, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + // feed-forward network + { + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + NULL, + LLM_FFN_GELU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + } + + cur = ggml_add(ctx0, cur, sa_out); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = ggml_mul_mat(ctx0, model.tok_embd, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } }; static struct ggml_cgraph * llama_build_graph( @@ -7472,6 +7640,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_minicpm(); } break; + case LLM_ARCH_GEMMA: + { + result = llm.build_gemma(); + } break; default: GGML_ASSERT(false); } @@ -10354,6 +10526,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL && qs.model.hparams.n_gqa() >= 4) { + new_type = GGML_TYPE_Q5_K; + } else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; @@ -10406,6 +10581,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K; } } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL && !qs.has_imatrix) { + if (i_layer < n_layer/8) new_type = GGML_TYPE_Q5_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) { new_type = GGML_TYPE_Q5_K; @@ -10422,7 +10600,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty if (arch != LLM_ARCH_FALCON) { if (qs.model.hparams.n_expert == 8) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || - ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || + ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) { new_type = GGML_TYPE_Q5_K; } @@ -10489,8 +10667,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: - case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break; - case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break; + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: new_type = GGML_TYPE_IQ4_NL; break; case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break; case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break; case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break; @@ -10531,7 +10709,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break; case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break; case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break; - case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S ; break; + case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S; break; + case LLAMA_FTYPE_MOSTLY_IQ4_NL: quantized_type = GGML_TYPE_IQ4_NL; break; default: throw std::runtime_error(format("invalid output file type %d\n", ftype)); } diff --git a/llama.h b/llama.h index 77a84c18a69cf..8ba20696f8af9 100644 --- a/llama.h +++ b/llama.h @@ -101,6 +101,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ4_NL = 25, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 733d8f95b93df..bbbf88d9d6ff7 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -818eeb8a3be99125746a90ec63af8f51516a2ec6 +30805514e1bf389a59d30a54a0525cbdc30d5bd1 diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index ef37c5af275fe..55db42bf6e851 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1918,6 +1918,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op GGML_TYPE_Q6_K, GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, + GGML_TYPE_IQ4_NL, }; // unary ops