diff --git a/android/src/main/java/com/rnllama/LlamaContext.java b/android/src/main/java/com/rnllama/LlamaContext.java
index bab6803..4cdb55a 100644
--- a/android/src/main/java/com/rnllama/LlamaContext.java
+++ b/android/src/main/java/com/rnllama/LlamaContext.java
@@ -52,8 +52,6 @@ public LlamaContext(int id, ReactApplicationContext reactContext, ReadableMap pa
       params.hasKey("use_mlock") ? params.getBoolean("use_mlock") : true,
       // boolean use_mmap,
       params.hasKey("use_mmap") ? params.getBoolean("use_mmap") : true,
-      // boolean memory_f16,
-      params.hasKey("memory_f16") ? params.getBoolean("memory_f16") : true,
       // String lora,
       params.hasKey("lora") ? params.getString("lora") : "",
       // float lora_scaled,
@@ -285,7 +283,6 @@ protected static native long initContext(
     int n_gpu_layers, // TODO: Support this
     boolean use_mlock,
     boolean use_mmap,
-    boolean memory_f16,
     String lora,
     float lora_scaled,
     String lora_base,
diff --git a/android/src/main/jni.cpp b/android/src/main/jni.cpp
index bd06d1f..414ce02 100644
--- a/android/src/main/jni.cpp
+++ b/android/src/main/jni.cpp
@@ -129,7 +129,6 @@ Java_com_rnllama_LlamaContext_initContext(
     jint n_gpu_layers, // TODO: Support this
     jboolean use_mlock,
     jboolean use_mmap,
-    jboolean memory_f16,
     jstring lora_str,
     jfloat lora_scaled,
     jstring lora_base_str,
@@ -158,8 +157,6 @@ Java_com_rnllama_LlamaContext_initContext(
     defaultParams.use_mlock = use_mlock;
     defaultParams.use_mmap = use_mmap;
 
-    defaultParams.memory_f16 = memory_f16;
-
     const char *lora_chars = env->GetStringUTFChars(lora_str, nullptr);
     const char *lora_base_chars = env->GetStringUTFChars(lora_base_str, nullptr);
     if (lora_chars) {
diff --git a/cpp/common.cpp b/cpp/common.cpp
index 252af3f..dcde2be 100644
--- a/cpp/common.cpp
+++ b/cpp/common.cpp
@@ -12,6 +12,7 @@
 #include <regex>
 #include <sstream>
 #include <string>
+#include <unordered_map>
 #include <unordered_set>
 #include <vector>
 #include <cinttypes>
@@ -277,8 +278,18 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.yarn_beta_slow = std::stof(argv[i]);
-        } else if (arg == "--memory-f32") {
-            params.memory_f16 = false;
+        } else if (arg == "--samplers") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            sparams.samplers_sequence = parse_samplers_input(argv[i]);
+        } else if (arg == "--sampling-seq") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            sparams.samplers_sequence = argv[i];
         } else if (arg == "--top-p") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -491,8 +502,18 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
             params.interactive_first = true;
         } else if (arg == "-ins" || arg == "--instruct") {
             params.instruct = true;
+        } else if (arg == "-cml" || arg == "--chatml") {
+            params.chatml = true;
         } else if (arg == "--infill") {
             params.infill = true;
+        } else if (arg == "-dkvc" || arg == "--dump-kv-cache") {
+            params.dump_kv_cache = true;
+        } else if (arg == "-nkvo" || arg == "--no-kv-offload") {
+            params.no_kv_offload = true;
+        } else if (arg == "-ctk" || arg == "--cache-type-k") {
+            params.cache_type_k = argv[++i];
+        } else if (arg == "-ctv" || arg == "--cache-type-v") {
+            params.cache_type_v = argv[++i];
         } else if (arg == "--multiline-input") {
             params.multiline_input = true;
         } else if (arg == "--simple-io") {
@@ -673,6 +694,47 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 std::istreambuf_iterator<char>(),
                 std::back_inserter(sparams.grammar)
             );
+        } else if (arg == "--override-kv") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            char * sep = strchr(argv[i], '=');
+            if (sep == nullptr || sep - argv[i] >= 128) {
+                fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]);
+                invalid_param = true;
+                break;
+            }
+            struct llama_model_kv_override kvo;
+            std::strncpy(kvo.key, argv[i], sep - argv[i]);
+            kvo.key[sep - argv[i]] = 0;
+            sep++;
+            if (strncmp(sep, "int:", 4) == 0) {
+                sep += 4;
+                kvo.tag = LLAMA_KV_OVERRIDE_INT;
+                kvo.int_value = std::atol(sep);
+            } else if (strncmp(sep, "float:", 6) == 0) {
+                sep += 6;
+                kvo.tag = LLAMA_KV_OVERRIDE_FLOAT;
+                kvo.float_value = std::atof(sep);
+            } else if (strncmp(sep, "bool:", 5) == 0) {
+                sep += 5;
+                kvo.tag = LLAMA_KV_OVERRIDE_BOOL;
+                if (std::strcmp(sep, "true") == 0) {
+                    kvo.bool_value = true;
+                } else if (std::strcmp(sep, "false") == 0) {
+                    kvo.bool_value = false;
+                } else {
+                    fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
+                    invalid_param = true;
+                    break;
+                }
+            } else {
+                fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
+                invalid_param = true;
+                break;
+            }
+            params.kv_overrides.push_back(kvo);
 #ifndef LOG_DISABLE_LOGS
         // Parse args for logging parameters
         } else if ( log_param_single_parse( argv[i] ) ) {
@@ -716,6 +778,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
         }
     }
 
+    if (!params.kv_overrides.empty()) {
+        params.kv_overrides.emplace_back(llama_model_kv_override());
+        params.kv_overrides.back().key[0] = 0;
+    }
+
     return true;
 }
 
@@ -730,6 +797,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  -i, --interactive     run in interactive mode\n");
     printf("  --interactive-first   run in interactive mode and wait for input right away\n");
     printf("  -ins, --instruct      run in instruction mode (use with Alpaca models)\n");
+    printf("  -cml, --chatml        run in chatml mode (use with ChatML-compatible models)\n");
     printf("  --multiline-input     allows you to write or paste multiple lines without ending each in '\\'\n");
     printf("  -r PROMPT, --reverse-prompt PROMPT\n");
     printf("                        halt generation at PROMPT, return control in interactive mode\n");
@@ -755,6 +823,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
     printf("  -c N, --ctx-size N    size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
     printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
+    printf("  --samplers            samplers that will be used for generation in the order, separated by \';\', for example: \"top_k;tfs;typical;top_p;min_p;temp\"\n");
+    printf("  --sampling-seq        simplified sequence for samplers that will be used (default: %s)\n", sparams.samplers_sequence.c_str());
     printf("  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
     printf("  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
     printf("  --min-p N             min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
@@ -792,8 +862,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  --yarn-beta-fast N    YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
     printf("  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
     printf("  --no-penalize-nl      do not penalize newline token\n");
-    printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
-    printf("                        not recommended: doubles context memory required and no measurable increase in quality\n");
     printf("  --temp N              temperature (default: %.1f)\n", (double)sparams.temp);
     printf("  --logits-all          return logits for all tokens in the batch (default: disabled)\n");
     printf("  --hellaswag           compute HellaSwag score over random tasks from datafile supplied with -f\n");
@@ -832,6 +900,14 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
 #endif // LM_GGML_USE_CUBLAS
 #endif
     printf("  --verbose-prompt      print prompt before generation\n");
+    printf("  -dkvc, --dump-kv-cache\n");
+    printf("                        verbose print of the KV cache\n");
+    printf("  -nkvo, --no-kv-offload\n");
+    printf("                        disable KV offload\n");
+    printf("  -ctk TYPE, --cache-type-k TYPE\n");
+    printf("                        KV cache data type for K (default: %s)\n", params.cache_type_k.c_str());
+    printf("  -ctv TYPE, --cache-type-v TYPE\n");
+    printf("                        KV cache data type for V (default: %s)\n", params.cache_type_v.c_str());
     printf("  --simple-io           use basic IO for better compatibility in subprocesses and limited consoles\n");
     printf("  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
     printf("  --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
@@ -842,6 +918,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("                        draft model for speculative decoding (default: %s)\n", params.model.c_str());
     printf("  -ld LOGDIR, --logdir LOGDIR\n");
     printf("                        path under which to save YAML logs (no logging if unset)\n");
+    printf("  --override-kv KEY=TYPE:VALUE\n");
+    printf("                        advanced option to override model metadata by key. may be specified multiple times.\n");
+    printf("                        types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
     printf("\n");
 #ifndef LOG_DISABLE_LOGS
     log_print_usage();
@@ -878,6 +957,48 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
     LM_GGML_UNREACHABLE();
 }
 
+//
+// String parsing
+//
+
+std::string parse_samplers_input(std::string input) {
+    std::string output = "";
+    // since samplers names are written multiple ways
+    // make it ready for both system names and input names
+    std::unordered_map<std::string, char> samplers_symbols {
+        {"top_k",      'k'},
+        {"top-k",      'k'},
+        {"top_p",      'p'},
+        {"top-p",      'p'},
+        {"nucleus",    'p'},
+        {"typical_p",  'y'},
+        {"typical-p",  'y'},
+        {"typical",    'y'},
+        {"min_p",      'm'},
+        {"min-p",      'm'},
+        {"tfs_z",      'f'},
+        {"tfs-z",      'f'},
+        {"tfs",        'f'},
+        {"temp",       't'},
+        {"temperature",'t'}
+    };
+    // expected format example: "temp;top_k;tfs_z;typical_p;top_p;min_p"
+    size_t separator = input.find(';');
+    while (separator != input.npos) {
+        std::string name = input.substr(0,separator);
+        input = input.substr(separator+1);
+        separator = input.find(';');
+
+        if (samplers_symbols.find(name) != samplers_symbols.end()) {
+            output += samplers_symbols[name];
+        }
+    }
+    if (samplers_symbols.find(input) != samplers_symbols.end()) {
+        output += samplers_symbols[input];
+    }
+    return output;
+}
+
 //
 // Model utils
 //
@@ -892,10 +1013,39 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
     mparams.tensor_split    = params.tensor_split;
     mparams.use_mmap        = params.use_mmap;
     mparams.use_mlock       = params.use_mlock;
+    if (params.kv_overrides.empty()) {
+        mparams.kv_overrides = NULL;
+    } else {
+        LM_GGML_ASSERT(params.kv_overrides.back().key[0] == 0 && "KV overrides not terminated with empty key");
+        mparams.kv_overrides = params.kv_overrides.data();
+    }
 
     return mparams;
 }
 
+static lm_ggml_type kv_cache_type_from_str(const std::string & s) {
+    if (s == "f16") {
+        return LM_GGML_TYPE_F16;
+    }
+    if (s == "q8_0") {
+        return LM_GGML_TYPE_Q8_0;
+    }
+    if (s == "q4_0") {
+        return LM_GGML_TYPE_Q4_0;
+    }
+    if (s == "q4_1") {
+        return LM_GGML_TYPE_Q4_1;
+    }
+    if (s == "q5_0") {
+        return LM_GGML_TYPE_Q5_0;
+    }
+    if (s == "q5_1") {
+        return LM_GGML_TYPE_Q5_1;
+    }
+
+    throw std::runtime_error("Invalid cache type: " + s);
+}
+
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
     auto cparams = llama_context_default_params();
 
@@ -905,7 +1055,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
     cparams.n_threads_batch   = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
     cparams.mul_mat_q         = params.mul_mat_q;
     cparams.seed              = params.seed;
-    cparams.f16_kv            = params.memory_f16;
     cparams.logits_all        = params.logits_all;
     cparams.embedding         = params.embedding;
     cparams.rope_scaling_type = params.rope_scaling_type;
@@ -916,6 +1065,10 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
     cparams.yarn_beta_fast    = params.yarn_beta_fast;
     cparams.yarn_beta_slow    = params.yarn_beta_slow;
     cparams.yarn_orig_ctx     = params.yarn_orig_ctx;
+    cparams.offload_kqv       = !params.no_kv_offload;
+
+    cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
+    cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
 
     return cparams;
 }
@@ -931,7 +1084,7 @@ void llama_batch_add(
     const std::vector<llama_seq_id> & seq_ids,
                                bool   logits) {
     batch.token   [batch.n_tokens] = id;
-    batch.pos     [batch.n_tokens] = pos,
+    batch.pos     [batch.n_tokens] = pos;
     batch.n_seq_id[batch.n_tokens] = seq_ids.size();
     for (size_t i = 0; i < seq_ids.size(); ++i) {
         batch.seq_id[batch.n_tokens][i] = seq_ids[i];
@@ -1072,6 +1225,12 @@ std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_to
     return result;
 }
 
+bool llama_should_add_bos_token(const llama_model * model) {
+    const int add_bos = llama_add_bos_token(model);
+
+    return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
+}
+
 //
 // YAML utils
 //
@@ -1188,6 +1347,7 @@ void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const cha
     if (!data_str.empty() && (std::isspace(data_str[0]) || std::isspace(data_str.back()))) {
         data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
         data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
+        data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)");
         data_str = "\"" + data_str + "\"";
         fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
         return;
@@ -1319,7 +1479,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     }
     fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
     fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
-    fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
     fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
     fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
     fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
@@ -1374,3 +1533,77 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
     fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
 }
+
+//
+// KV cache utils
+//
+
+void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) {
+    static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
+
+    printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
+        view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
+
+    llama_kv_cache_view_cell * c_curr = view.cells;
+    llama_seq_id * cs_curr = view.cells_sequences;
+
+    for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
+        if (i % row_size == 0) {
+            printf("\n%5d: ", i);
+        }
+        int seq_count = 0;
+        for (int j = 0; j < view.n_max_seq; j++) {
+            if (cs_curr[j] >= 0) { seq_count++; }
+        }
+        putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]);
+    }
+
+    printf("\n=== Done dumping\n");
+}
+
+void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
+    static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
+
+    printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
+        view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
+
+    std::unordered_map<llama_seq_id, size_t> seqs;
+    llama_kv_cache_view_cell * c_curr = view.cells;
+    llama_seq_id * cs_curr = view.cells_sequences;
+
+    for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
+        for (int j = 0; j < view.n_max_seq; j++) {
+            if (cs_curr[j] < 0) { continue; }
+            if (seqs.find(cs_curr[j]) == seqs.end()) {
+                if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
+                seqs[cs_curr[j]] = seqs.size();
+            }
+        }
+        if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
+    }
+
+    printf("=== Sequence legend: ");
+    for (const auto & it : seqs) {
+        printf("%zu=%d, ", it.second, it.first);
+    }
+    printf("'+'=other sequence ids");
+
+    c_curr = view.cells;
+    cs_curr = view.cells_sequences;
+    for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
+        if (i % row_size == 0) {
+            printf("\n%5d: ", i);
+        }
+        for (int j = 0; j < view.n_max_seq; j++) {
+            if (cs_curr[j] >= 0) {
+                const auto & it = seqs.find(cs_curr[j]);
+                putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+');
+            } else {
+                putchar('.');
+            }
+        }
+        putchar(' ');
+    }
+
+    printf("\n=== Done dumping\n");
+}
diff --git a/cpp/common.h b/cpp/common.h
index dd6b002..e87ce11 100644
--- a/cpp/common.h
+++ b/cpp/common.h
@@ -86,6 +86,8 @@ struct gpt_params {
     std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
     std::string logdir            = "";  // directory in which to save YAML log files
 
+    std::vector<llama_model_kv_override> kv_overrides;
+
     // TODO: avoid tuple, use struct
     std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
     std::string lora_base  = "";                              // base model path for the lora adapter
@@ -98,10 +100,10 @@ struct gpt_params {
     size_t hellaswag_tasks = 400;   // number of tasks to use when computing the HellaSwag score
 
     bool mul_mat_q         = true;  // if true, use mul_mat_q kernels instead of cuBLAS
-    bool memory_f16        = true;  // use f16 instead of f32 for memory kv
     bool random_prompt     = false; // do not randomize prompt if none provided
     bool use_color         = false; // use color to distinguish generations and inputs
     bool interactive       = false; // interactive mode
+    bool chatml            = false; // chatml mode (used for models trained on chatml syntax)
     bool prompt_cache_all  = false; // save user input and generations to prompt cache
     bool prompt_cache_ro   = false; // open the prompt cache read-only and do not update it
 
@@ -121,10 +123,15 @@ struct gpt_params {
     bool numa              = false; // attempt optimizations that help on some NUMA systems
     bool verbose_prompt    = false; // print prompt tokens before generation
     bool infill            = false; // use infill mode
+    bool dump_kv_cache     = false; // dump the KV cache contents for debugging purposes
+    bool no_kv_offload     = false; // disable KV offloading
+
+    std::string cache_type_k = "f16"; // KV cache data type for the K
+    std::string cache_type_v = "f16"; // KV cache data type for the V
 
     // multimodal models (see examples/llava)
     std::string mmproj = ""; // path to multimodal projector
-    std::string image = ""; // path to an image file
+    std::string image  = ""; // path to an image file
 };
 
 bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
@@ -139,6 +146,12 @@ std::string gpt_random_prompt(std::mt19937 & rng);
 
 void process_escapes(std::string& input);
 
+//
+// String parsing
+//
+
+std::string parse_samplers_input(std::string input);
+
 //
 // Model utils
 //
@@ -200,6 +213,10 @@ std::string llama_detokenize_bpe(
                          llama_context * ctx,
         const std::vector<llama_token> & tokens);
 
+// Uses the value from the model metadata if possible, otherwise
+// defaults to true when model type is SPM, otherwise false.
+bool llama_should_add_bos_token(const llama_model * model);
+
 //
 // YAML utils
 //
@@ -213,3 +230,13 @@ std::string get_sortable_timestamp();
 void dump_non_result_info_yaml(
     FILE * stream, const gpt_params & params, const llama_context * lctx,
     const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
+
+//
+// KV cache utils
+//
+
+// Dump the KV cache view with the number of sequences per cell.
+void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);
+
+// Dump the KV cache view showing individual sequences in each cell (long output).
+void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
diff --git a/cpp/ggml-alloc.c b/cpp/ggml-alloc.c
index 2dbd6b5..ac293c2 100644
--- a/cpp/ggml-alloc.c
+++ b/cpp/ggml-alloc.c
@@ -1,51 +1,21 @@
 #include "ggml-alloc.h"
-#include "ggml-backend.h"
+#include "ggml-backend-impl.h"
 #include "ggml.h"
+#include "ggml-impl.h"
 #include <assert.h>
+#include <limits.h>
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
-
-#define UNUSED(x) (void)(x)
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
-#define LM_GGML_MAX_CONCUR (2*LM_GGML_MAX_NODES)
+#define MAX_FREE_BLOCKS 256
 
 //#define LM_GGML_ALLOCATOR_DEBUG
 
-//#define AT_PRINTF printf
-#define AT_PRINTF(...) ((void)0)
-
-struct hash_node {
-    struct lm_ggml_tensor * t;
-    int n_children;
-    int n_views;
-};
-
-static size_t hash(void * p) {
-    return (size_t)p % LM_GGML_GRAPH_HASHTABLE_SIZE;
-}
-
-static struct hash_node * hash_get(struct hash_node hash_table[], struct lm_ggml_tensor * t) {
-    size_t h = hash(t);
-
-    // linear probing
-    size_t i = h;
-    while (hash_table[i].t != NULL) {
-        if (hash_table[i].t == t) {
-            return &hash_table[i];
-        }
-        i = (i + 1) % LM_GGML_GRAPH_HASHTABLE_SIZE;
-        if (i == h) {
-            // hash table is full
-            LM_GGML_ASSERT(false);
-        }
-    }
-
-    hash_table[i].t = t;
-    return &hash_table[i];
-}
+//#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__)
+#define AT_PRINTF(...)
 
 // TODO: LM_GGML_PAD ?
 static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
@@ -59,20 +29,18 @@ struct free_block {
     size_t size;
 };
 
-#define MAX_FREE_BLOCKS 256
-
-struct lm_ggml_allocr {
+struct lm_ggml_tallocr {
     struct lm_ggml_backend_buffer * buffer;
     bool buffer_owned;
-    void * data;
+    void * base;
     size_t alignment;
+
     int n_free_blocks;
     struct free_block free_blocks[MAX_FREE_BLOCKS];
-    struct hash_node hash_table[LM_GGML_GRAPH_HASHTABLE_SIZE];
+
     size_t max_size;
+
     bool measure;
-    int parse_seq[LM_GGML_MAX_CONCUR];
-    int parse_seq_len;
 
 #ifdef LM_GGML_ALLOCATOR_DEBUG
     struct lm_ggml_tensor * allocated_tensors[1024];
@@ -80,7 +48,7 @@ struct lm_ggml_allocr {
 };
 
 #ifdef LM_GGML_ALLOCATOR_DEBUG
-static void add_allocated_tensor(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor * tensor) {
+static void add_allocated_tensor(lm_ggml_tallocr_t alloc, struct lm_ggml_tensor * tensor) {
     for (int i = 0; i < 1024; i++) {
         if (alloc->allocated_tensors[i] == NULL) {
             alloc->allocated_tensors[i] = tensor;
@@ -89,7 +57,7 @@ static void add_allocated_tensor(struct lm_ggml_allocr * alloc, struct lm_ggml_t
     }
     LM_GGML_ASSERT(!"out of allocated_tensors");
 }
-static void remove_allocated_tensor(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor * tensor) {
+static void remove_allocated_tensor(lm_ggml_tallocr_t alloc, struct lm_ggml_tensor * tensor) {
     for (int i = 0; i < 1024; i++) {
         if (alloc->allocated_tensors[i] == tensor ||
             (alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) {
@@ -103,7 +71,7 @@ static void remove_allocated_tensor(struct lm_ggml_allocr * alloc, struct lm_ggm
 #endif
 
 // check if a tensor is allocated by this buffer
-static bool lm_ggml_allocr_is_own(struct lm_ggml_allocr * alloc, const struct lm_ggml_tensor * tensor) {
+static bool lm_ggml_tallocr_is_own(lm_ggml_tallocr_t alloc, const struct lm_ggml_tensor * tensor) {
     return tensor->buffer == alloc->buffer;
 }
 
@@ -111,7 +79,7 @@ static bool lm_ggml_is_view(struct lm_ggml_tensor * t) {
     return t->view_src != NULL;
 }
 
-void lm_ggml_allocr_alloc(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor * tensor) {
+void lm_ggml_tallocr_alloc(lm_ggml_tallocr_t alloc, struct lm_ggml_tensor * tensor) {
     LM_GGML_ASSERT(!lm_ggml_is_view(tensor)); // views generally get data pointer from one of their sources
     LM_GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
 
@@ -162,13 +130,14 @@ void lm_ggml_allocr_alloc(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor *
     }
 
     tensor->data = addr;
-    AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data);
     tensor->buffer = alloc->buffer;
-    lm_ggml_backend_buffer_init_tensor(alloc->buffer, tensor);
+    if (!alloc->measure) {
+        lm_ggml_backend_buffer_init_tensor(alloc->buffer, tensor);
+    }
 
 #ifdef LM_GGML_ALLOCATOR_DEBUG
     add_allocated_tensor(alloc, tensor);
-    size_t cur_max = (char*)addr - (char*)alloc->data + size;
+    size_t cur_max = (char*)addr - (char*)alloc->base + size;
     if (cur_max > alloc->max_size) {
         printf("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
         for (int i = 0; i < 1024; i++) {
@@ -180,16 +149,16 @@ void lm_ggml_allocr_alloc(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor *
     }
 #endif
 
-    alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)alloc->data + size);
+    alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)alloc->base + size);
 }
 
 // this is a very naive implementation, but for our case the number of free blocks should be very small
-static void lm_ggml_allocr_free_tensor(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor * tensor) {
-    if (lm_ggml_allocr_is_own(alloc, tensor) == false) {
+static void lm_ggml_tallocr_free_tensor(lm_ggml_tallocr_t alloc, struct lm_ggml_tensor * tensor) {
+    if (lm_ggml_tallocr_is_own(alloc, tensor) == false) {
         // the tensor was not allocated in this buffer
         // this can happen because the graph allocator will try to free weights and other tensors from different buffers
         // the easiest way to deal with this is just to ignore it
-        AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer);
+        // AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer);
         return;
     }
 
@@ -199,8 +168,6 @@ static void lm_ggml_allocr_free_tensor(struct lm_ggml_allocr * alloc, struct lm_
     size = aligned_offset(NULL, size, alloc->alignment);
     AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
 
-    lm_ggml_backend_buffer_free_tensor(alloc->buffer, tensor);
-
 #ifdef LM_GGML_ALLOCATOR_DEBUG
     remove_allocated_tensor(alloc, tensor);
 #endif
@@ -253,91 +220,180 @@ static void lm_ggml_allocr_free_tensor(struct lm_ggml_allocr * alloc, struct lm_
     alloc->n_free_blocks++;
 }
 
-void lm_ggml_allocr_set_parse_seq(struct lm_ggml_allocr * alloc, const int * list, int n) {
-    for (int i = 0; i < n; i++) {
-        alloc->parse_seq[i] = list[i];
-    }
-    alloc->parse_seq_len = n;
-}
-
-void lm_ggml_allocr_reset(struct lm_ggml_allocr * alloc) {
+void lm_ggml_tallocr_reset(lm_ggml_tallocr_t alloc) {
     alloc->n_free_blocks = 1;
-    size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
-    alloc->free_blocks[0].addr = (char *)alloc->data + align_offset;
-    alloc->free_blocks[0].size = lm_ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
+    size_t align_offset = aligned_offset(alloc->base, 0, alloc->alignment);
+    alloc->free_blocks[0].addr = (char *)alloc->base + align_offset;
+
+    if (alloc->measure) {
+        alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
+    } else {
+        alloc->free_blocks[0].size = lm_ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
+    }
 }
 
-struct lm_ggml_allocr * lm_ggml_allocr_new(void * data, size_t size, size_t alignment) {
-    struct lm_ggml_backend_buffer * buffer = lm_ggml_backend_cpu_buffer_from_ptr(NULL, data, size);
+lm_ggml_tallocr_t lm_ggml_tallocr_new(void * data, size_t size, size_t alignment) {
+    struct lm_ggml_backend_buffer * buffer = lm_ggml_backend_cpu_buffer_from_ptr(data, size);
 
-    struct lm_ggml_allocr * alloc = (struct lm_ggml_allocr *)malloc(sizeof(struct lm_ggml_allocr));
+    lm_ggml_tallocr_t alloc = (lm_ggml_tallocr_t)malloc(sizeof(struct lm_ggml_tallocr));
 
-    *alloc = (struct lm_ggml_allocr){
+    *alloc = (struct lm_ggml_tallocr) {
         /*.buffer        = */ buffer,
         /*.buffer_owned  = */ true,
         /*.base          = */ lm_ggml_backend_buffer_get_base(buffer),
         /*.alignment     = */ alignment,
         /*.n_free_blocks = */ 0,
         /*.free_blocks   = */ {{0}},
-        /*.hash_table    = */ {{0}},
         /*.max_size      = */ 0,
         /*.measure       = */ false,
-        /*.parse_seq     = */ {0},
-        /*.parse_seq_len = */ 0,
 #ifdef LM_GGML_ALLOCATOR_DEBUG
         /*.allocated_tensors = */ {0},
 #endif
     };
 
-    lm_ggml_allocr_reset(alloc);
+    lm_ggml_tallocr_reset(alloc);
 
     return alloc;
 }
 
-struct lm_ggml_allocr * lm_ggml_allocr_new_measure(size_t alignment) {
-    struct lm_ggml_allocr * alloc = lm_ggml_allocr_new((void *)0x1000, (size_t)-0x1001, alignment);
+lm_ggml_tallocr_t lm_ggml_tallocr_new_measure(size_t alignment) {
+    lm_ggml_tallocr_t alloc = lm_ggml_tallocr_new((void *)0x1000, SIZE_MAX/2, alignment);
+    alloc->measure = true;
+
+    return alloc;
+}
+
+lm_ggml_tallocr_t lm_ggml_tallocr_new_measure_from_backend(struct lm_ggml_backend * backend) {
+    // create a backend buffer to get the correct tensor allocation sizes
+    lm_ggml_backend_buffer_t buffer = lm_ggml_backend_alloc_buffer(backend, 1);
+
+    // TODO: move alloc initialization to a common lm_ggml_tallocr_new_impl function
+    lm_ggml_tallocr_t alloc = lm_ggml_tallocr_new_from_buffer(buffer);
+    alloc->buffer_owned = true;
     alloc->measure = true;
+    lm_ggml_tallocr_reset(alloc);
+    return alloc;
+}
 
+lm_ggml_tallocr_t lm_ggml_tallocr_new_from_backend(struct lm_ggml_backend * backend, size_t size) {
+    lm_ggml_backend_buffer_t buffer = lm_ggml_backend_alloc_buffer(backend, size);
+    lm_ggml_tallocr_t alloc = lm_ggml_tallocr_new_from_buffer(buffer);
+    alloc->buffer_owned = true;
     return alloc;
 }
 
-struct lm_ggml_allocr * lm_ggml_allocr_new_from_buffer(struct lm_ggml_backend_buffer * buffer) {
-    struct lm_ggml_allocr * alloc = (struct lm_ggml_allocr *)malloc(sizeof(struct lm_ggml_allocr));
+lm_ggml_tallocr_t lm_ggml_tallocr_new_from_buffer(struct lm_ggml_backend_buffer * buffer) {
+    lm_ggml_tallocr_t alloc = (lm_ggml_tallocr_t)malloc(sizeof(struct lm_ggml_tallocr));
 
-    *alloc = (struct lm_ggml_allocr){
+    *alloc = (struct lm_ggml_tallocr) {
         /*.buffer        = */ buffer,
         /*.buffer_owned  = */ false,
         /*.base          = */ lm_ggml_backend_buffer_get_base(buffer),
         /*.alignment     = */ lm_ggml_backend_buffer_get_alignment(buffer),
         /*.n_free_blocks = */ 0,
         /*.free_blocks   = */ {{0}},
-        /*.hash_table    = */ {{0}},
         /*.max_size      = */ 0,
         /*.measure       = */ false,
-        /*.parse_seq     = */ {0},
-        /*.parse_seq_len = */ 0,
 #ifdef LM_GGML_ALLOCATOR_DEBUG
         /*.allocated_tensors = */ {0},
 #endif
     };
 
-    lm_ggml_allocr_reset(alloc);
+    lm_ggml_tallocr_reset(alloc);
 
     return alloc;
 }
 
-void lm_ggml_allocr_free(struct lm_ggml_allocr * alloc) {
+struct lm_ggml_backend_buffer * lm_ggml_tallocr_get_buffer(lm_ggml_tallocr_t alloc) {
+    return alloc->buffer;
+}
+
+void lm_ggml_tallocr_free(lm_ggml_tallocr_t alloc) {
+    if (alloc == NULL) {
+        return;
+    }
+
     if (alloc->buffer_owned) {
         lm_ggml_backend_buffer_free(alloc->buffer);
     }
     free(alloc);
 }
 
-bool lm_ggml_allocr_is_measure(struct lm_ggml_allocr * alloc) {
+bool lm_ggml_tallocr_is_measure(lm_ggml_tallocr_t alloc) {
     return alloc->measure;
 }
 
-//////////// compute graph allocator
+size_t lm_ggml_tallocr_max_size(lm_ggml_tallocr_t alloc) {
+    return alloc->max_size;
+}
+
+// graph allocator
+
+struct hash_node {
+    int n_children;
+    int n_views;
+};
+
+struct lm_ggml_gallocr {
+    lm_ggml_tallocr_t talloc;
+    struct lm_ggml_hash_set hash_set;
+    struct hash_node * hash_values;
+    size_t hash_values_size;
+    lm_ggml_tallocr_t * hash_allocs;
+    int * parse_seq;
+    int parse_seq_len;
+};
+
+lm_ggml_gallocr_t lm_ggml_gallocr_new(void) {
+    lm_ggml_gallocr_t galloc = (lm_ggml_gallocr_t)malloc(sizeof(struct lm_ggml_gallocr));
+
+    *galloc = (struct lm_ggml_gallocr) {
+        /*.talloc           = */ NULL,
+        /*.hash_set         = */ {0},
+        /*.hash_values      = */ NULL,
+        /*.hash_values_size = */ 0,
+        /*.hash_allocs      = */ NULL,
+        /*.parse_seq        = */ NULL,
+        /*.parse_seq_len    = */ 0,
+    };
+
+    return galloc;
+}
+
+void lm_ggml_gallocr_free(lm_ggml_gallocr_t galloc) {
+    if (galloc == NULL) {
+        return;
+    }
+
+    if (galloc->hash_set.keys != NULL) {
+        free(galloc->hash_set.keys);
+    }
+    if (galloc->hash_values != NULL) {
+        free(galloc->hash_values);
+    }
+    if (galloc->hash_allocs != NULL) {
+        free(galloc->hash_allocs);
+    }
+    if (galloc->parse_seq != NULL) {
+        free(galloc->parse_seq);
+    }
+    free(galloc);
+}
+
+void lm_ggml_gallocr_set_parse_seq(lm_ggml_gallocr_t galloc, const int * list, int n) {
+    free(galloc->parse_seq);
+    galloc->parse_seq = malloc(sizeof(int) * n);
+
+    for (int i = 0; i < n; i++) {
+        galloc->parse_seq[i] = list[i];
+    }
+    galloc->parse_seq_len = n;
+}
+
+static struct hash_node * hash_get(lm_ggml_gallocr_t galloc, struct lm_ggml_tensor * t) {
+    size_t i = lm_ggml_hash_find_or_insert(galloc->hash_set, t);
+    return &galloc->hash_values[i];
+}
 
 static bool lm_ggml_are_same_layout(const struct lm_ggml_tensor * a, const struct lm_ggml_tensor * b) {
     if (a->type != b->type) {
@@ -378,27 +434,39 @@ static bool lm_ggml_op_can_inplace(enum lm_ggml_op op) {
     }
 }
 
-static void init_view(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor * view, bool update_backend) {
-    assert(view->view_src != NULL && view->view_src->data != NULL);
+static lm_ggml_tallocr_t node_tallocr(lm_ggml_gallocr_t galloc, struct lm_ggml_tensor * node) {
+    if (galloc->talloc != NULL) {
+        return galloc->talloc;
+    }
+
+    return galloc->hash_allocs[lm_ggml_hash_find_or_insert(galloc->hash_set, node)];
+}
+
+static void init_view(lm_ggml_gallocr_t galloc, struct lm_ggml_tensor * view, bool update_backend) {
+    lm_ggml_tallocr_t alloc = node_tallocr(galloc, view);
 
+    LM_GGML_ASSERT(view->view_src != NULL && view->view_src->data != NULL);
     if (update_backend) {
         view->backend = view->view_src->backend;
     }
-
     view->buffer  = view->view_src->buffer;
     view->data    = (char *)view->view_src->data + view->view_offs;
 
     // FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
     // due to the lm_ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
-    assert(lm_ggml_allocr_is_measure(alloc) || !view->buffer || view->buffer->backend == alloc->buffer->backend);
-    lm_ggml_backend_buffer_init_tensor(alloc->buffer, view);
+    assert(lm_ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);
+
+    if (!alloc->measure) {
+        lm_ggml_backend_buffer_init_tensor(alloc->buffer, view);
+    }
 }
 
-static void allocate_node(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor * node) {
-    struct hash_node * ht = alloc->hash_table;
+static void allocate_node(lm_ggml_gallocr_t galloc, struct lm_ggml_tensor * node) {
+    lm_ggml_tallocr_t alloc = node_tallocr(galloc, node);
+
     if (node->data == NULL) {
         if (lm_ggml_is_view(node)) {
-            init_view(alloc, node, true);
+            init_view(galloc, node, true);
         } else {
             // see if we can reuse a parent's buffer (inplace)
             if (lm_ggml_op_can_inplace(node->op)) {
@@ -409,16 +477,16 @@ static void allocate_node(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor *
                     }
 
                     // if the node's data is external, then we cannot re-use it
-                    if (lm_ggml_allocr_is_own(alloc, parent) == false) {
+                    if (lm_ggml_tallocr_is_own(alloc, parent) == false) {
                         AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
                         continue;
                     }
 
-                    struct hash_node * p_hn = hash_get(ht, parent);
+                    struct hash_node * p_hn = hash_get(galloc, parent);
                     if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && lm_ggml_are_same_layout(node, parent)) {
                         if (lm_ggml_is_view(parent)) {
                             struct lm_ggml_tensor * view_src = parent->view_src;
-                            struct hash_node * view_src_hn = hash_get(ht, view_src);
+                            struct hash_node * view_src_hn = hash_get(galloc, view_src);
                             if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
                                 // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
                                 // the parent's data that it will need later (same layout requirement). the problem is that then
@@ -428,170 +496,307 @@ static void allocate_node(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor *
                                 AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
                                 node->view_src = view_src;
                                 view_src_hn->n_views += 1;
-                                init_view(alloc, node, false);
+                                init_view(galloc, node, false);
                                 return;
                             }
                         } else {
                             AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
                             node->view_src = parent;
                             p_hn->n_views += 1;
-                            init_view(alloc, node, false);
+                            init_view(galloc, node, false);
                             return;
                         }
                     }
                 }
             }
-            lm_ggml_allocr_alloc(alloc, node);
+            lm_ggml_tallocr_alloc(alloc, node);
         }
     }
 }
 
-size_t lm_ggml_allocr_alloc_graph_n(
-    struct lm_ggml_allocr * alloc,
-    struct lm_ggml_cgraph ** graphs, int n_graphs,
-    struct lm_ggml_tensor *** inputs, struct lm_ggml_tensor *** outputs) {
+static void free_node(lm_ggml_gallocr_t galloc, struct lm_ggml_tensor * node) {
+    lm_ggml_tallocr_t alloc = node_tallocr(galloc, node);
 
-    // reset hash table
-    struct hash_node * ht = alloc->hash_table;
-    memset(ht, 0, sizeof(struct hash_node) * LM_GGML_GRAPH_HASHTABLE_SIZE);
+    lm_ggml_tallocr_free_tensor(alloc, node);
+}
+
+static void lm_ggml_tallocr_alloc_graph_impl(lm_ggml_gallocr_t galloc, struct lm_ggml_cgraph * gf) {
+    const int * parse_seq     = galloc->parse_seq;
+    int         parse_seq_len = galloc->parse_seq_len;
 
     // count number of children and views
-    for (int g = 0; g < n_graphs; g++) {
-        struct lm_ggml_cgraph * gf = graphs[g];
-        for (int i = 0; i < gf->n_nodes; i++) {
+    for (int i = 0; i < gf->n_nodes; i++) {
+        struct lm_ggml_tensor * node = gf->nodes[i];
+
+        if (lm_ggml_is_view(node)) {
+            struct lm_ggml_tensor * view_src = node->view_src;
+            hash_get(galloc, view_src)->n_views += 1;
+            if (node->buffer == NULL && node->data != NULL) {
+                // view of a pre-allocated tensor, didn't call init_view() yet
+                init_view(galloc, node, true);
+            }
+        }
+
+        for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
+            struct lm_ggml_tensor * parent = node->src[j];
+            if (parent == NULL) {
+                break;
+            }
+            hash_get(galloc, parent)->n_children += 1;
+            if (lm_ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
+                init_view(galloc, parent, true);
+            }
+        }
+   }
+
+    // allocate tensors
+    // if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers
+    int last_barrier_pos = 0;
+    int n_nodes = parse_seq_len ? parse_seq_len : gf->n_nodes;
+
+    for (int ind = 0; ind < n_nodes; ind++) {
+        // allocate a node if there is no parse_seq or this is not a barrier
+        if (parse_seq_len == 0 || parse_seq[ind] != -1) {
+            int i = parse_seq_len ? parse_seq[ind] : ind;
             struct lm_ggml_tensor * node = gf->nodes[i];
 
-            if (lm_ggml_is_view(node)) {
-                struct lm_ggml_tensor * view_src = node->view_src;
-                hash_get(ht, view_src)->n_views += 1;
-                if (node->buffer == NULL && node->data != NULL) {
-                    // view of a pre-allocated tensor, didn't call init_view() yet
-                    init_view(alloc, node, true);
+            // allocate parents (leafs)
+            for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
+                struct lm_ggml_tensor * parent = node->src[j];
+                if (parent == NULL) {
+                    break;
                 }
+                allocate_node(galloc, parent);
             }
 
+            // allocate node
+            allocate_node(galloc, node);
+
+            AT_PRINTF("exec: %s (%s) <= ", lm_ggml_op_name(node->op), node->name);
             for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
                 struct lm_ggml_tensor * parent = node->src[j];
                 if (parent == NULL) {
                     break;
                 }
-                hash_get(ht, parent)->n_children += 1;
-                if (lm_ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
-                    init_view(alloc, parent, true);
+                AT_PRINTF("%s", parent->name);
+                if (j < LM_GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
+                    AT_PRINTF(", ");
                 }
             }
+            AT_PRINTF("\n");
         }
-    }
 
-    // allocate tensors
-    for (int g = 0; g < n_graphs; g++) {
-        struct lm_ggml_cgraph * gf = graphs[g];
-        AT_PRINTF("####### graph %d/%d\n", g, n_graphs);
-        // graph inputs are allocated first to ensure that they are not overwritten by each other
-        if (inputs != NULL && inputs[g] != NULL) {
-            for (int i = 0; inputs[g][i] != NULL; i++) {
-                struct lm_ggml_tensor * input = inputs[g][i];
-                AT_PRINTF("input: %s\n", input->name);
-                allocate_node(alloc, input);
-            }
-        }
-        // if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers
-        int last_barrier_pos = 0;
-        int n_nodes = alloc->parse_seq_len ? alloc->parse_seq_len : gf->n_nodes;
+        // update parents
+        // update immediately if there is no parse_seq
+        // update only at barriers if there is parse_seq
+        if ((parse_seq_len == 0) || parse_seq[ind] == -1) {
+            int update_start = parse_seq_len ? last_barrier_pos : ind;
+            int update_end   = parse_seq_len ? ind              : ind + 1;
+            for (int i = update_start; i < update_end; i++) {
+                int node_i = parse_seq_len ? parse_seq[i] : i;
+                struct lm_ggml_tensor * node = gf->nodes[node_i];
 
-        for (int ind = 0; ind < n_nodes; ind++) {
-            // allocate a node if there is no parse_seq or this is not a barrier
-            if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] != -1) {
-                int i = alloc->parse_seq_len ? alloc->parse_seq[ind] : ind;
-                struct lm_ggml_tensor * node = gf->nodes[i];
-
-                // allocate parents (leafs)
                 for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
                     struct lm_ggml_tensor * parent = node->src[j];
                     if (parent == NULL) {
                         break;
                     }
-                    allocate_node(alloc, parent);
-                }
+                    struct hash_node * p_hn = hash_get(galloc, parent);
+                    p_hn->n_children -= 1;
 
-                // allocate node
-                allocate_node(alloc, node);
+                    //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
 
-                AT_PRINTF("exec: %s (%s) <= ", lm_ggml_op_name(node->op), node->name);
-                for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
-                    struct lm_ggml_tensor * parent = node->src[j];
-                    if (parent == NULL) {
-                        break;
-                    }
-                    AT_PRINTF("%s", parent->name);
-                    if (j < LM_GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
-                        AT_PRINTF(", ");
-                    }
-                }
-                AT_PRINTF("\n");
-            }
-
-            // update parents
-            // update immediately if there is no parse_seq
-            // update only at barriers if there is parse_seq
-            if ((alloc->parse_seq_len == 0) || alloc->parse_seq[ind] == -1) {
-                int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
-                int update_end   = alloc->parse_seq_len ? ind              : ind + 1;
-                for (int i = update_start; i < update_end; i++) {
-                    int node_i = alloc->parse_seq_len ? alloc->parse_seq[i] : i;
-                    struct lm_ggml_tensor * node = gf->nodes[node_i];
-
-                    for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
-                        struct lm_ggml_tensor * parent = node->src[j];
-                        if (parent == NULL) {
-                            break;
-                        }
-                        struct hash_node * p_hn = hash_get(ht, parent);
-                        p_hn->n_children -= 1;
-
-                        //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
-
-                        if (p_hn->n_children == 0 && p_hn->n_views == 0) {
-                            if (lm_ggml_is_view(parent)) {
-                                struct lm_ggml_tensor * view_src = parent->view_src;
-                                struct hash_node * view_src_hn = hash_get(ht, view_src);
-                                view_src_hn->n_views -= 1;
-                                AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
-                                if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
-                                    lm_ggml_allocr_free_tensor(alloc, view_src);
-                                }
-                            }
-                            else {
-                                if (parent->data != node->data) {
-                                    lm_ggml_allocr_free_tensor(alloc, parent);
-                                }
+                    if (p_hn->n_children == 0 && p_hn->n_views == 0) {
+                        if (lm_ggml_is_view(parent)) {
+                            struct lm_ggml_tensor * view_src = parent->view_src;
+                            struct hash_node * view_src_hn = hash_get(galloc, view_src);
+                            view_src_hn->n_views -= 1;
+                            AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
+                            if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0) {
+                                free_node(galloc, view_src);
                             }
                         }
+                        else {
+                            free_node(galloc, parent);
+                        }
                     }
                 }
-                AT_PRINTF("\n");
-                if (alloc->parse_seq_len) {
-                    last_barrier_pos = ind + 1;
-                }
             }
-        }
-        // free graph outputs here that wouldn't be freed otherwise because they have no children
-        if (outputs != NULL && outputs[g] != NULL) {
-            for (int i = 0; outputs[g][i] != NULL; i++) {
-                struct lm_ggml_tensor * output = outputs[g][i];
-                AT_PRINTF("output: %s\n", output->name);
-                lm_ggml_allocr_free_tensor(alloc, output);
+            AT_PRINTF("\n");
+            if (parse_seq_len) {
+                last_barrier_pos = ind + 1;
             }
         }
     }
+}
 
-    return alloc->max_size;
+size_t lm_ggml_gallocr_alloc_graph(lm_ggml_gallocr_t galloc, lm_ggml_tallocr_t talloc, struct lm_ggml_cgraph * graph) {
+    size_t hash_size = graph->visited_hash_table.size;
+
+    // check if the hash table is initialized and large enough
+    if (galloc->hash_set.size < hash_size) {
+        if (galloc->hash_set.keys != NULL) {
+            free(galloc->hash_set.keys);
+        }
+        if (galloc->hash_values != NULL) {
+            free(galloc->hash_values);
+        }
+        galloc->hash_set.keys = malloc(sizeof(struct lm_ggml_tensor *) * hash_size);
+        galloc->hash_set.size = hash_size;
+        galloc->hash_values = malloc(sizeof(struct hash_node) * hash_size);
+    }
+
+    // reset hash table
+    memset(galloc->hash_set.keys, 0, sizeof(struct lm_ggml_tensor *) * hash_size);
+    memset(galloc->hash_values,   0, sizeof(struct hash_node) * hash_size);
+
+    galloc->talloc = talloc;
+    lm_ggml_tallocr_alloc_graph_impl(galloc, graph);
+    galloc->talloc = NULL;
+
+    size_t max_size = lm_ggml_tallocr_max_size(talloc);
+
+    return max_size;
 }
 
-size_t lm_ggml_allocr_alloc_graph(struct lm_ggml_allocr * alloc, struct lm_ggml_cgraph * graph) {
-    return lm_ggml_allocr_alloc_graph_n(alloc, &graph, 1, NULL, NULL);
+void lm_ggml_gallocr_alloc_graph_n(lm_ggml_gallocr_t galloc, struct lm_ggml_cgraph * graph, struct lm_ggml_hash_set hash_set, lm_ggml_tallocr_t * hash_node_talloc) {
+    const size_t hash_size = hash_set.size;
+
+    LM_GGML_ASSERT(hash_size >= (size_t)(graph->n_nodes + graph->n_leafs));
+
+    galloc->talloc = NULL;
+
+    // alloc hash_values if needed
+    if (galloc->hash_values == NULL || galloc->hash_values_size < hash_size) {
+        free(galloc->hash_values);
+        galloc->hash_values      = malloc(sizeof(struct hash_node) * hash_size);
+        galloc->hash_values_size = hash_size;
+    }
+
+    // free hash_set.keys if needed
+    if (galloc->hash_set.keys != NULL) {
+        free(galloc->hash_set.keys);
+    }
+    galloc->hash_set = hash_set;
+
+    // reset hash values
+    memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size);
+
+    galloc->hash_allocs = hash_node_talloc;
+
+    lm_ggml_tallocr_alloc_graph_impl(galloc, graph);
+
+    // remove unowned resources
+    galloc->hash_set.keys = NULL;
+    galloc->hash_allocs = NULL;
 }
 
-size_t lm_ggml_allocr_max_size(struct lm_ggml_allocr * alloc) {
-    return alloc->max_size;
+// legacy API wrapper
+
+struct lm_ggml_allocr {
+    lm_ggml_tallocr_t talloc;
+    lm_ggml_gallocr_t galloc;
+};
+
+static lm_ggml_allocr_t lm_ggml_allocr_new_impl(lm_ggml_tallocr_t talloc) {
+    lm_ggml_allocr_t alloc = (lm_ggml_allocr_t)malloc(sizeof(struct lm_ggml_allocr));
+    *alloc = (struct lm_ggml_allocr) {
+        /*.talloc = */ talloc,
+        /*.galloc = */ lm_ggml_gallocr_new(),
+    };
+    return alloc;
+}
+
+lm_ggml_allocr_t lm_ggml_allocr_new(void * data, size_t size, size_t alignment) {
+    return lm_ggml_allocr_new_impl(lm_ggml_tallocr_new(data, size, alignment));
+}
+
+lm_ggml_allocr_t lm_ggml_allocr_new_measure(size_t alignment) {
+    return lm_ggml_allocr_new_impl(lm_ggml_tallocr_new_measure(alignment));
+}
+
+lm_ggml_allocr_t lm_ggml_allocr_new_from_buffer(struct lm_ggml_backend_buffer * buffer) {
+    return lm_ggml_allocr_new_impl(lm_ggml_tallocr_new_from_buffer(buffer));
+}
+
+lm_ggml_allocr_t lm_ggml_allocr_new_from_backend(struct lm_ggml_backend * backend, size_t size) {
+    return lm_ggml_allocr_new_impl(lm_ggml_tallocr_new_from_backend(backend, size));
+}
+
+lm_ggml_allocr_t lm_ggml_allocr_new_measure_from_backend(struct lm_ggml_backend * backend) {
+    return lm_ggml_allocr_new_impl(lm_ggml_tallocr_new_measure_from_backend(backend));
+}
+
+struct lm_ggml_backend_buffer * lm_ggml_allocr_get_buffer(lm_ggml_allocr_t alloc) {
+    return lm_ggml_tallocr_get_buffer(alloc->talloc);
+}
+
+void lm_ggml_allocr_set_parse_seq(lm_ggml_allocr_t alloc, const int * list, int n) {
+    lm_ggml_gallocr_set_parse_seq(alloc->galloc, list, n);
+}
+
+void lm_ggml_allocr_free(lm_ggml_allocr_t alloc) {
+    lm_ggml_gallocr_free(alloc->galloc);
+    lm_ggml_tallocr_free(alloc->talloc);
+    free(alloc);
+}
+
+bool lm_ggml_allocr_is_measure(lm_ggml_allocr_t alloc) {
+    return lm_ggml_tallocr_is_measure(alloc->talloc);
+}
+
+void lm_ggml_allocr_reset(lm_ggml_allocr_t alloc) {
+    lm_ggml_tallocr_reset(alloc->talloc);
+}
+
+void lm_ggml_allocr_alloc(lm_ggml_allocr_t alloc, struct lm_ggml_tensor * tensor) {
+    lm_ggml_tallocr_alloc(alloc->talloc, tensor);
+}
+
+size_t lm_ggml_allocr_max_size(lm_ggml_allocr_t alloc) {
+    return lm_ggml_tallocr_max_size(alloc->talloc);
+}
+
+size_t lm_ggml_allocr_alloc_graph(lm_ggml_allocr_t alloc, struct lm_ggml_cgraph * graph) {
+    return lm_ggml_gallocr_alloc_graph(alloc->galloc, alloc->talloc, graph);
+}
+
+// utils
+lm_ggml_backend_buffer_t lm_ggml_backend_alloc_ctx_tensors_from_buft(struct lm_ggml_context * ctx, lm_ggml_backend_buffer_type_t buft) {
+    LM_GGML_ASSERT(lm_ggml_get_no_alloc(ctx) == true);
+
+    size_t alignment = lm_ggml_backend_buft_get_alignment(buft);
+
+    size_t nbytes = 0;
+    for (struct lm_ggml_tensor * t = lm_ggml_get_first_tensor(ctx); t != NULL; t = lm_ggml_get_next_tensor(ctx, t)) {
+        if (t->data == NULL && t->view_src == NULL) {
+            nbytes += LM_GGML_PAD(lm_ggml_backend_buft_get_alloc_size(buft, t), alignment);
+        }
+    }
+
+    if (nbytes == 0) {
+        fprintf(stderr, "%s: no tensors to allocate\n", __func__);
+        return NULL;
+    }
+
+    lm_ggml_backend_buffer_t buffer = lm_ggml_backend_buft_alloc_buffer(buft, nbytes);
+    lm_ggml_tallocr_t tallocr = lm_ggml_tallocr_new_from_buffer(buffer);
+
+    for (struct lm_ggml_tensor * t = lm_ggml_get_first_tensor(ctx); t != NULL; t = lm_ggml_get_next_tensor(ctx, t)) {
+        if (t->data == NULL) {
+            if (t->view_src == NULL) {
+                lm_ggml_tallocr_alloc(tallocr, t);
+            } else {
+                lm_ggml_backend_view_init(buffer, t);
+            }
+        }
+    }
+
+    lm_ggml_tallocr_free(tallocr);
+
+    return buffer;
+}
+
+lm_ggml_backend_buffer_t lm_ggml_backend_alloc_ctx_tensors(struct lm_ggml_context * ctx, lm_ggml_backend_t backend) {
+    return lm_ggml_backend_alloc_ctx_tensors_from_buft(ctx, lm_ggml_backend_get_default_buffer_type(backend));
 }
diff --git a/cpp/ggml-alloc.h b/cpp/ggml-alloc.h
index 565e0b1..8f5f880 100644
--- a/cpp/ggml-alloc.h
+++ b/cpp/ggml-alloc.h
@@ -6,27 +6,86 @@
 extern "C" {
 #endif
 
+struct lm_ggml_backend;
 struct lm_ggml_backend_buffer;
+struct lm_ggml_backend_buffer_type;
 
-LM_GGML_API struct lm_ggml_allocr * lm_ggml_allocr_new(void * data, size_t size, size_t alignment);
-LM_GGML_API struct lm_ggml_allocr * lm_ggml_allocr_new_measure(size_t alignment);
-LM_GGML_API struct lm_ggml_allocr * lm_ggml_allocr_new_from_buffer(struct lm_ggml_backend_buffer * buffer);
+//
+// Legacy API
+//
+
+typedef struct lm_ggml_allocr * lm_ggml_allocr_t;
+
+// initialize allocator for use with CPU backend only
+LM_GGML_API lm_ggml_allocr_t lm_ggml_allocr_new(void * data, size_t size, size_t alignment);
+LM_GGML_API lm_ggml_allocr_t lm_ggml_allocr_new_measure(size_t alignment);
+
+// initialize allocator for use with ggml-backend
+LM_GGML_API lm_ggml_allocr_t lm_ggml_allocr_new_from_buffer(struct lm_ggml_backend_buffer * buffer);
+LM_GGML_API lm_ggml_allocr_t lm_ggml_allocr_new_from_backend(struct lm_ggml_backend * backend, size_t size); // allocates an owned buffer
+LM_GGML_API lm_ggml_allocr_t lm_ggml_allocr_new_measure_from_backend(struct lm_ggml_backend * backend);
+
+LM_GGML_API struct lm_ggml_backend_buffer * lm_ggml_allocr_get_buffer(lm_ggml_allocr_t alloc);
 
 // tell the allocator to parse nodes following the order described in the list
 // you should call this if your graph are optimized to execute out-of-order
-LM_GGML_API void   lm_ggml_allocr_set_parse_seq(struct lm_ggml_allocr * alloc, const int * list, int n);
-
-LM_GGML_API void   lm_ggml_allocr_free       (struct lm_ggml_allocr * alloc);
-LM_GGML_API bool   lm_ggml_allocr_is_measure (struct lm_ggml_allocr * alloc);
-LM_GGML_API void   lm_ggml_allocr_reset      (struct lm_ggml_allocr * alloc);
-LM_GGML_API void   lm_ggml_allocr_alloc      (struct lm_ggml_allocr * alloc, struct lm_ggml_tensor * tensor);
-LM_GGML_API size_t lm_ggml_allocr_alloc_graph(struct lm_ggml_allocr * alloc, struct lm_ggml_cgraph * graph);
-LM_GGML_API size_t lm_ggml_allocr_max_size   (struct lm_ggml_allocr * alloc);
-
-LM_GGML_API size_t lm_ggml_allocr_alloc_graph_n(
-                    struct lm_ggml_allocr * alloc,
-                    struct lm_ggml_cgraph ** graphs, int n_graphs,
-                    struct lm_ggml_tensor *** inputs, struct lm_ggml_tensor *** outputs);
+LM_GGML_API void   lm_ggml_allocr_set_parse_seq(lm_ggml_allocr_t alloc, const int * list, int n);
+
+LM_GGML_API void   lm_ggml_allocr_free       (lm_ggml_allocr_t alloc);
+LM_GGML_API bool   lm_ggml_allocr_is_measure (lm_ggml_allocr_t alloc);
+LM_GGML_API void   lm_ggml_allocr_reset      (lm_ggml_allocr_t alloc);
+LM_GGML_API void   lm_ggml_allocr_alloc      (lm_ggml_allocr_t alloc, struct lm_ggml_tensor * tensor);
+LM_GGML_API size_t lm_ggml_allocr_max_size   (lm_ggml_allocr_t alloc);
+
+LM_GGML_API size_t lm_ggml_allocr_alloc_graph(lm_ggml_allocr_t alloc, struct lm_ggml_cgraph * graph);
+
+//
+// ggml-backend v2 API
+//
+
+// Seperate tensor and graph allocator objects
+// This is necessary for multi-backend allocation because the graph allocator needs to use multiple tensor allocators
+// The original API is kept as a wrapper around the new API
+
+// Tensor allocator
+typedef struct lm_ggml_tallocr * lm_ggml_tallocr_t;
+
+LM_GGML_API lm_ggml_tallocr_t lm_ggml_tallocr_new(void * data, size_t size, size_t alignment);
+LM_GGML_API lm_ggml_tallocr_t lm_ggml_tallocr_new_measure(size_t alignment);
+LM_GGML_API lm_ggml_tallocr_t lm_ggml_tallocr_new_from_buffer(struct lm_ggml_backend_buffer * buffer);
+LM_GGML_API lm_ggml_tallocr_t lm_ggml_tallocr_new_from_backend(struct lm_ggml_backend * backend, size_t size); // allocates an owned buffer
+LM_GGML_API lm_ggml_tallocr_t lm_ggml_tallocr_new_measure_from_backend(struct lm_ggml_backend * backend);
+
+LM_GGML_API struct lm_ggml_backend_buffer * lm_ggml_tallocr_get_buffer(lm_ggml_tallocr_t talloc);
+
+LM_GGML_API void   lm_ggml_tallocr_free       (lm_ggml_tallocr_t talloc);
+LM_GGML_API bool   lm_ggml_tallocr_is_measure (lm_ggml_tallocr_t talloc);
+LM_GGML_API void   lm_ggml_tallocr_reset      (lm_ggml_tallocr_t talloc);
+LM_GGML_API void   lm_ggml_tallocr_alloc      (lm_ggml_tallocr_t talloc, struct lm_ggml_tensor * tensor);
+LM_GGML_API size_t lm_ggml_tallocr_max_size   (lm_ggml_tallocr_t talloc);
+
+
+// Graph allocator
+typedef struct lm_ggml_gallocr * lm_ggml_gallocr_t;
+
+LM_GGML_API lm_ggml_gallocr_t lm_ggml_gallocr_new(void);
+LM_GGML_API void   lm_ggml_gallocr_free(lm_ggml_gallocr_t galloc);
+
+LM_GGML_API void   lm_ggml_gallocr_set_parse_seq(lm_ggml_gallocr_t galloc, const int * list, int n);
+LM_GGML_API size_t lm_ggml_gallocr_alloc_graph(lm_ggml_gallocr_t galloc, lm_ggml_tallocr_t talloc, struct lm_ggml_cgraph * graph);
+
+// Allocate tensors from the allocators given by the hash table
+LM_GGML_API void   lm_ggml_gallocr_alloc_graph_n(
+                    lm_ggml_gallocr_t galloc,
+                    struct lm_ggml_cgraph * graph,
+                    struct lm_ggml_hash_set hash_set,
+                    lm_ggml_tallocr_t * hash_node_talloc);
+
+
+// Utils
+// Create a buffer and allocate all the tensors in a lm_ggml_context
+LM_GGML_API struct lm_ggml_backend_buffer * lm_ggml_backend_alloc_ctx_tensors_from_buft(struct lm_ggml_context * ctx, struct lm_ggml_backend_buffer_type * buft);
+LM_GGML_API struct lm_ggml_backend_buffer * lm_ggml_backend_alloc_ctx_tensors(struct lm_ggml_context * ctx, struct lm_ggml_backend * backend);
 
 #ifdef  __cplusplus
 }
diff --git a/cpp/ggml-backend-impl.h b/cpp/ggml-backend-impl.h
new file mode 100644
index 0000000..fd83f84
--- /dev/null
+++ b/cpp/ggml-backend-impl.h
@@ -0,0 +1,112 @@
+#pragma once
+
+// ggml-backend internal header
+
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+    //
+    // Backend buffer
+    //
+
+    // buffer type
+    typedef void * lm_ggml_backend_buffer_type_context_t;
+
+    struct lm_ggml_backend_buffer_type_i {
+        lm_ggml_backend_buffer_t (*alloc_buffer)    (lm_ggml_backend_buffer_type_t buft, size_t size);
+        size_t                (*get_alignment)   (lm_ggml_backend_buffer_type_t buft); // tensor alignment
+        size_t                (*get_alloc_size)  (lm_ggml_backend_buffer_type_t buft, struct lm_ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
+        bool                  (*supports_backend)(lm_ggml_backend_buffer_type_t buft, lm_ggml_backend_t backend); // check if the buffer type is usable by the backend
+    };
+
+    struct lm_ggml_backend_buffer_type {
+        struct lm_ggml_backend_buffer_type_i  iface;
+        lm_ggml_backend_buffer_type_context_t context;
+    };
+
+    // buffer
+    typedef void * lm_ggml_backend_buffer_context_t;
+
+    struct lm_ggml_backend_buffer_i {
+        void     (*free_buffer)(lm_ggml_backend_buffer_t buffer);
+        //void     (*reset)      (lm_ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
+        void *   (*get_base)   (lm_ggml_backend_buffer_t buffer);
+        void     (*init_tensor)(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor);
+        void     (*set_tensor) (lm_ggml_backend_buffer_t buffer,       struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+        void     (*get_tensor) (lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+        // (optional) copy tensor between different buffer-type, allow for single-copy tranfers
+        void (*cpy_tensor_from)(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst);
+        void (*cpy_tensor_to)  (lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst);
+    };
+
+    struct lm_ggml_backend_buffer {
+        struct lm_ggml_backend_buffer_i  iface;
+        lm_ggml_backend_buffer_type_t    buft;
+        lm_ggml_backend_buffer_context_t context;
+        size_t size;
+    };
+
+    lm_ggml_backend_buffer_t lm_ggml_backend_buffer_init(
+                   lm_ggml_backend_buffer_type_t      buft,
+            struct lm_ggml_backend_buffer_i           iface,
+                   lm_ggml_backend_buffer_context_t   context,
+                   size_t                          size);
+
+
+    //
+    // Backend
+    //
+
+    typedef void * lm_ggml_backend_context_t;
+
+    struct lm_ggml_backend_i {
+        const char * (*get_name)(lm_ggml_backend_t backend);
+
+        void (*free)(lm_ggml_backend_t backend);
+
+        // buffer allocation
+        lm_ggml_backend_buffer_type_t (*get_default_buffer_type)(lm_ggml_backend_t backend);
+
+        // (optional) asynchroneous tensor data access
+        void (*set_tensor_async)(lm_ggml_backend_t backend,       struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+        void (*get_tensor_async)(lm_ggml_backend_t backend, const struct lm_ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+
+        // (optional) asynchroneous tensor copy
+        void (*cpy_tensor_from_async)(lm_ggml_backend_t backend, struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst);
+        void (*cpy_tensor_to_async)  (lm_ggml_backend_t backend, struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst);
+
+        void (*synchronize)     (lm_ggml_backend_t backend);
+
+        // compute graph with a plan
+        lm_ggml_backend_graph_plan_t (*graph_plan_create) (lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph);
+        void                      (*graph_plan_free)   (lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan);
+        void                      (*graph_plan_compute)(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan);
+
+        // compute graph without a plan
+        void (*graph_compute)(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph);
+
+        // check if the backend supports an operation
+        bool (*supports_op)(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op);
+    };
+
+    struct lm_ggml_backend {
+        struct lm_ggml_backend_i iface;
+
+        lm_ggml_backend_context_t context;
+    };
+
+
+    //
+    // Backend registry
+    //
+
+    typedef lm_ggml_backend_t (*lm_ggml_backend_init_fn)(const char * params, void * user_data);
+
+    void lm_ggml_backend_register(const char * name, lm_ggml_backend_init_fn init_fn, lm_ggml_backend_buffer_type_t default_buffer_type, void * user_data);
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/cpp/ggml-backend.c b/cpp/ggml-backend.c
index 2f83068..dea815d 100644
--- a/cpp/ggml-backend.c
+++ b/cpp/ggml-backend.c
@@ -1,20 +1,44 @@
-#include "ggml-backend.h"
+#include "ggml-backend-impl.h"
 #include "ggml-alloc.h"
+#include "ggml-impl.h"
 
 #include <assert.h>
+#include <limits.h>
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
-#define UNUSED LM_GGML_UNUSED
 
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 
+
+// backend buffer type
+
+lm_ggml_backend_buffer_t lm_ggml_backend_buft_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
+    return buft->iface.alloc_buffer(buft, size);
+}
+
+size_t lm_ggml_backend_buft_get_alignment(lm_ggml_backend_buffer_type_t buft) {
+    return buft->iface.get_alignment(buft);
+}
+
+size_t lm_ggml_backend_buft_get_alloc_size(lm_ggml_backend_buffer_type_t buft, struct lm_ggml_tensor * tensor) {
+    // get_alloc_size is optional, defaults to lm_ggml_nbytes
+    if (buft->iface.get_alloc_size) {
+        return buft->iface.get_alloc_size(buft, tensor);
+    }
+    return lm_ggml_nbytes(tensor);
+}
+
+bool lm_ggml_backend_buft_supports_backend(lm_ggml_backend_buffer_type_t buft, lm_ggml_backend_t backend) {
+    return buft->iface.supports_backend(buft, backend);
+}
+
 // backend buffer
 
 lm_ggml_backend_buffer_t lm_ggml_backend_buffer_init(
-        struct lm_ggml_backend                  * backend,
+               lm_ggml_backend_buffer_type_t      buft,
         struct lm_ggml_backend_buffer_i           iface,
                lm_ggml_backend_buffer_context_t   context,
                size_t                          size) {
@@ -24,7 +48,7 @@ lm_ggml_backend_buffer_t lm_ggml_backend_buffer_init(
 
     (*buffer) = (struct lm_ggml_backend_buffer) {
         /* .interface = */ iface,
-        /* .backend   = */ backend,
+        /* .buft      = */ buft,
         /* .context   = */ context,
         /* .size      = */ size,
     };
@@ -33,84 +57,111 @@ lm_ggml_backend_buffer_t lm_ggml_backend_buffer_init(
 }
 
 void lm_ggml_backend_buffer_free(lm_ggml_backend_buffer_t buffer) {
+    if (buffer == NULL) {
+        return;
+    }
+
     if (buffer->iface.free_buffer != NULL) {
         buffer->iface.free_buffer(buffer);
     }
     free(buffer);
 }
 
-size_t lm_ggml_backend_buffer_get_alignment(lm_ggml_backend_buffer_t buffer) {
-    return lm_ggml_backend_get_alignment(buffer->backend);
+size_t lm_ggml_backend_buffer_get_size(lm_ggml_backend_buffer_t buffer) {
+    return buffer->size;
 }
 
 void * lm_ggml_backend_buffer_get_base(lm_ggml_backend_buffer_t buffer) {
-    return buffer->iface.get_base(buffer);
-}
+    void * base = buffer->iface.get_base(buffer);
 
-size_t lm_ggml_backend_buffer_get_size(lm_ggml_backend_buffer_t buffer) {
-    return buffer->size;
-}
+    LM_GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
 
-size_t lm_ggml_backend_buffer_get_alloc_size(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor) {
-    if (buffer->iface.get_alloc_size) {
-        return buffer->iface.get_alloc_size(buffer, tensor);
-    }
-    return lm_ggml_nbytes(tensor);
+    return base;
 }
 
 void lm_ggml_backend_buffer_init_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor) {
+    // init_tensor is optional
     if (buffer->iface.init_tensor) {
         buffer->iface.init_tensor(buffer, tensor);
     }
 }
 
-void lm_ggml_backend_buffer_free_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor) {
-    if (buffer->iface.free_tensor) {
-        buffer->iface.free_tensor(buffer, tensor);
-    }
+size_t lm_ggml_backend_buffer_get_alignment (lm_ggml_backend_buffer_t buffer) {
+    return lm_ggml_backend_buft_get_alignment(lm_ggml_backend_buffer_type(buffer));
 }
 
-// backend
+size_t lm_ggml_backend_buffer_get_alloc_size(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor) {
+    return lm_ggml_backend_buft_get_alloc_size(lm_ggml_backend_buffer_type(buffer), tensor);
+}
 
-lm_ggml_backend_t lm_ggml_get_backend(const struct lm_ggml_tensor * tensor) {
-    return tensor->buffer->backend;
+lm_ggml_backend_buffer_type_t lm_ggml_backend_buffer_type(lm_ggml_backend_buffer_t buffer) {
+    return buffer->buft;
 }
 
+// backend
+
 const char * lm_ggml_backend_name(lm_ggml_backend_t backend) {
+    if (backend == NULL) {
+        return "NULL";
+    }
     return backend->iface.get_name(backend);
 }
 
 void lm_ggml_backend_free(lm_ggml_backend_t backend) {
+    if (backend == NULL) {
+        return;
+    }
+
     backend->iface.free(backend);
 }
 
+lm_ggml_backend_buffer_type_t lm_ggml_backend_get_default_buffer_type(lm_ggml_backend_t backend) {
+    return backend->iface.get_default_buffer_type(backend);
+}
+
 lm_ggml_backend_buffer_t lm_ggml_backend_alloc_buffer(lm_ggml_backend_t backend, size_t size) {
-    return backend->iface.alloc_buffer(backend, size);
+    return lm_ggml_backend_buft_alloc_buffer(lm_ggml_backend_get_default_buffer_type(backend), size);
 }
 
 size_t lm_ggml_backend_get_alignment(lm_ggml_backend_t backend) {
-    return backend->iface.get_alignment(backend);
+    return lm_ggml_backend_buft_get_alignment(lm_ggml_backend_get_default_buffer_type(backend));
 }
 
-void lm_ggml_backend_tensor_set_async(struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    lm_ggml_get_backend(tensor)->iface.set_tensor_async(lm_ggml_get_backend(tensor), tensor, data, offset, size);
+void lm_ggml_backend_tensor_set_async(lm_ggml_backend_t backend, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+    LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor write out of bounds");
+
+    backend->iface.set_tensor_async(backend, tensor, data, offset, size);
 }
 
-void lm_ggml_backend_tensor_get_async(const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    lm_ggml_get_backend(tensor)->iface.get_tensor_async(lm_ggml_get_backend(tensor), tensor, data, offset, size);
+void lm_ggml_backend_tensor_get_async(lm_ggml_backend_t backend, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+    LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor read out of bounds");
+
+    backend->iface.get_tensor_async(backend, tensor, data, offset, size);
 }
 
 void lm_ggml_backend_tensor_set(struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    lm_ggml_get_backend(tensor)->iface.set_tensor_async(lm_ggml_get_backend(tensor), tensor, data, offset, size);
-    lm_ggml_get_backend(tensor)->iface.synchronize(lm_ggml_get_backend(tensor));
+    LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+    LM_GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
+    LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor write out of bounds");
+
+    tensor->buffer->iface.set_tensor(tensor->buffer, tensor, data, offset, size);
 }
 
 void lm_ggml_backend_tensor_get(const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    lm_ggml_get_backend(tensor)->iface.get_tensor_async(lm_ggml_get_backend(tensor), tensor, data, offset, size);
-    lm_ggml_get_backend(tensor)->iface.synchronize(lm_ggml_get_backend(tensor));
+    LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+    LM_GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
+    LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor read out of bounds");
+
+    tensor->buffer->iface.get_tensor(tensor->buffer, tensor, data, offset, size);
 }
 
 void lm_ggml_backend_synchronize(lm_ggml_backend_t backend) {
+    if (backend->iface.synchronize == NULL) {
+        return;
+    }
+
     backend->iface.synchronize(backend);
 }
 
@@ -124,10 +175,16 @@ void lm_ggml_backend_graph_plan_free(lm_ggml_backend_t backend, lm_ggml_backend_
 
 void lm_ggml_backend_graph_plan_compute(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan) {
     backend->iface.graph_plan_compute(backend, plan);
+
+    // TODO: optional sync
+    lm_ggml_backend_synchronize(backend);
 }
 
 void lm_ggml_backend_graph_compute(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) {
     backend->iface.graph_compute(backend, cgraph);
+
+    // TODO: optional sync
+    lm_ggml_backend_synchronize(backend);
 }
 
 bool lm_ggml_backend_supports_op(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op) {
@@ -156,7 +213,7 @@ void lm_ggml_backend_tensor_copy(struct lm_ggml_tensor * src, struct lm_ggml_ten
     //printf("dst: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", dst->name, (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], (int)dst->nb[0], (int)dst->nb[1], (int)dst->nb[2], (int)dst->nb[3]);
     LM_GGML_ASSERT(lm_ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
 
-    // printf("cpy tensor %s from %s to %s (%lu bytes)\n", src->name, lm_ggml_backend_name(src->backend), lm_ggml_backend_name(dst->backend), lm_ggml_nbytes(src));
+    // fprintf(stderr, "cpy tensor %s from %s to %s (%lu bytes)\n", src->name, lm_ggml_backend_name(src->backend), lm_ggml_backend_name(dst->backend), lm_ggml_nbytes(src));
 
     if (src == dst) {
         return;
@@ -164,14 +221,15 @@ void lm_ggml_backend_tensor_copy(struct lm_ggml_tensor * src, struct lm_ggml_ten
 
     // TODO: allow backends to support copy to/from same backend
 
-    if (lm_ggml_get_backend(dst)->iface.cpy_tensor_from != NULL) {
-        lm_ggml_get_backend(dst)->iface.cpy_tensor_from(lm_ggml_get_backend(dst)->context, src, dst);
-    } else if (lm_ggml_get_backend(src)->iface.cpy_tensor_to != NULL) {
-        lm_ggml_get_backend(src)->iface.cpy_tensor_to(lm_ggml_get_backend(src)->context, src, dst);
+    if (dst->buffer->iface.cpy_tensor_from != NULL) {
+        dst->buffer->iface.cpy_tensor_from(dst->buffer, src, dst);
+    } else if (src->buffer->iface.cpy_tensor_to != NULL) {
+        src->buffer->iface.cpy_tensor_to(src->buffer, src, dst);
     } else {
         // shouldn't be hit when copying from/to CPU
         #ifndef NDEBUG
-        fprintf(stderr, "lm_ggml_backend_tensor_copy: neither cpy_tensor_from nor cpy_tensor_to are implemented for backends %s and %s, falling back to get/set\n", lm_ggml_backend_name(src->buffer->backend), lm_ggml_backend_name(dst->buffer->backend));
+        fprintf(stderr, "lm_ggml_backend_tensor_copy: neither cpy_tensor_from nor cpy_tensor_to "
+                        "are implemented for %s and %s, falling back to get/set\n", src->name, dst->name);
         #endif
         size_t nbytes = lm_ggml_nbytes(src);
         void * data = malloc(nbytes);
@@ -181,100 +239,259 @@ void lm_ggml_backend_tensor_copy(struct lm_ggml_tensor * src, struct lm_ggml_ten
     }
 }
 
-// backend CPU
+// backend registry
 
-struct lm_ggml_backend_cpu_context {
-    int n_threads;
-    void * work_data;
-    size_t work_size;
+#define LM_GGML_MAX_BACKENDS_REG 16
+
+struct lm_ggml_backend_reg {
+    char name[128];
+    lm_ggml_backend_init_fn init_fn;
+    lm_ggml_backend_buffer_type_t default_buffer_type;
+    void * user_data;
 };
 
-static const char * lm_ggml_backend_cpu_name(lm_ggml_backend_t backend) {
-    return "CPU";
+static struct lm_ggml_backend_reg lm_ggml_backend_registry[LM_GGML_MAX_BACKENDS_REG];
+static size_t lm_ggml_backend_registry_count = 0;
+
+static lm_ggml_backend_t lm_ggml_backend_reg_cpu_init(const char * params, void * user_data);
+
+static void lm_ggml_backend_registry_init(void) {
+    static bool initialized = false;
+
+    if (initialized) {
+        return;
+    }
 
-    UNUSED(backend);
+    initialized = true;
+
+    lm_ggml_backend_register("CPU", lm_ggml_backend_reg_cpu_init, lm_ggml_backend_cpu_buffer_type(), NULL);
+
+    // add forward decls here to avoid including the backend headers
+#ifdef LM_GGML_USE_CUBLAS
+    extern void lm_ggml_backend_cuda_reg_devices(void);
+    lm_ggml_backend_cuda_reg_devices();
+#endif
+
+#ifdef LM_GGML_USE_METAL
+    extern lm_ggml_backend_t lm_ggml_backend_reg_metal_init(const char * params, void * user_data);
+    extern lm_ggml_backend_buffer_type_t lm_ggml_backend_metal_buffer_type(void);
+    lm_ggml_backend_register("Metal", lm_ggml_backend_reg_metal_init, lm_ggml_backend_metal_buffer_type(), NULL);
+#endif
 }
 
-static void lm_ggml_backend_cpu_free(lm_ggml_backend_t backend) {
-    struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
-    free(cpu_ctx->work_data);
-    free(cpu_ctx);
-    free(backend);
+void lm_ggml_backend_register(const char * name, lm_ggml_backend_init_fn init_fn, lm_ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
+    LM_GGML_ASSERT(lm_ggml_backend_registry_count < LM_GGML_MAX_BACKENDS_REG);
+
+    int id = lm_ggml_backend_registry_count;
+
+    lm_ggml_backend_registry[id] = (struct lm_ggml_backend_reg) {
+        /* .name                = */ {0},
+        /* .fn                  = */ init_fn,
+        /* .default_buffer_type = */ default_buffer_type,
+        /* .user_data           = */ user_data,
+    };
+
+    snprintf(lm_ggml_backend_registry[id].name, sizeof(lm_ggml_backend_registry[id].name), "%s", name);
+
+#ifndef NDEBUG
+    fprintf(stderr, "%s: registered backend %s\n", __func__, name);
+#endif
+
+    lm_ggml_backend_registry_count++;
+}
+
+size_t lm_ggml_backend_reg_get_count(void) {
+    lm_ggml_backend_registry_init();
+
+    return lm_ggml_backend_registry_count;
+}
+
+size_t lm_ggml_backend_reg_find_by_name(const char * name) {
+    lm_ggml_backend_registry_init();
+
+    for (size_t i = 0; i < lm_ggml_backend_registry_count; i++) {
+        // TODO: case insensitive in a portable way
+        if (strcmp(lm_ggml_backend_registry[i].name, name) == 0) {
+            return i;
+        }
+    }
+    return SIZE_MAX;
 }
 
+// init from backend:params string
+lm_ggml_backend_t lm_ggml_backend_reg_init_backend_from_str(const char * backend_str) {
+    lm_ggml_backend_registry_init();
+
+    const char * params = strchr(backend_str, ':');
+    char backend_name[128];
+    if (params == NULL) {
+        strcpy(backend_name, backend_str);
+        params = "";
+    } else {
+        strncpy(backend_name, backend_str, params - backend_str);
+        backend_name[params - backend_str] = '\0';
+        params++;
+    }
+
+    size_t backend_i = lm_ggml_backend_reg_find_by_name(backend_name);
+    if (backend_i == SIZE_MAX) {
+        fprintf(stderr, "%s: backend %s not found\n", __func__, backend_name);
+        return NULL;
+    }
+
+    return lm_ggml_backend_reg_init_backend(backend_i, params);
+}
+
+const char * lm_ggml_backend_reg_get_name(size_t i) {
+    lm_ggml_backend_registry_init();
+
+    LM_GGML_ASSERT(i < lm_ggml_backend_registry_count);
+    return lm_ggml_backend_registry[i].name;
+}
+
+lm_ggml_backend_t lm_ggml_backend_reg_init_backend(size_t i, const char * params) {
+    lm_ggml_backend_registry_init();
+
+    LM_GGML_ASSERT(i < lm_ggml_backend_registry_count);
+    return lm_ggml_backend_registry[i].init_fn(params, lm_ggml_backend_registry[i].user_data);
+}
+
+lm_ggml_backend_buffer_type_t lm_ggml_backend_reg_get_default_buffer_type(size_t i) {
+    lm_ggml_backend_registry_init();
+
+    LM_GGML_ASSERT(i < lm_ggml_backend_registry_count);
+    return lm_ggml_backend_registry[i].default_buffer_type;
+}
+
+lm_ggml_backend_buffer_t lm_ggml_backend_reg_alloc_buffer(size_t i, size_t size) {
+    lm_ggml_backend_registry_init();
+
+    LM_GGML_ASSERT(i < lm_ggml_backend_registry_count);
+    return lm_ggml_backend_buft_alloc_buffer(lm_ggml_backend_registry[i].default_buffer_type, size);
+}
+
+// backend CPU
+
 static void * lm_ggml_backend_cpu_buffer_get_base(lm_ggml_backend_buffer_t buffer) {
     return (void *)buffer->context;
 }
 
 static void lm_ggml_backend_cpu_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
     free(buffer->context);
-    UNUSED(buffer);
+    LM_GGML_UNUSED(buffer);
+}
+
+static void lm_ggml_backend_cpu_buffer_set_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor write out of bounds");
+    LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+
+    memcpy((char *)tensor->data + offset, data, size);
+
+    LM_GGML_UNUSED(buffer);
+}
+
+static void lm_ggml_backend_cpu_buffer_get_tensor(lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor read out of bounds");
+    LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+
+    memcpy(data, (const char *)tensor->data + offset, size);
+
+    LM_GGML_UNUSED(buffer);
+}
+
+static void lm_ggml_backend_cpu_buffer_cpy_tensor_from(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) {
+    lm_ggml_backend_tensor_get(src, dst->data, 0, lm_ggml_nbytes(src));
+
+    LM_GGML_UNUSED(buffer);
+}
+
+static void lm_ggml_backend_cpu_buffer_cpy_tensor_to(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) {
+    lm_ggml_backend_tensor_set(dst, src->data, 0, lm_ggml_nbytes(src));
+
+    LM_GGML_UNUSED(buffer);
 }
 
 static struct lm_ggml_backend_buffer_i cpu_backend_buffer_i = {
-    /* .free_buffer    = */ lm_ggml_backend_cpu_buffer_free_buffer,
-    /* .get_base       = */ lm_ggml_backend_cpu_buffer_get_base,
-    /* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes
-    /* .init_tensor    = */ NULL, // no initialization required
-    /* .free_tensor    = */ NULL, // no cleanup required
+    /* .free_buffer     = */ lm_ggml_backend_cpu_buffer_free_buffer,
+    /* .get_base        = */ lm_ggml_backend_cpu_buffer_get_base,
+    /* .init_tensor     = */ NULL, // no initialization required
+    /* .set_tensor      = */ lm_ggml_backend_cpu_buffer_set_tensor,
+    /* .get_tensor      = */ lm_ggml_backend_cpu_buffer_get_tensor,
+    /* .cpy_tensor_from = */ lm_ggml_backend_cpu_buffer_cpy_tensor_from,
+    /* .cpy_tensor_to   = */ lm_ggml_backend_cpu_buffer_cpy_tensor_to,
 };
 
 // for buffers from ptr, free is not called
 static struct lm_ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
-    /* .free_buffer    = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
-    /* .get_base       = */ lm_ggml_backend_cpu_buffer_get_base,
-    /* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes
-    /* .init_tensor    = */ NULL,
-    /* .free_tensor    = */ NULL,
+    /* .free_buffer     = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
+    /* .get_base        = */ lm_ggml_backend_cpu_buffer_get_base,
+    /* .init_tensor     = */ NULL, // no initialization required
+    /* .set_tensor      = */ lm_ggml_backend_cpu_buffer_set_tensor,
+    /* .get_tensor      = */ lm_ggml_backend_cpu_buffer_get_tensor,
+    /* .cpy_tensor_from = */ lm_ggml_backend_cpu_buffer_cpy_tensor_from,
+    /* .cpy_tensor_to   = */ lm_ggml_backend_cpu_buffer_cpy_tensor_to,
 };
 
 static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
 
-static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_alloc_buffer(lm_ggml_backend_t backend, size_t size) {
+static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_type_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
     size += TENSOR_ALIGNMENT;   // malloc may return an address that is not aligned
     void * data = malloc(size); // TODO: maybe use LM_GGML_ALIGNED_MALLOC?
 
-    return lm_ggml_backend_buffer_init(backend, cpu_backend_buffer_i, data, size);
+    LM_GGML_ASSERT(data != NULL && "failed to allocate buffer");
+
+    return lm_ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
 }
 
-static size_t lm_ggml_backend_cpu_get_alignment(lm_ggml_backend_t backend) {
+static size_t lm_ggml_backend_cpu_buffer_type_get_alignment(lm_ggml_backend_buffer_type_t buft) {
     return TENSOR_ALIGNMENT;
-    UNUSED(backend);
-}
 
-static void lm_ggml_backend_cpu_set_tensor_async(lm_ggml_backend_t backend, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor write out of bounds");
-    LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+    LM_GGML_UNUSED(buft);
+}
 
-    memcpy((char *)tensor->data + offset, data, size);
+static bool lm_ggml_backend_cpu_buffer_type_supports_backend(lm_ggml_backend_buffer_type_t buft, lm_ggml_backend_t backend) {
+    return lm_ggml_backend_is_cpu(backend);
 
-    UNUSED(backend);
+    LM_GGML_UNUSED(buft);
 }
 
-static void lm_ggml_backend_cpu_get_tensor_async(lm_ggml_backend_t backend, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor read out of bounds");
-    LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
-
-    memcpy(data, (const char *)tensor->data + offset, size);
+lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_buffer_type(void) {
+    static struct lm_ggml_backend_buffer_type lm_ggml_backend_buffer_type_cpu = {
+        /* .iface = */ {
+            /* .alloc_buffer     = */ lm_ggml_backend_cpu_buffer_type_alloc_buffer,
+            /* .get_alignment    = */ lm_ggml_backend_cpu_buffer_type_get_alignment,
+            /* .get_alloc_size   = */ NULL, // defaults to lm_ggml_nbytes
+            /* .supports_backend = */ lm_ggml_backend_cpu_buffer_type_supports_backend,
+        },
+        /* .context = */ NULL,
+    };
 
-    UNUSED(backend);
+    return &lm_ggml_backend_buffer_type_cpu;
 }
 
-static void lm_ggml_backend_cpu_synchronize(lm_ggml_backend_t backend) {
-    UNUSED(backend);
-}
+struct lm_ggml_backend_cpu_context {
+    int n_threads;
+    void * work_data;
+    size_t work_size;
+};
 
-static void lm_ggml_backend_cpu_cpy_tensor_from(lm_ggml_backend_t backend, struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) {
-    lm_ggml_backend_tensor_get(src, dst->data, 0, lm_ggml_nbytes(src));
+static const char * lm_ggml_backend_cpu_name(lm_ggml_backend_t backend) {
+    return "CPU";
 
-    UNUSED(backend);
+    LM_GGML_UNUSED(backend);
 }
 
-static void lm_ggml_backend_cpu_cpy_tensor_to(lm_ggml_backend_t backend, struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) {
-    // for a backend such as CUDA that can queue async calls, it is ok to do this asynchronously, but it may not be the case for other backends
-    lm_ggml_backend_tensor_set_async(dst, src->data, 0, lm_ggml_nbytes(src));
+static void lm_ggml_backend_cpu_free(lm_ggml_backend_t backend) {
+    struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
+    free(cpu_ctx->work_data);
+    free(cpu_ctx);
+    free(backend);
+}
 
-    UNUSED(backend);
+static lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_get_default_buffer_type(lm_ggml_backend_t backend) {
+    return lm_ggml_backend_cpu_buffer_type();
+
+    LM_GGML_UNUSED(backend);
 }
 
 struct lm_ggml_backend_plan_cpu {
@@ -303,7 +520,7 @@ static void lm_ggml_backend_cpu_graph_plan_free(lm_ggml_backend_t backend, lm_gg
     free(cpu_plan->cplan.work_data);
     free(cpu_plan);
 
-    UNUSED(backend);
+    LM_GGML_UNUSED(backend);
 }
 
 static void lm_ggml_backend_cpu_graph_plan_compute(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan) {
@@ -311,7 +528,7 @@ static void lm_ggml_backend_cpu_graph_plan_compute(lm_ggml_backend_t backend, lm
 
     lm_ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
 
-    UNUSED(backend);
+    LM_GGML_UNUSED(backend);
 }
 
 static void lm_ggml_backend_cpu_graph_compute(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) {
@@ -332,25 +549,25 @@ static void lm_ggml_backend_cpu_graph_compute(lm_ggml_backend_t backend, struct
 
 static bool lm_ggml_backend_cpu_supports_op(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op) {
     return true;
-    UNUSED(backend);
-    UNUSED(op);
+
+    LM_GGML_UNUSED(backend);
+    LM_GGML_UNUSED(op);
 }
 
 static struct lm_ggml_backend_i cpu_backend_i = {
-    /* .get_name            = */ lm_ggml_backend_cpu_name,
-    /* .free                = */ lm_ggml_backend_cpu_free,
-    /* .alloc_buffer        = */ lm_ggml_backend_cpu_alloc_buffer,
-    /* .get_alignment       = */ lm_ggml_backend_cpu_get_alignment,
-    /* .set_tensor_async    = */ lm_ggml_backend_cpu_set_tensor_async,
-    /* .get_tensor_async    = */ lm_ggml_backend_cpu_get_tensor_async,
-    /* .synchronize         = */ lm_ggml_backend_cpu_synchronize,
-    /* .cpy_tensor_from     = */ lm_ggml_backend_cpu_cpy_tensor_from,
-    /* .cpy_tensor_to       = */ lm_ggml_backend_cpu_cpy_tensor_to,
-    /* .graph_plan_create   = */ lm_ggml_backend_cpu_graph_plan_create,
-    /* .graph_plan_free     = */ lm_ggml_backend_cpu_graph_plan_free,
-    /* .graph_plan_compute  = */ lm_ggml_backend_cpu_graph_plan_compute,
-    /* .graph_compute       = */ lm_ggml_backend_cpu_graph_compute,
-    /* .supports_op         = */ lm_ggml_backend_cpu_supports_op,
+    /* .get_name                = */ lm_ggml_backend_cpu_name,
+    /* .free                    = */ lm_ggml_backend_cpu_free,
+    /* .get_default_buffer_type = */ lm_ggml_backend_cpu_get_default_buffer_type,
+    /* .set_tensor_async        = */ NULL,
+    /* .get_tensor_async        = */ NULL,
+    /* .cpy_tensor_from_async   = */ NULL,
+    /* .cpy_tensor_to_async     = */ NULL,
+    /* .synchronize             = */ NULL,
+    /* .graph_plan_create       = */ lm_ggml_backend_cpu_graph_plan_create,
+    /* .graph_plan_free         = */ lm_ggml_backend_cpu_graph_plan_free,
+    /* .graph_plan_compute      = */ lm_ggml_backend_cpu_graph_plan_compute,
+    /* .graph_compute           = */ lm_ggml_backend_cpu_graph_compute,
+    /* .supports_op             = */ lm_ggml_backend_cpu_supports_op,
 };
 
 lm_ggml_backend_t lm_ggml_backend_cpu_init(void) {
@@ -380,6 +597,761 @@ void lm_ggml_backend_cpu_set_n_threads(lm_ggml_backend_t backend_cpu, int n_thre
     ctx->n_threads = n_threads;
 }
 
-lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_from_ptr(lm_ggml_backend_t backend_cpu, void * ptr, size_t size) {
-    return lm_ggml_backend_buffer_init(backend_cpu, cpu_backend_buffer_i_from_ptr, ptr, size);
+lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
+    return lm_ggml_backend_buffer_init(lm_ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
+}
+
+static lm_ggml_backend_t lm_ggml_backend_reg_cpu_init(const char * params, void * user_data) {
+    return lm_ggml_backend_cpu_init();
+
+    LM_GGML_UNUSED(params);
+    LM_GGML_UNUSED(user_data);
+}
+
+
+// scheduler
+
+#define LM_GGML_MAX_BACKENDS 4
+#define LM_GGML_MAX_SPLITS 256
+#define LM_GGML_MAX_SPLIT_INPUTS 16
+
+struct lm_ggml_backend_sched_split {
+    lm_ggml_tallocr_t tallocr;
+    int i_start;
+    int i_end;
+    struct lm_ggml_tensor * inputs[LM_GGML_MAX_SPLIT_INPUTS];
+    int n_inputs;
+    struct lm_ggml_cgraph graph;
+};
+
+struct lm_ggml_backend_sched {
+    int n_backends;
+    lm_ggml_backend_t backends[LM_GGML_MAX_BACKENDS];
+    lm_ggml_tallocr_t  tallocs[LM_GGML_MAX_BACKENDS];
+
+    lm_ggml_gallocr_t galloc;
+
+    struct lm_ggml_hash_set    hash_set;
+    lm_ggml_tallocr_t *        node_talloc;                     // [hash_set.size]
+    struct lm_ggml_tensor * (* node_copies)[LM_GGML_MAX_BACKENDS]; // [hash_set.size][LM_GGML_MAX_BACKENDS]
+
+    struct lm_ggml_cgraph * graph;
+    struct lm_ggml_backend_sched_split splits[LM_GGML_MAX_SPLITS];
+    int n_splits;
+
+    struct lm_ggml_context * ctx;
+
+    // align context_buffer to LM_GGML_MEM_ALIGN
+    #ifdef _MSC_VER
+    __declspec(align(LM_GGML_MEM_ALIGN))
+    #else
+    __attribute__((aligned(LM_GGML_MEM_ALIGN)))
+    #endif
+    char context_buffer[LM_GGML_MAX_SPLITS*LM_GGML_MAX_SPLIT_INPUTS*sizeof(struct lm_ggml_tensor) + sizeof(struct lm_ggml_cgraph)];
+};
+
+#define hash_id(node) lm_ggml_hash_find_or_insert(sched->hash_set, node)
+#define node_allocr(node) sched->node_talloc[hash_id(node)]
+
+static bool lm_ggml_is_view_op(enum lm_ggml_op op) {
+    return op == LM_GGML_OP_VIEW || op == LM_GGML_OP_RESHAPE || op == LM_GGML_OP_PERMUTE || op == LM_GGML_OP_TRANSPOSE;
+}
+
+// returns the priority of the backend, lower is better
+static int sched_backend_prio(lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend) {
+    for (int i = 0; i < sched->n_backends; i++) {
+        if (sched->backends[i] == backend) {
+            return i;
+        }
+    }
+    return INT_MAX;
+}
+
+static int sched_allocr_prio(lm_ggml_backend_sched_t sched, lm_ggml_tallocr_t allocr) {
+    for (int i = 0; i < sched->n_backends; i++) {
+        if (sched->tallocs[i] == allocr) {
+            return i;
+        }
+    }
+    return INT_MAX;
+}
+
+static lm_ggml_backend_t get_buffer_backend(lm_ggml_backend_sched_t sched, lm_ggml_backend_buffer_t buffer) {
+    if (buffer == NULL) {
+        return NULL;
+    }
+    // find highest prio backend that supports the buffer type
+    for (int i = 0; i < sched->n_backends; i++) {
+        if (lm_ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) {
+            return sched->backends[i];
+        }
+    }
+    LM_GGML_ASSERT(false && "tensor buffer type not supported by any backend");
+}
+
+static lm_ggml_backend_t get_allocr_backend(lm_ggml_backend_sched_t sched, lm_ggml_tallocr_t allocr) {
+    if (allocr == NULL) {
+        return NULL;
+    }
+    // find highest prio backend that supports the buffer type
+    for (int i = 0; i < sched->n_backends; i++) {
+        if (sched->tallocs[i] == allocr) {
+            return sched->backends[i];
+        }
+    }
+    LM_GGML_UNREACHABLE();
+}
+
+#if 0
+static char causes[LM_GGML_DEFAULT_GRAPH_SIZE*8 + LM_GGML_MAX_SPLITS*LM_GGML_MAX_SPLIT_INPUTS][128]; // debug, remove
+#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
+#define GET_CAUSE(node) causes[hash_id(node)]
+#else
+#define SET_CAUSE(node, ...)
+#define GET_CAUSE(node) ""
+#endif
+
+// returns the backend that should be used for the node based on the current locations
+static lm_ggml_backend_t sched_backend_from_cur(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node) {
+    // if the dst tensor is already allocated in a buffer, we must assume that it is critical to keep it there
+    // ie. kv cache updates
+    // note that this doesn't allow fallback to CPU. need to add output tensors to the splits to copy the data back to the original backend.
+    // dst
+    lm_ggml_backend_t cur_backend = get_buffer_backend(sched, node->buffer);
+    if (cur_backend != NULL) {
+        SET_CAUSE(node, "1.dst");
+        return cur_backend;
+    }
+
+    // view_src
+    if (node->view_src != NULL && get_buffer_backend(sched, node->view_src->buffer) != NULL) {
+        SET_CAUSE(node, "1.vsrc");
+        return get_buffer_backend(sched, node->view_src->buffer);
+    }
+
+    // src
+    int cur_prio = INT_MAX;
+    size_t cur_size = 0;
+
+    for (int i = 0; i < LM_GGML_MAX_SRC; i++) {
+        const struct lm_ggml_tensor * src = node->src[i];
+        if (src == NULL) {
+            break;
+        }
+        lm_ggml_backend_t src_backend = get_buffer_backend(sched, src->buffer);
+        if (src_backend != NULL) {
+            int src_prio = sched_backend_prio(sched, src_backend);
+            size_t src_size = lm_ggml_nbytes(src);
+            if (src_prio < cur_prio && src_size >= cur_size) {
+                cur_prio = src_prio;
+                cur_size = src_size;
+                cur_backend = src_backend;
+                SET_CAUSE(node, "1.src%d", i);
+            }
+        }
+    }
+    return cur_backend;
+}
+
+static char * fmt_size(size_t size) {
+    static char buffer[128];
+    if (size >= 1024*1024) {
+        sprintf(buffer, "%zuM", size/1024/1024);
+    } else {
+        sprintf(buffer, "%zuK", size/1024);
+    }
+    return buffer;
+}
+
+static void sched_print_assignments(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
+    int cur_split = 0;
+    for (int i = 0; i < graph->n_nodes; i++) {
+        if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
+            lm_ggml_backend_t split_backend = get_allocr_backend(sched, sched->splits[cur_split].tallocr);
+            fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, lm_ggml_backend_name(split_backend),
+                sched->splits[cur_split].n_inputs);
+            for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
+                fprintf(stderr, "[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
+                    fmt_size(lm_ggml_nbytes(sched->splits[cur_split].inputs[j])));
+            }
+            fprintf(stderr, "\n");
+            cur_split++;
+        }
+        struct lm_ggml_tensor * node = graph->nodes[i];
+        if (lm_ggml_is_view_op(node->op)) {
+            continue;
+        }
+        lm_ggml_tallocr_t node_allocr = node_allocr(node);
+        lm_ggml_backend_t node_backend = node_allocr ? get_allocr_backend(sched, node_allocr) : NULL; // FIXME:
+        fprintf(stderr, "node #%3d (%10.10s): %20.20s (%4.4s) [%4.4s %8.8s]:", i, lm_ggml_op_name(node->op), node->name,
+            fmt_size(lm_ggml_nbytes(node)), node_allocr ? lm_ggml_backend_name(node_backend) : "NULL", GET_CAUSE(node));
+        for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
+            struct lm_ggml_tensor * src = node->src[j];
+            if (src == NULL) {
+                break;
+            }
+            lm_ggml_tallocr_t src_allocr = node_allocr(src);
+            lm_ggml_backend_t src_backend = src_allocr ? get_allocr_backend(sched, src_allocr) : NULL;
+            fprintf(stderr, " %20.20s (%4.4s) [%4.4s %8.8s]", src->name,
+                fmt_size(lm_ggml_nbytes(src)), src_backend ? lm_ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
+        }
+        fprintf(stderr, "\n");
+    }
+}
+
+// creates a copy of the tensor with the same memory layout
+static struct lm_ggml_tensor * lm_ggml_dup_tensor_layout(struct lm_ggml_context * ctx, const struct lm_ggml_tensor * tensor) {
+    struct lm_ggml_tensor * dup = lm_ggml_dup_tensor(ctx, tensor);
+    for (int i = 0; i < LM_GGML_MAX_DIMS; i++) {
+        dup->nb[i] = tensor->nb[i];
+    }
+    return dup;
+}
+
+// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
+// TODO: merge passes
+static void sched_split_graph(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
+    // reset state
+    size_t hash_size = sched->hash_set.size;
+    memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size);
+    memset(sched->node_talloc,   0, sizeof(sched->node_talloc[0])   * hash_size);
+    memset(sched->node_copies,   0, sizeof(sched->node_copies[0])   * hash_size);
+    sched->n_splits = 0;
+
+    struct lm_ggml_init_params params = {
+        /* .mem_size =   */ sizeof(sched->context_buffer),
+        /* .mem_buffer = */ sched->context_buffer,
+        /* .no_alloc =   */ true
+    };
+
+    if (sched->ctx != NULL) {
+        lm_ggml_free(sched->ctx);
+    }
+
+    sched->ctx = lm_ggml_init(params);
+
+    // pass 1: assign backends to ops with allocated inputs
+    for (int i = 0; i < graph->n_leafs; i++) {
+        struct lm_ggml_tensor * leaf = graph->leafs[i];
+        if (node_allocr(leaf) != NULL) {
+            // do not overwrite user assignments
+            continue;
+        }
+        lm_ggml_backend_t leaf_backend = get_buffer_backend(sched, leaf->buffer);
+        if (leaf_backend == NULL && leaf->view_src != NULL) {
+            leaf_backend = get_buffer_backend(sched, leaf->view_src->buffer);
+        }
+        if (leaf_backend != NULL) {
+            node_allocr(leaf) = lm_ggml_backend_sched_get_tallocr(sched, leaf_backend);
+        }
+    }
+
+    for (int i = 0; i < graph->n_nodes; i++) {
+        struct lm_ggml_tensor * node = graph->nodes[i];
+        if (node_allocr(node) != NULL) {
+            // do not overwrite user assignments
+            continue;
+        }
+        lm_ggml_backend_t node_backend = sched_backend_from_cur(sched, node);
+        if (node_backend != NULL) {
+            node_allocr(node) = lm_ggml_backend_sched_get_tallocr(sched, node_backend);
+        }
+    }
+    //printf("PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
+
+    // pass 2: assign backends to ops from current assignments
+    // TODO:
+    //  - reuse sched_backend_from_cur
+    for (int i = 0; i < graph->n_nodes; i++) {
+        struct lm_ggml_tensor * node = graph->nodes[i];
+        lm_ggml_tallocr_t node_allocr = node_allocr(node);
+        if (node_allocr == NULL) {
+            int    cur_prio = INT_MAX;
+            size_t cur_size = 0;
+            for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
+                struct lm_ggml_tensor * src = node->src[j];
+                if (src == NULL) {
+                    break;
+                }
+                lm_ggml_tallocr_t src_allocr = node_allocr(src);
+                if (src_allocr != NULL) {
+                    int    src_prio = sched_allocr_prio(sched, src_allocr);
+                    size_t src_size = lm_ggml_nbytes(src);
+                    if (src_prio < cur_prio && src_size >= cur_size) {
+                        cur_prio = src_prio;
+                        cur_size = src_size;
+                        node_allocr = src_allocr;
+                        SET_CAUSE(node, "2.src%d", j);
+                    }
+                }
+            }
+            if (node_allocr != NULL) {
+                node_allocr(node) = node_allocr;
+            }
+        }
+    }
+    //printf("PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
+
+    // pass 3: assign backends to remaining src from dst (should only be leafs)
+    for (int i = 0; i < graph->n_nodes; i++) {
+        struct lm_ggml_tensor * node = graph->nodes[i];
+        lm_ggml_tallocr_t node_allocr = node_allocr(node);
+        for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
+            struct lm_ggml_tensor * src = node->src[j];
+            if (src == NULL) {
+                break;
+            }
+            lm_ggml_tallocr_t src_allocr = node_allocr(src);
+            if (src_allocr == NULL) {
+                node_allocr(src) = node_allocr;
+            }
+        }
+    }
+    //printf("PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
+
+    // pass 4: split graph, find tensors that need to be copied
+    // TODO:
+    //  - when switching from a less preferred backend to a more preferred backend, check if it is possible to move the switch to an earlier point for the same cost
+    // find first backend
+    int cur_split = 0;
+    for (int i = 0; i < graph->n_nodes; i++) {
+        struct lm_ggml_tensor * node = graph->nodes[i];
+        if (node->view_src == NULL) {
+            sched->splits[0].tallocr = node_allocr(node);
+            break;
+        }
+    }
+    sched->splits[0].i_start = 0;
+    sched->splits[0].n_inputs = 0;
+    memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
+    lm_ggml_tallocr_t cur_allocr = sched->splits[0].tallocr;
+    size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr);
+    for (int i = 0; i < graph->n_nodes; i++) {
+        struct lm_ggml_tensor * node = graph->nodes[i];
+
+        if (lm_ggml_is_view_op(node->op)) {
+            continue;
+        }
+
+        lm_ggml_tallocr_t node_allocr = node_allocr(node);
+
+        if (node_allocr != cur_allocr) {
+            sched->splits[cur_split].i_end = i;
+            cur_split++;
+            LM_GGML_ASSERT(cur_split < LM_GGML_MAX_SPLITS);
+            sched->splits[cur_split].tallocr = node_allocr;
+            sched->splits[cur_split].i_start = i;
+            sched->splits[cur_split].n_inputs = 0;
+            memset(sched->splits[cur_split].inputs, 0, sizeof(sched->splits[cur_split].inputs)); //HACK
+            cur_allocr = node_allocr;
+            cur_backend_id = sched_allocr_prio(sched, cur_allocr);
+        }
+
+        // find inputs that are not on the same backend
+        for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
+            struct lm_ggml_tensor * src = node->src[j];
+            if (src == NULL) {
+                break;
+            }
+            lm_ggml_tallocr_t src_allocr = node_allocr(src);
+            if (src_allocr != node_allocr) {
+                int n_inputs = sched->splits[cur_split].n_inputs++;
+                LM_GGML_ASSERT(n_inputs < LM_GGML_MAX_SPLIT_INPUTS);
+                sched->splits[cur_split].inputs[n_inputs] = (struct lm_ggml_tensor *)src;
+
+                // create copies
+                size_t id = hash_id(src);
+                if (sched->node_copies[id][cur_backend_id] == NULL) {
+                    struct lm_ggml_tensor * tensor_copy = lm_ggml_dup_tensor_layout(sched->ctx, src);
+                    sched->node_copies[id][cur_backend_id] = tensor_copy;
+                    node_allocr(tensor_copy) = cur_allocr;
+                    lm_ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
+                    lm_ggml_format_name(tensor_copy, "%s#%s", lm_ggml_backend_name(backend), src->name);
+                }
+                node->src[j] = sched->node_copies[id][cur_backend_id];
+            }
+        }
+    }
+    sched->splits[cur_split].i_end = graph->n_nodes;
+    sched->n_splits = cur_split + 1;
+
+    //fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); fflush(stdout);
+
+#if 1
+    // sanity check: all sources should have the same backend as the node
+    for (int i = 0; i < graph->n_nodes; i++) {
+        struct lm_ggml_tensor * node = graph->nodes[i];
+        lm_ggml_tallocr_t node_allocr = node_allocr(node);
+        if (node_allocr == NULL) {
+            fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
+        }
+        for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
+            struct lm_ggml_tensor * src = node->src[j];
+            if (src == NULL) {
+                break;
+            }
+            lm_ggml_tallocr_t src_allocr = node_allocr(src);
+            if (src_allocr != node_allocr /* && src_backend != NULL */) { // ignore nulls for now
+                fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
+                    node->name, node_allocr ? lm_ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
+                    j, src->name, src_allocr ? lm_ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL");
+            }
+        }
+    }
+#endif
+
+    // create copies of the graph for each split
+    // FIXME: avoid this copy, pass split inputs to lm_ggml_gallocr_alloc_graph_n in some other way
+    struct lm_ggml_cgraph * graph_copy = lm_ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*LM_GGML_MAX_SPLIT_INPUTS, false);
+    for (int i = 0; i < sched->n_splits; i++) {
+        struct lm_ggml_backend_sched_split * split = &sched->splits[i];
+        split->graph = lm_ggml_graph_view(graph, split->i_start, split->i_end);
+
+        // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
+        for (int j = 0; j < split->n_inputs; j++) {
+            struct lm_ggml_tensor * input = split->inputs[j];
+            struct lm_ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_allocr_prio(sched, split->tallocr)];
+            input_cpy->src[0] = input;
+            graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
+        }
+
+        for (int j = split->i_start; j < split->i_end; j++) {
+            graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
+        }
+    }
+    sched->graph = graph_copy;
+}
+
+static void sched_alloc_splits(lm_ggml_backend_sched_t sched) {
+    lm_ggml_gallocr_alloc_graph_n(
+        sched->galloc,
+        sched->graph,
+        sched->hash_set,
+        sched->node_talloc);
+}
+
+static void sched_compute_splits(lm_ggml_backend_sched_t sched) {
+    uint64_t copy_us[LM_GGML_MAX_BACKENDS] = {0};
+    uint64_t compute_us[LM_GGML_MAX_BACKENDS] = {0};
+
+    struct lm_ggml_backend_sched_split * splits = sched->splits;
+
+    for (int i = 0; i < sched->n_splits; i++) {
+        struct lm_ggml_backend_sched_split * split = &splits[i];
+        lm_ggml_backend_t split_backend = get_allocr_backend(sched, split->tallocr);
+        int split_backend_id = sched_backend_prio(sched, split_backend);
+
+        // copy the input tensors to the split backend
+        uint64_t copy_start_us = lm_ggml_time_us();
+        for (int j = 0; j < split->n_inputs; j++) {
+            struct lm_ggml_tensor * input = split->inputs[j];
+            struct lm_ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_backend_prio(sched, split_backend)];
+            if (input->buffer == NULL) {
+                if (input->view_src == NULL) {
+                    fprintf(stderr, "input %s has no buffer and no view_src\n", input->name);
+                    exit(1);
+                }
+                // FIXME: may need to use the sched buffer instead
+                lm_ggml_backend_view_init(input->view_src->buffer, input);
+            }
+            if (input_cpy->buffer == NULL) {
+                fprintf(stderr, "input_cpy %s has no buffer\n", input_cpy->name);
+                exit(1);
+            }
+            //LM_GGML_ASSERT(input->buffer->backend != input_cpy->buffer->backend);
+            //LM_GGML_ASSERT(input_cpy->buffer->backend == split_backend);
+            lm_ggml_backend_tensor_copy(input, input_cpy);
+        }
+        // lm_ggml_backend_synchronize(split_backend);
+        int64_t copy_end_us = lm_ggml_time_us();
+        copy_us[split_backend_id] += copy_end_us - copy_start_us;
+
+#if 0
+        char split_filename[LM_GGML_MAX_NAME];
+        snprintf(split_filename, LM_GGML_MAX_NAME, "split_%i_%s.dot", i, lm_ggml_backend_name(split_backend));
+        lm_ggml_graph_dump_dot(split->graph, NULL, split_filename);
+#endif
+
+        uint64_t compute_start_us = lm_ggml_time_us();
+        lm_ggml_backend_graph_compute(split_backend, &split->graph);
+        // lm_ggml_backend_synchronize(split_backend);
+        uint64_t compute_end_us = lm_ggml_time_us();
+        compute_us[split_backend_id] += compute_end_us - compute_start_us;
+    }
+
+#if 0
+    // per-backend timings
+    fprintf(stderr, "sched_compute_splits times (%d splits):\n", sched->n_splits);
+    for (int i = 0; i < sched->n_backends; i++) {
+        if (copy_us[i] > 0 || compute_us[i] > 0) {
+            fprintf(stderr, "\t%5.5s: %lu us copy, %lu us compute\n", lm_ggml_backend_name(sched->backends[i]), copy_us[i], compute_us[i]);
+        }
+    }
+#endif
+}
+
+static void sched_reset(lm_ggml_backend_sched_t sched) {
+    for (int i = 0; i < sched->n_backends; i++) {
+        lm_ggml_tallocr_reset(sched->tallocs[i]);
+    }
+}
+
+lm_ggml_backend_sched_t lm_ggml_backend_sched_new(lm_ggml_backend_t * backends, int n_backends) {
+    LM_GGML_ASSERT(n_backends <= LM_GGML_MAX_BACKENDS);
+
+    struct lm_ggml_backend_sched * sched = malloc(sizeof(struct lm_ggml_backend_sched));
+    memset(sched, 0, sizeof(struct lm_ggml_backend_sched));
+
+    sched->n_backends = n_backends;
+    for (int i = 0; i < n_backends; i++) {
+        sched->backends[i] = backends[i];
+    }
+
+    sched->galloc = lm_ggml_gallocr_new();
+
+    // init measure allocs for each backend
+    for (int i = 0; i < n_backends; i++) {
+        sched->tallocs[i] = lm_ggml_tallocr_new_measure_from_backend(backends[i]);
+    }
+
+    return sched;
+}
+
+void lm_ggml_backend_sched_free(lm_ggml_backend_sched_t sched) {
+    if (sched == NULL) {
+        return;
+    }
+    for (int i = 0; i < sched->n_backends; i++) {
+        lm_ggml_tallocr_free(sched->tallocs[i]);
+    }
+    lm_ggml_gallocr_free(sched->galloc);
+    free(sched->hash_set.keys);
+    free(sched->node_talloc);
+    free(sched->node_copies);
+    free(sched);
+}
+
+void lm_ggml_backend_sched_init_measure(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * measure_graph) {
+    // initialize hash tables
+    size_t hash_size = measure_graph->visited_hash_table.size + LM_GGML_MAX_SPLITS*LM_GGML_MAX_SPLIT_INPUTS;
+    sched->hash_set.size = hash_size;
+    sched->hash_set.keys = malloc(sizeof(sched->hash_set.keys[0]) * hash_size);
+    sched->node_talloc   = malloc(sizeof(sched->node_talloc[0])   * hash_size);
+    sched->node_copies   = malloc(sizeof(sched->node_copies[0])   * hash_size);
+
+    sched_split_graph(sched, measure_graph);
+    sched_alloc_splits(sched);
+
+    // allocate buffers and reset allocators
+    for (int i = 0; i < sched->n_backends; i++) {
+        size_t size = lm_ggml_tallocr_max_size(sched->tallocs[i]);
+        lm_ggml_tallocr_free(sched->tallocs[i]);
+        sched->tallocs[i] = lm_ggml_tallocr_new_from_backend(sched->backends[i], size);
+    }
+
+    sched_reset(sched);
+}
+
+void lm_ggml_backend_sched_graph_compute(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
+    LM_GGML_ASSERT(sched->hash_set.size >= graph->visited_hash_table.size + LM_GGML_MAX_SPLITS*LM_GGML_MAX_SPLIT_INPUTS);
+
+    sched_split_graph(sched, graph);
+    sched_alloc_splits(sched);
+    sched_compute_splits(sched);
+    sched_reset(sched);
+}
+
+lm_ggml_tallocr_t lm_ggml_backend_sched_get_tallocr(lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend) {
+    int backend_index = sched_backend_prio(sched, backend);
+    return sched->tallocs[backend_index];
+}
+
+lm_ggml_backend_buffer_t lm_ggml_backend_sched_get_buffer(lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend) {
+    int backend_index = sched_backend_prio(sched, backend);
+    return lm_ggml_tallocr_get_buffer(sched->tallocs[backend_index]);
+}
+
+void lm_ggml_backend_sched_set_node_backend(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node, lm_ggml_backend_t backend) {
+    int backend_index = sched_backend_prio(sched, backend);
+    LM_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
+    node_allocr(node) = sched->tallocs[backend_index];
+}
+
+// utils
+void lm_ggml_backend_view_init(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor) {
+    LM_GGML_ASSERT(tensor->buffer == NULL);
+    LM_GGML_ASSERT(tensor->data == NULL);
+    LM_GGML_ASSERT(tensor->view_src != NULL);
+    LM_GGML_ASSERT(tensor->view_src->buffer != NULL);
+    LM_GGML_ASSERT(tensor->view_src->data != NULL);
+
+    tensor->buffer = buffer;
+    tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
+    tensor->backend = tensor->view_src->backend;
+    lm_ggml_backend_buffer_init_tensor(buffer, tensor);
+}
+
+void lm_ggml_backend_tensor_alloc(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, void * addr) {
+    LM_GGML_ASSERT(tensor->buffer == NULL);
+    LM_GGML_ASSERT(tensor->data == NULL);
+    LM_GGML_ASSERT(tensor->view_src == NULL);
+    LM_GGML_ASSERT(addr >= lm_ggml_backend_buffer_get_base(buffer));
+    LM_GGML_ASSERT((char *)addr + lm_ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
+                (char *)lm_ggml_backend_buffer_get_base(buffer) + lm_ggml_backend_buffer_get_size(buffer));
+
+    tensor->buffer = buffer;
+    tensor->data = addr;
+    lm_ggml_backend_buffer_init_tensor(buffer, tensor);
+}
+
+static struct lm_ggml_tensor * graph_dup_tensor(struct lm_ggml_hash_set hash_set, struct lm_ggml_tensor ** node_copies,
+    struct lm_ggml_context * ctx_allocated, struct lm_ggml_context * ctx_unallocated, struct lm_ggml_tensor * src) {
+
+    LM_GGML_ASSERT(src != NULL);
+    LM_GGML_ASSERT(src->data && "graph must be allocated");
+
+    size_t id = lm_ggml_hash_insert(hash_set, src);
+    if (id == LM_GGML_HASHTABLE_ALREADY_EXISTS) {
+        return node_copies[lm_ggml_hash_find(hash_set, src)];
+    }
+
+    struct lm_ggml_tensor * dst = lm_ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
+    if (src->view_src != NULL) {
+        dst->view_src = graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
+        dst->view_offs = src->view_offs;
+    }
+    dst->op = src->op;
+    memcpy(dst->op_params, src->op_params, sizeof(dst->op_params));
+    lm_ggml_set_name(dst, src->name);
+
+    // copy src
+    for (int i = 0; i < LM_GGML_MAX_SRC; i++) {
+        struct lm_ggml_tensor * s = src->src[i];
+        if (s == NULL) {
+            break;
+        }
+        dst->src[i] = graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
+    }
+
+    node_copies[id] = dst;
+    return dst;
+}
+
+static void graph_init_tensor(struct lm_ggml_hash_set hash_set, struct lm_ggml_tensor ** node_copies, bool * node_init, struct lm_ggml_tensor * src) {
+    size_t id = lm_ggml_hash_find(hash_set, src);
+    if (node_init[id]) {
+        return;
+    }
+    node_init[id] = true;
+
+    struct lm_ggml_tensor * dst = node_copies[id];
+    if (dst->view_src != NULL) {
+        lm_ggml_backend_view_init(dst->view_src->buffer, dst);
+    }
+    else {
+        lm_ggml_backend_tensor_copy(src, dst);
+    }
+
+    // init src
+    for (int i = 0; i < LM_GGML_MAX_SRC; i++) {
+        struct lm_ggml_tensor * s = src->src[i];
+        if (s == NULL) {
+            break;
+        }
+        graph_init_tensor(hash_set, node_copies, node_init, s);
+    }
+}
+
+struct lm_ggml_backend_graph_copy lm_ggml_backend_graph_copy(lm_ggml_backend_t backend, struct lm_ggml_cgraph * graph) {
+    struct lm_ggml_hash_set hash_set = {
+        /* .size = */ graph->visited_hash_table.size,
+        /* .keys = */ calloc(sizeof(hash_set.keys[0]) * graph->visited_hash_table.size, 1)
+    };
+    struct lm_ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]) * hash_set.size, 1);
+    bool * node_init = calloc(sizeof(node_init[0]) * hash_set.size, 1);
+
+    struct lm_ggml_init_params params = {
+        /* .mem_size   = */ lm_ggml_tensor_overhead()*hash_set.size + lm_ggml_graph_overhead_custom(graph->size, false),
+        /* .mem_buffer = */ NULL,
+        /* .no_alloc   = */ true
+    };
+
+    struct lm_ggml_context * ctx_allocated = lm_ggml_init(params);
+    struct lm_ggml_context * ctx_unallocated = lm_ggml_init(params);
+
+    // dup nodes
+    for (int i = 0; i < graph->n_nodes; i++) {
+        struct lm_ggml_tensor * node = graph->nodes[i];
+        graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
+    }
+
+    // allocate nodes
+    lm_ggml_backend_buffer_t buffer = lm_ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
+
+    //printf("copy buffer size: %zu MB\n", lm_ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
+
+    // copy data and init views
+    for (int i = 0; i < graph->n_nodes; i++) {
+        struct lm_ggml_tensor * node = graph->nodes[i];
+        graph_init_tensor(hash_set, node_copies, node_init, node);
+    }
+
+    // build graph copy
+    struct lm_ggml_cgraph * graph_copy = lm_ggml_new_graph_custom(ctx_allocated, graph->size, false);
+    for (int i = 0; i < graph->n_nodes; i++) {
+        struct lm_ggml_tensor * node = graph->nodes[i];
+        struct lm_ggml_tensor * node_copy = node_copies[lm_ggml_hash_find(hash_set, node)];
+        graph_copy->nodes[i] = node_copy;
+    }
+    graph_copy->n_nodes = graph->n_nodes;
+
+    free(hash_set.keys);
+    free(node_copies);
+    free(node_init);
+
+    return (struct lm_ggml_backend_graph_copy) {
+        /* .buffer           = */ buffer,
+        /* .ctx_allocated    = */ ctx_allocated,
+        /* .ctx_unallocated  = */ ctx_unallocated,
+        /* .graph            = */ graph_copy,
+    };
+}
+
+void lm_ggml_backend_graph_copy_free(struct lm_ggml_backend_graph_copy copy) {
+    lm_ggml_backend_buffer_free(copy.buffer);
+    lm_ggml_free(copy.ctx_allocated);
+    lm_ggml_free(copy.ctx_unallocated);
+}
+
+void lm_ggml_backend_compare_graph_backend(lm_ggml_backend_t backend1, lm_ggml_backend_t backend2, struct lm_ggml_cgraph * graph, lm_ggml_backend_eval_callback callback, void * user_data) {
+    struct lm_ggml_backend_graph_copy copy = lm_ggml_backend_graph_copy(backend2, graph);
+    struct lm_ggml_cgraph * g1 = graph;
+    struct lm_ggml_cgraph * g2 = copy.graph;
+
+    assert(g1->n_nodes == g2->n_nodes);
+
+    for (int i = 0; i < g1->n_nodes; i++) {
+        //printf("eval %d/%d\n", i, g1->n_nodes);
+        struct lm_ggml_tensor * t1 = g1->nodes[i];
+        struct lm_ggml_tensor * t2 = g2->nodes[i];
+
+        assert(t1->op == t2->op && lm_ggml_are_same_layout(t1, t2));
+
+        struct lm_ggml_cgraph g1v = lm_ggml_graph_view(g1, i, i + 1);
+        struct lm_ggml_cgraph g2v = lm_ggml_graph_view(g2, i, i + 1);
+
+        lm_ggml_backend_graph_compute(backend1, &g1v);
+        lm_ggml_backend_graph_compute(backend2, &g2v);
+
+        if (lm_ggml_is_view_op(t1->op)) {
+            continue;
+        }
+
+        // compare results, calculate rms etc
+        if (!callback(i, t1, t2, user_data)) {
+            break;
+        }
+    }
+
+    lm_ggml_backend_graph_copy_free(copy);
 }
diff --git a/cpp/ggml-backend.h b/cpp/ggml-backend.h
index 33cc8a5..63fa013 100644
--- a/cpp/ggml-backend.h
+++ b/cpp/ggml-backend.h
@@ -1,115 +1,50 @@
 #pragma once
 
 #include "ggml.h"
+#include "ggml-alloc.h"
 
 #ifdef  __cplusplus
 extern "C" {
 #endif
-    struct lm_ggml_backend;
-    struct lm_ggml_backend_buffer;
 
-    // type-erased backend-specific types / wrappers
-    typedef void * lm_ggml_backend_context_t;
-    typedef void * lm_ggml_backend_graph_plan_t;
-    typedef void * lm_ggml_backend_buffer_context_t;
-
-    // avoid accessing internals of these types
-    typedef struct lm_ggml_backend        * lm_ggml_backend_t;
+    typedef struct lm_ggml_backend_buffer_type * lm_ggml_backend_buffer_type_t;
     typedef struct lm_ggml_backend_buffer * lm_ggml_backend_buffer_t;
+    typedef struct lm_ggml_backend * lm_ggml_backend_t;
+    typedef void * lm_ggml_backend_graph_plan_t;
 
     //
-    // backend buffer
+    // Backend buffer
     //
 
-    struct lm_ggml_backend_buffer_i {
-        void   (*free_buffer)   (lm_ggml_backend_buffer_t buffer);
-        void * (*get_base)      (lm_ggml_backend_buffer_t buffer); // get base pointer
-        size_t (*get_alloc_size)(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor); // pre-allocation callback
-        void   (*init_tensor)   (lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor); // post-allocation callback
-        void   (*free_tensor)   (lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor); // pre-free callback
-    };
-
-    // TODO: hide behind API
-    struct lm_ggml_backend_buffer {
-        struct lm_ggml_backend_buffer_i iface;
-
-        lm_ggml_backend_t                backend;
-        lm_ggml_backend_buffer_context_t context;
-
-        size_t size;
-    };
-
-    // backend buffer functions
-    LM_GGML_API lm_ggml_backend_buffer_t lm_ggml_backend_buffer_init(
-            struct lm_ggml_backend                  * backend,
-            struct lm_ggml_backend_buffer_i           iface,
-                   lm_ggml_backend_buffer_context_t   context,
-                   size_t                          size);
+    // buffer type
+    LM_GGML_API lm_ggml_backend_buffer_t lm_ggml_backend_buft_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size);
+    LM_GGML_API size_t lm_ggml_backend_buft_get_alignment (lm_ggml_backend_buffer_type_t buft);
+    LM_GGML_API size_t lm_ggml_backend_buft_get_alloc_size(lm_ggml_backend_buffer_type_t buft, struct lm_ggml_tensor * tensor);
+    LM_GGML_API bool lm_ggml_backend_buft_supports_backend(lm_ggml_backend_buffer_type_t buft, lm_ggml_backend_t backend);
 
+    // buffer
     LM_GGML_API void   lm_ggml_backend_buffer_free          (lm_ggml_backend_buffer_t buffer);
-    LM_GGML_API size_t lm_ggml_backend_buffer_get_alignment (lm_ggml_backend_buffer_t buffer);
     LM_GGML_API void * lm_ggml_backend_buffer_get_base      (lm_ggml_backend_buffer_t buffer);
     LM_GGML_API size_t lm_ggml_backend_buffer_get_size      (lm_ggml_backend_buffer_t buffer);
-    LM_GGML_API size_t lm_ggml_backend_buffer_get_alloc_size(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor);
     LM_GGML_API void   lm_ggml_backend_buffer_init_tensor   (lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor);
-    LM_GGML_API void   lm_ggml_backend_buffer_free_tensor   (lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor);
+    LM_GGML_API size_t lm_ggml_backend_buffer_get_alignment (lm_ggml_backend_buffer_t buffer);
+    LM_GGML_API size_t lm_ggml_backend_buffer_get_alloc_size(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor);
+    LM_GGML_API lm_ggml_backend_buffer_type_t lm_ggml_backend_buffer_type(lm_ggml_backend_buffer_t buffer);
 
     //
-    // backend
+    // Backend
     //
 
-    struct lm_ggml_backend_i {
-        const char * (*get_name)(lm_ggml_backend_t backend);
-
-        void (*free)(lm_ggml_backend_t backend);
-
-        // buffer allocation
-        lm_ggml_backend_buffer_t (*alloc_buffer)(lm_ggml_backend_t backend, size_t size);
-
-        // get buffer alignment
-        size_t (*get_alignment)(lm_ggml_backend_t backend);
-
-        // tensor data access
-        // these functions can be asynchronous, helper functions are provided for synchronous access that automatically call synchronize
-        void (*set_tensor_async)(lm_ggml_backend_t backend,       struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-        void (*get_tensor_async)(lm_ggml_backend_t backend, const struct lm_ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-        void (*synchronize)     (lm_ggml_backend_t backend);
-
-        // (optional) copy tensor between different backends, allow for single-copy tranfers
-        void (*cpy_tensor_from)(lm_ggml_backend_t backend, struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst);
-        void (*cpy_tensor_to)  (lm_ggml_backend_t backend, struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst);
-
-        // compute graph with a plan
-        lm_ggml_backend_graph_plan_t (*graph_plan_create) (lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph);
-        void                      (*graph_plan_free)   (lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan);
-        void                      (*graph_plan_compute)(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan);
-
-        // compute graph without a plan
-        void (*graph_compute)(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph);
-
-        // check if the backend supports an operation
-        bool (*supports_op)(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op);
-    };
-
-    // TODO: hide behind API
-    struct lm_ggml_backend {
-        struct lm_ggml_backend_i iface;
-
-        lm_ggml_backend_context_t context;
-    };
-
-    // backend helper functions
-    LM_GGML_API lm_ggml_backend_t lm_ggml_get_backend(const struct lm_ggml_tensor * tensor);
 
     LM_GGML_API const char * lm_ggml_backend_name(lm_ggml_backend_t backend);
     LM_GGML_API void         lm_ggml_backend_free(lm_ggml_backend_t backend);
 
-    LM_GGML_API lm_ggml_backend_buffer_t lm_ggml_backend_alloc_buffer(lm_ggml_backend_t backend, size_t size);
+    LM_GGML_API lm_ggml_backend_buffer_type_t lm_ggml_backend_get_default_buffer_type(lm_ggml_backend_t backend);
+    LM_GGML_API lm_ggml_backend_buffer_t      lm_ggml_backend_alloc_buffer(lm_ggml_backend_t backend, size_t size);
+    LM_GGML_API size_t                     lm_ggml_backend_get_alignment(lm_ggml_backend_t backend);
 
-    LM_GGML_API size_t lm_ggml_backend_get_alignment(lm_ggml_backend_t backend);
-
-    LM_GGML_API void lm_ggml_backend_tensor_set_async(      struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-    LM_GGML_API void lm_ggml_backend_tensor_get_async(const struct lm_ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+    LM_GGML_API void lm_ggml_backend_tensor_set_async(lm_ggml_backend_t backend,       struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+    LM_GGML_API void lm_ggml_backend_tensor_get_async(lm_ggml_backend_t backend, const struct lm_ggml_tensor * tensor,       void * data, size_t offset, size_t size);
 
     LM_GGML_API void lm_ggml_backend_tensor_set(      struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size);
     LM_GGML_API void lm_ggml_backend_tensor_get(const struct lm_ggml_tensor * tensor,       void * data, size_t offset, size_t size);
@@ -125,6 +60,7 @@ extern "C" {
 
     // tensor copy between different backends
     LM_GGML_API void lm_ggml_backend_tensor_copy(struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst);
+    LM_GGML_API void lm_ggml_backend_tensor_copy_async(lm_ggml_backend_t backend, struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst); // automatic fallback to sync copy
 
     //
     // CPU backend
@@ -133,10 +69,112 @@ extern "C" {
     LM_GGML_API lm_ggml_backend_t lm_ggml_backend_cpu_init(void);
 
     LM_GGML_API bool lm_ggml_backend_is_cpu(lm_ggml_backend_t backend);
-
     LM_GGML_API void lm_ggml_backend_cpu_set_n_threads(lm_ggml_backend_t backend_cpu, int n_threads);
 
-    LM_GGML_API lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_from_ptr(lm_ggml_backend_t backend_cpu, void * ptr, size_t size);
+    // Create a backend buffer from an existing pointer
+    LM_GGML_API lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
+
+    LM_GGML_API lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_buffer_type(void);
+
+    //
+    // Backend registry
+    //
+
+    // The backend registry is a registry of all the available backends, and allows initializing backends in a generic way
+
+    LM_GGML_API size_t                     lm_ggml_backend_reg_get_count(void);
+    LM_GGML_API size_t                     lm_ggml_backend_reg_find_by_name(const char * name);
+    LM_GGML_API lm_ggml_backend_t             lm_ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is name[:params]
+    LM_GGML_API const char *               lm_ggml_backend_reg_get_name(size_t i);
+    LM_GGML_API lm_ggml_backend_t             lm_ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
+    LM_GGML_API lm_ggml_backend_buffer_type_t lm_ggml_backend_reg_get_default_buffer_type(size_t i);
+    LM_GGML_API lm_ggml_backend_buffer_t      lm_ggml_backend_reg_alloc_buffer(size_t i, size_t size);
+
+    //
+    // Backend scheduler
+    //
+
+    // The backend scheduler allows for multiple backends to be used together
+    // Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
+    // The backends are selected based on:
+    // - the backend that supports the operation
+    // - the location of the pre-allocated tensors (e.g. the weights)
+    /*
+      Example usage:
+
+        sched = lm_ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, num_backends);
+        // sched is initialized with measure allocators and cannot be used until allocated with a measure graph
+
+        // initialize buffers from a measure graph
+        measure_graph = build_graph(sched); // use the allocr to allocate inputs as needed
+
+        // in build_graph:
+        build_graph(...) {
+            // allocating tensors in a specific backend (optional, recommended: pre-allocate inputs in a different buffer)
+            alloc_cpu = lm_ggml_backend_sched_get_allocr(sched, backend_cpu);
+            lm_ggml_allocr_alloc(alloc_cpu, tensor);
+
+            // manually assigning nodes to a backend (optional, shouldn't be needed in most cases)
+            struct lm_ggml_tensor * node = lm_ggml_mul_mat(ctx, ...);
+            lm_ggml_backend_sched_set_node_backend(sched, node, backend_gpu);
+        }
+
+        // allocate backend buffers from measure graph
+        lm_ggml_backend_sched_init_measure(sched, measure_graph);
+
+        // the scheduler is now ready to compute graphs
+
+        // compute
+        graph = build_graph(sched);
+        lm_ggml_backend_sched_graph_compute(sched, graph);
+    */
+
+    struct lm_ggml_backend_sched;
+    typedef struct lm_ggml_backend_sched * lm_ggml_backend_sched_t;
+
+    // Initialize a backend scheduler
+    LM_GGML_API lm_ggml_backend_sched_t lm_ggml_backend_sched_new(lm_ggml_backend_t * backends, int n_backends);
+
+    LM_GGML_API void lm_ggml_backend_sched_free(lm_ggml_backend_sched_t sched);
+
+    // Initialize backend buffers from a measure graph
+    LM_GGML_API void lm_ggml_backend_sched_init_measure(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * measure_graph);
+
+    LM_GGML_API lm_ggml_tallocr_t        lm_ggml_backend_sched_get_tallocr(lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend);
+    LM_GGML_API lm_ggml_backend_buffer_t lm_ggml_backend_sched_get_buffer (lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend);
+
+    LM_GGML_API void lm_ggml_backend_sched_set_node_backend(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node, lm_ggml_backend_t backend);
+
+    // Allocate a graph on the backend scheduler
+    LM_GGML_API void lm_ggml_backend_sched_graph_compute(
+            lm_ggml_backend_sched_t sched,
+            struct lm_ggml_cgraph * graph);
+
+
+    //
+    // Utils
+    //
+
+    struct lm_ggml_backend_graph_copy {
+        lm_ggml_backend_buffer_t buffer;
+        struct lm_ggml_context * ctx_allocated;
+        struct lm_ggml_context * ctx_unallocated;
+        struct lm_ggml_cgraph * graph;
+    };
+
+    // Copy a graph to a different backend
+    LM_GGML_API struct lm_ggml_backend_graph_copy lm_ggml_backend_graph_copy(lm_ggml_backend_t backend, struct lm_ggml_cgraph * graph);
+    LM_GGML_API void                           lm_ggml_backend_graph_copy_free(struct lm_ggml_backend_graph_copy copy);
+
+    typedef bool (*lm_ggml_backend_eval_callback)(int node_index, struct lm_ggml_tensor * t1, struct lm_ggml_tensor * t2, void * user_data);
+
+    // Compare the output of two backends
+    LM_GGML_API void lm_ggml_backend_compare_graph_backend(lm_ggml_backend_t backend1, lm_ggml_backend_t backend2, struct lm_ggml_cgraph * graph, lm_ggml_backend_eval_callback callback, void * user_data);
+
+    // Tensor initialization
+    LM_GGML_API void lm_ggml_backend_tensor_alloc(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, void * addr);
+    LM_GGML_API void lm_ggml_backend_view_init(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor);
+
 
 #ifdef  __cplusplus
 }
diff --git a/cpp/ggml-impl.h b/cpp/ggml-impl.h
index 07881b7..998c0ba 100644
--- a/cpp/ggml-impl.h
+++ b/cpp/ggml-impl.h
@@ -39,12 +39,6 @@ extern "C" {
 #endif
 #endif
 
-#undef MIN
-#undef MAX
-
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-
 // 16-bit float
 // on Arm, we use __fp16
 // on x86, we use uint16_t
@@ -230,7 +224,19 @@ inline static float lm_ggml_lookup_fp16_to_fp32(lm_ggml_fp16_t f) {
 
 #endif
 
-    // TODO: backend v2 PR
+#define LM_GGML_HASHTABLE_FULL ((size_t)-1)
+#define LM_GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2)
+
+bool   lm_ggml_hash_contains      (const struct lm_ggml_hash_set hash_set, struct lm_ggml_tensor * key);
+
+// returns LM_GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
+size_t lm_ggml_hash_find          (const struct lm_ggml_hash_set hash_set, struct lm_ggml_tensor * key);
+
+// returns LM_GGML_HASHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
+size_t lm_ggml_hash_insert        (      struct lm_ggml_hash_set hash_set, struct lm_ggml_tensor * key);
+
+// return index, asserts if table is full
+size_t lm_ggml_hash_find_or_insert(      struct lm_ggml_hash_set hash_set, struct lm_ggml_tensor * key);
 
 #ifdef __cplusplus
 }
diff --git a/cpp/ggml-metal-llama.metal b/cpp/ggml-metal-llama.metal
index 7c35f23..2f8ea22 100644
--- a/cpp/ggml-metal-llama.metal
+++ b/cpp/ggml-metal-llama.metal
@@ -3,6 +3,8 @@
 using namespace metal;
 
 #define MAX(x, y) ((x) > (y) ? (x) : (y))
+#define MIN(x, y) ((x) < (y) ? (x) : (y))
+#define SWAP(x, y) { auto tmp = (x); (x) = (y); (y) = tmp; }
 
 #define QK4_0 32
 #define QR4_0 2
@@ -39,8 +41,15 @@ typedef struct {
     int8_t  qs[QK8_0]; // quants
 } block_q8_0;
 
-// general-purpose kernel for addition of two tensors
-// pros: works for non-contiguous tensors, supports broadcast across dims 1, 2 and 3
+#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
+
+enum ggml_sort_order {
+    GGML_SORT_ASC,
+    GGML_SORT_DESC,
+};
+
+// general-purpose kernel for addition, multiplication and division of two tensors
+// pros: works for non-contiguous tensors, supports broadcast across all dims
 // cons: not very efficient
 kernel void kernel_add(
         device const char * src0,
@@ -81,16 +90,111 @@ kernel void kernel_add(
     const int64_t i12 = i02 % ne12;
     const int64_t i11 = i01 % ne11;
 
-    device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01 + tpitg.x*nb00;
-    device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11 + tpitg.x*nb10;
-    device       char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1  + tpitg.x*nb0;
+    device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
+    device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
+    device       char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1;
 
     for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
-        ((device float *)dst_ptr)[0] = ((device float *)src0_ptr)[0] + ((device float *)src1_ptr)[0];
+        const int i10 = i0 % ne10;
+        *((device float *)(dst_ptr + i0*nb0)) = *((device float *)(src0_ptr + i0*nb00)) + *((device float *)(src1_ptr + i10*nb10));
+    }
+}
+
+kernel void kernel_mul(
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        constant  int64_t & ne00,
+        constant  int64_t & ne01,
+        constant  int64_t & ne02,
+        constant  int64_t & ne03,
+        constant  int64_t & nb00,
+        constant  int64_t & nb01,
+        constant  int64_t & nb02,
+        constant  int64_t & nb03,
+        constant  int64_t & ne10,
+        constant  int64_t & ne11,
+        constant  int64_t & ne12,
+        constant  int64_t & ne13,
+        constant  int64_t & nb10,
+        constant  int64_t & nb11,
+        constant  int64_t & nb12,
+        constant  int64_t & nb13,
+        constant  int64_t & ne0,
+        constant  int64_t & ne1,
+        constant  int64_t & ne2,
+        constant  int64_t & ne3,
+        constant  int64_t & nb0,
+        constant  int64_t & nb1,
+        constant  int64_t & nb2,
+        constant  int64_t & nb3,
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint3   ntg[[threads_per_threadgroup]]) {
+    const int64_t i03 = tgpig.z;
+    const int64_t i02 = tgpig.y;
+    const int64_t i01 = tgpig.x;
+
+    const int64_t i13 = i03 % ne13;
+    const int64_t i12 = i02 % ne12;
+    const int64_t i11 = i01 % ne11;
+
+    device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
+    device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
+    device       char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1;
 
-        src0_ptr += ntg.x*nb00;
-        src1_ptr += ntg.x*nb10;
-        dst_ptr  += ntg.x*nb0;
+    for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
+        const int i10 = i0 % ne10;
+        *((device float *)(dst_ptr + i0*nb0)) = *((device float *)(src0_ptr + i0*nb00)) * *((device float *)(src1_ptr + i10*nb10));
+    }
+}
+
+kernel void kernel_div(
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        constant  int64_t & ne00,
+        constant  int64_t & ne01,
+        constant  int64_t & ne02,
+        constant  int64_t & ne03,
+        constant  int64_t & nb00,
+        constant  int64_t & nb01,
+        constant  int64_t & nb02,
+        constant  int64_t & nb03,
+        constant  int64_t & ne10,
+        constant  int64_t & ne11,
+        constant  int64_t & ne12,
+        constant  int64_t & ne13,
+        constant  int64_t & nb10,
+        constant  int64_t & nb11,
+        constant  int64_t & nb12,
+        constant  int64_t & nb13,
+        constant  int64_t & ne0,
+        constant  int64_t & ne1,
+        constant  int64_t & ne2,
+        constant  int64_t & ne3,
+        constant  int64_t & nb0,
+        constant  int64_t & nb1,
+        constant  int64_t & nb2,
+        constant  int64_t & nb3,
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint3   ntg[[threads_per_threadgroup]]) {
+    const int64_t i03 = tgpig.z;
+    const int64_t i02 = tgpig.y;
+    const int64_t i01 = tgpig.x;
+
+    const int64_t i13 = i03 % ne13;
+    const int64_t i12 = i02 % ne12;
+    const int64_t i11 = i01 % ne11;
+
+    device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
+    device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
+    device       char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1;
+
+    for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
+        const int i10 = i0 % ne10;
+        *((device float *)(dst_ptr + i0*nb0)) = *((device float *)(src0_ptr + i0*nb00)) / *((device float *)(src1_ptr + i10*nb10));
     }
 }
 
@@ -105,23 +209,22 @@ kernel void kernel_add_row(
     dst[tpig] = src0[tpig] + src1[tpig % nb];
 }
 
-kernel void kernel_mul(
+kernel void kernel_mul_row(
         device const float4 * src0,
         device const float4 * src1,
         device       float4 * dst,
+        constant    int64_t & nb  [[buffer(27)]],
         uint tpig[[thread_position_in_grid]]) {
-    dst[tpig] = src0[tpig] * src1[tpig];
+    dst[tpig] = src0[tpig] * src1[tpig % nb];
 }
 
-// assumption: src1 is a row
-// broadcast src1 into src0
-kernel void kernel_mul_row(
+kernel void kernel_div_row(
         device const float4 * src0,
         device const float4 * src1,
         device       float4 * dst,
-        constant    int64_t & nb,
+        constant    int64_t & nb  [[buffer(27)]],
         uint tpig[[thread_position_in_grid]]) {
-    dst[tpig] = src0[tpig] * src1[tpig % nb];
+    dst[tpig] = src0[tpig] / src1[tpig % nb];
 }
 
 kernel void kernel_scale(
@@ -162,6 +265,54 @@ kernel void kernel_sqr(
     dst[tpig] = src0[tpig] * src0[tpig];
 }
 
+kernel void kernel_sum_rows(
+        device const float * src0,
+        device       float * dst,
+        constant  int64_t & ne00,
+        constant  int64_t & ne01,
+        constant  int64_t & ne02,
+        constant  int64_t & ne03,
+        constant  int64_t & nb00,
+        constant  int64_t & nb01,
+        constant  int64_t & nb02,
+        constant  int64_t & nb03,
+        constant  int64_t & ne10,
+        constant  int64_t & ne11,
+        constant  int64_t & ne12,
+        constant  int64_t & ne13,
+        constant  int64_t & nb10,
+        constant  int64_t & nb11,
+        constant  int64_t & nb12,
+        constant  int64_t & nb13,
+        constant  int64_t & ne0,
+        constant  int64_t & ne1,
+        constant  int64_t & ne2,
+        constant  int64_t & ne3,
+        constant  int64_t & nb0,
+        constant  int64_t & nb1,
+        constant  int64_t & nb2,
+        constant  int64_t & nb3,
+        uint3 tpig[[thread_position_in_grid]]) {
+    int64_t i3 = tpig.z;
+    int64_t i2 = tpig.y;
+    int64_t i1 = tpig.x;
+
+    if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) {
+        return;
+    }
+
+    device const float * src_row = (device const float *) ((device const char *) src0 + i1*nb01 + i2*nb02 + i3*nb03);
+    device       float * dst_row = (device       float *) ((device       char *) dst  + i1*nb1  + i2*nb2  + i3*nb3);
+
+    float row_sum = 0;
+
+    for (int64_t i0 = 0; i0 < ne00; i0++) {
+        row_sum += src_row[i0];
+    }
+
+    dst_row[0] = row_sum;
+}
+
 constant float GELU_COEF_A    = 0.044715f;
 constant float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
 
@@ -180,10 +331,12 @@ kernel void kernel_gelu(
 
 kernel void kernel_soft_max(
         device const float * src0,
+        device const float * src1,
         device       float * dst,
         constant   int64_t & ne00,
         constant   int64_t & ne01,
         constant   int64_t & ne02,
+        constant     float & scale,
         threadgroup float  * buf [[threadgroup(0)]],
         uint  tgpig[[threadgroup_position_in_grid]],
         uint  tpitg[[thread_position_in_threadgroup]],
@@ -194,73 +347,77 @@ kernel void kernel_soft_max(
     const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01;
     const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01);
 
-    device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
-    device       float * pdst  = dst  + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+    device const float * psrc0 =        src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+    device const float * pmask = src1 ? src1                                      + i01*ne00 : nullptr;
+    device       float * pdst  =        dst  + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
 
     // parallel max
-    float lmax = tpitg < ne00 ? psrc0[tpitg] : -INFINITY;
+    float lmax = -INFINITY;
 
-    for (int i00 = tpitg + ntg; i00 < ne00; i00 += ntg) {
-        lmax = MAX(lmax, psrc0[i00]);
+    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
+        lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f));
     }
 
-    float max = simd_max(lmax);
-    if (tiisg == 0) {
-        buf[sgitg] = max;
-    }
+    // find the max value in the block
+    float max_val = simd_max(lmax);
+    if (ntg > N_SIMDWIDTH) {
+        if (sgitg == 0) {
+            buf[tiisg] = -INFINITY;
+        }
 
-    threadgroup_barrier(mem_flags::mem_threadgroup);
+        threadgroup_barrier(mem_flags::mem_threadgroup);
 
-    // broadcast, simd group number is ntg / 32
-    for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
-       if (tpitg < i) {
-           buf[tpitg] = MAX(buf[tpitg], buf[tpitg + i]);
-       }
-    }
+        if (tiisg == 0) {
+            buf[sgitg] = max_val;
+        }
 
-    threadgroup_barrier(mem_flags::mem_threadgroup);
+        threadgroup_barrier(mem_flags::mem_threadgroup);
 
-    max = buf[0];
+        max_val = buf[tiisg];
+        max_val = simd_max(max_val);
+    }
 
     // parallel sum
     float lsum = 0.0f;
     for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
-        const float exp_psrc0 = exp(psrc0[i00] - max);
+        const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f)) - max_val);
         lsum += exp_psrc0;
-        // Remember the result of exp here. exp is expensive, so we really do not
-        // wish to compute it twice.
         pdst[i00] = exp_psrc0;
     }
 
     float sum = simd_sum(lsum);
-    if (tiisg == 0) {
-        buf[sgitg] = sum;
-    }
+    if (ntg > N_SIMDWIDTH) {
+        if (sgitg == 0) {
+            buf[tiisg] = 0.0f;
+        }
 
-    threadgroup_barrier(mem_flags::mem_threadgroup);
+        threadgroup_barrier(mem_flags::mem_threadgroup);
 
-    // broadcast, simd group number is ntg / 32
-    for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
-       if (tpitg < i) {
-           buf[tpitg] += buf[tpitg + i];
-       }
-    }
+        if (tiisg == 0) {
+            buf[sgitg] = sum;
+        }
 
-    threadgroup_barrier(mem_flags::mem_threadgroup);
+        threadgroup_barrier(mem_flags::mem_threadgroup);
 
-    sum = buf[0];
+        sum = buf[tiisg];
+        sum = simd_sum(sum);
+    }
+
+    const float inv_sum = 1.0f/sum;
 
     for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
-        pdst[i00] /= sum;
+        pdst[i00] *= inv_sum;
     }
 }
 
 kernel void kernel_soft_max_4(
         device const float * src0,
+        device const float * src1,
         device       float * dst,
         constant   int64_t & ne00,
         constant   int64_t & ne01,
         constant   int64_t & ne02,
+        constant     float & scale,
         threadgroup float  * buf [[threadgroup(0)]],
         uint  tgpig[[threadgroup_position_in_grid]],
         uint  tpitg[[thread_position_in_threadgroup]],
@@ -271,64 +428,68 @@ kernel void kernel_soft_max_4(
     const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01;
     const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01);
 
-    device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
-    device       float4 * pdst4 = (device       float4 *)(dst  + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
+    device const float4 * psrc4 =        (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
+    device const float4 * pmask = src1 ? (device const float4 *)(src1 +                                      i01*ne00) : nullptr;
+    device       float4 * pdst4 =        (device       float4 *)(dst  + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
 
     // parallel max
-    float4 lmax4 = tpitg < ne00/4 ? psrc4[tpitg] : -INFINITY;
+    float4 lmax4 = -INFINITY;
 
-    for (int i00 = tpitg + ntg; i00 < ne00/4; i00 += ntg) {
-        lmax4 = fmax(lmax4, psrc4[i00]);
+    for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
+        lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f));
     }
 
     const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
-    float max = simd_max(lmax);
-    if (tiisg == 0) {
-        buf[sgitg] = max;
-    }
 
-    threadgroup_barrier(mem_flags::mem_threadgroup);
+    float max_val = simd_max(lmax);
+    if (ntg > N_SIMDWIDTH) {
+        if (sgitg == 0) {
+            buf[tiisg] = -INFINITY;
+        }
 
-    // broadcast, simd group number is ntg / 32
-    for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
-       if (tpitg < i) {
-           buf[tpitg] = MAX(buf[tpitg], buf[tpitg + i]);
-       }
-    }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
 
-    threadgroup_barrier(mem_flags::mem_threadgroup);
+        if (tiisg == 0) {
+            buf[sgitg] = max_val;
+        }
 
-    max = buf[0];
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        max_val = buf[tiisg];
+        max_val = simd_max(max_val);
+    }
 
     // parallel sum
     float4 lsum4 = 0.0f;
     for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
-        const float4 exp_psrc4 = exp(psrc4[i00] - max);
+        const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f)) - max_val);
         lsum4 += exp_psrc4;
         pdst4[i00] = exp_psrc4;
     }
 
     const float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3];
     float sum = simd_sum(lsum);
-    if (tiisg == 0) {
-        buf[sgitg] = sum;
-    }
+    if (ntg > N_SIMDWIDTH) {
+        if (sgitg == 0) {
+            buf[tiisg] = 0.0f;
+        }
 
-    threadgroup_barrier(mem_flags::mem_threadgroup);
+        threadgroup_barrier(mem_flags::mem_threadgroup);
 
-    // broadcast, simd group number is ntg / 32
-    for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
-       if (tpitg < i) {
-           buf[tpitg] += buf[tpitg + i];
-       }
-    }
+        if (tiisg == 0) {
+            buf[sgitg] = sum;
+        }
 
-    threadgroup_barrier(mem_flags::mem_threadgroup);
+        threadgroup_barrier(mem_flags::mem_threadgroup);
 
-    sum = buf[0];
+        sum = buf[tiisg];
+        sum = simd_sum(sum);
+    }
+
+    const float inv_sum = 1.0f/sum;
 
     for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
-        pdst4[i00] /= sum;
+        pdst4[i00] *= inv_sum;
     }
 }
 
@@ -435,14 +596,13 @@ kernel void kernel_rms_norm(
         constant   int64_t & ne00,
         constant  uint64_t & nb01,
         constant     float & eps,
-        threadgroup float  * sum [[threadgroup(0)]],
+        threadgroup float  * buf [[threadgroup(0)]],
         uint tgpig[[threadgroup_position_in_grid]],
         uint tpitg[[thread_position_in_threadgroup]],
         uint sgitg[[simdgroup_index_in_threadgroup]],
         uint tiisg[[thread_index_in_simdgroup]],
         uint   ntg[[threads_per_threadgroup]]) {
-    device const float4 * x        = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
-    device const float  * x_scalar = (device const float  *) x;
+    device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
 
     float4 sumf = 0;
     float all_sum = 0;
@@ -453,40 +613,30 @@ kernel void kernel_rms_norm(
     }
     all_sum = sumf[0] + sumf[1] + sumf[2] + sumf[3];
     all_sum = simd_sum(all_sum);
-    if (tiisg == 0) {
-        sum[sgitg] = all_sum;
-    }
+    if (ntg > N_SIMDWIDTH) {
+        if (sgitg == 0) {
+            buf[tiisg] = 0.0f;
+        }
 
-    threadgroup_barrier(mem_flags::mem_threadgroup);
+        threadgroup_barrier(mem_flags::mem_threadgroup);
 
-    // broadcast, simd group number is ntg / 32
-    for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
-       if (tpitg < i) {
-           sum[tpitg] += sum[tpitg + i];
-       }
-    }
-    if (tpitg == 0) {
-        for (int i = 4 * (ne00 / 4); i < ne00; i++) {
-            sum[0] += x_scalar[i];
+        if (tiisg == 0) {
+            buf[sgitg] = all_sum;
         }
-        sum[0] /= ne00;
-    }
 
-    threadgroup_barrier(mem_flags::mem_threadgroup);
+        threadgroup_barrier(mem_flags::mem_threadgroup);
 
-    const float mean  = sum[0];
+        all_sum = buf[tiisg];
+        all_sum = simd_sum(all_sum);
+    }
+
+    const float mean  = all_sum/ne00;
     const float scale = 1.0f/sqrt(mean + eps);
 
     device float4 * y = (device float4 *) (dst + tgpig*ne00);
-    device float * y_scalar = (device float *) y;
     for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
         y[i00] = x[i00] * scale;
     }
-    if (tpitg == 0) {
-        for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {
-            y_scalar[i00] = x_scalar[i00] * scale;
-        }
-    }
 }
 
 // function for calculate inner product between half a q4_0 block and 16 floats (yl), sumy is SUM(yl[i])
@@ -576,15 +726,25 @@ inline float block_q_n_dot_y(device const block_q5_1 * qb_curr, float sumy, thre
 // putting them in the kernel cause a significant performance penalty
 #define N_DST 4        // each SIMD group works on 4 rows
 #define N_SIMDGROUP 2  // number of SIMD groups in a thread group
-#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
 //Note: This is a template, but strictly speaking it only applies to
 //      quantizations where the block size is 32. It also does not
 //      giard against the number of rows not being divisible by
 //      N_DST, so this is another explicit assumption of the implementation.
 template<typename block_q_type, int nr, int nsg, int nw>
-void mul_vec_q_n_f32(device const void * src0, device const float * src1, device float * dst,
-                    int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne10, int64_t ne12, int64_t ne0, int64_t ne1, uint gqa,
-                    uint3 tgpig, uint tiisg, uint sgitg) {
+void mul_vec_q_n_f32(
+        device const void  * src0,
+        device const float * src1,
+        device       float * dst,
+                   int64_t   ne00,
+                   int64_t   ne01,
+                   int64_t   ne02,
+                   int64_t   ne10,
+                   int64_t   ne12,
+                   int64_t   ne0,
+                   int64_t   ne1,
+                   uint      r2,
+                   uint      r3,
+                   uint3 tgpig, uint tiisg, uint sgitg) {
     const int nb = ne00/QK4_0;
 
     const int r0 = tgpig.x;
@@ -593,7 +753,10 @@ void mul_vec_q_n_f32(device const void * src0, device const float * src1, device
 
     const int first_row = (r0 * nsg + sgitg) * nr;
 
-    const uint offset0 = first_row * nb + im/gqa*(nb*ne0);
+    const uint i12 = im%ne12;
+    const uint i13 = im/ne12;
+
+    const uint offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
 
     device const block_q_type * x = (device const block_q_type *) src0 + offset0;
     device const float        * y = (device const float        *) src1 + r1*ne10 + im*ne00*ne1;
@@ -643,13 +806,14 @@ kernel void kernel_mul_mv_q4_0_f32(
         constant   int64_t & ne02[[buffer(5)]],
         constant   int64_t & ne10[[buffer(9)]],
         constant   int64_t & ne12[[buffer(11)]],
-        constant   int64_t & ne0[[buffer(15)]],
-        constant   int64_t & ne1[[buffer(16)]],
-        constant   uint    & gqa[[buffer(17)]],
+        constant   int64_t & ne0 [[buffer(15)]],
+        constant   int64_t & ne1 [[buffer(16)]],
+        constant   uint    & r2  [[buffer(17)]],
+        constant   uint    & r3  [[buffer(18)]],
         uint3 tgpig[[threadgroup_position_in_grid]],
         uint  tiisg[[thread_index_in_simdgroup]],
         uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-    mul_vec_q_n_f32<block_q4_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
+    mul_vec_q_n_f32<block_q4_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,tgpig,tiisg,sgitg);
 }
 
 kernel void kernel_mul_mv_q4_1_f32(
@@ -661,13 +825,14 @@ kernel void kernel_mul_mv_q4_1_f32(
         constant   int64_t & ne02[[buffer(5)]],
         constant   int64_t & ne10[[buffer(9)]],
         constant   int64_t & ne12[[buffer(11)]],
-        constant   int64_t & ne0[[buffer(15)]],
-        constant   int64_t & ne1[[buffer(16)]],
-        constant   uint    & gqa[[buffer(17)]],
+        constant   int64_t & ne0 [[buffer(15)]],
+        constant   int64_t & ne1 [[buffer(16)]],
+        constant   uint    & r2  [[buffer(17)]],
+        constant   uint    & r3  [[buffer(18)]],
         uint3 tgpig[[threadgroup_position_in_grid]],
         uint tiisg[[thread_index_in_simdgroup]],
         uint sgitg[[simdgroup_index_in_threadgroup]]) {
-     mul_vec_q_n_f32<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
+     mul_vec_q_n_f32<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,tgpig,tiisg,sgitg);
 }
 
 kernel void kernel_mul_mv_q5_0_f32(
@@ -679,13 +844,14 @@ kernel void kernel_mul_mv_q5_0_f32(
         constant   int64_t & ne02[[buffer(5)]],
         constant   int64_t & ne10[[buffer(9)]],
         constant   int64_t & ne12[[buffer(11)]],
-        constant   int64_t & ne0[[buffer(15)]],
-        constant   int64_t & ne1[[buffer(16)]],
-        constant   uint    & gqa[[buffer(17)]],
+        constant   int64_t & ne0 [[buffer(15)]],
+        constant   int64_t & ne1 [[buffer(16)]],
+        constant   uint    & r2  [[buffer(17)]],
+        constant   uint    & r3  [[buffer(18)]],
         uint3 tgpig[[threadgroup_position_in_grid]],
         uint  tiisg[[thread_index_in_simdgroup]],
         uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-    mul_vec_q_n_f32<block_q5_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
+    mul_vec_q_n_f32<block_q5_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,tgpig,tiisg,sgitg);
 }
 
 kernel void kernel_mul_mv_q5_1_f32(
@@ -697,13 +863,14 @@ kernel void kernel_mul_mv_q5_1_f32(
         constant   int64_t & ne02[[buffer(5)]],
         constant   int64_t & ne10[[buffer(9)]],
         constant   int64_t & ne12[[buffer(11)]],
-        constant   int64_t & ne0[[buffer(15)]],
-        constant   int64_t & ne1[[buffer(16)]],
-        constant   uint    & gqa[[buffer(17)]],
+        constant   int64_t & ne0 [[buffer(15)]],
+        constant   int64_t & ne1 [[buffer(16)]],
+        constant   uint    & r2  [[buffer(17)]],
+        constant   uint    & r3  [[buffer(18)]],
         uint3 tgpig[[threadgroup_position_in_grid]],
         uint  tiisg[[thread_index_in_simdgroup]],
         uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-    mul_vec_q_n_f32<block_q5_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
+    mul_vec_q_n_f32<block_q5_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,tgpig,tiisg,sgitg);
 }
 
 
@@ -718,9 +885,10 @@ kernel void kernel_mul_mv_q8_0_f32(
         constant   int64_t & ne02[[buffer(5)]],
         constant   int64_t & ne10[[buffer(9)]],
         constant   int64_t & ne12[[buffer(11)]],
-        constant   int64_t & ne0[[buffer(15)]],
-        constant   int64_t & ne1[[buffer(16)]],
-        constant   uint    & gqa[[buffer(17)]],
+        constant   int64_t & ne0 [[buffer(15)]],
+        constant   int64_t & ne1 [[buffer(16)]],
+        constant   uint    & r2  [[buffer(17)]],
+        constant   uint    & r3  [[buffer(18)]],
         uint3 tgpig[[threadgroup_position_in_grid]],
         uint tiisg[[thread_index_in_simdgroup]],
         uint sgitg[[simdgroup_index_in_threadgroup]]) {
@@ -732,8 +900,14 @@ kernel void kernel_mul_mv_q8_0_f32(
     const int r0 = tgpig.x;
     const int r1 = tgpig.y;
     const int im = tgpig.z;
+
     const int first_row = (r0 * nsg + sgitg) * nr;
-    const uint offset0 = first_row * nb + im/gqa*(nb*ne0);
+
+    const uint i12 = im%ne12;
+    const uint i13 = im/ne12;
+
+    const uint offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+
     device const block_q8_0 * x = (device const block_q8_0 *) src0 + offset0;
     device const float      * y = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
 
@@ -791,14 +965,21 @@ kernel void kernel_mul_mv_f32_f32(
         constant  uint64_t & nb12,
         constant   int64_t & ne0,
         constant   int64_t & ne1,
+        constant   uint    & r2   [[buffer(17)]],
+        constant   uint    & r3   [[buffer(18)]],
         uint3 tgpig[[threadgroup_position_in_grid]],
-        uint tiisg[[thread_index_in_simdgroup]]) {
+        uint  tiisg[[thread_index_in_simdgroup]]) {
 
     const int64_t r0 = tgpig.x;
     const int64_t rb = tgpig.y*N_F32_F32;
     const int64_t im = tgpig.z;
 
-    device const float * x = (device const float *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
+    const uint i12 = im%ne12;
+    const uint i13 = im/ne12;
+
+    const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb02*ne02;
+
+    device const float * x = (device const float *) (src0 + offset0);
 
     if (ne00 < 128) {
         for (int row = 0; row < N_F32_F32; ++row) {
@@ -844,6 +1025,86 @@ kernel void kernel_mul_mv_f32_f32(
     }
 }
 
+#define N_F16_F16 4
+
+kernel void kernel_mul_mv_f16_f16(
+        device const  char * src0,
+        device const  char * src1,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01,
+        constant   int64_t & ne02,
+        constant  uint64_t & nb00,
+        constant  uint64_t & nb01,
+        constant  uint64_t & nb02,
+        constant   int64_t & ne10,
+        constant   int64_t & ne11,
+        constant   int64_t & ne12,
+        constant  uint64_t & nb10,
+        constant  uint64_t & nb11,
+        constant  uint64_t & nb12,
+        constant   int64_t & ne0,
+        constant   int64_t & ne1,
+        constant   uint    & r2   [[buffer(17)]],
+        constant   uint    & r3   [[buffer(18)]],
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint  tiisg[[thread_index_in_simdgroup]]) {
+
+    const int64_t r0 = tgpig.x;
+    const int64_t rb = tgpig.y*N_F16_F16;
+    const int64_t im = tgpig.z;
+
+    const uint i12 = im%ne12;
+    const uint i13 = im/ne12;
+
+    const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb02*ne02;
+
+    device const half * x = (device const half *) (src0 + offset0);
+
+    if (ne00 < 128) {
+        for (int row = 0; row < N_F16_F16; ++row) {
+            int r1 = rb + row;
+            if (r1 >= ne11) {
+                break;
+            }
+
+            device const half * y = (device const half *) (src1 + r1*nb11 + im*nb12);
+
+            float sumf = 0;
+            for (int i = tiisg; i < ne00; i += 32) {
+                sumf += (half) x[i] * (half) y[i];
+            }
+
+            float all_sum = simd_sum(sumf);
+            if (tiisg == 0) {
+                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+            }
+        }
+    } else {
+        device const half4 * x4 = (device const half4 *)x;
+        for (int row = 0; row < N_F16_F16; ++row) {
+            int r1 = rb + row;
+            if (r1 >= ne11) {
+                break;
+            }
+
+            device const half  * y  = (device const half  *) (src1 + r1*nb11 + im*nb12);
+            device const half4 * y4 = (device const half4 *) y;
+
+            float sumf = 0;
+            for (int i = tiisg; i < ne00/4; i += 32) {
+                for (int k = 0; k < 4; ++k) sumf += (half) x4[i][k] * y4[i][k];
+            }
+
+            float all_sum = simd_sum(sumf);
+            if (tiisg == 0) {
+                for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (half) x[i] * y[i];
+                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+            }
+        }
+    }
+}
+
 kernel void kernel_mul_mv_f16_f32_1row(
         device const  char * src0,
         device const  char * src1,
@@ -862,6 +1123,8 @@ kernel void kernel_mul_mv_f16_f32_1row(
         constant  uint64_t & nb12,
         constant   int64_t & ne0,
         constant   int64_t & ne1,
+        constant   uint    & r2   [[buffer(17)]],
+        constant   uint    & r3   [[buffer(18)]],
         uint3 tgpig[[threadgroup_position_in_grid]],
         uint  tiisg[[thread_index_in_simdgroup]]) {
 
@@ -869,7 +1132,12 @@ kernel void kernel_mul_mv_f16_f32_1row(
     const int64_t r1 = tgpig.y;
     const int64_t im = tgpig.z;
 
-    device const half  * x = (device const half  *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
+    const uint i12 = im%ne12;
+    const uint i13 = im/ne12;
+
+    const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb02*ne02;
+
+    device const half  * x = (device const half  *) (src0 + offset0);
     device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
 
     float sumf = 0;
@@ -916,6 +1184,8 @@ kernel void kernel_mul_mv_f16_f32(
         constant  uint64_t & nb12,
         constant   int64_t & ne0,
         constant   int64_t & ne1,
+        constant   uint    & r2   [[buffer(17)]],
+        constant   uint    & r3   [[buffer(18)]],
         uint3 tgpig[[threadgroup_position_in_grid]],
         uint tiisg[[thread_index_in_simdgroup]]) {
 
@@ -923,7 +1193,12 @@ kernel void kernel_mul_mv_f16_f32(
     const int64_t rb = tgpig.y*N_F16_F32;
     const int64_t im = tgpig.z;
 
-    device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
+    const uint i12 = im%ne12;
+    const uint i13 = im/ne12;
+
+    const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb02*ne02;
+
+    device const half * x = (device const half *) (src0 + offset0);
 
     if (ne00 < 128) {
         for (int row = 0; row < N_F16_F32; ++row) {
@@ -988,6 +1263,8 @@ kernel void kernel_mul_mv_f16_f32_l4(
         constant  uint64_t & nb12,
         constant   int64_t & ne0,
         constant   int64_t & ne1,
+        constant   uint    & r2   [[buffer(17)]],
+        constant   uint    & r3   [[buffer(18)]],
         uint3 tgpig[[threadgroup_position_in_grid]],
         uint tiisg[[thread_index_in_simdgroup]]) {
 
@@ -995,7 +1272,12 @@ kernel void kernel_mul_mv_f16_f32_l4(
     const int64_t r0 = tgpig.x;
     const int64_t im = tgpig.z;
 
-    device const half4 * x4 = (device const half4 *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
+    const uint i12 = im%ne12;
+    const uint i13 = im/ne12;
+
+    const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb02*ne02;
+
+    device const half4 * x4 = (device const half4 *) (src0 + offset0);
 
     for (int r1 = 0; r1 < nrows; ++r1) {
         device const float4 * y4 = (device const float4 *) (src1 + r1*nb11 + im*nb12);
@@ -1047,17 +1329,21 @@ kernel void kernel_alibi_f32(
     const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
     const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
     const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
+    const int64_t k = i3*ne3 + i2;
 
-    device float * dst_data = (device float *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
     float m_k;
-    if (i2 < n_heads_log2_floor) {
-        m_k = pow(m0, i2 + 1);
+    if (k < n_heads_log2_floor) {
+        m_k = pow(m0, k + 1);
     } else {
-        m_k = pow(m1, 2 * (i2 - n_heads_log2_floor) + 1);
+        m_k = pow(m1, 2 * (k - n_heads_log2_floor) + 1);
     }
+
+    device       char * dst_row = (device char *) dst + i3*nb3 + i2*nb2 + i1*nb1;
+    device const char * src_row = (device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01;
     for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
-        device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
-        dst_data[i00] = src[0] + m_k * (i00 - ne00 + 1);
+        const  float   src_v = *(device float *)(src_row + i00*nb00);
+        device float * dst_v =  (device float *)(dst_row + i00*nb0);
+        *dst_v = i00 * m_k + src_v;
     }
 }
 
@@ -1201,33 +1487,118 @@ kernel void kernel_rope(
             dst_data[1] = x0*sin_theta + x1*cos_theta;
         }
     } else {
-        for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
-            for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) {
+        for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
+            for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) {
+
+                // simplified from `(ib * n_dims + ic) * inv_ndims`
+                const float cur_rot = inv_ndims*ic - ib;
+
+                const float theta = theta_0 * pow(freq_base, cur_rot);
+                float cos_theta, sin_theta;
+                rope_yarn(theta, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
+
+                const int64_t i0 = ib*n_dims + ic/2;
+
+                device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+                device       T * dst_data  = (device T *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+                const float x0 = src[0];
+                const float x1 = src[n_dims/2];
+
+                dst_data[0]        = x0*cos_theta - x1*sin_theta;
+                dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
+            }
+        }
+    }
+}
+
+template [[host_name("kernel_rope_f32")]] kernel rope_t kernel_rope<float>;
+template [[host_name("kernel_rope_f16")]] kernel rope_t kernel_rope<half>;
+
+kernel void kernel_im2col_f16(
+        device const float * x,
+        device       half * dst,
+        constant   int32_t & ofs0,
+        constant   int32_t & ofs1,
+        constant   int32_t & IW,
+        constant   int32_t & IH,
+        constant   int32_t & CHW,
+        constant   int32_t & s0,
+        constant   int32_t & s1,
+        constant   int32_t & p0,
+        constant   int32_t & p1,
+        constant   int32_t & d0,
+        constant   int32_t & d1,
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3  tgpg[[threadgroups_per_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint3   ntg[[threads_per_threadgroup]]) {
+    const int32_t iiw = tgpig[2] * s0 + tpitg[2] * d0 - p0;
+    const int32_t iih = tgpig[1] * s1 + tpitg[1] * d1 - p1;
+
+    const int32_t offset_dst =
+        (tpitg[0] * tgpg[1] * tgpg[2] + tgpig[1] * tgpg[2] + tgpig[2]) * CHW +
+        (tgpig[0] * (ntg[1] * ntg[2]) + tpitg[1] * ntg[2] + tpitg[2]);
+
+    if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
+        dst[offset_dst] = 0.0f;
+    } else {
+        const int32_t offset_src = tpitg[0] * ofs0 + tgpig[0] * ofs1;
+        dst[offset_dst] = x[offset_src + iih * IW + iiw];
+    }
+}
 
-                // simplified from `(ib * n_dims + ic) * inv_ndims`
-                const float cur_rot = inv_ndims*ic - ib;
+// bitonic sort implementation following the CUDA kernels as reference
+typedef void (argsort_t)(
+        device const float * x,
+        device     int32_t * dst,
+        constant   int64_t & ncols,
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]]);
 
-                const float theta = theta_0 * pow(freq_base, cur_rot);
-                float cos_theta, sin_theta;
-                rope_yarn(theta, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
+template<ggml_sort_order order>
+kernel void kernel_argsort_f32_i32(
+        device const float   * x,
+        device       int32_t * dst,
+        constant     int64_t & ncols,
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]]) {
+    // bitonic sort
+    int col = tpitg[0];
+    int row = tgpig[1];
 
-                const int64_t i0 = ib*n_dims + ic/2;
+    if (col >= ncols) return;
 
-                device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                device       T * dst_data  = (device T *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+    device const float   * x_row   = x   + row * ncols;
+    device       int32_t * dst_row = dst + row * ncols;
 
-                const float x0 = src[0];
-                const float x1 = src[n_dims/2];
+    // initialize indices
+    if (col < ncols) {
+        dst_row[col] = col;
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
 
-                dst_data[0]        = x0*cos_theta - x1*sin_theta;
-                dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
+    for (int k = 2; k <= ncols; k *= 2) {
+        for (int j = k / 2; j > 0; j /= 2) {
+            int ixj = col ^ j;
+            if (ixj > col) {
+                if ((col & k) == 0) {
+                    if (order == GGML_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
+                        SWAP(dst_row[col], dst_row[ixj]);
+                    }
+                } else {
+                    if (order == GGML_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
+                        SWAP(dst_row[col], dst_row[ixj]);
+                    }
+                }
             }
+            threadgroup_barrier(mem_flags::mem_threadgroup);
         }
     }
 }
 
-template [[host_name("kernel_rope_f32")]] kernel rope_t kernel_rope<float>;
-template [[host_name("kernel_rope_f16")]] kernel rope_t kernel_rope<half>;
+template [[host_name("kernel_argsort_f32_i32_asc")]]  kernel argsort_t kernel_argsort_f32_i32<GGML_SORT_ASC>;
+template [[host_name("kernel_argsort_f32_i32_desc")]] kernel argsort_t kernel_argsort_f32_i32<GGML_SORT_DESC>;
 
 kernel void kernel_cpy_f16_f16(
         device const half * src0,
@@ -1354,6 +1725,197 @@ kernel void kernel_cpy_f32_f32(
     }
 }
 
+kernel void kernel_cpy_f32_q8_0(
+        device const float * src0,
+        device        void * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01,
+        constant   int64_t & ne02,
+        constant   int64_t & ne03,
+        constant  uint64_t & nb00,
+        constant  uint64_t & nb01,
+        constant  uint64_t & nb02,
+        constant  uint64_t & nb03,
+        constant   int64_t & ne0,
+        constant   int64_t & ne1,
+        constant   int64_t & ne2,
+        constant   int64_t & ne3,
+        constant  uint64_t & nb0,
+        constant  uint64_t & nb1,
+        constant  uint64_t & nb2,
+        constant  uint64_t & nb3,
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint3   ntg[[threads_per_threadgroup]]) {
+    const int64_t i03 = tgpig[2];
+    const int64_t i02 = tgpig[1];
+    const int64_t i01 = tgpig[0];
+
+    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+
+    const int64_t i3 = n / (ne2*ne1*ne0);
+    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
+    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
+    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0)/QK8_0;
+
+    device block_q8_0 * dst_data = (device block_q8_0 *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+
+    for (int64_t i00 = tpitg.x*QK8_0; i00 < ne00; i00 += ntg.x*QK8_0) {
+        device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
+
+        float amax = 0.0f; // absolute max
+
+        for (int j = 0; j < QK8_0; j++) {
+            const float v = src[j];
+            amax = MAX(amax, fabs(v));
+        }
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        dst_data[i00/QK8_0].d = d;
+
+        for (int j = 0; j < QK8_0; ++j) {
+            const float x0 = src[j]*id;
+
+            dst_data[i00/QK8_0].qs[j] = round(x0);
+        }
+    }
+}
+
+kernel void kernel_cpy_f32_q4_0(
+        device const float * src0,
+        device        void * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01,
+        constant   int64_t & ne02,
+        constant   int64_t & ne03,
+        constant  uint64_t & nb00,
+        constant  uint64_t & nb01,
+        constant  uint64_t & nb02,
+        constant  uint64_t & nb03,
+        constant   int64_t & ne0,
+        constant   int64_t & ne1,
+        constant   int64_t & ne2,
+        constant   int64_t & ne3,
+        constant  uint64_t & nb0,
+        constant  uint64_t & nb1,
+        constant  uint64_t & nb2,
+        constant  uint64_t & nb3,
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint3   ntg[[threads_per_threadgroup]]) {
+    const int64_t i03 = tgpig[2];
+    const int64_t i02 = tgpig[1];
+    const int64_t i01 = tgpig[0];
+
+    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+
+    const int64_t i3 = n / (ne2*ne1*ne0);
+    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
+    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
+    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0)/QK4_0;
+
+    device block_q4_0 * dst_data = (device block_q4_0 *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+
+    for (int64_t i00 = tpitg.x*QK4_0; i00 < ne00; i00 += ntg.x*QK4_0) {
+        device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
+
+        float amax = 0.0f; // absolute max
+        float max  = 0.0f;
+
+        for (int j = 0; j < QK4_0; j++) {
+            const float v = src[j];
+            if (amax < fabs(v)) {
+                amax = fabs(v);
+                max  = v;
+            }
+        }
+
+        const float d = max / -8;
+        const float id = d ? 1.0f/d : 0.0f;
+
+        dst_data[i00/QK4_0].d = d;
+
+        for (int j = 0; j < QK4_0/2; ++j) {
+            const float x0 = src[0       + j]*id;
+            const float x1 = src[QK4_0/2 + j]*id;
+
+            const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f));
+            const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f));
+
+            dst_data[i00/QK4_0].qs[j]  = xi0;
+            dst_data[i00/QK4_0].qs[j] |= xi1 << 4;
+        }
+    }
+}
+
+kernel void kernel_cpy_f32_q4_1(
+        device const float * src0,
+        device        void * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01,
+        constant   int64_t & ne02,
+        constant   int64_t & ne03,
+        constant  uint64_t & nb00,
+        constant  uint64_t & nb01,
+        constant  uint64_t & nb02,
+        constant  uint64_t & nb03,
+        constant   int64_t & ne0,
+        constant   int64_t & ne1,
+        constant   int64_t & ne2,
+        constant   int64_t & ne3,
+        constant  uint64_t & nb0,
+        constant  uint64_t & nb1,
+        constant  uint64_t & nb2,
+        constant  uint64_t & nb3,
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint3   ntg[[threads_per_threadgroup]]) {
+    const int64_t i03 = tgpig[2];
+    const int64_t i02 = tgpig[1];
+    const int64_t i01 = tgpig[0];
+
+    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+
+    const int64_t i3 = n / (ne2*ne1*ne0);
+    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
+    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
+    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0)/QK4_1;
+
+    device block_q4_1 * dst_data = (device block_q4_1 *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+
+    for (int64_t i00 = tpitg.x*QK4_1; i00 < ne00; i00 += ntg.x*QK4_1) {
+        device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
+
+        float min = FLT_MAX;
+        float max = -FLT_MAX;
+
+        for (int j = 0; j < QK4_1; j++) {
+            const float v = src[j];
+            if (min > v) min = v;
+            if (max < v) max = v;
+        }
+
+        const float d = (max - min) / ((1 << 4) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        dst_data[i00/QK4_1].d = d;
+        dst_data[i00/QK4_1].m = min;
+
+        for (int j = 0; j < QK4_1/2; ++j) {
+            const float x0 = (src[0       + j] - min)*id;
+            const float x1 = (src[QK4_1/2 + j] - min)*id;
+
+            const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f));
+            const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f));
+
+            dst_data[i00/QK4_1].qs[j]  = xi0;
+            dst_data[i00/QK4_1].qs[j] |= xi1 << 4;
+        }
+    }
+}
+
 kernel void kernel_concat(
     device const char * src0,
     device const char * src1,
@@ -1511,23 +2073,30 @@ kernel void kernel_mul_mv_q2_K_f32(
         constant   int64_t & ne02[[buffer(5)]],
         constant   int64_t & ne10[[buffer(9)]],
         constant   int64_t & ne12[[buffer(11)]],
-        constant   int64_t & ne0[[buffer(15)]],
-        constant   int64_t & ne1[[buffer(16)]],
-        constant   uint    & gqa[[buffer(17)]],
+        constant   int64_t & ne0 [[buffer(15)]],
+        constant   int64_t & ne1 [[buffer(16)]],
+        constant   uint    & r2  [[buffer(17)]],
+        constant   uint    & r3  [[buffer(18)]],
         uint3 tgpig[[threadgroup_position_in_grid]],
-        uint tiisg[[thread_index_in_simdgroup]],
-        uint sgitg[[simdgroup_index_in_threadgroup]]) {
+        uint  tiisg[[thread_index_in_simdgroup]],
+        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
 
     const int nb = ne00/QK_K;
     const int r0 = tgpig.x;
     const int r1 = tgpig.y;
-    const int r2 = tgpig.z;
+    const int im = tgpig.z;
 
     const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
     const int ib_row = first_row * nb;
-    const uint offset0 = r2/gqa*(nb*ne0);
+
+    const uint i12 = im%ne12;
+    const uint i13 = im/ne12;
+
+    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+
     device const block_q2_K * x = (device const block_q2_K *) src0 + ib_row + offset0;
-    device const float      * y = (device const float      *) src1 + r1*ne10 + r2*ne00*ne1;
+    device const float      * y = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
+
     float yl[32];
     float sumf[N_DST]={0.f}, all_sum;
 
@@ -1536,11 +2105,11 @@ kernel void kernel_mul_mv_q2_K_f32(
 #if QK_K == 256
     const int ix = tiisg/8;  // 0...3
     const int it = tiisg%8;  // 0...7
-    const int im = it/4;     // 0 or 1
+    const int iq = it/4;     // 0 or 1
     const int ir = it%4;     // 0...3
     const int is = (8*ir)/16;// 0 or 1
 
-    device const float * y4 = y + ix * QK_K + 128 * im + 8 * ir;
+    device const float * y4 = y + ix * QK_K + 128 * iq + 8 * ir;
 
     for (int ib = ix; ib < nb; ib += 4) {
 
@@ -1552,8 +2121,8 @@ kernel void kernel_mul_mv_q2_K_f32(
             yl[i+24] = y4[i+96]; sumy[3] += yl[i+24];
         }
 
-        device const uint8_t  * sc = (device const uint8_t  *)x[ib].scales + 8*im + is;
-        device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 16 * im + 4 * ir;
+        device const uint8_t  * sc = (device const uint8_t  *)x[ib].scales + 8*iq + is;
+        device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 16 * iq + 4 * ir;
         device const half     * dh = &x[ib].d;
 
         for (int row = 0; row < N_DST; row++) {
@@ -1640,7 +2209,7 @@ kernel void kernel_mul_mv_q2_K_f32(
     for (int row = 0; row < N_DST; ++row) {
         all_sum = simd_sum(sumf[row]);
         if (tiisg == 0) {
-            dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = all_sum;
+            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
         }
     }
 }
@@ -1655,9 +2224,10 @@ kernel void kernel_mul_mv_q3_K_f32(
         constant   int64_t & ne02[[buffer(5)]],
         constant   int64_t & ne10[[buffer(9)]],
         constant   int64_t & ne12[[buffer(11)]],
-        constant   int64_t & ne0[[buffer(15)]],
-        constant   int64_t & ne1[[buffer(16)]],
-        constant   uint    & gqa[[buffer(17)]],
+        constant   int64_t & ne0 [[buffer(15)]],
+        constant   int64_t & ne1 [[buffer(16)]],
+        constant   uint    & r2  [[buffer(17)]],
+        constant   uint    & r3  [[buffer(18)]],
         uint3 tgpig[[threadgroup_position_in_grid]],
         uint tiisg[[thread_index_in_simdgroup]],
         uint sgitg[[simdgroup_index_in_threadgroup]]) {
@@ -1666,12 +2236,17 @@ kernel void kernel_mul_mv_q3_K_f32(
 
     const int64_t r0 = tgpig.x;
     const int64_t r1 = tgpig.y;
-    const int64_t r2 = tgpig.z;
+    const int64_t im = tgpig.z;
 
     const int first_row = (r0 * N_SIMDGROUP + sgitg) * 2;
-    const uint offset0 = r2/gqa*(nb*ne0);
+
+    const uint i12 = im%ne12;
+    const uint i13 = im/ne12;
+
+    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+
     device const block_q3_K * x = (device const block_q3_K *) src0 + first_row*nb + offset0;
-    device const float     * yy = (device const float      *) src1 + r1*ne10 + r2*ne00*ne1;
+    device const float     * yy = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
 
     float yl[32];
 
@@ -1793,7 +2368,7 @@ kernel void kernel_mul_mv_q3_K_f32(
     }
     if (tiisg == 0) {
         for (int row = 0; row < 2; ++row) {
-            dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = sumf1[row];
+            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = sumf1[row];
         }
     }
 }
@@ -1807,26 +2382,33 @@ kernel void kernel_mul_mv_q3_K_f32(
         constant   int64_t & ne02[[buffer(5)]],
         constant   int64_t & ne10[[buffer(9)]],
         constant   int64_t & ne12[[buffer(11)]],
-        constant   int64_t & ne0[[buffer(15)]],
-        constant   int64_t & ne1[[buffer(16)]],
-        constant   uint    & gqa[[buffer(17)]],
+        constant   int64_t & ne0 [[buffer(15)]],
+        constant   int64_t & ne1 [[buffer(16)]],
+        constant   uint    & r2  [[buffer(17)]],
+        constant   uint    & r3  [[buffer(18)]],
         uint3 tgpig[[threadgroup_position_in_grid]],
-        uint tiisg[[thread_index_in_simdgroup]],
-        uint sgitg[[simdgroup_index_in_threadgroup]]) {
+        uint  tiisg[[thread_index_in_simdgroup]],
+        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
 
     const int nb = ne00/QK_K;
 
     const int64_t r0 = tgpig.x;
     const int64_t r1 = tgpig.y;
-    const int64_t r2 = tgpig.z;
+    const int64_t im = tgpig.z;
 
     const int row = 2 * r0 + sgitg;
-    const uint offset0 = r2/gqa*(nb*ne0);
+
+    const uint i12 = im%ne12;
+    const uint i13 = im/ne12;
+
+    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+
     device const block_q3_K * x = (device const block_q3_K *) src0 + row*nb + offset0;
-    device const float     * yy = (device const float      *) src1 + r1*ne10 + r2*ne00*ne1;
+    device const float     * yy = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
+
     const int ix = tiisg/4;
     const int il = 4 * (tiisg%4);// 0, 4, 8, 12
-    const int im = il/8;         // 0, 0, 1, 1
+    const int iq = il/8;         // 0, 0, 1, 1
     const int in = il%8;         // 0, 4, 0, 4
 
     float2 sum = {0.f, 0.f};
@@ -1846,7 +2428,7 @@ kernel void kernel_mul_mv_q3_K_f32(
         const float d4 = d_all * ((int32_t)(s[0] & 0xF000) - 32768) * 1.f/262144.f;
 
         for (int l = 0; l < 4; l += 2) {
-            const uint16_t hm = h[l/2] >> im;
+            const uint16_t hm = h[l/2] >> iq;
             sum[0] += y[l+ 0] * d1 * ((int32_t)(q[l/2] & 0x0003) - ((hm & 0x0001) ? 0 :  4))
                     + y[l+16] * d2 * ((int32_t)(q[l/2] & 0x000c) - ((hm & 0x0004) ? 0 : 16))
                     + y[l+32] * d3 * ((int32_t)(q[l/2] & 0x0030) - ((hm & 0x0010) ? 0 : 64))
@@ -1862,7 +2444,7 @@ kernel void kernel_mul_mv_q3_K_f32(
 
     const float tot = simd_sum(sumf);
     if (tiisg == 0) {
-        dst[r1*ne0 + r2*ne0*ne1 + row] = tot;
+        dst[r1*ne0 + im*ne0*ne1 + row] = tot;
     }
 
 }
@@ -1880,10 +2462,11 @@ kernel void kernel_mul_mv_q4_K_f32(
         constant   int64_t & ne12 [[buffer(11)]],
         constant   int64_t & ne0  [[buffer(15)]],
         constant   int64_t & ne1  [[buffer(16)]],
-        constant   uint    & gqa  [[buffer(17)]],
+        constant   uint    & r2   [[buffer(17)]],
+        constant   uint    & r3   [[buffer(18)]],
         uint3 tgpig[[threadgroup_position_in_grid]],
-        uint tiisg[[thread_index_in_simdgroup]],
-        uint sgitg[[simdgroup_index_in_threadgroup]]) {
+        uint  tiisg[[thread_index_in_simdgroup]],
+        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
 
     const uint16_t kmask1 = 0x3f3f;
     const uint16_t kmask2 = 0x0f0f;
@@ -1891,26 +2474,32 @@ kernel void kernel_mul_mv_q4_K_f32(
 
     const int ix = tiisg/8;  // 0...3
     const int it = tiisg%8;  // 0...7
-    const int im = it/4;     // 0 or 1
+    const int iq = it/4;     // 0 or 1
     const int ir = it%4;     // 0...3
 
     const int nb = ne00/QK_K;
     const int r0 = tgpig.x;
     const int r1 = tgpig.y;
-    const int r2 = tgpig.z;
+    const int im = tgpig.z;
     //const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
     const int first_row = r0 * N_DST;
     const int ib_row = first_row * nb;
-    const uint offset0 = r2/gqa*(nb*ne0);
+
+    const uint i12 = im%ne12;
+    const uint i13 = im/ne12;
+
+    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+
     device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0;
-    device const float      * y = (device const float      *) src1 + r1*ne10 + r2*ne00*ne1;
+    device const float      * y = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
+
     float yl[16];
     float yh[16];
     float sumf[N_DST]={0.f}, all_sum;
 
     const int step = sizeof(block_q4_K) * nb / 2;
 
-    device const float * y4 = y + ix * QK_K + 64 * im + 8 * ir;
+    device const float * y4 = y + ix * QK_K + 64 * iq + 8 * ir;
 
     uint16_t sc16[4];
     thread const uint8_t * sc8 = (thread const uint8_t *)sc16;
@@ -1925,8 +2514,8 @@ kernel void kernel_mul_mv_q4_K_f32(
             yh[i+8] = y4[i+160]; sumy[3] += yh[i+8];
         }
 
-        device const uint16_t * sc = (device const uint16_t *)x[ib].scales + im;
-        device const uint16_t * q1 = (device const uint16_t *)x[ib].qs + 16 * im + 4 * ir;
+        device const uint16_t * sc = (device const uint16_t *)x[ib].scales + iq;
+        device const uint16_t * q1 = (device const uint16_t *)x[ib].qs + 16 * iq + 4 * ir;
         device const half     * dh = &x[ib].d;
 
         for (int row = 0; row < N_DST; row++) {
@@ -1970,7 +2559,7 @@ kernel void kernel_mul_mv_q4_K_f32(
     for (int row = 0; row < N_DST; ++row) {
         all_sum = simd_sum(sumf[row]);
         if (tiisg == 0) {
-            dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = all_sum;
+            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
         }
     }
 }
@@ -1984,9 +2573,10 @@ kernel void kernel_mul_mv_q4_K_f32(
         constant   int64_t & ne02[[buffer(5)]],
         constant   int64_t & ne10[[buffer(9)]],
         constant   int64_t & ne12[[buffer(11)]],
-        constant   int64_t & ne0[[buffer(15)]],
-        constant   int64_t & ne1[[buffer(16)]],
-        constant   uint    & gqa[[buffer(17)]],
+        constant   int64_t & ne0 [[buffer(15)]],
+        constant   int64_t & ne1 [[buffer(16)]],
+        constant   uint    & r2  [[buffer(17)]],
+        constant   uint    & r3  [[buffer(18)]],
         uint3 tgpig[[threadgroup_position_in_grid]],
         uint tiisg[[thread_index_in_simdgroup]],
         uint sgitg[[simdgroup_index_in_threadgroup]]) {
@@ -1997,12 +2587,18 @@ kernel void kernel_mul_mv_q4_K_f32(
     const int nb = ne00/QK_K;
     const int r0 = tgpig.x;
     const int r1 = tgpig.y;
-    const int r2 = tgpig.z;
+    const int im = tgpig.z;
     const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
     const int ib_row = first_row * nb;
-    const uint offset0 = r2/gqa*(nb*ne0);
+
+    const uint i12 = im%ne12;
+    const uint i13 = im/ne12;
+
+    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+
     device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0;
-    device const float      * y = (device const float      *) src1 + r1*ne10 + r2*ne00*ne1;
+    device const float      * y = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
+
     float yl[8];
     float yh[8];
     float sumf[N_DST]={0.f}, all_sum;
@@ -2058,7 +2654,7 @@ kernel void kernel_mul_mv_q4_K_f32(
     for (int row = 0; row < N_DST; ++row) {
         all_sum = simd_sum(sumf[row]);
         if (tiisg == 0) {
-            dst[r1*ne0+ r2*ne0*ne1 + first_row + row] = all_sum;
+            dst[r1*ne0+ im*ne0*ne1 + first_row + row] = all_sum;
         }
     }
 }
@@ -2073,9 +2669,10 @@ kernel void kernel_mul_mv_q5_K_f32(
         constant   int64_t & ne02[[buffer(5)]],
         constant   int64_t & ne10[[buffer(9)]],
         constant   int64_t & ne12[[buffer(11)]],
-        constant   int64_t & ne0[[buffer(15)]],
-        constant   int64_t & ne1[[buffer(16)]],
-        constant   uint    & gqa[[buffer(17)]],
+        constant   int64_t & ne0 [[buffer(15)]],
+        constant   int64_t & ne1 [[buffer(16)]],
+        constant   uint    & r2  [[buffer(17)]],
+        constant   uint    & r3  [[buffer(18)]],
         uint3 tgpig[[threadgroup_position_in_grid]],
         uint tiisg[[thread_index_in_simdgroup]],
         uint sgitg[[simdgroup_index_in_threadgroup]]) {
@@ -2084,12 +2681,17 @@ kernel void kernel_mul_mv_q5_K_f32(
 
     const int64_t r0 = tgpig.x;
     const int64_t r1 = tgpig.y;
-    const int r2 = tgpig.z;
+    const int im = tgpig.z;
 
     const int first_row = (r0 * N_SIMDGROUP + sgitg) * 2;
-    const uint offset0 = r2/gqa*(nb*ne0);
+
+    const uint i12 = im%ne12;
+    const uint i13 = im/ne12;
+
+    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+
     device const block_q5_K * x = (device const block_q5_K *) src0 + first_row*nb + offset0;
-    device const float     * yy = (device const float      *) src1 + r1*ne10 + r2*ne00*ne1;
+    device const float     * yy = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
 
     float sumf[2]={0.f};
 
@@ -2105,15 +2707,15 @@ kernel void kernel_mul_mv_q5_K_f32(
 
     const int tid = tiisg/4;
     const int ix  = tiisg%4;
-    const int im  = tid/4;
+    const int iq  = tid/4;
     const int ir  = tid%4;
     const int n   = 8;
 
     const int l0 = n*ir;
-    const int q_offset = 32*im + l0;
-    const int y_offset = 64*im + l0;
+    const int q_offset = 32*iq + l0;
+    const int y_offset = 64*iq + l0;
 
-    const uint8_t hm1 = 1u << (2*im);
+    const uint8_t hm1 = 1u << (2*iq);
     const uint8_t hm2 = hm1 << 1;
     const uint8_t hm3 = hm1 << 4;
     const uint8_t hm4 = hm2 << 4;
@@ -2128,7 +2730,7 @@ kernel void kernel_mul_mv_q5_K_f32(
         device const uint8_t * q1 = x[i].qs + q_offset;
         device const uint8_t * qh = x[i].qh + l0;
         device const half * dh = &x[i].d;
-        device const uint16_t * a = (device const uint16_t *)x[i].scales + im;
+        device const uint16_t * a = (device const uint16_t *)x[i].scales + iq;
 
         device const float * y2 = y1 + 128;
         float4 sumy = {0.f, 0.f, 0.f, 0.f};
@@ -2184,7 +2786,7 @@ kernel void kernel_mul_mv_q5_K_f32(
 
     const int il = 4 * (tiisg/8);  // 0, 4, 8, 12
     const int ix = tiisg%8;
-    const int im = il/8;         // 0, 0, 1, 1
+    const int iq = il/8;         // 0, 0, 1, 1
     const int in = il%8;         // 0, 4, 0, 4
 
     device const float * y = yy + ix*QK_K + il;
@@ -2209,7 +2811,7 @@ kernel void kernel_mul_mv_q5_K_f32(
 
             float2 acc = {0.f, 0.f};
             for (int l = 0; l < 4; ++l) {
-                const uint8_t hl = h[l] >> im;
+                const uint8_t hl = h[l] >> iq;
                 acc[0] += yl[l+0] * s[0] * ((int16_t)(q[l+ 0] & 0x0F) - (hl & 0x01 ? 0 : 16))
                         + yl[l+4] * s[1] * ((int16_t)(q[l+16] & 0x0F) - (hl & 0x04 ? 0 : 16));
                 acc[1] += yh[l+0] * s[2] * ((int16_t)(q[l+ 0] & 0xF0) - (hl & 0x10 ? 0 : 256))
@@ -2231,7 +2833,7 @@ kernel void kernel_mul_mv_q5_K_f32(
     for (int row = 0; row < 2; ++row) {
         const float tot = simd_sum(sumf[row]);
         if (tiisg == 0) {
-            dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = tot;
+            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot;
         }
     }
 
@@ -2246,9 +2848,10 @@ kernel void kernel_mul_mv_q6_K_f32(
         constant   int64_t & ne02[[buffer(5)]],
         constant   int64_t & ne10[[buffer(9)]],
         constant   int64_t & ne12[[buffer(11)]],
-        constant   int64_t & ne0[[buffer(15)]],
-        constant   int64_t & ne1[[buffer(16)]],
-        constant   uint    & gqa[[buffer(17)]],
+        constant   int64_t & ne0 [[buffer(15)]],
+        constant   int64_t & ne1 [[buffer(16)]],
+        constant   uint    & r2  [[buffer(17)]],
+        constant   uint    & r3  [[buffer(18)]],
         uint3 tgpig[[threadgroup_position_in_grid]],
         uint tiisg[[thread_index_in_simdgroup]],
         uint sgitg[[simdgroup_index_in_threadgroup]]) {
@@ -2262,12 +2865,17 @@ kernel void kernel_mul_mv_q6_K_f32(
 
     const int64_t r0 = tgpig.x;
     const int64_t r1 = tgpig.y;
-    const int r2 = tgpig.z;
+    const int     im = tgpig.z;
 
     const int row = 2 * r0 + sgitg;
-    const uint offset0 = r2/gqa*(nb*ne0);
+
+    const uint i12 = im%ne12;
+    const uint i13 = im/ne12;
+
+    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+
     device const block_q6_K * x = (device const block_q6_K *) src0 + row * nb + offset0;
-    device const float     * yy = (device const float      *) src1 + r1*ne10 + r2*ne00*ne1;
+    device const float     * yy = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
 
     float sumf = 0;
 
@@ -2333,7 +2941,7 @@ kernel void kernel_mul_mv_q6_K_f32(
 
     const float tot = simd_sum(sumf);
     if (tiisg == 0) {
-        dst[r1*ne0 + r2*ne0*ne1 + row] = tot;
+        dst[r1*ne0 + im*ne0*ne1 + row] = tot;
     }
 }
 
@@ -2643,24 +3251,25 @@ kernel void kernel_get_rows(
 
 // each block_q contains 16*nl weights
 template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)>
-kernel void kernel_mul_mm(device const  uchar * src0,
-                          device const  uchar * src1,
-                          device        float * dst,
-                          constant    int64_t & ne00,
-                          constant    int64_t & ne02,
-                          constant    int64_t & nb01,
-                          constant    int64_t & nb02,
-                          constant    int64_t & ne12,
-                          constant    int64_t & nb10,
-                          constant    int64_t & nb11,
-                          constant    int64_t & nb12,
-                          constant    int64_t & ne0,
-                          constant    int64_t & ne1,
-                          constant       uint & gqa,
-                          threadgroup   uchar * shared_memory [[threadgroup(0)]],
-                          uint3                 tgpig[[threadgroup_position_in_grid]],
-                          uint                  tiitg[[thread_index_in_threadgroup]],
-                          uint                  sgitg[[simdgroup_index_in_threadgroup]]) {
+void kernel_mul_mm_impl(device const  uchar * src0,
+                        device const  uchar * src1,
+                        device        float * dst,
+                        constant    int64_t & ne00,
+                        constant    int64_t & ne02,
+                        constant    int64_t & nb01,
+                        constant    int64_t & nb02,
+                        constant    int64_t & ne12,
+                        constant    int64_t & nb10,
+                        constant    int64_t & nb11,
+                        constant    int64_t & nb12,
+                        constant    int64_t & ne0,
+                        constant    int64_t & ne1,
+                        constant       uint & r2,
+                        constant       uint & r3,
+                        threadgroup   uchar * shared_memory [[threadgroup(0)]],
+                        uint3                 tgpig[[threadgroup_position_in_grid]],
+                        uint                  tiitg[[thread_index_in_threadgroup]],
+                        uint                  sgitg[[simdgroup_index_in_threadgroup]]) {
 
     threadgroup half  * sa = (threadgroup half  *)(shared_memory);
     threadgroup float * sb = (threadgroup float *)(shared_memory + 4096);
@@ -2686,7 +3295,10 @@ kernel void kernel_mul_mm(device const  uchar * src0,
 
     short il = (tiitg % THREAD_PER_ROW);
 
-    uint   offset0 = im/gqa*nb02;
+    const uint i12 = im%ne12;
+    const uint i13 = im/ne12;
+
+    uint   offset0 = (i12/r2)*nb02 + (i13/r3)*(nb02*ne02);
     ushort offset1 = il/nl;
 
     device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01 + offset0) + offset1;
@@ -2770,14 +3382,116 @@ kernel void kernel_mul_mm(device const  uchar * src0,
     }
 }
 
+template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)>
+kernel void kernel_mul_mm(device const  uchar * src0,
+                          device const  uchar * src1,
+                          device        float * dst,
+                          constant    int64_t & ne00,
+                          constant    int64_t & ne02,
+                          constant    int64_t & nb01,
+                          constant    int64_t & nb02,
+                          constant    int64_t & ne12,
+                          constant    int64_t & nb10,
+                          constant    int64_t & nb11,
+                          constant    int64_t & nb12,
+                          constant    int64_t & ne0,
+                          constant    int64_t & ne1,
+                          constant       uint & r2,
+                          constant       uint & r3,
+                          threadgroup   uchar * shared_memory [[threadgroup(0)]],
+                          uint3                 tgpig[[threadgroup_position_in_grid]],
+                          uint                  tiitg[[thread_index_in_threadgroup]],
+                          uint                  sgitg[[simdgroup_index_in_threadgroup]]) {
+    kernel_mul_mm_impl<block_q, nl, dequantize_func>(
+        src0,
+        src1,
+        dst,
+        ne00,
+        ne02,
+        nb01,
+        nb02,
+        ne12,
+        nb10,
+        nb11,
+        nb12,
+        ne0,
+        ne1,
+        r2,
+        r3,
+        shared_memory,
+        tgpig,
+        tiitg,
+        sgitg);
+}
+
+template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)>
+kernel void kernel_mul_mm_id(
+        device const int32_t * ids,
+        device const   uchar * src1,
+        device         float * dst,
+        constant     int64_t & ne00,
+        constant     int64_t & ne02,
+        constant     int64_t & nb01,
+        constant     int64_t & nb02,
+        constant     int64_t & ne12,
+        constant     int64_t & nb10,
+        constant     int64_t & nb11,
+        constant     int64_t & nb12,
+        constant     int64_t & ne0,
+        constant     int64_t & ne1,
+        constant        uint & r2,
+        constant        uint & r3,
+        constant         int & idx,
+        device const   uchar * src00,
+        device const   uchar * src01,
+        device const   uchar * src02,
+        device const   uchar * src03,
+        device const   uchar * src04,
+        device const   uchar * src05,
+        device const   uchar * src06,
+        device const   uchar * src07,
+        threadgroup    uchar * shared_memory [[threadgroup(0)]],
+        uint3                  tgpig[[threadgroup_position_in_grid]],
+        uint                   tiitg[[thread_index_in_threadgroup]],
+        uint                   sgitg[[simdgroup_index_in_threadgroup]]) {
+    device const uchar * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
+
+    kernel_mul_mm_impl<block_q, nl, dequantize_func>(
+        src0[ids[idx]],
+        src1,
+        dst,
+        ne00,
+        ne02,
+        nb01,
+        nb02,
+        ne12,
+        nb10,
+        nb11,
+        nb12,
+        ne0,
+        ne1,
+        r2,
+        r3,
+        shared_memory,
+        tgpig,
+        tiitg,
+        sgitg);
+}
+
 #if QK_K == 256
 #define QK_NL 16
 #else
 #define QK_NL 4
 #endif
 
-typedef void (get_rows_t)(device const void *, device const int *, device float *, constant int64_t &, \
-                          constant uint64_t &, constant uint64_t &, uint, uint, uint);
+typedef void (get_rows_t)(
+        device const void * src0,
+        device const  int * src1,
+        device      float * dst,
+        constant  int64_t & ne00,
+        constant uint64_t & nb01,
+        constant uint64_t & nb1,
+        uint, uint, uint);
 
 template [[host_name("kernel_get_rows_f32")]]  kernel get_rows_t kernel_get_rows<float4x4,   1, dequantize_f32>;
 template [[host_name("kernel_get_rows_f16")]]  kernel get_rows_t kernel_get_rows<half4x4,    1, dequantize_f16>;
@@ -2806,8 +3520,10 @@ typedef void (mat_mm_t)(
         constant    int64_t & nb12,
         constant    int64_t & ne0,
         constant    int64_t & ne1,
-        constant       uint & gqa,
-        threadgroup uchar *, uint3, uint, uint);
+        constant       uint & r2,
+        constant       uint & r3,
+        threadgroup   uchar *,
+        uint3, uint, uint);
 
 template [[host_name("kernel_mul_mm_f32_f32")]]  kernel mat_mm_t kernel_mul_mm<float4x4,   1,     dequantize_f32>;
 template [[host_name("kernel_mul_mm_f16_f32")]]  kernel mat_mm_t kernel_mul_mm<half4x4,    1,     dequantize_f16>;
@@ -2821,3 +3537,44 @@ template [[host_name("kernel_mul_mm_q3_K_f32")]] kernel mat_mm_t kernel_mul_mm<b
 template [[host_name("kernel_mul_mm_q4_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_K, QK_NL, dequantize_q4_K>;
 template [[host_name("kernel_mul_mm_q5_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q5_K, QK_NL, dequantize_q5_K>;
 template [[host_name("kernel_mul_mm_q6_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q6_K, QK_NL, dequantize_q6_K>;
+
+typedef void (mat_mm_id_t)(
+        device const int32_t * ids,
+        device const   uchar * src1,
+        device         float * dst,
+        constant     int64_t & ne00,
+        constant     int64_t & ne02,
+        constant     int64_t & nb01,
+        constant     int64_t & nb02,
+        constant     int64_t & ne12,
+        constant     int64_t & nb10,
+        constant     int64_t & nb11,
+        constant     int64_t & nb12,
+        constant     int64_t & ne0,
+        constant     int64_t & ne1,
+        constant        uint & r2,
+        constant        uint & r3,
+        constant         int & idx,
+        device const   uchar * src00,
+        device const   uchar * src01,
+        device const   uchar * src02,
+        device const   uchar * src03,
+        device const   uchar * src04,
+        device const   uchar * src05,
+        device const   uchar * src06,
+        device const   uchar * src07,
+        threadgroup    uchar *,
+        uint3, uint, uint);
+
+template [[host_name("kernel_mul_mm_id_f32_f32")]]  kernel mat_mm_id_t kernel_mul_mm_id<float4x4,   1,     dequantize_f32>;
+template [[host_name("kernel_mul_mm_id_f16_f32")]]  kernel mat_mm_id_t kernel_mul_mm_id<half4x4,    1,     dequantize_f16>;
+template [[host_name("kernel_mul_mm_id_q4_0_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q4_0, 2,     dequantize_q4_0>;
+template [[host_name("kernel_mul_mm_id_q4_1_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q4_1, 2,     dequantize_q4_1>;
+template [[host_name("kernel_mul_mm_id_q5_0_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q5_0, 2,     dequantize_q5_0>;
+template [[host_name("kernel_mul_mm_id_q5_1_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q5_1, 2,     dequantize_q5_1>;
+template [[host_name("kernel_mul_mm_id_q8_0_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q8_0, 2,     dequantize_q8_0>;
+template [[host_name("kernel_mul_mm_id_q2_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q2_K, QK_NL, dequantize_q2_K>;
+template [[host_name("kernel_mul_mm_id_q3_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q3_K, QK_NL, dequantize_q3_K>;
+template [[host_name("kernel_mul_mm_id_q4_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q4_K, QK_NL, dequantize_q4_K>;
+template [[host_name("kernel_mul_mm_id_q5_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q5_K, QK_NL, dequantize_q5_K>;
+template [[host_name("kernel_mul_mm_id_q6_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q6_K, QK_NL, dequantize_q6_K>;
diff --git a/cpp/ggml-metal.h b/cpp/ggml-metal.h
index f650977..6e5291d 100644
--- a/cpp/ggml-metal.h
+++ b/cpp/ggml-metal.h
@@ -26,7 +26,7 @@
 #include <stdbool.h>
 
 // max memory buffers that can be mapped to the device
-#define LM_GGML_METAL_MAX_BUFFERS 16
+#define LM_GGML_METAL_MAX_BUFFERS 64
 #define LM_GGML_METAL_MAX_COMMAND_BUFFERS 32
 
 struct lm_ggml_tensor;
@@ -99,6 +99,12 @@ LM_GGML_API lm_ggml_backend_t lm_ggml_backend_metal_init(void);
 LM_GGML_API bool lm_ggml_backend_is_metal(lm_ggml_backend_t backend);
 
 LM_GGML_API void lm_ggml_backend_metal_set_n_cb(lm_ggml_backend_t backend, int n_cb);
+LM_GGML_API lm_ggml_backend_buffer_type_t lm_ggml_backend_metal_buffer_type(void);
+
+// helper to check if the device supports a specific family
+// ideally, the user code should be doing these checks
+// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
+LM_GGML_API bool lm_ggml_backend_metal_supports_family(lm_ggml_backend_t backend, int family);
 
 #ifdef __cplusplus
 }
diff --git a/cpp/ggml-metal.m b/cpp/ggml-metal.m
index 4c68f3d..32c1019 100644
--- a/cpp/ggml-metal.m
+++ b/cpp/ggml-metal.m
@@ -1,5 +1,6 @@
 #import "ggml-metal.h"
 
+#import "ggml-backend-impl.h"
 #import "ggml.h"
 
 #import <Foundation/Foundation.h>
@@ -23,7 +24,7 @@
 
 #define UNUSED(x) (void)(x)
 
-#define LM_GGML_MAX_CONCUR (2*LM_GGML_MAX_NODES)
+#define LM_GGML_MAX_CONCUR (2*LM_GGML_DEFAULT_GRAPH_SIZE)
 
 struct lm_ggml_metal_buffer {
     const char * name;
@@ -61,6 +62,8 @@
     LM_GGML_METAL_DECL_KERNEL(add_row); // TODO: avoid this extra kernel, instead extend the "add" kernel to support broadcast
     LM_GGML_METAL_DECL_KERNEL(mul);
     LM_GGML_METAL_DECL_KERNEL(mul_row); // TODO: avoid this extra kernel, instead extend the "mul" kernel to support broadcast
+    LM_GGML_METAL_DECL_KERNEL(div);
+    LM_GGML_METAL_DECL_KERNEL(div_row);
     LM_GGML_METAL_DECL_KERNEL(scale);
     LM_GGML_METAL_DECL_KERNEL(scale_4);
     LM_GGML_METAL_DECL_KERNEL(silu);
@@ -85,6 +88,7 @@
     LM_GGML_METAL_DECL_KERNEL(rms_norm);
     LM_GGML_METAL_DECL_KERNEL(norm);
     LM_GGML_METAL_DECL_KERNEL(mul_mv_f32_f32);
+    LM_GGML_METAL_DECL_KERNEL(mul_mv_f16_f16);
     LM_GGML_METAL_DECL_KERNEL(mul_mv_f16_f32);
     LM_GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_1row);
     LM_GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_l4);
@@ -110,14 +114,35 @@
     LM_GGML_METAL_DECL_KERNEL(mul_mm_q4_K_f32);
     LM_GGML_METAL_DECL_KERNEL(mul_mm_q5_K_f32);
     LM_GGML_METAL_DECL_KERNEL(mul_mm_q6_K_f32);
+    LM_GGML_METAL_DECL_KERNEL(mul_mm_id_f32_f32);
+    LM_GGML_METAL_DECL_KERNEL(mul_mm_id_f16_f32);
+    LM_GGML_METAL_DECL_KERNEL(mul_mm_id_q4_0_f32);
+    LM_GGML_METAL_DECL_KERNEL(mul_mm_id_q4_1_f32);
+    LM_GGML_METAL_DECL_KERNEL(mul_mm_id_q5_0_f32);
+    LM_GGML_METAL_DECL_KERNEL(mul_mm_id_q5_1_f32);
+    LM_GGML_METAL_DECL_KERNEL(mul_mm_id_q8_0_f32);
+    LM_GGML_METAL_DECL_KERNEL(mul_mm_id_q2_K_f32);
+    LM_GGML_METAL_DECL_KERNEL(mul_mm_id_q3_K_f32);
+    LM_GGML_METAL_DECL_KERNEL(mul_mm_id_q4_K_f32);
+    LM_GGML_METAL_DECL_KERNEL(mul_mm_id_q5_K_f32);
+    LM_GGML_METAL_DECL_KERNEL(mul_mm_id_q6_K_f32);
     LM_GGML_METAL_DECL_KERNEL(rope_f32);
     LM_GGML_METAL_DECL_KERNEL(rope_f16);
     LM_GGML_METAL_DECL_KERNEL(alibi_f32);
+    LM_GGML_METAL_DECL_KERNEL(im2col_f16);
+    LM_GGML_METAL_DECL_KERNEL(argsort_f32_i32_asc);
+    LM_GGML_METAL_DECL_KERNEL(argsort_f32_i32_desc);
     LM_GGML_METAL_DECL_KERNEL(cpy_f32_f16);
     LM_GGML_METAL_DECL_KERNEL(cpy_f32_f32);
+    LM_GGML_METAL_DECL_KERNEL(cpy_f32_q8_0);
+    LM_GGML_METAL_DECL_KERNEL(cpy_f32_q4_0);
+    LM_GGML_METAL_DECL_KERNEL(cpy_f32_q4_1);
+    //LM_GGML_METAL_DECL_KERNEL(cpy_f32_q5_0);
+    //LM_GGML_METAL_DECL_KERNEL(cpy_f32_q5_1);
     LM_GGML_METAL_DECL_KERNEL(cpy_f16_f16);
     LM_GGML_METAL_DECL_KERNEL(concat);
     LM_GGML_METAL_DECL_KERNEL(sqr);
+    LM_GGML_METAL_DECL_KERNEL(sum_rows);
 
 #undef LM_GGML_METAL_DECL_KERNEL
 };
@@ -125,7 +150,7 @@
 // MSL code
 // TODO: move the contents here when ready
 //       for now it is easier to work in a separate file
-static NSString * const msl_library_source = @"see metal.metal";
+//static NSString * const msl_library_source = @"see metal.metal";
 
 // Here to assist with NSBundle Path Hack
 @interface LMGGMLMetalClass : NSObject
@@ -141,7 +166,8 @@ void lm_ggml_metal_log_set_callback(lm_ggml_log_callback log_callback, void * us
     lm_ggml_metal_log_user_data = user_data;
 }
 
-static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char* format, ...){
+LM_GGML_ATTRIBUTE_FORMAT(2, 3)
+static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char * format, ...){
     if (lm_ggml_metal_log_callback != NULL) {
         va_list args;
         va_start(args, format);
@@ -160,12 +186,10 @@ static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char* format,
     }
 }
 
-
-
 struct lm_ggml_metal_context * lm_ggml_metal_init(int n_cb) {
     LM_GGML_METAL_LOG_INFO("%s: allocating\n", __func__);
 
-    id <MTLDevice> device;
+    id<MTLDevice> device;
     NSString * s;
 
 #if TARGET_OS_OSX
@@ -209,7 +233,16 @@ static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char* format,
         } else {
             LM_GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
 
-            NSString * sourcePath = [bundle pathForResource:@"ggml-metal-llama" ofType:@"metal"];
+            NSString * sourcePath;
+            NSString * ggmlMetalPathResources = [[NSProcessInfo processInfo].environment objectForKey:@"LM_GGML_METAL_PATH_RESOURCES"];
+
+            LM_GGML_METAL_LOG_INFO("%s: LM_GGML_METAL_PATH_RESOURCES = %s\n", __func__, ggmlMetalPathResources ? [ggmlMetalPathResources UTF8String] : "nil");
+
+            if (ggmlMetalPathResources) {
+                sourcePath = [ggmlMetalPathResources stringByAppendingPathComponent:@"ggml-metal.metal"];
+            } else {
+                sourcePath = [bundle pathForResource:@"ggml-metal-llama" ofType:@"metal"];
+            }
             if (sourcePath == nil) {
                 LM_GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
                 sourcePath = @"ggml-metal.metal";
@@ -235,6 +268,29 @@ static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char* format,
         }
     }
 
+#if TARGET_OS_OSX
+    // print MTL GPU family:
+    LM_GGML_METAL_LOG_INFO("%s: GPU name:   %s\n", __func__, [[ctx->device name] UTF8String]);
+
+    // determine max supported GPU family
+    // https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
+    // https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
+    for (int i = MTLGPUFamilyApple1 + 20; i >= MTLGPUFamilyApple1; --i) {
+        if ([ctx->device supportsFamily:i]) {
+            LM_GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d (%d)\n", __func__, i - (int) MTLGPUFamilyApple1 + 1, i);
+            break;
+        }
+    }
+
+    LM_GGML_METAL_LOG_INFO("%s: hasUnifiedMemory              = %s\n",       __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
+    LM_GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize  = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6);
+    if (ctx->device.maxTransferRate != 0) {
+        LM_GGML_METAL_LOG_INFO("%s: maxTransferRate               = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1e6);
+    } else {
+        LM_GGML_METAL_LOG_INFO("%s: maxTransferRate               = built-in GPU\n", __func__);
+    }
+#endif
+
     // load kernels
     {
         NSError * error = nil;
@@ -256,6 +312,8 @@ static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char* format,
         LM_GGML_METAL_ADD_KERNEL(add_row);
         LM_GGML_METAL_ADD_KERNEL(mul);
         LM_GGML_METAL_ADD_KERNEL(mul_row);
+        LM_GGML_METAL_ADD_KERNEL(div);
+        LM_GGML_METAL_ADD_KERNEL(div_row);
         LM_GGML_METAL_ADD_KERNEL(scale);
         LM_GGML_METAL_ADD_KERNEL(scale_4);
         LM_GGML_METAL_ADD_KERNEL(silu);
@@ -280,6 +338,7 @@ static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char* format,
         LM_GGML_METAL_ADD_KERNEL(rms_norm);
         LM_GGML_METAL_ADD_KERNEL(norm);
         LM_GGML_METAL_ADD_KERNEL(mul_mv_f32_f32);
+        LM_GGML_METAL_ADD_KERNEL(mul_mv_f16_f16);
         LM_GGML_METAL_ADD_KERNEL(mul_mv_f16_f32);
         LM_GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_1row);
         LM_GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_l4);
@@ -306,42 +365,40 @@ static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char* format,
             LM_GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
             LM_GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32);
             LM_GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32);
+            LM_GGML_METAL_ADD_KERNEL(mul_mm_id_f32_f32);
+            LM_GGML_METAL_ADD_KERNEL(mul_mm_id_f16_f32);
+            LM_GGML_METAL_ADD_KERNEL(mul_mm_id_q4_0_f32);
+            LM_GGML_METAL_ADD_KERNEL(mul_mm_id_q4_1_f32);
+            LM_GGML_METAL_ADD_KERNEL(mul_mm_id_q5_0_f32);
+            LM_GGML_METAL_ADD_KERNEL(mul_mm_id_q5_1_f32);
+            LM_GGML_METAL_ADD_KERNEL(mul_mm_id_q8_0_f32);
+            LM_GGML_METAL_ADD_KERNEL(mul_mm_id_q2_K_f32);
+            LM_GGML_METAL_ADD_KERNEL(mul_mm_id_q3_K_f32);
+            LM_GGML_METAL_ADD_KERNEL(mul_mm_id_q4_K_f32);
+            LM_GGML_METAL_ADD_KERNEL(mul_mm_id_q5_K_f32);
+            LM_GGML_METAL_ADD_KERNEL(mul_mm_id_q6_K_f32);
         }
         LM_GGML_METAL_ADD_KERNEL(rope_f32);
         LM_GGML_METAL_ADD_KERNEL(rope_f16);
         LM_GGML_METAL_ADD_KERNEL(alibi_f32);
+        LM_GGML_METAL_ADD_KERNEL(im2col_f16);
+        LM_GGML_METAL_ADD_KERNEL(argsort_f32_i32_asc);
+        LM_GGML_METAL_ADD_KERNEL(argsort_f32_i32_desc);
         LM_GGML_METAL_ADD_KERNEL(cpy_f32_f16);
         LM_GGML_METAL_ADD_KERNEL(cpy_f32_f32);
+        LM_GGML_METAL_ADD_KERNEL(cpy_f32_q8_0);
+        LM_GGML_METAL_ADD_KERNEL(cpy_f32_q4_0);
+        LM_GGML_METAL_ADD_KERNEL(cpy_f32_q4_1);
+        //LM_GGML_METAL_ADD_KERNEL(cpy_f32_q5_0);
+        //LM_GGML_METAL_ADD_KERNEL(cpy_f32_q5_1);
         LM_GGML_METAL_ADD_KERNEL(cpy_f16_f16);
         LM_GGML_METAL_ADD_KERNEL(concat);
         LM_GGML_METAL_ADD_KERNEL(sqr);
+        LM_GGML_METAL_ADD_KERNEL(sum_rows);
 
 #undef LM_GGML_METAL_ADD_KERNEL
     }
 
-#if TARGET_OS_OSX
-    // print MTL GPU family:
-    LM_GGML_METAL_LOG_INFO("%s: GPU name:   %s\n", __func__, [[ctx->device name] UTF8String]);
-
-    // determine max supported GPU family
-    // https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
-    // https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
-    for (int i = MTLGPUFamilyApple1 + 20; i >= MTLGPUFamilyApple1; --i) {
-        if ([ctx->device supportsFamily:i]) {
-            LM_GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d (%d)\n", __func__, i - MTLGPUFamilyApple1 + 1, i);
-            break;
-        }
-    }
-
-    LM_GGML_METAL_LOG_INFO("%s: hasUnifiedMemory              = %s\n",       __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
-    LM_GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize  = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
-    if (ctx->device.maxTransferRate != 0) {
-        LM_GGML_METAL_LOG_INFO("%s: maxTransferRate               = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
-    } else {
-        LM_GGML_METAL_LOG_INFO("%s: maxTransferRate               = built-in GPU\n", __func__);
-    }
-#endif
-
     return ctx;
 }
 
@@ -355,6 +412,8 @@ void lm_ggml_metal_free(struct lm_ggml_metal_context * ctx) {
     LM_GGML_METAL_DEL_KERNEL(add_row);
     LM_GGML_METAL_DEL_KERNEL(mul);
     LM_GGML_METAL_DEL_KERNEL(mul_row);
+    LM_GGML_METAL_DEL_KERNEL(div);
+    LM_GGML_METAL_DEL_KERNEL(div_row);
     LM_GGML_METAL_DEL_KERNEL(scale);
     LM_GGML_METAL_DEL_KERNEL(scale_4);
     LM_GGML_METAL_DEL_KERNEL(silu);
@@ -379,6 +438,7 @@ void lm_ggml_metal_free(struct lm_ggml_metal_context * ctx) {
     LM_GGML_METAL_DEL_KERNEL(rms_norm);
     LM_GGML_METAL_DEL_KERNEL(norm);
     LM_GGML_METAL_DEL_KERNEL(mul_mv_f32_f32);
+    LM_GGML_METAL_DEL_KERNEL(mul_mv_f16_f16);
     LM_GGML_METAL_DEL_KERNEL(mul_mv_f16_f32);
     LM_GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_1row);
     LM_GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_l4);
@@ -405,15 +465,36 @@ void lm_ggml_metal_free(struct lm_ggml_metal_context * ctx) {
         LM_GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32);
         LM_GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32);
         LM_GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32);
+        LM_GGML_METAL_DEL_KERNEL(mul_mm_id_f32_f32);
+        LM_GGML_METAL_DEL_KERNEL(mul_mm_id_f16_f32);
+        LM_GGML_METAL_DEL_KERNEL(mul_mm_id_q4_0_f32);
+        LM_GGML_METAL_DEL_KERNEL(mul_mm_id_q4_1_f32);
+        LM_GGML_METAL_DEL_KERNEL(mul_mm_id_q5_0_f32);
+        LM_GGML_METAL_DEL_KERNEL(mul_mm_id_q5_1_f32);
+        LM_GGML_METAL_DEL_KERNEL(mul_mm_id_q8_0_f32);
+        LM_GGML_METAL_DEL_KERNEL(mul_mm_id_q2_K_f32);
+        LM_GGML_METAL_DEL_KERNEL(mul_mm_id_q3_K_f32);
+        LM_GGML_METAL_DEL_KERNEL(mul_mm_id_q4_K_f32);
+        LM_GGML_METAL_DEL_KERNEL(mul_mm_id_q5_K_f32);
+        LM_GGML_METAL_DEL_KERNEL(mul_mm_id_q6_K_f32);
     }
     LM_GGML_METAL_DEL_KERNEL(rope_f32);
     LM_GGML_METAL_DEL_KERNEL(rope_f16);
     LM_GGML_METAL_DEL_KERNEL(alibi_f32);
+    LM_GGML_METAL_DEL_KERNEL(im2col_f16);
+    LM_GGML_METAL_DEL_KERNEL(argsort_f32_i32_asc);
+    LM_GGML_METAL_DEL_KERNEL(argsort_f32_i32_desc);
     LM_GGML_METAL_DEL_KERNEL(cpy_f32_f16);
     LM_GGML_METAL_DEL_KERNEL(cpy_f32_f32);
+    LM_GGML_METAL_DEL_KERNEL(cpy_f32_q8_0);
+    LM_GGML_METAL_DEL_KERNEL(cpy_f32_q4_0);
+    LM_GGML_METAL_DEL_KERNEL(cpy_f32_q4_1);
+    //LM_GGML_METAL_DEL_KERNEL(cpy_f32_q5_0);
+    //LM_GGML_METAL_DEL_KERNEL(cpy_f32_q5_1);
     LM_GGML_METAL_DEL_KERNEL(cpy_f16_f16);
     LM_GGML_METAL_DEL_KERNEL(concat);
     LM_GGML_METAL_DEL_KERNEL(sqr);
+    LM_GGML_METAL_DEL_KERNEL(sum_rows);
 
 #undef LM_GGML_METAL_DEL_KERNEL
 
@@ -457,6 +538,13 @@ int lm_ggml_metal_if_optimized(struct lm_ggml_metal_context * ctx) {
     return ctx->concur_list;
 }
 
+// temporarily defined here for compatibility between ggml-backend and the old API
+struct lm_ggml_backend_metal_buffer_context {
+    void * data;
+
+    id<MTLBuffer> metal;
+};
+
 // finds the Metal buffer that contains the tensor data on the GPU device
 // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
 // Metal buffer based on the host memory pointer
@@ -466,6 +554,19 @@ int lm_ggml_metal_if_optimized(struct lm_ggml_metal_context * ctx) {
 
     const int64_t tsize = lm_ggml_nbytes(t);
 
+    // compatibility with ggml-backend
+    if (t->buffer && t->buffer->buft == lm_ggml_backend_metal_buffer_type()) {
+        struct lm_ggml_backend_metal_buffer_context * buf_ctx = (struct lm_ggml_backend_metal_buffer_context *) t->buffer->context;
+
+        const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->data;
+
+        LM_GGML_ASSERT(ioffs >= 0 && ioffs + tsize <= (int64_t) t->buffer->size);
+
+        *offs = (size_t) ioffs;
+
+        return buf_ctx->metal;
+    }
+
     // find the view that contains the tensor fully
     for (int i = 0; i < ctx->n_buffers; ++i) {
         const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
@@ -523,11 +624,11 @@ bool lm_ggml_metal_add_buffer(
             ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
 
             if (ctx->buffers[ctx->n_buffers].metal == nil) {
-                LM_GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
+                LM_GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MiB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
                 return false;
             }
 
-            LM_GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
+            LM_GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MiB", __func__, name, size_aligned / 1024.0 / 1024.0);
 
             ++ctx->n_buffers;
         } else {
@@ -547,11 +648,11 @@ bool lm_ggml_metal_add_buffer(
                 ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
 
                 if (ctx->buffers[ctx->n_buffers].metal == nil) {
-                    LM_GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
+                    LM_GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MiB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
                     return false;
                 }
 
-                LM_GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
+                LM_GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MiB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
                 if (i + size_step < size) {
                     LM_GGML_METAL_LOG_INFO("\n");
                 }
@@ -566,7 +667,7 @@ bool lm_ggml_metal_add_buffer(
                 ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
 
         if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
-            LM_GGML_METAL_LOG_WARN(", warning: current allocated size is greater than the recommended max working set size\n", __func__);
+            LM_GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
         } else {
             LM_GGML_METAL_LOG_INFO("\n");
         }
@@ -688,6 +789,51 @@ void lm_ggml_metal_graph_find_concurrency(
     }
 }
 
+static bool lm_ggml_metal_supports_op(const struct lm_ggml_tensor * op) {
+    switch (op->op) {
+        case LM_GGML_OP_UNARY:
+            switch (lm_ggml_get_unary_op(op)) {
+                case LM_GGML_UNARY_OP_SILU:
+                case LM_GGML_UNARY_OP_RELU:
+                case LM_GGML_UNARY_OP_GELU:
+                    return true;
+                default:
+                    return false;
+            }
+        case LM_GGML_OP_NONE:
+        case LM_GGML_OP_RESHAPE:
+        case LM_GGML_OP_VIEW:
+        case LM_GGML_OP_TRANSPOSE:
+        case LM_GGML_OP_PERMUTE:
+        case LM_GGML_OP_CONCAT:
+        case LM_GGML_OP_ADD:
+        case LM_GGML_OP_MUL:
+        case LM_GGML_OP_DIV:
+        case LM_GGML_OP_SCALE:
+        case LM_GGML_OP_SQR:
+        case LM_GGML_OP_SUM_ROWS:
+        case LM_GGML_OP_SOFT_MAX:
+        case LM_GGML_OP_RMS_NORM:
+        case LM_GGML_OP_NORM:
+        case LM_GGML_OP_ALIBI:
+        case LM_GGML_OP_ROPE:
+        case LM_GGML_OP_IM2COL:
+        case LM_GGML_OP_ARGSORT:
+        case LM_GGML_OP_DUP:
+        case LM_GGML_OP_CPY:
+        case LM_GGML_OP_CONT:
+        case LM_GGML_OP_MUL_MAT:
+        case LM_GGML_OP_MUL_MAT_ID:
+            return true;
+        case LM_GGML_OP_DIAG_MASK_INF:
+        case LM_GGML_OP_GET_ROWS:
+            {
+                return op->ne[0] % 4 == 0;
+            }
+        default:
+            return false;
+    }
+}
 void lm_ggml_metal_graph_compute(
         struct lm_ggml_metal_context * ctx,
                struct lm_ggml_cgraph * gf) {
@@ -744,6 +890,22 @@ void lm_ggml_metal_graph_compute(
                 struct lm_ggml_tensor * src1 = gf->nodes[i]->src[1];
                 struct lm_ggml_tensor * dst  = gf->nodes[i];
 
+                switch (dst->op) {
+                    case LM_GGML_OP_NONE:
+                    case LM_GGML_OP_RESHAPE:
+                    case LM_GGML_OP_VIEW:
+                    case LM_GGML_OP_TRANSPOSE:
+                    case LM_GGML_OP_PERMUTE:
+                        {
+                            // noop -> next node
+                        } continue;
+                    default:
+                        {
+                        } break;
+                }
+
+                LM_GGML_ASSERT(lm_ggml_metal_supports_op(dst));
+
                 const int64_t  ne00 = src0 ? src0->ne[0] : 0;
                 const int64_t  ne01 = src0 ? src0->ne[1] : 0;
                 const int64_t  ne02 = src0 ? src0->ne[2] : 0;
@@ -797,14 +959,6 @@ void lm_ggml_metal_graph_compute(
                 //}
 
                 switch (dst->op) {
-                    case LM_GGML_OP_NONE:
-                    case LM_GGML_OP_RESHAPE:
-                    case LM_GGML_OP_VIEW:
-                    case LM_GGML_OP_TRANSPOSE:
-                    case LM_GGML_OP_PERMUTE:
-                        {
-                            // noop
-                        } break;
                     case LM_GGML_OP_CONCAT:
                         {
                             const int64_t nb = ne00;
@@ -844,6 +998,8 @@ void lm_ggml_metal_graph_compute(
                             [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                         } break;
                     case LM_GGML_OP_ADD:
+                    case LM_GGML_OP_MUL:
+                    case LM_GGML_OP_DIV:
                         {
                             LM_GGML_ASSERT(lm_ggml_is_contiguous(src0));
                             LM_GGML_ASSERT(lm_ggml_is_contiguous(src1));
@@ -857,11 +1013,21 @@ void lm_ggml_metal_graph_compute(
                                 LM_GGML_ASSERT(ne11 == 1);
 
                                 nb = ne00 / 4;
-                                [encoder setComputePipelineState:ctx->pipeline_add_row];
+                                switch (dst->op) {
+                                    case LM_GGML_OP_ADD: [encoder setComputePipelineState:ctx->pipeline_add_row]; break;
+                                    case LM_GGML_OP_MUL: [encoder setComputePipelineState:ctx->pipeline_mul_row]; break;
+                                    case LM_GGML_OP_DIV: [encoder setComputePipelineState:ctx->pipeline_div_row]; break;
+                                    default: LM_GGML_ASSERT(false);
+                                }
 
                                 bcast_row = true;
                             } else {
-                                [encoder setComputePipelineState:ctx->pipeline_add];
+                                switch (dst->op) {
+                                    case LM_GGML_OP_ADD: [encoder setComputePipelineState:ctx->pipeline_add]; break;
+                                    case LM_GGML_OP_MUL: [encoder setComputePipelineState:ctx->pipeline_mul]; break;
+                                    case LM_GGML_OP_DIV: [encoder setComputePipelineState:ctx->pipeline_div]; break;
+                                    default: LM_GGML_ASSERT(false);
+                                }
                             }
                             [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                             [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
@@ -902,31 +1068,6 @@ void lm_ggml_metal_graph_compute(
                                 [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                             }
                         } break;
-                    case LM_GGML_OP_MUL:
-                        {
-                            LM_GGML_ASSERT(lm_ggml_is_contiguous(src0));
-                            LM_GGML_ASSERT(lm_ggml_is_contiguous(src1));
-
-                            // utilize float4
-                            LM_GGML_ASSERT(ne00 % 4 == 0);
-                            const int64_t nb = ne00/4;
-
-                            if (lm_ggml_nelements(src1) == ne10) {
-                                // src1 is a row
-                                LM_GGML_ASSERT(ne11 == 1);
-                                [encoder setComputePipelineState:ctx->pipeline_mul_row];
-                            } else {
-                                [encoder setComputePipelineState:ctx->pipeline_mul];
-                            }
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                            [encoder setBytes:&nb     length:sizeof(nb) atIndex:3];
-
-                            const int64_t n = lm_ggml_nelements(dst)/4;
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                        } break;
                     case LM_GGML_OP_SCALE:
                         {
                             LM_GGML_ASSERT(lm_ggml_is_contiguous(src0));
@@ -999,25 +1140,66 @@ void lm_ggml_metal_graph_compute(
                             const int64_t n = lm_ggml_nelements(dst);
                             [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                         } break;
+                    case LM_GGML_OP_SUM_ROWS:
+                        {
+                            LM_GGML_ASSERT(src0->nb[0] == lm_ggml_type_size(src0->type));
+
+                            [encoder setComputePipelineState:ctx->pipeline_sum_rows];
+                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
+                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
+                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
+                            [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
+                            [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
+                            [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
+                            [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
+                            [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
+                            [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:10];
+                            [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:11];
+                            [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12];
+                            [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:13];
+                            [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14];
+                            [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15];
+                            [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16];
+                            [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:17];
+                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:18];
+                            [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:19];
+                            [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:20];
+                            [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:21];
+                            [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:22];
+                            [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:23];
+                            [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:24];
+                            [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:25];
+
+                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                        } break;
                     case LM_GGML_OP_SOFT_MAX:
                         {
                             int nth = 32; // SIMD width
 
                             if (ne00%4 == 0) {
+                                while (nth < ne00/4 && nth < 256) {
+                                    nth *= 2;
+                                }
                                 [encoder setComputePipelineState:ctx->pipeline_soft_max_4];
                             } else {
-                                do {
+                                while (nth < ne00 && nth < 1024) {
                                     nth *= 2;
-                                } while (nth <= ne00 && nth <= 1024);
-                                nth /= 2;
+                                }
                                 [encoder setComputePipelineState:ctx->pipeline_soft_max];
                             }
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
-                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
-                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
-                            [encoder setThreadgroupMemoryLength:MAX(16, nth/32*sizeof(float)) atIndex:0];
+
+                            const float scale = ((float *) dst->op_params)[0];
+
+                            [encoder setBuffer:id_src0 offset:offs_src0   atIndex:0];
+                            [encoder setBuffer:id_src1 offset:offs_src1   atIndex:1];
+                            [encoder setBuffer:id_dst  offset:offs_dst    atIndex:2];
+                            [encoder setBytes:&ne00  length:sizeof(ne00)  atIndex:3];
+                            [encoder setBytes:&ne01  length:sizeof(ne01)  atIndex:4];
+                            [encoder setBytes:&ne02  length:sizeof(ne02)  atIndex:5];
+                            [encoder setBytes:&scale length:sizeof(scale) atIndex:6];
+                            [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
 
                             [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                         } break;
@@ -1046,9 +1228,13 @@ void lm_ggml_metal_graph_compute(
                     case LM_GGML_OP_MUL_MAT:
                         {
                             LM_GGML_ASSERT(ne00 == ne10);
-                            LM_GGML_ASSERT(ne03 == ne13);
 
-                            const uint gqa = ne12/ne02;
+                            // TODO: assert that dim2 and dim3 are contiguous
+                            LM_GGML_ASSERT(ne12 % ne02 == 0);
+                            LM_GGML_ASSERT(ne13 % ne03 == 0);
+
+                            const uint r2 = ne12/ne02;
+                            const uint r3 = ne13/ne03;
 
                             // find the break-even point where the matrix-matrix kernel becomes more efficient compared
                             // to the matrix-vector kernel
@@ -1083,7 +1269,7 @@ void lm_ggml_metal_graph_compute(
                                 !lm_ggml_is_transposed(src1) &&
                                 src1t == LM_GGML_TYPE_F32 &&
                                 ne00 % 32 == 0 && ne00 >= 64 &&
-                                ne11 > ne11_mm_min) {
+                                (ne11 > ne11_mm_min || (lm_ggml_is_quantized(src0t) && ne12 > 1))) {
                                 //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
                                 switch (src0->type) {
                                     case LM_GGML_TYPE_F32:  [encoder setComputePipelineState:ctx->pipeline_mul_mm_f32_f32];  break;
@@ -1113,9 +1299,10 @@ void lm_ggml_metal_graph_compute(
                                 [encoder setBytes:&nb12    length:sizeof(nb12) atIndex:10];
                                 [encoder setBytes:&ne0     length:sizeof(ne0)  atIndex:11];
                                 [encoder setBytes:&ne1     length:sizeof(ne1)  atIndex:12];
-                                [encoder setBytes:&gqa     length:sizeof(gqa)  atIndex:13];
+                                [encoder setBytes:&r2      length:sizeof(r2)   atIndex:13];
+                                [encoder setBytes:&r3      length:sizeof(r3)   atIndex:14];
                                 [encoder setThreadgroupMemoryLength:8192 atIndex:0];
-                                [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne01 + 63)/64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
+                                [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne01 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
                             } else {
                                 int nth0 = 32;
                                 int nth1 = 1;
@@ -1126,6 +1313,7 @@ void lm_ggml_metal_graph_compute(
                                 switch (src0t) {
                                     case LM_GGML_TYPE_F32:
                                         {
+                                            LM_GGML_ASSERT(src1t == LM_GGML_TYPE_F32);
                                             [encoder setComputePipelineState:ctx->pipeline_mul_mv_f32_f32];
                                             nrows = 4;
                                         } break;
@@ -1133,102 +1321,77 @@ void lm_ggml_metal_graph_compute(
                                         {
                                             nth0 = 32;
                                             nth1 = 1;
-                                            if (ne11 * ne12 < 4) {
-                                                [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_1row];
-                                            } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
-                                                [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_l4];
-                                                nrows = ne11;
+                                            if (src1t == LM_GGML_TYPE_F32) {
+                                                if (ne11 * ne12 < 4) {
+                                                    [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_1row];
+                                                } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
+                                                    [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_l4];
+                                                    nrows = ne11;
+                                                } else {
+                                                    [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32];
+                                                    nrows = 4;
+                                                }
                                             } else {
-                                                [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32];
+                                                [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f16];
                                                 nrows = 4;
                                             }
                                         } break;
                                     case LM_GGML_TYPE_Q4_0:
                                         {
-                                            LM_GGML_ASSERT(ne02 == 1);
-                                            LM_GGML_ASSERT(ne12 == 1);
-
                                             nth0 = 8;
                                             nth1 = 8;
                                             [encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_0_f32];
                                         } break;
                                     case LM_GGML_TYPE_Q4_1:
                                         {
-                                            LM_GGML_ASSERT(ne02 == 1);
-                                            LM_GGML_ASSERT(ne12 == 1);
-
                                             nth0 = 8;
                                             nth1 = 8;
                                             [encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_1_f32];
                                         } break;
                                     case LM_GGML_TYPE_Q5_0:
                                         {
-                                            LM_GGML_ASSERT(ne02 == 1);
-                                            LM_GGML_ASSERT(ne12 == 1);
-
                                             nth0 = 8;
                                             nth1 = 8;
                                             [encoder setComputePipelineState:ctx->pipeline_mul_mv_q5_0_f32];
                                         } break;
                                     case LM_GGML_TYPE_Q5_1:
                                         {
-                                            LM_GGML_ASSERT(ne02 == 1);
-                                            LM_GGML_ASSERT(ne12 == 1);
-
                                             nth0 = 8;
                                             nth1 = 8;
                                             [encoder setComputePipelineState:ctx->pipeline_mul_mv_q5_1_f32];
                                         } break;
                                     case LM_GGML_TYPE_Q8_0:
                                         {
-                                            LM_GGML_ASSERT(ne02 == 1);
-                                            LM_GGML_ASSERT(ne12 == 1);
-
                                             nth0 = 8;
                                             nth1 = 8;
                                             [encoder setComputePipelineState:ctx->pipeline_mul_mv_q8_0_f32];
                                         } break;
                                     case LM_GGML_TYPE_Q2_K:
                                         {
-                                            LM_GGML_ASSERT(ne02 == 1);
-                                            LM_GGML_ASSERT(ne12 == 1);
-
                                             nth0 = 2;
                                             nth1 = 32;
                                             [encoder setComputePipelineState:ctx->pipeline_mul_mv_q2_K_f32];
                                         } break;
                                     case LM_GGML_TYPE_Q3_K:
                                         {
-                                            LM_GGML_ASSERT(ne02 == 1);
-                                            LM_GGML_ASSERT(ne12 == 1);
-
                                             nth0 = 2;
                                             nth1 = 32;
                                             [encoder setComputePipelineState:ctx->pipeline_mul_mv_q3_K_f32];
                                         } break;
                                     case LM_GGML_TYPE_Q4_K:
                                         {
-                                            LM_GGML_ASSERT(ne02 == 1);
-                                            LM_GGML_ASSERT(ne12 == 1);
-
                                             nth0 = 4; //1;
                                             nth1 = 8; //32;
                                             [encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_K_f32];
                                         } break;
                                     case LM_GGML_TYPE_Q5_K:
                                         {
-                                            LM_GGML_ASSERT(ne02 == 1);
-                                            LM_GGML_ASSERT(ne12 == 1);
-
                                             nth0 = 2;
                                             nth1 = 32;
                                             [encoder setComputePipelineState:ctx->pipeline_mul_mv_q5_K_f32];
                                         } break;
                                     case LM_GGML_TYPE_Q6_K:
                                         {
-                                            LM_GGML_ASSERT(ne02 == 1);
-                                            LM_GGML_ASSERT(ne12 == 1);
-
                                             nth0 = 2;
                                             nth1 = 32;
                                             [encoder setComputePipelineState:ctx->pipeline_mul_mv_q6_K_f32];
@@ -1257,32 +1420,125 @@ void lm_ggml_metal_graph_compute(
                                 [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14];
                                 [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:15];
                                 [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:16];
-                                [encoder setBytes:&gqa  length:sizeof(gqa)  atIndex:17];
+                                [encoder setBytes:&r2   length:sizeof(r2)   atIndex:17];
+                                [encoder setBytes:&r3   length:sizeof(r3)   atIndex:18];
 
                                 if (src0t == LM_GGML_TYPE_Q4_0 || src0t == LM_GGML_TYPE_Q4_1 ||
                                     src0t == LM_GGML_TYPE_Q5_0 || src0t == LM_GGML_TYPE_Q5_1 || src0t == LM_GGML_TYPE_Q8_0 ||
                                     src0t == LM_GGML_TYPE_Q2_K) { // || src0t == LM_GGML_TYPE_Q4_K) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                 }
                                 else if (src0t == LM_GGML_TYPE_Q4_K) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                 }
                                 else if (src0t == LM_GGML_TYPE_Q3_K) {
 #ifdef LM_GGML_QKK_64
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
 #else
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
 #endif
                                 }
                                 else if (src0t == LM_GGML_TYPE_Q5_K) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                 }
                                 else if (src0t == LM_GGML_TYPE_Q6_K) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                 } else {
                                     int64_t ny = (ne11 + nrows - 1)/nrows;
-                                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                }
+                            }
+                        } break;
+                    case LM_GGML_OP_MUL_MAT_ID:
+                        {
+                            //LM_GGML_ASSERT(ne00 == ne10);
+                            //LM_GGML_ASSERT(ne03 == ne13);
+
+                            LM_GGML_ASSERT(src0t == LM_GGML_TYPE_I32);
+
+                            const int n_as = ne00;
+
+                            // TODO: make this more general
+                            LM_GGML_ASSERT(n_as <= 8);
+
+                            struct lm_ggml_tensor * src2 = gf->nodes[i]->src[2];
+
+                            const int64_t  ne20 = src2 ? src2->ne[0] : 0;
+                            const int64_t  ne21 = src2 ? src2->ne[1] : 0;
+                            const int64_t  ne22 = src2 ? src2->ne[2] : 0;
+                            const int64_t  ne23 = src2 ? src2->ne[3] : 0; LM_GGML_UNUSED(ne23);
+
+                            const uint64_t nb20 = src2 ? src2->nb[0] : 0; LM_GGML_UNUSED(nb20);
+                            const uint64_t nb21 = src2 ? src2->nb[1] : 0;
+                            const uint64_t nb22 = src2 ? src2->nb[2] : 0;
+                            const uint64_t nb23 = src2 ? src2->nb[3] : 0; LM_GGML_UNUSED(nb23);
+
+                            const enum lm_ggml_type src2t = src2 ? src2->type : LM_GGML_TYPE_COUNT; LM_GGML_UNUSED(src2t);
+
+                            LM_GGML_ASSERT(!lm_ggml_is_transposed(src2));
+                            LM_GGML_ASSERT(!lm_ggml_is_transposed(src1));
+
+                            LM_GGML_ASSERT(ne20 % 32 == 0);
+                            // !!!!!!!!! TODO: this assert is probably required but not sure!
+                            //LM_GGML_ASSERT(ne20 >= 64);
+                            LM_GGML_ASSERT(src1t == LM_GGML_TYPE_F32);
+
+                            const uint r2 = ne12/ne22;
+                            const uint r3 = ne13/ne23;
+
+                            // find the break-even point where the matrix-matrix kernel becomes more efficient compared
+                            // to the matrix-vector kernel
+                            int ne11_mm_min = 0;
+
+                            const int idx = ((int32_t *) dst->op_params)[0];
+
+                            // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
+                            // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
+                            if ([ctx->device supportsFamily:MTLGPUFamilyApple7] &&
+                                ne11 > ne11_mm_min) {
+                                switch (src2->type) {
+                                    case LM_GGML_TYPE_F32:  [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_f32_f32];  break;
+                                    case LM_GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_f16_f32];  break;
+                                    case LM_GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q4_0_f32]; break;
+                                    case LM_GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q4_1_f32]; break;
+                                    case LM_GGML_TYPE_Q5_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q5_0_f32]; break;
+                                    case LM_GGML_TYPE_Q5_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q5_1_f32]; break;
+                                    case LM_GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q8_0_f32]; break;
+                                    case LM_GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q2_K_f32]; break;
+                                    case LM_GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q3_K_f32]; break;
+                                    case LM_GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q4_K_f32]; break;
+                                    case LM_GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q5_K_f32]; break;
+                                    case LM_GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q6_K_f32]; break;
+                                    default: LM_GGML_ASSERT(false && "MUL_MAT_ID not implemented");
                                 }
+                                [encoder setBuffer:id_src0 offset:offs_src0    atIndex:0];
+                                [encoder setBuffer:id_src1 offset:offs_src1    atIndex:1];
+                                [encoder setBuffer:id_dst  offset:offs_dst     atIndex:2];
+                                [encoder setBytes:&ne20    length:sizeof(ne20) atIndex:3];
+                                [encoder setBytes:&ne22    length:sizeof(ne22) atIndex:4];
+                                [encoder setBytes:&nb21    length:sizeof(nb21) atIndex:5];
+                                [encoder setBytes:&nb22    length:sizeof(nb22) atIndex:6];
+                                [encoder setBytes:&ne12    length:sizeof(ne12) atIndex:7];
+                                [encoder setBytes:&nb10    length:sizeof(nb10) atIndex:8];
+                                [encoder setBytes:&nb11    length:sizeof(nb11) atIndex:9];
+                                [encoder setBytes:&nb12    length:sizeof(nb12) atIndex:10];
+                                [encoder setBytes:&ne0     length:sizeof(ne0)  atIndex:11];
+                                [encoder setBytes:&ne1     length:sizeof(ne1)  atIndex:12];
+                                [encoder setBytes:&r2      length:sizeof(r2)   atIndex:13];
+                                [encoder setBytes:&r3      length:sizeof(r3)   atIndex:14];
+                                [encoder setBytes:&idx     length:sizeof(idx)  atIndex:15];
+                                // TODO: how to make this an array? read Metal docs
+                                for (int j = 0; j < n_as; ++j) {
+                                    struct lm_ggml_tensor * src_cur = dst->src[2 + j];
+
+                                    size_t offs_src_cur = 0;
+                                    id<MTLBuffer> id_src_cur = lm_ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
+
+                                    [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:16 + j];
+                                }
+
+                                [encoder setThreadgroupMemoryLength:8192 atIndex:0];
+                                [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne21 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
                             }
                         } break;
                     case LM_GGML_OP_GET_ROWS:
@@ -1321,15 +1577,19 @@ void lm_ggml_metal_graph_compute(
                             float eps;
                             memcpy(&eps, dst->op_params, sizeof(float));
 
-                            const int nth = MIN(512, ne00);
+                            int nth = 32; // SIMD width
+
+                            while (nth < ne00/4 && nth < 1024) {
+                                nth *= 2;
+                            }
 
                             [encoder setComputePipelineState:ctx->pipeline_rms_norm];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                            [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
-                            [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
-                            [encoder setBytes:&eps  length:sizeof(   float) atIndex:4];
-                            [encoder setThreadgroupMemoryLength:nth/32*sizeof(float) atIndex:0];
+                            [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
+                            [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
+                            [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
+                            [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:3];
+                            [encoder setBytes:&eps     length:sizeof(   float) atIndex:4];
+                            [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
 
                             const int64_t nrows = lm_ggml_nrows(src0);
 
@@ -1348,7 +1608,7 @@ void lm_ggml_metal_graph_compute(
                             [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
                             [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:3];
                             [encoder setBytes:&eps     length:sizeof(   float) atIndex:4];
-                            [encoder setThreadgroupMemoryLength:MAX(16, nth*sizeof(float)) atIndex:0];
+                            [encoder setThreadgroupMemoryLength:LM_GGML_PAD(nth*sizeof(float), 16) atIndex:0];
 
                             const int64_t nrows = lm_ggml_nrows(src0);
 
@@ -1452,18 +1712,100 @@ void lm_ggml_metal_graph_compute(
 
                             [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                         } break;
+                    case LM_GGML_OP_IM2COL:
+                        {
+                            LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F16);
+                            LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F32);
+                            LM_GGML_ASSERT( dst->type == LM_GGML_TYPE_F16);
+
+                            const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
+                            const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
+                            const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
+                            const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
+                            const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
+                            const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
+                            const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
+
+                            const int32_t N  = src1->ne[is_2D ? 3 : 2];
+                            const int32_t IC = src1->ne[is_2D ? 2 : 1];
+                            const int32_t IH = is_2D ? src1->ne[1] : 1;
+                            const int32_t IW =         src1->ne[0];
+
+                            const int32_t KH = is_2D ? src0->ne[1] : 1;
+                            const int32_t KW =         src0->ne[0];
+
+                            const int32_t OH = is_2D ? dst->ne[2] : 1;
+                            const int32_t OW =         dst->ne[1];
+
+                            const int32_t CHW = IC * KH * KW;
+
+                            const int32_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4;
+                            const int32_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4;
+
+                            switch (src0->type) {
+                                case LM_GGML_TYPE_F32: LM_GGML_ASSERT(false && "not implemented"); break;
+                                case LM_GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_im2col_f16]; break;
+                                default: LM_GGML_ASSERT(false);
+                            };
+
+                            [encoder setBuffer:id_src1 offset:offs_src1        atIndex:0];
+                            [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
+                            [encoder setBytes:&ofs0    length:sizeof( int32_t) atIndex:2];
+                            [encoder setBytes:&ofs1    length:sizeof( int32_t) atIndex:3];
+                            [encoder setBytes:&IW      length:sizeof( int32_t) atIndex:4];
+                            [encoder setBytes:&IH      length:sizeof( int32_t) atIndex:5];
+                            [encoder setBytes:&CHW     length:sizeof( int32_t) atIndex:6];
+                            [encoder setBytes:&s0      length:sizeof( int32_t) atIndex:7];
+                            [encoder setBytes:&s1      length:sizeof( int32_t) atIndex:8];
+                            [encoder setBytes:&p0      length:sizeof( int32_t) atIndex:9];
+                            [encoder setBytes:&p1      length:sizeof( int32_t) atIndex:10];
+                            [encoder setBytes:&d0      length:sizeof( int32_t) atIndex:11];
+                            [encoder setBytes:&d1      length:sizeof( int32_t) atIndex:12];
+
+                            [encoder dispatchThreadgroups:MTLSizeMake(IC, OH, OW) threadsPerThreadgroup:MTLSizeMake(N, KH, KW)];
+                        } break;
+                    case LM_GGML_OP_ARGSORT:
+                        {
+                            LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F32);
+                            LM_GGML_ASSERT( dst->type == LM_GGML_TYPE_I32);
+
+                            const int nrows = lm_ggml_nrows(src0);
+
+                            enum lm_ggml_sort_order order = (enum lm_ggml_sort_order) dst->op_params[0];
+
+                            switch (order) {
+                                case LM_GGML_SORT_ASC:  [encoder setComputePipelineState:ctx->pipeline_argsort_f32_i32_asc];  break;
+                                case LM_GGML_SORT_DESC: [encoder setComputePipelineState:ctx->pipeline_argsort_f32_i32_desc]; break;
+                                default: LM_GGML_ASSERT(false);
+                            };
+
+                            [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
+                            [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
+                            [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
+
+                            [encoder dispatchThreadgroups:MTLSizeMake(1, nrows, 1) threadsPerThreadgroup:MTLSizeMake(ne00, 1, 1)];
+                        } break;
                     case LM_GGML_OP_DUP:
                     case LM_GGML_OP_CPY:
                     case LM_GGML_OP_CONT:
                         {
-                            const int nth = MIN(1024, ne00);
+                            LM_GGML_ASSERT(ne00 % lm_ggml_blck_size(src0->type) == 0);
+
+                            int nth = MIN(1024, ne00/lm_ggml_blck_size(src0->type));
 
                             switch (src0t) {
                                 case LM_GGML_TYPE_F32:
                                     {
+                                        LM_GGML_ASSERT(ne0 % lm_ggml_blck_size(dst->type) == 0);
+
                                         switch (dstt) {
-                                            case LM_GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f16]; break;
-                                            case LM_GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32]; break;
+                                            case LM_GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f16];  break;
+                                            case LM_GGML_TYPE_F32:  [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32];  break;
+                                            case LM_GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q8_0]; break;
+                                            case LM_GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q4_0]; break;
+                                            case LM_GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q4_1]; break;
+                                            //case LM_GGML_TYPE_Q5_0: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q5_0]; break;
+                                            //case LM_GGML_TYPE_Q5_1: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q5_1]; break;
                                             default: LM_GGML_ASSERT(false && "not implemented");
                                         };
                                     } break;
@@ -1538,81 +1880,150 @@ void lm_ggml_metal_graph_compute(
 
 // backend interface
 
-static const char * lm_ggml_backend_metal_name(lm_ggml_backend_t backend) {
-    return "Metal";
+static id<MTLDevice> g_backend_device = nil;
+static int g_backend_device_ref_count = 0;
 
-    UNUSED(backend);
+static id<MTLDevice> lm_ggml_backend_metal_get_device(void) {
+    if (g_backend_device == nil) {
+        g_backend_device = MTLCreateSystemDefaultDevice();
+    }
+
+    g_backend_device_ref_count++;
+
+    return g_backend_device;
 }
 
-static void lm_ggml_backend_metal_free(lm_ggml_backend_t backend) {
-    struct lm_ggml_metal_context * ctx = (struct lm_ggml_metal_context *)backend->context;
-    lm_ggml_metal_free(ctx);
-    free(backend);
+static void lm_ggml_backend_metal_free_device(void) {
+    assert(g_backend_device_ref_count > 0);
+
+    g_backend_device_ref_count--;
+
+    if (g_backend_device_ref_count == 0) {
+        [g_backend_device release];
+        g_backend_device = nil;
+    }
 }
 
 static void * lm_ggml_backend_metal_buffer_get_base(lm_ggml_backend_buffer_t buffer) {
-    return (void *)buffer->context;
+    struct lm_ggml_backend_metal_buffer_context * ctx = (struct lm_ggml_backend_metal_buffer_context *)buffer->context;
+
+    return ctx->data;
 }
 
 static void lm_ggml_backend_metal_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
-    free(buffer->context);
+    struct lm_ggml_backend_metal_buffer_context * ctx = (struct lm_ggml_backend_metal_buffer_context *)buffer->context;
+
+    [ctx->metal release];
+    lm_ggml_backend_metal_free_device();
+
+    free(ctx->data);
+    free(ctx);
+
+    UNUSED(buffer);
+}
+
+static void lm_ggml_backend_metal_buffer_set_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor write out of bounds");
+    LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+
+    memcpy((char *)tensor->data + offset, data, size);
+
+    UNUSED(buffer);
+}
+
+static void lm_ggml_backend_metal_buffer_get_tensor(lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor read out of bounds");
+    LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+
+    memcpy(data, (const char *)tensor->data + offset, size);
+
+    UNUSED(buffer);
+}
+
+static void lm_ggml_backend_metal_buffer_cpy_tensor_from(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) {
+    lm_ggml_backend_tensor_get(src, dst->data, 0, lm_ggml_nbytes(src));
+
+    UNUSED(buffer);
+}
+
+static void lm_ggml_backend_metal_buffer_cpy_tensor_to(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) {
+    lm_ggml_backend_tensor_set(dst, src->data, 0, lm_ggml_nbytes(src));
+
     UNUSED(buffer);
 }
 
 static struct lm_ggml_backend_buffer_i metal_backend_buffer_i = {
-    /* .free_buffer    = */ lm_ggml_backend_metal_buffer_free_buffer,
-    /* .get_base       = */ lm_ggml_backend_metal_buffer_get_base,
-    /* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes
-    /* .init_tensor    = */ NULL, // no initialization required
-    /* .free_tensor    = */ NULL, // no cleanup required
+    /* .free_buffer     = */ lm_ggml_backend_metal_buffer_free_buffer,
+    /* .get_base        = */ lm_ggml_backend_metal_buffer_get_base,
+    /* .init_tensor     = */ NULL,
+    /* .set_tensor      = */ lm_ggml_backend_metal_buffer_set_tensor,
+    /* .get_tensor      = */ lm_ggml_backend_metal_buffer_get_tensor,
+    /* .cpy_tensor_from = */ lm_ggml_backend_metal_buffer_cpy_tensor_from,
+    /* .cpy_tensor_to   = */ lm_ggml_backend_metal_buffer_cpy_tensor_to,
 };
 
-static lm_ggml_backend_buffer_t lm_ggml_backend_metal_alloc_buffer(lm_ggml_backend_t backend, size_t size) {
-    struct lm_ggml_metal_context * ctx = (struct lm_ggml_metal_context *)backend->context;
+static lm_ggml_backend_buffer_t lm_ggml_backend_metal_buffer_type_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
+    struct lm_ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct lm_ggml_backend_metal_buffer_context));
 
-    void * data = lm_ggml_metal_host_malloc(size);
+    const size_t size_page = sysconf(_SC_PAGESIZE);
 
-    // TODO: set proper name of the buffers
-    lm_ggml_metal_add_buffer(ctx, "backend", data, size, 0);
+    size_t size_aligned = size;
+    if ((size_aligned % size_page) != 0) {
+        size_aligned += (size_page - (size_aligned % size_page));
+    }
 
-    return lm_ggml_backend_buffer_init(backend, metal_backend_buffer_i, data, size);
+    ctx->data  = lm_ggml_metal_host_malloc(size);
+    ctx->metal = [lm_ggml_backend_metal_get_device() newBufferWithBytesNoCopy:ctx->data
+                    length:size_aligned
+                    options:MTLResourceStorageModeShared
+                    deallocator:nil];
+
+    return lm_ggml_backend_buffer_init(buft, metal_backend_buffer_i, ctx, size);
 }
 
-static size_t lm_ggml_backend_metal_get_alignment(lm_ggml_backend_t backend) {
+static size_t lm_ggml_backend_metal_buffer_type_get_alignment(lm_ggml_backend_buffer_type_t buft) {
     return 32;
-    UNUSED(backend);
+    UNUSED(buft);
 }
 
-static void lm_ggml_backend_metal_set_tensor_async(lm_ggml_backend_t backend, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor write out of bounds");
-    LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
-
-    memcpy((char *)tensor->data + offset, data, size);
+static bool lm_ggml_backend_metal_buffer_type_supports_backend(lm_ggml_backend_buffer_type_t buft, lm_ggml_backend_t backend) {
+    return lm_ggml_backend_is_metal(backend) || lm_ggml_backend_is_cpu(backend);
 
-    UNUSED(backend);
+    LM_GGML_UNUSED(buft);
 }
 
-static void lm_ggml_backend_metal_get_tensor_async(lm_ggml_backend_t backend, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor read out of bounds");
-    LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
-
-    memcpy(data, (const char *)tensor->data + offset, size);
+lm_ggml_backend_buffer_type_t lm_ggml_backend_metal_buffer_type(void) {
+    static struct lm_ggml_backend_buffer_type lm_ggml_backend_buffer_type_metal = {
+        /* .iface = */ {
+            /* .alloc_buffer     = */ lm_ggml_backend_metal_buffer_type_alloc_buffer,
+            /* .get_alignment    = */ lm_ggml_backend_metal_buffer_type_get_alignment,
+            /* .get_alloc_size   = */ NULL, // defaults to lm_ggml_nbytes
+            /* .supports_backend = */ lm_ggml_backend_metal_buffer_type_supports_backend,
+        },
+        /* .context = */ NULL,
+    };
 
-    UNUSED(backend);
+    return &lm_ggml_backend_buffer_type_metal;
 }
 
-static void lm_ggml_backend_metal_synchronize(lm_ggml_backend_t backend) {
+static const char * lm_ggml_backend_metal_name(lm_ggml_backend_t backend) {
+    return "Metal";
+
     UNUSED(backend);
 }
 
-static void lm_ggml_backend_metal_cpy_tensor_from(lm_ggml_backend_t backend, struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) {
-    lm_ggml_backend_tensor_get(src, dst->data, 0, lm_ggml_nbytes(src));
+static void lm_ggml_backend_metal_free(lm_ggml_backend_t backend) {
+    struct lm_ggml_metal_context * ctx = (struct lm_ggml_metal_context *)backend->context;
+    lm_ggml_metal_free(ctx);
+    free(backend);
+}
 
+static void lm_ggml_backend_metal_synchronize(lm_ggml_backend_t backend) {
     UNUSED(backend);
 }
 
-static void lm_ggml_backend_metal_cpy_tensor_to(lm_ggml_backend_t backend, struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) {
-    lm_ggml_backend_tensor_set_async(dst, src->data, 0, lm_ggml_nbytes(src));
+static lm_ggml_backend_buffer_type_t lm_ggml_backend_metal_get_default_buffer_type(lm_ggml_backend_t backend) {
+    return lm_ggml_backend_metal_buffer_type();
 
     UNUSED(backend);
 }
@@ -1624,32 +2035,43 @@ static void lm_ggml_backend_metal_graph_compute(lm_ggml_backend_t backend, struc
 }
 
 static bool lm_ggml_backend_metal_supports_op(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op) {
-    return true;
+    return lm_ggml_metal_supports_op(op);
+
     UNUSED(backend);
-    UNUSED(op);
 }
 
 static struct lm_ggml_backend_i metal_backend_i = {
-    /* .get_name            = */ lm_ggml_backend_metal_name,
-    /* .free                = */ lm_ggml_backend_metal_free,
-    /* .alloc_buffer        = */ lm_ggml_backend_metal_alloc_buffer,
-    /* .get_alignment       = */ lm_ggml_backend_metal_get_alignment,
-    /* .set_tensor_async    = */ lm_ggml_backend_metal_set_tensor_async,
-    /* .get_tensor_async    = */ lm_ggml_backend_metal_get_tensor_async,
-    /* .synchronize         = */ lm_ggml_backend_metal_synchronize,
-    /* .cpy_tensor_from     = */ lm_ggml_backend_metal_cpy_tensor_from,
-    /* .cpy_tensor_to       = */ lm_ggml_backend_metal_cpy_tensor_to,
-    /* .graph_plan_create   = */ NULL, // the metal implementation does not require creating graph plans atm
-    /* .graph_plan_free     = */ NULL,
-    /* .graph_plan_compute  = */ NULL,
-    /* .graph_compute       = */ lm_ggml_backend_metal_graph_compute,
-    /* .supports_op         = */ lm_ggml_backend_metal_supports_op,
+    /* .get_name                = */ lm_ggml_backend_metal_name,
+    /* .free                    = */ lm_ggml_backend_metal_free,
+    /* .get_default_buffer_type = */ lm_ggml_backend_metal_get_default_buffer_type,
+    /* .set_tensor_async        = */ NULL,
+    /* .get_tensor_async        = */ NULL,
+    /* .cpy_tensor_from_async   = */ NULL,
+    /* .cpy_tensor_to_async     = */ NULL,
+    /* .synchronize             = */ lm_ggml_backend_metal_synchronize,
+    /* .graph_plan_create       = */ NULL, // the metal implementation does not require creating graph plans atm
+    /* .graph_plan_free         = */ NULL,
+    /* .graph_plan_compute      = */ NULL,
+    /* .graph_compute           = */ lm_ggml_backend_metal_graph_compute,
+    /* .supports_op             = */ lm_ggml_backend_metal_supports_op,
 };
 
+// TODO: make a common log callback for all backends in ggml-backend
+static void lm_ggml_backend_log_callback(enum lm_ggml_log_level level, const char * msg, void * user_data) {
+    fprintf(stderr, "%s", msg);
+
+    UNUSED(level);
+    UNUSED(user_data);
+}
+
 lm_ggml_backend_t lm_ggml_backend_metal_init(void) {
-    struct lm_ggml_metal_context * ctx = malloc(sizeof(struct lm_ggml_metal_context));
+    lm_ggml_metal_log_set_callback(lm_ggml_backend_log_callback, NULL);
 
-    ctx = lm_ggml_metal_init(LM_GGML_DEFAULT_N_THREADS);
+    struct lm_ggml_metal_context * ctx = lm_ggml_metal_init(LM_GGML_DEFAULT_N_THREADS);
+
+    if (ctx == NULL) {
+        return NULL;
+    }
 
     lm_ggml_backend_t metal_backend = malloc(sizeof(struct lm_ggml_backend));
 
@@ -1666,7 +2088,26 @@ bool lm_ggml_backend_is_metal(lm_ggml_backend_t backend) {
 }
 
 void lm_ggml_backend_metal_set_n_cb(lm_ggml_backend_t backend, int n_cb) {
+    LM_GGML_ASSERT(lm_ggml_backend_is_metal(backend));
+
     struct lm_ggml_metal_context * ctx = (struct lm_ggml_metal_context *)backend->context;
 
     lm_ggml_metal_set_n_cb(ctx, n_cb);
 }
+
+bool lm_ggml_backend_metal_supports_family(lm_ggml_backend_t backend, int family) {
+    LM_GGML_ASSERT(lm_ggml_backend_is_metal(backend));
+
+    struct lm_ggml_metal_context * ctx = (struct lm_ggml_metal_context *)backend->context;
+
+    return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
+}
+
+lm_ggml_backend_t lm_ggml_backend_reg_metal_init(const char * params, void * user_data); // silence warning
+
+lm_ggml_backend_t lm_ggml_backend_reg_metal_init(const char * params, void * user_data) {
+    return lm_ggml_backend_metal_init();
+
+    LM_GGML_UNUSED(params);
+    LM_GGML_UNUSED(user_data);
+}
diff --git a/cpp/ggml-quants.c b/cpp/ggml-quants.c
index abc8dbb..30bebbf 100644
--- a/cpp/ggml-quants.c
+++ b/cpp/ggml-quants.c
@@ -14,32 +14,12 @@
 //
 #include <arm_neon.h>
 
-#if !defined(__aarch64__)
-inline static int32_t vaddvq_s16(int16x8_t v) {
-    return
-        (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
-        (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
-        (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
-        (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
-}
-
-inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
-    int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
-    int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
-    return vcombine_s16(a0, b0);
-}
-
-inline static int32_t vaddvq_s32(int32x4_t v) {
-    return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
-}
-#endif
-
 #else
 
 #ifdef __wasm_simd128__
 #include <wasm_simd128.h>
 #else
-#ifdef __POWER9_VECTOR__
+#if defined(__POWER9_VECTOR__) || defined(__powerpc64__)
 #include <altivec.h>
 #undef bool
 #define bool _Bool
@@ -47,13 +27,15 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <intrin.h>
 #else
-#if !defined(__riscv) && !defined(__s390__)
+#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
+#if !defined(__riscv)
 #include <immintrin.h>
 #endif
 #endif
 #endif
 #endif
 #endif
+#endif
 
 #ifdef __riscv_v_intrinsic
 #include <riscv_vector.h>
@@ -61,6 +43,7 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
 
 #undef MIN
 #undef MAX
+
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 
@@ -283,9 +266,31 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
 #endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
 
 #if defined(__ARM_NEON)
-
 #if !defined(__aarch64__)
 
+// 64-bit compatibility
+
+// vaddvq_s16
+// vpaddq_s16
+// vaddvq_s32
+// vaddvq_f32
+// vmaxvq_f32
+// vcvtnq_s32_f32
+
+inline static int32_t vaddvq_s16(int16x8_t v) {
+    return
+        (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
+        (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
+        (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
+        (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
+}
+
+inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
+    int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
+    int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
+    return vcombine_s16(a0, b0);
+}
+
 inline static int32_t vaddvq_s32(int32x4_t v) {
     return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
 }
@@ -311,6 +316,96 @@ inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
     return res;
 }
 
+// vld1q_s16_x2
+// vld1q_u8_x2
+// vld1q_u8_x4
+// vld1q_s8_x2
+// vld1q_s8_x4
+// TODO: double-check these work correctly
+
+typedef struct lm_ggml_int16x8x2_t {
+    int16x8_t val[2];
+} lm_ggml_int16x8x2_t;
+
+inline static lm_ggml_int16x8x2_t lm_ggml_vld1q_s16_x2(const int16_t * ptr) {
+    lm_ggml_int16x8x2_t res;
+
+    res.val[0] = vld1q_s16(ptr + 0);
+    res.val[1] = vld1q_s16(ptr + 8);
+
+    return res;
+}
+
+typedef struct lm_ggml_uint8x16x2_t {
+    uint8x16_t val[2];
+} lm_ggml_uint8x16x2_t;
+
+inline static lm_ggml_uint8x16x2_t lm_ggml_vld1q_u8_x2(const uint8_t * ptr) {
+    lm_ggml_uint8x16x2_t res;
+
+    res.val[0] = vld1q_u8(ptr + 0);
+    res.val[1] = vld1q_u8(ptr + 16);
+
+    return res;
+}
+
+typedef struct lm_ggml_uint8x16x4_t {
+    uint8x16_t val[4];
+} lm_ggml_uint8x16x4_t;
+
+inline static lm_ggml_uint8x16x4_t lm_ggml_vld1q_u8_x4(const uint8_t * ptr) {
+    lm_ggml_uint8x16x4_t res;
+
+    res.val[0] = vld1q_u8(ptr + 0);
+    res.val[1] = vld1q_u8(ptr + 16);
+    res.val[2] = vld1q_u8(ptr + 32);
+    res.val[3] = vld1q_u8(ptr + 48);
+
+    return res;
+}
+
+typedef struct lm_ggml_int8x16x2_t {
+    int8x16_t val[2];
+} lm_ggml_int8x16x2_t;
+
+inline static lm_ggml_int8x16x2_t lm_ggml_vld1q_s8_x2(const int8_t * ptr) {
+    lm_ggml_int8x16x2_t res;
+
+    res.val[0] = vld1q_s8(ptr + 0);
+    res.val[1] = vld1q_s8(ptr + 16);
+
+    return res;
+}
+
+typedef struct lm_ggml_int8x16x4_t {
+    int8x16_t val[4];
+} lm_ggml_int8x16x4_t;
+
+inline static lm_ggml_int8x16x4_t lm_ggml_vld1q_s8_x4(const int8_t * ptr) {
+    lm_ggml_int8x16x4_t res;
+
+    res.val[0] = vld1q_s8(ptr + 0);
+    res.val[1] = vld1q_s8(ptr + 16);
+    res.val[2] = vld1q_s8(ptr + 32);
+    res.val[3] = vld1q_s8(ptr + 48);
+
+    return res;
+}
+
+#else
+
+#define lm_ggml_int16x8x2_t  int16x8x2_t
+#define lm_ggml_uint8x16x2_t uint8x16x2_t
+#define lm_ggml_uint8x16x4_t uint8x16x4_t
+#define lm_ggml_int8x16x2_t  int8x16x2_t
+#define lm_ggml_int8x16x4_t  int8x16x4_t
+
+#define lm_ggml_vld1q_s16_x2 vld1q_s16_x2
+#define lm_ggml_vld1q_u8_x2  vld1q_u8_x2
+#define lm_ggml_vld1q_u8_x4  vld1q_u8_x4
+#define lm_ggml_vld1q_s8_x2  vld1q_s8_x2
+#define lm_ggml_vld1q_s8_x4  vld1q_s8_x4
+
 #endif
 #endif
 
@@ -1273,7 +1368,12 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f
     float max = x[0];
     float sum_w = weights[0];
     float sum_x = sum_w * x[0];
+#ifdef HAVE_BUGGY_APPLE_LINKER
+    // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
+    for (volatile int i = 1; i < n; ++i) {
+#else
     for (int i = 1; i < n; ++i) {
+#endif
         if (x[i] < min) min = x[i];
         if (x[i] > max) max = x[i];
         float w = weights[i];
@@ -3557,7 +3657,7 @@ void lm_ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * res
     const int32x4_t  vzero = vdupq_n_s32(0);
 #endif
 
-    int8x16x2_t q2bytes;
+    lm_ggml_int8x16x2_t q2bytes;
     uint8_t aux[16];
 
     float sum = 0;
@@ -3576,8 +3676,8 @@ void lm_ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * res
         vst1q_u8(aux, scales);
 
         const uint8x16_t mins = vshrq_n_u8(mins_and_scales, 4);
-        const int16x8x2_t q8sums = vld1q_s16_x2(y[i].bsums);
-        const int16x8x2_t mins16 = {vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))};
+        const lm_ggml_int16x8x2_t q8sums = lm_ggml_vld1q_s16_x2(y[i].bsums);
+        const lm_ggml_int16x8x2_t mins16 = {vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))};
         const int32x4_t s0 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[0]), vget_low_s16 (q8sums.val[0])),
                                        vmull_s16(vget_high_s16(mins16.val[0]), vget_high_s16(q8sums.val[0])));
         const int32x4_t s1 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[1]), vget_low_s16 (q8sums.val[1])),
@@ -3605,7 +3705,7 @@ void lm_ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * res
 #endif
 
 #define SHIFT_MULTIPLY_ACCUM_WITH_SCALE(shift, index)\
-        q8bytes = vld1q_s8_x2(q8); q8 += 32;\
+        q8bytes = lm_ggml_vld1q_s8_x2(q8); q8 += 32;\
         q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[0], (shift)), m3));\
         q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[1], (shift)), m3));\
         MULTIPLY_ACCUM_WITH_SCALE((index));
@@ -3613,9 +3713,9 @@ void lm_ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * res
 
         for (int j = 0; j < QK_K/128; ++j) {
 
-            const uint8x16x2_t q2bits = vld1q_u8_x2(q2); q2 += 32;
+            const lm_ggml_uint8x16x2_t q2bits = lm_ggml_vld1q_u8_x2(q2); q2 += 32;
 
-            int8x16x2_t q8bytes = vld1q_s8_x2(q8); q8 += 32;
+            lm_ggml_int8x16x2_t q8bytes = lm_ggml_vld1q_s8_x2(q8); q8 += 32;
             q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[0], m3));
             q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[1], m3));
             MULTIPLY_ACCUM_WITH_SCALE(0);
@@ -3949,7 +4049,7 @@ void lm_ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * res
     const int32x4_t  vzero = vdupq_n_s32(0);
 #endif
 
-    int8x16x4_t q2bytes;
+    lm_ggml_int8x16x4_t q2bytes;
 
     uint32_t aux32[2];
     const uint8_t * scales = (const uint8_t *)aux32;
@@ -3974,7 +4074,7 @@ void lm_ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * res
 
         const uint8x16_t q2bits = vld1q_u8(q2);
 
-        const int8x16x4_t q8bytes = vld1q_s8_x4(q8);
+        const lm_ggml_int8x16x4_t q8bytes = lm_ggml_vld1q_s8_x4(q8);
 
         q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits, m3));
         q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 2), m3));
@@ -4238,7 +4338,7 @@ void lm_ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * res
     const uint8x16_t m3 = vshlq_n_u8(m0, 3);
     const int8_t m32 = 32;
 
-    int8x16x4_t q3bytes;
+    lm_ggml_int8x16x4_t q3bytes;
 
     float sum = 0;
 
@@ -4250,9 +4350,9 @@ void lm_ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * res
         const uint8_t * restrict qh = x[i].hmask;
         const int8_t  * restrict q8 = y[i].qs;
 
-        uint8x16x2_t qhbits = vld1q_u8_x2(qh);
+        lm_ggml_uint8x16x2_t qhbits = lm_ggml_vld1q_u8_x2(qh);
 
-        uint8x16x4_t q3h;
+        lm_ggml_uint8x16x4_t q3h;
 
         int32_t isum = 0;
 
@@ -4268,9 +4368,9 @@ void lm_ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * res
 
         for (int j = 0; j < QK_K/128; ++j) {
 
-            const uint8x16x2_t q3bits = vld1q_u8_x2(q3); q3 += 32;
-            const int8x16x4_t q8bytes_1 = vld1q_s8_x4(q8); q8 += 64;
-            const int8x16x4_t q8bytes_2 = vld1q_s8_x4(q8); q8 += 64;
+            const lm_ggml_uint8x16x2_t q3bits = lm_ggml_vld1q_u8_x2(q3); q3 += 32;
+            const lm_ggml_int8x16x4_t q8bytes_1 = lm_ggml_vld1q_s8_x4(q8); q8 += 64;
+            const lm_ggml_int8x16x4_t q8bytes_2 = lm_ggml_vld1q_s8_x4(q8); q8 += 64;
 
             q3h.val[0] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[0]), 2);
             q3h.val[1] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[1]), 2);
@@ -4772,7 +4872,7 @@ void lm_ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * res
     const uint8x16_t m3b = vdupq_n_u8(0x3);
     const uint8x16_t mh  = vdupq_n_u8(4);
 
-    int8x16x4_t q3bytes;
+    lm_ggml_int8x16x4_t q3bytes;
 
     uint16_t aux16[2];
     int8_t * scales = (int8_t *)aux16;
@@ -4781,11 +4881,11 @@ void lm_ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * res
 
     for (int i = 0; i < nb; ++i) {
 
-        uint8x16x4_t q3h;
+        lm_ggml_uint8x16x4_t q3h;
 
         const uint8x8_t  hbits    = vld1_u8(x[i].hmask);
         const uint8x16_t q3bits   = vld1q_u8(x[i].qs);
-        const int8x16x4_t q8bytes = vld1q_s8_x4(y[i].qs);
+        const lm_ggml_int8x16x4_t q8bytes = lm_ggml_vld1q_s8_x4(y[i].qs);
 
         const uint16_t a = *(const uint16_t *)x[i].scales;
         aux16[0] = a & 0x0f0f;
@@ -5134,8 +5234,8 @@ void lm_ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * res
     const int32x4_t mzero = vdupq_n_s32(0);
 #endif
 
-    int8x16x2_t q4bytes;
-    int8x16x2_t q8bytes;
+    lm_ggml_int8x16x2_t q4bytes;
+    lm_ggml_int8x16x2_t q8bytes;
 
     float sumf = 0;
 
@@ -5170,17 +5270,17 @@ void lm_ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * res
 
         for (int j = 0; j < QK_K/64; ++j) {
 
-            const uint8x16x2_t q4bits = vld1q_u8_x2(q4); q4 += 32;
+            const lm_ggml_uint8x16x2_t q4bits = lm_ggml_vld1q_u8_x2(q4); q4 += 32;
 
 #ifdef __ARM_FEATURE_DOTPROD
-            q8bytes = vld1q_s8_x2(q8); q8 += 32;
+            q8bytes = lm_ggml_vld1q_s8_x2(q8); q8 += 32;
             q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[0], m4b));
             q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[1], m4b));
 
             const int32x4_t p1 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
             sumi1 += vaddvq_s32(p1) * scales[2*j+0];
 
-            q8bytes = vld1q_s8_x2(q8); q8 += 32;
+            q8bytes = lm_ggml_vld1q_s8_x2(q8); q8 += 32;
             q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
             q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
 
@@ -5188,7 +5288,7 @@ void lm_ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * res
 
             sumi2 += vaddvq_s32(p2) * scales[2*j+1];
 #else
-            q8bytes = vld1q_s8_x2(q8); q8 += 32;
+            q8bytes = lm_ggml_vld1q_s8_x2(q8); q8 += 32;
             q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[0], m4b));
             q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[1], m4b));
             const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
@@ -5197,7 +5297,7 @@ void lm_ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * res
                                            vmull_s8(vget_high_s8(q4bytes.val[1]), vget_high_s8(q8bytes.val[1])));
             sumi1 += vaddvq_s16(vaddq_s16(p0, p1)) * scales[2*j+0];
 
-            q8bytes = vld1q_s8_x2(q8); q8 += 32;
+            q8bytes = lm_ggml_vld1q_s8_x2(q8); q8 += 32;
             q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
             q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
             const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
@@ -5512,8 +5612,8 @@ void lm_ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * res
 
     float sumf = 0;
 
-    int8x16x2_t q4bytes;
-    int8x16x4_t q8bytes;
+    lm_ggml_int8x16x2_t q4bytes;
+    lm_ggml_int8x16x4_t q8bytes;
 
     float sum_mins = 0.f;
 
@@ -5534,10 +5634,10 @@ void lm_ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * res
 
         const float d = y[i].d * (float)x[i].d[0];
 
-        const uint8x16x2_t q4bits = vld1q_u8_x2(q4);
+        const lm_ggml_uint8x16x2_t q4bits = lm_ggml_vld1q_u8_x2(q4);
 
 #ifdef __ARM_FEATURE_DOTPROD
-        q8bytes = vld1q_s8_x4(q8);
+        q8bytes = lm_ggml_vld1q_s8_x4(q8);
         q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[0], m4b));
         q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[1], m4b));
 
@@ -5551,7 +5651,7 @@ void lm_ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * res
         const int32_t sumi2 = vaddvq_s32(p2) * scales[1];
 
 #else
-        q8bytes = vld1q_s8_x4(q8);
+        q8bytes = lm_ggml_vld1q_s8_x4(q8);
         q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[0], m4b));
         q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[1], m4b));
         const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
@@ -5785,7 +5885,7 @@ void lm_ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * res
     const int32x4_t mzero = vdupq_n_s32(0);
 #endif
 
-    int8x16x4_t q5bytes;
+    lm_ggml_int8x16x4_t q5bytes;
 
     float sumf = 0;
 
@@ -5815,16 +5915,16 @@ void lm_ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * res
         const uint8_t * restrict qh = x[i].qh;
         const int8_t  * restrict q8 = y[i].qs;
 
-        uint8x16x2_t qhbits = vld1q_u8_x2(qh);
+        lm_ggml_uint8x16x2_t qhbits = lm_ggml_vld1q_u8_x2(qh);
 
-        uint8x16x4_t q5h;
+        lm_ggml_uint8x16x4_t q5h;
 
         int32_t sumi = 0;
 
         for (int j = 0; j < QK_K/64; ++j) {
 
-            const uint8x16x2_t q5bits = vld1q_u8_x2(q5); q5 += 32;
-            const int8x16x4_t q8bytes = vld1q_s8_x4(q8); q8 += 64;
+            const lm_ggml_uint8x16x2_t q5bits = lm_ggml_vld1q_u8_x2(q5); q5 += 32;
+            const lm_ggml_int8x16x4_t q8bytes = lm_ggml_vld1q_s8_x4(q8); q8 += 64;
 
             q5h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits.val[0]), 4);
             q5h.val[1] = vshlq_n_u8(vandq_u8(mone, qhbits.val[1]), 4);
@@ -6218,8 +6318,8 @@ void lm_ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * res
     const int32x4_t mzero = vdupq_n_s32(0);
 #endif
 
-    int8x16x4_t q5bytes;
-    uint8x16x4_t q5h;
+    lm_ggml_int8x16x4_t q5bytes;
+    lm_ggml_uint8x16x4_t q5h;
 
     float sumf = 0;
 
@@ -6234,8 +6334,8 @@ void lm_ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * res
 
         const uint8x8_t qhbits = vld1_u8(qh);
 
-        const uint8x16x2_t q5bits = vld1q_u8_x2(q5);
-        const int8x16x4_t q8bytes = vld1q_s8_x4(q8);
+        const lm_ggml_uint8x16x2_t q5bits = lm_ggml_vld1q_u8_x2(q5);
+        const lm_ggml_int8x16x4_t q8bytes = lm_ggml_vld1q_s8_x4(q8);
 
         const uint8x16_t htmp = vcombine_u8(qhbits, vshr_n_u8(qhbits, 1));
         q5h.val[0] = vbicq_u8(mh, vshlq_n_u8(htmp, 4));
@@ -6511,8 +6611,8 @@ void lm_ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * res
 
     const uint8x16_t mone = vdupq_n_u8(3);
 
-    int8x16x4_t q6bytes;
-    uint8x16x4_t q6h;
+    lm_ggml_int8x16x4_t q6bytes;
+    lm_ggml_uint8x16x4_t q6h;
 
     for (int i = 0; i < nb; ++i) {
 
@@ -6524,9 +6624,9 @@ void lm_ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * res
 
         const int8_t * restrict scale = x[i].scales;
 
-        const int16x8x2_t q8sums = vld1q_s16_x2(y[i].bsums);
+        const lm_ggml_int16x8x2_t q8sums = lm_ggml_vld1q_s16_x2(y[i].bsums);
         const int8x16_t scales = vld1q_s8(scale);
-        const int16x8x2_t q6scales = {vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))};
+        const lm_ggml_int16x8x2_t q6scales = {vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))};
 
         const int32x4_t prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[0]), vget_low_s16 (q6scales.val[0])),
                                                    vmull_s16(vget_high_s16(q8sums.val[0]), vget_high_s16(q6scales.val[0]))),
@@ -6538,9 +6638,9 @@ void lm_ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * res
 
         for (int j = 0; j < QK_K/128; ++j) {
 
-            uint8x16x2_t qhbits = vld1q_u8_x2(qh); qh += 32;
-            uint8x16x4_t q6bits = vld1q_u8_x4(q6); q6 += 64;
-            int8x16x4_t q8bytes = vld1q_s8_x4(q8); q8 += 64;
+            lm_ggml_uint8x16x2_t qhbits = lm_ggml_vld1q_u8_x2(qh); qh += 32;
+            lm_ggml_uint8x16x4_t q6bits = lm_ggml_vld1q_u8_x4(q6); q6 += 64;
+            lm_ggml_int8x16x4_t q8bytes = lm_ggml_vld1q_s8_x4(q8); q8 += 64;
 
             q6h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits.val[0]), 4);
             q6h.val[1] = vshlq_n_u8(vandq_u8(mone, qhbits.val[1]), 4);
@@ -6583,7 +6683,7 @@ void lm_ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * res
             scale += 2;
 #endif
 
-            q8bytes = vld1q_s8_x4(q8); q8 += 64;
+            q8bytes = lm_ggml_vld1q_s8_x4(q8); q8 += 64;
 
             shifted = vshrq_n_u8(qhbits.val[0], 4);
             q6h.val[0] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
@@ -6987,8 +7087,8 @@ void lm_ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * res
 
     const uint8x16_t mone = vdupq_n_u8(3);
 
-    int8x16x4_t q6bytes;
-    uint8x16x4_t q6h;
+    lm_ggml_int8x16x4_t q6bytes;
+    lm_ggml_uint8x16x4_t q6h;
 
     for (int i = 0; i < nb; ++i) {
 
@@ -7002,9 +7102,9 @@ void lm_ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * res
 
         int32_t isum = 0;
 
-        uint8x16_t   qhbits = vld1q_u8(qh);
-        uint8x16x2_t q6bits = vld1q_u8_x2(q6);
-        int8x16x4_t q8bytes = vld1q_s8_x4(q8);
+        uint8x16_t qhbits = vld1q_u8(qh);
+        lm_ggml_uint8x16x2_t q6bits = lm_ggml_vld1q_u8_x2(q6);
+        lm_ggml_int8x16x4_t q8bytes = lm_ggml_vld1q_s8_x4(q8);
 
         q6h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits), 4);
         uint8x16_t shifted = vshrq_n_u8(qhbits, 2);
diff --git a/cpp/ggml.c b/cpp/ggml.c
index c0d6ff3..2dda9e7 100644
--- a/cpp/ggml.c
+++ b/cpp/ggml.c
@@ -100,6 +100,49 @@ typedef void * thread_ret_t;
 #include <hbwmalloc.h>
 #endif
 
+#if defined(__APPLE__)
+#include <TargetConditionals.h>
+#endif
+
+#if (defined(__linux__) || defined(__APPLE__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)) && \
+    (!defined(TARGET_OS_TV) && !defined(TARGET_OS_WATCH))
+
+#include <sys/wait.h>
+
+void lm_ggml_print_backtrace(void) {
+    /*
+    #include <execinfo.h>
+    #include <dlfcn.h>
+
+    void * trace[100];
+
+    int nptrs = backtrace(trace, sizeof(trace)/sizeof(trace[0]));
+
+    backtrace_symbols_fd(trace, nptrs, STDERR_FILENO);
+    */
+
+    // backtrack_symbols does not show line numbers, use gdb instead
+    char attach[32];
+    snprintf(attach, sizeof(attach), "attach %d", getpid());
+    int pid = fork();
+    if (pid == 0) {
+        execlp("gdb", "gdb", "--batch",
+            "-ex", "set style enabled on",
+            "-ex", attach,
+            "-ex", "bt -frame-info source-and-location",
+            "-ex", "detach",
+            "-ex", "quit",
+            NULL);
+    } else {
+        waitpid(pid, NULL, 0);
+    }
+}
+#else
+void lm_ggml_print_backtrace(void) {
+    // platform not supported
+}
+#endif
+
 /*#define LM_GGML_PERF*/
 #define LM_GGML_DEBUG 0
 #define LM_GGML_GELU_FP16
@@ -190,24 +233,6 @@ inline static void * lm_ggml_aligned_malloc(size_t size) {
 #define UNUSED LM_GGML_UNUSED
 #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
 
-//
-// tensor access macros
-//
-
-#define LM_GGML_TENSOR_UNARY_OP_LOCALS \
-    LM_GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
-    LM_GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
-    LM_GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
-    LM_GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
-
-#define LM_GGML_TENSOR_BINARY_OP_LOCALS \
-    LM_GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
-    LM_GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
-    LM_GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
-    LM_GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb) \
-    LM_GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
-    LM_GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
-
 #if defined(LM_GGML_USE_ACCELERATE)
 #include <Accelerate/Accelerate.h>
 #if defined(LM_GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
@@ -228,6 +253,12 @@ inline static void * lm_ggml_aligned_malloc(size_t size) {
 // floating point type used to accumulate sums
 typedef double lm_ggml_float;
 
+#undef MIN
+#undef MAX
+
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
 //
 // global data
 //
@@ -561,6 +592,18 @@ lm_ggml_type_traits_t lm_ggml_internal_get_type_traits(enum lm_ggml_type type) {
 // simd mappings
 //
 
+#if defined(__ARM_NEON)
+#if !defined(__aarch64__)
+
+// 64-bit compatibility
+
+inline static float vaddvq_f32(float32x4_t v) {
+    return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
+}
+
+#endif
+#endif
+
 // we define a common set of C macros which map to specific intrinsics based on the current architecture
 // we then implement the fundamental computation operations below using only these macros
 // adding support for new architectures requires to define the corresponding SIMD macros
@@ -1352,6 +1395,7 @@ inline static void lm_ggml_vec_step_f32 (const int n, float * y, const float * x
 inline static void lm_ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]);  }
 inline static void lm_ggml_vec_elu_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
 inline static void lm_ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
+inline static void lm_ggml_vec_leaky_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.1f*x[i]; }
 
 static const float GELU_COEF_A     = 0.044715f;
 static const float GELU_QUICK_COEF = -1.702f;
@@ -1551,6 +1595,7 @@ static const char * LM_GGML_OP_NAME[LM_GGML_OP_COUNT] = {
     "GROUP_NORM",
 
     "MUL_MAT",
+    "MUL_MAT_ID",
     "OUT_PROD",
 
     "SCALE",
@@ -1572,17 +1617,13 @@ static const char * LM_GGML_OP_NAME[LM_GGML_OP_COUNT] = {
     "ROPE_BACK",
     "ALIBI",
     "CLAMP",
-    "CONV_1D",
-    "CONV_1D_STAGE_0",
-    "CONV_1D_STAGE_1",
     "CONV_TRANSPOSE_1D",
-    "CONV_2D",
-    "CONV_2D_STAGE_0",
-    "CONV_2D_STAGE_1",
+    "IM2COL",
     "CONV_TRANSPOSE_2D",
     "POOL_1D",
     "POOL_2D",
     "UPSCALE",
+    "ARGSORT",
 
     "FLASH_ATTN",
     "FLASH_FF",
@@ -1609,7 +1650,7 @@ static const char * LM_GGML_OP_NAME[LM_GGML_OP_COUNT] = {
     "CROSS_ENTROPY_LOSS_BACK",
 };
 
-static_assert(LM_GGML_OP_COUNT == 73, "LM_GGML_OP_COUNT != 73");
+static_assert(LM_GGML_OP_COUNT == 70, "LM_GGML_OP_COUNT != 70");
 
 static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = {
     "none",
@@ -1638,6 +1679,7 @@ static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = {
     "group_norm(x)",
 
     "X*Y",
+    "X[i]*Y",
     "X*Y",
 
     "x*v",
@@ -1659,17 +1701,13 @@ static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = {
     "rope_back(x)",
     "alibi(x)",
     "clamp(x)",
-    "conv_1d(x)",
-    "conv_1d_stage_0(x)",
-    "conv_1d_stage_1(x)",
     "conv_transpose_1d(x)",
-    "conv_2d(x)",
-    "conv_2d_stage_0(x)",
-    "conv_2d_stage_1(x)",
+    "im2col(x)",
     "conv_transpose_2d(x)",
     "pool_1d(x)",
     "pool_2d(x)",
     "upscale(x)",
+    "argsort(x)",
 
     "flash_attn(x)",
     "flash_ff(x)",
@@ -1696,10 +1734,28 @@ static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = {
     "cross_entropy_loss_back(x,y)",
 };
 
-static_assert(LM_GGML_OP_COUNT == 73, "LM_GGML_OP_COUNT != 73");
+static_assert(LM_GGML_OP_COUNT == 70, "LM_GGML_OP_COUNT != 70");
 
 static_assert(LM_GGML_OP_POOL_COUNT == 2, "LM_GGML_OP_POOL_COUNT != 2");
 
+
+static const char * LM_GGML_UNARY_OP_NAME[LM_GGML_UNARY_OP_COUNT] = {
+    "ABS",
+    "SGN",
+    "NEG",
+    "STEP",
+    "TANH",
+    "ELU",
+    "RELU",
+    "GELU",
+    "GELU_QUICK",
+    "SILU",
+    "LEAKY",
+};
+
+static_assert(LM_GGML_UNARY_OP_COUNT == 11, "LM_GGML_UNARY_OP_COUNT != 11");
+
+
 static_assert(sizeof(struct lm_ggml_object)%LM_GGML_MEM_ALIGN == 0, "lm_ggml_object size must be a multiple of LM_GGML_MEM_ALIGN");
 static_assert(sizeof(struct lm_ggml_tensor)%LM_GGML_MEM_ALIGN == 0, "lm_ggml_tensor size must be a multiple of LM_GGML_MEM_ALIGN");
 
@@ -1719,18 +1775,13 @@ static void lm_ggml_setup_op_has_task_pass(void) {
 
         p[LM_GGML_OP_ACC                    ] = true;
         p[LM_GGML_OP_MUL_MAT                ] = true;
+        p[LM_GGML_OP_MUL_MAT_ID             ] = true;
         p[LM_GGML_OP_OUT_PROD               ] = true;
         p[LM_GGML_OP_SET                    ] = true;
         p[LM_GGML_OP_GET_ROWS_BACK          ] = true;
         p[LM_GGML_OP_DIAG_MASK_INF          ] = true;
         p[LM_GGML_OP_DIAG_MASK_ZERO         ] = true;
-        p[LM_GGML_OP_CONV_1D                ] = true;
-        p[LM_GGML_OP_CONV_1D_STAGE_0        ] = true;
-        p[LM_GGML_OP_CONV_1D_STAGE_1        ] = true;
         p[LM_GGML_OP_CONV_TRANSPOSE_1D      ] = true;
-        p[LM_GGML_OP_CONV_2D                ] = true;
-        p[LM_GGML_OP_CONV_2D_STAGE_0        ] = true;
-        p[LM_GGML_OP_CONV_2D_STAGE_1        ] = true;
         p[LM_GGML_OP_CONV_TRANSPOSE_2D      ] = true;
         p[LM_GGML_OP_FLASH_ATTN_BACK        ] = true;
         p[LM_GGML_OP_CROSS_ENTROPY_LOSS     ] = true;
@@ -1977,6 +2028,20 @@ const char * lm_ggml_op_symbol(enum lm_ggml_op op) {
     return LM_GGML_OP_SYMBOL[op];
 }
 
+const char * lm_ggml_unary_op_name(enum lm_ggml_unary_op op) {
+    return LM_GGML_UNARY_OP_NAME[op];
+}
+
+const char * lm_ggml_op_desc(const struct lm_ggml_tensor * t) {
+    if (t->op == LM_GGML_OP_UNARY) {
+        enum lm_ggml_unary_op uop = lm_ggml_get_unary_op(t);
+        return lm_ggml_unary_op_name(uop);
+    }
+    else {
+        return lm_ggml_op_name(t->op);
+    }
+}
+
 size_t lm_ggml_element_size(const struct lm_ggml_tensor * tensor) {
     return lm_ggml_type_size(tensor->type);
 }
@@ -3108,9 +3173,7 @@ static struct lm_ggml_tensor * lm_ggml_add_impl(
         struct lm_ggml_tensor * a,
         struct lm_ggml_tensor * b,
         bool inplace) {
-    // TODO: support less-strict constraint
-    //       LM_GGML_ASSERT(lm_ggml_can_repeat(b, a));
-    LM_GGML_ASSERT(lm_ggml_can_repeat_rows(b, a));
+    LM_GGML_ASSERT(lm_ggml_can_repeat(b, a));
 
     bool is_node = false;
 
@@ -3325,9 +3388,7 @@ static struct lm_ggml_tensor * lm_ggml_mul_impl(
         struct lm_ggml_tensor * a,
         struct lm_ggml_tensor * b,
         bool inplace) {
-    // TODO: support less-strict constraint
-    //       LM_GGML_ASSERT(lm_ggml_can_repeat(b, a));
-    LM_GGML_ASSERT(lm_ggml_can_repeat_rows(b, a));
+    LM_GGML_ASSERT(lm_ggml_can_repeat(b, a));
 
     bool is_node = false;
 
@@ -3372,7 +3433,7 @@ static struct lm_ggml_tensor * lm_ggml_div_impl(
         struct lm_ggml_tensor * a,
         struct lm_ggml_tensor * b,
         bool inplace) {
-    LM_GGML_ASSERT(lm_ggml_are_same_shape(a, b));
+    LM_GGML_ASSERT(lm_ggml_can_repeat(b, a));
 
     bool is_node = false;
 
@@ -3769,6 +3830,14 @@ struct lm_ggml_tensor * lm_ggml_relu_inplace(
     return lm_ggml_unary_inplace(ctx, a, LM_GGML_UNARY_OP_RELU);
 }
 
+// lm_ggml_leaky
+
+struct lm_ggml_tensor * lm_ggml_leaky(
+        struct lm_ggml_context * ctx,
+        struct lm_ggml_tensor  * a) {
+    return lm_ggml_unary(ctx, a, LM_GGML_UNARY_OP_LEAKY);
+}
+
 // lm_ggml_gelu
 
 struct lm_ggml_tensor * lm_ggml_gelu(
@@ -4002,6 +4071,49 @@ struct lm_ggml_tensor * lm_ggml_mul_mat(
     return result;
 }
 
+// lm_ggml_mul_mat_id
+
+struct lm_ggml_tensor * lm_ggml_mul_mat_id(
+        struct lm_ggml_context * ctx,
+        struct lm_ggml_tensor  * as[],
+        struct lm_ggml_tensor  * ids,
+        int                   id,
+        struct lm_ggml_tensor  * b) {
+
+    int64_t n_as = ids->ne[0];
+
+    LM_GGML_ASSERT(ids->type == LM_GGML_TYPE_I32);
+    LM_GGML_ASSERT(lm_ggml_is_vector(ids));
+    LM_GGML_ASSERT(n_as > 0 && n_as <= LM_GGML_MAX_SRC - 2);
+    LM_GGML_ASSERT(id >= 0 && id < n_as);
+
+    bool is_node = false;
+
+    if (as[0]->grad || b->grad) {
+        is_node = true;
+    }
+
+    const int64_t ne[4] = { as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3] };
+    struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F32, MAX(as[0]->n_dims, b->n_dims), ne);
+
+    lm_ggml_set_op_params_i32(result, 0, id);
+
+    result->op   = LM_GGML_OP_MUL_MAT_ID;
+    result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = ids;
+    result->src[1] = b;
+
+    for (int64_t i = 0; i < n_as; i++) {
+        struct lm_ggml_tensor * a = as[i];
+        LM_GGML_ASSERT(lm_ggml_are_same_shape(as[0], a));
+        LM_GGML_ASSERT(lm_ggml_can_mul_mat(a, b));
+        LM_GGML_ASSERT(!lm_ggml_is_transposed(a));
+        result->src[i + 2] = a;
+    }
+
+    return result;
+}
+
 // lm_ggml_out_prod
 
 struct lm_ggml_tensor * lm_ggml_out_prod(
@@ -4155,7 +4267,7 @@ struct lm_ggml_tensor * lm_ggml_set_2d_inplace(
         struct lm_ggml_tensor *  b,
         size_t                nb1,
         size_t                offset) {
-    return lm_ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false);
+    return lm_ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, true);
 }
 
 // lm_ggml_cpy
@@ -4772,7 +4884,17 @@ struct lm_ggml_tensor * lm_ggml_diag_mask_zero_inplace(
 static struct lm_ggml_tensor * lm_ggml_soft_max_impl(
         struct lm_ggml_context * ctx,
         struct lm_ggml_tensor  * a,
+        struct lm_ggml_tensor  * mask,
+        float                 scale,
         bool                  inplace) {
+    LM_GGML_ASSERT(lm_ggml_is_contiguous(a));
+    if (mask) {
+        LM_GGML_ASSERT(lm_ggml_is_contiguous(mask));
+        LM_GGML_ASSERT(mask->ne[2] == 1);
+        LM_GGML_ASSERT(mask->ne[3] == 1);
+        LM_GGML_ASSERT(lm_ggml_can_repeat_rows(mask, a));
+    }
+
     bool is_node = false;
 
     if (a->grad) {
@@ -4781,9 +4903,13 @@ static struct lm_ggml_tensor * lm_ggml_soft_max_impl(
 
     struct lm_ggml_tensor * result = inplace ? lm_ggml_view_tensor(ctx, a) : lm_ggml_dup_tensor(ctx, a);
 
+    float params[] = { scale };
+    lm_ggml_set_op_params(result, params, sizeof(params));
+
     result->op   = LM_GGML_OP_SOFT_MAX;
     result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL;
     result->src[0] = a;
+    result->src[1] = mask;
 
     return result;
 }
@@ -4791,13 +4917,21 @@ static struct lm_ggml_tensor * lm_ggml_soft_max_impl(
 struct lm_ggml_tensor * lm_ggml_soft_max(
         struct lm_ggml_context * ctx,
         struct lm_ggml_tensor  * a) {
-    return lm_ggml_soft_max_impl(ctx, a, false);
+    return lm_ggml_soft_max_impl(ctx, a, NULL, 1.0f, false);
 }
 
 struct lm_ggml_tensor * lm_ggml_soft_max_inplace(
         struct lm_ggml_context * ctx,
         struct lm_ggml_tensor  * a) {
-    return lm_ggml_soft_max_impl(ctx, a, true);
+    return lm_ggml_soft_max_impl(ctx, a, NULL, 1.0f, true);
+}
+
+struct lm_ggml_tensor * lm_ggml_soft_max_ext(
+        struct lm_ggml_context * ctx,
+        struct lm_ggml_tensor  * a,
+        struct lm_ggml_tensor  * mask,
+        float                 scale) {
+    return lm_ggml_soft_max_impl(ctx, a, mask, scale, false);
 }
 
 // lm_ggml_soft_max_back
@@ -5076,82 +5210,6 @@ static int64_t lm_ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int
     return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
 }
 
-// im2col: [N, IC, IL] => [N, OL, IC*K]
-// a: [OC，IC, K]
-// b: [N, IC, IL]
-// result: [N, OL, IC*K]
-static struct lm_ggml_tensor * lm_ggml_conv_1d_stage_0(
-    struct lm_ggml_context * ctx,
-    struct lm_ggml_tensor  * a,
-    struct lm_ggml_tensor  * b,
-    int                   s0,
-    int                   p0,
-    int                   d0) {
-    LM_GGML_ASSERT(a->ne[1] == b->ne[1]);
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        LM_GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    const int64_t OL = lm_ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
-
-    const int64_t ne[4] = {
-        a->ne[1] * a->ne[0],
-        OL,
-        b->ne[2],
-        1,
-    };
-    struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F16, 4, ne);
-
-    int32_t params[] = { s0, p0, d0 };
-    lm_ggml_set_op_params(result, params, sizeof(params));
-
-    result->op = LM_GGML_OP_CONV_1D_STAGE_0;
-    result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-// lm_ggml_conv_1d_stage_1
-
-// gemm: [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
-// a: [OC, IC, K]
-// b: [N, OL, IC * K]
-// result: [N, OC, OL]
-static struct lm_ggml_tensor * lm_ggml_conv_1d_stage_1(
-    struct lm_ggml_context * ctx,
-    struct lm_ggml_tensor  * a,
-    struct lm_ggml_tensor  * b) {
-
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        LM_GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    const int64_t ne[4] = {
-        b->ne[1],
-        a->ne[2],
-        b->ne[2],
-        1,
-    };
-    struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F32, 4, ne);
-
-    result->op = LM_GGML_OP_CONV_1D_STAGE_1;
-    result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-// lm_ggml_conv_1d
-
 LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_1d(
         struct lm_ggml_context * ctx,
         struct lm_ggml_tensor  * a,
@@ -5159,43 +5217,17 @@ LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_1d(
         int                   s0,
         int                   p0,
         int                   d0) {
-    struct lm_ggml_tensor * result = lm_ggml_conv_1d_stage_0(ctx, a, b, s0, p0, d0);
-    result = lm_ggml_conv_1d_stage_1(ctx, a, result);
-    return result;
-}
+    struct lm_ggml_tensor * im2col = lm_ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false); // [N, OL, IC * K]
 
-// LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_1d(
-//         struct lm_ggml_context * ctx,
-//         struct lm_ggml_tensor  * a,
-//         struct lm_ggml_tensor  * b,
-//         int                   s0,
-//         int                   p0,
-//         int                   d0) {
-//     LM_GGML_ASSERT(lm_ggml_is_matrix(b));
-//     LM_GGML_ASSERT(a->ne[1] == b->ne[1]);
-//     bool is_node = false;
+    struct lm_ggml_tensor * result =
+        lm_ggml_mul_mat(ctx,
+                lm_ggml_reshape_2d(ctx, im2col, im2col->ne[0], (im2col->ne[2] * im2col->ne[1])), // [N, OL, IC * K] => [N*OL, IC * K]
+                lm_ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1]), a->ne[2]));                    // [OC，IC, K] => [OC, IC * K]
 
-//     if (a->grad || b->grad) {
-//         LM_GGML_ASSERT(false); // TODO: implement backward
-//         is_node = true;
-//     }
+    result = lm_ggml_reshape_3d(ctx, result, im2col->ne[1], a->ne[2], im2col->ne[2]); // [N, OC, OL]
 
-//     const int64_t ne[4] = {
-//         lm_ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
-//         a->ne[2], 1, 1,
-//     };
-//     struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F32, 2, ne);
-
-//     int32_t params[] = { s0, p0, d0 };
-//     lm_ggml_set_op_params(result, params, sizeof(params));
-
-//     result->op = LM_GGML_OP_CONV_1D;
-//     result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL;
-//     result->src[0] = a;
-//     result->src[1] = b;
-
-//     return result;
-// }
+    return result;
+}
 
 // lm_ggml_conv_1d_ph
 
@@ -5258,7 +5290,7 @@ LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_transpose_1d(
 // a: [OC，IC, KH, KW]
 // b: [N, IC, IH, IW]
 // result: [N, OH, OW, IC*KH*KW]
-static struct lm_ggml_tensor * lm_ggml_conv_2d_stage_0(
+struct lm_ggml_tensor * lm_ggml_im2col(
     struct lm_ggml_context * ctx,
     struct lm_ggml_tensor  * a,
     struct lm_ggml_tensor  * b,
@@ -5267,9 +5299,14 @@ static struct lm_ggml_tensor * lm_ggml_conv_2d_stage_0(
     int                  p0,
     int                  p1,
     int                  d0,
-    int                  d1) {
+    int                  d1,
+    bool                 is_2D) {
 
-    LM_GGML_ASSERT(a->ne[2] == b->ne[2]);
+    if(is_2D) {
+        LM_GGML_ASSERT(a->ne[2] == b->ne[2]);
+    } else {
+        LM_GGML_ASSERT(a->ne[1] == b->ne[1]);
+    }
     bool is_node = false;
 
     if (a->grad || b->grad) {
@@ -5277,81 +5314,51 @@ static struct lm_ggml_tensor * lm_ggml_conv_2d_stage_0(
         is_node = true;
     }
 
-    const int64_t OH = lm_ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1);
-    const int64_t OW = lm_ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
+    const int64_t OH = is_2D ? lm_ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1) : 0;
+    const int64_t OW =         lm_ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
 
     const int64_t ne[4] = {
-        a->ne[2] * a->ne[1] * a->ne[0],
+        is_2D ? (a->ne[2] * a->ne[1] * a->ne[0]) : a->ne[1] * a->ne[0],
         OW,
-        OH,
-        b->ne[3],
+        is_2D ? OH : b->ne[2],
+        is_2D ?      b->ne[3] : 1,
     };
-    struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F16, 4, ne);
 
-    int32_t params[] = { s0, s1, p0, p1, d0, d1 };
+    struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F16, 4, ne);
+    int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
     lm_ggml_set_op_params(result, params, sizeof(params));
 
-    result->op = LM_GGML_OP_CONV_2D_STAGE_0;
-    result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-
-}
-
-// gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
-// a: [OC, IC, KH, KW]
-// b: [N, OH, OW, IC * KH * KW]
-// result: [N, OC, OH, OW]
-static struct lm_ggml_tensor * lm_ggml_conv_2d_stage_1(
-    struct lm_ggml_context * ctx,
-    struct lm_ggml_tensor  * a,
-    struct lm_ggml_tensor  * b) {
-
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        LM_GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    const int64_t ne[4] = {
-        b->ne[1],
-        b->ne[2],
-        a->ne[3],
-        b->ne[3],
-    };
-    struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F32, 4, ne);
-
-    result->op = LM_GGML_OP_CONV_2D_STAGE_1;
+    result->op = LM_GGML_OP_IM2COL;
     result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL;
     result->src[0] = a;
     result->src[1] = b;
 
     return result;
-
 }
 
 // a: [OC，IC, KH, KW]
 // b: [N, IC, IH, IW]
 // result: [N, OC, OH, OW]
 struct lm_ggml_tensor * lm_ggml_conv_2d(
-    struct lm_ggml_context * ctx,
-    struct lm_ggml_tensor  * a,
-    struct lm_ggml_tensor  * b,
-    int                  s0,
-    int                  s1,
-    int                  p0,
-    int                  p1,
-    int                  d0,
-    int                  d1) {
+        struct lm_ggml_context * ctx,
+        struct lm_ggml_tensor  * a,
+        struct lm_ggml_tensor  * b,
+        int                  s0,
+        int                  s1,
+        int                  p0,
+        int                  p1,
+        int                  d0,
+        int                  d1) {
+    struct lm_ggml_tensor * im2col = lm_ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true); // [N, OH, OW, IC * KH * KW]
 
-    struct lm_ggml_tensor * result = lm_ggml_conv_2d_stage_0(ctx, a, b, s0, s1, p0, p1, d0, d1); // [N, OH, OW, IC * KH * KW]
-    result = lm_ggml_conv_2d_stage_1(ctx, a, result);
+    struct lm_ggml_tensor * result =
+        lm_ggml_mul_mat(ctx,
+                lm_ggml_reshape_2d(ctx, im2col, im2col->ne[0],  im2col->ne[3] * im2col->ne[2] * im2col->ne[1]), // [N, OH, OW, IC * KH * KW] => [N*OH*OW, IC * KH * KW]
+                lm_ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1] * a->ne[2]),  a->ne[3]));                       // [OC，IC, KH, KW] => [OC, IC * KH * KW]
 
-    return result;
+    result = lm_ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], a->ne[3], im2col->ne[3]); // [N, OC, OH, OW]
 
+    return result;
 }
 
 // lm_ggml_conv_2d_sk_p0
@@ -5411,7 +5418,7 @@ struct lm_ggml_tensor * lm_ggml_conv_transpose_2d_p0(
 
 // lm_ggml_pool_*
 
-static int64_t lm_ggml_calc_pool_output_size(int64_t ins, int ks, int s, int p) {
+static int64_t lm_ggml_calc_pool_output_size(int64_t ins, int ks, int s, float p) {
     return (ins + 2 * p - ks) / s + 1;
 }
 
@@ -5458,8 +5465,8 @@ struct lm_ggml_tensor * lm_ggml_pool_2d(
         int                   k1,
         int                   s0,
         int                   s1,
-        int                   p0,
-        int                   p1) {
+        float                 p0,
+        float                 p1) {
 
     bool is_node = false;
 
@@ -5519,6 +5526,43 @@ struct lm_ggml_tensor * lm_ggml_upscale(
     return lm_ggml_upscale_impl(ctx, a, scale_factor);
 }
 
+// lm_ggml_argsort
+
+struct lm_ggml_tensor * lm_ggml_argsort(
+        struct lm_ggml_context * ctx,
+        struct lm_ggml_tensor  * a,
+        enum lm_ggml_sort_order  order) {
+    bool is_node = false;
+
+    struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_I32, a->n_dims, a->ne);
+
+    lm_ggml_set_op_params_i32(result, 0, (int32_t) order);
+
+    result->op   = LM_GGML_OP_ARGSORT;
+    result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+// lm_ggml_top_k
+
+struct lm_ggml_tensor * lm_ggml_top_k(
+        struct lm_ggml_context * ctx,
+        struct lm_ggml_tensor  * a,
+        int                   k) {
+    LM_GGML_ASSERT(a->ne[0] >= k);
+
+    struct lm_ggml_tensor * result = lm_ggml_argsort(ctx, a, LM_GGML_SORT_DESC);
+
+    result = lm_ggml_view_4d(ctx, result,
+                k, result->ne[1], result->ne[2], result->ne[3],
+                   result->nb[1], result->nb[2], result->nb[3],
+                0);
+
+    return result;
+}
+
 // lm_ggml_flash_attn
 
 struct lm_ggml_tensor * lm_ggml_flash_attn(
@@ -6878,7 +6922,7 @@ static void lm_ggml_compute_forward_add_f32(
         const struct lm_ggml_tensor * src0,
         const struct lm_ggml_tensor * src1,
         struct lm_ggml_tensor * dst) {
-    LM_GGML_ASSERT(lm_ggml_can_repeat_rows(src1, src0) && lm_ggml_are_same_shape(src0, dst));
+    LM_GGML_ASSERT(lm_ggml_can_repeat(src1, src0) && lm_ggml_are_same_shape(src0, dst));
 
     if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) {
         return;
@@ -6911,16 +6955,19 @@ static void lm_ggml_compute_forward_add_f32(
             const int64_t i13 = i03 % ne13;
             const int64_t i12 = i02 % ne12;
             const int64_t i11 = i01 % ne11;
+            const int64_t nr0 = ne00 / ne10;
 
             float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
             float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
             float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
 
+            for (int64_t r = 0; r < nr0; ++r) {
 #ifdef LM_GGML_USE_ACCELERATE
-            vDSP_vadd(src0_ptr, 1, src1_ptr, 1, dst_ptr, 1, ne00);
+                vDSP_vadd(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
 #else
-            lm_ggml_vec_add_f32(ne00, dst_ptr, src0_ptr, src1_ptr);
+                lm_ggml_vec_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
 #endif
+            }
         }
     } else {
         // src1 is not contiguous
@@ -6937,8 +6984,9 @@ static void lm_ggml_compute_forward_add_f32(
             float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
             float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
 
-            for (int i0 = 0; i0 < ne0; i0++) {
-                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i0*nb10);
+            for (int64_t i0 = 0; i0 < ne0; ++i0) {
+                const int64_t i10 = i0 % ne10;
+                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
 
                 dst_ptr[i0] = src0_ptr[i0] + *src1_ptr;
             }
@@ -7658,7 +7706,7 @@ static void lm_ggml_compute_forward_mul_f32(
         const struct lm_ggml_tensor * src0,
         const struct lm_ggml_tensor * src1,
         struct lm_ggml_tensor * dst) {
-    LM_GGML_ASSERT(lm_ggml_can_repeat_rows(src1, src0) && lm_ggml_are_same_shape(src0, dst));
+    LM_GGML_ASSERT(lm_ggml_can_repeat(src1, src0) && lm_ggml_are_same_shape(src0, dst));
 
     if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) {
         return;
@@ -7681,7 +7729,6 @@ static void lm_ggml_compute_forward_mul_f32(
 
     LM_GGML_ASSERT( nb0 == sizeof(float));
     LM_GGML_ASSERT(nb00 == sizeof(float));
-    LM_GGML_ASSERT(ne00 == ne10);
 
     if (nb10 == sizeof(float)) {
         for (int64_t ir = ith; ir < nr; ir += nth) {
@@ -7693,20 +7740,21 @@ static void lm_ggml_compute_forward_mul_f32(
             const int64_t i13 = i03 % ne13;
             const int64_t i12 = i02 % ne12;
             const int64_t i11 = i01 % ne11;
+            const int64_t nr0 = ne00 / ne10;
 
             float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
             float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
             float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
 
+            for (int64_t r = 0 ; r < nr0; ++r) {
 #ifdef LM_GGML_USE_ACCELERATE
-            UNUSED(lm_ggml_vec_mul_f32);
+                UNUSED(lm_ggml_vec_mul_f32);
 
-            vDSP_vmul( src0_ptr, 1, src1_ptr, 1, dst_ptr,  1, ne00);
+                vDSP_vmul(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
 #else
-            lm_ggml_vec_mul_f32(ne00, dst_ptr, src0_ptr, src1_ptr);
+                lm_ggml_vec_mul_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
 #endif
-                // }
-            // }
+            }
         }
     } else {
         // src1 is not contiguous
@@ -7724,8 +7772,9 @@ static void lm_ggml_compute_forward_mul_f32(
             float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
             float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
 
-            for (int64_t i0 = 0; i0 < ne00; i0++) {
-                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i0*nb10);
+            for (int64_t i0 = 0; i0 < ne00; ++i0) {
+                const int64_t i10 = i0 % ne10;
+                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
 
                 dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr);
             }
@@ -7759,14 +7808,16 @@ static void lm_ggml_compute_forward_div_f32(
         const struct lm_ggml_tensor * src0,
         const struct lm_ggml_tensor * src1,
         struct lm_ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(lm_ggml_are_same_shape(src0, src1) && lm_ggml_are_same_shape(src0, dst));
+    LM_GGML_ASSERT(lm_ggml_can_repeat(src1, src0) && lm_ggml_are_same_shape(src0, dst));
 
     if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) {
         return;
     }
 
-    const int nr  = lm_ggml_nrows(src0);
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t nr = lm_ggml_nrows(src0);
 
     LM_GGML_TENSOR_BINARY_OP_LOCALS
 
@@ -7774,41 +7825,50 @@ static void lm_ggml_compute_forward_div_f32(
     LM_GGML_ASSERT(nb00 == sizeof(float));
 
     if (nb10 == sizeof(float)) {
-        for (int ir = 0; ir < nr; ++ir) {
-            // src0, src1 and dst are same shape => same indices
-            const int i3 = ir/(ne2*ne1);
-            const int i2 = (ir - i3*ne2*ne1)/ne1;
-            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+        for (int64_t ir = ith; ir < nr; ir += nth) {
+            // src0 and dst are same shape => same indices
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+            const int64_t nr0 = ne00 / ne10;
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
 
+            for (int64_t r = 0; r < nr0; ++r) {
 #ifdef LM_GGML_USE_ACCELERATE
-            UNUSED(lm_ggml_vec_div_f32);
+                UNUSED(lm_ggml_vec_div_f32);
 
-            vDSP_vdiv(
-                    (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
-                    (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
-                    (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ), 1,
-                    ne0);
+                vDSP_vdiv(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
 #else
-            lm_ggml_vec_div_f32(ne0,
-                    (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ),
-                    (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
-                    (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
+                lm_ggml_vec_div_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
 #endif
-                // }
-            // }
+            }
         }
     } else {
         // src1 is not contiguous
-        for (int ir = 0; ir < nr; ++ir) {
-            // src0, src1 and dst are same shape => same indices
-            const int i3 = ir/(ne2*ne1);
-            const int i2 = (ir - i3*ne2*ne1)/ne1;
-            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
-            float * dst_ptr  = (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 );
-            float * src0_ptr = (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
-            for (int i0 = 0; i0 < ne0; i0++) {
-                float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11 + i0*nb10);
+        for (int64_t ir = ith; ir < nr; ir += nth) {
+            // src0 and dst are same shape => same indices
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+
+            for (int64_t i0 = 0; i0 < ne00; ++i0) {
+                const int64_t i10 = i0 % ne10;
+                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
 
                 dst_ptr[i0] = src0_ptr[i0] / (*src1_ptr);
             }
@@ -8254,7 +8314,7 @@ static void lm_ggml_compute_forward_repeat_f16(
         return;
     }
 
-    LM_GGML_TENSOR_UNARY_OP_LOCALS;
+    LM_GGML_TENSOR_UNARY_OP_LOCALS
 
     // guaranteed to be an integer due to the check in lm_ggml_can_repeat
     const int nr0 = (int)(ne0/ne00);
@@ -8399,6 +8459,7 @@ static void lm_ggml_compute_forward_concat_f32(
     LM_GGML_ASSERT(src0->nb[0] == sizeof(float));
 
     const int ith = params->ith;
+    const int nth = params->nth;
 
     LM_GGML_TENSOR_BINARY_OP_LOCALS
 
@@ -8408,7 +8469,7 @@ static void lm_ggml_compute_forward_concat_f32(
     LM_GGML_ASSERT(nb10 == sizeof(float));
 
     for (int i3 = 0; i3 < ne3; i3++) {
-        for (int i2 = ith; i2 < ne2; i2++) {
+        for (int i2 = ith; i2 < ne2; i2 += nth) {
             if (i2 < ne02) { // src0
                 for (int i1 = 0; i1 < ne1; i1++) {
                     for (int i0 = 0; i0 < ne0; i0++) {
@@ -8921,6 +8982,48 @@ static void lm_ggml_compute_forward_silu(
     }
 }
 
+// lm_ggml_compute_forward_leaky
+
+static void lm_ggml_compute_forward_leaky_f32(
+        const struct lm_ggml_compute_params * params,
+        const struct lm_ggml_tensor * src0,
+        struct lm_ggml_tensor * dst) {
+    assert(params->ith == 0);
+    assert(lm_ggml_are_same_shape(src0, dst));
+
+    if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int n  = lm_ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    assert(dst->nb[0]  == sizeof(float));
+    assert(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        lm_ggml_vec_leaky_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void lm_ggml_compute_forward_leaky(
+        const struct lm_ggml_compute_params * params,
+        const struct lm_ggml_tensor * src0,
+        struct lm_ggml_tensor * dst) {
+    switch (src0->type) {
+        case LM_GGML_TYPE_F32:
+            {
+                lm_ggml_compute_forward_leaky_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                LM_GGML_ASSERT(false);
+            } break;
+    }
+}
+
 // lm_ggml_compute_forward_silu_back
 
 static void lm_ggml_compute_forward_silu_back_f32(
@@ -9404,6 +9507,8 @@ static bool lm_ggml_compute_forward_mul_mat_use_blas(
     // TODO: find the optimal values for these
     if (lm_ggml_is_contiguous(src0) &&
         lm_ggml_is_contiguous(src1) &&
+      //src0->type == LM_GGML_TYPE_F32 &&
+        src1->type == LM_GGML_TYPE_F32 &&
         (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
 
         /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
@@ -9442,7 +9547,7 @@ static void lm_ggml_compute_forward_mul_mat(
 
     // we don't support permuted src0 or src1
     LM_GGML_ASSERT(nb00 == lm_ggml_type_size(type));
-    LM_GGML_ASSERT(nb10 == sizeof(float));
+    LM_GGML_ASSERT(nb10 == lm_ggml_type_size(src1->type));
 
     // dst cannot be transposed or permuted
     LM_GGML_ASSERT(nb0 == sizeof(float));
@@ -9524,6 +9629,8 @@ static void lm_ggml_compute_forward_mul_mat(
             char * wdata = params->wdata;
             const size_t row_size = ne10*lm_ggml_type_size(vec_dot_type)/lm_ggml_blck_size(vec_dot_type);
 
+            assert(params->wsize >= ne11*ne12*ne13*row_size);
+
             for (int64_t i13 = 0; i13 < ne13; ++i13) {
                 for (int64_t i12 = 0; i12 < ne12; ++i12) {
                     for (int64_t i11 = 0; i11 < ne11; ++i11) {
@@ -9625,6 +9732,26 @@ static void lm_ggml_compute_forward_mul_mat(
     }
 }
 
+// lm_ggml_compute_forward_mul_mat_id
+
+static void lm_ggml_compute_forward_mul_mat_id(
+        const struct lm_ggml_compute_params * params,
+              struct lm_ggml_tensor * dst) {
+
+    const struct lm_ggml_tensor * ids = dst->src[0];
+    const struct lm_ggml_tensor * src1 = dst->src[1];
+
+    const int id = lm_ggml_get_op_params_i32(dst, 0);
+
+    const int a_id = ((int32_t *)ids->data)[id];
+
+    LM_GGML_ASSERT(a_id >= 0 && a_id < ids->ne[0]);
+
+    const struct lm_ggml_tensor * src0 = dst->src[a_id + 2];
+
+    lm_ggml_compute_forward_mul_mat(params, src0, src1, dst);
+}
+
 // lm_ggml_compute_forward_out_prod
 
 static void lm_ggml_compute_forward_out_prod_f32(
@@ -9640,10 +9767,12 @@ static void lm_ggml_compute_forward_out_prod_f32(
     const int ith = params->ith;
     const int nth = params->nth;
 
+    LM_GGML_ASSERT(ne0  == ne00);
+    LM_GGML_ASSERT(ne1  == ne10);
+    LM_GGML_ASSERT(ne2  == ne02);
     LM_GGML_ASSERT(ne02 == ne12);
-    LM_GGML_ASSERT(ne03 == ne13);
-    LM_GGML_ASSERT(ne2  == ne12);
     LM_GGML_ASSERT(ne3  == ne13);
+    LM_GGML_ASSERT(ne03 == ne13);
 
     // we don't support permuted src0 or src1
     LM_GGML_ASSERT(nb00 == sizeof(float));
@@ -9654,18 +9783,25 @@ static void lm_ggml_compute_forward_out_prod_f32(
     // LM_GGML_ASSERT(nb1 <= nb2);
     // LM_GGML_ASSERT(nb2 <= nb3);
 
-    LM_GGML_ASSERT(ne0 == ne00);
-    LM_GGML_ASSERT(ne1 == ne10);
-    LM_GGML_ASSERT(ne2 == ne02);
-    LM_GGML_ASSERT(ne3 == ne03);
-
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
 
     // TODO: #if defined(LM_GGML_USE_CUBLAS) lm_ggml_cuda_out_prod
-    // TODO: #if defined(LM_GGML_USE_ACCELERATE) || defined(LM_GGML_USE_OPENBLAS) || defined(LM_GGML_USE_CLBLAST)
+    // TODO: #if defined(LM_GGML_USE_CLBLAST)
+
+#if defined(LM_GGML_USE_ACCELERATE) || defined(LM_GGML_USE_OPENBLAS)
+    bool use_blas = lm_ggml_is_matrix(src0) &&
+        lm_ggml_is_matrix(src1) &&
+        lm_ggml_is_contiguous(src0) &&
+        (lm_ggml_is_contiguous(src1) || lm_ggml_is_transposed(src1));
+#endif
 
     if (params->type == LM_GGML_TASK_INIT) {
+#if defined(LM_GGML_USE_ACCELERATE) || defined(LM_GGML_USE_OPENBLAS) // gemm beta will zero dst
+        if (use_blas) {
+            return;
+        }
+#endif
         lm_ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
         return;
     }
@@ -9674,6 +9810,50 @@ static void lm_ggml_compute_forward_out_prod_f32(
         return;
     }
 
+#if defined(LM_GGML_USE_ACCELERATE) || defined(LM_GGML_USE_OPENBLAS)
+    if (use_blas) {
+        if (params->ith != 0) { // All threads other than the first do no work.
+            return;
+        }
+        // Arguments to lm_ggml_compute_forward_out_prod (expressed as major,minor)
+        // src0: (k,n)
+        // src1: (k,m)
+        // dst:  (m,n)
+        //
+        // Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f)
+        // Also expressed as (major,minor)
+        // a: (m,k): so src1 transposed
+        // b: (k,n): so src0
+        // c: (m,n)
+        //
+        // However, if lm_ggml_is_transposed(src1) is true, then
+        // src1->data already contains a transposed version, so sgemm mustn't
+        // transpose it further.
+
+        int n = src0->ne[0];
+        int k = src0->ne[1];
+        int m = src1->ne[0];
+
+        int transposeA, lda;
+
+        if (!lm_ggml_is_transposed(src1)) {
+            transposeA = CblasTrans;
+            lda = m;
+        } else {
+            transposeA = CblasNoTrans;
+            lda = k;
+        }
+
+        float * a = (float *) ((char *) src1->data);
+        float * b = (float *) ((char *) src0->data);
+        float * c = (float *) ((char *) dst->data);
+
+        cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);
+
+        return;
+    }
+#endif
+
     // dst[:,:,:,:] = 0
     // for i2,i3:
     //   for i1:
@@ -10527,20 +10707,25 @@ static void lm_ggml_compute_forward_diag_mask_zero(
 static void lm_ggml_compute_forward_soft_max_f32(
         const struct lm_ggml_compute_params * params,
         const struct lm_ggml_tensor * src0,
-        struct lm_ggml_tensor * dst) {
-    LM_GGML_ASSERT(lm_ggml_is_contiguous(src0));
-    LM_GGML_ASSERT(lm_ggml_is_contiguous(dst));
-    LM_GGML_ASSERT(lm_ggml_are_same_shape(src0, dst));
+        const struct lm_ggml_tensor * src1,
+              struct lm_ggml_tensor * dst) {
+    assert(lm_ggml_is_contiguous(dst));
+    assert(lm_ggml_are_same_shape(src0, dst));
 
     if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) {
         return;
     }
 
+    float scale = 1.0f;
+    memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
+
     // TODO: handle transposed/permuted matrices
 
     const int ith = params->ith;
     const int nth = params->nth;
 
+    const int64_t ne11 = src1 ? src1->ne[1] : 1;
+
     const int nc = src0->ne[0];
     const int nr = lm_ggml_nrows(src0);
 
@@ -10551,29 +10736,40 @@ static void lm_ggml_compute_forward_soft_max_f32(
     const int ir0 = dr*ith;
     const int ir1 = MIN(ir0 + dr, nr);
 
+    float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
+
     for (int i1 = ir0; i1 < ir1; i1++) {
-        float *sp = (float *)((char *) src0->data + i1*src0->nb[1]);
-        float *dp = (float *)((char *)  dst->data +  i1*dst->nb[1]);
+        float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
+        float * dp = (float *)((char *)  dst->data +  i1*dst->nb[1]);
+
+        // broadcast the mask across rows
+        float * mp = src1 ? (float *)((char *) src1->data + (i1%ne11)*src1->nb[1]) : NULL;
+
+        lm_ggml_vec_cpy_f32  (nc, wp, sp);
+        lm_ggml_vec_scale_f32(nc, wp, scale);
+        if (mp) {
+            lm_ggml_vec_acc_f32(nc, wp, mp);
+        }
 
 #ifndef NDEBUG
         for (int i = 0; i < nc; ++i) {
             //printf("p[%d] = %f\n", i, p[i]);
-            assert(!isnan(sp[i]));
+            assert(!isnan(wp[i]));
         }
 #endif
 
         float max = -INFINITY;
-        lm_ggml_vec_max_f32(nc, &max, sp);
+        lm_ggml_vec_max_f32(nc, &max, wp);
 
         lm_ggml_float sum = 0.0;
 
         uint16_t scvt;
         for (int i = 0; i < nc; i++) {
-            if (sp[i] == -INFINITY) {
+            if (wp[i] == -INFINITY) {
                 dp[i] = 0.0f;
             } else {
-                // const float val = (sp[i] == -INFINITY) ? 0.0 : exp(sp[i] - max);
-                lm_ggml_fp16_t s = LM_GGML_FP32_TO_FP16(sp[i] - max);
+                // const float val = (wp[i] == -INFINITY) ? 0.0 : exp(wp[i] - max);
+                lm_ggml_fp16_t s = LM_GGML_FP32_TO_FP16(wp[i] - max);
                 memcpy(&scvt, &s, sizeof(scvt));
                 const float val = LM_GGML_FP16_TO_FP32(lm_ggml_table_exp_f16[scvt]);
                 sum += (lm_ggml_float)val;
@@ -10598,11 +10794,12 @@ static void lm_ggml_compute_forward_soft_max_f32(
 static void lm_ggml_compute_forward_soft_max(
         const struct lm_ggml_compute_params * params,
         const struct lm_ggml_tensor * src0,
-        struct lm_ggml_tensor * dst) {
+        const struct lm_ggml_tensor * src1,
+              struct lm_ggml_tensor * dst) {
     switch (src0->type) {
         case LM_GGML_TYPE_F32:
             {
-                lm_ggml_compute_forward_soft_max_f32(params, src0, dst);
+                lm_ggml_compute_forward_soft_max_f32(params, src0, src1, dst);
             } break;
         default:
             {
@@ -11340,9 +11537,9 @@ static void lm_ggml_compute_forward_rope_back(
     }
 }
 
-// lm_ggml_compute_forward_conv_1d
+// lm_ggml_compute_forward_conv_transpose_1d
 
-static void lm_ggml_compute_forward_conv_1d_f16_f32(
+static void lm_ggml_compute_forward_conv_transpose_1d_f16_f32(
         const struct lm_ggml_compute_params * params,
         const struct lm_ggml_tensor * src0,
         const struct lm_ggml_tensor * src1,
@@ -11359,14 +11556,7 @@ static void lm_ggml_compute_forward_conv_1d_f16_f32(
     const int ith = params->ith;
     const int nth = params->nth;
 
-    const int nk = ne00;
-
-    // size of the convolution row - the kernel size unrolled across all input channels
-    const int ew0 = nk*ne01;
-
-    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
-    const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
-    const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
+    const int nk = ne00*ne01*ne02;
 
     LM_GGML_ASSERT(nb00 == sizeof(lm_ggml_fp16_t));
     LM_GGML_ASSERT(nb10 == sizeof(float));
@@ -11374,23 +11564,37 @@ static void lm_ggml_compute_forward_conv_1d_f16_f32(
     if (params->type == LM_GGML_TASK_INIT) {
         memset(params->wdata, 0, params->wsize);
 
-        lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) params->wdata + 0;
+        // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
+        {
+            lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) params->wdata + 0;
 
-        for (int64_t i11 = 0; i11 < ne11; i11++) {
-            const float * const src = (float *)((char *) src1->data + i11*nb11);
-            lm_ggml_fp16_t * dst_data = wdata;
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                for (int64_t i01 = 0; i01 < ne01; i01++) {
+                    const lm_ggml_fp16_t * const src = (lm_ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
+                    lm_ggml_fp16_t * dst_data = wdata + i01*ne00*ne02;
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        dst_data[i00*ne02 + i02] = src[i00];
+                    }
+                }
+            }
+        }
 
-            for (int64_t i0 = 0; i0 < ne0; i0++) {
-                for (int64_t ik = 0; ik < nk; ik++) {
-                    const int idx0 = i0*s0 + ik*d0 - p0;
+        // permute source data (src1) from (L x Cin) to (Cin x L)
+        {
+            lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) params->wdata + nk;
+            lm_ggml_fp16_t * dst_data = wdata;
 
-                    if(!(idx0 < 0 || idx0 >= ne10)) {
-                        dst_data[i0*ew0 + i11*nk + ik] = LM_GGML_FP32_TO_FP16(src[idx0]);
-                    }
+            for (int64_t i11 = 0; i11 < ne11; i11++) {
+                const float * const src = (float *)((char *) src1->data + i11*nb11);
+                for (int64_t i10 = 0; i10 < ne10; i10++) {
+                    dst_data[i10*ne11 + i11] = LM_GGML_FP32_TO_FP16(src[i10]);
                 }
             }
         }
 
+        // need to zero dst since we are accumulating into it
+        memset(dst->data, 0, lm_ggml_nbytes(dst));
+
         return;
     }
 
@@ -11398,8 +11602,10 @@ static void lm_ggml_compute_forward_conv_1d_f16_f32(
         return;
     }
 
+    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+
     // total rows in dst
-    const int nr = ne2;
+    const int nr = ne1;
 
     // rows per thread
     const int dr = (nr + nth - 1)/nth;
@@ -11408,22 +11614,26 @@ static void lm_ggml_compute_forward_conv_1d_f16_f32(
     const int ir0 = dr*ith;
     const int ir1 = MIN(ir0 + dr, nr);
 
-    lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) params->wdata + 0;
-
-    for (int i2 = 0; i2 < ne2; i2++) {
-        for (int i1 = ir0; i1 < ir1; i1++) {
-            float * dst_data = (float *)((char *) dst->data + i2*nb2 + i1*nb1);
+    lm_ggml_fp16_t * const wdata     = (lm_ggml_fp16_t *) params->wdata + 0;
+    lm_ggml_fp16_t * const wdata_src = wdata + nk;
 
-            for (int i0 = 0; i0 < ne0; i0++) {
-                lm_ggml_vec_dot_f16(ew0, dst_data + i0,
-                        (lm_ggml_fp16_t *) ((char *) src0->data + i1*nb02),
-                        (lm_ggml_fp16_t *)                wdata + i2*nb2 + i0*ew0);
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        float * dst_data = (float *)((char *) dst->data + i1*nb1);
+        lm_ggml_fp16_t * wdata_kernel = wdata + i1*ne02*ne00;
+        for (int i10 = 0; i10 < ne10; i10++) {
+            const int i1n = i10*ne11;
+            for (int i00 = 0; i00 < ne00; i00++) {
+                float v = 0;
+                lm_ggml_vec_dot_f16(ne02, &v,
+                        (lm_ggml_fp16_t *)    wdata_src + i1n,
+                        (lm_ggml_fp16_t *) wdata_kernel + i00*ne02);
+                dst_data[i10*s0 + i00] += v;
             }
         }
     }
 }
 
-static void lm_ggml_compute_forward_conv_1d_f32(
+static void lm_ggml_compute_forward_conv_transpose_1d_f32(
         const struct lm_ggml_compute_params * params,
         const struct lm_ggml_tensor * src0,
         const struct lm_ggml_tensor * src1,
@@ -11440,13 +11650,7 @@ static void lm_ggml_compute_forward_conv_1d_f32(
     const int ith = params->ith;
     const int nth = params->nth;
 
-    const int nk = ne00;
-
-    const int ew0 = nk*ne01;
-
-    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
-    const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
-    const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
+    const int nk = ne00*ne01*ne02;
 
     LM_GGML_ASSERT(nb00 == sizeof(float));
     LM_GGML_ASSERT(nb10 == sizeof(float));
@@ -11454,23 +11658,37 @@ static void lm_ggml_compute_forward_conv_1d_f32(
     if (params->type == LM_GGML_TASK_INIT) {
         memset(params->wdata, 0, params->wsize);
 
-        float * const wdata = (float *) params->wdata + 0;
+        // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
+        {
+            float * const wdata = (float *) params->wdata + 0;
+
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                for (int64_t i01 = 0; i01 < ne01; i01++) {
+                    const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
+                    float * dst_data = wdata + i01*ne00*ne02;
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        dst_data[i00*ne02 + i02] = src[i00];
+                    }
+                }
+            }
+        }
 
-        for (int64_t i11 = 0; i11 < ne11; i11++) {
-            const float * const src = (float *)((char *) src1->data + i11*nb11);
+        // prepare source data (src1)
+        {
+            float * const wdata = (float *) params->wdata + nk;
             float * dst_data = wdata;
 
-            for (int64_t i0 = 0; i0 < ne0; i0++) {
-                for (int64_t ik = 0; ik < nk; ik++) {
-                    const int idx0 = i0*s0 + ik*d0 - p0;
-
-                    if(!(idx0 < 0 || idx0 >= ne10)) {
-                        dst_data[i0*ew0 + i11*nk + ik] = src[idx0];
-                    }
+            for (int64_t i11 = 0; i11 < ne11; i11++) {
+                const float * const src = (float *)((char *) src1->data + i11*nb11);
+                for (int64_t i10 = 0; i10 < ne10; i10++) {
+                    dst_data[i10*ne11 + i11] = src[i10];
                 }
             }
         }
 
+        // need to zero dst since we are accumulating into it
+        memset(dst->data, 0, lm_ggml_nbytes(dst));
+
         return;
     }
 
@@ -11478,8 +11696,10 @@ static void lm_ggml_compute_forward_conv_1d_f32(
         return;
     }
 
+    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+
     // total rows in dst
-    const int nr = ne02;
+    const int nr = ne1;
 
     // rows per thread
     const int dr = (nr + nth - 1)/nth;
@@ -11488,94 +11708,50 @@ static void lm_ggml_compute_forward_conv_1d_f32(
     const int ir0 = dr*ith;
     const int ir1 = MIN(ir0 + dr, nr);
 
-    float * const wdata = (float *) params->wdata + 0;
-
-    for (int i2 = 0; i2 < ne2; i2++) {
-        for (int i1 = ir0; i1 < ir1; i1++) {
-            float * dst_data = (float *)((char *) dst->data + i2*nb2 + i1*nb1);
+    float * const wdata     = (float *) params->wdata + 0;
+    float * const wdata_src = wdata + nk;
 
-            for (int i0 = 0; i0 < ne0; i0++) {
-                lm_ggml_vec_dot_f32(ew0, dst_data + i0,
-                        (float *) ((char *) src0->data + i1*nb02),
-                        (float *)                wdata + i2*nb2 + i0*ew0);
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        float * dst_data = (float *)((char *) dst->data + i1*nb1);
+        float * wdata_kernel = wdata + i1*ne02*ne00;
+        for (int i10 = 0; i10 < ne10; i10++) {
+            const int i1n = i10*ne11;
+            for (int i00 = 0; i00 < ne00; i00++) {
+                float v = 0;
+                lm_ggml_vec_dot_f32(ne02, &v,
+                        wdata_src + i1n,
+                        wdata_kernel + i00*ne02);
+                dst_data[i10*s0 + i00] += v;
             }
         }
     }
 }
 
-// TODO: reuse lm_ggml_mul_mat or implement lm_ggml_im2col and remove stage_0 and stage_1
-static void gemm_f16_out_f32(int64_t m, int64_t n, int64_t k,
-                             lm_ggml_fp16_t * A,
-                             lm_ggml_fp16_t * B,
-                             float * C,
-                             const int ith, const int nth) {
-    // does not seem to make a difference
-    int64_t m0, m1, n0, n1;
-    // patches per thread
-    if (m > n) {
-        n0 = 0;
-        n1 = n;
-
-        // total patches in dst
-        const int np = m;
-
-        // patches per thread
-        const int dp = (np + nth - 1)/nth;
-
-        // patch range for this thread
-        m0 = dp*ith;
-        m1 = MIN(m0 + dp, np);
-    } else {
-        m0 = 0;
-        m1 = m;
-
-        // total patches in dst
-        const int np = n;
-
-        // patches per thread
-        const int dp = (np + nth - 1)/nth;
-
-        // patch range for this thread
-        n0 = dp*ith;
-        n1 = MIN(n0 + dp, np);
-    }
-
-    // block-tiling attempt
-    int64_t blck_n = 16;
-    int64_t blck_m = 16;
-
-    // int64_t CACHE_SIZE = 2 * 1024 * 1024; // 2MB
-    // int64_t blck_size = CACHE_SIZE / (sizeof(float) + 2 * sizeof(lm_ggml_fp16_t) * K);
-    // if (blck_size > 0) {
-    //     blck_0 = 4;
-    //     blck_1 = blck_size / blck_0;
-    //     if (blck_1 < 0) {
-    //         blck_1 = 1;
-    //     }
-    //     // blck_0 = (int64_t)sqrt(blck_size);
-    //     // blck_1 = blck_0;
-    // }
-    // // printf("%zd %zd %zd %zd\n", blck_size, K, blck_0, blck_1);
-
-    for (int j = n0; j < n1; j+=blck_n) {
-        for (int i = m0; i < m1; i+=blck_m) {
-            // printf("i j k => %d %d %d\n", i, j, K);
-            for (int ii = i; ii < i + blck_m && ii < m1; ii++) {
-                for (int jj = j; jj < j + blck_n && jj < n1; jj++) {
-                    lm_ggml_vec_dot_f16(k,
-                                    C + ii*n + jj,
-                                    A + ii * k,
-                                    B + jj * k);
-                }
-            }
-        }
+static void lm_ggml_compute_forward_conv_transpose_1d(
+        const struct lm_ggml_compute_params * params,
+        const struct lm_ggml_tensor * src0,
+        const struct lm_ggml_tensor * src1,
+              struct lm_ggml_tensor * dst) {
+    switch (src0->type) {
+        case LM_GGML_TYPE_F16:
+            {
+                lm_ggml_compute_forward_conv_transpose_1d_f16_f32(params, src0, src1, dst);
+            } break;
+        case LM_GGML_TYPE_F32:
+            {
+                lm_ggml_compute_forward_conv_transpose_1d_f32(params, src0, src1, dst);
+            } break;
+        default:
+            {
+                LM_GGML_ASSERT(false);
+            } break;
     }
 }
 
-// src0: kernel [OC, IC, K]
-// src1: signal [N, IC, IL]
-// dst:  result [N, OL, IC*K]
-static void lm_ggml_compute_forward_conv_1d_stage_0_f32(
+// src0: kernel [OC, IC, KH, KW]
+// src1: image [N, IC, IH, IW]
+// dst:  result [N, OH, OW, IC*KH*KW]
+static void lm_ggml_compute_forward_im2col_f16(
         const struct lm_ggml_compute_params * params,
         const struct lm_ggml_tensor * src0,
         const struct lm_ggml_tensor * src1,
@@ -11589,425 +11765,35 @@ static void lm_ggml_compute_forward_conv_1d_stage_0_f32(
 
     LM_GGML_TENSOR_BINARY_OP_LOCALS;
 
-    const int64_t N  = ne12;
-    const int64_t IC = ne11;
-    const int64_t IL = ne10;
-
-    const int64_t K = ne00;
-
-    const int64_t OL = ne1;
+    const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
+    const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
+    const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
+    const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
+    const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
+    const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
+    const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
 
     const int ith = params->ith;
     const int nth = params->nth;
 
-    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
-    const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
-    const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
-
-    LM_GGML_ASSERT(nb00 == sizeof(lm_ggml_fp16_t));
-    LM_GGML_ASSERT(nb10 == sizeof(float));
-
-    if (params->type == LM_GGML_TASK_INIT) {
-        memset(dst->data, 0, lm_ggml_nbytes(dst));
-        return;
-    }
-
-    if (params->type == LM_GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // im2col: [N, IC, IL] => [N, OL, IC*K]
-    {
-        lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) dst->data;
-
-        for (int64_t in = 0; in < N; in++) {
-            for (int64_t iol = 0; iol < OL; iol++) {
-                for (int64_t iic = ith; iic < IC; iic+=nth) {
-
-                    // micro kernel
-                    lm_ggml_fp16_t * dst_data = wdata + (in*OL + iol)*(IC*K); // [IC, K]
-                    const float * const src_data = (float *)((char *) src1->data + in*nb12 + iic*nb11); // [IL]
-
-                    for (int64_t ik = 0; ik < K; ik++) {
-                        const int64_t iil = iol*s0 + ik*d0 - p0;
-
-                        if (!(iil < 0 || iil >= IL)) {
-                            dst_data[iic*K + ik] = LM_GGML_FP32_TO_FP16(src_data[iil]);
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-// gemm: [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
-// src0: [OC, IC, K]
-// src1: [N, OL, IC * K]
-// result: [N, OC, OL]
-static void lm_ggml_compute_forward_conv_1d_stage_1_f16(
-        const struct lm_ggml_compute_params * params,
-        const struct lm_ggml_tensor * src0,
-        const struct lm_ggml_tensor * src1,
-              struct lm_ggml_tensor * dst) {
-    LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F16);
-    LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F16);
-    LM_GGML_ASSERT( dst->type == LM_GGML_TYPE_F32);
-
-    int64_t t0 = lm_ggml_perf_time_us();
-    UNUSED(t0);
-
-    if (params->type == LM_GGML_TASK_INIT) {
-        return;
-    }
-
-    if (params->type == LM_GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    LM_GGML_TENSOR_BINARY_OP_LOCALS;
-
-    LM_GGML_ASSERT(nb00 == sizeof(lm_ggml_fp16_t));
-    LM_GGML_ASSERT(nb10 == sizeof(lm_ggml_fp16_t));
-    LM_GGML_ASSERT(nb0  == sizeof(float));
-
-    const int N = ne12;
-    const int OL = ne11;
-
-    const int OC = ne02;
-    const int IC = ne01;
-    const int K  = ne00;
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    int64_t m = OC;
-    int64_t n = OL;
-    int64_t k = IC * K;
-
-    // [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
-    for (int i = 0; i < N; i++) {
-        lm_ggml_fp16_t * A = (lm_ggml_fp16_t *)src0->data; // [m, k]
-        lm_ggml_fp16_t * B = (lm_ggml_fp16_t *)src1->data + i * m * k; // [n, k]
-        float * C = (float *)dst->data + i * m * n; // [m, n]
-
-        gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
-    }
-}
-
-static void lm_ggml_compute_forward_conv_1d(
-        const struct lm_ggml_compute_params * params,
-        const struct lm_ggml_tensor * src0,
-        const struct lm_ggml_tensor * src1,
-              struct lm_ggml_tensor * dst) {
-    switch(src0->type) {
-        case LM_GGML_TYPE_F16:
-            {
-                lm_ggml_compute_forward_conv_1d_f16_f32(params, src0, src1, dst);
-            } break;
-        case LM_GGML_TYPE_F32:
-            {
-                lm_ggml_compute_forward_conv_1d_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                LM_GGML_ASSERT(false);
-            } break;
-    }
-}
-
-static void lm_ggml_compute_forward_conv_1d_stage_0(
-        const struct lm_ggml_compute_params * params,
-        const struct lm_ggml_tensor * src0,
-        const struct lm_ggml_tensor * src1,
-              struct lm_ggml_tensor * dst) {
-    switch(src0->type) {
-        case LM_GGML_TYPE_F16:
-            {
-                lm_ggml_compute_forward_conv_1d_stage_0_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                LM_GGML_ASSERT(false);
-            } break;
-    }
-}
-
-static void lm_ggml_compute_forward_conv_1d_stage_1(
-        const struct lm_ggml_compute_params * params,
-        const struct lm_ggml_tensor * src0,
-        const struct lm_ggml_tensor * src1,
-              struct lm_ggml_tensor * dst) {
-    switch(src0->type) {
-        case LM_GGML_TYPE_F16:
-            {
-                lm_ggml_compute_forward_conv_1d_stage_1_f16(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                LM_GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// lm_ggml_compute_forward_conv_transpose_1d
-
-static void lm_ggml_compute_forward_conv_transpose_1d_f16_f32(
-        const struct lm_ggml_compute_params * params,
-        const struct lm_ggml_tensor * src0,
-        const struct lm_ggml_tensor * src1,
-              struct lm_ggml_tensor * dst) {
-    LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F16);
-    LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F32);
-    LM_GGML_ASSERT( dst->type == LM_GGML_TYPE_F32);
-
-    int64_t t0 = lm_ggml_perf_time_us();
-    UNUSED(t0);
-
-    LM_GGML_TENSOR_BINARY_OP_LOCALS
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nk = ne00*ne01*ne02;
-
-    LM_GGML_ASSERT(nb00 == sizeof(lm_ggml_fp16_t));
-    LM_GGML_ASSERT(nb10 == sizeof(float));
-
-    if (params->type == LM_GGML_TASK_INIT) {
-        memset(params->wdata, 0, params->wsize);
-
-        // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
-        {
-            lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) params->wdata + 0;
-
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                for (int64_t i01 = 0; i01 < ne01; i01++) {
-                    const lm_ggml_fp16_t * const src = (lm_ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
-                    lm_ggml_fp16_t * dst_data = wdata + i01*ne00*ne02;
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        dst_data[i00*ne02 + i02] = src[i00];
-                    }
-                }
-            }
-        }
-
-        // permute source data (src1) from (L x Cin) to (Cin x L)
-        {
-            lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) params->wdata + nk;
-            lm_ggml_fp16_t * dst_data = wdata;
-
-            for (int64_t i11 = 0; i11 < ne11; i11++) {
-                const float * const src = (float *)((char *) src1->data + i11*nb11);
-                for (int64_t i10 = 0; i10 < ne10; i10++) {
-                    dst_data[i10*ne11 + i11] = LM_GGML_FP32_TO_FP16(src[i10]);
-                }
-            }
-        }
-
-        // need to zero dst since we are accumulating into it
-        memset(dst->data, 0, lm_ggml_nbytes(dst));
-
-        return;
-    }
-
-    if (params->type == LM_GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
-
-    // total rows in dst
-    const int nr = ne1;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    lm_ggml_fp16_t * const wdata     = (lm_ggml_fp16_t *) params->wdata + 0;
-    lm_ggml_fp16_t * const wdata_src = wdata + nk;
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        float * dst_data = (float *)((char *) dst->data + i1*nb1);
-        lm_ggml_fp16_t * wdata_kernel = wdata + i1*ne02*ne00;
-        for (int i10 = 0; i10 < ne10; i10++) {
-            const int i1n = i10*ne11;
-            for (int i00 = 0; i00 < ne00; i00++) {
-                float v = 0;
-                lm_ggml_vec_dot_f16(ne02, &v,
-                        (lm_ggml_fp16_t *)    wdata_src + i1n,
-                        (lm_ggml_fp16_t *) wdata_kernel + i00*ne02);
-                dst_data[i10*s0 + i00] += v;
-            }
-        }
-    }
-}
-
-static void lm_ggml_compute_forward_conv_transpose_1d_f32(
-        const struct lm_ggml_compute_params * params,
-        const struct lm_ggml_tensor * src0,
-        const struct lm_ggml_tensor * src1,
-              struct lm_ggml_tensor * dst) {
-    LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F32);
-    LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F32);
-    LM_GGML_ASSERT( dst->type == LM_GGML_TYPE_F32);
-
-    int64_t t0 = lm_ggml_perf_time_us();
-    UNUSED(t0);
-
-    LM_GGML_TENSOR_BINARY_OP_LOCALS
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nk = ne00*ne01*ne02;
-
-    LM_GGML_ASSERT(nb00 == sizeof(float));
-    LM_GGML_ASSERT(nb10 == sizeof(float));
-
-    if (params->type == LM_GGML_TASK_INIT) {
-        memset(params->wdata, 0, params->wsize);
-
-        // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
-        {
-            float * const wdata = (float *) params->wdata + 0;
-
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                for (int64_t i01 = 0; i01 < ne01; i01++) {
-                    const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
-                    float * dst_data = wdata + i01*ne00*ne02;
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        dst_data[i00*ne02 + i02] = src[i00];
-                    }
-                }
-            }
-        }
-
-        // prepare source data (src1)
-        {
-            float * const wdata = (float *) params->wdata + nk;
-            float * dst_data = wdata;
-
-            for (int64_t i11 = 0; i11 < ne11; i11++) {
-                const float * const src = (float *)((char *) src1->data + i11*nb11);
-                for (int64_t i10 = 0; i10 < ne10; i10++) {
-                    dst_data[i10*ne11 + i11] = src[i10];
-                }
-            }
-        }
-
-        // need to zero dst since we are accumulating into it
-        memset(dst->data, 0, lm_ggml_nbytes(dst));
-
-        return;
-    }
-
-    if (params->type == LM_GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
-
-    // total rows in dst
-    const int nr = ne1;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    float * const wdata     = (float *) params->wdata + 0;
-    float * const wdata_src = wdata + nk;
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        float * dst_data = (float *)((char *) dst->data + i1*nb1);
-        float * wdata_kernel = wdata + i1*ne02*ne00;
-        for (int i10 = 0; i10 < ne10; i10++) {
-            const int i1n = i10*ne11;
-            for (int i00 = 0; i00 < ne00; i00++) {
-                float v = 0;
-                lm_ggml_vec_dot_f32(ne02, &v,
-                        wdata_src + i1n,
-                        wdata_kernel + i00*ne02);
-                dst_data[i10*s0 + i00] += v;
-            }
-        }
-    }
-}
-
-static void lm_ggml_compute_forward_conv_transpose_1d(
-        const struct lm_ggml_compute_params * params,
-        const struct lm_ggml_tensor * src0,
-        const struct lm_ggml_tensor * src1,
-              struct lm_ggml_tensor * dst) {
-    switch (src0->type) {
-        case LM_GGML_TYPE_F16:
-            {
-                lm_ggml_compute_forward_conv_transpose_1d_f16_f32(params, src0, src1, dst);
-            } break;
-        case LM_GGML_TYPE_F32:
-            {
-                lm_ggml_compute_forward_conv_transpose_1d_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                LM_GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// lm_ggml_compute_forward_conv_2d
-
-// src0: kernel [OC, IC, KH, KW]
-// src1: image [N, IC, IH, IW]
-// dst:  result [N, OH, OW, IC*KH*KW]
-static void lm_ggml_compute_forward_conv_2d_stage_0_f32(
-        const struct lm_ggml_compute_params * params,
-        const struct lm_ggml_tensor * src0,
-        const struct lm_ggml_tensor * src1,
-              struct lm_ggml_tensor * dst) {
-    LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F16);
-    LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F32);
-    LM_GGML_ASSERT( dst->type == LM_GGML_TYPE_F16);
-
-    int64_t t0 = lm_ggml_perf_time_us();
-    UNUSED(t0);
-
-    LM_GGML_TENSOR_BINARY_OP_LOCALS;
-
-    const int64_t N = ne13;
-    const int64_t IC = ne12;
-    const int64_t IH = ne11;
+    const int64_t N  = is_2D ? ne13 : ne12;
+    const int64_t IC = is_2D ? ne12 : ne11;
+    const int64_t IH = is_2D ? ne11 : 1;
     const int64_t IW = ne10;
 
-    // const int64_t OC = ne03;
-    // const int64_t IC = ne02;
-    const int64_t KH = ne01;
+    const int64_t KH = is_2D ? ne01 : 1;
     const int64_t KW = ne00;
 
-    const int64_t OH = ne2;
-    const int64_t OW = ne1;
-
-    const int ith = params->ith;
-    const int nth = params->nth;
+    const int64_t OH = is_2D ? ne2 : 1;
+    const int64_t OW = ne1;
 
-    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
-    const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
-    const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
-    const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
-    const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
-    const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
+    int ofs0 = is_2D ? nb13 : nb12;
+    int ofs1 = is_2D ? nb12 : nb11;
 
     LM_GGML_ASSERT(nb00 == sizeof(lm_ggml_fp16_t));
     LM_GGML_ASSERT(nb10 == sizeof(float));
 
     if (params->type == LM_GGML_TASK_INIT) {
-        memset(dst->data, 0, lm_ggml_nbytes(dst));
         return;
     }
 
@@ -12020,20 +11806,22 @@ static void lm_ggml_compute_forward_conv_2d_stage_0_f32(
         lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) dst->data;
 
         for (int64_t in = 0; in < N; in++) {
-            for (int64_t ioh = 0; ioh < OH; ioh++) {
+            for (int64_t ioh = 0; ioh < OH; ioh++) { // 1
                 for (int64_t iow = 0; iow < OW; iow++) {
-                    for (int64_t iic = ith; iic < IC; iic+=nth) {
+                    for (int64_t iic = ith; iic < IC; iic += nth) {
 
                         // micro kernel
                         lm_ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
-                        const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW]
+                        const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW]
 
-                        for (int64_t ikh = 0; ikh < KH; ikh++) {
+                        for (int64_t ikh = 0; ikh < KH; ikh++) {  // 1
                             for (int64_t ikw = 0; ikw < KW; ikw++) {
                                 const int64_t iiw = iow*s0 + ikw*d0 - p0;
                                 const int64_t iih = ioh*s1 + ikh*d1 - p1;
 
-                                if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
+                                if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
+                                    dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0;
+                                } else {
                                     dst_data[iic*(KH*KW) + ikh*KW + ikw] = LM_GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
                                 }
                             }
@@ -12045,223 +11833,7 @@ static void lm_ggml_compute_forward_conv_2d_stage_0_f32(
     }
 }
 
-// gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
-// src0: [OC, IC, KH, KW]
-// src1: [N, OH, OW, IC * KH * KW]
-// result: [N, OC, OH, OW]
-static void lm_ggml_compute_forward_conv_2d_stage_1_f16(
-        const struct lm_ggml_compute_params * params,
-        const struct lm_ggml_tensor * src0,
-        const struct lm_ggml_tensor * src1,
-              struct lm_ggml_tensor * dst) {
-    LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F16);
-    LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F16);
-    LM_GGML_ASSERT( dst->type == LM_GGML_TYPE_F32);
-
-    int64_t t0 = lm_ggml_perf_time_us();
-    UNUSED(t0);
-
-    if (params->type == LM_GGML_TASK_INIT) {
-        return;
-    }
-
-    if (params->type == LM_GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    LM_GGML_TENSOR_BINARY_OP_LOCALS;
-
-    LM_GGML_ASSERT(nb00 == sizeof(lm_ggml_fp16_t));
-    LM_GGML_ASSERT(nb10 == sizeof(lm_ggml_fp16_t));
-    LM_GGML_ASSERT(nb0  == sizeof(float));
-
-    const int N = ne13;
-    const int OH = ne12;
-    const int OW = ne11;
-
-    const int OC = ne03;
-    const int IC = ne02;
-    const int KH = ne01;
-    const int KW = ne00;
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    int64_t m = OC;
-    int64_t n = OH * OW;
-    int64_t k = IC * KH * KW;
-
-    // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
-    for (int i = 0; i < N; i++) {
-        lm_ggml_fp16_t * A = (lm_ggml_fp16_t *)src0->data; // [m, k]
-        lm_ggml_fp16_t * B = (lm_ggml_fp16_t *)src1->data + i * m * k; // [n, k]
-        float * C = (float *)dst->data + i * m * n; // [m, n]
-
-        gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
-    }
-}
-
-static void lm_ggml_compute_forward_conv_2d_f16_f32(
-        const struct lm_ggml_compute_params * params,
-        const struct lm_ggml_tensor * src0,
-        const struct lm_ggml_tensor * src1,
-              struct lm_ggml_tensor * dst) {
-    LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F16);
-    LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F32);
-    LM_GGML_ASSERT( dst->type == LM_GGML_TYPE_F32);
-
-    int64_t t0 = lm_ggml_perf_time_us();
-    UNUSED(t0);
-
-    LM_GGML_TENSOR_BINARY_OP_LOCALS
-
-    // src1: image [N, IC, IH, IW]
-    // src0: kernel [OC, IC, KH, KW]
-    // dst:  result [N, OC, OH, OW]
-    // ne12: IC
-    // ne0: OW
-    // ne1: OH
-    // nk0: KW
-    // nk1: KH
-    // ne13: N
-
-    const int N = ne13;
-    const int IC = ne12;
-    const int IH = ne11;
-    const int IW = ne10;
-
-    const int OC = ne03;
-    // const int IC = ne02;
-    const int KH = ne01;
-    const int KW = ne00;
-
-    const int OH = ne1;
-    const int OW = ne0;
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    // const int nk0 = ne00;
-    // const int nk1 = ne01;
-
-    // size of the convolution row - the kernel size unrolled across all channels
-    // const int ew0 = nk0*nk1*ne02;
-    // ew0: IC*KH*KW
-
-    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
-    const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
-    const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
-    const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
-    const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
-    const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
-
-    LM_GGML_ASSERT(nb00 == sizeof(lm_ggml_fp16_t));
-    LM_GGML_ASSERT(nb10 == sizeof(float));
-
-    if (params->type == LM_GGML_TASK_INIT) {
-        memset(params->wdata, 0, params->wsize);
-
-        // prepare source data (src1)
-        // im2col: [N, IC, IH, IW] => [N*OH*OW, IC*KH*KW]
-
-        {
-            lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) params->wdata + 0;
-
-            for (int in = 0; in < N; in++) {
-                for (int iic = 0; iic < IC; iic++) {
-                    for (int ioh = 0; ioh < OH; ioh++) {
-                        for (int iow = 0; iow < OW; iow++) {
-
-                            // micro kernel
-                            lm_ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
-                            const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW]
-
-                            for (int ikh = 0; ikh < KH; ikh++) {
-                                for (int ikw = 0; ikw < KW; ikw++) {
-                                    const int iiw = iow*s0 + ikw*d0 - p0;
-                                    const int iih = ioh*s1 + ikh*d1 - p1;
-
-                                    if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
-                                        dst_data[iic*(KH*KW) + ikh*KW + ikw] = LM_GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-
-        return;
-    }
-
-    if (params->type == LM_GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) params->wdata + 0;
-    // wdata: [N*OH*OW, IC*KH*KW]
-    // dst: result [N, OC, OH, OW]
-    // src0: kernel [OC, IC, KH, KW]
-
-    int64_t m = OC;
-    int64_t n = OH * OW;
-    int64_t k = IC * KH * KW;
-
-    // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
-    for (int i = 0; i < N; i++) {
-        lm_ggml_fp16_t * A = (lm_ggml_fp16_t *)src0->data; // [m, k]
-        lm_ggml_fp16_t * B = (lm_ggml_fp16_t *)wdata + i * m * k; // [n, k]
-        float * C = (float *)dst->data + i * m * n; // [m * k]
-
-        gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
-    }
-}
-
-static void lm_ggml_compute_forward_conv_2d(
-        const struct lm_ggml_compute_params * params,
-        const struct lm_ggml_tensor * src0,
-        const struct lm_ggml_tensor * src1,
-              struct lm_ggml_tensor * dst) {
-    switch (src0->type) {
-        case LM_GGML_TYPE_F16:
-            {
-                lm_ggml_compute_forward_conv_2d_f16_f32(params, src0, src1, dst);
-            } break;
-        case LM_GGML_TYPE_F32:
-            {
-                //lm_ggml_compute_forward_conv_2d_f32(params, src0, src1, dst);
-                LM_GGML_ASSERT(false);
-            } break;
-        default:
-            {
-                LM_GGML_ASSERT(false);
-            } break;
-    }
-}
-
-static void lm_ggml_compute_forward_conv_2d_stage_0(
-        const struct lm_ggml_compute_params * params,
-        const struct lm_ggml_tensor * src0,
-        const struct lm_ggml_tensor * src1,
-              struct lm_ggml_tensor * dst) {
-    switch (src0->type) {
-        case LM_GGML_TYPE_F16:
-            {
-                lm_ggml_compute_forward_conv_2d_stage_0_f32(params, src0, src1, dst);
-            } break;
-        case LM_GGML_TYPE_F32:
-            {
-                LM_GGML_ASSERT(false);
-            } break;
-        default:
-            {
-                LM_GGML_ASSERT(false);
-            } break;
-    }
-}
-
-static void lm_ggml_compute_forward_conv_2d_stage_1(
+static void lm_ggml_compute_forward_im2col(
         const struct lm_ggml_compute_params * params,
         const struct lm_ggml_tensor * src0,
         const struct lm_ggml_tensor * src1,
@@ -12269,7 +11841,7 @@ static void lm_ggml_compute_forward_conv_2d_stage_1(
     switch (src0->type) {
         case LM_GGML_TYPE_F16:
             {
-                lm_ggml_compute_forward_conv_2d_stage_1_f16(params, src0, src1, dst);
+                lm_ggml_compute_forward_im2col_f16(params, src0, src1, dst);
             } break;
         case LM_GGML_TYPE_F32:
             {
@@ -12454,14 +12026,11 @@ static void lm_ggml_compute_forward_pool_1d(
     lm_ggml_compute_forward_pool_1d_sk_p0(params, op, src0, k0, dst);
 }
 
-// lm_ggml_compute_forward_pool_2d_sk_p0
+// lm_ggml_compute_forward_pool_2d
 
-static void lm_ggml_compute_forward_pool_2d_sk_p0(
+static void lm_ggml_compute_forward_pool_2d(
         const struct lm_ggml_compute_params * params,
-        const enum   lm_ggml_op_pool op,
         const struct lm_ggml_tensor * src,
-        const int k0,
-        const int k1,
         struct lm_ggml_tensor * dst) {
     assert(src->type == LM_GGML_TYPE_F32);
     assert(params->ith == 0);
@@ -12470,6 +12039,14 @@ static void lm_ggml_compute_forward_pool_2d_sk_p0(
         return;
     }
 
+    const int32_t * opts = (const int32_t *)dst->op_params;
+    enum lm_ggml_op_pool op = opts[0];
+    const int k0 = opts[1];
+    const int k1 = opts[2];
+    const int s0 = opts[3];
+    const int s1 = opts[4];
+    const int p0 = opts[5];
+    const int p1 = opts[6];
     const char * cdata = (const char*)src->data;
     const char * const data_end = cdata + lm_ggml_nbytes(src);
 
@@ -12480,6 +12057,8 @@ static void lm_ggml_compute_forward_pool_2d_sk_p0(
     float * dplane = (float *)dst->data;
 
     const int ka = k0 * k1;
+    const int offset0 = -p0;
+    const int offset1 = -p1;
 
     while (cdata < data_end) {
         for (int oy = 0; oy < py; ++oy) {
@@ -12492,13 +12071,15 @@ static void lm_ggml_compute_forward_pool_2d_sk_p0(
                     case LM_GGML_OP_POOL_COUNT: LM_GGML_ASSERT(false); break;
                 }
 
-                const int ix = ox * k0;
-                const int iy = oy * k1;
+                const int ix = offset0 + ox * s0;
+                const int iy = offset1 + oy * s1;
 
                 for (int ky = 0; ky < k1; ++ky) {
+                    if (iy + ky < 0 || iy + ky >= src->ne[1]) continue;
                     const float * const srow = (const float *)(cdata + src->nb[1] * (iy + ky));
                     for (int kx = 0; kx < k0; ++kx) {
                         int j = ix + kx;
+                        if (j < 0 || j >= src->ne[0]) continue;
                         switch (op) {
                             case LM_GGML_OP_POOL_AVG:                     *out += srow[j]; break;
                             case LM_GGML_OP_POOL_MAX: if (srow[j] > *out) *out  = srow[j]; break;
@@ -12519,29 +12100,6 @@ static void lm_ggml_compute_forward_pool_2d_sk_p0(
     }
 }
 
-// lm_ggml_compute_forward_pool_2d
-
-static void lm_ggml_compute_forward_pool_2d(
-        const struct lm_ggml_compute_params * params,
-        const struct lm_ggml_tensor * src0,
-              struct lm_ggml_tensor * dst) {
-
-    const int32_t * opts = (const int32_t *)dst->op_params;
-    enum lm_ggml_op_pool op = opts[0];
-    const int k0 = opts[1];
-    const int k1 = opts[2];
-    const int s0 = opts[3];
-    const int s1 = opts[4];
-    const int p0 = opts[5];
-    const int p1 = opts[6];
-    LM_GGML_ASSERT(p0 == 0);
-    LM_GGML_ASSERT(p1 == 0); // padding not supported
-    LM_GGML_ASSERT(k0 == s0);
-    LM_GGML_ASSERT(k1 == s1); // only s = k supported
-
-    lm_ggml_compute_forward_pool_2d_sk_p0(params, op, src0, k0, k1, dst);
-}
-
 // lm_ggml_compute_forward_upscale
 
 static void lm_ggml_compute_forward_upscale_f32(
@@ -12597,6 +12155,67 @@ static void lm_ggml_compute_forward_upscale(
     }
 }
 
+// lm_ggml_compute_forward_argsort
+
+static void lm_ggml_compute_forward_argsort_f32(
+    const struct lm_ggml_compute_params * params,
+    const struct lm_ggml_tensor * src0,
+    struct lm_ggml_tensor * dst) {
+
+    if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    LM_GGML_TENSOR_UNARY_OP_LOCALS
+
+    LM_GGML_ASSERT(nb0 == sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t nr = lm_ggml_nrows(src0);
+
+    enum lm_ggml_sort_order order = (enum lm_ggml_sort_order) lm_ggml_get_op_params_i32(dst, 0);
+
+    for (int64_t i = ith; i < nr; i += nth) {
+        int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
+        const float * src_data = (float *)((char *) src0->data + i*nb01);
+
+        for (int64_t j = 0; j < ne0; j++) {
+            dst_data[j] = j;
+        }
+
+        // C doesn't have a functional sort, so we do a bubble sort instead
+        for (int64_t j = 0; j < ne0; j++) {
+            for (int64_t k = j + 1; k < ne0; k++) {
+                if ((order == LM_GGML_SORT_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
+                    (order == LM_GGML_SORT_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
+                    int32_t tmp = dst_data[j];
+                    dst_data[j] = dst_data[k];
+                    dst_data[k] = tmp;
+                }
+            }
+        }
+    }
+}
+
+static void lm_ggml_compute_forward_argsort(
+    const struct lm_ggml_compute_params * params,
+    const struct lm_ggml_tensor * src0,
+    struct lm_ggml_tensor * dst) {
+
+    switch (src0->type) {
+        case LM_GGML_TYPE_F32:
+            {
+                lm_ggml_compute_forward_argsort_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                LM_GGML_ASSERT(false);
+            } break;
+    }
+}
+
 // lm_ggml_compute_forward_flash_attn
 
 static void lm_ggml_compute_forward_flash_attn_f32(
@@ -13743,6 +13362,10 @@ static void lm_ggml_compute_forward_unary(
             {
                 lm_ggml_compute_forward_silu(params, src0, dst);
             } break;
+        case LM_GGML_UNARY_OP_LEAKY:
+            {
+                lm_ggml_compute_forward_leaky(params, src0, dst);
+            } break;
         default:
             {
                 LM_GGML_ASSERT(false);
@@ -14416,6 +14039,10 @@ static void lm_ggml_compute_forward(struct lm_ggml_compute_params * params, stru
             {
                 lm_ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
             } break;
+        case LM_GGML_OP_MUL_MAT_ID:
+            {
+                lm_ggml_compute_forward_mul_mat_id(params, tensor);
+            } break;
         case LM_GGML_OP_OUT_PROD:
             {
                 lm_ggml_compute_forward_out_prod(params, tensor->src[0], tensor->src[1], tensor);
@@ -14474,7 +14101,7 @@ static void lm_ggml_compute_forward(struct lm_ggml_compute_params * params, stru
             } break;
         case LM_GGML_OP_SOFT_MAX:
             {
-                lm_ggml_compute_forward_soft_max(params, tensor->src[0], tensor);
+                lm_ggml_compute_forward_soft_max(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case LM_GGML_OP_SOFT_MAX_BACK:
             {
@@ -14496,33 +14123,13 @@ static void lm_ggml_compute_forward(struct lm_ggml_compute_params * params, stru
             {
                 lm_ggml_compute_forward_clamp(params, tensor->src[0], tensor);
             } break;
-        case LM_GGML_OP_CONV_1D:
-            {
-                lm_ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor);
-            } break;
-        case LM_GGML_OP_CONV_1D_STAGE_0:
-            {
-                lm_ggml_compute_forward_conv_1d_stage_0(params, tensor->src[0], tensor->src[1], tensor);
-            } break;
-        case LM_GGML_OP_CONV_1D_STAGE_1:
-            {
-                lm_ggml_compute_forward_conv_1d_stage_1(params, tensor->src[0], tensor->src[1], tensor);
-            } break;
         case LM_GGML_OP_CONV_TRANSPOSE_1D:
             {
                 lm_ggml_compute_forward_conv_transpose_1d(params, tensor->src[0], tensor->src[1], tensor);
             } break;
-        case LM_GGML_OP_CONV_2D:
-            {
-                lm_ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor);
-            } break;
-        case LM_GGML_OP_CONV_2D_STAGE_0:
-            {
-                lm_ggml_compute_forward_conv_2d_stage_0(params, tensor->src[0], tensor->src[1], tensor);
-            } break;
-        case LM_GGML_OP_CONV_2D_STAGE_1:
+        case LM_GGML_OP_IM2COL:
             {
-                lm_ggml_compute_forward_conv_2d_stage_1(params, tensor->src[0], tensor->src[1], tensor);
+                lm_ggml_compute_forward_im2col(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case LM_GGML_OP_CONV_TRANSPOSE_2D:
             {
@@ -14540,6 +14147,10 @@ static void lm_ggml_compute_forward(struct lm_ggml_compute_params * params, stru
             {
                 lm_ggml_compute_forward_upscale(params, tensor->src[0], tensor);
             } break;
+        case LM_GGML_OP_ARGSORT:
+            {
+                lm_ggml_compute_forward_argsort(params, tensor->src[0], tensor);
+            } break;
         case LM_GGML_OP_FLASH_ATTN:
             {
                 const int32_t t = lm_ggml_get_op_params_i32(tensor, 0);
@@ -14651,62 +14262,109 @@ static void lm_ggml_compute_forward(struct lm_ggml_compute_params * params, stru
 
 ////////////////////////////////////////////////////////////////////////////////
 
-static_assert(LM_GGML_GRAPH_HASHTABLE_SIZE > LM_GGML_MAX_NODES * 2, "LM_GGML_GRAPH_HT_SIZE is too small");
+static size_t lm_ggml_hash_size(size_t min_sz) {
+    // next primes after powers of two
+    static const size_t primes[] = {
+        2, 3, 5, 11, 17, 37, 67, 131, 257, 521, 1031,
+        2053, 4099, 8209, 16411, 32771, 65537, 131101,
+        262147, 524309, 1048583, 2097169, 4194319, 8388617,
+        16777259, 33554467, 67108879, 134217757, 268435459,
+        536870923, 1073741827, 2147483659
+    };
+    static const size_t n_primes = sizeof(primes)/sizeof(primes[0]);
+
+    // find the smallest prime that is larger or equal to min_sz
+    size_t l = 0;
+    size_t r = n_primes;
+    while (l < r) {
+        size_t m = (l + r)/2;
+        if (primes[m] < min_sz) {
+            l = m + 1;
+        } else {
+            r = m;
+        }
+    }
+    size_t sz = l < n_primes ? primes[l] : min_sz | 1;
+    return sz;
+}
 
-static size_t hash(void * p) {
-    return (size_t)p % LM_GGML_GRAPH_HASHTABLE_SIZE;
+static size_t lm_ggml_hash(const void * p) {
+    return (size_t)p;
 }
 
-static size_t hash_find(void * hash_table[], void * p) {
-    size_t h = hash(p);
+size_t lm_ggml_hash_find(const struct lm_ggml_hash_set hash_set, struct lm_ggml_tensor * key) {
+    size_t h = lm_ggml_hash(key) % hash_set.size;
 
     // linear probing
     size_t i = h;
-    while (hash_table[i] != NULL && hash_table[i] != p) {
-        i = (i + 1) % LM_GGML_GRAPH_HASHTABLE_SIZE;
+    while (hash_set.keys[i] != NULL && hash_set.keys[i] != key) {
+        i = (i + 1) % hash_set.size;
         if (i == h) {
             // visited all hash table entries -> not found
-            return LM_GGML_GRAPH_HASHTABLE_SIZE;
+            return LM_GGML_HASHTABLE_FULL;
         }
     }
     return i;
 }
 
-static bool hash_insert(void * hash_table[], void * p) {
-    size_t i = hash_find(hash_table, p);
+bool lm_ggml_hash_contains(struct lm_ggml_hash_set hash_set, struct lm_ggml_tensor * key) {
+    size_t i = lm_ggml_hash_find(hash_set, key);
+    return i != LM_GGML_HASHTABLE_FULL && hash_set.keys[i] == key;
+}
 
-    LM_GGML_ASSERT(i < LM_GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
+size_t lm_ggml_hash_insert(struct lm_ggml_hash_set hash_set, struct lm_ggml_tensor * key) {
+    size_t i = lm_ggml_hash_find(hash_set, key);
 
-    if (hash_table[i] == p) {
-        return true;
+    LM_GGML_ASSERT(i != LM_GGML_HASHTABLE_FULL);
+
+    if (hash_set.keys[i] == key) {
+        return LM_GGML_HASHTABLE_ALREADY_EXISTS;
     }
 
     // insert
-    LM_GGML_ASSERT(hash_table[i] == NULL);
-    hash_table[i] = p;
-    return false;
+    LM_GGML_ASSERT(hash_set.keys[i] == NULL);
+    hash_set.keys[i] = key;
+    return i;
+}
+
+size_t lm_ggml_hash_find_or_insert(struct lm_ggml_hash_set hash_set, struct lm_ggml_tensor * key) {
+    size_t i = lm_ggml_hash_find(hash_set, key);
+
+    LM_GGML_ASSERT(i != LM_GGML_HASHTABLE_FULL);
+
+    hash_set.keys[i] = key;
+    return i;
+}
+
+static struct lm_ggml_hash_set lm_ggml_hash_set_new(size_t size) {
+    size = lm_ggml_hash_size(size);
+    struct lm_ggml_hash_set result;
+    result.size = size;
+    result.keys = malloc(sizeof(struct lm_ggml_tensor *) * size);
+    memset(result.keys, 0, sizeof(struct lm_ggml_tensor *) * size);
+    return result;
 }
 
-static bool hash_contains(void * hash_table[], void * p) {
-    size_t i = hash_find(hash_table, p);
-    return (i < LM_GGML_GRAPH_HASHTABLE_SIZE) && (hash_table[i] == p);
+static void lm_ggml_hash_set_free(struct lm_ggml_hash_set hash_set) {
+    free(hash_set.keys);
 }
 
 struct hash_map {
-    void * keys[LM_GGML_GRAPH_HASHTABLE_SIZE];
-    void * vals[LM_GGML_GRAPH_HASHTABLE_SIZE];
+    struct lm_ggml_hash_set set;
+    struct lm_ggml_tensor ** vals;
 };
 
-static struct hash_map * new_hash_map(void) {
+static struct hash_map * lm_ggml_new_hash_map(size_t size) {
     struct hash_map * result = malloc(sizeof(struct hash_map));
-    for (int i=0; i<LM_GGML_GRAPH_HASHTABLE_SIZE; ++i) {
-        result->keys[i] = NULL;
-        result->vals[i] = NULL;
-    }
+    result->set = lm_ggml_hash_set_new(size);
+    result->vals = malloc(sizeof(struct lm_ggml_tensor *) * result->set.size);
+    memset(result->vals, 0, sizeof(struct lm_ggml_tensor *) * result->set.size);
     return result;
 }
 
-static void free_hash_map(struct hash_map * map) {
+static void lm_ggml_hash_map_free(struct hash_map * map) {
+    lm_ggml_hash_set_free(map->set);
+    free(map->vals);
     free(map);
 }
 
@@ -14726,7 +14384,7 @@ static struct lm_ggml_tensor * lm_ggml_recompute_graph_node(
         return node;
     }
 
-    if (!hash_contains(graph->visited_hash_table, node)) {
+    if (!lm_ggml_hash_contains(graph->visited_hash_table, node)) {
         return node;
     }
 
@@ -14741,17 +14399,17 @@ static struct lm_ggml_tensor * lm_ggml_recompute_graph_node(
         return node;
     }
 
-    size_t i = hash_find(replacements->keys, node);
-    LM_GGML_ASSERT(i < LM_GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
-    if (replacements->keys[i] == node) {
-        return (struct lm_ggml_tensor *) replacements->vals[i];
+    size_t i = lm_ggml_hash_find(replacements->set, node);
+    LM_GGML_ASSERT(i != LM_GGML_HASHTABLE_FULL); // assert that not full
+    if (replacements->set.keys[i] == node) {
+        return replacements->vals[i];
     }
 
     struct lm_ggml_tensor * clone = lm_ggml_new_tensor(ctx, node->type, node->n_dims, node->ne);
 
     // insert clone into replacements
-    LM_GGML_ASSERT(replacements->keys[i] == NULL); // assert that we don't overwrite
-    replacements->keys[i] = node;
+    LM_GGML_ASSERT(replacements->set.keys[i] == NULL); // assert that we don't overwrite
+    replacements->set.keys[i] = node;
     replacements->vals[i] = clone;
 
     clone->op       = node->op;
@@ -14788,26 +14446,26 @@ void lm_ggml_build_backward_gradient_checkpointing(
         struct lm_ggml_cgraph    * gb_tmp,
         struct lm_ggml_tensor  * * checkpoints,
         int                     n_checkpoints) {
-    *gb_tmp = *gf;
+    lm_ggml_graph_cpy(gf, gb_tmp);
     lm_ggml_build_backward_expand(ctx, gf, gb_tmp, true);
 
     if (n_checkpoints <= 0) {
-        *gb = *gb_tmp;
+        lm_ggml_graph_cpy(gb_tmp, gb);
         return;
     }
 
-    struct hash_map * replacements = new_hash_map();
+    struct hash_map * replacements = lm_ggml_new_hash_map(gf->n_nodes + gf->n_leafs + n_checkpoints);
 
     // insert checkpoints in replacements
     for (int i = 0; i < n_checkpoints; ++i) {
-        size_t k = hash_find(replacements->keys, checkpoints[i]);
-        LM_GGML_ASSERT(k < LM_GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
-        LM_GGML_ASSERT(replacements->keys[k] == NULL); // assert that we don't overwrite
-        replacements->keys[k] = checkpoints[i];
-        replacements->vals[k] = checkpoints[i];
+        size_t k = lm_ggml_hash_find(replacements->set, checkpoints[i]);
+        LM_GGML_ASSERT(k != LM_GGML_HASHTABLE_FULL); // assert that not full
+        LM_GGML_ASSERT(replacements->set.keys[k] == NULL); // assert that we don't overwrite
+        replacements->set.keys[k] = checkpoints[i];
+        replacements->vals[k]     = checkpoints[i];
     }
 
-    *gb = *gf;
+    lm_ggml_graph_cpy(gf, gb);
     // rewrite gb_tmp->nodes[gf->n_nodes:gb_tmp->n_nodes],
     // replacing references to gb_tmp->nodes[0:gf->n_nodes] ( == gf->nodes[0:gf->n_nodes]),
     // by recomputing them from checkpoints
@@ -14824,21 +14482,21 @@ void lm_ggml_build_backward_gradient_checkpointing(
         lm_ggml_build_forward_expand(gb, node);
     }
 
-    free_hash_map(replacements);
+    lm_ggml_hash_map_free(replacements);
 }
 
 // functions to change gradients considering the case that input a might be initial gradient with zero value
 
-static struct lm_ggml_tensor * lm_ggml_add_or_set(struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, struct lm_ggml_tensor * b, void * zero_table[]) {
-    if (hash_contains(zero_table, a)) {
+static struct lm_ggml_tensor * lm_ggml_add_or_set(struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, struct lm_ggml_tensor * b, struct lm_ggml_hash_set zero_table) {
+    if (lm_ggml_hash_contains(zero_table, a)) {
         return b;
     } else {
         return lm_ggml_add_impl(ctx, a, b, false);
     }
 }
 
-static struct lm_ggml_tensor * lm_ggml_acc_or_set(struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, struct lm_ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, void * zero_table[]) {
-    if (hash_contains(zero_table, a)) {
+static struct lm_ggml_tensor * lm_ggml_acc_or_set(struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, struct lm_ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct lm_ggml_hash_set zero_table) {
+    if (lm_ggml_hash_contains(zero_table, a)) {
         struct lm_ggml_tensor * a_zero = lm_ggml_scale(ctx, a, lm_ggml_new_f32(ctx, 0));
         return lm_ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false);
     } else {
@@ -14846,23 +14504,23 @@ static struct lm_ggml_tensor * lm_ggml_acc_or_set(struct lm_ggml_context * ctx,
     }
 }
 
-static struct lm_ggml_tensor * lm_ggml_add1_or_set(struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, struct lm_ggml_tensor * b, void * zero_table[]) {
-    if (hash_contains(zero_table, a)) {
+static struct lm_ggml_tensor * lm_ggml_add1_or_set(struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, struct lm_ggml_tensor * b, struct lm_ggml_hash_set zero_table) {
+    if (lm_ggml_hash_contains(zero_table, a)) {
         return lm_ggml_repeat(ctx, b, a);
     } else {
         return lm_ggml_add1_impl(ctx, a, b, false);
     }
 }
 
-static struct lm_ggml_tensor * lm_ggml_sub_or_set(struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, struct lm_ggml_tensor * b, void * zero_table[]) {
-    if (hash_contains(zero_table, a)) {
+static struct lm_ggml_tensor * lm_ggml_sub_or_set(struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, struct lm_ggml_tensor * b, struct lm_ggml_hash_set zero_table) {
+    if (lm_ggml_hash_contains(zero_table, a)) {
         return lm_ggml_neg(ctx, b);
     } else {
         return lm_ggml_sub_impl(ctx, a, b, false);
     }
 }
 
-static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggml_tensor * tensor, void * zero_table[]) {
+static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggml_tensor * tensor, struct lm_ggml_hash_set zero_table) {
     struct lm_ggml_tensor * src0 = tensor->src[0];
     struct lm_ggml_tensor * src1 = tensor->src[1];
 
@@ -15143,6 +14801,10 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm
                                 zero_table);
                 }
             } break;
+        case LM_GGML_OP_MUL_MAT_ID:
+            {
+                LM_GGML_ASSERT(false); // TODO: not implemented
+            } break;
         case LM_GGML_OP_OUT_PROD:
             {
                 LM_GGML_ASSERT(false); // TODO: not implemented
@@ -15449,23 +15111,11 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm
                             zero_table);
                 }
             } break;
-        case LM_GGML_OP_ALIBI:
-            {
-                LM_GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case LM_GGML_OP_CLAMP:
-            {
-                LM_GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case LM_GGML_OP_CONV_1D:
-            {
-                LM_GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case LM_GGML_OP_CONV_1D_STAGE_0:
+        case LM_GGML_OP_ALIBI:
             {
                 LM_GGML_ASSERT(false); // TODO: not implemented
             } break;
-        case LM_GGML_OP_CONV_1D_STAGE_1:
+        case LM_GGML_OP_CLAMP:
             {
                 LM_GGML_ASSERT(false); // TODO: not implemented
             } break;
@@ -15473,15 +15123,7 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm
             {
                 LM_GGML_ASSERT(false); // TODO: not implemented
             } break;
-        case LM_GGML_OP_CONV_2D:
-            {
-                LM_GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case LM_GGML_OP_CONV_2D_STAGE_0:
-            {
-                LM_GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case LM_GGML_OP_CONV_2D_STAGE_1:
+        case LM_GGML_OP_IM2COL:
             {
                 LM_GGML_ASSERT(false); // TODO: not implemented
             } break;
@@ -15501,6 +15143,10 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm
             {
                 LM_GGML_ASSERT(false); // TODO: not implemented
             } break;
+        case LM_GGML_OP_ARGSORT:
+            {
+                LM_GGML_ASSERT(false); // TODO: not implemented
+            } break;
         case LM_GGML_OP_FLASH_ATTN:
             {
                 struct lm_ggml_tensor * flash_grad = NULL;
@@ -15695,7 +15341,7 @@ static void lm_ggml_visit_parents(struct lm_ggml_cgraph * cgraph, struct lm_ggml
     }
 
     // check if already visited
-    if (hash_insert(cgraph->visited_hash_table, node)) {
+    if (lm_ggml_hash_insert(cgraph->visited_hash_table, node) == LM_GGML_HASHTABLE_ALREADY_EXISTS) {
         return;
     }
 
@@ -15711,7 +15357,7 @@ static void lm_ggml_visit_parents(struct lm_ggml_cgraph * cgraph, struct lm_ggml
 
     if (node->op == LM_GGML_OP_NONE && node->grad == NULL) {
         // reached a leaf node, not part of the gradient graph (e.g. a constant)
-        LM_GGML_ASSERT(cgraph->n_leafs < LM_GGML_MAX_NODES);
+        LM_GGML_ASSERT(cgraph->n_leafs < cgraph->size);
 
         if (strlen(node->name) == 0) {
             lm_ggml_format_name(node, "leaf_%d", cgraph->n_leafs);
@@ -15720,22 +15366,24 @@ static void lm_ggml_visit_parents(struct lm_ggml_cgraph * cgraph, struct lm_ggml
         cgraph->leafs[cgraph->n_leafs] = node;
         cgraph->n_leafs++;
     } else {
-        LM_GGML_ASSERT(cgraph->n_nodes < LM_GGML_MAX_NODES);
+        LM_GGML_ASSERT(cgraph->n_nodes < cgraph->size);
 
         if (strlen(node->name) == 0) {
             lm_ggml_format_name(node, "node_%d", cgraph->n_nodes);
         }
 
         cgraph->nodes[cgraph->n_nodes] = node;
-        cgraph->grads[cgraph->n_nodes] = node->grad;
+        if (cgraph->grads) {
+            cgraph->grads[cgraph->n_nodes] = node->grad;
+        }
         cgraph->n_nodes++;
     }
 }
 
 static void lm_ggml_build_forward_impl(struct lm_ggml_cgraph * cgraph, struct lm_ggml_tensor * tensor, bool expand) {
     if (!expand) {
-        cgraph->n_nodes = 0;
-        cgraph->n_leafs = 0;
+        // TODO: this branch isn't accessible anymore, maybe move this to lm_ggml_build_forward_expand
+        lm_ggml_graph_clear(cgraph);
     }
 
     const int n0 = cgraph->n_nodes;
@@ -15756,25 +15404,6 @@ void lm_ggml_build_forward_expand(struct lm_ggml_cgraph * cgraph, struct lm_ggml
     lm_ggml_build_forward_impl(cgraph, tensor, true);
 }
 
-struct lm_ggml_cgraph lm_ggml_build_forward(struct lm_ggml_tensor * tensor) {
-    struct lm_ggml_cgraph result = {
-        /*.n_nodes      =*/ 0,
-        /*.n_leafs      =*/ 0,
-        /*.nodes        =*/ { NULL },
-        /*.grads        =*/ { NULL },
-        /*.leafs        =*/ { NULL },
-        /*.hash_table   =*/ { NULL },
-        /*.order        =*/ LM_GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
-        /*.perf_runs    =*/ 0,
-        /*.perf_cycles  =*/ 0,
-        /*.perf_time_us =*/ 0,
-    };
-
-    lm_ggml_build_forward_impl(&result, tensor, false);
-
-    return result;
-}
-
 void lm_ggml_build_backward_expand(struct lm_ggml_context * ctx, struct lm_ggml_cgraph * gf, struct lm_ggml_cgraph * gb, bool keep) {
     LM_GGML_ASSERT(gf->n_nodes > 0);
 
@@ -15791,11 +15420,10 @@ void lm_ggml_build_backward_expand(struct lm_ggml_context * ctx, struct lm_ggml_
     }
 
     // remember original gradients which start with zero values
-    void ** zero_table = malloc(sizeof(void *) * LM_GGML_GRAPH_HASHTABLE_SIZE);
-    memset(zero_table, 0, sizeof(void*) * LM_GGML_GRAPH_HASHTABLE_SIZE);
+    struct lm_ggml_hash_set zero_table = lm_ggml_hash_set_new(gf->size);
     for (int i = 0; i < gf->n_nodes; i++) {
         if (gf->grads[i]) {
-            hash_insert(zero_table, gf->grads[i]);
+            lm_ggml_hash_insert(zero_table, gf->grads[i]);
         }
     }
 
@@ -15818,26 +15446,54 @@ void lm_ggml_build_backward_expand(struct lm_ggml_context * ctx, struct lm_ggml_
         }
     }
 
-    free(zero_table);
+    lm_ggml_hash_set_free(zero_table);
 }
 
-struct lm_ggml_cgraph lm_ggml_build_backward(struct lm_ggml_context * ctx, struct lm_ggml_cgraph * gf, bool keep) {
-    struct lm_ggml_cgraph result = *gf;
-    lm_ggml_build_backward_expand(ctx, gf, &result, keep);
-    return result;
+static size_t lm_ggml_graph_nbytes(size_t size, bool grads) {
+    size_t nbytes = sizeof(struct lm_ggml_cgraph);
+    nbytes += size * sizeof(struct lm_ggml_tensor *) * 2; // leafs + nodes
+    if (grads) {
+        nbytes += size * sizeof(struct lm_ggml_tensor *); // grads
+    }
+    nbytes += lm_ggml_hash_size(size * 2) * sizeof(struct lm_ggml_tensor *); // hash set
+    return nbytes;
 }
 
-struct lm_ggml_cgraph * lm_ggml_new_graph(struct lm_ggml_context * ctx) {
-    struct lm_ggml_object * obj = lm_ggml_new_object(ctx, LM_GGML_OBJECT_GRAPH, LM_GGML_GRAPH_SIZE);
+size_t lm_ggml_graph_overhead_custom(size_t size, bool grads) {
+    return LM_GGML_OBJECT_SIZE + LM_GGML_PAD(lm_ggml_graph_nbytes(size, grads), LM_GGML_MEM_ALIGN);
+}
+
+size_t lm_ggml_graph_overhead(void) {
+    return lm_ggml_graph_overhead_custom(LM_GGML_DEFAULT_GRAPH_SIZE, false);
+}
+
+struct lm_ggml_cgraph * lm_ggml_new_graph_custom(struct lm_ggml_context * ctx, size_t size, bool grads) {
+    const size_t obj_size = lm_ggml_graph_nbytes(size, grads);
+    struct lm_ggml_object * obj = lm_ggml_new_object(ctx, LM_GGML_OBJECT_GRAPH, obj_size);
     struct lm_ggml_cgraph * cgraph = (struct lm_ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
 
+    struct lm_ggml_tensor ** data_start = (struct lm_ggml_tensor **) (cgraph + 1);
+
+    size_t hash_size = lm_ggml_hash_size(size * 2);
+    struct lm_ggml_tensor ** nodes_ptr = data_start;
+    struct lm_ggml_tensor ** leafs_ptr = nodes_ptr + size;
+    struct lm_ggml_tensor ** hash_keys_ptr = leafs_ptr + size;
+    struct lm_ggml_tensor ** grads_ptr = grads ? hash_keys_ptr + hash_size : NULL;
+
+    // check that we allocated the correct amount of memory
+    assert(obj_size == (size_t) (
+        (grads ? (char *)(grads_ptr + size) : (char *)(hash_keys_ptr + hash_size)) - (char *)cgraph));
+
+    memset(hash_keys_ptr, 0, hash_size * sizeof(struct lm_ggml_tensor *));
+
     *cgraph = (struct lm_ggml_cgraph) {
+        /*.size         =*/ size,
         /*.n_nodes      =*/ 0,
         /*.n_leafs      =*/ 0,
-        /*.nodes        =*/ { NULL },
-        /*.grads        =*/ { NULL },
-        /*.leafs        =*/ { NULL },
-        /*.hash_table   =*/ { NULL },
+        /*.nodes        =*/ nodes_ptr,
+        /*.grads        =*/ grads_ptr,
+        /*.leafs        =*/ leafs_ptr,
+        /*.hash_table   =*/ { hash_size, hash_keys_ptr },
         /*.order        =*/ LM_GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
         /*.perf_runs    =*/ 0,
         /*.perf_cycles  =*/ 0,
@@ -15847,14 +15503,81 @@ struct lm_ggml_cgraph * lm_ggml_new_graph(struct lm_ggml_context * ctx) {
     return cgraph;
 }
 
-struct lm_ggml_cgraph * lm_ggml_build_forward_ctx(struct lm_ggml_context * ctx, struct lm_ggml_tensor * tensor) {
-    struct lm_ggml_cgraph * cgraph = lm_ggml_new_graph(ctx);
-    lm_ggml_build_forward_impl(cgraph, tensor, false);
+struct lm_ggml_cgraph * lm_ggml_new_graph(struct lm_ggml_context * ctx) {
+    return lm_ggml_new_graph_custom(ctx, LM_GGML_DEFAULT_GRAPH_SIZE, false);
+}
+
+struct lm_ggml_cgraph lm_ggml_graph_view(struct lm_ggml_cgraph * cgraph0, int i0, int i1) {
+    struct lm_ggml_cgraph cgraph = {
+        /*.size         =*/ 0,
+        /*.n_nodes      =*/ i1 - i0,
+        /*.n_leafs      =*/ 0,
+        /*.nodes        =*/ cgraph0->nodes + i0,
+        /*.grads        =*/ cgraph0->grads ? cgraph0->grads + i0 : NULL,
+        /*.leafs        =*/ NULL,
+        /*.hash_table   =*/ { 0, NULL },
+        /*.order        =*/ cgraph0->order,
+        /*.perf_runs    =*/ 0,
+        /*.perf_cycles  =*/ 0,
+        /*.perf_time_us =*/ 0,
+    };
+
     return cgraph;
 }
 
-size_t lm_ggml_graph_overhead(void) {
-    return LM_GGML_OBJECT_SIZE + LM_GGML_PAD(LM_GGML_GRAPH_SIZE, LM_GGML_MEM_ALIGN);
+void lm_ggml_graph_cpy(struct lm_ggml_cgraph * src, struct lm_ggml_cgraph * dst) {
+    LM_GGML_ASSERT(dst->size >= src->n_leafs);
+    LM_GGML_ASSERT(dst->size >= src->n_nodes);
+    LM_GGML_ASSERT(dst->visited_hash_table.size >= src->visited_hash_table.size);
+
+    dst->n_leafs = src->n_leafs;
+    dst->n_nodes = src->n_nodes;
+    dst->order   = src->order;
+
+    for (int i = 0; i < src->n_leafs; ++i) {
+        dst->leafs[i] = src->leafs[i];
+    }
+
+    for (int i = 0; i < src->n_nodes; ++i) {
+        dst->nodes[i] = src->nodes[i];
+    }
+
+    if (src->grads) {
+        LM_GGML_ASSERT(dst->grads != NULL);
+        for (int i = 0; i < src->n_nodes; ++i) {
+            dst->grads[i] = src->grads[i];
+        }
+    }
+
+    for (size_t i = 0; i < src->visited_hash_table.size; ++i) {
+        if (src->visited_hash_table.keys[i]) {
+            lm_ggml_hash_insert(dst->visited_hash_table, src->visited_hash_table.keys[i]);
+        }
+    }
+}
+
+struct lm_ggml_cgraph * lm_ggml_graph_dup(struct lm_ggml_context * ctx, struct lm_ggml_cgraph * cgraph) {
+    struct lm_ggml_cgraph * result = lm_ggml_new_graph_custom(ctx, cgraph->size, cgraph->grads != NULL);
+    lm_ggml_graph_cpy(cgraph, result);
+    return result;
+}
+
+void lm_ggml_graph_reset(struct lm_ggml_cgraph * cgraph) {
+    LM_GGML_ASSERT(cgraph->grads != NULL);
+
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        struct lm_ggml_tensor * grad = cgraph->grads[i];
+
+        if (grad) {
+            lm_ggml_set_zero(grad);
+        }
+    }
+}
+
+void lm_ggml_graph_clear(struct lm_ggml_cgraph * cgraph) {
+    cgraph->n_leafs = 0;
+    cgraph->n_nodes = 0;
+    memset(cgraph->visited_hash_table.keys, 0, cgraph->visited_hash_table.size * sizeof(struct lm_ggml_tensor *));
 }
 
 //
@@ -15966,45 +15689,285 @@ static void clear_numa_thread_affinity(void) {
             strerror(rv));
     }
 
-    CPU_FREE(cpus);
-}
-#else
-// TODO: Windows etc.
-// (the linux implementation may also work on BSD, someone should test)
-static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads);  }
-static void clear_numa_thread_affinity(void) {}
-#endif
-
-struct lm_ggml_compute_state_shared {
-    const struct lm_ggml_cgraph * cgraph;
-    const struct lm_ggml_cplan  * cplan;
-
-    int64_t perf_node_start_cycles;
-    int64_t perf_node_start_time_us;
-
-    const int n_threads;
-
-    // synchronization primitives
-    atomic_int n_active; // num active threads
-    atomic_int node_n;   // active graph node
-
-    bool (*abort_callback)(void * data); // abort lm_ggml_graph_compute when true
-    void * abort_callback_data;
-};
-
-struct lm_ggml_compute_state {
-    lm_ggml_thread_t thrd;
-    int ith;
-    struct lm_ggml_compute_state_shared * shared;
-};
-
-static void lm_ggml_graph_compute_perf_stats_node(struct lm_ggml_tensor * node, const struct lm_ggml_compute_state_shared * st) {
-    int64_t cycles_cur  = lm_ggml_perf_cycles()  - st->perf_node_start_cycles;
-    int64_t time_us_cur = lm_ggml_perf_time_us() - st->perf_node_start_time_us;
+    CPU_FREE(cpus);
+}
+#else
+// TODO: Windows etc.
+// (the linux implementation may also work on BSD, someone should test)
+static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads);  }
+static void clear_numa_thread_affinity(void) {}
+#endif
+
+struct lm_ggml_compute_state_shared {
+    const struct lm_ggml_cgraph * cgraph;
+    const struct lm_ggml_cplan  * cplan;
+
+    int64_t perf_node_start_cycles;
+    int64_t perf_node_start_time_us;
+
+    const int n_threads;
+
+    // synchronization primitives
+    atomic_int n_active; // num active threads
+    atomic_int node_n;   // active graph node
+
+    bool (*abort_callback)(void * data); // abort lm_ggml_graph_compute when true
+    void * abort_callback_data;
+};
+
+struct lm_ggml_compute_state {
+    lm_ggml_thread_t thrd;
+    int ith;
+    struct lm_ggml_compute_state_shared * shared;
+};
+
+static void lm_ggml_graph_compute_perf_stats_node(struct lm_ggml_tensor * node, const struct lm_ggml_compute_state_shared * st) {
+    int64_t cycles_cur  = lm_ggml_perf_cycles()  - st->perf_node_start_cycles;
+    int64_t time_us_cur = lm_ggml_perf_time_us() - st->perf_node_start_time_us;
+
+    node->perf_runs++;
+    node->perf_cycles  += cycles_cur;
+    node->perf_time_us += time_us_cur;
+}
+
+static int lm_ggml_get_n_tasks(struct lm_ggml_tensor * node, int n_threads) {
+    int n_tasks = 0;
+
+    switch (node->op) {
+        case LM_GGML_OP_CPY:
+        case LM_GGML_OP_DUP:
+        case LM_GGML_OP_ADD:
+        case LM_GGML_OP_ADD1:
+        case LM_GGML_OP_ACC:
+            {
+                n_tasks = n_threads;
+            } break;
+        case LM_GGML_OP_SUB:
+        case LM_GGML_OP_SQR:
+        case LM_GGML_OP_SQRT:
+        case LM_GGML_OP_LOG:
+        case LM_GGML_OP_SUM:
+        case LM_GGML_OP_SUM_ROWS:
+        case LM_GGML_OP_MEAN:
+        case LM_GGML_OP_ARGMAX:
+        case LM_GGML_OP_REPEAT:
+        case LM_GGML_OP_REPEAT_BACK:
+            {
+                n_tasks = 1;
+            } break;
+        case LM_GGML_OP_UNARY:
+            switch (lm_ggml_get_unary_op(node)) {
+                case LM_GGML_UNARY_OP_ABS:
+                case LM_GGML_UNARY_OP_SGN:
+                case LM_GGML_UNARY_OP_NEG:
+                case LM_GGML_UNARY_OP_STEP:
+                case LM_GGML_UNARY_OP_TANH:
+                case LM_GGML_UNARY_OP_ELU:
+                case LM_GGML_UNARY_OP_RELU:
+                case LM_GGML_UNARY_OP_LEAKY:
+                    {
+                        n_tasks = 1;
+                    } break;
+
+                case LM_GGML_UNARY_OP_GELU:
+                case LM_GGML_UNARY_OP_GELU_QUICK:
+                case LM_GGML_UNARY_OP_SILU:
+                    {
+                        n_tasks = n_threads;
+                    } break;
+                default:
+                    LM_GGML_ASSERT(false);
+            }
+            break;
+        case LM_GGML_OP_SILU_BACK:
+        case LM_GGML_OP_MUL:
+        case LM_GGML_OP_DIV:
+        case LM_GGML_OP_NORM:
+        case LM_GGML_OP_RMS_NORM:
+        case LM_GGML_OP_RMS_NORM_BACK:
+        case LM_GGML_OP_GROUP_NORM:
+        case LM_GGML_OP_CONCAT:
+            {
+                n_tasks = n_threads;
+            } break;
+        case LM_GGML_OP_MUL_MAT:
+            {
+                n_tasks = n_threads;
+
+                // TODO: use different scheduling for different matrix sizes
+                //const int nr0 = lm_ggml_nrows(node->src[0]);
+                //const int nr1 = lm_ggml_nrows(node->src[1]);
+
+                //n_tasks = MIN(n_threads, MAX(1, nr0/128));
+                //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
+
+#if defined(LM_GGML_USE_CUBLAS)
+                if (lm_ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) {
+                    n_tasks = 1; // TODO: this actually is doing nothing
+                                 //       the threads are still spinning
+                }
+#elif defined(LM_GGML_USE_CLBLAST)
+                if (lm_ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
+                    n_tasks = 1; // TODO: this actually is doing nothing
+                                 //       the threads are still spinning
+                }
+#endif
+#if defined(LM_GGML_USE_ACCELERATE) || defined(LM_GGML_USE_OPENBLAS)
+                if (lm_ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
+                    n_tasks = 1; // TODO: this actually is doing nothing
+                                 //       the threads are still spinning
+                }
+#endif
+            } break;
+        case LM_GGML_OP_MUL_MAT_ID:
+            {
+                // FIXME: blas
+                n_tasks = n_threads;
+            } break;
+        case LM_GGML_OP_OUT_PROD:
+            {
+                n_tasks = n_threads;
+            } break;
+        case LM_GGML_OP_SCALE:
+        case LM_GGML_OP_SET:
+        case LM_GGML_OP_CONT:
+        case LM_GGML_OP_RESHAPE:
+        case LM_GGML_OP_VIEW:
+        case LM_GGML_OP_PERMUTE:
+        case LM_GGML_OP_TRANSPOSE:
+        case LM_GGML_OP_GET_ROWS:
+        case LM_GGML_OP_GET_ROWS_BACK:
+        case LM_GGML_OP_DIAG:
+            {
+                n_tasks = 1;
+            } break;
+        case LM_GGML_OP_DIAG_MASK_ZERO:
+        case LM_GGML_OP_DIAG_MASK_INF:
+        case LM_GGML_OP_SOFT_MAX_BACK:
+        case LM_GGML_OP_ROPE:
+        case LM_GGML_OP_ROPE_BACK:
+        case LM_GGML_OP_ADD_REL_POS:
+            {
+                n_tasks = n_threads;
+            } break;
+        case LM_GGML_OP_ALIBI:
+            {
+                n_tasks = 1; //TODO
+            } break;
+        case LM_GGML_OP_CLAMP:
+            {
+                n_tasks = 1; //TODO
+            } break;
+        case LM_GGML_OP_SOFT_MAX:
+            {
+                n_tasks = MIN(MIN(4, n_threads), lm_ggml_nrows(node->src[0]));
+            } break;
+        case LM_GGML_OP_CONV_TRANSPOSE_1D:
+            {
+                n_tasks = n_threads;
+            } break;
+        case LM_GGML_OP_IM2COL:
+            {
+                n_tasks = n_threads;
+            } break;
+        case LM_GGML_OP_CONV_TRANSPOSE_2D:
+            {
+                n_tasks = n_threads;
+            } break;
+        case LM_GGML_OP_POOL_1D:
+        case LM_GGML_OP_POOL_2D:
+            {
+                n_tasks = 1;
+            } break;
+        case LM_GGML_OP_UPSCALE:
+            {
+                n_tasks = n_threads;
+            } break;
+        case LM_GGML_OP_ARGSORT:
+            {
+                n_tasks = n_threads;
+            } break;
+        case LM_GGML_OP_FLASH_ATTN:
+            {
+                n_tasks = n_threads;
+            } break;
+        case LM_GGML_OP_FLASH_FF:
+            {
+                n_tasks = n_threads;
+            } break;
+        case LM_GGML_OP_FLASH_ATTN_BACK:
+            {
+                n_tasks = n_threads;
+            } break;
+        case LM_GGML_OP_WIN_PART:
+        case LM_GGML_OP_WIN_UNPART:
+        case LM_GGML_OP_GET_REL_POS:
+        case LM_GGML_OP_MAP_UNARY:
+        case LM_GGML_OP_MAP_BINARY:
+        case LM_GGML_OP_MAP_CUSTOM1_F32:
+        case LM_GGML_OP_MAP_CUSTOM2_F32:
+        case LM_GGML_OP_MAP_CUSTOM3_F32:
+            {
+                n_tasks = 1;
+            } break;
+        case LM_GGML_OP_MAP_CUSTOM1:
+            {
+                struct lm_ggml_map_custom1_op_params * p = (struct lm_ggml_map_custom1_op_params *) node->op_params;
+                if (p->n_tasks == LM_GGML_N_TASKS_MAX) {
+                    n_tasks = n_threads;
+                } else {
+                    n_tasks = MIN(p->n_tasks, n_threads);
+                }
+            } break;
+        case LM_GGML_OP_MAP_CUSTOM2:
+            {
+                struct lm_ggml_map_custom2_op_params * p = (struct lm_ggml_map_custom2_op_params *) node->op_params;
+                if (p->n_tasks == LM_GGML_N_TASKS_MAX) {
+                    n_tasks = n_threads;
+                } else {
+                    n_tasks = MIN(p->n_tasks, n_threads);
+                }
+            } break;
+        case LM_GGML_OP_MAP_CUSTOM3:
+            {
+                struct lm_ggml_map_custom3_op_params * p = (struct lm_ggml_map_custom3_op_params *) node->op_params;
+                if (p->n_tasks == LM_GGML_N_TASKS_MAX) {
+                    n_tasks = n_threads;
+                } else {
+                    n_tasks = MIN(p->n_tasks, n_threads);
+                }
+            } break;
+        case LM_GGML_OP_CROSS_ENTROPY_LOSS:
+            {
+                n_tasks = n_threads;
+            } break;
+        case LM_GGML_OP_CROSS_ENTROPY_LOSS_BACK:
+            {
+                n_tasks = n_threads;
+            } break;
+        case LM_GGML_OP_NONE:
+            {
+                n_tasks = 1;
+            } break;
+        case LM_GGML_OP_COUNT:
+            {
+                LM_GGML_ASSERT(false);
+            } break;
+        default:
+            {
+                fprintf(stderr, "%s: op not implemented: ", __func__);
+                if (node->op < LM_GGML_OP_COUNT) {
+                    fprintf(stderr, "%s\n", lm_ggml_op_name(node->op));
+                } else {
+                    fprintf(stderr, "%d\n", node->op);
+                }
+                LM_GGML_ASSERT(false);
+            } break;
+    }
+
+    assert(n_tasks > 0);
 
-    node->perf_runs++;
-    node->perf_cycles  += cycles_cur;
-    node->perf_time_us += time_us_cur;
+    return n_tasks;
 }
 
 static thread_ret_t lm_ggml_graph_compute_thread(void * data) {
@@ -16013,7 +15976,6 @@ static thread_ret_t lm_ggml_graph_compute_thread(void * data) {
     const struct lm_ggml_cgraph * cgraph = state->shared->cgraph;
     const struct lm_ggml_cplan  * cplan  = state->shared->cplan;
 
-    const int * n_tasks_arr = cplan->n_tasks;
     const int   n_threads   = state->shared->n_threads;
 
     set_numa_thread_affinity(state->ith, n_threads);
@@ -16038,9 +16000,9 @@ static thread_ret_t lm_ggml_graph_compute_thread(void * data) {
 
             if (node_n != -1) {
                 /* FINALIZE */
-                struct lm_ggml_tensor * node = state->shared->cgraph->nodes[node_n];
+                struct lm_ggml_tensor * node = cgraph->nodes[node_n];
                 if (LM_GGML_OP_HAS_FINALIZE[node->op]) {
-                    params.nth = n_tasks_arr[node_n];
+                    params.nth = lm_ggml_get_n_tasks(node, n_threads);
                     lm_ggml_compute_forward(&params, node);
                 }
                 lm_ggml_graph_compute_perf_stats_node(node, state->shared);
@@ -16051,7 +16013,7 @@ static thread_ret_t lm_ggml_graph_compute_thread(void * data) {
                 LM_GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
 
                 struct lm_ggml_tensor * node = cgraph->nodes[node_n];
-                const int n_tasks = n_tasks_arr[node_n];
+                const int n_tasks = lm_ggml_get_n_tasks(node, n_threads);
 
                 state->shared->perf_node_start_cycles  = lm_ggml_perf_cycles();
                 state->shared->perf_node_start_time_us = lm_ggml_perf_time_us();
@@ -16109,7 +16071,7 @@ static thread_ret_t lm_ggml_graph_compute_thread(void * data) {
 
         /* COMPUTE */
         struct lm_ggml_tensor * node = cgraph->nodes[node_n];
-        const int n_tasks = n_tasks_arr[node_n];
+        const int n_tasks = lm_ggml_get_n_tasks(node, n_threads);
 
         struct lm_ggml_compute_params params = {
             /*.type  =*/ LM_GGML_TASK_COMPUTE,
@@ -16139,125 +16101,44 @@ struct lm_ggml_cplan lm_ggml_graph_plan(struct lm_ggml_cgraph * cgraph, int n_th
 
     // thread scheduling for the different operations + work buffer size estimation
     for (int i = 0; i < cgraph->n_nodes; i++) {
-        int n_tasks = 1;
-
         struct lm_ggml_tensor * node = cgraph->nodes[i];
 
+        const int n_tasks = lm_ggml_get_n_tasks(node, n_threads);
+
+        size_t cur = 0;
+
         switch (node->op) {
             case LM_GGML_OP_CPY:
             case LM_GGML_OP_DUP:
                 {
-                    n_tasks = n_threads;
-
-                    size_t cur = 0;
                     if (lm_ggml_is_quantized(node->type)) {
                         cur = lm_ggml_type_size(LM_GGML_TYPE_F32) * node->ne[0] * n_tasks;
                     }
-
-                    work_size = MAX(work_size, cur);
                 } break;
             case LM_GGML_OP_ADD:
             case LM_GGML_OP_ADD1:
                 {
-                    n_tasks = n_threads;
-
-                    size_t cur = 0;
-
                     if (lm_ggml_is_quantized(node->src[0]->type)) {
                         cur = lm_ggml_type_size(LM_GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
                     }
-
-                    work_size = MAX(work_size, cur);
                 } break;
             case LM_GGML_OP_ACC:
                 {
-                    n_tasks = n_threads;
-
-                    size_t cur = 0;
-
                     if (lm_ggml_is_quantized(node->src[0]->type)) {
                         cur = lm_ggml_type_size(LM_GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
                     }
-
-                    work_size = MAX(work_size, cur);
-                } break;
-            case LM_GGML_OP_SUB:
-            case LM_GGML_OP_DIV:
-            case LM_GGML_OP_SQR:
-            case LM_GGML_OP_SQRT:
-            case LM_GGML_OP_LOG:
-            case LM_GGML_OP_SUM:
-            case LM_GGML_OP_SUM_ROWS:
-            case LM_GGML_OP_MEAN:
-            case LM_GGML_OP_ARGMAX:
-            case LM_GGML_OP_REPEAT:
-            case LM_GGML_OP_REPEAT_BACK:
-            {
-                    n_tasks = 1;
-                } break;
-
-            case LM_GGML_OP_UNARY:
-                {
-                    switch (lm_ggml_get_unary_op(node)) {
-                        case LM_GGML_UNARY_OP_ABS:
-                        case LM_GGML_UNARY_OP_SGN:
-                        case LM_GGML_UNARY_OP_NEG:
-                        case LM_GGML_UNARY_OP_STEP:
-                        case LM_GGML_UNARY_OP_TANH:
-                        case LM_GGML_UNARY_OP_ELU:
-                        case LM_GGML_UNARY_OP_RELU:
-                            {
-                                n_tasks = 1;
-                            } break;
-
-                        case LM_GGML_UNARY_OP_GELU:
-                        case LM_GGML_UNARY_OP_GELU_QUICK:
-                        case LM_GGML_UNARY_OP_SILU:
-                            {
-                                n_tasks = n_threads;
-                            } break;
-                    }
-                } break;
-            case LM_GGML_OP_SILU_BACK:
-            case LM_GGML_OP_MUL:
-            case LM_GGML_OP_NORM:
-            case LM_GGML_OP_RMS_NORM:
-            case LM_GGML_OP_RMS_NORM_BACK:
-            case LM_GGML_OP_GROUP_NORM:
-                {
-                    n_tasks = n_threads;
                 } break;
-            case LM_GGML_OP_CONCAT:
             case LM_GGML_OP_MUL_MAT:
                 {
-                    n_tasks = n_threads;
-
-                    // TODO: use different scheduling for different matrix sizes
-                    //const int nr0 = lm_ggml_nrows(node->src[0]);
-                    //const int nr1 = lm_ggml_nrows(node->src[1]);
-
-                    //n_tasks = MIN(n_threads, MAX(1, nr0/128));
-                    //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
-
-                    size_t cur = 0;
                     const enum lm_ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
 
-#if defined(LM_GGML_USE_CUBLAS)
-                    if (lm_ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) {
-                        n_tasks = 1; // TODO: this actually is doing nothing
-                                     //       the threads are still spinning
-                    } else
-#elif defined(LM_GGML_USE_CLBLAST)
+#if defined(LM_GGML_USE_CLBLAST)
                     if (lm_ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
-                        n_tasks = 1; // TODO: this actually is doing nothing
-                                     //       the threads are still spinning
                         cur = lm_ggml_cl_mul_mat_get_wsize(node->src[0], node->src[1], node);
                     } else
 #endif
 #if defined(LM_GGML_USE_ACCELERATE) || defined(LM_GGML_USE_OPENBLAS)
                     if (lm_ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
-                        n_tasks = 1; // TODO: this actually is doing nothing
-                                     //       the threads are still spinning
                         if (node->src[0]->type != LM_GGML_TYPE_F32) {
                             // here we need memory just for single 2D matrix from src0
                             cur = lm_ggml_type_size(LM_GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
@@ -16266,108 +16147,37 @@ struct lm_ggml_cplan lm_ggml_graph_plan(struct lm_ggml_cgraph * cgraph, int n_th
 #endif
                     if (node->src[1]->type != vec_dot_type) {
                         cur = lm_ggml_type_size(vec_dot_type)*lm_ggml_nelements(node->src[1])/lm_ggml_blck_size(vec_dot_type);
-                    } else {
-                        cur = 0;
                     }
-
-                    work_size = MAX(work_size, cur);
+                } break;
+            case LM_GGML_OP_MUL_MAT_ID:
+                {
+                    const struct lm_ggml_tensor * a = node->src[2];
+                    const struct lm_ggml_tensor * b = node->src[1];
+                    const enum lm_ggml_type vec_dot_type = type_traits[a->type].vec_dot_type;
+#if defined(LM_GGML_USE_ACCELERATE) || defined(LM_GGML_USE_OPENBLAS)
+                    if (lm_ggml_compute_forward_mul_mat_use_blas(a, b, node)) {
+                        if (a->type != LM_GGML_TYPE_F32) {
+                            // here we need memory just for single 2D matrix from src0
+                            cur = lm_ggml_type_size(LM_GGML_TYPE_F32)*(a->ne[0]*a->ne[1]);
+                        }
+                    } else
+#endif
+                    if (b->type != vec_dot_type) {
+                        cur = lm_ggml_type_size(vec_dot_type)*lm_ggml_nelements(b)/lm_ggml_blck_size(vec_dot_type);
+                    }
                 } break;
             case LM_GGML_OP_OUT_PROD:
                 {
-                    n_tasks = n_threads;
-
-                    size_t cur = 0;
-
                     if (lm_ggml_is_quantized(node->src[0]->type)) {
                         cur = lm_ggml_type_size(LM_GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
                     }
-
-                    work_size = MAX(work_size, cur);
-                } break;
-            case LM_GGML_OP_SCALE:
-                {
-                    n_tasks = 1;
                 } break;
-            case LM_GGML_OP_SET:
-            case LM_GGML_OP_CONT:
-            case LM_GGML_OP_RESHAPE:
-            case LM_GGML_OP_VIEW:
-            case LM_GGML_OP_PERMUTE:
-            case LM_GGML_OP_TRANSPOSE:
-            case LM_GGML_OP_GET_ROWS:
-            case LM_GGML_OP_GET_ROWS_BACK:
-            case LM_GGML_OP_DIAG:
-                {
-                    n_tasks = 1;
-                } break;
-            case LM_GGML_OP_DIAG_MASK_ZERO:
-            case LM_GGML_OP_DIAG_MASK_INF:
             case LM_GGML_OP_SOFT_MAX:
-            case LM_GGML_OP_SOFT_MAX_BACK:
-            case LM_GGML_OP_ROPE:
-            case LM_GGML_OP_ROPE_BACK:
-            case LM_GGML_OP_ADD_REL_POS:
-                {
-                    n_tasks = n_threads;
-                } break;
-            case LM_GGML_OP_ALIBI:
-                {
-                    n_tasks = 1; //TODO
-                } break;
-            case LM_GGML_OP_CLAMP:
-                {
-                    n_tasks = 1; //TODO
-                } break;
-            case LM_GGML_OP_CONV_1D:
-                {
-                    n_tasks = n_threads;
-
-                    LM_GGML_ASSERT(node->src[0]->ne[3] == 1);
-                    LM_GGML_ASSERT(node->src[1]->ne[2] == 1);
-                    LM_GGML_ASSERT(node->src[1]->ne[3] == 1);
-
-                    const int64_t ne00 = node->src[0]->ne[0];
-                    const int64_t ne01 = node->src[0]->ne[1];
-                    const int64_t ne02 = node->src[0]->ne[2];
-
-                    const int64_t ne10 = node->src[1]->ne[0];
-                    const int64_t ne11 = node->src[1]->ne[1];
-
-                    const int64_t ne0 = node->ne[0];
-                    const int64_t ne1 = node->ne[1];
-                    const int64_t nk  = ne00;
-                    const int64_t ew0 = nk * ne01;
-
-                    UNUSED(ne02);
-                    UNUSED(ne10);
-                    UNUSED(ne11);
-
-                    size_t cur = 0;
-
-                    if (node->src[0]->type == LM_GGML_TYPE_F16 &&
-                        node->src[1]->type == LM_GGML_TYPE_F32) {
-                        cur = sizeof(lm_ggml_fp16_t)*(ne0*ne1*ew0);
-                    } else if (node->src[0]->type == LM_GGML_TYPE_F32 &&
-                               node->src[1]->type == LM_GGML_TYPE_F32) {
-                        cur = sizeof(float)*(ne0*ne1*ew0);
-                    } else {
-                        LM_GGML_ASSERT(false);
-                    }
-
-                    work_size = MAX(work_size, cur);
-                } break;
-            case LM_GGML_OP_CONV_1D_STAGE_0:
-                {
-                    n_tasks = n_threads;
-                } break;
-            case LM_GGML_OP_CONV_1D_STAGE_1:
                 {
-                    n_tasks = n_threads;
+                    cur = lm_ggml_type_size(LM_GGML_TYPE_F32) * node->ne[0] * n_tasks;
                 } break;
             case LM_GGML_OP_CONV_TRANSPOSE_1D:
                 {
-                    n_tasks = n_threads;
-
                     LM_GGML_ASSERT(node->src[0]->ne[3] == 1);
                     LM_GGML_ASSERT(node->src[1]->ne[2] == 1);
                     LM_GGML_ASSERT(node->src[1]->ne[3] == 1);
@@ -16379,7 +16189,6 @@ struct lm_ggml_cplan lm_ggml_graph_plan(struct lm_ggml_cgraph * cgraph, int n_th
                     const int64_t ne10 = node->src[1]->ne[0];  // L
                     const int64_t ne11 = node->src[1]->ne[1];  // Cin
 
-                    size_t cur = 0;
                     if (node->src[0]->type == LM_GGML_TYPE_F16 &&
                         node->src[1]->type == LM_GGML_TYPE_F32) {
                         cur += sizeof(lm_ggml_fp16_t)*ne00*ne01*ne02;
@@ -16391,59 +16200,9 @@ struct lm_ggml_cplan lm_ggml_graph_plan(struct lm_ggml_cgraph * cgraph, int n_th
                     } else {
                         LM_GGML_ASSERT(false);
                     }
-
-                    work_size = MAX(work_size, cur);
-                } break;
-            case LM_GGML_OP_CONV_2D:
-                {
-                    n_tasks = n_threads;
-
-                    const int64_t ne00 = node->src[0]->ne[0]; // W
-                    const int64_t ne01 = node->src[0]->ne[1]; // H
-                    const int64_t ne02 = node->src[0]->ne[2]; // C
-                    const int64_t ne03 = node->src[0]->ne[3]; // N
-
-                    const int64_t ne10 = node->src[1]->ne[0]; // W
-                    const int64_t ne11 = node->src[1]->ne[1]; // H
-                    const int64_t ne12 = node->src[1]->ne[2]; // C
-
-                    const int64_t ne0 = node->ne[0];
-                    const int64_t ne1 = node->ne[1];
-                    const int64_t ne2 = node->ne[2];
-                    const int64_t ne3 = node->ne[3];
-                    const int64_t nk = ne00*ne01;
-                    const int64_t ew0 = nk * ne02;
-
-                    UNUSED(ne03);
-                    UNUSED(ne2);
-
-                    size_t cur = 0;
-
-                    if (node->src[0]->type == LM_GGML_TYPE_F16 &&
-                        node->src[1]->type == LM_GGML_TYPE_F32) {
-                        // im2col: [N*OH*OW, IC*KH*KW]
-                        cur = sizeof(lm_ggml_fp16_t)*(ne3*ne0*ne1*ew0);
-                    } else if (node->src[0]->type == LM_GGML_TYPE_F32 &&
-                               node->src[1]->type == LM_GGML_TYPE_F32) {
-                        cur = sizeof(float)*      (ne10*ne11*ne12);
-                    } else {
-                        LM_GGML_ASSERT(false);
-                    }
-
-                    work_size = MAX(work_size, cur);
-                } break;
-            case LM_GGML_OP_CONV_2D_STAGE_0:
-                {
-                    n_tasks = n_threads;
-                } break;
-            case LM_GGML_OP_CONV_2D_STAGE_1:
-                {
-                    n_tasks = n_threads;
                 } break;
             case LM_GGML_OP_CONV_TRANSPOSE_2D:
                 {
-                    n_tasks = n_threads;
-
                     const int64_t ne00 = node->src[0]->ne[0]; // W
                     const int64_t ne01 = node->src[0]->ne[1]; // H
                     const int64_t ne02 = node->src[0]->ne[2]; // Channels Out
@@ -16453,141 +16212,58 @@ struct lm_ggml_cplan lm_ggml_graph_plan(struct lm_ggml_cgraph * cgraph, int n_th
                     const int64_t ne11 = node->src[1]->ne[1]; // H
                     const int64_t ne12 = node->src[1]->ne[2]; // Channels In
 
-                    size_t cur = 0;
                     cur += sizeof(lm_ggml_fp16_t)*ne00*ne01*ne02*ne03;
                     cur += sizeof(lm_ggml_fp16_t)*ne10*ne11*ne12;
-
-                    work_size = MAX(work_size, cur);
-                } break;
-            case LM_GGML_OP_POOL_1D:
-            case LM_GGML_OP_POOL_2D:
-                {
-                    n_tasks = 1;
-                } break;
-            case LM_GGML_OP_UPSCALE:
-                {
-                    n_tasks = n_threads;
                 } break;
             case LM_GGML_OP_FLASH_ATTN:
                 {
-                    n_tasks = n_threads;
-
-                    size_t cur = 0;
-
                     const int64_t ne11 = lm_ggml_up(node->src[1]->ne[1], LM_GGML_SOFT_MAX_UNROLL);
 
                     if (node->src[1]->type == LM_GGML_TYPE_F32) {
                         cur  = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
                         cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
-                    }
-
-                    if (node->src[1]->type == LM_GGML_TYPE_F16) {
+                    } else if (node->src[1]->type == LM_GGML_TYPE_F16) {
                         cur  = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
                         cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
                     }
-
-                    work_size = MAX(work_size, cur);
                 } break;
             case LM_GGML_OP_FLASH_FF:
                 {
-                    n_tasks = n_threads;
-
-                    size_t cur = 0;
-
                     if (node->src[1]->type == LM_GGML_TYPE_F32) {
                         cur  = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
                         cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
-                    }
-
-                    if (node->src[1]->type == LM_GGML_TYPE_F16) {
+                    } else if (node->src[1]->type == LM_GGML_TYPE_F16) {
                         cur  = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
                         cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
                     }
-
-                    work_size = MAX(work_size, cur);
                 } break;
             case LM_GGML_OP_FLASH_ATTN_BACK:
                 {
-                    n_tasks = n_threads;
-
-                    size_t cur = 0;
-
                     const int64_t    D = node->src[0]->ne[0];
                     const int64_t ne11 = lm_ggml_up(node->src[1]->ne[1], LM_GGML_SOFT_MAX_UNROLL);
                     const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in lm_ggml_compute_forward_flash_attn_back
                     if (node->src[1]->type == LM_GGML_TYPE_F32) {
                         cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
                         cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
-                    }
-
-                    if (node->src[1]->type == LM_GGML_TYPE_F16) {
+                    } else if (node->src[1]->type == LM_GGML_TYPE_F16) {
                         cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
                         cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
                     }
-
-                    work_size = MAX(work_size, cur);
-                } break;
-            case LM_GGML_OP_WIN_PART:
-            case LM_GGML_OP_WIN_UNPART:
-            case LM_GGML_OP_GET_REL_POS:
-            case LM_GGML_OP_MAP_UNARY:
-            case LM_GGML_OP_MAP_BINARY:
-            case LM_GGML_OP_MAP_CUSTOM1_F32:
-            case LM_GGML_OP_MAP_CUSTOM2_F32:
-            case LM_GGML_OP_MAP_CUSTOM3_F32:
-                {
-                    n_tasks = 1;
-                } break;
-            case LM_GGML_OP_MAP_CUSTOM1:
-                {
-                    struct lm_ggml_map_custom1_op_params * p = (struct lm_ggml_map_custom1_op_params *) node->op_params;
-                    if (p->n_tasks == LM_GGML_N_TASKS_MAX) {
-                        n_tasks = n_threads;
-                    } else {
-                        n_tasks = MIN(p->n_tasks, n_threads);
-                    }
-                } break;
-            case LM_GGML_OP_MAP_CUSTOM2:
-                {
-                    struct lm_ggml_map_custom2_op_params * p = (struct lm_ggml_map_custom2_op_params *) node->op_params;
-                    if (p->n_tasks == LM_GGML_N_TASKS_MAX) {
-                        n_tasks = n_threads;
-                    } else {
-                        n_tasks = MIN(p->n_tasks, n_threads);
-                    }
-                } break;
-            case LM_GGML_OP_MAP_CUSTOM3:
-                {
-                    struct lm_ggml_map_custom3_op_params * p = (struct lm_ggml_map_custom3_op_params *) node->op_params;
-                    if (p->n_tasks == LM_GGML_N_TASKS_MAX) {
-                        n_tasks = n_threads;
-                    } else {
-                        n_tasks = MIN(p->n_tasks, n_threads);
-                    }
                 } break;
-            case LM_GGML_OP_CROSS_ENTROPY_LOSS:
-                {
-                    n_tasks = n_threads;
 
-                    size_t cur = lm_ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
-
-                    work_size = MAX(work_size, cur);
-                } break;
-            case LM_GGML_OP_CROSS_ENTROPY_LOSS_BACK:
-                {
-                    n_tasks = n_threads;
-                } break;
-            case LM_GGML_OP_NONE:
+            case LM_GGML_OP_CROSS_ENTROPY_LOSS:
                 {
-                    n_tasks = 1;
+                    cur = lm_ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
                 } break;
             case LM_GGML_OP_COUNT:
                 {
                     LM_GGML_ASSERT(false);
                 } break;
+            default:
+                break;
         }
 
-        cplan.n_tasks[i] = n_tasks;
+        work_size = MAX(work_size, cur);
     }
 
     if (work_size > 0) {
@@ -16609,12 +16285,6 @@ int lm_ggml_graph_compute(struct lm_ggml_cgraph * cgraph, struct lm_ggml_cplan *
         if (cplan->work_size > 0) {
             LM_GGML_ASSERT(cplan->work_data);
         }
-
-        for (int i = 0; i < cgraph->n_nodes; ++i) {
-            if (cgraph->nodes[i]->op != LM_GGML_OP_NONE) {
-                LM_GGML_ASSERT(cplan->n_tasks[i] > 0);
-            }
-        }
     }
 
     const int n_threads = cplan->n_threads;
@@ -16687,16 +16357,6 @@ int lm_ggml_graph_compute(struct lm_ggml_cgraph * cgraph, struct lm_ggml_cplan *
     return compute_status;
 }
 
-void lm_ggml_graph_reset(struct lm_ggml_cgraph * cgraph) {
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        struct lm_ggml_tensor * grad = cgraph->grads[i];
-
-        if (grad) {
-            lm_ggml_set_zero(grad);
-        }
-    }
-}
-
 void lm_ggml_graph_compute_with_ctx(struct lm_ggml_context * ctx, struct lm_ggml_cgraph * cgraph, int n_threads) {
     struct lm_ggml_cplan cplan = lm_ggml_graph_plan(cgraph, n_threads);
 
@@ -16823,12 +16483,12 @@ void lm_ggml_graph_export(const struct lm_ggml_cgraph * cgraph, const char * fna
             const uint32_t magic   = LM_GGML_FILE_MAGIC;
             const uint32_t version = LM_GGML_FILE_VERSION;
             const uint32_t n_leafs = cgraph->n_leafs;
-            const uint32_t nodes   = cgraph->n_nodes;
+            const uint32_t n_nodes = cgraph->n_nodes;
 
             fwrite(&magic,     sizeof(uint32_t), 1, fout);
             fwrite(&version,   sizeof(uint32_t), 1, fout);
             fwrite(&n_leafs,   sizeof(uint32_t), 1, fout);
-            fwrite(&nodes,     sizeof(uint32_t), 1, fout);
+            fwrite(&n_nodes,   sizeof(uint32_t), 1, fout);
             fwrite(&size_eval, sizeof(uint64_t), 1, fout);
         }
 
@@ -16916,7 +16576,7 @@ void lm_ggml_graph_export(const struct lm_ggml_cgraph * cgraph, const char * fna
                             if (idx == -1) {
                                 for (int k = 0; k < cgraph->n_nodes; ++k) {
                                     if (args[j] == cgraph->nodes[k]) {
-                                        idx = LM_GGML_MAX_NODES + k;
+                                        idx = cgraph->n_leafs + k;
                                         break;
                                     }
                                 }
@@ -16943,11 +16603,11 @@ void lm_ggml_graph_export(const struct lm_ggml_cgraph * cgraph, const char * fna
     }
 }
 
-struct lm_ggml_cgraph lm_ggml_graph_import(const char * fname, struct lm_ggml_context ** ctx_data, struct lm_ggml_context ** ctx_eval) {
+struct lm_ggml_cgraph * lm_ggml_graph_import(const char * fname, struct lm_ggml_context ** ctx_data, struct lm_ggml_context ** ctx_eval) {
     assert(*ctx_data == NULL);
     assert(*ctx_eval == NULL);
 
-    struct lm_ggml_cgraph result = { 0 };
+    struct lm_ggml_cgraph * result = NULL;
 
     struct lm_ggml_tensor * data = NULL;
 
@@ -17019,13 +16679,11 @@ struct lm_ggml_cgraph lm_ggml_graph_import(const char * fname, struct lm_ggml_co
         const uint32_t n_leafs   = *(const uint32_t *) ptr; ptr += sizeof(n_leafs);
         const uint32_t n_nodes   = *(const uint32_t *) ptr; ptr += sizeof(n_nodes);
         const uint64_t size_eval = *(const uint64_t *) ptr; ptr += sizeof(size_eval);
-
-        result.n_leafs = n_leafs;
-        result.n_nodes = n_nodes;
+        const int     graph_size = MAX(n_leafs, n_nodes);
 
         // create the data context
         {
-            const size_t overhead = (n_leafs + n_nodes)*lm_ggml_tensor_overhead();
+            const size_t overhead = (n_leafs + n_nodes)*lm_ggml_tensor_overhead() + lm_ggml_graph_overhead_custom(graph_size, false);
 
             struct lm_ggml_init_params params = {
                 .mem_size   = size_eval + overhead,
@@ -17041,6 +16699,12 @@ struct lm_ggml_cgraph lm_ggml_graph_import(const char * fname, struct lm_ggml_co
             }
         }
 
+        result = lm_ggml_new_graph_custom(*ctx_eval, graph_size, false);
+
+        result->n_leafs = n_leafs;
+        result->n_nodes = n_nodes;
+
+
         // leafs
         {
             uint32_t type;
@@ -17079,7 +16743,7 @@ struct lm_ggml_cgraph lm_ggml_graph_import(const char * fname, struct lm_ggml_co
                     tensor->nb[j] = nb[j];
                 }
 
-                result.leafs[i] = tensor;
+                result->leafs[i] = tensor;
 
                 ptr += lm_ggml_nbytes(tensor);
 
@@ -17131,10 +16795,10 @@ struct lm_ggml_cgraph lm_ggml_graph_import(const char * fname, struct lm_ggml_co
                         continue;
                     }
 
-                    if (arg_idx < LM_GGML_MAX_NODES) {
-                        args[j] = result.leafs[arg_idx];
+                    if (arg_idx < result->n_leafs) {
+                        args[j] = result->leafs[arg_idx];
                     } else {
-                        args[j] = result.nodes[arg_idx - LM_GGML_MAX_NODES];
+                        args[j] = result->nodes[arg_idx - result->n_leafs];
                     }
                 }
 
@@ -17186,7 +16850,7 @@ struct lm_ggml_cgraph lm_ggml_graph_import(const char * fname, struct lm_ggml_co
                     tensor->src[j] = args[j];
                 }
 
-                result.nodes[i] = tensor;
+                result->nodes[i] = tensor;
 
                 fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, lm_ggml_nbytes(tensor));
             }
@@ -18091,10 +17755,11 @@ struct lm_ggml_opt_params lm_ggml_opt_default_params(enum lm_ggml_opt_type type)
         case LM_GGML_OPT_ADAM:
             {
                 result = (struct lm_ggml_opt_params) {
-                    .type      = LM_GGML_OPT_ADAM,
-                    .n_threads = 1,
-                    .past      = 0,
-                    .delta     = 1e-5f,
+                    .type       = LM_GGML_OPT_ADAM,
+                    .graph_size = LM_GGML_DEFAULT_GRAPH_SIZE,
+                    .n_threads  = 1, // FIXME: LM_GGML_DEFAULT_N_THREADS ?
+                    .past       = 0,
+                    .delta      = 1e-5f,
 
                     .max_no_improvement = 100,
 
@@ -18121,10 +17786,11 @@ struct lm_ggml_opt_params lm_ggml_opt_default_params(enum lm_ggml_opt_type type)
         case LM_GGML_OPT_LBFGS:
             {
                 result = (struct lm_ggml_opt_params) {
-                    .type      = LM_GGML_OPT_LBFGS,
-                    .n_threads = 1,
-                    .past      = 0,
-                    .delta     = 1e-5f,
+                    .type       = LM_GGML_OPT_LBFGS,
+                    .graph_size = LM_GGML_DEFAULT_GRAPH_SIZE,
+                    .n_threads  = 1,
+                    .past       = 0,
+                    .delta      = 1e-5f,
 
                     .max_no_improvement = 0,
 
@@ -18266,14 +17932,11 @@ enum lm_ggml_opt_result lm_ggml_opt_resume(
         struct lm_ggml_tensor * f) {
 
     // build forward + backward compute graphs
-    struct lm_ggml_tensor * gfbuf = lm_ggml_new_tensor_1d(ctx, LM_GGML_TYPE_I32, sizeof(struct lm_ggml_cgraph) / lm_ggml_type_size(LM_GGML_TYPE_I32)+ (sizeof(struct lm_ggml_cgraph) % lm_ggml_type_size(LM_GGML_TYPE_I32) ? 1 : 0));
-    struct lm_ggml_tensor * gbbuf = lm_ggml_new_tensor_1d(ctx, LM_GGML_TYPE_I32, sizeof(struct lm_ggml_cgraph) / lm_ggml_type_size(LM_GGML_TYPE_I32)+ (sizeof(struct lm_ggml_cgraph) % lm_ggml_type_size(LM_GGML_TYPE_I32) ? 1 : 0));
-
-    struct lm_ggml_cgraph * gf = (struct lm_ggml_cgraph *) gfbuf->data;
-    struct lm_ggml_cgraph * gb = (struct lm_ggml_cgraph *) gbbuf->data;
+    struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx, opt->params.graph_size, true);
+    lm_ggml_build_forward_expand(gf, f);
 
-    *gf = lm_ggml_build_forward (f);
-    *gb = lm_ggml_build_backward(ctx, gf, true);
+    struct lm_ggml_cgraph * gb = lm_ggml_graph_dup(ctx, gf);
+    lm_ggml_build_backward_expand(ctx, gf, gb, true);
 
     return lm_ggml_opt_resume_g(ctx, opt, f, gf, gb, NULL, NULL);
 }
@@ -18376,8 +18039,8 @@ size_t lm_ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_
             memcpy(&qh, &y[i].qh, sizeof(qh));
 
             for (int j = 0; j < QK5_0; j += 2) {
-                const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
-                const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
+                const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
+                const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
 
                 // cast to 16 bins
                 const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
@@ -18406,8 +18069,8 @@ size_t lm_ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_
             memcpy(&qh, &y[i].qh, sizeof(qh));
 
             for (int j = 0; j < QK5_1; j += 2) {
-                const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
-                const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
+                const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
+                const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
 
                 // cast to 16 bins
                 const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
@@ -18597,6 +18260,7 @@ struct lm_gguf_kv {
 
 struct lm_gguf_header {
     char magic[4];
+
     uint32_t version;
     uint64_t n_tensors; // GGUFv2
     uint64_t n_kv;      // GGUFv2
@@ -18686,7 +18350,7 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg
 
         for (uint32_t i = 0; i < sizeof(magic); i++) {
             if (magic[i] != LM_GGUF_MAGIC[i]) {
-                fprintf(stderr, "%s: invalid magic characters %s.\n", __func__, magic);
+                fprintf(stderr, "%s: invalid magic characters '%c%c%c%c'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
                 fclose(file);
                 return NULL;
             }
@@ -18701,7 +18365,6 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg
     {
         strncpy(ctx->header.magic, magic, 4);
 
-
         ctx->kv    = NULL;
         ctx->infos = NULL;
         ctx->data  = NULL;
@@ -18729,7 +18392,7 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg
     {
         ctx->kv = malloc(ctx->header.n_kv * sizeof(struct lm_gguf_kv));
 
-        for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
+        for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
             struct lm_gguf_kv * kv = &ctx->kv[i];
 
             //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
@@ -18776,7 +18439,7 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg
                             case LM_GGUF_TYPE_STRING:
                                 {
                                     kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct lm_gguf_str));
-                                    for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
+                                    for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
                                         ok = ok && lm_gguf_fread_str(file, &((struct lm_gguf_str *) kv->value.arr.data)[j], &offset);
                                     }
                                 } break;
@@ -18804,7 +18467,7 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg
     {
         ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct lm_gguf_tensor_info));
 
-        for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
             struct lm_gguf_tensor_info * info = &ctx->infos[i];
 
             for (int j = 0; j < LM_GGML_MAX_DIMS; ++j) {
@@ -18851,7 +18514,7 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg
     // compute the total size of the data section, taking into account the alignment
     {
         ctx->size = 0;
-        for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
             struct lm_gguf_tensor_info * info = &ctx->infos[i];
 
             const int64_t ne =
@@ -18920,7 +18583,7 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg
         lm_ggml_set_no_alloc(ctx_data, true);
 
         // create the tensors
-        for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
             const int64_t ne[LM_GGML_MAX_DIMS] = {
                 ctx->infos[i].ne[0],
                 ctx->infos[i].ne[1],
@@ -19055,24 +18718,29 @@ int lm_gguf_find_key(const struct lm_gguf_context * ctx, const char * key) {
 }
 
 const char * lm_gguf_get_key(const struct lm_gguf_context * ctx, int key_id) {
+    LM_GGML_ASSERT(key_id >= 0 && key_id < lm_gguf_get_n_kv(ctx));
     return ctx->kv[key_id].key.data;
 }
 
 enum lm_gguf_type lm_gguf_get_kv_type(const struct lm_gguf_context * ctx, int key_id) {
+    LM_GGML_ASSERT(key_id >= 0 && key_id < lm_gguf_get_n_kv(ctx));
     return ctx->kv[key_id].type;
 }
 
 enum lm_gguf_type lm_gguf_get_arr_type(const struct lm_gguf_context * ctx, int key_id) {
+    LM_GGML_ASSERT(key_id >= 0 && key_id < lm_gguf_get_n_kv(ctx));
     LM_GGML_ASSERT(ctx->kv[key_id].type == LM_GGUF_TYPE_ARRAY);
     return ctx->kv[key_id].value.arr.type;
 }
 
 const void * lm_gguf_get_arr_data(const struct lm_gguf_context * ctx, int key_id) {
+    LM_GGML_ASSERT(key_id >= 0 && key_id < lm_gguf_get_n_kv(ctx));
     LM_GGML_ASSERT(ctx->kv[key_id].type == LM_GGUF_TYPE_ARRAY);
     return ctx->kv[key_id].value.arr.data;
 }
 
 const char * lm_gguf_get_arr_str(const struct lm_gguf_context * ctx, int key_id, int i) {
+    LM_GGML_ASSERT(key_id >= 0 && key_id < lm_gguf_get_n_kv(ctx));
     LM_GGML_ASSERT(ctx->kv[key_id].type == LM_GGUF_TYPE_ARRAY);
     struct lm_gguf_kv * kv = &ctx->kv[key_id];
     struct lm_gguf_str * str = &((struct lm_gguf_str *) kv->value.arr.data)[i];
@@ -19080,70 +18748,90 @@ const char * lm_gguf_get_arr_str(const struct lm_gguf_context * ctx, int key_id,
 }
 
 int lm_gguf_get_arr_n(const struct lm_gguf_context * ctx, int key_id) {
+    LM_GGML_ASSERT(key_id >= 0 && key_id < lm_gguf_get_n_kv(ctx));
     LM_GGML_ASSERT(ctx->kv[key_id].type == LM_GGUF_TYPE_ARRAY);
     return ctx->kv[key_id].value.arr.n;
 }
 
 uint8_t lm_gguf_get_val_u8(const struct lm_gguf_context * ctx, int key_id) {
+    LM_GGML_ASSERT(key_id >= 0 && key_id < lm_gguf_get_n_kv(ctx));
     LM_GGML_ASSERT(ctx->kv[key_id].type == LM_GGUF_TYPE_UINT8);
     return ctx->kv[key_id].value.uint8;
 }
 
 int8_t lm_gguf_get_val_i8(const struct lm_gguf_context * ctx, int key_id) {
+    LM_GGML_ASSERT(key_id >= 0 && key_id < lm_gguf_get_n_kv(ctx));
     LM_GGML_ASSERT(ctx->kv[key_id].type == LM_GGUF_TYPE_INT8);
     return ctx->kv[key_id].value.int8;
 }
 
 uint16_t lm_gguf_get_val_u16(const struct lm_gguf_context * ctx, int key_id) {
+    LM_GGML_ASSERT(key_id >= 0 && key_id < lm_gguf_get_n_kv(ctx));
     LM_GGML_ASSERT(ctx->kv[key_id].type == LM_GGUF_TYPE_UINT16);
     return ctx->kv[key_id].value.uint16;
 }
 
 int16_t lm_gguf_get_val_i16(const struct lm_gguf_context * ctx, int key_id) {
+    LM_GGML_ASSERT(key_id >= 0 && key_id < lm_gguf_get_n_kv(ctx));
     LM_GGML_ASSERT(ctx->kv[key_id].type == LM_GGUF_TYPE_INT16);
     return ctx->kv[key_id].value.int16;
 }
 
 uint32_t lm_gguf_get_val_u32(const struct lm_gguf_context * ctx, int key_id) {
+    LM_GGML_ASSERT(key_id >= 0 && key_id < lm_gguf_get_n_kv(ctx));
     LM_GGML_ASSERT(ctx->kv[key_id].type == LM_GGUF_TYPE_UINT32);
     return ctx->kv[key_id].value.uint32;
 }
 
 int32_t lm_gguf_get_val_i32(const struct lm_gguf_context * ctx, int key_id) {
+    LM_GGML_ASSERT(key_id >= 0 && key_id < lm_gguf_get_n_kv(ctx));
     LM_GGML_ASSERT(ctx->kv[key_id].type == LM_GGUF_TYPE_INT32);
     return ctx->kv[key_id].value.int32;
 }
 
 float lm_gguf_get_val_f32(const struct lm_gguf_context * ctx, int key_id) {
+    LM_GGML_ASSERT(key_id >= 0 && key_id < lm_gguf_get_n_kv(ctx));
     LM_GGML_ASSERT(ctx->kv[key_id].type == LM_GGUF_TYPE_FLOAT32);
     return ctx->kv[key_id].value.float32;
 }
 
 uint64_t lm_gguf_get_val_u64(const struct lm_gguf_context * ctx, int key_id) {
+    LM_GGML_ASSERT(key_id >= 0 && key_id < lm_gguf_get_n_kv(ctx));
     LM_GGML_ASSERT(ctx->kv[key_id].type == LM_GGUF_TYPE_UINT64);
     return ctx->kv[key_id].value.uint64;
 }
 
 int64_t lm_gguf_get_val_i64(const struct lm_gguf_context * ctx, int key_id) {
+    LM_GGML_ASSERT(key_id >= 0 && key_id < lm_gguf_get_n_kv(ctx));
     LM_GGML_ASSERT(ctx->kv[key_id].type == LM_GGUF_TYPE_INT64);
     return ctx->kv[key_id].value.int64;
 }
 
 double lm_gguf_get_val_f64(const struct lm_gguf_context * ctx, int key_id) {
+    LM_GGML_ASSERT(key_id >= 0 && key_id < lm_gguf_get_n_kv(ctx));
     LM_GGML_ASSERT(ctx->kv[key_id].type == LM_GGUF_TYPE_FLOAT64);
     return ctx->kv[key_id].value.float64;
 }
 
 bool lm_gguf_get_val_bool(const struct lm_gguf_context * ctx, int key_id) {
+    LM_GGML_ASSERT(key_id >= 0 && key_id < lm_gguf_get_n_kv(ctx));
     LM_GGML_ASSERT(ctx->kv[key_id].type == LM_GGUF_TYPE_BOOL);
     return ctx->kv[key_id].value.bool_;
 }
 
 const char * lm_gguf_get_val_str(const struct lm_gguf_context * ctx, int key_id) {
+    LM_GGML_ASSERT(key_id >= 0 && key_id < lm_gguf_get_n_kv(ctx));
     LM_GGML_ASSERT(ctx->kv[key_id].type == LM_GGUF_TYPE_STRING);
     return ctx->kv[key_id].value.str.data;
 }
 
+const void * lm_gguf_get_val_data(const struct lm_gguf_context * ctx, int key_id) {
+    LM_GGML_ASSERT(key_id >= 0 && key_id < lm_gguf_get_n_kv(ctx));
+    LM_GGML_ASSERT(ctx->kv[key_id].type != LM_GGUF_TYPE_ARRAY);
+    LM_GGML_ASSERT(ctx->kv[key_id].type != LM_GGUF_TYPE_STRING);
+    return &ctx->kv[key_id].value;
+}
+
 int lm_gguf_get_n_tensors(const struct lm_gguf_context * ctx) {
     return ctx->header.n_tensors;
 }
diff --git a/cpp/ggml.h b/cpp/ggml.h
index 3cbd5d8..2000b8f 100644
--- a/cpp/ggml.h
+++ b/cpp/ggml.h
@@ -58,7 +58,8 @@
 //   {
 //       ...
 //
-//       struct lm_ggml_cgraph gf = lm_ggml_build_forward(f);
+//       struct lm_ggml_cgraph * gf = lm_ggml_new_graph(ctx);
+//       lm_ggml_build_forward_expand(gf, f);
 //
 //       // set the input variable and parameter values
 //       lm_ggml_set_f32(x, 2.0f);
@@ -213,15 +214,14 @@
 #define LM_GGML_QNT_VERSION        2    // bump this on quantization format changes
 #define LM_GGML_QNT_VERSION_FACTOR 1000 // do not change this
 
-#define LM_GGML_MAX_DIMS          4
-#define LM_GGML_MAX_NODES         16384
-#define LM_GGML_MAX_PARAMS        1024
-#define LM_GGML_MAX_CONTEXTS      64
-#define LM_GGML_MAX_SRC           6
-#define LM_GGML_MAX_NAME          64
-#define LM_GGML_MAX_OP_PARAMS     64
-#define LM_GGML_DEFAULT_N_THREADS 4
-
+#define LM_GGML_MAX_DIMS           4
+#define LM_GGML_MAX_PARAMS         1024
+#define LM_GGML_MAX_CONTEXTS       64
+#define LM_GGML_MAX_SRC            6
+#define LM_GGML_MAX_NAME           64
+#define LM_GGML_MAX_OP_PARAMS      64
+#define LM_GGML_DEFAULT_N_THREADS  4
+#define LM_GGML_DEFAULT_GRAPH_SIZE 2048
 #if UINTPTR_MAX == 0xFFFFFFFF
     #define LM_GGML_MEM_ALIGN 4
 #else
@@ -244,7 +244,9 @@
 #define LM_GGML_ASSERT(x) \
     do { \
         if (!(x)) { \
+            fflush(stdout); \
             fprintf(stderr, "LM_GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
+            lm_ggml_print_backtrace(); \
             abort(); \
         } \
     } while (0)
@@ -281,6 +283,20 @@
     const type prefix##3 = (pointer)->array[3]; \
     LM_GGML_UNUSED(prefix##3);
 
+#define LM_GGML_TENSOR_UNARY_OP_LOCALS \
+    LM_GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
+    LM_GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
+    LM_GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
+    LM_GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
+
+#define LM_GGML_TENSOR_BINARY_OP_LOCALS \
+    LM_GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
+    LM_GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
+    LM_GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
+    LM_GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb) \
+    LM_GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
+    LM_GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
+
 #ifdef  __cplusplus
 extern "C" {
 #endif
@@ -379,6 +395,7 @@ extern "C" {
         LM_GGML_OP_GROUP_NORM,
 
         LM_GGML_OP_MUL_MAT,
+        LM_GGML_OP_MUL_MAT_ID,
         LM_GGML_OP_OUT_PROD,
 
         LM_GGML_OP_SCALE,
@@ -400,18 +417,13 @@ extern "C" {
         LM_GGML_OP_ROPE_BACK,
         LM_GGML_OP_ALIBI,
         LM_GGML_OP_CLAMP,
-        LM_GGML_OP_CONV_1D,
-        LM_GGML_OP_CONV_1D_STAGE_0,  // internal
-        LM_GGML_OP_CONV_1D_STAGE_1,  // internal
         LM_GGML_OP_CONV_TRANSPOSE_1D,
-        LM_GGML_OP_CONV_2D,
-        LM_GGML_OP_CONV_2D_STAGE_0, // internal
-        LM_GGML_OP_CONV_2D_STAGE_1, // internal
+        LM_GGML_OP_IM2COL,
         LM_GGML_OP_CONV_TRANSPOSE_2D,
         LM_GGML_OP_POOL_1D,
         LM_GGML_OP_POOL_2D,
-
         LM_GGML_OP_UPSCALE, // nearest interpolate
+        LM_GGML_OP_ARGSORT,
 
         LM_GGML_OP_FLASH_ATTN,
         LM_GGML_OP_FLASH_FF,
@@ -451,6 +463,9 @@ extern "C" {
         LM_GGML_UNARY_OP_GELU,
         LM_GGML_UNARY_OP_GELU_QUICK,
         LM_GGML_UNARY_OP_SILU,
+        LM_GGML_UNARY_OP_LEAKY,
+
+        LM_GGML_UNARY_OP_COUNT,
     };
 
     enum lm_ggml_object_type {
@@ -531,37 +546,33 @@ extern "C" {
 
         int n_threads;
 
-        // the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
-        int n_tasks[LM_GGML_MAX_NODES];
-
         // abort lm_ggml_graph_compute when true
         bool (*abort_callback)(void * data);
         void * abort_callback_data;
     };
 
-    // next prime after LM_GGML_MAX_NODES
-    // #define LM_GGML_GRAPH_HASHTABLE_SIZE 4099
-    // next prime after LM_GGML_MAX_NODES * 2 (nodes + leafs)
-    // #define LM_GGML_GRAPH_HASHTABLE_SIZE 8273
-    // #define LM_GGML_GRAPH_HASHTABLE_SIZE 16411
-    #define LM_GGML_GRAPH_HASHTABLE_SIZE 32771
-
     enum lm_ggml_cgraph_eval_order {
         LM_GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
         LM_GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
         LM_GGML_CGRAPH_EVAL_ORDER_COUNT
     };
 
+    struct lm_ggml_hash_set {
+        size_t size;
+        struct lm_ggml_tensor ** keys;
+    };
+
     // computation graph
     struct lm_ggml_cgraph {
+        int size;
         int n_nodes;
         int n_leafs;
 
-        struct lm_ggml_tensor * nodes[LM_GGML_MAX_NODES];
-        struct lm_ggml_tensor * grads[LM_GGML_MAX_NODES];
-        struct lm_ggml_tensor * leafs[LM_GGML_MAX_NODES];
+        struct lm_ggml_tensor ** nodes;
+        struct lm_ggml_tensor ** grads;
+        struct lm_ggml_tensor ** leafs;
 
-        void * visited_hash_table[LM_GGML_GRAPH_HASHTABLE_SIZE];
+        struct lm_ggml_hash_set visited_hash_table;
 
         enum lm_ggml_cgraph_eval_order order;
 
@@ -571,8 +582,6 @@ extern "C" {
         int64_t perf_time_us;
     };
 
-    static const size_t LM_GGML_GRAPH_SIZE = sizeof(struct lm_ggml_cgraph);
-
     // scratch buffer
     struct lm_ggml_scratch {
         size_t offs;
@@ -617,6 +626,8 @@ extern "C" {
     LM_GGML_API int64_t lm_ggml_cycles(void);
     LM_GGML_API int64_t lm_ggml_cycles_per_ms(void);
 
+    LM_GGML_API void    lm_ggml_print_backtrace(void);
+
     LM_GGML_API void    lm_ggml_numa_init(void); // call once for better performance on NUMA systems
     LM_GGML_API bool    lm_ggml_is_numa(void); // true if init detected that system has >1 NUMA node
 
@@ -637,6 +648,9 @@ extern "C" {
     LM_GGML_API const char * lm_ggml_op_name  (enum lm_ggml_op   op);
     LM_GGML_API const char * lm_ggml_op_symbol(enum lm_ggml_op   op);
 
+    LM_GGML_API const char * lm_ggml_unary_op_name(enum lm_ggml_unary_op op);
+    LM_GGML_API const char * lm_ggml_op_desc(const struct lm_ggml_tensor * t); // unary or op name
+
     LM_GGML_API size_t  lm_ggml_element_size(const struct lm_ggml_tensor * tensor);
 
     LM_GGML_API bool    lm_ggml_is_quantized(enum lm_ggml_type type);
@@ -709,7 +723,7 @@ extern "C" {
     // Context tensor enumeration and lookup
     LM_GGML_API struct lm_ggml_tensor * lm_ggml_get_first_tensor(struct lm_ggml_context * ctx);
     LM_GGML_API struct lm_ggml_tensor * lm_ggml_get_next_tensor (struct lm_ggml_context * ctx, struct lm_ggml_tensor * tensor);
-    LM_GGML_API struct lm_ggml_tensor * lm_ggml_get_tensor      (struct lm_ggml_context * ctx, const char * name);
+    LM_GGML_API struct lm_ggml_tensor * lm_ggml_get_tensor(struct lm_ggml_context * ctx, const char * name);
 
     LM_GGML_API struct lm_ggml_tensor * lm_ggml_set_zero(struct lm_ggml_tensor * tensor);
     LM_GGML_API struct lm_ggml_tensor * lm_ggml_set_i32 (struct lm_ggml_tensor * tensor, int32_t value);
@@ -943,6 +957,10 @@ extern "C" {
             struct lm_ggml_context * ctx,
             struct lm_ggml_tensor  * a);
 
+    LM_GGML_API struct lm_ggml_tensor * lm_ggml_leaky(
+            struct lm_ggml_context * ctx,
+            struct lm_ggml_tensor  * a);
+
     LM_GGML_API struct lm_ggml_tensor * lm_ggml_relu_inplace(
             struct lm_ggml_context * ctx,
             struct lm_ggml_tensor  * a);
@@ -1029,6 +1047,15 @@ extern "C" {
             struct lm_ggml_tensor  * a,
             struct lm_ggml_tensor  * b);
 
+    // indirect matrix multiplication
+    //  lm_ggml_mul_mat_id(ctx, as, ids, id, b) ~= lm_ggml_mul_mat(as[ids[id]], b)
+    LM_GGML_API struct lm_ggml_tensor * lm_ggml_mul_mat_id(
+            struct lm_ggml_context * ctx,
+            struct lm_ggml_tensor  * as[],
+            struct lm_ggml_tensor  * ids,
+            int                   id,
+            struct lm_ggml_tensor  * b);
+
     // A: m columns, n rows,
     // B: p columns, n rows,
     // result is m columns, p rows
@@ -1284,6 +1311,14 @@ extern "C" {
             struct lm_ggml_context * ctx,
             struct lm_ggml_tensor  * a);
 
+    // fused soft_max(a*scale + mask)
+    // mask is optional
+    LM_GGML_API struct lm_ggml_tensor * lm_ggml_soft_max_ext(
+            struct lm_ggml_context * ctx,
+            struct lm_ggml_tensor  * a,
+            struct lm_ggml_tensor  * mask,
+            float                 scale);
+
     LM_GGML_API struct lm_ggml_tensor * lm_ggml_soft_max_back(
             struct lm_ggml_context * ctx,
             struct lm_ggml_tensor  * a,
@@ -1399,6 +1434,18 @@ extern "C" {
             float                 min,
             float                 max);
 
+    LM_GGML_API struct lm_ggml_tensor * lm_ggml_im2col(
+            struct lm_ggml_context * ctx,
+            struct lm_ggml_tensor  * a,
+            struct lm_ggml_tensor  * b,
+            int                  s0,
+            int                  s1,
+            int                  p0,
+            int                  p1,
+            int                  d0,
+            int                  d1,
+            bool                 is_2D);
+
     LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_1d(
             struct lm_ggml_context * ctx,
             struct lm_ggml_tensor  * a,
@@ -1482,6 +1529,8 @@ extern "C" {
             int                   s0, // stride
             int                   p0); // padding
 
+    // the result will have 2*p0 padding for the first dimension
+    // and 2*p1 padding for the second dimension
     LM_GGML_API struct lm_ggml_tensor * lm_ggml_pool_2d(
             struct lm_ggml_context * ctx,
             struct lm_ggml_tensor  * a,
@@ -1490,8 +1539,8 @@ extern "C" {
             int                   k1,
             int                   s0,
             int                   s1,
-            int                   p0,
-            int                   p1);
+            float                 p0,
+            float                 p1);
 
     // nearest interpolate
     // used in stable-diffusion
@@ -1500,6 +1549,23 @@ extern "C" {
             struct lm_ggml_tensor  * a,
             int                   scale_factor);
 
+    // sort rows
+    enum lm_ggml_sort_order {
+        LM_GGML_SORT_ASC,
+        LM_GGML_SORT_DESC,
+    };
+
+    LM_GGML_API struct lm_ggml_tensor * lm_ggml_argsort(
+            struct lm_ggml_context * ctx,
+            struct lm_ggml_tensor  * a,
+            enum lm_ggml_sort_order  order);
+
+    // top k elements per row
+    LM_GGML_API struct lm_ggml_tensor * lm_ggml_top_k(
+            struct lm_ggml_context * ctx,
+            struct lm_ggml_tensor  * a,
+            int                   k);
+
     LM_GGML_API struct lm_ggml_tensor * lm_ggml_flash_attn(
             struct lm_ggml_context * ctx,
             struct lm_ggml_tensor  * q,
@@ -1561,7 +1627,6 @@ extern "C" {
             int                   kh);
 
     // used in sam
-
     LM_GGML_API struct lm_ggml_tensor * lm_ggml_add_rel_pos(
             struct lm_ggml_context * ctx,
             struct lm_ggml_tensor  * a,
@@ -1732,19 +1797,22 @@ extern "C" {
     LM_GGML_API void lm_ggml_build_forward_expand (struct lm_ggml_cgraph * cgraph, struct lm_ggml_tensor * tensor);
     LM_GGML_API void lm_ggml_build_backward_expand(struct lm_ggml_context * ctx, struct lm_ggml_cgraph * gf, struct lm_ggml_cgraph * gb, bool keep);
 
-    LM_GGML_API struct lm_ggml_cgraph lm_ggml_build_forward (struct lm_ggml_tensor * tensor);
-    LM_GGML_API struct lm_ggml_cgraph lm_ggml_build_backward(struct lm_ggml_context * ctx, struct lm_ggml_cgraph * gf, bool keep);
-
     // graph allocation in a context
-    LM_GGML_API struct lm_ggml_cgraph * lm_ggml_new_graph        (struct lm_ggml_context * ctx);
-    LM_GGML_API struct lm_ggml_cgraph * lm_ggml_build_forward_ctx(struct lm_ggml_context * ctx, struct lm_ggml_tensor * tensor);
+    LM_GGML_API struct lm_ggml_cgraph * lm_ggml_new_graph         (struct lm_ggml_context * ctx); // size = LM_GGML_DEFAULT_GRAPH_SIZE, grads = false
+    LM_GGML_API struct lm_ggml_cgraph * lm_ggml_new_graph_custom  (struct lm_ggml_context * ctx, size_t size, bool grads);
+    LM_GGML_API struct lm_ggml_cgraph * lm_ggml_graph_dup         (struct lm_ggml_context * ctx, struct lm_ggml_cgraph * cgraph);
+    LM_GGML_API struct lm_ggml_cgraph   lm_ggml_graph_view        (struct lm_ggml_cgraph * cgraph, int i0, int i1);
+    LM_GGML_API void                 lm_ggml_graph_cpy         (struct lm_ggml_cgraph * src, struct lm_ggml_cgraph * dst);
+    LM_GGML_API void                 lm_ggml_graph_reset       (struct lm_ggml_cgraph * cgraph);  // zero grads
+    LM_GGML_API void                 lm_ggml_graph_clear       (struct lm_ggml_cgraph * cgraph);
+
     LM_GGML_API size_t lm_ggml_graph_overhead(void);
+    LM_GGML_API size_t lm_ggml_graph_overhead_custom(size_t size, bool grads);
 
     // lm_ggml_graph_plan() has to be called before lm_ggml_graph_compute()
     // when plan.work_size > 0, caller must allocate memory for plan.work_data
     LM_GGML_API struct lm_ggml_cplan lm_ggml_graph_plan   (struct lm_ggml_cgraph * cgraph, int n_threads /*= LM_GGML_DEFAULT_N_THREADS*/);
-    LM_GGML_API               int lm_ggml_graph_compute(struct lm_ggml_cgraph * cgraph, struct lm_ggml_cplan * cplan);
-    LM_GGML_API              void lm_ggml_graph_reset  (struct lm_ggml_cgraph * cgraph);
+    LM_GGML_API int               lm_ggml_graph_compute(struct lm_ggml_cgraph * cgraph, struct lm_ggml_cplan * cplan);
 
     // same as lm_ggml_graph_compute() but the work data is allocated as a part of the context
     // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
@@ -1752,8 +1820,8 @@ extern "C" {
 
     LM_GGML_API struct lm_ggml_tensor * lm_ggml_graph_get_tensor(struct lm_ggml_cgraph * cgraph, const char * name);
 
-    LM_GGML_API void               lm_ggml_graph_export(const struct lm_ggml_cgraph * cgraph, const char * fname);
-    LM_GGML_API struct lm_ggml_cgraph lm_ggml_graph_import(const char * fname, struct lm_ggml_context ** ctx_data, struct lm_ggml_context ** ctx_eval);
+    LM_GGML_API void                 lm_ggml_graph_export(const struct lm_ggml_cgraph * cgraph, const char * fname);
+    LM_GGML_API struct lm_ggml_cgraph * lm_ggml_graph_import(const char * fname, struct lm_ggml_context ** ctx_data, struct lm_ggml_context ** ctx_eval);
 
     // print info and performance information for the graph
     LM_GGML_API void lm_ggml_graph_print(const struct lm_ggml_cgraph * cgraph);
@@ -1816,6 +1884,8 @@ extern "C" {
     struct lm_ggml_opt_params {
         enum lm_ggml_opt_type type;
 
+        size_t graph_size;
+
         int n_threads;
 
         // delta-based convergence test
@@ -2027,6 +2097,7 @@ extern "C" {
     LM_GGML_API double       lm_gguf_get_val_f64 (const struct lm_gguf_context * ctx, int key_id);
     LM_GGML_API bool         lm_gguf_get_val_bool(const struct lm_gguf_context * ctx, int key_id);
     LM_GGML_API const char * lm_gguf_get_val_str (const struct lm_gguf_context * ctx, int key_id);
+    LM_GGML_API const void * lm_gguf_get_val_data(const struct lm_gguf_context * ctx, int key_id);
     LM_GGML_API int          lm_gguf_get_arr_n   (const struct lm_gguf_context * ctx, int key_id);
     LM_GGML_API const void * lm_gguf_get_arr_data(const struct lm_gguf_context * ctx, int key_id);
     LM_GGML_API const char * lm_gguf_get_arr_str (const struct lm_gguf_context * ctx, int key_id, int i);
diff --git a/cpp/grammar-parser.cpp b/cpp/grammar-parser.cpp
index ff51cc8..bf89a96 100644
--- a/cpp/grammar-parser.cpp
+++ b/cpp/grammar-parser.cpp
@@ -190,7 +190,7 @@ namespace grammar_parser {
                 pos = parse_space(pos + 1, is_nested);
             } else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator
                 if (last_sym_start == out_elements.size()) {
-                    throw std::runtime_error(std::string("expecting preceeding item to */+/? at ") + pos);
+                    throw std::runtime_error(std::string("expecting preceding item to */+/? at ") + pos);
                 }
 
                 // apply transformation to previous symbol (last_sym_start to end) according to
diff --git a/cpp/llama.cpp b/cpp/llama.cpp
index b62b25a..f1de553 100644
--- a/cpp/llama.cpp
+++ b/cpp/llama.cpp
@@ -46,7 +46,6 @@
     #endif
     #include <windows.h>
     #include <io.h>
-    #include <stdio.h> // for _fseeki64
 #endif
 
 #include <algorithm>
@@ -75,6 +74,7 @@
 #include <set>
 #include <sstream>
 #include <thread>
+#include <type_traits>
 #include <unordered_map>
 
 #if defined(_MSC_VER)
@@ -91,6 +91,8 @@
 #define LLAMA_ATTRIBUTE_FORMAT(...)
 #endif
 
+#define LLAMA_MAX_NODES 8192
+
 //
 // logging
 //
@@ -201,6 +203,8 @@ enum llm_arch {
     LLM_ARCH_PERSIMMON,
     LLM_ARCH_REFACT,
     LLM_ARCH_BLOOM,
+    LLM_ARCH_STABLELM,
+    LLM_ARCH_QWEN,
     LLM_ARCH_UNKNOWN,
 };
 
@@ -216,6 +220,8 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
     { LLM_ARCH_PERSIMMON,       "persimmon" },
     { LLM_ARCH_REFACT,          "refact"    },
     { LLM_ARCH_BLOOM,           "bloom"     },
+    { LLM_ARCH_STABLELM,        "stablelm"  },
+    { LLM_ARCH_QWEN,            "qwen"      },
 };
 
 enum llm_kv {
@@ -262,6 +268,8 @@ enum llm_kv {
     LLM_KV_TOKENIZER_UNK_ID,
     LLM_KV_TOKENIZER_SEP_ID,
     LLM_KV_TOKENIZER_PAD_ID,
+    LLM_KV_TOKENIZER_ADD_BOS,
+    LLM_KV_TOKENIZER_ADD_EOS,
     LLM_KV_TOKENIZER_HF_JSON,
     LLM_KV_TOKENIZER_RWKV,
 };
@@ -310,6 +318,8 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
     { LLM_KV_TOKENIZER_UNK_ID,              "tokenizer.ggml.unknown_token_id"   },
     { LLM_KV_TOKENIZER_SEP_ID,              "tokenizer.ggml.seperator_token_id" },
     { LLM_KV_TOKENIZER_PAD_ID,              "tokenizer.ggml.padding_token_id"   },
+    { LLM_KV_TOKENIZER_ADD_BOS,             "tokenizer.ggml.add_bos_token"      },
+    { LLM_KV_TOKENIZER_ADD_EOS,             "tokenizer.ggml.add_eos_token"      },
     { LLM_KV_TOKENIZER_HF_JSON,             "tokenizer.huggingface.json"        },
     { LLM_KV_TOKENIZER_RWKV,                "tokenizer.rwkv.world"              },
 };
@@ -504,6 +514,41 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
             { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
         },
     },
+    {
+        LLM_ARCH_STABLELM,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
+    {
+        LLM_ARCH_QWEN,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
+
     {
         LLM_ARCH_UNKNOWN,
         {
@@ -557,21 +602,6 @@ struct LLM_TN {
 // gguf helpers
 //
 
-#define LM_GGUF_GET_KEY(ctx, dst, func, type, req, key) \
-do { \
-    const std::string skey(key); \
-    const int kid = lm_gguf_find_key(ctx, skey.c_str()); \
-    if (kid >= 0) { \
-        enum lm_gguf_type ktype = lm_gguf_get_kv_type(ctx, kid); \
-        if (ktype != (type)) { \
-            throw std::runtime_error(format("key %s has wrong type: %s", skey.c_str(), lm_gguf_type_name(ktype))); \
-        } \
-        (dst) = func(ctx, kid); \
-    } else if (req) { \
-        throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \
-    } \
-} while (0)
-
 static std::map<int8_t, std::string> LLAMA_ROPE_SCALING_TYPES = {
     { LLAMA_ROPE_SCALING_NONE,   "none"   },
     { LLAMA_ROPE_SCALING_LINEAR, "linear" },
@@ -588,6 +618,60 @@ static int8_t llama_rope_scaling_type_from_string(const std::string & name) {
     return LLAMA_ROPE_SCALING_UNSPECIFIED;
 }
 
+static std::string lm_gguf_data_to_str(enum lm_gguf_type type, const void * data, int i) {
+    switch (type) {
+        case LM_GGUF_TYPE_UINT8:   return std::to_string(((const uint8_t  *)data)[i]);
+        case LM_GGUF_TYPE_INT8:    return std::to_string(((const int8_t   *)data)[i]);
+        case LM_GGUF_TYPE_UINT16:  return std::to_string(((const uint16_t *)data)[i]);
+        case LM_GGUF_TYPE_INT16:   return std::to_string(((const int16_t  *)data)[i]);
+        case LM_GGUF_TYPE_UINT32:  return std::to_string(((const uint32_t *)data)[i]);
+        case LM_GGUF_TYPE_INT32:   return std::to_string(((const int32_t  *)data)[i]);
+        case LM_GGUF_TYPE_UINT64:  return std::to_string(((const uint64_t *)data)[i]);
+        case LM_GGUF_TYPE_INT64:   return std::to_string(((const int64_t  *)data)[i]);
+        case LM_GGUF_TYPE_FLOAT32: return std::to_string(((const float    *)data)[i]);
+        case LM_GGUF_TYPE_FLOAT64: return std::to_string(((const double   *)data)[i]);
+        case LM_GGUF_TYPE_BOOL:    return ((const bool *)data)[i] ? "true" : "false";
+        default:                return format("unknown type %d", type);
+    }
+}
+
+static std::string lm_gguf_kv_to_str(const struct lm_gguf_context * ctx_gguf, int i) {
+    const enum lm_gguf_type type = lm_gguf_get_kv_type(ctx_gguf, i);
+
+    switch (type) {
+        case LM_GGUF_TYPE_STRING:
+            return lm_gguf_get_val_str(ctx_gguf, i);
+        case LM_GGUF_TYPE_ARRAY:
+            {
+                const enum lm_gguf_type arr_type = lm_gguf_get_arr_type(ctx_gguf, i);
+                int arr_n = lm_gguf_get_arr_n(ctx_gguf, i);
+                const void * data = lm_gguf_get_arr_data(ctx_gguf, i);
+                std::stringstream ss;
+                ss << "[";
+                for (int j = 0; j < arr_n; j++) {
+                    if (arr_type == LM_GGUF_TYPE_STRING) {
+                        std::string val = lm_gguf_get_arr_str(ctx_gguf, i, j);
+                        // escape quotes
+                        replace_all(val, "\\", "\\\\");
+                        replace_all(val, "\"", "\\\"");
+                        ss << '"' << val << '"';
+                    } else if (arr_type == LM_GGUF_TYPE_ARRAY) {
+                        ss << "???";
+                    } else {
+                        ss << lm_gguf_data_to_str(arr_type, data, j);
+                    }
+                    if (j < arr_n - 1) {
+                        ss << ", ";
+                    }
+                }
+                ss << "]";
+                return ss.str();
+            }
+        default:
+            return lm_gguf_data_to_str(type, lm_gguf_get_val_data(ctx_gguf, i), 0);
+    }
+}
+
 //
 // ggml helpers
 //
@@ -1048,6 +1132,12 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
 //
 
 struct llama_state {
+    llama_state() {
+#ifdef LM_GGML_USE_METAL
+        lm_ggml_metal_log_set_callback(log_callback, log_callback_user_data);
+#endif
+    }
+
     // We save the log callback globally
     lm_ggml_log_callback log_callback = llama_log_callback_default;
     void * log_callback_user_data = nullptr;
@@ -1071,9 +1161,9 @@ enum e_model {
     MODEL_70B,
 };
 
-static const size_t kB = 1024;
-static const size_t MB = 1024*kB;
-static const size_t GB = 1024*MB;
+static const size_t kiB = 1024;
+static const size_t MiB = 1024*kiB;
+static const size_t GiB = 1024*MiB;
 
 struct llama_hparams {
     bool     vocab_only;
@@ -1152,6 +1242,7 @@ struct llama_cparams {
     float yarn_beta_slow;
 
     bool mul_mat_q;
+    bool offload_kqv;
 };
 
 struct llama_layer {
@@ -1173,6 +1264,9 @@ struct llama_layer {
     struct lm_ggml_tensor * wqkv;
 
     // attention bias
+    struct lm_ggml_tensor * bq;
+    struct lm_ggml_tensor * bk;
+    struct lm_ggml_tensor * bv;
     struct lm_ggml_tensor * bo;
     struct lm_ggml_tensor * bqkv;
 
@@ -1210,14 +1304,15 @@ struct llama_kv_cache {
     // cannot be freely changed after a slot has been allocated.
     uint32_t head = 0;
     uint32_t size = 0;
+    uint32_t used = 0; // used cells (i.e. at least one seq_id)
 
     // computed before each graph build
     uint32_t n = 0;
 
     std::vector<llama_kv_cell> cells;
 
-    struct lm_ggml_tensor * k = NULL;
-    struct lm_ggml_tensor * v = NULL;
+    std::vector<struct lm_ggml_tensor *> k_l; // per layer
+    std::vector<struct lm_ggml_tensor *> v_l;
 
     struct lm_ggml_context * ctx = NULL;
 
@@ -1230,8 +1325,10 @@ struct llama_kv_cache {
 
 #ifdef LM_GGML_USE_CUBLAS
         if (lm_ggml_cublas_loaded()) {
-            lm_ggml_cuda_free_data(k);
-            lm_ggml_cuda_free_data(v);
+            for (size_t i = 0; i < k_l.size(); ++i) {
+                lm_ggml_cuda_free_data(k_l[i]);
+                lm_ggml_cuda_free_data(v_l[i]);
+            }
         }
 #endif
     }
@@ -1264,6 +1361,9 @@ struct llama_vocab {
     id special_sep_id = -1;
     id special_pad_id = -1;
 
+    int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
+    int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
+
     id linefeed_id       = 13;
     id special_prefix_id = 32007;
     id special_middle_id = 32009;
@@ -1308,6 +1408,9 @@ struct llama_model {
 
     int n_gpu_layers;
 
+    // gguf metadata
+    std::unordered_map<std::string, std::string> lm_gguf_kv;
+
     // context
     struct lm_ggml_context * ctx = NULL;
 
@@ -1415,9 +1518,11 @@ struct llama_context {
 static bool llama_kv_cache_init(
         const struct llama_hparams & hparams,
              struct llama_kv_cache & cache,
-                         lm_ggml_type   wtype,
+                         lm_ggml_type   ktype,
+                         lm_ggml_type   vtype,
                           uint32_t   n_ctx,
-                               int   n_gpu_layers) {
+                               int   n_gpu_layers,
+                              bool   offload) {
     const uint32_t n_embd  = hparams.n_embd_gqa();
     const uint32_t n_layer = hparams.n_layer;
 
@@ -1428,11 +1533,12 @@ static bool llama_kv_cache_init(
 
     cache.head = 0;
     cache.size = n_ctx;
+    cache.used = 0;
 
     cache.cells.clear();
     cache.cells.resize(n_ctx);
 
-    cache.buf.resize(2u*n_elements*lm_ggml_type_size(wtype) + 2u*lm_ggml_tensor_overhead());
+    cache.buf.resize(n_elements*(lm_ggml_type_sizef(ktype) + lm_ggml_type_sizef(vtype)) + 2u*n_layer*lm_ggml_tensor_overhead());
     memset(cache.buf.data, 0, cache.buf.size);
 
     struct lm_ggml_init_params params;
@@ -1442,37 +1548,44 @@ static bool llama_kv_cache_init(
 
     cache.ctx = lm_ggml_init(params);
 
+    size_t vram_kv_cache = 0;
+
     if (!cache.ctx) {
         LLAMA_LOG_ERROR("%s: failed to allocate memory for kv cache\n", __func__);
         return false;
     }
 
-    cache.k = lm_ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
-    cache.v = lm_ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
-    lm_ggml_set_name(cache.k, "cache_k");
-    lm_ggml_set_name(cache.v, "cache_v");
+    cache.k_l.reserve(n_layer);
+    cache.v_l.reserve(n_layer);
 
-    (void) n_gpu_layers;
+    const int i_gpu_start = (int) n_layer - n_gpu_layers; LM_GGML_UNUSED(i_gpu_start);
 
-#ifdef LM_GGML_USE_CUBLAS
-    if (lm_ggml_cublas_loaded()) {
-        size_t vram_kv_cache = 0;
+    LM_GGML_UNUSED(offload);
 
-        if (n_gpu_layers > (int)n_layer + 1) {
-            lm_ggml_cuda_assign_buffers_no_scratch(cache.v);
-            LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
-            vram_kv_cache += lm_ggml_nbytes(cache.v);
-        }
-        if (n_gpu_layers > (int)n_layer + 2) {
-            lm_ggml_cuda_assign_buffers_no_scratch(cache.k);
-            LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
-            vram_kv_cache += lm_ggml_nbytes(cache.k);
-        }
-        if (vram_kv_cache > 0) {
-            LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
+    for (int i = 0; i < (int) n_layer; i++) {
+        lm_ggml_tensor * k = lm_ggml_new_tensor_1d(cache.ctx, ktype, n_embd*n_ctx);
+        lm_ggml_tensor * v = lm_ggml_new_tensor_1d(cache.ctx, vtype, n_embd*n_ctx);
+        lm_ggml_format_name(k, "cache_k_l%d", i);
+        lm_ggml_format_name(v, "cache_v_l%d", i);
+        cache.k_l.push_back(k);
+        cache.v_l.push_back(v);
+#ifdef LM_GGML_USE_CUBLAS
+        if (i >= i_gpu_start) {
+            if (offload) {
+                lm_ggml_cuda_assign_buffers_no_scratch(k);
+                vram_kv_cache += lm_ggml_nbytes(k);
+                lm_ggml_cuda_assign_buffers_no_scratch(v);
+                vram_kv_cache += lm_ggml_nbytes(v);
+            }
         }
+#endif // LM_GGML_USE_CUBLAS
     }
-#endif
+
+    if (vram_kv_cache > 0) {
+        LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
+    }
+
+    LM_GGML_UNUSED(n_gpu_layers);
 
     return true;
 }
@@ -1529,6 +1642,8 @@ static bool llama_kv_cache_find_slot(
         }
     }
 
+    cache.used += n_tokens;
+
     return true;
 }
 
@@ -1549,6 +1664,7 @@ static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
         cache.cells[i].seq_id.clear();
     }
     cache.head = 0;
+    cache.used = 0;
 }
 
 static void llama_kv_cache_seq_rm(
@@ -1571,6 +1687,9 @@ static void llama_kv_cache_seq_rm(
                 continue;
             }
             if (cache.cells[i].seq_id.empty()) {
+                // keep count of the number of used cells
+                if (cache.cells[i].pos >= 0) cache.used--;
+
                 cache.cells[i].pos = -1;
                 if (new_head == cache.size) new_head = i;
             }
@@ -1578,7 +1697,7 @@ static void llama_kv_cache_seq_rm(
     }
 
     // If we freed up a slot, set head to it so searching can start there.
-    if (new_head != cache.size) cache.head = new_head;
+    if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
 }
 
 static void llama_kv_cache_seq_cp(
@@ -1604,6 +1723,7 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
 
     for (uint32_t i = 0; i < cache.size; ++i) {
         if (!cache.cells[i].has_seq_id(seq_id)) {
+            if (cache.cells[i].pos >= 0) cache.used--;
             cache.cells[i].pos = -1;
             cache.cells[i].seq_id.clear();
             if (new_head == cache.size) new_head = i;
@@ -1614,7 +1734,7 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
     }
 
     // If we freed up a slot, set head to it so searching can start there.
-    if (new_head != cache.size) cache.head = new_head;
+    if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
 }
 
 static void llama_kv_cache_seq_shift(
@@ -1635,6 +1755,7 @@ static void llama_kv_cache_seq_shift(
             cache.cells[i].delta += delta;
 
             if (cache.cells[i].pos < 0) {
+                if (!cache.cells[i].seq_id.empty()) cache.used--;
                 cache.cells[i].pos = -1;
                 cache.cells[i].seq_id.clear();
                 if (new_head == cache.size) new_head = i;
@@ -1685,6 +1806,169 @@ static std::string llama_format_tensor_shape(const struct lm_ggml_tensor * t) {
     return buf;
 }
 
+namespace GGUFMeta {
+    template <typename T, lm_gguf_type gt_, T (*gfun)(const lm_gguf_context *, const int)>
+    struct GKV_Base_Type {
+        static constexpr lm_gguf_type gt = gt_;
+
+        static T getter(const lm_gguf_context * ctx, const int kid) {
+            return gfun(ctx, kid);
+        }
+    };
+
+    template<typename T> struct GKV_Base;
+
+    template<> struct GKV_Base<bool        >: GKV_Base_Type<bool,         LM_GGUF_TYPE_BOOL,    lm_gguf_get_val_bool> {};
+    template<> struct GKV_Base<uint8_t     >: GKV_Base_Type<uint8_t,      LM_GGUF_TYPE_UINT8,   lm_gguf_get_val_u8  > {};
+    template<> struct GKV_Base<uint16_t    >: GKV_Base_Type<uint16_t,     LM_GGUF_TYPE_UINT16,  lm_gguf_get_val_u16 > {};
+    template<> struct GKV_Base<uint32_t    >: GKV_Base_Type<uint32_t,     LM_GGUF_TYPE_UINT32,  lm_gguf_get_val_u32 > {};
+    template<> struct GKV_Base<uint64_t    >: GKV_Base_Type<uint64_t,     LM_GGUF_TYPE_UINT64,  lm_gguf_get_val_u64 > {};
+    template<> struct GKV_Base<int8_t      >: GKV_Base_Type<int8_t,       LM_GGUF_TYPE_INT8,    lm_gguf_get_val_i8  > {};
+    template<> struct GKV_Base<int16_t     >: GKV_Base_Type<int16_t,      LM_GGUF_TYPE_INT16,   lm_gguf_get_val_i16 > {};
+    template<> struct GKV_Base<int32_t     >: GKV_Base_Type<int32_t,      LM_GGUF_TYPE_INT32,   lm_gguf_get_val_i32 > {};
+    template<> struct GKV_Base<int64_t     >: GKV_Base_Type<int64_t,      LM_GGUF_TYPE_INT64,   lm_gguf_get_val_i64 > {};
+    template<> struct GKV_Base<float       >: GKV_Base_Type<float,        LM_GGUF_TYPE_FLOAT32, lm_gguf_get_val_f32 > {};
+    template<> struct GKV_Base<double      >: GKV_Base_Type<double,       LM_GGUF_TYPE_FLOAT64, lm_gguf_get_val_f64 > {};
+    template<> struct GKV_Base<const char *>: GKV_Base_Type<const char *, LM_GGUF_TYPE_STRING,  lm_gguf_get_val_str > {};
+
+    template<> struct GKV_Base<std::string> {
+        static constexpr lm_gguf_type gt = LM_GGUF_TYPE_STRING;
+
+        static std::string getter(const lm_gguf_context * ctx, const int kid) {
+            return lm_gguf_get_val_str(ctx, kid);
+        }
+    };
+
+    struct ArrayInfo{
+        const lm_gguf_type gt;
+        const size_t length;
+        const void * data;
+    };
+
+    template<> struct GKV_Base<ArrayInfo> {
+        public:
+        static constexpr lm_gguf_type gt = LM_GGUF_TYPE_ARRAY;
+        static ArrayInfo getter(const lm_gguf_context *ctx, const int k) {
+            return ArrayInfo {
+                lm_gguf_get_arr_type(ctx, k),
+                size_t(lm_gguf_get_arr_n(ctx, k)),
+                lm_gguf_get_arr_data(ctx, k),
+            };
+        }
+    };
+
+    template<typename T>
+    class GKV: public GKV_Base<T> {
+        GKV() = delete;
+
+        public:
+        static T get_kv(const lm_gguf_context * ctx, const int k) {
+            const enum lm_gguf_type kt = lm_gguf_get_kv_type(ctx, k);
+
+            if (kt != GKV::gt) {
+                throw std::runtime_error(format("key %s has wrong type %s but expected type %s",
+                    lm_gguf_get_key(ctx, k), lm_gguf_type_name(kt), lm_gguf_type_name(GKV::gt)));
+            }
+            return GKV::getter(ctx, k);
+        }
+
+        static const char * override_type_to_str(const llama_model_kv_override_type ty) {
+            switch (ty) {
+                case LLAMA_KV_OVERRIDE_BOOL:  return "bool";
+                case LLAMA_KV_OVERRIDE_INT:   return "int";
+                case LLAMA_KV_OVERRIDE_FLOAT: return "float";
+            }
+            return "unknown";
+        }
+
+        static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override *override) {
+            if (!override) { return false; }
+            if (override->tag == expected_type) {
+                LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ",
+                    __func__, override_type_to_str(override->tag), override->key);
+                switch (override->tag) {
+                    case LLAMA_KV_OVERRIDE_BOOL:  {
+                        printf("%s\n", override->bool_value ? "true" : "false");
+                    } break;
+                    case LLAMA_KV_OVERRIDE_INT:   {
+                        printf("%" PRId64 "\n", override->int_value);
+                    } break;
+                    case LLAMA_KV_OVERRIDE_FLOAT: {
+                        printf("%.6f\n", override->float_value);
+                    } break;
+                    default:
+                        // Shouldn't be possible to end up here, but just in case...
+                        throw std::runtime_error(
+                            format("Unsupported attempt to override %s type for metadata key %s\n",
+                                override_type_to_str(override->tag), override->key));
+                }
+                return true;
+            }
+            LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n",
+                __func__, override->key, override_type_to_str(expected_type), override_type_to_str(override->tag));
+            return false;
+        }
+
+        template<typename OT>
+        static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
+        try_override(OT & target, const struct llama_model_kv_override *override) {
+            if (validate_override(LLAMA_KV_OVERRIDE_BOOL, override)) {
+                target = override->bool_value;
+                return true;
+            }
+            return true;
+        }
+
+        template<typename OT>
+        static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
+        try_override(OT & target, const struct llama_model_kv_override *override) {
+            if (validate_override(LLAMA_KV_OVERRIDE_INT, override)) {
+                target = override->int_value;
+                return true;
+            }
+            return false;
+        }
+
+        template<typename OT>
+        static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
+        try_override(T & target, const struct llama_model_kv_override *override) {
+            if (validate_override(LLAMA_KV_OVERRIDE_FLOAT, override)) {
+                target = override->float_value;
+                return true;
+            }
+            return false;
+        }
+
+        template<typename OT>
+        static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
+        try_override(T & target, const struct llama_model_kv_override *override) {
+            (void)target;
+            (void)override;
+            if (!override) { return false; }
+            // Currently, we should never end up here so it would be a bug if we do.
+            throw std::runtime_error(format("Unsupported attempt to override string type for metadata key %s\n",
+                override ? override->key : "NULL"));
+        }
+
+        static bool set(const lm_gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override *override = nullptr) {
+            if (try_override<T>(target, override)) {
+                return true;
+            }
+            if (k < 0) { return false; }
+            target = get_kv(ctx, k);
+            return true;
+        }
+
+        static bool set(const lm_gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override *override = nullptr) {
+            return set(ctx, lm_gguf_find_key(ctx, key), target, override);
+        }
+
+        static bool set(const lm_gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override *override = nullptr) {
+            return set(ctx, key.c_str(), target, override);
+        }
+    };
+}
+
 struct llama_model_loader {
     int n_kv      = 0;
     int n_tensors = 0;
@@ -1700,21 +1984,34 @@ struct llama_model_loader {
     llama_fver  fver;
 
     std::unique_ptr<llama_mmap> mapping;
+    std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
 
     struct lm_gguf_context * ctx_gguf = NULL;
     struct lm_ggml_context * ctx_meta = NULL;
 
-    llama_model_loader(const std::string & fname, bool use_mmap) : file(fname.c_str(), "rb") {
+    std::string arch_name;
+    LLM_KV      llm_kv    = LLM_KV(LLM_ARCH_UNKNOWN);
+
+    llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) : file(fname.c_str(), "rb") {
         struct lm_gguf_init_params params = {
             /*.no_alloc = */ true,
             /*.ctx      = */ &ctx_meta,
         };
 
+        if (param_overrides_p != nullptr) {
+            for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) {
+                kv_overrides.insert({std::string(p->key), *p});
+            }
+        }
+
         ctx_gguf = lm_gguf_init_from_file(fname.c_str(), params);
         if (!ctx_gguf) {
             throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
         }
 
+        get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
+        llm_kv = LLM_KV(llm_arch_from_string(arch_name));
+
         n_kv      = lm_gguf_get_n_kv(ctx_gguf);
         n_tensors = lm_gguf_get_n_tensors(ctx_gguf);
 
@@ -1766,10 +2063,10 @@ struct llama_model_loader {
                 case LM_GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
                 case LM_GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K;   break;
                 default:
-                     {
-                         LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, lm_ggml_type_name(type_max));
-                         ftype = LLAMA_FTYPE_ALL_F32;
-                     } break;
+                    {
+                        LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, lm_ggml_type_name(type_max));
+                        ftype = LLAMA_FTYPE_ALL_F32;
+                    } break;
             }
 
             // this is a way to mark that we have "guessed" the file type
@@ -1782,11 +2079,23 @@ struct llama_model_loader {
                 }
             }
 
+            LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
             for (int i = 0; i < n_kv; i++) {
-                const char * name         = lm_gguf_get_key(ctx_gguf, i);
-                const enum lm_gguf_type type = lm_gguf_get_kv_type(ctx_gguf, i);
+                const char * name           = lm_gguf_get_key(ctx_gguf, i);
+                const enum lm_gguf_type type   = lm_gguf_get_kv_type(ctx_gguf, i);
+                const std::string type_name =
+                    type == LM_GGUF_TYPE_ARRAY
+                    ? format("%s[%s,%d]", lm_gguf_type_name(type), lm_gguf_type_name(lm_gguf_get_arr_type(ctx_gguf, i)), lm_gguf_get_arr_n(ctx_gguf, i))
+                    : lm_gguf_type_name(type);
+
+                std::string value          = lm_gguf_kv_to_str(ctx_gguf, i);
+                const size_t MAX_VALUE_LEN = 40;
+                if (value.size() > MAX_VALUE_LEN) {
+                    value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
+                }
+                replace_all(value, "\n", "\\n");
 
-                LLAMA_LOG_INFO("%s: - kv %3d: %42s %-8s\n", __func__, i, name, lm_gguf_type_name(type));
+                LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
             }
 
             // print type counts
@@ -1816,19 +2125,59 @@ struct llama_model_loader {
         }
     }
 
-    std::string get_arch_name() const {
-        const auto kv = LLM_KV(LLM_ARCH_UNKNOWN);
+    template<typename T>
+    typename std::enable_if<std::is_integral<T>::value, bool>::type
+    get_arr_n(const std::string & key, T & result, const bool required = true) {
+        const int kid = lm_gguf_find_key(ctx_gguf, key.c_str());
+
+        if (kid < 0) {
+            if (required) {
+                throw std::runtime_error(format("key not found in model: %s", key.c_str()));
+            }
+            return false;
+        }
+
+        struct GGUFMeta::ArrayInfo arr_info =
+            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx_gguf, kid);
+
+
+        result = arr_info.length;
+        return true;
+    }
+
+    template<typename T>
+    typename std::enable_if<std::is_integral<T>::value, bool>::type
+    get_arr_n(const enum llm_kv kid, T & result, const bool required = true) {
+        return get_arr_n(llm_kv(kid), result, required);
+    }
+
+    template<typename T>
+    bool get_key(const std::string & key, T & result, const bool required = true) {
+        auto it = kv_overrides.find(key);
+
+        const struct llama_model_kv_override * override =
+            it != kv_overrides.end() ? &it->second : nullptr;
+
+        const bool found = GGUFMeta::GKV<T>::set(ctx_gguf, key, result, override);
+
+        if (required && !found) {
+            throw std::runtime_error(format("key not found in model: %s", key.c_str()));
+        }
+
+        return found;
+    }
 
-        std::string arch_name;
-        LM_GGUF_GET_KEY(ctx_gguf, arch_name, lm_gguf_get_val_str, LM_GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_ARCHITECTURE));
+    template<typename T>
+    bool get_key(const enum llm_kv kid, T & result, const bool required = true) {
+        return get_key(llm_kv(kid), result, required);
+    }
 
+    std::string get_arch_name() const {
         return arch_name;
     }
 
     enum llm_arch get_arch() const {
-        const std::string arch_name = get_arch_name();
-
-        return llm_arch_from_string(arch_name);
+        return llm_kv.arch;
     }
 
     const char * get_tensor_name(int i) const {
@@ -1868,10 +2217,13 @@ struct llama_model_loader {
         return tensor;
     }
 
-    struct lm_ggml_tensor * create_tensor(struct lm_ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, lm_ggml_backend_type backend) {
+    struct lm_ggml_tensor * create_tensor(struct lm_ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, lm_ggml_backend_type backend, bool required = true) {
         struct lm_ggml_tensor * cur = lm_ggml_get_tensor(ctx_meta, name.c_str());
 
         if (cur == NULL) {
+            if (!required) {
+                return NULL;
+            }
             throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
         }
 
@@ -2075,49 +2427,56 @@ static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
 static void llm_load_hparams(
         llama_model_loader & ml,
         llama_model & model) {
-    struct lm_gguf_context * ctx = ml.ctx_gguf;
-
-    const auto kv = LLM_KV(model.arch);
-
     auto & hparams = model.hparams;
+    const lm_gguf_context * ctx = ml.ctx_gguf;
+
+    // get metadata as string
+    for (int i = 0; i < lm_gguf_get_n_kv(ctx); i++) {
+        enum lm_gguf_type type = lm_gguf_get_kv_type(ctx, i);
+        if (type == LM_GGUF_TYPE_ARRAY) {
+            continue;
+        }
+        const char * name = lm_gguf_get_key(ctx, i);
+        const std::string value = lm_gguf_kv_to_str(ctx, i);
+        model.lm_gguf_kv.emplace(name, value);
+    }
 
     // get general kv
-    LM_GGUF_GET_KEY(ctx, model.name, lm_gguf_get_val_str, LM_GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME));
+    ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
 
     // get hparams kv
-    LM_GGUF_GET_KEY(ctx, hparams.n_vocab,        lm_gguf_get_arr_n,   LM_GGUF_TYPE_ARRAY,  true, kv(LLM_KV_TOKENIZER_LIST));
-    LM_GGUF_GET_KEY(ctx, hparams.n_ctx_train,    lm_gguf_get_val_u32, LM_GGUF_TYPE_UINT32, true, kv(LLM_KV_CONTEXT_LENGTH));
-    LM_GGUF_GET_KEY(ctx, hparams.n_embd,         lm_gguf_get_val_u32, LM_GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH));
-    LM_GGUF_GET_KEY(ctx, hparams.n_ff,           lm_gguf_get_val_u32, LM_GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH));
-    LM_GGUF_GET_KEY(ctx, hparams.n_head,         lm_gguf_get_val_u32, LM_GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
-    LM_GGUF_GET_KEY(ctx, hparams.n_layer,        lm_gguf_get_val_u32, LM_GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT));
+    ml.get_arr_n(LLM_KV_TOKENIZER_LIST,       hparams.n_vocab);
+    ml.get_key  (LLM_KV_CONTEXT_LENGTH,       hparams.n_ctx_train);
+    ml.get_key  (LLM_KV_EMBEDDING_LENGTH,     hparams.n_embd);
+    ml.get_key  (LLM_KV_FEED_FORWARD_LENGTH,  hparams.n_ff);
+    ml.get_key  (LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head);
+    ml.get_key  (LLM_KV_BLOCK_COUNT,          hparams.n_layer);
 
     // n_head_kv is optional, default to n_head
     hparams.n_head_kv = hparams.n_head;
-    LM_GGUF_GET_KEY(ctx, hparams.n_head_kv, lm_gguf_get_val_u32, LM_GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
+    ml.get_key(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv, false);
 
-    hparams.rope_finetuned = false;
-    LM_GGUF_GET_KEY(ctx, hparams.rope_finetuned, lm_gguf_get_val_bool, LM_GGUF_TYPE_BOOL, false,
-                 kv(LLM_KV_ROPE_SCALING_FINETUNED));
+    bool rope_finetuned = false;
+    ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
+    hparams.rope_finetuned = rope_finetuned;
 
     hparams.n_yarn_orig_ctx = hparams.n_ctx_train;
-    LM_GGUF_GET_KEY(ctx, hparams.n_yarn_orig_ctx, lm_gguf_get_val_u32, LM_GGUF_TYPE_UINT32, false,
-                 kv(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN));
+    ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_yarn_orig_ctx, false);
 
     // rope_freq_base (optional)
     hparams.rope_freq_base_train = 10000.0f;
-    LM_GGUF_GET_KEY(ctx, hparams.rope_freq_base_train, lm_gguf_get_val_f32, LM_GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
+    ml.get_key(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train, false);
 
     std::string rope_scaling("linear");
-    LM_GGUF_GET_KEY(ctx, rope_scaling, lm_gguf_get_val_str, LM_GGUF_TYPE_STRING, false, kv(LLM_KV_ROPE_SCALING_TYPE));
+    ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
     hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
     LM_GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_UNSPECIFIED);
 
     // rope_freq_scale (inverse of the kv) is optional
     float ropescale = 0.0f;
-    LM_GGUF_GET_KEY(ctx, ropescale, lm_gguf_get_val_f32, LM_GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALING_FACTOR));
-    if (ropescale == 0.0f) { // try the old key name
-        LM_GGUF_GET_KEY(ctx, ropescale, lm_gguf_get_val_f32, LM_GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
+    if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
+        // try the old key name
+        ml.get_key(LLM_KV_ROPE_SCALE_LINEAR, ropescale, false);
     }
     hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
 
@@ -2125,7 +2484,7 @@ static void llm_load_hparams(
     {
         hparams.n_rot = hparams.n_embd / hparams.n_head;
 
-        LM_GGUF_GET_KEY(ctx, hparams.n_rot, lm_gguf_get_val_u32, LM_GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT));
+        ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
 
         if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
             if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
@@ -2140,7 +2499,7 @@ static void llm_load_hparams(
     switch (model.arch) {
         case LLM_ARCH_LLAMA:
             {
-                LM_GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, lm_gguf_get_val_f32, LM_GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 
                 switch (hparams.n_layer) {
                     case 26: model.type = e_model::MODEL_3B; break;
@@ -2154,7 +2513,7 @@ static void llm_load_hparams(
             } break;
         case LLM_ARCH_FALCON:
             {
-                LM_GGUF_GET_KEY(ctx, hparams.f_norm_eps, lm_gguf_get_val_f32, LM_GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
 
                 switch (hparams.n_layer) {
                     case 32: model.type = e_model::MODEL_7B; break;
@@ -2164,7 +2523,7 @@ static void llm_load_hparams(
             } break;
         case LLM_ARCH_BAICHUAN:
             {
-                LM_GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, lm_gguf_get_val_f32, LM_GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                 switch (hparams.n_layer) {
                     case 32: model.type = e_model::MODEL_7B; break;
                     case 40: model.type = e_model::MODEL_13B; break;
@@ -2173,7 +2532,7 @@ static void llm_load_hparams(
             } break;
         case LLM_ARCH_STARCODER:
             {
-                LM_GGUF_GET_KEY(ctx, hparams.f_norm_eps, lm_gguf_get_val_f32, LM_GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
                 switch (hparams.n_layer) {
                     case 24: model.type = e_model::MODEL_1B; break;
                     case 36: model.type = e_model::MODEL_3B; break;
@@ -2184,7 +2543,7 @@ static void llm_load_hparams(
             } break;
         case LLM_ARCH_PERSIMMON:
             {
-                LM_GGUF_GET_KEY(ctx, hparams.f_norm_eps, lm_gguf_get_val_f32, LM_GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
                 switch (hparams.n_layer) {
                     case 36: model.type = e_model::MODEL_8B; break;
                     default: model.type = e_model::MODEL_UNKNOWN;
@@ -2192,7 +2551,7 @@ static void llm_load_hparams(
             } break;
         case LLM_ARCH_REFACT:
             {
-                LM_GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, lm_gguf_get_val_f32, LM_GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                 switch (hparams.n_layer) {
                     case 32: model.type = e_model::MODEL_1B; break;
                     default: model.type = e_model::MODEL_UNKNOWN;
@@ -2200,7 +2559,7 @@ static void llm_load_hparams(
             } break;
         case LLM_ARCH_BLOOM:
             {
-                LM_GGUF_GET_KEY(ctx, hparams.f_norm_eps, lm_gguf_get_val_f32, LM_GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
 
                 switch (hparams.n_layer) {
                     case 24: model.type = e_model::MODEL_1B; break;
@@ -2215,9 +2574,9 @@ static void llm_load_hparams(
             {
                 hparams.f_clamp_kqv = 0.0f;
 
-                LM_GGUF_GET_KEY(ctx, hparams.f_norm_eps, lm_gguf_get_val_f32, LM_GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
-                LM_GGUF_GET_KEY(ctx, hparams.f_clamp_kqv, lm_gguf_get_val_f32, LM_GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_CLAMP_KQV));
-                LM_GGUF_GET_KEY(ctx, hparams.f_max_alibi_bias, lm_gguf_get_val_f32, LM_GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS));
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,  hparams.f_norm_eps);
+                ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV,      hparams.f_clamp_kqv, false);
+                ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
 
                 switch (hparams.n_layer) {
                     case 32: model.type = e_model::MODEL_7B; break;
@@ -2225,6 +2584,26 @@ static void llm_load_hparams(
                     default: model.type = e_model::MODEL_UNKNOWN;
                 }
             } break;
+        case LLM_ARCH_STABLELM:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+
+                switch (hparams.n_layer) {
+                    case 32: model.type = e_model::MODEL_3B; break;
+                    default: model.type = e_model::MODEL_UNKNOWN;
+               }
+            } break;
+        case LLM_ARCH_QWEN:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+                switch (hparams.n_layer) {
+                    case 32: model.type = e_model::MODEL_7B; break;
+                    case 40: model.type = e_model::MODEL_13B; break;
+                    default: model.type = e_model::MODEL_UNKNOWN;
+                }
+            } break;
+
         default: (void)0;
     }
 
@@ -2265,7 +2644,7 @@ static void llm_load_vocab(
     {
         std::string tokenizer_name;
 
-        LM_GGUF_GET_KEY(ctx, tokenizer_name, lm_gguf_get_val_str, LM_GGUF_TYPE_STRING, true, kv(LLM_KV_TOKENIZER_MODEL));
+        ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name);
 
         if (tokenizer_name == "llama") {
             vocab.type = LLAMA_VOCAB_TYPE_SPM;
@@ -2355,16 +2734,30 @@ static void llm_load_vocab(
         };
         for (const auto & it : special_token_types) {
             const std::string & key = kv(std::get<0>(it));
-            int32_t & id = std::get<1>(it), old_id = id;
+            int32_t & id = std::get<1>(it);
+
+            uint32_t new_id;
+            if (!ml.get_key(std::get<0>(it), new_id, false)) {
+                continue;
+            }
+            if (new_id >= vocab.id_to_token.size()) {
+                LLAMA_LOG_WARN("%s: bad special token: '%s' = %ud, using default id %d\n",
+                    __func__, key.c_str(), new_id, id);
+            } else {
+                id = new_id;
+            }
+
+        }
 
-            LM_GGUF_GET_KEY(ctx, id, lm_gguf_get_val_u32, LM_GGUF_TYPE_UINT32, false, key);
-            // Must be >= -1 and < vocab size. Since the key is unsigned, -1
-            // can only come from the default value, so there's no point in
-            // validating that.
-            if (size_t(id + 1) > vocab.id_to_token.size()) {
-                LLAMA_LOG_WARN("%s: bad special token: '%s' = %d, using default id %d\n",
-                    __func__, key.c_str(), id, old_id);
-                id = old_id;
+        // Handle add_bos_token and add_eos_token
+        {
+            bool temp = true;
+
+            if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) {
+                vocab.special_add_bos = int(temp);
+            }
+            if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
+                vocab.special_add_eos = int(temp);
             }
         }
     }
@@ -2497,22 +2890,22 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
     LLAMA_LOG_INFO("%s: model type       = %s\n",     __func__, llama_model_type_name(model.type));
     LLAMA_LOG_INFO("%s: model ftype      = %s\n",     __func__, llama_model_ftype_name(model.ftype).c_str());
     LLAMA_LOG_INFO("%s: model params     = %.2f B\n", __func__, ml.n_elements*1e-9);
-    if (ml.n_bytes < GB) {
-        LLAMA_LOG_INFO("%s: model size       = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
+    if (ml.n_bytes < GiB) {
+        LLAMA_LOG_INFO("%s: model size       = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0,        ml.n_bytes*8.0/ml.n_elements);
     } else {
         LLAMA_LOG_INFO("%s: model size       = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
     }
 
     // general kv
-    LLAMA_LOG_INFO("%s: general.name   = %s\n",    __func__, model.name.c_str());
+    LLAMA_LOG_INFO("%s: general.name     = %s\n",    __func__, model.name.c_str());
 
     // special tokens
-    if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
-    if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
-    if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
-    if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
-    if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
-    if (vocab.linefeed_id    != -1) { LLAMA_LOG_INFO( "%s: LF token  = %d '%s'\n", __func__, vocab.linefeed_id,    vocab.id_to_token[vocab.linefeed_id].text.c_str() );    }
+    if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token        = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
+    if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token        = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
+    if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token        = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
+    if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token        = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
+    if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token        = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
+    if (vocab.linefeed_id    != -1) { LLAMA_LOG_INFO( "%s: LF token         = %d '%s'\n", __func__, vocab.linefeed_id,    vocab.id_to_token[vocab.linefeed_id].text.c_str() );    }
 }
 
 static void llm_load_tensors(
@@ -2536,7 +2929,7 @@ static void llm_load_tensors(
 
     ml.calc_sizes(ctx_size, mmapped_size);
 
-    LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
+    LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0);
 
     // create the ggml context
     {
@@ -2598,14 +2991,7 @@ static void llm_load_tensors(
                         lm_ggml_backend_type backend_output;
 
                         if (n_gpu_layers > int(n_layer)) {
-                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
-                            // on Windows however this is detrimental unless everything is on the GPU
-#ifndef _WIN32
-                            backend_norm = llama_backend_offload;
-#else
-                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : llama_backend_offload;
-#endif // _WIN32
-
+                            backend_norm   = llama_backend_offload;
                             backend_output = llama_backend_offload_split;
                         } else {
                             backend_norm   = LM_GGML_BACKEND_CPU;
@@ -2642,6 +3028,12 @@ static void llm_load_tensors(
                         layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, backend_split);
                         layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},     backend_split);
 
+                        // optional bias tensors
+                        layer.bq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     backend, false);
+                        layer.bk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, backend, false);
+                        layer.bv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, backend, false);
+                        layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     backend, false);
+
                         layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
 
                         layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, backend_split);
@@ -2650,9 +3042,14 @@ static void llm_load_tensors(
 
                         if (backend == LM_GGML_BACKEND_GPU) {
                             vram_weights +=
-                                lm_ggml_nbytes(layer.attn_norm) + lm_ggml_nbytes(layer.wq)       + lm_ggml_nbytes(layer.wk)       +
-                                lm_ggml_nbytes(layer.wv)        + lm_ggml_nbytes(layer.wo)       + lm_ggml_nbytes(layer.ffn_norm) +
-                                lm_ggml_nbytes(layer.ffn_gate)  + lm_ggml_nbytes(layer.ffn_down) + lm_ggml_nbytes(layer.ffn_up);
+                                lm_ggml_nbytes(layer.attn_norm) + lm_ggml_nbytes(layer.wq) + lm_ggml_nbytes(layer.wk) +
+                                lm_ggml_nbytes(layer.wv) + lm_ggml_nbytes(layer.wo) +
+                                (layer.bq ? lm_ggml_nbytes(layer.bq) : 0) +
+                                (layer.bk ? lm_ggml_nbytes(layer.bk) : 0) +
+                                (layer.bv ? lm_ggml_nbytes(layer.bv) : 0) +
+                                (layer.bo ? lm_ggml_nbytes(layer.bo) : 0) +
+                                lm_ggml_nbytes(layer.ffn_norm) + lm_ggml_nbytes(layer.ffn_gate) +
+                                lm_ggml_nbytes(layer.ffn_down) + lm_ggml_nbytes(layer.ffn_up);
                         }
                     }
                 } break;
@@ -2664,14 +3061,7 @@ static void llm_load_tensors(
                         lm_ggml_backend_type backend_output;
 
                         if (n_gpu_layers > int(n_layer)) {
-                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
-                            // on Windows however this is detrimental unless everything is on the GPU
-#ifndef _WIN32
-                            backend_norm = llama_backend_offload;
-#else
-                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : llama_backend_offload;
-#endif // _WIN32
-
+                            backend_norm   = llama_backend_offload;
                             backend_output = llama_backend_offload_split;
                         } else {
                             backend_norm   = LM_GGML_BACKEND_CPU;
@@ -2734,14 +3124,7 @@ static void llm_load_tensors(
                         lm_ggml_backend_type backend_output;
 
                         if (n_gpu_layers > int(n_layer)) {
-                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
-                            // on Windows however this is detrimental unless everything is on the GPU
-#ifndef _WIN32
-                            backend_norm = llama_backend_offload;
-#else
-                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : llama_backend_offload;
-#endif // _WIN32
-
+                            backend_norm   = llama_backend_offload;
                             backend_output = llama_backend_offload_split;
                         } else {
                             backend_norm   = LM_GGML_BACKEND_CPU;
@@ -2811,14 +3194,7 @@ static void llm_load_tensors(
                         lm_ggml_backend_type backend_output;
 
                         if (n_gpu_layers > int(n_layer)) {
-                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
-                            // on Windows however this is detrimental unless everything is on the GPU
-#ifndef _WIN32
-                            backend_norm = llama_backend_offload;
-#else
-                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : llama_backend_offload;
-#endif // _WIN32
-
+                            backend_norm   = llama_backend_offload;
                             backend_output = llama_backend_offload_split;
                         } else {
                             backend_norm   = LM_GGML_BACKEND_CPU;
@@ -2888,14 +3264,7 @@ static void llm_load_tensors(
                         lm_ggml_backend_type backend_output;
 
                         if (n_gpu_layers > int(n_layer)) {
-                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
-                            // on Windows however this is detrimental unless everything is on the GPU
-#ifndef _WIN32
-                            backend_norm = llama_backend_offload;
-#else
-                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : llama_backend_offload;
-#endif // _WIN32
-
+                            backend_norm   = llama_backend_offload;
                             backend_output = llama_backend_offload_split;
                         } else {
                             backend_norm   = LM_GGML_BACKEND_CPU;
@@ -2954,14 +3323,7 @@ static void llm_load_tensors(
                         lm_ggml_backend_type backend_output;
 
                         if (n_gpu_layers > int(n_layer)) {
-                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
-                            // on Windows however this is detrimental unless everything is on the GPU
-#ifndef _WIN32
-                            backend_norm = llama_backend_offload;
-#else
-                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : llama_backend_offload;
-#endif // _WIN32
-
+                            backend_norm   = llama_backend_offload;
                             backend_output = llama_backend_offload_split;
                         } else {
                             backend_norm   = LM_GGML_BACKEND_CPU;
@@ -3032,14 +3394,7 @@ static void llm_load_tensors(
                         lm_ggml_backend_type backend_output;
 
                         if (n_gpu_layers > int(n_layer)) {
-                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
-                            // on Windows however this is detrimental unless everything is on the GPU
-#ifndef _WIN32
-                            backend_norm = llama_backend_offload;
-#else
-                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : llama_backend_offload;
-#endif // _WIN32
-
+                            backend_norm   = llama_backend_offload;
                             backend_output = llama_backend_offload_split;
                         } else {
                             backend_norm   = LM_GGML_BACKEND_CPU;
@@ -3089,41 +3444,167 @@ static void llm_load_tensors(
                         }
                     }
                 } break;
-            default:
-                throw std::runtime_error("unknown architecture");
-        }
-    }
+            case LLM_ARCH_STABLELM:
+                {
+                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, LM_GGML_BACKEND_CPU);
 
-    ml.done_getting_tensors();
+                    // output
+                    {
+                        lm_ggml_backend_type backend_norm;
+                        lm_ggml_backend_type backend_output;
 
-    // print memory requirements
-    {
-        // this is the total memory required to run the inference
-        size_t mem_required =
-            ctx_size +
-            mmapped_size - vram_weights; // weights in VRAM not in memory
+                        if (n_gpu_layers > int(n_layer)) {
+                            backend_norm   = llama_backend_offload;
+                            backend_output = llama_backend_offload_split;
+                        } else {
+                            backend_norm   = LM_GGML_BACKEND_CPU;
+                            backend_output = LM_GGML_BACKEND_CPU;
+                        }
 
-        LLAMA_LOG_INFO("%s: mem required  = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
+                        model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd},          backend_norm);
+                        model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd},          backend_norm);
+                        model.output      = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
 
-#if defined(LM_GGML_USE_CUBLAS) || defined(LM_GGML_USE_CLBLAST)
-        const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
+                        if (backend_norm == LM_GGML_BACKEND_GPU) {
+                            vram_weights += lm_ggml_nbytes(model.output_norm);
+                        }
+                        if (backend_output == LM_GGML_BACKEND_GPU_SPLIT) {
+                            vram_weights += lm_ggml_nbytes(model.output);
+                        }
+                    }
 
-        LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
-        if (n_gpu_layers > (int) hparams.n_layer) {
-            LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
-        }
+                    const uint32_t n_ff = hparams.n_ff;
 
-#ifdef LM_GGML_USE_CUBLAS
-        const int max_backend_supported_layers = hparams.n_layer + 3;
-        const int max_offloadable_layers       = hparams.n_layer + 3;
-#elif LM_GGML_USE_CLBLAST
-        const int max_backend_supported_layers = hparams.n_layer + 1;
-        const int max_offloadable_layers       = hparams.n_layer + 1;
-#endif // LM_GGML_USE_CUBLAS
+                    const int i_gpu_start = n_layer - n_gpu_layers;
 
-        LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
-        LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1024.0 / 1024.0);
-#else
+                    model.layers.resize(n_layer);
+
+                    for (uint32_t i = 0; i < n_layer; ++i) {
+                        /*
+                        llama_model_loader: - tensor    4:         blk.0.attn_output.weight f16      [  2560,  2560,     1,     1 ]
+                        */
+                        const lm_ggml_backend_type backend = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
+                        const lm_ggml_backend_type backend_split = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
+
+                        auto & layer = model.layers[i];
+
+                        layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
+                        layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
+
+                        layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd},     backend_split);
+                        layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, backend_split);
+                        layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, backend_split);
+                        layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},     backend_split);
+
+                        layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
+                        layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
+
+                        layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, backend_split);
+                        layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
+                        layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
+
+                        if (backend == LM_GGML_BACKEND_GPU) {
+                            vram_weights +=
+                                lm_ggml_nbytes(layer.attn_norm) + lm_ggml_nbytes(layer.wq)       + lm_ggml_nbytes(layer.wk)       +
+                                lm_ggml_nbytes(layer.wv)        + lm_ggml_nbytes(layer.wo)       + lm_ggml_nbytes(layer.ffn_norm) +
+                                lm_ggml_nbytes(layer.ffn_gate)  + lm_ggml_nbytes(layer.ffn_down) + lm_ggml_nbytes(layer.ffn_up);
+                        }
+                    }
+                } break;
+            case LLM_ARCH_QWEN:
+                {
+                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, LM_GGML_BACKEND_CPU);
+                    {
+                        lm_ggml_backend_type backend_norm;
+                        lm_ggml_backend_type backend_output;
+
+                        if (n_gpu_layers > int(n_layer)) {
+                            backend_norm   = llama_backend_offload;
+                            backend_output = llama_backend_offload_split;
+                        } else {
+                            backend_norm   = LM_GGML_BACKEND_CPU;
+                            backend_output = LM_GGML_BACKEND_CPU;
+                        }
+
+                        model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd},          backend_norm);
+                        model.output      = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
+
+                        if (backend_norm == LM_GGML_BACKEND_GPU) {
+                            vram_weights += lm_ggml_nbytes(model.output_norm);
+                        }
+                        if (backend_output == LM_GGML_BACKEND_GPU_SPLIT) {
+                            vram_weights += lm_ggml_nbytes(model.output);
+                        }
+                    }
+
+                    const uint32_t n_ff = hparams.n_ff / 2;
+
+                    const int i_gpu_start = n_layer - n_gpu_layers;
+
+                    model.layers.resize(n_layer);
+
+                    for (uint32_t i = 0; i < n_layer; ++i) {
+                        const lm_ggml_backend_type backend = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
+                        const lm_ggml_backend_type backend_split = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
+
+                        auto & layer = model.layers[i];
+
+                        layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
+
+                        layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd * 3}, backend_split);
+                        layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd * 3},         backend);
+                        layer.wo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},     backend_split);
+
+                        layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
+
+                        layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, backend_split);
+                        layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
+                        layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
+
+                        if (backend == LM_GGML_BACKEND_GPU) {
+                            vram_weights +=
+                                lm_ggml_nbytes(layer.attn_norm) + lm_ggml_nbytes(layer.wqkv)     + lm_ggml_nbytes(layer.bqkv)     +
+                                lm_ggml_nbytes(layer.wo)        + lm_ggml_nbytes(layer.ffn_norm) + lm_ggml_nbytes(layer.ffn_gate) +
+                                lm_ggml_nbytes(layer.ffn_down)  + lm_ggml_nbytes(layer.ffn_up);
+                        }
+                    }
+                } break;
+
+            default:
+                throw std::runtime_error("unknown architecture");
+        }
+    }
+
+    ml.done_getting_tensors();
+
+    // print memory requirements
+    {
+        // this is the total memory required to run the inference
+        size_t mem_required =
+            ctx_size +
+            mmapped_size - vram_weights; // weights in VRAM not in memory
+
+        LLAMA_LOG_INFO("%s: mem required  = %7.2f MiB\n", __func__, mem_required / 1024.0 / 1024.0);
+
+#if defined(LM_GGML_USE_CUBLAS) || defined(LM_GGML_USE_CLBLAST)
+        const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
+
+        LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
+        if (n_gpu_layers > (int) hparams.n_layer) {
+            LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
+        }
+
+#ifdef LM_GGML_USE_CUBLAS
+        const int max_backend_supported_layers = hparams.n_layer + 1;
+        const int max_offloadable_layers       = hparams.n_layer + 1;
+#elif LM_GGML_USE_CLBLAST
+        const int max_backend_supported_layers = hparams.n_layer + 1;
+        const int max_offloadable_layers       = hparams.n_layer + 1;
+#endif // LM_GGML_USE_CUBLAS
+
+        LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
+        LLAMA_LOG_INFO("%s: VRAM used: %.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
+#else
         (void) n_gpu_layers;
 #endif // defined(LM_GGML_USE_CUBLAS) || defined(LM_GGML_USE_CLBLAST)
     }
@@ -3156,7 +3637,7 @@ static void llm_load_tensors(
 
 static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
     try {
-        llama_model_loader ml(fname, params.use_mmap);
+        llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
 
         model.hparams.vocab_only = params.vocab_only;
 
@@ -3252,7 +3733,7 @@ static void llm_build_k_shift(
        struct lm_ggml_cgraph * graph,
             llm_rope_type   type,
                   int64_t   n_ctx,
-                  int64_t   n_rot,
+                  int       n_rot,
                   float     freq_base,
                   float     freq_scale,
        const llm_build_cb & cb) {
@@ -3283,11 +3764,11 @@ static void llm_build_k_shift(
         struct lm_ggml_tensor * tmp =
             // we rotate only the first n_rot dimensions
             lm_ggml_rope_custom_inplace(ctx,
-                    lm_ggml_view_3d(ctx, kv.k,
-                        n_rot, n_head_kv, n_ctx,
-                        lm_ggml_element_size(kv.k)*n_embd_head,
-                        lm_ggml_element_size(kv.k)*n_embd_gqa,
-                        lm_ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il),
+                    lm_ggml_view_3d(ctx, kv.k_l[il],
+                        n_embd_head, n_head_kv, n_ctx,
+                        lm_ggml_type_sizef(kv.k_l[il]->type)*n_embd_head,
+                        lm_ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa,
+                        0),
                     K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow);
         cb(tmp, "K_shifted", il);
@@ -3314,13 +3795,13 @@ static void llm_build_kv_store(
     //struct lm_ggml_tensor * v_cur_t = lm_ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed
     cb(v_cur_t, "v_cur_t", il);
 
-    struct lm_ggml_tensor * k_cache_view = lm_ggml_view_1d(ctx, kv.k, n_tokens*n_embd_gqa,
-            (lm_ggml_element_size(kv.k)*n_embd_gqa)*(il*n_ctx + kv_head));
+    struct lm_ggml_tensor * k_cache_view = lm_ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_gqa,
+            (lm_ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa)*kv_head);
     cb(k_cache_view, "k_cache_view", il);
 
-    struct lm_ggml_tensor * v_cache_view = lm_ggml_view_2d(ctx, kv.v, n_tokens, n_embd_gqa,
-            (   n_ctx)*lm_ggml_element_size(kv.v),
-            (il*n_ctx)*lm_ggml_element_size(kv.v)*n_embd_gqa + kv_head*lm_ggml_element_size(kv.v));
+    struct lm_ggml_tensor * v_cache_view = lm_ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_gqa,
+            (  n_ctx)*lm_ggml_element_size(kv.v_l[il]),
+            (kv_head)*lm_ggml_element_size(kv.v_l[il]));
     cb(v_cache_view, "v_cache_view", il);
 
     // important: storing RoPE-ed version of K in the KV cache!
@@ -3472,40 +3953,46 @@ static struct lm_ggml_tensor * llm_build_kqv(
     cb(q, "q", il);
 
     struct lm_ggml_tensor * k =
-        lm_ggml_view_3d(ctx, kv.k,
+        lm_ggml_view_3d(ctx, kv.k_l[il],
                 n_embd_head, n_kv, n_head_kv,
-                lm_ggml_element_size(kv.k)*n_embd_gqa,
-                lm_ggml_element_size(kv.k)*n_embd_head,
-                lm_ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il);
+                lm_ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa,
+                lm_ggml_type_sizef(kv.k_l[il]->type)*n_embd_head,
+                0);
     cb(k, "k", il);
 
     struct lm_ggml_tensor * kq = lm_ggml_mul_mat(ctx, k, q);
     cb(kq, "kq", il);
 
-    kq = lm_ggml_scale(ctx, kq, kq_scale);
-    cb(kq, "kq_scaled", il);
-
     if (max_alibi_bias > 0.0f) {
-        // TODO: n_head or n_head_kv
-        // TODO: K-shift is likely not working
-        // TODO: change to lm_ggml_add
-        kq = lm_ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias);
-        cb(kq, "kq_scaled_alibi", il);
-    }
+        // temporary branch until we figure out how to handle lm_ggml_alibi through lm_ggml_add
+        kq = lm_ggml_scale(ctx, kq, kq_scale);
+        cb(kq, "kq_scaled", il);
+
+        if (max_alibi_bias > 0.0f) {
+            // TODO: n_head or n_head_kv
+            // TODO: K-shift is likely not working
+            // TODO: change to lm_ggml_add
+            kq = lm_ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias);
+            cb(kq, "kq_scaled_alibi", il);
+        }
 
-    kq = lm_ggml_add(ctx, kq, kq_mask);
-    cb(kq, "kq_masked", il);
+        kq = lm_ggml_add(ctx, kq, kq_mask);
+        cb(kq, "kq_masked", il);
 
-    kq = lm_ggml_soft_max(ctx, kq);
-    cb(kq, "kq_soft_max", il);
+        kq = lm_ggml_soft_max(ctx, kq);
+        cb(kq, "kq_soft_max", il);
+    } else {
+        kq = lm_ggml_soft_max_ext(ctx, kq, kq_mask, 1.0f/sqrtf(float(n_embd_head)));
+        cb(kq, "kq_soft_max_ext", il);
+    }
 
     // split cached v into n_head heads
     struct lm_ggml_tensor * v =
-        lm_ggml_view_3d(ctx, kv.v,
+        lm_ggml_view_3d(ctx, kv.v_l[il],
                 n_kv, n_embd_head, n_head_kv,
-                lm_ggml_element_size(kv.v)*n_ctx,
-                lm_ggml_element_size(kv.v)*n_ctx*n_embd_head,
-                lm_ggml_element_size(kv.v)*n_ctx*n_embd_gqa*il);
+                lm_ggml_element_size(kv.v_l[il])*n_ctx,
+                lm_ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head,
+                0);
     cb(v, "v", il);
 
     struct lm_ggml_tensor * kqv = lm_ggml_mul_mat(ctx, v, kq);
@@ -3622,7 +4109,7 @@ struct llm_build_context {
     }
 
     struct lm_ggml_cgraph * build_llama() {
-        struct lm_ggml_cgraph * gf = lm_ggml_new_graph(ctx0);
+        struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
 
         LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -3663,12 +4150,24 @@ struct llm_build_context {
                 // compute Q and K and RoPE them
                 struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
+                if (model.layers[il].bq) {
+                    Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
+                    cb(Qcur, "Qcur", il);
+                }
 
                 struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
+                if (model.layers[il].bk) {
+                    Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
+                    cb(Kcur, "Kcur", il);
+                }
 
                 struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
+                if (model.layers[il].bv) {
+                    Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
+                    cb(Vcur, "Vcur", il);
+                }
 
                 Qcur = lm_ggml_rope_custom(
                     ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos,
@@ -3687,7 +4186,7 @@ struct llm_build_context {
                 llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
 
                 cur = llm_build_kqv(ctx0, hparams, kv_self,
-                        model.layers[il].wo, NULL,
+                        model.layers[il].wo, model.layers[il].bo,
                         Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
                 cb(cur, "kqv_out", il);
             }
@@ -3734,7 +4233,7 @@ struct llm_build_context {
     }
 
     struct lm_ggml_cgraph * build_baichuan() {
-        struct lm_ggml_cgraph * gf = lm_ggml_new_graph(ctx0);
+        struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
 
         struct lm_ggml_tensor * cur;
         struct lm_ggml_tensor * inpL;
@@ -3854,7 +4353,7 @@ struct llm_build_context {
     }
 
     struct lm_ggml_cgraph * build_falcon() {
-        struct lm_ggml_cgraph * gf = lm_ggml_new_graph(ctx0);
+        struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
 
         struct lm_ggml_tensor * cur;
         struct lm_ggml_tensor * inpL;
@@ -3976,7 +4475,7 @@ struct llm_build_context {
     }
 
     struct lm_ggml_cgraph * build_starcoder() {
-        struct lm_ggml_cgraph * gf = lm_ggml_new_graph(ctx0);
+        struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
 
         struct lm_ggml_tensor * cur;
         struct lm_ggml_tensor * pos;
@@ -4075,7 +4574,7 @@ struct llm_build_context {
     }
 
     struct lm_ggml_cgraph * build_persimmon() {
-        struct lm_ggml_cgraph * gf = lm_ggml_new_graph(ctx0);
+        struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
 
         const int64_t n_rot = n_embd_head / 2;
 
@@ -4085,6 +4584,7 @@ struct llm_build_context {
         inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
         cb(inpL, "imp_embd", -1);
 
+        // inp_pos - contains the positions
         struct lm_ggml_tensor * inp_pos = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens);
         cb(inp_pos, "inp_pos", -1);
 
@@ -4092,6 +4592,7 @@ struct llm_build_context {
         struct lm_ggml_tensor * KQ_scale = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, 1);
         cb(KQ_scale, "KQ_scale", -1);
 
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
         struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1);
         cb(KQ_mask, "KQ_mask", -1);
 
@@ -4220,7 +4721,7 @@ struct llm_build_context {
                 struct lm_ggml_tensor * Kcur = lm_ggml_concat(ctx0, krotated, kpass);
                 cb(Kcur, "Kcur", il);
 
-                struct lm_ggml_tensor * Q = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, Qcur, 1, 2, 0, 3));
+                struct lm_ggml_tensor * Q = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
                 cb(Q, "Q", il);
 
                 Kcur = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
@@ -4285,7 +4786,7 @@ struct llm_build_context {
     }
 
     struct lm_ggml_cgraph * build_refact() {
-        struct lm_ggml_cgraph * gf = lm_ggml_new_graph(ctx0);
+        struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
 
         struct lm_ggml_tensor * cur;
         struct lm_ggml_tensor * inpL;
@@ -4330,25 +4831,218 @@ struct llm_build_context {
 
                 cur = llm_build_kqv(ctx0, hparams, kv_self,
                         model.layers[il].wo, NULL,
-                        Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, cb, il);
+                        Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, cb, il);
+                cb(cur, "kqv_out", il);
+            }
+
+            struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            {
+                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                        model.layers[il].ffn_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = llm_build_ffn(ctx0, cur,
+                        model.layers[il].ffn_up,   NULL,
+                        model.layers[il].ffn_gate, NULL,
+                        model.layers[il].ffn_down, NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = lm_ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = lm_ggml_mul_mat(ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        lm_ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct lm_ggml_cgraph * build_bloom() {
+        struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+
+        struct lm_ggml_tensor * cur;
+        struct lm_ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+        cb(inpL, "inp_embd", -1);
+
+        // KQ_scale
+        struct lm_ggml_tensor * KQ_scale = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, 1);
+        cb(KQ_scale, "KQ_scale", -1);
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask, "KQ_mask", -1);
+
+        inpL = llm_build_norm(ctx0, inpL, hparams,
+                model.tok_norm,
+                model.tok_norm_b,
+                LLM_NORM, cb, -1);
+        cb(inpL, "inp_norm", -1);
+
+        for (int il = 0; il < n_layer; ++il) {
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm,
+                    model.layers[il].attn_norm_b,
+                    LLM_NORM, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                cur = lm_ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
+                cb(cur, "wqkv", il);
+
+                cur = lm_ggml_add(ctx0, cur, model.layers[il].bqkv);
+                cb(cur, "bqkv", il);
+
+                struct lm_ggml_tensor * Qcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+                struct lm_ggml_tensor * Kcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+                struct lm_ggml_tensor * Vcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+
+                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
+
+                cur = llm_build_kqv(ctx0, hparams, kv_self,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, cb, il);
+                cb(cur, "kqv_out", il);
+            }
+
+            // Add the input
+            struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpL);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // FF
+            {
+                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                        model.layers[il].ffn_norm,
+                        model.layers[il].ffn_norm_b,
+                        LLM_NORM, cb, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = llm_build_ffn(ctx0, cur,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
+                        NULL,                      NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            inpL = lm_ggml_add(ctx0, cur, ffn_inp);
+            cb(inpL, "l_out", il);
+        }
+
+        cur = llm_build_norm(ctx0, inpL, hparams,
+                model.output_norm,
+                model.output_norm_b,
+                LLM_NORM, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        cur = lm_ggml_mul_mat(ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        lm_ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct lm_ggml_cgraph * build_mpt() {
+        struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+
+        struct lm_ggml_tensor * cur;
+        struct lm_ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+        cb(inpL, "inp_embd", -1);
+
+        // KQ_scale
+        struct lm_ggml_tensor * KQ_scale = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, 1);
+        cb(KQ_scale, "KQ_scale", -1);
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask, "KQ_mask", -1);
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct lm_ggml_tensor * attn_norm;
+
+            attn_norm = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm,
+                    NULL,
+                    LLM_NORM, cb, il);
+            cb(attn_norm, "attn_norm", il);
+
+            // self-attention
+            {
+                cur = attn_norm;
+
+                cur = lm_ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
+                cb(cur, "wqkv", il);
+
+                if (hparams.f_clamp_kqv > 0.0f) {
+                    cur = lm_ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
+                    cb(cur, "wqkv_clamped", il);
+                }
+
+                struct lm_ggml_tensor * Qcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+                struct lm_ggml_tensor * Kcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+                struct lm_ggml_tensor * Vcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+
+                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
+
+                cur = llm_build_kqv(ctx0, hparams, kv_self,
+                        model.layers[il].wo, NULL,
+                        Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, cb, il);
                 cb(cur, "kqv_out", il);
             }
 
-            struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
+            // Add the input
+            struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpL);
             cb(ffn_inp, "ffn_inp", il);
 
-            // feed-forward network
+            // feed forward
             {
                 cur = llm_build_norm(ctx0, ffn_inp, hparams,
-                        model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                        model.layers[il].ffn_norm,
+                        NULL,
+                        LLM_NORM, cb, il);
                 cb(cur, "ffn_norm", il);
 
                 cur = llm_build_ffn(ctx0, cur,
                         model.layers[il].ffn_up,   NULL,
-                        model.layers[il].ffn_gate, NULL,
+                        NULL,                      NULL,
                         model.layers[il].ffn_down, NULL,
-                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
                 cb(cur, "ffn_out", il);
             }
 
@@ -4362,11 +5056,11 @@ struct llm_build_context {
         cur = inpL;
 
         cur = llm_build_norm(ctx0, cur, hparams,
-                model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                model.output_norm,
+                NULL,
+                LLM_NORM, cb, -1);
         cb(cur, "result_norm", -1);
 
-        // lm_head
         cur = lm_ggml_mul_mat(ctx0, model.output, cur);
         cb(cur, "result_output", -1);
 
@@ -4375,7 +5069,7 @@ struct llm_build_context {
         return gf;
     }
 
-    struct lm_ggml_cgraph * build_bloom() {
+    struct lm_ggml_cgraph * build_stablelm() {
         struct lm_ggml_cgraph * gf = lm_ggml_new_graph(ctx0);
 
         struct lm_ggml_tensor * cur;
@@ -4384,6 +5078,10 @@ struct llm_build_context {
         inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
         cb(inpL, "inp_embd", -1);
 
+        // inp_pos - contains the positions
+        struct lm_ggml_tensor * inp_pos = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens);
+        cb(inp_pos, "inp_pos", -1);
+
         // KQ_scale
         struct lm_ggml_tensor * KQ_scale = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, 1);
         cb(KQ_scale, "KQ_scale", -1);
@@ -4392,13 +5090,15 @@ struct llm_build_context {
         struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1);
         cb(KQ_mask, "KQ_mask", -1);
 
-        inpL = llm_build_norm(ctx0, inpL, hparams,
-                model.tok_norm,
-                model.tok_norm_b,
-                LLM_NORM, cb, -1);
-        cb(inpL, "inp_norm", -1);
+        // shift the entire K-cache if needed
+        if (do_rope_shift) {
+            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, hparams.n_rot, freq_base, freq_scale, cb);
+        }
 
         for (int il = 0; il < n_layer; ++il) {
+            struct lm_ggml_tensor * inpSA = inpL;
+
+            // norm
             cur = llm_build_norm(ctx0, inpL, hparams,
                     model.layers[il].attn_norm,
                     model.layers[il].attn_norm_b,
@@ -4407,35 +5107,42 @@ struct llm_build_context {
 
             // self-attention
             {
-                cur = lm_ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
-                cb(cur, "wqkv", il);
+                // compute Q and K and RoPE them
+                struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
 
-                cur = lm_ggml_add(ctx0, cur, model.layers[il].bqkv);
-                cb(cur, "bqkv", il);
+                struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
 
-                struct lm_ggml_tensor * Qcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
-                struct lm_ggml_tensor * Kcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
-                struct lm_ggml_tensor * Vcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+                struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
 
+                Qcur = lm_ggml_rope_custom(
+                    ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos,
+                    hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
                 cb(Qcur, "Qcur", il);
-                cb(Kcur, "Kcur", il);
-                cb(Vcur, "Vcur", il);
 
-                Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+                Kcur = lm_ggml_rope_custom(
+                    ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
+                    hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
 
                 llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
 
                 cur = llm_build_kqv(ctx0, hparams, kv_self,
-                        model.layers[il].wo, model.layers[il].bo,
-                        Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, cb, il);
+                        model.layers[il].wo, NULL,
+                        Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
                 cb(cur, "kqv_out", il);
             }
 
-            // Add the input
-            struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpL);
+            struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
             cb(ffn_inp, "ffn_inp", il);
 
-            // FF
+            // feed-forward network
             {
                 cur = llm_build_norm(ctx0, ffn_inp, hparams,
                         model.layers[il].ffn_norm,
@@ -4444,23 +5151,29 @@ struct llm_build_context {
                 cb(cur, "ffn_norm", il);
 
                 cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
-                        NULL,                      NULL,
-                        model.layers[il].ffn_down, model.layers[il].ffn_down_b,
-                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
+                        model.layers[il].ffn_up,   NULL,
+                        model.layers[il].ffn_gate, NULL,
+                        model.layers[il].ffn_down, NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
                 cb(cur, "ffn_out", il);
             }
 
-            inpL = lm_ggml_add(ctx0, cur, ffn_inp);
-            cb(inpL, "l_out", il);
+            cur = lm_ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
         }
 
-        cur = llm_build_norm(ctx0, inpL, hparams,
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
                 model.output_norm,
                 model.output_norm_b,
                 LLM_NORM, cb, -1);
         cb(cur, "result_norm", -1);
 
+        // lm_head
         cur = lm_ggml_mul_mat(ctx0, model.output, cur);
         cb(cur, "result_output", -1);
 
@@ -4469,8 +5182,8 @@ struct llm_build_context {
         return gf;
     }
 
-    struct lm_ggml_cgraph * build_mpt() {
-        struct lm_ggml_cgraph * gf = lm_ggml_new_graph(ctx0);
+    struct lm_ggml_cgraph * build_qwen() {
+        struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
 
         struct lm_ggml_tensor * cur;
         struct lm_ggml_tensor * inpL;
@@ -4478,70 +5191,86 @@ struct llm_build_context {
         inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
         cb(inpL, "inp_embd", -1);
 
+        // inp_pos - contains the positions
+        struct lm_ggml_tensor * inp_pos= lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens);
+        cb(inp_pos, "inp_pos", -1);
+
         // KQ_scale
-        struct lm_ggml_tensor * KQ_scale = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, 1);
+        struct lm_ggml_tensor * KQ_scale= lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, 1);
         cb(KQ_scale, "KQ_scale", -1);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1);
+        struct lm_ggml_tensor * KQ_mask= lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1);
         cb(KQ_mask, "KQ_mask", -1);
 
+        // shift the entire K-cache if needed
+        if (do_rope_shift) {
+            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
+        }
+
         for (int il = 0; il < n_layer; ++il) {
-            struct lm_ggml_tensor * attn_norm;
+            struct lm_ggml_tensor * inpSA = inpL;
 
-            attn_norm = llm_build_norm(ctx0, inpL, hparams,
-                    model.layers[il].attn_norm,
-                    NULL,
-                    LLM_NORM, cb, il);
-            cb(attn_norm, "attn_norm", il);
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
 
             // self-attention
             {
-                cur = attn_norm;
-
                 cur = lm_ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
 
-                if (hparams.f_clamp_kqv > 0.0f) {
-                    cur = lm_ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
-                    cb(cur, "wqkv_clamped", il);
-                }
+                cur = lm_ggml_add(ctx0, cur, model.layers[il].bqkv);
+                cb(cur, "bqkv", il);
 
-                struct lm_ggml_tensor * Qcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
-                struct lm_ggml_tensor * Kcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
-                struct lm_ggml_tensor * Vcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+                struct lm_ggml_tensor * Qcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+                struct lm_ggml_tensor * Kcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+                struct lm_ggml_tensor * Vcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd)));
 
                 cb(Qcur, "Qcur", il);
                 cb(Kcur, "Kcur", il);
                 cb(Vcur, "Vcur", il);
 
-                Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+                Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+                Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+
+                // using mode = 2 for neox mode
+                Qcur = lm_ggml_rope_custom(
+                    ctx0, Qcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
+                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = lm_ggml_rope_custom(
+                    ctx0, Kcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
+                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
 
                 llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
 
                 cur = llm_build_kqv(ctx0, hparams, kv_self,
                         model.layers[il].wo, NULL,
-                        Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, cb, il);
+                        Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
                 cb(cur, "kqv_out", il);
             }
 
-            // Add the input
-            struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpL);
+            struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
             cb(ffn_inp, "ffn_inp", il);
 
-            // feed forward
+            // feed-forward forward
             {
                 cur = llm_build_norm(ctx0, ffn_inp, hparams,
-                        model.layers[il].ffn_norm,
-                        NULL,
-                        LLM_NORM, cb, il);
+                        model.layers[il].ffn_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
                 cb(cur, "ffn_norm", il);
 
                 cur = llm_build_ffn(ctx0, cur,
                         model.layers[il].ffn_up,   NULL,
-                        NULL,                      NULL,
+                        model.layers[il].ffn_gate, NULL,
                         model.layers[il].ffn_down, NULL,
-                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
+                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
                 cb(cur, "ffn_out", il);
             }
 
@@ -4555,11 +5284,11 @@ struct llm_build_context {
         cur = inpL;
 
         cur = llm_build_norm(ctx0, cur, hparams,
-                model.output_norm,
-                NULL,
-                LLM_NORM, cb, -1);
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
         cb(cur, "result_norm", -1);
 
+        // lm_head
         cur = lm_ggml_mul_mat(ctx0, model.output, cur);
         cb(cur, "result_output", -1);
 
@@ -4577,8 +5306,8 @@ struct llm_build_context {
 enum llm_offload_func_e {
     OFFLOAD_FUNC_NOP,
     OFFLOAD_FUNC,
-    OFFLOAD_FUNC_KQ,
-    OFFLOAD_FUNC_V,
+    OFFLOAD_FUNC_FRC, // force offload
+    OFFLOAD_FUNC_KQV,
     OFFLOAD_FUNC_NR,
     OFFLOAD_FUNC_EMB,
     OFFLOAD_FUNC_OUT,
@@ -4664,11 +5393,12 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
   //{ "inp_embd",                   OFFLOAD_FUNC_NR  }, // TODO: missing K-quants get_rows kernel
     { "pos_embd",                   OFFLOAD_FUNC_NR  },
 
-    { "inp_pos",                    OFFLOAD_FUNC_KQ  }, // this is often used for KQ ops (e.g. rope)
-    { "KQ_scale",                   OFFLOAD_FUNC_KQ  },
-    { "KQ_mask",                    OFFLOAD_FUNC_KQ  },
-    { "K_shift",                    OFFLOAD_FUNC_KQ  },
-    { "K_shifted",                  OFFLOAD_FUNC_KQ  },
+    { "inp_pos",                    OFFLOAD_FUNC_FRC }, // this is often used for KQ ops (e.g. rope)
+    { "KQ_scale",                   OFFLOAD_FUNC_FRC },
+    { "KQ_mask",                    OFFLOAD_FUNC_FRC },
+    { "K_shift",                    OFFLOAD_FUNC_FRC },
+
+    { "K_shifted",                  OFFLOAD_FUNC     },
 
     { "inp_norm",                   OFFLOAD_FUNC_NR  },
     { "inp_norm_w",                 OFFLOAD_FUNC_NR  },
@@ -4681,37 +5411,38 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
     { "attn_norm",                  OFFLOAD_FUNC     },
     { "attn_norm_2",                OFFLOAD_FUNC     },
 
-    { "wqkv",                       OFFLOAD_FUNC_KQ  },
-    { "bqkv",                       OFFLOAD_FUNC_KQ  },
-    { "wqkv_clamped",               OFFLOAD_FUNC_KQ  },
-
-    { "tmpk",                       OFFLOAD_FUNC_KQ  },
-    { "tmpq",                       OFFLOAD_FUNC_KQ  },
-    { "tmpv",                       OFFLOAD_FUNC_V   },
-    { "Kcur",                       OFFLOAD_FUNC_KQ  },
-    { "Qcur",                       OFFLOAD_FUNC_KQ  },
-    { "Vcur",                       OFFLOAD_FUNC_V   },
-
-    { "krot",                       OFFLOAD_FUNC_KQ  },
-    { "qrot",                       OFFLOAD_FUNC_KQ  },
-    { "kpass",                      OFFLOAD_FUNC_KQ  },
-    { "qpass",                      OFFLOAD_FUNC_KQ  },
-    { "krotated",                   OFFLOAD_FUNC_KQ  },
-    { "qrotated",                   OFFLOAD_FUNC_KQ  },
-
-    { "q",                          OFFLOAD_FUNC_KQ  },
-    { "k",                          OFFLOAD_FUNC_KQ  },
-    { "kq",                         OFFLOAD_FUNC_KQ  },
-    { "kq_scaled",                  OFFLOAD_FUNC_KQ  },
-    { "kq_scaled_alibi",            OFFLOAD_FUNC_KQ  },
-    { "kq_masked",                  OFFLOAD_FUNC_KQ  },
-    { "kq_soft_max",                OFFLOAD_FUNC_V   },
-    { "v",                          OFFLOAD_FUNC_V   },
-    { "kqv",                        OFFLOAD_FUNC_V   },
-    { "kqv_merged",                 OFFLOAD_FUNC_V   },
-    { "kqv_merged_cont",            OFFLOAD_FUNC_V   },
-    { "kqv_wo",                     OFFLOAD_FUNC_V   },
-    { "kqv_out",                    OFFLOAD_FUNC_V   },
+    { "wqkv",                       OFFLOAD_FUNC_KQV },
+    { "bqkv",                       OFFLOAD_FUNC_KQV },
+    { "wqkv_clamped",               OFFLOAD_FUNC_KQV },
+
+    { "tmpk",                       OFFLOAD_FUNC_KQV },
+    { "tmpq",                       OFFLOAD_FUNC_KQV },
+    { "tmpv",                       OFFLOAD_FUNC_KQV },
+    { "Kcur",                       OFFLOAD_FUNC_KQV },
+    { "Qcur",                       OFFLOAD_FUNC_KQV },
+    { "Vcur",                       OFFLOAD_FUNC_KQV },
+
+    { "krot",                       OFFLOAD_FUNC_KQV },
+    { "qrot",                       OFFLOAD_FUNC_KQV },
+    { "kpass",                      OFFLOAD_FUNC_KQV },
+    { "qpass",                      OFFLOAD_FUNC_KQV },
+    { "krotated",                   OFFLOAD_FUNC_KQV },
+    { "qrotated",                   OFFLOAD_FUNC_KQV },
+
+    { "q",                          OFFLOAD_FUNC_KQV },
+    { "k",                          OFFLOAD_FUNC_KQV },
+    { "kq",                         OFFLOAD_FUNC_KQV },
+    { "kq_scaled",                  OFFLOAD_FUNC_KQV },
+    { "kq_scaled_alibi",            OFFLOAD_FUNC_KQV },
+    { "kq_masked",                  OFFLOAD_FUNC_KQV },
+    { "kq_soft_max",                OFFLOAD_FUNC_KQV },
+    { "kq_soft_max_ext",            OFFLOAD_FUNC_KQV },
+    { "v",                          OFFLOAD_FUNC_KQV },
+    { "kqv",                        OFFLOAD_FUNC_KQV },
+    { "kqv_merged",                 OFFLOAD_FUNC_KQV },
+    { "kqv_merged_cont",            OFFLOAD_FUNC_KQV },
+    { "kqv_wo",                     OFFLOAD_FUNC_KQV },
+    { "kqv_out",                    OFFLOAD_FUNC_KQV },
 
     { "ffn_inp",                    OFFLOAD_FUNC     },
     { "ffn_norm",                   OFFLOAD_FUNC     },
@@ -4903,15 +5634,15 @@ static struct lm_ggml_cgraph * llama_build_graph(
             { OFFLOAD_FUNC_NOP, "CPU" },
             { OFFLOAD_FUNC_OUT, "CPU" },
 #ifdef LM_GGML_USE_CUBLAS
-            { OFFLOAD_FUNC,     "GPU (CUDA)" },
-            { OFFLOAD_FUNC_KQ,  "GPU (CUDA) KQ" },
-            { OFFLOAD_FUNC_V,   "GPU (CUDA) V" },
-            { OFFLOAD_FUNC_NR,  "GPU (CUDA) NR" },
+            { OFFLOAD_FUNC,     "GPU (CUDA)"     },
+            { OFFLOAD_FUNC_FRC, "GPU (CUDA) FRC" },
+            { OFFLOAD_FUNC_KQV, "GPU (CUDA) KQV" },
+            { OFFLOAD_FUNC_NR,  "GPU (CUDA) NR"  },
             { OFFLOAD_FUNC_EMB, "GPU (CUDA) EMB" },
 #else
             { OFFLOAD_FUNC,     "CPU" },
-            { OFFLOAD_FUNC_KQ,  "CPU" },
-            { OFFLOAD_FUNC_V,   "CPU" },
+            { OFFLOAD_FUNC_FRC, "CPU" },
+            { OFFLOAD_FUNC_KQV, "CPU" },
             { OFFLOAD_FUNC_NR,  "CPU" },
             { OFFLOAD_FUNC_EMB, "CPU" },
 #endif // LM_GGML_USE_CUBLAS
@@ -4944,18 +5675,23 @@ static struct lm_ggml_cgraph * llama_build_graph(
                     }
                 }
                 break;
-            case OFFLOAD_FUNC_NR:
-                if (n_gpu_layers <= n_layer + 0) {
+            case OFFLOAD_FUNC_FRC:
+                if (!lctx.cparams.offload_kqv) {
                     func_e = OFFLOAD_FUNC_NOP;
-                }
-                break;
-            case OFFLOAD_FUNC_V:
-                if (n_gpu_layers <= n_layer + 1) {
+                } break;
+            case OFFLOAD_FUNC_KQV:
+                if (!lctx.cparams.offload_kqv) {
                     func_e = OFFLOAD_FUNC_NOP;
+                } else {
+                    if (n_gpu_layers < n_layer) {
+                        if (il < i_gpu_start) {
+                            func_e = OFFLOAD_FUNC_NOP;
+                        }
+                    }
                 }
                 break;
-            case OFFLOAD_FUNC_KQ:
-                if (n_gpu_layers <= n_layer + 2) {
+            case OFFLOAD_FUNC_NR:
+                if (n_gpu_layers <= n_layer + 0) {
                     func_e = OFFLOAD_FUNC_NOP;
                 }
                 break;
@@ -4980,8 +5716,8 @@ static struct lm_ggml_cgraph * llama_build_graph(
             case OFFLOAD_FUNC_NOP:
             case OFFLOAD_FUNC_OUT: func = lm_ggml_offload_nop; break;
             case OFFLOAD_FUNC:
-            case OFFLOAD_FUNC_KQ:
-            case OFFLOAD_FUNC_V:
+            case OFFLOAD_FUNC_KQV:
+            case OFFLOAD_FUNC_FRC:
             case OFFLOAD_FUNC_NR:
             case OFFLOAD_FUNC_EMB: func = lm_ggml_offload_gpu; break;
             default: LM_GGML_ASSERT(false);
@@ -5036,6 +5772,14 @@ static struct lm_ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_mpt();
             } break;
+         case LLM_ARCH_STABLELM:
+            {
+                result = llm.build_stablelm();
+            } break;
+        case LLM_ARCH_QWEN:
+            {
+                result = llm.build_qwen();
+            } break;
         default:
             LM_GGML_ASSERT(false);
     }
@@ -5145,6 +5889,12 @@ static int llama_decode_internal(
         batch.seq_id = seq_id_arr.data();
     }
 
+    // if we have enough unused cells before the current head ->
+    //   better to start searching from the beginning of the cache, hoping to fill it
+    if (kv_self.head > kv_self.used + 2*n_tokens) {
+        kv_self.head = 0;
+    }
+
     if (!llama_kv_cache_find_slot(kv_self, batch)) {
         return 1;
     }
@@ -5152,10 +5902,10 @@ static int llama_decode_internal(
     // a heuristic, to avoid attending the full cache if it is not yet utilized
     // after enough generations, the benefit from this heuristic disappears
     // if we start defragmenting the cache, the benefit from this will be more important
-    //kv_self.n = std::max(32, LM_GGML_PAD(llama_kv_cache_cell_max(kv_self), 32));   // TODO: this might be better for CUDA?
-    kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self)));
+    kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, LM_GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
+    //kv_self.n = llama_kv_cache_cell_max(kv_self);
 
-    //printf("kv_self.n = %d\n", kv_self.n);
+    //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
 
     lm_ggml_allocr_reset(lctx.alloc);
 
@@ -5204,17 +5954,8 @@ static int llama_decode_internal(
         n_threads = std::min(4, n_threads);
     }
 
-    // If all tensors can be run on the GPU then using more than 1 thread is detrimental.
-    const bool full_offload_supported =
-        model.arch == LLM_ARCH_LLAMA      ||
-        model.arch == LLM_ARCH_BAICHUAN   ||
-        model.arch == LLM_ARCH_FALCON     ||
-        model.arch == LLM_ARCH_REFACT     ||
-        model.arch == LLM_ARCH_MPT        ||
-        model.arch == LLM_ARCH_STARCODER;
-
-    const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
-    if (lm_ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
+    const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1;
+    if (lm_ggml_cpu_has_cublas() && fully_offloaded) {
         n_threads = 1;
     }
 
@@ -6003,7 +6744,10 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
                         //  by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
                         //  and passing 'add space prefix' as bool argument
                         //
-                        auto raw_text = (special ? "" : " ") + fragment.raw_text.substr(fragment.offset, fragment.length);
+                        auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
+                        if (&fragment == &fragment_buffer.front()) {
+                            raw_text = " " + raw_text; // prefix with space if the first token is not special
+                        }
 
 #ifdef PRETOKENIZERDEBUG
                         fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
@@ -6069,11 +6813,13 @@ struct llama_grammar_candidate {
 // Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
 // pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
 static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
-        const char         * src,
+        const std::string & src,
         llama_partial_utf8   partial_start) {
     static const int      lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
-    const char          * pos      = src;
+    const char          * pos      = src.c_str();
     std::vector<uint32_t> code_points;
+    // common english strings have the same number of codepoints and bytes. `+ 1` for the terminating 0.
+    code_points.reserve(src.size() + 1);
     uint32_t              value    = partial_start.value;
     int                   n_remain = partial_start.n_remain;
 
@@ -6677,6 +7423,7 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
     // Replace the data in candidates with the new_candidates data
     std::copy(new_candidates.begin(), new_candidates.end(), candidates->data);
     candidates->size = new_candidates.size();
+    candidates->sorted = false;
 
     if (ctx) {
         ctx->t_sample_us += lm_ggml_time_us() - t_start_sample_us;
@@ -6761,7 +7508,9 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
     const llama_token eos = llama_token_eos(&ctx->model);
 
     std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
+    candidates_decoded.reserve(candidates->size);
     std::vector<llama_grammar_candidate>                              candidates_grammar;
+    candidates_grammar.reserve(candidates->size);
 
     for (size_t i = 0; i < candidates->size; ++i) {
         const llama_token id    = candidates->data[i].id;
@@ -6773,7 +7522,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
         } else if (piece.empty() || piece[0] == 0) {
             candidates->data[i].logit = -INFINITY;
         } else {
-            candidates_decoded.push_back(decode_utf8(piece.c_str(), grammar->partial_utf8));
+            candidates_decoded.push_back(decode_utf8(piece, grammar->partial_utf8));
             candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
         }
     }
@@ -6980,7 +7729,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
     const std::string piece = llama_token_to_piece(ctx, token);
 
     // Note terminating 0 in decoded string
-    const auto   decoded     = decode_utf8(piece.c_str(), grammar->partial_utf8);
+    const auto   decoded     = decode_utf8(piece, grammar->partial_utf8);
     const auto & code_points = decoded.first;
     for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
         grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
@@ -7298,18 +8047,21 @@ static void llama_convert_tensor_internal(
         return;
     }
 
-    auto block_size = tensor->type == LM_GGML_TYPE_F16 ? 1 : (size_t)lm_ggml_blck_size(tensor->type);
-    auto block_size_bytes = lm_ggml_type_size(tensor->type);
+    size_t block_size = tensor->type == LM_GGML_TYPE_F16 ? 1 : (size_t)lm_ggml_blck_size(tensor->type);
+    size_t block_size_bytes = lm_ggml_type_size(tensor->type);
 
     LM_GGML_ASSERT(nelements % block_size == 0);
-    auto nblocks = nelements / block_size;
-    auto blocks_per_thread = nblocks / nthread;
-    auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
+    size_t nblocks = nelements / block_size;
+    size_t blocks_per_thread = nblocks / nthread;
+    size_t spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
 
-    for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) {
-        auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
-        auto thr_elems = thr_blocks * block_size; // number of elements for this thread
-        auto thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
+    size_t in_buff_offs = 0;
+    size_t out_buff_offs = 0;
+
+    for (int tnum = 0; tnum < nthread; tnum++) {
+        size_t thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
+        size_t thr_elems = thr_blocks * block_size; // number of elements for this thread
+        size_t thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
 
         auto compute = [qtype] (lm_ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
             if (typ == LM_GGML_TYPE_F16) {
@@ -7479,7 +8231,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     constexpr bool use_mmap = false;
 #endif
 
-    llama_model_loader ml(fname_inp, use_mmap);
+    llama_model_loader ml(fname_inp, use_mmap, NULL);
     if (ml.use_mmap) {
         ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, lm_ggml_is_numa()));
     }
@@ -7655,7 +8407,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                 workers.clear();
             }
 
-            LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", lm_ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
+            LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB | hist: ", lm_ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
             int64_t tot_count = 0;
             for (size_t i = 0; i < hist_cur.size(); i++) {
                 hist_all[i] += hist_cur[i];
@@ -7775,7 +8527,7 @@ static int llama_apply_lora_from_file_internal(
     std::vector<uint8_t> base_buf;
     if (path_base_model) {
         LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
-        ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true));
+        ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ NULL));
 
         size_t ctx_size;
         size_t mmapped_size;
@@ -8003,6 +8755,7 @@ struct llama_model_params llama_model_default_params() {
         /*.tensor_split                =*/ nullptr,
         /*.progress_callback           =*/ nullptr,
         /*.progress_callback_user_data =*/ nullptr,
+        /*.kv_overrides                =*/ nullptr,
         /*.vocab_only                  =*/ false,
         /*.use_mmap                    =*/ true,
         /*.use_mlock                   =*/ false,
@@ -8030,10 +8783,12 @@ struct llama_context_params llama_context_default_params() {
         /*.yarn_beta_fast              =*/ 32.0f,
         /*.yarn_beta_slow              =*/ 1.0f,
         /*.yarn_orig_ctx               =*/ 0,
+        /*.type_k                      =*/ LM_GGML_TYPE_F16,
+        /*.type_v                      =*/ LM_GGML_TYPE_F16,
         /*.mul_mat_q                   =*/ true,
-        /*.f16_kv                      =*/ true,
         /*.logits_all                  =*/ false,
         /*.embedding                   =*/ false,
+        /*.offload_kqv                 =*/ true,
     };
 
     return result;
@@ -8150,6 +8905,7 @@ struct llama_context * llama_new_context_with_model(
     cparams.yarn_beta_fast   = params.yarn_beta_fast;
     cparams.yarn_beta_slow   = params.yarn_beta_slow;
     cparams.mul_mat_q        = params.mul_mat_q;
+    cparams.offload_kqv      = params.offload_kqv;
 
     cparams.n_ctx            = params.n_ctx           == 0    ? hparams.n_ctx_train           : params.n_ctx;
     cparams.rope_freq_base   = params.rope_freq_base  == 0.0f ? hparams.rope_freq_base_train  : params.rope_freq_base;
@@ -8183,19 +8939,36 @@ struct llama_context * llama_new_context_with_model(
     ctx->rng = std::mt19937(params.seed);
     ctx->logits_all = params.logits_all;
 
-    lm_ggml_type memory_type = params.f16_kv ? LM_GGML_TYPE_F16 : LM_GGML_TYPE_F32;
+    const lm_ggml_type type_k = params.type_k;
+    const lm_ggml_type type_v = params.type_v;
+
+    LM_GGML_ASSERT(hparams.n_embd_head() % lm_ggml_blck_size(type_k) == 0);
+    LM_GGML_ASSERT(hparams.n_embd_head() % lm_ggml_blck_size(type_v) == 0);
 
     // reserve memory for context buffers
     if (!hparams.vocab_only) {
-        if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, cparams.n_ctx, model->n_gpu_layers)) {
+        if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, type_k, type_v, cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv)) {
             LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
             llama_free(ctx);
             return nullptr;
         }
 
         {
-            const size_t memory_size = lm_ggml_nbytes(ctx->kv_self.k) + lm_ggml_nbytes(ctx->kv_self.v);
-            LLAMA_LOG_INFO("%s: kv self size  = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
+            size_t memory_size_k = 0;
+            size_t memory_size_v = 0;
+
+            for (auto & k : ctx->kv_self.k_l) {
+                memory_size_k += lm_ggml_nbytes(k);
+            }
+
+            for (auto & v : ctx->kv_self.v_l) {
+                memory_size_v += lm_ggml_nbytes(v);
+            }
+
+            LLAMA_LOG_INFO("%s: KV self size  = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
+                (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
+                lm_ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
+                lm_ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
         }
 
         // resized during inference
@@ -8212,7 +8985,7 @@ struct llama_context * llama_new_context_with_model(
         {
             static const size_t tensor_alignment = 32;
             // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
-            ctx->buf_compute.resize(lm_ggml_tensor_overhead()*LM_GGML_MAX_NODES + lm_ggml_graph_overhead());
+            ctx->buf_compute.resize(lm_ggml_tensor_overhead()*LLAMA_MAX_NODES + lm_ggml_graph_overhead());
 
             // create measure allocator
             ctx->alloc = lm_ggml_allocr_new_measure(tensor_alignment);
@@ -8225,8 +8998,6 @@ struct llama_context * llama_new_context_with_model(
 
 #ifdef LM_GGML_USE_METAL
             if (model->n_gpu_layers > 0) {
-                lm_ggml_metal_log_set_callback(llama_log_callback_default, NULL);
-
                 ctx->ctx_metal = lm_ggml_metal_init(1);
                 if (!ctx->ctx_metal) {
                     LLAMA_LOG_ERROR("%s: lm_ggml_metal_init() failed\n", __func__);
@@ -8240,7 +9011,7 @@ struct llama_context * llama_new_context_with_model(
             // measure memory requirements for the graph
             size_t alloc_size = lm_ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
 
-            LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
+            LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
 
             // recreate allocator with exact memory requirements
             lm_ggml_allocr_free(ctx->alloc);
@@ -8254,7 +9025,7 @@ struct llama_context * llama_new_context_with_model(
 #endif
 #ifdef LM_GGML_USE_CUBLAS
             lm_ggml_cuda_set_scratch_size(alloc_size);
-            LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1024.0 / 1024.0);
+            LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
 
             // calculate total VRAM usage
             auto add_tensor = [](const lm_ggml_tensor * t, size_t & size) {
@@ -8268,16 +9039,20 @@ struct llama_context * llama_new_context_with_model(
             }
 
             size_t kv_vram_size = 0;
-            add_tensor(ctx->kv_self.k, kv_vram_size);
-            add_tensor(ctx->kv_self.v, kv_vram_size);
+            for (auto & k : ctx->kv_self.k_l) {
+                add_tensor(k, kv_vram_size);
+            }
+            for (auto & v : ctx->kv_self.v_l) {
+                add_tensor(v, kv_vram_size);
+            }
 
             size_t ctx_vram_size = alloc_size + kv_vram_size;
             size_t total_vram_size = model_vram_size + ctx_vram_size;
 
-            LLAMA_LOG_INFO("%s: total VRAM used: %.2f MB (model: %.2f MB, context: %.2f MB)\n", __func__,
+            LLAMA_LOG_INFO("%s: total VRAM used: %.2f MiB (model: %.2f MiB, context: %.2f MiB)\n", __func__,
                     total_vram_size / 1024.0 / 1024.0,
                     model_vram_size / 1024.0 / 1024.0,
-                    ctx_vram_size / 1024.0 / 1024.0);
+                    ctx_vram_size   / 1024.0 / 1024.0);
 #endif
         }
 
@@ -8298,7 +9073,7 @@ struct llama_context * llama_new_context_with_model(
 
             const size_t max_size = lm_ggml_get_max_tensor_size(ctx->model.ctx);
 
-            LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
+            LLAMA_LOG_INFO("%s: max tensor size = %8.2f MiB\n", __func__, max_size/1024.0/1024.0);
 
 #define LLAMA_METAL_CHECK_BUF(result)                            \
             if (!(result)) {                                             \
@@ -8364,6 +9139,45 @@ float llama_rope_freq_scale_train(const struct llama_model * model) {
     return model->hparams.rope_freq_scale_train;
 }
 
+int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
+    const auto & it = model->lm_gguf_kv.find(key);
+    if (it == model->lm_gguf_kv.end()) {
+        if (buf_size > 0) {
+            buf[0] = '\0';
+        }
+        return -1;
+    }
+    return snprintf(buf, buf_size, "%s", it->second.c_str());
+}
+
+int llama_model_meta_count(const struct llama_model * model) {
+    return (int)model->lm_gguf_kv.size();
+}
+
+int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
+    if (i < 0 || i >= (int)model->lm_gguf_kv.size()) {
+        if (buf_size > 0) {
+            buf[0] = '\0';
+        }
+        return -1;
+    }
+    auto it = model->lm_gguf_kv.begin();
+    std::advance(it, i);
+    return snprintf(buf, buf_size, "%s", it->first.c_str());
+}
+
+int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
+    if (i < 0 || i >= (int)model->lm_gguf_kv.size()) {
+        if (buf_size > 0) {
+            buf[0] = '\0';
+        }
+        return -1;
+    }
+    auto it = model->lm_gguf_kv.begin();
+    std::advance(it, i);
+    return snprintf(buf, buf_size, "%s", it->second.c_str());
+}
+
 int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
     return snprintf(buf, buf_size, "%s %s %s",
             llama_model_arch_name(model->arch).c_str(),
@@ -8422,8 +9236,107 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha
     }
 }
 
+struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq) {
+    struct llama_kv_cache_view result = {
+        /*.n_cells            = */ 0,
+        /*.n_max_seq          = */ n_max_seq,
+        /*.token_count        = */ 0,
+        /*.used_cells         = */ llama_get_kv_cache_used_cells(ctx),
+        /*.max_contiguous     = */ 0,
+        /*.max_contiguous_idx = */ -1,
+        /*.cells              = */ nullptr,
+        /*.cells_sequences    = */ nullptr,
+    };
+    return result;
+}
+
+void llama_kv_cache_view_free(struct llama_kv_cache_view * view) {
+    if (view->cells != nullptr) {
+        free(view->cells);
+        view->cells = nullptr;
+    }
+    if (view->cells_sequences != nullptr) {
+        free(view->cells_sequences);
+        view->cells_sequences = nullptr;
+    }
+}
+
+void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view) {
+    if (uint32_t(view->n_cells) < ctx->kv_self.size || view->cells == nullptr) {
+        view->n_cells = int32_t(ctx->kv_self.size);
+        void * p = realloc(view->cells, sizeof(struct llama_kv_cache_view_cell) * view->n_cells);
+        LM_GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells");
+        view->cells = (struct llama_kv_cache_view_cell *)p;
+        p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_max_seq * view->n_cells);
+        LM_GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences");
+        view->cells_sequences = (llama_seq_id *)p;
+    }
+
+    const std::vector<llama_kv_cell> & kv_cells = ctx->kv_self.cells;
+    llama_kv_cache_view_cell * c_curr = view->cells;
+    llama_seq_id * cs_curr = view->cells_sequences;
+    int32_t used_cells = 0;
+    int32_t token_count = 0;
+    int32_t curr_contig_idx = -1;
+    uint32_t max_contig = 0;
+    int32_t max_contig_idx = -1;
+
+    for (int32_t i = 0; i < int32_t(ctx->kv_self.size); i++, c_curr++, cs_curr += view->n_max_seq) {
+        const size_t curr_size = kv_cells[i].seq_id.size();
+        token_count += curr_size;
+        c_curr->pos = kv_cells[i].pos + kv_cells[i].delta;
+
+        if (curr_size > 0) {
+            if (curr_contig_idx >= 0 && uint32_t(i - curr_contig_idx) > max_contig) {
+                max_contig = i - curr_contig_idx;
+                max_contig_idx = curr_contig_idx;
+            }
+            curr_contig_idx = -1;
+        } else if (curr_contig_idx < 0) {
+            curr_contig_idx = i;
+        }
+
+        int seq_idx = 0;
+        for (const llama_seq_id it : kv_cells[i].seq_id) {
+            if (seq_idx >= view->n_max_seq) {
+                break;
+            }
+            cs_curr[seq_idx] = it;
+            seq_idx++;
+        }
+        if (seq_idx != 0) {
+            used_cells++;
+        }
+        for (; seq_idx < view->n_max_seq; seq_idx++) {
+            cs_curr[seq_idx] = -1;
+        }
+    }
+    if (curr_contig_idx >= 0 && kv_cells.size() - curr_contig_idx > max_contig) {
+        max_contig_idx = curr_contig_idx;
+        max_contig = kv_cells.size() - curr_contig_idx;
+    }
+    view->max_contiguous = max_contig;
+    view->max_contiguous_idx = max_contig_idx;
+    view->token_count = token_count;
+    view->used_cells = used_cells;
+    if (uint32_t(used_cells) != ctx->kv_self.used) {
+        LLAMA_LOG_ERROR("%s: used cells mismatch. kv_cache says %d but we calculated %d\n",
+            __func__, ctx->kv_self.used, used_cells);
+    }
+}
+
 int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
-    return ctx->kv_self.head;
+    int result = 0;
+
+    for (uint32_t i = 0; i < ctx->kv_self.size; i++) {
+        result += ctx->kv_self.cells[i].seq_id.size();
+    }
+
+    return result;
+}
+
+int llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
+    return ctx->kv_self.used;
 }
 
 void llama_kv_cache_clear(struct llama_context * ctx) {
@@ -8593,43 +9506,53 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
         const size_t   kv_buf_size = kv_self.buf.size;
         const uint32_t kv_head     = kv_self.head;
         const uint32_t kv_size     = kv_self.size;
+        const uint32_t kv_used     = kv_self.used;
 
         data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
         data_ctx->write(&kv_head,     sizeof(kv_head));
         data_ctx->write(&kv_size,     sizeof(kv_size));
+        data_ctx->write(&kv_used,     sizeof(kv_used));
 
         if (kv_buf_size) {
-            const size_t elt_size = lm_ggml_element_size(kv_self.k);
+            const size_t elt_size = lm_ggml_element_size(kv_self.k_l[0]);
 
-            lm_ggml_context * cpy_ctx = lm_ggml_init({ 4096, NULL, /* no_alloc */ true });
-            lm_ggml_cgraph gf{};
+            lm_ggml_context * cpy_ctx = lm_ggml_init({ 6*n_layer*lm_ggml_tensor_overhead() + lm_ggml_graph_overhead(), NULL, /* no_alloc */ true });
+            lm_ggml_cgraph * gf = lm_ggml_new_graph(cpy_ctx);
 
-            lm_ggml_tensor * kout3d = lm_ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
-            std::vector<uint8_t> kout3d_data(lm_ggml_nbytes(kout3d), 0);
-            kout3d->data = kout3d_data.data();
+            std::vector<std::vector<uint8_t>> kout2d_data(n_layer);
+            std::vector<std::vector<uint8_t>> vout2d_data(n_layer);
 
-            lm_ggml_tensor * vout3d = lm_ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer);
-            std::vector<uint8_t> vout3d_data(lm_ggml_nbytes(vout3d), 0);
-            vout3d->data = vout3d_data.data();
+            for (int il = 0; il < (int) n_layer; ++il) {
+                lm_ggml_tensor * kout2d = lm_ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
+                kout2d_data[il].resize(lm_ggml_nbytes(kout2d));
+                kout2d->data = kout2d_data[il].data();
 
-            lm_ggml_tensor * k3d = lm_ggml_view_3d(cpy_ctx, kv_self.k,
-                n_embd, kv_head, n_layer,
-                elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
+                lm_ggml_tensor * vout2d = lm_ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
+                vout2d_data[il].resize(lm_ggml_nbytes(vout2d));
+                vout2d->data = vout2d_data[il].data();
 
-            lm_ggml_tensor * v3d = lm_ggml_view_3d(cpy_ctx, kv_self.v,
-                kv_head, n_embd, n_layer,
-                elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
+                lm_ggml_tensor * k2d = lm_ggml_view_2d(cpy_ctx, kv_self.k_l[il],
+                        n_embd, kv_head,
+                        elt_size*n_embd, 0);
 
-            lm_ggml_build_forward_expand(&gf, lm_ggml_cpy(cpy_ctx, k3d, kout3d));
-            lm_ggml_build_forward_expand(&gf, lm_ggml_cpy(cpy_ctx, v3d, vout3d));
-            lm_ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
+                lm_ggml_tensor * v2d = lm_ggml_view_2d(cpy_ctx, kv_self.v_l[il],
+                        kv_head, n_embd,
+                        elt_size*n_ctx, 0);
+
+                lm_ggml_build_forward_expand(gf, lm_ggml_cpy(cpy_ctx, k2d, kout2d));
+                lm_ggml_build_forward_expand(gf, lm_ggml_cpy(cpy_ctx, v2d, vout2d));
+            }
+
+            lm_ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
 
             lm_ggml_free(cpy_ctx);
 
-            // our data is now in the kout3d_data and vout3d_data buffers
+            // our data is now in the kout2d_data and vout2d_data buffers
             // write them to file
-            data_ctx->write(kout3d_data.data(), kout3d_data.size());
-            data_ctx->write(vout3d_data.data(), vout3d_data.size());
+            for (uint32_t il = 0; il < n_layer; ++il) {
+                data_ctx->write(kout2d_data[il].data(), kout2d_data[il].size());
+                data_ctx->write(vout2d_data[il].data(), vout2d_data[il].size());
+            }
         }
 
         for (uint32_t i = 0; i < kv_size; ++i) {
@@ -8719,44 +9642,50 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
         size_t   kv_buf_size;
         uint32_t kv_head;
         uint32_t kv_size;
+        uint32_t kv_used;
 
         memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size);
         memcpy(&kv_head,     inp, sizeof(kv_head));     inp += sizeof(kv_head);
         memcpy(&kv_size,     inp, sizeof(kv_size));     inp += sizeof(kv_size);
+        memcpy(&kv_used,     inp, sizeof(kv_used));     inp += sizeof(kv_used);
 
         if (kv_buf_size) {
             LM_GGML_ASSERT(kv_self.buf.size == kv_buf_size);
 
-            const size_t elt_size = lm_ggml_element_size(kv_self.k);
+            const size_t elt_size = lm_ggml_element_size(kv_self.k_l[0]);
 
-            lm_ggml_context * cpy_ctx = lm_ggml_init({ 4096, NULL, /* no_alloc */ true });
-            lm_ggml_cgraph gf{};
+            lm_ggml_context * cpy_ctx = lm_ggml_init({ 6*n_layer*lm_ggml_tensor_overhead() + lm_ggml_graph_overhead(), NULL, /* no_alloc */ true });
+            lm_ggml_cgraph * gf = lm_ggml_new_graph(cpy_ctx);
 
-            lm_ggml_tensor * kin3d = lm_ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
-            kin3d->data = (void *) inp;
-            inp += lm_ggml_nbytes(kin3d);
+            for (int il = 0; il < n_layer; ++il) {
+                lm_ggml_tensor * kin2d = lm_ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
+                kin2d->data = (void *) inp;
+                inp += lm_ggml_nbytes(kin2d);
 
-            lm_ggml_tensor * vin3d = lm_ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer);
-            vin3d->data = (void *) inp;
-            inp += lm_ggml_nbytes(vin3d);
+                lm_ggml_tensor * vin2d = lm_ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
+                vin2d->data = (void *) inp;
+                inp += lm_ggml_nbytes(vin2d);
 
-            lm_ggml_tensor * k3d = lm_ggml_view_3d(cpy_ctx, kv_self.k,
-                n_embd, kv_head, n_layer,
-                elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
+                lm_ggml_tensor * k2d = lm_ggml_view_2d(cpy_ctx, kv_self.k_l[il],
+                    n_embd, kv_head,
+                    elt_size*n_embd, 0);
 
-            lm_ggml_tensor * v3d = lm_ggml_view_3d(cpy_ctx, kv_self.v,
-                kv_head, n_embd, n_layer,
-                elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
+                lm_ggml_tensor * v2d = lm_ggml_view_2d(cpy_ctx, kv_self.v_l[il],
+                    kv_head, n_embd,
+                    elt_size*n_ctx, 0);
 
-            lm_ggml_build_forward_expand(&gf, lm_ggml_cpy(cpy_ctx, kin3d, k3d));
-            lm_ggml_build_forward_expand(&gf, lm_ggml_cpy(cpy_ctx, vin3d, v3d));
-            lm_ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
+                lm_ggml_build_forward_expand(gf, lm_ggml_cpy(cpy_ctx, kin2d, k2d));
+                lm_ggml_build_forward_expand(gf, lm_ggml_cpy(cpy_ctx, vin2d, v2d));
+            }
+
+            lm_ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
 
             lm_ggml_free(cpy_ctx);
         }
 
         ctx->kv_self.head = kv_head;
         ctx->kv_self.size = kv_size;
+        ctx->kv_self.used = kv_used;
 
         ctx->kv_self.cells.resize(kv_size);
 
@@ -9005,6 +9934,14 @@ llama_token llama_token_nl(const struct llama_model * model) {
     return model->vocab.linefeed_id;
 }
 
+int llama_add_bos_token(const struct llama_model * model) {
+    return model->vocab.special_add_bos;
+}
+
+int llama_add_eos_token(const struct llama_model * model) {
+    return model->vocab.special_add_eos;
+}
+
 llama_token llama_token_prefix(const struct llama_model * model) {
     return model->vocab.special_prefix_id;
 }
@@ -9211,6 +10148,9 @@ const std::vector<std::pair<std::string, struct lm_ggml_tensor *>> & llama_inter
 void llama_log_set(lm_ggml_log_callback log_callback, void * user_data) {
     g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
     g_state.log_callback_user_data = user_data;
+#ifdef LM_GGML_USE_METAL
+    lm_ggml_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
+#endif
 }
 
 static void llama_log_internal_v(lm_ggml_log_level level, const char * format, va_list args) {
diff --git a/cpp/llama.h b/cpp/llama.h
index d806999..df8ff8c 100644
--- a/cpp/llama.h
+++ b/cpp/llama.h
@@ -42,7 +42,7 @@
 #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
 
 #define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
-#define LLAMA_SESSION_VERSION 2
+#define LLAMA_SESSION_VERSION 3
 
 #if defined(LM_GGML_USE_CUBLAS) || defined(LM_GGML_USE_CLBLAST) || defined(LM_GGML_USE_METAL)
 // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
@@ -158,6 +158,22 @@ extern "C" {
         llama_seq_id all_seq_id; // used if seq_id == NULL
     } llama_batch;
 
+    enum llama_model_kv_override_type {
+        LLAMA_KV_OVERRIDE_INT,
+        LLAMA_KV_OVERRIDE_FLOAT,
+        LLAMA_KV_OVERRIDE_BOOL,
+    };
+
+    struct llama_model_kv_override {
+        char key[128];
+        enum llama_model_kv_override_type tag;
+        union {
+            int64_t int_value;
+            double float_value;
+            bool bool_value;
+        };
+    };
+
     struct llama_model_params {
         int32_t n_gpu_layers; // number of layers to store in VRAM
         int32_t main_gpu;     // the GPU that is used for scratch and small tensors
@@ -165,9 +181,13 @@ extern "C" {
 
         // called with a progress value between 0 and 1, pass NULL to disable
         llama_progress_callback progress_callback;
+
         // context pointer passed to the progress callback
         void * progress_callback_user_data;
 
+        // override key-value pairs of the model meta data
+        const struct llama_model_kv_override * kv_overrides;
+
         // Keep the booleans together to avoid misalignment during copy-by-value.
         bool vocab_only; // only load the vocabulary, no weights
         bool use_mmap;   // use mmap if possible
@@ -185,17 +205,20 @@ extern "C" {
         // ref: https://github.com/ggerganov/llama.cpp/pull/2054
         float    rope_freq_base;   // RoPE base frequency, 0 = from model
         float    rope_freq_scale;  // RoPE frequency scaling factor, 0 = from model
-        float    yarn_ext_factor;  // YaRN extrapolation mix factor, NaN = from model
+        float    yarn_ext_factor;  // YaRN extrapolation mix factor, negative = from model
         float    yarn_attn_factor; // YaRN magnitude scaling factor
         float    yarn_beta_fast;   // YaRN low correction dim
         float    yarn_beta_slow;   // YaRN high correction dim
         uint32_t yarn_orig_ctx;    // YaRN original context size
 
+        enum lm_ggml_type type_k; // data type for K cache
+        enum lm_ggml_type type_v; // data type for V cache
+
         // Keep the booleans together to avoid misalignment during copy-by-value.
-        bool mul_mat_q;  // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
-        bool f16_kv;     // use fp16 for KV cache, fp32 otherwise
-        bool logits_all; // the llama_eval() call computes all logits, not just the last one
-        bool embedding;  // embedding mode only
+        bool mul_mat_q;   // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
+        bool logits_all;  // the llama_eval() call computes all logits, not just the last one
+        bool embedding;   // embedding mode only
+        bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
     };
 
     // model quantization parameters
@@ -301,6 +324,23 @@ extern "C" {
     // Get the model's RoPE frequency scaling factor
     LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
 
+    // Functions to access the model's GGUF metadata scalar values
+    // - The functions return the length of the string on success, or -1 on failure
+    // - The output string is always null-terminated and cleared on failure
+    // - GGUF array values are not supported by these functions
+
+    // Get metadata value as a string by key name
+    LLAMA_API int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size);
+
+    // Get the number of metadata key/value pairs
+    LLAMA_API int llama_model_meta_count(const struct llama_model * model);
+
+    // Get metadata key name by index
+    LLAMA_API int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);
+
+    // Get metadata value as a string by index
+    LLAMA_API int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);
+
     // Get a string describing the model type
     LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
 
@@ -344,9 +384,60 @@ extern "C" {
     // KV cache
     //
 
-    // Returns the number of tokens in the KV cache
-    LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
-            "avoid using this, it will be removed in the future, instead - count the tokens in user code");
+    // Information associated with an individual cell in the KV cache view.
+    struct llama_kv_cache_view_cell {
+        // The position for this cell. Takes KV cache shifts into account.
+        // May be negative if the cell is not populated.
+        llama_pos pos;
+    };
+
+    // An updateable view of the KV cache.
+    struct llama_kv_cache_view {
+        // Number of KV cache cells. This will be the same as the context size.
+        int32_t n_cells;
+
+        // Maximum number of sequences that can exist in a cell. It's not an error
+        // if there are more sequences in a cell than this value, however they will
+        // not be visible in the view cells_sequences.
+        int32_t n_max_seq;
+
+        // Number of tokens in the cache. For example, if there are two populated
+        // cells, the first with 1 sequence id in it and the second with 2 sequence
+        // ids then you'll have 3 tokens.
+        int32_t token_count;
+
+        // Number of populated cache cells.
+        int32_t used_cells;
+
+        // Maximum contiguous empty slots in the cache.
+        int32_t max_contiguous;
+
+        // Index to the start of the max_contiguous slot range. Can be negative
+        // when cache is full.
+        int32_t max_contiguous_idx;
+
+        // Information for an individual cell.
+        struct llama_kv_cache_view_cell * cells;
+
+        // The sequences for each cell. There will be n_max_seq items per cell.
+        llama_seq_id * cells_sequences;
+    };
+
+    // Create an empty KV cache view. (use only for debugging purposes)
+    LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq);
+
+    // Free a KV cache view. (use only for debugging purposes)
+    LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
+
+    // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
+    LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
+
+    // Returns the number of tokens in the KV cache (slow, use only for debug)
+    // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
+    LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
+
+    // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
+    LLAMA_API int llama_get_kv_cache_used_cells(const struct llama_context * ctx);
 
     // Clear the KV cache
     LLAMA_API void llama_kv_cache_clear(
@@ -517,6 +608,12 @@ extern "C" {
     LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
     LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
 
+    // Returns -1 if unknown, 1 for true or 0 for false.
+    LLAMA_API int         llama_add_bos_token(const struct llama_model * model);
+
+    // Returns -1 if unknown, 1 for true or 0 for false.
+    LLAMA_API int         llama_add_eos_token(const struct llama_model * model);
+
     // codellama infill tokens
     LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
     LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
diff --git a/cpp/sampling.cpp b/cpp/sampling.cpp
index 1317024..f4e76df 100644
--- a/cpp/sampling.cpp
+++ b/cpp/sampling.cpp
@@ -99,6 +99,56 @@ std::string llama_sampling_print(const llama_sampling_params & params) {
     return std::string(result);
 }
 
+std::string llama_sampling_order_print(const llama_sampling_params & params) {
+    std::string result = "CFG -> Penalties ";
+    if (params.mirostat == 0) {
+        for (auto s : params.samplers_sequence) {
+            switch (s) {
+                case 'k': result += "-> top_k "; break;
+                case 'f': result += "-> tfs_z "; break;
+                case 'y': result += "-> typical_p "; break;
+                case 'p': result += "-> top_p "; break;
+                case 'm': result += "-> min_p "; break;
+                case 't': result += "-> temp "; break;
+                default : break;
+            }
+        }
+    } else {
+        result += "-> mirostat ";
+    }
+
+    return result;
+}
+
+// no reasons to expose this function in header
+static void sampler_queue(
+                   struct llama_context * ctx_main,
+            const llama_sampling_params & params,
+                 llama_token_data_array & cur_p,
+                                 size_t & min_keep) {
+    const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
+
+    const float         temp              = params.temp;
+    const int32_t       top_k             = params.top_k <= 0 ? n_vocab : params.top_k;
+    const float         top_p             = params.top_p;
+    const float         min_p             = params.min_p;
+    const float         tfs_z             = params.tfs_z;
+    const float         typical_p         = params.typical_p;
+    const std::string & samplers_sequence = params.samplers_sequence;
+
+    for (auto s : samplers_sequence) {
+        switch (s){
+            case 'k': llama_sample_top_k    (ctx_main, &cur_p, top_k,     min_keep); break;
+            case 'f': llama_sample_tail_free(ctx_main, &cur_p, tfs_z,     min_keep); break;
+            case 'y': llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep); break;
+            case 'p': llama_sample_top_p    (ctx_main, &cur_p, top_p,     min_keep); break;
+            case 'm': llama_sample_min_p    (ctx_main, &cur_p, min_p,     min_keep); break;
+            case 't': llama_sample_temp     (ctx_main, &cur_p, temp); break;
+            default : break;
+        }
+    }
+}
+
 llama_token llama_sampling_sample(
                   struct llama_sampling_context * ctx_sampling,
                   struct llama_context * ctx_main,
@@ -109,11 +159,6 @@ llama_token llama_sampling_sample(
     const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
 
     const float   temp            = params.temp;
-    const int32_t top_k           = params.top_k <= 0 ? n_vocab : params.top_k;
-    const float   top_p           = params.top_p;
-    const float   min_p           = params.min_p;
-    const float   tfs_z           = params.tfs_z;
-    const float   typical_p       = params.typical_p;
     const int32_t penalty_last_n  = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
     const float   penalty_repeat  = params.penalty_repeat;
     const float   penalty_freq    = params.penalty_freq;
@@ -188,12 +233,7 @@ llama_token llama_sampling_sample(
             // temperature sampling
             size_t min_keep = std::max(1, params.n_probs);
 
-            llama_sample_top_k    (ctx_main, &cur_p, top_k,     min_keep);
-            llama_sample_tail_free(ctx_main, &cur_p, tfs_z,     min_keep);
-            llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep);
-            llama_sample_top_p    (ctx_main, &cur_p, top_p,     min_keep);
-            llama_sample_min_p    (ctx_main, &cur_p, min_p,     min_keep);
-            llama_sample_temp     (ctx_main, &cur_p, temp);
+            sampler_queue(ctx_main, params, cur_p, min_keep);
 
             id = llama_sample_token(ctx_main, &cur_p);
 
diff --git a/cpp/sampling.h b/cpp/sampling.h
index 7c9b8dc..fdfa9ee 100644
--- a/cpp/sampling.h
+++ b/cpp/sampling.h
@@ -10,22 +10,23 @@
 
 // sampling parameters
 typedef struct llama_sampling_params {
-    int32_t n_prev            = 64;    // number of previous tokens to remember
-    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
-    int32_t top_k             = 40;    // <= 0 to use vocab size
-    float   top_p             = 0.95f; // 1.0 = disabled
-    float   min_p             = 0.05f; // 0.0 = disabled
-    float   tfs_z             = 1.00f; // 1.0 = disabled
-    float   typical_p         = 1.00f; // 1.0 = disabled
-    float   temp              = 0.80f; // 1.0 = disabled
-    int32_t penalty_last_n    = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float   penalty_repeat    = 1.10f; // 1.0 = disabled
-    float   penalty_freq      = 0.00f; // 0.0 = disabled
-    float   penalty_present   = 0.00f; // 0.0 = disabled
-    int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
-    float   mirostat_tau      = 5.00f; // target entropy
-    float   mirostat_eta      = 0.10f; // learning rate
-    bool    penalize_nl       = true;  // consider newlines as a repeatable token
+    int32_t     n_prev                = 64;       // number of previous tokens to remember
+    int32_t     n_probs               = 0;        // if greater than 0, output the probabilities of top n_probs tokens.
+    int32_t     top_k                 = 40;       // <= 0 to use vocab size
+    float       top_p                 = 0.95f;    // 1.0 = disabled
+    float       min_p                 = 0.05f;    // 0.0 = disabled
+    float       tfs_z                 = 1.00f;    // 1.0 = disabled
+    float       typical_p             = 1.00f;    // 1.0 = disabled
+    float       temp                  = 0.80f;    // 1.0 = disabled
+    int32_t     penalty_last_n        = 64;       // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    float       penalty_repeat        = 1.10f;    // 1.0 = disabled
+    float       penalty_freq          = 0.00f;    // 0.0 = disabled
+    float       penalty_present       = 0.00f;    // 0.0 = disabled
+    int32_t     mirostat              = 0;        // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float       mirostat_tau          = 5.00f;    // target entropy
+    float       mirostat_eta          = 0.10f;    // learning rate
+    bool        penalize_nl           = true;     // consider newlines as a repeatable token
+    std::string samplers_sequence     = "kfypmt"; // top_k, tail_free, typical_p, top_p, min_p, temp
 
     std::string grammar;  // optional BNF-like grammar to constrain sampling
 
@@ -80,6 +81,9 @@ std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama
 // Print sampling parameters into a string
 std::string llama_sampling_print(const llama_sampling_params & params);
 
+// Print sampling order into a string
+std::string llama_sampling_order_print(const llama_sampling_params & params);
+
 // this is a common sampling function used across the examples for convenience
 // it can serve as a starting point for implementing your own sampling function
 // Note: When using multiple sequences, it is the caller's responsibility to call
diff --git a/example/ios/Podfile.lock b/example/ios/Podfile.lock
index 8bf07c5..3758c71 100644
--- a/example/ios/Podfile.lock
+++ b/example/ios/Podfile.lock
@@ -8,7 +8,7 @@ PODS:
     - hermes-engine/Pre-built (= 0.72.3)
   - hermes-engine/Pre-built (0.72.3)
   - libevent (2.1.12)
-  - llama-rn (0.3.0-rc.4):
+  - llama-rn (0.3.0-rc.5):
     - RCT-Folly
     - RCTRequired
     - RCTTypeSafety
@@ -1242,7 +1242,7 @@ SPEC CHECKSUMS:
   glog: 04b94705f318337d7ead9e6d17c019bd9b1f6b1b
   hermes-engine: 10fbd3f62405c41ea07e71973ea61e1878d07322
   libevent: 4049cae6c81cdb3654a443be001fb9bdceff7913
-  llama-rn: 0abcf4f4f58615499974d3a30876ddf8ca986012
+  llama-rn: 0a0f4d56e8c2ca348c77847cd18709330314042a
   RCT-Folly: 424b8c9a7a0b9ab2886ffe9c3b041ef628fd4fb1
   RCTRequired: a2faf4bad4e438ca37b2040cb8f7799baa065c18
   RCTTypeSafety: cb09f3e4747b6d18331a15eb05271de7441ca0b3
diff --git a/ios/RNLlamaContext.mm b/ios/RNLlamaContext.mm
index f9c6035..55adae2 100644
--- a/ios/RNLlamaContext.mm
+++ b/ios/RNLlamaContext.mm
@@ -55,7 +55,6 @@ + (instancetype)initWithParams:(NSDictionary *)params {
     }
     if (params[@"n_batch"]) defaultParams.n_batch = [params[@"n_batch"] intValue];
     if (params[@"use_mmap"]) defaultParams.use_mmap = [params[@"use_mmap"] boolValue];
-    if (params[@"memory_f16"]) defaultParams.memory_f16 = [params[@"memory_f16"] boolValue];
 
     if (params[@"lora"]) {
         float lora_scaled = 1.0f;
diff --git a/llama.cpp b/llama.cpp
index a75fa57..8a7b2fa 160000
--- a/llama.cpp
+++ b/llama.cpp
@@ -1 +1 @@
-Subproject commit a75fa576abba9d37f463580c379e4bbf1e1ad03c
+Subproject commit 8a7b2fa528f130631a5f43648481596ab320ed5a
diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh
index 7fd9bc8..8d2a965 100755
--- a/scripts/bootstrap.sh
+++ b/scripts/bootstrap.sh
@@ -12,6 +12,7 @@ cp ./llama.cpp/ggml-alloc.h ./cpp/ggml-alloc.h
 cp ./llama.cpp/ggml-alloc.c ./cpp/ggml-alloc.c
 cp ./llama.cpp/ggml-backend.h ./cpp/ggml-backend.h
 cp ./llama.cpp/ggml-backend.c ./cpp/ggml-backend.c
+cp ./llama.cpp/ggml-backend-impl.h ./cpp/ggml-backend-impl.h
 cp ./llama.cpp/ggml-impl.h ./cpp/ggml-impl.h
 cp ./llama.cpp/llama.h ./cpp/llama.h
 cp ./llama.cpp/llama.cpp ./cpp/llama.cpp
@@ -41,6 +42,7 @@ files=(
   "./cpp/ggml-alloc.c"
   "./cpp/ggml-backend.h"
   "./cpp/ggml-backend.c"
+  "./cpp/ggml-backend-impl.h"
   "./cpp/ggml-impl.h"
 )
 
diff --git a/scripts/common.cpp.patch b/scripts/common.cpp.patch
index 782ccbe..b1bfd37 100644
--- a/scripts/common.cpp.patch
+++ b/scripts/common.cpp.patch
@@ -1,6 +1,6 @@
---- common.cpp.orig	2023-11-07 10:50:44
-+++ common.cpp	2023-11-07 10:50:46
-@@ -1225,8 +1225,6 @@
+--- common.cpp.orig	2023-12-12 10:50:18
++++ common.cpp	2023-12-12 10:50:19
+@@ -1385,8 +1385,6 @@
                                 const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
      const llama_sampling_params & sparams = params.sparams;
  
diff --git a/scripts/ggml-metal.m.patch b/scripts/ggml-metal.m.patch
index c160468..368b05e 100644
--- a/scripts/ggml-metal.m.patch
+++ b/scripts/ggml-metal.m.patch
@@ -1,11 +1,11 @@
---- ggml-metal.m.orig	2023-11-02 10:42:43
-+++ ggml-metal.m	2023-11-02 10:43:38
-@@ -209,7 +209,7 @@
-         } else {
-             LM_GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
- 
--            NSString * sourcePath = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
-+            NSString * sourcePath = [bundle pathForResource:@"ggml-metal-llama" ofType:@"metal"];
+--- ggml-metal.m.orig	2023-12-12 10:46:04
++++ ggml-metal.m	2023-12-12 10:46:43
+@@ -241,7 +241,7 @@
+             if (ggmlMetalPathResources) {
+                 sourcePath = [ggmlMetalPathResources stringByAppendingPathComponent:@"ggml-metal.metal"];
+             } else {
+-                sourcePath = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
++                sourcePath = [bundle pathForResource:@"ggml-metal-llama" ofType:@"metal"];
+             }
              if (sourcePath == nil) {
                  LM_GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
-                 sourcePath = @"ggml-metal.metal";
diff --git a/scripts/llama.cpp.patch b/scripts/llama.cpp.patch
index 649b186..f88e643 100644
--- a/scripts/llama.cpp.patch
+++ b/scripts/llama.cpp.patch
@@ -1,9 +1,9 @@
---- llama.cpp.orig	2023-11-10 13:36:00
-+++ llama.cpp	2023-11-10 13:36:02
-@@ -103,6 +103,17 @@
+--- llama.cpp.orig	2023-12-12 10:46:04
++++ llama.cpp	2023-12-12 10:46:05
+@@ -105,6 +105,17 @@
  #define LLAMA_LOG_WARN(...)  llama_log_internal(LM_GGML_LOG_LEVEL_WARN , __VA_ARGS__)
  #define LLAMA_LOG_ERROR(...) llama_log_internal(LM_GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
-
+ 
 +#if defined(__ANDROID__) && defined(RNLLAMA_ANDROID_ENABLE_LOGGING)
 +#include <android/log.h>
 +#define LLAMA_ANDROID_TAG "RNLLAMA_LOG_ANDROID"
@@ -18,8 +18,8 @@
  //
  // helpers
  //
-@@ -779,16 +790,16 @@
-
+@@ -863,16 +874,16 @@
+ 
          if (prefetch > 0) {
              // Advise the kernel to preload the mapped memory
 -            if (posix_madvise(addr, std::min(file->size, prefetch), POSIX_MADV_WILLNEED)) {
diff --git a/src/NativeRNLlama.ts b/src/NativeRNLlama.ts
index 6cb88e6..d1b1a75 100644
--- a/src/NativeRNLlama.ts
+++ b/src/NativeRNLlama.ts
@@ -16,8 +16,6 @@ export type NativeContextParams = {
   use_mlock?: boolean
   use_mmap?: boolean
 
-  memory_f16?: boolean
-
   lora?: string // lora_adaptor
   lora_scaled?: number
   lora_base?: string