diff --git a/android/src/main/java/com/rnllama/LlamaContext.java b/android/src/main/java/com/rnllama/LlamaContext.java index bab6803..4cdb55a 100644 --- a/android/src/main/java/com/rnllama/LlamaContext.java +++ b/android/src/main/java/com/rnllama/LlamaContext.java @@ -52,8 +52,6 @@ public LlamaContext(int id, ReactApplicationContext reactContext, ReadableMap pa params.hasKey("use_mlock") ? params.getBoolean("use_mlock") : true, // boolean use_mmap, params.hasKey("use_mmap") ? params.getBoolean("use_mmap") : true, - // boolean memory_f16, - params.hasKey("memory_f16") ? params.getBoolean("memory_f16") : true, // String lora, params.hasKey("lora") ? params.getString("lora") : "", // float lora_scaled, @@ -285,7 +283,6 @@ protected static native long initContext( int n_gpu_layers, // TODO: Support this boolean use_mlock, boolean use_mmap, - boolean memory_f16, String lora, float lora_scaled, String lora_base, diff --git a/android/src/main/jni.cpp b/android/src/main/jni.cpp index bd06d1f..414ce02 100644 --- a/android/src/main/jni.cpp +++ b/android/src/main/jni.cpp @@ -129,7 +129,6 @@ Java_com_rnllama_LlamaContext_initContext( jint n_gpu_layers, // TODO: Support this jboolean use_mlock, jboolean use_mmap, - jboolean memory_f16, jstring lora_str, jfloat lora_scaled, jstring lora_base_str, @@ -158,8 +157,6 @@ Java_com_rnllama_LlamaContext_initContext( defaultParams.use_mlock = use_mlock; defaultParams.use_mmap = use_mmap; - defaultParams.memory_f16 = memory_f16; - const char *lora_chars = env->GetStringUTFChars(lora_str, nullptr); const char *lora_base_chars = env->GetStringUTFChars(lora_base_str, nullptr); if (lora_chars) { diff --git a/cpp/common.cpp b/cpp/common.cpp index 252af3f..dcde2be 100644 --- a/cpp/common.cpp +++ b/cpp/common.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -277,8 +278,18 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } params.yarn_beta_slow = std::stof(argv[i]); - } else if (arg == "--memory-f32") { - params.memory_f16 = false; + } else if (arg == "--samplers") { + if (++i >= argc) { + invalid_param = true; + break; + } + sparams.samplers_sequence = parse_samplers_input(argv[i]); + } else if (arg == "--sampling-seq") { + if (++i >= argc) { + invalid_param = true; + break; + } + sparams.samplers_sequence = argv[i]; } else if (arg == "--top-p") { if (++i >= argc) { invalid_param = true; @@ -491,8 +502,18 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { params.interactive_first = true; } else if (arg == "-ins" || arg == "--instruct") { params.instruct = true; + } else if (arg == "-cml" || arg == "--chatml") { + params.chatml = true; } else if (arg == "--infill") { params.infill = true; + } else if (arg == "-dkvc" || arg == "--dump-kv-cache") { + params.dump_kv_cache = true; + } else if (arg == "-nkvo" || arg == "--no-kv-offload") { + params.no_kv_offload = true; + } else if (arg == "-ctk" || arg == "--cache-type-k") { + params.cache_type_k = argv[++i]; + } else if (arg == "-ctv" || arg == "--cache-type-v") { + params.cache_type_v = argv[++i]; } else if (arg == "--multiline-input") { params.multiline_input = true; } else if (arg == "--simple-io") { @@ -673,6 +694,47 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { std::istreambuf_iterator(), std::back_inserter(sparams.grammar) ); + } else if (arg == "--override-kv") { + if (++i >= argc) { + invalid_param = true; + break; + } + char * sep = strchr(argv[i], '='); + if (sep == nullptr || sep - argv[i] >= 128) { + fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]); + invalid_param = true; + break; + } + struct llama_model_kv_override kvo; + std::strncpy(kvo.key, argv[i], sep - argv[i]); + kvo.key[sep - argv[i]] = 0; + sep++; + if (strncmp(sep, "int:", 4) == 0) { + sep += 4; + kvo.tag = LLAMA_KV_OVERRIDE_INT; + kvo.int_value = std::atol(sep); + } else if (strncmp(sep, "float:", 6) == 0) { + sep += 6; + kvo.tag = LLAMA_KV_OVERRIDE_FLOAT; + kvo.float_value = std::atof(sep); + } else if (strncmp(sep, "bool:", 5) == 0) { + sep += 5; + kvo.tag = LLAMA_KV_OVERRIDE_BOOL; + if (std::strcmp(sep, "true") == 0) { + kvo.bool_value = true; + } else if (std::strcmp(sep, "false") == 0) { + kvo.bool_value = false; + } else { + fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]); + invalid_param = true; + break; + } + } else { + fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]); + invalid_param = true; + break; + } + params.kv_overrides.push_back(kvo); #ifndef LOG_DISABLE_LOGS // Parse args for logging parameters } else if ( log_param_single_parse( argv[i] ) ) { @@ -716,6 +778,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { } } + if (!params.kv_overrides.empty()) { + params.kv_overrides.emplace_back(llama_model_kv_override()); + params.kv_overrides.back().key[0] = 0; + } + return true; } @@ -730,6 +797,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -i, --interactive run in interactive mode\n"); printf(" --interactive-first run in interactive mode and wait for input right away\n"); printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n"); + printf(" -cml, --chatml run in chatml mode (use with ChatML-compatible models)\n"); printf(" --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n"); printf(" -r PROMPT, --reverse-prompt PROMPT\n"); printf(" halt generation at PROMPT, return control in interactive mode\n"); @@ -755,6 +823,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict); printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx); printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); + printf(" --samplers samplers that will be used for generation in the order, separated by \';\', for example: \"top_k;tfs;typical;top_p;min_p;temp\"\n"); + printf(" --sampling-seq simplified sequence for samplers that will be used (default: %s)\n", sparams.samplers_sequence.c_str()); printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k); printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p); printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p); @@ -792,8 +862,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast); printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); printf(" --no-penalize-nl do not penalize newline token\n"); - printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); - printf(" not recommended: doubles context memory required and no measurable increase in quality\n"); printf(" --temp N temperature (default: %.1f)\n", (double)sparams.temp); printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n"); printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n"); @@ -832,6 +900,14 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { #endif // LM_GGML_USE_CUBLAS #endif printf(" --verbose-prompt print prompt before generation\n"); + printf(" -dkvc, --dump-kv-cache\n"); + printf(" verbose print of the KV cache\n"); + printf(" -nkvo, --no-kv-offload\n"); + printf(" disable KV offload\n"); + printf(" -ctk TYPE, --cache-type-k TYPE\n"); + printf(" KV cache data type for K (default: %s)\n", params.cache_type_k.c_str()); + printf(" -ctv TYPE, --cache-type-v TYPE\n"); + printf(" KV cache data type for V (default: %s)\n", params.cache_type_v.c_str()); printf(" --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n"); printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n"); @@ -842,6 +918,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" draft model for speculative decoding (default: %s)\n", params.model.c_str()); printf(" -ld LOGDIR, --logdir LOGDIR\n"); printf(" path under which to save YAML logs (no logging if unset)\n"); + printf(" --override-kv KEY=TYPE:VALUE\n"); + printf(" advanced option to override model metadata by key. may be specified multiple times.\n"); + printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n"); printf("\n"); #ifndef LOG_DISABLE_LOGS log_print_usage(); @@ -878,6 +957,48 @@ std::string gpt_random_prompt(std::mt19937 & rng) { LM_GGML_UNREACHABLE(); } +// +// String parsing +// + +std::string parse_samplers_input(std::string input) { + std::string output = ""; + // since samplers names are written multiple ways + // make it ready for both system names and input names + std::unordered_map samplers_symbols { + {"top_k", 'k'}, + {"top-k", 'k'}, + {"top_p", 'p'}, + {"top-p", 'p'}, + {"nucleus", 'p'}, + {"typical_p", 'y'}, + {"typical-p", 'y'}, + {"typical", 'y'}, + {"min_p", 'm'}, + {"min-p", 'm'}, + {"tfs_z", 'f'}, + {"tfs-z", 'f'}, + {"tfs", 'f'}, + {"temp", 't'}, + {"temperature",'t'} + }; + // expected format example: "temp;top_k;tfs_z;typical_p;top_p;min_p" + size_t separator = input.find(';'); + while (separator != input.npos) { + std::string name = input.substr(0,separator); + input = input.substr(separator+1); + separator = input.find(';'); + + if (samplers_symbols.find(name) != samplers_symbols.end()) { + output += samplers_symbols[name]; + } + } + if (samplers_symbols.find(input) != samplers_symbols.end()) { + output += samplers_symbols[input]; + } + return output; +} + // // Model utils // @@ -892,10 +1013,39 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & mparams.tensor_split = params.tensor_split; mparams.use_mmap = params.use_mmap; mparams.use_mlock = params.use_mlock; + if (params.kv_overrides.empty()) { + mparams.kv_overrides = NULL; + } else { + LM_GGML_ASSERT(params.kv_overrides.back().key[0] == 0 && "KV overrides not terminated with empty key"); + mparams.kv_overrides = params.kv_overrides.data(); + } return mparams; } +static lm_ggml_type kv_cache_type_from_str(const std::string & s) { + if (s == "f16") { + return LM_GGML_TYPE_F16; + } + if (s == "q8_0") { + return LM_GGML_TYPE_Q8_0; + } + if (s == "q4_0") { + return LM_GGML_TYPE_Q4_0; + } + if (s == "q4_1") { + return LM_GGML_TYPE_Q4_1; + } + if (s == "q5_0") { + return LM_GGML_TYPE_Q5_0; + } + if (s == "q5_1") { + return LM_GGML_TYPE_Q5_1; + } + + throw std::runtime_error("Invalid cache type: " + s); +} + struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) { auto cparams = llama_context_default_params(); @@ -905,7 +1055,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; cparams.mul_mat_q = params.mul_mat_q; cparams.seed = params.seed; - cparams.f16_kv = params.memory_f16; cparams.logits_all = params.logits_all; cparams.embedding = params.embedding; cparams.rope_scaling_type = params.rope_scaling_type; @@ -916,6 +1065,10 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param cparams.yarn_beta_fast = params.yarn_beta_fast; cparams.yarn_beta_slow = params.yarn_beta_slow; cparams.yarn_orig_ctx = params.yarn_orig_ctx; + cparams.offload_kqv = !params.no_kv_offload; + + cparams.type_k = kv_cache_type_from_str(params.cache_type_k); + cparams.type_v = kv_cache_type_from_str(params.cache_type_v); return cparams; } @@ -931,7 +1084,7 @@ void llama_batch_add( const std::vector & seq_ids, bool logits) { batch.token [batch.n_tokens] = id; - batch.pos [batch.n_tokens] = pos, + batch.pos [batch.n_tokens] = pos; batch.n_seq_id[batch.n_tokens] = seq_ids.size(); for (size_t i = 0; i < seq_ids.size(); ++i) { batch.seq_id[batch.n_tokens][i] = seq_ids[i]; @@ -1072,6 +1225,12 @@ std::string llama_detokenize_bpe(llama_context * ctx, const std::vector= 0) { seq_count++; } + } + putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]); + } + + printf("\n=== Done dumping\n"); +} + +void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) { + static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; + + printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n", + view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx); + + std::unordered_map seqs; + llama_kv_cache_view_cell * c_curr = view.cells; + llama_seq_id * cs_curr = view.cells_sequences; + + for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) { + for (int j = 0; j < view.n_max_seq; j++) { + if (cs_curr[j] < 0) { continue; } + if (seqs.find(cs_curr[j]) == seqs.end()) { + if (seqs.size() + 1 >= sizeof(slot_chars)) { break; } + seqs[cs_curr[j]] = seqs.size(); + } + } + if (seqs.size() + 1 >= sizeof(slot_chars)) { break; } + } + + printf("=== Sequence legend: "); + for (const auto & it : seqs) { + printf("%zu=%d, ", it.second, it.first); + } + printf("'+'=other sequence ids"); + + c_curr = view.cells; + cs_curr = view.cells_sequences; + for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) { + if (i % row_size == 0) { + printf("\n%5d: ", i); + } + for (int j = 0; j < view.n_max_seq; j++) { + if (cs_curr[j] >= 0) { + const auto & it = seqs.find(cs_curr[j]); + putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+'); + } else { + putchar('.'); + } + } + putchar(' '); + } + + printf("\n=== Done dumping\n"); +} diff --git a/cpp/common.h b/cpp/common.h index dd6b002..e87ce11 100644 --- a/cpp/common.h +++ b/cpp/common.h @@ -86,6 +86,8 @@ struct gpt_params { std::vector antiprompt; // string upon seeing which more user input is prompted std::string logdir = ""; // directory in which to save YAML log files + std::vector kv_overrides; + // TODO: avoid tuple, use struct std::vector> lora_adapter; // lora adapter path with user defined scale std::string lora_base = ""; // base model path for the lora adapter @@ -98,10 +100,10 @@ struct gpt_params { size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS - bool memory_f16 = true; // use f16 instead of f32 for memory kv bool random_prompt = false; // do not randomize prompt if none provided bool use_color = false; // use color to distinguish generations and inputs bool interactive = false; // interactive mode + bool chatml = false; // chatml mode (used for models trained on chatml syntax) bool prompt_cache_all = false; // save user input and generations to prompt cache bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it @@ -121,10 +123,15 @@ struct gpt_params { bool numa = false; // attempt optimizations that help on some NUMA systems bool verbose_prompt = false; // print prompt tokens before generation bool infill = false; // use infill mode + bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes + bool no_kv_offload = false; // disable KV offloading + + std::string cache_type_k = "f16"; // KV cache data type for the K + std::string cache_type_v = "f16"; // KV cache data type for the V // multimodal models (see examples/llava) std::string mmproj = ""; // path to multimodal projector - std::string image = ""; // path to an image file + std::string image = ""; // path to an image file }; bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params); @@ -139,6 +146,12 @@ std::string gpt_random_prompt(std::mt19937 & rng); void process_escapes(std::string& input); +// +// String parsing +// + +std::string parse_samplers_input(std::string input); + // // Model utils // @@ -200,6 +213,10 @@ std::string llama_detokenize_bpe( llama_context * ctx, const std::vector & tokens); +// Uses the value from the model metadata if possible, otherwise +// defaults to true when model type is SPM, otherwise false. +bool llama_should_add_bos_token(const llama_model * model); + // // YAML utils // @@ -213,3 +230,13 @@ std::string get_sortable_timestamp(); void dump_non_result_info_yaml( FILE * stream, const gpt_params & params, const llama_context * lctx, const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc); + +// +// KV cache utils +// + +// Dump the KV cache view with the number of sequences per cell. +void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80); + +// Dump the KV cache view showing individual sequences in each cell (long output). +void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40); diff --git a/cpp/ggml-alloc.c b/cpp/ggml-alloc.c index 2dbd6b5..ac293c2 100644 --- a/cpp/ggml-alloc.c +++ b/cpp/ggml-alloc.c @@ -1,51 +1,21 @@ #include "ggml-alloc.h" -#include "ggml-backend.h" +#include "ggml-backend-impl.h" #include "ggml.h" +#include "ggml-impl.h" #include +#include #include #include #include #include - -#define UNUSED(x) (void)(x) #define MAX(a, b) ((a) > (b) ? (a) : (b)) -#define LM_GGML_MAX_CONCUR (2*LM_GGML_MAX_NODES) +#define MAX_FREE_BLOCKS 256 //#define LM_GGML_ALLOCATOR_DEBUG -//#define AT_PRINTF printf -#define AT_PRINTF(...) ((void)0) - -struct hash_node { - struct lm_ggml_tensor * t; - int n_children; - int n_views; -}; - -static size_t hash(void * p) { - return (size_t)p % LM_GGML_GRAPH_HASHTABLE_SIZE; -} - -static struct hash_node * hash_get(struct hash_node hash_table[], struct lm_ggml_tensor * t) { - size_t h = hash(t); - - // linear probing - size_t i = h; - while (hash_table[i].t != NULL) { - if (hash_table[i].t == t) { - return &hash_table[i]; - } - i = (i + 1) % LM_GGML_GRAPH_HASHTABLE_SIZE; - if (i == h) { - // hash table is full - LM_GGML_ASSERT(false); - } - } - - hash_table[i].t = t; - return &hash_table[i]; -} +//#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__) +#define AT_PRINTF(...) // TODO: LM_GGML_PAD ? static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) { @@ -59,20 +29,18 @@ struct free_block { size_t size; }; -#define MAX_FREE_BLOCKS 256 - -struct lm_ggml_allocr { +struct lm_ggml_tallocr { struct lm_ggml_backend_buffer * buffer; bool buffer_owned; - void * data; + void * base; size_t alignment; + int n_free_blocks; struct free_block free_blocks[MAX_FREE_BLOCKS]; - struct hash_node hash_table[LM_GGML_GRAPH_HASHTABLE_SIZE]; + size_t max_size; + bool measure; - int parse_seq[LM_GGML_MAX_CONCUR]; - int parse_seq_len; #ifdef LM_GGML_ALLOCATOR_DEBUG struct lm_ggml_tensor * allocated_tensors[1024]; @@ -80,7 +48,7 @@ struct lm_ggml_allocr { }; #ifdef LM_GGML_ALLOCATOR_DEBUG -static void add_allocated_tensor(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor * tensor) { +static void add_allocated_tensor(lm_ggml_tallocr_t alloc, struct lm_ggml_tensor * tensor) { for (int i = 0; i < 1024; i++) { if (alloc->allocated_tensors[i] == NULL) { alloc->allocated_tensors[i] = tensor; @@ -89,7 +57,7 @@ static void add_allocated_tensor(struct lm_ggml_allocr * alloc, struct lm_ggml_t } LM_GGML_ASSERT(!"out of allocated_tensors"); } -static void remove_allocated_tensor(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor * tensor) { +static void remove_allocated_tensor(lm_ggml_tallocr_t alloc, struct lm_ggml_tensor * tensor) { for (int i = 0; i < 1024; i++) { if (alloc->allocated_tensors[i] == tensor || (alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) { @@ -103,7 +71,7 @@ static void remove_allocated_tensor(struct lm_ggml_allocr * alloc, struct lm_ggm #endif // check if a tensor is allocated by this buffer -static bool lm_ggml_allocr_is_own(struct lm_ggml_allocr * alloc, const struct lm_ggml_tensor * tensor) { +static bool lm_ggml_tallocr_is_own(lm_ggml_tallocr_t alloc, const struct lm_ggml_tensor * tensor) { return tensor->buffer == alloc->buffer; } @@ -111,7 +79,7 @@ static bool lm_ggml_is_view(struct lm_ggml_tensor * t) { return t->view_src != NULL; } -void lm_ggml_allocr_alloc(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor * tensor) { +void lm_ggml_tallocr_alloc(lm_ggml_tallocr_t alloc, struct lm_ggml_tensor * tensor) { LM_GGML_ASSERT(!lm_ggml_is_view(tensor)); // views generally get data pointer from one of their sources LM_GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated @@ -162,13 +130,14 @@ void lm_ggml_allocr_alloc(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor * } tensor->data = addr; - AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data); tensor->buffer = alloc->buffer; - lm_ggml_backend_buffer_init_tensor(alloc->buffer, tensor); + if (!alloc->measure) { + lm_ggml_backend_buffer_init_tensor(alloc->buffer, tensor); + } #ifdef LM_GGML_ALLOCATOR_DEBUG add_allocated_tensor(alloc, tensor); - size_t cur_max = (char*)addr - (char*)alloc->data + size; + size_t cur_max = (char*)addr - (char*)alloc->base + size; if (cur_max > alloc->max_size) { printf("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0); for (int i = 0; i < 1024; i++) { @@ -180,16 +149,16 @@ void lm_ggml_allocr_alloc(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor * } #endif - alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)alloc->data + size); + alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)alloc->base + size); } // this is a very naive implementation, but for our case the number of free blocks should be very small -static void lm_ggml_allocr_free_tensor(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor * tensor) { - if (lm_ggml_allocr_is_own(alloc, tensor) == false) { +static void lm_ggml_tallocr_free_tensor(lm_ggml_tallocr_t alloc, struct lm_ggml_tensor * tensor) { + if (lm_ggml_tallocr_is_own(alloc, tensor) == false) { // the tensor was not allocated in this buffer // this can happen because the graph allocator will try to free weights and other tensors from different buffers // the easiest way to deal with this is just to ignore it - AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer); + // AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer); return; } @@ -199,8 +168,6 @@ static void lm_ggml_allocr_free_tensor(struct lm_ggml_allocr * alloc, struct lm_ size = aligned_offset(NULL, size, alloc->alignment); AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks); - lm_ggml_backend_buffer_free_tensor(alloc->buffer, tensor); - #ifdef LM_GGML_ALLOCATOR_DEBUG remove_allocated_tensor(alloc, tensor); #endif @@ -253,91 +220,180 @@ static void lm_ggml_allocr_free_tensor(struct lm_ggml_allocr * alloc, struct lm_ alloc->n_free_blocks++; } -void lm_ggml_allocr_set_parse_seq(struct lm_ggml_allocr * alloc, const int * list, int n) { - for (int i = 0; i < n; i++) { - alloc->parse_seq[i] = list[i]; - } - alloc->parse_seq_len = n; -} - -void lm_ggml_allocr_reset(struct lm_ggml_allocr * alloc) { +void lm_ggml_tallocr_reset(lm_ggml_tallocr_t alloc) { alloc->n_free_blocks = 1; - size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment); - alloc->free_blocks[0].addr = (char *)alloc->data + align_offset; - alloc->free_blocks[0].size = lm_ggml_backend_buffer_get_size(alloc->buffer) - align_offset; + size_t align_offset = aligned_offset(alloc->base, 0, alloc->alignment); + alloc->free_blocks[0].addr = (char *)alloc->base + align_offset; + + if (alloc->measure) { + alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows + } else { + alloc->free_blocks[0].size = lm_ggml_backend_buffer_get_size(alloc->buffer) - align_offset; + } } -struct lm_ggml_allocr * lm_ggml_allocr_new(void * data, size_t size, size_t alignment) { - struct lm_ggml_backend_buffer * buffer = lm_ggml_backend_cpu_buffer_from_ptr(NULL, data, size); +lm_ggml_tallocr_t lm_ggml_tallocr_new(void * data, size_t size, size_t alignment) { + struct lm_ggml_backend_buffer * buffer = lm_ggml_backend_cpu_buffer_from_ptr(data, size); - struct lm_ggml_allocr * alloc = (struct lm_ggml_allocr *)malloc(sizeof(struct lm_ggml_allocr)); + lm_ggml_tallocr_t alloc = (lm_ggml_tallocr_t)malloc(sizeof(struct lm_ggml_tallocr)); - *alloc = (struct lm_ggml_allocr){ + *alloc = (struct lm_ggml_tallocr) { /*.buffer = */ buffer, /*.buffer_owned = */ true, /*.base = */ lm_ggml_backend_buffer_get_base(buffer), /*.alignment = */ alignment, /*.n_free_blocks = */ 0, /*.free_blocks = */ {{0}}, - /*.hash_table = */ {{0}}, /*.max_size = */ 0, /*.measure = */ false, - /*.parse_seq = */ {0}, - /*.parse_seq_len = */ 0, #ifdef LM_GGML_ALLOCATOR_DEBUG /*.allocated_tensors = */ {0}, #endif }; - lm_ggml_allocr_reset(alloc); + lm_ggml_tallocr_reset(alloc); return alloc; } -struct lm_ggml_allocr * lm_ggml_allocr_new_measure(size_t alignment) { - struct lm_ggml_allocr * alloc = lm_ggml_allocr_new((void *)0x1000, (size_t)-0x1001, alignment); +lm_ggml_tallocr_t lm_ggml_tallocr_new_measure(size_t alignment) { + lm_ggml_tallocr_t alloc = lm_ggml_tallocr_new((void *)0x1000, SIZE_MAX/2, alignment); + alloc->measure = true; + + return alloc; +} + +lm_ggml_tallocr_t lm_ggml_tallocr_new_measure_from_backend(struct lm_ggml_backend * backend) { + // create a backend buffer to get the correct tensor allocation sizes + lm_ggml_backend_buffer_t buffer = lm_ggml_backend_alloc_buffer(backend, 1); + + // TODO: move alloc initialization to a common lm_ggml_tallocr_new_impl function + lm_ggml_tallocr_t alloc = lm_ggml_tallocr_new_from_buffer(buffer); + alloc->buffer_owned = true; alloc->measure = true; + lm_ggml_tallocr_reset(alloc); + return alloc; +} +lm_ggml_tallocr_t lm_ggml_tallocr_new_from_backend(struct lm_ggml_backend * backend, size_t size) { + lm_ggml_backend_buffer_t buffer = lm_ggml_backend_alloc_buffer(backend, size); + lm_ggml_tallocr_t alloc = lm_ggml_tallocr_new_from_buffer(buffer); + alloc->buffer_owned = true; return alloc; } -struct lm_ggml_allocr * lm_ggml_allocr_new_from_buffer(struct lm_ggml_backend_buffer * buffer) { - struct lm_ggml_allocr * alloc = (struct lm_ggml_allocr *)malloc(sizeof(struct lm_ggml_allocr)); +lm_ggml_tallocr_t lm_ggml_tallocr_new_from_buffer(struct lm_ggml_backend_buffer * buffer) { + lm_ggml_tallocr_t alloc = (lm_ggml_tallocr_t)malloc(sizeof(struct lm_ggml_tallocr)); - *alloc = (struct lm_ggml_allocr){ + *alloc = (struct lm_ggml_tallocr) { /*.buffer = */ buffer, /*.buffer_owned = */ false, /*.base = */ lm_ggml_backend_buffer_get_base(buffer), /*.alignment = */ lm_ggml_backend_buffer_get_alignment(buffer), /*.n_free_blocks = */ 0, /*.free_blocks = */ {{0}}, - /*.hash_table = */ {{0}}, /*.max_size = */ 0, /*.measure = */ false, - /*.parse_seq = */ {0}, - /*.parse_seq_len = */ 0, #ifdef LM_GGML_ALLOCATOR_DEBUG /*.allocated_tensors = */ {0}, #endif }; - lm_ggml_allocr_reset(alloc); + lm_ggml_tallocr_reset(alloc); return alloc; } -void lm_ggml_allocr_free(struct lm_ggml_allocr * alloc) { +struct lm_ggml_backend_buffer * lm_ggml_tallocr_get_buffer(lm_ggml_tallocr_t alloc) { + return alloc->buffer; +} + +void lm_ggml_tallocr_free(lm_ggml_tallocr_t alloc) { + if (alloc == NULL) { + return; + } + if (alloc->buffer_owned) { lm_ggml_backend_buffer_free(alloc->buffer); } free(alloc); } -bool lm_ggml_allocr_is_measure(struct lm_ggml_allocr * alloc) { +bool lm_ggml_tallocr_is_measure(lm_ggml_tallocr_t alloc) { return alloc->measure; } -//////////// compute graph allocator +size_t lm_ggml_tallocr_max_size(lm_ggml_tallocr_t alloc) { + return alloc->max_size; +} + +// graph allocator + +struct hash_node { + int n_children; + int n_views; +}; + +struct lm_ggml_gallocr { + lm_ggml_tallocr_t talloc; + struct lm_ggml_hash_set hash_set; + struct hash_node * hash_values; + size_t hash_values_size; + lm_ggml_tallocr_t * hash_allocs; + int * parse_seq; + int parse_seq_len; +}; + +lm_ggml_gallocr_t lm_ggml_gallocr_new(void) { + lm_ggml_gallocr_t galloc = (lm_ggml_gallocr_t)malloc(sizeof(struct lm_ggml_gallocr)); + + *galloc = (struct lm_ggml_gallocr) { + /*.talloc = */ NULL, + /*.hash_set = */ {0}, + /*.hash_values = */ NULL, + /*.hash_values_size = */ 0, + /*.hash_allocs = */ NULL, + /*.parse_seq = */ NULL, + /*.parse_seq_len = */ 0, + }; + + return galloc; +} + +void lm_ggml_gallocr_free(lm_ggml_gallocr_t galloc) { + if (galloc == NULL) { + return; + } + + if (galloc->hash_set.keys != NULL) { + free(galloc->hash_set.keys); + } + if (galloc->hash_values != NULL) { + free(galloc->hash_values); + } + if (galloc->hash_allocs != NULL) { + free(galloc->hash_allocs); + } + if (galloc->parse_seq != NULL) { + free(galloc->parse_seq); + } + free(galloc); +} + +void lm_ggml_gallocr_set_parse_seq(lm_ggml_gallocr_t galloc, const int * list, int n) { + free(galloc->parse_seq); + galloc->parse_seq = malloc(sizeof(int) * n); + + for (int i = 0; i < n; i++) { + galloc->parse_seq[i] = list[i]; + } + galloc->parse_seq_len = n; +} + +static struct hash_node * hash_get(lm_ggml_gallocr_t galloc, struct lm_ggml_tensor * t) { + size_t i = lm_ggml_hash_find_or_insert(galloc->hash_set, t); + return &galloc->hash_values[i]; +} static bool lm_ggml_are_same_layout(const struct lm_ggml_tensor * a, const struct lm_ggml_tensor * b) { if (a->type != b->type) { @@ -378,27 +434,39 @@ static bool lm_ggml_op_can_inplace(enum lm_ggml_op op) { } } -static void init_view(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor * view, bool update_backend) { - assert(view->view_src != NULL && view->view_src->data != NULL); +static lm_ggml_tallocr_t node_tallocr(lm_ggml_gallocr_t galloc, struct lm_ggml_tensor * node) { + if (galloc->talloc != NULL) { + return galloc->talloc; + } + + return galloc->hash_allocs[lm_ggml_hash_find_or_insert(galloc->hash_set, node)]; +} + +static void init_view(lm_ggml_gallocr_t galloc, struct lm_ggml_tensor * view, bool update_backend) { + lm_ggml_tallocr_t alloc = node_tallocr(galloc, view); + LM_GGML_ASSERT(view->view_src != NULL && view->view_src->data != NULL); if (update_backend) { view->backend = view->view_src->backend; } - view->buffer = view->view_src->buffer; view->data = (char *)view->view_src->data + view->view_offs; // FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend // due to the lm_ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras - assert(lm_ggml_allocr_is_measure(alloc) || !view->buffer || view->buffer->backend == alloc->buffer->backend); - lm_ggml_backend_buffer_init_tensor(alloc->buffer, view); + assert(lm_ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft); + + if (!alloc->measure) { + lm_ggml_backend_buffer_init_tensor(alloc->buffer, view); + } } -static void allocate_node(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor * node) { - struct hash_node * ht = alloc->hash_table; +static void allocate_node(lm_ggml_gallocr_t galloc, struct lm_ggml_tensor * node) { + lm_ggml_tallocr_t alloc = node_tallocr(galloc, node); + if (node->data == NULL) { if (lm_ggml_is_view(node)) { - init_view(alloc, node, true); + init_view(galloc, node, true); } else { // see if we can reuse a parent's buffer (inplace) if (lm_ggml_op_can_inplace(node->op)) { @@ -409,16 +477,16 @@ static void allocate_node(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor * } // if the node's data is external, then we cannot re-use it - if (lm_ggml_allocr_is_own(alloc, parent) == false) { + if (lm_ggml_tallocr_is_own(alloc, parent) == false) { AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data); continue; } - struct hash_node * p_hn = hash_get(ht, parent); + struct hash_node * p_hn = hash_get(galloc, parent); if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && lm_ggml_are_same_layout(node, parent)) { if (lm_ggml_is_view(parent)) { struct lm_ggml_tensor * view_src = parent->view_src; - struct hash_node * view_src_hn = hash_get(ht, view_src); + struct hash_node * view_src_hn = hash_get(galloc, view_src); if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) { // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite // the parent's data that it will need later (same layout requirement). the problem is that then @@ -428,170 +496,307 @@ static void allocate_node(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor * AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name); node->view_src = view_src; view_src_hn->n_views += 1; - init_view(alloc, node, false); + init_view(galloc, node, false); return; } } else { AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name); node->view_src = parent; p_hn->n_views += 1; - init_view(alloc, node, false); + init_view(galloc, node, false); return; } } } } - lm_ggml_allocr_alloc(alloc, node); + lm_ggml_tallocr_alloc(alloc, node); } } } -size_t lm_ggml_allocr_alloc_graph_n( - struct lm_ggml_allocr * alloc, - struct lm_ggml_cgraph ** graphs, int n_graphs, - struct lm_ggml_tensor *** inputs, struct lm_ggml_tensor *** outputs) { +static void free_node(lm_ggml_gallocr_t galloc, struct lm_ggml_tensor * node) { + lm_ggml_tallocr_t alloc = node_tallocr(galloc, node); - // reset hash table - struct hash_node * ht = alloc->hash_table; - memset(ht, 0, sizeof(struct hash_node) * LM_GGML_GRAPH_HASHTABLE_SIZE); + lm_ggml_tallocr_free_tensor(alloc, node); +} + +static void lm_ggml_tallocr_alloc_graph_impl(lm_ggml_gallocr_t galloc, struct lm_ggml_cgraph * gf) { + const int * parse_seq = galloc->parse_seq; + int parse_seq_len = galloc->parse_seq_len; // count number of children and views - for (int g = 0; g < n_graphs; g++) { - struct lm_ggml_cgraph * gf = graphs[g]; - for (int i = 0; i < gf->n_nodes; i++) { + for (int i = 0; i < gf->n_nodes; i++) { + struct lm_ggml_tensor * node = gf->nodes[i]; + + if (lm_ggml_is_view(node)) { + struct lm_ggml_tensor * view_src = node->view_src; + hash_get(galloc, view_src)->n_views += 1; + if (node->buffer == NULL && node->data != NULL) { + // view of a pre-allocated tensor, didn't call init_view() yet + init_view(galloc, node, true); + } + } + + for (int j = 0; j < LM_GGML_MAX_SRC; j++) { + struct lm_ggml_tensor * parent = node->src[j]; + if (parent == NULL) { + break; + } + hash_get(galloc, parent)->n_children += 1; + if (lm_ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) { + init_view(galloc, parent, true); + } + } + } + + // allocate tensors + // if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers + int last_barrier_pos = 0; + int n_nodes = parse_seq_len ? parse_seq_len : gf->n_nodes; + + for (int ind = 0; ind < n_nodes; ind++) { + // allocate a node if there is no parse_seq or this is not a barrier + if (parse_seq_len == 0 || parse_seq[ind] != -1) { + int i = parse_seq_len ? parse_seq[ind] : ind; struct lm_ggml_tensor * node = gf->nodes[i]; - if (lm_ggml_is_view(node)) { - struct lm_ggml_tensor * view_src = node->view_src; - hash_get(ht, view_src)->n_views += 1; - if (node->buffer == NULL && node->data != NULL) { - // view of a pre-allocated tensor, didn't call init_view() yet - init_view(alloc, node, true); + // allocate parents (leafs) + for (int j = 0; j < LM_GGML_MAX_SRC; j++) { + struct lm_ggml_tensor * parent = node->src[j]; + if (parent == NULL) { + break; } + allocate_node(galloc, parent); } + // allocate node + allocate_node(galloc, node); + + AT_PRINTF("exec: %s (%s) <= ", lm_ggml_op_name(node->op), node->name); for (int j = 0; j < LM_GGML_MAX_SRC; j++) { struct lm_ggml_tensor * parent = node->src[j]; if (parent == NULL) { break; } - hash_get(ht, parent)->n_children += 1; - if (lm_ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) { - init_view(alloc, parent, true); + AT_PRINTF("%s", parent->name); + if (j < LM_GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) { + AT_PRINTF(", "); } } + AT_PRINTF("\n"); } - } - // allocate tensors - for (int g = 0; g < n_graphs; g++) { - struct lm_ggml_cgraph * gf = graphs[g]; - AT_PRINTF("####### graph %d/%d\n", g, n_graphs); - // graph inputs are allocated first to ensure that they are not overwritten by each other - if (inputs != NULL && inputs[g] != NULL) { - for (int i = 0; inputs[g][i] != NULL; i++) { - struct lm_ggml_tensor * input = inputs[g][i]; - AT_PRINTF("input: %s\n", input->name); - allocate_node(alloc, input); - } - } - // if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers - int last_barrier_pos = 0; - int n_nodes = alloc->parse_seq_len ? alloc->parse_seq_len : gf->n_nodes; + // update parents + // update immediately if there is no parse_seq + // update only at barriers if there is parse_seq + if ((parse_seq_len == 0) || parse_seq[ind] == -1) { + int update_start = parse_seq_len ? last_barrier_pos : ind; + int update_end = parse_seq_len ? ind : ind + 1; + for (int i = update_start; i < update_end; i++) { + int node_i = parse_seq_len ? parse_seq[i] : i; + struct lm_ggml_tensor * node = gf->nodes[node_i]; - for (int ind = 0; ind < n_nodes; ind++) { - // allocate a node if there is no parse_seq or this is not a barrier - if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] != -1) { - int i = alloc->parse_seq_len ? alloc->parse_seq[ind] : ind; - struct lm_ggml_tensor * node = gf->nodes[i]; - - // allocate parents (leafs) for (int j = 0; j < LM_GGML_MAX_SRC; j++) { struct lm_ggml_tensor * parent = node->src[j]; if (parent == NULL) { break; } - allocate_node(alloc, parent); - } + struct hash_node * p_hn = hash_get(galloc, parent); + p_hn->n_children -= 1; - // allocate node - allocate_node(alloc, node); + //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views); - AT_PRINTF("exec: %s (%s) <= ", lm_ggml_op_name(node->op), node->name); - for (int j = 0; j < LM_GGML_MAX_SRC; j++) { - struct lm_ggml_tensor * parent = node->src[j]; - if (parent == NULL) { - break; - } - AT_PRINTF("%s", parent->name); - if (j < LM_GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) { - AT_PRINTF(", "); - } - } - AT_PRINTF("\n"); - } - - // update parents - // update immediately if there is no parse_seq - // update only at barriers if there is parse_seq - if ((alloc->parse_seq_len == 0) || alloc->parse_seq[ind] == -1) { - int update_start = alloc->parse_seq_len ? last_barrier_pos : ind; - int update_end = alloc->parse_seq_len ? ind : ind + 1; - for (int i = update_start; i < update_end; i++) { - int node_i = alloc->parse_seq_len ? alloc->parse_seq[i] : i; - struct lm_ggml_tensor * node = gf->nodes[node_i]; - - for (int j = 0; j < LM_GGML_MAX_SRC; j++) { - struct lm_ggml_tensor * parent = node->src[j]; - if (parent == NULL) { - break; - } - struct hash_node * p_hn = hash_get(ht, parent); - p_hn->n_children -= 1; - - //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views); - - if (p_hn->n_children == 0 && p_hn->n_views == 0) { - if (lm_ggml_is_view(parent)) { - struct lm_ggml_tensor * view_src = parent->view_src; - struct hash_node * view_src_hn = hash_get(ht, view_src); - view_src_hn->n_views -= 1; - AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views); - if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) { - lm_ggml_allocr_free_tensor(alloc, view_src); - } - } - else { - if (parent->data != node->data) { - lm_ggml_allocr_free_tensor(alloc, parent); - } + if (p_hn->n_children == 0 && p_hn->n_views == 0) { + if (lm_ggml_is_view(parent)) { + struct lm_ggml_tensor * view_src = parent->view_src; + struct hash_node * view_src_hn = hash_get(galloc, view_src); + view_src_hn->n_views -= 1; + AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views); + if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0) { + free_node(galloc, view_src); } } + else { + free_node(galloc, parent); + } } } - AT_PRINTF("\n"); - if (alloc->parse_seq_len) { - last_barrier_pos = ind + 1; - } } - } - // free graph outputs here that wouldn't be freed otherwise because they have no children - if (outputs != NULL && outputs[g] != NULL) { - for (int i = 0; outputs[g][i] != NULL; i++) { - struct lm_ggml_tensor * output = outputs[g][i]; - AT_PRINTF("output: %s\n", output->name); - lm_ggml_allocr_free_tensor(alloc, output); + AT_PRINTF("\n"); + if (parse_seq_len) { + last_barrier_pos = ind + 1; } } } +} - return alloc->max_size; +size_t lm_ggml_gallocr_alloc_graph(lm_ggml_gallocr_t galloc, lm_ggml_tallocr_t talloc, struct lm_ggml_cgraph * graph) { + size_t hash_size = graph->visited_hash_table.size; + + // check if the hash table is initialized and large enough + if (galloc->hash_set.size < hash_size) { + if (galloc->hash_set.keys != NULL) { + free(galloc->hash_set.keys); + } + if (galloc->hash_values != NULL) { + free(galloc->hash_values); + } + galloc->hash_set.keys = malloc(sizeof(struct lm_ggml_tensor *) * hash_size); + galloc->hash_set.size = hash_size; + galloc->hash_values = malloc(sizeof(struct hash_node) * hash_size); + } + + // reset hash table + memset(galloc->hash_set.keys, 0, sizeof(struct lm_ggml_tensor *) * hash_size); + memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size); + + galloc->talloc = talloc; + lm_ggml_tallocr_alloc_graph_impl(galloc, graph); + galloc->talloc = NULL; + + size_t max_size = lm_ggml_tallocr_max_size(talloc); + + return max_size; } -size_t lm_ggml_allocr_alloc_graph(struct lm_ggml_allocr * alloc, struct lm_ggml_cgraph * graph) { - return lm_ggml_allocr_alloc_graph_n(alloc, &graph, 1, NULL, NULL); +void lm_ggml_gallocr_alloc_graph_n(lm_ggml_gallocr_t galloc, struct lm_ggml_cgraph * graph, struct lm_ggml_hash_set hash_set, lm_ggml_tallocr_t * hash_node_talloc) { + const size_t hash_size = hash_set.size; + + LM_GGML_ASSERT(hash_size >= (size_t)(graph->n_nodes + graph->n_leafs)); + + galloc->talloc = NULL; + + // alloc hash_values if needed + if (galloc->hash_values == NULL || galloc->hash_values_size < hash_size) { + free(galloc->hash_values); + galloc->hash_values = malloc(sizeof(struct hash_node) * hash_size); + galloc->hash_values_size = hash_size; + } + + // free hash_set.keys if needed + if (galloc->hash_set.keys != NULL) { + free(galloc->hash_set.keys); + } + galloc->hash_set = hash_set; + + // reset hash values + memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size); + + galloc->hash_allocs = hash_node_talloc; + + lm_ggml_tallocr_alloc_graph_impl(galloc, graph); + + // remove unowned resources + galloc->hash_set.keys = NULL; + galloc->hash_allocs = NULL; } -size_t lm_ggml_allocr_max_size(struct lm_ggml_allocr * alloc) { - return alloc->max_size; +// legacy API wrapper + +struct lm_ggml_allocr { + lm_ggml_tallocr_t talloc; + lm_ggml_gallocr_t galloc; +}; + +static lm_ggml_allocr_t lm_ggml_allocr_new_impl(lm_ggml_tallocr_t talloc) { + lm_ggml_allocr_t alloc = (lm_ggml_allocr_t)malloc(sizeof(struct lm_ggml_allocr)); + *alloc = (struct lm_ggml_allocr) { + /*.talloc = */ talloc, + /*.galloc = */ lm_ggml_gallocr_new(), + }; + return alloc; +} + +lm_ggml_allocr_t lm_ggml_allocr_new(void * data, size_t size, size_t alignment) { + return lm_ggml_allocr_new_impl(lm_ggml_tallocr_new(data, size, alignment)); +} + +lm_ggml_allocr_t lm_ggml_allocr_new_measure(size_t alignment) { + return lm_ggml_allocr_new_impl(lm_ggml_tallocr_new_measure(alignment)); +} + +lm_ggml_allocr_t lm_ggml_allocr_new_from_buffer(struct lm_ggml_backend_buffer * buffer) { + return lm_ggml_allocr_new_impl(lm_ggml_tallocr_new_from_buffer(buffer)); +} + +lm_ggml_allocr_t lm_ggml_allocr_new_from_backend(struct lm_ggml_backend * backend, size_t size) { + return lm_ggml_allocr_new_impl(lm_ggml_tallocr_new_from_backend(backend, size)); +} + +lm_ggml_allocr_t lm_ggml_allocr_new_measure_from_backend(struct lm_ggml_backend * backend) { + return lm_ggml_allocr_new_impl(lm_ggml_tallocr_new_measure_from_backend(backend)); +} + +struct lm_ggml_backend_buffer * lm_ggml_allocr_get_buffer(lm_ggml_allocr_t alloc) { + return lm_ggml_tallocr_get_buffer(alloc->talloc); +} + +void lm_ggml_allocr_set_parse_seq(lm_ggml_allocr_t alloc, const int * list, int n) { + lm_ggml_gallocr_set_parse_seq(alloc->galloc, list, n); +} + +void lm_ggml_allocr_free(lm_ggml_allocr_t alloc) { + lm_ggml_gallocr_free(alloc->galloc); + lm_ggml_tallocr_free(alloc->talloc); + free(alloc); +} + +bool lm_ggml_allocr_is_measure(lm_ggml_allocr_t alloc) { + return lm_ggml_tallocr_is_measure(alloc->talloc); +} + +void lm_ggml_allocr_reset(lm_ggml_allocr_t alloc) { + lm_ggml_tallocr_reset(alloc->talloc); +} + +void lm_ggml_allocr_alloc(lm_ggml_allocr_t alloc, struct lm_ggml_tensor * tensor) { + lm_ggml_tallocr_alloc(alloc->talloc, tensor); +} + +size_t lm_ggml_allocr_max_size(lm_ggml_allocr_t alloc) { + return lm_ggml_tallocr_max_size(alloc->talloc); +} + +size_t lm_ggml_allocr_alloc_graph(lm_ggml_allocr_t alloc, struct lm_ggml_cgraph * graph) { + return lm_ggml_gallocr_alloc_graph(alloc->galloc, alloc->talloc, graph); +} + +// utils +lm_ggml_backend_buffer_t lm_ggml_backend_alloc_ctx_tensors_from_buft(struct lm_ggml_context * ctx, lm_ggml_backend_buffer_type_t buft) { + LM_GGML_ASSERT(lm_ggml_get_no_alloc(ctx) == true); + + size_t alignment = lm_ggml_backend_buft_get_alignment(buft); + + size_t nbytes = 0; + for (struct lm_ggml_tensor * t = lm_ggml_get_first_tensor(ctx); t != NULL; t = lm_ggml_get_next_tensor(ctx, t)) { + if (t->data == NULL && t->view_src == NULL) { + nbytes += LM_GGML_PAD(lm_ggml_backend_buft_get_alloc_size(buft, t), alignment); + } + } + + if (nbytes == 0) { + fprintf(stderr, "%s: no tensors to allocate\n", __func__); + return NULL; + } + + lm_ggml_backend_buffer_t buffer = lm_ggml_backend_buft_alloc_buffer(buft, nbytes); + lm_ggml_tallocr_t tallocr = lm_ggml_tallocr_new_from_buffer(buffer); + + for (struct lm_ggml_tensor * t = lm_ggml_get_first_tensor(ctx); t != NULL; t = lm_ggml_get_next_tensor(ctx, t)) { + if (t->data == NULL) { + if (t->view_src == NULL) { + lm_ggml_tallocr_alloc(tallocr, t); + } else { + lm_ggml_backend_view_init(buffer, t); + } + } + } + + lm_ggml_tallocr_free(tallocr); + + return buffer; +} + +lm_ggml_backend_buffer_t lm_ggml_backend_alloc_ctx_tensors(struct lm_ggml_context * ctx, lm_ggml_backend_t backend) { + return lm_ggml_backend_alloc_ctx_tensors_from_buft(ctx, lm_ggml_backend_get_default_buffer_type(backend)); } diff --git a/cpp/ggml-alloc.h b/cpp/ggml-alloc.h index 565e0b1..8f5f880 100644 --- a/cpp/ggml-alloc.h +++ b/cpp/ggml-alloc.h @@ -6,27 +6,86 @@ extern "C" { #endif +struct lm_ggml_backend; struct lm_ggml_backend_buffer; +struct lm_ggml_backend_buffer_type; -LM_GGML_API struct lm_ggml_allocr * lm_ggml_allocr_new(void * data, size_t size, size_t alignment); -LM_GGML_API struct lm_ggml_allocr * lm_ggml_allocr_new_measure(size_t alignment); -LM_GGML_API struct lm_ggml_allocr * lm_ggml_allocr_new_from_buffer(struct lm_ggml_backend_buffer * buffer); +// +// Legacy API +// + +typedef struct lm_ggml_allocr * lm_ggml_allocr_t; + +// initialize allocator for use with CPU backend only +LM_GGML_API lm_ggml_allocr_t lm_ggml_allocr_new(void * data, size_t size, size_t alignment); +LM_GGML_API lm_ggml_allocr_t lm_ggml_allocr_new_measure(size_t alignment); + +// initialize allocator for use with ggml-backend +LM_GGML_API lm_ggml_allocr_t lm_ggml_allocr_new_from_buffer(struct lm_ggml_backend_buffer * buffer); +LM_GGML_API lm_ggml_allocr_t lm_ggml_allocr_new_from_backend(struct lm_ggml_backend * backend, size_t size); // allocates an owned buffer +LM_GGML_API lm_ggml_allocr_t lm_ggml_allocr_new_measure_from_backend(struct lm_ggml_backend * backend); + +LM_GGML_API struct lm_ggml_backend_buffer * lm_ggml_allocr_get_buffer(lm_ggml_allocr_t alloc); // tell the allocator to parse nodes following the order described in the list // you should call this if your graph are optimized to execute out-of-order -LM_GGML_API void lm_ggml_allocr_set_parse_seq(struct lm_ggml_allocr * alloc, const int * list, int n); - -LM_GGML_API void lm_ggml_allocr_free (struct lm_ggml_allocr * alloc); -LM_GGML_API bool lm_ggml_allocr_is_measure (struct lm_ggml_allocr * alloc); -LM_GGML_API void lm_ggml_allocr_reset (struct lm_ggml_allocr * alloc); -LM_GGML_API void lm_ggml_allocr_alloc (struct lm_ggml_allocr * alloc, struct lm_ggml_tensor * tensor); -LM_GGML_API size_t lm_ggml_allocr_alloc_graph(struct lm_ggml_allocr * alloc, struct lm_ggml_cgraph * graph); -LM_GGML_API size_t lm_ggml_allocr_max_size (struct lm_ggml_allocr * alloc); - -LM_GGML_API size_t lm_ggml_allocr_alloc_graph_n( - struct lm_ggml_allocr * alloc, - struct lm_ggml_cgraph ** graphs, int n_graphs, - struct lm_ggml_tensor *** inputs, struct lm_ggml_tensor *** outputs); +LM_GGML_API void lm_ggml_allocr_set_parse_seq(lm_ggml_allocr_t alloc, const int * list, int n); + +LM_GGML_API void lm_ggml_allocr_free (lm_ggml_allocr_t alloc); +LM_GGML_API bool lm_ggml_allocr_is_measure (lm_ggml_allocr_t alloc); +LM_GGML_API void lm_ggml_allocr_reset (lm_ggml_allocr_t alloc); +LM_GGML_API void lm_ggml_allocr_alloc (lm_ggml_allocr_t alloc, struct lm_ggml_tensor * tensor); +LM_GGML_API size_t lm_ggml_allocr_max_size (lm_ggml_allocr_t alloc); + +LM_GGML_API size_t lm_ggml_allocr_alloc_graph(lm_ggml_allocr_t alloc, struct lm_ggml_cgraph * graph); + +// +// ggml-backend v2 API +// + +// Seperate tensor and graph allocator objects +// This is necessary for multi-backend allocation because the graph allocator needs to use multiple tensor allocators +// The original API is kept as a wrapper around the new API + +// Tensor allocator +typedef struct lm_ggml_tallocr * lm_ggml_tallocr_t; + +LM_GGML_API lm_ggml_tallocr_t lm_ggml_tallocr_new(void * data, size_t size, size_t alignment); +LM_GGML_API lm_ggml_tallocr_t lm_ggml_tallocr_new_measure(size_t alignment); +LM_GGML_API lm_ggml_tallocr_t lm_ggml_tallocr_new_from_buffer(struct lm_ggml_backend_buffer * buffer); +LM_GGML_API lm_ggml_tallocr_t lm_ggml_tallocr_new_from_backend(struct lm_ggml_backend * backend, size_t size); // allocates an owned buffer +LM_GGML_API lm_ggml_tallocr_t lm_ggml_tallocr_new_measure_from_backend(struct lm_ggml_backend * backend); + +LM_GGML_API struct lm_ggml_backend_buffer * lm_ggml_tallocr_get_buffer(lm_ggml_tallocr_t talloc); + +LM_GGML_API void lm_ggml_tallocr_free (lm_ggml_tallocr_t talloc); +LM_GGML_API bool lm_ggml_tallocr_is_measure (lm_ggml_tallocr_t talloc); +LM_GGML_API void lm_ggml_tallocr_reset (lm_ggml_tallocr_t talloc); +LM_GGML_API void lm_ggml_tallocr_alloc (lm_ggml_tallocr_t talloc, struct lm_ggml_tensor * tensor); +LM_GGML_API size_t lm_ggml_tallocr_max_size (lm_ggml_tallocr_t talloc); + + +// Graph allocator +typedef struct lm_ggml_gallocr * lm_ggml_gallocr_t; + +LM_GGML_API lm_ggml_gallocr_t lm_ggml_gallocr_new(void); +LM_GGML_API void lm_ggml_gallocr_free(lm_ggml_gallocr_t galloc); + +LM_GGML_API void lm_ggml_gallocr_set_parse_seq(lm_ggml_gallocr_t galloc, const int * list, int n); +LM_GGML_API size_t lm_ggml_gallocr_alloc_graph(lm_ggml_gallocr_t galloc, lm_ggml_tallocr_t talloc, struct lm_ggml_cgraph * graph); + +// Allocate tensors from the allocators given by the hash table +LM_GGML_API void lm_ggml_gallocr_alloc_graph_n( + lm_ggml_gallocr_t galloc, + struct lm_ggml_cgraph * graph, + struct lm_ggml_hash_set hash_set, + lm_ggml_tallocr_t * hash_node_talloc); + + +// Utils +// Create a buffer and allocate all the tensors in a lm_ggml_context +LM_GGML_API struct lm_ggml_backend_buffer * lm_ggml_backend_alloc_ctx_tensors_from_buft(struct lm_ggml_context * ctx, struct lm_ggml_backend_buffer_type * buft); +LM_GGML_API struct lm_ggml_backend_buffer * lm_ggml_backend_alloc_ctx_tensors(struct lm_ggml_context * ctx, struct lm_ggml_backend * backend); #ifdef __cplusplus } diff --git a/cpp/ggml-backend-impl.h b/cpp/ggml-backend-impl.h new file mode 100644 index 0000000..fd83f84 --- /dev/null +++ b/cpp/ggml-backend-impl.h @@ -0,0 +1,112 @@ +#pragma once + +// ggml-backend internal header + +#include "ggml-backend.h" + +#ifdef __cplusplus +extern "C" { +#endif + + // + // Backend buffer + // + + // buffer type + typedef void * lm_ggml_backend_buffer_type_context_t; + + struct lm_ggml_backend_buffer_type_i { + lm_ggml_backend_buffer_t (*alloc_buffer) (lm_ggml_backend_buffer_type_t buft, size_t size); + size_t (*get_alignment) (lm_ggml_backend_buffer_type_t buft); // tensor alignment + size_t (*get_alloc_size) (lm_ggml_backend_buffer_type_t buft, struct lm_ggml_tensor * tensor); // data size needed to allocate the tensor, including padding + bool (*supports_backend)(lm_ggml_backend_buffer_type_t buft, lm_ggml_backend_t backend); // check if the buffer type is usable by the backend + }; + + struct lm_ggml_backend_buffer_type { + struct lm_ggml_backend_buffer_type_i iface; + lm_ggml_backend_buffer_type_context_t context; + }; + + // buffer + typedef void * lm_ggml_backend_buffer_context_t; + + struct lm_ggml_backend_buffer_i { + void (*free_buffer)(lm_ggml_backend_buffer_t buffer); + //void (*reset) (lm_ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras + void * (*get_base) (lm_ggml_backend_buffer_t buffer); + void (*init_tensor)(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor); + void (*set_tensor) (lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size); + void (*get_tensor) (lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size); + // (optional) copy tensor between different buffer-type, allow for single-copy tranfers + void (*cpy_tensor_from)(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst); + void (*cpy_tensor_to) (lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst); + }; + + struct lm_ggml_backend_buffer { + struct lm_ggml_backend_buffer_i iface; + lm_ggml_backend_buffer_type_t buft; + lm_ggml_backend_buffer_context_t context; + size_t size; + }; + + lm_ggml_backend_buffer_t lm_ggml_backend_buffer_init( + lm_ggml_backend_buffer_type_t buft, + struct lm_ggml_backend_buffer_i iface, + lm_ggml_backend_buffer_context_t context, + size_t size); + + + // + // Backend + // + + typedef void * lm_ggml_backend_context_t; + + struct lm_ggml_backend_i { + const char * (*get_name)(lm_ggml_backend_t backend); + + void (*free)(lm_ggml_backend_t backend); + + // buffer allocation + lm_ggml_backend_buffer_type_t (*get_default_buffer_type)(lm_ggml_backend_t backend); + + // (optional) asynchroneous tensor data access + void (*set_tensor_async)(lm_ggml_backend_t backend, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size); + void (*get_tensor_async)(lm_ggml_backend_t backend, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size); + + // (optional) asynchroneous tensor copy + void (*cpy_tensor_from_async)(lm_ggml_backend_t backend, struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst); + void (*cpy_tensor_to_async) (lm_ggml_backend_t backend, struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst); + + void (*synchronize) (lm_ggml_backend_t backend); + + // compute graph with a plan + lm_ggml_backend_graph_plan_t (*graph_plan_create) (lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph); + void (*graph_plan_free) (lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan); + void (*graph_plan_compute)(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan); + + // compute graph without a plan + void (*graph_compute)(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph); + + // check if the backend supports an operation + bool (*supports_op)(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op); + }; + + struct lm_ggml_backend { + struct lm_ggml_backend_i iface; + + lm_ggml_backend_context_t context; + }; + + + // + // Backend registry + // + + typedef lm_ggml_backend_t (*lm_ggml_backend_init_fn)(const char * params, void * user_data); + + void lm_ggml_backend_register(const char * name, lm_ggml_backend_init_fn init_fn, lm_ggml_backend_buffer_type_t default_buffer_type, void * user_data); + +#ifdef __cplusplus +} +#endif diff --git a/cpp/ggml-backend.c b/cpp/ggml-backend.c index 2f83068..dea815d 100644 --- a/cpp/ggml-backend.c +++ b/cpp/ggml-backend.c @@ -1,20 +1,44 @@ -#include "ggml-backend.h" +#include "ggml-backend-impl.h" #include "ggml-alloc.h" +#include "ggml-impl.h" #include +#include #include #include #include #include -#define UNUSED LM_GGML_UNUSED #define MAX(a, b) ((a) > (b) ? (a) : (b)) + +// backend buffer type + +lm_ggml_backend_buffer_t lm_ggml_backend_buft_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) { + return buft->iface.alloc_buffer(buft, size); +} + +size_t lm_ggml_backend_buft_get_alignment(lm_ggml_backend_buffer_type_t buft) { + return buft->iface.get_alignment(buft); +} + +size_t lm_ggml_backend_buft_get_alloc_size(lm_ggml_backend_buffer_type_t buft, struct lm_ggml_tensor * tensor) { + // get_alloc_size is optional, defaults to lm_ggml_nbytes + if (buft->iface.get_alloc_size) { + return buft->iface.get_alloc_size(buft, tensor); + } + return lm_ggml_nbytes(tensor); +} + +bool lm_ggml_backend_buft_supports_backend(lm_ggml_backend_buffer_type_t buft, lm_ggml_backend_t backend) { + return buft->iface.supports_backend(buft, backend); +} + // backend buffer lm_ggml_backend_buffer_t lm_ggml_backend_buffer_init( - struct lm_ggml_backend * backend, + lm_ggml_backend_buffer_type_t buft, struct lm_ggml_backend_buffer_i iface, lm_ggml_backend_buffer_context_t context, size_t size) { @@ -24,7 +48,7 @@ lm_ggml_backend_buffer_t lm_ggml_backend_buffer_init( (*buffer) = (struct lm_ggml_backend_buffer) { /* .interface = */ iface, - /* .backend = */ backend, + /* .buft = */ buft, /* .context = */ context, /* .size = */ size, }; @@ -33,84 +57,111 @@ lm_ggml_backend_buffer_t lm_ggml_backend_buffer_init( } void lm_ggml_backend_buffer_free(lm_ggml_backend_buffer_t buffer) { + if (buffer == NULL) { + return; + } + if (buffer->iface.free_buffer != NULL) { buffer->iface.free_buffer(buffer); } free(buffer); } -size_t lm_ggml_backend_buffer_get_alignment(lm_ggml_backend_buffer_t buffer) { - return lm_ggml_backend_get_alignment(buffer->backend); +size_t lm_ggml_backend_buffer_get_size(lm_ggml_backend_buffer_t buffer) { + return buffer->size; } void * lm_ggml_backend_buffer_get_base(lm_ggml_backend_buffer_t buffer) { - return buffer->iface.get_base(buffer); -} + void * base = buffer->iface.get_base(buffer); -size_t lm_ggml_backend_buffer_get_size(lm_ggml_backend_buffer_t buffer) { - return buffer->size; -} + LM_GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL"); -size_t lm_ggml_backend_buffer_get_alloc_size(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor) { - if (buffer->iface.get_alloc_size) { - return buffer->iface.get_alloc_size(buffer, tensor); - } - return lm_ggml_nbytes(tensor); + return base; } void lm_ggml_backend_buffer_init_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor) { + // init_tensor is optional if (buffer->iface.init_tensor) { buffer->iface.init_tensor(buffer, tensor); } } -void lm_ggml_backend_buffer_free_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor) { - if (buffer->iface.free_tensor) { - buffer->iface.free_tensor(buffer, tensor); - } +size_t lm_ggml_backend_buffer_get_alignment (lm_ggml_backend_buffer_t buffer) { + return lm_ggml_backend_buft_get_alignment(lm_ggml_backend_buffer_type(buffer)); } -// backend +size_t lm_ggml_backend_buffer_get_alloc_size(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor) { + return lm_ggml_backend_buft_get_alloc_size(lm_ggml_backend_buffer_type(buffer), tensor); +} -lm_ggml_backend_t lm_ggml_get_backend(const struct lm_ggml_tensor * tensor) { - return tensor->buffer->backend; +lm_ggml_backend_buffer_type_t lm_ggml_backend_buffer_type(lm_ggml_backend_buffer_t buffer) { + return buffer->buft; } +// backend + const char * lm_ggml_backend_name(lm_ggml_backend_t backend) { + if (backend == NULL) { + return "NULL"; + } return backend->iface.get_name(backend); } void lm_ggml_backend_free(lm_ggml_backend_t backend) { + if (backend == NULL) { + return; + } + backend->iface.free(backend); } +lm_ggml_backend_buffer_type_t lm_ggml_backend_get_default_buffer_type(lm_ggml_backend_t backend) { + return backend->iface.get_default_buffer_type(backend); +} + lm_ggml_backend_buffer_t lm_ggml_backend_alloc_buffer(lm_ggml_backend_t backend, size_t size) { - return backend->iface.alloc_buffer(backend, size); + return lm_ggml_backend_buft_alloc_buffer(lm_ggml_backend_get_default_buffer_type(backend), size); } size_t lm_ggml_backend_get_alignment(lm_ggml_backend_t backend) { - return backend->iface.get_alignment(backend); + return lm_ggml_backend_buft_get_alignment(lm_ggml_backend_get_default_buffer_type(backend)); } -void lm_ggml_backend_tensor_set_async(struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - lm_ggml_get_backend(tensor)->iface.set_tensor_async(lm_ggml_get_backend(tensor), tensor, data, offset, size); +void lm_ggml_backend_tensor_set_async(lm_ggml_backend_t backend, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor write out of bounds"); + + backend->iface.set_tensor_async(backend, tensor, data, offset, size); } -void lm_ggml_backend_tensor_get_async(const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) { - lm_ggml_get_backend(tensor)->iface.get_tensor_async(lm_ggml_get_backend(tensor), tensor, data, offset, size); +void lm_ggml_backend_tensor_get_async(lm_ggml_backend_t backend, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) { + LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor read out of bounds"); + + backend->iface.get_tensor_async(backend, tensor, data, offset, size); } void lm_ggml_backend_tensor_set(struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - lm_ggml_get_backend(tensor)->iface.set_tensor_async(lm_ggml_get_backend(tensor), tensor, data, offset, size); - lm_ggml_get_backend(tensor)->iface.synchronize(lm_ggml_get_backend(tensor)); + LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + LM_GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set"); + LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor write out of bounds"); + + tensor->buffer->iface.set_tensor(tensor->buffer, tensor, data, offset, size); } void lm_ggml_backend_tensor_get(const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) { - lm_ggml_get_backend(tensor)->iface.get_tensor_async(lm_ggml_get_backend(tensor), tensor, data, offset, size); - lm_ggml_get_backend(tensor)->iface.synchronize(lm_ggml_get_backend(tensor)); + LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + LM_GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set"); + LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor read out of bounds"); + + tensor->buffer->iface.get_tensor(tensor->buffer, tensor, data, offset, size); } void lm_ggml_backend_synchronize(lm_ggml_backend_t backend) { + if (backend->iface.synchronize == NULL) { + return; + } + backend->iface.synchronize(backend); } @@ -124,10 +175,16 @@ void lm_ggml_backend_graph_plan_free(lm_ggml_backend_t backend, lm_ggml_backend_ void lm_ggml_backend_graph_plan_compute(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan) { backend->iface.graph_plan_compute(backend, plan); + + // TODO: optional sync + lm_ggml_backend_synchronize(backend); } void lm_ggml_backend_graph_compute(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) { backend->iface.graph_compute(backend, cgraph); + + // TODO: optional sync + lm_ggml_backend_synchronize(backend); } bool lm_ggml_backend_supports_op(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op) { @@ -156,7 +213,7 @@ void lm_ggml_backend_tensor_copy(struct lm_ggml_tensor * src, struct lm_ggml_ten //printf("dst: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", dst->name, (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], (int)dst->nb[0], (int)dst->nb[1], (int)dst->nb[2], (int)dst->nb[3]); LM_GGML_ASSERT(lm_ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts"); - // printf("cpy tensor %s from %s to %s (%lu bytes)\n", src->name, lm_ggml_backend_name(src->backend), lm_ggml_backend_name(dst->backend), lm_ggml_nbytes(src)); + // fprintf(stderr, "cpy tensor %s from %s to %s (%lu bytes)\n", src->name, lm_ggml_backend_name(src->backend), lm_ggml_backend_name(dst->backend), lm_ggml_nbytes(src)); if (src == dst) { return; @@ -164,14 +221,15 @@ void lm_ggml_backend_tensor_copy(struct lm_ggml_tensor * src, struct lm_ggml_ten // TODO: allow backends to support copy to/from same backend - if (lm_ggml_get_backend(dst)->iface.cpy_tensor_from != NULL) { - lm_ggml_get_backend(dst)->iface.cpy_tensor_from(lm_ggml_get_backend(dst)->context, src, dst); - } else if (lm_ggml_get_backend(src)->iface.cpy_tensor_to != NULL) { - lm_ggml_get_backend(src)->iface.cpy_tensor_to(lm_ggml_get_backend(src)->context, src, dst); + if (dst->buffer->iface.cpy_tensor_from != NULL) { + dst->buffer->iface.cpy_tensor_from(dst->buffer, src, dst); + } else if (src->buffer->iface.cpy_tensor_to != NULL) { + src->buffer->iface.cpy_tensor_to(src->buffer, src, dst); } else { // shouldn't be hit when copying from/to CPU #ifndef NDEBUG - fprintf(stderr, "lm_ggml_backend_tensor_copy: neither cpy_tensor_from nor cpy_tensor_to are implemented for backends %s and %s, falling back to get/set\n", lm_ggml_backend_name(src->buffer->backend), lm_ggml_backend_name(dst->buffer->backend)); + fprintf(stderr, "lm_ggml_backend_tensor_copy: neither cpy_tensor_from nor cpy_tensor_to " + "are implemented for %s and %s, falling back to get/set\n", src->name, dst->name); #endif size_t nbytes = lm_ggml_nbytes(src); void * data = malloc(nbytes); @@ -181,100 +239,259 @@ void lm_ggml_backend_tensor_copy(struct lm_ggml_tensor * src, struct lm_ggml_ten } } -// backend CPU +// backend registry -struct lm_ggml_backend_cpu_context { - int n_threads; - void * work_data; - size_t work_size; +#define LM_GGML_MAX_BACKENDS_REG 16 + +struct lm_ggml_backend_reg { + char name[128]; + lm_ggml_backend_init_fn init_fn; + lm_ggml_backend_buffer_type_t default_buffer_type; + void * user_data; }; -static const char * lm_ggml_backend_cpu_name(lm_ggml_backend_t backend) { - return "CPU"; +static struct lm_ggml_backend_reg lm_ggml_backend_registry[LM_GGML_MAX_BACKENDS_REG]; +static size_t lm_ggml_backend_registry_count = 0; + +static lm_ggml_backend_t lm_ggml_backend_reg_cpu_init(const char * params, void * user_data); + +static void lm_ggml_backend_registry_init(void) { + static bool initialized = false; + + if (initialized) { + return; + } - UNUSED(backend); + initialized = true; + + lm_ggml_backend_register("CPU", lm_ggml_backend_reg_cpu_init, lm_ggml_backend_cpu_buffer_type(), NULL); + + // add forward decls here to avoid including the backend headers +#ifdef LM_GGML_USE_CUBLAS + extern void lm_ggml_backend_cuda_reg_devices(void); + lm_ggml_backend_cuda_reg_devices(); +#endif + +#ifdef LM_GGML_USE_METAL + extern lm_ggml_backend_t lm_ggml_backend_reg_metal_init(const char * params, void * user_data); + extern lm_ggml_backend_buffer_type_t lm_ggml_backend_metal_buffer_type(void); + lm_ggml_backend_register("Metal", lm_ggml_backend_reg_metal_init, lm_ggml_backend_metal_buffer_type(), NULL); +#endif } -static void lm_ggml_backend_cpu_free(lm_ggml_backend_t backend) { - struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context; - free(cpu_ctx->work_data); - free(cpu_ctx); - free(backend); +void lm_ggml_backend_register(const char * name, lm_ggml_backend_init_fn init_fn, lm_ggml_backend_buffer_type_t default_buffer_type, void * user_data) { + LM_GGML_ASSERT(lm_ggml_backend_registry_count < LM_GGML_MAX_BACKENDS_REG); + + int id = lm_ggml_backend_registry_count; + + lm_ggml_backend_registry[id] = (struct lm_ggml_backend_reg) { + /* .name = */ {0}, + /* .fn = */ init_fn, + /* .default_buffer_type = */ default_buffer_type, + /* .user_data = */ user_data, + }; + + snprintf(lm_ggml_backend_registry[id].name, sizeof(lm_ggml_backend_registry[id].name), "%s", name); + +#ifndef NDEBUG + fprintf(stderr, "%s: registered backend %s\n", __func__, name); +#endif + + lm_ggml_backend_registry_count++; +} + +size_t lm_ggml_backend_reg_get_count(void) { + lm_ggml_backend_registry_init(); + + return lm_ggml_backend_registry_count; +} + +size_t lm_ggml_backend_reg_find_by_name(const char * name) { + lm_ggml_backend_registry_init(); + + for (size_t i = 0; i < lm_ggml_backend_registry_count; i++) { + // TODO: case insensitive in a portable way + if (strcmp(lm_ggml_backend_registry[i].name, name) == 0) { + return i; + } + } + return SIZE_MAX; } +// init from backend:params string +lm_ggml_backend_t lm_ggml_backend_reg_init_backend_from_str(const char * backend_str) { + lm_ggml_backend_registry_init(); + + const char * params = strchr(backend_str, ':'); + char backend_name[128]; + if (params == NULL) { + strcpy(backend_name, backend_str); + params = ""; + } else { + strncpy(backend_name, backend_str, params - backend_str); + backend_name[params - backend_str] = '\0'; + params++; + } + + size_t backend_i = lm_ggml_backend_reg_find_by_name(backend_name); + if (backend_i == SIZE_MAX) { + fprintf(stderr, "%s: backend %s not found\n", __func__, backend_name); + return NULL; + } + + return lm_ggml_backend_reg_init_backend(backend_i, params); +} + +const char * lm_ggml_backend_reg_get_name(size_t i) { + lm_ggml_backend_registry_init(); + + LM_GGML_ASSERT(i < lm_ggml_backend_registry_count); + return lm_ggml_backend_registry[i].name; +} + +lm_ggml_backend_t lm_ggml_backend_reg_init_backend(size_t i, const char * params) { + lm_ggml_backend_registry_init(); + + LM_GGML_ASSERT(i < lm_ggml_backend_registry_count); + return lm_ggml_backend_registry[i].init_fn(params, lm_ggml_backend_registry[i].user_data); +} + +lm_ggml_backend_buffer_type_t lm_ggml_backend_reg_get_default_buffer_type(size_t i) { + lm_ggml_backend_registry_init(); + + LM_GGML_ASSERT(i < lm_ggml_backend_registry_count); + return lm_ggml_backend_registry[i].default_buffer_type; +} + +lm_ggml_backend_buffer_t lm_ggml_backend_reg_alloc_buffer(size_t i, size_t size) { + lm_ggml_backend_registry_init(); + + LM_GGML_ASSERT(i < lm_ggml_backend_registry_count); + return lm_ggml_backend_buft_alloc_buffer(lm_ggml_backend_registry[i].default_buffer_type, size); +} + +// backend CPU + static void * lm_ggml_backend_cpu_buffer_get_base(lm_ggml_backend_buffer_t buffer) { return (void *)buffer->context; } static void lm_ggml_backend_cpu_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) { free(buffer->context); - UNUSED(buffer); + LM_GGML_UNUSED(buffer); +} + +static void lm_ggml_backend_cpu_buffer_set_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor write out of bounds"); + LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + + memcpy((char *)tensor->data + offset, data, size); + + LM_GGML_UNUSED(buffer); +} + +static void lm_ggml_backend_cpu_buffer_get_tensor(lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) { + LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor read out of bounds"); + LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + + memcpy(data, (const char *)tensor->data + offset, size); + + LM_GGML_UNUSED(buffer); +} + +static void lm_ggml_backend_cpu_buffer_cpy_tensor_from(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) { + lm_ggml_backend_tensor_get(src, dst->data, 0, lm_ggml_nbytes(src)); + + LM_GGML_UNUSED(buffer); +} + +static void lm_ggml_backend_cpu_buffer_cpy_tensor_to(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) { + lm_ggml_backend_tensor_set(dst, src->data, 0, lm_ggml_nbytes(src)); + + LM_GGML_UNUSED(buffer); } static struct lm_ggml_backend_buffer_i cpu_backend_buffer_i = { - /* .free_buffer = */ lm_ggml_backend_cpu_buffer_free_buffer, - /* .get_base = */ lm_ggml_backend_cpu_buffer_get_base, - /* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes - /* .init_tensor = */ NULL, // no initialization required - /* .free_tensor = */ NULL, // no cleanup required + /* .free_buffer = */ lm_ggml_backend_cpu_buffer_free_buffer, + /* .get_base = */ lm_ggml_backend_cpu_buffer_get_base, + /* .init_tensor = */ NULL, // no initialization required + /* .set_tensor = */ lm_ggml_backend_cpu_buffer_set_tensor, + /* .get_tensor = */ lm_ggml_backend_cpu_buffer_get_tensor, + /* .cpy_tensor_from = */ lm_ggml_backend_cpu_buffer_cpy_tensor_from, + /* .cpy_tensor_to = */ lm_ggml_backend_cpu_buffer_cpy_tensor_to, }; // for buffers from ptr, free is not called static struct lm_ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = { - /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed - /* .get_base = */ lm_ggml_backend_cpu_buffer_get_base, - /* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes - /* .init_tensor = */ NULL, - /* .free_tensor = */ NULL, + /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed + /* .get_base = */ lm_ggml_backend_cpu_buffer_get_base, + /* .init_tensor = */ NULL, // no initialization required + /* .set_tensor = */ lm_ggml_backend_cpu_buffer_set_tensor, + /* .get_tensor = */ lm_ggml_backend_cpu_buffer_get_tensor, + /* .cpy_tensor_from = */ lm_ggml_backend_cpu_buffer_cpy_tensor_from, + /* .cpy_tensor_to = */ lm_ggml_backend_cpu_buffer_cpy_tensor_to, }; static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512 -static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_alloc_buffer(lm_ggml_backend_t backend, size_t size) { +static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_type_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) { size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned void * data = malloc(size); // TODO: maybe use LM_GGML_ALIGNED_MALLOC? - return lm_ggml_backend_buffer_init(backend, cpu_backend_buffer_i, data, size); + LM_GGML_ASSERT(data != NULL && "failed to allocate buffer"); + + return lm_ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size); } -static size_t lm_ggml_backend_cpu_get_alignment(lm_ggml_backend_t backend) { +static size_t lm_ggml_backend_cpu_buffer_type_get_alignment(lm_ggml_backend_buffer_type_t buft) { return TENSOR_ALIGNMENT; - UNUSED(backend); -} -static void lm_ggml_backend_cpu_set_tensor_async(lm_ggml_backend_t backend, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor write out of bounds"); - LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + LM_GGML_UNUSED(buft); +} - memcpy((char *)tensor->data + offset, data, size); +static bool lm_ggml_backend_cpu_buffer_type_supports_backend(lm_ggml_backend_buffer_type_t buft, lm_ggml_backend_t backend) { + return lm_ggml_backend_is_cpu(backend); - UNUSED(backend); + LM_GGML_UNUSED(buft); } -static void lm_ggml_backend_cpu_get_tensor_async(lm_ggml_backend_t backend, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) { - LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor read out of bounds"); - LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); - - memcpy(data, (const char *)tensor->data + offset, size); +lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_buffer_type(void) { + static struct lm_ggml_backend_buffer_type lm_ggml_backend_buffer_type_cpu = { + /* .iface = */ { + /* .alloc_buffer = */ lm_ggml_backend_cpu_buffer_type_alloc_buffer, + /* .get_alignment = */ lm_ggml_backend_cpu_buffer_type_get_alignment, + /* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes + /* .supports_backend = */ lm_ggml_backend_cpu_buffer_type_supports_backend, + }, + /* .context = */ NULL, + }; - UNUSED(backend); + return &lm_ggml_backend_buffer_type_cpu; } -static void lm_ggml_backend_cpu_synchronize(lm_ggml_backend_t backend) { - UNUSED(backend); -} +struct lm_ggml_backend_cpu_context { + int n_threads; + void * work_data; + size_t work_size; +}; -static void lm_ggml_backend_cpu_cpy_tensor_from(lm_ggml_backend_t backend, struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) { - lm_ggml_backend_tensor_get(src, dst->data, 0, lm_ggml_nbytes(src)); +static const char * lm_ggml_backend_cpu_name(lm_ggml_backend_t backend) { + return "CPU"; - UNUSED(backend); + LM_GGML_UNUSED(backend); } -static void lm_ggml_backend_cpu_cpy_tensor_to(lm_ggml_backend_t backend, struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) { - // for a backend such as CUDA that can queue async calls, it is ok to do this asynchronously, but it may not be the case for other backends - lm_ggml_backend_tensor_set_async(dst, src->data, 0, lm_ggml_nbytes(src)); +static void lm_ggml_backend_cpu_free(lm_ggml_backend_t backend) { + struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context; + free(cpu_ctx->work_data); + free(cpu_ctx); + free(backend); +} - UNUSED(backend); +static lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_get_default_buffer_type(lm_ggml_backend_t backend) { + return lm_ggml_backend_cpu_buffer_type(); + + LM_GGML_UNUSED(backend); } struct lm_ggml_backend_plan_cpu { @@ -303,7 +520,7 @@ static void lm_ggml_backend_cpu_graph_plan_free(lm_ggml_backend_t backend, lm_gg free(cpu_plan->cplan.work_data); free(cpu_plan); - UNUSED(backend); + LM_GGML_UNUSED(backend); } static void lm_ggml_backend_cpu_graph_plan_compute(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan) { @@ -311,7 +528,7 @@ static void lm_ggml_backend_cpu_graph_plan_compute(lm_ggml_backend_t backend, lm lm_ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan); - UNUSED(backend); + LM_GGML_UNUSED(backend); } static void lm_ggml_backend_cpu_graph_compute(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) { @@ -332,25 +549,25 @@ static void lm_ggml_backend_cpu_graph_compute(lm_ggml_backend_t backend, struct static bool lm_ggml_backend_cpu_supports_op(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op) { return true; - UNUSED(backend); - UNUSED(op); + + LM_GGML_UNUSED(backend); + LM_GGML_UNUSED(op); } static struct lm_ggml_backend_i cpu_backend_i = { - /* .get_name = */ lm_ggml_backend_cpu_name, - /* .free = */ lm_ggml_backend_cpu_free, - /* .alloc_buffer = */ lm_ggml_backend_cpu_alloc_buffer, - /* .get_alignment = */ lm_ggml_backend_cpu_get_alignment, - /* .set_tensor_async = */ lm_ggml_backend_cpu_set_tensor_async, - /* .get_tensor_async = */ lm_ggml_backend_cpu_get_tensor_async, - /* .synchronize = */ lm_ggml_backend_cpu_synchronize, - /* .cpy_tensor_from = */ lm_ggml_backend_cpu_cpy_tensor_from, - /* .cpy_tensor_to = */ lm_ggml_backend_cpu_cpy_tensor_to, - /* .graph_plan_create = */ lm_ggml_backend_cpu_graph_plan_create, - /* .graph_plan_free = */ lm_ggml_backend_cpu_graph_plan_free, - /* .graph_plan_compute = */ lm_ggml_backend_cpu_graph_plan_compute, - /* .graph_compute = */ lm_ggml_backend_cpu_graph_compute, - /* .supports_op = */ lm_ggml_backend_cpu_supports_op, + /* .get_name = */ lm_ggml_backend_cpu_name, + /* .free = */ lm_ggml_backend_cpu_free, + /* .get_default_buffer_type = */ lm_ggml_backend_cpu_get_default_buffer_type, + /* .set_tensor_async = */ NULL, + /* .get_tensor_async = */ NULL, + /* .cpy_tensor_from_async = */ NULL, + /* .cpy_tensor_to_async = */ NULL, + /* .synchronize = */ NULL, + /* .graph_plan_create = */ lm_ggml_backend_cpu_graph_plan_create, + /* .graph_plan_free = */ lm_ggml_backend_cpu_graph_plan_free, + /* .graph_plan_compute = */ lm_ggml_backend_cpu_graph_plan_compute, + /* .graph_compute = */ lm_ggml_backend_cpu_graph_compute, + /* .supports_op = */ lm_ggml_backend_cpu_supports_op, }; lm_ggml_backend_t lm_ggml_backend_cpu_init(void) { @@ -380,6 +597,761 @@ void lm_ggml_backend_cpu_set_n_threads(lm_ggml_backend_t backend_cpu, int n_thre ctx->n_threads = n_threads; } -lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_from_ptr(lm_ggml_backend_t backend_cpu, void * ptr, size_t size) { - return lm_ggml_backend_buffer_init(backend_cpu, cpu_backend_buffer_i_from_ptr, ptr, size); +lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) { + return lm_ggml_backend_buffer_init(lm_ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size); +} + +static lm_ggml_backend_t lm_ggml_backend_reg_cpu_init(const char * params, void * user_data) { + return lm_ggml_backend_cpu_init(); + + LM_GGML_UNUSED(params); + LM_GGML_UNUSED(user_data); +} + + +// scheduler + +#define LM_GGML_MAX_BACKENDS 4 +#define LM_GGML_MAX_SPLITS 256 +#define LM_GGML_MAX_SPLIT_INPUTS 16 + +struct lm_ggml_backend_sched_split { + lm_ggml_tallocr_t tallocr; + int i_start; + int i_end; + struct lm_ggml_tensor * inputs[LM_GGML_MAX_SPLIT_INPUTS]; + int n_inputs; + struct lm_ggml_cgraph graph; +}; + +struct lm_ggml_backend_sched { + int n_backends; + lm_ggml_backend_t backends[LM_GGML_MAX_BACKENDS]; + lm_ggml_tallocr_t tallocs[LM_GGML_MAX_BACKENDS]; + + lm_ggml_gallocr_t galloc; + + struct lm_ggml_hash_set hash_set; + lm_ggml_tallocr_t * node_talloc; // [hash_set.size] + struct lm_ggml_tensor * (* node_copies)[LM_GGML_MAX_BACKENDS]; // [hash_set.size][LM_GGML_MAX_BACKENDS] + + struct lm_ggml_cgraph * graph; + struct lm_ggml_backend_sched_split splits[LM_GGML_MAX_SPLITS]; + int n_splits; + + struct lm_ggml_context * ctx; + + // align context_buffer to LM_GGML_MEM_ALIGN + #ifdef _MSC_VER + __declspec(align(LM_GGML_MEM_ALIGN)) + #else + __attribute__((aligned(LM_GGML_MEM_ALIGN))) + #endif + char context_buffer[LM_GGML_MAX_SPLITS*LM_GGML_MAX_SPLIT_INPUTS*sizeof(struct lm_ggml_tensor) + sizeof(struct lm_ggml_cgraph)]; +}; + +#define hash_id(node) lm_ggml_hash_find_or_insert(sched->hash_set, node) +#define node_allocr(node) sched->node_talloc[hash_id(node)] + +static bool lm_ggml_is_view_op(enum lm_ggml_op op) { + return op == LM_GGML_OP_VIEW || op == LM_GGML_OP_RESHAPE || op == LM_GGML_OP_PERMUTE || op == LM_GGML_OP_TRANSPOSE; +} + +// returns the priority of the backend, lower is better +static int sched_backend_prio(lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend) { + for (int i = 0; i < sched->n_backends; i++) { + if (sched->backends[i] == backend) { + return i; + } + } + return INT_MAX; +} + +static int sched_allocr_prio(lm_ggml_backend_sched_t sched, lm_ggml_tallocr_t allocr) { + for (int i = 0; i < sched->n_backends; i++) { + if (sched->tallocs[i] == allocr) { + return i; + } + } + return INT_MAX; +} + +static lm_ggml_backend_t get_buffer_backend(lm_ggml_backend_sched_t sched, lm_ggml_backend_buffer_t buffer) { + if (buffer == NULL) { + return NULL; + } + // find highest prio backend that supports the buffer type + for (int i = 0; i < sched->n_backends; i++) { + if (lm_ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) { + return sched->backends[i]; + } + } + LM_GGML_ASSERT(false && "tensor buffer type not supported by any backend"); +} + +static lm_ggml_backend_t get_allocr_backend(lm_ggml_backend_sched_t sched, lm_ggml_tallocr_t allocr) { + if (allocr == NULL) { + return NULL; + } + // find highest prio backend that supports the buffer type + for (int i = 0; i < sched->n_backends; i++) { + if (sched->tallocs[i] == allocr) { + return sched->backends[i]; + } + } + LM_GGML_UNREACHABLE(); +} + +#if 0 +static char causes[LM_GGML_DEFAULT_GRAPH_SIZE*8 + LM_GGML_MAX_SPLITS*LM_GGML_MAX_SPLIT_INPUTS][128]; // debug, remove +#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__) +#define GET_CAUSE(node) causes[hash_id(node)] +#else +#define SET_CAUSE(node, ...) +#define GET_CAUSE(node) "" +#endif + +// returns the backend that should be used for the node based on the current locations +static lm_ggml_backend_t sched_backend_from_cur(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node) { + // if the dst tensor is already allocated in a buffer, we must assume that it is critical to keep it there + // ie. kv cache updates + // note that this doesn't allow fallback to CPU. need to add output tensors to the splits to copy the data back to the original backend. + // dst + lm_ggml_backend_t cur_backend = get_buffer_backend(sched, node->buffer); + if (cur_backend != NULL) { + SET_CAUSE(node, "1.dst"); + return cur_backend; + } + + // view_src + if (node->view_src != NULL && get_buffer_backend(sched, node->view_src->buffer) != NULL) { + SET_CAUSE(node, "1.vsrc"); + return get_buffer_backend(sched, node->view_src->buffer); + } + + // src + int cur_prio = INT_MAX; + size_t cur_size = 0; + + for (int i = 0; i < LM_GGML_MAX_SRC; i++) { + const struct lm_ggml_tensor * src = node->src[i]; + if (src == NULL) { + break; + } + lm_ggml_backend_t src_backend = get_buffer_backend(sched, src->buffer); + if (src_backend != NULL) { + int src_prio = sched_backend_prio(sched, src_backend); + size_t src_size = lm_ggml_nbytes(src); + if (src_prio < cur_prio && src_size >= cur_size) { + cur_prio = src_prio; + cur_size = src_size; + cur_backend = src_backend; + SET_CAUSE(node, "1.src%d", i); + } + } + } + return cur_backend; +} + +static char * fmt_size(size_t size) { + static char buffer[128]; + if (size >= 1024*1024) { + sprintf(buffer, "%zuM", size/1024/1024); + } else { + sprintf(buffer, "%zuK", size/1024); + } + return buffer; +} + +static void sched_print_assignments(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) { + int cur_split = 0; + for (int i = 0; i < graph->n_nodes; i++) { + if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) { + lm_ggml_backend_t split_backend = get_allocr_backend(sched, sched->splits[cur_split].tallocr); + fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, lm_ggml_backend_name(split_backend), + sched->splits[cur_split].n_inputs); + for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) { + fprintf(stderr, "[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name, + fmt_size(lm_ggml_nbytes(sched->splits[cur_split].inputs[j]))); + } + fprintf(stderr, "\n"); + cur_split++; + } + struct lm_ggml_tensor * node = graph->nodes[i]; + if (lm_ggml_is_view_op(node->op)) { + continue; + } + lm_ggml_tallocr_t node_allocr = node_allocr(node); + lm_ggml_backend_t node_backend = node_allocr ? get_allocr_backend(sched, node_allocr) : NULL; // FIXME: + fprintf(stderr, "node #%3d (%10.10s): %20.20s (%4.4s) [%4.4s %8.8s]:", i, lm_ggml_op_name(node->op), node->name, + fmt_size(lm_ggml_nbytes(node)), node_allocr ? lm_ggml_backend_name(node_backend) : "NULL", GET_CAUSE(node)); + for (int j = 0; j < LM_GGML_MAX_SRC; j++) { + struct lm_ggml_tensor * src = node->src[j]; + if (src == NULL) { + break; + } + lm_ggml_tallocr_t src_allocr = node_allocr(src); + lm_ggml_backend_t src_backend = src_allocr ? get_allocr_backend(sched, src_allocr) : NULL; + fprintf(stderr, " %20.20s (%4.4s) [%4.4s %8.8s]", src->name, + fmt_size(lm_ggml_nbytes(src)), src_backend ? lm_ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src)); + } + fprintf(stderr, "\n"); + } +} + +// creates a copy of the tensor with the same memory layout +static struct lm_ggml_tensor * lm_ggml_dup_tensor_layout(struct lm_ggml_context * ctx, const struct lm_ggml_tensor * tensor) { + struct lm_ggml_tensor * dup = lm_ggml_dup_tensor(ctx, tensor); + for (int i = 0; i < LM_GGML_MAX_DIMS; i++) { + dup->nb[i] = tensor->nb[i]; + } + return dup; +} + +// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend +// TODO: merge passes +static void sched_split_graph(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) { + // reset state + size_t hash_size = sched->hash_set.size; + memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); + memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size); + memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size); + sched->n_splits = 0; + + struct lm_ggml_init_params params = { + /* .mem_size = */ sizeof(sched->context_buffer), + /* .mem_buffer = */ sched->context_buffer, + /* .no_alloc = */ true + }; + + if (sched->ctx != NULL) { + lm_ggml_free(sched->ctx); + } + + sched->ctx = lm_ggml_init(params); + + // pass 1: assign backends to ops with allocated inputs + for (int i = 0; i < graph->n_leafs; i++) { + struct lm_ggml_tensor * leaf = graph->leafs[i]; + if (node_allocr(leaf) != NULL) { + // do not overwrite user assignments + continue; + } + lm_ggml_backend_t leaf_backend = get_buffer_backend(sched, leaf->buffer); + if (leaf_backend == NULL && leaf->view_src != NULL) { + leaf_backend = get_buffer_backend(sched, leaf->view_src->buffer); + } + if (leaf_backend != NULL) { + node_allocr(leaf) = lm_ggml_backend_sched_get_tallocr(sched, leaf_backend); + } + } + + for (int i = 0; i < graph->n_nodes; i++) { + struct lm_ggml_tensor * node = graph->nodes[i]; + if (node_allocr(node) != NULL) { + // do not overwrite user assignments + continue; + } + lm_ggml_backend_t node_backend = sched_backend_from_cur(sched, node); + if (node_backend != NULL) { + node_allocr(node) = lm_ggml_backend_sched_get_tallocr(sched, node_backend); + } + } + //printf("PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); + + // pass 2: assign backends to ops from current assignments + // TODO: + // - reuse sched_backend_from_cur + for (int i = 0; i < graph->n_nodes; i++) { + struct lm_ggml_tensor * node = graph->nodes[i]; + lm_ggml_tallocr_t node_allocr = node_allocr(node); + if (node_allocr == NULL) { + int cur_prio = INT_MAX; + size_t cur_size = 0; + for (int j = 0; j < LM_GGML_MAX_SRC; j++) { + struct lm_ggml_tensor * src = node->src[j]; + if (src == NULL) { + break; + } + lm_ggml_tallocr_t src_allocr = node_allocr(src); + if (src_allocr != NULL) { + int src_prio = sched_allocr_prio(sched, src_allocr); + size_t src_size = lm_ggml_nbytes(src); + if (src_prio < cur_prio && src_size >= cur_size) { + cur_prio = src_prio; + cur_size = src_size; + node_allocr = src_allocr; + SET_CAUSE(node, "2.src%d", j); + } + } + } + if (node_allocr != NULL) { + node_allocr(node) = node_allocr; + } + } + } + //printf("PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); + + // pass 3: assign backends to remaining src from dst (should only be leafs) + for (int i = 0; i < graph->n_nodes; i++) { + struct lm_ggml_tensor * node = graph->nodes[i]; + lm_ggml_tallocr_t node_allocr = node_allocr(node); + for (int j = 0; j < LM_GGML_MAX_SRC; j++) { + struct lm_ggml_tensor * src = node->src[j]; + if (src == NULL) { + break; + } + lm_ggml_tallocr_t src_allocr = node_allocr(src); + if (src_allocr == NULL) { + node_allocr(src) = node_allocr; + } + } + } + //printf("PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); + + // pass 4: split graph, find tensors that need to be copied + // TODO: + // - when switching from a less preferred backend to a more preferred backend, check if it is possible to move the switch to an earlier point for the same cost + // find first backend + int cur_split = 0; + for (int i = 0; i < graph->n_nodes; i++) { + struct lm_ggml_tensor * node = graph->nodes[i]; + if (node->view_src == NULL) { + sched->splits[0].tallocr = node_allocr(node); + break; + } + } + sched->splits[0].i_start = 0; + sched->splits[0].n_inputs = 0; + memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK + lm_ggml_tallocr_t cur_allocr = sched->splits[0].tallocr; + size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr); + for (int i = 0; i < graph->n_nodes; i++) { + struct lm_ggml_tensor * node = graph->nodes[i]; + + if (lm_ggml_is_view_op(node->op)) { + continue; + } + + lm_ggml_tallocr_t node_allocr = node_allocr(node); + + if (node_allocr != cur_allocr) { + sched->splits[cur_split].i_end = i; + cur_split++; + LM_GGML_ASSERT(cur_split < LM_GGML_MAX_SPLITS); + sched->splits[cur_split].tallocr = node_allocr; + sched->splits[cur_split].i_start = i; + sched->splits[cur_split].n_inputs = 0; + memset(sched->splits[cur_split].inputs, 0, sizeof(sched->splits[cur_split].inputs)); //HACK + cur_allocr = node_allocr; + cur_backend_id = sched_allocr_prio(sched, cur_allocr); + } + + // find inputs that are not on the same backend + for (int j = 0; j < LM_GGML_MAX_SRC; j++) { + struct lm_ggml_tensor * src = node->src[j]; + if (src == NULL) { + break; + } + lm_ggml_tallocr_t src_allocr = node_allocr(src); + if (src_allocr != node_allocr) { + int n_inputs = sched->splits[cur_split].n_inputs++; + LM_GGML_ASSERT(n_inputs < LM_GGML_MAX_SPLIT_INPUTS); + sched->splits[cur_split].inputs[n_inputs] = (struct lm_ggml_tensor *)src; + + // create copies + size_t id = hash_id(src); + if (sched->node_copies[id][cur_backend_id] == NULL) { + struct lm_ggml_tensor * tensor_copy = lm_ggml_dup_tensor_layout(sched->ctx, src); + sched->node_copies[id][cur_backend_id] = tensor_copy; + node_allocr(tensor_copy) = cur_allocr; + lm_ggml_backend_t backend = get_allocr_backend(sched, cur_allocr); + lm_ggml_format_name(tensor_copy, "%s#%s", lm_ggml_backend_name(backend), src->name); + } + node->src[j] = sched->node_copies[id][cur_backend_id]; + } + } + } + sched->splits[cur_split].i_end = graph->n_nodes; + sched->n_splits = cur_split + 1; + + //fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); fflush(stdout); + +#if 1 + // sanity check: all sources should have the same backend as the node + for (int i = 0; i < graph->n_nodes; i++) { + struct lm_ggml_tensor * node = graph->nodes[i]; + lm_ggml_tallocr_t node_allocr = node_allocr(node); + if (node_allocr == NULL) { + fprintf(stderr, "!!!!!!! %s has no backend\n", node->name); + } + for (int j = 0; j < LM_GGML_MAX_SRC; j++) { + struct lm_ggml_tensor * src = node->src[j]; + if (src == NULL) { + break; + } + lm_ggml_tallocr_t src_allocr = node_allocr(src); + if (src_allocr != node_allocr /* && src_backend != NULL */) { // ignore nulls for now + fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n", + node->name, node_allocr ? lm_ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL", + j, src->name, src_allocr ? lm_ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL"); + } + } + } +#endif + + // create copies of the graph for each split + // FIXME: avoid this copy, pass split inputs to lm_ggml_gallocr_alloc_graph_n in some other way + struct lm_ggml_cgraph * graph_copy = lm_ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*LM_GGML_MAX_SPLIT_INPUTS, false); + for (int i = 0; i < sched->n_splits; i++) { + struct lm_ggml_backend_sched_split * split = &sched->splits[i]; + split->graph = lm_ggml_graph_view(graph, split->i_start, split->i_end); + + // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split + for (int j = 0; j < split->n_inputs; j++) { + struct lm_ggml_tensor * input = split->inputs[j]; + struct lm_ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_allocr_prio(sched, split->tallocr)]; + input_cpy->src[0] = input; + graph_copy->nodes[graph_copy->n_nodes++] = input_cpy; + } + + for (int j = split->i_start; j < split->i_end; j++) { + graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j]; + } + } + sched->graph = graph_copy; +} + +static void sched_alloc_splits(lm_ggml_backend_sched_t sched) { + lm_ggml_gallocr_alloc_graph_n( + sched->galloc, + sched->graph, + sched->hash_set, + sched->node_talloc); +} + +static void sched_compute_splits(lm_ggml_backend_sched_t sched) { + uint64_t copy_us[LM_GGML_MAX_BACKENDS] = {0}; + uint64_t compute_us[LM_GGML_MAX_BACKENDS] = {0}; + + struct lm_ggml_backend_sched_split * splits = sched->splits; + + for (int i = 0; i < sched->n_splits; i++) { + struct lm_ggml_backend_sched_split * split = &splits[i]; + lm_ggml_backend_t split_backend = get_allocr_backend(sched, split->tallocr); + int split_backend_id = sched_backend_prio(sched, split_backend); + + // copy the input tensors to the split backend + uint64_t copy_start_us = lm_ggml_time_us(); + for (int j = 0; j < split->n_inputs; j++) { + struct lm_ggml_tensor * input = split->inputs[j]; + struct lm_ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_backend_prio(sched, split_backend)]; + if (input->buffer == NULL) { + if (input->view_src == NULL) { + fprintf(stderr, "input %s has no buffer and no view_src\n", input->name); + exit(1); + } + // FIXME: may need to use the sched buffer instead + lm_ggml_backend_view_init(input->view_src->buffer, input); + } + if (input_cpy->buffer == NULL) { + fprintf(stderr, "input_cpy %s has no buffer\n", input_cpy->name); + exit(1); + } + //LM_GGML_ASSERT(input->buffer->backend != input_cpy->buffer->backend); + //LM_GGML_ASSERT(input_cpy->buffer->backend == split_backend); + lm_ggml_backend_tensor_copy(input, input_cpy); + } + // lm_ggml_backend_synchronize(split_backend); + int64_t copy_end_us = lm_ggml_time_us(); + copy_us[split_backend_id] += copy_end_us - copy_start_us; + +#if 0 + char split_filename[LM_GGML_MAX_NAME]; + snprintf(split_filename, LM_GGML_MAX_NAME, "split_%i_%s.dot", i, lm_ggml_backend_name(split_backend)); + lm_ggml_graph_dump_dot(split->graph, NULL, split_filename); +#endif + + uint64_t compute_start_us = lm_ggml_time_us(); + lm_ggml_backend_graph_compute(split_backend, &split->graph); + // lm_ggml_backend_synchronize(split_backend); + uint64_t compute_end_us = lm_ggml_time_us(); + compute_us[split_backend_id] += compute_end_us - compute_start_us; + } + +#if 0 + // per-backend timings + fprintf(stderr, "sched_compute_splits times (%d splits):\n", sched->n_splits); + for (int i = 0; i < sched->n_backends; i++) { + if (copy_us[i] > 0 || compute_us[i] > 0) { + fprintf(stderr, "\t%5.5s: %lu us copy, %lu us compute\n", lm_ggml_backend_name(sched->backends[i]), copy_us[i], compute_us[i]); + } + } +#endif +} + +static void sched_reset(lm_ggml_backend_sched_t sched) { + for (int i = 0; i < sched->n_backends; i++) { + lm_ggml_tallocr_reset(sched->tallocs[i]); + } +} + +lm_ggml_backend_sched_t lm_ggml_backend_sched_new(lm_ggml_backend_t * backends, int n_backends) { + LM_GGML_ASSERT(n_backends <= LM_GGML_MAX_BACKENDS); + + struct lm_ggml_backend_sched * sched = malloc(sizeof(struct lm_ggml_backend_sched)); + memset(sched, 0, sizeof(struct lm_ggml_backend_sched)); + + sched->n_backends = n_backends; + for (int i = 0; i < n_backends; i++) { + sched->backends[i] = backends[i]; + } + + sched->galloc = lm_ggml_gallocr_new(); + + // init measure allocs for each backend + for (int i = 0; i < n_backends; i++) { + sched->tallocs[i] = lm_ggml_tallocr_new_measure_from_backend(backends[i]); + } + + return sched; +} + +void lm_ggml_backend_sched_free(lm_ggml_backend_sched_t sched) { + if (sched == NULL) { + return; + } + for (int i = 0; i < sched->n_backends; i++) { + lm_ggml_tallocr_free(sched->tallocs[i]); + } + lm_ggml_gallocr_free(sched->galloc); + free(sched->hash_set.keys); + free(sched->node_talloc); + free(sched->node_copies); + free(sched); +} + +void lm_ggml_backend_sched_init_measure(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * measure_graph) { + // initialize hash tables + size_t hash_size = measure_graph->visited_hash_table.size + LM_GGML_MAX_SPLITS*LM_GGML_MAX_SPLIT_INPUTS; + sched->hash_set.size = hash_size; + sched->hash_set.keys = malloc(sizeof(sched->hash_set.keys[0]) * hash_size); + sched->node_talloc = malloc(sizeof(sched->node_talloc[0]) * hash_size); + sched->node_copies = malloc(sizeof(sched->node_copies[0]) * hash_size); + + sched_split_graph(sched, measure_graph); + sched_alloc_splits(sched); + + // allocate buffers and reset allocators + for (int i = 0; i < sched->n_backends; i++) { + size_t size = lm_ggml_tallocr_max_size(sched->tallocs[i]); + lm_ggml_tallocr_free(sched->tallocs[i]); + sched->tallocs[i] = lm_ggml_tallocr_new_from_backend(sched->backends[i], size); + } + + sched_reset(sched); +} + +void lm_ggml_backend_sched_graph_compute(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) { + LM_GGML_ASSERT(sched->hash_set.size >= graph->visited_hash_table.size + LM_GGML_MAX_SPLITS*LM_GGML_MAX_SPLIT_INPUTS); + + sched_split_graph(sched, graph); + sched_alloc_splits(sched); + sched_compute_splits(sched); + sched_reset(sched); +} + +lm_ggml_tallocr_t lm_ggml_backend_sched_get_tallocr(lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend) { + int backend_index = sched_backend_prio(sched, backend); + return sched->tallocs[backend_index]; +} + +lm_ggml_backend_buffer_t lm_ggml_backend_sched_get_buffer(lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend) { + int backend_index = sched_backend_prio(sched, backend); + return lm_ggml_tallocr_get_buffer(sched->tallocs[backend_index]); +} + +void lm_ggml_backend_sched_set_node_backend(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node, lm_ggml_backend_t backend) { + int backend_index = sched_backend_prio(sched, backend); + LM_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); + node_allocr(node) = sched->tallocs[backend_index]; +} + +// utils +void lm_ggml_backend_view_init(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor) { + LM_GGML_ASSERT(tensor->buffer == NULL); + LM_GGML_ASSERT(tensor->data == NULL); + LM_GGML_ASSERT(tensor->view_src != NULL); + LM_GGML_ASSERT(tensor->view_src->buffer != NULL); + LM_GGML_ASSERT(tensor->view_src->data != NULL); + + tensor->buffer = buffer; + tensor->data = (char *)tensor->view_src->data + tensor->view_offs; + tensor->backend = tensor->view_src->backend; + lm_ggml_backend_buffer_init_tensor(buffer, tensor); +} + +void lm_ggml_backend_tensor_alloc(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, void * addr) { + LM_GGML_ASSERT(tensor->buffer == NULL); + LM_GGML_ASSERT(tensor->data == NULL); + LM_GGML_ASSERT(tensor->view_src == NULL); + LM_GGML_ASSERT(addr >= lm_ggml_backend_buffer_get_base(buffer)); + LM_GGML_ASSERT((char *)addr + lm_ggml_backend_buffer_get_alloc_size(buffer, tensor) <= + (char *)lm_ggml_backend_buffer_get_base(buffer) + lm_ggml_backend_buffer_get_size(buffer)); + + tensor->buffer = buffer; + tensor->data = addr; + lm_ggml_backend_buffer_init_tensor(buffer, tensor); +} + +static struct lm_ggml_tensor * graph_dup_tensor(struct lm_ggml_hash_set hash_set, struct lm_ggml_tensor ** node_copies, + struct lm_ggml_context * ctx_allocated, struct lm_ggml_context * ctx_unallocated, struct lm_ggml_tensor * src) { + + LM_GGML_ASSERT(src != NULL); + LM_GGML_ASSERT(src->data && "graph must be allocated"); + + size_t id = lm_ggml_hash_insert(hash_set, src); + if (id == LM_GGML_HASHTABLE_ALREADY_EXISTS) { + return node_copies[lm_ggml_hash_find(hash_set, src)]; + } + + struct lm_ggml_tensor * dst = lm_ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src); + if (src->view_src != NULL) { + dst->view_src = graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src); + dst->view_offs = src->view_offs; + } + dst->op = src->op; + memcpy(dst->op_params, src->op_params, sizeof(dst->op_params)); + lm_ggml_set_name(dst, src->name); + + // copy src + for (int i = 0; i < LM_GGML_MAX_SRC; i++) { + struct lm_ggml_tensor * s = src->src[i]; + if (s == NULL) { + break; + } + dst->src[i] = graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s); + } + + node_copies[id] = dst; + return dst; +} + +static void graph_init_tensor(struct lm_ggml_hash_set hash_set, struct lm_ggml_tensor ** node_copies, bool * node_init, struct lm_ggml_tensor * src) { + size_t id = lm_ggml_hash_find(hash_set, src); + if (node_init[id]) { + return; + } + node_init[id] = true; + + struct lm_ggml_tensor * dst = node_copies[id]; + if (dst->view_src != NULL) { + lm_ggml_backend_view_init(dst->view_src->buffer, dst); + } + else { + lm_ggml_backend_tensor_copy(src, dst); + } + + // init src + for (int i = 0; i < LM_GGML_MAX_SRC; i++) { + struct lm_ggml_tensor * s = src->src[i]; + if (s == NULL) { + break; + } + graph_init_tensor(hash_set, node_copies, node_init, s); + } +} + +struct lm_ggml_backend_graph_copy lm_ggml_backend_graph_copy(lm_ggml_backend_t backend, struct lm_ggml_cgraph * graph) { + struct lm_ggml_hash_set hash_set = { + /* .size = */ graph->visited_hash_table.size, + /* .keys = */ calloc(sizeof(hash_set.keys[0]) * graph->visited_hash_table.size, 1) + }; + struct lm_ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]) * hash_set.size, 1); + bool * node_init = calloc(sizeof(node_init[0]) * hash_set.size, 1); + + struct lm_ggml_init_params params = { + /* .mem_size = */ lm_ggml_tensor_overhead()*hash_set.size + lm_ggml_graph_overhead_custom(graph->size, false), + /* .mem_buffer = */ NULL, + /* .no_alloc = */ true + }; + + struct lm_ggml_context * ctx_allocated = lm_ggml_init(params); + struct lm_ggml_context * ctx_unallocated = lm_ggml_init(params); + + // dup nodes + for (int i = 0; i < graph->n_nodes; i++) { + struct lm_ggml_tensor * node = graph->nodes[i]; + graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node); + } + + // allocate nodes + lm_ggml_backend_buffer_t buffer = lm_ggml_backend_alloc_ctx_tensors(ctx_allocated, backend); + + //printf("copy buffer size: %zu MB\n", lm_ggml_backend_buffer_get_size(buffer) / 1024 / 1024); + + // copy data and init views + for (int i = 0; i < graph->n_nodes; i++) { + struct lm_ggml_tensor * node = graph->nodes[i]; + graph_init_tensor(hash_set, node_copies, node_init, node); + } + + // build graph copy + struct lm_ggml_cgraph * graph_copy = lm_ggml_new_graph_custom(ctx_allocated, graph->size, false); + for (int i = 0; i < graph->n_nodes; i++) { + struct lm_ggml_tensor * node = graph->nodes[i]; + struct lm_ggml_tensor * node_copy = node_copies[lm_ggml_hash_find(hash_set, node)]; + graph_copy->nodes[i] = node_copy; + } + graph_copy->n_nodes = graph->n_nodes; + + free(hash_set.keys); + free(node_copies); + free(node_init); + + return (struct lm_ggml_backend_graph_copy) { + /* .buffer = */ buffer, + /* .ctx_allocated = */ ctx_allocated, + /* .ctx_unallocated = */ ctx_unallocated, + /* .graph = */ graph_copy, + }; +} + +void lm_ggml_backend_graph_copy_free(struct lm_ggml_backend_graph_copy copy) { + lm_ggml_backend_buffer_free(copy.buffer); + lm_ggml_free(copy.ctx_allocated); + lm_ggml_free(copy.ctx_unallocated); +} + +void lm_ggml_backend_compare_graph_backend(lm_ggml_backend_t backend1, lm_ggml_backend_t backend2, struct lm_ggml_cgraph * graph, lm_ggml_backend_eval_callback callback, void * user_data) { + struct lm_ggml_backend_graph_copy copy = lm_ggml_backend_graph_copy(backend2, graph); + struct lm_ggml_cgraph * g1 = graph; + struct lm_ggml_cgraph * g2 = copy.graph; + + assert(g1->n_nodes == g2->n_nodes); + + for (int i = 0; i < g1->n_nodes; i++) { + //printf("eval %d/%d\n", i, g1->n_nodes); + struct lm_ggml_tensor * t1 = g1->nodes[i]; + struct lm_ggml_tensor * t2 = g2->nodes[i]; + + assert(t1->op == t2->op && lm_ggml_are_same_layout(t1, t2)); + + struct lm_ggml_cgraph g1v = lm_ggml_graph_view(g1, i, i + 1); + struct lm_ggml_cgraph g2v = lm_ggml_graph_view(g2, i, i + 1); + + lm_ggml_backend_graph_compute(backend1, &g1v); + lm_ggml_backend_graph_compute(backend2, &g2v); + + if (lm_ggml_is_view_op(t1->op)) { + continue; + } + + // compare results, calculate rms etc + if (!callback(i, t1, t2, user_data)) { + break; + } + } + + lm_ggml_backend_graph_copy_free(copy); } diff --git a/cpp/ggml-backend.h b/cpp/ggml-backend.h index 33cc8a5..63fa013 100644 --- a/cpp/ggml-backend.h +++ b/cpp/ggml-backend.h @@ -1,115 +1,50 @@ #pragma once #include "ggml.h" +#include "ggml-alloc.h" #ifdef __cplusplus extern "C" { #endif - struct lm_ggml_backend; - struct lm_ggml_backend_buffer; - // type-erased backend-specific types / wrappers - typedef void * lm_ggml_backend_context_t; - typedef void * lm_ggml_backend_graph_plan_t; - typedef void * lm_ggml_backend_buffer_context_t; - - // avoid accessing internals of these types - typedef struct lm_ggml_backend * lm_ggml_backend_t; + typedef struct lm_ggml_backend_buffer_type * lm_ggml_backend_buffer_type_t; typedef struct lm_ggml_backend_buffer * lm_ggml_backend_buffer_t; + typedef struct lm_ggml_backend * lm_ggml_backend_t; + typedef void * lm_ggml_backend_graph_plan_t; // - // backend buffer + // Backend buffer // - struct lm_ggml_backend_buffer_i { - void (*free_buffer) (lm_ggml_backend_buffer_t buffer); - void * (*get_base) (lm_ggml_backend_buffer_t buffer); // get base pointer - size_t (*get_alloc_size)(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor); // pre-allocation callback - void (*init_tensor) (lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor); // post-allocation callback - void (*free_tensor) (lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor); // pre-free callback - }; - - // TODO: hide behind API - struct lm_ggml_backend_buffer { - struct lm_ggml_backend_buffer_i iface; - - lm_ggml_backend_t backend; - lm_ggml_backend_buffer_context_t context; - - size_t size; - }; - - // backend buffer functions - LM_GGML_API lm_ggml_backend_buffer_t lm_ggml_backend_buffer_init( - struct lm_ggml_backend * backend, - struct lm_ggml_backend_buffer_i iface, - lm_ggml_backend_buffer_context_t context, - size_t size); + // buffer type + LM_GGML_API lm_ggml_backend_buffer_t lm_ggml_backend_buft_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size); + LM_GGML_API size_t lm_ggml_backend_buft_get_alignment (lm_ggml_backend_buffer_type_t buft); + LM_GGML_API size_t lm_ggml_backend_buft_get_alloc_size(lm_ggml_backend_buffer_type_t buft, struct lm_ggml_tensor * tensor); + LM_GGML_API bool lm_ggml_backend_buft_supports_backend(lm_ggml_backend_buffer_type_t buft, lm_ggml_backend_t backend); + // buffer LM_GGML_API void lm_ggml_backend_buffer_free (lm_ggml_backend_buffer_t buffer); - LM_GGML_API size_t lm_ggml_backend_buffer_get_alignment (lm_ggml_backend_buffer_t buffer); LM_GGML_API void * lm_ggml_backend_buffer_get_base (lm_ggml_backend_buffer_t buffer); LM_GGML_API size_t lm_ggml_backend_buffer_get_size (lm_ggml_backend_buffer_t buffer); - LM_GGML_API size_t lm_ggml_backend_buffer_get_alloc_size(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor); LM_GGML_API void lm_ggml_backend_buffer_init_tensor (lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor); - LM_GGML_API void lm_ggml_backend_buffer_free_tensor (lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor); + LM_GGML_API size_t lm_ggml_backend_buffer_get_alignment (lm_ggml_backend_buffer_t buffer); + LM_GGML_API size_t lm_ggml_backend_buffer_get_alloc_size(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor); + LM_GGML_API lm_ggml_backend_buffer_type_t lm_ggml_backend_buffer_type(lm_ggml_backend_buffer_t buffer); // - // backend + // Backend // - struct lm_ggml_backend_i { - const char * (*get_name)(lm_ggml_backend_t backend); - - void (*free)(lm_ggml_backend_t backend); - - // buffer allocation - lm_ggml_backend_buffer_t (*alloc_buffer)(lm_ggml_backend_t backend, size_t size); - - // get buffer alignment - size_t (*get_alignment)(lm_ggml_backend_t backend); - - // tensor data access - // these functions can be asynchronous, helper functions are provided for synchronous access that automatically call synchronize - void (*set_tensor_async)(lm_ggml_backend_t backend, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size); - void (*get_tensor_async)(lm_ggml_backend_t backend, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size); - void (*synchronize) (lm_ggml_backend_t backend); - - // (optional) copy tensor between different backends, allow for single-copy tranfers - void (*cpy_tensor_from)(lm_ggml_backend_t backend, struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst); - void (*cpy_tensor_to) (lm_ggml_backend_t backend, struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst); - - // compute graph with a plan - lm_ggml_backend_graph_plan_t (*graph_plan_create) (lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph); - void (*graph_plan_free) (lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan); - void (*graph_plan_compute)(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan); - - // compute graph without a plan - void (*graph_compute)(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph); - - // check if the backend supports an operation - bool (*supports_op)(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op); - }; - - // TODO: hide behind API - struct lm_ggml_backend { - struct lm_ggml_backend_i iface; - - lm_ggml_backend_context_t context; - }; - - // backend helper functions - LM_GGML_API lm_ggml_backend_t lm_ggml_get_backend(const struct lm_ggml_tensor * tensor); LM_GGML_API const char * lm_ggml_backend_name(lm_ggml_backend_t backend); LM_GGML_API void lm_ggml_backend_free(lm_ggml_backend_t backend); - LM_GGML_API lm_ggml_backend_buffer_t lm_ggml_backend_alloc_buffer(lm_ggml_backend_t backend, size_t size); + LM_GGML_API lm_ggml_backend_buffer_type_t lm_ggml_backend_get_default_buffer_type(lm_ggml_backend_t backend); + LM_GGML_API lm_ggml_backend_buffer_t lm_ggml_backend_alloc_buffer(lm_ggml_backend_t backend, size_t size); + LM_GGML_API size_t lm_ggml_backend_get_alignment(lm_ggml_backend_t backend); - LM_GGML_API size_t lm_ggml_backend_get_alignment(lm_ggml_backend_t backend); - - LM_GGML_API void lm_ggml_backend_tensor_set_async( struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size); - LM_GGML_API void lm_ggml_backend_tensor_get_async(const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size); + LM_GGML_API void lm_ggml_backend_tensor_set_async(lm_ggml_backend_t backend, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size); + LM_GGML_API void lm_ggml_backend_tensor_get_async(lm_ggml_backend_t backend, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size); LM_GGML_API void lm_ggml_backend_tensor_set( struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size); LM_GGML_API void lm_ggml_backend_tensor_get(const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size); @@ -125,6 +60,7 @@ extern "C" { // tensor copy between different backends LM_GGML_API void lm_ggml_backend_tensor_copy(struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst); + LM_GGML_API void lm_ggml_backend_tensor_copy_async(lm_ggml_backend_t backend, struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst); // automatic fallback to sync copy // // CPU backend @@ -133,10 +69,112 @@ extern "C" { LM_GGML_API lm_ggml_backend_t lm_ggml_backend_cpu_init(void); LM_GGML_API bool lm_ggml_backend_is_cpu(lm_ggml_backend_t backend); - LM_GGML_API void lm_ggml_backend_cpu_set_n_threads(lm_ggml_backend_t backend_cpu, int n_threads); - LM_GGML_API lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_from_ptr(lm_ggml_backend_t backend_cpu, void * ptr, size_t size); + // Create a backend buffer from an existing pointer + LM_GGML_API lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size); + + LM_GGML_API lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_buffer_type(void); + + // + // Backend registry + // + + // The backend registry is a registry of all the available backends, and allows initializing backends in a generic way + + LM_GGML_API size_t lm_ggml_backend_reg_get_count(void); + LM_GGML_API size_t lm_ggml_backend_reg_find_by_name(const char * name); + LM_GGML_API lm_ggml_backend_t lm_ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is name[:params] + LM_GGML_API const char * lm_ggml_backend_reg_get_name(size_t i); + LM_GGML_API lm_ggml_backend_t lm_ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific + LM_GGML_API lm_ggml_backend_buffer_type_t lm_ggml_backend_reg_get_default_buffer_type(size_t i); + LM_GGML_API lm_ggml_backend_buffer_t lm_ggml_backend_reg_alloc_buffer(size_t i, size_t size); + + // + // Backend scheduler + // + + // The backend scheduler allows for multiple backends to be used together + // Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends + // The backends are selected based on: + // - the backend that supports the operation + // - the location of the pre-allocated tensors (e.g. the weights) + /* + Example usage: + + sched = lm_ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, num_backends); + // sched is initialized with measure allocators and cannot be used until allocated with a measure graph + + // initialize buffers from a measure graph + measure_graph = build_graph(sched); // use the allocr to allocate inputs as needed + + // in build_graph: + build_graph(...) { + // allocating tensors in a specific backend (optional, recommended: pre-allocate inputs in a different buffer) + alloc_cpu = lm_ggml_backend_sched_get_allocr(sched, backend_cpu); + lm_ggml_allocr_alloc(alloc_cpu, tensor); + + // manually assigning nodes to a backend (optional, shouldn't be needed in most cases) + struct lm_ggml_tensor * node = lm_ggml_mul_mat(ctx, ...); + lm_ggml_backend_sched_set_node_backend(sched, node, backend_gpu); + } + + // allocate backend buffers from measure graph + lm_ggml_backend_sched_init_measure(sched, measure_graph); + + // the scheduler is now ready to compute graphs + + // compute + graph = build_graph(sched); + lm_ggml_backend_sched_graph_compute(sched, graph); + */ + + struct lm_ggml_backend_sched; + typedef struct lm_ggml_backend_sched * lm_ggml_backend_sched_t; + + // Initialize a backend scheduler + LM_GGML_API lm_ggml_backend_sched_t lm_ggml_backend_sched_new(lm_ggml_backend_t * backends, int n_backends); + + LM_GGML_API void lm_ggml_backend_sched_free(lm_ggml_backend_sched_t sched); + + // Initialize backend buffers from a measure graph + LM_GGML_API void lm_ggml_backend_sched_init_measure(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * measure_graph); + + LM_GGML_API lm_ggml_tallocr_t lm_ggml_backend_sched_get_tallocr(lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend); + LM_GGML_API lm_ggml_backend_buffer_t lm_ggml_backend_sched_get_buffer (lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend); + + LM_GGML_API void lm_ggml_backend_sched_set_node_backend(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node, lm_ggml_backend_t backend); + + // Allocate a graph on the backend scheduler + LM_GGML_API void lm_ggml_backend_sched_graph_compute( + lm_ggml_backend_sched_t sched, + struct lm_ggml_cgraph * graph); + + + // + // Utils + // + + struct lm_ggml_backend_graph_copy { + lm_ggml_backend_buffer_t buffer; + struct lm_ggml_context * ctx_allocated; + struct lm_ggml_context * ctx_unallocated; + struct lm_ggml_cgraph * graph; + }; + + // Copy a graph to a different backend + LM_GGML_API struct lm_ggml_backend_graph_copy lm_ggml_backend_graph_copy(lm_ggml_backend_t backend, struct lm_ggml_cgraph * graph); + LM_GGML_API void lm_ggml_backend_graph_copy_free(struct lm_ggml_backend_graph_copy copy); + + typedef bool (*lm_ggml_backend_eval_callback)(int node_index, struct lm_ggml_tensor * t1, struct lm_ggml_tensor * t2, void * user_data); + + // Compare the output of two backends + LM_GGML_API void lm_ggml_backend_compare_graph_backend(lm_ggml_backend_t backend1, lm_ggml_backend_t backend2, struct lm_ggml_cgraph * graph, lm_ggml_backend_eval_callback callback, void * user_data); + + // Tensor initialization + LM_GGML_API void lm_ggml_backend_tensor_alloc(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, void * addr); + LM_GGML_API void lm_ggml_backend_view_init(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor); + #ifdef __cplusplus } diff --git a/cpp/ggml-impl.h b/cpp/ggml-impl.h index 07881b7..998c0ba 100644 --- a/cpp/ggml-impl.h +++ b/cpp/ggml-impl.h @@ -39,12 +39,6 @@ extern "C" { #endif #endif -#undef MIN -#undef MAX - -#define MIN(a, b) ((a) < (b) ? (a) : (b)) -#define MAX(a, b) ((a) > (b) ? (a) : (b)) - // 16-bit float // on Arm, we use __fp16 // on x86, we use uint16_t @@ -230,7 +224,19 @@ inline static float lm_ggml_lookup_fp16_to_fp32(lm_ggml_fp16_t f) { #endif - // TODO: backend v2 PR +#define LM_GGML_HASHTABLE_FULL ((size_t)-1) +#define LM_GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2) + +bool lm_ggml_hash_contains (const struct lm_ggml_hash_set hash_set, struct lm_ggml_tensor * key); + +// returns LM_GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted +size_t lm_ggml_hash_find (const struct lm_ggml_hash_set hash_set, struct lm_ggml_tensor * key); + +// returns LM_GGML_HASHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full +size_t lm_ggml_hash_insert ( struct lm_ggml_hash_set hash_set, struct lm_ggml_tensor * key); + +// return index, asserts if table is full +size_t lm_ggml_hash_find_or_insert( struct lm_ggml_hash_set hash_set, struct lm_ggml_tensor * key); #ifdef __cplusplus } diff --git a/cpp/ggml-metal-llama.metal b/cpp/ggml-metal-llama.metal index 7c35f23..2f8ea22 100644 --- a/cpp/ggml-metal-llama.metal +++ b/cpp/ggml-metal-llama.metal @@ -3,6 +3,8 @@ using namespace metal; #define MAX(x, y) ((x) > (y) ? (x) : (y)) +#define MIN(x, y) ((x) < (y) ? (x) : (y)) +#define SWAP(x, y) { auto tmp = (x); (x) = (y); (y) = tmp; } #define QK4_0 32 #define QR4_0 2 @@ -39,8 +41,15 @@ typedef struct { int8_t qs[QK8_0]; // quants } block_q8_0; -// general-purpose kernel for addition of two tensors -// pros: works for non-contiguous tensors, supports broadcast across dims 1, 2 and 3 +#define N_SIMDWIDTH 32 // assuming SIMD group size is 32 + +enum ggml_sort_order { + GGML_SORT_ASC, + GGML_SORT_DESC, +}; + +// general-purpose kernel for addition, multiplication and division of two tensors +// pros: works for non-contiguous tensors, supports broadcast across all dims // cons: not very efficient kernel void kernel_add( device const char * src0, @@ -81,16 +90,111 @@ kernel void kernel_add( const int64_t i12 = i02 % ne12; const int64_t i11 = i01 % ne11; - device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01 + tpitg.x*nb00; - device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11 + tpitg.x*nb10; - device char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1 + tpitg.x*nb0; + device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01; + device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11; + device char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1; for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) { - ((device float *)dst_ptr)[0] = ((device float *)src0_ptr)[0] + ((device float *)src1_ptr)[0]; + const int i10 = i0 % ne10; + *((device float *)(dst_ptr + i0*nb0)) = *((device float *)(src0_ptr + i0*nb00)) + *((device float *)(src1_ptr + i10*nb10)); + } +} + +kernel void kernel_mul( + device const char * src0, + device const char * src1, + device char * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne03, + constant int64_t & nb00, + constant int64_t & nb01, + constant int64_t & nb02, + constant int64_t & nb03, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant int64_t & ne13, + constant int64_t & nb10, + constant int64_t & nb11, + constant int64_t & nb12, + constant int64_t & nb13, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & ne2, + constant int64_t & ne3, + constant int64_t & nb0, + constant int64_t & nb1, + constant int64_t & nb2, + constant int64_t & nb3, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + const int64_t i03 = tgpig.z; + const int64_t i02 = tgpig.y; + const int64_t i01 = tgpig.x; + + const int64_t i13 = i03 % ne13; + const int64_t i12 = i02 % ne12; + const int64_t i11 = i01 % ne11; + + device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01; + device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11; + device char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1; - src0_ptr += ntg.x*nb00; - src1_ptr += ntg.x*nb10; - dst_ptr += ntg.x*nb0; + for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) { + const int i10 = i0 % ne10; + *((device float *)(dst_ptr + i0*nb0)) = *((device float *)(src0_ptr + i0*nb00)) * *((device float *)(src1_ptr + i10*nb10)); + } +} + +kernel void kernel_div( + device const char * src0, + device const char * src1, + device char * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne03, + constant int64_t & nb00, + constant int64_t & nb01, + constant int64_t & nb02, + constant int64_t & nb03, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant int64_t & ne13, + constant int64_t & nb10, + constant int64_t & nb11, + constant int64_t & nb12, + constant int64_t & nb13, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & ne2, + constant int64_t & ne3, + constant int64_t & nb0, + constant int64_t & nb1, + constant int64_t & nb2, + constant int64_t & nb3, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + const int64_t i03 = tgpig.z; + const int64_t i02 = tgpig.y; + const int64_t i01 = tgpig.x; + + const int64_t i13 = i03 % ne13; + const int64_t i12 = i02 % ne12; + const int64_t i11 = i01 % ne11; + + device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01; + device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11; + device char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1; + + for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) { + const int i10 = i0 % ne10; + *((device float *)(dst_ptr + i0*nb0)) = *((device float *)(src0_ptr + i0*nb00)) / *((device float *)(src1_ptr + i10*nb10)); } } @@ -105,23 +209,22 @@ kernel void kernel_add_row( dst[tpig] = src0[tpig] + src1[tpig % nb]; } -kernel void kernel_mul( +kernel void kernel_mul_row( device const float4 * src0, device const float4 * src1, device float4 * dst, + constant int64_t & nb [[buffer(27)]], uint tpig[[thread_position_in_grid]]) { - dst[tpig] = src0[tpig] * src1[tpig]; + dst[tpig] = src0[tpig] * src1[tpig % nb]; } -// assumption: src1 is a row -// broadcast src1 into src0 -kernel void kernel_mul_row( +kernel void kernel_div_row( device const float4 * src0, device const float4 * src1, device float4 * dst, - constant int64_t & nb, + constant int64_t & nb [[buffer(27)]], uint tpig[[thread_position_in_grid]]) { - dst[tpig] = src0[tpig] * src1[tpig % nb]; + dst[tpig] = src0[tpig] / src1[tpig % nb]; } kernel void kernel_scale( @@ -162,6 +265,54 @@ kernel void kernel_sqr( dst[tpig] = src0[tpig] * src0[tpig]; } +kernel void kernel_sum_rows( + device const float * src0, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne03, + constant int64_t & nb00, + constant int64_t & nb01, + constant int64_t & nb02, + constant int64_t & nb03, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant int64_t & ne13, + constant int64_t & nb10, + constant int64_t & nb11, + constant int64_t & nb12, + constant int64_t & nb13, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & ne2, + constant int64_t & ne3, + constant int64_t & nb0, + constant int64_t & nb1, + constant int64_t & nb2, + constant int64_t & nb3, + uint3 tpig[[thread_position_in_grid]]) { + int64_t i3 = tpig.z; + int64_t i2 = tpig.y; + int64_t i1 = tpig.x; + + if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) { + return; + } + + device const float * src_row = (device const float *) ((device const char *) src0 + i1*nb01 + i2*nb02 + i3*nb03); + device float * dst_row = (device float *) ((device char *) dst + i1*nb1 + i2*nb2 + i3*nb3); + + float row_sum = 0; + + for (int64_t i0 = 0; i0 < ne00; i0++) { + row_sum += src_row[i0]; + } + + dst_row[0] = row_sum; +} + constant float GELU_COEF_A = 0.044715f; constant float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; @@ -180,10 +331,12 @@ kernel void kernel_gelu( kernel void kernel_soft_max( device const float * src0, + device const float * src1, device float * dst, constant int64_t & ne00, constant int64_t & ne01, constant int64_t & ne02, + constant float & scale, threadgroup float * buf [[threadgroup(0)]], uint tgpig[[threadgroup_position_in_grid]], uint tpitg[[thread_position_in_threadgroup]], @@ -194,73 +347,77 @@ kernel void kernel_soft_max( const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01; const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01); - device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; - device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; + device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; + device const float * pmask = src1 ? src1 + i01*ne00 : nullptr; + device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; // parallel max - float lmax = tpitg < ne00 ? psrc0[tpitg] : -INFINITY; + float lmax = -INFINITY; - for (int i00 = tpitg + ntg; i00 < ne00; i00 += ntg) { - lmax = MAX(lmax, psrc0[i00]); + for (int i00 = tpitg; i00 < ne00; i00 += ntg) { + lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f)); } - float max = simd_max(lmax); - if (tiisg == 0) { - buf[sgitg] = max; - } + // find the max value in the block + float max_val = simd_max(lmax); + if (ntg > N_SIMDWIDTH) { + if (sgitg == 0) { + buf[tiisg] = -INFINITY; + } - threadgroup_barrier(mem_flags::mem_threadgroup); + threadgroup_barrier(mem_flags::mem_threadgroup); - // broadcast, simd group number is ntg / 32 - for (uint i = ntg / 32 / 2; i > 0; i /= 2) { - if (tpitg < i) { - buf[tpitg] = MAX(buf[tpitg], buf[tpitg + i]); - } - } + if (tiisg == 0) { + buf[sgitg] = max_val; + } - threadgroup_barrier(mem_flags::mem_threadgroup); + threadgroup_barrier(mem_flags::mem_threadgroup); - max = buf[0]; + max_val = buf[tiisg]; + max_val = simd_max(max_val); + } // parallel sum float lsum = 0.0f; for (int i00 = tpitg; i00 < ne00; i00 += ntg) { - const float exp_psrc0 = exp(psrc0[i00] - max); + const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f)) - max_val); lsum += exp_psrc0; - // Remember the result of exp here. exp is expensive, so we really do not - // wish to compute it twice. pdst[i00] = exp_psrc0; } float sum = simd_sum(lsum); - if (tiisg == 0) { - buf[sgitg] = sum; - } + if (ntg > N_SIMDWIDTH) { + if (sgitg == 0) { + buf[tiisg] = 0.0f; + } - threadgroup_barrier(mem_flags::mem_threadgroup); + threadgroup_barrier(mem_flags::mem_threadgroup); - // broadcast, simd group number is ntg / 32 - for (uint i = ntg / 32 / 2; i > 0; i /= 2) { - if (tpitg < i) { - buf[tpitg] += buf[tpitg + i]; - } - } + if (tiisg == 0) { + buf[sgitg] = sum; + } - threadgroup_barrier(mem_flags::mem_threadgroup); + threadgroup_barrier(mem_flags::mem_threadgroup); - sum = buf[0]; + sum = buf[tiisg]; + sum = simd_sum(sum); + } + + const float inv_sum = 1.0f/sum; for (int i00 = tpitg; i00 < ne00; i00 += ntg) { - pdst[i00] /= sum; + pdst[i00] *= inv_sum; } } kernel void kernel_soft_max_4( device const float * src0, + device const float * src1, device float * dst, constant int64_t & ne00, constant int64_t & ne01, constant int64_t & ne02, + constant float & scale, threadgroup float * buf [[threadgroup(0)]], uint tgpig[[threadgroup_position_in_grid]], uint tpitg[[thread_position_in_threadgroup]], @@ -271,64 +428,68 @@ kernel void kernel_soft_max_4( const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01; const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01); - device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); - device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); + device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); + device const float4 * pmask = src1 ? (device const float4 *)(src1 + i01*ne00) : nullptr; + device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); // parallel max - float4 lmax4 = tpitg < ne00/4 ? psrc4[tpitg] : -INFINITY; + float4 lmax4 = -INFINITY; - for (int i00 = tpitg + ntg; i00 < ne00/4; i00 += ntg) { - lmax4 = fmax(lmax4, psrc4[i00]); + for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) { + lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f)); } const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3])); - float max = simd_max(lmax); - if (tiisg == 0) { - buf[sgitg] = max; - } - threadgroup_barrier(mem_flags::mem_threadgroup); + float max_val = simd_max(lmax); + if (ntg > N_SIMDWIDTH) { + if (sgitg == 0) { + buf[tiisg] = -INFINITY; + } - // broadcast, simd group number is ntg / 32 - for (uint i = ntg / 32 / 2; i > 0; i /= 2) { - if (tpitg < i) { - buf[tpitg] = MAX(buf[tpitg], buf[tpitg + i]); - } - } + threadgroup_barrier(mem_flags::mem_threadgroup); - threadgroup_barrier(mem_flags::mem_threadgroup); + if (tiisg == 0) { + buf[sgitg] = max_val; + } - max = buf[0]; + threadgroup_barrier(mem_flags::mem_threadgroup); + + max_val = buf[tiisg]; + max_val = simd_max(max_val); + } // parallel sum float4 lsum4 = 0.0f; for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) { - const float4 exp_psrc4 = exp(psrc4[i00] - max); + const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f)) - max_val); lsum4 += exp_psrc4; pdst4[i00] = exp_psrc4; } const float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3]; float sum = simd_sum(lsum); - if (tiisg == 0) { - buf[sgitg] = sum; - } + if (ntg > N_SIMDWIDTH) { + if (sgitg == 0) { + buf[tiisg] = 0.0f; + } - threadgroup_barrier(mem_flags::mem_threadgroup); + threadgroup_barrier(mem_flags::mem_threadgroup); - // broadcast, simd group number is ntg / 32 - for (uint i = ntg / 32 / 2; i > 0; i /= 2) { - if (tpitg < i) { - buf[tpitg] += buf[tpitg + i]; - } - } + if (tiisg == 0) { + buf[sgitg] = sum; + } - threadgroup_barrier(mem_flags::mem_threadgroup); + threadgroup_barrier(mem_flags::mem_threadgroup); - sum = buf[0]; + sum = buf[tiisg]; + sum = simd_sum(sum); + } + + const float inv_sum = 1.0f/sum; for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) { - pdst4[i00] /= sum; + pdst4[i00] *= inv_sum; } } @@ -435,14 +596,13 @@ kernel void kernel_rms_norm( constant int64_t & ne00, constant uint64_t & nb01, constant float & eps, - threadgroup float * sum [[threadgroup(0)]], + threadgroup float * buf [[threadgroup(0)]], uint tgpig[[threadgroup_position_in_grid]], uint tpitg[[thread_position_in_threadgroup]], uint sgitg[[simdgroup_index_in_threadgroup]], uint tiisg[[thread_index_in_simdgroup]], uint ntg[[threads_per_threadgroup]]) { - device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01); - device const float * x_scalar = (device const float *) x; + device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01); float4 sumf = 0; float all_sum = 0; @@ -453,40 +613,30 @@ kernel void kernel_rms_norm( } all_sum = sumf[0] + sumf[1] + sumf[2] + sumf[3]; all_sum = simd_sum(all_sum); - if (tiisg == 0) { - sum[sgitg] = all_sum; - } + if (ntg > N_SIMDWIDTH) { + if (sgitg == 0) { + buf[tiisg] = 0.0f; + } - threadgroup_barrier(mem_flags::mem_threadgroup); + threadgroup_barrier(mem_flags::mem_threadgroup); - // broadcast, simd group number is ntg / 32 - for (uint i = ntg / 32 / 2; i > 0; i /= 2) { - if (tpitg < i) { - sum[tpitg] += sum[tpitg + i]; - } - } - if (tpitg == 0) { - for (int i = 4 * (ne00 / 4); i < ne00; i++) { - sum[0] += x_scalar[i]; + if (tiisg == 0) { + buf[sgitg] = all_sum; } - sum[0] /= ne00; - } - threadgroup_barrier(mem_flags::mem_threadgroup); + threadgroup_barrier(mem_flags::mem_threadgroup); - const float mean = sum[0]; + all_sum = buf[tiisg]; + all_sum = simd_sum(all_sum); + } + + const float mean = all_sum/ne00; const float scale = 1.0f/sqrt(mean + eps); device float4 * y = (device float4 *) (dst + tgpig*ne00); - device float * y_scalar = (device float *) y; for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) { y[i00] = x[i00] * scale; } - if (tpitg == 0) { - for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) { - y_scalar[i00] = x_scalar[i00] * scale; - } - } } // function for calculate inner product between half a q4_0 block and 16 floats (yl), sumy is SUM(yl[i]) @@ -576,15 +726,25 @@ inline float block_q_n_dot_y(device const block_q5_1 * qb_curr, float sumy, thre // putting them in the kernel cause a significant performance penalty #define N_DST 4 // each SIMD group works on 4 rows #define N_SIMDGROUP 2 // number of SIMD groups in a thread group -#define N_SIMDWIDTH 32 // assuming SIMD group size is 32 //Note: This is a template, but strictly speaking it only applies to // quantizations where the block size is 32. It also does not // giard against the number of rows not being divisible by // N_DST, so this is another explicit assumption of the implementation. template -void mul_vec_q_n_f32(device const void * src0, device const float * src1, device float * dst, - int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne10, int64_t ne12, int64_t ne0, int64_t ne1, uint gqa, - uint3 tgpig, uint tiisg, uint sgitg) { +void mul_vec_q_n_f32( + device const void * src0, + device const float * src1, + device float * dst, + int64_t ne00, + int64_t ne01, + int64_t ne02, + int64_t ne10, + int64_t ne12, + int64_t ne0, + int64_t ne1, + uint r2, + uint r3, + uint3 tgpig, uint tiisg, uint sgitg) { const int nb = ne00/QK4_0; const int r0 = tgpig.x; @@ -593,7 +753,10 @@ void mul_vec_q_n_f32(device const void * src0, device const float * src1, device const int first_row = (r0 * nsg + sgitg) * nr; - const uint offset0 = first_row * nb + im/gqa*(nb*ne0); + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); device const block_q_type * x = (device const block_q_type *) src0 + offset0; device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; @@ -643,13 +806,14 @@ kernel void kernel_mul_mv_q4_0_f32( constant int64_t & ne02[[buffer(5)]], constant int64_t & ne10[[buffer(9)]], constant int64_t & ne12[[buffer(11)]], - constant int64_t & ne0[[buffer(15)]], - constant int64_t & ne1[[buffer(16)]], - constant uint & gqa[[buffer(17)]], + constant int64_t & ne0 [[buffer(15)]], + constant int64_t & ne1 [[buffer(16)]], + constant uint & r2 [[buffer(17)]], + constant uint & r3 [[buffer(18)]], uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - mul_vec_q_n_f32(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg); + mul_vec_q_n_f32(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,tgpig,tiisg,sgitg); } kernel void kernel_mul_mv_q4_1_f32( @@ -661,13 +825,14 @@ kernel void kernel_mul_mv_q4_1_f32( constant int64_t & ne02[[buffer(5)]], constant int64_t & ne10[[buffer(9)]], constant int64_t & ne12[[buffer(11)]], - constant int64_t & ne0[[buffer(15)]], - constant int64_t & ne1[[buffer(16)]], - constant uint & gqa[[buffer(17)]], + constant int64_t & ne0 [[buffer(15)]], + constant int64_t & ne1 [[buffer(16)]], + constant uint & r2 [[buffer(17)]], + constant uint & r3 [[buffer(18)]], uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - mul_vec_q_n_f32(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg); + mul_vec_q_n_f32(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,tgpig,tiisg,sgitg); } kernel void kernel_mul_mv_q5_0_f32( @@ -679,13 +844,14 @@ kernel void kernel_mul_mv_q5_0_f32( constant int64_t & ne02[[buffer(5)]], constant int64_t & ne10[[buffer(9)]], constant int64_t & ne12[[buffer(11)]], - constant int64_t & ne0[[buffer(15)]], - constant int64_t & ne1[[buffer(16)]], - constant uint & gqa[[buffer(17)]], + constant int64_t & ne0 [[buffer(15)]], + constant int64_t & ne1 [[buffer(16)]], + constant uint & r2 [[buffer(17)]], + constant uint & r3 [[buffer(18)]], uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - mul_vec_q_n_f32(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg); + mul_vec_q_n_f32(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,tgpig,tiisg,sgitg); } kernel void kernel_mul_mv_q5_1_f32( @@ -697,13 +863,14 @@ kernel void kernel_mul_mv_q5_1_f32( constant int64_t & ne02[[buffer(5)]], constant int64_t & ne10[[buffer(9)]], constant int64_t & ne12[[buffer(11)]], - constant int64_t & ne0[[buffer(15)]], - constant int64_t & ne1[[buffer(16)]], - constant uint & gqa[[buffer(17)]], + constant int64_t & ne0 [[buffer(15)]], + constant int64_t & ne1 [[buffer(16)]], + constant uint & r2 [[buffer(17)]], + constant uint & r3 [[buffer(18)]], uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - mul_vec_q_n_f32(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg); + mul_vec_q_n_f32(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,tgpig,tiisg,sgitg); } @@ -718,9 +885,10 @@ kernel void kernel_mul_mv_q8_0_f32( constant int64_t & ne02[[buffer(5)]], constant int64_t & ne10[[buffer(9)]], constant int64_t & ne12[[buffer(11)]], - constant int64_t & ne0[[buffer(15)]], - constant int64_t & ne1[[buffer(16)]], - constant uint & gqa[[buffer(17)]], + constant int64_t & ne0 [[buffer(15)]], + constant int64_t & ne1 [[buffer(16)]], + constant uint & r2 [[buffer(17)]], + constant uint & r3 [[buffer(18)]], uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -732,8 +900,14 @@ kernel void kernel_mul_mv_q8_0_f32( const int r0 = tgpig.x; const int r1 = tgpig.y; const int im = tgpig.z; + const int first_row = (r0 * nsg + sgitg) * nr; - const uint offset0 = first_row * nb + im/gqa*(nb*ne0); + + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + device const block_q8_0 * x = (device const block_q8_0 *) src0 + offset0; device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; @@ -791,14 +965,21 @@ kernel void kernel_mul_mv_f32_f32( constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, + constant uint & r2 [[buffer(17)]], + constant uint & r3 [[buffer(18)]], uint3 tgpig[[threadgroup_position_in_grid]], - uint tiisg[[thread_index_in_simdgroup]]) { + uint tiisg[[thread_index_in_simdgroup]]) { const int64_t r0 = tgpig.x; const int64_t rb = tgpig.y*N_F32_F32; const int64_t im = tgpig.z; - device const float * x = (device const float *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02); + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb02*ne02; + + device const float * x = (device const float *) (src0 + offset0); if (ne00 < 128) { for (int row = 0; row < N_F32_F32; ++row) { @@ -844,6 +1025,86 @@ kernel void kernel_mul_mv_f32_f32( } } +#define N_F16_F16 4 + +kernel void kernel_mul_mv_f16_f16( + device const char * src0, + device const char * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2 [[buffer(17)]], + constant uint & r3 [[buffer(18)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]]) { + + const int64_t r0 = tgpig.x; + const int64_t rb = tgpig.y*N_F16_F16; + const int64_t im = tgpig.z; + + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb02*ne02; + + device const half * x = (device const half *) (src0 + offset0); + + if (ne00 < 128) { + for (int row = 0; row < N_F16_F16; ++row) { + int r1 = rb + row; + if (r1 >= ne11) { + break; + } + + device const half * y = (device const half *) (src1 + r1*nb11 + im*nb12); + + float sumf = 0; + for (int i = tiisg; i < ne00; i += 32) { + sumf += (half) x[i] * (half) y[i]; + } + + float all_sum = simd_sum(sumf); + if (tiisg == 0) { + dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum; + } + } + } else { + device const half4 * x4 = (device const half4 *)x; + for (int row = 0; row < N_F16_F16; ++row) { + int r1 = rb + row; + if (r1 >= ne11) { + break; + } + + device const half * y = (device const half *) (src1 + r1*nb11 + im*nb12); + device const half4 * y4 = (device const half4 *) y; + + float sumf = 0; + for (int i = tiisg; i < ne00/4; i += 32) { + for (int k = 0; k < 4; ++k) sumf += (half) x4[i][k] * y4[i][k]; + } + + float all_sum = simd_sum(sumf); + if (tiisg == 0) { + for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (half) x[i] * y[i]; + dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum; + } + } + } +} + kernel void kernel_mul_mv_f16_f32_1row( device const char * src0, device const char * src1, @@ -862,6 +1123,8 @@ kernel void kernel_mul_mv_f16_f32_1row( constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, + constant uint & r2 [[buffer(17)]], + constant uint & r3 [[buffer(18)]], uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]]) { @@ -869,7 +1132,12 @@ kernel void kernel_mul_mv_f16_f32_1row( const int64_t r1 = tgpig.y; const int64_t im = tgpig.z; - device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02); + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb02*ne02; + + device const half * x = (device const half *) (src0 + offset0); device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12); float sumf = 0; @@ -916,6 +1184,8 @@ kernel void kernel_mul_mv_f16_f32( constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, + constant uint & r2 [[buffer(17)]], + constant uint & r3 [[buffer(18)]], uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]]) { @@ -923,7 +1193,12 @@ kernel void kernel_mul_mv_f16_f32( const int64_t rb = tgpig.y*N_F16_F32; const int64_t im = tgpig.z; - device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02); + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb02*ne02; + + device const half * x = (device const half *) (src0 + offset0); if (ne00 < 128) { for (int row = 0; row < N_F16_F32; ++row) { @@ -988,6 +1263,8 @@ kernel void kernel_mul_mv_f16_f32_l4( constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, + constant uint & r2 [[buffer(17)]], + constant uint & r3 [[buffer(18)]], uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]]) { @@ -995,7 +1272,12 @@ kernel void kernel_mul_mv_f16_f32_l4( const int64_t r0 = tgpig.x; const int64_t im = tgpig.z; - device const half4 * x4 = (device const half4 *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02); + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb02*ne02; + + device const half4 * x4 = (device const half4 *) (src0 + offset0); for (int r1 = 0; r1 < nrows; ++r1) { device const float4 * y4 = (device const float4 *) (src1 + r1*nb11 + im*nb12); @@ -1047,17 +1329,21 @@ kernel void kernel_alibi_f32( const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0); const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0; const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0); + const int64_t k = i3*ne3 + i2; - device float * dst_data = (device float *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); float m_k; - if (i2 < n_heads_log2_floor) { - m_k = pow(m0, i2 + 1); + if (k < n_heads_log2_floor) { + m_k = pow(m0, k + 1); } else { - m_k = pow(m1, 2 * (i2 - n_heads_log2_floor) + 1); + m_k = pow(m1, 2 * (k - n_heads_log2_floor) + 1); } + + device char * dst_row = (device char *) dst + i3*nb3 + i2*nb2 + i1*nb1; + device const char * src_row = (device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01; for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) { - device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00); - dst_data[i00] = src[0] + m_k * (i00 - ne00 + 1); + const float src_v = *(device float *)(src_row + i00*nb00); + device float * dst_v = (device float *)(dst_row + i00*nb0); + *dst_v = i00 * m_k + src_v; } } @@ -1201,33 +1487,118 @@ kernel void kernel_rope( dst_data[1] = x0*sin_theta + x1*cos_theta; } } else { - for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { - for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) { + for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { + for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) { + + // simplified from `(ib * n_dims + ic) * inv_ndims` + const float cur_rot = inv_ndims*ic - ib; + + const float theta = theta_0 * pow(freq_base, cur_rot); + float cos_theta, sin_theta; + rope_yarn(theta, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta); + + const int64_t i0 = ib*n_dims + ic/2; + + device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + device T * dst_data = (device T *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + const float x0 = src[0]; + const float x1 = src[n_dims/2]; + + dst_data[0] = x0*cos_theta - x1*sin_theta; + dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta; + } + } + } +} + +template [[host_name("kernel_rope_f32")]] kernel rope_t kernel_rope; +template [[host_name("kernel_rope_f16")]] kernel rope_t kernel_rope; + +kernel void kernel_im2col_f16( + device const float * x, + device half * dst, + constant int32_t & ofs0, + constant int32_t & ofs1, + constant int32_t & IW, + constant int32_t & IH, + constant int32_t & CHW, + constant int32_t & s0, + constant int32_t & s1, + constant int32_t & p0, + constant int32_t & p1, + constant int32_t & d0, + constant int32_t & d1, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tgpg[[threadgroups_per_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + const int32_t iiw = tgpig[2] * s0 + tpitg[2] * d0 - p0; + const int32_t iih = tgpig[1] * s1 + tpitg[1] * d1 - p1; + + const int32_t offset_dst = + (tpitg[0] * tgpg[1] * tgpg[2] + tgpig[1] * tgpg[2] + tgpig[2]) * CHW + + (tgpig[0] * (ntg[1] * ntg[2]) + tpitg[1] * ntg[2] + tpitg[2]); + + if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) { + dst[offset_dst] = 0.0f; + } else { + const int32_t offset_src = tpitg[0] * ofs0 + tgpig[0] * ofs1; + dst[offset_dst] = x[offset_src + iih * IW + iiw]; + } +} - // simplified from `(ib * n_dims + ic) * inv_ndims` - const float cur_rot = inv_ndims*ic - ib; +// bitonic sort implementation following the CUDA kernels as reference +typedef void (argsort_t)( + device const float * x, + device int32_t * dst, + constant int64_t & ncols, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]]); - const float theta = theta_0 * pow(freq_base, cur_rot); - float cos_theta, sin_theta; - rope_yarn(theta, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta); +template +kernel void kernel_argsort_f32_i32( + device const float * x, + device int32_t * dst, + constant int64_t & ncols, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]]) { + // bitonic sort + int col = tpitg[0]; + int row = tgpig[1]; - const int64_t i0 = ib*n_dims + ic/2; + if (col >= ncols) return; - device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - device T * dst_data = (device T *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + device const float * x_row = x + row * ncols; + device int32_t * dst_row = dst + row * ncols; - const float x0 = src[0]; - const float x1 = src[n_dims/2]; + // initialize indices + if (col < ncols) { + dst_row[col] = col; + } + threadgroup_barrier(mem_flags::mem_threadgroup); - dst_data[0] = x0*cos_theta - x1*sin_theta; - dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta; + for (int k = 2; k <= ncols; k *= 2) { + for (int j = k / 2; j > 0; j /= 2) { + int ixj = col ^ j; + if (ixj > col) { + if ((col & k) == 0) { + if (order == GGML_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) { + SWAP(dst_row[col], dst_row[ixj]); + } + } else { + if (order == GGML_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) { + SWAP(dst_row[col], dst_row[ixj]); + } + } } + threadgroup_barrier(mem_flags::mem_threadgroup); } } } -template [[host_name("kernel_rope_f32")]] kernel rope_t kernel_rope; -template [[host_name("kernel_rope_f16")]] kernel rope_t kernel_rope; +template [[host_name("kernel_argsort_f32_i32_asc")]] kernel argsort_t kernel_argsort_f32_i32; +template [[host_name("kernel_argsort_f32_i32_desc")]] kernel argsort_t kernel_argsort_f32_i32; kernel void kernel_cpy_f16_f16( device const half * src0, @@ -1354,6 +1725,197 @@ kernel void kernel_cpy_f32_f32( } } +kernel void kernel_cpy_f32_q8_0( + device const float * src0, + device void * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne03, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant uint64_t & nb03, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & ne2, + constant int64_t & ne3, + constant uint64_t & nb0, + constant uint64_t & nb1, + constant uint64_t & nb2, + constant uint64_t & nb3, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + const int64_t i03 = tgpig[2]; + const int64_t i02 = tgpig[1]; + const int64_t i01 = tgpig[0]; + + const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; + + const int64_t i3 = n / (ne2*ne1*ne0); + const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0); + const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0; + const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0)/QK8_0; + + device block_q8_0 * dst_data = (device block_q8_0 *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + for (int64_t i00 = tpitg.x*QK8_0; i00 < ne00; i00 += ntg.x*QK8_0) { + device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00); + + float amax = 0.0f; // absolute max + + for (int j = 0; j < QK8_0; j++) { + const float v = src[j]; + amax = MAX(amax, fabs(v)); + } + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + dst_data[i00/QK8_0].d = d; + + for (int j = 0; j < QK8_0; ++j) { + const float x0 = src[j]*id; + + dst_data[i00/QK8_0].qs[j] = round(x0); + } + } +} + +kernel void kernel_cpy_f32_q4_0( + device const float * src0, + device void * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne03, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant uint64_t & nb03, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & ne2, + constant int64_t & ne3, + constant uint64_t & nb0, + constant uint64_t & nb1, + constant uint64_t & nb2, + constant uint64_t & nb3, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + const int64_t i03 = tgpig[2]; + const int64_t i02 = tgpig[1]; + const int64_t i01 = tgpig[0]; + + const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; + + const int64_t i3 = n / (ne2*ne1*ne0); + const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0); + const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0; + const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0)/QK4_0; + + device block_q4_0 * dst_data = (device block_q4_0 *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + for (int64_t i00 = tpitg.x*QK4_0; i00 < ne00; i00 += ntg.x*QK4_0) { + device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00); + + float amax = 0.0f; // absolute max + float max = 0.0f; + + for (int j = 0; j < QK4_0; j++) { + const float v = src[j]; + if (amax < fabs(v)) { + amax = fabs(v); + max = v; + } + } + + const float d = max / -8; + const float id = d ? 1.0f/d : 0.0f; + + dst_data[i00/QK4_0].d = d; + + for (int j = 0; j < QK4_0/2; ++j) { + const float x0 = src[0 + j]*id; + const float x1 = src[QK4_0/2 + j]*id; + + const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f)); + const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f)); + + dst_data[i00/QK4_0].qs[j] = xi0; + dst_data[i00/QK4_0].qs[j] |= xi1 << 4; + } + } +} + +kernel void kernel_cpy_f32_q4_1( + device const float * src0, + device void * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne03, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant uint64_t & nb03, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & ne2, + constant int64_t & ne3, + constant uint64_t & nb0, + constant uint64_t & nb1, + constant uint64_t & nb2, + constant uint64_t & nb3, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + const int64_t i03 = tgpig[2]; + const int64_t i02 = tgpig[1]; + const int64_t i01 = tgpig[0]; + + const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; + + const int64_t i3 = n / (ne2*ne1*ne0); + const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0); + const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0; + const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0)/QK4_1; + + device block_q4_1 * dst_data = (device block_q4_1 *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + for (int64_t i00 = tpitg.x*QK4_1; i00 < ne00; i00 += ntg.x*QK4_1) { + device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00); + + float min = FLT_MAX; + float max = -FLT_MAX; + + for (int j = 0; j < QK4_1; j++) { + const float v = src[j]; + if (min > v) min = v; + if (max < v) max = v; + } + + const float d = (max - min) / ((1 << 4) - 1); + const float id = d ? 1.0f/d : 0.0f; + + dst_data[i00/QK4_1].d = d; + dst_data[i00/QK4_1].m = min; + + for (int j = 0; j < QK4_1/2; ++j) { + const float x0 = (src[0 + j] - min)*id; + const float x1 = (src[QK4_1/2 + j] - min)*id; + + const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f)); + const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f)); + + dst_data[i00/QK4_1].qs[j] = xi0; + dst_data[i00/QK4_1].qs[j] |= xi1 << 4; + } + } +} + kernel void kernel_concat( device const char * src0, device const char * src1, @@ -1511,23 +2073,30 @@ kernel void kernel_mul_mv_q2_K_f32( constant int64_t & ne02[[buffer(5)]], constant int64_t & ne10[[buffer(9)]], constant int64_t & ne12[[buffer(11)]], - constant int64_t & ne0[[buffer(15)]], - constant int64_t & ne1[[buffer(16)]], - constant uint & gqa[[buffer(17)]], + constant int64_t & ne0 [[buffer(15)]], + constant int64_t & ne1 [[buffer(16)]], + constant uint & r2 [[buffer(17)]], + constant uint & r3 [[buffer(18)]], uint3 tgpig[[threadgroup_position_in_grid]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { const int nb = ne00/QK_K; const int r0 = tgpig.x; const int r1 = tgpig.y; - const int r2 = tgpig.z; + const int im = tgpig.z; const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; const int ib_row = first_row * nb; - const uint offset0 = r2/gqa*(nb*ne0); + + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + device const block_q2_K * x = (device const block_q2_K *) src0 + ib_row + offset0; - device const float * y = (device const float *) src1 + r1*ne10 + r2*ne00*ne1; + device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; + float yl[32]; float sumf[N_DST]={0.f}, all_sum; @@ -1536,11 +2105,11 @@ kernel void kernel_mul_mv_q2_K_f32( #if QK_K == 256 const int ix = tiisg/8; // 0...3 const int it = tiisg%8; // 0...7 - const int im = it/4; // 0 or 1 + const int iq = it/4; // 0 or 1 const int ir = it%4; // 0...3 const int is = (8*ir)/16;// 0 or 1 - device const float * y4 = y + ix * QK_K + 128 * im + 8 * ir; + device const float * y4 = y + ix * QK_K + 128 * iq + 8 * ir; for (int ib = ix; ib < nb; ib += 4) { @@ -1552,8 +2121,8 @@ kernel void kernel_mul_mv_q2_K_f32( yl[i+24] = y4[i+96]; sumy[3] += yl[i+24]; } - device const uint8_t * sc = (device const uint8_t *)x[ib].scales + 8*im + is; - device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 16 * im + 4 * ir; + device const uint8_t * sc = (device const uint8_t *)x[ib].scales + 8*iq + is; + device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 16 * iq + 4 * ir; device const half * dh = &x[ib].d; for (int row = 0; row < N_DST; row++) { @@ -1640,7 +2209,7 @@ kernel void kernel_mul_mv_q2_K_f32( for (int row = 0; row < N_DST; ++row) { all_sum = simd_sum(sumf[row]); if (tiisg == 0) { - dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = all_sum; + dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum; } } } @@ -1655,9 +2224,10 @@ kernel void kernel_mul_mv_q3_K_f32( constant int64_t & ne02[[buffer(5)]], constant int64_t & ne10[[buffer(9)]], constant int64_t & ne12[[buffer(11)]], - constant int64_t & ne0[[buffer(15)]], - constant int64_t & ne1[[buffer(16)]], - constant uint & gqa[[buffer(17)]], + constant int64_t & ne0 [[buffer(15)]], + constant int64_t & ne1 [[buffer(16)]], + constant uint & r2 [[buffer(17)]], + constant uint & r3 [[buffer(18)]], uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -1666,12 +2236,17 @@ kernel void kernel_mul_mv_q3_K_f32( const int64_t r0 = tgpig.x; const int64_t r1 = tgpig.y; - const int64_t r2 = tgpig.z; + const int64_t im = tgpig.z; const int first_row = (r0 * N_SIMDGROUP + sgitg) * 2; - const uint offset0 = r2/gqa*(nb*ne0); + + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + device const block_q3_K * x = (device const block_q3_K *) src0 + first_row*nb + offset0; - device const float * yy = (device const float *) src1 + r1*ne10 + r2*ne00*ne1; + device const float * yy = (device const float *) src1 + r1*ne10 + im*ne00*ne1; float yl[32]; @@ -1793,7 +2368,7 @@ kernel void kernel_mul_mv_q3_K_f32( } if (tiisg == 0) { for (int row = 0; row < 2; ++row) { - dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = sumf1[row]; + dst[r1*ne0 + im*ne0*ne1 + first_row + row] = sumf1[row]; } } } @@ -1807,26 +2382,33 @@ kernel void kernel_mul_mv_q3_K_f32( constant int64_t & ne02[[buffer(5)]], constant int64_t & ne10[[buffer(9)]], constant int64_t & ne12[[buffer(11)]], - constant int64_t & ne0[[buffer(15)]], - constant int64_t & ne1[[buffer(16)]], - constant uint & gqa[[buffer(17)]], + constant int64_t & ne0 [[buffer(15)]], + constant int64_t & ne1 [[buffer(16)]], + constant uint & r2 [[buffer(17)]], + constant uint & r3 [[buffer(18)]], uint3 tgpig[[threadgroup_position_in_grid]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { const int nb = ne00/QK_K; const int64_t r0 = tgpig.x; const int64_t r1 = tgpig.y; - const int64_t r2 = tgpig.z; + const int64_t im = tgpig.z; const int row = 2 * r0 + sgitg; - const uint offset0 = r2/gqa*(nb*ne0); + + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + device const block_q3_K * x = (device const block_q3_K *) src0 + row*nb + offset0; - device const float * yy = (device const float *) src1 + r1*ne10 + r2*ne00*ne1; + device const float * yy = (device const float *) src1 + r1*ne10 + im*ne00*ne1; + const int ix = tiisg/4; const int il = 4 * (tiisg%4);// 0, 4, 8, 12 - const int im = il/8; // 0, 0, 1, 1 + const int iq = il/8; // 0, 0, 1, 1 const int in = il%8; // 0, 4, 0, 4 float2 sum = {0.f, 0.f}; @@ -1846,7 +2428,7 @@ kernel void kernel_mul_mv_q3_K_f32( const float d4 = d_all * ((int32_t)(s[0] & 0xF000) - 32768) * 1.f/262144.f; for (int l = 0; l < 4; l += 2) { - const uint16_t hm = h[l/2] >> im; + const uint16_t hm = h[l/2] >> iq; sum[0] += y[l+ 0] * d1 * ((int32_t)(q[l/2] & 0x0003) - ((hm & 0x0001) ? 0 : 4)) + y[l+16] * d2 * ((int32_t)(q[l/2] & 0x000c) - ((hm & 0x0004) ? 0 : 16)) + y[l+32] * d3 * ((int32_t)(q[l/2] & 0x0030) - ((hm & 0x0010) ? 0 : 64)) @@ -1862,7 +2444,7 @@ kernel void kernel_mul_mv_q3_K_f32( const float tot = simd_sum(sumf); if (tiisg == 0) { - dst[r1*ne0 + r2*ne0*ne1 + row] = tot; + dst[r1*ne0 + im*ne0*ne1 + row] = tot; } } @@ -1880,10 +2462,11 @@ kernel void kernel_mul_mv_q4_K_f32( constant int64_t & ne12 [[buffer(11)]], constant int64_t & ne0 [[buffer(15)]], constant int64_t & ne1 [[buffer(16)]], - constant uint & gqa [[buffer(17)]], + constant uint & r2 [[buffer(17)]], + constant uint & r3 [[buffer(18)]], uint3 tgpig[[threadgroup_position_in_grid]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { const uint16_t kmask1 = 0x3f3f; const uint16_t kmask2 = 0x0f0f; @@ -1891,26 +2474,32 @@ kernel void kernel_mul_mv_q4_K_f32( const int ix = tiisg/8; // 0...3 const int it = tiisg%8; // 0...7 - const int im = it/4; // 0 or 1 + const int iq = it/4; // 0 or 1 const int ir = it%4; // 0...3 const int nb = ne00/QK_K; const int r0 = tgpig.x; const int r1 = tgpig.y; - const int r2 = tgpig.z; + const int im = tgpig.z; //const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; const int first_row = r0 * N_DST; const int ib_row = first_row * nb; - const uint offset0 = r2/gqa*(nb*ne0); + + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0; - device const float * y = (device const float *) src1 + r1*ne10 + r2*ne00*ne1; + device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; + float yl[16]; float yh[16]; float sumf[N_DST]={0.f}, all_sum; const int step = sizeof(block_q4_K) * nb / 2; - device const float * y4 = y + ix * QK_K + 64 * im + 8 * ir; + device const float * y4 = y + ix * QK_K + 64 * iq + 8 * ir; uint16_t sc16[4]; thread const uint8_t * sc8 = (thread const uint8_t *)sc16; @@ -1925,8 +2514,8 @@ kernel void kernel_mul_mv_q4_K_f32( yh[i+8] = y4[i+160]; sumy[3] += yh[i+8]; } - device const uint16_t * sc = (device const uint16_t *)x[ib].scales + im; - device const uint16_t * q1 = (device const uint16_t *)x[ib].qs + 16 * im + 4 * ir; + device const uint16_t * sc = (device const uint16_t *)x[ib].scales + iq; + device const uint16_t * q1 = (device const uint16_t *)x[ib].qs + 16 * iq + 4 * ir; device const half * dh = &x[ib].d; for (int row = 0; row < N_DST; row++) { @@ -1970,7 +2559,7 @@ kernel void kernel_mul_mv_q4_K_f32( for (int row = 0; row < N_DST; ++row) { all_sum = simd_sum(sumf[row]); if (tiisg == 0) { - dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = all_sum; + dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum; } } } @@ -1984,9 +2573,10 @@ kernel void kernel_mul_mv_q4_K_f32( constant int64_t & ne02[[buffer(5)]], constant int64_t & ne10[[buffer(9)]], constant int64_t & ne12[[buffer(11)]], - constant int64_t & ne0[[buffer(15)]], - constant int64_t & ne1[[buffer(16)]], - constant uint & gqa[[buffer(17)]], + constant int64_t & ne0 [[buffer(15)]], + constant int64_t & ne1 [[buffer(16)]], + constant uint & r2 [[buffer(17)]], + constant uint & r3 [[buffer(18)]], uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -1997,12 +2587,18 @@ kernel void kernel_mul_mv_q4_K_f32( const int nb = ne00/QK_K; const int r0 = tgpig.x; const int r1 = tgpig.y; - const int r2 = tgpig.z; + const int im = tgpig.z; const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; const int ib_row = first_row * nb; - const uint offset0 = r2/gqa*(nb*ne0); + + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0; - device const float * y = (device const float *) src1 + r1*ne10 + r2*ne00*ne1; + device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; + float yl[8]; float yh[8]; float sumf[N_DST]={0.f}, all_sum; @@ -2058,7 +2654,7 @@ kernel void kernel_mul_mv_q4_K_f32( for (int row = 0; row < N_DST; ++row) { all_sum = simd_sum(sumf[row]); if (tiisg == 0) { - dst[r1*ne0+ r2*ne0*ne1 + first_row + row] = all_sum; + dst[r1*ne0+ im*ne0*ne1 + first_row + row] = all_sum; } } } @@ -2073,9 +2669,10 @@ kernel void kernel_mul_mv_q5_K_f32( constant int64_t & ne02[[buffer(5)]], constant int64_t & ne10[[buffer(9)]], constant int64_t & ne12[[buffer(11)]], - constant int64_t & ne0[[buffer(15)]], - constant int64_t & ne1[[buffer(16)]], - constant uint & gqa[[buffer(17)]], + constant int64_t & ne0 [[buffer(15)]], + constant int64_t & ne1 [[buffer(16)]], + constant uint & r2 [[buffer(17)]], + constant uint & r3 [[buffer(18)]], uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -2084,12 +2681,17 @@ kernel void kernel_mul_mv_q5_K_f32( const int64_t r0 = tgpig.x; const int64_t r1 = tgpig.y; - const int r2 = tgpig.z; + const int im = tgpig.z; const int first_row = (r0 * N_SIMDGROUP + sgitg) * 2; - const uint offset0 = r2/gqa*(nb*ne0); + + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + device const block_q5_K * x = (device const block_q5_K *) src0 + first_row*nb + offset0; - device const float * yy = (device const float *) src1 + r1*ne10 + r2*ne00*ne1; + device const float * yy = (device const float *) src1 + r1*ne10 + im*ne00*ne1; float sumf[2]={0.f}; @@ -2105,15 +2707,15 @@ kernel void kernel_mul_mv_q5_K_f32( const int tid = tiisg/4; const int ix = tiisg%4; - const int im = tid/4; + const int iq = tid/4; const int ir = tid%4; const int n = 8; const int l0 = n*ir; - const int q_offset = 32*im + l0; - const int y_offset = 64*im + l0; + const int q_offset = 32*iq + l0; + const int y_offset = 64*iq + l0; - const uint8_t hm1 = 1u << (2*im); + const uint8_t hm1 = 1u << (2*iq); const uint8_t hm2 = hm1 << 1; const uint8_t hm3 = hm1 << 4; const uint8_t hm4 = hm2 << 4; @@ -2128,7 +2730,7 @@ kernel void kernel_mul_mv_q5_K_f32( device const uint8_t * q1 = x[i].qs + q_offset; device const uint8_t * qh = x[i].qh + l0; device const half * dh = &x[i].d; - device const uint16_t * a = (device const uint16_t *)x[i].scales + im; + device const uint16_t * a = (device const uint16_t *)x[i].scales + iq; device const float * y2 = y1 + 128; float4 sumy = {0.f, 0.f, 0.f, 0.f}; @@ -2184,7 +2786,7 @@ kernel void kernel_mul_mv_q5_K_f32( const int il = 4 * (tiisg/8); // 0, 4, 8, 12 const int ix = tiisg%8; - const int im = il/8; // 0, 0, 1, 1 + const int iq = il/8; // 0, 0, 1, 1 const int in = il%8; // 0, 4, 0, 4 device const float * y = yy + ix*QK_K + il; @@ -2209,7 +2811,7 @@ kernel void kernel_mul_mv_q5_K_f32( float2 acc = {0.f, 0.f}; for (int l = 0; l < 4; ++l) { - const uint8_t hl = h[l] >> im; + const uint8_t hl = h[l] >> iq; acc[0] += yl[l+0] * s[0] * ((int16_t)(q[l+ 0] & 0x0F) - (hl & 0x01 ? 0 : 16)) + yl[l+4] * s[1] * ((int16_t)(q[l+16] & 0x0F) - (hl & 0x04 ? 0 : 16)); acc[1] += yh[l+0] * s[2] * ((int16_t)(q[l+ 0] & 0xF0) - (hl & 0x10 ? 0 : 256)) @@ -2231,7 +2833,7 @@ kernel void kernel_mul_mv_q5_K_f32( for (int row = 0; row < 2; ++row) { const float tot = simd_sum(sumf[row]); if (tiisg == 0) { - dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = tot; + dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot; } } @@ -2246,9 +2848,10 @@ kernel void kernel_mul_mv_q6_K_f32( constant int64_t & ne02[[buffer(5)]], constant int64_t & ne10[[buffer(9)]], constant int64_t & ne12[[buffer(11)]], - constant int64_t & ne0[[buffer(15)]], - constant int64_t & ne1[[buffer(16)]], - constant uint & gqa[[buffer(17)]], + constant int64_t & ne0 [[buffer(15)]], + constant int64_t & ne1 [[buffer(16)]], + constant uint & r2 [[buffer(17)]], + constant uint & r3 [[buffer(18)]], uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -2262,12 +2865,17 @@ kernel void kernel_mul_mv_q6_K_f32( const int64_t r0 = tgpig.x; const int64_t r1 = tgpig.y; - const int r2 = tgpig.z; + const int im = tgpig.z; const int row = 2 * r0 + sgitg; - const uint offset0 = r2/gqa*(nb*ne0); + + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + device const block_q6_K * x = (device const block_q6_K *) src0 + row * nb + offset0; - device const float * yy = (device const float *) src1 + r1*ne10 + r2*ne00*ne1; + device const float * yy = (device const float *) src1 + r1*ne10 + im*ne00*ne1; float sumf = 0; @@ -2333,7 +2941,7 @@ kernel void kernel_mul_mv_q6_K_f32( const float tot = simd_sum(sumf); if (tiisg == 0) { - dst[r1*ne0 + r2*ne0*ne1 + row] = tot; + dst[r1*ne0 + im*ne0*ne1 + row] = tot; } } @@ -2643,24 +3251,25 @@ kernel void kernel_get_rows( // each block_q contains 16*nl weights template -kernel void kernel_mul_mm(device const uchar * src0, - device const uchar * src1, - device float * dst, - constant int64_t & ne00, - constant int64_t & ne02, - constant int64_t & nb01, - constant int64_t & nb02, - constant int64_t & ne12, - constant int64_t & nb10, - constant int64_t & nb11, - constant int64_t & nb12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint & gqa, - threadgroup uchar * shared_memory [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiitg[[thread_index_in_threadgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { +void kernel_mul_mm_impl(device const uchar * src0, + device const uchar * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne02, + constant int64_t & nb01, + constant int64_t & nb02, + constant int64_t & ne12, + constant int64_t & nb10, + constant int64_t & nb11, + constant int64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + threadgroup uchar * shared_memory [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { threadgroup half * sa = (threadgroup half *)(shared_memory); threadgroup float * sb = (threadgroup float *)(shared_memory + 4096); @@ -2686,7 +3295,10 @@ kernel void kernel_mul_mm(device const uchar * src0, short il = (tiitg % THREAD_PER_ROW); - uint offset0 = im/gqa*nb02; + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + uint offset0 = (i12/r2)*nb02 + (i13/r3)*(nb02*ne02); ushort offset1 = il/nl; device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01 + offset0) + offset1; @@ -2770,14 +3382,116 @@ kernel void kernel_mul_mm(device const uchar * src0, } } +template +kernel void kernel_mul_mm(device const uchar * src0, + device const uchar * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne02, + constant int64_t & nb01, + constant int64_t & nb02, + constant int64_t & ne12, + constant int64_t & nb10, + constant int64_t & nb11, + constant int64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + threadgroup uchar * shared_memory [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + kernel_mul_mm_impl( + src0, + src1, + dst, + ne00, + ne02, + nb01, + nb02, + ne12, + nb10, + nb11, + nb12, + ne0, + ne1, + r2, + r3, + shared_memory, + tgpig, + tiitg, + sgitg); +} + +template +kernel void kernel_mul_mm_id( + device const int32_t * ids, + device const uchar * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne02, + constant int64_t & nb01, + constant int64_t & nb02, + constant int64_t & ne12, + constant int64_t & nb10, + constant int64_t & nb11, + constant int64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + constant int & idx, + device const uchar * src00, + device const uchar * src01, + device const uchar * src02, + device const uchar * src03, + device const uchar * src04, + device const uchar * src05, + device const uchar * src06, + device const uchar * src07, + threadgroup uchar * shared_memory [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + device const uchar * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; + + kernel_mul_mm_impl( + src0[ids[idx]], + src1, + dst, + ne00, + ne02, + nb01, + nb02, + ne12, + nb10, + nb11, + nb12, + ne0, + ne1, + r2, + r3, + shared_memory, + tgpig, + tiitg, + sgitg); +} + #if QK_K == 256 #define QK_NL 16 #else #define QK_NL 4 #endif -typedef void (get_rows_t)(device const void *, device const int *, device float *, constant int64_t &, \ - constant uint64_t &, constant uint64_t &, uint, uint, uint); +typedef void (get_rows_t)( + device const void * src0, + device const int * src1, + device float * dst, + constant int64_t & ne00, + constant uint64_t & nb01, + constant uint64_t & nb1, + uint, uint, uint); template [[host_name("kernel_get_rows_f32")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_f16")]] kernel get_rows_t kernel_get_rows; @@ -2806,8 +3520,10 @@ typedef void (mat_mm_t)( constant int64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, - constant uint & gqa, - threadgroup uchar *, uint3, uint, uint); + constant uint & r2, + constant uint & r3, + threadgroup uchar *, + uint3, uint, uint); template [[host_name("kernel_mul_mm_f32_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_f16_f32")]] kernel mat_mm_t kernel_mul_mm; @@ -2821,3 +3537,44 @@ template [[host_name("kernel_mul_mm_q3_K_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_q5_K_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_q6_K_f32")]] kernel mat_mm_t kernel_mul_mm; + +typedef void (mat_mm_id_t)( + device const int32_t * ids, + device const uchar * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne02, + constant int64_t & nb01, + constant int64_t & nb02, + constant int64_t & ne12, + constant int64_t & nb10, + constant int64_t & nb11, + constant int64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + constant int & idx, + device const uchar * src00, + device const uchar * src01, + device const uchar * src02, + device const uchar * src03, + device const uchar * src04, + device const uchar * src05, + device const uchar * src06, + device const uchar * src07, + threadgroup uchar *, + uint3, uint, uint); + +template [[host_name("kernel_mul_mm_id_f32_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_f16_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q4_0_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q4_1_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q5_0_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q5_1_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q8_0_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q2_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q3_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q4_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q5_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q6_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; diff --git a/cpp/ggml-metal.h b/cpp/ggml-metal.h index f650977..6e5291d 100644 --- a/cpp/ggml-metal.h +++ b/cpp/ggml-metal.h @@ -26,7 +26,7 @@ #include // max memory buffers that can be mapped to the device -#define LM_GGML_METAL_MAX_BUFFERS 16 +#define LM_GGML_METAL_MAX_BUFFERS 64 #define LM_GGML_METAL_MAX_COMMAND_BUFFERS 32 struct lm_ggml_tensor; @@ -99,6 +99,12 @@ LM_GGML_API lm_ggml_backend_t lm_ggml_backend_metal_init(void); LM_GGML_API bool lm_ggml_backend_is_metal(lm_ggml_backend_t backend); LM_GGML_API void lm_ggml_backend_metal_set_n_cb(lm_ggml_backend_t backend, int n_cb); +LM_GGML_API lm_ggml_backend_buffer_type_t lm_ggml_backend_metal_buffer_type(void); + +// helper to check if the device supports a specific family +// ideally, the user code should be doing these checks +// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf +LM_GGML_API bool lm_ggml_backend_metal_supports_family(lm_ggml_backend_t backend, int family); #ifdef __cplusplus } diff --git a/cpp/ggml-metal.m b/cpp/ggml-metal.m index 4c68f3d..32c1019 100644 --- a/cpp/ggml-metal.m +++ b/cpp/ggml-metal.m @@ -1,5 +1,6 @@ #import "ggml-metal.h" +#import "ggml-backend-impl.h" #import "ggml.h" #import @@ -23,7 +24,7 @@ #define UNUSED(x) (void)(x) -#define LM_GGML_MAX_CONCUR (2*LM_GGML_MAX_NODES) +#define LM_GGML_MAX_CONCUR (2*LM_GGML_DEFAULT_GRAPH_SIZE) struct lm_ggml_metal_buffer { const char * name; @@ -61,6 +62,8 @@ LM_GGML_METAL_DECL_KERNEL(add_row); // TODO: avoid this extra kernel, instead extend the "add" kernel to support broadcast LM_GGML_METAL_DECL_KERNEL(mul); LM_GGML_METAL_DECL_KERNEL(mul_row); // TODO: avoid this extra kernel, instead extend the "mul" kernel to support broadcast + LM_GGML_METAL_DECL_KERNEL(div); + LM_GGML_METAL_DECL_KERNEL(div_row); LM_GGML_METAL_DECL_KERNEL(scale); LM_GGML_METAL_DECL_KERNEL(scale_4); LM_GGML_METAL_DECL_KERNEL(silu); @@ -85,6 +88,7 @@ LM_GGML_METAL_DECL_KERNEL(rms_norm); LM_GGML_METAL_DECL_KERNEL(norm); LM_GGML_METAL_DECL_KERNEL(mul_mv_f32_f32); + LM_GGML_METAL_DECL_KERNEL(mul_mv_f16_f16); LM_GGML_METAL_DECL_KERNEL(mul_mv_f16_f32); LM_GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_1row); LM_GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_l4); @@ -110,14 +114,35 @@ LM_GGML_METAL_DECL_KERNEL(mul_mm_q4_K_f32); LM_GGML_METAL_DECL_KERNEL(mul_mm_q5_K_f32); LM_GGML_METAL_DECL_KERNEL(mul_mm_q6_K_f32); + LM_GGML_METAL_DECL_KERNEL(mul_mm_id_f32_f32); + LM_GGML_METAL_DECL_KERNEL(mul_mm_id_f16_f32); + LM_GGML_METAL_DECL_KERNEL(mul_mm_id_q4_0_f32); + LM_GGML_METAL_DECL_KERNEL(mul_mm_id_q4_1_f32); + LM_GGML_METAL_DECL_KERNEL(mul_mm_id_q5_0_f32); + LM_GGML_METAL_DECL_KERNEL(mul_mm_id_q5_1_f32); + LM_GGML_METAL_DECL_KERNEL(mul_mm_id_q8_0_f32); + LM_GGML_METAL_DECL_KERNEL(mul_mm_id_q2_K_f32); + LM_GGML_METAL_DECL_KERNEL(mul_mm_id_q3_K_f32); + LM_GGML_METAL_DECL_KERNEL(mul_mm_id_q4_K_f32); + LM_GGML_METAL_DECL_KERNEL(mul_mm_id_q5_K_f32); + LM_GGML_METAL_DECL_KERNEL(mul_mm_id_q6_K_f32); LM_GGML_METAL_DECL_KERNEL(rope_f32); LM_GGML_METAL_DECL_KERNEL(rope_f16); LM_GGML_METAL_DECL_KERNEL(alibi_f32); + LM_GGML_METAL_DECL_KERNEL(im2col_f16); + LM_GGML_METAL_DECL_KERNEL(argsort_f32_i32_asc); + LM_GGML_METAL_DECL_KERNEL(argsort_f32_i32_desc); LM_GGML_METAL_DECL_KERNEL(cpy_f32_f16); LM_GGML_METAL_DECL_KERNEL(cpy_f32_f32); + LM_GGML_METAL_DECL_KERNEL(cpy_f32_q8_0); + LM_GGML_METAL_DECL_KERNEL(cpy_f32_q4_0); + LM_GGML_METAL_DECL_KERNEL(cpy_f32_q4_1); + //LM_GGML_METAL_DECL_KERNEL(cpy_f32_q5_0); + //LM_GGML_METAL_DECL_KERNEL(cpy_f32_q5_1); LM_GGML_METAL_DECL_KERNEL(cpy_f16_f16); LM_GGML_METAL_DECL_KERNEL(concat); LM_GGML_METAL_DECL_KERNEL(sqr); + LM_GGML_METAL_DECL_KERNEL(sum_rows); #undef LM_GGML_METAL_DECL_KERNEL }; @@ -125,7 +150,7 @@ // MSL code // TODO: move the contents here when ready // for now it is easier to work in a separate file -static NSString * const msl_library_source = @"see metal.metal"; +//static NSString * const msl_library_source = @"see metal.metal"; // Here to assist with NSBundle Path Hack @interface LMGGMLMetalClass : NSObject @@ -141,7 +166,8 @@ void lm_ggml_metal_log_set_callback(lm_ggml_log_callback log_callback, void * us lm_ggml_metal_log_user_data = user_data; } -static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char* format, ...){ +LM_GGML_ATTRIBUTE_FORMAT(2, 3) +static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char * format, ...){ if (lm_ggml_metal_log_callback != NULL) { va_list args; va_start(args, format); @@ -160,12 +186,10 @@ static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char* format, } } - - struct lm_ggml_metal_context * lm_ggml_metal_init(int n_cb) { LM_GGML_METAL_LOG_INFO("%s: allocating\n", __func__); - id device; + id device; NSString * s; #if TARGET_OS_OSX @@ -209,7 +233,16 @@ static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char* format, } else { LM_GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__); - NSString * sourcePath = [bundle pathForResource:@"ggml-metal-llama" ofType:@"metal"]; + NSString * sourcePath; + NSString * ggmlMetalPathResources = [[NSProcessInfo processInfo].environment objectForKey:@"LM_GGML_METAL_PATH_RESOURCES"]; + + LM_GGML_METAL_LOG_INFO("%s: LM_GGML_METAL_PATH_RESOURCES = %s\n", __func__, ggmlMetalPathResources ? [ggmlMetalPathResources UTF8String] : "nil"); + + if (ggmlMetalPathResources) { + sourcePath = [ggmlMetalPathResources stringByAppendingPathComponent:@"ggml-metal.metal"]; + } else { + sourcePath = [bundle pathForResource:@"ggml-metal-llama" ofType:@"metal"]; + } if (sourcePath == nil) { LM_GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__); sourcePath = @"ggml-metal.metal"; @@ -235,6 +268,29 @@ static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char* format, } } +#if TARGET_OS_OSX + // print MTL GPU family: + LM_GGML_METAL_LOG_INFO("%s: GPU name: %s\n", __func__, [[ctx->device name] UTF8String]); + + // determine max supported GPU family + // https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf + // https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf + for (int i = MTLGPUFamilyApple1 + 20; i >= MTLGPUFamilyApple1; --i) { + if ([ctx->device supportsFamily:i]) { + LM_GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d (%d)\n", __func__, i - (int) MTLGPUFamilyApple1 + 1, i); + break; + } + } + + LM_GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); + LM_GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6); + if (ctx->device.maxTransferRate != 0) { + LM_GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1e6); + } else { + LM_GGML_METAL_LOG_INFO("%s: maxTransferRate = built-in GPU\n", __func__); + } +#endif + // load kernels { NSError * error = nil; @@ -256,6 +312,8 @@ static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char* format, LM_GGML_METAL_ADD_KERNEL(add_row); LM_GGML_METAL_ADD_KERNEL(mul); LM_GGML_METAL_ADD_KERNEL(mul_row); + LM_GGML_METAL_ADD_KERNEL(div); + LM_GGML_METAL_ADD_KERNEL(div_row); LM_GGML_METAL_ADD_KERNEL(scale); LM_GGML_METAL_ADD_KERNEL(scale_4); LM_GGML_METAL_ADD_KERNEL(silu); @@ -280,6 +338,7 @@ static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char* format, LM_GGML_METAL_ADD_KERNEL(rms_norm); LM_GGML_METAL_ADD_KERNEL(norm); LM_GGML_METAL_ADD_KERNEL(mul_mv_f32_f32); + LM_GGML_METAL_ADD_KERNEL(mul_mv_f16_f16); LM_GGML_METAL_ADD_KERNEL(mul_mv_f16_f32); LM_GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_1row); LM_GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_l4); @@ -306,42 +365,40 @@ static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char* format, LM_GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32); LM_GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32); LM_GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32); + LM_GGML_METAL_ADD_KERNEL(mul_mm_id_f32_f32); + LM_GGML_METAL_ADD_KERNEL(mul_mm_id_f16_f32); + LM_GGML_METAL_ADD_KERNEL(mul_mm_id_q4_0_f32); + LM_GGML_METAL_ADD_KERNEL(mul_mm_id_q4_1_f32); + LM_GGML_METAL_ADD_KERNEL(mul_mm_id_q5_0_f32); + LM_GGML_METAL_ADD_KERNEL(mul_mm_id_q5_1_f32); + LM_GGML_METAL_ADD_KERNEL(mul_mm_id_q8_0_f32); + LM_GGML_METAL_ADD_KERNEL(mul_mm_id_q2_K_f32); + LM_GGML_METAL_ADD_KERNEL(mul_mm_id_q3_K_f32); + LM_GGML_METAL_ADD_KERNEL(mul_mm_id_q4_K_f32); + LM_GGML_METAL_ADD_KERNEL(mul_mm_id_q5_K_f32); + LM_GGML_METAL_ADD_KERNEL(mul_mm_id_q6_K_f32); } LM_GGML_METAL_ADD_KERNEL(rope_f32); LM_GGML_METAL_ADD_KERNEL(rope_f16); LM_GGML_METAL_ADD_KERNEL(alibi_f32); + LM_GGML_METAL_ADD_KERNEL(im2col_f16); + LM_GGML_METAL_ADD_KERNEL(argsort_f32_i32_asc); + LM_GGML_METAL_ADD_KERNEL(argsort_f32_i32_desc); LM_GGML_METAL_ADD_KERNEL(cpy_f32_f16); LM_GGML_METAL_ADD_KERNEL(cpy_f32_f32); + LM_GGML_METAL_ADD_KERNEL(cpy_f32_q8_0); + LM_GGML_METAL_ADD_KERNEL(cpy_f32_q4_0); + LM_GGML_METAL_ADD_KERNEL(cpy_f32_q4_1); + //LM_GGML_METAL_ADD_KERNEL(cpy_f32_q5_0); + //LM_GGML_METAL_ADD_KERNEL(cpy_f32_q5_1); LM_GGML_METAL_ADD_KERNEL(cpy_f16_f16); LM_GGML_METAL_ADD_KERNEL(concat); LM_GGML_METAL_ADD_KERNEL(sqr); + LM_GGML_METAL_ADD_KERNEL(sum_rows); #undef LM_GGML_METAL_ADD_KERNEL } -#if TARGET_OS_OSX - // print MTL GPU family: - LM_GGML_METAL_LOG_INFO("%s: GPU name: %s\n", __func__, [[ctx->device name] UTF8String]); - - // determine max supported GPU family - // https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf - // https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf - for (int i = MTLGPUFamilyApple1 + 20; i >= MTLGPUFamilyApple1; --i) { - if ([ctx->device supportsFamily:i]) { - LM_GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d (%d)\n", __func__, i - MTLGPUFamilyApple1 + 1, i); - break; - } - } - - LM_GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); - LM_GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); - if (ctx->device.maxTransferRate != 0) { - LM_GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0); - } else { - LM_GGML_METAL_LOG_INFO("%s: maxTransferRate = built-in GPU\n", __func__); - } -#endif - return ctx; } @@ -355,6 +412,8 @@ void lm_ggml_metal_free(struct lm_ggml_metal_context * ctx) { LM_GGML_METAL_DEL_KERNEL(add_row); LM_GGML_METAL_DEL_KERNEL(mul); LM_GGML_METAL_DEL_KERNEL(mul_row); + LM_GGML_METAL_DEL_KERNEL(div); + LM_GGML_METAL_DEL_KERNEL(div_row); LM_GGML_METAL_DEL_KERNEL(scale); LM_GGML_METAL_DEL_KERNEL(scale_4); LM_GGML_METAL_DEL_KERNEL(silu); @@ -379,6 +438,7 @@ void lm_ggml_metal_free(struct lm_ggml_metal_context * ctx) { LM_GGML_METAL_DEL_KERNEL(rms_norm); LM_GGML_METAL_DEL_KERNEL(norm); LM_GGML_METAL_DEL_KERNEL(mul_mv_f32_f32); + LM_GGML_METAL_DEL_KERNEL(mul_mv_f16_f16); LM_GGML_METAL_DEL_KERNEL(mul_mv_f16_f32); LM_GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_1row); LM_GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_l4); @@ -405,15 +465,36 @@ void lm_ggml_metal_free(struct lm_ggml_metal_context * ctx) { LM_GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32); LM_GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32); LM_GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32); + LM_GGML_METAL_DEL_KERNEL(mul_mm_id_f32_f32); + LM_GGML_METAL_DEL_KERNEL(mul_mm_id_f16_f32); + LM_GGML_METAL_DEL_KERNEL(mul_mm_id_q4_0_f32); + LM_GGML_METAL_DEL_KERNEL(mul_mm_id_q4_1_f32); + LM_GGML_METAL_DEL_KERNEL(mul_mm_id_q5_0_f32); + LM_GGML_METAL_DEL_KERNEL(mul_mm_id_q5_1_f32); + LM_GGML_METAL_DEL_KERNEL(mul_mm_id_q8_0_f32); + LM_GGML_METAL_DEL_KERNEL(mul_mm_id_q2_K_f32); + LM_GGML_METAL_DEL_KERNEL(mul_mm_id_q3_K_f32); + LM_GGML_METAL_DEL_KERNEL(mul_mm_id_q4_K_f32); + LM_GGML_METAL_DEL_KERNEL(mul_mm_id_q5_K_f32); + LM_GGML_METAL_DEL_KERNEL(mul_mm_id_q6_K_f32); } LM_GGML_METAL_DEL_KERNEL(rope_f32); LM_GGML_METAL_DEL_KERNEL(rope_f16); LM_GGML_METAL_DEL_KERNEL(alibi_f32); + LM_GGML_METAL_DEL_KERNEL(im2col_f16); + LM_GGML_METAL_DEL_KERNEL(argsort_f32_i32_asc); + LM_GGML_METAL_DEL_KERNEL(argsort_f32_i32_desc); LM_GGML_METAL_DEL_KERNEL(cpy_f32_f16); LM_GGML_METAL_DEL_KERNEL(cpy_f32_f32); + LM_GGML_METAL_DEL_KERNEL(cpy_f32_q8_0); + LM_GGML_METAL_DEL_KERNEL(cpy_f32_q4_0); + LM_GGML_METAL_DEL_KERNEL(cpy_f32_q4_1); + //LM_GGML_METAL_DEL_KERNEL(cpy_f32_q5_0); + //LM_GGML_METAL_DEL_KERNEL(cpy_f32_q5_1); LM_GGML_METAL_DEL_KERNEL(cpy_f16_f16); LM_GGML_METAL_DEL_KERNEL(concat); LM_GGML_METAL_DEL_KERNEL(sqr); + LM_GGML_METAL_DEL_KERNEL(sum_rows); #undef LM_GGML_METAL_DEL_KERNEL @@ -457,6 +538,13 @@ int lm_ggml_metal_if_optimized(struct lm_ggml_metal_context * ctx) { return ctx->concur_list; } +// temporarily defined here for compatibility between ggml-backend and the old API +struct lm_ggml_backend_metal_buffer_context { + void * data; + + id metal; +}; + // finds the Metal buffer that contains the tensor data on the GPU device // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the // Metal buffer based on the host memory pointer @@ -466,6 +554,19 @@ int lm_ggml_metal_if_optimized(struct lm_ggml_metal_context * ctx) { const int64_t tsize = lm_ggml_nbytes(t); + // compatibility with ggml-backend + if (t->buffer && t->buffer->buft == lm_ggml_backend_metal_buffer_type()) { + struct lm_ggml_backend_metal_buffer_context * buf_ctx = (struct lm_ggml_backend_metal_buffer_context *) t->buffer->context; + + const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->data; + + LM_GGML_ASSERT(ioffs >= 0 && ioffs + tsize <= (int64_t) t->buffer->size); + + *offs = (size_t) ioffs; + + return buf_ctx->metal; + } + // find the view that contains the tensor fully for (int i = 0; i < ctx->n_buffers; ++i) { const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data; @@ -523,11 +624,11 @@ bool lm_ggml_metal_add_buffer( ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; if (ctx->buffers[ctx->n_buffers].metal == nil) { - LM_GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0); + LM_GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MiB\n", __func__, name, size_aligned / 1024.0 / 1024.0); return false; } - LM_GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0); + LM_GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MiB", __func__, name, size_aligned / 1024.0 / 1024.0); ++ctx->n_buffers; } else { @@ -547,11 +648,11 @@ bool lm_ggml_metal_add_buffer( ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; if (ctx->buffers[ctx->n_buffers].metal == nil) { - LM_GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0); + LM_GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MiB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0); return false; } - LM_GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i); + LM_GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MiB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i); if (i + size_step < size) { LM_GGML_METAL_LOG_INFO("\n"); } @@ -566,7 +667,7 @@ bool lm_ggml_metal_add_buffer( ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) { - LM_GGML_METAL_LOG_WARN(", warning: current allocated size is greater than the recommended max working set size\n", __func__); + LM_GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__); } else { LM_GGML_METAL_LOG_INFO("\n"); } @@ -688,6 +789,51 @@ void lm_ggml_metal_graph_find_concurrency( } } +static bool lm_ggml_metal_supports_op(const struct lm_ggml_tensor * op) { + switch (op->op) { + case LM_GGML_OP_UNARY: + switch (lm_ggml_get_unary_op(op)) { + case LM_GGML_UNARY_OP_SILU: + case LM_GGML_UNARY_OP_RELU: + case LM_GGML_UNARY_OP_GELU: + return true; + default: + return false; + } + case LM_GGML_OP_NONE: + case LM_GGML_OP_RESHAPE: + case LM_GGML_OP_VIEW: + case LM_GGML_OP_TRANSPOSE: + case LM_GGML_OP_PERMUTE: + case LM_GGML_OP_CONCAT: + case LM_GGML_OP_ADD: + case LM_GGML_OP_MUL: + case LM_GGML_OP_DIV: + case LM_GGML_OP_SCALE: + case LM_GGML_OP_SQR: + case LM_GGML_OP_SUM_ROWS: + case LM_GGML_OP_SOFT_MAX: + case LM_GGML_OP_RMS_NORM: + case LM_GGML_OP_NORM: + case LM_GGML_OP_ALIBI: + case LM_GGML_OP_ROPE: + case LM_GGML_OP_IM2COL: + case LM_GGML_OP_ARGSORT: + case LM_GGML_OP_DUP: + case LM_GGML_OP_CPY: + case LM_GGML_OP_CONT: + case LM_GGML_OP_MUL_MAT: + case LM_GGML_OP_MUL_MAT_ID: + return true; + case LM_GGML_OP_DIAG_MASK_INF: + case LM_GGML_OP_GET_ROWS: + { + return op->ne[0] % 4 == 0; + } + default: + return false; + } +} void lm_ggml_metal_graph_compute( struct lm_ggml_metal_context * ctx, struct lm_ggml_cgraph * gf) { @@ -744,6 +890,22 @@ void lm_ggml_metal_graph_compute( struct lm_ggml_tensor * src1 = gf->nodes[i]->src[1]; struct lm_ggml_tensor * dst = gf->nodes[i]; + switch (dst->op) { + case LM_GGML_OP_NONE: + case LM_GGML_OP_RESHAPE: + case LM_GGML_OP_VIEW: + case LM_GGML_OP_TRANSPOSE: + case LM_GGML_OP_PERMUTE: + { + // noop -> next node + } continue; + default: + { + } break; + } + + LM_GGML_ASSERT(lm_ggml_metal_supports_op(dst)); + const int64_t ne00 = src0 ? src0->ne[0] : 0; const int64_t ne01 = src0 ? src0->ne[1] : 0; const int64_t ne02 = src0 ? src0->ne[2] : 0; @@ -797,14 +959,6 @@ void lm_ggml_metal_graph_compute( //} switch (dst->op) { - case LM_GGML_OP_NONE: - case LM_GGML_OP_RESHAPE: - case LM_GGML_OP_VIEW: - case LM_GGML_OP_TRANSPOSE: - case LM_GGML_OP_PERMUTE: - { - // noop - } break; case LM_GGML_OP_CONCAT: { const int64_t nb = ne00; @@ -844,6 +998,8 @@ void lm_ggml_metal_graph_compute( [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; case LM_GGML_OP_ADD: + case LM_GGML_OP_MUL: + case LM_GGML_OP_DIV: { LM_GGML_ASSERT(lm_ggml_is_contiguous(src0)); LM_GGML_ASSERT(lm_ggml_is_contiguous(src1)); @@ -857,11 +1013,21 @@ void lm_ggml_metal_graph_compute( LM_GGML_ASSERT(ne11 == 1); nb = ne00 / 4; - [encoder setComputePipelineState:ctx->pipeline_add_row]; + switch (dst->op) { + case LM_GGML_OP_ADD: [encoder setComputePipelineState:ctx->pipeline_add_row]; break; + case LM_GGML_OP_MUL: [encoder setComputePipelineState:ctx->pipeline_mul_row]; break; + case LM_GGML_OP_DIV: [encoder setComputePipelineState:ctx->pipeline_div_row]; break; + default: LM_GGML_ASSERT(false); + } bcast_row = true; } else { - [encoder setComputePipelineState:ctx->pipeline_add]; + switch (dst->op) { + case LM_GGML_OP_ADD: [encoder setComputePipelineState:ctx->pipeline_add]; break; + case LM_GGML_OP_MUL: [encoder setComputePipelineState:ctx->pipeline_mul]; break; + case LM_GGML_OP_DIV: [encoder setComputePipelineState:ctx->pipeline_div]; break; + default: LM_GGML_ASSERT(false); + } } [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; @@ -902,31 +1068,6 @@ void lm_ggml_metal_graph_compute( [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } } break; - case LM_GGML_OP_MUL: - { - LM_GGML_ASSERT(lm_ggml_is_contiguous(src0)); - LM_GGML_ASSERT(lm_ggml_is_contiguous(src1)); - - // utilize float4 - LM_GGML_ASSERT(ne00 % 4 == 0); - const int64_t nb = ne00/4; - - if (lm_ggml_nelements(src1) == ne10) { - // src1 is a row - LM_GGML_ASSERT(ne11 == 1); - [encoder setComputePipelineState:ctx->pipeline_mul_row]; - } else { - [encoder setComputePipelineState:ctx->pipeline_mul]; - } - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&nb length:sizeof(nb) atIndex:3]; - - const int64_t n = lm_ggml_nelements(dst)/4; - - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; case LM_GGML_OP_SCALE: { LM_GGML_ASSERT(lm_ggml_is_contiguous(src0)); @@ -999,25 +1140,66 @@ void lm_ggml_metal_graph_compute( const int64_t n = lm_ggml_nelements(dst); [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; + case LM_GGML_OP_SUM_ROWS: + { + LM_GGML_ASSERT(src0->nb[0] == lm_ggml_type_size(src0->type)); + + [encoder setComputePipelineState:ctx->pipeline_sum_rows]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; + [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5]; + [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7]; + [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8]; + [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9]; + [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:10]; + [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:11]; + [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12]; + [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:13]; + [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14]; + [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15]; + [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16]; + [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:17]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:18]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:19]; + [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:20]; + [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:21]; + [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:22]; + [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:23]; + [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:24]; + [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:25]; + + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; case LM_GGML_OP_SOFT_MAX: { int nth = 32; // SIMD width if (ne00%4 == 0) { + while (nth < ne00/4 && nth < 256) { + nth *= 2; + } [encoder setComputePipelineState:ctx->pipeline_soft_max_4]; } else { - do { + while (nth < ne00 && nth < 1024) { nth *= 2; - } while (nth <= ne00 && nth <= 1024); - nth /= 2; + } [encoder setComputePipelineState:ctx->pipeline_soft_max]; } - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; - [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; - [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; - [encoder setThreadgroupMemoryLength:MAX(16, nth/32*sizeof(float)) atIndex:0]; + + const float scale = ((float *) dst->op_params)[0]; + + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5]; + [encoder setBytes:&scale length:sizeof(scale) atIndex:6]; + [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; @@ -1046,9 +1228,13 @@ void lm_ggml_metal_graph_compute( case LM_GGML_OP_MUL_MAT: { LM_GGML_ASSERT(ne00 == ne10); - LM_GGML_ASSERT(ne03 == ne13); - const uint gqa = ne12/ne02; + // TODO: assert that dim2 and dim3 are contiguous + LM_GGML_ASSERT(ne12 % ne02 == 0); + LM_GGML_ASSERT(ne13 % ne03 == 0); + + const uint r2 = ne12/ne02; + const uint r3 = ne13/ne03; // find the break-even point where the matrix-matrix kernel becomes more efficient compared // to the matrix-vector kernel @@ -1083,7 +1269,7 @@ void lm_ggml_metal_graph_compute( !lm_ggml_is_transposed(src1) && src1t == LM_GGML_TYPE_F32 && ne00 % 32 == 0 && ne00 >= 64 && - ne11 > ne11_mm_min) { + (ne11 > ne11_mm_min || (lm_ggml_is_quantized(src0t) && ne12 > 1))) { //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12); switch (src0->type) { case LM_GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f32_f32]; break; @@ -1113,9 +1299,10 @@ void lm_ggml_metal_graph_compute( [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:10]; [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:11]; [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:12]; - [encoder setBytes:&gqa length:sizeof(gqa) atIndex:13]; + [encoder setBytes:&r2 length:sizeof(r2) atIndex:13]; + [encoder setBytes:&r3 length:sizeof(r3) atIndex:14]; [encoder setThreadgroupMemoryLength:8192 atIndex:0]; - [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne01 + 63)/64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne01 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; } else { int nth0 = 32; int nth1 = 1; @@ -1126,6 +1313,7 @@ void lm_ggml_metal_graph_compute( switch (src0t) { case LM_GGML_TYPE_F32: { + LM_GGML_ASSERT(src1t == LM_GGML_TYPE_F32); [encoder setComputePipelineState:ctx->pipeline_mul_mv_f32_f32]; nrows = 4; } break; @@ -1133,102 +1321,77 @@ void lm_ggml_metal_graph_compute( { nth0 = 32; nth1 = 1; - if (ne11 * ne12 < 4) { - [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_1row]; - } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) { - [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_l4]; - nrows = ne11; + if (src1t == LM_GGML_TYPE_F32) { + if (ne11 * ne12 < 4) { + [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_1row]; + } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) { + [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_l4]; + nrows = ne11; + } else { + [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32]; + nrows = 4; + } } else { - [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32]; + [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f16]; nrows = 4; } } break; case LM_GGML_TYPE_Q4_0: { - LM_GGML_ASSERT(ne02 == 1); - LM_GGML_ASSERT(ne12 == 1); - nth0 = 8; nth1 = 8; [encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_0_f32]; } break; case LM_GGML_TYPE_Q4_1: { - LM_GGML_ASSERT(ne02 == 1); - LM_GGML_ASSERT(ne12 == 1); - nth0 = 8; nth1 = 8; [encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_1_f32]; } break; case LM_GGML_TYPE_Q5_0: { - LM_GGML_ASSERT(ne02 == 1); - LM_GGML_ASSERT(ne12 == 1); - nth0 = 8; nth1 = 8; [encoder setComputePipelineState:ctx->pipeline_mul_mv_q5_0_f32]; } break; case LM_GGML_TYPE_Q5_1: { - LM_GGML_ASSERT(ne02 == 1); - LM_GGML_ASSERT(ne12 == 1); - nth0 = 8; nth1 = 8; [encoder setComputePipelineState:ctx->pipeline_mul_mv_q5_1_f32]; } break; case LM_GGML_TYPE_Q8_0: { - LM_GGML_ASSERT(ne02 == 1); - LM_GGML_ASSERT(ne12 == 1); - nth0 = 8; nth1 = 8; [encoder setComputePipelineState:ctx->pipeline_mul_mv_q8_0_f32]; } break; case LM_GGML_TYPE_Q2_K: { - LM_GGML_ASSERT(ne02 == 1); - LM_GGML_ASSERT(ne12 == 1); - nth0 = 2; nth1 = 32; [encoder setComputePipelineState:ctx->pipeline_mul_mv_q2_K_f32]; } break; case LM_GGML_TYPE_Q3_K: { - LM_GGML_ASSERT(ne02 == 1); - LM_GGML_ASSERT(ne12 == 1); - nth0 = 2; nth1 = 32; [encoder setComputePipelineState:ctx->pipeline_mul_mv_q3_K_f32]; } break; case LM_GGML_TYPE_Q4_K: { - LM_GGML_ASSERT(ne02 == 1); - LM_GGML_ASSERT(ne12 == 1); - nth0 = 4; //1; nth1 = 8; //32; [encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_K_f32]; } break; case LM_GGML_TYPE_Q5_K: { - LM_GGML_ASSERT(ne02 == 1); - LM_GGML_ASSERT(ne12 == 1); - nth0 = 2; nth1 = 32; [encoder setComputePipelineState:ctx->pipeline_mul_mv_q5_K_f32]; } break; case LM_GGML_TYPE_Q6_K: { - LM_GGML_ASSERT(ne02 == 1); - LM_GGML_ASSERT(ne12 == 1); - nth0 = 2; nth1 = 32; [encoder setComputePipelineState:ctx->pipeline_mul_mv_q6_K_f32]; @@ -1257,32 +1420,125 @@ void lm_ggml_metal_graph_compute( [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14]; [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:15]; [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:16]; - [encoder setBytes:&gqa length:sizeof(gqa) atIndex:17]; + [encoder setBytes:&r2 length:sizeof(r2) atIndex:17]; + [encoder setBytes:&r3 length:sizeof(r3) atIndex:18]; if (src0t == LM_GGML_TYPE_Q4_0 || src0t == LM_GGML_TYPE_Q4_1 || src0t == LM_GGML_TYPE_Q5_0 || src0t == LM_GGML_TYPE_Q5_1 || src0t == LM_GGML_TYPE_Q8_0 || src0t == LM_GGML_TYPE_Q2_K) { // || src0t == LM_GGML_TYPE_Q4_K) { - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else if (src0t == LM_GGML_TYPE_Q4_K) { - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else if (src0t == LM_GGML_TYPE_Q3_K) { #ifdef LM_GGML_QKK_64 - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; #else - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; #endif } else if (src0t == LM_GGML_TYPE_Q5_K) { - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else if (src0t == LM_GGML_TYPE_Q6_K) { - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else { int64_t ny = (ne11 + nrows - 1)/nrows; - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + } + } break; + case LM_GGML_OP_MUL_MAT_ID: + { + //LM_GGML_ASSERT(ne00 == ne10); + //LM_GGML_ASSERT(ne03 == ne13); + + LM_GGML_ASSERT(src0t == LM_GGML_TYPE_I32); + + const int n_as = ne00; + + // TODO: make this more general + LM_GGML_ASSERT(n_as <= 8); + + struct lm_ggml_tensor * src2 = gf->nodes[i]->src[2]; + + const int64_t ne20 = src2 ? src2->ne[0] : 0; + const int64_t ne21 = src2 ? src2->ne[1] : 0; + const int64_t ne22 = src2 ? src2->ne[2] : 0; + const int64_t ne23 = src2 ? src2->ne[3] : 0; LM_GGML_UNUSED(ne23); + + const uint64_t nb20 = src2 ? src2->nb[0] : 0; LM_GGML_UNUSED(nb20); + const uint64_t nb21 = src2 ? src2->nb[1] : 0; + const uint64_t nb22 = src2 ? src2->nb[2] : 0; + const uint64_t nb23 = src2 ? src2->nb[3] : 0; LM_GGML_UNUSED(nb23); + + const enum lm_ggml_type src2t = src2 ? src2->type : LM_GGML_TYPE_COUNT; LM_GGML_UNUSED(src2t); + + LM_GGML_ASSERT(!lm_ggml_is_transposed(src2)); + LM_GGML_ASSERT(!lm_ggml_is_transposed(src1)); + + LM_GGML_ASSERT(ne20 % 32 == 0); + // !!!!!!!!! TODO: this assert is probably required but not sure! + //LM_GGML_ASSERT(ne20 >= 64); + LM_GGML_ASSERT(src1t == LM_GGML_TYPE_F32); + + const uint r2 = ne12/ne22; + const uint r3 = ne13/ne23; + + // find the break-even point where the matrix-matrix kernel becomes more efficient compared + // to the matrix-vector kernel + int ne11_mm_min = 0; + + const int idx = ((int32_t *) dst->op_params)[0]; + + // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs + // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel + if ([ctx->device supportsFamily:MTLGPUFamilyApple7] && + ne11 > ne11_mm_min) { + switch (src2->type) { + case LM_GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_f32_f32]; break; + case LM_GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_f16_f32]; break; + case LM_GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q4_0_f32]; break; + case LM_GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q4_1_f32]; break; + case LM_GGML_TYPE_Q5_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q5_0_f32]; break; + case LM_GGML_TYPE_Q5_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q5_1_f32]; break; + case LM_GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q8_0_f32]; break; + case LM_GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q2_K_f32]; break; + case LM_GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q3_K_f32]; break; + case LM_GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q4_K_f32]; break; + case LM_GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q5_K_f32]; break; + case LM_GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q6_K_f32]; break; + default: LM_GGML_ASSERT(false && "MUL_MAT_ID not implemented"); } + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&ne20 length:sizeof(ne20) atIndex:3]; + [encoder setBytes:&ne22 length:sizeof(ne22) atIndex:4]; + [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:5]; + [encoder setBytes:&nb22 length:sizeof(nb22) atIndex:6]; + [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:7]; + [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:8]; + [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:9]; + [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:10]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:11]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:12]; + [encoder setBytes:&r2 length:sizeof(r2) atIndex:13]; + [encoder setBytes:&r3 length:sizeof(r3) atIndex:14]; + [encoder setBytes:&idx length:sizeof(idx) atIndex:15]; + // TODO: how to make this an array? read Metal docs + for (int j = 0; j < n_as; ++j) { + struct lm_ggml_tensor * src_cur = dst->src[2 + j]; + + size_t offs_src_cur = 0; + id id_src_cur = lm_ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur); + + [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:16 + j]; + } + + [encoder setThreadgroupMemoryLength:8192 atIndex:0]; + [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne21 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; } } break; case LM_GGML_OP_GET_ROWS: @@ -1321,15 +1577,19 @@ void lm_ggml_metal_graph_compute( float eps; memcpy(&eps, dst->op_params, sizeof(float)); - const int nth = MIN(512, ne00); + int nth = 32; // SIMD width + + while (nth < ne00/4 && nth < 1024) { + nth *= 2; + } [encoder setComputePipelineState:ctx->pipeline_rms_norm]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; - [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3]; - [encoder setBytes:&eps length:sizeof( float) atIndex:4]; - [encoder setThreadgroupMemoryLength:nth/32*sizeof(float) atIndex:0]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3]; + [encoder setBytes:&eps length:sizeof( float) atIndex:4]; + [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0]; const int64_t nrows = lm_ggml_nrows(src0); @@ -1348,7 +1608,7 @@ void lm_ggml_metal_graph_compute( [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3]; [encoder setBytes:&eps length:sizeof( float) atIndex:4]; - [encoder setThreadgroupMemoryLength:MAX(16, nth*sizeof(float)) atIndex:0]; + [encoder setThreadgroupMemoryLength:LM_GGML_PAD(nth*sizeof(float), 16) atIndex:0]; const int64_t nrows = lm_ggml_nrows(src0); @@ -1452,18 +1712,100 @@ void lm_ggml_metal_graph_compute( [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; + case LM_GGML_OP_IM2COL: + { + LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F16); + LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F32); + LM_GGML_ASSERT( dst->type == LM_GGML_TYPE_F16); + + const int32_t s0 = ((const int32_t *)(dst->op_params))[0]; + const int32_t s1 = ((const int32_t *)(dst->op_params))[1]; + const int32_t p0 = ((const int32_t *)(dst->op_params))[2]; + const int32_t p1 = ((const int32_t *)(dst->op_params))[3]; + const int32_t d0 = ((const int32_t *)(dst->op_params))[4]; + const int32_t d1 = ((const int32_t *)(dst->op_params))[5]; + const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1; + + const int32_t N = src1->ne[is_2D ? 3 : 2]; + const int32_t IC = src1->ne[is_2D ? 2 : 1]; + const int32_t IH = is_2D ? src1->ne[1] : 1; + const int32_t IW = src1->ne[0]; + + const int32_t KH = is_2D ? src0->ne[1] : 1; + const int32_t KW = src0->ne[0]; + + const int32_t OH = is_2D ? dst->ne[2] : 1; + const int32_t OW = dst->ne[1]; + + const int32_t CHW = IC * KH * KW; + + const int32_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4; + const int32_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4; + + switch (src0->type) { + case LM_GGML_TYPE_F32: LM_GGML_ASSERT(false && "not implemented"); break; + case LM_GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_im2col_f16]; break; + default: LM_GGML_ASSERT(false); + }; + + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ofs0 length:sizeof( int32_t) atIndex:2]; + [encoder setBytes:&ofs1 length:sizeof( int32_t) atIndex:3]; + [encoder setBytes:&IW length:sizeof( int32_t) atIndex:4]; + [encoder setBytes:&IH length:sizeof( int32_t) atIndex:5]; + [encoder setBytes:&CHW length:sizeof( int32_t) atIndex:6]; + [encoder setBytes:&s0 length:sizeof( int32_t) atIndex:7]; + [encoder setBytes:&s1 length:sizeof( int32_t) atIndex:8]; + [encoder setBytes:&p0 length:sizeof( int32_t) atIndex:9]; + [encoder setBytes:&p1 length:sizeof( int32_t) atIndex:10]; + [encoder setBytes:&d0 length:sizeof( int32_t) atIndex:11]; + [encoder setBytes:&d1 length:sizeof( int32_t) atIndex:12]; + + [encoder dispatchThreadgroups:MTLSizeMake(IC, OH, OW) threadsPerThreadgroup:MTLSizeMake(N, KH, KW)]; + } break; + case LM_GGML_OP_ARGSORT: + { + LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F32); + LM_GGML_ASSERT( dst->type == LM_GGML_TYPE_I32); + + const int nrows = lm_ggml_nrows(src0); + + enum lm_ggml_sort_order order = (enum lm_ggml_sort_order) dst->op_params[0]; + + switch (order) { + case LM_GGML_SORT_ASC: [encoder setComputePipelineState:ctx->pipeline_argsort_f32_i32_asc]; break; + case LM_GGML_SORT_DESC: [encoder setComputePipelineState:ctx->pipeline_argsort_f32_i32_desc]; break; + default: LM_GGML_ASSERT(false); + }; + + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + + [encoder dispatchThreadgroups:MTLSizeMake(1, nrows, 1) threadsPerThreadgroup:MTLSizeMake(ne00, 1, 1)]; + } break; case LM_GGML_OP_DUP: case LM_GGML_OP_CPY: case LM_GGML_OP_CONT: { - const int nth = MIN(1024, ne00); + LM_GGML_ASSERT(ne00 % lm_ggml_blck_size(src0->type) == 0); + + int nth = MIN(1024, ne00/lm_ggml_blck_size(src0->type)); switch (src0t) { case LM_GGML_TYPE_F32: { + LM_GGML_ASSERT(ne0 % lm_ggml_blck_size(dst->type) == 0); + switch (dstt) { - case LM_GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f16]; break; - case LM_GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32]; break; + case LM_GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f16]; break; + case LM_GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32]; break; + case LM_GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q8_0]; break; + case LM_GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q4_0]; break; + case LM_GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q4_1]; break; + //case LM_GGML_TYPE_Q5_0: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q5_0]; break; + //case LM_GGML_TYPE_Q5_1: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q5_1]; break; default: LM_GGML_ASSERT(false && "not implemented"); }; } break; @@ -1538,81 +1880,150 @@ void lm_ggml_metal_graph_compute( // backend interface -static const char * lm_ggml_backend_metal_name(lm_ggml_backend_t backend) { - return "Metal"; +static id g_backend_device = nil; +static int g_backend_device_ref_count = 0; - UNUSED(backend); +static id lm_ggml_backend_metal_get_device(void) { + if (g_backend_device == nil) { + g_backend_device = MTLCreateSystemDefaultDevice(); + } + + g_backend_device_ref_count++; + + return g_backend_device; } -static void lm_ggml_backend_metal_free(lm_ggml_backend_t backend) { - struct lm_ggml_metal_context * ctx = (struct lm_ggml_metal_context *)backend->context; - lm_ggml_metal_free(ctx); - free(backend); +static void lm_ggml_backend_metal_free_device(void) { + assert(g_backend_device_ref_count > 0); + + g_backend_device_ref_count--; + + if (g_backend_device_ref_count == 0) { + [g_backend_device release]; + g_backend_device = nil; + } } static void * lm_ggml_backend_metal_buffer_get_base(lm_ggml_backend_buffer_t buffer) { - return (void *)buffer->context; + struct lm_ggml_backend_metal_buffer_context * ctx = (struct lm_ggml_backend_metal_buffer_context *)buffer->context; + + return ctx->data; } static void lm_ggml_backend_metal_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) { - free(buffer->context); + struct lm_ggml_backend_metal_buffer_context * ctx = (struct lm_ggml_backend_metal_buffer_context *)buffer->context; + + [ctx->metal release]; + lm_ggml_backend_metal_free_device(); + + free(ctx->data); + free(ctx); + + UNUSED(buffer); +} + +static void lm_ggml_backend_metal_buffer_set_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor write out of bounds"); + LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + + memcpy((char *)tensor->data + offset, data, size); + + UNUSED(buffer); +} + +static void lm_ggml_backend_metal_buffer_get_tensor(lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) { + LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor read out of bounds"); + LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + + memcpy(data, (const char *)tensor->data + offset, size); + + UNUSED(buffer); +} + +static void lm_ggml_backend_metal_buffer_cpy_tensor_from(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) { + lm_ggml_backend_tensor_get(src, dst->data, 0, lm_ggml_nbytes(src)); + + UNUSED(buffer); +} + +static void lm_ggml_backend_metal_buffer_cpy_tensor_to(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) { + lm_ggml_backend_tensor_set(dst, src->data, 0, lm_ggml_nbytes(src)); + UNUSED(buffer); } static struct lm_ggml_backend_buffer_i metal_backend_buffer_i = { - /* .free_buffer = */ lm_ggml_backend_metal_buffer_free_buffer, - /* .get_base = */ lm_ggml_backend_metal_buffer_get_base, - /* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes - /* .init_tensor = */ NULL, // no initialization required - /* .free_tensor = */ NULL, // no cleanup required + /* .free_buffer = */ lm_ggml_backend_metal_buffer_free_buffer, + /* .get_base = */ lm_ggml_backend_metal_buffer_get_base, + /* .init_tensor = */ NULL, + /* .set_tensor = */ lm_ggml_backend_metal_buffer_set_tensor, + /* .get_tensor = */ lm_ggml_backend_metal_buffer_get_tensor, + /* .cpy_tensor_from = */ lm_ggml_backend_metal_buffer_cpy_tensor_from, + /* .cpy_tensor_to = */ lm_ggml_backend_metal_buffer_cpy_tensor_to, }; -static lm_ggml_backend_buffer_t lm_ggml_backend_metal_alloc_buffer(lm_ggml_backend_t backend, size_t size) { - struct lm_ggml_metal_context * ctx = (struct lm_ggml_metal_context *)backend->context; +static lm_ggml_backend_buffer_t lm_ggml_backend_metal_buffer_type_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) { + struct lm_ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct lm_ggml_backend_metal_buffer_context)); - void * data = lm_ggml_metal_host_malloc(size); + const size_t size_page = sysconf(_SC_PAGESIZE); - // TODO: set proper name of the buffers - lm_ggml_metal_add_buffer(ctx, "backend", data, size, 0); + size_t size_aligned = size; + if ((size_aligned % size_page) != 0) { + size_aligned += (size_page - (size_aligned % size_page)); + } - return lm_ggml_backend_buffer_init(backend, metal_backend_buffer_i, data, size); + ctx->data = lm_ggml_metal_host_malloc(size); + ctx->metal = [lm_ggml_backend_metal_get_device() newBufferWithBytesNoCopy:ctx->data + length:size_aligned + options:MTLResourceStorageModeShared + deallocator:nil]; + + return lm_ggml_backend_buffer_init(buft, metal_backend_buffer_i, ctx, size); } -static size_t lm_ggml_backend_metal_get_alignment(lm_ggml_backend_t backend) { +static size_t lm_ggml_backend_metal_buffer_type_get_alignment(lm_ggml_backend_buffer_type_t buft) { return 32; - UNUSED(backend); + UNUSED(buft); } -static void lm_ggml_backend_metal_set_tensor_async(lm_ggml_backend_t backend, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor write out of bounds"); - LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); - - memcpy((char *)tensor->data + offset, data, size); +static bool lm_ggml_backend_metal_buffer_type_supports_backend(lm_ggml_backend_buffer_type_t buft, lm_ggml_backend_t backend) { + return lm_ggml_backend_is_metal(backend) || lm_ggml_backend_is_cpu(backend); - UNUSED(backend); + LM_GGML_UNUSED(buft); } -static void lm_ggml_backend_metal_get_tensor_async(lm_ggml_backend_t backend, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) { - LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor read out of bounds"); - LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); - - memcpy(data, (const char *)tensor->data + offset, size); +lm_ggml_backend_buffer_type_t lm_ggml_backend_metal_buffer_type(void) { + static struct lm_ggml_backend_buffer_type lm_ggml_backend_buffer_type_metal = { + /* .iface = */ { + /* .alloc_buffer = */ lm_ggml_backend_metal_buffer_type_alloc_buffer, + /* .get_alignment = */ lm_ggml_backend_metal_buffer_type_get_alignment, + /* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes + /* .supports_backend = */ lm_ggml_backend_metal_buffer_type_supports_backend, + }, + /* .context = */ NULL, + }; - UNUSED(backend); + return &lm_ggml_backend_buffer_type_metal; } -static void lm_ggml_backend_metal_synchronize(lm_ggml_backend_t backend) { +static const char * lm_ggml_backend_metal_name(lm_ggml_backend_t backend) { + return "Metal"; + UNUSED(backend); } -static void lm_ggml_backend_metal_cpy_tensor_from(lm_ggml_backend_t backend, struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) { - lm_ggml_backend_tensor_get(src, dst->data, 0, lm_ggml_nbytes(src)); +static void lm_ggml_backend_metal_free(lm_ggml_backend_t backend) { + struct lm_ggml_metal_context * ctx = (struct lm_ggml_metal_context *)backend->context; + lm_ggml_metal_free(ctx); + free(backend); +} +static void lm_ggml_backend_metal_synchronize(lm_ggml_backend_t backend) { UNUSED(backend); } -static void lm_ggml_backend_metal_cpy_tensor_to(lm_ggml_backend_t backend, struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) { - lm_ggml_backend_tensor_set_async(dst, src->data, 0, lm_ggml_nbytes(src)); +static lm_ggml_backend_buffer_type_t lm_ggml_backend_metal_get_default_buffer_type(lm_ggml_backend_t backend) { + return lm_ggml_backend_metal_buffer_type(); UNUSED(backend); } @@ -1624,32 +2035,43 @@ static void lm_ggml_backend_metal_graph_compute(lm_ggml_backend_t backend, struc } static bool lm_ggml_backend_metal_supports_op(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op) { - return true; + return lm_ggml_metal_supports_op(op); + UNUSED(backend); - UNUSED(op); } static struct lm_ggml_backend_i metal_backend_i = { - /* .get_name = */ lm_ggml_backend_metal_name, - /* .free = */ lm_ggml_backend_metal_free, - /* .alloc_buffer = */ lm_ggml_backend_metal_alloc_buffer, - /* .get_alignment = */ lm_ggml_backend_metal_get_alignment, - /* .set_tensor_async = */ lm_ggml_backend_metal_set_tensor_async, - /* .get_tensor_async = */ lm_ggml_backend_metal_get_tensor_async, - /* .synchronize = */ lm_ggml_backend_metal_synchronize, - /* .cpy_tensor_from = */ lm_ggml_backend_metal_cpy_tensor_from, - /* .cpy_tensor_to = */ lm_ggml_backend_metal_cpy_tensor_to, - /* .graph_plan_create = */ NULL, // the metal implementation does not require creating graph plans atm - /* .graph_plan_free = */ NULL, - /* .graph_plan_compute = */ NULL, - /* .graph_compute = */ lm_ggml_backend_metal_graph_compute, - /* .supports_op = */ lm_ggml_backend_metal_supports_op, + /* .get_name = */ lm_ggml_backend_metal_name, + /* .free = */ lm_ggml_backend_metal_free, + /* .get_default_buffer_type = */ lm_ggml_backend_metal_get_default_buffer_type, + /* .set_tensor_async = */ NULL, + /* .get_tensor_async = */ NULL, + /* .cpy_tensor_from_async = */ NULL, + /* .cpy_tensor_to_async = */ NULL, + /* .synchronize = */ lm_ggml_backend_metal_synchronize, + /* .graph_plan_create = */ NULL, // the metal implementation does not require creating graph plans atm + /* .graph_plan_free = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ lm_ggml_backend_metal_graph_compute, + /* .supports_op = */ lm_ggml_backend_metal_supports_op, }; +// TODO: make a common log callback for all backends in ggml-backend +static void lm_ggml_backend_log_callback(enum lm_ggml_log_level level, const char * msg, void * user_data) { + fprintf(stderr, "%s", msg); + + UNUSED(level); + UNUSED(user_data); +} + lm_ggml_backend_t lm_ggml_backend_metal_init(void) { - struct lm_ggml_metal_context * ctx = malloc(sizeof(struct lm_ggml_metal_context)); + lm_ggml_metal_log_set_callback(lm_ggml_backend_log_callback, NULL); - ctx = lm_ggml_metal_init(LM_GGML_DEFAULT_N_THREADS); + struct lm_ggml_metal_context * ctx = lm_ggml_metal_init(LM_GGML_DEFAULT_N_THREADS); + + if (ctx == NULL) { + return NULL; + } lm_ggml_backend_t metal_backend = malloc(sizeof(struct lm_ggml_backend)); @@ -1666,7 +2088,26 @@ bool lm_ggml_backend_is_metal(lm_ggml_backend_t backend) { } void lm_ggml_backend_metal_set_n_cb(lm_ggml_backend_t backend, int n_cb) { + LM_GGML_ASSERT(lm_ggml_backend_is_metal(backend)); + struct lm_ggml_metal_context * ctx = (struct lm_ggml_metal_context *)backend->context; lm_ggml_metal_set_n_cb(ctx, n_cb); } + +bool lm_ggml_backend_metal_supports_family(lm_ggml_backend_t backend, int family) { + LM_GGML_ASSERT(lm_ggml_backend_is_metal(backend)); + + struct lm_ggml_metal_context * ctx = (struct lm_ggml_metal_context *)backend->context; + + return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)]; +} + +lm_ggml_backend_t lm_ggml_backend_reg_metal_init(const char * params, void * user_data); // silence warning + +lm_ggml_backend_t lm_ggml_backend_reg_metal_init(const char * params, void * user_data) { + return lm_ggml_backend_metal_init(); + + LM_GGML_UNUSED(params); + LM_GGML_UNUSED(user_data); +} diff --git a/cpp/ggml-quants.c b/cpp/ggml-quants.c index abc8dbb..30bebbf 100644 --- a/cpp/ggml-quants.c +++ b/cpp/ggml-quants.c @@ -14,32 +14,12 @@ // #include -#if !defined(__aarch64__) -inline static int32_t vaddvq_s16(int16x8_t v) { - return - (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) + - (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) + - (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) + - (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7); -} - -inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) { - int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a)); - int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b)); - return vcombine_s16(a0, b0); -} - -inline static int32_t vaddvq_s32(int32x4_t v) { - return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3); -} -#endif - #else #ifdef __wasm_simd128__ #include #else -#ifdef __POWER9_VECTOR__ +#if defined(__POWER9_VECTOR__) || defined(__powerpc64__) #include #undef bool #define bool _Bool @@ -47,13 +27,15 @@ inline static int32_t vaddvq_s32(int32x4_t v) { #if defined(_MSC_VER) || defined(__MINGW32__) #include #else -#if !defined(__riscv) && !defined(__s390__) +#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) +#if !defined(__riscv) #include #endif #endif #endif #endif #endif +#endif #ifdef __riscv_v_intrinsic #include @@ -61,6 +43,7 @@ inline static int32_t vaddvq_s32(int32x4_t v) { #undef MIN #undef MAX + #define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b)) @@ -283,9 +266,31 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 #endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) #if defined(__ARM_NEON) - #if !defined(__aarch64__) +// 64-bit compatibility + +// vaddvq_s16 +// vpaddq_s16 +// vaddvq_s32 +// vaddvq_f32 +// vmaxvq_f32 +// vcvtnq_s32_f32 + +inline static int32_t vaddvq_s16(int16x8_t v) { + return + (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) + + (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) + + (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) + + (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7); +} + +inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) { + int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a)); + int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b)); + return vcombine_s16(a0, b0); +} + inline static int32_t vaddvq_s32(int32x4_t v) { return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3); } @@ -311,6 +316,96 @@ inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) { return res; } +// vld1q_s16_x2 +// vld1q_u8_x2 +// vld1q_u8_x4 +// vld1q_s8_x2 +// vld1q_s8_x4 +// TODO: double-check these work correctly + +typedef struct lm_ggml_int16x8x2_t { + int16x8_t val[2]; +} lm_ggml_int16x8x2_t; + +inline static lm_ggml_int16x8x2_t lm_ggml_vld1q_s16_x2(const int16_t * ptr) { + lm_ggml_int16x8x2_t res; + + res.val[0] = vld1q_s16(ptr + 0); + res.val[1] = vld1q_s16(ptr + 8); + + return res; +} + +typedef struct lm_ggml_uint8x16x2_t { + uint8x16_t val[2]; +} lm_ggml_uint8x16x2_t; + +inline static lm_ggml_uint8x16x2_t lm_ggml_vld1q_u8_x2(const uint8_t * ptr) { + lm_ggml_uint8x16x2_t res; + + res.val[0] = vld1q_u8(ptr + 0); + res.val[1] = vld1q_u8(ptr + 16); + + return res; +} + +typedef struct lm_ggml_uint8x16x4_t { + uint8x16_t val[4]; +} lm_ggml_uint8x16x4_t; + +inline static lm_ggml_uint8x16x4_t lm_ggml_vld1q_u8_x4(const uint8_t * ptr) { + lm_ggml_uint8x16x4_t res; + + res.val[0] = vld1q_u8(ptr + 0); + res.val[1] = vld1q_u8(ptr + 16); + res.val[2] = vld1q_u8(ptr + 32); + res.val[3] = vld1q_u8(ptr + 48); + + return res; +} + +typedef struct lm_ggml_int8x16x2_t { + int8x16_t val[2]; +} lm_ggml_int8x16x2_t; + +inline static lm_ggml_int8x16x2_t lm_ggml_vld1q_s8_x2(const int8_t * ptr) { + lm_ggml_int8x16x2_t res; + + res.val[0] = vld1q_s8(ptr + 0); + res.val[1] = vld1q_s8(ptr + 16); + + return res; +} + +typedef struct lm_ggml_int8x16x4_t { + int8x16_t val[4]; +} lm_ggml_int8x16x4_t; + +inline static lm_ggml_int8x16x4_t lm_ggml_vld1q_s8_x4(const int8_t * ptr) { + lm_ggml_int8x16x4_t res; + + res.val[0] = vld1q_s8(ptr + 0); + res.val[1] = vld1q_s8(ptr + 16); + res.val[2] = vld1q_s8(ptr + 32); + res.val[3] = vld1q_s8(ptr + 48); + + return res; +} + +#else + +#define lm_ggml_int16x8x2_t int16x8x2_t +#define lm_ggml_uint8x16x2_t uint8x16x2_t +#define lm_ggml_uint8x16x4_t uint8x16x4_t +#define lm_ggml_int8x16x2_t int8x16x2_t +#define lm_ggml_int8x16x4_t int8x16x4_t + +#define lm_ggml_vld1q_s16_x2 vld1q_s16_x2 +#define lm_ggml_vld1q_u8_x2 vld1q_u8_x2 +#define lm_ggml_vld1q_u8_x4 vld1q_u8_x4 +#define lm_ggml_vld1q_s8_x2 vld1q_s8_x2 +#define lm_ggml_vld1q_s8_x4 vld1q_s8_x4 + #endif #endif @@ -1273,7 +1368,12 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f float max = x[0]; float sum_w = weights[0]; float sum_x = sum_w * x[0]; +#ifdef HAVE_BUGGY_APPLE_LINKER + // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7 + for (volatile int i = 1; i < n; ++i) { +#else for (int i = 1; i < n; ++i) { +#endif if (x[i] < min) min = x[i]; if (x[i] > max) max = x[i]; float w = weights[i]; @@ -3557,7 +3657,7 @@ void lm_ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * res const int32x4_t vzero = vdupq_n_s32(0); #endif - int8x16x2_t q2bytes; + lm_ggml_int8x16x2_t q2bytes; uint8_t aux[16]; float sum = 0; @@ -3576,8 +3676,8 @@ void lm_ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * res vst1q_u8(aux, scales); const uint8x16_t mins = vshrq_n_u8(mins_and_scales, 4); - const int16x8x2_t q8sums = vld1q_s16_x2(y[i].bsums); - const int16x8x2_t mins16 = {vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))}; + const lm_ggml_int16x8x2_t q8sums = lm_ggml_vld1q_s16_x2(y[i].bsums); + const lm_ggml_int16x8x2_t mins16 = {vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))}; const int32x4_t s0 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[0]), vget_low_s16 (q8sums.val[0])), vmull_s16(vget_high_s16(mins16.val[0]), vget_high_s16(q8sums.val[0]))); const int32x4_t s1 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[1]), vget_low_s16 (q8sums.val[1])), @@ -3605,7 +3705,7 @@ void lm_ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * res #endif #define SHIFT_MULTIPLY_ACCUM_WITH_SCALE(shift, index)\ - q8bytes = vld1q_s8_x2(q8); q8 += 32;\ + q8bytes = lm_ggml_vld1q_s8_x2(q8); q8 += 32;\ q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[0], (shift)), m3));\ q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[1], (shift)), m3));\ MULTIPLY_ACCUM_WITH_SCALE((index)); @@ -3613,9 +3713,9 @@ void lm_ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * res for (int j = 0; j < QK_K/128; ++j) { - const uint8x16x2_t q2bits = vld1q_u8_x2(q2); q2 += 32; + const lm_ggml_uint8x16x2_t q2bits = lm_ggml_vld1q_u8_x2(q2); q2 += 32; - int8x16x2_t q8bytes = vld1q_s8_x2(q8); q8 += 32; + lm_ggml_int8x16x2_t q8bytes = lm_ggml_vld1q_s8_x2(q8); q8 += 32; q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[0], m3)); q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[1], m3)); MULTIPLY_ACCUM_WITH_SCALE(0); @@ -3949,7 +4049,7 @@ void lm_ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * res const int32x4_t vzero = vdupq_n_s32(0); #endif - int8x16x4_t q2bytes; + lm_ggml_int8x16x4_t q2bytes; uint32_t aux32[2]; const uint8_t * scales = (const uint8_t *)aux32; @@ -3974,7 +4074,7 @@ void lm_ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * res const uint8x16_t q2bits = vld1q_u8(q2); - const int8x16x4_t q8bytes = vld1q_s8_x4(q8); + const lm_ggml_int8x16x4_t q8bytes = lm_ggml_vld1q_s8_x4(q8); q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits, m3)); q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 2), m3)); @@ -4238,7 +4338,7 @@ void lm_ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * res const uint8x16_t m3 = vshlq_n_u8(m0, 3); const int8_t m32 = 32; - int8x16x4_t q3bytes; + lm_ggml_int8x16x4_t q3bytes; float sum = 0; @@ -4250,9 +4350,9 @@ void lm_ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * res const uint8_t * restrict qh = x[i].hmask; const int8_t * restrict q8 = y[i].qs; - uint8x16x2_t qhbits = vld1q_u8_x2(qh); + lm_ggml_uint8x16x2_t qhbits = lm_ggml_vld1q_u8_x2(qh); - uint8x16x4_t q3h; + lm_ggml_uint8x16x4_t q3h; int32_t isum = 0; @@ -4268,9 +4368,9 @@ void lm_ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * res for (int j = 0; j < QK_K/128; ++j) { - const uint8x16x2_t q3bits = vld1q_u8_x2(q3); q3 += 32; - const int8x16x4_t q8bytes_1 = vld1q_s8_x4(q8); q8 += 64; - const int8x16x4_t q8bytes_2 = vld1q_s8_x4(q8); q8 += 64; + const lm_ggml_uint8x16x2_t q3bits = lm_ggml_vld1q_u8_x2(q3); q3 += 32; + const lm_ggml_int8x16x4_t q8bytes_1 = lm_ggml_vld1q_s8_x4(q8); q8 += 64; + const lm_ggml_int8x16x4_t q8bytes_2 = lm_ggml_vld1q_s8_x4(q8); q8 += 64; q3h.val[0] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[0]), 2); q3h.val[1] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[1]), 2); @@ -4772,7 +4872,7 @@ void lm_ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * res const uint8x16_t m3b = vdupq_n_u8(0x3); const uint8x16_t mh = vdupq_n_u8(4); - int8x16x4_t q3bytes; + lm_ggml_int8x16x4_t q3bytes; uint16_t aux16[2]; int8_t * scales = (int8_t *)aux16; @@ -4781,11 +4881,11 @@ void lm_ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * res for (int i = 0; i < nb; ++i) { - uint8x16x4_t q3h; + lm_ggml_uint8x16x4_t q3h; const uint8x8_t hbits = vld1_u8(x[i].hmask); const uint8x16_t q3bits = vld1q_u8(x[i].qs); - const int8x16x4_t q8bytes = vld1q_s8_x4(y[i].qs); + const lm_ggml_int8x16x4_t q8bytes = lm_ggml_vld1q_s8_x4(y[i].qs); const uint16_t a = *(const uint16_t *)x[i].scales; aux16[0] = a & 0x0f0f; @@ -5134,8 +5234,8 @@ void lm_ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * res const int32x4_t mzero = vdupq_n_s32(0); #endif - int8x16x2_t q4bytes; - int8x16x2_t q8bytes; + lm_ggml_int8x16x2_t q4bytes; + lm_ggml_int8x16x2_t q8bytes; float sumf = 0; @@ -5170,17 +5270,17 @@ void lm_ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * res for (int j = 0; j < QK_K/64; ++j) { - const uint8x16x2_t q4bits = vld1q_u8_x2(q4); q4 += 32; + const lm_ggml_uint8x16x2_t q4bits = lm_ggml_vld1q_u8_x2(q4); q4 += 32; #ifdef __ARM_FEATURE_DOTPROD - q8bytes = vld1q_s8_x2(q8); q8 += 32; + q8bytes = lm_ggml_vld1q_s8_x2(q8); q8 += 32; q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b)); q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b)); const int32x4_t p1 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]); sumi1 += vaddvq_s32(p1) * scales[2*j+0]; - q8bytes = vld1q_s8_x2(q8); q8 += 32; + q8bytes = lm_ggml_vld1q_s8_x2(q8); q8 += 32; q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4)); q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4)); @@ -5188,7 +5288,7 @@ void lm_ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * res sumi2 += vaddvq_s32(p2) * scales[2*j+1]; #else - q8bytes = vld1q_s8_x2(q8); q8 += 32; + q8bytes = lm_ggml_vld1q_s8_x2(q8); q8 += 32; q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b)); q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b)); const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])), @@ -5197,7 +5297,7 @@ void lm_ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * res vmull_s8(vget_high_s8(q4bytes.val[1]), vget_high_s8(q8bytes.val[1]))); sumi1 += vaddvq_s16(vaddq_s16(p0, p1)) * scales[2*j+0]; - q8bytes = vld1q_s8_x2(q8); q8 += 32; + q8bytes = lm_ggml_vld1q_s8_x2(q8); q8 += 32; q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4)); q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4)); const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])), @@ -5512,8 +5612,8 @@ void lm_ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * res float sumf = 0; - int8x16x2_t q4bytes; - int8x16x4_t q8bytes; + lm_ggml_int8x16x2_t q4bytes; + lm_ggml_int8x16x4_t q8bytes; float sum_mins = 0.f; @@ -5534,10 +5634,10 @@ void lm_ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * res const float d = y[i].d * (float)x[i].d[0]; - const uint8x16x2_t q4bits = vld1q_u8_x2(q4); + const lm_ggml_uint8x16x2_t q4bits = lm_ggml_vld1q_u8_x2(q4); #ifdef __ARM_FEATURE_DOTPROD - q8bytes = vld1q_s8_x4(q8); + q8bytes = lm_ggml_vld1q_s8_x4(q8); q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b)); q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b)); @@ -5551,7 +5651,7 @@ void lm_ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * res const int32_t sumi2 = vaddvq_s32(p2) * scales[1]; #else - q8bytes = vld1q_s8_x4(q8); + q8bytes = lm_ggml_vld1q_s8_x4(q8); q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b)); q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b)); const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])), @@ -5785,7 +5885,7 @@ void lm_ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * res const int32x4_t mzero = vdupq_n_s32(0); #endif - int8x16x4_t q5bytes; + lm_ggml_int8x16x4_t q5bytes; float sumf = 0; @@ -5815,16 +5915,16 @@ void lm_ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * res const uint8_t * restrict qh = x[i].qh; const int8_t * restrict q8 = y[i].qs; - uint8x16x2_t qhbits = vld1q_u8_x2(qh); + lm_ggml_uint8x16x2_t qhbits = lm_ggml_vld1q_u8_x2(qh); - uint8x16x4_t q5h; + lm_ggml_uint8x16x4_t q5h; int32_t sumi = 0; for (int j = 0; j < QK_K/64; ++j) { - const uint8x16x2_t q5bits = vld1q_u8_x2(q5); q5 += 32; - const int8x16x4_t q8bytes = vld1q_s8_x4(q8); q8 += 64; + const lm_ggml_uint8x16x2_t q5bits = lm_ggml_vld1q_u8_x2(q5); q5 += 32; + const lm_ggml_int8x16x4_t q8bytes = lm_ggml_vld1q_s8_x4(q8); q8 += 64; q5h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits.val[0]), 4); q5h.val[1] = vshlq_n_u8(vandq_u8(mone, qhbits.val[1]), 4); @@ -6218,8 +6318,8 @@ void lm_ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * res const int32x4_t mzero = vdupq_n_s32(0); #endif - int8x16x4_t q5bytes; - uint8x16x4_t q5h; + lm_ggml_int8x16x4_t q5bytes; + lm_ggml_uint8x16x4_t q5h; float sumf = 0; @@ -6234,8 +6334,8 @@ void lm_ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * res const uint8x8_t qhbits = vld1_u8(qh); - const uint8x16x2_t q5bits = vld1q_u8_x2(q5); - const int8x16x4_t q8bytes = vld1q_s8_x4(q8); + const lm_ggml_uint8x16x2_t q5bits = lm_ggml_vld1q_u8_x2(q5); + const lm_ggml_int8x16x4_t q8bytes = lm_ggml_vld1q_s8_x4(q8); const uint8x16_t htmp = vcombine_u8(qhbits, vshr_n_u8(qhbits, 1)); q5h.val[0] = vbicq_u8(mh, vshlq_n_u8(htmp, 4)); @@ -6511,8 +6611,8 @@ void lm_ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * res const uint8x16_t mone = vdupq_n_u8(3); - int8x16x4_t q6bytes; - uint8x16x4_t q6h; + lm_ggml_int8x16x4_t q6bytes; + lm_ggml_uint8x16x4_t q6h; for (int i = 0; i < nb; ++i) { @@ -6524,9 +6624,9 @@ void lm_ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * res const int8_t * restrict scale = x[i].scales; - const int16x8x2_t q8sums = vld1q_s16_x2(y[i].bsums); + const lm_ggml_int16x8x2_t q8sums = lm_ggml_vld1q_s16_x2(y[i].bsums); const int8x16_t scales = vld1q_s8(scale); - const int16x8x2_t q6scales = {vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))}; + const lm_ggml_int16x8x2_t q6scales = {vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))}; const int32x4_t prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[0]), vget_low_s16 (q6scales.val[0])), vmull_s16(vget_high_s16(q8sums.val[0]), vget_high_s16(q6scales.val[0]))), @@ -6538,9 +6638,9 @@ void lm_ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * res for (int j = 0; j < QK_K/128; ++j) { - uint8x16x2_t qhbits = vld1q_u8_x2(qh); qh += 32; - uint8x16x4_t q6bits = vld1q_u8_x4(q6); q6 += 64; - int8x16x4_t q8bytes = vld1q_s8_x4(q8); q8 += 64; + lm_ggml_uint8x16x2_t qhbits = lm_ggml_vld1q_u8_x2(qh); qh += 32; + lm_ggml_uint8x16x4_t q6bits = lm_ggml_vld1q_u8_x4(q6); q6 += 64; + lm_ggml_int8x16x4_t q8bytes = lm_ggml_vld1q_s8_x4(q8); q8 += 64; q6h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits.val[0]), 4); q6h.val[1] = vshlq_n_u8(vandq_u8(mone, qhbits.val[1]), 4); @@ -6583,7 +6683,7 @@ void lm_ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * res scale += 2; #endif - q8bytes = vld1q_s8_x4(q8); q8 += 64; + q8bytes = lm_ggml_vld1q_s8_x4(q8); q8 += 64; shifted = vshrq_n_u8(qhbits.val[0], 4); q6h.val[0] = vshlq_n_u8(vandq_u8(mone, shifted), 4); @@ -6987,8 +7087,8 @@ void lm_ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * res const uint8x16_t mone = vdupq_n_u8(3); - int8x16x4_t q6bytes; - uint8x16x4_t q6h; + lm_ggml_int8x16x4_t q6bytes; + lm_ggml_uint8x16x4_t q6h; for (int i = 0; i < nb; ++i) { @@ -7002,9 +7102,9 @@ void lm_ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * res int32_t isum = 0; - uint8x16_t qhbits = vld1q_u8(qh); - uint8x16x2_t q6bits = vld1q_u8_x2(q6); - int8x16x4_t q8bytes = vld1q_s8_x4(q8); + uint8x16_t qhbits = vld1q_u8(qh); + lm_ggml_uint8x16x2_t q6bits = lm_ggml_vld1q_u8_x2(q6); + lm_ggml_int8x16x4_t q8bytes = lm_ggml_vld1q_s8_x4(q8); q6h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits), 4); uint8x16_t shifted = vshrq_n_u8(qhbits, 2); diff --git a/cpp/ggml.c b/cpp/ggml.c index c0d6ff3..2dda9e7 100644 --- a/cpp/ggml.c +++ b/cpp/ggml.c @@ -100,6 +100,49 @@ typedef void * thread_ret_t; #include #endif +#if defined(__APPLE__) +#include +#endif + +#if (defined(__linux__) || defined(__APPLE__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)) && \ + (!defined(TARGET_OS_TV) && !defined(TARGET_OS_WATCH)) + +#include + +void lm_ggml_print_backtrace(void) { + /* + #include + #include + + void * trace[100]; + + int nptrs = backtrace(trace, sizeof(trace)/sizeof(trace[0])); + + backtrace_symbols_fd(trace, nptrs, STDERR_FILENO); + */ + + // backtrack_symbols does not show line numbers, use gdb instead + char attach[32]; + snprintf(attach, sizeof(attach), "attach %d", getpid()); + int pid = fork(); + if (pid == 0) { + execlp("gdb", "gdb", "--batch", + "-ex", "set style enabled on", + "-ex", attach, + "-ex", "bt -frame-info source-and-location", + "-ex", "detach", + "-ex", "quit", + NULL); + } else { + waitpid(pid, NULL, 0); + } +} +#else +void lm_ggml_print_backtrace(void) { + // platform not supported +} +#endif + /*#define LM_GGML_PERF*/ #define LM_GGML_DEBUG 0 #define LM_GGML_GELU_FP16 @@ -190,24 +233,6 @@ inline static void * lm_ggml_aligned_malloc(size_t size) { #define UNUSED LM_GGML_UNUSED #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0) -// -// tensor access macros -// - -#define LM_GGML_TENSOR_UNARY_OP_LOCALS \ - LM_GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \ - LM_GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \ - LM_GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \ - LM_GGML_TENSOR_LOCALS(size_t, nb, dst, nb) - -#define LM_GGML_TENSOR_BINARY_OP_LOCALS \ - LM_GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \ - LM_GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \ - LM_GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \ - LM_GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \ - LM_GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \ - LM_GGML_TENSOR_LOCALS(size_t, nb, dst, nb) - #if defined(LM_GGML_USE_ACCELERATE) #include #if defined(LM_GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions @@ -228,6 +253,12 @@ inline static void * lm_ggml_aligned_malloc(size_t size) { // floating point type used to accumulate sums typedef double lm_ggml_float; +#undef MIN +#undef MAX + +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + // // global data // @@ -561,6 +592,18 @@ lm_ggml_type_traits_t lm_ggml_internal_get_type_traits(enum lm_ggml_type type) { // simd mappings // +#if defined(__ARM_NEON) +#if !defined(__aarch64__) + +// 64-bit compatibility + +inline static float vaddvq_f32(float32x4_t v) { + return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3); +} + +#endif +#endif + // we define a common set of C macros which map to specific intrinsics based on the current architecture // we then implement the fundamental computation operations below using only these macros // adding support for new architectures requires to define the corresponding SIMD macros @@ -1352,6 +1395,7 @@ inline static void lm_ggml_vec_step_f32 (const int n, float * y, const float * x inline static void lm_ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); } inline static void lm_ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; } inline static void lm_ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; } +inline static void lm_ggml_vec_leaky_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.1f*x[i]; } static const float GELU_COEF_A = 0.044715f; static const float GELU_QUICK_COEF = -1.702f; @@ -1551,6 +1595,7 @@ static const char * LM_GGML_OP_NAME[LM_GGML_OP_COUNT] = { "GROUP_NORM", "MUL_MAT", + "MUL_MAT_ID", "OUT_PROD", "SCALE", @@ -1572,17 +1617,13 @@ static const char * LM_GGML_OP_NAME[LM_GGML_OP_COUNT] = { "ROPE_BACK", "ALIBI", "CLAMP", - "CONV_1D", - "CONV_1D_STAGE_0", - "CONV_1D_STAGE_1", "CONV_TRANSPOSE_1D", - "CONV_2D", - "CONV_2D_STAGE_0", - "CONV_2D_STAGE_1", + "IM2COL", "CONV_TRANSPOSE_2D", "POOL_1D", "POOL_2D", "UPSCALE", + "ARGSORT", "FLASH_ATTN", "FLASH_FF", @@ -1609,7 +1650,7 @@ static const char * LM_GGML_OP_NAME[LM_GGML_OP_COUNT] = { "CROSS_ENTROPY_LOSS_BACK", }; -static_assert(LM_GGML_OP_COUNT == 73, "LM_GGML_OP_COUNT != 73"); +static_assert(LM_GGML_OP_COUNT == 70, "LM_GGML_OP_COUNT != 70"); static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = { "none", @@ -1638,6 +1679,7 @@ static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = { "group_norm(x)", "X*Y", + "X[i]*Y", "X*Y", "x*v", @@ -1659,17 +1701,13 @@ static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = { "rope_back(x)", "alibi(x)", "clamp(x)", - "conv_1d(x)", - "conv_1d_stage_0(x)", - "conv_1d_stage_1(x)", "conv_transpose_1d(x)", - "conv_2d(x)", - "conv_2d_stage_0(x)", - "conv_2d_stage_1(x)", + "im2col(x)", "conv_transpose_2d(x)", "pool_1d(x)", "pool_2d(x)", "upscale(x)", + "argsort(x)", "flash_attn(x)", "flash_ff(x)", @@ -1696,10 +1734,28 @@ static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = { "cross_entropy_loss_back(x,y)", }; -static_assert(LM_GGML_OP_COUNT == 73, "LM_GGML_OP_COUNT != 73"); +static_assert(LM_GGML_OP_COUNT == 70, "LM_GGML_OP_COUNT != 70"); static_assert(LM_GGML_OP_POOL_COUNT == 2, "LM_GGML_OP_POOL_COUNT != 2"); + +static const char * LM_GGML_UNARY_OP_NAME[LM_GGML_UNARY_OP_COUNT] = { + "ABS", + "SGN", + "NEG", + "STEP", + "TANH", + "ELU", + "RELU", + "GELU", + "GELU_QUICK", + "SILU", + "LEAKY", +}; + +static_assert(LM_GGML_UNARY_OP_COUNT == 11, "LM_GGML_UNARY_OP_COUNT != 11"); + + static_assert(sizeof(struct lm_ggml_object)%LM_GGML_MEM_ALIGN == 0, "lm_ggml_object size must be a multiple of LM_GGML_MEM_ALIGN"); static_assert(sizeof(struct lm_ggml_tensor)%LM_GGML_MEM_ALIGN == 0, "lm_ggml_tensor size must be a multiple of LM_GGML_MEM_ALIGN"); @@ -1719,18 +1775,13 @@ static void lm_ggml_setup_op_has_task_pass(void) { p[LM_GGML_OP_ACC ] = true; p[LM_GGML_OP_MUL_MAT ] = true; + p[LM_GGML_OP_MUL_MAT_ID ] = true; p[LM_GGML_OP_OUT_PROD ] = true; p[LM_GGML_OP_SET ] = true; p[LM_GGML_OP_GET_ROWS_BACK ] = true; p[LM_GGML_OP_DIAG_MASK_INF ] = true; p[LM_GGML_OP_DIAG_MASK_ZERO ] = true; - p[LM_GGML_OP_CONV_1D ] = true; - p[LM_GGML_OP_CONV_1D_STAGE_0 ] = true; - p[LM_GGML_OP_CONV_1D_STAGE_1 ] = true; p[LM_GGML_OP_CONV_TRANSPOSE_1D ] = true; - p[LM_GGML_OP_CONV_2D ] = true; - p[LM_GGML_OP_CONV_2D_STAGE_0 ] = true; - p[LM_GGML_OP_CONV_2D_STAGE_1 ] = true; p[LM_GGML_OP_CONV_TRANSPOSE_2D ] = true; p[LM_GGML_OP_FLASH_ATTN_BACK ] = true; p[LM_GGML_OP_CROSS_ENTROPY_LOSS ] = true; @@ -1977,6 +2028,20 @@ const char * lm_ggml_op_symbol(enum lm_ggml_op op) { return LM_GGML_OP_SYMBOL[op]; } +const char * lm_ggml_unary_op_name(enum lm_ggml_unary_op op) { + return LM_GGML_UNARY_OP_NAME[op]; +} + +const char * lm_ggml_op_desc(const struct lm_ggml_tensor * t) { + if (t->op == LM_GGML_OP_UNARY) { + enum lm_ggml_unary_op uop = lm_ggml_get_unary_op(t); + return lm_ggml_unary_op_name(uop); + } + else { + return lm_ggml_op_name(t->op); + } +} + size_t lm_ggml_element_size(const struct lm_ggml_tensor * tensor) { return lm_ggml_type_size(tensor->type); } @@ -3108,9 +3173,7 @@ static struct lm_ggml_tensor * lm_ggml_add_impl( struct lm_ggml_tensor * a, struct lm_ggml_tensor * b, bool inplace) { - // TODO: support less-strict constraint - // LM_GGML_ASSERT(lm_ggml_can_repeat(b, a)); - LM_GGML_ASSERT(lm_ggml_can_repeat_rows(b, a)); + LM_GGML_ASSERT(lm_ggml_can_repeat(b, a)); bool is_node = false; @@ -3325,9 +3388,7 @@ static struct lm_ggml_tensor * lm_ggml_mul_impl( struct lm_ggml_tensor * a, struct lm_ggml_tensor * b, bool inplace) { - // TODO: support less-strict constraint - // LM_GGML_ASSERT(lm_ggml_can_repeat(b, a)); - LM_GGML_ASSERT(lm_ggml_can_repeat_rows(b, a)); + LM_GGML_ASSERT(lm_ggml_can_repeat(b, a)); bool is_node = false; @@ -3372,7 +3433,7 @@ static struct lm_ggml_tensor * lm_ggml_div_impl( struct lm_ggml_tensor * a, struct lm_ggml_tensor * b, bool inplace) { - LM_GGML_ASSERT(lm_ggml_are_same_shape(a, b)); + LM_GGML_ASSERT(lm_ggml_can_repeat(b, a)); bool is_node = false; @@ -3769,6 +3830,14 @@ struct lm_ggml_tensor * lm_ggml_relu_inplace( return lm_ggml_unary_inplace(ctx, a, LM_GGML_UNARY_OP_RELU); } +// lm_ggml_leaky + +struct lm_ggml_tensor * lm_ggml_leaky( + struct lm_ggml_context * ctx, + struct lm_ggml_tensor * a) { + return lm_ggml_unary(ctx, a, LM_GGML_UNARY_OP_LEAKY); +} + // lm_ggml_gelu struct lm_ggml_tensor * lm_ggml_gelu( @@ -4002,6 +4071,49 @@ struct lm_ggml_tensor * lm_ggml_mul_mat( return result; } +// lm_ggml_mul_mat_id + +struct lm_ggml_tensor * lm_ggml_mul_mat_id( + struct lm_ggml_context * ctx, + struct lm_ggml_tensor * as[], + struct lm_ggml_tensor * ids, + int id, + struct lm_ggml_tensor * b) { + + int64_t n_as = ids->ne[0]; + + LM_GGML_ASSERT(ids->type == LM_GGML_TYPE_I32); + LM_GGML_ASSERT(lm_ggml_is_vector(ids)); + LM_GGML_ASSERT(n_as > 0 && n_as <= LM_GGML_MAX_SRC - 2); + LM_GGML_ASSERT(id >= 0 && id < n_as); + + bool is_node = false; + + if (as[0]->grad || b->grad) { + is_node = true; + } + + const int64_t ne[4] = { as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3] }; + struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F32, MAX(as[0]->n_dims, b->n_dims), ne); + + lm_ggml_set_op_params_i32(result, 0, id); + + result->op = LM_GGML_OP_MUL_MAT_ID; + result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = ids; + result->src[1] = b; + + for (int64_t i = 0; i < n_as; i++) { + struct lm_ggml_tensor * a = as[i]; + LM_GGML_ASSERT(lm_ggml_are_same_shape(as[0], a)); + LM_GGML_ASSERT(lm_ggml_can_mul_mat(a, b)); + LM_GGML_ASSERT(!lm_ggml_is_transposed(a)); + result->src[i + 2] = a; + } + + return result; +} + // lm_ggml_out_prod struct lm_ggml_tensor * lm_ggml_out_prod( @@ -4155,7 +4267,7 @@ struct lm_ggml_tensor * lm_ggml_set_2d_inplace( struct lm_ggml_tensor * b, size_t nb1, size_t offset) { - return lm_ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false); + return lm_ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, true); } // lm_ggml_cpy @@ -4772,7 +4884,17 @@ struct lm_ggml_tensor * lm_ggml_diag_mask_zero_inplace( static struct lm_ggml_tensor * lm_ggml_soft_max_impl( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, + struct lm_ggml_tensor * mask, + float scale, bool inplace) { + LM_GGML_ASSERT(lm_ggml_is_contiguous(a)); + if (mask) { + LM_GGML_ASSERT(lm_ggml_is_contiguous(mask)); + LM_GGML_ASSERT(mask->ne[2] == 1); + LM_GGML_ASSERT(mask->ne[3] == 1); + LM_GGML_ASSERT(lm_ggml_can_repeat_rows(mask, a)); + } + bool is_node = false; if (a->grad) { @@ -4781,9 +4903,13 @@ static struct lm_ggml_tensor * lm_ggml_soft_max_impl( struct lm_ggml_tensor * result = inplace ? lm_ggml_view_tensor(ctx, a) : lm_ggml_dup_tensor(ctx, a); + float params[] = { scale }; + lm_ggml_set_op_params(result, params, sizeof(params)); + result->op = LM_GGML_OP_SOFT_MAX; result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; + result->src[1] = mask; return result; } @@ -4791,13 +4917,21 @@ static struct lm_ggml_tensor * lm_ggml_soft_max_impl( struct lm_ggml_tensor * lm_ggml_soft_max( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a) { - return lm_ggml_soft_max_impl(ctx, a, false); + return lm_ggml_soft_max_impl(ctx, a, NULL, 1.0f, false); } struct lm_ggml_tensor * lm_ggml_soft_max_inplace( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a) { - return lm_ggml_soft_max_impl(ctx, a, true); + return lm_ggml_soft_max_impl(ctx, a, NULL, 1.0f, true); +} + +struct lm_ggml_tensor * lm_ggml_soft_max_ext( + struct lm_ggml_context * ctx, + struct lm_ggml_tensor * a, + struct lm_ggml_tensor * mask, + float scale) { + return lm_ggml_soft_max_impl(ctx, a, mask, scale, false); } // lm_ggml_soft_max_back @@ -5076,82 +5210,6 @@ static int64_t lm_ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int return (ins + 2 * p - d * (ks - 1) - 1) / s + 1; } -// im2col: [N, IC, IL] => [N, OL, IC*K] -// a: [OC,IC, K] -// b: [N, IC, IL] -// result: [N, OL, IC*K] -static struct lm_ggml_tensor * lm_ggml_conv_1d_stage_0( - struct lm_ggml_context * ctx, - struct lm_ggml_tensor * a, - struct lm_ggml_tensor * b, - int s0, - int p0, - int d0) { - LM_GGML_ASSERT(a->ne[1] == b->ne[1]); - bool is_node = false; - - if (a->grad || b->grad) { - LM_GGML_ASSERT(false); // TODO: implement backward - is_node = true; - } - - const int64_t OL = lm_ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0); - - const int64_t ne[4] = { - a->ne[1] * a->ne[0], - OL, - b->ne[2], - 1, - }; - struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F16, 4, ne); - - int32_t params[] = { s0, p0, d0 }; - lm_ggml_set_op_params(result, params, sizeof(params)); - - result->op = LM_GGML_OP_CONV_1D_STAGE_0; - result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL; - result->src[0] = a; - result->src[1] = b; - - return result; -} - -// lm_ggml_conv_1d_stage_1 - -// gemm: [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K] -// a: [OC, IC, K] -// b: [N, OL, IC * K] -// result: [N, OC, OL] -static struct lm_ggml_tensor * lm_ggml_conv_1d_stage_1( - struct lm_ggml_context * ctx, - struct lm_ggml_tensor * a, - struct lm_ggml_tensor * b) { - - bool is_node = false; - - if (a->grad || b->grad) { - LM_GGML_ASSERT(false); // TODO: implement backward - is_node = true; - } - - const int64_t ne[4] = { - b->ne[1], - a->ne[2], - b->ne[2], - 1, - }; - struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F32, 4, ne); - - result->op = LM_GGML_OP_CONV_1D_STAGE_1; - result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL; - result->src[0] = a; - result->src[1] = b; - - return result; -} - -// lm_ggml_conv_1d - LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_1d( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, @@ -5159,43 +5217,17 @@ LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_1d( int s0, int p0, int d0) { - struct lm_ggml_tensor * result = lm_ggml_conv_1d_stage_0(ctx, a, b, s0, p0, d0); - result = lm_ggml_conv_1d_stage_1(ctx, a, result); - return result; -} + struct lm_ggml_tensor * im2col = lm_ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false); // [N, OL, IC * K] -// LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_1d( -// struct lm_ggml_context * ctx, -// struct lm_ggml_tensor * a, -// struct lm_ggml_tensor * b, -// int s0, -// int p0, -// int d0) { -// LM_GGML_ASSERT(lm_ggml_is_matrix(b)); -// LM_GGML_ASSERT(a->ne[1] == b->ne[1]); -// bool is_node = false; + struct lm_ggml_tensor * result = + lm_ggml_mul_mat(ctx, + lm_ggml_reshape_2d(ctx, im2col, im2col->ne[0], (im2col->ne[2] * im2col->ne[1])), // [N, OL, IC * K] => [N*OL, IC * K] + lm_ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1]), a->ne[2])); // [OC,IC, K] => [OC, IC * K] -// if (a->grad || b->grad) { -// LM_GGML_ASSERT(false); // TODO: implement backward -// is_node = true; -// } + result = lm_ggml_reshape_3d(ctx, result, im2col->ne[1], a->ne[2], im2col->ne[2]); // [N, OC, OL] -// const int64_t ne[4] = { -// lm_ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0), -// a->ne[2], 1, 1, -// }; -// struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F32, 2, ne); - -// int32_t params[] = { s0, p0, d0 }; -// lm_ggml_set_op_params(result, params, sizeof(params)); - -// result->op = LM_GGML_OP_CONV_1D; -// result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL; -// result->src[0] = a; -// result->src[1] = b; - -// return result; -// } + return result; +} // lm_ggml_conv_1d_ph @@ -5258,7 +5290,7 @@ LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_transpose_1d( // a: [OC,IC, KH, KW] // b: [N, IC, IH, IW] // result: [N, OH, OW, IC*KH*KW] -static struct lm_ggml_tensor * lm_ggml_conv_2d_stage_0( +struct lm_ggml_tensor * lm_ggml_im2col( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, struct lm_ggml_tensor * b, @@ -5267,9 +5299,14 @@ static struct lm_ggml_tensor * lm_ggml_conv_2d_stage_0( int p0, int p1, int d0, - int d1) { + int d1, + bool is_2D) { - LM_GGML_ASSERT(a->ne[2] == b->ne[2]); + if(is_2D) { + LM_GGML_ASSERT(a->ne[2] == b->ne[2]); + } else { + LM_GGML_ASSERT(a->ne[1] == b->ne[1]); + } bool is_node = false; if (a->grad || b->grad) { @@ -5277,81 +5314,51 @@ static struct lm_ggml_tensor * lm_ggml_conv_2d_stage_0( is_node = true; } - const int64_t OH = lm_ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1); - const int64_t OW = lm_ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0); + const int64_t OH = is_2D ? lm_ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1) : 0; + const int64_t OW = lm_ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0); const int64_t ne[4] = { - a->ne[2] * a->ne[1] * a->ne[0], + is_2D ? (a->ne[2] * a->ne[1] * a->ne[0]) : a->ne[1] * a->ne[0], OW, - OH, - b->ne[3], + is_2D ? OH : b->ne[2], + is_2D ? b->ne[3] : 1, }; - struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F16, 4, ne); - int32_t params[] = { s0, s1, p0, p1, d0, d1 }; + struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F16, 4, ne); + int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) }; lm_ggml_set_op_params(result, params, sizeof(params)); - result->op = LM_GGML_OP_CONV_2D_STAGE_0; - result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL; - result->src[0] = a; - result->src[1] = b; - - return result; - -} - -// gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW] -// a: [OC, IC, KH, KW] -// b: [N, OH, OW, IC * KH * KW] -// result: [N, OC, OH, OW] -static struct lm_ggml_tensor * lm_ggml_conv_2d_stage_1( - struct lm_ggml_context * ctx, - struct lm_ggml_tensor * a, - struct lm_ggml_tensor * b) { - - bool is_node = false; - - if (a->grad || b->grad) { - LM_GGML_ASSERT(false); // TODO: implement backward - is_node = true; - } - - const int64_t ne[4] = { - b->ne[1], - b->ne[2], - a->ne[3], - b->ne[3], - }; - struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F32, 4, ne); - - result->op = LM_GGML_OP_CONV_2D_STAGE_1; + result->op = LM_GGML_OP_IM2COL; result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = b; return result; - } // a: [OC,IC, KH, KW] // b: [N, IC, IH, IW] // result: [N, OC, OH, OW] struct lm_ggml_tensor * lm_ggml_conv_2d( - struct lm_ggml_context * ctx, - struct lm_ggml_tensor * a, - struct lm_ggml_tensor * b, - int s0, - int s1, - int p0, - int p1, - int d0, - int d1) { + struct lm_ggml_context * ctx, + struct lm_ggml_tensor * a, + struct lm_ggml_tensor * b, + int s0, + int s1, + int p0, + int p1, + int d0, + int d1) { + struct lm_ggml_tensor * im2col = lm_ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true); // [N, OH, OW, IC * KH * KW] - struct lm_ggml_tensor * result = lm_ggml_conv_2d_stage_0(ctx, a, b, s0, s1, p0, p1, d0, d1); // [N, OH, OW, IC * KH * KW] - result = lm_ggml_conv_2d_stage_1(ctx, a, result); + struct lm_ggml_tensor * result = + lm_ggml_mul_mat(ctx, + lm_ggml_reshape_2d(ctx, im2col, im2col->ne[0], im2col->ne[3] * im2col->ne[2] * im2col->ne[1]), // [N, OH, OW, IC * KH * KW] => [N*OH*OW, IC * KH * KW] + lm_ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1] * a->ne[2]), a->ne[3])); // [OC,IC, KH, KW] => [OC, IC * KH * KW] - return result; + result = lm_ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], a->ne[3], im2col->ne[3]); // [N, OC, OH, OW] + return result; } // lm_ggml_conv_2d_sk_p0 @@ -5411,7 +5418,7 @@ struct lm_ggml_tensor * lm_ggml_conv_transpose_2d_p0( // lm_ggml_pool_* -static int64_t lm_ggml_calc_pool_output_size(int64_t ins, int ks, int s, int p) { +static int64_t lm_ggml_calc_pool_output_size(int64_t ins, int ks, int s, float p) { return (ins + 2 * p - ks) / s + 1; } @@ -5458,8 +5465,8 @@ struct lm_ggml_tensor * lm_ggml_pool_2d( int k1, int s0, int s1, - int p0, - int p1) { + float p0, + float p1) { bool is_node = false; @@ -5519,6 +5526,43 @@ struct lm_ggml_tensor * lm_ggml_upscale( return lm_ggml_upscale_impl(ctx, a, scale_factor); } +// lm_ggml_argsort + +struct lm_ggml_tensor * lm_ggml_argsort( + struct lm_ggml_context * ctx, + struct lm_ggml_tensor * a, + enum lm_ggml_sort_order order) { + bool is_node = false; + + struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_I32, a->n_dims, a->ne); + + lm_ggml_set_op_params_i32(result, 0, (int32_t) order); + + result->op = LM_GGML_OP_ARGSORT; + result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + +// lm_ggml_top_k + +struct lm_ggml_tensor * lm_ggml_top_k( + struct lm_ggml_context * ctx, + struct lm_ggml_tensor * a, + int k) { + LM_GGML_ASSERT(a->ne[0] >= k); + + struct lm_ggml_tensor * result = lm_ggml_argsort(ctx, a, LM_GGML_SORT_DESC); + + result = lm_ggml_view_4d(ctx, result, + k, result->ne[1], result->ne[2], result->ne[3], + result->nb[1], result->nb[2], result->nb[3], + 0); + + return result; +} + // lm_ggml_flash_attn struct lm_ggml_tensor * lm_ggml_flash_attn( @@ -6878,7 +6922,7 @@ static void lm_ggml_compute_forward_add_f32( const struct lm_ggml_tensor * src0, const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { - LM_GGML_ASSERT(lm_ggml_can_repeat_rows(src1, src0) && lm_ggml_are_same_shape(src0, dst)); + LM_GGML_ASSERT(lm_ggml_can_repeat(src1, src0) && lm_ggml_are_same_shape(src0, dst)); if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { return; @@ -6911,16 +6955,19 @@ static void lm_ggml_compute_forward_add_f32( const int64_t i13 = i03 % ne13; const int64_t i12 = i02 % ne12; const int64_t i11 = i01 % ne11; + const int64_t nr0 = ne00 / ne10; float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11); + for (int64_t r = 0; r < nr0; ++r) { #ifdef LM_GGML_USE_ACCELERATE - vDSP_vadd(src0_ptr, 1, src1_ptr, 1, dst_ptr, 1, ne00); + vDSP_vadd(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10); #else - lm_ggml_vec_add_f32(ne00, dst_ptr, src0_ptr, src1_ptr); + lm_ggml_vec_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr); #endif + } } } else { // src1 is not contiguous @@ -6937,8 +6984,9 @@ static void lm_ggml_compute_forward_add_f32( float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); - for (int i0 = 0; i0 < ne0; i0++) { - float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i0*nb10); + for (int64_t i0 = 0; i0 < ne0; ++i0) { + const int64_t i10 = i0 % ne10; + float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10); dst_ptr[i0] = src0_ptr[i0] + *src1_ptr; } @@ -7658,7 +7706,7 @@ static void lm_ggml_compute_forward_mul_f32( const struct lm_ggml_tensor * src0, const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { - LM_GGML_ASSERT(lm_ggml_can_repeat_rows(src1, src0) && lm_ggml_are_same_shape(src0, dst)); + LM_GGML_ASSERT(lm_ggml_can_repeat(src1, src0) && lm_ggml_are_same_shape(src0, dst)); if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { return; @@ -7681,7 +7729,6 @@ static void lm_ggml_compute_forward_mul_f32( LM_GGML_ASSERT( nb0 == sizeof(float)); LM_GGML_ASSERT(nb00 == sizeof(float)); - LM_GGML_ASSERT(ne00 == ne10); if (nb10 == sizeof(float)) { for (int64_t ir = ith; ir < nr; ir += nth) { @@ -7693,20 +7740,21 @@ static void lm_ggml_compute_forward_mul_f32( const int64_t i13 = i03 % ne13; const int64_t i12 = i02 % ne12; const int64_t i11 = i01 % ne11; + const int64_t nr0 = ne00 / ne10; float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11); + for (int64_t r = 0 ; r < nr0; ++r) { #ifdef LM_GGML_USE_ACCELERATE - UNUSED(lm_ggml_vec_mul_f32); + UNUSED(lm_ggml_vec_mul_f32); - vDSP_vmul( src0_ptr, 1, src1_ptr, 1, dst_ptr, 1, ne00); + vDSP_vmul(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10); #else - lm_ggml_vec_mul_f32(ne00, dst_ptr, src0_ptr, src1_ptr); + lm_ggml_vec_mul_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr); #endif - // } - // } + } } } else { // src1 is not contiguous @@ -7724,8 +7772,9 @@ static void lm_ggml_compute_forward_mul_f32( float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); - for (int64_t i0 = 0; i0 < ne00; i0++) { - float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i0*nb10); + for (int64_t i0 = 0; i0 < ne00; ++i0) { + const int64_t i10 = i0 % ne10; + float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10); dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr); } @@ -7759,14 +7808,16 @@ static void lm_ggml_compute_forward_div_f32( const struct lm_ggml_tensor * src0, const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { - assert(params->ith == 0); - assert(lm_ggml_are_same_shape(src0, src1) && lm_ggml_are_same_shape(src0, dst)); + LM_GGML_ASSERT(lm_ggml_can_repeat(src1, src0) && lm_ggml_are_same_shape(src0, dst)); if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { return; } - const int nr = lm_ggml_nrows(src0); + const int ith = params->ith; + const int nth = params->nth; + + const int64_t nr = lm_ggml_nrows(src0); LM_GGML_TENSOR_BINARY_OP_LOCALS @@ -7774,41 +7825,50 @@ static void lm_ggml_compute_forward_div_f32( LM_GGML_ASSERT(nb00 == sizeof(float)); if (nb10 == sizeof(float)) { - for (int ir = 0; ir < nr; ++ir) { - // src0, src1 and dst are same shape => same indices - const int i3 = ir/(ne2*ne1); - const int i2 = (ir - i3*ne2*ne1)/ne1; - const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + for (int64_t ir = ith; ir < nr; ir += nth) { + // src0 and dst are same shape => same indices + const int64_t i03 = ir/(ne02*ne01); + const int64_t i02 = (ir - i03*ne02*ne01)/ne01; + const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); + + const int64_t i13 = i03 % ne13; + const int64_t i12 = i02 % ne12; + const int64_t i11 = i01 % ne11; + const int64_t nr0 = ne00 / ne10; + + float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); + float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); + float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11); + for (int64_t r = 0; r < nr0; ++r) { #ifdef LM_GGML_USE_ACCELERATE - UNUSED(lm_ggml_vec_div_f32); + UNUSED(lm_ggml_vec_div_f32); - vDSP_vdiv( - (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1, - (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1, - (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1, - ne0); + vDSP_vdiv(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10); #else - lm_ggml_vec_div_f32(ne0, - (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), - (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), - (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11)); + lm_ggml_vec_div_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr); #endif - // } - // } + } } } else { // src1 is not contiguous - for (int ir = 0; ir < nr; ++ir) { - // src0, src1 and dst are same shape => same indices - const int i3 = ir/(ne2*ne1); - const int i2 = (ir - i3*ne2*ne1)/ne1; - const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - - float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); - float * src0_ptr = (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); - for (int i0 = 0; i0 < ne0; i0++) { - float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11 + i0*nb10); + for (int64_t ir = ith; ir < nr; ir += nth) { + // src0 and dst are same shape => same indices + // src1 is broadcastable across src0 and dst in i1, i2, i3 + const int64_t i03 = ir/(ne02*ne01); + const int64_t i02 = (ir - i03*ne02*ne01)/ne01; + const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); + + const int64_t i13 = i03 % ne13; + const int64_t i12 = i02 % ne12; + const int64_t i11 = i01 % ne11; + + float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); + float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); + + for (int64_t i0 = 0; i0 < ne00; ++i0) { + const int64_t i10 = i0 % ne10; + float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10); dst_ptr[i0] = src0_ptr[i0] / (*src1_ptr); } @@ -8254,7 +8314,7 @@ static void lm_ggml_compute_forward_repeat_f16( return; } - LM_GGML_TENSOR_UNARY_OP_LOCALS; + LM_GGML_TENSOR_UNARY_OP_LOCALS // guaranteed to be an integer due to the check in lm_ggml_can_repeat const int nr0 = (int)(ne0/ne00); @@ -8399,6 +8459,7 @@ static void lm_ggml_compute_forward_concat_f32( LM_GGML_ASSERT(src0->nb[0] == sizeof(float)); const int ith = params->ith; + const int nth = params->nth; LM_GGML_TENSOR_BINARY_OP_LOCALS @@ -8408,7 +8469,7 @@ static void lm_ggml_compute_forward_concat_f32( LM_GGML_ASSERT(nb10 == sizeof(float)); for (int i3 = 0; i3 < ne3; i3++) { - for (int i2 = ith; i2 < ne2; i2++) { + for (int i2 = ith; i2 < ne2; i2 += nth) { if (i2 < ne02) { // src0 for (int i1 = 0; i1 < ne1; i1++) { for (int i0 = 0; i0 < ne0; i0++) { @@ -8921,6 +8982,48 @@ static void lm_ggml_compute_forward_silu( } } +// lm_ggml_compute_forward_leaky + +static void lm_ggml_compute_forward_leaky_f32( + const struct lm_ggml_compute_params * params, + const struct lm_ggml_tensor * src0, + struct lm_ggml_tensor * dst) { + assert(params->ith == 0); + assert(lm_ggml_are_same_shape(src0, dst)); + + if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { + return; + } + + const int n = lm_ggml_nrows(src0); + const int nc = src0->ne[0]; + + assert(dst->nb[0] == sizeof(float)); + assert(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + lm_ggml_vec_leaky_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + +static void lm_ggml_compute_forward_leaky( + const struct lm_ggml_compute_params * params, + const struct lm_ggml_tensor * src0, + struct lm_ggml_tensor * dst) { + switch (src0->type) { + case LM_GGML_TYPE_F32: + { + lm_ggml_compute_forward_leaky_f32(params, src0, dst); + } break; + default: + { + LM_GGML_ASSERT(false); + } break; + } +} + // lm_ggml_compute_forward_silu_back static void lm_ggml_compute_forward_silu_back_f32( @@ -9404,6 +9507,8 @@ static bool lm_ggml_compute_forward_mul_mat_use_blas( // TODO: find the optimal values for these if (lm_ggml_is_contiguous(src0) && lm_ggml_is_contiguous(src1) && + //src0->type == LM_GGML_TYPE_F32 && + src1->type == LM_GGML_TYPE_F32 && (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) { /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/ @@ -9442,7 +9547,7 @@ static void lm_ggml_compute_forward_mul_mat( // we don't support permuted src0 or src1 LM_GGML_ASSERT(nb00 == lm_ggml_type_size(type)); - LM_GGML_ASSERT(nb10 == sizeof(float)); + LM_GGML_ASSERT(nb10 == lm_ggml_type_size(src1->type)); // dst cannot be transposed or permuted LM_GGML_ASSERT(nb0 == sizeof(float)); @@ -9524,6 +9629,8 @@ static void lm_ggml_compute_forward_mul_mat( char * wdata = params->wdata; const size_t row_size = ne10*lm_ggml_type_size(vec_dot_type)/lm_ggml_blck_size(vec_dot_type); + assert(params->wsize >= ne11*ne12*ne13*row_size); + for (int64_t i13 = 0; i13 < ne13; ++i13) { for (int64_t i12 = 0; i12 < ne12; ++i12) { for (int64_t i11 = 0; i11 < ne11; ++i11) { @@ -9625,6 +9732,26 @@ static void lm_ggml_compute_forward_mul_mat( } } +// lm_ggml_compute_forward_mul_mat_id + +static void lm_ggml_compute_forward_mul_mat_id( + const struct lm_ggml_compute_params * params, + struct lm_ggml_tensor * dst) { + + const struct lm_ggml_tensor * ids = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + + const int id = lm_ggml_get_op_params_i32(dst, 0); + + const int a_id = ((int32_t *)ids->data)[id]; + + LM_GGML_ASSERT(a_id >= 0 && a_id < ids->ne[0]); + + const struct lm_ggml_tensor * src0 = dst->src[a_id + 2]; + + lm_ggml_compute_forward_mul_mat(params, src0, src1, dst); +} + // lm_ggml_compute_forward_out_prod static void lm_ggml_compute_forward_out_prod_f32( @@ -9640,10 +9767,12 @@ static void lm_ggml_compute_forward_out_prod_f32( const int ith = params->ith; const int nth = params->nth; + LM_GGML_ASSERT(ne0 == ne00); + LM_GGML_ASSERT(ne1 == ne10); + LM_GGML_ASSERT(ne2 == ne02); LM_GGML_ASSERT(ne02 == ne12); - LM_GGML_ASSERT(ne03 == ne13); - LM_GGML_ASSERT(ne2 == ne12); LM_GGML_ASSERT(ne3 == ne13); + LM_GGML_ASSERT(ne03 == ne13); // we don't support permuted src0 or src1 LM_GGML_ASSERT(nb00 == sizeof(float)); @@ -9654,18 +9783,25 @@ static void lm_ggml_compute_forward_out_prod_f32( // LM_GGML_ASSERT(nb1 <= nb2); // LM_GGML_ASSERT(nb2 <= nb3); - LM_GGML_ASSERT(ne0 == ne00); - LM_GGML_ASSERT(ne1 == ne10); - LM_GGML_ASSERT(ne2 == ne02); - LM_GGML_ASSERT(ne3 == ne03); - // nb01 >= nb00 - src0 is not transposed // compute by src0 rows // TODO: #if defined(LM_GGML_USE_CUBLAS) lm_ggml_cuda_out_prod - // TODO: #if defined(LM_GGML_USE_ACCELERATE) || defined(LM_GGML_USE_OPENBLAS) || defined(LM_GGML_USE_CLBLAST) + // TODO: #if defined(LM_GGML_USE_CLBLAST) + +#if defined(LM_GGML_USE_ACCELERATE) || defined(LM_GGML_USE_OPENBLAS) + bool use_blas = lm_ggml_is_matrix(src0) && + lm_ggml_is_matrix(src1) && + lm_ggml_is_contiguous(src0) && + (lm_ggml_is_contiguous(src1) || lm_ggml_is_transposed(src1)); +#endif if (params->type == LM_GGML_TASK_INIT) { +#if defined(LM_GGML_USE_ACCELERATE) || defined(LM_GGML_USE_OPENBLAS) // gemm beta will zero dst + if (use_blas) { + return; + } +#endif lm_ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); return; } @@ -9674,6 +9810,50 @@ static void lm_ggml_compute_forward_out_prod_f32( return; } +#if defined(LM_GGML_USE_ACCELERATE) || defined(LM_GGML_USE_OPENBLAS) + if (use_blas) { + if (params->ith != 0) { // All threads other than the first do no work. + return; + } + // Arguments to lm_ggml_compute_forward_out_prod (expressed as major,minor) + // src0: (k,n) + // src1: (k,m) + // dst: (m,n) + // + // Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f) + // Also expressed as (major,minor) + // a: (m,k): so src1 transposed + // b: (k,n): so src0 + // c: (m,n) + // + // However, if lm_ggml_is_transposed(src1) is true, then + // src1->data already contains a transposed version, so sgemm mustn't + // transpose it further. + + int n = src0->ne[0]; + int k = src0->ne[1]; + int m = src1->ne[0]; + + int transposeA, lda; + + if (!lm_ggml_is_transposed(src1)) { + transposeA = CblasTrans; + lda = m; + } else { + transposeA = CblasNoTrans; + lda = k; + } + + float * a = (float *) ((char *) src1->data); + float * b = (float *) ((char *) src0->data); + float * c = (float *) ((char *) dst->data); + + cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n); + + return; + } +#endif + // dst[:,:,:,:] = 0 // for i2,i3: // for i1: @@ -10527,20 +10707,25 @@ static void lm_ggml_compute_forward_diag_mask_zero( static void lm_ggml_compute_forward_soft_max_f32( const struct lm_ggml_compute_params * params, const struct lm_ggml_tensor * src0, - struct lm_ggml_tensor * dst) { - LM_GGML_ASSERT(lm_ggml_is_contiguous(src0)); - LM_GGML_ASSERT(lm_ggml_is_contiguous(dst)); - LM_GGML_ASSERT(lm_ggml_are_same_shape(src0, dst)); + const struct lm_ggml_tensor * src1, + struct lm_ggml_tensor * dst) { + assert(lm_ggml_is_contiguous(dst)); + assert(lm_ggml_are_same_shape(src0, dst)); if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { return; } + float scale = 1.0f; + memcpy(&scale, (float *) dst->op_params + 0, sizeof(float)); + // TODO: handle transposed/permuted matrices const int ith = params->ith; const int nth = params->nth; + const int64_t ne11 = src1 ? src1->ne[1] : 1; + const int nc = src0->ne[0]; const int nr = lm_ggml_nrows(src0); @@ -10551,29 +10736,40 @@ static void lm_ggml_compute_forward_soft_max_f32( const int ir0 = dr*ith; const int ir1 = MIN(ir0 + dr, nr); + float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith; + for (int i1 = ir0; i1 < ir1; i1++) { - float *sp = (float *)((char *) src0->data + i1*src0->nb[1]); - float *dp = (float *)((char *) dst->data + i1*dst->nb[1]); + float * sp = (float *)((char *) src0->data + i1*src0->nb[1]); + float * dp = (float *)((char *) dst->data + i1*dst->nb[1]); + + // broadcast the mask across rows + float * mp = src1 ? (float *)((char *) src1->data + (i1%ne11)*src1->nb[1]) : NULL; + + lm_ggml_vec_cpy_f32 (nc, wp, sp); + lm_ggml_vec_scale_f32(nc, wp, scale); + if (mp) { + lm_ggml_vec_acc_f32(nc, wp, mp); + } #ifndef NDEBUG for (int i = 0; i < nc; ++i) { //printf("p[%d] = %f\n", i, p[i]); - assert(!isnan(sp[i])); + assert(!isnan(wp[i])); } #endif float max = -INFINITY; - lm_ggml_vec_max_f32(nc, &max, sp); + lm_ggml_vec_max_f32(nc, &max, wp); lm_ggml_float sum = 0.0; uint16_t scvt; for (int i = 0; i < nc; i++) { - if (sp[i] == -INFINITY) { + if (wp[i] == -INFINITY) { dp[i] = 0.0f; } else { - // const float val = (sp[i] == -INFINITY) ? 0.0 : exp(sp[i] - max); - lm_ggml_fp16_t s = LM_GGML_FP32_TO_FP16(sp[i] - max); + // const float val = (wp[i] == -INFINITY) ? 0.0 : exp(wp[i] - max); + lm_ggml_fp16_t s = LM_GGML_FP32_TO_FP16(wp[i] - max); memcpy(&scvt, &s, sizeof(scvt)); const float val = LM_GGML_FP16_TO_FP32(lm_ggml_table_exp_f16[scvt]); sum += (lm_ggml_float)val; @@ -10598,11 +10794,12 @@ static void lm_ggml_compute_forward_soft_max_f32( static void lm_ggml_compute_forward_soft_max( const struct lm_ggml_compute_params * params, const struct lm_ggml_tensor * src0, - struct lm_ggml_tensor * dst) { + const struct lm_ggml_tensor * src1, + struct lm_ggml_tensor * dst) { switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_soft_max_f32(params, src0, dst); + lm_ggml_compute_forward_soft_max_f32(params, src0, src1, dst); } break; default: { @@ -11340,9 +11537,9 @@ static void lm_ggml_compute_forward_rope_back( } } -// lm_ggml_compute_forward_conv_1d +// lm_ggml_compute_forward_conv_transpose_1d -static void lm_ggml_compute_forward_conv_1d_f16_f32( +static void lm_ggml_compute_forward_conv_transpose_1d_f16_f32( const struct lm_ggml_compute_params * params, const struct lm_ggml_tensor * src0, const struct lm_ggml_tensor * src1, @@ -11359,14 +11556,7 @@ static void lm_ggml_compute_forward_conv_1d_f16_f32( const int ith = params->ith; const int nth = params->nth; - const int nk = ne00; - - // size of the convolution row - the kernel size unrolled across all input channels - const int ew0 = nk*ne01; - - const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; - const int32_t p0 = ((const int32_t*)(dst->op_params))[1]; - const int32_t d0 = ((const int32_t*)(dst->op_params))[2]; + const int nk = ne00*ne01*ne02; LM_GGML_ASSERT(nb00 == sizeof(lm_ggml_fp16_t)); LM_GGML_ASSERT(nb10 == sizeof(float)); @@ -11374,23 +11564,37 @@ static void lm_ggml_compute_forward_conv_1d_f16_f32( if (params->type == LM_GGML_TASK_INIT) { memset(params->wdata, 0, params->wsize); - lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) params->wdata + 0; + // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout) + { + lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) params->wdata + 0; - for (int64_t i11 = 0; i11 < ne11; i11++) { - const float * const src = (float *)((char *) src1->data + i11*nb11); - lm_ggml_fp16_t * dst_data = wdata; + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + const lm_ggml_fp16_t * const src = (lm_ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01); + lm_ggml_fp16_t * dst_data = wdata + i01*ne00*ne02; + for (int64_t i00 = 0; i00 < ne00; i00++) { + dst_data[i00*ne02 + i02] = src[i00]; + } + } + } + } - for (int64_t i0 = 0; i0 < ne0; i0++) { - for (int64_t ik = 0; ik < nk; ik++) { - const int idx0 = i0*s0 + ik*d0 - p0; + // permute source data (src1) from (L x Cin) to (Cin x L) + { + lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) params->wdata + nk; + lm_ggml_fp16_t * dst_data = wdata; - if(!(idx0 < 0 || idx0 >= ne10)) { - dst_data[i0*ew0 + i11*nk + ik] = LM_GGML_FP32_TO_FP16(src[idx0]); - } + for (int64_t i11 = 0; i11 < ne11; i11++) { + const float * const src = (float *)((char *) src1->data + i11*nb11); + for (int64_t i10 = 0; i10 < ne10; i10++) { + dst_data[i10*ne11 + i11] = LM_GGML_FP32_TO_FP16(src[i10]); } } } + // need to zero dst since we are accumulating into it + memset(dst->data, 0, lm_ggml_nbytes(dst)); + return; } @@ -11398,8 +11602,10 @@ static void lm_ggml_compute_forward_conv_1d_f16_f32( return; } + const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; + // total rows in dst - const int nr = ne2; + const int nr = ne1; // rows per thread const int dr = (nr + nth - 1)/nth; @@ -11408,22 +11614,26 @@ static void lm_ggml_compute_forward_conv_1d_f16_f32( const int ir0 = dr*ith; const int ir1 = MIN(ir0 + dr, nr); - lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) params->wdata + 0; - - for (int i2 = 0; i2 < ne2; i2++) { - for (int i1 = ir0; i1 < ir1; i1++) { - float * dst_data = (float *)((char *) dst->data + i2*nb2 + i1*nb1); + lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) params->wdata + 0; + lm_ggml_fp16_t * const wdata_src = wdata + nk; - for (int i0 = 0; i0 < ne0; i0++) { - lm_ggml_vec_dot_f16(ew0, dst_data + i0, - (lm_ggml_fp16_t *) ((char *) src0->data + i1*nb02), - (lm_ggml_fp16_t *) wdata + i2*nb2 + i0*ew0); + for (int i1 = ir0; i1 < ir1; i1++) { + float * dst_data = (float *)((char *) dst->data + i1*nb1); + lm_ggml_fp16_t * wdata_kernel = wdata + i1*ne02*ne00; + for (int i10 = 0; i10 < ne10; i10++) { + const int i1n = i10*ne11; + for (int i00 = 0; i00 < ne00; i00++) { + float v = 0; + lm_ggml_vec_dot_f16(ne02, &v, + (lm_ggml_fp16_t *) wdata_src + i1n, + (lm_ggml_fp16_t *) wdata_kernel + i00*ne02); + dst_data[i10*s0 + i00] += v; } } } } -static void lm_ggml_compute_forward_conv_1d_f32( +static void lm_ggml_compute_forward_conv_transpose_1d_f32( const struct lm_ggml_compute_params * params, const struct lm_ggml_tensor * src0, const struct lm_ggml_tensor * src1, @@ -11440,13 +11650,7 @@ static void lm_ggml_compute_forward_conv_1d_f32( const int ith = params->ith; const int nth = params->nth; - const int nk = ne00; - - const int ew0 = nk*ne01; - - const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; - const int32_t p0 = ((const int32_t*)(dst->op_params))[1]; - const int32_t d0 = ((const int32_t*)(dst->op_params))[2]; + const int nk = ne00*ne01*ne02; LM_GGML_ASSERT(nb00 == sizeof(float)); LM_GGML_ASSERT(nb10 == sizeof(float)); @@ -11454,23 +11658,37 @@ static void lm_ggml_compute_forward_conv_1d_f32( if (params->type == LM_GGML_TASK_INIT) { memset(params->wdata, 0, params->wsize); - float * const wdata = (float *) params->wdata + 0; + // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout) + { + float * const wdata = (float *) params->wdata + 0; + + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01); + float * dst_data = wdata + i01*ne00*ne02; + for (int64_t i00 = 0; i00 < ne00; i00++) { + dst_data[i00*ne02 + i02] = src[i00]; + } + } + } + } - for (int64_t i11 = 0; i11 < ne11; i11++) { - const float * const src = (float *)((char *) src1->data + i11*nb11); + // prepare source data (src1) + { + float * const wdata = (float *) params->wdata + nk; float * dst_data = wdata; - for (int64_t i0 = 0; i0 < ne0; i0++) { - for (int64_t ik = 0; ik < nk; ik++) { - const int idx0 = i0*s0 + ik*d0 - p0; - - if(!(idx0 < 0 || idx0 >= ne10)) { - dst_data[i0*ew0 + i11*nk + ik] = src[idx0]; - } + for (int64_t i11 = 0; i11 < ne11; i11++) { + const float * const src = (float *)((char *) src1->data + i11*nb11); + for (int64_t i10 = 0; i10 < ne10; i10++) { + dst_data[i10*ne11 + i11] = src[i10]; } } } + // need to zero dst since we are accumulating into it + memset(dst->data, 0, lm_ggml_nbytes(dst)); + return; } @@ -11478,8 +11696,10 @@ static void lm_ggml_compute_forward_conv_1d_f32( return; } + const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; + // total rows in dst - const int nr = ne02; + const int nr = ne1; // rows per thread const int dr = (nr + nth - 1)/nth; @@ -11488,94 +11708,50 @@ static void lm_ggml_compute_forward_conv_1d_f32( const int ir0 = dr*ith; const int ir1 = MIN(ir0 + dr, nr); - float * const wdata = (float *) params->wdata + 0; - - for (int i2 = 0; i2 < ne2; i2++) { - for (int i1 = ir0; i1 < ir1; i1++) { - float * dst_data = (float *)((char *) dst->data + i2*nb2 + i1*nb1); + float * const wdata = (float *) params->wdata + 0; + float * const wdata_src = wdata + nk; - for (int i0 = 0; i0 < ne0; i0++) { - lm_ggml_vec_dot_f32(ew0, dst_data + i0, - (float *) ((char *) src0->data + i1*nb02), - (float *) wdata + i2*nb2 + i0*ew0); + for (int i1 = ir0; i1 < ir1; i1++) { + float * dst_data = (float *)((char *) dst->data + i1*nb1); + float * wdata_kernel = wdata + i1*ne02*ne00; + for (int i10 = 0; i10 < ne10; i10++) { + const int i1n = i10*ne11; + for (int i00 = 0; i00 < ne00; i00++) { + float v = 0; + lm_ggml_vec_dot_f32(ne02, &v, + wdata_src + i1n, + wdata_kernel + i00*ne02); + dst_data[i10*s0 + i00] += v; } } } } -// TODO: reuse lm_ggml_mul_mat or implement lm_ggml_im2col and remove stage_0 and stage_1 -static void gemm_f16_out_f32(int64_t m, int64_t n, int64_t k, - lm_ggml_fp16_t * A, - lm_ggml_fp16_t * B, - float * C, - const int ith, const int nth) { - // does not seem to make a difference - int64_t m0, m1, n0, n1; - // patches per thread - if (m > n) { - n0 = 0; - n1 = n; - - // total patches in dst - const int np = m; - - // patches per thread - const int dp = (np + nth - 1)/nth; - - // patch range for this thread - m0 = dp*ith; - m1 = MIN(m0 + dp, np); - } else { - m0 = 0; - m1 = m; - - // total patches in dst - const int np = n; - - // patches per thread - const int dp = (np + nth - 1)/nth; - - // patch range for this thread - n0 = dp*ith; - n1 = MIN(n0 + dp, np); - } - - // block-tiling attempt - int64_t blck_n = 16; - int64_t blck_m = 16; - - // int64_t CACHE_SIZE = 2 * 1024 * 1024; // 2MB - // int64_t blck_size = CACHE_SIZE / (sizeof(float) + 2 * sizeof(lm_ggml_fp16_t) * K); - // if (blck_size > 0) { - // blck_0 = 4; - // blck_1 = blck_size / blck_0; - // if (blck_1 < 0) { - // blck_1 = 1; - // } - // // blck_0 = (int64_t)sqrt(blck_size); - // // blck_1 = blck_0; - // } - // // printf("%zd %zd %zd %zd\n", blck_size, K, blck_0, blck_1); - - for (int j = n0; j < n1; j+=blck_n) { - for (int i = m0; i < m1; i+=blck_m) { - // printf("i j k => %d %d %d\n", i, j, K); - for (int ii = i; ii < i + blck_m && ii < m1; ii++) { - for (int jj = j; jj < j + blck_n && jj < n1; jj++) { - lm_ggml_vec_dot_f16(k, - C + ii*n + jj, - A + ii * k, - B + jj * k); - } - } - } +static void lm_ggml_compute_forward_conv_transpose_1d( + const struct lm_ggml_compute_params * params, + const struct lm_ggml_tensor * src0, + const struct lm_ggml_tensor * src1, + struct lm_ggml_tensor * dst) { + switch (src0->type) { + case LM_GGML_TYPE_F16: + { + lm_ggml_compute_forward_conv_transpose_1d_f16_f32(params, src0, src1, dst); + } break; + case LM_GGML_TYPE_F32: + { + lm_ggml_compute_forward_conv_transpose_1d_f32(params, src0, src1, dst); + } break; + default: + { + LM_GGML_ASSERT(false); + } break; } } -// src0: kernel [OC, IC, K] -// src1: signal [N, IC, IL] -// dst: result [N, OL, IC*K] -static void lm_ggml_compute_forward_conv_1d_stage_0_f32( +// src0: kernel [OC, IC, KH, KW] +// src1: image [N, IC, IH, IW] +// dst: result [N, OH, OW, IC*KH*KW] +static void lm_ggml_compute_forward_im2col_f16( const struct lm_ggml_compute_params * params, const struct lm_ggml_tensor * src0, const struct lm_ggml_tensor * src1, @@ -11589,425 +11765,35 @@ static void lm_ggml_compute_forward_conv_1d_stage_0_f32( LM_GGML_TENSOR_BINARY_OP_LOCALS; - const int64_t N = ne12; - const int64_t IC = ne11; - const int64_t IL = ne10; - - const int64_t K = ne00; - - const int64_t OL = ne1; + const int32_t s0 = ((const int32_t *)(dst->op_params))[0]; + const int32_t s1 = ((const int32_t *)(dst->op_params))[1]; + const int32_t p0 = ((const int32_t *)(dst->op_params))[2]; + const int32_t p1 = ((const int32_t *)(dst->op_params))[3]; + const int32_t d0 = ((const int32_t *)(dst->op_params))[4]; + const int32_t d1 = ((const int32_t *)(dst->op_params))[5]; + const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1; const int ith = params->ith; const int nth = params->nth; - const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; - const int32_t p0 = ((const int32_t*)(dst->op_params))[1]; - const int32_t d0 = ((const int32_t*)(dst->op_params))[2]; - - LM_GGML_ASSERT(nb00 == sizeof(lm_ggml_fp16_t)); - LM_GGML_ASSERT(nb10 == sizeof(float)); - - if (params->type == LM_GGML_TASK_INIT) { - memset(dst->data, 0, lm_ggml_nbytes(dst)); - return; - } - - if (params->type == LM_GGML_TASK_FINALIZE) { - return; - } - - // im2col: [N, IC, IL] => [N, OL, IC*K] - { - lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) dst->data; - - for (int64_t in = 0; in < N; in++) { - for (int64_t iol = 0; iol < OL; iol++) { - for (int64_t iic = ith; iic < IC; iic+=nth) { - - // micro kernel - lm_ggml_fp16_t * dst_data = wdata + (in*OL + iol)*(IC*K); // [IC, K] - const float * const src_data = (float *)((char *) src1->data + in*nb12 + iic*nb11); // [IL] - - for (int64_t ik = 0; ik < K; ik++) { - const int64_t iil = iol*s0 + ik*d0 - p0; - - if (!(iil < 0 || iil >= IL)) { - dst_data[iic*K + ik] = LM_GGML_FP32_TO_FP16(src_data[iil]); - } - } - } - } - } - } -} - -// gemm: [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K] -// src0: [OC, IC, K] -// src1: [N, OL, IC * K] -// result: [N, OC, OL] -static void lm_ggml_compute_forward_conv_1d_stage_1_f16( - const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, - struct lm_ggml_tensor * dst) { - LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F16); - LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F16); - LM_GGML_ASSERT( dst->type == LM_GGML_TYPE_F32); - - int64_t t0 = lm_ggml_perf_time_us(); - UNUSED(t0); - - if (params->type == LM_GGML_TASK_INIT) { - return; - } - - if (params->type == LM_GGML_TASK_FINALIZE) { - return; - } - - LM_GGML_TENSOR_BINARY_OP_LOCALS; - - LM_GGML_ASSERT(nb00 == sizeof(lm_ggml_fp16_t)); - LM_GGML_ASSERT(nb10 == sizeof(lm_ggml_fp16_t)); - LM_GGML_ASSERT(nb0 == sizeof(float)); - - const int N = ne12; - const int OL = ne11; - - const int OC = ne02; - const int IC = ne01; - const int K = ne00; - - const int ith = params->ith; - const int nth = params->nth; - - int64_t m = OC; - int64_t n = OL; - int64_t k = IC * K; - - // [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K] - for (int i = 0; i < N; i++) { - lm_ggml_fp16_t * A = (lm_ggml_fp16_t *)src0->data; // [m, k] - lm_ggml_fp16_t * B = (lm_ggml_fp16_t *)src1->data + i * m * k; // [n, k] - float * C = (float *)dst->data + i * m * n; // [m, n] - - gemm_f16_out_f32(m, n, k, A, B, C, ith, nth); - } -} - -static void lm_ggml_compute_forward_conv_1d( - const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, - struct lm_ggml_tensor * dst) { - switch(src0->type) { - case LM_GGML_TYPE_F16: - { - lm_ggml_compute_forward_conv_1d_f16_f32(params, src0, src1, dst); - } break; - case LM_GGML_TYPE_F32: - { - lm_ggml_compute_forward_conv_1d_f32(params, src0, src1, dst); - } break; - default: - { - LM_GGML_ASSERT(false); - } break; - } -} - -static void lm_ggml_compute_forward_conv_1d_stage_0( - const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, - struct lm_ggml_tensor * dst) { - switch(src0->type) { - case LM_GGML_TYPE_F16: - { - lm_ggml_compute_forward_conv_1d_stage_0_f32(params, src0, src1, dst); - } break; - default: - { - LM_GGML_ASSERT(false); - } break; - } -} - -static void lm_ggml_compute_forward_conv_1d_stage_1( - const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, - struct lm_ggml_tensor * dst) { - switch(src0->type) { - case LM_GGML_TYPE_F16: - { - lm_ggml_compute_forward_conv_1d_stage_1_f16(params, src0, src1, dst); - } break; - default: - { - LM_GGML_ASSERT(false); - } break; - } -} - -// lm_ggml_compute_forward_conv_transpose_1d - -static void lm_ggml_compute_forward_conv_transpose_1d_f16_f32( - const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, - struct lm_ggml_tensor * dst) { - LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F16); - LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F32); - LM_GGML_ASSERT( dst->type == LM_GGML_TYPE_F32); - - int64_t t0 = lm_ggml_perf_time_us(); - UNUSED(t0); - - LM_GGML_TENSOR_BINARY_OP_LOCALS - - const int ith = params->ith; - const int nth = params->nth; - - const int nk = ne00*ne01*ne02; - - LM_GGML_ASSERT(nb00 == sizeof(lm_ggml_fp16_t)); - LM_GGML_ASSERT(nb10 == sizeof(float)); - - if (params->type == LM_GGML_TASK_INIT) { - memset(params->wdata, 0, params->wsize); - - // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout) - { - lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) params->wdata + 0; - - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - const lm_ggml_fp16_t * const src = (lm_ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01); - lm_ggml_fp16_t * dst_data = wdata + i01*ne00*ne02; - for (int64_t i00 = 0; i00 < ne00; i00++) { - dst_data[i00*ne02 + i02] = src[i00]; - } - } - } - } - - // permute source data (src1) from (L x Cin) to (Cin x L) - { - lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) params->wdata + nk; - lm_ggml_fp16_t * dst_data = wdata; - - for (int64_t i11 = 0; i11 < ne11; i11++) { - const float * const src = (float *)((char *) src1->data + i11*nb11); - for (int64_t i10 = 0; i10 < ne10; i10++) { - dst_data[i10*ne11 + i11] = LM_GGML_FP32_TO_FP16(src[i10]); - } - } - } - - // need to zero dst since we are accumulating into it - memset(dst->data, 0, lm_ggml_nbytes(dst)); - - return; - } - - if (params->type == LM_GGML_TASK_FINALIZE) { - return; - } - - const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; - - // total rows in dst - const int nr = ne1; - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) params->wdata + 0; - lm_ggml_fp16_t * const wdata_src = wdata + nk; - - for (int i1 = ir0; i1 < ir1; i1++) { - float * dst_data = (float *)((char *) dst->data + i1*nb1); - lm_ggml_fp16_t * wdata_kernel = wdata + i1*ne02*ne00; - for (int i10 = 0; i10 < ne10; i10++) { - const int i1n = i10*ne11; - for (int i00 = 0; i00 < ne00; i00++) { - float v = 0; - lm_ggml_vec_dot_f16(ne02, &v, - (lm_ggml_fp16_t *) wdata_src + i1n, - (lm_ggml_fp16_t *) wdata_kernel + i00*ne02); - dst_data[i10*s0 + i00] += v; - } - } - } -} - -static void lm_ggml_compute_forward_conv_transpose_1d_f32( - const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, - struct lm_ggml_tensor * dst) { - LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F32); - LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F32); - LM_GGML_ASSERT( dst->type == LM_GGML_TYPE_F32); - - int64_t t0 = lm_ggml_perf_time_us(); - UNUSED(t0); - - LM_GGML_TENSOR_BINARY_OP_LOCALS - - const int ith = params->ith; - const int nth = params->nth; - - const int nk = ne00*ne01*ne02; - - LM_GGML_ASSERT(nb00 == sizeof(float)); - LM_GGML_ASSERT(nb10 == sizeof(float)); - - if (params->type == LM_GGML_TASK_INIT) { - memset(params->wdata, 0, params->wsize); - - // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout) - { - float * const wdata = (float *) params->wdata + 0; - - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01); - float * dst_data = wdata + i01*ne00*ne02; - for (int64_t i00 = 0; i00 < ne00; i00++) { - dst_data[i00*ne02 + i02] = src[i00]; - } - } - } - } - - // prepare source data (src1) - { - float * const wdata = (float *) params->wdata + nk; - float * dst_data = wdata; - - for (int64_t i11 = 0; i11 < ne11; i11++) { - const float * const src = (float *)((char *) src1->data + i11*nb11); - for (int64_t i10 = 0; i10 < ne10; i10++) { - dst_data[i10*ne11 + i11] = src[i10]; - } - } - } - - // need to zero dst since we are accumulating into it - memset(dst->data, 0, lm_ggml_nbytes(dst)); - - return; - } - - if (params->type == LM_GGML_TASK_FINALIZE) { - return; - } - - const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; - - // total rows in dst - const int nr = ne1; - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - float * const wdata = (float *) params->wdata + 0; - float * const wdata_src = wdata + nk; - - for (int i1 = ir0; i1 < ir1; i1++) { - float * dst_data = (float *)((char *) dst->data + i1*nb1); - float * wdata_kernel = wdata + i1*ne02*ne00; - for (int i10 = 0; i10 < ne10; i10++) { - const int i1n = i10*ne11; - for (int i00 = 0; i00 < ne00; i00++) { - float v = 0; - lm_ggml_vec_dot_f32(ne02, &v, - wdata_src + i1n, - wdata_kernel + i00*ne02); - dst_data[i10*s0 + i00] += v; - } - } - } -} - -static void lm_ggml_compute_forward_conv_transpose_1d( - const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, - struct lm_ggml_tensor * dst) { - switch (src0->type) { - case LM_GGML_TYPE_F16: - { - lm_ggml_compute_forward_conv_transpose_1d_f16_f32(params, src0, src1, dst); - } break; - case LM_GGML_TYPE_F32: - { - lm_ggml_compute_forward_conv_transpose_1d_f32(params, src0, src1, dst); - } break; - default: - { - LM_GGML_ASSERT(false); - } break; - } -} - -// lm_ggml_compute_forward_conv_2d - -// src0: kernel [OC, IC, KH, KW] -// src1: image [N, IC, IH, IW] -// dst: result [N, OH, OW, IC*KH*KW] -static void lm_ggml_compute_forward_conv_2d_stage_0_f32( - const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, - struct lm_ggml_tensor * dst) { - LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F16); - LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F32); - LM_GGML_ASSERT( dst->type == LM_GGML_TYPE_F16); - - int64_t t0 = lm_ggml_perf_time_us(); - UNUSED(t0); - - LM_GGML_TENSOR_BINARY_OP_LOCALS; - - const int64_t N = ne13; - const int64_t IC = ne12; - const int64_t IH = ne11; + const int64_t N = is_2D ? ne13 : ne12; + const int64_t IC = is_2D ? ne12 : ne11; + const int64_t IH = is_2D ? ne11 : 1; const int64_t IW = ne10; - // const int64_t OC = ne03; - // const int64_t IC = ne02; - const int64_t KH = ne01; + const int64_t KH = is_2D ? ne01 : 1; const int64_t KW = ne00; - const int64_t OH = ne2; - const int64_t OW = ne1; - - const int ith = params->ith; - const int nth = params->nth; + const int64_t OH = is_2D ? ne2 : 1; + const int64_t OW = ne1; - const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; - const int32_t s1 = ((const int32_t*)(dst->op_params))[1]; - const int32_t p0 = ((const int32_t*)(dst->op_params))[2]; - const int32_t p1 = ((const int32_t*)(dst->op_params))[3]; - const int32_t d0 = ((const int32_t*)(dst->op_params))[4]; - const int32_t d1 = ((const int32_t*)(dst->op_params))[5]; + int ofs0 = is_2D ? nb13 : nb12; + int ofs1 = is_2D ? nb12 : nb11; LM_GGML_ASSERT(nb00 == sizeof(lm_ggml_fp16_t)); LM_GGML_ASSERT(nb10 == sizeof(float)); if (params->type == LM_GGML_TASK_INIT) { - memset(dst->data, 0, lm_ggml_nbytes(dst)); return; } @@ -12020,20 +11806,22 @@ static void lm_ggml_compute_forward_conv_2d_stage_0_f32( lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) dst->data; for (int64_t in = 0; in < N; in++) { - for (int64_t ioh = 0; ioh < OH; ioh++) { + for (int64_t ioh = 0; ioh < OH; ioh++) { // 1 for (int64_t iow = 0; iow < OW; iow++) { - for (int64_t iic = ith; iic < IC; iic+=nth) { + for (int64_t iic = ith; iic < IC; iic += nth) { // micro kernel lm_ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW] - const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW] + const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW] - for (int64_t ikh = 0; ikh < KH; ikh++) { + for (int64_t ikh = 0; ikh < KH; ikh++) { // 1 for (int64_t ikw = 0; ikw < KW; ikw++) { const int64_t iiw = iow*s0 + ikw*d0 - p0; const int64_t iih = ioh*s1 + ikh*d1 - p1; - if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) { + if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) { + dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0; + } else { dst_data[iic*(KH*KW) + ikh*KW + ikw] = LM_GGML_FP32_TO_FP16(src_data[iih*IW + iiw]); } } @@ -12045,223 +11833,7 @@ static void lm_ggml_compute_forward_conv_2d_stage_0_f32( } } -// gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW] -// src0: [OC, IC, KH, KW] -// src1: [N, OH, OW, IC * KH * KW] -// result: [N, OC, OH, OW] -static void lm_ggml_compute_forward_conv_2d_stage_1_f16( - const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, - struct lm_ggml_tensor * dst) { - LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F16); - LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F16); - LM_GGML_ASSERT( dst->type == LM_GGML_TYPE_F32); - - int64_t t0 = lm_ggml_perf_time_us(); - UNUSED(t0); - - if (params->type == LM_GGML_TASK_INIT) { - return; - } - - if (params->type == LM_GGML_TASK_FINALIZE) { - return; - } - - LM_GGML_TENSOR_BINARY_OP_LOCALS; - - LM_GGML_ASSERT(nb00 == sizeof(lm_ggml_fp16_t)); - LM_GGML_ASSERT(nb10 == sizeof(lm_ggml_fp16_t)); - LM_GGML_ASSERT(nb0 == sizeof(float)); - - const int N = ne13; - const int OH = ne12; - const int OW = ne11; - - const int OC = ne03; - const int IC = ne02; - const int KH = ne01; - const int KW = ne00; - - const int ith = params->ith; - const int nth = params->nth; - - int64_t m = OC; - int64_t n = OH * OW; - int64_t k = IC * KH * KW; - - // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW] - for (int i = 0; i < N; i++) { - lm_ggml_fp16_t * A = (lm_ggml_fp16_t *)src0->data; // [m, k] - lm_ggml_fp16_t * B = (lm_ggml_fp16_t *)src1->data + i * m * k; // [n, k] - float * C = (float *)dst->data + i * m * n; // [m, n] - - gemm_f16_out_f32(m, n, k, A, B, C, ith, nth); - } -} - -static void lm_ggml_compute_forward_conv_2d_f16_f32( - const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, - struct lm_ggml_tensor * dst) { - LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F16); - LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F32); - LM_GGML_ASSERT( dst->type == LM_GGML_TYPE_F32); - - int64_t t0 = lm_ggml_perf_time_us(); - UNUSED(t0); - - LM_GGML_TENSOR_BINARY_OP_LOCALS - - // src1: image [N, IC, IH, IW] - // src0: kernel [OC, IC, KH, KW] - // dst: result [N, OC, OH, OW] - // ne12: IC - // ne0: OW - // ne1: OH - // nk0: KW - // nk1: KH - // ne13: N - - const int N = ne13; - const int IC = ne12; - const int IH = ne11; - const int IW = ne10; - - const int OC = ne03; - // const int IC = ne02; - const int KH = ne01; - const int KW = ne00; - - const int OH = ne1; - const int OW = ne0; - - const int ith = params->ith; - const int nth = params->nth; - - // const int nk0 = ne00; - // const int nk1 = ne01; - - // size of the convolution row - the kernel size unrolled across all channels - // const int ew0 = nk0*nk1*ne02; - // ew0: IC*KH*KW - - const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; - const int32_t s1 = ((const int32_t*)(dst->op_params))[1]; - const int32_t p0 = ((const int32_t*)(dst->op_params))[2]; - const int32_t p1 = ((const int32_t*)(dst->op_params))[3]; - const int32_t d0 = ((const int32_t*)(dst->op_params))[4]; - const int32_t d1 = ((const int32_t*)(dst->op_params))[5]; - - LM_GGML_ASSERT(nb00 == sizeof(lm_ggml_fp16_t)); - LM_GGML_ASSERT(nb10 == sizeof(float)); - - if (params->type == LM_GGML_TASK_INIT) { - memset(params->wdata, 0, params->wsize); - - // prepare source data (src1) - // im2col: [N, IC, IH, IW] => [N*OH*OW, IC*KH*KW] - - { - lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) params->wdata + 0; - - for (int in = 0; in < N; in++) { - for (int iic = 0; iic < IC; iic++) { - for (int ioh = 0; ioh < OH; ioh++) { - for (int iow = 0; iow < OW; iow++) { - - // micro kernel - lm_ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW] - const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW] - - for (int ikh = 0; ikh < KH; ikh++) { - for (int ikw = 0; ikw < KW; ikw++) { - const int iiw = iow*s0 + ikw*d0 - p0; - const int iih = ioh*s1 + ikh*d1 - p1; - - if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) { - dst_data[iic*(KH*KW) + ikh*KW + ikw] = LM_GGML_FP32_TO_FP16(src_data[iih*IW + iiw]); - } - } - } - } - } - } - } - } - - return; - } - - if (params->type == LM_GGML_TASK_FINALIZE) { - return; - } - - lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) params->wdata + 0; - // wdata: [N*OH*OW, IC*KH*KW] - // dst: result [N, OC, OH, OW] - // src0: kernel [OC, IC, KH, KW] - - int64_t m = OC; - int64_t n = OH * OW; - int64_t k = IC * KH * KW; - - // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW] - for (int i = 0; i < N; i++) { - lm_ggml_fp16_t * A = (lm_ggml_fp16_t *)src0->data; // [m, k] - lm_ggml_fp16_t * B = (lm_ggml_fp16_t *)wdata + i * m * k; // [n, k] - float * C = (float *)dst->data + i * m * n; // [m * k] - - gemm_f16_out_f32(m, n, k, A, B, C, ith, nth); - } -} - -static void lm_ggml_compute_forward_conv_2d( - const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, - struct lm_ggml_tensor * dst) { - switch (src0->type) { - case LM_GGML_TYPE_F16: - { - lm_ggml_compute_forward_conv_2d_f16_f32(params, src0, src1, dst); - } break; - case LM_GGML_TYPE_F32: - { - //lm_ggml_compute_forward_conv_2d_f32(params, src0, src1, dst); - LM_GGML_ASSERT(false); - } break; - default: - { - LM_GGML_ASSERT(false); - } break; - } -} - -static void lm_ggml_compute_forward_conv_2d_stage_0( - const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, - struct lm_ggml_tensor * dst) { - switch (src0->type) { - case LM_GGML_TYPE_F16: - { - lm_ggml_compute_forward_conv_2d_stage_0_f32(params, src0, src1, dst); - } break; - case LM_GGML_TYPE_F32: - { - LM_GGML_ASSERT(false); - } break; - default: - { - LM_GGML_ASSERT(false); - } break; - } -} - -static void lm_ggml_compute_forward_conv_2d_stage_1( +static void lm_ggml_compute_forward_im2col( const struct lm_ggml_compute_params * params, const struct lm_ggml_tensor * src0, const struct lm_ggml_tensor * src1, @@ -12269,7 +11841,7 @@ static void lm_ggml_compute_forward_conv_2d_stage_1( switch (src0->type) { case LM_GGML_TYPE_F16: { - lm_ggml_compute_forward_conv_2d_stage_1_f16(params, src0, src1, dst); + lm_ggml_compute_forward_im2col_f16(params, src0, src1, dst); } break; case LM_GGML_TYPE_F32: { @@ -12454,14 +12026,11 @@ static void lm_ggml_compute_forward_pool_1d( lm_ggml_compute_forward_pool_1d_sk_p0(params, op, src0, k0, dst); } -// lm_ggml_compute_forward_pool_2d_sk_p0 +// lm_ggml_compute_forward_pool_2d -static void lm_ggml_compute_forward_pool_2d_sk_p0( +static void lm_ggml_compute_forward_pool_2d( const struct lm_ggml_compute_params * params, - const enum lm_ggml_op_pool op, const struct lm_ggml_tensor * src, - const int k0, - const int k1, struct lm_ggml_tensor * dst) { assert(src->type == LM_GGML_TYPE_F32); assert(params->ith == 0); @@ -12470,6 +12039,14 @@ static void lm_ggml_compute_forward_pool_2d_sk_p0( return; } + const int32_t * opts = (const int32_t *)dst->op_params; + enum lm_ggml_op_pool op = opts[0]; + const int k0 = opts[1]; + const int k1 = opts[2]; + const int s0 = opts[3]; + const int s1 = opts[4]; + const int p0 = opts[5]; + const int p1 = opts[6]; const char * cdata = (const char*)src->data; const char * const data_end = cdata + lm_ggml_nbytes(src); @@ -12480,6 +12057,8 @@ static void lm_ggml_compute_forward_pool_2d_sk_p0( float * dplane = (float *)dst->data; const int ka = k0 * k1; + const int offset0 = -p0; + const int offset1 = -p1; while (cdata < data_end) { for (int oy = 0; oy < py; ++oy) { @@ -12492,13 +12071,15 @@ static void lm_ggml_compute_forward_pool_2d_sk_p0( case LM_GGML_OP_POOL_COUNT: LM_GGML_ASSERT(false); break; } - const int ix = ox * k0; - const int iy = oy * k1; + const int ix = offset0 + ox * s0; + const int iy = offset1 + oy * s1; for (int ky = 0; ky < k1; ++ky) { + if (iy + ky < 0 || iy + ky >= src->ne[1]) continue; const float * const srow = (const float *)(cdata + src->nb[1] * (iy + ky)); for (int kx = 0; kx < k0; ++kx) { int j = ix + kx; + if (j < 0 || j >= src->ne[0]) continue; switch (op) { case LM_GGML_OP_POOL_AVG: *out += srow[j]; break; case LM_GGML_OP_POOL_MAX: if (srow[j] > *out) *out = srow[j]; break; @@ -12519,29 +12100,6 @@ static void lm_ggml_compute_forward_pool_2d_sk_p0( } } -// lm_ggml_compute_forward_pool_2d - -static void lm_ggml_compute_forward_pool_2d( - const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - struct lm_ggml_tensor * dst) { - - const int32_t * opts = (const int32_t *)dst->op_params; - enum lm_ggml_op_pool op = opts[0]; - const int k0 = opts[1]; - const int k1 = opts[2]; - const int s0 = opts[3]; - const int s1 = opts[4]; - const int p0 = opts[5]; - const int p1 = opts[6]; - LM_GGML_ASSERT(p0 == 0); - LM_GGML_ASSERT(p1 == 0); // padding not supported - LM_GGML_ASSERT(k0 == s0); - LM_GGML_ASSERT(k1 == s1); // only s = k supported - - lm_ggml_compute_forward_pool_2d_sk_p0(params, op, src0, k0, k1, dst); -} - // lm_ggml_compute_forward_upscale static void lm_ggml_compute_forward_upscale_f32( @@ -12597,6 +12155,67 @@ static void lm_ggml_compute_forward_upscale( } } +// lm_ggml_compute_forward_argsort + +static void lm_ggml_compute_forward_argsort_f32( + const struct lm_ggml_compute_params * params, + const struct lm_ggml_tensor * src0, + struct lm_ggml_tensor * dst) { + + if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { + return; + } + + LM_GGML_TENSOR_UNARY_OP_LOCALS + + LM_GGML_ASSERT(nb0 == sizeof(float)); + + const int ith = params->ith; + const int nth = params->nth; + + const int64_t nr = lm_ggml_nrows(src0); + + enum lm_ggml_sort_order order = (enum lm_ggml_sort_order) lm_ggml_get_op_params_i32(dst, 0); + + for (int64_t i = ith; i < nr; i += nth) { + int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1); + const float * src_data = (float *)((char *) src0->data + i*nb01); + + for (int64_t j = 0; j < ne0; j++) { + dst_data[j] = j; + } + + // C doesn't have a functional sort, so we do a bubble sort instead + for (int64_t j = 0; j < ne0; j++) { + for (int64_t k = j + 1; k < ne0; k++) { + if ((order == LM_GGML_SORT_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) || + (order == LM_GGML_SORT_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) { + int32_t tmp = dst_data[j]; + dst_data[j] = dst_data[k]; + dst_data[k] = tmp; + } + } + } + } +} + +static void lm_ggml_compute_forward_argsort( + const struct lm_ggml_compute_params * params, + const struct lm_ggml_tensor * src0, + struct lm_ggml_tensor * dst) { + + switch (src0->type) { + case LM_GGML_TYPE_F32: + { + lm_ggml_compute_forward_argsort_f32(params, src0, dst); + } break; + default: + { + LM_GGML_ASSERT(false); + } break; + } +} + // lm_ggml_compute_forward_flash_attn static void lm_ggml_compute_forward_flash_attn_f32( @@ -13743,6 +13362,10 @@ static void lm_ggml_compute_forward_unary( { lm_ggml_compute_forward_silu(params, src0, dst); } break; + case LM_GGML_UNARY_OP_LEAKY: + { + lm_ggml_compute_forward_leaky(params, src0, dst); + } break; default: { LM_GGML_ASSERT(false); @@ -14416,6 +14039,10 @@ static void lm_ggml_compute_forward(struct lm_ggml_compute_params * params, stru { lm_ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor); } break; + case LM_GGML_OP_MUL_MAT_ID: + { + lm_ggml_compute_forward_mul_mat_id(params, tensor); + } break; case LM_GGML_OP_OUT_PROD: { lm_ggml_compute_forward_out_prod(params, tensor->src[0], tensor->src[1], tensor); @@ -14474,7 +14101,7 @@ static void lm_ggml_compute_forward(struct lm_ggml_compute_params * params, stru } break; case LM_GGML_OP_SOFT_MAX: { - lm_ggml_compute_forward_soft_max(params, tensor->src[0], tensor); + lm_ggml_compute_forward_soft_max(params, tensor->src[0], tensor->src[1], tensor); } break; case LM_GGML_OP_SOFT_MAX_BACK: { @@ -14496,33 +14123,13 @@ static void lm_ggml_compute_forward(struct lm_ggml_compute_params * params, stru { lm_ggml_compute_forward_clamp(params, tensor->src[0], tensor); } break; - case LM_GGML_OP_CONV_1D: - { - lm_ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor); - } break; - case LM_GGML_OP_CONV_1D_STAGE_0: - { - lm_ggml_compute_forward_conv_1d_stage_0(params, tensor->src[0], tensor->src[1], tensor); - } break; - case LM_GGML_OP_CONV_1D_STAGE_1: - { - lm_ggml_compute_forward_conv_1d_stage_1(params, tensor->src[0], tensor->src[1], tensor); - } break; case LM_GGML_OP_CONV_TRANSPOSE_1D: { lm_ggml_compute_forward_conv_transpose_1d(params, tensor->src[0], tensor->src[1], tensor); } break; - case LM_GGML_OP_CONV_2D: - { - lm_ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor); - } break; - case LM_GGML_OP_CONV_2D_STAGE_0: - { - lm_ggml_compute_forward_conv_2d_stage_0(params, tensor->src[0], tensor->src[1], tensor); - } break; - case LM_GGML_OP_CONV_2D_STAGE_1: + case LM_GGML_OP_IM2COL: { - lm_ggml_compute_forward_conv_2d_stage_1(params, tensor->src[0], tensor->src[1], tensor); + lm_ggml_compute_forward_im2col(params, tensor->src[0], tensor->src[1], tensor); } break; case LM_GGML_OP_CONV_TRANSPOSE_2D: { @@ -14540,6 +14147,10 @@ static void lm_ggml_compute_forward(struct lm_ggml_compute_params * params, stru { lm_ggml_compute_forward_upscale(params, tensor->src[0], tensor); } break; + case LM_GGML_OP_ARGSORT: + { + lm_ggml_compute_forward_argsort(params, tensor->src[0], tensor); + } break; case LM_GGML_OP_FLASH_ATTN: { const int32_t t = lm_ggml_get_op_params_i32(tensor, 0); @@ -14651,62 +14262,109 @@ static void lm_ggml_compute_forward(struct lm_ggml_compute_params * params, stru //////////////////////////////////////////////////////////////////////////////// -static_assert(LM_GGML_GRAPH_HASHTABLE_SIZE > LM_GGML_MAX_NODES * 2, "LM_GGML_GRAPH_HT_SIZE is too small"); +static size_t lm_ggml_hash_size(size_t min_sz) { + // next primes after powers of two + static const size_t primes[] = { + 2, 3, 5, 11, 17, 37, 67, 131, 257, 521, 1031, + 2053, 4099, 8209, 16411, 32771, 65537, 131101, + 262147, 524309, 1048583, 2097169, 4194319, 8388617, + 16777259, 33554467, 67108879, 134217757, 268435459, + 536870923, 1073741827, 2147483659 + }; + static const size_t n_primes = sizeof(primes)/sizeof(primes[0]); + + // find the smallest prime that is larger or equal to min_sz + size_t l = 0; + size_t r = n_primes; + while (l < r) { + size_t m = (l + r)/2; + if (primes[m] < min_sz) { + l = m + 1; + } else { + r = m; + } + } + size_t sz = l < n_primes ? primes[l] : min_sz | 1; + return sz; +} -static size_t hash(void * p) { - return (size_t)p % LM_GGML_GRAPH_HASHTABLE_SIZE; +static size_t lm_ggml_hash(const void * p) { + return (size_t)p; } -static size_t hash_find(void * hash_table[], void * p) { - size_t h = hash(p); +size_t lm_ggml_hash_find(const struct lm_ggml_hash_set hash_set, struct lm_ggml_tensor * key) { + size_t h = lm_ggml_hash(key) % hash_set.size; // linear probing size_t i = h; - while (hash_table[i] != NULL && hash_table[i] != p) { - i = (i + 1) % LM_GGML_GRAPH_HASHTABLE_SIZE; + while (hash_set.keys[i] != NULL && hash_set.keys[i] != key) { + i = (i + 1) % hash_set.size; if (i == h) { // visited all hash table entries -> not found - return LM_GGML_GRAPH_HASHTABLE_SIZE; + return LM_GGML_HASHTABLE_FULL; } } return i; } -static bool hash_insert(void * hash_table[], void * p) { - size_t i = hash_find(hash_table, p); +bool lm_ggml_hash_contains(struct lm_ggml_hash_set hash_set, struct lm_ggml_tensor * key) { + size_t i = lm_ggml_hash_find(hash_set, key); + return i != LM_GGML_HASHTABLE_FULL && hash_set.keys[i] == key; +} - LM_GGML_ASSERT(i < LM_GGML_GRAPH_HASHTABLE_SIZE); // assert that not full +size_t lm_ggml_hash_insert(struct lm_ggml_hash_set hash_set, struct lm_ggml_tensor * key) { + size_t i = lm_ggml_hash_find(hash_set, key); - if (hash_table[i] == p) { - return true; + LM_GGML_ASSERT(i != LM_GGML_HASHTABLE_FULL); + + if (hash_set.keys[i] == key) { + return LM_GGML_HASHTABLE_ALREADY_EXISTS; } // insert - LM_GGML_ASSERT(hash_table[i] == NULL); - hash_table[i] = p; - return false; + LM_GGML_ASSERT(hash_set.keys[i] == NULL); + hash_set.keys[i] = key; + return i; +} + +size_t lm_ggml_hash_find_or_insert(struct lm_ggml_hash_set hash_set, struct lm_ggml_tensor * key) { + size_t i = lm_ggml_hash_find(hash_set, key); + + LM_GGML_ASSERT(i != LM_GGML_HASHTABLE_FULL); + + hash_set.keys[i] = key; + return i; +} + +static struct lm_ggml_hash_set lm_ggml_hash_set_new(size_t size) { + size = lm_ggml_hash_size(size); + struct lm_ggml_hash_set result; + result.size = size; + result.keys = malloc(sizeof(struct lm_ggml_tensor *) * size); + memset(result.keys, 0, sizeof(struct lm_ggml_tensor *) * size); + return result; } -static bool hash_contains(void * hash_table[], void * p) { - size_t i = hash_find(hash_table, p); - return (i < LM_GGML_GRAPH_HASHTABLE_SIZE) && (hash_table[i] == p); +static void lm_ggml_hash_set_free(struct lm_ggml_hash_set hash_set) { + free(hash_set.keys); } struct hash_map { - void * keys[LM_GGML_GRAPH_HASHTABLE_SIZE]; - void * vals[LM_GGML_GRAPH_HASHTABLE_SIZE]; + struct lm_ggml_hash_set set; + struct lm_ggml_tensor ** vals; }; -static struct hash_map * new_hash_map(void) { +static struct hash_map * lm_ggml_new_hash_map(size_t size) { struct hash_map * result = malloc(sizeof(struct hash_map)); - for (int i=0; ikeys[i] = NULL; - result->vals[i] = NULL; - } + result->set = lm_ggml_hash_set_new(size); + result->vals = malloc(sizeof(struct lm_ggml_tensor *) * result->set.size); + memset(result->vals, 0, sizeof(struct lm_ggml_tensor *) * result->set.size); return result; } -static void free_hash_map(struct hash_map * map) { +static void lm_ggml_hash_map_free(struct hash_map * map) { + lm_ggml_hash_set_free(map->set); + free(map->vals); free(map); } @@ -14726,7 +14384,7 @@ static struct lm_ggml_tensor * lm_ggml_recompute_graph_node( return node; } - if (!hash_contains(graph->visited_hash_table, node)) { + if (!lm_ggml_hash_contains(graph->visited_hash_table, node)) { return node; } @@ -14741,17 +14399,17 @@ static struct lm_ggml_tensor * lm_ggml_recompute_graph_node( return node; } - size_t i = hash_find(replacements->keys, node); - LM_GGML_ASSERT(i < LM_GGML_GRAPH_HASHTABLE_SIZE); // assert that not full - if (replacements->keys[i] == node) { - return (struct lm_ggml_tensor *) replacements->vals[i]; + size_t i = lm_ggml_hash_find(replacements->set, node); + LM_GGML_ASSERT(i != LM_GGML_HASHTABLE_FULL); // assert that not full + if (replacements->set.keys[i] == node) { + return replacements->vals[i]; } struct lm_ggml_tensor * clone = lm_ggml_new_tensor(ctx, node->type, node->n_dims, node->ne); // insert clone into replacements - LM_GGML_ASSERT(replacements->keys[i] == NULL); // assert that we don't overwrite - replacements->keys[i] = node; + LM_GGML_ASSERT(replacements->set.keys[i] == NULL); // assert that we don't overwrite + replacements->set.keys[i] = node; replacements->vals[i] = clone; clone->op = node->op; @@ -14788,26 +14446,26 @@ void lm_ggml_build_backward_gradient_checkpointing( struct lm_ggml_cgraph * gb_tmp, struct lm_ggml_tensor * * checkpoints, int n_checkpoints) { - *gb_tmp = *gf; + lm_ggml_graph_cpy(gf, gb_tmp); lm_ggml_build_backward_expand(ctx, gf, gb_tmp, true); if (n_checkpoints <= 0) { - *gb = *gb_tmp; + lm_ggml_graph_cpy(gb_tmp, gb); return; } - struct hash_map * replacements = new_hash_map(); + struct hash_map * replacements = lm_ggml_new_hash_map(gf->n_nodes + gf->n_leafs + n_checkpoints); // insert checkpoints in replacements for (int i = 0; i < n_checkpoints; ++i) { - size_t k = hash_find(replacements->keys, checkpoints[i]); - LM_GGML_ASSERT(k < LM_GGML_GRAPH_HASHTABLE_SIZE); // assert that not full - LM_GGML_ASSERT(replacements->keys[k] == NULL); // assert that we don't overwrite - replacements->keys[k] = checkpoints[i]; - replacements->vals[k] = checkpoints[i]; + size_t k = lm_ggml_hash_find(replacements->set, checkpoints[i]); + LM_GGML_ASSERT(k != LM_GGML_HASHTABLE_FULL); // assert that not full + LM_GGML_ASSERT(replacements->set.keys[k] == NULL); // assert that we don't overwrite + replacements->set.keys[k] = checkpoints[i]; + replacements->vals[k] = checkpoints[i]; } - *gb = *gf; + lm_ggml_graph_cpy(gf, gb); // rewrite gb_tmp->nodes[gf->n_nodes:gb_tmp->n_nodes], // replacing references to gb_tmp->nodes[0:gf->n_nodes] ( == gf->nodes[0:gf->n_nodes]), // by recomputing them from checkpoints @@ -14824,21 +14482,21 @@ void lm_ggml_build_backward_gradient_checkpointing( lm_ggml_build_forward_expand(gb, node); } - free_hash_map(replacements); + lm_ggml_hash_map_free(replacements); } // functions to change gradients considering the case that input a might be initial gradient with zero value -static struct lm_ggml_tensor * lm_ggml_add_or_set(struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, struct lm_ggml_tensor * b, void * zero_table[]) { - if (hash_contains(zero_table, a)) { +static struct lm_ggml_tensor * lm_ggml_add_or_set(struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, struct lm_ggml_tensor * b, struct lm_ggml_hash_set zero_table) { + if (lm_ggml_hash_contains(zero_table, a)) { return b; } else { return lm_ggml_add_impl(ctx, a, b, false); } } -static struct lm_ggml_tensor * lm_ggml_acc_or_set(struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, struct lm_ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, void * zero_table[]) { - if (hash_contains(zero_table, a)) { +static struct lm_ggml_tensor * lm_ggml_acc_or_set(struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, struct lm_ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct lm_ggml_hash_set zero_table) { + if (lm_ggml_hash_contains(zero_table, a)) { struct lm_ggml_tensor * a_zero = lm_ggml_scale(ctx, a, lm_ggml_new_f32(ctx, 0)); return lm_ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false); } else { @@ -14846,23 +14504,23 @@ static struct lm_ggml_tensor * lm_ggml_acc_or_set(struct lm_ggml_context * ctx, } } -static struct lm_ggml_tensor * lm_ggml_add1_or_set(struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, struct lm_ggml_tensor * b, void * zero_table[]) { - if (hash_contains(zero_table, a)) { +static struct lm_ggml_tensor * lm_ggml_add1_or_set(struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, struct lm_ggml_tensor * b, struct lm_ggml_hash_set zero_table) { + if (lm_ggml_hash_contains(zero_table, a)) { return lm_ggml_repeat(ctx, b, a); } else { return lm_ggml_add1_impl(ctx, a, b, false); } } -static struct lm_ggml_tensor * lm_ggml_sub_or_set(struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, struct lm_ggml_tensor * b, void * zero_table[]) { - if (hash_contains(zero_table, a)) { +static struct lm_ggml_tensor * lm_ggml_sub_or_set(struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, struct lm_ggml_tensor * b, struct lm_ggml_hash_set zero_table) { + if (lm_ggml_hash_contains(zero_table, a)) { return lm_ggml_neg(ctx, b); } else { return lm_ggml_sub_impl(ctx, a, b, false); } } -static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggml_tensor * tensor, void * zero_table[]) { +static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggml_tensor * tensor, struct lm_ggml_hash_set zero_table) { struct lm_ggml_tensor * src0 = tensor->src[0]; struct lm_ggml_tensor * src1 = tensor->src[1]; @@ -15143,6 +14801,10 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm zero_table); } } break; + case LM_GGML_OP_MUL_MAT_ID: + { + LM_GGML_ASSERT(false); // TODO: not implemented + } break; case LM_GGML_OP_OUT_PROD: { LM_GGML_ASSERT(false); // TODO: not implemented @@ -15449,23 +15111,11 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm zero_table); } } break; - case LM_GGML_OP_ALIBI: - { - LM_GGML_ASSERT(false); // TODO: not implemented - } break; - case LM_GGML_OP_CLAMP: - { - LM_GGML_ASSERT(false); // TODO: not implemented - } break; - case LM_GGML_OP_CONV_1D: - { - LM_GGML_ASSERT(false); // TODO: not implemented - } break; - case LM_GGML_OP_CONV_1D_STAGE_0: + case LM_GGML_OP_ALIBI: { LM_GGML_ASSERT(false); // TODO: not implemented } break; - case LM_GGML_OP_CONV_1D_STAGE_1: + case LM_GGML_OP_CLAMP: { LM_GGML_ASSERT(false); // TODO: not implemented } break; @@ -15473,15 +15123,7 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm { LM_GGML_ASSERT(false); // TODO: not implemented } break; - case LM_GGML_OP_CONV_2D: - { - LM_GGML_ASSERT(false); // TODO: not implemented - } break; - case LM_GGML_OP_CONV_2D_STAGE_0: - { - LM_GGML_ASSERT(false); // TODO: not implemented - } break; - case LM_GGML_OP_CONV_2D_STAGE_1: + case LM_GGML_OP_IM2COL: { LM_GGML_ASSERT(false); // TODO: not implemented } break; @@ -15501,6 +15143,10 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm { LM_GGML_ASSERT(false); // TODO: not implemented } break; + case LM_GGML_OP_ARGSORT: + { + LM_GGML_ASSERT(false); // TODO: not implemented + } break; case LM_GGML_OP_FLASH_ATTN: { struct lm_ggml_tensor * flash_grad = NULL; @@ -15695,7 +15341,7 @@ static void lm_ggml_visit_parents(struct lm_ggml_cgraph * cgraph, struct lm_ggml } // check if already visited - if (hash_insert(cgraph->visited_hash_table, node)) { + if (lm_ggml_hash_insert(cgraph->visited_hash_table, node) == LM_GGML_HASHTABLE_ALREADY_EXISTS) { return; } @@ -15711,7 +15357,7 @@ static void lm_ggml_visit_parents(struct lm_ggml_cgraph * cgraph, struct lm_ggml if (node->op == LM_GGML_OP_NONE && node->grad == NULL) { // reached a leaf node, not part of the gradient graph (e.g. a constant) - LM_GGML_ASSERT(cgraph->n_leafs < LM_GGML_MAX_NODES); + LM_GGML_ASSERT(cgraph->n_leafs < cgraph->size); if (strlen(node->name) == 0) { lm_ggml_format_name(node, "leaf_%d", cgraph->n_leafs); @@ -15720,22 +15366,24 @@ static void lm_ggml_visit_parents(struct lm_ggml_cgraph * cgraph, struct lm_ggml cgraph->leafs[cgraph->n_leafs] = node; cgraph->n_leafs++; } else { - LM_GGML_ASSERT(cgraph->n_nodes < LM_GGML_MAX_NODES); + LM_GGML_ASSERT(cgraph->n_nodes < cgraph->size); if (strlen(node->name) == 0) { lm_ggml_format_name(node, "node_%d", cgraph->n_nodes); } cgraph->nodes[cgraph->n_nodes] = node; - cgraph->grads[cgraph->n_nodes] = node->grad; + if (cgraph->grads) { + cgraph->grads[cgraph->n_nodes] = node->grad; + } cgraph->n_nodes++; } } static void lm_ggml_build_forward_impl(struct lm_ggml_cgraph * cgraph, struct lm_ggml_tensor * tensor, bool expand) { if (!expand) { - cgraph->n_nodes = 0; - cgraph->n_leafs = 0; + // TODO: this branch isn't accessible anymore, maybe move this to lm_ggml_build_forward_expand + lm_ggml_graph_clear(cgraph); } const int n0 = cgraph->n_nodes; @@ -15756,25 +15404,6 @@ void lm_ggml_build_forward_expand(struct lm_ggml_cgraph * cgraph, struct lm_ggml lm_ggml_build_forward_impl(cgraph, tensor, true); } -struct lm_ggml_cgraph lm_ggml_build_forward(struct lm_ggml_tensor * tensor) { - struct lm_ggml_cgraph result = { - /*.n_nodes =*/ 0, - /*.n_leafs =*/ 0, - /*.nodes =*/ { NULL }, - /*.grads =*/ { NULL }, - /*.leafs =*/ { NULL }, - /*.hash_table =*/ { NULL }, - /*.order =*/ LM_GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT, - /*.perf_runs =*/ 0, - /*.perf_cycles =*/ 0, - /*.perf_time_us =*/ 0, - }; - - lm_ggml_build_forward_impl(&result, tensor, false); - - return result; -} - void lm_ggml_build_backward_expand(struct lm_ggml_context * ctx, struct lm_ggml_cgraph * gf, struct lm_ggml_cgraph * gb, bool keep) { LM_GGML_ASSERT(gf->n_nodes > 0); @@ -15791,11 +15420,10 @@ void lm_ggml_build_backward_expand(struct lm_ggml_context * ctx, struct lm_ggml_ } // remember original gradients which start with zero values - void ** zero_table = malloc(sizeof(void *) * LM_GGML_GRAPH_HASHTABLE_SIZE); - memset(zero_table, 0, sizeof(void*) * LM_GGML_GRAPH_HASHTABLE_SIZE); + struct lm_ggml_hash_set zero_table = lm_ggml_hash_set_new(gf->size); for (int i = 0; i < gf->n_nodes; i++) { if (gf->grads[i]) { - hash_insert(zero_table, gf->grads[i]); + lm_ggml_hash_insert(zero_table, gf->grads[i]); } } @@ -15818,26 +15446,54 @@ void lm_ggml_build_backward_expand(struct lm_ggml_context * ctx, struct lm_ggml_ } } - free(zero_table); + lm_ggml_hash_set_free(zero_table); } -struct lm_ggml_cgraph lm_ggml_build_backward(struct lm_ggml_context * ctx, struct lm_ggml_cgraph * gf, bool keep) { - struct lm_ggml_cgraph result = *gf; - lm_ggml_build_backward_expand(ctx, gf, &result, keep); - return result; +static size_t lm_ggml_graph_nbytes(size_t size, bool grads) { + size_t nbytes = sizeof(struct lm_ggml_cgraph); + nbytes += size * sizeof(struct lm_ggml_tensor *) * 2; // leafs + nodes + if (grads) { + nbytes += size * sizeof(struct lm_ggml_tensor *); // grads + } + nbytes += lm_ggml_hash_size(size * 2) * sizeof(struct lm_ggml_tensor *); // hash set + return nbytes; } -struct lm_ggml_cgraph * lm_ggml_new_graph(struct lm_ggml_context * ctx) { - struct lm_ggml_object * obj = lm_ggml_new_object(ctx, LM_GGML_OBJECT_GRAPH, LM_GGML_GRAPH_SIZE); +size_t lm_ggml_graph_overhead_custom(size_t size, bool grads) { + return LM_GGML_OBJECT_SIZE + LM_GGML_PAD(lm_ggml_graph_nbytes(size, grads), LM_GGML_MEM_ALIGN); +} + +size_t lm_ggml_graph_overhead(void) { + return lm_ggml_graph_overhead_custom(LM_GGML_DEFAULT_GRAPH_SIZE, false); +} + +struct lm_ggml_cgraph * lm_ggml_new_graph_custom(struct lm_ggml_context * ctx, size_t size, bool grads) { + const size_t obj_size = lm_ggml_graph_nbytes(size, grads); + struct lm_ggml_object * obj = lm_ggml_new_object(ctx, LM_GGML_OBJECT_GRAPH, obj_size); struct lm_ggml_cgraph * cgraph = (struct lm_ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs); + struct lm_ggml_tensor ** data_start = (struct lm_ggml_tensor **) (cgraph + 1); + + size_t hash_size = lm_ggml_hash_size(size * 2); + struct lm_ggml_tensor ** nodes_ptr = data_start; + struct lm_ggml_tensor ** leafs_ptr = nodes_ptr + size; + struct lm_ggml_tensor ** hash_keys_ptr = leafs_ptr + size; + struct lm_ggml_tensor ** grads_ptr = grads ? hash_keys_ptr + hash_size : NULL; + + // check that we allocated the correct amount of memory + assert(obj_size == (size_t) ( + (grads ? (char *)(grads_ptr + size) : (char *)(hash_keys_ptr + hash_size)) - (char *)cgraph)); + + memset(hash_keys_ptr, 0, hash_size * sizeof(struct lm_ggml_tensor *)); + *cgraph = (struct lm_ggml_cgraph) { + /*.size =*/ size, /*.n_nodes =*/ 0, /*.n_leafs =*/ 0, - /*.nodes =*/ { NULL }, - /*.grads =*/ { NULL }, - /*.leafs =*/ { NULL }, - /*.hash_table =*/ { NULL }, + /*.nodes =*/ nodes_ptr, + /*.grads =*/ grads_ptr, + /*.leafs =*/ leafs_ptr, + /*.hash_table =*/ { hash_size, hash_keys_ptr }, /*.order =*/ LM_GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT, /*.perf_runs =*/ 0, /*.perf_cycles =*/ 0, @@ -15847,14 +15503,81 @@ struct lm_ggml_cgraph * lm_ggml_new_graph(struct lm_ggml_context * ctx) { return cgraph; } -struct lm_ggml_cgraph * lm_ggml_build_forward_ctx(struct lm_ggml_context * ctx, struct lm_ggml_tensor * tensor) { - struct lm_ggml_cgraph * cgraph = lm_ggml_new_graph(ctx); - lm_ggml_build_forward_impl(cgraph, tensor, false); +struct lm_ggml_cgraph * lm_ggml_new_graph(struct lm_ggml_context * ctx) { + return lm_ggml_new_graph_custom(ctx, LM_GGML_DEFAULT_GRAPH_SIZE, false); +} + +struct lm_ggml_cgraph lm_ggml_graph_view(struct lm_ggml_cgraph * cgraph0, int i0, int i1) { + struct lm_ggml_cgraph cgraph = { + /*.size =*/ 0, + /*.n_nodes =*/ i1 - i0, + /*.n_leafs =*/ 0, + /*.nodes =*/ cgraph0->nodes + i0, + /*.grads =*/ cgraph0->grads ? cgraph0->grads + i0 : NULL, + /*.leafs =*/ NULL, + /*.hash_table =*/ { 0, NULL }, + /*.order =*/ cgraph0->order, + /*.perf_runs =*/ 0, + /*.perf_cycles =*/ 0, + /*.perf_time_us =*/ 0, + }; + return cgraph; } -size_t lm_ggml_graph_overhead(void) { - return LM_GGML_OBJECT_SIZE + LM_GGML_PAD(LM_GGML_GRAPH_SIZE, LM_GGML_MEM_ALIGN); +void lm_ggml_graph_cpy(struct lm_ggml_cgraph * src, struct lm_ggml_cgraph * dst) { + LM_GGML_ASSERT(dst->size >= src->n_leafs); + LM_GGML_ASSERT(dst->size >= src->n_nodes); + LM_GGML_ASSERT(dst->visited_hash_table.size >= src->visited_hash_table.size); + + dst->n_leafs = src->n_leafs; + dst->n_nodes = src->n_nodes; + dst->order = src->order; + + for (int i = 0; i < src->n_leafs; ++i) { + dst->leafs[i] = src->leafs[i]; + } + + for (int i = 0; i < src->n_nodes; ++i) { + dst->nodes[i] = src->nodes[i]; + } + + if (src->grads) { + LM_GGML_ASSERT(dst->grads != NULL); + for (int i = 0; i < src->n_nodes; ++i) { + dst->grads[i] = src->grads[i]; + } + } + + for (size_t i = 0; i < src->visited_hash_table.size; ++i) { + if (src->visited_hash_table.keys[i]) { + lm_ggml_hash_insert(dst->visited_hash_table, src->visited_hash_table.keys[i]); + } + } +} + +struct lm_ggml_cgraph * lm_ggml_graph_dup(struct lm_ggml_context * ctx, struct lm_ggml_cgraph * cgraph) { + struct lm_ggml_cgraph * result = lm_ggml_new_graph_custom(ctx, cgraph->size, cgraph->grads != NULL); + lm_ggml_graph_cpy(cgraph, result); + return result; +} + +void lm_ggml_graph_reset(struct lm_ggml_cgraph * cgraph) { + LM_GGML_ASSERT(cgraph->grads != NULL); + + for (int i = 0; i < cgraph->n_nodes; i++) { + struct lm_ggml_tensor * grad = cgraph->grads[i]; + + if (grad) { + lm_ggml_set_zero(grad); + } + } +} + +void lm_ggml_graph_clear(struct lm_ggml_cgraph * cgraph) { + cgraph->n_leafs = 0; + cgraph->n_nodes = 0; + memset(cgraph->visited_hash_table.keys, 0, cgraph->visited_hash_table.size * sizeof(struct lm_ggml_tensor *)); } // @@ -15966,45 +15689,285 @@ static void clear_numa_thread_affinity(void) { strerror(rv)); } - CPU_FREE(cpus); -} -#else -// TODO: Windows etc. -// (the linux implementation may also work on BSD, someone should test) -static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); } -static void clear_numa_thread_affinity(void) {} -#endif - -struct lm_ggml_compute_state_shared { - const struct lm_ggml_cgraph * cgraph; - const struct lm_ggml_cplan * cplan; - - int64_t perf_node_start_cycles; - int64_t perf_node_start_time_us; - - const int n_threads; - - // synchronization primitives - atomic_int n_active; // num active threads - atomic_int node_n; // active graph node - - bool (*abort_callback)(void * data); // abort lm_ggml_graph_compute when true - void * abort_callback_data; -}; - -struct lm_ggml_compute_state { - lm_ggml_thread_t thrd; - int ith; - struct lm_ggml_compute_state_shared * shared; -}; - -static void lm_ggml_graph_compute_perf_stats_node(struct lm_ggml_tensor * node, const struct lm_ggml_compute_state_shared * st) { - int64_t cycles_cur = lm_ggml_perf_cycles() - st->perf_node_start_cycles; - int64_t time_us_cur = lm_ggml_perf_time_us() - st->perf_node_start_time_us; + CPU_FREE(cpus); +} +#else +// TODO: Windows etc. +// (the linux implementation may also work on BSD, someone should test) +static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); } +static void clear_numa_thread_affinity(void) {} +#endif + +struct lm_ggml_compute_state_shared { + const struct lm_ggml_cgraph * cgraph; + const struct lm_ggml_cplan * cplan; + + int64_t perf_node_start_cycles; + int64_t perf_node_start_time_us; + + const int n_threads; + + // synchronization primitives + atomic_int n_active; // num active threads + atomic_int node_n; // active graph node + + bool (*abort_callback)(void * data); // abort lm_ggml_graph_compute when true + void * abort_callback_data; +}; + +struct lm_ggml_compute_state { + lm_ggml_thread_t thrd; + int ith; + struct lm_ggml_compute_state_shared * shared; +}; + +static void lm_ggml_graph_compute_perf_stats_node(struct lm_ggml_tensor * node, const struct lm_ggml_compute_state_shared * st) { + int64_t cycles_cur = lm_ggml_perf_cycles() - st->perf_node_start_cycles; + int64_t time_us_cur = lm_ggml_perf_time_us() - st->perf_node_start_time_us; + + node->perf_runs++; + node->perf_cycles += cycles_cur; + node->perf_time_us += time_us_cur; +} + +static int lm_ggml_get_n_tasks(struct lm_ggml_tensor * node, int n_threads) { + int n_tasks = 0; + + switch (node->op) { + case LM_GGML_OP_CPY: + case LM_GGML_OP_DUP: + case LM_GGML_OP_ADD: + case LM_GGML_OP_ADD1: + case LM_GGML_OP_ACC: + { + n_tasks = n_threads; + } break; + case LM_GGML_OP_SUB: + case LM_GGML_OP_SQR: + case LM_GGML_OP_SQRT: + case LM_GGML_OP_LOG: + case LM_GGML_OP_SUM: + case LM_GGML_OP_SUM_ROWS: + case LM_GGML_OP_MEAN: + case LM_GGML_OP_ARGMAX: + case LM_GGML_OP_REPEAT: + case LM_GGML_OP_REPEAT_BACK: + { + n_tasks = 1; + } break; + case LM_GGML_OP_UNARY: + switch (lm_ggml_get_unary_op(node)) { + case LM_GGML_UNARY_OP_ABS: + case LM_GGML_UNARY_OP_SGN: + case LM_GGML_UNARY_OP_NEG: + case LM_GGML_UNARY_OP_STEP: + case LM_GGML_UNARY_OP_TANH: + case LM_GGML_UNARY_OP_ELU: + case LM_GGML_UNARY_OP_RELU: + case LM_GGML_UNARY_OP_LEAKY: + { + n_tasks = 1; + } break; + + case LM_GGML_UNARY_OP_GELU: + case LM_GGML_UNARY_OP_GELU_QUICK: + case LM_GGML_UNARY_OP_SILU: + { + n_tasks = n_threads; + } break; + default: + LM_GGML_ASSERT(false); + } + break; + case LM_GGML_OP_SILU_BACK: + case LM_GGML_OP_MUL: + case LM_GGML_OP_DIV: + case LM_GGML_OP_NORM: + case LM_GGML_OP_RMS_NORM: + case LM_GGML_OP_RMS_NORM_BACK: + case LM_GGML_OP_GROUP_NORM: + case LM_GGML_OP_CONCAT: + { + n_tasks = n_threads; + } break; + case LM_GGML_OP_MUL_MAT: + { + n_tasks = n_threads; + + // TODO: use different scheduling for different matrix sizes + //const int nr0 = lm_ggml_nrows(node->src[0]); + //const int nr1 = lm_ggml_nrows(node->src[1]); + + //n_tasks = MIN(n_threads, MAX(1, nr0/128)); + //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks); + +#if defined(LM_GGML_USE_CUBLAS) + if (lm_ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) { + n_tasks = 1; // TODO: this actually is doing nothing + // the threads are still spinning + } +#elif defined(LM_GGML_USE_CLBLAST) + if (lm_ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) { + n_tasks = 1; // TODO: this actually is doing nothing + // the threads are still spinning + } +#endif +#if defined(LM_GGML_USE_ACCELERATE) || defined(LM_GGML_USE_OPENBLAS) + if (lm_ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) { + n_tasks = 1; // TODO: this actually is doing nothing + // the threads are still spinning + } +#endif + } break; + case LM_GGML_OP_MUL_MAT_ID: + { + // FIXME: blas + n_tasks = n_threads; + } break; + case LM_GGML_OP_OUT_PROD: + { + n_tasks = n_threads; + } break; + case LM_GGML_OP_SCALE: + case LM_GGML_OP_SET: + case LM_GGML_OP_CONT: + case LM_GGML_OP_RESHAPE: + case LM_GGML_OP_VIEW: + case LM_GGML_OP_PERMUTE: + case LM_GGML_OP_TRANSPOSE: + case LM_GGML_OP_GET_ROWS: + case LM_GGML_OP_GET_ROWS_BACK: + case LM_GGML_OP_DIAG: + { + n_tasks = 1; + } break; + case LM_GGML_OP_DIAG_MASK_ZERO: + case LM_GGML_OP_DIAG_MASK_INF: + case LM_GGML_OP_SOFT_MAX_BACK: + case LM_GGML_OP_ROPE: + case LM_GGML_OP_ROPE_BACK: + case LM_GGML_OP_ADD_REL_POS: + { + n_tasks = n_threads; + } break; + case LM_GGML_OP_ALIBI: + { + n_tasks = 1; //TODO + } break; + case LM_GGML_OP_CLAMP: + { + n_tasks = 1; //TODO + } break; + case LM_GGML_OP_SOFT_MAX: + { + n_tasks = MIN(MIN(4, n_threads), lm_ggml_nrows(node->src[0])); + } break; + case LM_GGML_OP_CONV_TRANSPOSE_1D: + { + n_tasks = n_threads; + } break; + case LM_GGML_OP_IM2COL: + { + n_tasks = n_threads; + } break; + case LM_GGML_OP_CONV_TRANSPOSE_2D: + { + n_tasks = n_threads; + } break; + case LM_GGML_OP_POOL_1D: + case LM_GGML_OP_POOL_2D: + { + n_tasks = 1; + } break; + case LM_GGML_OP_UPSCALE: + { + n_tasks = n_threads; + } break; + case LM_GGML_OP_ARGSORT: + { + n_tasks = n_threads; + } break; + case LM_GGML_OP_FLASH_ATTN: + { + n_tasks = n_threads; + } break; + case LM_GGML_OP_FLASH_FF: + { + n_tasks = n_threads; + } break; + case LM_GGML_OP_FLASH_ATTN_BACK: + { + n_tasks = n_threads; + } break; + case LM_GGML_OP_WIN_PART: + case LM_GGML_OP_WIN_UNPART: + case LM_GGML_OP_GET_REL_POS: + case LM_GGML_OP_MAP_UNARY: + case LM_GGML_OP_MAP_BINARY: + case LM_GGML_OP_MAP_CUSTOM1_F32: + case LM_GGML_OP_MAP_CUSTOM2_F32: + case LM_GGML_OP_MAP_CUSTOM3_F32: + { + n_tasks = 1; + } break; + case LM_GGML_OP_MAP_CUSTOM1: + { + struct lm_ggml_map_custom1_op_params * p = (struct lm_ggml_map_custom1_op_params *) node->op_params; + if (p->n_tasks == LM_GGML_N_TASKS_MAX) { + n_tasks = n_threads; + } else { + n_tasks = MIN(p->n_tasks, n_threads); + } + } break; + case LM_GGML_OP_MAP_CUSTOM2: + { + struct lm_ggml_map_custom2_op_params * p = (struct lm_ggml_map_custom2_op_params *) node->op_params; + if (p->n_tasks == LM_GGML_N_TASKS_MAX) { + n_tasks = n_threads; + } else { + n_tasks = MIN(p->n_tasks, n_threads); + } + } break; + case LM_GGML_OP_MAP_CUSTOM3: + { + struct lm_ggml_map_custom3_op_params * p = (struct lm_ggml_map_custom3_op_params *) node->op_params; + if (p->n_tasks == LM_GGML_N_TASKS_MAX) { + n_tasks = n_threads; + } else { + n_tasks = MIN(p->n_tasks, n_threads); + } + } break; + case LM_GGML_OP_CROSS_ENTROPY_LOSS: + { + n_tasks = n_threads; + } break; + case LM_GGML_OP_CROSS_ENTROPY_LOSS_BACK: + { + n_tasks = n_threads; + } break; + case LM_GGML_OP_NONE: + { + n_tasks = 1; + } break; + case LM_GGML_OP_COUNT: + { + LM_GGML_ASSERT(false); + } break; + default: + { + fprintf(stderr, "%s: op not implemented: ", __func__); + if (node->op < LM_GGML_OP_COUNT) { + fprintf(stderr, "%s\n", lm_ggml_op_name(node->op)); + } else { + fprintf(stderr, "%d\n", node->op); + } + LM_GGML_ASSERT(false); + } break; + } + + assert(n_tasks > 0); - node->perf_runs++; - node->perf_cycles += cycles_cur; - node->perf_time_us += time_us_cur; + return n_tasks; } static thread_ret_t lm_ggml_graph_compute_thread(void * data) { @@ -16013,7 +15976,6 @@ static thread_ret_t lm_ggml_graph_compute_thread(void * data) { const struct lm_ggml_cgraph * cgraph = state->shared->cgraph; const struct lm_ggml_cplan * cplan = state->shared->cplan; - const int * n_tasks_arr = cplan->n_tasks; const int n_threads = state->shared->n_threads; set_numa_thread_affinity(state->ith, n_threads); @@ -16038,9 +16000,9 @@ static thread_ret_t lm_ggml_graph_compute_thread(void * data) { if (node_n != -1) { /* FINALIZE */ - struct lm_ggml_tensor * node = state->shared->cgraph->nodes[node_n]; + struct lm_ggml_tensor * node = cgraph->nodes[node_n]; if (LM_GGML_OP_HAS_FINALIZE[node->op]) { - params.nth = n_tasks_arr[node_n]; + params.nth = lm_ggml_get_n_tasks(node, n_threads); lm_ggml_compute_forward(¶ms, node); } lm_ggml_graph_compute_perf_stats_node(node, state->shared); @@ -16051,7 +16013,7 @@ static thread_ret_t lm_ggml_graph_compute_thread(void * data) { LM_GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes); struct lm_ggml_tensor * node = cgraph->nodes[node_n]; - const int n_tasks = n_tasks_arr[node_n]; + const int n_tasks = lm_ggml_get_n_tasks(node, n_threads); state->shared->perf_node_start_cycles = lm_ggml_perf_cycles(); state->shared->perf_node_start_time_us = lm_ggml_perf_time_us(); @@ -16109,7 +16071,7 @@ static thread_ret_t lm_ggml_graph_compute_thread(void * data) { /* COMPUTE */ struct lm_ggml_tensor * node = cgraph->nodes[node_n]; - const int n_tasks = n_tasks_arr[node_n]; + const int n_tasks = lm_ggml_get_n_tasks(node, n_threads); struct lm_ggml_compute_params params = { /*.type =*/ LM_GGML_TASK_COMPUTE, @@ -16139,125 +16101,44 @@ struct lm_ggml_cplan lm_ggml_graph_plan(struct lm_ggml_cgraph * cgraph, int n_th // thread scheduling for the different operations + work buffer size estimation for (int i = 0; i < cgraph->n_nodes; i++) { - int n_tasks = 1; - struct lm_ggml_tensor * node = cgraph->nodes[i]; + const int n_tasks = lm_ggml_get_n_tasks(node, n_threads); + + size_t cur = 0; + switch (node->op) { case LM_GGML_OP_CPY: case LM_GGML_OP_DUP: { - n_tasks = n_threads; - - size_t cur = 0; if (lm_ggml_is_quantized(node->type)) { cur = lm_ggml_type_size(LM_GGML_TYPE_F32) * node->ne[0] * n_tasks; } - - work_size = MAX(work_size, cur); } break; case LM_GGML_OP_ADD: case LM_GGML_OP_ADD1: { - n_tasks = n_threads; - - size_t cur = 0; - if (lm_ggml_is_quantized(node->src[0]->type)) { cur = lm_ggml_type_size(LM_GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks; } - - work_size = MAX(work_size, cur); } break; case LM_GGML_OP_ACC: { - n_tasks = n_threads; - - size_t cur = 0; - if (lm_ggml_is_quantized(node->src[0]->type)) { cur = lm_ggml_type_size(LM_GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks; } - - work_size = MAX(work_size, cur); - } break; - case LM_GGML_OP_SUB: - case LM_GGML_OP_DIV: - case LM_GGML_OP_SQR: - case LM_GGML_OP_SQRT: - case LM_GGML_OP_LOG: - case LM_GGML_OP_SUM: - case LM_GGML_OP_SUM_ROWS: - case LM_GGML_OP_MEAN: - case LM_GGML_OP_ARGMAX: - case LM_GGML_OP_REPEAT: - case LM_GGML_OP_REPEAT_BACK: - { - n_tasks = 1; - } break; - - case LM_GGML_OP_UNARY: - { - switch (lm_ggml_get_unary_op(node)) { - case LM_GGML_UNARY_OP_ABS: - case LM_GGML_UNARY_OP_SGN: - case LM_GGML_UNARY_OP_NEG: - case LM_GGML_UNARY_OP_STEP: - case LM_GGML_UNARY_OP_TANH: - case LM_GGML_UNARY_OP_ELU: - case LM_GGML_UNARY_OP_RELU: - { - n_tasks = 1; - } break; - - case LM_GGML_UNARY_OP_GELU: - case LM_GGML_UNARY_OP_GELU_QUICK: - case LM_GGML_UNARY_OP_SILU: - { - n_tasks = n_threads; - } break; - } - } break; - case LM_GGML_OP_SILU_BACK: - case LM_GGML_OP_MUL: - case LM_GGML_OP_NORM: - case LM_GGML_OP_RMS_NORM: - case LM_GGML_OP_RMS_NORM_BACK: - case LM_GGML_OP_GROUP_NORM: - { - n_tasks = n_threads; } break; - case LM_GGML_OP_CONCAT: case LM_GGML_OP_MUL_MAT: { - n_tasks = n_threads; - - // TODO: use different scheduling for different matrix sizes - //const int nr0 = lm_ggml_nrows(node->src[0]); - //const int nr1 = lm_ggml_nrows(node->src[1]); - - //n_tasks = MIN(n_threads, MAX(1, nr0/128)); - //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks); - - size_t cur = 0; const enum lm_ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type; -#if defined(LM_GGML_USE_CUBLAS) - if (lm_ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) { - n_tasks = 1; // TODO: this actually is doing nothing - // the threads are still spinning - } else -#elif defined(LM_GGML_USE_CLBLAST) +#if defined(LM_GGML_USE_CLBLAST) if (lm_ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) { - n_tasks = 1; // TODO: this actually is doing nothing - // the threads are still spinning cur = lm_ggml_cl_mul_mat_get_wsize(node->src[0], node->src[1], node); } else #endif #if defined(LM_GGML_USE_ACCELERATE) || defined(LM_GGML_USE_OPENBLAS) if (lm_ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) { - n_tasks = 1; // TODO: this actually is doing nothing - // the threads are still spinning if (node->src[0]->type != LM_GGML_TYPE_F32) { // here we need memory just for single 2D matrix from src0 cur = lm_ggml_type_size(LM_GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]); @@ -16266,108 +16147,37 @@ struct lm_ggml_cplan lm_ggml_graph_plan(struct lm_ggml_cgraph * cgraph, int n_th #endif if (node->src[1]->type != vec_dot_type) { cur = lm_ggml_type_size(vec_dot_type)*lm_ggml_nelements(node->src[1])/lm_ggml_blck_size(vec_dot_type); - } else { - cur = 0; } - - work_size = MAX(work_size, cur); + } break; + case LM_GGML_OP_MUL_MAT_ID: + { + const struct lm_ggml_tensor * a = node->src[2]; + const struct lm_ggml_tensor * b = node->src[1]; + const enum lm_ggml_type vec_dot_type = type_traits[a->type].vec_dot_type; +#if defined(LM_GGML_USE_ACCELERATE) || defined(LM_GGML_USE_OPENBLAS) + if (lm_ggml_compute_forward_mul_mat_use_blas(a, b, node)) { + if (a->type != LM_GGML_TYPE_F32) { + // here we need memory just for single 2D matrix from src0 + cur = lm_ggml_type_size(LM_GGML_TYPE_F32)*(a->ne[0]*a->ne[1]); + } + } else +#endif + if (b->type != vec_dot_type) { + cur = lm_ggml_type_size(vec_dot_type)*lm_ggml_nelements(b)/lm_ggml_blck_size(vec_dot_type); + } } break; case LM_GGML_OP_OUT_PROD: { - n_tasks = n_threads; - - size_t cur = 0; - if (lm_ggml_is_quantized(node->src[0]->type)) { cur = lm_ggml_type_size(LM_GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks; } - - work_size = MAX(work_size, cur); - } break; - case LM_GGML_OP_SCALE: - { - n_tasks = 1; } break; - case LM_GGML_OP_SET: - case LM_GGML_OP_CONT: - case LM_GGML_OP_RESHAPE: - case LM_GGML_OP_VIEW: - case LM_GGML_OP_PERMUTE: - case LM_GGML_OP_TRANSPOSE: - case LM_GGML_OP_GET_ROWS: - case LM_GGML_OP_GET_ROWS_BACK: - case LM_GGML_OP_DIAG: - { - n_tasks = 1; - } break; - case LM_GGML_OP_DIAG_MASK_ZERO: - case LM_GGML_OP_DIAG_MASK_INF: case LM_GGML_OP_SOFT_MAX: - case LM_GGML_OP_SOFT_MAX_BACK: - case LM_GGML_OP_ROPE: - case LM_GGML_OP_ROPE_BACK: - case LM_GGML_OP_ADD_REL_POS: - { - n_tasks = n_threads; - } break; - case LM_GGML_OP_ALIBI: - { - n_tasks = 1; //TODO - } break; - case LM_GGML_OP_CLAMP: - { - n_tasks = 1; //TODO - } break; - case LM_GGML_OP_CONV_1D: - { - n_tasks = n_threads; - - LM_GGML_ASSERT(node->src[0]->ne[3] == 1); - LM_GGML_ASSERT(node->src[1]->ne[2] == 1); - LM_GGML_ASSERT(node->src[1]->ne[3] == 1); - - const int64_t ne00 = node->src[0]->ne[0]; - const int64_t ne01 = node->src[0]->ne[1]; - const int64_t ne02 = node->src[0]->ne[2]; - - const int64_t ne10 = node->src[1]->ne[0]; - const int64_t ne11 = node->src[1]->ne[1]; - - const int64_t ne0 = node->ne[0]; - const int64_t ne1 = node->ne[1]; - const int64_t nk = ne00; - const int64_t ew0 = nk * ne01; - - UNUSED(ne02); - UNUSED(ne10); - UNUSED(ne11); - - size_t cur = 0; - - if (node->src[0]->type == LM_GGML_TYPE_F16 && - node->src[1]->type == LM_GGML_TYPE_F32) { - cur = sizeof(lm_ggml_fp16_t)*(ne0*ne1*ew0); - } else if (node->src[0]->type == LM_GGML_TYPE_F32 && - node->src[1]->type == LM_GGML_TYPE_F32) { - cur = sizeof(float)*(ne0*ne1*ew0); - } else { - LM_GGML_ASSERT(false); - } - - work_size = MAX(work_size, cur); - } break; - case LM_GGML_OP_CONV_1D_STAGE_0: - { - n_tasks = n_threads; - } break; - case LM_GGML_OP_CONV_1D_STAGE_1: { - n_tasks = n_threads; + cur = lm_ggml_type_size(LM_GGML_TYPE_F32) * node->ne[0] * n_tasks; } break; case LM_GGML_OP_CONV_TRANSPOSE_1D: { - n_tasks = n_threads; - LM_GGML_ASSERT(node->src[0]->ne[3] == 1); LM_GGML_ASSERT(node->src[1]->ne[2] == 1); LM_GGML_ASSERT(node->src[1]->ne[3] == 1); @@ -16379,7 +16189,6 @@ struct lm_ggml_cplan lm_ggml_graph_plan(struct lm_ggml_cgraph * cgraph, int n_th const int64_t ne10 = node->src[1]->ne[0]; // L const int64_t ne11 = node->src[1]->ne[1]; // Cin - size_t cur = 0; if (node->src[0]->type == LM_GGML_TYPE_F16 && node->src[1]->type == LM_GGML_TYPE_F32) { cur += sizeof(lm_ggml_fp16_t)*ne00*ne01*ne02; @@ -16391,59 +16200,9 @@ struct lm_ggml_cplan lm_ggml_graph_plan(struct lm_ggml_cgraph * cgraph, int n_th } else { LM_GGML_ASSERT(false); } - - work_size = MAX(work_size, cur); - } break; - case LM_GGML_OP_CONV_2D: - { - n_tasks = n_threads; - - const int64_t ne00 = node->src[0]->ne[0]; // W - const int64_t ne01 = node->src[0]->ne[1]; // H - const int64_t ne02 = node->src[0]->ne[2]; // C - const int64_t ne03 = node->src[0]->ne[3]; // N - - const int64_t ne10 = node->src[1]->ne[0]; // W - const int64_t ne11 = node->src[1]->ne[1]; // H - const int64_t ne12 = node->src[1]->ne[2]; // C - - const int64_t ne0 = node->ne[0]; - const int64_t ne1 = node->ne[1]; - const int64_t ne2 = node->ne[2]; - const int64_t ne3 = node->ne[3]; - const int64_t nk = ne00*ne01; - const int64_t ew0 = nk * ne02; - - UNUSED(ne03); - UNUSED(ne2); - - size_t cur = 0; - - if (node->src[0]->type == LM_GGML_TYPE_F16 && - node->src[1]->type == LM_GGML_TYPE_F32) { - // im2col: [N*OH*OW, IC*KH*KW] - cur = sizeof(lm_ggml_fp16_t)*(ne3*ne0*ne1*ew0); - } else if (node->src[0]->type == LM_GGML_TYPE_F32 && - node->src[1]->type == LM_GGML_TYPE_F32) { - cur = sizeof(float)* (ne10*ne11*ne12); - } else { - LM_GGML_ASSERT(false); - } - - work_size = MAX(work_size, cur); - } break; - case LM_GGML_OP_CONV_2D_STAGE_0: - { - n_tasks = n_threads; - } break; - case LM_GGML_OP_CONV_2D_STAGE_1: - { - n_tasks = n_threads; } break; case LM_GGML_OP_CONV_TRANSPOSE_2D: { - n_tasks = n_threads; - const int64_t ne00 = node->src[0]->ne[0]; // W const int64_t ne01 = node->src[0]->ne[1]; // H const int64_t ne02 = node->src[0]->ne[2]; // Channels Out @@ -16453,141 +16212,58 @@ struct lm_ggml_cplan lm_ggml_graph_plan(struct lm_ggml_cgraph * cgraph, int n_th const int64_t ne11 = node->src[1]->ne[1]; // H const int64_t ne12 = node->src[1]->ne[2]; // Channels In - size_t cur = 0; cur += sizeof(lm_ggml_fp16_t)*ne00*ne01*ne02*ne03; cur += sizeof(lm_ggml_fp16_t)*ne10*ne11*ne12; - - work_size = MAX(work_size, cur); - } break; - case LM_GGML_OP_POOL_1D: - case LM_GGML_OP_POOL_2D: - { - n_tasks = 1; - } break; - case LM_GGML_OP_UPSCALE: - { - n_tasks = n_threads; } break; case LM_GGML_OP_FLASH_ATTN: { - n_tasks = n_threads; - - size_t cur = 0; - const int64_t ne11 = lm_ggml_up(node->src[1]->ne[1], LM_GGML_SOFT_MAX_UNROLL); if (node->src[1]->type == LM_GGML_TYPE_F32) { cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1) cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2 - } - - if (node->src[1]->type == LM_GGML_TYPE_F16) { + } else if (node->src[1]->type == LM_GGML_TYPE_F16) { cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1) cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2 } - - work_size = MAX(work_size, cur); } break; case LM_GGML_OP_FLASH_FF: { - n_tasks = n_threads; - - size_t cur = 0; - if (node->src[1]->type == LM_GGML_TYPE_F32) { cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1) cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2 - } - - if (node->src[1]->type == LM_GGML_TYPE_F16) { + } else if (node->src[1]->type == LM_GGML_TYPE_F16) { cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1) cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2 } - - work_size = MAX(work_size, cur); } break; case LM_GGML_OP_FLASH_ATTN_BACK: { - n_tasks = n_threads; - - size_t cur = 0; - const int64_t D = node->src[0]->ne[0]; const int64_t ne11 = lm_ggml_up(node->src[1]->ne[1], LM_GGML_SOFT_MAX_UNROLL); const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in lm_ggml_compute_forward_flash_attn_back if (node->src[1]->type == LM_GGML_TYPE_F32) { cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1) cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2 - } - - if (node->src[1]->type == LM_GGML_TYPE_F16) { + } else if (node->src[1]->type == LM_GGML_TYPE_F16) { cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1) cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2 } - - work_size = MAX(work_size, cur); - } break; - case LM_GGML_OP_WIN_PART: - case LM_GGML_OP_WIN_UNPART: - case LM_GGML_OP_GET_REL_POS: - case LM_GGML_OP_MAP_UNARY: - case LM_GGML_OP_MAP_BINARY: - case LM_GGML_OP_MAP_CUSTOM1_F32: - case LM_GGML_OP_MAP_CUSTOM2_F32: - case LM_GGML_OP_MAP_CUSTOM3_F32: - { - n_tasks = 1; - } break; - case LM_GGML_OP_MAP_CUSTOM1: - { - struct lm_ggml_map_custom1_op_params * p = (struct lm_ggml_map_custom1_op_params *) node->op_params; - if (p->n_tasks == LM_GGML_N_TASKS_MAX) { - n_tasks = n_threads; - } else { - n_tasks = MIN(p->n_tasks, n_threads); - } - } break; - case LM_GGML_OP_MAP_CUSTOM2: - { - struct lm_ggml_map_custom2_op_params * p = (struct lm_ggml_map_custom2_op_params *) node->op_params; - if (p->n_tasks == LM_GGML_N_TASKS_MAX) { - n_tasks = n_threads; - } else { - n_tasks = MIN(p->n_tasks, n_threads); - } - } break; - case LM_GGML_OP_MAP_CUSTOM3: - { - struct lm_ggml_map_custom3_op_params * p = (struct lm_ggml_map_custom3_op_params *) node->op_params; - if (p->n_tasks == LM_GGML_N_TASKS_MAX) { - n_tasks = n_threads; - } else { - n_tasks = MIN(p->n_tasks, n_threads); - } } break; - case LM_GGML_OP_CROSS_ENTROPY_LOSS: - { - n_tasks = n_threads; - size_t cur = lm_ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks); - - work_size = MAX(work_size, cur); - } break; - case LM_GGML_OP_CROSS_ENTROPY_LOSS_BACK: - { - n_tasks = n_threads; - } break; - case LM_GGML_OP_NONE: + case LM_GGML_OP_CROSS_ENTROPY_LOSS: { - n_tasks = 1; + cur = lm_ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks); } break; case LM_GGML_OP_COUNT: { LM_GGML_ASSERT(false); } break; + default: + break; } - cplan.n_tasks[i] = n_tasks; + work_size = MAX(work_size, cur); } if (work_size > 0) { @@ -16609,12 +16285,6 @@ int lm_ggml_graph_compute(struct lm_ggml_cgraph * cgraph, struct lm_ggml_cplan * if (cplan->work_size > 0) { LM_GGML_ASSERT(cplan->work_data); } - - for (int i = 0; i < cgraph->n_nodes; ++i) { - if (cgraph->nodes[i]->op != LM_GGML_OP_NONE) { - LM_GGML_ASSERT(cplan->n_tasks[i] > 0); - } - } } const int n_threads = cplan->n_threads; @@ -16687,16 +16357,6 @@ int lm_ggml_graph_compute(struct lm_ggml_cgraph * cgraph, struct lm_ggml_cplan * return compute_status; } -void lm_ggml_graph_reset(struct lm_ggml_cgraph * cgraph) { - for (int i = 0; i < cgraph->n_nodes; i++) { - struct lm_ggml_tensor * grad = cgraph->grads[i]; - - if (grad) { - lm_ggml_set_zero(grad); - } - } -} - void lm_ggml_graph_compute_with_ctx(struct lm_ggml_context * ctx, struct lm_ggml_cgraph * cgraph, int n_threads) { struct lm_ggml_cplan cplan = lm_ggml_graph_plan(cgraph, n_threads); @@ -16823,12 +16483,12 @@ void lm_ggml_graph_export(const struct lm_ggml_cgraph * cgraph, const char * fna const uint32_t magic = LM_GGML_FILE_MAGIC; const uint32_t version = LM_GGML_FILE_VERSION; const uint32_t n_leafs = cgraph->n_leafs; - const uint32_t nodes = cgraph->n_nodes; + const uint32_t n_nodes = cgraph->n_nodes; fwrite(&magic, sizeof(uint32_t), 1, fout); fwrite(&version, sizeof(uint32_t), 1, fout); fwrite(&n_leafs, sizeof(uint32_t), 1, fout); - fwrite(&nodes, sizeof(uint32_t), 1, fout); + fwrite(&n_nodes, sizeof(uint32_t), 1, fout); fwrite(&size_eval, sizeof(uint64_t), 1, fout); } @@ -16916,7 +16576,7 @@ void lm_ggml_graph_export(const struct lm_ggml_cgraph * cgraph, const char * fna if (idx == -1) { for (int k = 0; k < cgraph->n_nodes; ++k) { if (args[j] == cgraph->nodes[k]) { - idx = LM_GGML_MAX_NODES + k; + idx = cgraph->n_leafs + k; break; } } @@ -16943,11 +16603,11 @@ void lm_ggml_graph_export(const struct lm_ggml_cgraph * cgraph, const char * fna } } -struct lm_ggml_cgraph lm_ggml_graph_import(const char * fname, struct lm_ggml_context ** ctx_data, struct lm_ggml_context ** ctx_eval) { +struct lm_ggml_cgraph * lm_ggml_graph_import(const char * fname, struct lm_ggml_context ** ctx_data, struct lm_ggml_context ** ctx_eval) { assert(*ctx_data == NULL); assert(*ctx_eval == NULL); - struct lm_ggml_cgraph result = { 0 }; + struct lm_ggml_cgraph * result = NULL; struct lm_ggml_tensor * data = NULL; @@ -17019,13 +16679,11 @@ struct lm_ggml_cgraph lm_ggml_graph_import(const char * fname, struct lm_ggml_co const uint32_t n_leafs = *(const uint32_t *) ptr; ptr += sizeof(n_leafs); const uint32_t n_nodes = *(const uint32_t *) ptr; ptr += sizeof(n_nodes); const uint64_t size_eval = *(const uint64_t *) ptr; ptr += sizeof(size_eval); - - result.n_leafs = n_leafs; - result.n_nodes = n_nodes; + const int graph_size = MAX(n_leafs, n_nodes); // create the data context { - const size_t overhead = (n_leafs + n_nodes)*lm_ggml_tensor_overhead(); + const size_t overhead = (n_leafs + n_nodes)*lm_ggml_tensor_overhead() + lm_ggml_graph_overhead_custom(graph_size, false); struct lm_ggml_init_params params = { .mem_size = size_eval + overhead, @@ -17041,6 +16699,12 @@ struct lm_ggml_cgraph lm_ggml_graph_import(const char * fname, struct lm_ggml_co } } + result = lm_ggml_new_graph_custom(*ctx_eval, graph_size, false); + + result->n_leafs = n_leafs; + result->n_nodes = n_nodes; + + // leafs { uint32_t type; @@ -17079,7 +16743,7 @@ struct lm_ggml_cgraph lm_ggml_graph_import(const char * fname, struct lm_ggml_co tensor->nb[j] = nb[j]; } - result.leafs[i] = tensor; + result->leafs[i] = tensor; ptr += lm_ggml_nbytes(tensor); @@ -17131,10 +16795,10 @@ struct lm_ggml_cgraph lm_ggml_graph_import(const char * fname, struct lm_ggml_co continue; } - if (arg_idx < LM_GGML_MAX_NODES) { - args[j] = result.leafs[arg_idx]; + if (arg_idx < result->n_leafs) { + args[j] = result->leafs[arg_idx]; } else { - args[j] = result.nodes[arg_idx - LM_GGML_MAX_NODES]; + args[j] = result->nodes[arg_idx - result->n_leafs]; } } @@ -17186,7 +16850,7 @@ struct lm_ggml_cgraph lm_ggml_graph_import(const char * fname, struct lm_ggml_co tensor->src[j] = args[j]; } - result.nodes[i] = tensor; + result->nodes[i] = tensor; fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, lm_ggml_nbytes(tensor)); } @@ -18091,10 +17755,11 @@ struct lm_ggml_opt_params lm_ggml_opt_default_params(enum lm_ggml_opt_type type) case LM_GGML_OPT_ADAM: { result = (struct lm_ggml_opt_params) { - .type = LM_GGML_OPT_ADAM, - .n_threads = 1, - .past = 0, - .delta = 1e-5f, + .type = LM_GGML_OPT_ADAM, + .graph_size = LM_GGML_DEFAULT_GRAPH_SIZE, + .n_threads = 1, // FIXME: LM_GGML_DEFAULT_N_THREADS ? + .past = 0, + .delta = 1e-5f, .max_no_improvement = 100, @@ -18121,10 +17786,11 @@ struct lm_ggml_opt_params lm_ggml_opt_default_params(enum lm_ggml_opt_type type) case LM_GGML_OPT_LBFGS: { result = (struct lm_ggml_opt_params) { - .type = LM_GGML_OPT_LBFGS, - .n_threads = 1, - .past = 0, - .delta = 1e-5f, + .type = LM_GGML_OPT_LBFGS, + .graph_size = LM_GGML_DEFAULT_GRAPH_SIZE, + .n_threads = 1, + .past = 0, + .delta = 1e-5f, .max_no_improvement = 0, @@ -18266,14 +17932,11 @@ enum lm_ggml_opt_result lm_ggml_opt_resume( struct lm_ggml_tensor * f) { // build forward + backward compute graphs - struct lm_ggml_tensor * gfbuf = lm_ggml_new_tensor_1d(ctx, LM_GGML_TYPE_I32, sizeof(struct lm_ggml_cgraph) / lm_ggml_type_size(LM_GGML_TYPE_I32)+ (sizeof(struct lm_ggml_cgraph) % lm_ggml_type_size(LM_GGML_TYPE_I32) ? 1 : 0)); - struct lm_ggml_tensor * gbbuf = lm_ggml_new_tensor_1d(ctx, LM_GGML_TYPE_I32, sizeof(struct lm_ggml_cgraph) / lm_ggml_type_size(LM_GGML_TYPE_I32)+ (sizeof(struct lm_ggml_cgraph) % lm_ggml_type_size(LM_GGML_TYPE_I32) ? 1 : 0)); - - struct lm_ggml_cgraph * gf = (struct lm_ggml_cgraph *) gfbuf->data; - struct lm_ggml_cgraph * gb = (struct lm_ggml_cgraph *) gbbuf->data; + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx, opt->params.graph_size, true); + lm_ggml_build_forward_expand(gf, f); - *gf = lm_ggml_build_forward (f); - *gb = lm_ggml_build_backward(ctx, gf, true); + struct lm_ggml_cgraph * gb = lm_ggml_graph_dup(ctx, gf); + lm_ggml_build_backward_expand(ctx, gf, gb, true); return lm_ggml_opt_resume_g(ctx, opt, f, gf, gb, NULL, NULL); } @@ -18376,8 +18039,8 @@ size_t lm_ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_ memcpy(&qh, &y[i].qh, sizeof(qh)); for (int j = 0; j < QK5_0; j += 2) { - const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; - const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12)); + const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4; + const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12)); // cast to 16 bins const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2; @@ -18406,8 +18069,8 @@ size_t lm_ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_ memcpy(&qh, &y[i].qh, sizeof(qh)); for (int j = 0; j < QK5_1; j += 2) { - const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; - const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12)); + const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4; + const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12)); // cast to 16 bins const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2; @@ -18597,6 +18260,7 @@ struct lm_gguf_kv { struct lm_gguf_header { char magic[4]; + uint32_t version; uint64_t n_tensors; // GGUFv2 uint64_t n_kv; // GGUFv2 @@ -18686,7 +18350,7 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg for (uint32_t i = 0; i < sizeof(magic); i++) { if (magic[i] != LM_GGUF_MAGIC[i]) { - fprintf(stderr, "%s: invalid magic characters %s.\n", __func__, magic); + fprintf(stderr, "%s: invalid magic characters '%c%c%c%c'\n", __func__, magic[0], magic[1], magic[2], magic[3]); fclose(file); return NULL; } @@ -18701,7 +18365,6 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg { strncpy(ctx->header.magic, magic, 4); - ctx->kv = NULL; ctx->infos = NULL; ctx->data = NULL; @@ -18729,7 +18392,7 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg { ctx->kv = malloc(ctx->header.n_kv * sizeof(struct lm_gguf_kv)); - for (uint32_t i = 0; i < ctx->header.n_kv; ++i) { + for (uint64_t i = 0; i < ctx->header.n_kv; ++i) { struct lm_gguf_kv * kv = &ctx->kv[i]; //fprintf(stderr, "%s: reading kv %d\n", __func__, i); @@ -18776,7 +18439,7 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg case LM_GGUF_TYPE_STRING: { kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct lm_gguf_str)); - for (uint32_t j = 0; j < kv->value.arr.n; ++j) { + for (uint64_t j = 0; j < kv->value.arr.n; ++j) { ok = ok && lm_gguf_fread_str(file, &((struct lm_gguf_str *) kv->value.arr.data)[j], &offset); } } break; @@ -18804,7 +18467,7 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg { ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct lm_gguf_tensor_info)); - for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) { + for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) { struct lm_gguf_tensor_info * info = &ctx->infos[i]; for (int j = 0; j < LM_GGML_MAX_DIMS; ++j) { @@ -18851,7 +18514,7 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg // compute the total size of the data section, taking into account the alignment { ctx->size = 0; - for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) { + for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) { struct lm_gguf_tensor_info * info = &ctx->infos[i]; const int64_t ne = @@ -18920,7 +18583,7 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg lm_ggml_set_no_alloc(ctx_data, true); // create the tensors - for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) { + for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) { const int64_t ne[LM_GGML_MAX_DIMS] = { ctx->infos[i].ne[0], ctx->infos[i].ne[1], @@ -19055,24 +18718,29 @@ int lm_gguf_find_key(const struct lm_gguf_context * ctx, const char * key) { } const char * lm_gguf_get_key(const struct lm_gguf_context * ctx, int key_id) { + LM_GGML_ASSERT(key_id >= 0 && key_id < lm_gguf_get_n_kv(ctx)); return ctx->kv[key_id].key.data; } enum lm_gguf_type lm_gguf_get_kv_type(const struct lm_gguf_context * ctx, int key_id) { + LM_GGML_ASSERT(key_id >= 0 && key_id < lm_gguf_get_n_kv(ctx)); return ctx->kv[key_id].type; } enum lm_gguf_type lm_gguf_get_arr_type(const struct lm_gguf_context * ctx, int key_id) { + LM_GGML_ASSERT(key_id >= 0 && key_id < lm_gguf_get_n_kv(ctx)); LM_GGML_ASSERT(ctx->kv[key_id].type == LM_GGUF_TYPE_ARRAY); return ctx->kv[key_id].value.arr.type; } const void * lm_gguf_get_arr_data(const struct lm_gguf_context * ctx, int key_id) { + LM_GGML_ASSERT(key_id >= 0 && key_id < lm_gguf_get_n_kv(ctx)); LM_GGML_ASSERT(ctx->kv[key_id].type == LM_GGUF_TYPE_ARRAY); return ctx->kv[key_id].value.arr.data; } const char * lm_gguf_get_arr_str(const struct lm_gguf_context * ctx, int key_id, int i) { + LM_GGML_ASSERT(key_id >= 0 && key_id < lm_gguf_get_n_kv(ctx)); LM_GGML_ASSERT(ctx->kv[key_id].type == LM_GGUF_TYPE_ARRAY); struct lm_gguf_kv * kv = &ctx->kv[key_id]; struct lm_gguf_str * str = &((struct lm_gguf_str *) kv->value.arr.data)[i]; @@ -19080,70 +18748,90 @@ const char * lm_gguf_get_arr_str(const struct lm_gguf_context * ctx, int key_id, } int lm_gguf_get_arr_n(const struct lm_gguf_context * ctx, int key_id) { + LM_GGML_ASSERT(key_id >= 0 && key_id < lm_gguf_get_n_kv(ctx)); LM_GGML_ASSERT(ctx->kv[key_id].type == LM_GGUF_TYPE_ARRAY); return ctx->kv[key_id].value.arr.n; } uint8_t lm_gguf_get_val_u8(const struct lm_gguf_context * ctx, int key_id) { + LM_GGML_ASSERT(key_id >= 0 && key_id < lm_gguf_get_n_kv(ctx)); LM_GGML_ASSERT(ctx->kv[key_id].type == LM_GGUF_TYPE_UINT8); return ctx->kv[key_id].value.uint8; } int8_t lm_gguf_get_val_i8(const struct lm_gguf_context * ctx, int key_id) { + LM_GGML_ASSERT(key_id >= 0 && key_id < lm_gguf_get_n_kv(ctx)); LM_GGML_ASSERT(ctx->kv[key_id].type == LM_GGUF_TYPE_INT8); return ctx->kv[key_id].value.int8; } uint16_t lm_gguf_get_val_u16(const struct lm_gguf_context * ctx, int key_id) { + LM_GGML_ASSERT(key_id >= 0 && key_id < lm_gguf_get_n_kv(ctx)); LM_GGML_ASSERT(ctx->kv[key_id].type == LM_GGUF_TYPE_UINT16); return ctx->kv[key_id].value.uint16; } int16_t lm_gguf_get_val_i16(const struct lm_gguf_context * ctx, int key_id) { + LM_GGML_ASSERT(key_id >= 0 && key_id < lm_gguf_get_n_kv(ctx)); LM_GGML_ASSERT(ctx->kv[key_id].type == LM_GGUF_TYPE_INT16); return ctx->kv[key_id].value.int16; } uint32_t lm_gguf_get_val_u32(const struct lm_gguf_context * ctx, int key_id) { + LM_GGML_ASSERT(key_id >= 0 && key_id < lm_gguf_get_n_kv(ctx)); LM_GGML_ASSERT(ctx->kv[key_id].type == LM_GGUF_TYPE_UINT32); return ctx->kv[key_id].value.uint32; } int32_t lm_gguf_get_val_i32(const struct lm_gguf_context * ctx, int key_id) { + LM_GGML_ASSERT(key_id >= 0 && key_id < lm_gguf_get_n_kv(ctx)); LM_GGML_ASSERT(ctx->kv[key_id].type == LM_GGUF_TYPE_INT32); return ctx->kv[key_id].value.int32; } float lm_gguf_get_val_f32(const struct lm_gguf_context * ctx, int key_id) { + LM_GGML_ASSERT(key_id >= 0 && key_id < lm_gguf_get_n_kv(ctx)); LM_GGML_ASSERT(ctx->kv[key_id].type == LM_GGUF_TYPE_FLOAT32); return ctx->kv[key_id].value.float32; } uint64_t lm_gguf_get_val_u64(const struct lm_gguf_context * ctx, int key_id) { + LM_GGML_ASSERT(key_id >= 0 && key_id < lm_gguf_get_n_kv(ctx)); LM_GGML_ASSERT(ctx->kv[key_id].type == LM_GGUF_TYPE_UINT64); return ctx->kv[key_id].value.uint64; } int64_t lm_gguf_get_val_i64(const struct lm_gguf_context * ctx, int key_id) { + LM_GGML_ASSERT(key_id >= 0 && key_id < lm_gguf_get_n_kv(ctx)); LM_GGML_ASSERT(ctx->kv[key_id].type == LM_GGUF_TYPE_INT64); return ctx->kv[key_id].value.int64; } double lm_gguf_get_val_f64(const struct lm_gguf_context * ctx, int key_id) { + LM_GGML_ASSERT(key_id >= 0 && key_id < lm_gguf_get_n_kv(ctx)); LM_GGML_ASSERT(ctx->kv[key_id].type == LM_GGUF_TYPE_FLOAT64); return ctx->kv[key_id].value.float64; } bool lm_gguf_get_val_bool(const struct lm_gguf_context * ctx, int key_id) { + LM_GGML_ASSERT(key_id >= 0 && key_id < lm_gguf_get_n_kv(ctx)); LM_GGML_ASSERT(ctx->kv[key_id].type == LM_GGUF_TYPE_BOOL); return ctx->kv[key_id].value.bool_; } const char * lm_gguf_get_val_str(const struct lm_gguf_context * ctx, int key_id) { + LM_GGML_ASSERT(key_id >= 0 && key_id < lm_gguf_get_n_kv(ctx)); LM_GGML_ASSERT(ctx->kv[key_id].type == LM_GGUF_TYPE_STRING); return ctx->kv[key_id].value.str.data; } +const void * lm_gguf_get_val_data(const struct lm_gguf_context * ctx, int key_id) { + LM_GGML_ASSERT(key_id >= 0 && key_id < lm_gguf_get_n_kv(ctx)); + LM_GGML_ASSERT(ctx->kv[key_id].type != LM_GGUF_TYPE_ARRAY); + LM_GGML_ASSERT(ctx->kv[key_id].type != LM_GGUF_TYPE_STRING); + return &ctx->kv[key_id].value; +} + int lm_gguf_get_n_tensors(const struct lm_gguf_context * ctx) { return ctx->header.n_tensors; } diff --git a/cpp/ggml.h b/cpp/ggml.h index 3cbd5d8..2000b8f 100644 --- a/cpp/ggml.h +++ b/cpp/ggml.h @@ -58,7 +58,8 @@ // { // ... // -// struct lm_ggml_cgraph gf = lm_ggml_build_forward(f); +// struct lm_ggml_cgraph * gf = lm_ggml_new_graph(ctx); +// lm_ggml_build_forward_expand(gf, f); // // // set the input variable and parameter values // lm_ggml_set_f32(x, 2.0f); @@ -213,15 +214,14 @@ #define LM_GGML_QNT_VERSION 2 // bump this on quantization format changes #define LM_GGML_QNT_VERSION_FACTOR 1000 // do not change this -#define LM_GGML_MAX_DIMS 4 -#define LM_GGML_MAX_NODES 16384 -#define LM_GGML_MAX_PARAMS 1024 -#define LM_GGML_MAX_CONTEXTS 64 -#define LM_GGML_MAX_SRC 6 -#define LM_GGML_MAX_NAME 64 -#define LM_GGML_MAX_OP_PARAMS 64 -#define LM_GGML_DEFAULT_N_THREADS 4 - +#define LM_GGML_MAX_DIMS 4 +#define LM_GGML_MAX_PARAMS 1024 +#define LM_GGML_MAX_CONTEXTS 64 +#define LM_GGML_MAX_SRC 6 +#define LM_GGML_MAX_NAME 64 +#define LM_GGML_MAX_OP_PARAMS 64 +#define LM_GGML_DEFAULT_N_THREADS 4 +#define LM_GGML_DEFAULT_GRAPH_SIZE 2048 #if UINTPTR_MAX == 0xFFFFFFFF #define LM_GGML_MEM_ALIGN 4 #else @@ -244,7 +244,9 @@ #define LM_GGML_ASSERT(x) \ do { \ if (!(x)) { \ + fflush(stdout); \ fprintf(stderr, "LM_GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \ + lm_ggml_print_backtrace(); \ abort(); \ } \ } while (0) @@ -281,6 +283,20 @@ const type prefix##3 = (pointer)->array[3]; \ LM_GGML_UNUSED(prefix##3); +#define LM_GGML_TENSOR_UNARY_OP_LOCALS \ + LM_GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \ + LM_GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \ + LM_GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \ + LM_GGML_TENSOR_LOCALS(size_t, nb, dst, nb) + +#define LM_GGML_TENSOR_BINARY_OP_LOCALS \ + LM_GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \ + LM_GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \ + LM_GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \ + LM_GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \ + LM_GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \ + LM_GGML_TENSOR_LOCALS(size_t, nb, dst, nb) + #ifdef __cplusplus extern "C" { #endif @@ -379,6 +395,7 @@ extern "C" { LM_GGML_OP_GROUP_NORM, LM_GGML_OP_MUL_MAT, + LM_GGML_OP_MUL_MAT_ID, LM_GGML_OP_OUT_PROD, LM_GGML_OP_SCALE, @@ -400,18 +417,13 @@ extern "C" { LM_GGML_OP_ROPE_BACK, LM_GGML_OP_ALIBI, LM_GGML_OP_CLAMP, - LM_GGML_OP_CONV_1D, - LM_GGML_OP_CONV_1D_STAGE_0, // internal - LM_GGML_OP_CONV_1D_STAGE_1, // internal LM_GGML_OP_CONV_TRANSPOSE_1D, - LM_GGML_OP_CONV_2D, - LM_GGML_OP_CONV_2D_STAGE_0, // internal - LM_GGML_OP_CONV_2D_STAGE_1, // internal + LM_GGML_OP_IM2COL, LM_GGML_OP_CONV_TRANSPOSE_2D, LM_GGML_OP_POOL_1D, LM_GGML_OP_POOL_2D, - LM_GGML_OP_UPSCALE, // nearest interpolate + LM_GGML_OP_ARGSORT, LM_GGML_OP_FLASH_ATTN, LM_GGML_OP_FLASH_FF, @@ -451,6 +463,9 @@ extern "C" { LM_GGML_UNARY_OP_GELU, LM_GGML_UNARY_OP_GELU_QUICK, LM_GGML_UNARY_OP_SILU, + LM_GGML_UNARY_OP_LEAKY, + + LM_GGML_UNARY_OP_COUNT, }; enum lm_ggml_object_type { @@ -531,37 +546,33 @@ extern "C" { int n_threads; - // the `n_tasks` of nodes, 1:1 mapping to cgraph nodes - int n_tasks[LM_GGML_MAX_NODES]; - // abort lm_ggml_graph_compute when true bool (*abort_callback)(void * data); void * abort_callback_data; }; - // next prime after LM_GGML_MAX_NODES - // #define LM_GGML_GRAPH_HASHTABLE_SIZE 4099 - // next prime after LM_GGML_MAX_NODES * 2 (nodes + leafs) - // #define LM_GGML_GRAPH_HASHTABLE_SIZE 8273 - // #define LM_GGML_GRAPH_HASHTABLE_SIZE 16411 - #define LM_GGML_GRAPH_HASHTABLE_SIZE 32771 - enum lm_ggml_cgraph_eval_order { LM_GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0, LM_GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT, LM_GGML_CGRAPH_EVAL_ORDER_COUNT }; + struct lm_ggml_hash_set { + size_t size; + struct lm_ggml_tensor ** keys; + }; + // computation graph struct lm_ggml_cgraph { + int size; int n_nodes; int n_leafs; - struct lm_ggml_tensor * nodes[LM_GGML_MAX_NODES]; - struct lm_ggml_tensor * grads[LM_GGML_MAX_NODES]; - struct lm_ggml_tensor * leafs[LM_GGML_MAX_NODES]; + struct lm_ggml_tensor ** nodes; + struct lm_ggml_tensor ** grads; + struct lm_ggml_tensor ** leafs; - void * visited_hash_table[LM_GGML_GRAPH_HASHTABLE_SIZE]; + struct lm_ggml_hash_set visited_hash_table; enum lm_ggml_cgraph_eval_order order; @@ -571,8 +582,6 @@ extern "C" { int64_t perf_time_us; }; - static const size_t LM_GGML_GRAPH_SIZE = sizeof(struct lm_ggml_cgraph); - // scratch buffer struct lm_ggml_scratch { size_t offs; @@ -617,6 +626,8 @@ extern "C" { LM_GGML_API int64_t lm_ggml_cycles(void); LM_GGML_API int64_t lm_ggml_cycles_per_ms(void); + LM_GGML_API void lm_ggml_print_backtrace(void); + LM_GGML_API void lm_ggml_numa_init(void); // call once for better performance on NUMA systems LM_GGML_API bool lm_ggml_is_numa(void); // true if init detected that system has >1 NUMA node @@ -637,6 +648,9 @@ extern "C" { LM_GGML_API const char * lm_ggml_op_name (enum lm_ggml_op op); LM_GGML_API const char * lm_ggml_op_symbol(enum lm_ggml_op op); + LM_GGML_API const char * lm_ggml_unary_op_name(enum lm_ggml_unary_op op); + LM_GGML_API const char * lm_ggml_op_desc(const struct lm_ggml_tensor * t); // unary or op name + LM_GGML_API size_t lm_ggml_element_size(const struct lm_ggml_tensor * tensor); LM_GGML_API bool lm_ggml_is_quantized(enum lm_ggml_type type); @@ -709,7 +723,7 @@ extern "C" { // Context tensor enumeration and lookup LM_GGML_API struct lm_ggml_tensor * lm_ggml_get_first_tensor(struct lm_ggml_context * ctx); LM_GGML_API struct lm_ggml_tensor * lm_ggml_get_next_tensor (struct lm_ggml_context * ctx, struct lm_ggml_tensor * tensor); - LM_GGML_API struct lm_ggml_tensor * lm_ggml_get_tensor (struct lm_ggml_context * ctx, const char * name); + LM_GGML_API struct lm_ggml_tensor * lm_ggml_get_tensor(struct lm_ggml_context * ctx, const char * name); LM_GGML_API struct lm_ggml_tensor * lm_ggml_set_zero(struct lm_ggml_tensor * tensor); LM_GGML_API struct lm_ggml_tensor * lm_ggml_set_i32 (struct lm_ggml_tensor * tensor, int32_t value); @@ -943,6 +957,10 @@ extern "C" { struct lm_ggml_context * ctx, struct lm_ggml_tensor * a); + LM_GGML_API struct lm_ggml_tensor * lm_ggml_leaky( + struct lm_ggml_context * ctx, + struct lm_ggml_tensor * a); + LM_GGML_API struct lm_ggml_tensor * lm_ggml_relu_inplace( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a); @@ -1029,6 +1047,15 @@ extern "C" { struct lm_ggml_tensor * a, struct lm_ggml_tensor * b); + // indirect matrix multiplication + // lm_ggml_mul_mat_id(ctx, as, ids, id, b) ~= lm_ggml_mul_mat(as[ids[id]], b) + LM_GGML_API struct lm_ggml_tensor * lm_ggml_mul_mat_id( + struct lm_ggml_context * ctx, + struct lm_ggml_tensor * as[], + struct lm_ggml_tensor * ids, + int id, + struct lm_ggml_tensor * b); + // A: m columns, n rows, // B: p columns, n rows, // result is m columns, p rows @@ -1284,6 +1311,14 @@ extern "C" { struct lm_ggml_context * ctx, struct lm_ggml_tensor * a); + // fused soft_max(a*scale + mask) + // mask is optional + LM_GGML_API struct lm_ggml_tensor * lm_ggml_soft_max_ext( + struct lm_ggml_context * ctx, + struct lm_ggml_tensor * a, + struct lm_ggml_tensor * mask, + float scale); + LM_GGML_API struct lm_ggml_tensor * lm_ggml_soft_max_back( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, @@ -1399,6 +1434,18 @@ extern "C" { float min, float max); + LM_GGML_API struct lm_ggml_tensor * lm_ggml_im2col( + struct lm_ggml_context * ctx, + struct lm_ggml_tensor * a, + struct lm_ggml_tensor * b, + int s0, + int s1, + int p0, + int p1, + int d0, + int d1, + bool is_2D); + LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_1d( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, @@ -1482,6 +1529,8 @@ extern "C" { int s0, // stride int p0); // padding + // the result will have 2*p0 padding for the first dimension + // and 2*p1 padding for the second dimension LM_GGML_API struct lm_ggml_tensor * lm_ggml_pool_2d( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, @@ -1490,8 +1539,8 @@ extern "C" { int k1, int s0, int s1, - int p0, - int p1); + float p0, + float p1); // nearest interpolate // used in stable-diffusion @@ -1500,6 +1549,23 @@ extern "C" { struct lm_ggml_tensor * a, int scale_factor); + // sort rows + enum lm_ggml_sort_order { + LM_GGML_SORT_ASC, + LM_GGML_SORT_DESC, + }; + + LM_GGML_API struct lm_ggml_tensor * lm_ggml_argsort( + struct lm_ggml_context * ctx, + struct lm_ggml_tensor * a, + enum lm_ggml_sort_order order); + + // top k elements per row + LM_GGML_API struct lm_ggml_tensor * lm_ggml_top_k( + struct lm_ggml_context * ctx, + struct lm_ggml_tensor * a, + int k); + LM_GGML_API struct lm_ggml_tensor * lm_ggml_flash_attn( struct lm_ggml_context * ctx, struct lm_ggml_tensor * q, @@ -1561,7 +1627,6 @@ extern "C" { int kh); // used in sam - LM_GGML_API struct lm_ggml_tensor * lm_ggml_add_rel_pos( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, @@ -1732,19 +1797,22 @@ extern "C" { LM_GGML_API void lm_ggml_build_forward_expand (struct lm_ggml_cgraph * cgraph, struct lm_ggml_tensor * tensor); LM_GGML_API void lm_ggml_build_backward_expand(struct lm_ggml_context * ctx, struct lm_ggml_cgraph * gf, struct lm_ggml_cgraph * gb, bool keep); - LM_GGML_API struct lm_ggml_cgraph lm_ggml_build_forward (struct lm_ggml_tensor * tensor); - LM_GGML_API struct lm_ggml_cgraph lm_ggml_build_backward(struct lm_ggml_context * ctx, struct lm_ggml_cgraph * gf, bool keep); - // graph allocation in a context - LM_GGML_API struct lm_ggml_cgraph * lm_ggml_new_graph (struct lm_ggml_context * ctx); - LM_GGML_API struct lm_ggml_cgraph * lm_ggml_build_forward_ctx(struct lm_ggml_context * ctx, struct lm_ggml_tensor * tensor); + LM_GGML_API struct lm_ggml_cgraph * lm_ggml_new_graph (struct lm_ggml_context * ctx); // size = LM_GGML_DEFAULT_GRAPH_SIZE, grads = false + LM_GGML_API struct lm_ggml_cgraph * lm_ggml_new_graph_custom (struct lm_ggml_context * ctx, size_t size, bool grads); + LM_GGML_API struct lm_ggml_cgraph * lm_ggml_graph_dup (struct lm_ggml_context * ctx, struct lm_ggml_cgraph * cgraph); + LM_GGML_API struct lm_ggml_cgraph lm_ggml_graph_view (struct lm_ggml_cgraph * cgraph, int i0, int i1); + LM_GGML_API void lm_ggml_graph_cpy (struct lm_ggml_cgraph * src, struct lm_ggml_cgraph * dst); + LM_GGML_API void lm_ggml_graph_reset (struct lm_ggml_cgraph * cgraph); // zero grads + LM_GGML_API void lm_ggml_graph_clear (struct lm_ggml_cgraph * cgraph); + LM_GGML_API size_t lm_ggml_graph_overhead(void); + LM_GGML_API size_t lm_ggml_graph_overhead_custom(size_t size, bool grads); // lm_ggml_graph_plan() has to be called before lm_ggml_graph_compute() // when plan.work_size > 0, caller must allocate memory for plan.work_data LM_GGML_API struct lm_ggml_cplan lm_ggml_graph_plan (struct lm_ggml_cgraph * cgraph, int n_threads /*= LM_GGML_DEFAULT_N_THREADS*/); - LM_GGML_API int lm_ggml_graph_compute(struct lm_ggml_cgraph * cgraph, struct lm_ggml_cplan * cplan); - LM_GGML_API void lm_ggml_graph_reset (struct lm_ggml_cgraph * cgraph); + LM_GGML_API int lm_ggml_graph_compute(struct lm_ggml_cgraph * cgraph, struct lm_ggml_cplan * cplan); // same as lm_ggml_graph_compute() but the work data is allocated as a part of the context // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data @@ -1752,8 +1820,8 @@ extern "C" { LM_GGML_API struct lm_ggml_tensor * lm_ggml_graph_get_tensor(struct lm_ggml_cgraph * cgraph, const char * name); - LM_GGML_API void lm_ggml_graph_export(const struct lm_ggml_cgraph * cgraph, const char * fname); - LM_GGML_API struct lm_ggml_cgraph lm_ggml_graph_import(const char * fname, struct lm_ggml_context ** ctx_data, struct lm_ggml_context ** ctx_eval); + LM_GGML_API void lm_ggml_graph_export(const struct lm_ggml_cgraph * cgraph, const char * fname); + LM_GGML_API struct lm_ggml_cgraph * lm_ggml_graph_import(const char * fname, struct lm_ggml_context ** ctx_data, struct lm_ggml_context ** ctx_eval); // print info and performance information for the graph LM_GGML_API void lm_ggml_graph_print(const struct lm_ggml_cgraph * cgraph); @@ -1816,6 +1884,8 @@ extern "C" { struct lm_ggml_opt_params { enum lm_ggml_opt_type type; + size_t graph_size; + int n_threads; // delta-based convergence test @@ -2027,6 +2097,7 @@ extern "C" { LM_GGML_API double lm_gguf_get_val_f64 (const struct lm_gguf_context * ctx, int key_id); LM_GGML_API bool lm_gguf_get_val_bool(const struct lm_gguf_context * ctx, int key_id); LM_GGML_API const char * lm_gguf_get_val_str (const struct lm_gguf_context * ctx, int key_id); + LM_GGML_API const void * lm_gguf_get_val_data(const struct lm_gguf_context * ctx, int key_id); LM_GGML_API int lm_gguf_get_arr_n (const struct lm_gguf_context * ctx, int key_id); LM_GGML_API const void * lm_gguf_get_arr_data(const struct lm_gguf_context * ctx, int key_id); LM_GGML_API const char * lm_gguf_get_arr_str (const struct lm_gguf_context * ctx, int key_id, int i); diff --git a/cpp/grammar-parser.cpp b/cpp/grammar-parser.cpp index ff51cc8..bf89a96 100644 --- a/cpp/grammar-parser.cpp +++ b/cpp/grammar-parser.cpp @@ -190,7 +190,7 @@ namespace grammar_parser { pos = parse_space(pos + 1, is_nested); } else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator if (last_sym_start == out_elements.size()) { - throw std::runtime_error(std::string("expecting preceeding item to */+/? at ") + pos); + throw std::runtime_error(std::string("expecting preceding item to */+/? at ") + pos); } // apply transformation to previous symbol (last_sym_start to end) according to diff --git a/cpp/llama.cpp b/cpp/llama.cpp index b62b25a..f1de553 100644 --- a/cpp/llama.cpp +++ b/cpp/llama.cpp @@ -46,7 +46,6 @@ #endif #include #include - #include // for _fseeki64 #endif #include @@ -75,6 +74,7 @@ #include #include #include +#include #include #if defined(_MSC_VER) @@ -91,6 +91,8 @@ #define LLAMA_ATTRIBUTE_FORMAT(...) #endif +#define LLAMA_MAX_NODES 8192 + // // logging // @@ -201,6 +203,8 @@ enum llm_arch { LLM_ARCH_PERSIMMON, LLM_ARCH_REFACT, LLM_ARCH_BLOOM, + LLM_ARCH_STABLELM, + LLM_ARCH_QWEN, LLM_ARCH_UNKNOWN, }; @@ -216,6 +220,8 @@ static std::map LLM_ARCH_NAMES = { { LLM_ARCH_PERSIMMON, "persimmon" }, { LLM_ARCH_REFACT, "refact" }, { LLM_ARCH_BLOOM, "bloom" }, + { LLM_ARCH_STABLELM, "stablelm" }, + { LLM_ARCH_QWEN, "qwen" }, }; enum llm_kv { @@ -262,6 +268,8 @@ enum llm_kv { LLM_KV_TOKENIZER_UNK_ID, LLM_KV_TOKENIZER_SEP_ID, LLM_KV_TOKENIZER_PAD_ID, + LLM_KV_TOKENIZER_ADD_BOS, + LLM_KV_TOKENIZER_ADD_EOS, LLM_KV_TOKENIZER_HF_JSON, LLM_KV_TOKENIZER_RWKV, }; @@ -310,6 +318,8 @@ static std::map LLM_KV_NAMES = { { LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" }, { LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" }, { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" }, + { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" }, + { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" }, { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" }, { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" }, }; @@ -504,6 +514,41 @@ static std::map> LLM_TENSOR_NAMES = { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, }, }, + { + LLM_ARCH_STABLELM, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, + { + LLM_ARCH_QWEN, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, + { LLM_ARCH_UNKNOWN, { @@ -557,21 +602,6 @@ struct LLM_TN { // gguf helpers // -#define LM_GGUF_GET_KEY(ctx, dst, func, type, req, key) \ -do { \ - const std::string skey(key); \ - const int kid = lm_gguf_find_key(ctx, skey.c_str()); \ - if (kid >= 0) { \ - enum lm_gguf_type ktype = lm_gguf_get_kv_type(ctx, kid); \ - if (ktype != (type)) { \ - throw std::runtime_error(format("key %s has wrong type: %s", skey.c_str(), lm_gguf_type_name(ktype))); \ - } \ - (dst) = func(ctx, kid); \ - } else if (req) { \ - throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \ - } \ -} while (0) - static std::map LLAMA_ROPE_SCALING_TYPES = { { LLAMA_ROPE_SCALING_NONE, "none" }, { LLAMA_ROPE_SCALING_LINEAR, "linear" }, @@ -588,6 +618,60 @@ static int8_t llama_rope_scaling_type_from_string(const std::string & name) { return LLAMA_ROPE_SCALING_UNSPECIFIED; } +static std::string lm_gguf_data_to_str(enum lm_gguf_type type, const void * data, int i) { + switch (type) { + case LM_GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]); + case LM_GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]); + case LM_GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]); + case LM_GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]); + case LM_GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]); + case LM_GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]); + case LM_GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]); + case LM_GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]); + case LM_GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]); + case LM_GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]); + case LM_GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false"; + default: return format("unknown type %d", type); + } +} + +static std::string lm_gguf_kv_to_str(const struct lm_gguf_context * ctx_gguf, int i) { + const enum lm_gguf_type type = lm_gguf_get_kv_type(ctx_gguf, i); + + switch (type) { + case LM_GGUF_TYPE_STRING: + return lm_gguf_get_val_str(ctx_gguf, i); + case LM_GGUF_TYPE_ARRAY: + { + const enum lm_gguf_type arr_type = lm_gguf_get_arr_type(ctx_gguf, i); + int arr_n = lm_gguf_get_arr_n(ctx_gguf, i); + const void * data = lm_gguf_get_arr_data(ctx_gguf, i); + std::stringstream ss; + ss << "["; + for (int j = 0; j < arr_n; j++) { + if (arr_type == LM_GGUF_TYPE_STRING) { + std::string val = lm_gguf_get_arr_str(ctx_gguf, i, j); + // escape quotes + replace_all(val, "\\", "\\\\"); + replace_all(val, "\"", "\\\""); + ss << '"' << val << '"'; + } else if (arr_type == LM_GGUF_TYPE_ARRAY) { + ss << "???"; + } else { + ss << lm_gguf_data_to_str(arr_type, data, j); + } + if (j < arr_n - 1) { + ss << ", "; + } + } + ss << "]"; + return ss.str(); + } + default: + return lm_gguf_data_to_str(type, lm_gguf_get_val_data(ctx_gguf, i), 0); + } +} + // // ggml helpers // @@ -1048,6 +1132,12 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_ // struct llama_state { + llama_state() { +#ifdef LM_GGML_USE_METAL + lm_ggml_metal_log_set_callback(log_callback, log_callback_user_data); +#endif + } + // We save the log callback globally lm_ggml_log_callback log_callback = llama_log_callback_default; void * log_callback_user_data = nullptr; @@ -1071,9 +1161,9 @@ enum e_model { MODEL_70B, }; -static const size_t kB = 1024; -static const size_t MB = 1024*kB; -static const size_t GB = 1024*MB; +static const size_t kiB = 1024; +static const size_t MiB = 1024*kiB; +static const size_t GiB = 1024*MiB; struct llama_hparams { bool vocab_only; @@ -1152,6 +1242,7 @@ struct llama_cparams { float yarn_beta_slow; bool mul_mat_q; + bool offload_kqv; }; struct llama_layer { @@ -1173,6 +1264,9 @@ struct llama_layer { struct lm_ggml_tensor * wqkv; // attention bias + struct lm_ggml_tensor * bq; + struct lm_ggml_tensor * bk; + struct lm_ggml_tensor * bv; struct lm_ggml_tensor * bo; struct lm_ggml_tensor * bqkv; @@ -1210,14 +1304,15 @@ struct llama_kv_cache { // cannot be freely changed after a slot has been allocated. uint32_t head = 0; uint32_t size = 0; + uint32_t used = 0; // used cells (i.e. at least one seq_id) // computed before each graph build uint32_t n = 0; std::vector cells; - struct lm_ggml_tensor * k = NULL; - struct lm_ggml_tensor * v = NULL; + std::vector k_l; // per layer + std::vector v_l; struct lm_ggml_context * ctx = NULL; @@ -1230,8 +1325,10 @@ struct llama_kv_cache { #ifdef LM_GGML_USE_CUBLAS if (lm_ggml_cublas_loaded()) { - lm_ggml_cuda_free_data(k); - lm_ggml_cuda_free_data(v); + for (size_t i = 0; i < k_l.size(); ++i) { + lm_ggml_cuda_free_data(k_l[i]); + lm_ggml_cuda_free_data(v_l[i]); + } } #endif } @@ -1264,6 +1361,9 @@ struct llama_vocab { id special_sep_id = -1; id special_pad_id = -1; + int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add. + int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add. + id linefeed_id = 13; id special_prefix_id = 32007; id special_middle_id = 32009; @@ -1308,6 +1408,9 @@ struct llama_model { int n_gpu_layers; + // gguf metadata + std::unordered_map lm_gguf_kv; + // context struct lm_ggml_context * ctx = NULL; @@ -1415,9 +1518,11 @@ struct llama_context { static bool llama_kv_cache_init( const struct llama_hparams & hparams, struct llama_kv_cache & cache, - lm_ggml_type wtype, + lm_ggml_type ktype, + lm_ggml_type vtype, uint32_t n_ctx, - int n_gpu_layers) { + int n_gpu_layers, + bool offload) { const uint32_t n_embd = hparams.n_embd_gqa(); const uint32_t n_layer = hparams.n_layer; @@ -1428,11 +1533,12 @@ static bool llama_kv_cache_init( cache.head = 0; cache.size = n_ctx; + cache.used = 0; cache.cells.clear(); cache.cells.resize(n_ctx); - cache.buf.resize(2u*n_elements*lm_ggml_type_size(wtype) + 2u*lm_ggml_tensor_overhead()); + cache.buf.resize(n_elements*(lm_ggml_type_sizef(ktype) + lm_ggml_type_sizef(vtype)) + 2u*n_layer*lm_ggml_tensor_overhead()); memset(cache.buf.data, 0, cache.buf.size); struct lm_ggml_init_params params; @@ -1442,37 +1548,44 @@ static bool llama_kv_cache_init( cache.ctx = lm_ggml_init(params); + size_t vram_kv_cache = 0; + if (!cache.ctx) { LLAMA_LOG_ERROR("%s: failed to allocate memory for kv cache\n", __func__); return false; } - cache.k = lm_ggml_new_tensor_1d(cache.ctx, wtype, n_elements); - cache.v = lm_ggml_new_tensor_1d(cache.ctx, wtype, n_elements); - lm_ggml_set_name(cache.k, "cache_k"); - lm_ggml_set_name(cache.v, "cache_v"); + cache.k_l.reserve(n_layer); + cache.v_l.reserve(n_layer); - (void) n_gpu_layers; + const int i_gpu_start = (int) n_layer - n_gpu_layers; LM_GGML_UNUSED(i_gpu_start); -#ifdef LM_GGML_USE_CUBLAS - if (lm_ggml_cublas_loaded()) { - size_t vram_kv_cache = 0; + LM_GGML_UNUSED(offload); - if (n_gpu_layers > (int)n_layer + 1) { - lm_ggml_cuda_assign_buffers_no_scratch(cache.v); - LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__); - vram_kv_cache += lm_ggml_nbytes(cache.v); - } - if (n_gpu_layers > (int)n_layer + 2) { - lm_ggml_cuda_assign_buffers_no_scratch(cache.k); - LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__); - vram_kv_cache += lm_ggml_nbytes(cache.k); - } - if (vram_kv_cache > 0) { - LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0); + for (int i = 0; i < (int) n_layer; i++) { + lm_ggml_tensor * k = lm_ggml_new_tensor_1d(cache.ctx, ktype, n_embd*n_ctx); + lm_ggml_tensor * v = lm_ggml_new_tensor_1d(cache.ctx, vtype, n_embd*n_ctx); + lm_ggml_format_name(k, "cache_k_l%d", i); + lm_ggml_format_name(v, "cache_v_l%d", i); + cache.k_l.push_back(k); + cache.v_l.push_back(v); +#ifdef LM_GGML_USE_CUBLAS + if (i >= i_gpu_start) { + if (offload) { + lm_ggml_cuda_assign_buffers_no_scratch(k); + vram_kv_cache += lm_ggml_nbytes(k); + lm_ggml_cuda_assign_buffers_no_scratch(v); + vram_kv_cache += lm_ggml_nbytes(v); + } } +#endif // LM_GGML_USE_CUBLAS } -#endif + + if (vram_kv_cache > 0) { + LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0); + } + + LM_GGML_UNUSED(n_gpu_layers); return true; } @@ -1529,6 +1642,8 @@ static bool llama_kv_cache_find_slot( } } + cache.used += n_tokens; + return true; } @@ -1549,6 +1664,7 @@ static void llama_kv_cache_clear(struct llama_kv_cache & cache) { cache.cells[i].seq_id.clear(); } cache.head = 0; + cache.used = 0; } static void llama_kv_cache_seq_rm( @@ -1571,6 +1687,9 @@ static void llama_kv_cache_seq_rm( continue; } if (cache.cells[i].seq_id.empty()) { + // keep count of the number of used cells + if (cache.cells[i].pos >= 0) cache.used--; + cache.cells[i].pos = -1; if (new_head == cache.size) new_head = i; } @@ -1578,7 +1697,7 @@ static void llama_kv_cache_seq_rm( } // If we freed up a slot, set head to it so searching can start there. - if (new_head != cache.size) cache.head = new_head; + if (new_head != cache.size && new_head < cache.head) cache.head = new_head; } static void llama_kv_cache_seq_cp( @@ -1604,6 +1723,7 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id for (uint32_t i = 0; i < cache.size; ++i) { if (!cache.cells[i].has_seq_id(seq_id)) { + if (cache.cells[i].pos >= 0) cache.used--; cache.cells[i].pos = -1; cache.cells[i].seq_id.clear(); if (new_head == cache.size) new_head = i; @@ -1614,7 +1734,7 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id } // If we freed up a slot, set head to it so searching can start there. - if (new_head != cache.size) cache.head = new_head; + if (new_head != cache.size && new_head < cache.head) cache.head = new_head; } static void llama_kv_cache_seq_shift( @@ -1635,6 +1755,7 @@ static void llama_kv_cache_seq_shift( cache.cells[i].delta += delta; if (cache.cells[i].pos < 0) { + if (!cache.cells[i].seq_id.empty()) cache.used--; cache.cells[i].pos = -1; cache.cells[i].seq_id.clear(); if (new_head == cache.size) new_head = i; @@ -1685,6 +1806,169 @@ static std::string llama_format_tensor_shape(const struct lm_ggml_tensor * t) { return buf; } +namespace GGUFMeta { + template + struct GKV_Base_Type { + static constexpr lm_gguf_type gt = gt_; + + static T getter(const lm_gguf_context * ctx, const int kid) { + return gfun(ctx, kid); + } + }; + + template struct GKV_Base; + + template<> struct GKV_Base: GKV_Base_Type {}; + template<> struct GKV_Base: GKV_Base_Type {}; + template<> struct GKV_Base: GKV_Base_Type {}; + template<> struct GKV_Base: GKV_Base_Type {}; + template<> struct GKV_Base: GKV_Base_Type {}; + template<> struct GKV_Base: GKV_Base_Type {}; + template<> struct GKV_Base: GKV_Base_Type {}; + template<> struct GKV_Base: GKV_Base_Type {}; + template<> struct GKV_Base: GKV_Base_Type {}; + template<> struct GKV_Base: GKV_Base_Type {}; + template<> struct GKV_Base: GKV_Base_Type {}; + template<> struct GKV_Base: GKV_Base_Type {}; + + template<> struct GKV_Base { + static constexpr lm_gguf_type gt = LM_GGUF_TYPE_STRING; + + static std::string getter(const lm_gguf_context * ctx, const int kid) { + return lm_gguf_get_val_str(ctx, kid); + } + }; + + struct ArrayInfo{ + const lm_gguf_type gt; + const size_t length; + const void * data; + }; + + template<> struct GKV_Base { + public: + static constexpr lm_gguf_type gt = LM_GGUF_TYPE_ARRAY; + static ArrayInfo getter(const lm_gguf_context *ctx, const int k) { + return ArrayInfo { + lm_gguf_get_arr_type(ctx, k), + size_t(lm_gguf_get_arr_n(ctx, k)), + lm_gguf_get_arr_data(ctx, k), + }; + } + }; + + template + class GKV: public GKV_Base { + GKV() = delete; + + public: + static T get_kv(const lm_gguf_context * ctx, const int k) { + const enum lm_gguf_type kt = lm_gguf_get_kv_type(ctx, k); + + if (kt != GKV::gt) { + throw std::runtime_error(format("key %s has wrong type %s but expected type %s", + lm_gguf_get_key(ctx, k), lm_gguf_type_name(kt), lm_gguf_type_name(GKV::gt))); + } + return GKV::getter(ctx, k); + } + + static const char * override_type_to_str(const llama_model_kv_override_type ty) { + switch (ty) { + case LLAMA_KV_OVERRIDE_BOOL: return "bool"; + case LLAMA_KV_OVERRIDE_INT: return "int"; + case LLAMA_KV_OVERRIDE_FLOAT: return "float"; + } + return "unknown"; + } + + static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override *override) { + if (!override) { return false; } + if (override->tag == expected_type) { + LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ", + __func__, override_type_to_str(override->tag), override->key); + switch (override->tag) { + case LLAMA_KV_OVERRIDE_BOOL: { + printf("%s\n", override->bool_value ? "true" : "false"); + } break; + case LLAMA_KV_OVERRIDE_INT: { + printf("%" PRId64 "\n", override->int_value); + } break; + case LLAMA_KV_OVERRIDE_FLOAT: { + printf("%.6f\n", override->float_value); + } break; + default: + // Shouldn't be possible to end up here, but just in case... + throw std::runtime_error( + format("Unsupported attempt to override %s type for metadata key %s\n", + override_type_to_str(override->tag), override->key)); + } + return true; + } + LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n", + __func__, override->key, override_type_to_str(expected_type), override_type_to_str(override->tag)); + return false; + } + + template + static typename std::enable_if::value, bool>::type + try_override(OT & target, const struct llama_model_kv_override *override) { + if (validate_override(LLAMA_KV_OVERRIDE_BOOL, override)) { + target = override->bool_value; + return true; + } + return true; + } + + template + static typename std::enable_if::value && std::is_integral::value, bool>::type + try_override(OT & target, const struct llama_model_kv_override *override) { + if (validate_override(LLAMA_KV_OVERRIDE_INT, override)) { + target = override->int_value; + return true; + } + return false; + } + + template + static typename std::enable_if::value, bool>::type + try_override(T & target, const struct llama_model_kv_override *override) { + if (validate_override(LLAMA_KV_OVERRIDE_FLOAT, override)) { + target = override->float_value; + return true; + } + return false; + } + + template + static typename std::enable_if::value, bool>::type + try_override(T & target, const struct llama_model_kv_override *override) { + (void)target; + (void)override; + if (!override) { return false; } + // Currently, we should never end up here so it would be a bug if we do. + throw std::runtime_error(format("Unsupported attempt to override string type for metadata key %s\n", + override ? override->key : "NULL")); + } + + static bool set(const lm_gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override *override = nullptr) { + if (try_override(target, override)) { + return true; + } + if (k < 0) { return false; } + target = get_kv(ctx, k); + return true; + } + + static bool set(const lm_gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override *override = nullptr) { + return set(ctx, lm_gguf_find_key(ctx, key), target, override); + } + + static bool set(const lm_gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override *override = nullptr) { + return set(ctx, key.c_str(), target, override); + } + }; +} + struct llama_model_loader { int n_kv = 0; int n_tensors = 0; @@ -1700,21 +1984,34 @@ struct llama_model_loader { llama_fver fver; std::unique_ptr mapping; + std::unordered_map kv_overrides; struct lm_gguf_context * ctx_gguf = NULL; struct lm_ggml_context * ctx_meta = NULL; - llama_model_loader(const std::string & fname, bool use_mmap) : file(fname.c_str(), "rb") { + std::string arch_name; + LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN); + + llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) : file(fname.c_str(), "rb") { struct lm_gguf_init_params params = { /*.no_alloc = */ true, /*.ctx = */ &ctx_meta, }; + if (param_overrides_p != nullptr) { + for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) { + kv_overrides.insert({std::string(p->key), *p}); + } + } + ctx_gguf = lm_gguf_init_from_file(fname.c_str(), params); if (!ctx_gguf) { throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str())); } + get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); + llm_kv = LLM_KV(llm_arch_from_string(arch_name)); + n_kv = lm_gguf_get_n_kv(ctx_gguf); n_tensors = lm_gguf_get_n_tensors(ctx_gguf); @@ -1766,10 +2063,10 @@ struct llama_model_loader { case LM_GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break; case LM_GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break; default: - { - LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, lm_ggml_type_name(type_max)); - ftype = LLAMA_FTYPE_ALL_F32; - } break; + { + LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, lm_ggml_type_name(type_max)); + ftype = LLAMA_FTYPE_ALL_F32; + } break; } // this is a way to mark that we have "guessed" the file type @@ -1782,11 +2079,23 @@ struct llama_model_loader { } } + LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__); for (int i = 0; i < n_kv; i++) { - const char * name = lm_gguf_get_key(ctx_gguf, i); - const enum lm_gguf_type type = lm_gguf_get_kv_type(ctx_gguf, i); + const char * name = lm_gguf_get_key(ctx_gguf, i); + const enum lm_gguf_type type = lm_gguf_get_kv_type(ctx_gguf, i); + const std::string type_name = + type == LM_GGUF_TYPE_ARRAY + ? format("%s[%s,%d]", lm_gguf_type_name(type), lm_gguf_type_name(lm_gguf_get_arr_type(ctx_gguf, i)), lm_gguf_get_arr_n(ctx_gguf, i)) + : lm_gguf_type_name(type); + + std::string value = lm_gguf_kv_to_str(ctx_gguf, i); + const size_t MAX_VALUE_LEN = 40; + if (value.size() > MAX_VALUE_LEN) { + value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str()); + } + replace_all(value, "\n", "\\n"); - LLAMA_LOG_INFO("%s: - kv %3d: %42s %-8s\n", __func__, i, name, lm_gguf_type_name(type)); + LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str()); } // print type counts @@ -1816,19 +2125,59 @@ struct llama_model_loader { } } - std::string get_arch_name() const { - const auto kv = LLM_KV(LLM_ARCH_UNKNOWN); + template + typename std::enable_if::value, bool>::type + get_arr_n(const std::string & key, T & result, const bool required = true) { + const int kid = lm_gguf_find_key(ctx_gguf, key.c_str()); + + if (kid < 0) { + if (required) { + throw std::runtime_error(format("key not found in model: %s", key.c_str())); + } + return false; + } + + struct GGUFMeta::ArrayInfo arr_info = + GGUFMeta::GKV::get_kv(ctx_gguf, kid); + + + result = arr_info.length; + return true; + } + + template + typename std::enable_if::value, bool>::type + get_arr_n(const enum llm_kv kid, T & result, const bool required = true) { + return get_arr_n(llm_kv(kid), result, required); + } + + template + bool get_key(const std::string & key, T & result, const bool required = true) { + auto it = kv_overrides.find(key); + + const struct llama_model_kv_override * override = + it != kv_overrides.end() ? &it->second : nullptr; + + const bool found = GGUFMeta::GKV::set(ctx_gguf, key, result, override); + + if (required && !found) { + throw std::runtime_error(format("key not found in model: %s", key.c_str())); + } + + return found; + } - std::string arch_name; - LM_GGUF_GET_KEY(ctx_gguf, arch_name, lm_gguf_get_val_str, LM_GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_ARCHITECTURE)); + template + bool get_key(const enum llm_kv kid, T & result, const bool required = true) { + return get_key(llm_kv(kid), result, required); + } + std::string get_arch_name() const { return arch_name; } enum llm_arch get_arch() const { - const std::string arch_name = get_arch_name(); - - return llm_arch_from_string(arch_name); + return llm_kv.arch; } const char * get_tensor_name(int i) const { @@ -1868,10 +2217,13 @@ struct llama_model_loader { return tensor; } - struct lm_ggml_tensor * create_tensor(struct lm_ggml_context * ctx, const std::string & name, const std::vector & ne, lm_ggml_backend_type backend) { + struct lm_ggml_tensor * create_tensor(struct lm_ggml_context * ctx, const std::string & name, const std::vector & ne, lm_ggml_backend_type backend, bool required = true) { struct lm_ggml_tensor * cur = lm_ggml_get_tensor(ctx_meta, name.c_str()); if (cur == NULL) { + if (!required) { + return NULL; + } throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str())); } @@ -2075,49 +2427,56 @@ static void llm_load_arch(llama_model_loader & ml, llama_model & model) { static void llm_load_hparams( llama_model_loader & ml, llama_model & model) { - struct lm_gguf_context * ctx = ml.ctx_gguf; - - const auto kv = LLM_KV(model.arch); - auto & hparams = model.hparams; + const lm_gguf_context * ctx = ml.ctx_gguf; + + // get metadata as string + for (int i = 0; i < lm_gguf_get_n_kv(ctx); i++) { + enum lm_gguf_type type = lm_gguf_get_kv_type(ctx, i); + if (type == LM_GGUF_TYPE_ARRAY) { + continue; + } + const char * name = lm_gguf_get_key(ctx, i); + const std::string value = lm_gguf_kv_to_str(ctx, i); + model.lm_gguf_kv.emplace(name, value); + } // get general kv - LM_GGUF_GET_KEY(ctx, model.name, lm_gguf_get_val_str, LM_GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME)); + ml.get_key(LLM_KV_GENERAL_NAME, model.name, false); // get hparams kv - LM_GGUF_GET_KEY(ctx, hparams.n_vocab, lm_gguf_get_arr_n, LM_GGUF_TYPE_ARRAY, true, kv(LLM_KV_TOKENIZER_LIST)); - LM_GGUF_GET_KEY(ctx, hparams.n_ctx_train, lm_gguf_get_val_u32, LM_GGUF_TYPE_UINT32, true, kv(LLM_KV_CONTEXT_LENGTH)); - LM_GGUF_GET_KEY(ctx, hparams.n_embd, lm_gguf_get_val_u32, LM_GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH)); - LM_GGUF_GET_KEY(ctx, hparams.n_ff, lm_gguf_get_val_u32, LM_GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH)); - LM_GGUF_GET_KEY(ctx, hparams.n_head, lm_gguf_get_val_u32, LM_GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT)); - LM_GGUF_GET_KEY(ctx, hparams.n_layer, lm_gguf_get_val_u32, LM_GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT)); + ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab); + ml.get_key (LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train); + ml.get_key (LLM_KV_EMBEDDING_LENGTH, hparams.n_embd); + ml.get_key (LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff); + ml.get_key (LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head); + ml.get_key (LLM_KV_BLOCK_COUNT, hparams.n_layer); // n_head_kv is optional, default to n_head hparams.n_head_kv = hparams.n_head; - LM_GGUF_GET_KEY(ctx, hparams.n_head_kv, lm_gguf_get_val_u32, LM_GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV)); + ml.get_key(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv, false); - hparams.rope_finetuned = false; - LM_GGUF_GET_KEY(ctx, hparams.rope_finetuned, lm_gguf_get_val_bool, LM_GGUF_TYPE_BOOL, false, - kv(LLM_KV_ROPE_SCALING_FINETUNED)); + bool rope_finetuned = false; + ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false); + hparams.rope_finetuned = rope_finetuned; hparams.n_yarn_orig_ctx = hparams.n_ctx_train; - LM_GGUF_GET_KEY(ctx, hparams.n_yarn_orig_ctx, lm_gguf_get_val_u32, LM_GGUF_TYPE_UINT32, false, - kv(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN)); + ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_yarn_orig_ctx, false); // rope_freq_base (optional) hparams.rope_freq_base_train = 10000.0f; - LM_GGUF_GET_KEY(ctx, hparams.rope_freq_base_train, lm_gguf_get_val_f32, LM_GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE)); + ml.get_key(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train, false); std::string rope_scaling("linear"); - LM_GGUF_GET_KEY(ctx, rope_scaling, lm_gguf_get_val_str, LM_GGUF_TYPE_STRING, false, kv(LLM_KV_ROPE_SCALING_TYPE)); + ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false); hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling); LM_GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_UNSPECIFIED); // rope_freq_scale (inverse of the kv) is optional float ropescale = 0.0f; - LM_GGUF_GET_KEY(ctx, ropescale, lm_gguf_get_val_f32, LM_GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALING_FACTOR)); - if (ropescale == 0.0f) { // try the old key name - LM_GGUF_GET_KEY(ctx, ropescale, lm_gguf_get_val_f32, LM_GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR)); + if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) { + // try the old key name + ml.get_key(LLM_KV_ROPE_SCALE_LINEAR, ropescale, false); } hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale; @@ -2125,7 +2484,7 @@ static void llm_load_hparams( { hparams.n_rot = hparams.n_embd / hparams.n_head; - LM_GGUF_GET_KEY(ctx, hparams.n_rot, lm_gguf_get_val_u32, LM_GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT)); + ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false); if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) { if (hparams.n_rot != hparams.n_embd / hparams.n_head) { @@ -2140,7 +2499,7 @@ static void llm_load_hparams( switch (model.arch) { case LLM_ARCH_LLAMA: { - LM_GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, lm_gguf_get_val_f32, LM_GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS)); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); switch (hparams.n_layer) { case 26: model.type = e_model::MODEL_3B; break; @@ -2154,7 +2513,7 @@ static void llm_load_hparams( } break; case LLM_ARCH_FALCON: { - LM_GGUF_GET_KEY(ctx, hparams.f_norm_eps, lm_gguf_get_val_f32, LM_GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS)); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); switch (hparams.n_layer) { case 32: model.type = e_model::MODEL_7B; break; @@ -2164,7 +2523,7 @@ static void llm_load_hparams( } break; case LLM_ARCH_BAICHUAN: { - LM_GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, lm_gguf_get_val_f32, LM_GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS)); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); switch (hparams.n_layer) { case 32: model.type = e_model::MODEL_7B; break; case 40: model.type = e_model::MODEL_13B; break; @@ -2173,7 +2532,7 @@ static void llm_load_hparams( } break; case LLM_ARCH_STARCODER: { - LM_GGUF_GET_KEY(ctx, hparams.f_norm_eps, lm_gguf_get_val_f32, LM_GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS)); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); switch (hparams.n_layer) { case 24: model.type = e_model::MODEL_1B; break; case 36: model.type = e_model::MODEL_3B; break; @@ -2184,7 +2543,7 @@ static void llm_load_hparams( } break; case LLM_ARCH_PERSIMMON: { - LM_GGUF_GET_KEY(ctx, hparams.f_norm_eps, lm_gguf_get_val_f32, LM_GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS)); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); switch (hparams.n_layer) { case 36: model.type = e_model::MODEL_8B; break; default: model.type = e_model::MODEL_UNKNOWN; @@ -2192,7 +2551,7 @@ static void llm_load_hparams( } break; case LLM_ARCH_REFACT: { - LM_GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, lm_gguf_get_val_f32, LM_GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS)); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); switch (hparams.n_layer) { case 32: model.type = e_model::MODEL_1B; break; default: model.type = e_model::MODEL_UNKNOWN; @@ -2200,7 +2559,7 @@ static void llm_load_hparams( } break; case LLM_ARCH_BLOOM: { - LM_GGUF_GET_KEY(ctx, hparams.f_norm_eps, lm_gguf_get_val_f32, LM_GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS)); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); switch (hparams.n_layer) { case 24: model.type = e_model::MODEL_1B; break; @@ -2215,9 +2574,9 @@ static void llm_load_hparams( { hparams.f_clamp_kqv = 0.0f; - LM_GGUF_GET_KEY(ctx, hparams.f_norm_eps, lm_gguf_get_val_f32, LM_GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS)); - LM_GGUF_GET_KEY(ctx, hparams.f_clamp_kqv, lm_gguf_get_val_f32, LM_GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_CLAMP_KQV)); - LM_GGUF_GET_KEY(ctx, hparams.f_max_alibi_bias, lm_gguf_get_val_f32, LM_GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS)); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false); + ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias); switch (hparams.n_layer) { case 32: model.type = e_model::MODEL_7B; break; @@ -2225,6 +2584,26 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_STABLELM: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + + switch (hparams.n_layer) { + case 32: model.type = e_model::MODEL_3B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; + case LLM_ARCH_QWEN: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + + switch (hparams.n_layer) { + case 32: model.type = e_model::MODEL_7B; break; + case 40: model.type = e_model::MODEL_13B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; + default: (void)0; } @@ -2265,7 +2644,7 @@ static void llm_load_vocab( { std::string tokenizer_name; - LM_GGUF_GET_KEY(ctx, tokenizer_name, lm_gguf_get_val_str, LM_GGUF_TYPE_STRING, true, kv(LLM_KV_TOKENIZER_MODEL)); + ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name); if (tokenizer_name == "llama") { vocab.type = LLAMA_VOCAB_TYPE_SPM; @@ -2355,16 +2734,30 @@ static void llm_load_vocab( }; for (const auto & it : special_token_types) { const std::string & key = kv(std::get<0>(it)); - int32_t & id = std::get<1>(it), old_id = id; + int32_t & id = std::get<1>(it); + + uint32_t new_id; + if (!ml.get_key(std::get<0>(it), new_id, false)) { + continue; + } + if (new_id >= vocab.id_to_token.size()) { + LLAMA_LOG_WARN("%s: bad special token: '%s' = %ud, using default id %d\n", + __func__, key.c_str(), new_id, id); + } else { + id = new_id; + } + + } - LM_GGUF_GET_KEY(ctx, id, lm_gguf_get_val_u32, LM_GGUF_TYPE_UINT32, false, key); - // Must be >= -1 and < vocab size. Since the key is unsigned, -1 - // can only come from the default value, so there's no point in - // validating that. - if (size_t(id + 1) > vocab.id_to_token.size()) { - LLAMA_LOG_WARN("%s: bad special token: '%s' = %d, using default id %d\n", - __func__, key.c_str(), id, old_id); - id = old_id; + // Handle add_bos_token and add_eos_token + { + bool temp = true; + + if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) { + vocab.special_add_bos = int(temp); + } + if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) { + vocab.special_add_eos = int(temp); } } } @@ -2497,22 +2890,22 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type)); LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str()); LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9); - if (ml.n_bytes < GB) { - LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements); + if (ml.n_bytes < GiB) { + LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements); } else { LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements); } // general kv - LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str()); + LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str()); // special tokens - if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); } - if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); } - if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); } - if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); } - if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); } - if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); } + if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); } + if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); } + if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); } + if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); } + if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); } + if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); } } static void llm_load_tensors( @@ -2536,7 +2929,7 @@ static void llm_load_tensors( ml.calc_sizes(ctx_size, mmapped_size); - LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0); + LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0); // create the ggml context { @@ -2598,14 +2991,7 @@ static void llm_load_tensors( lm_ggml_backend_type backend_output; if (n_gpu_layers > int(n_layer)) { - // norm is not performance relevant on its own but keeping it in VRAM reduces data copying - // on Windows however this is detrimental unless everything is on the GPU -#ifndef _WIN32 - backend_norm = llama_backend_offload; -#else - backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : llama_backend_offload; -#endif // _WIN32 - + backend_norm = llama_backend_offload; backend_output = llama_backend_offload_split; } else { backend_norm = LM_GGML_BACKEND_CPU; @@ -2642,6 +3028,12 @@ static void llm_load_tensors( layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split); layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + // optional bias tensors + layer.bq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, backend, false); + layer.bk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, backend, false); + layer.bv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, backend, false); + layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend, false); + layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); @@ -2650,9 +3042,14 @@ static void llm_load_tensors( if (backend == LM_GGML_BACKEND_GPU) { vram_weights += - lm_ggml_nbytes(layer.attn_norm) + lm_ggml_nbytes(layer.wq) + lm_ggml_nbytes(layer.wk) + - lm_ggml_nbytes(layer.wv) + lm_ggml_nbytes(layer.wo) + lm_ggml_nbytes(layer.ffn_norm) + - lm_ggml_nbytes(layer.ffn_gate) + lm_ggml_nbytes(layer.ffn_down) + lm_ggml_nbytes(layer.ffn_up); + lm_ggml_nbytes(layer.attn_norm) + lm_ggml_nbytes(layer.wq) + lm_ggml_nbytes(layer.wk) + + lm_ggml_nbytes(layer.wv) + lm_ggml_nbytes(layer.wo) + + (layer.bq ? lm_ggml_nbytes(layer.bq) : 0) + + (layer.bk ? lm_ggml_nbytes(layer.bk) : 0) + + (layer.bv ? lm_ggml_nbytes(layer.bv) : 0) + + (layer.bo ? lm_ggml_nbytes(layer.bo) : 0) + + lm_ggml_nbytes(layer.ffn_norm) + lm_ggml_nbytes(layer.ffn_gate) + + lm_ggml_nbytes(layer.ffn_down) + lm_ggml_nbytes(layer.ffn_up); } } } break; @@ -2664,14 +3061,7 @@ static void llm_load_tensors( lm_ggml_backend_type backend_output; if (n_gpu_layers > int(n_layer)) { - // norm is not performance relevant on its own but keeping it in VRAM reduces data copying - // on Windows however this is detrimental unless everything is on the GPU -#ifndef _WIN32 - backend_norm = llama_backend_offload; -#else - backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : llama_backend_offload; -#endif // _WIN32 - + backend_norm = llama_backend_offload; backend_output = llama_backend_offload_split; } else { backend_norm = LM_GGML_BACKEND_CPU; @@ -2734,14 +3124,7 @@ static void llm_load_tensors( lm_ggml_backend_type backend_output; if (n_gpu_layers > int(n_layer)) { - // norm is not performance relevant on its own but keeping it in VRAM reduces data copying - // on Windows however this is detrimental unless everything is on the GPU -#ifndef _WIN32 - backend_norm = llama_backend_offload; -#else - backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : llama_backend_offload; -#endif // _WIN32 - + backend_norm = llama_backend_offload; backend_output = llama_backend_offload_split; } else { backend_norm = LM_GGML_BACKEND_CPU; @@ -2811,14 +3194,7 @@ static void llm_load_tensors( lm_ggml_backend_type backend_output; if (n_gpu_layers > int(n_layer)) { - // norm is not performance relevant on its own but keeping it in VRAM reduces data copying - // on Windows however this is detrimental unless everything is on the GPU -#ifndef _WIN32 - backend_norm = llama_backend_offload; -#else - backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : llama_backend_offload; -#endif // _WIN32 - + backend_norm = llama_backend_offload; backend_output = llama_backend_offload_split; } else { backend_norm = LM_GGML_BACKEND_CPU; @@ -2888,14 +3264,7 @@ static void llm_load_tensors( lm_ggml_backend_type backend_output; if (n_gpu_layers > int(n_layer)) { - // norm is not performance relevant on its own but keeping it in VRAM reduces data copying - // on Windows however this is detrimental unless everything is on the GPU -#ifndef _WIN32 - backend_norm = llama_backend_offload; -#else - backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : llama_backend_offload; -#endif // _WIN32 - + backend_norm = llama_backend_offload; backend_output = llama_backend_offload_split; } else { backend_norm = LM_GGML_BACKEND_CPU; @@ -2954,14 +3323,7 @@ static void llm_load_tensors( lm_ggml_backend_type backend_output; if (n_gpu_layers > int(n_layer)) { - // norm is not performance relevant on its own but keeping it in VRAM reduces data copying - // on Windows however this is detrimental unless everything is on the GPU -#ifndef _WIN32 - backend_norm = llama_backend_offload; -#else - backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : llama_backend_offload; -#endif // _WIN32 - + backend_norm = llama_backend_offload; backend_output = llama_backend_offload_split; } else { backend_norm = LM_GGML_BACKEND_CPU; @@ -3032,14 +3394,7 @@ static void llm_load_tensors( lm_ggml_backend_type backend_output; if (n_gpu_layers > int(n_layer)) { - // norm is not performance relevant on its own but keeping it in VRAM reduces data copying - // on Windows however this is detrimental unless everything is on the GPU -#ifndef _WIN32 - backend_norm = llama_backend_offload; -#else - backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : llama_backend_offload; -#endif // _WIN32 - + backend_norm = llama_backend_offload; backend_output = llama_backend_offload_split; } else { backend_norm = LM_GGML_BACKEND_CPU; @@ -3089,41 +3444,167 @@ static void llm_load_tensors( } } } break; - default: - throw std::runtime_error("unknown architecture"); - } - } + case LLM_ARCH_STABLELM: + { + model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, LM_GGML_BACKEND_CPU); - ml.done_getting_tensors(); + // output + { + lm_ggml_backend_type backend_norm; + lm_ggml_backend_type backend_output; - // print memory requirements - { - // this is the total memory required to run the inference - size_t mem_required = - ctx_size + - mmapped_size - vram_weights; // weights in VRAM not in memory + if (n_gpu_layers > int(n_layer)) { + backend_norm = llama_backend_offload; + backend_output = llama_backend_offload_split; + } else { + backend_norm = LM_GGML_BACKEND_CPU; + backend_output = LM_GGML_BACKEND_CPU; + } - LLAMA_LOG_INFO("%s: mem required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0); + model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); + model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); + model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); -#if defined(LM_GGML_USE_CUBLAS) || defined(LM_GGML_USE_CLBLAST) - const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); + if (backend_norm == LM_GGML_BACKEND_GPU) { + vram_weights += lm_ggml_nbytes(model.output_norm); + } + if (backend_output == LM_GGML_BACKEND_GPU_SPLIT) { + vram_weights += lm_ggml_nbytes(model.output); + } + } - LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu); - if (n_gpu_layers > (int) hparams.n_layer) { - LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__); - } + const uint32_t n_ff = hparams.n_ff; -#ifdef LM_GGML_USE_CUBLAS - const int max_backend_supported_layers = hparams.n_layer + 3; - const int max_offloadable_layers = hparams.n_layer + 3; -#elif LM_GGML_USE_CLBLAST - const int max_backend_supported_layers = hparams.n_layer + 1; - const int max_offloadable_layers = hparams.n_layer + 1; -#endif // LM_GGML_USE_CUBLAS + const int i_gpu_start = n_layer - n_gpu_layers; - LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers); - LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1024.0 / 1024.0); -#else + model.layers.resize(n_layer); + + for (uint32_t i = 0; i < n_layer; ++i) { + /* + llama_model_loader: - tensor 4: blk.0.attn_output.weight f16 [ 2560, 2560, 1, 1 ] + */ + const lm_ggml_backend_type backend = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload; // NOLINT + const lm_ggml_backend_type backend_split = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); + layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); + + layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split); + layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split); + layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split); + layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + + layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); + layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); + + layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + + if (backend == LM_GGML_BACKEND_GPU) { + vram_weights += + lm_ggml_nbytes(layer.attn_norm) + lm_ggml_nbytes(layer.wq) + lm_ggml_nbytes(layer.wk) + + lm_ggml_nbytes(layer.wv) + lm_ggml_nbytes(layer.wo) + lm_ggml_nbytes(layer.ffn_norm) + + lm_ggml_nbytes(layer.ffn_gate) + lm_ggml_nbytes(layer.ffn_down) + lm_ggml_nbytes(layer.ffn_up); + } + } + } break; + case LLM_ARCH_QWEN: + { + model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, LM_GGML_BACKEND_CPU); + { + lm_ggml_backend_type backend_norm; + lm_ggml_backend_type backend_output; + + if (n_gpu_layers > int(n_layer)) { + backend_norm = llama_backend_offload; + backend_output = llama_backend_offload_split; + } else { + backend_norm = LM_GGML_BACKEND_CPU; + backend_output = LM_GGML_BACKEND_CPU; + } + + model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); + model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + + if (backend_norm == LM_GGML_BACKEND_GPU) { + vram_weights += lm_ggml_nbytes(model.output_norm); + } + if (backend_output == LM_GGML_BACKEND_GPU_SPLIT) { + vram_weights += lm_ggml_nbytes(model.output); + } + } + + const uint32_t n_ff = hparams.n_ff / 2; + + const int i_gpu_start = n_layer - n_gpu_layers; + + model.layers.resize(n_layer); + + for (uint32_t i = 0; i < n_layer; ++i) { + const lm_ggml_backend_type backend = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload; // NOLINT + const lm_ggml_backend_type backend_split = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); + + layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd * 3}, backend_split); + layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd * 3}, backend); + layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + + layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); + + layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + + if (backend == LM_GGML_BACKEND_GPU) { + vram_weights += + lm_ggml_nbytes(layer.attn_norm) + lm_ggml_nbytes(layer.wqkv) + lm_ggml_nbytes(layer.bqkv) + + lm_ggml_nbytes(layer.wo) + lm_ggml_nbytes(layer.ffn_norm) + lm_ggml_nbytes(layer.ffn_gate) + + lm_ggml_nbytes(layer.ffn_down) + lm_ggml_nbytes(layer.ffn_up); + } + } + } break; + + default: + throw std::runtime_error("unknown architecture"); + } + } + + ml.done_getting_tensors(); + + // print memory requirements + { + // this is the total memory required to run the inference + size_t mem_required = + ctx_size + + mmapped_size - vram_weights; // weights in VRAM not in memory + + LLAMA_LOG_INFO("%s: mem required = %7.2f MiB\n", __func__, mem_required / 1024.0 / 1024.0); + +#if defined(LM_GGML_USE_CUBLAS) || defined(LM_GGML_USE_CLBLAST) + const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); + + LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu); + if (n_gpu_layers > (int) hparams.n_layer) { + LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__); + } + +#ifdef LM_GGML_USE_CUBLAS + const int max_backend_supported_layers = hparams.n_layer + 1; + const int max_offloadable_layers = hparams.n_layer + 1; +#elif LM_GGML_USE_CLBLAST + const int max_backend_supported_layers = hparams.n_layer + 1; + const int max_offloadable_layers = hparams.n_layer + 1; +#endif // LM_GGML_USE_CUBLAS + + LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers); + LLAMA_LOG_INFO("%s: VRAM used: %.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0); +#else (void) n_gpu_layers; #endif // defined(LM_GGML_USE_CUBLAS) || defined(LM_GGML_USE_CLBLAST) } @@ -3156,7 +3637,7 @@ static void llm_load_tensors( static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) { try { - llama_model_loader ml(fname, params.use_mmap); + llama_model_loader ml(fname, params.use_mmap, params.kv_overrides); model.hparams.vocab_only = params.vocab_only; @@ -3252,7 +3733,7 @@ static void llm_build_k_shift( struct lm_ggml_cgraph * graph, llm_rope_type type, int64_t n_ctx, - int64_t n_rot, + int n_rot, float freq_base, float freq_scale, const llm_build_cb & cb) { @@ -3283,11 +3764,11 @@ static void llm_build_k_shift( struct lm_ggml_tensor * tmp = // we rotate only the first n_rot dimensions lm_ggml_rope_custom_inplace(ctx, - lm_ggml_view_3d(ctx, kv.k, - n_rot, n_head_kv, n_ctx, - lm_ggml_element_size(kv.k)*n_embd_head, - lm_ggml_element_size(kv.k)*n_embd_gqa, - lm_ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il), + lm_ggml_view_3d(ctx, kv.k_l[il], + n_embd_head, n_head_kv, n_ctx, + lm_ggml_type_sizef(kv.k_l[il]->type)*n_embd_head, + lm_ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa, + 0), K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); cb(tmp, "K_shifted", il); @@ -3314,13 +3795,13 @@ static void llm_build_kv_store( //struct lm_ggml_tensor * v_cur_t = lm_ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed cb(v_cur_t, "v_cur_t", il); - struct lm_ggml_tensor * k_cache_view = lm_ggml_view_1d(ctx, kv.k, n_tokens*n_embd_gqa, - (lm_ggml_element_size(kv.k)*n_embd_gqa)*(il*n_ctx + kv_head)); + struct lm_ggml_tensor * k_cache_view = lm_ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_gqa, + (lm_ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa)*kv_head); cb(k_cache_view, "k_cache_view", il); - struct lm_ggml_tensor * v_cache_view = lm_ggml_view_2d(ctx, kv.v, n_tokens, n_embd_gqa, - ( n_ctx)*lm_ggml_element_size(kv.v), - (il*n_ctx)*lm_ggml_element_size(kv.v)*n_embd_gqa + kv_head*lm_ggml_element_size(kv.v)); + struct lm_ggml_tensor * v_cache_view = lm_ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_gqa, + ( n_ctx)*lm_ggml_element_size(kv.v_l[il]), + (kv_head)*lm_ggml_element_size(kv.v_l[il])); cb(v_cache_view, "v_cache_view", il); // important: storing RoPE-ed version of K in the KV cache! @@ -3472,40 +3953,46 @@ static struct lm_ggml_tensor * llm_build_kqv( cb(q, "q", il); struct lm_ggml_tensor * k = - lm_ggml_view_3d(ctx, kv.k, + lm_ggml_view_3d(ctx, kv.k_l[il], n_embd_head, n_kv, n_head_kv, - lm_ggml_element_size(kv.k)*n_embd_gqa, - lm_ggml_element_size(kv.k)*n_embd_head, - lm_ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il); + lm_ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa, + lm_ggml_type_sizef(kv.k_l[il]->type)*n_embd_head, + 0); cb(k, "k", il); struct lm_ggml_tensor * kq = lm_ggml_mul_mat(ctx, k, q); cb(kq, "kq", il); - kq = lm_ggml_scale(ctx, kq, kq_scale); - cb(kq, "kq_scaled", il); - if (max_alibi_bias > 0.0f) { - // TODO: n_head or n_head_kv - // TODO: K-shift is likely not working - // TODO: change to lm_ggml_add - kq = lm_ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias); - cb(kq, "kq_scaled_alibi", il); - } + // temporary branch until we figure out how to handle lm_ggml_alibi through lm_ggml_add + kq = lm_ggml_scale(ctx, kq, kq_scale); + cb(kq, "kq_scaled", il); + + if (max_alibi_bias > 0.0f) { + // TODO: n_head or n_head_kv + // TODO: K-shift is likely not working + // TODO: change to lm_ggml_add + kq = lm_ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias); + cb(kq, "kq_scaled_alibi", il); + } - kq = lm_ggml_add(ctx, kq, kq_mask); - cb(kq, "kq_masked", il); + kq = lm_ggml_add(ctx, kq, kq_mask); + cb(kq, "kq_masked", il); - kq = lm_ggml_soft_max(ctx, kq); - cb(kq, "kq_soft_max", il); + kq = lm_ggml_soft_max(ctx, kq); + cb(kq, "kq_soft_max", il); + } else { + kq = lm_ggml_soft_max_ext(ctx, kq, kq_mask, 1.0f/sqrtf(float(n_embd_head))); + cb(kq, "kq_soft_max_ext", il); + } // split cached v into n_head heads struct lm_ggml_tensor * v = - lm_ggml_view_3d(ctx, kv.v, + lm_ggml_view_3d(ctx, kv.v_l[il], n_kv, n_embd_head, n_head_kv, - lm_ggml_element_size(kv.v)*n_ctx, - lm_ggml_element_size(kv.v)*n_ctx*n_embd_head, - lm_ggml_element_size(kv.v)*n_ctx*n_embd_gqa*il); + lm_ggml_element_size(kv.v_l[il])*n_ctx, + lm_ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head, + 0); cb(v, "v", il); struct lm_ggml_tensor * kqv = lm_ggml_mul_mat(ctx, v, kq); @@ -3622,7 +4109,7 @@ struct llm_build_context { } struct lm_ggml_cgraph * build_llama() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph(ctx0); + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); LM_GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -3663,12 +4150,24 @@ struct llm_build_context { // compute Q and K and RoPE them struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } Qcur = lm_ggml_rope_custom( ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, @@ -3687,7 +4186,7 @@ struct llm_build_context { llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); cur = llm_build_kqv(ctx0, hparams, kv_self, - model.layers[il].wo, NULL, + model.layers[il].wo, model.layers[il].bo, Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il); cb(cur, "kqv_out", il); } @@ -3734,7 +4233,7 @@ struct llm_build_context { } struct lm_ggml_cgraph * build_baichuan() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph(ctx0); + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); struct lm_ggml_tensor * cur; struct lm_ggml_tensor * inpL; @@ -3854,7 +4353,7 @@ struct llm_build_context { } struct lm_ggml_cgraph * build_falcon() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph(ctx0); + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); struct lm_ggml_tensor * cur; struct lm_ggml_tensor * inpL; @@ -3976,7 +4475,7 @@ struct llm_build_context { } struct lm_ggml_cgraph * build_starcoder() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph(ctx0); + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); struct lm_ggml_tensor * cur; struct lm_ggml_tensor * pos; @@ -4075,7 +4574,7 @@ struct llm_build_context { } struct lm_ggml_cgraph * build_persimmon() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph(ctx0); + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); const int64_t n_rot = n_embd_head / 2; @@ -4085,6 +4584,7 @@ struct llm_build_context { inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); cb(inpL, "imp_embd", -1); + // inp_pos - contains the positions struct lm_ggml_tensor * inp_pos = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); cb(inp_pos, "inp_pos", -1); @@ -4092,6 +4592,7 @@ struct llm_build_context { struct lm_ggml_tensor * KQ_scale = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, 1); cb(KQ_scale, "KQ_scale", -1); + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); cb(KQ_mask, "KQ_mask", -1); @@ -4220,7 +4721,7 @@ struct llm_build_context { struct lm_ggml_tensor * Kcur = lm_ggml_concat(ctx0, krotated, kpass); cb(Kcur, "Kcur", il); - struct lm_ggml_tensor * Q = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, Qcur, 1, 2, 0, 3)); + struct lm_ggml_tensor * Q = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, Qcur, 2, 1, 0, 3)); cb(Q, "Q", il); Kcur = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, Kcur, 2, 1, 0, 3)); @@ -4285,7 +4786,7 @@ struct llm_build_context { } struct lm_ggml_cgraph * build_refact() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph(ctx0); + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); struct lm_ggml_tensor * cur; struct lm_ggml_tensor * inpL; @@ -4330,25 +4831,218 @@ struct llm_build_context { cur = llm_build_kqv(ctx0, hparams, kv_self, model.layers[il].wo, NULL, - Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, cb, il); + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, cb, il); + cb(cur, "kqv_out", il); + } + + struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + } + + cur = lm_ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = lm_ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + lm_ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct lm_ggml_cgraph * build_bloom() { + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + struct lm_ggml_tensor * cur; + struct lm_ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + cb(inpL, "inp_embd", -1); + + // KQ_scale + struct lm_ggml_tensor * KQ_scale = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, 1); + cb(KQ_scale, "KQ_scale", -1); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); + cb(KQ_mask, "KQ_mask", -1); + + inpL = llm_build_norm(ctx0, inpL, hparams, + model.tok_norm, + model.tok_norm_b, + LLM_NORM, cb, -1); + cb(inpL, "inp_norm", -1); + + for (int il = 0; il < n_layer; ++il) { + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + cur = lm_ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); + + cur = lm_ggml_add(ctx0, cur, model.layers[il].bqkv); + cb(cur, "bqkv", il); + + struct lm_ggml_tensor * Qcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct lm_ggml_tensor * Kcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct lm_ggml_tensor * Vcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + + llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); + + cur = llm_build_kqv(ctx0, hparams, kv_self, + model.layers[il].wo, model.layers[il].bo, + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, cb, il); + cb(cur, "kqv_out", il); + } + + // Add the input + struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); + + // FF + { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, + model.layers[il].ffn_norm_b, + LLM_NORM, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, + NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, + LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + cb(cur, "ffn_out", il); + } + + inpL = lm_ggml_add(ctx0, cur, ffn_inp); + cb(inpL, "l_out", il); + } + + cur = llm_build_norm(ctx0, inpL, hparams, + model.output_norm, + model.output_norm_b, + LLM_NORM, cb, -1); + cb(cur, "result_norm", -1); + + cur = lm_ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + lm_ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct lm_ggml_cgraph * build_mpt() { + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + struct lm_ggml_tensor * cur; + struct lm_ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + cb(inpL, "inp_embd", -1); + + // KQ_scale + struct lm_ggml_tensor * KQ_scale = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, 1); + cb(KQ_scale, "KQ_scale", -1); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); + cb(KQ_mask, "KQ_mask", -1); + + for (int il = 0; il < n_layer; ++il) { + struct lm_ggml_tensor * attn_norm; + + attn_norm = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, + NULL, + LLM_NORM, cb, il); + cb(attn_norm, "attn_norm", il); + + // self-attention + { + cur = attn_norm; + + cur = lm_ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); + + if (hparams.f_clamp_kqv > 0.0f) { + cur = lm_ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); + cb(cur, "wqkv_clamped", il); + } + + struct lm_ggml_tensor * Qcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct lm_ggml_tensor * Kcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct lm_ggml_tensor * Vcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + + llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); + + cur = llm_build_kqv(ctx0, hparams, kv_self, + model.layers[il].wo, NULL, + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, cb, il); cb(cur, "kqv_out", il); } - struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA); + // Add the input + struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpL); cb(ffn_inp, "ffn_inp", il); - // feed-forward network + // feed forward { cur = llm_build_norm(ctx0, ffn_inp, hparams, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + model.layers[il].ffn_norm, + NULL, + LLM_NORM, cb, il); cb(cur, "ffn_norm", il); cur = llm_build_ffn(ctx0, cur, model.layers[il].ffn_up, NULL, - model.layers[il].ffn_gate, NULL, + NULL, NULL, model.layers[il].ffn_down, NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); cb(cur, "ffn_out", il); } @@ -4362,11 +5056,11 @@ struct llm_build_context { cur = inpL; cur = llm_build_norm(ctx0, cur, hparams, - model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + model.output_norm, + NULL, + LLM_NORM, cb, -1); cb(cur, "result_norm", -1); - // lm_head cur = lm_ggml_mul_mat(ctx0, model.output, cur); cb(cur, "result_output", -1); @@ -4375,7 +5069,7 @@ struct llm_build_context { return gf; } - struct lm_ggml_cgraph * build_bloom() { + struct lm_ggml_cgraph * build_stablelm() { struct lm_ggml_cgraph * gf = lm_ggml_new_graph(ctx0); struct lm_ggml_tensor * cur; @@ -4384,6 +5078,10 @@ struct llm_build_context { inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); cb(inpL, "inp_embd", -1); + // inp_pos - contains the positions + struct lm_ggml_tensor * inp_pos = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); + cb(inp_pos, "inp_pos", -1); + // KQ_scale struct lm_ggml_tensor * KQ_scale = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, 1); cb(KQ_scale, "KQ_scale", -1); @@ -4392,13 +5090,15 @@ struct llm_build_context { struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); cb(KQ_mask, "KQ_mask", -1); - inpL = llm_build_norm(ctx0, inpL, hparams, - model.tok_norm, - model.tok_norm_b, - LLM_NORM, cb, -1); - cb(inpL, "inp_norm", -1); + // shift the entire K-cache if needed + if (do_rope_shift) { + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, hparams.n_rot, freq_base, freq_scale, cb); + } for (int il = 0; il < n_layer; ++il) { + struct lm_ggml_tensor * inpSA = inpL; + + // norm cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, model.layers[il].attn_norm_b, @@ -4407,35 +5107,42 @@ struct llm_build_context { // self-attention { - cur = lm_ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); - cb(cur, "wqkv", il); + // compute Q and K and RoPE them + struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); - cur = lm_ggml_add(ctx0, cur, model.layers[il].bqkv); - cb(cur, "bqkv", il); + struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); - struct lm_ggml_tensor * Qcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - struct lm_ggml_tensor * Kcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); - struct lm_ggml_tensor * Vcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + Qcur = lm_ggml_rope_custom( + ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); cb(Qcur, "Qcur", il); - cb(Kcur, "Kcur", il); - cb(Vcur, "Vcur", il); - Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = lm_ggml_rope_custom( + ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); cur = llm_build_kqv(ctx0, hparams, kv_self, - model.layers[il].wo, model.layers[il].bo, - Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, cb, il); + model.layers[il].wo, NULL, + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il); cb(cur, "kqv_out", il); } - // Add the input - struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpL); + struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); - // FF + // feed-forward network { cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, @@ -4444,23 +5151,29 @@ struct llm_build_context { cb(cur, "ffn_norm", il); cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, - NULL, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, - LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); } - inpL = lm_ggml_add(ctx0, cur, ffn_inp); - cb(inpL, "l_out", il); + cur = lm_ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; } - cur = llm_build_norm(ctx0, inpL, hparams, + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM, cb, -1); cb(cur, "result_norm", -1); + // lm_head cur = lm_ggml_mul_mat(ctx0, model.output, cur); cb(cur, "result_output", -1); @@ -4469,8 +5182,8 @@ struct llm_build_context { return gf; } - struct lm_ggml_cgraph * build_mpt() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph(ctx0); + struct lm_ggml_cgraph * build_qwen() { + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); struct lm_ggml_tensor * cur; struct lm_ggml_tensor * inpL; @@ -4478,70 +5191,86 @@ struct llm_build_context { inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); cb(inpL, "inp_embd", -1); + // inp_pos - contains the positions + struct lm_ggml_tensor * inp_pos= lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); + cb(inp_pos, "inp_pos", -1); + // KQ_scale - struct lm_ggml_tensor * KQ_scale = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, 1); + struct lm_ggml_tensor * KQ_scale= lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, 1); cb(KQ_scale, "KQ_scale", -1); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); + struct lm_ggml_tensor * KQ_mask= lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); cb(KQ_mask, "KQ_mask", -1); + // shift the entire K-cache if needed + if (do_rope_shift) { + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb); + } + for (int il = 0; il < n_layer; ++il) { - struct lm_ggml_tensor * attn_norm; + struct lm_ggml_tensor * inpSA = inpL; - attn_norm = llm_build_norm(ctx0, inpL, hparams, - model.layers[il].attn_norm, - NULL, - LLM_NORM, cb, il); - cb(attn_norm, "attn_norm", il); + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); // self-attention { - cur = attn_norm; - cur = lm_ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); - if (hparams.f_clamp_kqv > 0.0f) { - cur = lm_ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); - cb(cur, "wqkv_clamped", il); - } + cur = lm_ggml_add(ctx0, cur, model.layers[il].bqkv); + cb(cur, "bqkv", il); - struct lm_ggml_tensor * Qcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - struct lm_ggml_tensor * Kcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); - struct lm_ggml_tensor * Vcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + struct lm_ggml_tensor * Qcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct lm_ggml_tensor * Kcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct lm_ggml_tensor * Vcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd))); cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + + // using mode = 2 for neox mode + Qcur = lm_ggml_rope_custom( + ctx0, Qcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx, + freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = lm_ggml_rope_custom( + ctx0, Kcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx, + freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); cur = llm_build_kqv(ctx0, hparams, kv_self, model.layers[il].wo, NULL, - Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, cb, il); + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il); cb(cur, "kqv_out", il); } - // Add the input - struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpL); + struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); - // feed forward + // feed-forward forward { cur = llm_build_norm(ctx0, ffn_inp, hparams, - model.layers[il].ffn_norm, - NULL, - LLM_NORM, cb, il); + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); cur = llm_build_ffn(ctx0, cur, model.layers[il].ffn_up, NULL, - NULL, NULL, + model.layers[il].ffn_gate, NULL, model.layers[il].ffn_down, NULL, - LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); } @@ -4555,11 +5284,11 @@ struct llm_build_context { cur = inpL; cur = llm_build_norm(ctx0, cur, hparams, - model.output_norm, - NULL, - LLM_NORM, cb, -1); + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); cb(cur, "result_norm", -1); + // lm_head cur = lm_ggml_mul_mat(ctx0, model.output, cur); cb(cur, "result_output", -1); @@ -4577,8 +5306,8 @@ struct llm_build_context { enum llm_offload_func_e { OFFLOAD_FUNC_NOP, OFFLOAD_FUNC, - OFFLOAD_FUNC_KQ, - OFFLOAD_FUNC_V, + OFFLOAD_FUNC_FRC, // force offload + OFFLOAD_FUNC_KQV, OFFLOAD_FUNC_NR, OFFLOAD_FUNC_EMB, OFFLOAD_FUNC_OUT, @@ -4664,11 +5393,12 @@ static const std::unordered_map k_offload_map //{ "inp_embd", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel { "pos_embd", OFFLOAD_FUNC_NR }, - { "inp_pos", OFFLOAD_FUNC_KQ }, // this is often used for KQ ops (e.g. rope) - { "KQ_scale", OFFLOAD_FUNC_KQ }, - { "KQ_mask", OFFLOAD_FUNC_KQ }, - { "K_shift", OFFLOAD_FUNC_KQ }, - { "K_shifted", OFFLOAD_FUNC_KQ }, + { "inp_pos", OFFLOAD_FUNC_FRC }, // this is often used for KQ ops (e.g. rope) + { "KQ_scale", OFFLOAD_FUNC_FRC }, + { "KQ_mask", OFFLOAD_FUNC_FRC }, + { "K_shift", OFFLOAD_FUNC_FRC }, + + { "K_shifted", OFFLOAD_FUNC }, { "inp_norm", OFFLOAD_FUNC_NR }, { "inp_norm_w", OFFLOAD_FUNC_NR }, @@ -4681,37 +5411,38 @@ static const std::unordered_map k_offload_map { "attn_norm", OFFLOAD_FUNC }, { "attn_norm_2", OFFLOAD_FUNC }, - { "wqkv", OFFLOAD_FUNC_KQ }, - { "bqkv", OFFLOAD_FUNC_KQ }, - { "wqkv_clamped", OFFLOAD_FUNC_KQ }, - - { "tmpk", OFFLOAD_FUNC_KQ }, - { "tmpq", OFFLOAD_FUNC_KQ }, - { "tmpv", OFFLOAD_FUNC_V }, - { "Kcur", OFFLOAD_FUNC_KQ }, - { "Qcur", OFFLOAD_FUNC_KQ }, - { "Vcur", OFFLOAD_FUNC_V }, - - { "krot", OFFLOAD_FUNC_KQ }, - { "qrot", OFFLOAD_FUNC_KQ }, - { "kpass", OFFLOAD_FUNC_KQ }, - { "qpass", OFFLOAD_FUNC_KQ }, - { "krotated", OFFLOAD_FUNC_KQ }, - { "qrotated", OFFLOAD_FUNC_KQ }, - - { "q", OFFLOAD_FUNC_KQ }, - { "k", OFFLOAD_FUNC_KQ }, - { "kq", OFFLOAD_FUNC_KQ }, - { "kq_scaled", OFFLOAD_FUNC_KQ }, - { "kq_scaled_alibi", OFFLOAD_FUNC_KQ }, - { "kq_masked", OFFLOAD_FUNC_KQ }, - { "kq_soft_max", OFFLOAD_FUNC_V }, - { "v", OFFLOAD_FUNC_V }, - { "kqv", OFFLOAD_FUNC_V }, - { "kqv_merged", OFFLOAD_FUNC_V }, - { "kqv_merged_cont", OFFLOAD_FUNC_V }, - { "kqv_wo", OFFLOAD_FUNC_V }, - { "kqv_out", OFFLOAD_FUNC_V }, + { "wqkv", OFFLOAD_FUNC_KQV }, + { "bqkv", OFFLOAD_FUNC_KQV }, + { "wqkv_clamped", OFFLOAD_FUNC_KQV }, + + { "tmpk", OFFLOAD_FUNC_KQV }, + { "tmpq", OFFLOAD_FUNC_KQV }, + { "tmpv", OFFLOAD_FUNC_KQV }, + { "Kcur", OFFLOAD_FUNC_KQV }, + { "Qcur", OFFLOAD_FUNC_KQV }, + { "Vcur", OFFLOAD_FUNC_KQV }, + + { "krot", OFFLOAD_FUNC_KQV }, + { "qrot", OFFLOAD_FUNC_KQV }, + { "kpass", OFFLOAD_FUNC_KQV }, + { "qpass", OFFLOAD_FUNC_KQV }, + { "krotated", OFFLOAD_FUNC_KQV }, + { "qrotated", OFFLOAD_FUNC_KQV }, + + { "q", OFFLOAD_FUNC_KQV }, + { "k", OFFLOAD_FUNC_KQV }, + { "kq", OFFLOAD_FUNC_KQV }, + { "kq_scaled", OFFLOAD_FUNC_KQV }, + { "kq_scaled_alibi", OFFLOAD_FUNC_KQV }, + { "kq_masked", OFFLOAD_FUNC_KQV }, + { "kq_soft_max", OFFLOAD_FUNC_KQV }, + { "kq_soft_max_ext", OFFLOAD_FUNC_KQV }, + { "v", OFFLOAD_FUNC_KQV }, + { "kqv", OFFLOAD_FUNC_KQV }, + { "kqv_merged", OFFLOAD_FUNC_KQV }, + { "kqv_merged_cont", OFFLOAD_FUNC_KQV }, + { "kqv_wo", OFFLOAD_FUNC_KQV }, + { "kqv_out", OFFLOAD_FUNC_KQV }, { "ffn_inp", OFFLOAD_FUNC }, { "ffn_norm", OFFLOAD_FUNC }, @@ -4903,15 +5634,15 @@ static struct lm_ggml_cgraph * llama_build_graph( { OFFLOAD_FUNC_NOP, "CPU" }, { OFFLOAD_FUNC_OUT, "CPU" }, #ifdef LM_GGML_USE_CUBLAS - { OFFLOAD_FUNC, "GPU (CUDA)" }, - { OFFLOAD_FUNC_KQ, "GPU (CUDA) KQ" }, - { OFFLOAD_FUNC_V, "GPU (CUDA) V" }, - { OFFLOAD_FUNC_NR, "GPU (CUDA) NR" }, + { OFFLOAD_FUNC, "GPU (CUDA)" }, + { OFFLOAD_FUNC_FRC, "GPU (CUDA) FRC" }, + { OFFLOAD_FUNC_KQV, "GPU (CUDA) KQV" }, + { OFFLOAD_FUNC_NR, "GPU (CUDA) NR" }, { OFFLOAD_FUNC_EMB, "GPU (CUDA) EMB" }, #else { OFFLOAD_FUNC, "CPU" }, - { OFFLOAD_FUNC_KQ, "CPU" }, - { OFFLOAD_FUNC_V, "CPU" }, + { OFFLOAD_FUNC_FRC, "CPU" }, + { OFFLOAD_FUNC_KQV, "CPU" }, { OFFLOAD_FUNC_NR, "CPU" }, { OFFLOAD_FUNC_EMB, "CPU" }, #endif // LM_GGML_USE_CUBLAS @@ -4944,18 +5675,23 @@ static struct lm_ggml_cgraph * llama_build_graph( } } break; - case OFFLOAD_FUNC_NR: - if (n_gpu_layers <= n_layer + 0) { + case OFFLOAD_FUNC_FRC: + if (!lctx.cparams.offload_kqv) { func_e = OFFLOAD_FUNC_NOP; - } - break; - case OFFLOAD_FUNC_V: - if (n_gpu_layers <= n_layer + 1) { + } break; + case OFFLOAD_FUNC_KQV: + if (!lctx.cparams.offload_kqv) { func_e = OFFLOAD_FUNC_NOP; + } else { + if (n_gpu_layers < n_layer) { + if (il < i_gpu_start) { + func_e = OFFLOAD_FUNC_NOP; + } + } } break; - case OFFLOAD_FUNC_KQ: - if (n_gpu_layers <= n_layer + 2) { + case OFFLOAD_FUNC_NR: + if (n_gpu_layers <= n_layer + 0) { func_e = OFFLOAD_FUNC_NOP; } break; @@ -4980,8 +5716,8 @@ static struct lm_ggml_cgraph * llama_build_graph( case OFFLOAD_FUNC_NOP: case OFFLOAD_FUNC_OUT: func = lm_ggml_offload_nop; break; case OFFLOAD_FUNC: - case OFFLOAD_FUNC_KQ: - case OFFLOAD_FUNC_V: + case OFFLOAD_FUNC_KQV: + case OFFLOAD_FUNC_FRC: case OFFLOAD_FUNC_NR: case OFFLOAD_FUNC_EMB: func = lm_ggml_offload_gpu; break; default: LM_GGML_ASSERT(false); @@ -5036,6 +5772,14 @@ static struct lm_ggml_cgraph * llama_build_graph( { result = llm.build_mpt(); } break; + case LLM_ARCH_STABLELM: + { + result = llm.build_stablelm(); + } break; + case LLM_ARCH_QWEN: + { + result = llm.build_qwen(); + } break; default: LM_GGML_ASSERT(false); } @@ -5145,6 +5889,12 @@ static int llama_decode_internal( batch.seq_id = seq_id_arr.data(); } + // if we have enough unused cells before the current head -> + // better to start searching from the beginning of the cache, hoping to fill it + if (kv_self.head > kv_self.used + 2*n_tokens) { + kv_self.head = 0; + } + if (!llama_kv_cache_find_slot(kv_self, batch)) { return 1; } @@ -5152,10 +5902,10 @@ static int llama_decode_internal( // a heuristic, to avoid attending the full cache if it is not yet utilized // after enough generations, the benefit from this heuristic disappears // if we start defragmenting the cache, the benefit from this will be more important - //kv_self.n = std::max(32, LM_GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)); // TODO: this might be better for CUDA? - kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self))); + kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, LM_GGML_PAD(llama_kv_cache_cell_max(kv_self), 32))); + //kv_self.n = llama_kv_cache_cell_max(kv_self); - //printf("kv_self.n = %d\n", kv_self.n); + //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); lm_ggml_allocr_reset(lctx.alloc); @@ -5204,17 +5954,8 @@ static int llama_decode_internal( n_threads = std::min(4, n_threads); } - // If all tensors can be run on the GPU then using more than 1 thread is detrimental. - const bool full_offload_supported = - model.arch == LLM_ARCH_LLAMA || - model.arch == LLM_ARCH_BAICHUAN || - model.arch == LLM_ARCH_FALCON || - model.arch == LLM_ARCH_REFACT || - model.arch == LLM_ARCH_MPT || - model.arch == LLM_ARCH_STARCODER; - - const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3; - if (lm_ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) { + const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1; + if (lm_ggml_cpu_has_cublas() && fully_offloaded) { n_threads = 1; } @@ -6003,7 +6744,10 @@ static std::vector llama_tokenize_internal(const llama_vocab & // by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer // and passing 'add space prefix' as bool argument // - auto raw_text = (special ? "" : " ") + fragment.raw_text.substr(fragment.offset, fragment.length); + auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); + if (&fragment == &fragment_buffer.front()) { + raw_text = " " + raw_text; // prefix with space if the first token is not special + } #ifdef PRETOKENIZERDEBUG fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); @@ -6069,11 +6813,13 @@ struct llama_grammar_candidate { // Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as // pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`. static std::pair, llama_partial_utf8> decode_utf8( - const char * src, + const std::string & src, llama_partial_utf8 partial_start) { static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 }; - const char * pos = src; + const char * pos = src.c_str(); std::vector code_points; + // common english strings have the same number of codepoints and bytes. `+ 1` for the terminating 0. + code_points.reserve(src.size() + 1); uint32_t value = partial_start.value; int n_remain = partial_start.n_remain; @@ -6677,6 +7423,7 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c // Replace the data in candidates with the new_candidates data std::copy(new_candidates.begin(), new_candidates.end(), candidates->data); candidates->size = new_candidates.size(); + candidates->sorted = false; if (ctx) { ctx->t_sample_us += lm_ggml_time_us() - t_start_sample_us; @@ -6761,7 +7508,9 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c const llama_token eos = llama_token_eos(&ctx->model); std::vector, llama_partial_utf8>> candidates_decoded; + candidates_decoded.reserve(candidates->size); std::vector candidates_grammar; + candidates_grammar.reserve(candidates->size); for (size_t i = 0; i < candidates->size; ++i) { const llama_token id = candidates->data[i].id; @@ -6773,7 +7522,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c } else if (piece.empty() || piece[0] == 0) { candidates->data[i].logit = -INFINITY; } else { - candidates_decoded.push_back(decode_utf8(piece.c_str(), grammar->partial_utf8)); + candidates_decoded.push_back(decode_utf8(piece, grammar->partial_utf8)); candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second }); } } @@ -6980,7 +7729,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar const std::string piece = llama_token_to_piece(ctx, token); // Note terminating 0 in decoded string - const auto decoded = decode_utf8(piece.c_str(), grammar->partial_utf8); + const auto decoded = decode_utf8(piece, grammar->partial_utf8); const auto & code_points = decoded.first; for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) { grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it); @@ -7298,18 +8047,21 @@ static void llama_convert_tensor_internal( return; } - auto block_size = tensor->type == LM_GGML_TYPE_F16 ? 1 : (size_t)lm_ggml_blck_size(tensor->type); - auto block_size_bytes = lm_ggml_type_size(tensor->type); + size_t block_size = tensor->type == LM_GGML_TYPE_F16 ? 1 : (size_t)lm_ggml_blck_size(tensor->type); + size_t block_size_bytes = lm_ggml_type_size(tensor->type); LM_GGML_ASSERT(nelements % block_size == 0); - auto nblocks = nelements / block_size; - auto blocks_per_thread = nblocks / nthread; - auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count + size_t nblocks = nelements / block_size; + size_t blocks_per_thread = nblocks / nthread; + size_t spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count - for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) { - auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread - auto thr_elems = thr_blocks * block_size; // number of elements for this thread - auto thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread + size_t in_buff_offs = 0; + size_t out_buff_offs = 0; + + for (int tnum = 0; tnum < nthread; tnum++) { + size_t thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread + size_t thr_elems = thr_blocks * block_size; // number of elements for this thread + size_t thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread auto compute = [qtype] (lm_ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) { if (typ == LM_GGML_TYPE_F16) { @@ -7479,7 +8231,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s constexpr bool use_mmap = false; #endif - llama_model_loader ml(fname_inp, use_mmap); + llama_model_loader ml(fname_inp, use_mmap, NULL); if (ml.use_mmap) { ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, lm_ggml_is_numa())); } @@ -7655,7 +8407,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s workers.clear(); } - LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", lm_ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0); + LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB | hist: ", lm_ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0); int64_t tot_count = 0; for (size_t i = 0; i < hist_cur.size(); i++) { hist_all[i] += hist_cur[i]; @@ -7775,7 +8527,7 @@ static int llama_apply_lora_from_file_internal( std::vector base_buf; if (path_base_model) { LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model); - ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true)); + ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ NULL)); size_t ctx_size; size_t mmapped_size; @@ -8003,6 +8755,7 @@ struct llama_model_params llama_model_default_params() { /*.tensor_split =*/ nullptr, /*.progress_callback =*/ nullptr, /*.progress_callback_user_data =*/ nullptr, + /*.kv_overrides =*/ nullptr, /*.vocab_only =*/ false, /*.use_mmap =*/ true, /*.use_mlock =*/ false, @@ -8030,10 +8783,12 @@ struct llama_context_params llama_context_default_params() { /*.yarn_beta_fast =*/ 32.0f, /*.yarn_beta_slow =*/ 1.0f, /*.yarn_orig_ctx =*/ 0, + /*.type_k =*/ LM_GGML_TYPE_F16, + /*.type_v =*/ LM_GGML_TYPE_F16, /*.mul_mat_q =*/ true, - /*.f16_kv =*/ true, /*.logits_all =*/ false, /*.embedding =*/ false, + /*.offload_kqv =*/ true, }; return result; @@ -8150,6 +8905,7 @@ struct llama_context * llama_new_context_with_model( cparams.yarn_beta_fast = params.yarn_beta_fast; cparams.yarn_beta_slow = params.yarn_beta_slow; cparams.mul_mat_q = params.mul_mat_q; + cparams.offload_kqv = params.offload_kqv; cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base; @@ -8183,19 +8939,36 @@ struct llama_context * llama_new_context_with_model( ctx->rng = std::mt19937(params.seed); ctx->logits_all = params.logits_all; - lm_ggml_type memory_type = params.f16_kv ? LM_GGML_TYPE_F16 : LM_GGML_TYPE_F32; + const lm_ggml_type type_k = params.type_k; + const lm_ggml_type type_v = params.type_v; + + LM_GGML_ASSERT(hparams.n_embd_head() % lm_ggml_blck_size(type_k) == 0); + LM_GGML_ASSERT(hparams.n_embd_head() % lm_ggml_blck_size(type_v) == 0); // reserve memory for context buffers if (!hparams.vocab_only) { - if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, cparams.n_ctx, model->n_gpu_layers)) { + if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, type_k, type_v, cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv)) { LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__); llama_free(ctx); return nullptr; } { - const size_t memory_size = lm_ggml_nbytes(ctx->kv_self.k) + lm_ggml_nbytes(ctx->kv_self.v); - LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0); + size_t memory_size_k = 0; + size_t memory_size_v = 0; + + for (auto & k : ctx->kv_self.k_l) { + memory_size_k += lm_ggml_nbytes(k); + } + + for (auto & v : ctx->kv_self.v_l) { + memory_size_v += lm_ggml_nbytes(v); + } + + LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__, + (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), + lm_ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f), + lm_ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f)); } // resized during inference @@ -8212,7 +8985,7 @@ struct llama_context * llama_new_context_with_model( { static const size_t tensor_alignment = 32; // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data - ctx->buf_compute.resize(lm_ggml_tensor_overhead()*LM_GGML_MAX_NODES + lm_ggml_graph_overhead()); + ctx->buf_compute.resize(lm_ggml_tensor_overhead()*LLAMA_MAX_NODES + lm_ggml_graph_overhead()); // create measure allocator ctx->alloc = lm_ggml_allocr_new_measure(tensor_alignment); @@ -8225,8 +8998,6 @@ struct llama_context * llama_new_context_with_model( #ifdef LM_GGML_USE_METAL if (model->n_gpu_layers > 0) { - lm_ggml_metal_log_set_callback(llama_log_callback_default, NULL); - ctx->ctx_metal = lm_ggml_metal_init(1); if (!ctx->ctx_metal) { LLAMA_LOG_ERROR("%s: lm_ggml_metal_init() failed\n", __func__); @@ -8240,7 +9011,7 @@ struct llama_context * llama_new_context_with_model( // measure memory requirements for the graph size_t alloc_size = lm_ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment; - LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0); + LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0); // recreate allocator with exact memory requirements lm_ggml_allocr_free(ctx->alloc); @@ -8254,7 +9025,7 @@ struct llama_context * llama_new_context_with_model( #endif #ifdef LM_GGML_USE_CUBLAS lm_ggml_cuda_set_scratch_size(alloc_size); - LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1024.0 / 1024.0); + LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0); // calculate total VRAM usage auto add_tensor = [](const lm_ggml_tensor * t, size_t & size) { @@ -8268,16 +9039,20 @@ struct llama_context * llama_new_context_with_model( } size_t kv_vram_size = 0; - add_tensor(ctx->kv_self.k, kv_vram_size); - add_tensor(ctx->kv_self.v, kv_vram_size); + for (auto & k : ctx->kv_self.k_l) { + add_tensor(k, kv_vram_size); + } + for (auto & v : ctx->kv_self.v_l) { + add_tensor(v, kv_vram_size); + } size_t ctx_vram_size = alloc_size + kv_vram_size; size_t total_vram_size = model_vram_size + ctx_vram_size; - LLAMA_LOG_INFO("%s: total VRAM used: %.2f MB (model: %.2f MB, context: %.2f MB)\n", __func__, + LLAMA_LOG_INFO("%s: total VRAM used: %.2f MiB (model: %.2f MiB, context: %.2f MiB)\n", __func__, total_vram_size / 1024.0 / 1024.0, model_vram_size / 1024.0 / 1024.0, - ctx_vram_size / 1024.0 / 1024.0); + ctx_vram_size / 1024.0 / 1024.0); #endif } @@ -8298,7 +9073,7 @@ struct llama_context * llama_new_context_with_model( const size_t max_size = lm_ggml_get_max_tensor_size(ctx->model.ctx); - LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0); + LLAMA_LOG_INFO("%s: max tensor size = %8.2f MiB\n", __func__, max_size/1024.0/1024.0); #define LLAMA_METAL_CHECK_BUF(result) \ if (!(result)) { \ @@ -8364,6 +9139,45 @@ float llama_rope_freq_scale_train(const struct llama_model * model) { return model->hparams.rope_freq_scale_train; } +int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) { + const auto & it = model->lm_gguf_kv.find(key); + if (it == model->lm_gguf_kv.end()) { + if (buf_size > 0) { + buf[0] = '\0'; + } + return -1; + } + return snprintf(buf, buf_size, "%s", it->second.c_str()); +} + +int llama_model_meta_count(const struct llama_model * model) { + return (int)model->lm_gguf_kv.size(); +} + +int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) { + if (i < 0 || i >= (int)model->lm_gguf_kv.size()) { + if (buf_size > 0) { + buf[0] = '\0'; + } + return -1; + } + auto it = model->lm_gguf_kv.begin(); + std::advance(it, i); + return snprintf(buf, buf_size, "%s", it->first.c_str()); +} + +int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) { + if (i < 0 || i >= (int)model->lm_gguf_kv.size()) { + if (buf_size > 0) { + buf[0] = '\0'; + } + return -1; + } + auto it = model->lm_gguf_kv.begin(); + std::advance(it, i); + return snprintf(buf, buf_size, "%s", it->second.c_str()); +} + int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) { return snprintf(buf, buf_size, "%s %s %s", llama_model_arch_name(model->arch).c_str(), @@ -8422,8 +9236,107 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha } } +struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq) { + struct llama_kv_cache_view result = { + /*.n_cells = */ 0, + /*.n_max_seq = */ n_max_seq, + /*.token_count = */ 0, + /*.used_cells = */ llama_get_kv_cache_used_cells(ctx), + /*.max_contiguous = */ 0, + /*.max_contiguous_idx = */ -1, + /*.cells = */ nullptr, + /*.cells_sequences = */ nullptr, + }; + return result; +} + +void llama_kv_cache_view_free(struct llama_kv_cache_view * view) { + if (view->cells != nullptr) { + free(view->cells); + view->cells = nullptr; + } + if (view->cells_sequences != nullptr) { + free(view->cells_sequences); + view->cells_sequences = nullptr; + } +} + +void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view) { + if (uint32_t(view->n_cells) < ctx->kv_self.size || view->cells == nullptr) { + view->n_cells = int32_t(ctx->kv_self.size); + void * p = realloc(view->cells, sizeof(struct llama_kv_cache_view_cell) * view->n_cells); + LM_GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells"); + view->cells = (struct llama_kv_cache_view_cell *)p; + p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_max_seq * view->n_cells); + LM_GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences"); + view->cells_sequences = (llama_seq_id *)p; + } + + const std::vector & kv_cells = ctx->kv_self.cells; + llama_kv_cache_view_cell * c_curr = view->cells; + llama_seq_id * cs_curr = view->cells_sequences; + int32_t used_cells = 0; + int32_t token_count = 0; + int32_t curr_contig_idx = -1; + uint32_t max_contig = 0; + int32_t max_contig_idx = -1; + + for (int32_t i = 0; i < int32_t(ctx->kv_self.size); i++, c_curr++, cs_curr += view->n_max_seq) { + const size_t curr_size = kv_cells[i].seq_id.size(); + token_count += curr_size; + c_curr->pos = kv_cells[i].pos + kv_cells[i].delta; + + if (curr_size > 0) { + if (curr_contig_idx >= 0 && uint32_t(i - curr_contig_idx) > max_contig) { + max_contig = i - curr_contig_idx; + max_contig_idx = curr_contig_idx; + } + curr_contig_idx = -1; + } else if (curr_contig_idx < 0) { + curr_contig_idx = i; + } + + int seq_idx = 0; + for (const llama_seq_id it : kv_cells[i].seq_id) { + if (seq_idx >= view->n_max_seq) { + break; + } + cs_curr[seq_idx] = it; + seq_idx++; + } + if (seq_idx != 0) { + used_cells++; + } + for (; seq_idx < view->n_max_seq; seq_idx++) { + cs_curr[seq_idx] = -1; + } + } + if (curr_contig_idx >= 0 && kv_cells.size() - curr_contig_idx > max_contig) { + max_contig_idx = curr_contig_idx; + max_contig = kv_cells.size() - curr_contig_idx; + } + view->max_contiguous = max_contig; + view->max_contiguous_idx = max_contig_idx; + view->token_count = token_count; + view->used_cells = used_cells; + if (uint32_t(used_cells) != ctx->kv_self.used) { + LLAMA_LOG_ERROR("%s: used cells mismatch. kv_cache says %d but we calculated %d\n", + __func__, ctx->kv_self.used, used_cells); + } +} + int llama_get_kv_cache_token_count(const struct llama_context * ctx) { - return ctx->kv_self.head; + int result = 0; + + for (uint32_t i = 0; i < ctx->kv_self.size; i++) { + result += ctx->kv_self.cells[i].seq_id.size(); + } + + return result; +} + +int llama_get_kv_cache_used_cells(const struct llama_context * ctx) { + return ctx->kv_self.used; } void llama_kv_cache_clear(struct llama_context * ctx) { @@ -8593,43 +9506,53 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat const size_t kv_buf_size = kv_self.buf.size; const uint32_t kv_head = kv_self.head; const uint32_t kv_size = kv_self.size; + const uint32_t kv_used = kv_self.used; data_ctx->write(&kv_buf_size, sizeof(kv_buf_size)); data_ctx->write(&kv_head, sizeof(kv_head)); data_ctx->write(&kv_size, sizeof(kv_size)); + data_ctx->write(&kv_used, sizeof(kv_used)); if (kv_buf_size) { - const size_t elt_size = lm_ggml_element_size(kv_self.k); + const size_t elt_size = lm_ggml_element_size(kv_self.k_l[0]); - lm_ggml_context * cpy_ctx = lm_ggml_init({ 4096, NULL, /* no_alloc */ true }); - lm_ggml_cgraph gf{}; + lm_ggml_context * cpy_ctx = lm_ggml_init({ 6*n_layer*lm_ggml_tensor_overhead() + lm_ggml_graph_overhead(), NULL, /* no_alloc */ true }); + lm_ggml_cgraph * gf = lm_ggml_new_graph(cpy_ctx); - lm_ggml_tensor * kout3d = lm_ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer); - std::vector kout3d_data(lm_ggml_nbytes(kout3d), 0); - kout3d->data = kout3d_data.data(); + std::vector> kout2d_data(n_layer); + std::vector> vout2d_data(n_layer); - lm_ggml_tensor * vout3d = lm_ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer); - std::vector vout3d_data(lm_ggml_nbytes(vout3d), 0); - vout3d->data = vout3d_data.data(); + for (int il = 0; il < (int) n_layer; ++il) { + lm_ggml_tensor * kout2d = lm_ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head); + kout2d_data[il].resize(lm_ggml_nbytes(kout2d)); + kout2d->data = kout2d_data[il].data(); - lm_ggml_tensor * k3d = lm_ggml_view_3d(cpy_ctx, kv_self.k, - n_embd, kv_head, n_layer, - elt_size*n_embd, elt_size*n_embd*n_ctx, 0); + lm_ggml_tensor * vout2d = lm_ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd); + vout2d_data[il].resize(lm_ggml_nbytes(vout2d)); + vout2d->data = vout2d_data[il].data(); - lm_ggml_tensor * v3d = lm_ggml_view_3d(cpy_ctx, kv_self.v, - kv_head, n_embd, n_layer, - elt_size*n_ctx, elt_size*n_ctx*n_embd, 0); + lm_ggml_tensor * k2d = lm_ggml_view_2d(cpy_ctx, kv_self.k_l[il], + n_embd, kv_head, + elt_size*n_embd, 0); - lm_ggml_build_forward_expand(&gf, lm_ggml_cpy(cpy_ctx, k3d, kout3d)); - lm_ggml_build_forward_expand(&gf, lm_ggml_cpy(cpy_ctx, v3d, vout3d)); - lm_ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1); + lm_ggml_tensor * v2d = lm_ggml_view_2d(cpy_ctx, kv_self.v_l[il], + kv_head, n_embd, + elt_size*n_ctx, 0); + + lm_ggml_build_forward_expand(gf, lm_ggml_cpy(cpy_ctx, k2d, kout2d)); + lm_ggml_build_forward_expand(gf, lm_ggml_cpy(cpy_ctx, v2d, vout2d)); + } + + lm_ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1); lm_ggml_free(cpy_ctx); - // our data is now in the kout3d_data and vout3d_data buffers + // our data is now in the kout2d_data and vout2d_data buffers // write them to file - data_ctx->write(kout3d_data.data(), kout3d_data.size()); - data_ctx->write(vout3d_data.data(), vout3d_data.size()); + for (uint32_t il = 0; il < n_layer; ++il) { + data_ctx->write(kout2d_data[il].data(), kout2d_data[il].size()); + data_ctx->write(vout2d_data[il].data(), vout2d_data[il].size()); + } } for (uint32_t i = 0; i < kv_size; ++i) { @@ -8719,44 +9642,50 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { size_t kv_buf_size; uint32_t kv_head; uint32_t kv_size; + uint32_t kv_used; memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size); memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head); memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size); + memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used); if (kv_buf_size) { LM_GGML_ASSERT(kv_self.buf.size == kv_buf_size); - const size_t elt_size = lm_ggml_element_size(kv_self.k); + const size_t elt_size = lm_ggml_element_size(kv_self.k_l[0]); - lm_ggml_context * cpy_ctx = lm_ggml_init({ 4096, NULL, /* no_alloc */ true }); - lm_ggml_cgraph gf{}; + lm_ggml_context * cpy_ctx = lm_ggml_init({ 6*n_layer*lm_ggml_tensor_overhead() + lm_ggml_graph_overhead(), NULL, /* no_alloc */ true }); + lm_ggml_cgraph * gf = lm_ggml_new_graph(cpy_ctx); - lm_ggml_tensor * kin3d = lm_ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer); - kin3d->data = (void *) inp; - inp += lm_ggml_nbytes(kin3d); + for (int il = 0; il < n_layer; ++il) { + lm_ggml_tensor * kin2d = lm_ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head); + kin2d->data = (void *) inp; + inp += lm_ggml_nbytes(kin2d); - lm_ggml_tensor * vin3d = lm_ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer); - vin3d->data = (void *) inp; - inp += lm_ggml_nbytes(vin3d); + lm_ggml_tensor * vin2d = lm_ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd); + vin2d->data = (void *) inp; + inp += lm_ggml_nbytes(vin2d); - lm_ggml_tensor * k3d = lm_ggml_view_3d(cpy_ctx, kv_self.k, - n_embd, kv_head, n_layer, - elt_size*n_embd, elt_size*n_embd*n_ctx, 0); + lm_ggml_tensor * k2d = lm_ggml_view_2d(cpy_ctx, kv_self.k_l[il], + n_embd, kv_head, + elt_size*n_embd, 0); - lm_ggml_tensor * v3d = lm_ggml_view_3d(cpy_ctx, kv_self.v, - kv_head, n_embd, n_layer, - elt_size*n_ctx, elt_size*n_ctx*n_embd, 0); + lm_ggml_tensor * v2d = lm_ggml_view_2d(cpy_ctx, kv_self.v_l[il], + kv_head, n_embd, + elt_size*n_ctx, 0); - lm_ggml_build_forward_expand(&gf, lm_ggml_cpy(cpy_ctx, kin3d, k3d)); - lm_ggml_build_forward_expand(&gf, lm_ggml_cpy(cpy_ctx, vin3d, v3d)); - lm_ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1); + lm_ggml_build_forward_expand(gf, lm_ggml_cpy(cpy_ctx, kin2d, k2d)); + lm_ggml_build_forward_expand(gf, lm_ggml_cpy(cpy_ctx, vin2d, v2d)); + } + + lm_ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1); lm_ggml_free(cpy_ctx); } ctx->kv_self.head = kv_head; ctx->kv_self.size = kv_size; + ctx->kv_self.used = kv_used; ctx->kv_self.cells.resize(kv_size); @@ -9005,6 +9934,14 @@ llama_token llama_token_nl(const struct llama_model * model) { return model->vocab.linefeed_id; } +int llama_add_bos_token(const struct llama_model * model) { + return model->vocab.special_add_bos; +} + +int llama_add_eos_token(const struct llama_model * model) { + return model->vocab.special_add_eos; +} + llama_token llama_token_prefix(const struct llama_model * model) { return model->vocab.special_prefix_id; } @@ -9211,6 +10148,9 @@ const std::vector> & llama_inter void llama_log_set(lm_ggml_log_callback log_callback, void * user_data) { g_state.log_callback = log_callback ? log_callback : llama_log_callback_default; g_state.log_callback_user_data = user_data; +#ifdef LM_GGML_USE_METAL + lm_ggml_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data); +#endif } static void llama_log_internal_v(lm_ggml_log_level level, const char * format, va_list args) { diff --git a/cpp/llama.h b/cpp/llama.h index d806999..df8ff8c 100644 --- a/cpp/llama.h +++ b/cpp/llama.h @@ -42,7 +42,7 @@ #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn' #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN -#define LLAMA_SESSION_VERSION 2 +#define LLAMA_SESSION_VERSION 3 #if defined(LM_GGML_USE_CUBLAS) || defined(LM_GGML_USE_CLBLAST) || defined(LM_GGML_USE_METAL) // Defined when llama.cpp is compiled with support for offloading model layers to GPU. @@ -158,6 +158,22 @@ extern "C" { llama_seq_id all_seq_id; // used if seq_id == NULL } llama_batch; + enum llama_model_kv_override_type { + LLAMA_KV_OVERRIDE_INT, + LLAMA_KV_OVERRIDE_FLOAT, + LLAMA_KV_OVERRIDE_BOOL, + }; + + struct llama_model_kv_override { + char key[128]; + enum llama_model_kv_override_type tag; + union { + int64_t int_value; + double float_value; + bool bool_value; + }; + }; + struct llama_model_params { int32_t n_gpu_layers; // number of layers to store in VRAM int32_t main_gpu; // the GPU that is used for scratch and small tensors @@ -165,9 +181,13 @@ extern "C" { // called with a progress value between 0 and 1, pass NULL to disable llama_progress_callback progress_callback; + // context pointer passed to the progress callback void * progress_callback_user_data; + // override key-value pairs of the model meta data + const struct llama_model_kv_override * kv_overrides; + // Keep the booleans together to avoid misalignment during copy-by-value. bool vocab_only; // only load the vocabulary, no weights bool use_mmap; // use mmap if possible @@ -185,17 +205,20 @@ extern "C" { // ref: https://github.com/ggerganov/llama.cpp/pull/2054 float rope_freq_base; // RoPE base frequency, 0 = from model float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model - float yarn_ext_factor; // YaRN extrapolation mix factor, NaN = from model + float yarn_ext_factor; // YaRN extrapolation mix factor, negative = from model float yarn_attn_factor; // YaRN magnitude scaling factor float yarn_beta_fast; // YaRN low correction dim float yarn_beta_slow; // YaRN high correction dim uint32_t yarn_orig_ctx; // YaRN original context size + enum lm_ggml_type type_k; // data type for K cache + enum lm_ggml_type type_v; // data type for V cache + // Keep the booleans together to avoid misalignment during copy-by-value. - bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true) - bool f16_kv; // use fp16 for KV cache, fp32 otherwise - bool logits_all; // the llama_eval() call computes all logits, not just the last one - bool embedding; // embedding mode only + bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true) + bool logits_all; // the llama_eval() call computes all logits, not just the last one + bool embedding; // embedding mode only + bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU }; // model quantization parameters @@ -301,6 +324,23 @@ extern "C" { // Get the model's RoPE frequency scaling factor LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model); + // Functions to access the model's GGUF metadata scalar values + // - The functions return the length of the string on success, or -1 on failure + // - The output string is always null-terminated and cleared on failure + // - GGUF array values are not supported by these functions + + // Get metadata value as a string by key name + LLAMA_API int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size); + + // Get the number of metadata key/value pairs + LLAMA_API int llama_model_meta_count(const struct llama_model * model); + + // Get metadata key name by index + LLAMA_API int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size); + + // Get metadata value as a string by index + LLAMA_API int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size); + // Get a string describing the model type LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size); @@ -344,9 +384,60 @@ extern "C" { // KV cache // - // Returns the number of tokens in the KV cache - LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx), - "avoid using this, it will be removed in the future, instead - count the tokens in user code"); + // Information associated with an individual cell in the KV cache view. + struct llama_kv_cache_view_cell { + // The position for this cell. Takes KV cache shifts into account. + // May be negative if the cell is not populated. + llama_pos pos; + }; + + // An updateable view of the KV cache. + struct llama_kv_cache_view { + // Number of KV cache cells. This will be the same as the context size. + int32_t n_cells; + + // Maximum number of sequences that can exist in a cell. It's not an error + // if there are more sequences in a cell than this value, however they will + // not be visible in the view cells_sequences. + int32_t n_max_seq; + + // Number of tokens in the cache. For example, if there are two populated + // cells, the first with 1 sequence id in it and the second with 2 sequence + // ids then you'll have 3 tokens. + int32_t token_count; + + // Number of populated cache cells. + int32_t used_cells; + + // Maximum contiguous empty slots in the cache. + int32_t max_contiguous; + + // Index to the start of the max_contiguous slot range. Can be negative + // when cache is full. + int32_t max_contiguous_idx; + + // Information for an individual cell. + struct llama_kv_cache_view_cell * cells; + + // The sequences for each cell. There will be n_max_seq items per cell. + llama_seq_id * cells_sequences; + }; + + // Create an empty KV cache view. (use only for debugging purposes) + LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq); + + // Free a KV cache view. (use only for debugging purposes) + LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view); + + // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes) + LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view); + + // Returns the number of tokens in the KV cache (slow, use only for debug) + // If a KV cell has multiple sequences assigned to it, it will be counted multiple times + LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx); + + // Returns the number of used KV cells (i.e. have at least one sequence assigned to them) + LLAMA_API int llama_get_kv_cache_used_cells(const struct llama_context * ctx); // Clear the KV cache LLAMA_API void llama_kv_cache_clear( @@ -517,6 +608,12 @@ extern "C" { LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line + // Returns -1 if unknown, 1 for true or 0 for false. + LLAMA_API int llama_add_bos_token(const struct llama_model * model); + + // Returns -1 if unknown, 1 for true or 0 for false. + LLAMA_API int llama_add_eos_token(const struct llama_model * model); + // codellama infill tokens LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle diff --git a/cpp/sampling.cpp b/cpp/sampling.cpp index 1317024..f4e76df 100644 --- a/cpp/sampling.cpp +++ b/cpp/sampling.cpp @@ -99,6 +99,56 @@ std::string llama_sampling_print(const llama_sampling_params & params) { return std::string(result); } +std::string llama_sampling_order_print(const llama_sampling_params & params) { + std::string result = "CFG -> Penalties "; + if (params.mirostat == 0) { + for (auto s : params.samplers_sequence) { + switch (s) { + case 'k': result += "-> top_k "; break; + case 'f': result += "-> tfs_z "; break; + case 'y': result += "-> typical_p "; break; + case 'p': result += "-> top_p "; break; + case 'm': result += "-> min_p "; break; + case 't': result += "-> temp "; break; + default : break; + } + } + } else { + result += "-> mirostat "; + } + + return result; +} + +// no reasons to expose this function in header +static void sampler_queue( + struct llama_context * ctx_main, + const llama_sampling_params & params, + llama_token_data_array & cur_p, + size_t & min_keep) { + const int n_vocab = llama_n_vocab(llama_get_model(ctx_main)); + + const float temp = params.temp; + const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k; + const float top_p = params.top_p; + const float min_p = params.min_p; + const float tfs_z = params.tfs_z; + const float typical_p = params.typical_p; + const std::string & samplers_sequence = params.samplers_sequence; + + for (auto s : samplers_sequence) { + switch (s){ + case 'k': llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); break; + case 'f': llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); break; + case 'y': llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break; + case 'p': llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break; + case 'm': llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break; + case 't': llama_sample_temp (ctx_main, &cur_p, temp); break; + default : break; + } + } +} + llama_token llama_sampling_sample( struct llama_sampling_context * ctx_sampling, struct llama_context * ctx_main, @@ -109,11 +159,6 @@ llama_token llama_sampling_sample( const int n_vocab = llama_n_vocab(llama_get_model(ctx_main)); const float temp = params.temp; - const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k; - const float top_p = params.top_p; - const float min_p = params.min_p; - const float tfs_z = params.tfs_z; - const float typical_p = params.typical_p; const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n; const float penalty_repeat = params.penalty_repeat; const float penalty_freq = params.penalty_freq; @@ -188,12 +233,7 @@ llama_token llama_sampling_sample( // temperature sampling size_t min_keep = std::max(1, params.n_probs); - llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); - llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); - llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); - llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); - llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); - llama_sample_temp (ctx_main, &cur_p, temp); + sampler_queue(ctx_main, params, cur_p, min_keep); id = llama_sample_token(ctx_main, &cur_p); diff --git a/cpp/sampling.h b/cpp/sampling.h index 7c9b8dc..fdfa9ee 100644 --- a/cpp/sampling.h +++ b/cpp/sampling.h @@ -10,22 +10,23 @@ // sampling parameters typedef struct llama_sampling_params { - int32_t n_prev = 64; // number of previous tokens to remember - int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. - int32_t top_k = 40; // <= 0 to use vocab size - float top_p = 0.95f; // 1.0 = disabled - float min_p = 0.05f; // 0.0 = disabled - float tfs_z = 1.00f; // 1.0 = disabled - float typical_p = 1.00f; // 1.0 = disabled - float temp = 0.80f; // 1.0 = disabled - int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) - float penalty_repeat = 1.10f; // 1.0 = disabled - float penalty_freq = 0.00f; // 0.0 = disabled - float penalty_present = 0.00f; // 0.0 = disabled - int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 - float mirostat_tau = 5.00f; // target entropy - float mirostat_eta = 0.10f; // learning rate - bool penalize_nl = true; // consider newlines as a repeatable token + int32_t n_prev = 64; // number of previous tokens to remember + int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. + int32_t top_k = 40; // <= 0 to use vocab size + float top_p = 0.95f; // 1.0 = disabled + float min_p = 0.05f; // 0.0 = disabled + float tfs_z = 1.00f; // 1.0 = disabled + float typical_p = 1.00f; // 1.0 = disabled + float temp = 0.80f; // 1.0 = disabled + int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) + float penalty_repeat = 1.10f; // 1.0 = disabled + float penalty_freq = 0.00f; // 0.0 = disabled + float penalty_present = 0.00f; // 0.0 = disabled + int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 + float mirostat_tau = 5.00f; // target entropy + float mirostat_eta = 0.10f; // learning rate + bool penalize_nl = true; // consider newlines as a repeatable token + std::string samplers_sequence = "kfypmt"; // top_k, tail_free, typical_p, top_p, min_p, temp std::string grammar; // optional BNF-like grammar to constrain sampling @@ -80,6 +81,9 @@ std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama // Print sampling parameters into a string std::string llama_sampling_print(const llama_sampling_params & params); +// Print sampling order into a string +std::string llama_sampling_order_print(const llama_sampling_params & params); + // this is a common sampling function used across the examples for convenience // it can serve as a starting point for implementing your own sampling function // Note: When using multiple sequences, it is the caller's responsibility to call diff --git a/example/ios/Podfile.lock b/example/ios/Podfile.lock index 8bf07c5..3758c71 100644 --- a/example/ios/Podfile.lock +++ b/example/ios/Podfile.lock @@ -8,7 +8,7 @@ PODS: - hermes-engine/Pre-built (= 0.72.3) - hermes-engine/Pre-built (0.72.3) - libevent (2.1.12) - - llama-rn (0.3.0-rc.4): + - llama-rn (0.3.0-rc.5): - RCT-Folly - RCTRequired - RCTTypeSafety @@ -1242,7 +1242,7 @@ SPEC CHECKSUMS: glog: 04b94705f318337d7ead9e6d17c019bd9b1f6b1b hermes-engine: 10fbd3f62405c41ea07e71973ea61e1878d07322 libevent: 4049cae6c81cdb3654a443be001fb9bdceff7913 - llama-rn: 0abcf4f4f58615499974d3a30876ddf8ca986012 + llama-rn: 0a0f4d56e8c2ca348c77847cd18709330314042a RCT-Folly: 424b8c9a7a0b9ab2886ffe9c3b041ef628fd4fb1 RCTRequired: a2faf4bad4e438ca37b2040cb8f7799baa065c18 RCTTypeSafety: cb09f3e4747b6d18331a15eb05271de7441ca0b3 diff --git a/ios/RNLlamaContext.mm b/ios/RNLlamaContext.mm index f9c6035..55adae2 100644 --- a/ios/RNLlamaContext.mm +++ b/ios/RNLlamaContext.mm @@ -55,7 +55,6 @@ + (instancetype)initWithParams:(NSDictionary *)params { } if (params[@"n_batch"]) defaultParams.n_batch = [params[@"n_batch"] intValue]; if (params[@"use_mmap"]) defaultParams.use_mmap = [params[@"use_mmap"] boolValue]; - if (params[@"memory_f16"]) defaultParams.memory_f16 = [params[@"memory_f16"] boolValue]; if (params[@"lora"]) { float lora_scaled = 1.0f; diff --git a/llama.cpp b/llama.cpp index a75fa57..8a7b2fa 160000 --- a/llama.cpp +++ b/llama.cpp @@ -1 +1 @@ -Subproject commit a75fa576abba9d37f463580c379e4bbf1e1ad03c +Subproject commit 8a7b2fa528f130631a5f43648481596ab320ed5a diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index 7fd9bc8..8d2a965 100755 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -12,6 +12,7 @@ cp ./llama.cpp/ggml-alloc.h ./cpp/ggml-alloc.h cp ./llama.cpp/ggml-alloc.c ./cpp/ggml-alloc.c cp ./llama.cpp/ggml-backend.h ./cpp/ggml-backend.h cp ./llama.cpp/ggml-backend.c ./cpp/ggml-backend.c +cp ./llama.cpp/ggml-backend-impl.h ./cpp/ggml-backend-impl.h cp ./llama.cpp/ggml-impl.h ./cpp/ggml-impl.h cp ./llama.cpp/llama.h ./cpp/llama.h cp ./llama.cpp/llama.cpp ./cpp/llama.cpp @@ -41,6 +42,7 @@ files=( "./cpp/ggml-alloc.c" "./cpp/ggml-backend.h" "./cpp/ggml-backend.c" + "./cpp/ggml-backend-impl.h" "./cpp/ggml-impl.h" ) diff --git a/scripts/common.cpp.patch b/scripts/common.cpp.patch index 782ccbe..b1bfd37 100644 --- a/scripts/common.cpp.patch +++ b/scripts/common.cpp.patch @@ -1,6 +1,6 @@ ---- common.cpp.orig 2023-11-07 10:50:44 -+++ common.cpp 2023-11-07 10:50:46 -@@ -1225,8 +1225,6 @@ +--- common.cpp.orig 2023-12-12 10:50:18 ++++ common.cpp 2023-12-12 10:50:19 +@@ -1385,8 +1385,6 @@ const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc) { const llama_sampling_params & sparams = params.sparams; diff --git a/scripts/ggml-metal.m.patch b/scripts/ggml-metal.m.patch index c160468..368b05e 100644 --- a/scripts/ggml-metal.m.patch +++ b/scripts/ggml-metal.m.patch @@ -1,11 +1,11 @@ ---- ggml-metal.m.orig 2023-11-02 10:42:43 -+++ ggml-metal.m 2023-11-02 10:43:38 -@@ -209,7 +209,7 @@ - } else { - LM_GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__); - -- NSString * sourcePath = [bundle pathForResource:@"ggml-metal" ofType:@"metal"]; -+ NSString * sourcePath = [bundle pathForResource:@"ggml-metal-llama" ofType:@"metal"]; +--- ggml-metal.m.orig 2023-12-12 10:46:04 ++++ ggml-metal.m 2023-12-12 10:46:43 +@@ -241,7 +241,7 @@ + if (ggmlMetalPathResources) { + sourcePath = [ggmlMetalPathResources stringByAppendingPathComponent:@"ggml-metal.metal"]; + } else { +- sourcePath = [bundle pathForResource:@"ggml-metal" ofType:@"metal"]; ++ sourcePath = [bundle pathForResource:@"ggml-metal-llama" ofType:@"metal"]; + } if (sourcePath == nil) { LM_GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__); - sourcePath = @"ggml-metal.metal"; diff --git a/scripts/llama.cpp.patch b/scripts/llama.cpp.patch index 649b186..f88e643 100644 --- a/scripts/llama.cpp.patch +++ b/scripts/llama.cpp.patch @@ -1,9 +1,9 @@ ---- llama.cpp.orig 2023-11-10 13:36:00 -+++ llama.cpp 2023-11-10 13:36:02 -@@ -103,6 +103,17 @@ +--- llama.cpp.orig 2023-12-12 10:46:04 ++++ llama.cpp 2023-12-12 10:46:05 +@@ -105,6 +105,17 @@ #define LLAMA_LOG_WARN(...) llama_log_internal(LM_GGML_LOG_LEVEL_WARN , __VA_ARGS__) #define LLAMA_LOG_ERROR(...) llama_log_internal(LM_GGML_LOG_LEVEL_ERROR, __VA_ARGS__) - + +#if defined(__ANDROID__) && defined(RNLLAMA_ANDROID_ENABLE_LOGGING) +#include +#define LLAMA_ANDROID_TAG "RNLLAMA_LOG_ANDROID" @@ -18,8 +18,8 @@ // // helpers // -@@ -779,16 +790,16 @@ - +@@ -863,16 +874,16 @@ + if (prefetch > 0) { // Advise the kernel to preload the mapped memory - if (posix_madvise(addr, std::min(file->size, prefetch), POSIX_MADV_WILLNEED)) { diff --git a/src/NativeRNLlama.ts b/src/NativeRNLlama.ts index 6cb88e6..d1b1a75 100644 --- a/src/NativeRNLlama.ts +++ b/src/NativeRNLlama.ts @@ -16,8 +16,6 @@ export type NativeContextParams = { use_mlock?: boolean use_mmap?: boolean - memory_f16?: boolean - lora?: string // lora_adaptor lora_scaled?: number lora_base?: string