From 9df193626cd6e404791579a3ff131db042cf4364 Mon Sep 17 00:00:00 2001 From: jhen Date: Fri, 23 Aug 2024 14:12:28 +0800 Subject: [PATCH 1/4] feat: sync llama.cpp --- cpp/common.cpp | 199 ++- cpp/common.h | 33 +- cpp/ggml-aarch64.c | 2 + cpp/ggml-backend.c | 38 +- cpp/ggml-metal.h | 2 + cpp/ggml-metal.m | 77 +- cpp/ggml.c | 308 ++--- cpp/ggml.h | 22 +- cpp/grammar-parser.cpp | 3 + cpp/llama-impl.h | 15 + cpp/llama-sampling.cpp | 4 +- cpp/llama-vocab.cpp | 46 +- cpp/llama-vocab.h | 4 +- cpp/llama.cpp | 2690 +++++++++++++++++++++++++----------- cpp/llama.h | 25 +- llama.cpp | 2 +- scripts/common.h.patch | 6 +- scripts/ggml-metal.m.patch | 6 +- scripts/ggml.c.patch | 6 +- scripts/llama.cpp.patch | 6 +- 20 files changed, 2379 insertions(+), 1115 deletions(-) diff --git a/cpp/common.cpp b/cpp/common.cpp index 9efc83e..725a557 100644 --- a/cpp/common.cpp +++ b/cpp/common.cpp @@ -83,6 +83,41 @@ char const *LLAMA_BUILD_TARGET = "unknown"; using json = nlohmann::ordered_json; +// +// Environment variable utils +// + +template +static typename std::enable_if::value, void>::type +get_env(std::string name, T & target) { + char * value = std::getenv(name.c_str()); + target = value ? std::string(value) : target; +} + +template +static typename std::enable_if::value && std::is_integral::value, void>::type +get_env(std::string name, T & target) { + char * value = std::getenv(name.c_str()); + target = value ? std::stoi(value) : target; +} + +template +static typename std::enable_if::value, void>::type +get_env(std::string name, T & target) { + char * value = std::getenv(name.c_str()); + target = value ? std::stof(value) : target; +} + +template +static typename std::enable_if::value, void>::type +get_env(std::string name, T & target) { + char * value = std::getenv(name.c_str()); + if (value) { + std::string val(value); + target = val == "1" || val == "true"; + } +} + // // CPU utils // @@ -116,8 +151,34 @@ int32_t cpu_get_num_physical_cores() { if (result == 0) { return num_physical_cores; } -#elif defined(_WIN32) - //TODO: Implement +#elif defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later + // TODO: windows + arm64 + mingw64 + unsigned int n_threads_win = std::thread::hardware_concurrency(); + unsigned int default_threads = n_threads_win > 0 ? (n_threads_win <= 4 ? n_threads_win : n_threads_win / 2) : 4; + + DWORD buffer_size = 0; + if (!GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &buffer_size)) { + if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) { + return default_threads; + } + } + + std::vector buffer(buffer_size); + if (!GetLogicalProcessorInformationEx(RelationProcessorCore, reinterpret_cast(buffer.data()), &buffer_size)) { + return default_threads; + } + + int32_t num_physical_cores = 0; + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info = reinterpret_cast(buffer.data()); + while (buffer_size > 0) { + if (info->Relationship == RelationProcessorCore) { + num_physical_cores += info->Processor.GroupCount; + } + buffer_size -= info->Size; + info = reinterpret_cast(reinterpret_cast(info) + info->Size); + } + + return num_physical_cores > 0 ? num_physical_cores : default_threads; #endif unsigned int n_threads = std::thread::hardware_concurrency(); return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4; @@ -200,12 +261,6 @@ int32_t cpu_get_num_math() { // CLI argument parsing // -void gpt_params_handle_hf_token(gpt_params & params) { - if (params.hf_token.empty() && std::getenv("HF_TOKEN")) { - params.hf_token = std::getenv("HF_TOKEN"); - } -} - void gpt_params_handle_model_default(gpt_params & params) { if (!params.hf_repo.empty()) { // short-hand to avoid specifying --hf-file -> default it to --model @@ -253,7 +308,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { gpt_params_handle_model_default(params); - gpt_params_handle_hf_token(params); + if (params.hf_token.empty()) { + get_env("HF_TOKEN", params.hf_token); + } if (params.escape) { string_process_escapes(params.prompt); @@ -273,6 +330,25 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { return true; } +void gpt_params_parse_from_env(gpt_params & params) { + // we only care about server-related params for now + get_env("LLAMA_ARG_MODEL", params.model); + get_env("LLAMA_ARG_THREADS", params.n_threads); + get_env("LLAMA_ARG_CTX_SIZE", params.n_ctx); + get_env("LLAMA_ARG_N_PARALLEL", params.n_parallel); + get_env("LLAMA_ARG_BATCH", params.n_batch); + get_env("LLAMA_ARG_UBATCH", params.n_ubatch); + get_env("LLAMA_ARG_N_GPU_LAYERS", params.n_gpu_layers); + get_env("LLAMA_ARG_THREADS_HTTP", params.n_threads_http); + get_env("LLAMA_ARG_CHAT_TEMPLATE", params.chat_template); + get_env("LLAMA_ARG_N_PREDICT", params.n_predict); + get_env("LLAMA_ARG_ENDPOINT_METRICS", params.endpoint_metrics); + get_env("LLAMA_ARG_ENDPOINT_SLOTS", params.endpoint_slots); + get_env("LLAMA_ARG_EMBEDDINGS", params.embedding); + get_env("LLAMA_ARG_FLASH_ATTN", params.flash_attn); + get_env("LLAMA_ARG_DEFRAG_THOLD", params.defrag_thold); +} + bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { const auto params_org = params; // the example can modify the default params @@ -690,14 +766,24 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa } if (arg == "--lora") { CHECK_ARG - params.lora_adapter.emplace_back(argv[i], 1.0f); + params.lora_adapters.push_back({ + std::string(argv[i]), + 1.0, + }); return true; } if (arg == "--lora-scaled") { CHECK_ARG - const char* lora_adapter = argv[i]; + std::string lora_adapter = argv[i]; CHECK_ARG - params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i])); + params.lora_adapters.push_back({ + lora_adapter, + std::stof(argv[i]), + }); + return true; + } + if (arg == "--lora-init-without-apply") { + params.lora_init_without_apply = true; return true; } if (arg == "--control-vector") { @@ -1660,6 +1746,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" }); options.push_back({ "server", "-sps, --slot-prompt-similarity SIMILARITY", "how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity }); + options.push_back({ "server", " --lora-init-without-apply", "load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"}); #ifndef LOG_DISABLE_LOGS options.push_back({ "logging" }); @@ -1722,7 +1809,13 @@ std::string gpt_params_get_system_info(const gpt_params & params) { if (params.n_threads_batch != -1) { os << " (n_threads_batch = " << params.n_threads_batch << ")"; } +#if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later + // TODO: windows + arm64 + mingw64 + DWORD logicalProcessorCount = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS); + os << " / " << logicalProcessorCount << " | " << llama_print_system_info(); +#else os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info(); +#endif return os.str(); } @@ -1772,6 +1865,17 @@ std::string string_get_sortable_timestamp() { return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns); } +void string_replace_all(std::string & s, const std::string & search, const std::string & replace) { + if (search.empty()) { + return; // Avoid infinite loop if 'search' is an empty string + } + size_t pos = 0; + while ((pos = s.find(search, pos)) != std::string::npos) { + s.replace(pos, search.length(), replace); + pos += replace.length(); + } +} + void string_process_escapes(std::string & input) { std::size_t input_len = input.length(); std::size_t output_idx = 0; @@ -2045,8 +2149,8 @@ std::string fs_get_cache_file(const std::string & filename) { // // Model utils // - -std::tuple llama_init_from_gpt_params(gpt_params & params) { +struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { + llama_init_result iparams; auto mparams = llama_model_params_from_gpt_params(params); llama_model * model = nullptr; @@ -2061,7 +2165,7 @@ std::tuple llama_init_from_gpt_par if (model == NULL) { fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); - return std::make_tuple(nullptr, nullptr); + return iparams; } auto cparams = llama_context_params_from_gpt_params(params); @@ -2070,7 +2174,7 @@ std::tuple llama_init_from_gpt_par if (lctx == NULL) { fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str()); llama_free_model(model); - return std::make_tuple(nullptr, nullptr); + return iparams; } if (!params.control_vectors.empty()) { @@ -2081,7 +2185,7 @@ std::tuple llama_init_from_gpt_par if (cvec.n_embd == -1) { llama_free(lctx); llama_free_model(model); - return std::make_tuple(nullptr, nullptr); + return iparams; } int err = llama_control_vector_apply(lctx, @@ -2093,21 +2197,26 @@ std::tuple llama_init_from_gpt_par if (err) { llama_free(lctx); llama_free_model(model); - return std::make_tuple(nullptr, nullptr); + return iparams; } } - for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) { - const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]); - float lora_scale = std::get<1>(params.lora_adapter[i]); - auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str()); - if (adapter == nullptr) { - fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); + // load and optionally apply lora adapters + for (auto & la : params.lora_adapters) { + llama_lora_adapter_container loaded_la; + loaded_la.path = la.path; + loaded_la.scale = la.scale; + loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str()); + if (loaded_la.adapter == nullptr) { + fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str()); llama_free(lctx); llama_free_model(model); - return std::make_tuple(nullptr, nullptr); + return iparams; } - llama_lora_adapter_set(lctx, adapter, lora_scale); + iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters + } + if (!params.lora_init_without_apply) { + llama_lora_adapters_apply(lctx, iparams.lora_adapters); } if (params.ignore_eos) { @@ -2135,13 +2244,26 @@ std::tuple llama_init_from_gpt_par tmp.clear(); tmp.push_back(decoder_start_token_id); } - llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0)); + if (llama_model_has_decoder(model)) { + llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0)); + } llama_kv_cache_clear(lctx); llama_synchronize(lctx); llama_reset_timings(lctx); } - return std::make_tuple(model, lctx); + iparams.model = model; + iparams.context = lctx; + return iparams; +} + +void llama_lora_adapters_apply(struct llama_context * ctx, std::vector & lora_adapters) { + llama_lora_adapter_clear(ctx); + for (auto & la : lora_adapters) { + if (la.scale != 0.0f) { + llama_lora_adapter_set(ctx, la.adapter, la.scale); + } + } } struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) { @@ -2668,12 +2790,6 @@ std::string llama_detokenize(llama_context * ctx, const std::vector return text; } -bool llama_should_add_bos_token(const llama_model * model) { - const int add_bos = llama_add_bos_token(model); - - return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM); -} - // // Chat template utils // @@ -3166,19 +3282,18 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l } fprintf(stream, "lora:\n"); - for (std::tuple la : params.lora_adapter) { - if (std::get<1>(la) != 1.0f) { - continue; + for (auto & la : params.lora_adapters) { + if (la.scale == 1.0f) { + fprintf(stream, " - %s\n", la.path.c_str()); } - fprintf(stream, " - %s\n", std::get<0>(la).c_str()); } fprintf(stream, "lora_scaled:\n"); - for (std::tuple la : params.lora_adapter) { - if (std::get<1>(la) == 1.0f) { - continue; + for (auto & la : params.lora_adapters) { + if (la.scale != 1.0f) { + fprintf(stream, " - %s: %f\n", la.path.c_str(), la.scale); } - fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la)); } + fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false"); fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu); fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep); fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat); diff --git a/cpp/common.h b/cpp/common.h index f905274..18b027b 100644 --- a/cpp/common.h +++ b/cpp/common.h @@ -33,6 +33,15 @@ #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf" +struct llama_lora_adapter_info { + std::string path; + float scale; +}; + +struct llama_lora_adapter_container : llama_lora_adapter_info { + struct llama_lora_adapter * adapter; +}; + // build info extern int LLAMA_BUILD_NUMBER; extern char const * LLAMA_COMMIT; @@ -137,8 +146,8 @@ struct gpt_params { std::vector antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts) std::vector kv_overrides; - // TODO: avoid tuple, use struct - std::vector> lora_adapter; // lora adapter path with user defined scale + bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply) + std::vector lora_adapters; // lora adapter path with user defined scale std::vector control_vectors; // control vector with user defined scale @@ -269,7 +278,7 @@ struct gpt_params { std::string lora_outfile = "ggml-lora-merged-f16.gguf"; }; -void gpt_params_handle_hf_token(gpt_params & params); +void gpt_params_parse_from_env(gpt_params & params); void gpt_params_handle_model_default(gpt_params & params); bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params); @@ -288,6 +297,8 @@ std::vector string_split(std::string input, char separator); std::string string_strip(const std::string & str); std::string string_get_sortable_timestamp(); +void string_replace_all(std::string & s, const std::string & search, const std::string & replace); + template static std::vector string_split(const std::string & str, char delim) { std::vector values; @@ -319,8 +330,13 @@ std::string fs_get_cache_file(const std::string & filename); // Model utils // -// TODO: avoid tuplue, use struct -std::tuple llama_init_from_gpt_params(gpt_params & params); +struct llama_init_result { + struct llama_model * model = nullptr; + struct llama_context * context = nullptr; + std::vector lora_adapters; +}; + +struct llama_init_result llama_init_from_gpt_params(gpt_params & params); struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params); struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params); @@ -328,6 +344,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params); struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params); +// clear LoRA adapters from context, then apply new list of adapters +void llama_lora_adapters_apply(struct llama_context * ctx, std::vector & lora_adapters); + // Batch utils void llama_batch_clear(struct llama_batch & batch); @@ -372,10 +391,6 @@ std::string llama_detokenize( const std::vector & tokens, bool special = true); -// Uses the value from the model metadata if possible, otherwise -// defaults to true when model type is SPM, otherwise false. -bool llama_should_add_bos_token(const llama_model * model); - // // Chat template utils // diff --git a/cpp/ggml-aarch64.c b/cpp/ggml-aarch64.c index b6b05a4..9c507d4 100644 --- a/cpp/ggml-aarch64.c +++ b/cpp/ggml-aarch64.c @@ -16,6 +16,8 @@ #if defined(__GNUC__) #pragma GCC diagnostic ignored "-Woverlength-strings" +#elif defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data #endif #define UNUSED LM_GGML_UNUSED diff --git a/cpp/ggml-backend.c b/cpp/ggml-backend.c index 6853e08..12c540c 100644 --- a/cpp/ggml-backend.c +++ b/cpp/ggml-backend.c @@ -351,15 +351,10 @@ void lm_ggml_backend_tensor_copy_async(lm_ggml_backend_t backend_src, lm_ggml_ba } // an async copy would normally happen after all the queued operations on both backends are completed - // sync src, set_async dst - if (lm_ggml_backend_buffer_is_host(src->buffer)) { - lm_ggml_backend_synchronize(backend_src); - lm_ggml_backend_tensor_set_async(backend_dst, dst, src->data, 0, lm_ggml_nbytes(src)); - } else { - lm_ggml_backend_synchronize(backend_src); - lm_ggml_backend_tensor_copy(src, dst); - lm_ggml_backend_synchronize(backend_dst); - } + // to simulate the same behavior, we need to synchronize both backends first, and do a blocking copy + lm_ggml_backend_synchronize(backend_src); + lm_ggml_backend_synchronize(backend_dst); + lm_ggml_backend_tensor_copy(src, dst); } // events @@ -1023,10 +1018,6 @@ static bool lm_ggml_is_view_op(enum lm_ggml_op op) { #define LM_GGML_SCHED_MAX_BACKENDS 16 #endif -#ifndef LM_GGML_SCHED_MAX_SPLITS -#define LM_GGML_SCHED_MAX_SPLITS 2048 -#endif - #ifndef LM_GGML_SCHED_MAX_SPLIT_INPUTS #define LM_GGML_SCHED_MAX_SPLIT_INPUTS LM_GGML_MAX_SRC #endif @@ -1130,7 +1121,8 @@ static int lm_ggml_backend_sched_backend_from_buffer(lm_ggml_backend_sched_t sch } #if 0 -static char causes[LM_GGML_DEFAULT_GRAPH_SIZE*16 + LM_GGML_SCHED_MAX_SPLITS*LM_GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only +#define LM_GGML_SCHED_MAX_SPLITS_DEBUG 4096 +static char causes[LM_GGML_DEFAULT_GRAPH_SIZE*16 + LM_GGML_SCHED_MAX_SPLITS_DEBUG*LM_GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__) #define GET_CAUSE(node) causes[hash_id(node)] #else @@ -1554,7 +1546,6 @@ static void lm_ggml_backend_sched_split_graph(lm_ggml_backend_sched_t sched, str sched->splits = realloc(sched->splits, sched->splits_capacity * sizeof(struct lm_ggml_backend_sched_split)); LM_GGML_ASSERT(sched->splits != NULL); } - LM_GGML_ASSERT(i_split < LM_GGML_SCHED_MAX_SPLITS); split = &sched->splits[i_split]; split->backend_id = node_backend_id; split->i_start = i; @@ -1782,7 +1773,17 @@ static enum lm_ggml_status lm_ggml_backend_sched_compute_splits(lm_ggml_backend_ } else { lm_ggml_backend_synchronize(split_backend); } - lm_ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy); + // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events + // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface + if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) { + lm_ggml_backend_synchronize(input_backend); + if (sched->events[split_backend_id][sched->cur_copy] != NULL) { + lm_ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]); + } else { + lm_ggml_backend_synchronize(split_backend); + } + lm_ggml_backend_tensor_copy(input, input_cpy); + } } } @@ -1860,13 +1861,14 @@ lm_ggml_backend_sched_t lm_ggml_backend_sched_new( sched->hv_tensor_backend_ids = malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0])); sched->hv_tensor_copies = malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct lm_ggml_tensor *)); - const size_t nodes_size = graph_size + LM_GGML_SCHED_MAX_SPLITS*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2; + const size_t lm_ggml_sched_max_splits = graph_size; // at most there is one split for each node in the graph + const size_t nodes_size = graph_size + lm_ggml_sched_max_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2; sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0])); sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0])); sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0])); sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0])); - sched->context_buffer_size = LM_GGML_SCHED_MAX_SPLITS*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct lm_ggml_tensor) + lm_ggml_graph_overhead_custom(graph_size, false); + sched->context_buffer_size = lm_ggml_sched_max_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct lm_ggml_tensor) + lm_ggml_graph_overhead_custom(graph_size, false); sched->context_buffer = malloc(sched->context_buffer_size); const int initial_splits_capacity = 16; diff --git a/cpp/ggml-metal.h b/cpp/ggml-metal.h index b8d0909..7bc21a9 100644 --- a/cpp/ggml-metal.h +++ b/cpp/ggml-metal.h @@ -50,6 +50,8 @@ LM_GGML_API LM_GGML_CALL lm_ggml_backend_buffer_t lm_ggml_backend_metal_buffer_f LM_GGML_API void lm_ggml_backend_metal_set_n_cb(lm_ggml_backend_t backend, int n_cb); +LM_GGML_API void lm_ggml_backend_metal_set_abort_callback(lm_ggml_backend_t backend, lm_ggml_abort_callback abort_callback, void * user_data); + LM_GGML_API LM_GGML_CALL lm_ggml_backend_buffer_type_t lm_ggml_backend_metal_buffer_type(void); // helper to check if the device supports a specific family diff --git a/cpp/ggml-metal.m b/cpp/ggml-metal.m index f6a8d06..8568c96 100644 --- a/cpp/ggml-metal.m +++ b/cpp/ggml-metal.m @@ -210,7 +210,7 @@ LM_GGML_METAL_KERNEL_TYPE_COUNT }; -struct lm_ggml_metal_context { +struct lm_ggml_backend_metal_context { int n_cb; id device; @@ -224,6 +224,10 @@ bool support_simdgroup_mm; bool should_capture_next_compute; + + // abort lm_ggml_metal_graph_compute if callback returns true + lm_ggml_abort_callback abort_callback; + void * abort_callback_data; }; // MSL code @@ -289,7 +293,7 @@ static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char * format, return data; } -static struct lm_ggml_metal_context * lm_ggml_metal_init(int n_cb) { +static struct lm_ggml_backend_metal_context * lm_ggml_metal_init(int n_cb) { LM_GGML_METAL_LOG_INFO("%s: allocating\n", __func__); #if TARGET_OS_OSX && !LM_GGML_METAL_NDEBUG @@ -306,7 +310,7 @@ static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char * format, LM_GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]); // Configure context - struct lm_ggml_metal_context * ctx = malloc(sizeof(struct lm_ggml_metal_context)); + struct lm_ggml_backend_metal_context * ctx = calloc(1, sizeof(struct lm_ggml_backend_metal_context)); ctx->device = device; ctx->n_cb = MIN(n_cb, LM_GGML_METAL_MAX_BUFFERS); ctx->queue = [ctx->device newCommandQueue]; @@ -668,7 +672,7 @@ static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char * format, return ctx; } -static void lm_ggml_metal_free(struct lm_ggml_metal_context * ctx) { +static void lm_ggml_metal_free(struct lm_ggml_backend_metal_context * ctx) { LM_GGML_METAL_LOG_INFO("%s: deallocating\n", __func__); for (int i = 0; i < LM_GGML_METAL_KERNEL_TYPE_COUNT; ++i) { @@ -734,7 +738,7 @@ static void lm_ggml_metal_free(struct lm_ggml_metal_context * ctx) { return nil; } -static bool lm_ggml_metal_supports_op(const struct lm_ggml_metal_context * ctx, const struct lm_ggml_tensor * op) { +static bool lm_ggml_metal_supports_op(const struct lm_ggml_backend_metal_context * ctx, const struct lm_ggml_tensor * op) { for (size_t i = 0, n = 3; i < n; ++i) { if (op->src[i] != NULL && op->src[i]->type == LM_GGML_TYPE_BF16) { return false; @@ -845,7 +849,7 @@ static bool lm_ggml_metal_supports_op(const struct lm_ggml_metal_context * ctx, } static enum lm_ggml_status lm_ggml_metal_graph_compute( - struct lm_ggml_metal_context * ctx, + struct lm_ggml_backend_metal_context * ctx, struct lm_ggml_cgraph * gf) { @autoreleasepool { @@ -878,8 +882,11 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute( id command_buffer = [ctx->queue commandBufferWithUnretainedReferences]; command_buffer_builder[cb_idx] = command_buffer; - // enqueue the command buffers in order to specify their execution order - [command_buffer enqueue]; + // always enqueue the first two command buffers + // enqueue all of the command buffers if we don't need to abort + if (cb_idx < 2 || ctx->abort_callback == NULL) { + [command_buffer enqueue]; + } } const id *command_buffers = command_buffer_builder; @@ -2229,10 +2236,8 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute( LM_GGML_ASSERT(ne00 % 4 == 0); LM_GGML_ASSERT(lm_ggml_is_contiguous(src0)); - //float eps; - //memcpy(&eps, dst->op_params, sizeof(float)); - - const float eps = 1e-6f; // TODO: temporarily hardcoded + float eps; + memcpy(&eps, dst->op_params + 1, sizeof(float)); const int32_t n_groups = ((int32_t *) dst->op_params)[0]; @@ -2308,7 +2313,7 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute( memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); - const bool is_neox = mode & 2; + const bool is_neox = mode & LM_GGML_ROPE_TYPE_NEOX; id pipeline = nil; @@ -2829,7 +2834,9 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute( [encoder endEncoding]; - [command_buffer commit]; + if (cb_idx < 2 || ctx->abort_callback == NULL) { + [command_buffer commit]; + } }); // Wait for completion and check status of each command buffer @@ -2849,6 +2856,23 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute( return LM_GGML_STATUS_FAILED; } + + id next_buffer = (i + 1 < n_cb ? command_buffers[i + 1] : nil); + if (!next_buffer) { + continue; + } + + bool next_queued = ([next_buffer status] != MTLCommandBufferStatusNotEnqueued); + if (next_queued) { + continue; + } + + if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) { + LM_GGML_METAL_LOG_INFO("%s: command buffer %d aborted", __func__, i); + return LM_GGML_STATUS_ABORTED; + } + + [next_buffer commit]; } if (should_capture) { @@ -3152,7 +3176,7 @@ LM_GGML_CALL lm_ggml_backend_buffer_t lm_ggml_backend_metal_buffer_from_ptr(void } LM_GGML_CALL static void lm_ggml_backend_metal_free(lm_ggml_backend_t backend) { - struct lm_ggml_metal_context * ctx = (struct lm_ggml_metal_context *)backend->context; + struct lm_ggml_backend_metal_context * ctx = (struct lm_ggml_backend_metal_context *)backend->context; lm_ggml_metal_free(ctx); free(backend); } @@ -3164,13 +3188,13 @@ LM_GGML_CALL static lm_ggml_backend_buffer_type_t lm_ggml_backend_metal_get_defa } LM_GGML_CALL static enum lm_ggml_status lm_ggml_backend_metal_graph_compute(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) { - struct lm_ggml_metal_context * metal_ctx = (struct lm_ggml_metal_context *)backend->context; + struct lm_ggml_backend_metal_context * metal_ctx = (struct lm_ggml_backend_metal_context *)backend->context; return lm_ggml_metal_graph_compute(metal_ctx, cgraph); } LM_GGML_CALL static bool lm_ggml_backend_metal_supports_op(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op) { - struct lm_ggml_metal_context * metal_ctx = (struct lm_ggml_metal_context *)backend->context; + struct lm_ggml_backend_metal_context * metal_ctx = (struct lm_ggml_backend_metal_context *)backend->context; return lm_ggml_metal_supports_op(metal_ctx, op); } @@ -3215,9 +3239,9 @@ static lm_ggml_guid_t lm_ggml_backend_metal_guid(void) { } lm_ggml_backend_t lm_ggml_backend_metal_init(void) { - struct lm_ggml_metal_context * ctx = lm_ggml_metal_init(LM_GGML_DEFAULT_N_THREADS); - + struct lm_ggml_backend_metal_context * ctx = lm_ggml_metal_init(LM_GGML_DEFAULT_N_THREADS); if (ctx == NULL) { + LM_GGML_METAL_LOG_ERROR("%s: error: failed to allocate context\n", __func__); return NULL; } @@ -3239,15 +3263,24 @@ bool lm_ggml_backend_is_metal(lm_ggml_backend_t backend) { void lm_ggml_backend_metal_set_n_cb(lm_ggml_backend_t backend, int n_cb) { LM_GGML_ASSERT(lm_ggml_backend_is_metal(backend)); - struct lm_ggml_metal_context * ctx = (struct lm_ggml_metal_context *)backend->context; + struct lm_ggml_backend_metal_context * ctx = (struct lm_ggml_backend_metal_context *)backend->context; ctx->n_cb = MIN(n_cb, LM_GGML_METAL_MAX_BUFFERS); } +void lm_ggml_backend_metal_set_abort_callback(lm_ggml_backend_t backend, lm_ggml_abort_callback abort_callback, void * user_data) { + LM_GGML_ASSERT(lm_ggml_backend_is_metal(backend)); + + struct lm_ggml_backend_metal_context * ctx = (struct lm_ggml_backend_metal_context *)backend->context; + + ctx->abort_callback = abort_callback; + ctx->abort_callback_data = user_data; +} + bool lm_ggml_backend_metal_supports_family(lm_ggml_backend_t backend, int family) { LM_GGML_ASSERT(lm_ggml_backend_is_metal(backend)); - struct lm_ggml_metal_context * ctx = (struct lm_ggml_metal_context *)backend->context; + struct lm_ggml_backend_metal_context * ctx = (struct lm_ggml_backend_metal_context *)backend->context; return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)]; } @@ -3255,7 +3288,7 @@ bool lm_ggml_backend_metal_supports_family(lm_ggml_backend_t backend, int family void lm_ggml_backend_metal_capture_next_compute(lm_ggml_backend_t backend) { LM_GGML_ASSERT(lm_ggml_backend_is_metal(backend)); - struct lm_ggml_metal_context * ctx = (struct lm_ggml_metal_context *)backend->context; + struct lm_ggml_backend_metal_context * ctx = (struct lm_ggml_backend_metal_context *)backend->context; ctx->should_capture_next_compute = true; } diff --git a/cpp/ggml.c b/cpp/ggml.c index 84740ab..f125d44 100644 --- a/cpp/ggml.c +++ b/cpp/ggml.c @@ -56,6 +56,9 @@ int lm_ggml_sve_cnt_b = 0; // disable POSIX deprecation warnings // these functions are never going away, anyway #pragma warning(disable: 4996) + +// unreachable code because of multiple instances of code after LM_GGML_ABORT +#pragma warning(disable: 4702) #endif #if defined(_WIN32) @@ -3721,7 +3724,8 @@ static struct lm_ggml_tensor * lm_ggml_new_tensor_impl( struct lm_ggml_tensor * view_src, size_t view_offs) { - assert(n_dims >= 1 && n_dims <= LM_GGML_MAX_DIMS); + LM_GGML_ASSERT(type >= 0 && type < LM_GGML_TYPE_COUNT); + LM_GGML_ASSERT(n_dims >= 1 && n_dims <= LM_GGML_MAX_DIMS); // find the base tensor and absolute offset if (view_src != NULL && view_src->view_src != NULL) { @@ -5374,6 +5378,7 @@ static struct lm_ggml_tensor * lm_ggml_group_norm_impl( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, int n_groups, + float eps, bool inplace) { bool is_node = false; @@ -5384,7 +5389,8 @@ static struct lm_ggml_tensor * lm_ggml_group_norm_impl( struct lm_ggml_tensor * result = inplace ? lm_ggml_view_tensor(ctx, a) : lm_ggml_dup_tensor(ctx, a); - result->op_params[0] = n_groups; + lm_ggml_set_op_params_i32(result, 0, n_groups); + lm_ggml_set_op_params_f32(result, 1, eps); result->op = LM_GGML_OP_GROUP_NORM; result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL; @@ -5396,15 +5402,17 @@ static struct lm_ggml_tensor * lm_ggml_group_norm_impl( struct lm_ggml_tensor * lm_ggml_group_norm( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, - int n_groups) { - return lm_ggml_group_norm_impl(ctx, a, n_groups, false); + int n_groups, + float eps) { + return lm_ggml_group_norm_impl(ctx, a, n_groups, eps, false); } struct lm_ggml_tensor * lm_ggml_group_norm_inplace( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, - int n_groups) { - return lm_ggml_group_norm_impl(ctx, a, n_groups, true); + int n_groups, + float eps) { + return lm_ggml_group_norm_impl(ctx, a, n_groups, eps, true); } // lm_ggml_mul_mat @@ -7221,43 +7229,34 @@ struct lm_ggml_tensor * lm_ggml_flash_attn_back( struct lm_ggml_tensor * lm_ggml_ssm_conv( struct lm_ggml_context * ctx, - struct lm_ggml_tensor * s, - struct lm_ggml_tensor * x, - struct lm_ggml_tensor * c, - struct lm_ggml_tensor * sq) { - LM_GGML_ASSERT(lm_ggml_is_3d(s)); - LM_GGML_ASSERT(lm_ggml_is_matrix(x)); + struct lm_ggml_tensor * sx, + struct lm_ggml_tensor * c) { + LM_GGML_ASSERT(lm_ggml_is_3d(sx)); LM_GGML_ASSERT(lm_ggml_is_matrix(c)); - LM_GGML_ASSERT(lm_ggml_is_matrix(sq)); - LM_GGML_ASSERT(sq->type == LM_GGML_TYPE_I32); - const int64_t d_conv = c->ne[0]; - const int64_t d_inner = c->ne[1]; - const int64_t n_tokens = x->ne[1]; - const int64_t n_kv = s->ne[2]; + const int64_t d_conv = c->ne[0]; + const int64_t d_inner = c->ne[1]; + const int64_t n_t = sx->ne[0] - d_conv + 1; // tokens per sequence + const int64_t n_s = sx->ne[2]; - LM_GGML_ASSERT( s->ne[0] == d_conv - 1); - LM_GGML_ASSERT( s->ne[1] == d_inner); - LM_GGML_ASSERT( x->ne[0] == d_inner); - LM_GGML_ASSERT(sq->ne[0] == n_kv); - LM_GGML_ASSERT(sq->ne[1] == n_tokens); + // TODO: maybe support other strides than 1? + LM_GGML_ASSERT(sx->ne[0] == d_conv - 1 + n_t); + LM_GGML_ASSERT(sx->ne[1] == d_inner); + LM_GGML_ASSERT(n_t >= 0); bool is_node = false; - if (s->grad || x->grad || c->grad || sq->grad) { + if (sx->grad || c->grad) { LM_GGML_ABORT("fatal error"); // TODO: implement is_node = true; } - // 2-in-1 concatenated x and conv_states, {d_inner, n_tokens} with {d_conv, d_inner, n_kv} - struct lm_ggml_tensor * result = lm_ggml_new_tensor_1d(ctx, LM_GGML_TYPE_F32, (d_inner*n_tokens) + (d_conv*d_inner*n_kv)); + struct lm_ggml_tensor * result = lm_ggml_new_tensor_3d(ctx, LM_GGML_TYPE_F32, d_inner, n_t, n_s); result->op = LM_GGML_OP_SSM_CONV; result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL; - result->src[0] = s; - result->src[1] = x; - result->src[2] = c; - result->src[3] = sq; + result->src[0] = sx; + result->src[1] = c; return result; } @@ -7271,39 +7270,42 @@ struct lm_ggml_tensor * lm_ggml_ssm_scan( struct lm_ggml_tensor * dt, struct lm_ggml_tensor * A, struct lm_ggml_tensor * B, - struct lm_ggml_tensor * C, - struct lm_ggml_tensor * sq) { + struct lm_ggml_tensor * C) { LM_GGML_ASSERT(lm_ggml_is_contiguous(s)); LM_GGML_ASSERT(lm_ggml_is_contiguous(x)); LM_GGML_ASSERT(lm_ggml_is_contiguous(dt)); LM_GGML_ASSERT(lm_ggml_is_contiguous(A)); - LM_GGML_ASSERT(sq->type == LM_GGML_TYPE_I32); + LM_GGML_ASSERT(lm_ggml_is_matrix(A)); + LM_GGML_ASSERT(lm_ggml_is_3d(B)); + LM_GGML_ASSERT(lm_ggml_is_3d(s)); LM_GGML_ASSERT(B->nb[0] == lm_ggml_type_size(B->type)); LM_GGML_ASSERT(C->nb[0] == lm_ggml_type_size(C->type)); LM_GGML_ASSERT(lm_ggml_are_same_shape(x, dt)); + LM_GGML_ASSERT(lm_ggml_are_same_shape(B, C)); { - const int64_t d_state = s->ne[0]; - const int64_t d_inner = s->ne[1]; - const int64_t n_tokens = x->ne[1]; + const int64_t d_state = s->ne[0]; + const int64_t d_inner = s->ne[1]; + const int64_t n_seq_tokens = x->ne[1]; + const int64_t n_seqs = x->ne[2]; + LM_GGML_ASSERT(s->ne[2] == n_seqs); LM_GGML_ASSERT(x->ne[0] == d_inner); LM_GGML_ASSERT(A->ne[0] == d_state); LM_GGML_ASSERT(A->ne[1] == d_inner); LM_GGML_ASSERT(B->ne[0] == d_state); - LM_GGML_ASSERT(B->ne[1] == n_tokens); - LM_GGML_ASSERT(C->ne[0] == d_state); - LM_GGML_ASSERT(C->ne[1] == n_tokens); + LM_GGML_ASSERT(B->ne[1] == n_seq_tokens); + LM_GGML_ASSERT(B->ne[2] == n_seqs); } bool is_node = false; - if (s->grad || x->grad || dt->grad || A->grad || B->grad || C->grad || sq->grad) { + if (s->grad || x->grad || dt->grad || A->grad || B->grad || C->grad) { LM_GGML_ABORT("fatal error"); // TODO: implement is_node = true; } - // 2-in-1 concatenated y and ssm_states, {d_inner, n_tokens} with {d_state, d_inner, n_kv} + // concatenated y + ssm_states struct lm_ggml_tensor * result = lm_ggml_new_tensor_1d(ctx, LM_GGML_TYPE_F32, lm_ggml_nelements(x) + lm_ggml_nelements(s)); result->op = LM_GGML_OP_SSM_SCAN; @@ -7314,7 +7316,6 @@ struct lm_ggml_tensor * lm_ggml_ssm_scan( result->src[3] = A; result->src[4] = B; result->src[5] = C; - result->src[6] = sq; return result; } @@ -10987,11 +10988,6 @@ static void lm_ggml_compute_forward_concat_f32( LM_GGML_TENSOR_BINARY_OP_LOCALS - // TODO: support for transposed / permuted tensors - LM_GGML_ASSERT(nb0 == sizeof(float)); - LM_GGML_ASSERT(nb00 == sizeof(float)); - LM_GGML_ASSERT(nb10 == sizeof(float)); - const int32_t dim = lm_ggml_get_op_params_i32(dst, 0); LM_GGML_ASSERT(dim >= 0 && dim < 4); @@ -12095,10 +12091,11 @@ static void lm_ggml_compute_forward_group_norm_f32( LM_GGML_TENSOR_UNARY_OP_LOCALS - const float eps = 1e-6f; // TODO: make this a parameter - // TODO: optimize + float eps; + memcpy(&eps, dst->op_params + 1, sizeof(float)); + int n_channels = src0->ne[2]; int n_groups = dst->op_params[0]; int n_channels_per_group = (n_channels + n_groups - 1) / n_groups; @@ -14085,7 +14082,7 @@ static void lm_ggml_compute_forward_rope_f32( float corr_dims[2]; lm_ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); - const bool is_neox = mode & 2; + const bool is_neox = mode & LM_GGML_ROPE_TYPE_NEOX; const float * freq_factors = NULL; if (src2 != NULL) { @@ -14210,7 +14207,7 @@ static void lm_ggml_compute_forward_rope_f16( float corr_dims[2]; lm_ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); - const bool is_neox = mode & 2; + const bool is_neox = mode & LM_GGML_ROPE_TYPE_NEOX; const float * freq_factors = NULL; if (src2 != NULL) { @@ -15773,27 +15770,22 @@ static void lm_ggml_compute_forward_flash_attn_back( static void lm_ggml_compute_forward_ssm_conv_f32( const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst) { - const struct lm_ggml_tensor * src0 = dst->src[0]; // conv_state - const struct lm_ggml_tensor * src1 = dst->src[1]; // x - const struct lm_ggml_tensor * src2 = dst->src[2]; // conv1d.weight - const struct lm_ggml_tensor * src3 = dst->src[3]; // state_seq + const struct lm_ggml_tensor * src0 = dst->src[0]; // conv_x + const struct lm_ggml_tensor * src1 = dst->src[1]; // conv1d.weight const int ith = params->ith; const int nth = params->nth; - const int nc = src2->ne[0]; // d_conv - const int nr = src0->ne[1]; // d_inner - const int n_t = src1->ne[1]; // n_tokens - const int n_kv = src0->ne[2]; // max number of sequences in the batch + const int nc = src1->ne[0]; // d_conv + const int ncs = src0->ne[0]; // d_conv - 1 + n_t + const int nr = src0->ne[1]; // d_inner + const int n_t = dst->ne[1]; // tokens per sequence + const int n_s = dst->ne[2]; // number of sequences in the batch - LM_GGML_ASSERT((nr*n_t) + (nc*nr*n_kv) == lm_ggml_nelements(dst)); + LM_GGML_ASSERT( dst->ne[0] == nr); LM_GGML_ASSERT(src0->nb[0] == sizeof(float)); LM_GGML_ASSERT(src1->nb[0] == sizeof(float)); - LM_GGML_ASSERT(src2->nb[0] == sizeof(float)); - LM_GGML_ASSERT(src3->nb[0] == sizeof(int32_t)); LM_GGML_ASSERT(src0->nb[1] == src0->ne[0]*sizeof(float)); - // for use with the destination state offset between sequences - LM_GGML_ASSERT(src2->nb[2] == src2->ne[1]*src2->ne[0]*sizeof(float)); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -15803,74 +15795,27 @@ static void lm_ggml_compute_forward_ssm_conv_f32( const int ir1 = MIN(ir0 + dr, nr); const int ir = ir1 - ir0; - if (n_kv > 1) { - // multiple sequences means it's hard to know when it's the first time a state is read, - // so copy them all over to the destination, just to be sure. - for (int i3 = 0; i3 < n_kv; ++i3) { - float * s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2])); - float * s = (float *) ((char *) dst->data + ir0*(src2->nb[1]) + i3*(src2->nb[2]) + nr*n_t*sizeof(float)); - // can't use memcpy because of d_conv vs d_conv - 1 - for (int i1 = 0; i1 < ir; ++i1) { - for (int i0 = 0; i0 < nc - 1; ++i0) { - // copy s0 to last (d_conv - 1) columns of s - s[1 + i0 + i1*nc] = s0[i0 + i1*(nc - 1)]; - } - } - } - } - - for (int i2 = 0; i2 < n_t; ++i2) { - int32_t * sq = (int32_t *) ((char *) src3->data + i2*(src3->nb[1])); // {n_kv, n_tokens} - float * x = (float *) ((char *) dst->data + ir0*sizeof(float) + i2*(nr*sizeof(float))); // {d_inner, n_tokens} - float * s = (float *) ((char *) dst->data + ir0*(src2->nb[1]) + sq[0]*(src2->nb[2]) + nr*n_t*sizeof(float)); // {d_conv, d_inner, n_kv} - float * s0; // {d_conv - 1, d_inner, n_kv} - float * x0 = (float *) ((char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1])); // {d_inner, n_tokens} - float * c = (float *) ((char *) src2->data + ir0*(src2->nb[1])); // {d_conv, d_inner} - int ne0s0; + for (int i3 = 0; i3 < n_s; ++i3) { + for (int i2 = 0; i2 < n_t; ++i2) { + // {d_conv - 1 + n_t, d_inner, n_seqs} + // sliding window + const float * s = (const float *) ((const char *) src0->data + ir0*(src0->nb[1]) + i2*(src0->nb[0]) + i3*(src0->nb[2])); // {d_conv, d_inner, n_s} + const float * c = (const float *) ((const char *) src1->data + ir0*(src1->nb[1])); // {d_conv, d_inner} + float * x = (float *) ((char *) dst->data + ir0*(dst->nb[0]) + i2*(dst->nb[1]) + i3*(dst->nb[2])); // {d_inner, n_t, n_s} - LM_GGML_ASSERT(0 <= sq[0] && sq[0] < n_kv); - - // avoid needing to copy the state for the first token - if (i2 == 0) { - s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + sq[0]*(src0->nb[2])); // {d_conv - 1, d_inner, n_kv} - ne0s0 = src0->ne[0]; - } else { - // the source is the last (d_conv - 1) columns of the destination - s0 = s + 1; - ne0s0 = nc; - } - - // d_inner - for (int i1 = 0; i1 < ir; ++i1) { - // shift state left - for (int i0 = 0; i0 < nc - 1; ++i0) { - s[i0 + i1*nc] = s0[i0 + i1*ne0s0]; - } - // insert x on the last column - s[(nc - 1) + i1*nc] = x0[i1]; - } - - // handle copies when there are multiple output states - for (int i3 = 1; i3 < n_kv; ++i3) { - int32_t seq = sq[i3]; - if (0 <= seq && seq < n_kv) { - float * s1 = s + (seq - sq[0])*nc*nr; - memcpy(s1, s, nc*ir*sizeof(float)); - } else { - // stop at negative or too big seq_ids - break; - } - } + // TODO: transpose the output for smaller strides for big batches? + // d_inner + for (int i1 = 0; i1 < ir; ++i1) { + // rowwise dot product + // NOTE: not using lm_ggml_vec_dot_f32, because its sum is in double precision + float sumf = 0.0f; - // it seems a little faster when this is separate from the state shift - for (int i1 = 0; i1 < ir; ++i1) { - // rowwise dot product - float sumf = 0.0f; - for (int i0 = 0; i0 < nc; ++i0) { - int i = i0 + i1*nc; - sumf += s[i] * c[i]; + // d_conv + for (int i0 = 0; i0 < nc; ++i0) { + sumf += s[i0 + i1*ncs] * c[i0 + i1*nc]; + } + x[i1] = sumf; } - x[i1] = sumf; } } } @@ -15901,15 +15846,14 @@ static void lm_ggml_compute_forward_ssm_scan_f32( const struct lm_ggml_tensor * src3 = dst->src[3]; // A const struct lm_ggml_tensor * src4 = dst->src[4]; // B const struct lm_ggml_tensor * src5 = dst->src[5]; // C - const struct lm_ggml_tensor * src6 = dst->src[6]; // sq const int ith = params->ith; const int nth = params->nth; - const int64_t nc = src0->ne[0]; // d_state - const int64_t nr = src0->ne[1]; // d_inner - const int64_t n_t = src1->ne[1]; // number of tokens in the batch - const int64_t n_kv = src0->ne[2]; // max number of sequences in the batch + const int64_t nc = src0->ne[0]; // d_state + const int64_t nr = src0->ne[1]; // d_inner + const int64_t n_t = src1->ne[1]; // number of tokens per sequence + const int64_t n_s = src0->ne[2]; // number of sequences in the batch LM_GGML_ASSERT(lm_ggml_nelements(src1) + lm_ggml_nelements(src0) == lm_ggml_nelements(dst)); LM_GGML_ASSERT(src0->nb[0] == sizeof(float)); @@ -15918,12 +15862,12 @@ static void lm_ggml_compute_forward_ssm_scan_f32( LM_GGML_ASSERT(src3->nb[0] == sizeof(float)); LM_GGML_ASSERT(src4->nb[0] == sizeof(float)); LM_GGML_ASSERT(src5->nb[0] == sizeof(float)); - // required for the dot product between s and C, and when copying the states + // required for the dot product between s and C LM_GGML_ASSERT(src0->nb[1] == src0->ne[0]*sizeof(float)); // required for per-sequence offsets for states LM_GGML_ASSERT(src0->nb[2] == src0->ne[0]*src0->ne[1]*sizeof(float)); - // required to get correct offset for state destination (i.e. src1->nb[2]) - LM_GGML_ASSERT(src1->nb[2] == src1->ne[0]*src1->ne[1]*sizeof(float)); + // required to get correct offset for state destination (i.e. src1->nb[3]) + LM_GGML_ASSERT(src1->nb[3] == src1->ne[0]*src1->ne[1]*src1->ne[2]*sizeof(float)); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -15933,64 +15877,36 @@ static void lm_ggml_compute_forward_ssm_scan_f32( const int ir1 = MIN(ir0 + dr, nr); const int ir = ir1 - ir0; - if (n_kv > 1) { - // it's hard to know if the source states have already been copied - // when there are multiple, so copy them already. - for (int i3 = 0; i3 < n_kv; ++i3) { - float * s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2])); - float * s = (float *) ((char *) dst->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]) + src1->nb[2]); - memcpy(s, s0, nc*ir*sizeof(float)); - } - } - - for (int i2 = 0; i2 < n_t; ++i2) { - int32_t * sq = (int32_t *) ((char *) src6->data + i2*(src6->nb[1])); // {n_kv, n_tokens} - float * y = (float *) ((char *) dst->data + ir0*(src1->nb[0]) + i2*(src1->nb[1])); // {d_inner, n_tokens} - float * s = (float *) ((char *) dst->data + ir0*(src0->nb[1]) + sq[0]*(src0->nb[2]) + src1->nb[2]); // {d_state, d_inner, n_kv} - float * s0; - float * x = (float *) ((char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1])); // {d_inner, n_tokens} - float * dt = (float *) ((char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1])); // {d_inner, n_tokens} - float * A = (float *) ((char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner} - float * B = (float *) ((char *) src4->data + i2*(src4->nb[1])); // {d_state, n_tokens} - float * C = (float *) ((char *) src5->data + i2*(src5->nb[1])); // {d_state, n_tokens} - - LM_GGML_ASSERT(0 <= sq[0] && sq[0] < n_kv); - - // avoid needing to copy the state for the first token - if (i2 == 0) { - s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + sq[0]*(src0->nb[2])); // {d_state, d_inner, n_kv} - } else { - // otherwise the source is the same as the destination - s0 = s; - } - - // d_inner - for (int i1 = 0; i1 < ir; ++i1) { - // ref: https://github.com/state-spaces/mamba/blob/34076d664838588a3c97727b263478ab9f621a07/mamba_ssm/ops/triton/selective_state_update.py#L78 - float dt_soft_plus = dt[i1] <= 20.0f ? log1pf(expf(dt[i1])) : dt[i1]; - float x_dt = x[i1] * dt_soft_plus; - float sumf = 0.0f; - // d_state - for (int i0 = 0; i0 < nc; ++i0) { - int i = i0 + i1*nc; - // state = prev_state * dA + dB * x - float state = (s0[i] * expf(dt_soft_plus * A[i])) + (B[i0] * x_dt); - // y = rowwise_dotprod(state, C) - sumf += state * C[i0]; - s[i] = state; - } - y[i1] = sumf; - } - - // handle copies when there are multiple output states - for (int i3 = 1; i3 < n_kv; ++i3) { - int32_t seq = sq[i3]; - if (0 <= seq && seq < n_kv) { - float * s1 = s + (seq - sq[0])*nc*nr; - memcpy(s1, s, nc*ir*sizeof(float)); - } else { - // stop at negative or too big seq_ids - break; + for (int i3 = 0; i3 < n_s; ++i3) { + for (int i2 = 0; i2 < n_t; ++i2) { + const float * s0 = (const float *) ((const char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2])); // {d_state, d_inner, n_s} + const float * x = (const float *) ((const char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s} + const float * dt = (const float *) ((const char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1]) + i3*(src2->nb[2])); // {d_inner, n_t, n_s} + const float * A = (const float *) ((const char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner} + const float * B = (const float *) ((const char *) src4->data + i2*(src4->nb[1]) + i3*(src4->nb[2])); // {d_state, n_t, n_s} + const float * C = (const float *) ((const char *) src5->data + i2*(src5->nb[1]) + i3*(src5->nb[2])); // {d_state, n_t, n_s} + float * y = (float *) ((char *) dst->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s} + float * s = (float *) ((char *) dst->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]) + src1->nb[3]); // {d_state, d_inner, n_s} + + // use the output as the source for the next token-wise iterations + if (i2 > 0) { s0 = s; } + + // d_inner + for (int i1 = 0; i1 < ir; ++i1) { + // ref: https://github.com/state-spaces/mamba/blob/34076d664838588a3c97727b263478ab9f621a07/mamba_ssm/ops/triton/selective_state_update.py#L78 + float dt_soft_plus = dt[i1] <= 20.0f ? log1pf(expf(dt[i1])) : dt[i1]; + float x_dt = x[i1] * dt_soft_plus; + float sumf = 0.0f; + // d_state + for (int i0 = 0; i0 < nc; ++i0) { + int i = i0 + i1*nc; + // state = prev_state * dA + dB * x + float state = (s0[i] * expf(dt_soft_plus * A[i])) + (B[i0] * x_dt); + // y = rowwise_dotprod(state, C) + sumf += state * C[i0]; + s[i] = state; + } + y[i1] = sumf; } } } @@ -21120,7 +21036,7 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg (int64_t) info->ne[2] * (int64_t) info->ne[3]; - if (ne % lm_ggml_blck_size(info->type) != 0) { + if (lm_ggml_blck_size(info->type) == 0 || ne % lm_ggml_blck_size(info->type) != 0) { fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n", __func__, info->name.data, (int) info->type, lm_ggml_type_name(info->type), ne, lm_ggml_blck_size(info->type)); fclose(file); diff --git a/cpp/ggml.h b/cpp/ggml.h index e473313..1c52da2 100644 --- a/cpp/ggml.h +++ b/cpp/ggml.h @@ -244,6 +244,8 @@ #define LM_GGML_EXIT_SUCCESS 0 #define LM_GGML_EXIT_ABORTED 1 +#define LM_GGML_ROPE_TYPE_NEOX 2 + #define LM_GGUF_MAGIC "GGUF" #define LM_GGUF_VERSION 3 @@ -1140,16 +1142,17 @@ extern "C" { // group normalize along ne0*ne1*n_groups // used in stable-diffusion - // TODO: eps is hardcoded to 1e-6 for now LM_GGML_API struct lm_ggml_tensor * lm_ggml_group_norm( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, - int n_groups); + int n_groups, + float eps); LM_GGML_API struct lm_ggml_tensor * lm_ggml_group_norm_inplace( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, - int n_groups); + int n_groups, + float eps); // a - x // b - dy @@ -1452,8 +1455,8 @@ extern "C" { struct lm_ggml_tensor * b); // rotary position embedding - // if mode & 1 == 1, skip n_past elements (NOT SUPPORTED) - // if mode & 2 == 1, GPT-NeoX style + // if (mode & 1) - skip n_past elements (NOT SUPPORTED) + // if (mode & LM_GGML_ROPE_TYPE_NEOX) - GPT-NeoX style // // b is an int32 vector with size a->ne[2], it contains the positions LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope( @@ -1774,10 +1777,8 @@ extern "C" { LM_GGML_API struct lm_ggml_tensor * lm_ggml_ssm_conv( struct lm_ggml_context * ctx, - struct lm_ggml_tensor * s, - struct lm_ggml_tensor * x, - struct lm_ggml_tensor * c, - struct lm_ggml_tensor * sq); + struct lm_ggml_tensor * sx, + struct lm_ggml_tensor * c); LM_GGML_API struct lm_ggml_tensor * lm_ggml_ssm_scan( struct lm_ggml_context * ctx, @@ -1786,8 +1787,7 @@ extern "C" { struct lm_ggml_tensor * dt, struct lm_ggml_tensor * A, struct lm_ggml_tensor * B, - struct lm_ggml_tensor * C, - struct lm_ggml_tensor * sq); + struct lm_ggml_tensor * C); // partition into non-overlapping windows with padding if needed // example: diff --git a/cpp/grammar-parser.cpp b/cpp/grammar-parser.cpp index a518b76..438452e 100644 --- a/cpp/grammar-parser.cpp +++ b/cpp/grammar-parser.cpp @@ -369,6 +369,9 @@ namespace grammar_parser { } // Validate the state to ensure that all rules are defined for (const auto & rule : state.rules) { + if (rule.empty()) { + throw std::runtime_error("Undefined rule"); + } for (const auto & elem : rule) { if (elem.type == LLAMA_GRETYPE_RULE_REF) { // Ensure that the rule at that location exists diff --git a/cpp/llama-impl.h b/cpp/llama-impl.h index d734f27..ac6ce52 100644 --- a/cpp/llama-impl.h +++ b/cpp/llama-impl.h @@ -24,3 +24,18 @@ void llama_log_callback_default(lm_ggml_log_level level, const char * text, void #define LLAMA_LOG_INFO(...) llama_log_internal(LM_GGML_LOG_LEVEL_INFO , __VA_ARGS__) #define LLAMA_LOG_WARN(...) llama_log_internal(LM_GGML_LOG_LEVEL_WARN , __VA_ARGS__) #define LLAMA_LOG_ERROR(...) llama_log_internal(LM_GGML_LOG_LEVEL_ERROR, __VA_ARGS__) + +// +// helpers +// + +static void replace_all(std::string & s, const std::string & search, const std::string & replace) { + if (search.empty()) { + return; // Avoid infinite loop if 'search' is an empty string + } + size_t pos = 0; + while ((pos = s.find(search, pos)) != std::string::npos) { + s.replace(pos, search.length(), replace); + pos += replace.length(); + } +} diff --git a/cpp/llama-sampling.cpp b/cpp/llama-sampling.cpp index 3d7d0ce..c76a75b 100644 --- a/cpp/llama-sampling.cpp +++ b/cpp/llama-sampling.cpp @@ -85,14 +85,14 @@ void llama_sample_top_k_impl(struct llama_sampling * smpl, llama_token_data_arra constexpr float bucket_low = -10.0f; constexpr float bucket_high = 10.0f; constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low); - constexpr float bucker_inter = -bucket_low * bucket_scale; + constexpr float bucket_inter = -bucket_low * bucket_scale; std::vector bucket_idx(candidates->size); std::vector histo(nbuckets, 0); for (int i = 0; i < (int)candidates->size; ++i) { const float val = candidates->data[i].logit; - int ib = int(bucket_scale * val + bucker_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low); + int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low); ib = std::max(0, std::min(nbuckets-1, ib)); bucket_idx[i] = ib; ++histo[ib]; diff --git a/cpp/llama-vocab.cpp b/cpp/llama-vocab.cpp index 1a0e1a0..0a70ef3 100644 --- a/cpp/llama-vocab.cpp +++ b/cpp/llama-vocab.cpp @@ -16,20 +16,6 @@ // helpers // -static void replace_all(std::string & s, const std::string & search, const std::string & replace) { - std::string result; - for (size_t pos = 0; ; pos += search.length()) { - auto new_pos = s.find(search, pos); - if (new_pos == std::string::npos) { - result += s.substr(pos, s.size() - pos); - break; - } - result += s.substr(pos, new_pos - pos) + replace; - pos = new_pos; - } - s = std::move(result); -} - LLAMA_ATTRIBUTE_FORMAT(1, 2) static std::string format(const char * fmt, ...) { va_list ap; @@ -335,6 +321,21 @@ struct llm_tokenizer_spm { // TODO: there are a lot of common parts between spm and bpe tokenizers, should be refactored and reused +template, typename Compare = std::less> +class llama_priority_queue : public std::priority_queue { +public: + using std::priority_queue::priority_queue; + + T pop_move() { + T item = std::move(this->c.front()); + std::pop_heap(this->c.begin(), this->c.end(), this->comp); + this->c.pop_back(); + return item; + } + + void pop() = delete; +}; + struct llm_bigram_bpe { struct comparator { bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const { @@ -343,7 +344,7 @@ struct llm_bigram_bpe { }; using queue_storage = std::vector; - using queue = std::priority_queue; + using queue = llama_priority_queue; llm_symbol::index left; llm_symbol::index right; std::string text; @@ -402,6 +403,7 @@ struct llm_tokenizer_bpe { case LLAMA_VOCAB_PRE_TYPE_COMMAND_R: case LLAMA_VOCAB_PRE_TYPE_SMOLLM: case LLAMA_VOCAB_PRE_TYPE_CODESHELL: + case LLAMA_VOCAB_PRE_TYPE_EXAONE: regex_exprs = { "\\p{N}", "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", @@ -424,6 +426,8 @@ struct llm_tokenizer_bpe { }; break; case LLAMA_VOCAB_PRE_TYPE_PORO: + case LLAMA_VOCAB_PRE_TYPE_BLOOM: + case LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH: regex_exprs = { " ?[^(\\s|.,!?…。,、।۔،)]+", }; @@ -531,8 +535,7 @@ struct llm_tokenizer_bpe { // build token(s) while (!work_queue.empty()) { - auto bigram = work_queue.top(); - work_queue.pop(); + auto bigram = work_queue.pop_move(); auto & left_symbol = symbols[bigram.left]; auto & right_symbol = symbols[bigram.right]; @@ -816,6 +819,9 @@ struct llm_tokenizer_ugm { * the best tokenization. */ void tokenize(const std::string & text, std::vector & output) { + // get current size of output (for reversal later) + size_t output_size = output.size(); + // normalize the input first std::string normalized; normalize(text, &normalized); @@ -895,7 +901,7 @@ struct llm_tokenizer_ugm { } // reverse the output since we added tokens starting from the end of the input - std::reverse(output.begin(), output.end()); + std::reverse(output.begin() + output_size, output.end()); } private: @@ -1477,11 +1483,11 @@ llama_token llama_token_pad_impl(const struct llama_vocab & vocab) { return vocab.special_pad_id; } -int32_t llama_add_bos_token_impl(const struct llama_vocab & vocab) { +bool llama_add_bos_token_impl(const struct llama_vocab & vocab) { return vocab.tokenizer_add_bos; } -int32_t llama_add_eos_token_impl(const struct llama_vocab & vocab) { +bool llama_add_eos_token_impl(const struct llama_vocab & vocab) { return vocab.tokenizer_add_eos; } diff --git a/cpp/llama-vocab.h b/cpp/llama-vocab.h index 7adfc16..6e8f30b 100644 --- a/cpp/llama-vocab.h +++ b/cpp/llama-vocab.h @@ -95,8 +95,8 @@ llama_token llama_token_sep_impl(const struct llama_vocab & vocab); llama_token llama_token_nl_impl (const struct llama_vocab & vocab); llama_token llama_token_pad_impl(const struct llama_vocab & vocab); -int32_t llama_add_bos_token_impl(const struct llama_vocab & vocab); -int32_t llama_add_eos_token_impl(const struct llama_vocab & vocab); +bool llama_add_bos_token_impl(const struct llama_vocab & vocab); +bool llama_add_eos_token_impl(const struct llama_vocab & vocab); llama_token llama_token_prefix_impl(const struct llama_vocab & vocab); llama_token llama_token_middle_impl(const struct llama_vocab & vocab); diff --git a/cpp/llama.cpp b/cpp/llama.cpp index 1d99f94..79e0076 100644 --- a/cpp/llama.cpp +++ b/cpp/llama.cpp @@ -132,17 +132,6 @@ static std::string trim(const std::string & str) { return str.substr(start, end - start); } -static void replace_all(std::string & s, const std::string & search, const std::string & replace) { - if (search.empty()) { - return; // Avoid infinite loop if 'search' is an empty string - } - size_t pos = 0; - while ((pos = s.find(search, pos)) != std::string::npos) { - s.replace(pos, search.length(), replace); - pos += replace.length(); - } -} - static bool is_float_close(float a, float b, float abs_tol) { // Check for non-negative tolerance if (abs_tol < 0.0) { @@ -230,7 +219,10 @@ enum llm_arch { LLM_ARCH_CHATGLM, LLM_ARCH_BITNET, LLM_ARCH_T5, + LLM_ARCH_T5ENCODER, LLM_ARCH_JAIS, + LLM_ARCH_NEMOTRON, + LLM_ARCH_EXAONE, LLM_ARCH_UNKNOWN, }; @@ -274,7 +266,10 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_CHATGLM, "chatglm" }, { LLM_ARCH_BITNET, "bitnet" }, { LLM_ARCH_T5, "t5" }, + { LLM_ARCH_T5ENCODER, "t5encoder" }, { LLM_ARCH_JAIS, "jais" }, + { LLM_ARCH_NEMOTRON, "nemotron" }, + { LLM_ARCH_EXAONE, "exaone" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -344,6 +339,7 @@ enum llm_kv { LLM_KV_SSM_CONV_KERNEL, LLM_KV_SSM_STATE_SIZE, LLM_KV_SSM_TIME_STEP_RANK, + LLM_KV_SSM_DT_B_C_RMS, LLM_KV_TOKENIZER_MODEL, LLM_KV_TOKENIZER_PRE, @@ -442,6 +438,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" }, { LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" }, { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" }, + { LLM_KV_SSM_DT_B_C_RMS, "%s.ssm.dt_b_c_rms" }, { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" }, @@ -1283,6 +1280,24 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_ENC_FFN_UP, "enc.blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_T5ENCODER, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ENC_OUTPUT_NORM, "enc.output_norm" }, + { LLM_TENSOR_ENC_ATTN_NORM, "enc.blk.%d.attn_norm" }, + { LLM_TENSOR_ENC_ATTN_Q, "enc.blk.%d.attn_q" }, + { LLM_TENSOR_ENC_ATTN_K, "enc.blk.%d.attn_k" }, + { LLM_TENSOR_ENC_ATTN_V, "enc.blk.%d.attn_v" }, + { LLM_TENSOR_ENC_ATTN_OUT, "enc.blk.%d.attn_o" }, + { LLM_TENSOR_ENC_ATTN_REL_B, "enc.blk.%d.attn_rel_b" }, + { LLM_TENSOR_ENC_FFN_NORM, "enc.blk.%d.ffn_norm" }, + { LLM_TENSOR_ENC_FFN_GATE, "enc.blk.%d.ffn_gate" }, + { LLM_TENSOR_ENC_FFN_DOWN, "enc.blk.%d.ffn_down" }, + { LLM_TENSOR_ENC_FFN_UP, "enc.blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_JAIS, { @@ -1298,6 +1313,43 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, }, }, + { + LLM_ARCH_NEMOTRON, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, + { + LLM_ARCH_EXAONE, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -2198,6 +2250,7 @@ struct llama_hparams { uint32_t ssm_d_inner = 0; uint32_t ssm_d_state = 0; uint32_t ssm_dt_rank = 0; + bool ssm_dt_b_c_rms = false; float f_clamp_kqv = 0.0f; float f_max_alibi_bias = 0.0f; @@ -2247,6 +2300,7 @@ struct llama_hparams { if (this->ssm_d_inner != other.ssm_d_inner) return true; if (this->ssm_d_state != other.ssm_d_state) return true; if (this->ssm_dt_rank != other.ssm_dt_rank) return true; + if (this->ssm_dt_b_c_rms != other.ssm_dt_b_c_rms) return true; if (this->dec_start_token_id != other.dec_start_token_id) return true; @@ -2473,10 +2527,29 @@ struct llama_layer { struct lm_ggml_tensor * ffn_down_scale; }; +// very similar to llama_batch, +// but has more metadata about sequences +struct llama_ubatch { + bool equal_seqs; + // TODO: whole_seqs for embeddings? + + uint32_t n_tokens; // total tokens (n_seq_tokens * n_seqs) + uint32_t n_seq_tokens; // tokens per sequence + uint32_t n_seqs; + + llama_token * token; // [n_tokens] + float * embd; // [n_embd, n_tokens] + llama_pos * pos; // [n_tokens] + int32_t * n_seq_id; // [n_seqs] + llama_seq_id ** seq_id; // [n_seqs] + int8_t * output; // [n_tokens] +}; + struct llama_kv_cell { llama_pos pos = -1; llama_pos delta = 0; - int32_t src = 0; // used by recurrent state models to copy states + int32_t src = -1; // used by recurrent state models to copy states + int32_t tail = -1; std::set seq_id; @@ -2497,7 +2570,6 @@ struct llama_kv_cell { struct llama_kv_cache { bool has_shift = false; bool do_defrag = false; - bool do_copy = false; bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token bool v_trans = true; // the value tensor is transposed @@ -2660,6 +2732,340 @@ struct llama_model { } }; +struct llama_sbatch_seq { + int32_t n_seq_id; + llama_seq_id * seq_id; + size_t offset; + size_t length; + + // helper for smoother batch API transition -- can be deprecated in the future + llama_seq_id all_seq_id; // used if seq_id == NULL +}; + +// sequence-length-aware batch splitting +struct llama_sbatch { + // tokens left in this batch + size_t n_tokens; + + size_t n_embd; + + bool logits_all; // TODO: remove once lctx.logits_all is removed too + + // sorted indices into the batch + std::vector ids; + // batch indices of the output + std::vector out_ids; + std::vector seq; + const llama_batch * batch = nullptr; + + // buffers for the ubatch + std::vector ubatch_token; + std::vector ubatch_embd; + std::vector ubatch_pos; + std::vector ubatch_n_seq_id; + std::vector ubatch_seq_id; + std::vector ubatch_output; + + llama_ubatch reserve_ubatch(size_t n_ubatch, bool has_embd = false) { + // clear empty sequences + // the previous ubatch is assumed to be gone, + // so nothing should refer to values in these sequences anymore. + for (size_t i = seq.size(); i-- > 0;) { + if (seq[i].length == 0) { + seq.pop_back(); + } else { + break; + } + } + ubatch_token.resize(!has_embd ? n_ubatch : 0); + ubatch_embd.resize(has_embd ? n_embd * n_ubatch : 0); + ubatch_pos.resize(n_ubatch); + ubatch_n_seq_id.resize(n_ubatch); + ubatch_seq_id.resize(n_ubatch); + ubatch_output.resize(n_ubatch); + llama_ubatch ubatch = { + /*equal_seqs =*/ true, + /*n_tokens =*/ 0, + /*n_seq_tokens =*/ 0, + /*n_seqs =*/ 0, + /*token =*/ !has_embd ? ubatch_token.data() : nullptr, + /*embd =*/ has_embd ? ubatch_embd.data() : nullptr, + /*pos =*/ ubatch_pos.data(), + /*n_seq_id =*/ ubatch_n_seq_id.data(), + /*seq_id =*/ ubatch_seq_id.data(), + /*output =*/ ubatch_output.data(), + }; + return ubatch; + } + + void add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & seq, size_t length) { + LM_GGML_ASSERT(batch != nullptr); + LM_GGML_ASSERT(length <= seq.length); + // Can only add sequences of equal lengths to a batch, + // otherwise it isn't clear to which sequence a token belongs + LM_GGML_ASSERT(seq.n_seq_id == 0 || ubatch.n_seqs == 0 || length == (size_t) ubatch.n_tokens / ubatch.n_seqs); + LM_GGML_ASSERT((seq.n_seq_id != 0) == ubatch.equal_seqs); + // NOTE: loops are separated for cache-friendliness + if (batch->token) { + if (ubatch.equal_seqs) { + for (size_t i = 0; i < length; ++i) { + ubatch.token[ubatch.n_tokens + i] = batch->token[ids[seq.offset + i]]; + } + } else { + // simple split + ubatch.token = batch->token + seq.offset; + } + } else { + ubatch.token = nullptr; + } + if (batch->embd) { + if (ubatch.equal_seqs) { + for (size_t i = 0; i < length; ++i) { + memcpy( + ubatch.embd + n_embd * (ubatch.n_tokens + i), + batch->embd + n_embd * ids[seq.offset + i], + n_embd * sizeof(float) + ); + } + } else { + // simple split + ubatch.embd = batch->embd + (n_embd * seq.offset); + } + } else { + ubatch.embd = nullptr; + } + // from here on, the else branches are deprecated; + // they are helpers for smoother batch API transition + if (batch->pos) { + if (ubatch.equal_seqs) { + for (size_t i = 0; i < length; ++i) { + ubatch.pos[ubatch.n_tokens + i] = batch->pos[ids[seq.offset + i]]; + } + } else { + // simple split + ubatch.pos = batch->pos + seq.offset; + } + } else { + for (size_t i = 0; i < length; ++i) { + llama_pos bi = ids[seq.offset + i]; + ubatch.pos[ubatch.n_tokens + i] = batch->all_pos_0 + (bi * batch->all_pos_1); + } + } + if (ubatch.equal_seqs) { + ubatch.n_seq_id[ubatch.n_seqs] = seq.n_seq_id; + if (seq.seq_id) { + ubatch.seq_id[ubatch.n_seqs] = seq.seq_id; + } else { + LM_GGML_ASSERT(seq.n_seq_id == 1); + ubatch.seq_id[ubatch.n_seqs] = &seq.all_seq_id; + } + } else { + // simple split + if (batch->n_seq_id) { + for (size_t i = 0; i < length; ++i) { + ubatch.n_seq_id = batch->n_seq_id + seq.offset; + } + } else { + for (size_t i = 0; i < length; ++i) { + ubatch.n_seq_id[ubatch.n_seqs + i] = 1; + } + } + if (batch->seq_id) { + for (size_t i = 0; i < length; ++i) { + ubatch.seq_id = batch->seq_id + seq.offset; + } + } else { + for (size_t i = 0; i < length; ++i) { + ubatch.seq_id[ubatch.n_seqs + i] = &seq.all_seq_id; + } + } + } + if (logits_all) { + for (size_t i = 0; i < length; ++i) { + ubatch.output[ubatch.n_tokens + i] = 1; + out_ids.push_back(ids[seq.offset + i]); + } + } else if (batch->logits) { + if (ubatch.equal_seqs) { + for (size_t i = 0; i < length; ++i) { + size_t id = ids[seq.offset + i]; + int8_t is_output = batch->logits[id]; + ubatch.output[ubatch.n_tokens + i] = is_output; + if (is_output) { out_ids.push_back(id); } + } + } else { + // simple split + ubatch.output = batch->logits + seq.offset; + for (size_t i = 0; i < length; ++i) { + if (ubatch.output[i] != 0) { out_ids.push_back(seq.offset + i); } + } + } + } else { + // only get last output + for (size_t i = 0; i < length; ++i) { + size_t id = ids[seq.offset + i]; + int8_t is_last = id == ids.size() - 1; + ubatch.output[ubatch.n_tokens + i] = is_last; + if (is_last) { out_ids.push_back(id); } + } + } + if (ubatch.n_tokens == 0 && ubatch.n_seqs == 0) { + ubatch.n_seq_tokens = ubatch.equal_seqs ? length : 1; + } + ubatch.n_tokens += length; + ubatch.n_seqs += ubatch.equal_seqs ? 1 : length; // virtual sequences for simple splits + seq.offset += length; + seq.length -= length; + n_tokens -= length; + LM_GGML_ASSERT(ubatch.n_tokens == ubatch.n_seq_tokens * ubatch.n_seqs); + } + + // simple split, unknown number of sequences of unequal lengths + llama_ubatch split_simple(size_t n_ubatch) { + n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch; + llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr); + ubatch.equal_seqs = false; + if (!seq.empty()) { + llama_sbatch_seq & s = seq[0]; + size_t length = s.length < n_ubatch ? s.length : n_ubatch; + LM_GGML_ASSERT(seq.size() == 1 && s.n_seq_id == 0); // don't mix with other splits + add_seq_to_ubatch(ubatch, s, length); + } + return ubatch; + } + + // make batches of equal-length sequences + llama_ubatch split_equal(size_t n_ubatch) { + n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch; + llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr); + if (!seq.empty()) { + size_t length = 0; + size_t n_tokens_in_ubatch = 0; + LM_GGML_ASSERT(seq[0].n_seq_id > 0); // should not be mixed with simple splits + // smallest first, because it's easier to split this way; + // starting from the end to pop in constant time. + for (size_t i = seq.size(); i-- > 0;) { + llama_sbatch_seq & s = seq[i]; + LM_GGML_ASSERT(s.length > 0); + if (length == 0) { + length = s.length < n_ubatch ? s.length : n_ubatch; + } + add_seq_to_ubatch(ubatch, s, length); + n_tokens_in_ubatch += length; + // shared prompts can't be mixed with any of their sequences, + // so it's safer to compute them in their own ubatch + if (s.n_seq_id > 1) { break; } + // stop when there isn't enough space for another sequence + if (length + n_tokens_in_ubatch > n_ubatch) { break; } + } + } + return ubatch; + } + + // sequence-wise split + llama_ubatch split_seq(size_t n_ubatch) { + n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch; + llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr); + if (!seq.empty()) { + llama_sbatch_seq & s = seq[seq.size() - 1]; + size_t length = s.length < n_ubatch ? s.length : n_ubatch; + LM_GGML_ASSERT(s.n_seq_id > 0); // should not be mixed with simple splits + add_seq_to_ubatch(ubatch, s, length); + } + return ubatch; + } + + void from_batch(const llama_batch & batch, const size_t n_embd, const bool simple_split = false, const bool logits_all = false) { + LM_GGML_ASSERT(batch.n_tokens >= 0); + this->batch = &batch; + this->n_embd = n_embd; + this->logits_all = logits_all; + + n_tokens = batch.n_tokens; + ids.resize(n_tokens); + out_ids.clear(); + // TODO: reserve out_ids and seq + + for (size_t i = 0; i < n_tokens; ++i) { + ids[i] = i; + } + if (simple_split) { + seq.resize(1); + llama_sbatch_seq & s = seq[0]; + s.n_seq_id = 0; + s.seq_id = nullptr; + s.offset = 0; + s.length = n_tokens; + s.all_seq_id = batch.all_seq_id; + return; + } + std::sort(ids.begin(), ids.end(), + [&batch](size_t a, size_t b) { + int32_t n_seq_a = batch.n_seq_id ? batch.n_seq_id[a] : 1; + int32_t n_seq_b = batch.n_seq_id ? batch.n_seq_id[b] : 1; + // sort by seq_id, then by pos + if (n_seq_a == n_seq_b) { + if (batch.seq_id) { + for (int32_t i = 0; i < n_seq_a; ++i) { + llama_seq_id seq_id_a = batch.seq_id[a][i]; + llama_seq_id seq_id_b = batch.seq_id[b][i]; + // smaller seq_ids go first + if (seq_id_a != seq_id_b) { + return seq_id_a < seq_id_b; + } + } + } + // when all else is equal, sort by pos + if (batch.pos) { + return batch.pos[a] < batch.pos[b]; + } + // no pos, sort by id (assuming batch.all_pos_1 is positive) + return a < b; + } + // shared prompts go first + return n_seq_a > n_seq_b; + } + ); + // init seq + llama_sbatch_seq * last_seq = nullptr; + + if (batch.n_seq_id != nullptr && batch.seq_id != nullptr) { + for (size_t i = 0; i < n_tokens; ++i) { + const size_t bi = ids[i]; + const int32_t n_seqs = batch.n_seq_id[bi]; + llama_seq_id * seq_ids = batch.seq_id[bi]; + if (last_seq != nullptr) { + bool same = n_seqs == last_seq->n_seq_id; + for (int32_t j = 0; same && j < n_seqs; ++j) { + if (seq_ids[j] != last_seq->seq_id[j]) { + same = false; + } + } + if (same) { + last_seq->length += 1; + continue; + } + } + llama_sbatch_seq new_seq = {n_seqs, seq_ids, i, 1, batch.all_seq_id}; + seq.push_back(new_seq); + last_seq = &seq.back(); + } + } else { + llama_sbatch_seq new_seq = {1, nullptr, 0, n_tokens, batch.all_seq_id}; + seq.push_back(new_seq); + } + // keep shared prompts first at the end, then sort by length descending. + std::sort(seq.begin(), seq.end(), + [](llama_sbatch_seq & a, llama_sbatch_seq & b) { + if (a.n_seq_id == b.n_seq_id) { + return a.length > b.length; + } + return a.n_seq_id < b.n_seq_id; + } + ); + } +}; + struct llama_context { llama_context(const llama_model & model) : model(model) @@ -2681,6 +3087,7 @@ struct llama_context { struct llama_cparams cparams; struct llama_sampling sampling; + struct llama_sbatch sbatch; struct llama_kv_cache kv_self; struct llama_control_vector cvec; @@ -2941,8 +3348,7 @@ static bool llama_kv_cache_init( cache.has_shift = false; - // TODO: find a nicer way to add other recurrent model architectures - cache.recurrent = model.arch == LLM_ARCH_MAMBA; + cache.recurrent = llama_model_is_recurrent(&model); cache.v_trans = !cache.recurrent && !cparams.flash_attn; cache.head = 0; @@ -2955,13 +3361,6 @@ static bool llama_kv_cache_init( cache.cells.clear(); cache.cells.resize(kv_size); - if (cache.recurrent) { - // init state copy sources - for (uint32_t i = 0; i < cache.size; ++i) { - cache.cells[i].src = i; - } - } - // count used buffer types std::map buft_layer_count; if (offload) { @@ -3029,54 +3428,170 @@ static bool llama_kv_cache_init( // to the first cell of the slot. static bool llama_kv_cache_find_slot( struct llama_kv_cache & cache, - const struct llama_batch & batch) { + const struct llama_ubatch & batch) { const uint32_t n_tokens = batch.n_tokens; + const uint32_t n_seqs = batch.n_seqs; + const uint32_t n_seq_tokens = batch.n_seq_tokens; if (cache.recurrent) { // For recurrent state architectures (like Mamba), - // each KV cache cell can store the state for a whole sequence. - - llama_seq_id min = cache.size - 1; - llama_seq_id max = 0; - - for (uint32_t i = 0; i < n_tokens; ++i) { - for (int32_t j = 0; j < batch.n_seq_id[i]; ++j) { - llama_seq_id seq_id = batch.seq_id[i][j]; - // make sure it's a valid seq_id - if ((uint32_t) seq_id < cache.size) { - if (seq_id > max) { - max = seq_id; - } - if (seq_id < min) { - min = seq_id; + // each cache cell can store the state for a whole sequence. + // A slot should be always be contiguous. + + // can only process batches with an equal number of new tokens in each sequence + LM_GGML_ASSERT(batch.equal_seqs); + + int32_t min = cache.size - 1; + int32_t max = 0; + + // everything should fit if all seq_ids are smaller than the max + for (uint32_t s = 0; s < n_seqs; ++s) { + const uint32_t n_seq_id = batch.n_seq_id[s]; + for (uint32_t j = 0; j < n_seq_id; ++j) { + const llama_seq_id seq_id = batch.seq_id[s][j]; + + if (seq_id < 0 || (uint32_t) seq_id >= cache.size) { + // too big seq_id + // TODO: would it be possible to resize the cache instead? + LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, cache.size); + return false; + } + if (j > 0) { + llama_kv_cell & seq = cache.cells[seq_id]; + if (seq.tail >= 0) { + llama_kv_cell & cell = cache.cells[seq.tail]; + // clear cells from seq_ids that become shared + // (should not normally happen, but let's handle it anyway) + cell.seq_id.erase(seq_id); + seq.tail = -1; + if (cell.seq_id.empty()) { + cell.pos = -1; + cell.src = -1; + cache.used -= 1; + } } - // Assuming the tokens are in-order - if (batch.pos[i] != cache.cells[seq_id].pos + 1) { - // What should happen when the pos backtracks or skips a value? - // Clearing the state mid-batch would require special-casing which isn't done. - LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d\n", - __func__, batch.pos[i], cache.cells[seq_id].pos, seq_id); + } + } + } + +#ifndef NDEBUG + { + std::vector tails_verif; + tails_verif.assign(cache.size, -1); + for (uint32_t i = 0; i < cache.size; ++i) { + llama_kv_cell & cell = cache.cells[i]; + for (llama_seq_id seq_id : cell.seq_id) { + if (tails_verif[seq_id] != -1) { + LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tails_verif[seq_id]); } - if (cache.cells[seq_id].pos < 0 && 0 <= batch.pos[i]) { - cache.used += 1; + tails_verif[seq_id] = i; + } + } + for (uint32_t i = 0; i < cache.size; ++i) { + if (tails_verif[i] != cache.cells[i].tail) { + LLAMA_LOG_ERROR("%s: wrong tail for seq_id %d, (%d instead of %d)\n", __func__, i, cache.cells[i].tail, tails_verif[i]); + } + } + } +#endif + + // find next empty cell + uint32_t next_empty_cell = cache.head; + + for (uint32_t i = 0; i < cache.size; ++i) { + if (next_empty_cell >= cache.size) { next_empty_cell -= cache.size; } + llama_kv_cell & cell = cache.cells[next_empty_cell]; + if (cell.is_empty()) { break; } + next_empty_cell += 1; + } + + // find usable cell range + for (uint32_t s = 0; s < n_seqs; ++s) { + const llama_seq_id seq_id = batch.seq_id[s][0]; + llama_kv_cell & seq_meta = cache.cells[seq_id]; + bool has_cell = false; + if (seq_meta.tail >= 0) { + llama_kv_cell & cell = cache.cells[seq_meta.tail]; + LM_GGML_ASSERT(cell.has_seq_id(seq_id)); + // does this seq_id "own" the cell? + if (cell.seq_id.size() == 1) { has_cell = true; } + } + if (!has_cell) { + llama_kv_cell & empty_cell = cache.cells[next_empty_cell]; + LM_GGML_ASSERT(empty_cell.is_empty()); + // copy old tail into the empty cell + if (seq_meta.tail >= 0) { + llama_kv_cell & orig_cell = cache.cells[seq_meta.tail]; + empty_cell.pos = orig_cell.pos; + empty_cell.src = orig_cell.src; + orig_cell.seq_id.erase(seq_id); + empty_cell.seq_id.insert(seq_id); // will be overwritten + } + seq_meta.tail = next_empty_cell; + // find next empty cell + if (s + 1 < n_seqs) { + next_empty_cell += 1; + for (uint32_t i = 0; i < cache.size; ++i) { + if (next_empty_cell >= cache.size) { next_empty_cell -= cache.size; } + llama_kv_cell & cell = cache.cells[next_empty_cell]; + if (cell.is_empty()) { break; } + next_empty_cell += 1; } - cache.cells[seq_id].pos = batch.pos[i]; - // NOTE: seq_ids are not inserted here; they are handled when the input tensors are set - } else { - // too big seq_id - // TODO: would it be possible to resize the KV cache size instead? - LLAMA_LOG_ERROR("%s: seq_id=%d >= kv_size=%d Try using a bigger --parallel value\n", __func__, seq_id, cache.size); - return false; + } + } + if (min > seq_meta.tail) { min = seq_meta.tail; } + if (max < seq_meta.tail) { max = seq_meta.tail; } + } + + // gather and re-order + for (uint32_t s = 0; s < n_seqs; ++s) { + int32_t dst_id = s + min; + int32_t src_id = cache.cells[batch.seq_id[s][0]].tail; + if (dst_id != src_id) { + llama_kv_cell & dst_cell = cache.cells[dst_id]; + llama_kv_cell & src_cell = cache.cells[src_id]; + + std::swap(dst_cell.pos, src_cell.pos); + std::swap(dst_cell.src, src_cell.src); + std::swap(dst_cell.seq_id, src_cell.seq_id); + + // swap tails (assuming they NEVER overlap) + for (const llama_seq_id seq_id : src_cell.seq_id) { + cache.cells[seq_id].tail = src_id; + } + for (const llama_seq_id seq_id : dst_cell.seq_id) { + cache.cells[seq_id].tail = dst_id; } } } + // update the pos of the used seqs + for (uint32_t s = 0; s < n_seqs; ++s) { + const llama_pos last_pos = batch.pos[n_seq_tokens * s + n_seq_tokens - 1]; + int32_t cell_id = s + min; + llama_kv_cell & cell = cache.cells[cell_id]; + + if (cell.pos >= 0 && last_pos != cell.pos + (llama_pos) n_seq_tokens) { + // What should happen when the pos backtracks or skips a value? + // Clearing the state mid-batch would require special-casing which isn't done. + LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d with %u new tokens\n", + __func__, last_pos, cell.pos, batch.seq_id[s][0], n_seq_tokens); + } + cell.pos = last_pos; + cell.seq_id.clear(); + for (int32_t j = 0; j < batch.n_seq_id[s]; ++j) { + const llama_seq_id seq_id = batch.seq_id[s][j]; + cell.seq_id.insert(seq_id); + cache.cells[seq_id].tail = cell_id; + } + } + // allow getting the range of used cells, from head to head + n cache.head = min; cache.n = max - min + 1; // sanity check - return max >= min; + return cache.n >= n_seqs; } // otherwise, one cell per token. @@ -3114,11 +3629,14 @@ static bool llama_kv_cache_find_slot( } } - for (uint32_t i = 0; i < n_tokens; i++) { - cache.cells[cache.head + i].pos = batch.pos[i]; + for (uint32_t s = 0; s < n_seqs; s++) { + for (uint32_t i = 0; i < n_seq_tokens; ++i) { + uint32_t k = s*n_seq_tokens + i; + cache.cells[cache.head + k].pos = batch.pos[k]; - for (int32_t j = 0; j < batch.n_seq_id[i]; j++) { - cache.cells[cache.head + i].seq_id.insert(batch.seq_id[i][j]); + for (int32_t j = 0; j < batch.n_seq_id[s]; j++) { + cache.cells[cache.head + k].seq_id.insert(batch.seq_id[s][j]); + } } } @@ -3144,6 +3662,8 @@ static void llama_kv_cache_clear(struct llama_kv_cache & cache) { for (int32_t i = 0; i < (int32_t) cache.size; ++i) { cache.cells[i].pos = -1; cache.cells[i].seq_id.clear(); + cache.cells[i].src = -1; + cache.cells[i].tail = -1; } cache.head = 0; cache.used = 0; @@ -3170,9 +3690,16 @@ static bool llama_kv_cache_seq_rm( return false; } if (0 <= seq_id) { - // partial intersection is invalid - if ((0 < p0 && p0 <= cache.cells[seq_id].pos) || (0 < p1 && p1 <= cache.cells[seq_id].pos)) { - return false; + int32_t & tail_id = cache.cells[seq_id].tail; + if (tail_id >= 0) { + const llama_kv_cell & cell = cache.cells[tail_id]; + // partial intersection is invalid + if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) { + return false; + } + if (p0 <= cell.pos && p1 < cell.pos) { + tail_id = -1; + } } } else { // seq_id is negative, then the range should include everything or nothing @@ -3196,6 +3723,7 @@ static bool llama_kv_cache_seq_rm( if (cache.cells[i].pos >= 0) cache.used--; cache.cells[i].pos = -1; + cache.cells[i].src = -1; if (new_head == cache.size) new_head = i; } } @@ -3218,23 +3746,29 @@ static void llama_kv_cache_seq_cp( if (cache.recurrent) { if ((uint32_t) seq_id_dst < cache.size && (uint32_t) seq_id_src < cache.size) { - seq_id_src = cache.cells[seq_id_src].src; - LM_GGML_ASSERT((uint32_t) seq_id_src < cache.size); - // intent to "copy from" - // supports copy chains thanks to taking the source of the source - cache.cells[seq_id_dst].src = seq_id_src; - - // preserve the "keep or clear" status of the copied sequence - if (cache.cells[seq_id_src].has_seq_id(seq_id_src)) { - cache.cells[seq_id_dst].seq_id.insert(seq_id_dst); - } else { - cache.cells[seq_id_dst].seq_id.erase(seq_id_dst); + llama_kv_cell & tail_src = cache.cells[seq_id_src]; + llama_kv_cell & tail_dst = cache.cells[seq_id_dst]; + if (tail_dst.tail >= 0) { + // clear destination seq_id if it wasn't empty + llama_kv_cell & cell_dst = cache.cells[tail_dst.tail]; + + cell_dst.seq_id.erase(seq_id_dst); + tail_dst.tail = -1; + if (cell_dst.seq_id.empty()) { + cell_dst.pos = -1; + cell_dst.delta = -1; + cell_dst.src = -1; + cache.used -= 1; + } } + if (tail_src.tail >= 0) { + llama_kv_cell & cell_src = cache.cells[tail_src.tail]; - cache.do_copy = true; - - cache.cells[seq_id_dst].pos = cache.cells[seq_id_src].pos; + cell_src.seq_id.insert(seq_id_dst); + tail_dst.tail = tail_src.tail; + } } + return; } // otherwise, this is the KV cache of a Transformer-like model @@ -3252,9 +3786,13 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id uint32_t new_head = cache.size; for (uint32_t i = 0; i < cache.size; ++i) { + if (cache.recurrent && (llama_seq_id) i != seq_id) { + cache.cells[i].tail = -1; + } if (!cache.cells[i].has_seq_id(seq_id)) { if (cache.cells[i].pos >= 0) cache.used--; cache.cells[i].pos = -1; + cache.cells[i].src = -1; cache.cells[i].seq_id.clear(); if (new_head == cache.size) new_head = i; } else { @@ -3283,9 +3821,12 @@ static void llama_kv_cache_seq_add( if (cache.recurrent) { // for Mamba-like models, only the pos needs to be shifted if (0 <= seq_id && seq_id < (int64_t) cache.size) { - llama_kv_cell & cell = cache.cells[seq_id]; - if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) { - cell.pos += delta; + const int32_t tail_id = cache.cells[seq_id].tail; + if (tail_id >= 0) { + llama_kv_cell & cell = cache.cells[tail_id]; + if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) { + cell.pos += delta; + } } } return; @@ -3329,9 +3870,12 @@ static void llama_kv_cache_seq_div( if (cache.recurrent) { // for Mamba-like models, only the pos needs to be changed if (0 <= seq_id && seq_id < (int64_t) cache.size) { - llama_kv_cell & cell = cache.cells[seq_id]; - if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) { - cell.pos /= d; + const int32_t tail_id = cache.cells[seq_id].tail; + if (tail_id >= 0) { + llama_kv_cell & cell = cache.cells[tail_id]; + if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) { + cell.pos /= d; + } } } return; @@ -3363,7 +3907,9 @@ static llama_pos llama_kv_cache_seq_pos_max(struct llama_kv_cache & cache, llama } static void llama_kv_cache_defrag(struct llama_kv_cache & cache) { - cache.do_defrag = true; + if (!cache.recurrent) { + cache.do_defrag = true; + } } static uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams) { @@ -3577,13 +4123,8 @@ namespace GGUFMeta { using llama_buf_map = std::unordered_map; -// TODO: update when needed or think of some clever automatic way to do this -static size_t llama_model_max_nodes(const llama_model & /*model*/) { - //if (model.arch == LLM_ARCH_LLAMA && model.hparams.n_layer > ??) { // llama-3 405B - // return 32768; - //} - - return 8192; +static size_t llama_model_max_nodes(const llama_model & model) { + return std::max(8192, model.tensors_by_name.size()*5); } struct llama_model_loader { @@ -4903,7 +5444,6 @@ static void llm_load_hparams( } break; case LLM_ARCH_PHI3: { - ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); switch (hparams.n_layer) { @@ -4912,6 +5452,22 @@ static void llm_load_hparams( case 40: model.type = e_model::MODEL_14B; break; default: model.type = e_model::MODEL_UNKNOWN; } + + // for backward compatibility ; see: https://github.com/ggerganov/llama.cpp/pull/8931 + if ((hparams.n_layer == 32 || hparams.n_layer == 40) && hparams.n_ctx_train == 4096) { + // default value for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct + hparams.n_swa = 2047; + } else if (hparams.n_layer == 32 && hparams.n_head_kv(0) == 32 && hparams.n_ctx_train == 131072) { + // default value for Phi-3-mini-128k-instruct + hparams.n_swa = 262144; + } else if (hparams.n_layer == 40 && hparams.n_ctx_train == 131072) { + // default value for Phi-3-medium-128k-instruct + hparams.n_swa = 131072; + } + bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); + if (!found_swa && hparams.n_swa == 0) { + throw std::runtime_error("invalid value for sliding_window"); + } } break; case LLM_ARCH_PLAMO: { @@ -5003,6 +5559,7 @@ static void llm_load_hparams( ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner); ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state); ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank); + ml.get_key(LLM_KV_SSM_DT_B_C_RMS, hparams.ssm_dt_b_c_rms, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -5209,6 +5766,12 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_T5ENCODER: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts); + model.type = e_model::MODEL_UNKNOWN; + } break; case LLM_ARCH_JAIS: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); @@ -5221,6 +5784,23 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_NEMOTRON: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + switch (hparams.n_layer) { + case 32: model.type = e_model::MODEL_4B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; + case LLM_ARCH_EXAONE: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + + switch (hparams.n_layer) { + case 32: model.type = e_model::MODEL_8B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; default: (void)0; } @@ -5453,6 +6033,15 @@ static void llm_load_vocab( } else if ( tokenizer_pre == "codeshell") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL; + } else if ( + tokenizer_pre == "bloom") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_BLOOM; + } else if ( + tokenizer_pre == "gpt3-finnish") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH; + } else if ( + tokenizer_pre == "exaone") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_EXAONE; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } @@ -5826,6 +6415,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner); LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state); LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank); + LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms); } LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type)); @@ -6091,9 +6681,9 @@ static bool llm_load_tensors( layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); // optional MLP bias - layer.ffn_gate_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED); - layer.ffn_down_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); - layer.ffn_up_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.ffn_gate_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED); } else { layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}); @@ -6417,7 +7007,7 @@ static bool llm_load_tensors( layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}); layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); //output_dens - layer.bo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); //output_dens + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); //output_dens layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); //output_norm layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}); @@ -7443,6 +8033,42 @@ static bool llm_load_tensors( layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_FFN_UP, "weight", i), {n_embd, n_ff}); } } break; + case LLM_ARCH_T5ENCODER: + { + const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts; + + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // output + { + model.output_norm_enc = ml.create_tensor(ctx_output, tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); + // if output is NULL, init from the input tok embed + if (model.output == NULL) { + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); + } + } + + for (int i = 0; i < n_layer; ++i) { + lm_ggml_context * ctx_layer = ctx_for_layer(i); + lm_ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_rel_b_enc = ml.create_tensor(ctx_input, tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED); + + layer.wq_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}); + layer.wk_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}); + layer.wv_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}); + layer.wo_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}); + + layer.ffn_norm_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_gate_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.ffn_down_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}); + } + } break; case LLM_ARCH_JAIS: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); @@ -7512,18 +8138,90 @@ static bool llm_load_tensors( layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); } } break; - default: - throw std::runtime_error("unknown architecture"); - } - } + case LLM_ARCH_NEMOTRON: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); - ml.done_getting_tensors(); + // output + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + } - ml.init_mappings(true, use_mlock ? &model.mlock_mmaps : nullptr); - model.mappings.reserve(ml.mappings.size()); + for (int i = 0; i < n_layer; ++i) { + lm_ggml_context * ctx_layer = ctx_for_layer(i); + lm_ggml_context * ctx_split = ctx_for_layer_split(i); - // create the backend buffers - std::vector> ctx_bufs; + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); + + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + + // optional bias tensors + layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); + + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); + + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + + // optional MLP bias + layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED); + } + } break; + case LLM_ARCH_EXAONE: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // output + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + } + + for (int i = 0; i < n_layer; ++i) { + lm_ggml_context * ctx_layer = ctx_for_layer(i); + lm_ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}); + + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_embd/n_head/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + } + } break; + default: + throw std::runtime_error("unknown architecture"); + } + } + + ml.done_getting_tensors(); + + ml.init_mappings(true, use_mlock ? &model.mlock_mmaps : nullptr); + model.mappings.reserve(ml.mappings.size()); + + // create the backend buffers + std::vector> ctx_bufs; ctx_bufs.reserve(ctx_map.size()); // Ensure we have enough capacity for the maximum backend buffer we will potentially create @@ -7753,7 +8451,7 @@ static struct lm_ggml_tensor * llm_build_inp_embd( struct lm_ggml_context * ctx, struct llama_context & lctx, const llama_hparams & hparams, - const llama_batch & batch, + const llama_ubatch & batch, struct lm_ggml_tensor * tok_embd, const llm_build_cb & cb) { const int64_t n_embd = hparams.n_embd; @@ -8198,7 +8896,7 @@ static struct lm_ggml_tensor * llm_build_kqv( struct lm_ggml_tensor * kq = lm_ggml_mul_mat(ctx, k, q); cb(kq, "kq", il); - if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_QWEN2) { + if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_QWEN2 || model.arch == LLM_ARCH_NEMOTRON) { // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847 lm_ggml_mul_mat_set_prec(kq, LM_GGML_PREC_F32); @@ -8302,12 +9000,180 @@ static struct lm_ggml_tensor * llm_build_kv( return cur; } +static struct lm_ggml_tensor * llm_build_copy_mask_state( + struct lm_ggml_context * ctx, + struct lm_ggml_cgraph * graph, + struct lm_ggml_tensor * s, + struct lm_ggml_tensor * state_copy, + struct lm_ggml_tensor * state_mask, + int32_t n_state, + int32_t kv_size, + int32_t kv_head, + int32_t n_kv, + int32_t n_seqs) { + struct lm_ggml_tensor * states = lm_ggml_reshape_2d(ctx, s, n_state, kv_size); + + // copy states + // NOTE: assuming the copy destinations are ALL contained between kv_head and kv_head + n_kv + // this shrinks the tensors's ne[1] to n_kv + states = lm_ggml_get_rows(ctx, states, state_copy); + + // clear states of sequences which are starting at the beginning of this batch + // FIXME: zero-out NANs? + states = lm_ggml_mul(ctx, states, state_mask); + + // copy states which won't be changed further (between n_seqs and n_rs) + lm_ggml_build_forward_expand(graph, + lm_ggml_cpy(ctx, + lm_ggml_view_1d(ctx, states, n_state*(n_kv - n_seqs), n_seqs*n_state*lm_ggml_element_size(states)), + lm_ggml_view_1d(ctx, s, n_state*(n_kv - n_seqs), (kv_head + n_seqs)*n_state*lm_ggml_element_size(s)))); + + // the part of the states that will be used and modified + return lm_ggml_view_2d(ctx, states, n_state, n_seqs, states->nb[1], 0); +} + +// TODO: split +static struct lm_ggml_tensor * llm_build_mamba( + struct lm_ggml_context * ctx, + struct llama_context & lctx, + const llama_ubatch & batch, + struct lm_ggml_cgraph * graph, + struct lm_ggml_tensor * cur, + struct lm_ggml_tensor * state_copy, + struct lm_ggml_tensor * state_mask, + int32_t kv_head, + int32_t n_kv, + const llm_build_cb & cb, + int il) { + const llama_model & model = lctx.model; + const llama_hparams & hparams = model.hparams; + const llama_kv_cache & kv = lctx.kv_self; + const int64_t d_conv = hparams.ssm_d_conv; + const int64_t d_inner = hparams.ssm_d_inner; + const int64_t d_state = hparams.ssm_d_state; + const int64_t dt_rank = hparams.ssm_dt_rank; + const int64_t n_seqs = batch.n_seqs; + // Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers) + const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms; + // Use the same RMS norm as the final layer norm + const float norm_rms_eps = hparams.f_norm_rms_eps; + + const int64_t n_seq_tokens = batch.n_seq_tokens; + + LM_GGML_ASSERT(n_seqs != 0); + LM_GGML_ASSERT(batch.equal_seqs); + LM_GGML_ASSERT(batch.n_tokens == n_seq_tokens * n_seqs); + + struct lm_ggml_tensor * conv_states_all = kv.k_l[il]; + struct lm_ggml_tensor * ssm_states_all = kv.v_l[il]; + + // (ab)using the KV cache to store the states + struct lm_ggml_tensor * conv = llm_build_copy_mask_state(ctx, + graph, conv_states_all, state_copy, state_mask, + hparams.n_embd_k_s(), kv.size, kv_head, n_kv, n_seqs); + conv = lm_ggml_reshape_3d(ctx, conv, d_conv - 1, d_inner, n_seqs); + struct lm_ggml_tensor * ssm = llm_build_copy_mask_state(ctx, + graph, ssm_states_all, state_copy, state_mask, + hparams.n_embd_v_s(), kv.size, kv_head, n_kv, n_seqs); + ssm = lm_ggml_reshape_3d(ctx, ssm, d_state, d_inner, n_seqs); + + // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs} + cur = lm_ggml_reshape_3d(ctx, cur, cur->ne[0], n_seq_tokens, n_seqs); + + // {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs} + struct lm_ggml_tensor * xz = llm_build_lora_mm(lctx, ctx, model.layers[il].ssm_in, cur); + // split the above in two + // => {d_inner, n_seq_tokens, n_seqs} + struct lm_ggml_tensor * x = lm_ggml_view_3d(ctx, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], 0); + struct lm_ggml_tensor * z = lm_ggml_view_3d(ctx, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], d_inner*lm_ggml_element_size(xz)); + + // conv + { + // => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs} + struct lm_ggml_tensor * conv_x = lm_ggml_concat(ctx, conv, lm_ggml_transpose(ctx, x), 0); + + // copy last (d_conv - 1) columns back into the state cache + struct lm_ggml_tensor * last_conv = lm_ggml_view_3d(ctx, conv_x, d_conv - 1, d_inner, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0])); + + lm_ggml_build_forward_expand(graph, + lm_ggml_cpy(ctx, last_conv, + lm_ggml_view_1d(ctx, conv_states_all, + (d_conv - 1)*(d_inner)*(n_seqs), + kv_head*(d_conv - 1)*(d_inner)*lm_ggml_element_size(conv_states_all)))); + + // 1D convolution + // The equivalent is to make a self-overlapping view of conv_x + // over d_conv columns at each stride in the 3rd dimension, + // then element-wise multiply that with the conv1d weight, + // then sum the elements of each row, + // (the last two steps are a dot product over rows (also doable with mul_mat)) + // then permute away the ne[0] dimension, + // and then you're left with the resulting x tensor. + // For simultaneous sequences, all sequences need to have the same length. + x = lm_ggml_ssm_conv(ctx, conv_x, model.layers[il].ssm_conv1d); + + // bias + x = lm_ggml_add(ctx, x, model.layers[il].ssm_conv1d_b); + + x = lm_ggml_silu(ctx, x); + } + + // ssm + { + // {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs} + struct lm_ggml_tensor * x_db = llm_build_lora_mm(lctx, ctx, model.layers[il].ssm_x, x); + // split + struct lm_ggml_tensor * dt = lm_ggml_view_3d(ctx, x_db, dt_rank, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], 0); + struct lm_ggml_tensor * B = lm_ggml_view_3d(ctx, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], lm_ggml_element_size(x_db)*dt_rank); + struct lm_ggml_tensor * C = lm_ggml_view_3d(ctx, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], lm_ggml_element_size(x_db)*(dt_rank+d_state)); + + // Some Mamba variants (e.g. FalconMamba) apply RMS norm in B, C & Dt layers + if (ssm_dt_b_c_rms) { + dt = lm_ggml_rms_norm(ctx, dt, norm_rms_eps); + B = lm_ggml_rms_norm(ctx, B, norm_rms_eps); + C = lm_ggml_rms_norm(ctx, C, norm_rms_eps); + } + + // {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs} + dt = llm_build_lora_mm(lctx, ctx, model.layers[il].ssm_dt, dt); + dt = lm_ggml_add(ctx, dt, model.layers[il].ssm_dt_b); + + // Custom operator to optimize the parallel associative scan + // as described in the Annex D of the Mamba paper. + // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs} + struct lm_ggml_tensor * y_ssm = lm_ggml_ssm_scan(ctx, ssm, x, dt, model.layers[il].ssm_a, B, C); + + // store last states + lm_ggml_build_forward_expand(graph, + lm_ggml_cpy(ctx, + lm_ggml_view_1d(ctx, y_ssm, d_state*d_inner*n_seqs, x->nb[3]), + lm_ggml_view_1d(ctx, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*lm_ggml_element_size(ssm_states_all)))); + + struct lm_ggml_tensor * y = lm_ggml_view_3d(ctx, y_ssm, d_inner, n_seq_tokens, n_seqs, x->nb[1], x->nb[2], 0); + + // TODO: skip computing output earlier for unused tokens + + // {d_inner, n_seq_tokens, n_seqs} * {d_inner} => {d_inner, n_seq_tokens, n_seqs} + y = lm_ggml_add(ctx, y, lm_ggml_mul(ctx, x, model.layers[il].ssm_d)); + y = lm_ggml_mul(ctx, y, lm_ggml_silu(ctx, lm_ggml_cont(ctx, z))); + + // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs} + cur = llm_build_lora_mm(lctx, ctx, model.layers[il].ssm_out, y); + } + + // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens} + cur = lm_ggml_reshape_2d(ctx, cur, cur->ne[0], n_seq_tokens * n_seqs); + cb(cur, "mamba_out", il); + + return cur; +} + struct llm_build_context { const llama_model & model; llama_context & lctx; const llama_hparams & hparams; const llama_cparams & cparams; - const llama_batch & batch; + const llama_ubatch & batch; const llama_kv_cache & kv_self; const int64_t n_embd; @@ -8353,7 +9219,7 @@ struct llm_build_context { // TODO: consider making the entire interface noexcept llm_build_context( llama_context & lctx, - const llama_batch & batch, + const llama_ubatch & batch, const llm_build_cb & cb, bool worst_case) : model (lctx.model), @@ -8460,29 +9326,6 @@ struct llm_build_context { return gf; } - struct lm_ggml_cgraph * build_s_copy() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); - - LM_GGML_ASSERT(kv_self.recurrent); - - struct lm_ggml_tensor * state_copy = build_inp_s_copy(); - - for (int il = 0; il < n_layer; ++il) { - struct lm_ggml_tensor * conv_states = lm_ggml_reshape_2d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s(), kv_self.size); - struct lm_ggml_tensor * ssm_states = lm_ggml_reshape_2d(ctx0, kv_self.v_l[il], hparams.n_embd_v_s(), kv_self.size); - - conv_states = lm_ggml_get_rows(ctx0, conv_states, state_copy); - ssm_states = lm_ggml_get_rows(ctx0, ssm_states, state_copy); - - // TODO: name the intermediate tensors with cb() - - lm_ggml_build_forward_expand(gf, lm_ggml_cpy(ctx0, conv_states, kv_self.k_l[il])); - lm_ggml_build_forward_expand(gf, lm_ggml_cpy(ctx0, ssm_states, kv_self.v_l[il])); - } - - return gf; - } - struct lm_ggml_cgraph * build_defrag(const std::vector & ids) { struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); @@ -8617,7 +9460,7 @@ struct llm_build_context { } struct lm_ggml_tensor * build_inp_s_copy() { - lctx.inp_s_copy = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, kv_self.size); + lctx.inp_s_copy = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_kv); cb(lctx.inp_s_copy, "inp_s_copy", -1); lm_ggml_set_input(lctx.inp_s_copy); return lctx.inp_s_copy; @@ -8630,13 +9473,6 @@ struct llm_build_context { return lctx.inp_s_mask; } - struct lm_ggml_tensor * build_inp_s_seq() { - lctx.inp_s_seq = lm_ggml_new_tensor_2d(ctx0, LM_GGML_TYPE_I32, n_kv, n_tokens); - cb(lctx.inp_s_seq, "inp_s_seq", -1); - lm_ggml_set_input(lctx.inp_s_seq); - return lctx.inp_s_seq; - } - struct lm_ggml_cgraph * append_pooling(struct lm_ggml_cgraph * gf) { // find result_norm tensor for input struct lm_ggml_tensor * inp = nullptr; @@ -11966,125 +12802,31 @@ struct llm_build_context { struct lm_ggml_cgraph * build_mamba() { struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); - const int64_t d_model = n_embd; - const int64_t d_conv = hparams.ssm_d_conv; - const int64_t d_inner = hparams.ssm_d_inner; - LM_GGML_ASSERT(2 * d_model == d_inner); - const int64_t d_state = hparams.ssm_d_state; - const int64_t dt_rank = hparams.ssm_dt_rank; - struct lm_ggml_tensor * cur; struct lm_ggml_tensor * inpL; // {n_embd, n_tokens} inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + struct lm_ggml_tensor * state_copy = build_inp_s_copy(); struct lm_ggml_tensor * state_mask = build_inp_s_mask(); - struct lm_ggml_tensor * state_seq = build_inp_s_seq(); for (int il = 0; il < n_layer; ++il) { - // (ab)using the KV cache to store the states - struct lm_ggml_tensor * conv_states = lm_ggml_reshape_2d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s(), kv_self.size); - struct lm_ggml_tensor * ssm_states = lm_ggml_reshape_2d(ctx0, kv_self.v_l[il], hparams.n_embd_v_s(), kv_self.size); - - // clear states of sequences which are starting at the beginning of this batch - { - conv_states = lm_ggml_mul(ctx0, - lm_ggml_view_2d(ctx0, conv_states, conv_states->ne[0], n_kv, conv_states->nb[1], kv_head*conv_states->nb[1]), - state_mask); - ssm_states = lm_ggml_mul(ctx0, - lm_ggml_view_2d(ctx0, ssm_states, ssm_states->ne[0], n_kv, ssm_states->nb[1], kv_head*ssm_states->nb[1]), - state_mask); - } - - conv_states = lm_ggml_reshape_3d(ctx0, conv_states, d_conv - 1, d_inner, n_kv); - ssm_states = lm_ggml_reshape_3d(ctx0, ssm_states, d_state, d_inner, n_kv); - // norm cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, cb, il); cb(cur, "attn_norm", il); - // {n_embd, 2*d_inner} * {n_embd, n_tokens} => {2*d_inner, n_tokens} - struct lm_ggml_tensor * xz = llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_in, cur); - // split the above in two - // => {d_inner, n_tokens} - struct lm_ggml_tensor * x = lm_ggml_view_2d(ctx0, xz, d_inner, xz->ne[1], xz->nb[1], 0); - struct lm_ggml_tensor * z = lm_ggml_view_2d(ctx0, xz, d_inner, xz->ne[1], xz->nb[1], lm_ggml_element_size(xz)*d_inner); - - // conv - { - // Custom operator which is needed only to ease simultaneous sequence processing. - // For a single sequence, the equivalent is to concatenate the columns of conv_states and x, - // then make a self-overlapping view of that over d_conv columns at each stride in the 3rd dimension, - // then element-wise multiply that with the conv1d weigth, - // then sum the elements of each row, - // (the last two steps are a dot product over rows (also doable with mul_mat)) - // then permute away the ne[0] dimension, - // and then you're left with the resulting x tensor. - // The new conv_states is the last (d_conv - 1) columns - // of the last 3rd dimensional "layer" of the self-overlapping view. - // For simultaneous sequences, it's more complicated. - struct lm_ggml_tensor * x_conv = lm_ggml_ssm_conv(ctx0, conv_states, x, model.layers[il].ssm_conv1d, state_seq); - - // store last (d_conv - 1) columns of the conv_state part of x_conv back into the KV cache - lm_ggml_build_forward_expand(gf, - lm_ggml_cpy(ctx0, - lm_ggml_view_2d(ctx0, x_conv, d_conv - 1, d_inner*n_kv, d_conv*lm_ggml_element_size(x_conv), (1+d_inner*n_tokens)*lm_ggml_element_size(x_conv)), - lm_ggml_view_1d(ctx0, kv_self.k_l[il], (d_conv - 1)*(d_inner)*(n_kv), kv_head*(d_conv - 1)*(d_inner)*lm_ggml_element_size(x_conv)))); - - // extract x from x_conv - x = lm_ggml_view_2d(ctx0, x_conv, d_inner, n_tokens, d_inner*lm_ggml_element_size(x_conv), 0); - - // bias - x = lm_ggml_add(ctx0, x, model.layers[il].ssm_conv1d_b); - - x = lm_ggml_silu(ctx0, x); - } - - // ssm - { - // {d_inner, dt_rank + 2*d_state} * {d_inner, n_tokens} => {dt_rank + 2*d_state, n_tokens} - struct lm_ggml_tensor * x_db = llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_x, x); - // split - struct lm_ggml_tensor * dt = lm_ggml_view_2d(ctx0, x_db, dt_rank, n_tokens, x_db->nb[1], 0); - struct lm_ggml_tensor * B = lm_ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], lm_ggml_element_size(x_db)*dt_rank); - struct lm_ggml_tensor * C = lm_ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], lm_ggml_element_size(x_db)*(dt_rank+d_state)); - - // {dt_rank, d_inner} * {dt_rank, n_tokens} => {d_inner, n_tokens} - dt = llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_dt, dt); - dt = lm_ggml_add(ctx0, dt, model.layers[il].ssm_dt_b); - - // Custom operator to optimize the parallel associative scan - // as described in the Annex D of the Mamba paper. - // => {d_inner, n_tokens} and {d_state, d_inner, n_kv} combined, - // because only a single tensor can be returned. - struct lm_ggml_tensor * y_ssm_states = lm_ggml_ssm_scan(ctx0, ssm_states, x, dt, model.layers[il].ssm_a, B, C, state_seq); - - // store last states (the second part of y_ssm_states) - lm_ggml_build_forward_expand(gf, - lm_ggml_cpy(ctx0, - lm_ggml_view_1d(ctx0, y_ssm_states, d_state*d_inner*n_kv, d_inner*n_tokens*lm_ggml_element_size(y_ssm_states)), - lm_ggml_view_1d(ctx0, kv_self.v_l[il], d_state*d_inner*n_kv, kv_head*d_state*d_inner*lm_ggml_element_size(ssm_states)))); - - struct lm_ggml_tensor * y = lm_ggml_view_2d(ctx0, y_ssm_states, d_inner, n_tokens, d_inner*lm_ggml_element_size(y_ssm_states), 0); - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids(); - x = lm_ggml_get_rows(ctx0, x, inp_out_ids); - y = lm_ggml_get_rows(ctx0, y, inp_out_ids); - z = lm_ggml_get_rows(ctx0, z, inp_out_ids); - inpL = lm_ggml_get_rows(ctx0, inpL, inp_out_ids); - } - - // {d_inner, n_tokens} * {d_inner} => {d_inner, n_tokens} - y = lm_ggml_add(ctx0, y, lm_ggml_mul(ctx0, x, model.layers[il].ssm_d)); - y = lm_ggml_mul(ctx0, y, lm_ggml_silu(ctx0, z)); + cur = llm_build_mamba(ctx0, lctx, batch, gf, cur, + state_copy, state_mask, + kv_head, n_kv, cb, il); - // {d_inner, n_embd} * {d_inner, n_tokens} => {n_embd, n_tokens} - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_out, y); + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = lm_ggml_get_rows(ctx0, inpL, inp_out_ids); } // residual @@ -13157,7 +13899,7 @@ struct llm_build_context { return gf; } - struct lm_ggml_cgraph * build_t5() { + struct lm_ggml_cgraph * build_t5_encoder() { struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); // mutable variable, needed during the last layer of the computation to skip unused tokens @@ -13172,303 +13914,323 @@ struct llm_build_context { inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); - if (lctx.is_encoding) { - struct lm_ggml_tensor * pos_bucket_enc = llm_build_pos_bucket(false); - - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct lm_ggml_tensor * KQ_mask_enc = build_inp_KQ_mask(false); + LM_GGML_ASSERT(lctx.is_encoding); + struct lm_ggml_tensor * pos_bucket_enc = llm_build_pos_bucket(false); - for (int il = 0; il < n_layer; ++il) { - struct lm_ggml_tensor * inpSA = inpL; + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct lm_ggml_tensor * KQ_mask_enc = build_inp_KQ_mask(false); - // norm - cur = llm_build_norm(ctx0, inpL, hparams, - model.layers[il].attn_norm_enc, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "attn_norm", il); + for (int il = 0; il < n_layer; ++il) { + struct lm_ggml_tensor * inpSA = inpL; - // self-attention - { - struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq_enc, cur); - cb(Qcur, "Qcur", il); + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm_enc, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); - struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk_enc, cur); - cb(Kcur, "Kcur", il); + // self-attention + { + struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_enc, cur); + cb(Qcur, "Qcur", il); - struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv_enc, cur); - cb(Vcur, "Vcur", il); + struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_enc, cur); + cb(Kcur, "Kcur", il); - Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_enc, cur); + cb(Vcur, "Vcur", il); - struct lm_ggml_tensor * q = lm_ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - struct lm_ggml_tensor * k = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); + Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - struct lm_ggml_tensor * kq = lm_ggml_mul_mat(ctx0, k, q); - cb(kq, "kq", il); + struct lm_ggml_tensor * q = lm_ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + struct lm_ggml_tensor * k = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); - struct lm_ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc; - struct lm_ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_enc, attn_rel_b); - struct lm_ggml_tensor * kq_b = lm_ggml_add(ctx0, kq, pos_bias); - cb(kq_b, "kq_b", il); + struct lm_ggml_tensor * kq = lm_ggml_mul_mat(ctx0, k, q); + cb(kq, "kq", il); - kq = lm_ggml_soft_max_ext(ctx0, kq_b, KQ_mask_enc, 1.0f, hparams.f_max_alibi_bias); - cb(kq, "kq_soft_max_ext", il); + struct lm_ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc; + struct lm_ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_enc, attn_rel_b); + struct lm_ggml_tensor * kq_b = lm_ggml_add(ctx0, kq, pos_bias); + cb(kq_b, "kq_b", il); - struct lm_ggml_tensor * v = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, lm_ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens))); - cb(v, "v", il); + kq = lm_ggml_soft_max_ext(ctx0, kq_b, KQ_mask_enc, 1.0f, hparams.f_max_alibi_bias); + cb(kq, "kq_soft_max_ext", il); - struct lm_ggml_tensor * kqv = lm_ggml_mul_mat(ctx0, lm_ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq); - cb(kqv, "kqv", il); + struct lm_ggml_tensor * v = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, lm_ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens))); + cb(v, "v", il); - struct lm_ggml_tensor * kqv_merged = lm_ggml_permute(ctx0, kqv, 0, 2, 1, 3); - cb(kqv_merged, "kqv_merged", il); + struct lm_ggml_tensor * kqv = lm_ggml_mul_mat(ctx0, lm_ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq); + cb(kqv, "kqv", il); - cur = lm_ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); - cb(cur, "kqv_merged_cont", il); + struct lm_ggml_tensor * kqv_merged = lm_ggml_permute(ctx0, kqv, 0, 2, 1, 3); + cb(kqv_merged, "kqv_merged", il); - lm_ggml_build_forward_expand(gf, cur); + cur = lm_ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); + cb(cur, "kqv_merged_cont", il); - cur = lm_ggml_mul_mat(ctx0, model.layers[il].wo_enc, cur); - cb(cur, "kqv_out", il); - } + lm_ggml_build_forward_expand(gf, cur); - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; - cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids); - } + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_enc, cur); + cb(cur, "kqv_out", il); + } - struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids(); + n_tokens = n_outputs; + cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids); + } - // feed-forward network - { - cur = llm_build_norm(ctx0, ffn_inp, hparams, - model.layers[il].ffn_norm_enc, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); + struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); - // T5 uses relu, flan-T5 uses gelu-gated - cur = llm_build_ffn(ctx0, lctx, cur, - model.layers[il].ffn_up_enc, NULL, NULL, - model.layers[il].ffn_gate_enc, NULL, NULL, - model.layers[il].ffn_down_enc, NULL, NULL, - NULL, - model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU, - model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ, - cb, il); - cb(cur, "ffn_out", il); - } + // feed-forward network + { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm_enc, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); - cur = lm_ggml_add(ctx0, cur, ffn_inp); + // T5 uses relu, flan-T5 uses gelu-gated + cur = llm_build_ffn(ctx0, lctx, cur, + model.layers[il].ffn_up_enc, NULL, NULL, + model.layers[il].ffn_gate_enc, NULL, NULL, + model.layers[il].ffn_down_enc, NULL, NULL, + NULL, + model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU, + model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ, + cb, il); cb(cur, "ffn_out", il); + } - lm_ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); - if (layer_dir != nullptr) { - cur = lm_ggml_add(ctx0, cur, layer_dir); - } - cb(cur, "l_out", il); + cur = lm_ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); - // input for next layer - inpL = cur; + lm_ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); + if (layer_dir != nullptr) { + cur = lm_ggml_add(ctx0, cur, layer_dir); } + cb(cur, "l_out", il); - cur = inpL; - cb(cur, "result_embd", -1); + // input for next layer + inpL = cur; + } - cur = llm_build_norm(ctx0, cur, hparams, - model.output_norm_enc, NULL, - LLM_NORM_RMS, cb, -1); - cb(cur, "result_norm", -1); - } else { - LM_GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first"); + cur = inpL; + cb(cur, "result_embd", -1); - struct lm_ggml_tensor * embd_enc = llm_build_inp_embd_enc(); - struct lm_ggml_tensor * pos_bucket_dec = llm_build_pos_bucket(true); + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm_enc, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); - struct lm_ggml_tensor * KQ_mask_dec = build_inp_KQ_mask(); - struct lm_ggml_tensor * KQ_mask_cross = llm_build_inp_KQ_mask_cross(); + lm_ggml_build_forward_expand(gf, cur); - for (int il = 0; il < n_layer; ++il) { - struct lm_ggml_tensor * inpSA = inpL; + return gf; + } - // norm - cur = llm_build_norm(ctx0, inpL, hparams, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "attn_norm", il); + struct lm_ggml_cgraph * build_t5_decoder() { + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); - // self-attention - { - struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); + // mutable variable, needed during the last layer of the computation to skip unused tokens + int32_t n_tokens = this->n_tokens; - struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); + const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); + struct lm_ggml_tensor * cur; + struct lm_ggml_tensor * inpL; - llm_build_kv_store(ctx0, hparams, cparams, kv_self, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); - struct lm_ggml_tensor * k = - lm_ggml_view_3d(ctx0, kv_self.k_l[il], - n_embd_head_k, n_kv, n_head_kv, - lm_ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), - lm_ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k), - 0); - cb(k, "k", il); + LM_GGML_ASSERT(!lctx.is_encoding); + LM_GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first"); - struct lm_ggml_tensor * v = - lm_ggml_view_3d(ctx0, kv_self.v_l[il], - n_kv, n_embd_head_v, n_head_kv, - lm_ggml_element_size(kv_self.v_l[il])*n_ctx, - lm_ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v, - 0); - cb(v, "v", il); + struct lm_ggml_tensor * embd_enc = llm_build_inp_embd_enc(); + struct lm_ggml_tensor * pos_bucket_dec = llm_build_pos_bucket(true); - Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + struct lm_ggml_tensor * KQ_mask_dec = build_inp_KQ_mask(); + struct lm_ggml_tensor * KQ_mask_cross = llm_build_inp_KQ_mask_cross(); - struct lm_ggml_tensor * q = lm_ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + for (int il = 0; il < n_layer; ++il) { + struct lm_ggml_tensor * inpSA = inpL; - struct lm_ggml_tensor * kq = lm_ggml_mul_mat(ctx0, k, q); - cb(kq, "kq", il); + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); - struct lm_ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b; - struct lm_ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_dec, attn_rel_b); - struct lm_ggml_tensor * kq_b = lm_ggml_add(ctx0, kq, pos_bias); - cb(kq_b, "kq_b", il); + // self-attention + { + struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); - kq = lm_ggml_soft_max_ext(ctx0, kq_b, KQ_mask_dec, 1.0f, hparams.f_max_alibi_bias); - cb(kq, "kq_soft_max_ext", il); + struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); - struct lm_ggml_tensor * kqv = lm_ggml_mul_mat(ctx0, v, kq); - cb(kqv, "kqv", il); + struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); - struct lm_ggml_tensor * kqv_merged = lm_ggml_permute(ctx0, kqv, 0, 2, 1, 3); - cb(kqv_merged, "kqv_merged", il); + llm_build_kv_store(ctx0, hparams, cparams, kv_self, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); - cur = lm_ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); - cb(cur, "kqv_merged_cont", il); + struct lm_ggml_tensor * k = + lm_ggml_view_3d(ctx0, kv_self.k_l[il], + n_embd_head_k, n_kv, n_head_kv, + lm_ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), + lm_ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k), + 0); + cb(k, "k", il); - lm_ggml_build_forward_expand(gf, cur); + struct lm_ggml_tensor * v = + lm_ggml_view_3d(ctx0, kv_self.v_l[il], + n_kv, n_embd_head_v, n_head_kv, + lm_ggml_element_size(kv_self.v_l[il])*n_ctx, + lm_ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v, + 0); + cb(v, "v", il); - cur = lm_ggml_mul_mat(ctx0, model.layers[il].wo, cur); - cb(cur, "kqv_out", il); - } + Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = lm_ggml_add(ctx0, cur, inpSA); - cb(cur, "cross_inp", il); + struct lm_ggml_tensor * q = lm_ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - struct lm_ggml_tensor * inpCA = cur; + struct lm_ggml_tensor * kq = lm_ggml_mul_mat(ctx0, k, q); + cb(kq, "kq", il); - // norm - cur = llm_build_norm(ctx0, cur, hparams, - model.layers[il].attn_norm_cross, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "attn_norm_cross", il); + struct lm_ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b; + struct lm_ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_dec, attn_rel_b); + struct lm_ggml_tensor * kq_b = lm_ggml_add(ctx0, kq, pos_bias); + cb(kq_b, "kq_b", il); - // cross-attention - { - struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq_cross, cur); - cb(Qcur, "Qcur", il); + kq = lm_ggml_soft_max_ext(ctx0, kq_b, KQ_mask_dec, 1.0f, hparams.f_max_alibi_bias); + cb(kq, "kq_soft_max_ext", il); - struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk_cross, embd_enc); - cb(Kcur, "Kcur", il); + struct lm_ggml_tensor * kqv = lm_ggml_mul_mat(ctx0, v, kq); + cb(kqv, "kqv", il); - struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv_cross, embd_enc); - cb(Vcur, "Vcur", il); + struct lm_ggml_tensor * kqv_merged = lm_ggml_permute(ctx0, kqv, 0, 2, 1, 3); + cb(kqv_merged, "kqv_merged", il); - Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc); + cur = lm_ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); + cb(cur, "kqv_merged_cont", il); - struct lm_ggml_tensor * q = lm_ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - struct lm_ggml_tensor * k = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); + lm_ggml_build_forward_expand(gf, cur); - struct lm_ggml_tensor * kq = lm_ggml_mul_mat(ctx0, k, q); - cb(kq, "kq", il); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur); + cb(cur, "kqv_out", il); + } - kq = lm_ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias); - cb(kq, "kq_soft_max_ext", il); + cur = lm_ggml_add(ctx0, cur, inpSA); + cb(cur, "cross_inp", il); - struct lm_ggml_tensor * v = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, lm_ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc))); - cb(v, "v", il); + struct lm_ggml_tensor * inpCA = cur; - struct lm_ggml_tensor * kqv = lm_ggml_mul_mat(ctx0, lm_ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq); - cb(kqv, "kqv", il); + // norm + cur = llm_build_norm(ctx0, cur, hparams, + model.layers[il].attn_norm_cross, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm_cross", il); - struct lm_ggml_tensor * kqv_merged = lm_ggml_permute(ctx0, kqv, 0, 2, 1, 3); - cb(kqv_merged, "kqv_merged", il); + // cross-attention + { + struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_cross, cur); + cb(Qcur, "Qcur", il); - cur = lm_ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); - cb(cur, "kqv_merged_cont", il); + struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_cross, embd_enc); + cb(Kcur, "Kcur", il); - lm_ggml_build_forward_expand(gf, cur); + struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_cross, embd_enc); + cb(Vcur, "Vcur", il); - cur = lm_ggml_mul_mat(ctx0, model.layers[il].wo_cross, cur); - cb(cur, "kqv_out", il); - } + Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc); - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; - cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids); - inpCA = lm_ggml_get_rows(ctx0, inpCA, inp_out_ids); - } + struct lm_ggml_tensor * q = lm_ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + struct lm_ggml_tensor * k = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); - struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpCA); - cb(ffn_inp, "ffn_inp", il); + struct lm_ggml_tensor * kq = lm_ggml_mul_mat(ctx0, k, q); + cb(kq, "kq", il); - // feed-forward network - { - cur = llm_build_norm(ctx0, ffn_inp, hparams, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); + kq = lm_ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias); + cb(kq, "kq_soft_max_ext", il); - // T5 uses relu, flan-T5 uses gelu-gated - cur = llm_build_ffn(ctx0, lctx, cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU, - model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ, - cb, il); - cb(cur, "ffn_out", il); - } + struct lm_ggml_tensor * v = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, lm_ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc))); + cb(v, "v", il); - cur = lm_ggml_add(ctx0, cur, ffn_inp); - cb(cur, "ffn_out", il); + struct lm_ggml_tensor * kqv = lm_ggml_mul_mat(ctx0, lm_ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq); + cb(kqv, "kqv", il); - lm_ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); - if (layer_dir != nullptr) { - cur = lm_ggml_add(ctx0, cur, layer_dir); - } - cb(cur, "l_out", il); + struct lm_ggml_tensor * kqv_merged = lm_ggml_permute(ctx0, kqv, 0, 2, 1, 3); + cb(kqv_merged, "kqv_merged", il); - // input for next layer - inpL = cur; + cur = lm_ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); + cb(cur, "kqv_merged_cont", il); + + lm_ggml_build_forward_expand(gf, cur); + + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_cross, cur); + cb(cur, "kqv_out", il); } - cur = inpL; - cb(cur, "result_embd", -1); + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids(); + n_tokens = n_outputs; + cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids); + inpCA = lm_ggml_get_rows(ctx0, inpCA, inp_out_ids); + } - cur = llm_build_norm(ctx0, cur, hparams, - model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); - cb(cur, "result_norm", -1); + struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpCA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + // T5 uses relu, flan-T5 uses gelu-gated + cur = llm_build_ffn(ctx0, lctx, cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU, + model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ, + cb, il); + cb(cur, "ffn_out", il); + } + + cur = lm_ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); - // lm_head - cur = lm_ggml_mul_mat(ctx0, model.output, cur); - cb(cur, "result_output", -1); + lm_ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); + if (layer_dir != nullptr) { + cur = lm_ggml_add(ctx0, cur, layer_dir); + } + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; } + cur = inpL; + cb(cur, "result_embd", -1); + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cb(cur, "result_output", -1); + lm_ggml_build_forward_expand(gf, cur); return gf; @@ -13666,12 +14428,260 @@ struct llm_build_context { cb(inpL, "l_out", il); } - cur = llm_build_norm(ctx0, inpL, hparams, - model.output_norm, - NULL, + cur = llm_build_norm(ctx0, inpL, hparams, + model.output_norm, + NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cb(cur, "result_output", -1); + + lm_ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct lm_ggml_cgraph * build_nemotron() { + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + //LM_GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct lm_ggml_tensor * cur; + struct lm_ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + + // inp_pos - contains the positions + struct lm_ggml_tensor * inp_pos = build_inp_pos(); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask(); + + for (int il = 0; il < n_layer; ++il) { + struct lm_ggml_tensor * inpSA = inpL; + + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // compute Q and K and RoPE them + struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + + struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + Qcur = lm_ggml_rope_ext( + ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = lm_ggml_rope_ext( + ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = llm_build_kv(ctx0, lctx, kv_self, gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, + model.layers[il].ffn_norm_b, + LLM_NORM, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, lctx, cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, + NULL, NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, + NULL, + LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il); + + cur = lm_ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); + + cur = lctx.cvec.apply_to(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, model.output_norm_b, + LLM_NORM, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cb(cur, "result_output", -1); + + lm_ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct lm_ggml_cgraph * build_exaone() { + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); + + // mutable variable, needed during the last layer of the computation to skip unused tokens + int32_t n_tokens = this->n_tokens; + + const int64_t n_embd_head = hparams.n_embd_head_v; + LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + LM_GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct lm_ggml_tensor * cur; + struct lm_ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + + // inp_pos - contains the positions + struct lm_ggml_tensor * inp_pos = build_inp_pos(); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask(); + + for (int il = 0; il < n_layer; ++il) { + struct lm_ggml_tensor * inpSA = inpL; + + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // rope freq factors for llama3; may return nullptr for llama2 and other models + struct lm_ggml_tensor * rope_factors = build_rope_factors(il); + + // compute Q and K and RoPE them + struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + + struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + Qcur = lm_ggml_rope_ext( + ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = lm_ggml_rope_ext( + ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = llm_build_kv(ctx0, lctx, kv_self, gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids(); + n_tokens = n_outputs; + cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, lctx, cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + + cur = lm_ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); + + cur = lctx.cvec.apply_to(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, LLM_NORM_RMS, cb, -1); cb(cur, "result_norm", -1); + // lm_head cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); @@ -13682,8 +14692,8 @@ struct llm_build_context { }; static struct lm_ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector & ids) { - llama_batch dummy; - dummy.n_tokens = 0; + llama_ubatch dummy = {}; + dummy.equal_seqs = true; llm_build_cb cb = [&](struct lm_ggml_tensor * , const char * , int ) { }; @@ -13699,8 +14709,8 @@ static struct lm_ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, co } static struct lm_ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) { - llama_batch dummy; - dummy.n_tokens = 0; + llama_ubatch dummy = {}; + dummy.equal_seqs = true; llm_build_cb cb = [&](struct lm_ggml_tensor * , const char * , int ) { }; @@ -13715,26 +14725,9 @@ static struct lm_ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) { return result; } -static struct lm_ggml_cgraph * llama_build_graph_s_copy(llama_context & lctx) { - llama_batch dummy; - dummy.n_tokens = 0; - - llm_build_cb cb = [&](struct lm_ggml_tensor * , const char * , int ) { }; - - struct llm_build_context llm(lctx, dummy, cb, false); - - llm.init(); - - struct lm_ggml_cgraph * result = llm.build_s_copy(); - - llm.free(); - - return result; -} - static struct lm_ggml_cgraph * llama_build_graph( llama_context & lctx, - const llama_batch & batch, + const llama_ubatch & batch, bool worst_case) { const auto & model = lctx.model; @@ -13920,12 +14913,28 @@ static struct lm_ggml_cgraph * llama_build_graph( } break; case LLM_ARCH_T5: { - result = llm.build_t5(); + if (lctx.is_encoding) { + result = llm.build_t5_encoder(); + } else { + result = llm.build_t5_decoder(); + } + } break; + case LLM_ARCH_T5ENCODER: + { + result = llm.build_t5_encoder(); } break; case LLM_ARCH_JAIS: { result = llm.build_jais(); } break; + case LLM_ARCH_NEMOTRON: + { + result = llm.build_nemotron(); + } break; + case LLM_ARCH_EXAONE: + { + result = llm.build_exaone(); + } break; default: LM_GGML_ABORT("fatal error"); } @@ -13988,7 +14997,7 @@ static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t return relative_bucket; } -static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { +static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) { // // set input data // @@ -14027,10 +15036,10 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { for (int i = 0; i < n_tokens; ++i) { data[i] = i; } - } else if (batch.logits) { + } else if (batch.output) { int32_t n_outputs = 0; for (int i = 0; i < n_tokens; ++i) { - if (batch.logits[i]) { + if (batch.output[i]) { data[n_outputs++] = i; } } @@ -14054,8 +15063,10 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { if (lctx.inp_KQ_mask || lctx.inp_KQ_mask_swa) { // NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache. if (cparams.causal_attn && !lctx.is_encoding) { - const int64_t n_kv = kv_self.n; - const int64_t n_tokens = batch.n_tokens; + const int64_t n_kv = kv_self.n; + const int64_t n_tokens = batch.n_tokens; + const int64_t n_seq_tokens = batch.n_seq_tokens; + const int64_t n_seqs = batch.n_seqs; float * data = nullptr; @@ -14075,32 +15086,35 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { // of the correct sequence for each token of the batch. // It's assumed that if a token in the batch has multiple sequences, they are equivalent. for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - const llama_pos pos = batch.pos[j]; - const llama_seq_id seq_id = batch.seq_id[j][0]; + for (int s = 0; s < n_seqs; ++s) { + const llama_seq_id seq_id = batch.seq_id[s][0]; - for (int i = 0; i < n_kv; ++i) { - float f; - if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) { - f = -INFINITY; - } else { - if (hparams.use_alibi) { - f = -std::abs(lctx.kv_self.cells[i].pos - pos); + for (int j = 0; j < n_seq_tokens; ++j) { + const llama_pos pos = batch.pos[s*n_seq_tokens + j]; + + for (int i = 0; i < n_kv; ++i) { + float f; + if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { + f = -INFINITY; } else { - f = 0.0f; + if (hparams.use_alibi) { + f = -std::abs(kv_self.cells[i].pos - pos); + } else { + f = 0.0f; + } } - } - if (data) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = f; - } + if (data) { + data[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f; + } - // may need to cut off old tokens for sliding window - if (data_swa) { - if (pos - lctx.kv_self.cells[i].pos >= (int32_t)hparams.n_swa) { - f = -INFINITY; + // may need to cut off old tokens for sliding window + if (data_swa) { + if (pos - kv_self.cells[i].pos >= (int32_t)hparams.n_swa) { + f = -INFINITY; + } + data_swa[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f; } - data_swa[h*(n_kv*n_tokens) + j*n_kv + i] = f; } } } @@ -14122,8 +15136,10 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { } } } else { + const int64_t n_tokens = batch.n_tokens; + const int64_t n_seq_tokens = batch.n_seq_tokens; + const int64_t n_seqs = batch.n_seqs; // when using kv cache, the mask needs to match the kv cache size - const int64_t n_tokens = batch.n_tokens; const int64_t n_stride = hparams.causal_attn && !lctx.is_encoding ? kv_self.n : n_tokens; LM_GGML_ASSERT(lm_ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer)); @@ -14131,27 +15147,35 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { float * data = (float *) lctx.inp_KQ_mask->data; for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - const llama_seq_id seq_id = batch.seq_id[j][0]; - - for (int i = 0; i < n_tokens; ++i) { - float f = -INFINITY; - for (int s = 0; s < batch.n_seq_id[i]; ++s) { - if (batch.seq_id[i][s] == seq_id) { - if (hparams.use_alibi) { - f = -std::abs(batch.pos[i] - batch.pos[j]); - } else { - f = 0.0f; + for (int s1 = 0; s1 < n_seqs; ++s1) { + const llama_seq_id seq_id = batch.seq_id[s1][0]; + + for (int j = 0; j < n_seq_tokens; ++j) { + const int32_t tj = s1*n_seq_tokens + j; + + for (int s0 = 0; s0 < n_seqs; ++s0) { + for (int i = 0; i < n_seq_tokens; ++i) { + const int32_t ti = s0*n_seq_tokens + i; + float f = -INFINITY; + + for (int s = 0; s < batch.n_seq_id[s0]; ++s) { + if (batch.seq_id[s0][s] == seq_id) { + if (hparams.use_alibi) { + f = -std::abs(batch.pos[ti] - batch.pos[tj]); + } else { + f = 0.0f; + } + break; + } } - break; + + data[h*(n_tokens*n_tokens) + tj*n_stride + ti] = f; } } - data[h*(n_tokens*n_tokens) + j*n_stride + i] = f; - } - - for (int i = n_tokens; i < n_stride; ++i) { - data[h*(n_tokens*n_tokens) + j*n_stride + i] = -INFINITY; + for (int i = n_tokens; i < n_stride; ++i) { + data[h*(n_tokens*n_tokens) + tj*n_stride + i] = -INFINITY; + } } } } @@ -14159,7 +15183,9 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { } if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) { - const int64_t n_tokens = batch.n_tokens; + const int64_t n_tokens = batch.n_tokens; + const int64_t n_seq_tokens = batch.n_seq_tokens; + const int64_t n_seqs = batch.n_seqs; LM_GGML_ASSERT(lctx.inp_mean); LM_GGML_ASSERT(lm_ggml_backend_buffer_is_host(lctx.inp_mean->buffer)); @@ -14168,12 +15194,14 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * lm_ggml_element_size(lctx.inp_mean)); std::vector sum(n_tokens, 0); - for (int i = 0; i < n_tokens; ++i) { - const llama_seq_id seq_id = batch.seq_id[i][0]; + for (int s = 0; s < n_seqs; ++s) { + const llama_seq_id seq_id = batch.seq_id[s][0]; + + // TODO: adapt limits to n_seqs when batch.equal_seqs is true LM_GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN"); - sum[seq_id] += 1; + sum[seq_id] += batch.n_seq_tokens; } std::vector div(n_tokens, 0.0f); @@ -14184,14 +15212,19 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { } } - for (int i = 0; i < n_tokens; ++i) { - const llama_seq_id seq_id = batch.seq_id[i][0]; - data[seq_id*n_tokens + i] = div[seq_id]; + for (int s = 0; s < n_seqs; ++s) { + const llama_seq_id seq_id = batch.seq_id[s][0]; + + for (int i = 0; i < n_seq_tokens; ++i) { + data[seq_id*n_tokens + s*n_seq_tokens + i] = div[seq_id]; + } } } if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_CLS) { - const int64_t n_tokens = batch.n_tokens; + const int64_t n_tokens = batch.n_tokens; + const int64_t n_seq_tokens = batch.n_seq_tokens; + const int64_t n_seqs = batch.n_seqs; LM_GGML_ASSERT(lctx.inp_cls); LM_GGML_ASSERT(lm_ggml_backend_buffer_is_host(lctx.inp_cls->buffer)); @@ -14199,20 +15232,26 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { uint32_t * data = (uint32_t *) lctx.inp_cls->data; memset(lctx.inp_cls->data, 0, n_tokens * lm_ggml_element_size(lctx.inp_cls)); - for (int i = 0; i < n_tokens; ++i) { - const llama_seq_id seq_id = batch.seq_id[i][0]; - const llama_pos pos = batch.pos[i]; + for (int s = 0; s < n_seqs; ++s) { + const llama_seq_id seq_id = batch.seq_id[s][0]; + // TODO: adapt limits to n_seqs when batch.equal_seqs is true LM_GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS"); - if (pos == 0) { - data[seq_id] = i; + for (int i = 0; i < n_seq_tokens; ++i) { + const llama_pos pos = batch.pos[s*n_seq_tokens + i]; + + if (pos == 0) { + data[seq_id] = s*n_seq_tokens + i; + } } } } if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) { - const int64_t n_tokens = batch.n_tokens; + const int64_t n_tokens = batch.n_tokens; + const int64_t n_seq_tokens = batch.n_seq_tokens; + const int64_t n_seqs = batch.n_seqs; LM_GGML_ASSERT(lctx.inp_cls); LM_GGML_ASSERT(lm_ggml_backend_buffer_is_host(lctx.inp_cls->buffer)); @@ -14223,15 +15262,19 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { std::vector last_pos(n_tokens, -1); std::vector last_row(n_tokens, -1); - for (int i = 0; i < n_tokens; ++i) { - const llama_seq_id seq_id = batch.seq_id[i][0]; - const llama_pos pos = batch.pos[i]; + for (int s = 0; s < n_seqs; ++s) { + const llama_seq_id seq_id = batch.seq_id[s][0]; + // TODO: adapt limits to n_seqs when batch.equal_seqs is true LM_GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST"); - if (pos >= last_pos[seq_id]) { - last_pos[seq_id] = pos; - last_row[seq_id] = i; + for (int i = 0; i < n_seq_tokens; ++i) { + const llama_pos pos = batch.pos[s*n_seq_tokens + i]; + + if (pos >= last_pos[seq_id]) { + last_pos[seq_id] = pos; + last_row[seq_id] = s*n_seq_tokens + i; + } } } @@ -14249,41 +15292,39 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { LM_GGML_ASSERT(lm_ggml_backend_buffer_is_host(lctx.inp_s_mask->buffer)); float * data = (float *) lctx.inp_s_mask->data; - // states which are not affected by the current batch are left untouched + // clear unused states for (int i = 0; i < n_kv; ++i) { - llama_seq_id seq_id = i + lctx.kv_self.head; - llama_kv_cell & kv_cell = lctx.kv_self.cells[seq_id]; - bool has_self_seq = kv_cell.has_seq_id(seq_id); + uint32_t cell_id = i + kv_self.head; + llama_kv_cell & kv_cell = lctx.kv_self.cells[cell_id]; - data[i] = (float) has_self_seq; + data[i] = (float) (kv_cell.src >= 0); - // ensure current sequences will be kept - if (!has_self_seq && kv_cell.pos >= 0) { - kv_cell.seq_id.insert(seq_id); + // only clear once + if (kv_cell.src < 0) { + kv_cell.src = cell_id; } } } - // For Mamba (and other recurrent architectures), - // update the correct state(s)/sequence(s) for each token of the batch. - // Like with the KQ_mask, if a token in the batch has multiple sequences, - // they are assumed to be equivalent (not here, but in lm_ggml_ssm_scan and lm_ggml_ssm_conv). - if (lctx.inp_s_seq) { - const int64_t n_tokens = batch.n_tokens; - LM_GGML_ASSERT(lm_ggml_backend_buffer_is_host(lctx.inp_s_seq->buffer)); - int32_t * data = (int32_t *) lctx.inp_s_seq->data; + if (lctx.inp_s_copy) { + LM_GGML_ASSERT(lm_ggml_backend_buffer_is_host(lctx.inp_s_copy->buffer)); + int32_t * data = (int32_t *) lctx.inp_s_copy->data; - for (int j = 0; j < n_tokens; ++j) { - const int32_t n_seq = batch.n_seq_id[j]; - LM_GGML_ASSERT(0 < n_seq); // a token should be part of at least 1 sequence + // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n + for (uint32_t i = 0; i < n_kv; ++i) { + const uint32_t cell_id = i + kv_self.head; + llama_kv_cell & kv_cell = lctx.kv_self.cells[cell_id]; - for (int i = 0; i < n_kv; ++i) { - if (i < n_seq) { - // for this type of model, the head is the minimum seq_id of the batch - data[j*n_kv + i] = batch.seq_id[j][i] - kv_self.head; - } else { - data[j*n_kv + i] = -1; - } + // prevent out-of-bound sources + if (kv_cell.src < 0 || (uint32_t) kv_cell.src >= kv_self.size) { + kv_cell.src = cell_id; + } + + data[i] = kv_cell.src; + + // ensure copy only happens once + if (kv_cell.src != (int32_t) cell_id) { + kv_cell.src = cell_id; } } } @@ -14293,6 +15334,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { const int64_t n_tokens = batch.n_tokens; LM_GGML_ASSERT(lm_ggml_backend_buffer_is_host(lctx.inp_pos_bucket->buffer)); + LM_GGML_ASSERT(!batch.equal_seqs); // TODO: use batch.n_seqs instead of failing int32_t * data = (int32_t *) lctx.inp_pos_bucket->data; @@ -14328,6 +15370,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { const int64_t n_tokens = batch.n_tokens; LM_GGML_ASSERT(lm_ggml_backend_buffer_is_host(lctx.inp_KQ_mask_cross->buffer)); + LM_GGML_ASSERT(!batch.equal_seqs); // TODO: use batch.n_seqs instead of failing float * data = (float *) lctx.inp_KQ_mask_cross->data; @@ -14368,7 +15411,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) { // TODO: use a per-batch flag for logits presence instead const bool has_logits = !cparams.embeddings; - const bool has_embd = lctx.is_encoding || (cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE)); + const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE); const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0; const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0; @@ -14421,6 +15464,43 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) { return n_outputs_max; } +// make the outputs have the same order they had in the user-provided batch +static void llama_output_reorder(struct llama_context * ctx) { + std::vector & out_ids = ctx->sbatch.out_ids; + if (!out_ids.empty()) { + uint32_t n_vocab = ctx->model.hparams.n_vocab; + uint32_t n_embd = ctx->model.hparams.n_embd; + int32_t n_outputs = ctx->n_outputs; + LM_GGML_ASSERT((size_t) n_outputs == out_ids.size()); + // TODO: is there something more efficient which also minimizes swaps? + // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort) + for (int32_t i = 0; i < n_outputs - 1; ++i) { + int32_t j_min = i; + for (int32_t j = i + 1; j < n_outputs; ++j) { + if (out_ids[j] < out_ids[j_min]) { + j_min = j; + } + } + if (j_min == i) { continue; } + std::swap(out_ids[i], out_ids[j_min]); + if (ctx->logits_size > 0) { + for (uint32_t k = 0; k < n_vocab; k++) { + std::swap(ctx->logits[i*n_vocab + k], ctx->logits[j_min*n_vocab + k]); + } + } + if (ctx->embd_size > 0) { + for (uint32_t k = 0; k < n_embd; k++) { + std::swap(ctx->embd[i*n_embd + k], ctx->embd[j_min*n_embd + k]); + } + } + } + std::fill(ctx->output_ids.begin(), ctx->output_ids.end(), -1); + for (int32_t i = 0; i < n_outputs; ++i) { + ctx->output_ids[out_ids[i]] = i; + } + out_ids.clear(); + } +} static void llama_graph_compute( llama_context & lctx, @@ -14493,15 +15573,11 @@ static int llama_decode_internal( const auto n_ubatch = cparams.n_ubatch; - // TODO: simplify or deprecate - std::vector pos; - std::vector n_seq_id; - std::vector seq_id_arr; - std::vector> seq_id; - // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE; + lctx.embd_seq.clear(); + // count outputs if (batch_all.logits && !embd_pooled) { for (uint32_t i = 0; i < n_tokens_all; ++i) { @@ -14514,55 +15590,42 @@ static int llama_decode_internal( n_outputs = 1; } + lctx.sbatch.from_batch(batch_all, n_embd, + /* simple_split */ !kv_self.recurrent, + /* logits_all */ n_outputs == n_tokens_all); + // reserve output buffer if (llama_output_reserve(lctx, n_outputs) < n_outputs) { LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_outputs); return -2; }; - // set output mappings - if (batch_all.logits) { - int32_t i_logits = 0; - for (uint32_t i = 0; i < n_tokens_all; ++i) { - if (batch_all.logits[i]) { - lctx.output_ids[i] = i_logits++; + while (lctx.sbatch.n_tokens > 0) { + llama_ubatch ubatch; + if (kv_self.recurrent) { + if (embd_pooled) { + // Pooled embeddings cannot be split across ubatches (yet) + ubatch = lctx.sbatch.split_seq(n_ubatch); + } else { + // recurrent model architectures are easier to implement + // with equal-length sequences + ubatch = lctx.sbatch.split_equal(n_ubatch); } + } else { + ubatch = lctx.sbatch.split_simple(n_ubatch); } - } else { - for (uint32_t i = 0; i < n_outputs; ++i) { - lctx.output_ids[i] = i; - } - } - - for (uint32_t cur_token = 0; cur_token < n_tokens_all; cur_token += n_ubatch) { - const uint32_t n_tokens = std::min(n_ubatch, n_tokens_all - cur_token); - llama_batch u_batch = { - /* .n_tokens = */ (int32_t) n_tokens, - /* .token = */ batch_all.token ? batch_all.token + cur_token : nullptr, - /* .embd = */ batch_all.embd ? batch_all.embd + cur_token*n_embd : nullptr, - /* .pos = */ batch_all.pos ? batch_all.pos + cur_token : nullptr, - /* .n_seq_id = */ batch_all.n_seq_id ? batch_all.n_seq_id + cur_token : nullptr, - /* .seq_id = */ batch_all.seq_id ? batch_all.seq_id + cur_token : nullptr, - /* .logits = */ batch_all.logits ? batch_all.logits + cur_token : nullptr, - /* .all_pos_0 = */ batch_all.all_pos_0 + (llama_pos) cur_token*batch_all.all_pos_1, - /* .all_pos_1 = */ batch_all.all_pos_1, - /* .all_seq_id = */ batch_all.all_seq_id, - }; + const uint32_t n_tokens = ubatch.n_tokens; // count the outputs in this u_batch { int32_t n_outputs_new = 0; - if (u_batch.logits && !embd_pooled) { - for (uint32_t i = 0; i < n_tokens; i++) { - n_outputs_new += u_batch.logits[i] != 0; - } - } else if (n_outputs == n_tokens_all) { + if (n_outputs == n_tokens_all) { n_outputs_new = n_tokens; } else { - // keep last output only - if (cur_token + n_tokens >= n_tokens_all) { - n_outputs_new = 1; + LM_GGML_ASSERT(ubatch.output); + for (uint32_t i = 0; i < n_tokens; i++) { + n_outputs_new += (int32_t) (ubatch.output[i] != 0); } } @@ -14573,32 +15636,6 @@ static int llama_decode_internal( int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch; LM_GGML_ASSERT(n_threads > 0); - // helpers for smoother batch API transition - // after deprecating the llama_eval calls, these will be removed - if (u_batch.pos == nullptr) { - pos.resize(n_tokens); - for (uint32_t i = 0; i < n_tokens; i++) { - pos[i] = u_batch.all_pos_0 + i*u_batch.all_pos_1; - } - - u_batch.pos = pos.data(); - } - - if (u_batch.seq_id == nullptr) { - n_seq_id.resize(n_tokens); - seq_id.resize(n_tokens); - seq_id_arr.resize(n_tokens); - for (uint32_t i = 0; i < n_tokens; i++) { - n_seq_id[i] = 1; - seq_id[i].resize(1); - seq_id[i][0] = u_batch.all_seq_id; - seq_id_arr[i] = seq_id[i].data(); - } - - u_batch.n_seq_id = n_seq_id.data(); - u_batch.seq_id = seq_id_arr.data(); - } - // non-causal masks do not use the KV cache if (hparams.causal_attn) { llama_kv_cache_update(&lctx); @@ -14609,7 +15646,7 @@ static int llama_decode_internal( kv_self.head = 0; } - if (!llama_kv_cache_find_slot(kv_self, u_batch)) { + if (!llama_kv_cache_find_slot(kv_self, ubatch)) { return 1; } @@ -14628,7 +15665,7 @@ static int llama_decode_internal( lm_ggml_backend_sched_reset(lctx.sched); lm_ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data); - lm_ggml_cgraph * gf = llama_build_graph(lctx, u_batch, false); + lm_ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false); // the output is always the last tensor in the graph struct lm_ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; @@ -14639,12 +15676,15 @@ static int llama_decode_internal( res = nullptr; embd = nullptr; } else if (cparams.embeddings) { - res = nullptr; // do not extract logits for embedding case - embd = gf->nodes[gf->n_nodes - 1]; - if (strcmp(embd->name, "result_embd_pooled") != 0) { - embd = gf->nodes[gf->n_nodes - 2]; + res = nullptr; // do not extract logits for embedding case + embd = nullptr; + for (int i = gf->n_nodes - 1; i >= 0; --i) { + if (strcmp(gf->nodes[i]->name, "result_embd_pooled") == 0) { + embd = gf->nodes[i]; + break; + } } - LM_GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor"); + LM_GGML_ASSERT(embd != nullptr && "missing embeddings tensor"); } else { embd = nullptr; // do not extract embeddings when not needed LM_GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor"); @@ -14653,7 +15693,7 @@ static int llama_decode_internal( lm_ggml_backend_sched_alloc_graph(lctx.sched, gf); - llama_set_inputs(lctx, u_batch); + llama_set_inputs(lctx, ubatch); llama_graph_compute(lctx, gf, n_threads); @@ -14711,12 +15751,11 @@ static int llama_decode_internal( case LLAMA_POOLING_TYPE_CLS: case LLAMA_POOLING_TYPE_LAST: { - // extract sequence embeddings + // extract sequence embeddings (cleared before processing each batch) auto & embd_seq_out = lctx.embd_seq; - embd_seq_out.clear(); - for (uint32_t i = 0; i < n_tokens; i++) { - const llama_seq_id seq_id = u_batch.seq_id[i][0]; + for (uint32_t s = 0; s < ubatch.n_seqs; ++s) { + const llama_seq_id seq_id = ubatch.seq_id[s][0]; if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { continue; } @@ -14733,6 +15772,25 @@ static int llama_decode_internal( n_outputs_prev += lctx.n_outputs; } + // set output mappings + { + bool sorted_output = true; + + LM_GGML_ASSERT(lctx.sbatch.out_ids.size() == n_outputs); + + for (size_t i = 0; i < n_outputs; ++i) { + size_t out_id = lctx.sbatch.out_ids[i]; + lctx.output_ids[out_id] = i; + if (out_id != i) { + sorted_output = false; + } + } + + if (sorted_output) { + lctx.sbatch.out_ids.clear(); + } + } + // set to total number of outputs in the batch, for use in llama_get_logits_ith lctx.n_outputs = n_outputs; @@ -14797,11 +15855,9 @@ static int llama_encode_internal( const int64_t n_embd = hparams.n_embd; - // TODO: simplify or deprecate - std::vector pos; - std::vector n_seq_id; - std::vector seq_id_arr; - std::vector> seq_id; + lctx.sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true); + + const llama_ubatch ubatch = lctx.sbatch.split_simple(n_tokens); // reserve output buffer if (llama_output_reserve(lctx, n_tokens) < n_tokens) { @@ -14819,45 +15875,34 @@ static int llama_encode_internal( const int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch; LM_GGML_ASSERT(n_threads > 0); - // helpers for smoother batch API transition - // after deprecating the llama_eval calls, these will be removed - if (batch.pos == nullptr) { - pos.resize(n_tokens); - for (uint32_t i = 0; i < n_tokens; i++) { - pos[i] = batch.all_pos_0 + i*batch.all_pos_1; - } - - batch.pos = pos.data(); - } - - if (batch.seq_id == nullptr) { - n_seq_id.resize(n_tokens); - seq_id.resize(n_tokens); - seq_id_arr.resize(n_tokens); - for (uint32_t i = 0; i < n_tokens; i++) { - n_seq_id[i] = 1; - seq_id[i].resize(1); - seq_id[i][0] = batch.all_seq_id; - seq_id_arr[i] = seq_id[i].data(); - } - - batch.n_seq_id = n_seq_id.data(); - batch.seq_id = seq_id_arr.data(); - } - lm_ggml_backend_sched_reset(lctx.sched); lm_ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data); - lm_ggml_cgraph * gf = llama_build_graph(lctx, batch, false); + lm_ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false); // the output embeddings after the final encoder normalization - struct lm_ggml_tensor * embd = gf->nodes[gf->n_nodes - 1]; + struct lm_ggml_tensor * embd = nullptr; - LM_GGML_ASSERT(strcmp(embd->name, "result_norm") == 0); + // there are two cases here + if (llama_model_has_decoder(&lctx.model)) { + // first case is an encoder-decoder T5 model where embeddings are passed to decoder + embd = gf->nodes[gf->n_nodes - 1]; + LM_GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor"); + } else { + // second case is an encoder-only T5 model + if (cparams.embeddings) { + // only output embeddings if required + embd = gf->nodes[gf->n_nodes - 1]; + if (strcmp(embd->name, "result_embd_pooled") != 0) { + embd = gf->nodes[gf->n_nodes - 2]; + } + LM_GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor"); + } + } lm_ggml_backend_sched_alloc_graph(lctx.sched, gf); - llama_set_inputs(lctx, batch); + llama_set_inputs(lctx, ubatch); llama_graph_compute(lctx, gf, n_threads); @@ -14866,20 +15911,57 @@ static int llama_encode_internal( lm_ggml_backend_t backend_embd = lm_ggml_backend_sched_get_tensor_backend(lctx.sched, embd); LM_GGML_ASSERT(backend_embd != nullptr); - // extract token embeddings - LM_GGML_ASSERT(lctx.embd != nullptr); + if (llama_model_has_decoder(&lctx.model)) { + lctx.embd_enc.resize(n_tokens*n_embd); + float * embd_out = lctx.embd_enc.data(); - lctx.embd_enc.resize(n_tokens*n_embd); - float * embd_out = lctx.embd_enc.data(); + lm_ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float)); + LM_GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits - lm_ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float)); + // remember the sequence ids used during the encoding - needed for cross attention later + lctx.seq_ids_enc.resize(n_tokens); + for (uint32_t i = 0; i < n_tokens; i++) { + for (int s = 0; s < ubatch.n_seq_id[i]; s++) { + llama_seq_id seq_id = ubatch.seq_id[i][s]; + lctx.seq_ids_enc[i].insert(seq_id); + } + } + } else { + LM_GGML_ASSERT(lctx.embd != nullptr); - // remember the sequence ids used during the encoding - needed for cross attention later - lctx.seq_ids_enc.resize(n_tokens); - for (uint32_t i = 0; i < n_tokens; i++) { - for (int s = 0; s < batch.n_seq_id[i]; s++) { - llama_seq_id seq_id = batch.seq_id[i][s]; - lctx.seq_ids_enc[i].insert(seq_id); + switch (cparams.pooling_type) { + case LLAMA_POOLING_TYPE_NONE: + { + // extract token embeddings + LM_GGML_ASSERT(lctx.embd != nullptr); + float * embd_out = lctx.embd; + + LM_GGML_ASSERT(n_tokens*n_embd <= (int64_t) lctx.embd_size); + lm_ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float)); + } break; + case LLAMA_POOLING_TYPE_MEAN: + case LLAMA_POOLING_TYPE_CLS: + case LLAMA_POOLING_TYPE_LAST: + { + // extract sequence embeddings + auto & embd_seq_out = lctx.embd_seq; + embd_seq_out.clear(); + + LM_GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits + + for (uint32_t i = 0; i < n_tokens; i++) { + const llama_seq_id seq_id = ubatch.seq_id[i][0]; + if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { + continue; + } + embd_seq_out[seq_id].resize(n_embd); + lm_ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float)); + } + } break; + case LLAMA_POOLING_TYPE_UNSPECIFIED: + { + LM_GGML_ABORT("unknown pooling type"); + } } } } @@ -15146,32 +16228,6 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) { } } - if (lctx.kv_self.recurrent && lctx.kv_self.do_copy) { - { - lm_ggml_backend_sched_reset(lctx.sched); - - lm_ggml_cgraph * gf = llama_build_graph_s_copy(lctx); - - lm_ggml_backend_sched_alloc_graph(lctx.sched, gf); - - llama_set_s_copy(lctx); - - llama_graph_compute(lctx, gf, lctx.cparams.n_threads); - - need_reserve = true; - } - - { - auto & kv_self = lctx.kv_self; - - kv_self.do_copy = false; - - for (uint32_t i = 0; i < kv_self.size; ++i) { - kv_self.cells[i].src = i; - } - } - } - // defragment the KV cache if needed if (lctx.kv_self.do_defrag) { llama_kv_cache_defrag_internal(lctx); @@ -15185,10 +16241,11 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) { if (need_reserve) { // TODO: extract to a function // build worst-case graph - int n_tokens = (int)std::min(lctx.cparams.n_ctx, lctx.cparams.n_ubatch); - int n_past = lctx.cparams.n_ctx - n_tokens; + uint32_t n_seqs = 1; // TODO: worst-case number of sequences + uint32_t n_tokens = std::min(lctx.cparams.n_ctx, lctx.cparams.n_ubatch); llama_token token = llama_token_bos(&lctx.model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph - lm_ggml_cgraph * gf = llama_build_graph(lctx, llama_batch_get_one(&token, n_tokens, n_past, 0), true); + llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; + lm_ggml_cgraph * gf = llama_build_graph(lctx, ubatch, true); // initialize scheduler with the worst-case graph lm_ggml_backend_sched_reset(lctx.sched); @@ -15315,7 +16372,7 @@ static lm_ggml_type llama_tensor_get_type(quantize_state_internal & qs, lm_ggml_ const int n_expert = std::max(1, (int)qs.model.hparams.n_expert); auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) { if (n_expert > 1) { - // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly + // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but occasionally randomly // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work // for getting the current layer as I initially thought, and we need to resort to parsing the // tensor name. @@ -15580,6 +16637,9 @@ static lm_ggml_type llama_tensor_get_type(quantize_state_internal & qs, lm_ggml_ case LM_GGML_TYPE_Q6_K: new_type = LM_GGML_TYPE_Q8_0; break; default: throw std::runtime_error("\nUnsupported tensor size encountered\n"); } + if (tensor->ne[0] % lm_ggml_blck_size(new_type) != 0) { + new_type = LM_GGML_TYPE_F16; + } LLAMA_LOG_WARN(" - using fallback quantization %s\n", lm_ggml_type_name(new_type)); ++qs.n_fallback; } @@ -15781,12 +16841,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer; // sanity checks - // - // - qs.n_attention_wv == 0 for Mamba models - // - qs.n_attention_wv == model.hparams.n_layer for Transformer models - // - qs.n_attention_wv == 3 * model.hparams.n_layer for Encoder-Decoder models - // - LM_GGML_ASSERT((qs.n_attention_wv == 0 || qs.n_attention_wv == (int)model.hparams.n_layer || qs.n_attention_wv == 3 * (int)model.hparams.n_layer) && "n_attention_wv is unexpected"); + { + const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin(); + // attention layers have a non-zero number of kv heads + int32_t n_attn_layer = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0); + if (llama_model_has_encoder(&model)) { + n_attn_layer *= 3; + } + LM_GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected"); + } size_t total_size_org = 0; size_t total_size_new = 0; @@ -15908,8 +16971,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // do not quantize Mamba's small yet 2D weights // NOTE: can't use LLM_TN here because the layer number is not known quantize &= name.find("ssm_conv1d.weight") == std::string::npos; - quantize &= name.find("ssm_x.weight") == std::string::npos; - quantize &= name.find("ssm_dt.weight") == std::string::npos; // do not quantize relative position bias (T5) quantize &= name.find("attn_rel_b.weight") == std::string::npos; @@ -16589,13 +17650,15 @@ struct llama_context * llama_new_context_with_model( ctx->sampling.rng = std::mt19937(params.seed); ctx->logits_all = params.logits_all; + // build worst-case graph for encoder if a model contains encoder + ctx->is_encoding = llama_model_has_encoder(model); uint32_t kv_size = cparams.n_ctx; lm_ggml_type type_k = params.type_k; lm_ggml_type type_v = params.type_v; // Mamba only needs a constant number of KV cache cells per sequence - if (model->arch == LLM_ARCH_MAMBA) { + if (llama_model_is_recurrent(model)) { // Mamba needs at least as many KV cells as there are sequences kept at any time kv_size = std::max((uint32_t) 1, params.n_seq_max); // it's probably best to keep as much precision as possible for the states @@ -16827,10 +17890,11 @@ struct llama_context * llama_new_context_with_model( } // build worst-case graph - int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_ubatch); - int n_past = cparams.n_ctx - n_tokens; + uint32_t n_seqs = 1; // TODO: worst-case number of sequences + uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph - lm_ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0), true); + llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; + lm_ggml_cgraph * gf = llama_build_graph(*ctx, ubatch, true); // initialize scheduler with the worst-case graph if (!lm_ggml_backend_sched_reserve(ctx->sched, gf)) { @@ -16903,6 +17967,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_MAMBA: case LLM_ARCH_JINA_BERT_V2: case LLM_ARCH_T5: + case LLM_ARCH_T5ENCODER: case LLM_ARCH_JAIS: return LLAMA_ROPE_TYPE_NONE; @@ -16941,6 +18006,8 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_OPENELM: case LLM_ARCH_GPTNEOX: case LLM_ARCH_CODESHELL: + case LLM_ARCH_NEMOTRON: + case LLM_ARCH_EXAONE: return LLAMA_ROPE_TYPE_NEOX; // all model arches should be listed explicitly here @@ -17050,8 +18117,16 @@ struct lm_ggml_tensor * llama_get_model_tensor(struct llama_model * model, const bool llama_model_has_encoder(const struct llama_model * model) { switch (model->arch) { - case LLM_ARCH_T5: return true; - default: return false; + case LLM_ARCH_T5: return true; + case LLM_ARCH_T5ENCODER: return true; + default: return false; + } +} + +bool llama_model_has_decoder(const struct llama_model * model) { + switch (model->arch) { + case LLM_ARCH_T5ENCODER: return false; + default: return true; } } @@ -17059,6 +18134,13 @@ llama_token llama_model_decoder_start_token(const struct llama_model * model) { return model->hparams.dec_start_token_id; } +bool llama_model_is_recurrent(const struct llama_model * model) { + switch (model->arch) { + case LLM_ARCH_MAMBA: return true; + default: return false; + } +} + uint32_t llama_model_quantize( const char * fname_inp, const char * fname_out, @@ -17354,6 +18436,7 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi // TODO: replace all non-fatal assertions with returned errors or exceptions struct llama_data_write { virtual void write(const void * src, size_t size) = 0; + virtual void write_tensor_data(const struct lm_ggml_tensor * tensor, size_t offset, size_t size) = 0; virtual size_t get_size_written() = 0; virtual ~llama_data_write() = default; @@ -17379,7 +18462,9 @@ struct llama_data_write { write_string(rng_str); } - void write_output_ids(const struct llama_context * ctx) { + void write_output_ids(struct llama_context * ctx) { + llama_output_reorder(ctx); + const uint32_t n_outputs = ctx->n_outputs; std::vector output_pos; @@ -17476,9 +18561,8 @@ struct llama_data_write { // Read each range of cells of k_size length each into tmp_buf and write out for (const auto & range : cell_ranges) { const size_t range_size = range.second - range.first; - tmp_buf.resize(range_size * k_size_row); - lm_ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), range.first * k_size_row, range_size * k_size_row); - write(tmp_buf.data(), tmp_buf.size()); + const size_t buf_size = range_size * k_size_row; + write_tensor_data(kv_self.k_l[il], range.first * k_size_row, buf_size); } } @@ -17497,9 +18581,8 @@ struct llama_data_write { // Read each range of cells of v_size length each into tmp_buf and write out for (const auto & range : cell_ranges) { const size_t range_size = range.second - range.first; - tmp_buf.resize(range_size * v_size_row); - lm_ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), range.first * v_size_row, range_size * v_size_row); - write(tmp_buf.data(), tmp_buf.size()); + const size_t buf_size = range_size * v_size_row; + write_tensor_data(kv_self.v_l[il], range.first * v_size_row, buf_size); } } } else { @@ -17525,9 +18608,8 @@ struct llama_data_write { for (const auto & range : cell_ranges) { const size_t range_size = range.second - range.first; const size_t src_offset = (range.first + j * kv_size) * v_size_el; - tmp_buf.resize(range_size * v_size_el); - lm_ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size()); - write(tmp_buf.data(), tmp_buf.size()); + const size_t buf_size = range_size * v_size_el; + write_tensor_data(kv_self.v_l[il], src_offset, buf_size); } } } @@ -17670,8 +18752,11 @@ struct llama_data_read { llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1); - llama_batch batch = llama_batch_init(cell_count, 0, 1); + llama_ubatch batch = ctx->sbatch.reserve_ubatch(cell_count, /* has_embd */ false); batch.n_tokens = cell_count; + batch.n_seq_tokens = cell_count; + batch.n_seqs = 1; + for (uint32_t i = 0; i < cell_count; ++i) { llama_pos pos; uint32_t n_seq_id; @@ -17685,11 +18770,10 @@ struct llama_data_read { } batch.pos[i] = pos; - batch.n_seq_id[i] = 1; - batch.seq_id[i][0] = dest_seq_id; } + batch.n_seq_id[0] = 1; + batch.seq_id[0] = &dest_seq_id; if (!llama_kv_cache_find_slot(kv_self, batch)) { - llama_batch_free(batch); LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__); return false; } @@ -17701,9 +18785,6 @@ struct llama_data_read { LM_GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].pos == batch.pos[cell_count - 1]); LM_GGML_ASSERT(kv_self.cells[kv_self.head].has_seq_id(dest_seq_id)); LM_GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].has_seq_id(dest_seq_id)); - - // Cleanup - llama_batch_free(batch); } else { // whole KV cache restore @@ -17735,6 +18816,15 @@ struct llama_data_read { } cell.seq_id.insert(seq_id); + + if (kv_self.recurrent) { + int32_t & tail = kv_self.cells[seq_id].tail; + if (tail != -1) { + LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tail); + return false; + } + tail = i; + } } } @@ -17742,6 +18832,14 @@ struct llama_data_read { kv_self.used = cell_count; } + if (kv_self.recurrent) { + for (uint32_t i = 0; i < cell_count; ++i) { + uint32_t cell_id = kv_self.head + i; + // make sure the recurrent states will keep their restored state + kv_self.cells[cell_id].src = cell_id; + } + } + return true; } @@ -17886,12 +18984,14 @@ struct llama_data_write_dummy : llama_data_write { llama_data_write_dummy() {} - // TODO: avoid unnecessary calls to lm_ggml_backend_tensor_get in a dummy context - void write(const void * /* src */, size_t size) override { size_written += size; } + void write_tensor_data(const struct lm_ggml_tensor * /* tensor */, size_t /* offset */, size_t size) override { + size_written += size; + } + size_t get_size_written() override { return size_written; } @@ -17914,6 +19014,16 @@ struct llama_data_write_buffer : llama_data_write { buf_size -= size; } + void write_tensor_data(const struct lm_ggml_tensor * tensor, size_t offset, size_t size) override { + if (size > buf_size) { + throw std::runtime_error("unexpectedly reached end of buffer"); + } + lm_ggml_backend_tensor_get(tensor, ptr, offset, size); + ptr += size; + size_written += size; + buf_size -= size; + } + size_t get_size_written() override { return size_written; } @@ -17949,6 +19059,7 @@ struct llama_data_read_buffer : llama_data_read { struct llama_data_write_file : llama_data_write { llama_file * file; size_t size_written = 0; + std::vector temp_buffer; llama_data_write_file(llama_file * f) : file(f) {} @@ -17957,6 +19068,12 @@ struct llama_data_write_file : llama_data_write { size_written += size; } + void write_tensor_data(const struct lm_ggml_tensor * tensor, size_t offset, size_t size) override { + temp_buffer.resize(size); + lm_ggml_backend_tensor_get(tensor, temp_buffer.data(), offset, size); + write(temp_buffer.data(), temp_buffer.size()); + } + size_t get_size_written() override { return size_written; } @@ -18310,7 +19427,18 @@ struct llama_batch llama_batch_get_one( } struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_t n_seq_max) { - llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, }; + llama_batch batch = { + /*n_tokens =*/ 0, + /*tokens =*/ nullptr, + /*embd =*/ nullptr, + /*pos =*/ nullptr, + /*n_seq_id =*/ nullptr, + /*seq_id =*/ nullptr, + /*logits =*/ nullptr, + /*all_pos_0 =*/ 0, + /*all_pos_1 =*/ 0, + /*all_seq_id =*/ 0, + }; if (embd) { batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd); @@ -18396,6 +19524,10 @@ void llama_synchronize(struct llama_context * ctx) { float * llama_get_logits(struct llama_context * ctx) { llama_synchronize(ctx); + // reorder logits for backward compatibility + // TODO: maybe deprecate this + llama_output_reorder(ctx); + return ctx->logits; } @@ -18440,6 +19572,10 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) { float * llama_get_embeddings(struct llama_context * ctx) { llama_synchronize(ctx); + // reorder embeddings for backward compatibility + // TODO: maybe deprecate this + llama_output_reorder(ctx); + return ctx->embd; } @@ -18541,11 +19677,11 @@ llama_token llama_token_pad(const struct llama_model * model) { return llama_token_pad_impl(model->vocab); } -int32_t llama_add_bos_token(const struct llama_model * model) { +bool llama_add_bos_token(const struct llama_model * model) { return llama_add_bos_token_impl(model->vocab); } -int32_t llama_add_eos_token(const struct llama_model * model) { +bool llama_add_eos_token(const struct llama_model * model) { return llama_add_eos_token_impl(model->vocab); } @@ -18846,6 +19982,22 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "Assistant:"; } + } else if (tmpl == "exaone3" || (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]"))) { + // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb + // EXAONE-3.0-7.8B-Instruct + for (auto message : chat) { + std::string role(message->role); + if (role == "system") { + ss << "[|system|]" << trim(message->content) << "[|endofturn|]\n"; + } else if (role == "user") { + ss << "[|user|]" << trim(message->content) << "\n"; + } else if (role == "assistant") { + ss << "[|assistant|]" << trim(message->content) << "[|endofturn|]\n"; + } + } + if (add_ass) { + ss << "[|assistant|]"; + } } else { // template not supported return -1; diff --git a/cpp/llama.h b/cpp/llama.h index fe1e581..f27bd4b 100644 --- a/cpp/llama.h +++ b/cpp/llama.h @@ -93,15 +93,15 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20, LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21, LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22, + LLAMA_VOCAB_PRE_TYPE_BLOOM = 23, + LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24, + LLAMA_VOCAB_PRE_TYPE_EXAONE = 25, }; - // note: these values should be synchronized with lm_ggml_rope - // TODO: maybe move this enum to ggml.h (lm_ggml_rope_type) enum llama_rope_type { LLAMA_ROPE_TYPE_NONE = -1, - LLAMA_ROPE_TYPE_NORM = 0, - LLAMA_ROPE_TYPE_NEOX = 2, - LLAMA_ROPE_TYPE_GLM = 4, + LLAMA_ROPE_TYPE_NORM = 0, + LLAMA_ROPE_TYPE_NEOX = LM_GGML_ROPE_TYPE_NEOX, }; enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file @@ -345,7 +345,7 @@ extern "C" { int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() enum llama_ftype ftype; // quantize to this llama_ftype enum lm_ggml_type output_tensor_type; // output tensor type - enum lm_ggml_type token_embedding_type; // itoken embeddings tensor type + enum lm_ggml_type token_embedding_type; // token embeddings tensor type bool allow_requantize; // allow quantizing non-f32/f16 tensors bool quantize_output_tensor; // quantize output.weight bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored @@ -504,10 +504,16 @@ extern "C" { // Returns true if the model contains an encoder that requires llama_encode() call LLAMA_API bool llama_model_has_encoder(const struct llama_model * model); + // Returns true if the model contains a decoder that requires llama_decode() call + LLAMA_API bool llama_model_has_decoder(const struct llama_model * model); + // For encoder-decoder models, this function returns id of the token that must be provided // to the decoder to start generating output sequence. For other models, it returns -1. LLAMA_API llama_token llama_model_decoder_start_token(const struct llama_model * model); + // Returns true if the model is recurrent (like Mamba, RWKV, etc.) + LLAMA_API bool llama_model_is_recurrent(const struct llama_model * model); + // Returns 0 on success LLAMA_API uint32_t llama_model_quantize( const char * fname_inp, @@ -912,11 +918,8 @@ extern "C" { LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line LLAMA_API llama_token llama_token_pad(const struct llama_model * model); // padding - // Returns -1 if unknown, 1 for true or 0 for false. - LLAMA_API int32_t llama_add_bos_token(const struct llama_model * model); - - // Returns -1 if unknown, 1 for true or 0 for false. - LLAMA_API int32_t llama_add_eos_token(const struct llama_model * model); + LLAMA_API bool llama_add_bos_token(const struct llama_model * model); + LLAMA_API bool llama_add_eos_token(const struct llama_model * model); // Codellama infill tokens LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix diff --git a/llama.cpp b/llama.cpp index bc0f887..11b84eb 160000 --- a/llama.cpp +++ b/llama.cpp @@ -1 +1 @@ -Subproject commit bc0f887e159c0d78c28121e2c8b5c58094170875 +Subproject commit 11b84eb4578864827afcf956db5b571003f18180 diff --git a/scripts/common.h.patch b/scripts/common.h.patch index 8e33f28..cd020e7 100644 --- a/scripts/common.h.patch +++ b/scripts/common.h.patch @@ -1,6 +1,6 @@ ---- common.h.orig -+++ common.h -@@ -41,6 +41,17 @@ extern char const * LLAMA_BUILD_TARGET; +--- common.h.orig 2024-08-23 14:12:33 ++++ common.h 2024-08-23 14:12:34 +@@ -50,6 +50,17 @@ struct llama_control_vector_load_info; diff --git a/scripts/ggml-metal.m.patch b/scripts/ggml-metal.m.patch index 00d9061..a1266ac 100644 --- a/scripts/ggml-metal.m.patch +++ b/scripts/ggml-metal.m.patch @@ -1,6 +1,6 @@ ---- ggml-metal.m.orig -+++ ggml-metal.m -@@ -336,7 +336,7 @@ static struct lm_ggml_metal_context * lm_ggml_metal_init(int n_cb) { +--- ggml-metal.m.orig 2024-08-23 14:12:33 ++++ ggml-metal.m 2024-08-23 14:12:34 +@@ -340,7 +340,7 @@ const bool try_metallib = true; #endif diff --git a/scripts/ggml.c.patch b/scripts/ggml.c.patch index bd535d5..984f9a6 100644 --- a/scripts/ggml.c.patch +++ b/scripts/ggml.c.patch @@ -1,6 +1,6 @@ ---- ggml.c.orig -+++ ggml.c -@@ -191,9 +191,9 @@ static void lm_ggml_print_backtrace_symbols(void) { +--- ggml.c.orig 2024-08-23 14:12:33 ++++ ggml.c 2024-08-23 14:12:34 +@@ -194,9 +194,9 @@ #elif defined(__linux__) && defined(__GLIBC__) #include static void lm_ggml_print_backtrace_symbols(void) { diff --git a/scripts/llama.cpp.patch b/scripts/llama.cpp.patch index 9453529..f5d26b8 100644 --- a/scripts/llama.cpp.patch +++ b/scripts/llama.cpp.patch @@ -1,5 +1,5 @@ ---- llama.cpp.orig -+++ llama.cpp +--- llama.cpp.orig 2024-08-23 14:12:33 ++++ llama.cpp 2024-08-23 14:12:34 @@ -104,6 +104,17 @@ #define LLAMA_MAX_LAYERS 512 #define LLAMA_MAX_EXPERTS 160 // DeepSeekV2 @@ -18,7 +18,7 @@ // // helpers // -@@ -1689,16 +1700,16 @@ +@@ -1741,16 +1752,16 @@ if (prefetch > 0) { // advise the kernel to preload the mapped memory From 2ed24b9b449dfd5c58e289f64d3ed8f3e52becc2 Mon Sep 17 00:00:00 2001 From: jhen Date: Fri, 23 Aug 2024 14:59:33 +0800 Subject: [PATCH 2/4] fix: migrate from api changes --- android/src/main/jni.cpp | 2 +- cpp/rn-llama.hpp | 4 +++- ios/RNLlamaContext.mm | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/android/src/main/jni.cpp b/android/src/main/jni.cpp index 71662c5..5377aea 100644 --- a/android/src/main/jni.cpp +++ b/android/src/main/jni.cpp @@ -168,7 +168,7 @@ Java_com_rnllama_LlamaContext_initContext( const char *lora_chars = env->GetStringUTFChars(lora_str, nullptr); if (lora_chars != nullptr && lora_chars[0] != '\0') { - defaultParams.lora_adapter.push_back({lora_chars, lora_scaled}); + defaultParams.lora_adapters.push_back({lora_chars, lora_scaled}); defaultParams.use_mmap = false; } diff --git a/cpp/rn-llama.hpp b/cpp/rn-llama.hpp index 00a04c3..537f0a1 100644 --- a/cpp/rn-llama.hpp +++ b/cpp/rn-llama.hpp @@ -219,7 +219,9 @@ struct llama_rn_context bool loadModel(gpt_params ¶ms_) { params = params_; - std::tie(model, ctx) = llama_init_from_gpt_params(params); + llama_init_result llama_init = llama_init_from_gpt_params(params); + model = llama_init.model; + ctx = llama_init.context; if (model == nullptr) { LOG_ERROR("unable to load model: %s", params_.model.c_str()); diff --git a/ios/RNLlamaContext.mm b/ios/RNLlamaContext.mm index ef494aa..f7fbc95 100644 --- a/ios/RNLlamaContext.mm +++ b/ios/RNLlamaContext.mm @@ -59,7 +59,7 @@ + (instancetype)initWithParams:(NSDictionary *)params { if (params[@"lora"]) { float lora_scaled = 1.0f; if (params[@"lora_scaled"]) lora_scaled = [params[@"lora_scaled"] floatValue]; - defaultParams.lora_adapter.push_back({[params[@"lora"] UTF8String], lora_scaled}); + defaultParams.lora_adapters.push_back({[params[@"lora"] UTF8String], lora_scaled}); defaultParams.use_mmap = false; } From f2f4ba72edbb35d74c040ec6a4ba10dda7402413 Mon Sep 17 00:00:00 2001 From: jhen Date: Sun, 25 Aug 2024 11:09:32 +0800 Subject: [PATCH 3/4] feat: sync llama.cpp --- cpp/ggml-metal.m | 9 +++++++++ cpp/ggml.c | 33 +++++++++++++++++++++++---------- cpp/ggml.h | 3 ++- cpp/llama.cpp | 11 +++-------- llama.cpp | 2 +- 5 files changed, 38 insertions(+), 20 deletions(-) diff --git a/cpp/ggml-metal.m b/cpp/ggml-metal.m index 8568c96..a04db3f 100644 --- a/cpp/ggml-metal.m +++ b/cpp/ggml-metal.m @@ -802,6 +802,15 @@ static bool lm_ggml_metal_supports_op(const struct lm_ggml_backend_metal_context if (op->src[0]->ne[0] == 256) { return false; } + { + float logit_softcap; + + memcpy(&logit_softcap, ((const float *) op->op_params) + 2, sizeof(logit_softcap)); + + if (logit_softcap != 0.0f) { + return false; + } + } return ctx->support_simdgroup_mm; // TODO: over-restricted for vec-kernels case LM_GGML_OP_MUL_MAT: case LM_GGML_OP_MUL_MAT_ID: diff --git a/cpp/ggml.c b/cpp/ggml.c index f125d44..1c27476 100644 --- a/cpp/ggml.c +++ b/cpp/ggml.c @@ -7095,7 +7095,8 @@ struct lm_ggml_tensor * lm_ggml_flash_attn_ext( struct lm_ggml_tensor * v, struct lm_ggml_tensor * mask, float scale, - float max_bias) { + float max_bias, + float logit_softcap) { LM_GGML_ASSERT(lm_ggml_can_mul_mat(k, q)); // TODO: check if vT can be multiplied by (k*qT) @@ -7122,7 +7123,7 @@ struct lm_ggml_tensor * lm_ggml_flash_attn_ext( int64_t ne[4] = { q->ne[0], q->ne[2], q->ne[1], q->ne[3] }; struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F32, 4, ne); - float params[] = { scale, max_bias }; + float params[] = { scale, max_bias, logit_softcap }; lm_ggml_set_op_params(result, params, sizeof(params)); result->op = LM_GGML_OP_FLASH_ATTN_EXT; @@ -7142,7 +7143,7 @@ void lm_ggml_flash_attn_ext_set_prec( const int32_t prec_i32 = (int32_t) prec; - lm_ggml_set_op_params_i32(a, 2, prec_i32); // scale is on first pos, max_bias on second + lm_ggml_set_op_params_i32(a, 3, prec_i32); // scale is on first pos, max_bias on second } // lm_ggml_flash_attn_back @@ -15271,11 +15272,17 @@ static void lm_ggml_compute_forward_flash_attn_ext_f16( const int ir0 = dr*ith; const int ir1 = MIN(ir0 + dr, nr); - float scale = 1.0f; - float max_bias = 0.0f; + float scale = 1.0f; + float max_bias = 0.0f; + float logit_softcap = 0.0f; - memcpy(&scale, (float *) dst->op_params + 0, sizeof(float)); - memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float)); + memcpy(&scale, (float *) dst->op_params + 0, sizeof(float)); + memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float)); + memcpy(&logit_softcap, (float *) dst->op_params + 2, sizeof(float)); + + if (logit_softcap != 0) { + scale /= logit_softcap; + } const uint32_t n_head = neq2; const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head)); @@ -15339,7 +15346,13 @@ static void lm_ggml_compute_forward_flash_attn_ext_f16( const char * k_data = (const char *) k->data + ( ic*nbk1 + ik2*nbk2 + ik3*nbk3); kq_vec_dot(D, &s, 0, k_data, 0, Q_q, 0, 1); - s = s*scale + mv; // scale KQ value and apply mask + s = s*scale; // scale KQ value + + if (logit_softcap != 0.0f) { + s = logit_softcap*tanhf(s); + } + + s += mv; // apply mask const float Mold = M; @@ -15348,7 +15361,7 @@ static void lm_ggml_compute_forward_flash_attn_ext_f16( const char * v_data = ((const char *) v->data + (ic*nbv1 + iv2*nbv2 + iv3*nbv3)); - if (v->type== LM_GGML_TYPE_F16) { + if (v->type == LM_GGML_TYPE_F16) { if (s > M) { // s is new maximum, ms < 1.0f, vs == expf(s - s) == 1.0f M = s; @@ -15415,7 +15428,7 @@ static void lm_ggml_compute_forward_flash_attn_ext( const struct lm_ggml_tensor * v, const struct lm_ggml_tensor * mask, struct lm_ggml_tensor * dst) { - switch (dst->op_params[2]) { + switch (dst->op_params[3]) { case LM_GGML_PREC_DEFAULT: case LM_GGML_PREC_F32: { diff --git a/cpp/ggml.h b/cpp/ggml.h index 1c52da2..46342f5 100644 --- a/cpp/ggml.h +++ b/cpp/ggml.h @@ -1760,7 +1760,8 @@ extern "C" { struct lm_ggml_tensor * v, struct lm_ggml_tensor * mask, float scale, - float max_bias); + float max_bias, + float logit_softcap); LM_GGML_API void lm_ggml_flash_attn_ext_set_prec( struct lm_ggml_tensor * a, diff --git a/cpp/llama.cpp b/cpp/llama.cpp index 79e0076..57c13ef 100644 --- a/cpp/llama.cpp +++ b/cpp/llama.cpp @@ -8885,7 +8885,8 @@ static struct lm_ggml_tensor * llm_build_kqv( 0); cb(v, "v", il); - cur = lm_ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias); + cur = lm_ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias, + hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f); if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) { lm_ggml_flash_attn_ext_set_prec(cur, LM_GGML_PREC_F32); @@ -8896,7 +8897,7 @@ static struct lm_ggml_tensor * llm_build_kqv( struct lm_ggml_tensor * kq = lm_ggml_mul_mat(ctx, k, q); cb(kq, "kq", il); - if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_QWEN2 || model.arch == LLM_ARCH_NEMOTRON) { + if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_QWEN2 || model.arch == LLM_ARCH_NEMOTRON || model.arch == LLM_ARCH_CHATGLM) { // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847 lm_ggml_mul_mat_set_prec(kq, LM_GGML_PREC_F32); @@ -17544,12 +17545,6 @@ struct llama_context * llama_new_context_with_model( params.flash_attn = false; } - if (params.flash_attn && model->hparams.attn_soft_cap) { - LLAMA_LOG_WARN("%s: flash_attn is not compatible with attn_soft_cap - forcing off\n", __func__); - params.flash_attn = false; - } - - if (params.flash_attn && model->hparams.n_embd_head_k != model->hparams.n_embd_head_v) { LLAMA_LOG_WARN("%s: flash_attn requires n_embd_head_k == n_embd_head_v - forcing off\n", __func__); params.flash_attn = false; diff --git a/llama.cpp b/llama.cpp index 11b84eb..e11bd85 160000 --- a/llama.cpp +++ b/llama.cpp @@ -1 +1 @@ -Subproject commit 11b84eb4578864827afcf956db5b571003f18180 +Subproject commit e11bd856d538e44d24d8cad4b0381fba0984d162 From 958ab56f4f68e194a5d22a1d8bb4fea2eeb47330 Mon Sep 17 00:00:00 2001 From: jhen Date: Mon, 26 Aug 2024 10:39:20 +0800 Subject: [PATCH 4/4] feat: sync llama.cpp --- cpp/common.cpp | 2 +- cpp/llama.cpp | 2 +- example/ios/Podfile.lock | 6 +++--- llama.cpp | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/common.cpp b/cpp/common.cpp index 725a557..94cb0f1 100644 --- a/cpp/common.cpp +++ b/cpp/common.cpp @@ -907,7 +907,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa } return true; } - if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--gpu-layers-draft") { + if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--n-gpu-layers-draft") { CHECK_ARG params.n_gpu_layers_draft = std::stoi(argv[i]); if (!llama_supports_gpu_offload()) { diff --git a/cpp/llama.cpp b/cpp/llama.cpp index 57c13ef..02ad32f 100644 --- a/cpp/llama.cpp +++ b/cpp/llama.cpp @@ -8888,7 +8888,7 @@ static struct lm_ggml_tensor * llm_build_kqv( cur = lm_ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias, hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f); - if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) { + if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_GEMMA2) { lm_ggml_flash_attn_ext_set_prec(cur, LM_GGML_PREC_F32); } diff --git a/example/ios/Podfile.lock b/example/ios/Podfile.lock index ed89623..f9cf151 100644 --- a/example/ios/Podfile.lock +++ b/example/ios/Podfile.lock @@ -8,7 +8,7 @@ PODS: - hermes-engine/Pre-built (= 0.72.3) - hermes-engine/Pre-built (0.72.3) - libevent (2.1.12) - - llama-rn (0.3.6): + - llama-rn (0.3.10): - RCT-Folly - RCTRequired - RCTTypeSafety @@ -1261,7 +1261,7 @@ SPEC CHECKSUMS: glog: 04b94705f318337d7ead9e6d17c019bd9b1f6b1b hermes-engine: 10fbd3f62405c41ea07e71973ea61e1878d07322 libevent: 4049cae6c81cdb3654a443be001fb9bdceff7913 - llama-rn: fbc4775cb24796c7cc59bf04855a669955508f30 + llama-rn: bb4447eb237c895e526b1adbb2b88700093005ac RCT-Folly: 424b8c9a7a0b9ab2886ffe9c3b041ef628fd4fb1 RCTRequired: a2faf4bad4e438ca37b2040cb8f7799baa065c18 RCTTypeSafety: cb09f3e4747b6d18331a15eb05271de7441ca0b3 @@ -1307,4 +1307,4 @@ SPEC CHECKSUMS: PODFILE CHECKSUM: e85f4f0de2f3382c406a75702d5f126a993a8dde -COCOAPODS: 1.14.3 +COCOAPODS: 1.15.2 diff --git a/llama.cpp b/llama.cpp index e11bd85..93bc383 160000 --- a/llama.cpp +++ b/llama.cpp @@ -1 +1 @@ -Subproject commit e11bd856d538e44d24d8cad4b0381fba0984d162 +Subproject commit 93bc3839f980ff14be86efe408b4cd7e89b26835