ggml-backend : add load_tensor() to backend API

rgerganov · rgerganov · commit 7044072ae1bc · 2025-04-25T13:31:23.000+03:00
Add new backend API which allows loading tensor's data with precomputed hashes stored in the model KV. ref: #12954
diff --git a/examples/gguf-hash/gguf-hash.cpp b/examples/gguf-hash/gguf-hash.cpp
@@ -55,6 +55,7 @@ typedef enum {
 
 struct hash_params {
     std::string input;
+    bool fnv = false;
     bool xxh64 = false;
     bool sha1 = false;
     bool sha256 = false;
@@ -103,6 +104,7 @@ static void hash_print_usage(const char * executable) {
     printf("\n");
     printf("options:\n");
     printf("  -h, --help              show this help message and exit\n");
+    printf("      --fnv               use FNV-1a hash\n");
     printf("      --xxh64             use xxh64 hash\n");
     printf("      --sha1              use sha1 hash\n");
     printf("      --sha256            use sha256 hash\n");
@@ -131,6 +133,11 @@ static void hash_params_parse_ex(int argc, const char ** argv, hash_params & par
             exit(0);
         }
 
+        if (arg == "--fnv") {
+            arg_found = true;
+            params.fnv = true;
+        }
+
         if (arg == "--xxh64") {
             arg_found = true;
             params.xxh64 = true;
@@ -283,6 +290,18 @@ static void generate_uuidv5(const unsigned char sha1_digest[20], unsigned char u
     uuid[ 8] |= (0x8 << 4);
 }
 
+// Computes FNV-1a hash of the data
+static uint64_t fnv_hash(const uint8_t * data, size_t len) {
+    const uint64_t fnv_prime = 0x100000001b3ULL;
+    uint64_t hash = 0xcbf29ce484222325ULL;
+
+    for (size_t i = 0; i < len; ++i) {
+        hash ^= data[i];
+        hash *= fnv_prime;
+    }
+    return hash;
+}
+
 static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
     const std::string & fname = hash_params.input;
     struct ggml_context * ctx_data = NULL;
@@ -326,7 +345,11 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
         SHA1Update( &sha1_for_uuid_ctx, (unsigned char const *)uuidv5_namespace, sizeof(uuidv5_namespace));
     }
 
+    struct gguf_context * ctx_out = gguf_init_empty();
     struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
+
+    gguf_set_kv(ctx_out, ctx);
+
     const int n_tensors = gguf_get_n_tensors(ctx);
     bool tensor_layer_in_manifest = false;
     bool model_in_manifest = false;
@@ -335,10 +358,19 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
     for (int i = 0; i < n_tensors; ++i) {
         const char * name = gguf_get_tensor_name(ctx, i);
         struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
+        gguf_add_tensor(ctx_out, cur);
         auto n_bytes = ggml_nbytes(cur);
         auto *raw_data = cur->data;
         const std::string tensor_layer_name = fname + ":" + name;
 
+        if (hash_params.fnv) {
+            uint64_t hash = fnv_hash((const uint8_t *)raw_data, n_bytes);
+            printf("%016lx  %s\n", hash, tensor_layer_name.c_str());
+            char hash_key[128];
+            snprintf(hash_key, sizeof(hash_key), "%s_hash", name);
+            gguf_set_val_u64(ctx_out, hash_key, hash);
+        }
+
         if (hash_params.xxh64) {
 
             if (!hash_params.no_layer) {
@@ -580,6 +612,9 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
         }
     }
 
+    auto fname_out = fname + ".rpc";
+    gguf_write_to_file(ctx_out, fname_out.c_str(), false);
+    gguf_free(ctx_out);
 
     ggml_free(ctx_data);
     gguf_free(ctx);
@@ -663,7 +698,7 @@ int main(int argc, const char ** argv) {
 
         // Autoselect the highest security hash if manifest is provided but
         // the user has not specifically defined the hash they care about
-        if (!params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) {
+        if (!params.fnv && !params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) {
             // User has not selected a specific value, pick most secure hash
             if (manifest_check.sha256) {
                 params.sha256 = true;
@@ -680,7 +715,7 @@ int main(int argc, const char ** argv) {
     }
 
     // By default if no swich argument provided, assume xxh64
-    if (!params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) {
+    if (!params.fnv && !params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) {
         params.xxh64 = true;
     }
 
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
@@ -88,6 +88,7 @@ extern "C" {
 
     // "offset" refers to the offset in tensor->data for setting/getting data
     GGML_API void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+    GGML_API bool ggml_backend_tensor_load(     struct ggml_tensor * tensor,     size_t offset, uint64_t hash);
     GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
     GGML_API void ggml_backend_tensor_memset(   struct ggml_tensor * tensor,     uint8_t value, size_t offset, size_t size);
 
diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
@@ -49,6 +49,8 @@ extern "C" {
         void         (*memset_tensor)(ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor,     uint8_t value, size_t offset, size_t size);
         void         (*set_tensor)   (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
         void         (*get_tensor)   (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+        // (optional) load tensor with data which has the specified hash
+        bool         (*load_tensor)  (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor,     size_t offset, uint64_t hash);
         // (optional) tensor copy: dst is in the buffer, src may be in any buffer, including buffers from a different backend (return false if not supported)
         bool         (*cpy_tensor)   (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst);
         // clear the entire buffer
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
@@ -268,6 +268,18 @@ void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, siz
     buf->iface.set_tensor(buf, tensor, data, offset, size);
 }
 
+bool ggml_backend_tensor_load(struct ggml_tensor * tensor, size_t offset, uint64_t hash) {
+    GGML_ASSERT(tensor);
+    ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
+    GGML_ASSERT(buf != NULL && "tensor buffer not set");
+    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+
+    if (buf->iface.set_tensor == NULL) {
+        return false;
+    }
+    return buf->iface.load_tensor(buf, tensor, offset, hash);
+}
+
 void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
     GGML_ASSERT(tensor);
     ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
@@ -560,6 +572,7 @@ static const struct ggml_backend_buffer_i ggml_backend_multi_buffer_i = {
     /* .memset_tensor   = */ NULL,
     /* .set_tensor      = */ NULL,
     /* .get_tensor      = */ NULL,
+    /* .load_tensor     = */ NULL,
     /* .cpy_tensor      = */ NULL,
     /* .clear           = */ ggml_backend_multi_buffer_clear,
     /* .reset           = */ NULL,
@@ -1908,6 +1921,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
     /* .memset_tensor   = */ ggml_backend_cpu_buffer_memset_tensor,
     /* .set_tensor      = */ ggml_backend_cpu_buffer_set_tensor,
     /* .get_tensor      = */ ggml_backend_cpu_buffer_get_tensor,
+    /* .load_tensor     = */ NULL,
     /* .cpy_tensor      = */ ggml_backend_cpu_buffer_cpy_tensor,
     /* .clear           = */ ggml_backend_cpu_buffer_clear,
     /* .reset           = */ NULL,
@@ -1920,6 +1934,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
     /* .memset_tensor   = */ ggml_backend_cpu_buffer_memset_tensor,
     /* .set_tensor      = */ ggml_backend_cpu_buffer_set_tensor,
     /* .get_tensor      = */ ggml_backend_cpu_buffer_get_tensor,
+    /* .load_tensor     = */ NULL,
     /* .cpy_tensor      = */ ggml_backend_cpu_buffer_cpy_tensor,
     /* .clear           = */ ggml_backend_cpu_buffer_clear,
     /* .reset           = */ NULL,
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
@@ -569,6 +569,21 @@ static void ggml_backend_rpc_buffer_get_tensor(ggml_backend_buffer_t buffer, con
     GGML_ASSERT(status);
 }
 
+static bool ggml_backend_rpc_buffer_load_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, size_t offset, uint64_t hash) {
+    ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
+    rpc_tensor rpc_tensor = serialize_tensor(tensor);
+    // input serialization format: | rpc_tensor | offset (8 bytes) | hash (8 bytes)
+    size_t input_size = sizeof(rpc_tensor) + sizeof(uint64_t) + sizeof(uint64_t);
+    std::vector<uint8_t> input(input_size, 0);
+    memcpy(input.data(), &rpc_tensor, sizeof(rpc_tensor));
+    memcpy(input.data() + sizeof(rpc_tensor), &offset, sizeof(offset));
+    memcpy(input.data() + sizeof(rpc_tensor) + sizeof(offset), &hash, sizeof(hash));
+    rpc_msg_set_tensor_hash_rsp response;
+    bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR_HASH, input.data(), input.size(), &response, sizeof(response));
+    GGML_ASSERT(status);
+    return response.result;
+}
+
 static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
     // check if src and dst are on the same server
     ggml_backend_buffer_t src_buffer = src->buffer;
@@ -602,6 +617,7 @@ static ggml_backend_buffer_i ggml_backend_rpc_buffer_interface = {
     /* .memset_tensor   = */ NULL,
     /* .set_tensor      = */ ggml_backend_rpc_buffer_set_tensor,
     /* .get_tensor      = */ ggml_backend_rpc_buffer_get_tensor,
+    /* .load_tensor     = */ ggml_backend_rpc_buffer_load_tensor,
     /* .cpy_tensor      = */ ggml_backend_rpc_buffer_cpy_tensor,
     /* .clear           = */ ggml_backend_rpc_buffer_clear,
     /* .reset           = */ NULL,
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
@@ -376,6 +376,7 @@ namespace GGUFMeta {
     template bool llama_model_loader::get_key<bool>       (enum llm_kv kid, bool & result,        bool required);
     template bool llama_model_loader::get_key<float>      (enum llm_kv kid, float & result,       bool required);
     template bool llama_model_loader::get_key<uint32_t>   (enum llm_kv kid, uint32_t & result,    bool required);
+    template bool llama_model_loader::get_key<uint64_t>   (enum llm_kv kid, uint64_t & result,    bool required);
     template bool llama_model_loader::get_key<std::string>(enum llm_kv kid, std::string & result, bool required);
 
     template<>
@@ -1022,7 +1023,16 @@ bool llama_model_loader::load_all_data(
                 mmap_used.first  = std::min(mmap_used.first,  weight->offs);
                 mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
             } else {
-                ggml_backend_tensor_set(cur, data, 0, n_size);
+                char hash_key[128];
+                snprintf(hash_key, sizeof(hash_key), "%s_hash", ggml_get_name(cur));
+                uint64_t hash_val = 0;
+                if (get_key(hash_key, hash_val, false)) {
+                    if (!ggml_backend_tensor_load(cur, 0, hash_val)) {
+                        ggml_backend_tensor_set(cur, data, 0, n_size);
+                    }
+                } else {
+                    ggml_backend_tensor_set(cur, data, 0, n_size);
+                }
             }
         } else {
             const auto & file = files.at(weight->idx);