diff --git a/common/arg.cpp b/common/arg.cpp
index 231de227a9122..70fbc50d19fd4 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2280,12 +2280,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         "- distribute: spread execution evenly over all nodes\n"
         "- isolate: only spawn threads on CPUs on the node that execution started on\n"
         "- numactl: use the CPU map provided by numactl\n"
+#ifdef GGML_USE_NUMA_MIGRATE
+        "- migrate: for affinity threads with page migration across NUMA nodes\n"
+#endif
         "if run without this previously, it is recommended to drop the system page cache before using this\n"
         "see https://github.com/ggml-org/llama.cpp/issues/1437",
         [](common_params & params, const std::string & value) {
             /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
             else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
             else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
+#ifdef GGML_USE_NUMA_MIGRATE
+            else if (value == "migrate") { params.numa = GGML_NUMA_STRATEGY_MIGRATE; }
+#endif
             else { throw std::invalid_argument("invalid value"); }
         }
     ).set_env("LLAMA_ARG_NUMA"));
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 7b398ae8e30ed..35ac888bc192b 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -152,6 +152,11 @@ set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
                                             "ggml: BLAS library vendor")
 option(GGML_LLAMAFILE                       "ggml: use LLAMAFILE"                             ${GGML_LLAMAFILE_DEFAULT})
 
+option(GGML_NUMA_MIGRATE                    "ggml: use NUMA_MIGRATE"                          OFF)
+set(GGML_NUMA_MIGRATE_NODES "2" CACHE STRING
+                                            "ggml: the number of NUMA nodes during page migration")
+option(GGML_NUMA_MIGRATE_DEBUG              "ggml: enable debugging of NUMA_MIGRATE"          OFF)
+
 option(GGML_CUDA                            "ggml: use CUDA"                                  OFF)
 option(GGML_MUSA                            "ggml: use MUSA"                                  OFF)
 option(GGML_CUDA_FORCE_MMQ                  "ggml: use mmq kernels instead of cuBLAS"         OFF)
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index 778927f68217a..6b02d5f24a54a 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -348,6 +348,9 @@ extern "C" {
     // CPU buffer types are always available
     GGML_API ggml_backend_buffer_t      ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
     GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
+#ifdef GGML_USE_NUMA_MIGRATE
+    GGML_API size_t ggml_backend_get_page_size(void);
+#endif
 
 #ifdef  __cplusplus
 }
diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h
index de77a875ec533..79f93910e0d88 100644
--- a/ggml/include/ggml-cpu.h
+++ b/ggml/include/ggml-cpu.h
@@ -12,6 +12,9 @@ extern "C" {
     struct ggml_cplan {
         size_t    work_size; // size of work buffer, calculated by `ggml_graph_plan()`
         uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
+#ifdef GGML_USE_NUMA_MIGRATE
+        uint8_t * work_data_numa[GGML_NUMA_MIGRATE_NODES];
+#endif
 
         int n_threads;
         struct ggml_threadpool * threadpool;
@@ -28,6 +31,9 @@ extern "C" {
         GGML_NUMA_STRATEGY_ISOLATE    = 2,
         GGML_NUMA_STRATEGY_NUMACTL    = 3,
         GGML_NUMA_STRATEGY_MIRROR     = 4,
+#ifdef GGML_USE_NUMA_MIGRATE
+        GGML_NUMA_STRATEGY_MIGRATE    = 5,
+#endif
         GGML_NUMA_STRATEGY_COUNT
     };
 
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 17c9366f4a3cf..cc7d6f80d5b0b 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -381,3 +381,27 @@ if (BUILD_SHARED_LIBS)
         target_compile_definitions(${target} PUBLIC  GGML_SHARED)
     endforeach()
 endif()
+
+if (GGML_NUMA_MIGRATE)
+    find_path(NUMA_ROOT_DIR
+        NAMES include/numa.h
+        PATHS ENV NUMA_ROOT
+        DOC "NUMA root directory")
+
+    find_library(NUMA_LIBRARY
+        NAMES numa
+        HINTS ${NUMA_ROOT_DIR}
+        DOC "NUMA library")
+
+    if (NOT NUMA_LIBRARY)
+        message(FATAL_ERROR "Could NOT find NUMA library.")
+    endif()
+
+    if (GGML_NUMA_MIGRATE_DEBUG)
+        target_compile_definitions(ggml-base PUBLIC GGML_USE_NUMA_MIGRATE GGML_NUMA_MIGRATE_NODES=${GGML_NUMA_MIGRATE_NODES} GGML_USE_NUMA_MIGRATE_DEBUG)
+    else()
+        target_compile_definitions(ggml-base PUBLIC GGML_USE_NUMA_MIGRATE GGML_NUMA_MIGRATE_NODES=${GGML_NUMA_MIGRATE_NODES})
+    endif()
+
+    target_link_libraries(ggml-base PRIVATE ${NUMA_LIBRARY})
+endif()
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index 5fd379f6a9461..5d0d1331a38af 100644
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -948,6 +948,22 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
         ggml_backend_buffer_type_t buft, size_t size,
         ggml_backend_buffer_t ** buffers, size_t * n_buffers) {
 
+#ifdef GGML_USE_NUMA_MIGRATE
+    size_t num_of_tensors = 0;
+    for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
+        if (t->data == NULL) {
+            if (t->view_src == NULL) {
+                num_of_tensors++;
+            }
+        }
+    }
+    size_t ps = ggml_backend_get_page_size();
+    size_t original_size = size;
+    size += ps * num_of_tensors;
+    GGML_LOG_DEBUG("alloc buffer for NUMA page migration, num of tensors: %ld, size increased from %ld to %ld, increased %ld MiB\n",
+                    num_of_tensors, original_size, size, (size - original_size) / 1024 / 1024);
+#endif
+
     ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
     if (buffer == NULL) {
         GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index b1050ad59c26a..7bebdd27f9817 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -22,12 +22,53 @@
 #include <string>
 #include <vector>
 #include <algorithm>
+#include <execinfo.h>
+#include <iostream>
+#include <cstdlib>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <set>
+#include <mutex>
+
+#ifdef GGML_USE_NUMA_MIGRATE
+#include <numa.h>
+#include <numaif.h>
+#endif
 
 #ifdef __APPLE__
 #include <sys/types.h>
 #include <sys/sysctl.h>
 #endif
 
+#ifdef GGML_USE_NUMA_MIGRATE
+class numa_migrate_mapping_cache {
+public:
+    void * addr;
+    int size;
+    numa_migrate_mapping_cache(void *addr, int size): addr(addr), size(size) { }
+
+    bool operator<(const numa_migrate_mapping_cache& other) const {
+        if (addr != other.addr) {
+            return addr < other.addr;
+        } else {
+            return size < other.size;
+        }
+    }
+
+    bool operator==(const numa_migrate_mapping_cache& other) const {
+        return (addr == other.addr && size == other.size);
+    }
+
+};
+
+static std::set<numa_migrate_mapping_cache> ggml_mapping_cache;
+static size_t ggml_backend_page_size = 0;
+static std::mutex ggml_mapping_mutex;
+#endif
 
 // backend buffer type
 
@@ -1666,6 +1707,244 @@ enum ggml_status ggml_backend_view_init(struct ggml_tensor * tensor) {
     return ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
 }
 
+#ifdef GGML_USE_NUMA_MIGRATE
+#ifdef GGML_USE_NUMA_MIGRATE_DEBUG
+static int check_numa_pages_migration(void *addr, size_t total_size) {
+    if (total_size % ggml_backend_page_size != 0) {
+        return -1;
+    }
+
+    size_t offset = 0; // Offset in bytes from the start of the allocated memory
+    int num_nodes = GGML_NUMA_MIGRATE_NODES;
+
+    for (int i = 0; i < num_nodes; ++i) {
+        int target_node = i;
+        size_t size_to_migrate = total_size / num_nodes;
+
+        if (size_to_migrate > total_size - offset) {
+            GGML_LOG_ERROR(
+                "Error: Size to migrate to node %d exceeds remaining memory, "
+                "size_to_migrate: %ld, total: %ld\n",
+                target_node, size_to_migrate, total_size);
+            return -1;
+        }
+
+        size_t num_pages_to_migrate = size_to_migrate / ggml_backend_page_size;
+        if (size_to_migrate % ggml_backend_page_size != 0) {
+            GGML_LOG_WARN("Warning: Size to migrate to node %ld is not a "
+                          "multiple of page size, total: %ld size_to_migrate: "
+                          "%ld, ggml_backend_page_size: %ld.\n",
+                          target_node, total_size, size_to_migrate,
+                          ggml_backend_page_size);
+            return -1;
+        }
+
+        if (num_pages_to_migrate == 0) {
+            GGML_LOG_WARN("Warning: No pages to migrate to node %d.\n",
+                          target_node);
+            continue;
+        }
+
+        void *migrate_start_addr = (char *)addr + (i)*size_to_migrate;
+
+        int *status = (int *)malloc(num_pages_to_migrate * sizeof(int));
+        if (!status) {
+            GGML_LOG_ERROR("malloc for status failed");
+            return -1;
+        }
+        memset(status, 0, num_pages_to_migrate * sizeof(int));
+
+        int *nodes = (int *)malloc(num_pages_to_migrate * sizeof(int));
+        if (!nodes) {
+            GGML_LOG_ERROR("malloc for nodes failed");
+            return -1;
+        }
+        memset(nodes, 0, num_pages_to_migrate * sizeof(int));
+
+        void **addr_to_migrate =
+            (void **)malloc(num_pages_to_migrate * sizeof(void *));
+        for (size_t j = 0; j < num_pages_to_migrate; j++) {
+            status[j] = 0;
+            nodes[j] = i;
+            addr_to_migrate[j] = (void *)((char *)migrate_start_addr +
+                                          j * ggml_backend_page_size);
+        }
+
+        // check if pages are migrated
+        int ret = move_pages(0, num_pages_to_migrate, addr_to_migrate, NULL,
+                             status, MPOL_MF_MOVE);
+        if (ret < 0) {
+            GGML_LOG_ERROR("check pages failed");
+            free(status);
+            free(nodes);
+            return -1;
+        }
+
+        for (size_t j = 0; j < num_pages_to_migrate; ++j) {
+            if (status[j] != target_node) {
+                GGML_LOG_WARN("Warning: Page %zu migration status to node %d: "
+                              "%d, ret: %d, addr: %p\n",
+                              j, target_node, status[j], ret,
+                              addr_to_migrate[j]);
+                if (status[j] == -ENODEV) {
+                    GGML_LOG_ERROR(
+                        "   - Error: No such device (NUMA node problem)\n");
+                } else if (status[j] == -EPERM) {
+                    GGML_LOG_ERROR(
+                        "   - Error: Operation not permitted (permissions)\n");
+                } else if (status[j] == -ENOENT) {
+                    GGML_LOG_ERROR("   - Error: ENOENT\n");
+                } else if (status[j] == -EFAULT) {
+                    GGML_LOG_ERROR("   - Error: Bad address\n");
+                } else if (status[j] == -EINVAL) {
+                    GGML_LOG_ERROR("   - Error: Invalid argument\n");
+                } else if (status[j] == -ENOMEM) {
+                    GGML_LOG_ERROR("   - Error: Out of memory\n");
+                } else if (status[j] == -EACCES) {
+                    GGML_LOG_ERROR("   - Error: access\n");
+                } else if (status[j] == -ESRCH) {
+                    GGML_LOG_ERROR("   - Error: access\n");
+                } else {
+                    GGML_LOG_ERROR("   - Error: Unknown status code at j: %ld: "
+                                   "%d, total_size: %ld\n",
+                                   j, status[j], total_size);
+                }
+
+                exit(0);
+                return -1;
+            }
+        }
+
+        free(status);
+        free(nodes);
+        free(addr_to_migrate);
+
+        offset += size_to_migrate;
+    }
+
+    GGML_LOG_INFO(
+        "page migration check passed at %p, size: %ld, num nodes: %d\n", addr,
+        total_size, num_nodes);
+    return 0;
+}
+#endif
+
+// Function to migrate pages to multiple NUMA nodes.
+static int migrate_pages_multiple_nodes(void *addr, size_t total_size) {
+
+    if (total_size % ggml_backend_page_size != 0) {
+        GGML_LOG_WARN("Warning: Total size is not a multiple of page size. "
+                      "Some memory may not be migrated.\n");
+        return -1;
+    }
+
+    size_t offset = 0; // Offset in bytes from the start of the allocated memory
+    int num_nodes = GGML_NUMA_MIGRATE_NODES;
+
+    for (int i = 0; i < num_nodes; ++i) {
+        int target_node = i;
+        size_t size_to_migrate = total_size / num_nodes;
+
+        if (size_to_migrate > total_size - offset) {
+            GGML_LOG_ERROR(
+                "Error: Size to migrate to node %d exceeds remaining memory, "
+                "size_to_migrate: %ld, total: %ld\n",
+                target_node, size_to_migrate, total_size);
+            return -1;
+        }
+
+        size_t num_pages_to_migrate = size_to_migrate / ggml_backend_page_size;
+        if (size_to_migrate % ggml_backend_page_size != 0) {
+#ifdef GGML_USE_NUMA_MIGRATE_DEBUG
+            GGML_LOG_WARN("Warning: Size to migrate to node %ld is not a "
+                          "multiple of page size, total: %ld size_to_migrate: "
+                          "%ld, ggml_backend_page_size: %ld.\n",
+                          target_node, total_size, size_to_migrate,
+                          ggml_backend_page_size);
+#endif
+            return -1;
+        }
+
+        if (num_pages_to_migrate == 0) {
+            GGML_LOG_WARN("Warning: No pages to migrate to node %d.\n",
+                          target_node);
+            continue;
+        }
+
+        void *migrate_start_addr = (char *)addr + (i)*size_to_migrate;
+
+        int *status = (int *)malloc(num_pages_to_migrate * sizeof(int));
+        if (!status) {
+            GGML_LOG_ERROR("malloc for status failed");
+            return -1;
+        }
+        memset(status, 0, num_pages_to_migrate * sizeof(int));
+
+        int *nodes = (int *)malloc(num_pages_to_migrate * sizeof(int));
+        if (!nodes) {
+            GGML_LOG_ERROR("malloc for nodes failed");
+            return -1;
+        }
+        memset(nodes, 0, num_pages_to_migrate * sizeof(int));
+
+        void **addr_to_migrate =
+            (void **)malloc(num_pages_to_migrate * sizeof(void *));
+        for (size_t j = 0; j < num_pages_to_migrate; j++) {
+            status[j] = 0;
+            nodes[j] = i;
+            addr_to_migrate[j] = (void *)((char *)migrate_start_addr +
+                                          j * ggml_backend_page_size);
+        }
+
+        int ret = move_pages(0, num_pages_to_migrate, addr_to_migrate, nodes,
+                             status, MPOL_MF_MOVE);
+        if (ret < 0) {
+            GGML_LOG_ERROR("move_pages failed");
+            free(status);
+            free(nodes);
+            return -1;
+        }
+
+        free(status);
+        free(nodes);
+        free(addr_to_migrate);
+
+        offset += size_to_migrate;
+    }
+
+    return 0;
+}
+
+static void migrate_pages_with_cache(void *addr, size_t size,
+                                     bool force_memset) {
+    if (size >= GGML_NUMA_MIGRATE_NODES * ggml_backend_page_size) {
+        numa_migrate_mapping_cache current_addr(addr, size);
+        std::lock_guard<std::mutex> lock(ggml_mapping_mutex);
+        auto it = ggml_mapping_cache.find(current_addr);
+        if (it == ggml_mapping_cache.end()) {
+            GGML_ASSERT(((uint64_t)(addr) & (ggml_backend_page_size - 1)) == 0);
+            int num_pages =
+                size / ggml_backend_page_size / GGML_NUMA_MIGRATE_NODES;
+            if (num_pages && ((size % ggml_backend_page_size) == 0)) {
+                if (force_memset) {
+                    memset(addr, 0, size); // force to allocate memory
+                }
+                if (migrate_pages_multiple_nodes(addr, size) != 0) {
+                    GGML_LOG_DEBUG("Migration to multiple nodes failed, addr: "
+                                   "%p, size: %ld\n",
+                                   addr, size);
+                } else {
+#ifdef GGML_USE_NUMA_MIGRATE_DEBUG
+                    check_numa_pages_migration(addr, size);
+#endif
+                }
+                ggml_mapping_cache.insert(current_addr);
+            }
+        }
+    }
+}
+#endif
+
 enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
     GGML_ASSERT(tensor->buffer == NULL);
     GGML_ASSERT(tensor->data == NULL);
@@ -1676,6 +1955,11 @@ enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct
 
     tensor->buffer = buffer;
     tensor->data = addr;
+
+#ifdef GGML_USE_NUMA_MIGRATE
+    size_t size = ggml_backend_buffer_get_alloc_size(buffer, tensor);
+    migrate_pages_with_cache(tensor->data, size, true);
+#endif
     return ggml_backend_buffer_init_tensor(buffer, tensor);
 }
 
@@ -1869,16 +2153,28 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
 static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
     uintptr_t data = (uintptr_t)buffer->context;
 
+#ifdef GGML_USE_NUMA_MIGRATE
+    // align the buffer
+    if (data % ggml_backend_page_size != 0) {
+        data = GGML_PAD(data, ggml_backend_page_size);
+    }
+#else
     // align the buffer
     if (data % TENSOR_ALIGNMENT != 0) {
         data = GGML_PAD(data, TENSOR_ALIGNMENT);
     }
+#endif
 
     return (void *)data;
 }
 
 static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+#ifdef GGML_USE_NUMA_MIGRATE
+    numa_free(buffer->context, buffer->size);
+#else
     ggml_aligned_free(buffer->context, buffer->size);
+#endif
+
 }
 
 static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
@@ -1947,8 +2243,22 @@ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_ty
     GGML_UNUSED(buft);
 }
 
+#ifdef GGML_USE_NUMA_MIGRATE
+size_t ggml_backend_get_page_size(void) {
+    if (ggml_backend_page_size == 0) {
+        ggml_backend_page_size = sysconf(_SC_PAGE_SIZE);
+    }
+    return ggml_backend_page_size;
+}
+#endif
+
 static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+#ifdef GGML_USE_NUMA_MIGRATE
+    ggml_backend_get_page_size();
+    void * data = numa_alloc_onnode(size, 0);
+#else
     void * data = ggml_aligned_malloc(size);
+#endif
 
     if (data == NULL) {
         GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
@@ -1959,7 +2269,11 @@ static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_back
 }
 
 static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+#ifdef GGML_USE_NUMA_MIGRATE
+    return ggml_backend_get_page_size();
+#else
     return TENSOR_ALIGNMENT;
+#endif
 
     GGML_UNUSED(buft);
 }
diff --git a/ggml/src/ggml-cpu/amx/amx.cpp b/ggml/src/ggml-cpu/amx/amx.cpp
index 258857b00754a..b57ec45025946 100644
--- a/ggml/src/ggml-cpu/amx/amx.cpp
+++ b/ggml/src/ggml-cpu/amx/amx.cpp
@@ -133,7 +133,11 @@ static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_back
 }
 
 static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+#ifdef GGML_USE_NUMA_MIGRATE
+    return ggml_backend_get_page_size();
+#else
     return TENSOR_ALIGNMENT;
+#endif
 
     GGML_UNUSED(buft);
 }
diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h
index ae68cd006336d..ee4353771565b 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-impl.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h
@@ -22,6 +22,9 @@ struct ggml_compute_params {
     // work buffer for all threads
     size_t wsize;
     void * wdata;
+#ifdef GGML_USE_NUMA_MIGRATE
+    void * wdata_numa[GGML_NUMA_MIGRATE_NODES];
+#endif
 
     struct ggml_threadpool * threadpool;
 };
@@ -502,6 +505,17 @@ static __m256 __lasx_xvreplfr2vr_s(const float val) {
 
 // TODO: move to ggml-threading
 void ggml_barrier(struct ggml_threadpool * tp);
+#ifdef GGML_USE_NUMA_MIGRATE
+enum ggml_barrier_node_index {
+    GGML_BARRIER_NODE_PING = 0,
+    GGML_BARRIER_NODE_PONG = 1,
+    GGML_BARRIER_NODE_LAST = 2,
+    GGML_BARRIER_NODE_CNTS = 3
+};
+void ggml_barrier_numa_aware(struct ggml_threadpool * tp, int ith, int node_n);
+int ggml_cores_per_numa(void);
+int ggml_get_node_from_cpu(int cpu);
+#endif
 
 void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value);
 int  ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value);
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 2c12e493bc9b0..204aea79de94c 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -42,6 +42,10 @@
 #include <omp.h>
 #endif
 
+#ifdef GGML_USE_NUMA_MIGRATE
+#include <numa.h>
+#endif
+
 #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
 #undef GGML_USE_LLAMAFILE
 #endif
@@ -444,6 +448,13 @@ struct ggml_threadpool {
     atomic_int n_graph;       // incremented when there is work to be done (i.e each graph)
     atomic_int GGML_CACHE_ALIGN n_barrier;
     atomic_int GGML_CACHE_ALIGN n_barrier_passed;
+
+#ifdef GGML_USE_NUMA_MIGRATE
+    atomic_int GGML_CACHE_ALIGN *n_barrier_node[GGML_NUMA_MIGRATE_NODES];
+    atomic_int GGML_CACHE_ALIGN *n_barrier_passed_node[GGML_NUMA_MIGRATE_NODES];
+    atomic_int GGML_CACHE_ALIGN *n_barrier_passed_last[GGML_BARRIER_NODE_CNTS];
+#endif
+
     atomic_int GGML_CACHE_ALIGN current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
 
     // these are atomic as an annotation for thread-sanitizer
@@ -509,6 +520,10 @@ struct ggml_numa_nodes {
 #else
     uint32_t cpuset; // no NUMA support outside of Linux at this time. Use a portable datatype
 #endif
+
+#ifdef GGML_USE_NUMA_MIGRATE
+    bool even_distributed;
+#endif
 };
 
 //
@@ -567,6 +582,67 @@ int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value) {
     return atomic_fetch_add_explicit(&tp->current_chunk, value, memory_order_relaxed);
 }
 
+#ifdef GGML_USE_NUMA_MIGRATE
+int ggml_get_node_from_cpu(int cpu) {
+    return cpu / g_state.numa.nodes[0].n_cpus;
+}
+
+int ggml_cores_per_numa(void) {
+    return g_state.numa.nodes[0].n_cpus;
+}
+
+void ggml_barrier_numa_aware(struct ggml_threadpool * tp, int ith, int node_n) {
+    if ((g_state.numa.numa_strategy != GGML_NUMA_STRATEGY_MIGRATE) || !g_state.numa.even_distributed) {
+        ggml_barrier(tp);
+        return;
+    }
+
+    int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
+    if (n_threads == 1) {
+        return;
+    }
+
+    int cores_per_numa = ggml_cores_per_numa();
+    int numa_nodes = n_threads / cores_per_numa;
+    int remaining_cores = n_threads % cores_per_numa;
+    if ((numa_nodes != GGML_NUMA_MIGRATE_NODES) || remaining_cores) {
+        ggml_barrier(tp);
+        return;
+    }
+
+    int node = ggml_get_node_from_cpu(ith);
+
+    int n_passed = atomic_load_explicit(tp->n_barrier_passed_node[node], memory_order_relaxed);
+
+    // enter barrier (full seq-cst fence)
+    int n_barrier = atomic_fetch_add_explicit(tp->n_barrier_node[node], 1, memory_order_seq_cst);
+
+    if (n_barrier == (cores_per_numa - 1)) {
+        // last thread of current numa node
+        atomic_store_explicit(tp->n_barrier_node[node], 0, memory_order_seq_cst);
+
+        int n_passed_node = atomic_fetch_add_explicit(tp->n_barrier_passed_last[node_n], 1, memory_order_seq_cst);
+
+        if (n_passed_node == (numa_nodes - 1)) { // last numa node cpu
+            atomic_fetch_add_explicit(tp->n_barrier_passed_node[node], 1, memory_order_seq_cst);
+            atomic_store_explicit(tp->n_barrier_passed_last[node_n], 0, memory_order_seq_cst);
+        } else {
+            while (atomic_load_explicit(tp->n_barrier_passed_last[node_n], memory_order_relaxed)) {
+                ggml_thread_cpu_relax();
+            }
+            atomic_fetch_add_explicit(tp->n_barrier_passed_node[node], 1, memory_order_seq_cst);
+        }
+
+        return;
+    }
+
+    // wait for other threads
+    while (atomic_load_explicit(tp->n_barrier_passed_node[node], memory_order_seq_cst) == n_passed) {
+        ggml_thread_cpu_relax();
+    }
+}
+#endif
+
 #if defined(__gnu_linux__)
 static cpu_set_t ggml_get_numa_affinity(void) {
     cpu_set_t cpuset;
@@ -643,6 +719,10 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
         struct ggml_numa_node * node = &g_state.numa.nodes[n];
         GGML_PRINT_DEBUG("CPUs on node %u:", n);
         node->n_cpus = 0;
+
+#ifdef GGML_USE_NUMA_MIGRATE
+        g_state.numa.even_distributed = true;
+#endif
         for (uint32_t c = 0; c < g_state.numa.total_cpus; ++c) {
             rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c);
             GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
@@ -652,6 +732,11 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
             }
         }
         GGML_PRINT_DEBUG("\n");
+#ifdef GGML_USE_NUMA_MIGRATE
+        if ((n != 0) && (g_state.numa.nodes[n].n_cpus != g_state.numa.nodes[0].n_cpus)) {
+            g_state.numa.even_distributed = false;
+        }
+#endif
     }
 
     if (ggml_is_numa()) {
@@ -2082,6 +2167,30 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
 
 // Android's libc implementation "bionic" does not support setting affinity
 #if defined(__gnu_linux__)
+
+#ifdef GGML_USE_NUMA_MIGRATE
+static void set_numa_migrate_affinity(int core_no) {
+    // Check if the core number is valid
+    if (core_no < 0 || core_no >= (int)g_state.numa.total_cpus) {
+        printf("%s, Warn: core_no not between 0 and %d, failback.\n", __func__, g_state.numa.total_cpus);
+        return;
+    }
+
+    cpu_set_t cpuset;
+    CPU_ZERO(&cpuset); // Initialize the CPU set
+
+    CPU_SET(core_no, &cpuset); // Set the specified core
+
+    // Set the thread's CPU affinity
+    int result = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
+    if (result != 0) {
+        printf("failed to set core_no affinity: %d\n", core_no);
+        perror("set_affinity");
+        exit (1);
+    }
+}
+#endif
+
 static void set_numa_thread_affinity(int thread_n) {
     if (!ggml_is_numa()) {
         return;
@@ -2107,6 +2216,11 @@ static void set_numa_thread_affinity(int thread_n) {
                 fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",strerror(rv));
             }
             return;
+#ifdef GGML_USE_NUMA_MIGRATE
+        case GGML_NUMA_STRATEGY_MIGRATE:
+            set_numa_migrate_affinity(thread_n);
+            return;
+#endif
         default:
             return;
     }
@@ -2855,6 +2969,11 @@ struct ggml_cplan ggml_graph_plan(
     cplan.n_threads  = MIN(max_tasks, n_threads);
     cplan.work_size  = work_size;
     cplan.work_data  = NULL;
+#ifdef GGML_USE_NUMA_MIGRATE
+    for (int i = 0; i < GGML_NUMA_MIGRATE_NODES; i++) {
+        cplan.work_data_numa[i] = NULL;
+    }
+#endif
 
     return cplan;
 }
@@ -2873,12 +2992,20 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
         /*.nth       =*/ atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed),
         /*.wsize     =*/ cplan->work_size,
         /*.wdata     =*/ cplan->work_data,
+#ifdef GGML_USE_NUMA_MIGRATE
+        /*.wdata_numa     =*/ {NULL, NULL},
+#endif
         /*.threadpool=*/ tp,
     };
 
+#ifdef GGML_USE_NUMA_MIGRATE
+    for (int i = 0; i < GGML_NUMA_MIGRATE_NODES; i++) {
+        params.wdata_numa[i] = cplan->work_data_numa[ggml_get_node_from_cpu(state->ith)];
+    }
+#endif
+
     for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
         struct ggml_tensor * node = cgraph->nodes[node_n];
-
         ggml_compute_forward(&params, node);
 
         if (state->ith == 0 && cplan->abort_callback &&
@@ -2888,11 +3015,19 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
         }
 
         if (node_n + 1 < cgraph->n_nodes) {
-            ggml_barrier(state->threadpool);
+#ifdef GGML_USE_NUMA_MIGRATE
+            ggml_barrier_numa_aware(state->threadpool, state->ith, node_n % GGML_BARRIER_NODE_LAST);
+#else
+            ggml_barrier(tp);
+#endif
         }
     }
 
-    ggml_barrier(state->threadpool);
+#ifdef GGML_USE_NUMA_MIGRATE
+    ggml_barrier_numa_aware(state->threadpool, state->ith, GGML_BARRIER_NODE_LAST);
+#else
+    ggml_barrier(tp);
+#endif
 
     return 0;
 }
@@ -3057,6 +3192,21 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
         threadpool->n_graph          = 0;
         threadpool->n_barrier        = 0;
         threadpool->n_barrier_passed = 0;
+
+#ifdef GGML_USE_NUMA_MIGRATE
+        for (int node = 0; node < GGML_NUMA_MIGRATE_NODES; node++) {
+            threadpool->n_barrier_node[node] = (atomic_int *)numa_alloc_onnode(sizeof(atomic_int), node);
+            *threadpool->n_barrier_node[node] = 0;
+            threadpool->n_barrier_passed_node[node] = (atomic_int *)numa_alloc_onnode(sizeof(atomic_int), node);
+            *threadpool->n_barrier_passed_node[node] = 0;
+        }
+
+        for (int i = 0; i < GGML_BARRIER_NODE_CNTS; i++) {
+            threadpool->n_barrier_passed_last[i] = (atomic_int *)malloc(sizeof(atomic_int));
+            *threadpool->n_barrier_passed_last[i] = 0;
+        }
+#endif
+
         threadpool->current_chunk    = 0;
         threadpool->stop             = false;
         threadpool->pause            = tpp->paused;
diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
index 735ef3f015c13..f87aaea514525 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -9,6 +9,9 @@
 #include <cctype>
 #include <string>
 #include <vector>
+#ifdef GGML_USE_NUMA_MIGRATE
+#include <numa.h>
+#endif
 
 #ifdef GGML_USE_CPU_HBM
 #    include "hbm.h"
@@ -87,6 +90,9 @@ struct ggml_backend_cpu_context {
     ggml_threadpool_t   threadpool;
 
     uint8_t *           work_data;
+#ifdef GGML_USE_NUMA_MIGRATE
+    uint8_t *           work_data_numa[GGML_NUMA_MIGRATE_NODES];
+#endif
     size_t              work_size;
 
     ggml_abort_callback abort_callback;
@@ -102,6 +108,11 @@ static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) {
 static void ggml_backend_cpu_free(ggml_backend_t backend) {
     struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
     delete[] cpu_ctx->work_data;
+#ifdef GGML_USE_NUMA_MIGRATE
+    for (int i = 0; i < GGML_NUMA_MIGRATE_NODES; i++) {
+        numa_free(cpu_ctx->work_data_numa[i], cpu_ctx->work_size);
+    }
+#endif
     delete cpu_ctx;
     delete backend;
 }
@@ -162,9 +173,24 @@ static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, s
             cpu_ctx->work_size = 0;
             return GGML_STATUS_ALLOC_FAILED;
         }
+
+#ifdef GGML_USE_NUMA_MIGRATE
+        for (int i = 0; i < GGML_NUMA_MIGRATE_NODES; i++) {
+            cpu_ctx->work_data_numa[i] = (uint8_t *)numa_alloc_onnode(cplan.work_size, i);
+            if (cpu_ctx->work_data_numa[i] == NULL) {
+                cpu_ctx->work_size = 0;
+                return GGML_STATUS_ALLOC_FAILED;
+            }
+         }
+#endif
         cpu_ctx->work_size = cplan.work_size;
     }
     cplan.work_data = (uint8_t *)cpu_ctx->work_data;
+#ifdef GGML_USE_NUMA_MIGRATE
+    for (int i = 0; i < GGML_NUMA_MIGRATE_NODES; i++) {
+        cplan.work_data_numa[i] = (uint8_t *)(cpu_ctx->work_data_numa[i]);
+    }
+#endif
 
     cplan.abort_callback      = cpu_ctx->abort_callback;
     cplan.abort_callback_data = cpu_ctx->abort_callback_data;
@@ -205,6 +231,11 @@ ggml_backend_t ggml_backend_cpu_init(void) {
     ctx->n_threads           = GGML_DEFAULT_N_THREADS;
     ctx->threadpool          = NULL;
     ctx->work_data           = NULL;
+#ifdef GGML_USE_NUMA_MIGRATE
+    for (int i = 0; i < GGML_NUMA_MIGRATE_NODES; i++) {
+        ctx->work_data_numa[i]    = NULL;
+    }
+#endif
     ctx->work_size           = 0;
     ctx->abort_callback      = NULL;
     ctx->abort_callback_data = NULL;
diff --git a/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp b/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp
index fafe45e6c5c51..dea79898bbac4 100644
--- a/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp
+++ b/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp
@@ -413,7 +413,11 @@ static ggml_backend_buffer_t ggml_backend_cpu_kleidiai_buffer_type_alloc_buffer(
 }
 
 static size_t ggml_backend_cpu_kleidiai_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+#ifdef GGML_USE_NUMA_MIGRATE
+    return ggml_backend_get_page_size();
+#else
     return TENSOR_ALIGNMENT;
+#endif
 
     GGML_UNUSED(buft);
 }
diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp
index 604ccee907843..520ce191a3fd2 100644
--- a/ggml/src/ggml-cpu/repack.cpp
+++ b/ggml/src/ggml-cpu/repack.cpp
@@ -17,6 +17,9 @@
 #include <cassert>
 #include <cstdlib> // for qsort
 #include <cstdio>  // for GGML_ASSERT
+#ifdef GGML_USE_NUMA_MIGRATE
+#include <numa.h>
+#endif
 
 #include "repack.h"
 
@@ -1232,8 +1235,12 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
 
         GGML_ASSERT(ggml_n_dims(op->src[0]) == 2);
         // GGML_ASSERT(ggml_n_dims(op->src[1]) == 2);
-
+#ifdef GGML_USE_NUMA_MIGRATE
+        int node_id = ggml_get_node_from_cpu(ith);
+        char *       wdata = static_cast<char *>(params->wdata_numa[node_id]);
+#else
         char *       wdata = static_cast<char *>(params->wdata);
+#endif
         const size_t nbw1  = ggml_row_size(PARAM_TYPE, ne10);
 
         assert(params->wsize >= nbw1 * ne11);
@@ -1241,18 +1248,31 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
         const ggml_from_float_t from_float = ggml_get_type_traits_cpu(PARAM_TYPE)->from_float;
 
         int64_t i11_processed = 0;
-        for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
+#ifdef GGML_USE_NUMA_MIGRATE
+        int round_cnts = ggml_cores_per_numa();
+        int start_id = ith - round_cnts * node_id;
+        if (round_cnts == 0) {
+            round_cnts = nth;
+            start_id = ith;
+        }
+#else
+        int round_cnts = nth;
+        int start_id = ith;
+#endif
+        for (int64_t i11 = start_id * 4; i11 < ne11 - ne11 % 4; i11 += round_cnts * 4) {
             ggml_quantize_mat_t<INTER_SIZE, PARAM_TYPE>((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), 4, ne10);
         }
 
         i11_processed = ne11 - ne11 % 4;
-        for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
+        for (int64_t i11 = i11_processed + start_id; i11 < ne11; i11 += round_cnts) {
             from_float((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), ne10);
         }
 
+#ifdef GGML_USE_NUMA_MIGRATE
+        ggml_barrier_numa_aware(params->threadpool, ith, GGML_BARRIER_NODE_LAST);
+#else
         ggml_barrier(params->threadpool);
-
-        const void * src1_wdata      = params->wdata;
+#endif
         const size_t src1_col_stride = ggml_row_size(PARAM_TYPE, ne10);
         int64_t      src0_start      = (ith * ne01) / nth;
         int64_t      src0_end        = ((ith + 1) * ne01) / nth;
@@ -1267,13 +1287,13 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
             gemm<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
                     (float *) ((char *) dst->data) + src0_start, ne01,
                     (const char *) src0->data + src0_start * nb01,
-                    (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
+                    (const char *) wdata, ne11 - ne11 % 4, src0_end - src0_start);
         }
         for (int iter = ne11 - ne11 % 4; iter < ne11; iter++) {
             gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
                     (float *) ((char *) dst->data + (iter * nb1)) + src0_start, ne01,
                     (const char *) src0->data + src0_start * nb01,
-                    (const char *) src1_wdata + (src1_col_stride * iter), 1,
+                    (const char *) wdata + (src1_col_stride * iter), 1,
                     src0_end - src0_start);
         }
     }
@@ -1498,7 +1518,11 @@ static ggml_backend_buffer_t ggml_backend_cpu_repack_buffer_type_alloc_buffer(gg
 }
 
 static size_t ggml_backend_cpu_repack_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+#ifdef GGML_USE_NUMA_MIGRATE
+    return ggml_backend_get_page_size();
+#else
     return TENSOR_ALIGNMENT;
+#endif
 
     GGML_UNUSED(buft);
 }
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index bd9e6da8832b7..6ebda9250087d 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -711,6 +711,10 @@ llama_model_loader::llama_model_loader(
         use_mmap = false;
     }
 
+#ifdef GGML_USE_NUMA_MIGRATE
+    use_mmap = false;
+#endif
+
     this->use_mmap = use_mmap;
     this->check_tensors = check_tensors;
 }
diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp
index e59d61f195675..7891ddb2c3a78 100644
--- a/tools/llama-bench/llama-bench.cpp
+++ b/tools/llama-bench/llama-bench.cpp
@@ -312,7 +312,12 @@ static void print_usage(int /* argc */, char ** argv) {
     printf("\n");
     printf("options:\n");
     printf("  -h, --help\n");
+#ifdef GGML_USE_NUMA_MIGRATE
+    printf("  --numa <distribute|isolate|numactl|migrate>\n");
+    printf("                                            numa mode (default: disabled)\n");
+#else
     printf("  --numa <distribute|isolate|numactl>       numa mode (default: disabled)\n");
+#endif
     printf("  -r, --repetitions <n>                     number of times to repeat each test (default: %d)\n",
            cmd_params_defaults.reps);
     printf("  --prio <-1|0|1|2|3>                          process/thread priority (default: %d)\n",
@@ -628,6 +633,10 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                     params.numa = GGML_NUMA_STRATEGY_ISOLATE;
                 } else if (value == "numactl") {
                     params.numa = GGML_NUMA_STRATEGY_NUMACTL;
+#ifdef GGML_USE_NUMA_MIGRATE
+                } else if (value == "migrate") {
+                    params.numa = GGML_NUMA_STRATEGY_MIGRATE;
+#endif
                 } else {
                     invalid_param = true;
                     break;