diff --git a/common/arg.cpp b/common/arg.cpp index 231de227a9122..70fbc50d19fd4 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2280,12 +2280,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex "- distribute: spread execution evenly over all nodes\n" "- isolate: only spawn threads on CPUs on the node that execution started on\n" "- numactl: use the CPU map provided by numactl\n" +#ifdef GGML_USE_NUMA_MIGRATE + "- migrate: for affinity threads with page migration across NUMA nodes\n" +#endif "if run without this previously, it is recommended to drop the system page cache before using this\n" "see https://github.com/ggml-org/llama.cpp/issues/1437", [](common_params & params, const std::string & value) { /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } +#ifdef GGML_USE_NUMA_MIGRATE + else if (value == "migrate") { params.numa = GGML_NUMA_STRATEGY_MIGRATE; } +#endif else { throw std::invalid_argument("invalid value"); } } ).set_env("LLAMA_ARG_NUMA")); diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 7b398ae8e30ed..35ac888bc192b 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -152,6 +152,11 @@ set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING "ggml: BLAS library vendor") option(GGML_LLAMAFILE "ggml: use LLAMAFILE" ${GGML_LLAMAFILE_DEFAULT}) +option(GGML_NUMA_MIGRATE "ggml: use NUMA_MIGRATE" OFF) +set(GGML_NUMA_MIGRATE_NODES "2" CACHE STRING + "ggml: the number of NUMA nodes during page migration") +option(GGML_NUMA_MIGRATE_DEBUG "ggml: enable debugging of NUMA_MIGRATE" OFF) + option(GGML_CUDA "ggml: use CUDA" OFF) option(GGML_MUSA "ggml: use MUSA" OFF) option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF) diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index 778927f68217a..6b02d5f24a54a 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -348,6 +348,9 @@ extern "C" { // CPU buffer types are always available GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size); GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void); +#ifdef GGML_USE_NUMA_MIGRATE + GGML_API size_t ggml_backend_get_page_size(void); +#endif #ifdef __cplusplus } diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h index de77a875ec533..79f93910e0d88 100644 --- a/ggml/include/ggml-cpu.h +++ b/ggml/include/ggml-cpu.h @@ -12,6 +12,9 @@ extern "C" { struct ggml_cplan { size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()` uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()` +#ifdef GGML_USE_NUMA_MIGRATE + uint8_t * work_data_numa[GGML_NUMA_MIGRATE_NODES]; +#endif int n_threads; struct ggml_threadpool * threadpool; @@ -28,6 +31,9 @@ extern "C" { GGML_NUMA_STRATEGY_ISOLATE = 2, GGML_NUMA_STRATEGY_NUMACTL = 3, GGML_NUMA_STRATEGY_MIRROR = 4, +#ifdef GGML_USE_NUMA_MIGRATE + GGML_NUMA_STRATEGY_MIGRATE = 5, +#endif GGML_NUMA_STRATEGY_COUNT }; diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 17c9366f4a3cf..cc7d6f80d5b0b 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -381,3 +381,27 @@ if (BUILD_SHARED_LIBS) target_compile_definitions(${target} PUBLIC GGML_SHARED) endforeach() endif() + +if (GGML_NUMA_MIGRATE) + find_path(NUMA_ROOT_DIR + NAMES include/numa.h + PATHS ENV NUMA_ROOT + DOC "NUMA root directory") + + find_library(NUMA_LIBRARY + NAMES numa + HINTS ${NUMA_ROOT_DIR} + DOC "NUMA library") + + if (NOT NUMA_LIBRARY) + message(FATAL_ERROR "Could NOT find NUMA library.") + endif() + + if (GGML_NUMA_MIGRATE_DEBUG) + target_compile_definitions(ggml-base PUBLIC GGML_USE_NUMA_MIGRATE GGML_NUMA_MIGRATE_NODES=${GGML_NUMA_MIGRATE_NODES} GGML_USE_NUMA_MIGRATE_DEBUG) + else() + target_compile_definitions(ggml-base PUBLIC GGML_USE_NUMA_MIGRATE GGML_NUMA_MIGRATE_NODES=${GGML_NUMA_MIGRATE_NODES}) + endif() + + target_link_libraries(ggml-base PRIVATE ${NUMA_LIBRARY}) +endif() diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c index 5fd379f6a9461..5d0d1331a38af 100644 --- a/ggml/src/ggml-alloc.c +++ b/ggml/src/ggml-alloc.c @@ -948,6 +948,22 @@ static bool alloc_tensor_range(struct ggml_context * ctx, ggml_backend_buffer_type_t buft, size_t size, ggml_backend_buffer_t ** buffers, size_t * n_buffers) { +#ifdef GGML_USE_NUMA_MIGRATE + size_t num_of_tensors = 0; + for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) { + if (t->data == NULL) { + if (t->view_src == NULL) { + num_of_tensors++; + } + } + } + size_t ps = ggml_backend_get_page_size(); + size_t original_size = size; + size += ps * num_of_tensors; + GGML_LOG_DEBUG("alloc buffer for NUMA page migration, num of tensors: %ld, size increased from %ld to %ld, increased %ld MiB\n", + num_of_tensors, original_size, size, (size - original_size) / 1024 / 1024); +#endif + ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size); if (buffer == NULL) { GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size); diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index b1050ad59c26a..7bebdd27f9817 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -22,12 +22,53 @@ #include #include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef GGML_USE_NUMA_MIGRATE +#include +#include +#endif #ifdef __APPLE__ #include #include #endif +#ifdef GGML_USE_NUMA_MIGRATE +class numa_migrate_mapping_cache { +public: + void * addr; + int size; + numa_migrate_mapping_cache(void *addr, int size): addr(addr), size(size) { } + + bool operator<(const numa_migrate_mapping_cache& other) const { + if (addr != other.addr) { + return addr < other.addr; + } else { + return size < other.size; + } + } + + bool operator==(const numa_migrate_mapping_cache& other) const { + return (addr == other.addr && size == other.size); + } + +}; + +static std::set ggml_mapping_cache; +static size_t ggml_backend_page_size = 0; +static std::mutex ggml_mapping_mutex; +#endif // backend buffer type @@ -1666,6 +1707,244 @@ enum ggml_status ggml_backend_view_init(struct ggml_tensor * tensor) { return ggml_backend_buffer_init_tensor(tensor->buffer, tensor); } +#ifdef GGML_USE_NUMA_MIGRATE +#ifdef GGML_USE_NUMA_MIGRATE_DEBUG +static int check_numa_pages_migration(void *addr, size_t total_size) { + if (total_size % ggml_backend_page_size != 0) { + return -1; + } + + size_t offset = 0; // Offset in bytes from the start of the allocated memory + int num_nodes = GGML_NUMA_MIGRATE_NODES; + + for (int i = 0; i < num_nodes; ++i) { + int target_node = i; + size_t size_to_migrate = total_size / num_nodes; + + if (size_to_migrate > total_size - offset) { + GGML_LOG_ERROR( + "Error: Size to migrate to node %d exceeds remaining memory, " + "size_to_migrate: %ld, total: %ld\n", + target_node, size_to_migrate, total_size); + return -1; + } + + size_t num_pages_to_migrate = size_to_migrate / ggml_backend_page_size; + if (size_to_migrate % ggml_backend_page_size != 0) { + GGML_LOG_WARN("Warning: Size to migrate to node %ld is not a " + "multiple of page size, total: %ld size_to_migrate: " + "%ld, ggml_backend_page_size: %ld.\n", + target_node, total_size, size_to_migrate, + ggml_backend_page_size); + return -1; + } + + if (num_pages_to_migrate == 0) { + GGML_LOG_WARN("Warning: No pages to migrate to node %d.\n", + target_node); + continue; + } + + void *migrate_start_addr = (char *)addr + (i)*size_to_migrate; + + int *status = (int *)malloc(num_pages_to_migrate * sizeof(int)); + if (!status) { + GGML_LOG_ERROR("malloc for status failed"); + return -1; + } + memset(status, 0, num_pages_to_migrate * sizeof(int)); + + int *nodes = (int *)malloc(num_pages_to_migrate * sizeof(int)); + if (!nodes) { + GGML_LOG_ERROR("malloc for nodes failed"); + return -1; + } + memset(nodes, 0, num_pages_to_migrate * sizeof(int)); + + void **addr_to_migrate = + (void **)malloc(num_pages_to_migrate * sizeof(void *)); + for (size_t j = 0; j < num_pages_to_migrate; j++) { + status[j] = 0; + nodes[j] = i; + addr_to_migrate[j] = (void *)((char *)migrate_start_addr + + j * ggml_backend_page_size); + } + + // check if pages are migrated + int ret = move_pages(0, num_pages_to_migrate, addr_to_migrate, NULL, + status, MPOL_MF_MOVE); + if (ret < 0) { + GGML_LOG_ERROR("check pages failed"); + free(status); + free(nodes); + return -1; + } + + for (size_t j = 0; j < num_pages_to_migrate; ++j) { + if (status[j] != target_node) { + GGML_LOG_WARN("Warning: Page %zu migration status to node %d: " + "%d, ret: %d, addr: %p\n", + j, target_node, status[j], ret, + addr_to_migrate[j]); + if (status[j] == -ENODEV) { + GGML_LOG_ERROR( + " - Error: No such device (NUMA node problem)\n"); + } else if (status[j] == -EPERM) { + GGML_LOG_ERROR( + " - Error: Operation not permitted (permissions)\n"); + } else if (status[j] == -ENOENT) { + GGML_LOG_ERROR(" - Error: ENOENT\n"); + } else if (status[j] == -EFAULT) { + GGML_LOG_ERROR(" - Error: Bad address\n"); + } else if (status[j] == -EINVAL) { + GGML_LOG_ERROR(" - Error: Invalid argument\n"); + } else if (status[j] == -ENOMEM) { + GGML_LOG_ERROR(" - Error: Out of memory\n"); + } else if (status[j] == -EACCES) { + GGML_LOG_ERROR(" - Error: access\n"); + } else if (status[j] == -ESRCH) { + GGML_LOG_ERROR(" - Error: access\n"); + } else { + GGML_LOG_ERROR(" - Error: Unknown status code at j: %ld: " + "%d, total_size: %ld\n", + j, status[j], total_size); + } + + exit(0); + return -1; + } + } + + free(status); + free(nodes); + free(addr_to_migrate); + + offset += size_to_migrate; + } + + GGML_LOG_INFO( + "page migration check passed at %p, size: %ld, num nodes: %d\n", addr, + total_size, num_nodes); + return 0; +} +#endif + +// Function to migrate pages to multiple NUMA nodes. +static int migrate_pages_multiple_nodes(void *addr, size_t total_size) { + + if (total_size % ggml_backend_page_size != 0) { + GGML_LOG_WARN("Warning: Total size is not a multiple of page size. " + "Some memory may not be migrated.\n"); + return -1; + } + + size_t offset = 0; // Offset in bytes from the start of the allocated memory + int num_nodes = GGML_NUMA_MIGRATE_NODES; + + for (int i = 0; i < num_nodes; ++i) { + int target_node = i; + size_t size_to_migrate = total_size / num_nodes; + + if (size_to_migrate > total_size - offset) { + GGML_LOG_ERROR( + "Error: Size to migrate to node %d exceeds remaining memory, " + "size_to_migrate: %ld, total: %ld\n", + target_node, size_to_migrate, total_size); + return -1; + } + + size_t num_pages_to_migrate = size_to_migrate / ggml_backend_page_size; + if (size_to_migrate % ggml_backend_page_size != 0) { +#ifdef GGML_USE_NUMA_MIGRATE_DEBUG + GGML_LOG_WARN("Warning: Size to migrate to node %ld is not a " + "multiple of page size, total: %ld size_to_migrate: " + "%ld, ggml_backend_page_size: %ld.\n", + target_node, total_size, size_to_migrate, + ggml_backend_page_size); +#endif + return -1; + } + + if (num_pages_to_migrate == 0) { + GGML_LOG_WARN("Warning: No pages to migrate to node %d.\n", + target_node); + continue; + } + + void *migrate_start_addr = (char *)addr + (i)*size_to_migrate; + + int *status = (int *)malloc(num_pages_to_migrate * sizeof(int)); + if (!status) { + GGML_LOG_ERROR("malloc for status failed"); + return -1; + } + memset(status, 0, num_pages_to_migrate * sizeof(int)); + + int *nodes = (int *)malloc(num_pages_to_migrate * sizeof(int)); + if (!nodes) { + GGML_LOG_ERROR("malloc for nodes failed"); + return -1; + } + memset(nodes, 0, num_pages_to_migrate * sizeof(int)); + + void **addr_to_migrate = + (void **)malloc(num_pages_to_migrate * sizeof(void *)); + for (size_t j = 0; j < num_pages_to_migrate; j++) { + status[j] = 0; + nodes[j] = i; + addr_to_migrate[j] = (void *)((char *)migrate_start_addr + + j * ggml_backend_page_size); + } + + int ret = move_pages(0, num_pages_to_migrate, addr_to_migrate, nodes, + status, MPOL_MF_MOVE); + if (ret < 0) { + GGML_LOG_ERROR("move_pages failed"); + free(status); + free(nodes); + return -1; + } + + free(status); + free(nodes); + free(addr_to_migrate); + + offset += size_to_migrate; + } + + return 0; +} + +static void migrate_pages_with_cache(void *addr, size_t size, + bool force_memset) { + if (size >= GGML_NUMA_MIGRATE_NODES * ggml_backend_page_size) { + numa_migrate_mapping_cache current_addr(addr, size); + std::lock_guard lock(ggml_mapping_mutex); + auto it = ggml_mapping_cache.find(current_addr); + if (it == ggml_mapping_cache.end()) { + GGML_ASSERT(((uint64_t)(addr) & (ggml_backend_page_size - 1)) == 0); + int num_pages = + size / ggml_backend_page_size / GGML_NUMA_MIGRATE_NODES; + if (num_pages && ((size % ggml_backend_page_size) == 0)) { + if (force_memset) { + memset(addr, 0, size); // force to allocate memory + } + if (migrate_pages_multiple_nodes(addr, size) != 0) { + GGML_LOG_DEBUG("Migration to multiple nodes failed, addr: " + "%p, size: %ld\n", + addr, size); + } else { +#ifdef GGML_USE_NUMA_MIGRATE_DEBUG + check_numa_pages_migration(addr, size); +#endif + } + ggml_mapping_cache.insert(current_addr); + } + } + } +} +#endif + enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) { GGML_ASSERT(tensor->buffer == NULL); GGML_ASSERT(tensor->data == NULL); @@ -1676,6 +1955,11 @@ enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct tensor->buffer = buffer; tensor->data = addr; + +#ifdef GGML_USE_NUMA_MIGRATE + size_t size = ggml_backend_buffer_get_alloc_size(buffer, tensor); + migrate_pages_with_cache(tensor->data, size, true); +#endif return ggml_backend_buffer_init_tensor(buffer, tensor); } @@ -1869,16 +2153,28 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { uintptr_t data = (uintptr_t)buffer->context; +#ifdef GGML_USE_NUMA_MIGRATE + // align the buffer + if (data % ggml_backend_page_size != 0) { + data = GGML_PAD(data, ggml_backend_page_size); + } +#else // align the buffer if (data % TENSOR_ALIGNMENT != 0) { data = GGML_PAD(data, TENSOR_ALIGNMENT); } +#endif return (void *)data; } static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { +#ifdef GGML_USE_NUMA_MIGRATE + numa_free(buffer->context, buffer->size); +#else ggml_aligned_free(buffer->context, buffer->size); +#endif + } static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { @@ -1947,8 +2243,22 @@ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_ty GGML_UNUSED(buft); } +#ifdef GGML_USE_NUMA_MIGRATE +size_t ggml_backend_get_page_size(void) { + if (ggml_backend_page_size == 0) { + ggml_backend_page_size = sysconf(_SC_PAGE_SIZE); + } + return ggml_backend_page_size; +} +#endif + static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +#ifdef GGML_USE_NUMA_MIGRATE + ggml_backend_get_page_size(); + void * data = numa_alloc_onnode(size, 0); +#else void * data = ggml_aligned_malloc(size); +#endif if (data == NULL) { GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size); @@ -1959,7 +2269,11 @@ static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_back } static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +#ifdef GGML_USE_NUMA_MIGRATE + return ggml_backend_get_page_size(); +#else return TENSOR_ALIGNMENT; +#endif GGML_UNUSED(buft); } diff --git a/ggml/src/ggml-cpu/amx/amx.cpp b/ggml/src/ggml-cpu/amx/amx.cpp index 258857b00754a..b57ec45025946 100644 --- a/ggml/src/ggml-cpu/amx/amx.cpp +++ b/ggml/src/ggml-cpu/amx/amx.cpp @@ -133,7 +133,11 @@ static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_back } static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +#ifdef GGML_USE_NUMA_MIGRATE + return ggml_backend_get_page_size(); +#else return TENSOR_ALIGNMENT; +#endif GGML_UNUSED(buft); } diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h index ae68cd006336d..ee4353771565b 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-impl.h +++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h @@ -22,6 +22,9 @@ struct ggml_compute_params { // work buffer for all threads size_t wsize; void * wdata; +#ifdef GGML_USE_NUMA_MIGRATE + void * wdata_numa[GGML_NUMA_MIGRATE_NODES]; +#endif struct ggml_threadpool * threadpool; }; @@ -502,6 +505,17 @@ static __m256 __lasx_xvreplfr2vr_s(const float val) { // TODO: move to ggml-threading void ggml_barrier(struct ggml_threadpool * tp); +#ifdef GGML_USE_NUMA_MIGRATE +enum ggml_barrier_node_index { + GGML_BARRIER_NODE_PING = 0, + GGML_BARRIER_NODE_PONG = 1, + GGML_BARRIER_NODE_LAST = 2, + GGML_BARRIER_NODE_CNTS = 3 +}; +void ggml_barrier_numa_aware(struct ggml_threadpool * tp, int ith, int node_n); +int ggml_cores_per_numa(void); +int ggml_get_node_from_cpu(int cpu); +#endif void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value); int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value); diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 2c12e493bc9b0..204aea79de94c 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -42,6 +42,10 @@ #include #endif +#ifdef GGML_USE_NUMA_MIGRATE +#include +#endif + #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8) #undef GGML_USE_LLAMAFILE #endif @@ -444,6 +448,13 @@ struct ggml_threadpool { atomic_int n_graph; // incremented when there is work to be done (i.e each graph) atomic_int GGML_CACHE_ALIGN n_barrier; atomic_int GGML_CACHE_ALIGN n_barrier_passed; + +#ifdef GGML_USE_NUMA_MIGRATE + atomic_int GGML_CACHE_ALIGN *n_barrier_node[GGML_NUMA_MIGRATE_NODES]; + atomic_int GGML_CACHE_ALIGN *n_barrier_passed_node[GGML_NUMA_MIGRATE_NODES]; + atomic_int GGML_CACHE_ALIGN *n_barrier_passed_last[GGML_BARRIER_NODE_CNTS]; +#endif + atomic_int GGML_CACHE_ALIGN current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads. // these are atomic as an annotation for thread-sanitizer @@ -509,6 +520,10 @@ struct ggml_numa_nodes { #else uint32_t cpuset; // no NUMA support outside of Linux at this time. Use a portable datatype #endif + +#ifdef GGML_USE_NUMA_MIGRATE + bool even_distributed; +#endif }; // @@ -567,6 +582,67 @@ int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value) { return atomic_fetch_add_explicit(&tp->current_chunk, value, memory_order_relaxed); } +#ifdef GGML_USE_NUMA_MIGRATE +int ggml_get_node_from_cpu(int cpu) { + return cpu / g_state.numa.nodes[0].n_cpus; +} + +int ggml_cores_per_numa(void) { + return g_state.numa.nodes[0].n_cpus; +} + +void ggml_barrier_numa_aware(struct ggml_threadpool * tp, int ith, int node_n) { + if ((g_state.numa.numa_strategy != GGML_NUMA_STRATEGY_MIGRATE) || !g_state.numa.even_distributed) { + ggml_barrier(tp); + return; + } + + int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed); + if (n_threads == 1) { + return; + } + + int cores_per_numa = ggml_cores_per_numa(); + int numa_nodes = n_threads / cores_per_numa; + int remaining_cores = n_threads % cores_per_numa; + if ((numa_nodes != GGML_NUMA_MIGRATE_NODES) || remaining_cores) { + ggml_barrier(tp); + return; + } + + int node = ggml_get_node_from_cpu(ith); + + int n_passed = atomic_load_explicit(tp->n_barrier_passed_node[node], memory_order_relaxed); + + // enter barrier (full seq-cst fence) + int n_barrier = atomic_fetch_add_explicit(tp->n_barrier_node[node], 1, memory_order_seq_cst); + + if (n_barrier == (cores_per_numa - 1)) { + // last thread of current numa node + atomic_store_explicit(tp->n_barrier_node[node], 0, memory_order_seq_cst); + + int n_passed_node = atomic_fetch_add_explicit(tp->n_barrier_passed_last[node_n], 1, memory_order_seq_cst); + + if (n_passed_node == (numa_nodes - 1)) { // last numa node cpu + atomic_fetch_add_explicit(tp->n_barrier_passed_node[node], 1, memory_order_seq_cst); + atomic_store_explicit(tp->n_barrier_passed_last[node_n], 0, memory_order_seq_cst); + } else { + while (atomic_load_explicit(tp->n_barrier_passed_last[node_n], memory_order_relaxed)) { + ggml_thread_cpu_relax(); + } + atomic_fetch_add_explicit(tp->n_barrier_passed_node[node], 1, memory_order_seq_cst); + } + + return; + } + + // wait for other threads + while (atomic_load_explicit(tp->n_barrier_passed_node[node], memory_order_seq_cst) == n_passed) { + ggml_thread_cpu_relax(); + } +} +#endif + #if defined(__gnu_linux__) static cpu_set_t ggml_get_numa_affinity(void) { cpu_set_t cpuset; @@ -643,6 +719,10 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) { struct ggml_numa_node * node = &g_state.numa.nodes[n]; GGML_PRINT_DEBUG("CPUs on node %u:", n); node->n_cpus = 0; + +#ifdef GGML_USE_NUMA_MIGRATE + g_state.numa.even_distributed = true; +#endif for (uint32_t c = 0; c < g_state.numa.total_cpus; ++c) { rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c); GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path)); @@ -652,6 +732,11 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) { } } GGML_PRINT_DEBUG("\n"); +#ifdef GGML_USE_NUMA_MIGRATE + if ((n != 0) && (g_state.numa.nodes[n].n_cpus != g_state.numa.nodes[0].n_cpus)) { + g_state.numa.even_distributed = false; + } +#endif } if (ggml_is_numa()) { @@ -2082,6 +2167,30 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm // Android's libc implementation "bionic" does not support setting affinity #if defined(__gnu_linux__) + +#ifdef GGML_USE_NUMA_MIGRATE +static void set_numa_migrate_affinity(int core_no) { + // Check if the core number is valid + if (core_no < 0 || core_no >= (int)g_state.numa.total_cpus) { + printf("%s, Warn: core_no not between 0 and %d, failback.\n", __func__, g_state.numa.total_cpus); + return; + } + + cpu_set_t cpuset; + CPU_ZERO(&cpuset); // Initialize the CPU set + + CPU_SET(core_no, &cpuset); // Set the specified core + + // Set the thread's CPU affinity + int result = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset); + if (result != 0) { + printf("failed to set core_no affinity: %d\n", core_no); + perror("set_affinity"); + exit (1); + } +} +#endif + static void set_numa_thread_affinity(int thread_n) { if (!ggml_is_numa()) { return; @@ -2107,6 +2216,11 @@ static void set_numa_thread_affinity(int thread_n) { fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",strerror(rv)); } return; +#ifdef GGML_USE_NUMA_MIGRATE + case GGML_NUMA_STRATEGY_MIGRATE: + set_numa_migrate_affinity(thread_n); + return; +#endif default: return; } @@ -2855,6 +2969,11 @@ struct ggml_cplan ggml_graph_plan( cplan.n_threads = MIN(max_tasks, n_threads); cplan.work_size = work_size; cplan.work_data = NULL; +#ifdef GGML_USE_NUMA_MIGRATE + for (int i = 0; i < GGML_NUMA_MIGRATE_NODES; i++) { + cplan.work_data_numa[i] = NULL; + } +#endif return cplan; } @@ -2873,12 +2992,20 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { /*.nth =*/ atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed), /*.wsize =*/ cplan->work_size, /*.wdata =*/ cplan->work_data, +#ifdef GGML_USE_NUMA_MIGRATE + /*.wdata_numa =*/ {NULL, NULL}, +#endif /*.threadpool=*/ tp, }; +#ifdef GGML_USE_NUMA_MIGRATE + for (int i = 0; i < GGML_NUMA_MIGRATE_NODES; i++) { + params.wdata_numa[i] = cplan->work_data_numa[ggml_get_node_from_cpu(state->ith)]; + } +#endif + for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) { struct ggml_tensor * node = cgraph->nodes[node_n]; - ggml_compute_forward(¶ms, node); if (state->ith == 0 && cplan->abort_callback && @@ -2888,11 +3015,19 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { } if (node_n + 1 < cgraph->n_nodes) { - ggml_barrier(state->threadpool); +#ifdef GGML_USE_NUMA_MIGRATE + ggml_barrier_numa_aware(state->threadpool, state->ith, node_n % GGML_BARRIER_NODE_LAST); +#else + ggml_barrier(tp); +#endif } } - ggml_barrier(state->threadpool); +#ifdef GGML_USE_NUMA_MIGRATE + ggml_barrier_numa_aware(state->threadpool, state->ith, GGML_BARRIER_NODE_LAST); +#else + ggml_barrier(tp); +#endif return 0; } @@ -3057,6 +3192,21 @@ static struct ggml_threadpool * ggml_threadpool_new_impl( threadpool->n_graph = 0; threadpool->n_barrier = 0; threadpool->n_barrier_passed = 0; + +#ifdef GGML_USE_NUMA_MIGRATE + for (int node = 0; node < GGML_NUMA_MIGRATE_NODES; node++) { + threadpool->n_barrier_node[node] = (atomic_int *)numa_alloc_onnode(sizeof(atomic_int), node); + *threadpool->n_barrier_node[node] = 0; + threadpool->n_barrier_passed_node[node] = (atomic_int *)numa_alloc_onnode(sizeof(atomic_int), node); + *threadpool->n_barrier_passed_node[node] = 0; + } + + for (int i = 0; i < GGML_BARRIER_NODE_CNTS; i++) { + threadpool->n_barrier_passed_last[i] = (atomic_int *)malloc(sizeof(atomic_int)); + *threadpool->n_barrier_passed_last[i] = 0; + } +#endif + threadpool->current_chunk = 0; threadpool->stop = false; threadpool->pause = tpp->paused; diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp index 735ef3f015c13..f87aaea514525 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -9,6 +9,9 @@ #include #include #include +#ifdef GGML_USE_NUMA_MIGRATE +#include +#endif #ifdef GGML_USE_CPU_HBM # include "hbm.h" @@ -87,6 +90,9 @@ struct ggml_backend_cpu_context { ggml_threadpool_t threadpool; uint8_t * work_data; +#ifdef GGML_USE_NUMA_MIGRATE + uint8_t * work_data_numa[GGML_NUMA_MIGRATE_NODES]; +#endif size_t work_size; ggml_abort_callback abort_callback; @@ -102,6 +108,11 @@ static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) { static void ggml_backend_cpu_free(ggml_backend_t backend) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; delete[] cpu_ctx->work_data; +#ifdef GGML_USE_NUMA_MIGRATE + for (int i = 0; i < GGML_NUMA_MIGRATE_NODES; i++) { + numa_free(cpu_ctx->work_data_numa[i], cpu_ctx->work_size); + } +#endif delete cpu_ctx; delete backend; } @@ -162,9 +173,24 @@ static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, s cpu_ctx->work_size = 0; return GGML_STATUS_ALLOC_FAILED; } + +#ifdef GGML_USE_NUMA_MIGRATE + for (int i = 0; i < GGML_NUMA_MIGRATE_NODES; i++) { + cpu_ctx->work_data_numa[i] = (uint8_t *)numa_alloc_onnode(cplan.work_size, i); + if (cpu_ctx->work_data_numa[i] == NULL) { + cpu_ctx->work_size = 0; + return GGML_STATUS_ALLOC_FAILED; + } + } +#endif cpu_ctx->work_size = cplan.work_size; } cplan.work_data = (uint8_t *)cpu_ctx->work_data; +#ifdef GGML_USE_NUMA_MIGRATE + for (int i = 0; i < GGML_NUMA_MIGRATE_NODES; i++) { + cplan.work_data_numa[i] = (uint8_t *)(cpu_ctx->work_data_numa[i]); + } +#endif cplan.abort_callback = cpu_ctx->abort_callback; cplan.abort_callback_data = cpu_ctx->abort_callback_data; @@ -205,6 +231,11 @@ ggml_backend_t ggml_backend_cpu_init(void) { ctx->n_threads = GGML_DEFAULT_N_THREADS; ctx->threadpool = NULL; ctx->work_data = NULL; +#ifdef GGML_USE_NUMA_MIGRATE + for (int i = 0; i < GGML_NUMA_MIGRATE_NODES; i++) { + ctx->work_data_numa[i] = NULL; + } +#endif ctx->work_size = 0; ctx->abort_callback = NULL; ctx->abort_callback_data = NULL; diff --git a/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp b/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp index fafe45e6c5c51..dea79898bbac4 100644 --- a/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +++ b/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp @@ -413,7 +413,11 @@ static ggml_backend_buffer_t ggml_backend_cpu_kleidiai_buffer_type_alloc_buffer( } static size_t ggml_backend_cpu_kleidiai_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +#ifdef GGML_USE_NUMA_MIGRATE + return ggml_backend_get_page_size(); +#else return TENSOR_ALIGNMENT; +#endif GGML_UNUSED(buft); } diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp index 604ccee907843..520ce191a3fd2 100644 --- a/ggml/src/ggml-cpu/repack.cpp +++ b/ggml/src/ggml-cpu/repack.cpp @@ -17,6 +17,9 @@ #include #include // for qsort #include // for GGML_ASSERT +#ifdef GGML_USE_NUMA_MIGRATE +#include +#endif #include "repack.h" @@ -1232,8 +1235,12 @@ template src[0]) == 2); // GGML_ASSERT(ggml_n_dims(op->src[1]) == 2); - +#ifdef GGML_USE_NUMA_MIGRATE + int node_id = ggml_get_node_from_cpu(ith); + char * wdata = static_cast(params->wdata_numa[node_id]); +#else char * wdata = static_cast(params->wdata); +#endif const size_t nbw1 = ggml_row_size(PARAM_TYPE, ne10); assert(params->wsize >= nbw1 * ne11); @@ -1241,18 +1248,31 @@ template from_float; int64_t i11_processed = 0; - for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) { +#ifdef GGML_USE_NUMA_MIGRATE + int round_cnts = ggml_cores_per_numa(); + int start_id = ith - round_cnts * node_id; + if (round_cnts == 0) { + round_cnts = nth; + start_id = ith; + } +#else + int round_cnts = nth; + int start_id = ith; +#endif + for (int64_t i11 = start_id * 4; i11 < ne11 - ne11 % 4; i11 += round_cnts * 4) { ggml_quantize_mat_t((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), 4, ne10); } i11_processed = ne11 - ne11 % 4; - for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) { + for (int64_t i11 = i11_processed + start_id; i11 < ne11; i11 += round_cnts) { from_float((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), ne10); } +#ifdef GGML_USE_NUMA_MIGRATE + ggml_barrier_numa_aware(params->threadpool, ith, GGML_BARRIER_NODE_LAST); +#else ggml_barrier(params->threadpool); - - const void * src1_wdata = params->wdata; +#endif const size_t src1_col_stride = ggml_row_size(PARAM_TYPE, ne10); int64_t src0_start = (ith * ne01) / nth; int64_t src0_end = ((ith + 1) * ne01) / nth; @@ -1267,13 +1287,13 @@ template (ne00, (float *) ((char *) dst->data) + src0_start, ne01, (const char *) src0->data + src0_start * nb01, - (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start); + (const char *) wdata, ne11 - ne11 % 4, src0_end - src0_start); } for (int iter = ne11 - ne11 % 4; iter < ne11; iter++) { gemv(ne00, (float *) ((char *) dst->data + (iter * nb1)) + src0_start, ne01, (const char *) src0->data + src0_start * nb01, - (const char *) src1_wdata + (src1_col_stride * iter), 1, + (const char *) wdata + (src1_col_stride * iter), 1, src0_end - src0_start); } } @@ -1498,7 +1518,11 @@ static ggml_backend_buffer_t ggml_backend_cpu_repack_buffer_type_alloc_buffer(gg } static size_t ggml_backend_cpu_repack_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +#ifdef GGML_USE_NUMA_MIGRATE + return ggml_backend_get_page_size(); +#else return TENSOR_ALIGNMENT; +#endif GGML_UNUSED(buft); } diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index bd9e6da8832b7..6ebda9250087d 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -711,6 +711,10 @@ llama_model_loader::llama_model_loader( use_mmap = false; } +#ifdef GGML_USE_NUMA_MIGRATE + use_mmap = false; +#endif + this->use_mmap = use_mmap; this->check_tensors = check_tensors; } diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index e59d61f195675..7891ddb2c3a78 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -312,7 +312,12 @@ static void print_usage(int /* argc */, char ** argv) { printf("\n"); printf("options:\n"); printf(" -h, --help\n"); +#ifdef GGML_USE_NUMA_MIGRATE + printf(" --numa \n"); + printf(" numa mode (default: disabled)\n"); +#else printf(" --numa numa mode (default: disabled)\n"); +#endif printf(" -r, --repetitions number of times to repeat each test (default: %d)\n", cmd_params_defaults.reps); printf(" --prio <-1|0|1|2|3> process/thread priority (default: %d)\n", @@ -628,6 +633,10 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; +#ifdef GGML_USE_NUMA_MIGRATE + } else if (value == "migrate") { + params.numa = GGML_NUMA_STRATEGY_MIGRATE; +#endif } else { invalid_param = true; break;