Skip to content

ggml: introduce GGML_NUMA_MIGRATE to optimize cross NUMA op computation #14232

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2280,12 +2280,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
"- distribute: spread execution evenly over all nodes\n"
"- isolate: only spawn threads on CPUs on the node that execution started on\n"
"- numactl: use the CPU map provided by numactl\n"
#ifdef GGML_USE_NUMA_MIGRATE
"- migrate: for affinity threads with page migration across NUMA nodes\n"
#endif
"if run without this previously, it is recommended to drop the system page cache before using this\n"
"see https://github.com/ggml-org/llama.cpp/issues/1437",
[](common_params & params, const std::string & value) {
/**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
#ifdef GGML_USE_NUMA_MIGRATE
else if (value == "migrate") { params.numa = GGML_NUMA_STRATEGY_MIGRATE; }
#endif
else { throw std::invalid_argument("invalid value"); }
}
).set_env("LLAMA_ARG_NUMA"));
Expand Down
5 changes: 5 additions & 0 deletions ggml/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,11 @@ set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
"ggml: BLAS library vendor")
option(GGML_LLAMAFILE "ggml: use LLAMAFILE" ${GGML_LLAMAFILE_DEFAULT})

option(GGML_NUMA_MIGRATE "ggml: use NUMA_MIGRATE" OFF)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this should become GGML_CPU_NUMA. AFAIU, these changes require linking libnuma, is that correct?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, I'm OK with the name change since it's targeted for CPU:)

set(GGML_NUMA_MIGRATE_NODES "2" CACHE STRING
"ggml: the number of NUMA nodes during page migration")
option(GGML_NUMA_MIGRATE_DEBUG "ggml: enable debugging of NUMA_MIGRATE" OFF)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Avoid this by checking an environment variable instead. See GGML_SCHED_DEBUG for an example.


option(GGML_CUDA "ggml: use CUDA" OFF)
option(GGML_MUSA "ggml: use MUSA" OFF)
option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF)
Expand Down
3 changes: 3 additions & 0 deletions ggml/include/ggml-backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,9 @@ extern "C" {
// CPU buffer types are always available
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
#ifdef GGML_USE_NUMA_MIGRATE
GGML_API size_t ggml_backend_get_page_size(void);
#endif
Comment on lines +351 to +353
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems like should be moved to ggml-cpu-impl.h

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried to move the function definition from ggml-backend.cpp to ggml-cpu.c, but there're lots of link issues since the ggml-cpu.c is in the subdirectories of ggml-cpu while it can't be referenced by files under parent directories under ggml:
./ggml/src/{files needs to reference the function}
./ggml/src/ggml-cpu

[ 18%] Linking CXX shared library ../bin/libllama.so
/usr/bin/ld: ../../bin/libggml-base.so: undefined reference to `ggml_cpu_get_page_size'

moved here:

--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -583,6 +583,14 @@ int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value) {
 }
 
 #ifdef GGML_USE_NUMA_MIGRATE
+int ggml_cpu_page_size = 0;
+size_t ggml_cpu_get_page_size(void) {
+    if (ggml_cpu_page_size == 0) {
+        ggml_cpu_page_size = sysconf(_SC_PAGE_SIZE);
+    }
+    return ggml_cpu_page_size;
+}
+


#ifdef __cplusplus
}
Expand Down
6 changes: 6 additions & 0 deletions ggml/include/ggml-cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ extern "C" {
struct ggml_cplan {
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
#ifdef GGML_USE_NUMA_MIGRATE
uint8_t * work_data_numa[GGML_NUMA_MIGRATE_NODES];
#endif
Comment on lines +15 to +17
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We have to find a way to avoid so many #ifdefs through out the codebase. Not sure exactly how, but in this specific case, we could maybe wrap the work_data logic into a type and the underlying implementation can decide how many buffers to create and if they would be numa aware, etc.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, it's a valid concern, i'll need to find a better way.


int n_threads;
struct ggml_threadpool * threadpool;
Expand All @@ -28,6 +31,9 @@ extern "C" {
GGML_NUMA_STRATEGY_ISOLATE = 2,
GGML_NUMA_STRATEGY_NUMACTL = 3,
GGML_NUMA_STRATEGY_MIRROR = 4,
#ifdef GGML_USE_NUMA_MIGRATE
GGML_NUMA_STRATEGY_MIGRATE = 5,
#endif
GGML_NUMA_STRATEGY_COUNT
};

Expand Down
24 changes: 24 additions & 0 deletions ggml/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -381,3 +381,27 @@ if (BUILD_SHARED_LIBS)
target_compile_definitions(${target} PUBLIC GGML_SHARED)
endforeach()
endif()

if (GGML_NUMA_MIGRATE)
find_path(NUMA_ROOT_DIR
NAMES include/numa.h
PATHS ENV NUMA_ROOT
DOC "NUMA root directory")

find_library(NUMA_LIBRARY
NAMES numa
HINTS ${NUMA_ROOT_DIR}
DOC "NUMA library")

if (NOT NUMA_LIBRARY)
message(FATAL_ERROR "Could NOT find NUMA library.")
endif()

if (GGML_NUMA_MIGRATE_DEBUG)
target_compile_definitions(ggml-base PUBLIC GGML_USE_NUMA_MIGRATE GGML_NUMA_MIGRATE_NODES=${GGML_NUMA_MIGRATE_NODES} GGML_USE_NUMA_MIGRATE_DEBUG)
else()
target_compile_definitions(ggml-base PUBLIC GGML_USE_NUMA_MIGRATE GGML_NUMA_MIGRATE_NODES=${GGML_NUMA_MIGRATE_NODES})
endif()

target_link_libraries(ggml-base PRIVATE ${NUMA_LIBRARY})
endif()
16 changes: 16 additions & 0 deletions ggml/src/ggml-alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -948,6 +948,22 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
ggml_backend_buffer_type_t buft, size_t size,
ggml_backend_buffer_t ** buffers, size_t * n_buffers) {

#ifdef GGML_USE_NUMA_MIGRATE
size_t num_of_tensors = 0;
for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
if (t->data == NULL) {
if (t->view_src == NULL) {
num_of_tensors++;
}
}
}
size_t ps = ggml_backend_get_page_size();
size_t original_size = size;
size += ps * num_of_tensors;
GGML_LOG_DEBUG("alloc buffer for NUMA page migration, num of tensors: %ld, size increased from %ld to %ld, increased %ld MiB\n",
num_of_tensors, original_size, size, (size - original_size) / 1024 / 1024);
#endif

ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
if (buffer == NULL) {
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
Expand Down
Loading
Loading