Skip to content

Commit 6adc3c3

Browse files
slarenggerganov
andauthored
llama : add thread safety test (#14035)
* llama : add thread safety test * llamafile : remove global state * llama : better LLAMA_SPLIT_MODE_NONE logic when main_gpu < 0 GPU devices are not used --------- Co-authored-by: Georgi Gerganov <[email protected]>
1 parent 0dbcabd commit 6adc3c3

File tree

9 files changed

+192
-18
lines changed

9 files changed

+192
-18
lines changed

.github/workflows/build.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -778,6 +778,7 @@ jobs:
778778
cmake -S . -B build ${{ matrix.defines }} `
779779
-DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
780780
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
781+
cp $env:CURL_PATH/bin/libcurl-*.dll build/bin/Release
781782
782783
- name: Add libopenblas.dll
783784
id: add_libopenblas_dll

ci/run.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ sd=`dirname $0`
3939
cd $sd/../
4040
SRC=`pwd`
4141

42-
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=OFF"
42+
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON"
4343

4444
if [ ! -z ${GG_BUILD_METAL} ]; then
4545
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON"

common/common.cpp

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -767,6 +767,9 @@ bool fs_validate_filename(const std::string & filename) {
767767
return true;
768768
}
769769

770+
#include <iostream>
771+
772+
770773
// returns true if successful, false otherwise
771774
bool fs_create_directory_with_parents(const std::string & path) {
772775
#ifdef _WIN32
@@ -784,9 +787,16 @@ bool fs_create_directory_with_parents(const std::string & path) {
784787
// process path from front to back, procedurally creating directories
785788
while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
786789
const std::wstring subpath = wpath.substr(0, pos_slash);
787-
const wchar_t * test = subpath.c_str();
788790

789-
const bool success = CreateDirectoryW(test, NULL);
791+
pos_slash += 1;
792+
793+
// skip the drive letter, in some systems it can return an access denied error
794+
if (subpath.length() == 2 && subpath[1] == ':') {
795+
continue;
796+
}
797+
798+
const bool success = CreateDirectoryW(subpath.c_str(), NULL);
799+
790800
if (!success) {
791801
const DWORD error = GetLastError();
792802

@@ -800,8 +810,6 @@ bool fs_create_directory_with_parents(const std::string & path) {
800810
return false;
801811
}
802812
}
803-
804-
pos_slash += 1;
805813
}
806814

807815
return true;

ggml/src/ggml-cpu/ggml-cpu-impl.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -503,6 +503,9 @@ static __m256 __lasx_xvreplfr2vr_s(const float val) {
503503
// TODO: move to ggml-threading
504504
void ggml_barrier(struct ggml_threadpool * tp);
505505

506+
void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value);
507+
int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value);
508+
506509
#ifdef __cplusplus
507510
}
508511
#endif

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -559,6 +559,14 @@ void ggml_barrier(struct ggml_threadpool * tp) {
559559
#endif
560560
}
561561

562+
void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value) {
563+
atomic_store_explicit(&tp->current_chunk, value, memory_order_relaxed);
564+
}
565+
566+
int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value) {
567+
return atomic_fetch_add_explicit(&tp->current_chunk, value, memory_order_relaxed);
568+
}
569+
562570
#if defined(__gnu_linux__)
563571
static cpu_set_t ggml_get_numa_affinity(void) {
564572
cpu_set_t cpuset;

ggml/src/ggml-cpu/llamafile/sgemm.cpp

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,6 @@
5353
#include "ggml-cpu-impl.h"
5454
#include "ggml-quants.h"
5555

56-
#include <atomic>
5756
#include <array>
5857
#include <type_traits>
5958

@@ -394,8 +393,6 @@ class tinyBLAS {
394393

395394
template <int RM, int RN, int BM>
396395
NOINLINE void gemm(int64_t m, int64_t n, int64_t BN) {
397-
static std::atomic<int64_t> current_chunk;
398-
399396
GGML_ASSERT(m % (RM * BM) == 0);
400397
const int64_t ytiles = m / (RM * BM);
401398
const int64_t xtiles = (n + RN -1) / RN;
@@ -410,7 +407,7 @@ class tinyBLAS {
410407
if (params->ith == 0) {
411408
GGML_ASSERT( jj_BN * SIZE_BN + (NB_BN - jj_BN) * (SIZE_BN - 1) == xtiles);
412409
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
413-
std::atomic_store_explicit(&current_chunk, (int64_t)params->nth, std::memory_order_relaxed);
410+
ggml_threadpool_chunk_set(params->threadpool, params->nth);
414411
}
415412

416413
ggml_barrier(params->threadpool);
@@ -439,8 +436,7 @@ class tinyBLAS {
439436
GGML_ASSERT(jj == jj2);
440437
}
441438

442-
// next step.
443-
job = std::atomic_fetch_add_explicit(&current_chunk, (int64_t)1, std::memory_order_relaxed);
439+
job = ggml_threadpool_chunk_add(params->threadpool, 1);
444440
}
445441

446442
ggml_barrier(params->threadpool);

src/llama.cpp

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -198,14 +198,18 @@ static struct llama_model * llama_model_load_from_file_impl(
198198

199199
// if using single GPU mode, remove all except the main GPU
200200
if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
201-
if (params.main_gpu < 0 || params.main_gpu >= (int)model->devices.size()) {
202-
LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %d)\n", __func__, params.main_gpu, (int)model->devices.size());
203-
llama_model_free(model);
204-
return nullptr;
201+
if (params.main_gpu < 0) {
202+
model->devices.clear();
203+
} else {
204+
if (params.main_gpu >= (int)model->devices.size()) {
205+
LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %zu)\n", __func__, params.main_gpu, model->devices.size());
206+
llama_model_free(model);
207+
return nullptr;
208+
}
209+
ggml_backend_dev_t main_gpu = model->devices[params.main_gpu];
210+
model->devices.clear();
211+
model->devices.push_back(main_gpu);
205212
}
206-
ggml_backend_dev_t main_gpu = model->devices[params.main_gpu];
207-
model->devices.clear();
208-
model->devices.push_back(main_gpu);
209213
}
210214

211215
for (auto * dev : model->devices) {

tests/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,8 @@ llama_build_and_test(test-json-partial.cpp)
185185
llama_build_and_test(test-log.cpp)
186186
llama_build_and_test(test-regex-partial.cpp)
187187

188+
llama_build_and_test(test-thread-safety.cpp ARGS -hf ggml-org/models -hff tinyllamas/stories15M-q4_0.gguf -ngl 99 -p "The meaning of life is" -n 128 -c 256 -ub 32 -np 4)
189+
188190
# this fails on windows (github hosted runner) due to curl DLL not found (exit code 0xc0000135)
189191
if (NOT WIN32)
190192
llama_build_and_test(test-arg-parser.cpp)

tests/test-thread-safety.cpp

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
// thread safety test
2+
// - Loads a copy of the same model on each GPU, plus a copy on the CPU
3+
// - Creates n_parallel (--parallel) contexts per model
4+
// - Runs inference in parallel on each context
5+
6+
#include <thread>
7+
#include <vector>
8+
#include <atomic>
9+
#include "llama.h"
10+
#include "arg.h"
11+
#include "common.h"
12+
#include "log.h"
13+
#include "sampling.h"
14+
15+
int main(int argc, char ** argv) {
16+
common_params params;
17+
18+
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
19+
return 1;
20+
}
21+
22+
common_init();
23+
24+
llama_backend_init();
25+
llama_numa_init(params.numa);
26+
27+
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
28+
29+
//llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
30+
// if (level == GGML_LOG_LEVEL_ERROR) {
31+
// common_log_add(common_log_main(), level, "%s", text);
32+
// }
33+
//}, NULL);
34+
35+
auto cparams = common_context_params_to_llama(params);
36+
37+
int dev_count = ggml_backend_dev_count();
38+
int gpu_dev_count = 0;
39+
for (int i = 0; i < dev_count; ++i) {
40+
auto * dev = ggml_backend_dev_get(i);
41+
if (dev && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
42+
gpu_dev_count++;
43+
}
44+
}
45+
const int num_models = gpu_dev_count + 1 + 1; // GPUs + 1 CPU model + 1 layer split
46+
//const int num_models = std::max(1, gpu_dev_count);
47+
const int num_contexts = std::max(1, params.n_parallel);
48+
49+
std::vector<llama_model_ptr> models;
50+
std::vector<std::thread> threads;
51+
std::atomic<bool> failed = false;
52+
53+
for (int m = 0; m < num_models; ++m) {
54+
auto mparams = common_model_params_to_llama(params);
55+
56+
if (m < gpu_dev_count) {
57+
mparams.split_mode = LLAMA_SPLIT_MODE_NONE;
58+
mparams.main_gpu = m;
59+
} else if (m == gpu_dev_count) {
60+
mparams.split_mode = LLAMA_SPLIT_MODE_NONE;
61+
mparams.main_gpu = -1; // CPU model
62+
} else {
63+
mparams.split_mode = LLAMA_SPLIT_MODE_LAYER;;
64+
}
65+
66+
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
67+
if (model == NULL) {
68+
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
69+
return 1;
70+
}
71+
72+
models.emplace_back(model);
73+
}
74+
75+
for (int m = 0; m < num_models; ++m) {
76+
auto * model = models[m].get();
77+
for (int c = 0; c < num_contexts; ++c) {
78+
threads.emplace_back([&, m, c, model]() {
79+
LOG_INF("Creating context %d/%d for model %d/%d\n", c + 1, num_contexts, m + 1, num_models);
80+
81+
llama_context_ptr ctx { llama_init_from_model(model, cparams) };
82+
if (ctx == NULL) {
83+
LOG_ERR("failed to create context\n");
84+
failed.store(true);
85+
return;
86+
}
87+
88+
std::unique_ptr<common_sampler, decltype(&common_sampler_free)> sampler { common_sampler_init(model, params.sampling), common_sampler_free };
89+
if (sampler == NULL) {
90+
LOG_ERR("failed to create sampler\n");
91+
failed.store(true);
92+
return;
93+
}
94+
95+
llama_batch batch = {};
96+
{
97+
auto prompt = common_tokenize(ctx.get(), params.prompt, true);
98+
if (prompt.empty()) {
99+
LOG_ERR("failed to tokenize prompt\n");
100+
failed.store(true);
101+
return;
102+
}
103+
batch = llama_batch_get_one(prompt.data(), prompt.size());
104+
if (llama_decode(ctx.get(), batch)) {
105+
LOG_ERR("failed to decode prompt\n");
106+
failed.store(true);
107+
return;
108+
}
109+
}
110+
111+
const auto * vocab = llama_model_get_vocab(model);
112+
std::string result = params.prompt;
113+
114+
for (int i = 0; i < params.n_predict; i++) {
115+
llama_token token;
116+
if (batch.n_tokens > 0) {
117+
token = common_sampler_sample(sampler.get(), ctx.get(), batch.n_tokens - 1);
118+
} else {
119+
token = llama_vocab_bos(vocab);
120+
}
121+
122+
result += common_token_to_piece(ctx.get(), token);
123+
124+
if (llama_vocab_is_eog(vocab, token)) {
125+
break;
126+
}
127+
128+
batch = llama_batch_get_one(&token, 1);
129+
if (llama_decode(ctx.get(), batch)) {
130+
LOG_ERR("Model %d/%d, Context %d/%d: failed to decode\n", m + 1, num_models, c + 1, num_contexts);
131+
failed.store(true);
132+
return;
133+
}
134+
}
135+
136+
LOG_INF("Model %d/%d, Context %d/%d: %s\n\n", m + 1, num_models, c + 1, num_contexts, result.c_str());
137+
});
138+
}
139+
}
140+
141+
for (auto & thread : threads) {
142+
thread.join();
143+
}
144+
145+
if (failed) {
146+
LOG_ERR("One or more threads failed.\n");
147+
return 1;
148+
}
149+
150+
LOG_INF("All threads finished without errors.\n");
151+
return 0;
152+
}

0 commit comments

Comments
 (0)