Open
Description
Is this a duplicate?
- I confirmed there appear to be no duplicate issues for this bug and that I agree to the Code of Conduct
Type of Bug
Performance
Component
cuda.bindings
Describe the bug
When using cuda.bindings.nvrtc to compile code in multiple threads, the threads run sequentially indeed. This is an obvious GIL issue, since NVRTC itself supports multi-threading.
How to Reproduce
import cuda.bindings.nvrtc as nvrtc
import concurrent.futures
import threading
import time
num_workers = 8
pools = [
(concurrent.futures.ThreadPoolExecutor(max_workers=num_workers), "thread"),
(concurrent.futures.ProcessPoolExecutor(max_workers=num_workers), "process")
]
def task(start_time):
code = f"""
extern "C" __global__
void test(float a, float *x, float *y, float *out, size_t n)
{{
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < n) {{
float val = x[tid];
for (int i = 0; i < 1000; ++i) {{
val = val * a + y[tid];
val = sinf(val) * cosf(y[tid] + (float)i);
val = val / (1.0f + sqrtf(val * val + (float)i));
}}
for (int i = 0; i < 1000; ++i) {{
val = val + a + y[tid];
val = sinf(val) * cosf(y[tid] + (float)i);
val = val / (1.0f + sqrtf(val * val + (float)i));
}}
for (int i = 0; i < 1000; ++i) {{
float divisor_a = a;
if (fabsf(divisor_a) < 1e-7f) {{
divisor_a = copysignf(1e-7f, a);
}}
val = val / divisor_a + y[tid];
val = sinf(val) * cosf(y[tid] + (float)i);
val = val * (1.0f + sqrtf(val * val + (float)i));
}}
for (int j = 0; j < 40; ++j) {{
val = val * sinf(((float)j + 1.0f) * 0.1f + y[tid] * a) + cosf(val - ((float)j + 1.0f) * 0.05f);
val = sqrtf(fabsf(val) + 0.1f + ((float)j + 1.0f) * 0.01f) / (0.1f + fabsf(y[tid] - ((float)j + 1.0f) * 0.2f + a));
val = val + expf(0.001f * (val - a + y[tid] - ((float)j + 1.0f) * 0.15f));
val = logf(fabsf(val) + 0.1f + fabsf(a) + ((float)j + 1.0f) * 0.02f) * tanhf(y[tid] / (1.0f + fabsf(val) + ((float)j + 1.0f) * 0.03f));
val = val - a * cosf(val + ((float)j + 1.0f) * 0.08f) + y[tid] * sinf(((float)j + 1.0f) * 0.03f - val);
}}
out[tid] = val;
}}
}}
"""
current_time_start = time.time()
print(f"[{threading.get_native_id()}] Start at {current_time_start-start_time}")
_, program = nvrtc.nvrtcCreateProgram(bytes(code, "utf-8"), bytes("test.cu", "utf-8"), 0, [], [])
options = [b"--gpu-architecture=sm_90"]
_ = nvrtc.nvrtcCompileProgram(program, len(options), options)
current_time_end = time.time()
print(f"[{threading.get_native_id()}] End at {current_time_end-start_time}")
for pool, name in pools:
print(f"Using {name} pool")
futures = []
start_time = time.time()
for i in range(num_workers):
future = pool.submit(task, start_time)
futures.append(future)
for future in concurrent.futures.as_completed(futures):
future.result()
A sample output is as follows:
Using thread pool
[237132] Start at 0.0005047321319580078
[237133] Start at 0.01818108558654785
[237134] Start at 0.8023033142089844
[237132] End at 0.802753210067749
[237177] Start at 0.8028905391693115
[237132] Start at 1.0601465702056885
[237134] End at 1.0604710578918457
[237132] End at 1.3186068534851074
[237208] Start at 1.3188414573669434
[237134] Start at 1.3195312023162842
[237177] End at 1.5766639709472656
[237178] Start at 1.3193464279174805
[237133] End at 1.0606462955474854
[237208] End at 1.8349909782409668
[237134] End at 2.3467905521392822
[237178] End at 2.3475685119628906
Using process pool
[237225] Start at 0.025279998779296875
[237226] Start at 0.025611162185668945
[237227] Start at 0.025903940200805664
[237228] Start at 0.026208162307739258
[237229] Start at 0.026407718658447266
[237230] Start at 0.026773691177368164
[237231] Start at 0.027005672454833984
[237232] Start at 0.027050018310546875
[237229] End at 0.27670884132385254
[237232] End at 0.28432679176330566
[237228] End at 0.28521203994750977
[237227] End at 0.28534388542175293
[237226] End at 0.28702807426452637
[237231] End at 0.28710436820983887
[237230] End at 0.28752636909484863
[237225] End at 0.28855133056640625
Which clearly shows that when using ThreadPoolExecutor
, cuda.bindings.nvrtc
is affected by GIL and runs sequentially.
Expected behavior
cuda.bindings.nvrtc
should not be affected by GIL and should run in parallel when used in multiple threads.
Operating System
Ubuntu Linux 24.04
nvidia-smi output
No response