Skip to content

Commit 25e1b55

Browse files
authored
[UR][L0] Avoid calls to destroy interop data structures given loader instability (#17543)
- When an application transfers ownership of a L0 handle to the UR, this does not prevent the OS from releasing the memory backing the handle when the interop library is torn down. This leads to a situation where the UR is trying to destroy interop data structures that have already been destroyed by the OS. - UR has never known the lifetime of the handle, but with the static L0 Loader, the workaround in the loader to handle this situation and "pretend" the memory was freed is impossible to use. - To fix this issue, avoid calls to destroy interop data structures when the runtime is being torn down. - This avoids calling a function that already did not perform the intended operation and prevents segfaults when the UR tries to destroy interop data structures that have already been destroyed. --------- Signed-off-by: Neil R. Spruit <[email protected]>
1 parent 8418a42 commit 25e1b55

19 files changed

+186
-59
lines changed

unified-runtime/source/adapters/level_zero/common.hpp

+72
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,13 @@
1818
#include <unordered_map>
1919
#include <vector>
2020

21+
#ifdef _WIN32
22+
#include "windows.h"
23+
#else
24+
#include <dlfcn.h>
25+
#include <unistd.h>
26+
#endif
27+
2128
#include <ur/ur.hpp>
2229
#include <ur_ddi.h>
2330
#include <ze_api.h>
@@ -30,6 +37,68 @@
3037

3138
struct _ur_platform_handle_t;
3239

40+
[[maybe_unused]] static bool checkL0LoaderTeardown() {
41+
bool loaderStable = true;
42+
#ifdef _WIN32
43+
uint32_t ZeDriverCount = 0;
44+
HMODULE zeLoader = LoadLibrary("ze_loader.dll");
45+
if (zeLoader) {
46+
typedef ze_result_t (*zeDriverGet_t)(uint32_t *, ze_driver_handle_t *);
47+
zeDriverGet_t zeDriverGetLoader =
48+
(zeDriverGet_t)GetProcAddress(zeLoader, "zeDriverGet");
49+
if (zeDriverGetLoader) {
50+
ze_result_t result = zeDriverGetLoader(&ZeDriverCount, nullptr);
51+
logger::debug(
52+
"ZE ---> checkL0LoaderTeardown result = {} driver count = {}", result,
53+
ZeDriverCount);
54+
if (result != ZE_RESULT_SUCCESS || ZeDriverCount == 0) {
55+
loaderStable = false;
56+
}
57+
} else {
58+
logger::debug("ZE ---> checkL0LoaderTeardown: Failed to get address of "
59+
"zeDriverGet");
60+
loaderStable = false;
61+
}
62+
FreeLibrary(zeLoader);
63+
} else {
64+
logger::debug(
65+
"ZE ---> checkL0LoaderTeardown: Failed to load ze_loader.dll");
66+
loaderStable = false;
67+
}
68+
#else
69+
uint32_t ZeDriverCount = 0;
70+
void *zeLoader = dlopen("libze_loader.so.1", RTLD_LAZY);
71+
if (zeLoader) {
72+
typedef ze_result_t (*zeDriverGet_t)(uint32_t *, ze_driver_handle_t *);
73+
zeDriverGet_t zeDriverGetLoader =
74+
(zeDriverGet_t)dlsym(zeLoader, "zeDriverGet");
75+
if (zeDriverGetLoader) {
76+
ze_result_t result = zeDriverGetLoader(&ZeDriverCount, nullptr);
77+
logger::debug(
78+
"ZE ---> checkL0LoaderTeardown result = {} driver count = {}", result,
79+
ZeDriverCount);
80+
if (result != ZE_RESULT_SUCCESS || ZeDriverCount == 0) {
81+
loaderStable = false;
82+
}
83+
} else {
84+
logger::debug("ZE ---> checkL0LoaderTeardown: Failed to get address of "
85+
"zeDriverGet");
86+
loaderStable = false;
87+
}
88+
dlclose(zeLoader);
89+
} else {
90+
logger::debug(
91+
"ZE ---> checkL0LoaderTeardown: Failed to load libze_loader.so.1");
92+
loaderStable = false;
93+
}
94+
#endif
95+
if (!loaderStable) {
96+
logger::debug(
97+
"ZE ---> checkL0LoaderTeardown: Loader is not stable, returning false");
98+
}
99+
return loaderStable;
100+
}
101+
33102
static auto getUrResultString = [](ur_result_t Result) {
34103
switch (Result) {
35104
case UR_RESULT_SUCCESS:
@@ -435,6 +504,9 @@ struct _ur_object {
435504
// Indicates if we own the native handle or it came from interop that
436505
// asked to not transfer the ownership to SYCL RT.
437506
bool OwnNativeHandle = false;
507+
508+
// Indicates if this object is an interop handle.
509+
bool IsInteropNativeHandle = false;
438510
};
439511

440512
// Record for a memory allocation. This structure is used to keep information

unified-runtime/source/adapters/level_zero/context.cpp

+13-5
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,7 @@ ur_result_t urContextCreateWithNativeHandle(
152152
ur_context_handle_t_ *UrContext = new ur_context_handle_t_(
153153
ZeContext, NumDevices, Devices, OwnNativeHandle);
154154
UrContext->initialize();
155+
UrContext->IsInteropNativeHandle = true;
155156
*Context = reinterpret_cast<ur_context_handle_t>(UrContext);
156157
} catch (const std::bad_alloc &) {
157158
return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
@@ -263,7 +264,11 @@ ur_result_t ContextReleaseHelper(ur_context_handle_t Context) {
263264
Contexts.erase(It);
264265
}
265266
ze_context_handle_t DestroyZeContext =
266-
Context->OwnNativeHandle ? Context->ZeContext : nullptr;
267+
((Context->OwnNativeHandle && !Context->IsInteropNativeHandle) ||
268+
(Context->OwnNativeHandle && Context->IsInteropNativeHandle &&
269+
checkL0LoaderTeardown()))
270+
? Context->ZeContext
271+
: nullptr;
267272

268273
// Clean up any live memory associated with Context
269274
ur_result_t Result = Context->finalize();
@@ -302,11 +307,14 @@ ur_result_t ur_context_handle_t_::finalize() {
302307
std::scoped_lock<ur_mutex> Lock(EventCacheMutex);
303308
for (auto &EventCache : EventCaches) {
304309
for (auto &Event : EventCache) {
305-
auto ZeResult = ZE_CALL_NOCHECK(zeEventDestroy, (Event->ZeEvent));
310+
if (!Event->IsInteropNativeHandle ||
311+
(Event->IsInteropNativeHandle && checkL0LoaderTeardown())) {
312+
auto ZeResult = ZE_CALL_NOCHECK(zeEventDestroy, (Event->ZeEvent));
313+
// Gracefully handle the case that L0 was already unloaded.
314+
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
315+
return ze2urResult(ZeResult);
316+
}
306317
Event->ZeEvent = nullptr;
307-
// Gracefully handle the case that L0 was already unloaded.
308-
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
309-
return ze2urResult(ZeResult);
310318
delete Event;
311319
}
312320
EventCache.clear();

unified-runtime/source/adapters/level_zero/device.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -1528,6 +1528,7 @@ ur_result_t urDeviceCreateWithNativeHandle(
15281528
if (Dev == nullptr)
15291529
return UR_RESULT_ERROR_INVALID_VALUE;
15301530

1531+
Dev->IsInteropNativeHandle = true;
15311532
*Device = Dev;
15321533
return UR_RESULT_SUCCESS;
15331534
}

unified-runtime/source/adapters/level_zero/event.cpp

+8-4
Original file line numberDiff line numberDiff line change
@@ -1001,6 +1001,7 @@ ur_result_t urEventCreateWithNativeHandle(
10011001
UREvent->CleanedUp = true;
10021002

10031003
*Event = reinterpret_cast<ur_event_handle_t>(UREvent);
1004+
UREvent->IsInteropNativeHandle = true;
10041005

10051006
return UR_RESULT_SUCCESS;
10061007
}
@@ -1120,11 +1121,14 @@ ur_result_t urEventReleaseInternal(ur_event_handle_t Event) {
11201121
}
11211122
if (Event->OwnNativeHandle) {
11221123
if (DisableEventsCaching) {
1123-
auto ZeResult = ZE_CALL_NOCHECK(zeEventDestroy, (Event->ZeEvent));
1124+
if (!Event->IsInteropNativeHandle ||
1125+
(Event->IsInteropNativeHandle && checkL0LoaderTeardown())) {
1126+
auto ZeResult = ZE_CALL_NOCHECK(zeEventDestroy, (Event->ZeEvent));
1127+
// Gracefully handle the case that L0 was already unloaded.
1128+
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
1129+
return ze2urResult(ZeResult);
1130+
}
11241131
Event->ZeEvent = nullptr;
1125-
// Gracefully handle the case that L0 was already unloaded.
1126-
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
1127-
return ze2urResult(ZeResult);
11281132
auto Context = Event->Context;
11291133
if (auto Res = Context->decrementUnreleasedEventsInPool(Event))
11301134
return Res;

unified-runtime/source/adapters/level_zero/kernel.cpp

+8-4
Original file line numberDiff line numberDiff line change
@@ -940,10 +940,13 @@ ur_result_t urKernelRelease(
940940
auto KernelProgram = Kernel->Program;
941941
if (Kernel->OwnNativeHandle) {
942942
for (auto &ZeKernel : Kernel->ZeKernels) {
943-
auto ZeResult = ZE_CALL_NOCHECK(zeKernelDestroy, (ZeKernel));
944-
// Gracefully handle the case that L0 was already unloaded.
945-
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
946-
return ze2urResult(ZeResult);
943+
if (!Kernel->IsInteropNativeHandle ||
944+
(Kernel->IsInteropNativeHandle && checkL0LoaderTeardown())) {
945+
auto ZeResult = ZE_CALL_NOCHECK(zeKernelDestroy, (ZeKernel));
946+
// Gracefully handle the case that L0 was already unloaded.
947+
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
948+
return ze2urResult(ZeResult);
949+
}
947950
}
948951
}
949952
Kernel->ZeKernelMap.clear();
@@ -1154,6 +1157,7 @@ ur_result_t urKernelCreateWithNativeHandle(
11541157
}
11551158

11561159
Kernel->Program = Program;
1160+
Kernel->IsInteropNativeHandle = true;
11571161

11581162
UR_CALL(Kernel->initialize());
11591163

unified-runtime/source/adapters/level_zero/memory.cpp

+10-5
Original file line numberDiff line numberDiff line change
@@ -1563,6 +1563,7 @@ ur_result_t urMemImageCreateWithNativeHandle(
15631563
auto OwnNativeHandle = Properties ? Properties->isNativeHandleOwned : false;
15641564
UR_CALL(createUrMemFromZeImage(Context, ZeHImage, OwnNativeHandle,
15651565
ZeImageDesc, Mem));
1566+
(*Mem)->IsInteropNativeHandle = true;
15661567

15671568
return UR_RESULT_SUCCESS;
15681569
}
@@ -1662,11 +1663,14 @@ ur_result_t urMemRelease(
16621663
if (Image->OwnNativeHandle) {
16631664
UR_CALL(Mem->getZeHandle(ZeHandleImage, ur_mem_handle_t_::write_only,
16641665
nullptr, nullptr, 0u));
1665-
auto ZeResult = ZE_CALL_NOCHECK(
1666-
zeImageDestroy, (ur_cast<ze_image_handle_t>(ZeHandleImage)));
1667-
// Gracefully handle the case that L0 was already unloaded.
1668-
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
1669-
return ze2urResult(ZeResult);
1666+
if (!Image->IsInteropNativeHandle ||
1667+
(Image->IsInteropNativeHandle && checkL0LoaderTeardown())) {
1668+
auto ZeResult = ZE_CALL_NOCHECK(
1669+
zeImageDestroy, (ur_cast<ze_image_handle_t>(ZeHandleImage)));
1670+
// Gracefully handle the case that L0 was already unloaded.
1671+
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
1672+
return ze2urResult(ZeResult);
1673+
}
16701674
}
16711675
delete Image;
16721676
} else {
@@ -1772,6 +1776,7 @@ ur_result_t urMemBufferCreateWithNativeHandle(
17721776
Buffer = new _ur_buffer(Context, Size, Device, ur_cast<char *>(NativeMem),
17731777
OwnNativeHandle);
17741778
*Mem = reinterpret_cast<ur_mem_handle_t>(Buffer);
1779+
(*Mem)->IsInteropNativeHandle = true;
17751780
} catch (const std::bad_alloc &) {
17761781
return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
17771782
} catch (...) {

unified-runtime/source/adapters/level_zero/program.cpp

+7-7
Original file line numberDiff line numberDiff line change
@@ -956,7 +956,6 @@ ur_result_t urProgramCreateWithNativeHandle(
956956
UR_ASSERT(Context && NativeProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
957957
UR_ASSERT(Program, UR_RESULT_ERROR_INVALID_NULL_POINTER);
958958
auto ZeModule = ur_cast<ze_module_handle_t>(NativeProgram);
959-
960959
// We assume here that programs created from a native handle always
961960
// represent a fully linked executable (state Exe) and not an unlinked
962961
// executable (state Object).
@@ -966,6 +965,7 @@ ur_result_t urProgramCreateWithNativeHandle(
966965
ur_program_handle_t_::Exe, Context, ZeModule,
967966
Properties ? Properties->isNativeHandleOwned : false);
968967
*Program = reinterpret_cast<ur_program_handle_t>(UrProgram);
968+
(*Program)->IsInteropNativeHandle = true;
969969
} catch (const std::bad_alloc &) {
970970
return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
971971
} catch (...) {
@@ -1036,15 +1036,15 @@ ur_program_handle_t_::ur_program_handle_t_(ur_context_handle_t Context)
10361036
ur_program_handle_t_::ur_program_handle_t_(state, ur_context_handle_t Context,
10371037
ze_module_handle_t InteropZeModule)
10381038
: Context{Context}, NativeProperties{nullptr}, OwnZeModule{true},
1039-
AssociatedDevices({Context->getDevices()[0]}),
1040-
InteropZeModule{InteropZeModule} {}
1039+
AssociatedDevices({Context->getDevices()[0]}), InteropZeModule{
1040+
InteropZeModule} {}
10411041

10421042
ur_program_handle_t_::ur_program_handle_t_(state, ur_context_handle_t Context,
10431043
ze_module_handle_t InteropZeModule,
10441044
bool OwnZeModule)
10451045
: Context{Context}, NativeProperties{nullptr}, OwnZeModule{OwnZeModule},
1046-
AssociatedDevices({Context->getDevices()[0]}),
1047-
InteropZeModule{InteropZeModule} {
1046+
AssociatedDevices({Context->getDevices()[0]}), InteropZeModule{
1047+
InteropZeModule} {
10481048
// TODO: Currently it is not possible to understand the device associated
10491049
// with provided ZeModule. So we can't set the state on that device to Exe.
10501050
}
@@ -1080,10 +1080,10 @@ void ur_program_handle_t_::ur_release_program_resources(bool deletion) {
10801080
if (DeviceData.ZeBuildLog)
10811081
ZE_CALL_NOCHECK(zeModuleBuildLogDestroy, (DeviceData.ZeBuildLog));
10821082
}
1083-
10841083
// interop api
1085-
if (InteropZeModule && OwnZeModule)
1084+
if (InteropZeModule && OwnZeModule && checkL0LoaderTeardown()) {
10861085
ZE_CALL_NOCHECK(zeModuleDestroy, (InteropZeModule));
1086+
}
10871087

10881088
for (auto &[ZeDevice, DeviceData] : this->DeviceDataMap)
10891089
if (DeviceData.ZeModule)

unified-runtime/source/adapters/level_zero/queue.cpp

+8-4
Original file line numberDiff line numberDiff line change
@@ -800,6 +800,7 @@ ur_result_t urQueueCreateWithNativeHandle(
800800
ur_queue_handle_t_ *Queue = new ur_queue_handle_t_(
801801
ComputeQueues, CopyQueues, Context, UrDevice, OwnNativeHandle, Flags);
802802
*RetQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
803+
(*RetQueue)->IsInteropNativeHandle = true;
803804
} catch (const std::bad_alloc &) {
804805
return UR_RESULT_ERROR_OUT_OF_RESOURCES;
805806
} catch (...) {
@@ -1604,10 +1605,13 @@ ur_result_t urQueueReleaseInternal(ur_queue_handle_t Queue) {
16041605
for (auto &QueueGroup : QueueMap)
16051606
for (auto &ZeQueue : QueueGroup.second.ZeQueues)
16061607
if (ZeQueue) {
1607-
auto ZeResult = ZE_CALL_NOCHECK(zeCommandQueueDestroy, (ZeQueue));
1608-
// Gracefully handle the case that L0 was already unloaded.
1609-
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
1610-
return ze2urResult(ZeResult);
1608+
if (!Queue->IsInteropNativeHandle ||
1609+
(Queue->IsInteropNativeHandle && checkL0LoaderTeardown())) {
1610+
auto ZeResult = ZE_CALL_NOCHECK(zeCommandQueueDestroy, (ZeQueue));
1611+
// Gracefully handle the case that L0 was already unloaded.
1612+
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
1613+
return ze2urResult(ZeResult);
1614+
}
16111615
}
16121616
}
16131617

unified-runtime/source/adapters/level_zero/usm.cpp

+11-6
Original file line numberDiff line numberDiff line change
@@ -683,13 +683,18 @@ ur_result_t UR_APICALL urUSMPoolTrimToExp(ur_context_handle_t,
683683
} // namespace ur::level_zero
684684

685685
static ur_result_t USMFreeImpl(ur_context_handle_t Context, void *Ptr) {
686-
auto ZeResult = ZE_CALL_NOCHECK(zeMemFree, (Context->ZeContext, Ptr));
687-
// Handle When the driver is already released
688-
if (ZeResult == ZE_RESULT_ERROR_UNINITIALIZED) {
689-
return UR_RESULT_SUCCESS;
690-
} else {
691-
return ze2urResult(ZeResult);
686+
ur_result_t Res = UR_RESULT_SUCCESS;
687+
if (!Context->IsInteropNativeHandle ||
688+
(Context->IsInteropNativeHandle && checkL0LoaderTeardown())) {
689+
auto ZeResult = ZE_CALL_NOCHECK(zeMemFree, (Context->ZeContext, Ptr));
690+
// Handle When the driver is already released
691+
if (ZeResult == ZE_RESULT_ERROR_UNINITIALIZED) {
692+
Res = UR_RESULT_SUCCESS;
693+
} else {
694+
Res = ze2urResult(ZeResult);
695+
}
692696
}
697+
return Res;
693698
}
694699

695700
static ur_result_t USMQueryPageSize(ur_context_handle_t Context, void *Ptr,

unified-runtime/source/adapters/level_zero/v2/common.hpp

+3-1
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,8 @@ struct ze_handle_wrapper {
7979
return;
8080
}
8181

82-
if (ownZeHandle) {
82+
if ((ownZeHandle && !IsInteropNativeHandle) ||
83+
(ownZeHandle && IsInteropNativeHandle && checkL0LoaderTeardown())) {
8384
auto zeResult = destroy(handle);
8485
// Gracefully handle the case that L0 was already unloaded.
8586
if (zeResult && zeResult != ZE_RESULT_ERROR_UNINITIALIZED)
@@ -102,6 +103,7 @@ struct ze_handle_wrapper {
102103
private:
103104
ZeHandleT handle;
104105
bool ownZeHandle;
106+
bool IsInteropNativeHandle = false;
105107
};
106108

107109
using ze_kernel_handle_t = HANDLE_WRAPPER_TYPE(::ze_kernel_handle_t,

unified-runtime/source/adapters/level_zero/v2/context.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,7 @@ ur_result_t urContextCreateWithNativeHandle(
152152

153153
*phContext =
154154
new ur_context_handle_t_(zeContext, numDevices, phDevices, ownZeHandle);
155+
(*phContext)->IsInteropNativeHandle = true;
155156
return UR_RESULT_SUCCESS;
156157
} catch (...) {
157158
return exceptionToResult(std::current_exception());

unified-runtime/source/adapters/level_zero/v2/event.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -398,6 +398,7 @@ urEventCreateWithNativeHandle(ur_native_handle_t hNativeEvent,
398398
ZE2UR_CALL(zeEventHostSignal, ((*phEvent)->getZeEvent()));
399399
} else {
400400
*phEvent = new ur_event_handle_t_(hContext, hNativeEvent, pProperties);
401+
(*phEvent)->IsInteropNativeHandle = true;
401402
}
402403
return UR_RESULT_SUCCESS;
403404
} catch (...) {

unified-runtime/source/adapters/level_zero/v2/kernel.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,7 @@ urKernelCreateWithNativeHandle(ur_native_handle_t hNativeKernel,
361361

362362
*phKernel =
363363
new ur_kernel_handle_t_(hNativeKernel, hProgram, hContext, pProperties);
364+
(*phKernel)->IsInteropNativeHandle = true;
364365
return UR_RESULT_SUCCESS;
365366
} catch (...) {
366367
return exceptionToResult(std::current_exception());

0 commit comments

Comments
 (0)