Skip to content

Commit 844d7b6

Browse files
authored
[SYCL] Experimental support for L0 host pointer import (#4891)
This change adds support for an experimental L0 API for host pointer import into USM. Signed-off-by: Rajiv Deodhar <[email protected]>
1 parent b48be20 commit 844d7b6

File tree

3 files changed

+132
-25
lines changed

3 files changed

+132
-25
lines changed

sycl/doc/EnvironmentVariables.md

+1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ compiler and runtime.
2121
| `SYCL_CACHE_MIN_DEVICE_IMAGE_SIZE` | Positive integer | Minimum size of device code image in bytes which is reasonable to cache on disk because disk access operation may take more time than do JIT compilation for it. Default value is 0 to cache all images. |
2222
| `SYCL_CACHE_MAX_DEVICE_IMAGE_SIZE` | Positive integer | Maximum size of device image in bytes which is cached. Too big kernels may overload disk too fast. Default value is 1 GB. |
2323
| `SYCL_ENABLE_DEFAULT_CONTEXTS` | '1' or '0' | Enable ('1') or disable ('0') creation of default platform contexts in SYCL runtime. The default context for each platform contains all devices in the platform. Refer to [Platform Default Contexts](extensions/PlatformContext/PlatformContext.adoc) extension to learn more. Enabled by default on Linux and disabled on Windows. |
24+
| `SYCL_USM_HOSTPTR_IMPORT` | Integer | Enable by specifying non-zero value. Buffers created with a host pointer will result in host data promotion to USM, improving data transfer performance. To use this feature, also set SYCL_HOST_UNIFIED_MEMORY=1. |
2425

2526
`(*) Note: Any means this environment variable is effective when set to any non-null value.`
2627

sycl/plugins/level_zero/pi_level_zero.cpp

+121-20
Original file line numberDiff line numberDiff line change
@@ -1700,6 +1700,54 @@ static bool setEnvVar(const char *name, const char *value) {
17001700
return true;
17011701
}
17021702

1703+
static class ZeUSMImportExtension {
1704+
// Pointers to functions that import/release host memory into USM
1705+
ze_result_t (*zexDriverImportExternalPointer)(ze_driver_handle_t hDriver,
1706+
void *, size_t);
1707+
ze_result_t (*zexDriverReleaseImportedPointer)(ze_driver_handle_t, void *);
1708+
1709+
public:
1710+
// Whether user has requested Import/Release, and platform supports it.
1711+
bool Enabled;
1712+
1713+
ZeUSMImportExtension() : Enabled{false} {}
1714+
1715+
void setZeUSMImport(pi_platform Platform) {
1716+
// Whether env var SYCL_USM_HOSTPTR_IMPORT has been set requesting
1717+
// host ptr import during buffer creation.
1718+
const char *USMHostPtrImportStr = std::getenv("SYCL_USM_HOSTPTR_IMPORT");
1719+
if (!USMHostPtrImportStr || std::atoi(USMHostPtrImportStr) == 0)
1720+
return;
1721+
1722+
// Check if USM hostptr import feature is available.
1723+
ze_driver_handle_t driverHandle = Platform->ZeDriver;
1724+
if (ZE_CALL_NOCHECK(zeDriverGetExtensionFunctionAddress,
1725+
(driverHandle, "zexDriverImportExternalPointer",
1726+
reinterpret_cast<void **>(
1727+
&zexDriverImportExternalPointer))) == 0) {
1728+
ZE_CALL_NOCHECK(
1729+
zeDriverGetExtensionFunctionAddress,
1730+
(driverHandle, "zexDriverReleaseImportedPointer",
1731+
reinterpret_cast<void **>(&zexDriverReleaseImportedPointer)));
1732+
// Hostptr import/release is turned on because it has been requested
1733+
// by the env var, and this platform supports the APIs.
1734+
Enabled = true;
1735+
// Hostptr import is only possible if piMemBufferCreate receives a
1736+
// hostptr as an argument. The SYCL runtime passes a host ptr
1737+
// only when SYCL_HOST_UNIFIED_MEMORY is enabled. Therefore we turn it on.
1738+
setEnvVar("SYCL_HOST_UNIFIED_MEMORY", "1");
1739+
}
1740+
}
1741+
void doZeUSMImport(ze_driver_handle_t driverHandle, void *HostPtr,
1742+
size_t Size) {
1743+
ZE_CALL_NOCHECK(zexDriverImportExternalPointer,
1744+
(driverHandle, HostPtr, Size));
1745+
}
1746+
void doZeUSMRelease(ze_driver_handle_t driverHandle, void *HostPtr) {
1747+
ZE_CALL_NOCHECK(zexDriverReleaseImportedPointer, (driverHandle, HostPtr));
1748+
}
1749+
} ZeUSMImport;
1750+
17031751
pi_result _pi_platform::initialize() {
17041752
// Cache driver properties
17051753
ZeStruct<ze_driver_properties_t> ZeDriverProperties;
@@ -1745,6 +1793,10 @@ pi_result _pi_platform::initialize() {
17451793
zeDriverExtensionMap[extension.name] = extension.version;
17461794
}
17471795

1796+
// Check if import user ptr into USM feature has been requested.
1797+
// If yes, then set up L0 API pointers if the platform supports it.
1798+
ZeUSMImport.setZeUSMImport(this);
1799+
17481800
return PI_SUCCESS;
17491801
}
17501802

@@ -1854,8 +1906,9 @@ pi_result piPlatformsGet(pi_uint32 NumEntries, pi_platform *Platforms,
18541906
std::copy_n(PiPlatformsCache->begin(), NumEntries, Platforms);
18551907
}
18561908

1857-
if (NumPlatforms)
1909+
if (NumPlatforms) {
18581910
*NumPlatforms = PiPlatformsCache->size();
1911+
}
18591912

18601913
zePrint("Using events scope: %s\n",
18611914
EventsScope == AllHostVisible ? "all host-visible"
@@ -3360,32 +3413,69 @@ pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size,
33603413
else
33613414
Alignment = 1UL;
33623415

3363-
pi_result Result = PI_SUCCESS;
3416+
// If USM Import feature is enabled and hostptr is supplied,
3417+
// import the hostptr if not already imported into USM.
3418+
// Data transfer rate is maximized when both source and destination
3419+
// are USM pointers. Promotion of the host pointer to USM thus
3420+
// optimizes data transfer performance.
3421+
bool HostPtrImported = false;
3422+
if (ZeUSMImport.Enabled && HostPtr != nullptr &&
3423+
(Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0) {
3424+
// Query memory type of the host pointer
3425+
ze_device_handle_t ZeDeviceHandle;
3426+
ZeStruct<ze_memory_allocation_properties_t> ZeMemoryAllocationProperties;
3427+
ZE_CALL(zeMemGetAllocProperties,
3428+
(Context->ZeContext, HostPtr, &ZeMemoryAllocationProperties,
3429+
&ZeDeviceHandle));
3430+
3431+
// If not shared of any type, we can import the ptr
3432+
if (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_UNKNOWN) {
3433+
// Promote the host ptr to USM host memory
3434+
ze_driver_handle_t driverHandle = Context->Devices[0]->Platform->ZeDriver;
3435+
ZeUSMImport.doZeUSMImport(driverHandle, HostPtr, Size);
3436+
HostPtrImported = true;
3437+
}
3438+
}
3439+
3440+
pi_result Result;
33643441
if (DeviceIsIntegrated) {
3365-
if (enableBufferPooling()) {
3366-
PI_CALL(piextUSMHostAlloc(&Ptr, Context, nullptr, Size, Alignment));
3367-
} else
3368-
Result = ZeHostMemAllocHelper(&Ptr, Context, Size);
3442+
if (HostPtrImported) {
3443+
// When HostPtr is imported we use it for the buffer.
3444+
Ptr = HostPtr;
3445+
} else {
3446+
if (enableBufferPooling()) {
3447+
PI_CALL(piextUSMHostAlloc(&Ptr, Context, nullptr, Size, Alignment));
3448+
} else {
3449+
Result = ZeHostMemAllocHelper(&Ptr, Context, Size);
3450+
}
3451+
}
33693452
} else if (Context->SingleRootDevice) {
33703453
// If we have a single discrete device or all devices in the context are
33713454
// sub-devices of the same device then we can allocate on device
33723455
if (enableBufferPooling()) {
33733456
PI_CALL(piextUSMDeviceAlloc(&Ptr, Context, Context->SingleRootDevice,
33743457
nullptr, Size, Alignment));
3375-
} else
3458+
} else {
33763459
Result = ZeDeviceMemAllocHelper(&Ptr, Context, Context->SingleRootDevice,
33773460
Size);
3461+
}
33783462
} else {
33793463
// Context with several gpu cards. Temporarily use host allocation because
33803464
// it is accessible by all devices. But it is not good in terms of
33813465
// performance.
33823466
// TODO: We need to either allow remote access to device memory using IPC,
33833467
// or do explicit memory transfers from one device to another using host
33843468
// resources as backing buffers to allow those transfers.
3385-
if (enableBufferPooling()) {
3386-
PI_CALL(piextUSMHostAlloc(&Ptr, Context, nullptr, Size, Alignment));
3387-
} else
3388-
Result = ZeHostMemAllocHelper(&Ptr, Context, Size);
3469+
if (HostPtrImported) {
3470+
// When HostPtr is imported we use it for the buffer.
3471+
Ptr = HostPtr;
3472+
} else {
3473+
if (enableBufferPooling()) {
3474+
PI_CALL(piextUSMHostAlloc(&Ptr, Context, nullptr, Size, Alignment));
3475+
} else {
3476+
Result = ZeHostMemAllocHelper(&Ptr, Context, Size);
3477+
}
3478+
}
33893479
}
33903480

33913481
if (Result != PI_SUCCESS)
@@ -3396,8 +3486,10 @@ pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size,
33963486
(Flags & PI_MEM_FLAGS_HOST_PTR_COPY) != 0) {
33973487
// Initialize the buffer with user data
33983488
if (DeviceIsIntegrated) {
3399-
// Do a host to host copy
3400-
memcpy(Ptr, HostPtr, Size);
3489+
// Do a host to host copy.
3490+
// For an imported HostPtr the copy is unneeded.
3491+
if (!HostPtrImported)
3492+
memcpy(Ptr, HostPtr, Size);
34013493
} else if (Context->SingleRootDevice) {
34023494
// Initialize the buffer synchronously with immediate offload
34033495
ZE_CALL(zeCommandListAppendMemoryCopy,
@@ -3406,7 +3498,9 @@ pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size,
34063498
} else {
34073499
// Multiple root devices, do a host to host copy because we use a host
34083500
// allocation for this case.
3409-
memcpy(Ptr, HostPtr, Size);
3501+
// For an imported HostPtr the copy is unneeded.
3502+
if (!HostPtrImported)
3503+
memcpy(Ptr, HostPtr, Size);
34103504
}
34113505
} else if (Flags == 0 || (Flags == PI_MEM_FLAGS_ACCESS_RW)) {
34123506
// Nothing more to do.
@@ -3421,7 +3515,7 @@ pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size,
34213515
*RetMem = new _pi_buffer(
34223516
Context, pi_cast<char *>(Ptr) /* Level Zero Memory Handle */,
34233517
HostPtrOrNull, nullptr, 0, 0,
3424-
DeviceIsIntegrated /* allocation in host memory */);
3518+
DeviceIsIntegrated /* allocation in host memory */, HostPtrImported);
34253519
} catch (const std::bad_alloc &) {
34263520
return PI_OUT_OF_HOST_MEMORY;
34273521
} catch (...) {
@@ -3491,11 +3585,17 @@ pi_result piMemRelease(pi_mem Mem) {
34913585
} else {
34923586
auto Buf = static_cast<_pi_buffer *>(Mem);
34933587
if (!Buf->isSubBuffer()) {
3494-
if (enableBufferPooling()) {
3495-
PI_CALL(piextUSMFree(Mem->Context, Mem->getZeHandle()));
3588+
if (Mem->HostPtrImported) {
3589+
ze_driver_handle_t driverHandle =
3590+
Mem->Context->Devices[0]->Platform->ZeDriver;
3591+
ZeUSMImport.doZeUSMRelease(driverHandle, Mem->MapHostPtr);
34963592
} else {
3497-
if (auto Res = ZeMemFreeHelper(Mem->Context, Mem->getZeHandle()))
3498-
return Res;
3593+
if (enableBufferPooling()) {
3594+
PI_CALL(piextUSMFree(Mem->Context, Mem->getZeHandle()));
3595+
} else {
3596+
if (auto Res = ZeMemFreeHelper(Mem->Context, Mem->getZeHandle()))
3597+
return Res;
3598+
}
34993599
}
35003600
}
35013601
}
@@ -6020,7 +6120,8 @@ pi_result piEnqueueMemBufferMap(pi_queue Queue, pi_mem Buffer,
60206120

60216121
if (Buffer->MapHostPtr) {
60226122
*RetMap = Buffer->MapHostPtr + Offset;
6023-
if (!(MapFlags & PI_MAP_WRITE_INVALIDATE_REGION))
6123+
if (!Buffer->HostPtrImported &&
6124+
!(MapFlags & PI_MAP_WRITE_INVALIDATE_REGION))
60246125
memcpy(*RetMap, pi_cast<char *>(Buffer->getZeHandle()) + Offset, Size);
60256126
} else {
60266127
*RetMap = pi_cast<char *>(Buffer->getZeHandle()) + Offset;

sycl/plugins/level_zero/pi_level_zero.hpp

+10-5
Original file line numberDiff line numberDiff line change
@@ -811,6 +811,9 @@ struct _pi_mem : _pi_object {
811811
// Flag to indicate that this memory is allocated in host memory
812812
bool OnHost;
813813

814+
// Flag to indicate that the host ptr has been imported into USM
815+
bool HostPtrImported;
816+
814817
// Supplementary data to keep track of the mappings of this memory
815818
// created with piEnqueueMemBufferMap and piEnqueueMemImageMap.
816819
struct Mapping {
@@ -838,8 +841,10 @@ struct _pi_mem : _pi_object {
838841
pi_result removeMapping(void *MappedTo, Mapping &MapInfo);
839842

840843
protected:
841-
_pi_mem(pi_context Ctx, char *HostPtr, bool MemOnHost = false)
842-
: Context{Ctx}, MapHostPtr{HostPtr}, OnHost{MemOnHost}, Mappings{} {}
844+
_pi_mem(pi_context Ctx, char *HostPtr, bool MemOnHost = false,
845+
bool ImportedHostPtr = false)
846+
: Context{Ctx}, MapHostPtr{HostPtr}, OnHost{MemOnHost},
847+
HostPtrImported{ImportedHostPtr}, Mappings{} {}
843848

844849
private:
845850
// The key is the host pointer representing an active mapping.
@@ -856,9 +861,9 @@ struct _pi_buffer final : _pi_mem {
856861
// Buffer/Sub-buffer constructor
857862
_pi_buffer(pi_context Ctx, char *Mem, char *HostPtr,
858863
_pi_mem *Parent = nullptr, size_t Origin = 0, size_t Size = 0,
859-
bool MemOnHost = false)
860-
: _pi_mem(Ctx, HostPtr, MemOnHost), ZeMem{Mem}, SubBuffer{Parent, Origin,
861-
Size} {}
864+
bool MemOnHost = false, bool ImportedHostPtr = false)
865+
: _pi_mem(Ctx, HostPtr, MemOnHost, ImportedHostPtr), ZeMem{Mem},
866+
SubBuffer{Parent, Origin, Size} {}
862867

863868
void *getZeHandle() override { return ZeMem; }
864869

0 commit comments

Comments
 (0)