@@ -1700,6 +1700,54 @@ static bool setEnvVar(const char *name, const char *value) {
1700
1700
return true ;
1701
1701
}
1702
1702
1703
+ static class ZeUSMImportExtension {
1704
+ // Pointers to functions that import/release host memory into USM
1705
+ ze_result_t (*zexDriverImportExternalPointer)(ze_driver_handle_t hDriver,
1706
+ void *, size_t );
1707
+ ze_result_t (*zexDriverReleaseImportedPointer)(ze_driver_handle_t , void *);
1708
+
1709
+ public:
1710
+ // Whether user has requested Import/Release, and platform supports it.
1711
+ bool Enabled;
1712
+
1713
+ ZeUSMImportExtension () : Enabled{false } {}
1714
+
1715
+ void setZeUSMImport (pi_platform Platform) {
1716
+ // Whether env var SYCL_USM_HOSTPTR_IMPORT has been set requesting
1717
+ // host ptr import during buffer creation.
1718
+ const char *USMHostPtrImportStr = std::getenv (" SYCL_USM_HOSTPTR_IMPORT" );
1719
+ if (!USMHostPtrImportStr || std::atoi (USMHostPtrImportStr) == 0 )
1720
+ return ;
1721
+
1722
+ // Check if USM hostptr import feature is available.
1723
+ ze_driver_handle_t driverHandle = Platform->ZeDriver ;
1724
+ if (ZE_CALL_NOCHECK (zeDriverGetExtensionFunctionAddress,
1725
+ (driverHandle, " zexDriverImportExternalPointer" ,
1726
+ reinterpret_cast <void **>(
1727
+ &zexDriverImportExternalPointer))) == 0 ) {
1728
+ ZE_CALL_NOCHECK (
1729
+ zeDriverGetExtensionFunctionAddress,
1730
+ (driverHandle, " zexDriverReleaseImportedPointer" ,
1731
+ reinterpret_cast <void **>(&zexDriverReleaseImportedPointer)));
1732
+ // Hostptr import/release is turned on because it has been requested
1733
+ // by the env var, and this platform supports the APIs.
1734
+ Enabled = true ;
1735
+ // Hostptr import is only possible if piMemBufferCreate receives a
1736
+ // hostptr as an argument. The SYCL runtime passes a host ptr
1737
+ // only when SYCL_HOST_UNIFIED_MEMORY is enabled. Therefore we turn it on.
1738
+ setEnvVar (" SYCL_HOST_UNIFIED_MEMORY" , " 1" );
1739
+ }
1740
+ }
1741
+ void doZeUSMImport (ze_driver_handle_t driverHandle, void *HostPtr,
1742
+ size_t Size ) {
1743
+ ZE_CALL_NOCHECK (zexDriverImportExternalPointer,
1744
+ (driverHandle, HostPtr, Size ));
1745
+ }
1746
+ void doZeUSMRelease (ze_driver_handle_t driverHandle, void *HostPtr) {
1747
+ ZE_CALL_NOCHECK (zexDriverReleaseImportedPointer, (driverHandle, HostPtr));
1748
+ }
1749
+ } ZeUSMImport;
1750
+
1703
1751
pi_result _pi_platform::initialize () {
1704
1752
// Cache driver properties
1705
1753
ZeStruct<ze_driver_properties_t > ZeDriverProperties;
@@ -1745,6 +1793,10 @@ pi_result _pi_platform::initialize() {
1745
1793
zeDriverExtensionMap[extension.name ] = extension.version ;
1746
1794
}
1747
1795
1796
+ // Check if import user ptr into USM feature has been requested.
1797
+ // If yes, then set up L0 API pointers if the platform supports it.
1798
+ ZeUSMImport.setZeUSMImport (this );
1799
+
1748
1800
return PI_SUCCESS;
1749
1801
}
1750
1802
@@ -1854,8 +1906,9 @@ pi_result piPlatformsGet(pi_uint32 NumEntries, pi_platform *Platforms,
1854
1906
std::copy_n (PiPlatformsCache->begin (), NumEntries, Platforms);
1855
1907
}
1856
1908
1857
- if (NumPlatforms)
1909
+ if (NumPlatforms) {
1858
1910
*NumPlatforms = PiPlatformsCache->size ();
1911
+ }
1859
1912
1860
1913
zePrint (" Using events scope: %s\n " ,
1861
1914
EventsScope == AllHostVisible ? " all host-visible"
@@ -3360,32 +3413,69 @@ pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size,
3360
3413
else
3361
3414
Alignment = 1UL ;
3362
3415
3363
- pi_result Result = PI_SUCCESS;
3416
+ // If USM Import feature is enabled and hostptr is supplied,
3417
+ // import the hostptr if not already imported into USM.
3418
+ // Data transfer rate is maximized when both source and destination
3419
+ // are USM pointers. Promotion of the host pointer to USM thus
3420
+ // optimizes data transfer performance.
3421
+ bool HostPtrImported = false ;
3422
+ if (ZeUSMImport.Enabled && HostPtr != nullptr &&
3423
+ (Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0 ) {
3424
+ // Query memory type of the host pointer
3425
+ ze_device_handle_t ZeDeviceHandle;
3426
+ ZeStruct<ze_memory_allocation_properties_t > ZeMemoryAllocationProperties;
3427
+ ZE_CALL (zeMemGetAllocProperties,
3428
+ (Context->ZeContext , HostPtr, &ZeMemoryAllocationProperties,
3429
+ &ZeDeviceHandle));
3430
+
3431
+ // If not shared of any type, we can import the ptr
3432
+ if (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_UNKNOWN) {
3433
+ // Promote the host ptr to USM host memory
3434
+ ze_driver_handle_t driverHandle = Context->Devices [0 ]->Platform ->ZeDriver ;
3435
+ ZeUSMImport.doZeUSMImport (driverHandle, HostPtr, Size );
3436
+ HostPtrImported = true ;
3437
+ }
3438
+ }
3439
+
3440
+ pi_result Result;
3364
3441
if (DeviceIsIntegrated) {
3365
- if (enableBufferPooling ()) {
3366
- PI_CALL (piextUSMHostAlloc (&Ptr , Context, nullptr , Size , Alignment));
3367
- } else
3368
- Result = ZeHostMemAllocHelper (&Ptr , Context, Size );
3442
+ if (HostPtrImported) {
3443
+ // When HostPtr is imported we use it for the buffer.
3444
+ Ptr = HostPtr;
3445
+ } else {
3446
+ if (enableBufferPooling ()) {
3447
+ PI_CALL (piextUSMHostAlloc (&Ptr , Context, nullptr , Size , Alignment));
3448
+ } else {
3449
+ Result = ZeHostMemAllocHelper (&Ptr , Context, Size );
3450
+ }
3451
+ }
3369
3452
} else if (Context->SingleRootDevice ) {
3370
3453
// If we have a single discrete device or all devices in the context are
3371
3454
// sub-devices of the same device then we can allocate on device
3372
3455
if (enableBufferPooling ()) {
3373
3456
PI_CALL (piextUSMDeviceAlloc (&Ptr , Context, Context->SingleRootDevice ,
3374
3457
nullptr , Size , Alignment));
3375
- } else
3458
+ } else {
3376
3459
Result = ZeDeviceMemAllocHelper (&Ptr , Context, Context->SingleRootDevice ,
3377
3460
Size );
3461
+ }
3378
3462
} else {
3379
3463
// Context with several gpu cards. Temporarily use host allocation because
3380
3464
// it is accessible by all devices. But it is not good in terms of
3381
3465
// performance.
3382
3466
// TODO: We need to either allow remote access to device memory using IPC,
3383
3467
// or do explicit memory transfers from one device to another using host
3384
3468
// resources as backing buffers to allow those transfers.
3385
- if (enableBufferPooling ()) {
3386
- PI_CALL (piextUSMHostAlloc (&Ptr , Context, nullptr , Size , Alignment));
3387
- } else
3388
- Result = ZeHostMemAllocHelper (&Ptr , Context, Size );
3469
+ if (HostPtrImported) {
3470
+ // When HostPtr is imported we use it for the buffer.
3471
+ Ptr = HostPtr;
3472
+ } else {
3473
+ if (enableBufferPooling ()) {
3474
+ PI_CALL (piextUSMHostAlloc (&Ptr , Context, nullptr , Size , Alignment));
3475
+ } else {
3476
+ Result = ZeHostMemAllocHelper (&Ptr , Context, Size );
3477
+ }
3478
+ }
3389
3479
}
3390
3480
3391
3481
if (Result != PI_SUCCESS)
@@ -3396,8 +3486,10 @@ pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size,
3396
3486
(Flags & PI_MEM_FLAGS_HOST_PTR_COPY) != 0 ) {
3397
3487
// Initialize the buffer with user data
3398
3488
if (DeviceIsIntegrated) {
3399
- // Do a host to host copy
3400
- memcpy (Ptr , HostPtr, Size );
3489
+ // Do a host to host copy.
3490
+ // For an imported HostPtr the copy is unneeded.
3491
+ if (!HostPtrImported)
3492
+ memcpy (Ptr , HostPtr, Size );
3401
3493
} else if (Context->SingleRootDevice ) {
3402
3494
// Initialize the buffer synchronously with immediate offload
3403
3495
ZE_CALL (zeCommandListAppendMemoryCopy,
@@ -3406,7 +3498,9 @@ pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size,
3406
3498
} else {
3407
3499
// Multiple root devices, do a host to host copy because we use a host
3408
3500
// allocation for this case.
3409
- memcpy (Ptr , HostPtr, Size );
3501
+ // For an imported HostPtr the copy is unneeded.
3502
+ if (!HostPtrImported)
3503
+ memcpy (Ptr , HostPtr, Size );
3410
3504
}
3411
3505
} else if (Flags == 0 || (Flags == PI_MEM_FLAGS_ACCESS_RW)) {
3412
3506
// Nothing more to do.
@@ -3421,7 +3515,7 @@ pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size,
3421
3515
*RetMem = new _pi_buffer (
3422
3516
Context, pi_cast<char *>(Ptr ) /* Level Zero Memory Handle */ ,
3423
3517
HostPtrOrNull, nullptr , 0 , 0 ,
3424
- DeviceIsIntegrated /* allocation in host memory */ );
3518
+ DeviceIsIntegrated /* allocation in host memory */ , HostPtrImported );
3425
3519
} catch (const std::bad_alloc &) {
3426
3520
return PI_OUT_OF_HOST_MEMORY;
3427
3521
} catch (...) {
@@ -3491,11 +3585,17 @@ pi_result piMemRelease(pi_mem Mem) {
3491
3585
} else {
3492
3586
auto Buf = static_cast <_pi_buffer *>(Mem);
3493
3587
if (!Buf->isSubBuffer ()) {
3494
- if (enableBufferPooling ()) {
3495
- PI_CALL (piextUSMFree (Mem->Context , Mem->getZeHandle ()));
3588
+ if (Mem->HostPtrImported ) {
3589
+ ze_driver_handle_t driverHandle =
3590
+ Mem->Context ->Devices [0 ]->Platform ->ZeDriver ;
3591
+ ZeUSMImport.doZeUSMRelease (driverHandle, Mem->MapHostPtr );
3496
3592
} else {
3497
- if (auto Res = ZeMemFreeHelper (Mem->Context , Mem->getZeHandle ()))
3498
- return Res;
3593
+ if (enableBufferPooling ()) {
3594
+ PI_CALL (piextUSMFree (Mem->Context , Mem->getZeHandle ()));
3595
+ } else {
3596
+ if (auto Res = ZeMemFreeHelper (Mem->Context , Mem->getZeHandle ()))
3597
+ return Res;
3598
+ }
3499
3599
}
3500
3600
}
3501
3601
}
@@ -6020,7 +6120,8 @@ pi_result piEnqueueMemBufferMap(pi_queue Queue, pi_mem Buffer,
6020
6120
6021
6121
if (Buffer->MapHostPtr ) {
6022
6122
*RetMap = Buffer->MapHostPtr + Offset;
6023
- if (!(MapFlags & PI_MAP_WRITE_INVALIDATE_REGION))
6123
+ if (!Buffer->HostPtrImported &&
6124
+ !(MapFlags & PI_MAP_WRITE_INVALIDATE_REGION))
6024
6125
memcpy (*RetMap, pi_cast<char *>(Buffer->getZeHandle ()) + Offset, Size );
6025
6126
} else {
6026
6127
*RetMap = pi_cast<char *>(Buffer->getZeHandle ()) + Offset;
0 commit comments