diff --git a/bin/hipify-perl b/bin/hipify-perl index 82c0923c..703e1530 100755 --- a/bin/hipify-perl +++ b/bin/hipify-perl @@ -6236,6 +6236,7 @@ sub simpleSubstitutions { subst("__bfloat1622float2", "__bfloat1622float2", "device_function"); subst("__bfloat162bfloat162", "__bfloat162bfloat162", "device_function"); subst("__bfloat162float", "__bfloat162float", "device_function"); + subst("__bfloat16_as_short", "__bfloat16_as_short", "device_function"); subst("__brev", "__brev", "device_function"); subst("__brevll", "__brevll", "device_function"); subst("__byte_perm", "__byte_perm", "device_function"); @@ -6353,6 +6354,7 @@ sub simpleSubstitutions { subst("__half2ushort_rz", "__half2ushort_rz", "device_function"); subst("__half_as_short", "__half_as_short", "device_function"); subst("__half_as_ushort", "__half_as_ushort", "device_function"); + subst("__halves2bfloat162", "__halves2bfloat162", "device_function"); subst("__halves2half2", "__halves2half2", "device_function"); subst("__hbeq2", "__hbeq2", "device_function"); subst("__hbequ2", "__hbequ2", "device_function"); @@ -6384,6 +6386,7 @@ sub simpleSubstitutions { subst("__hgtu", "__hgtu", "device_function"); subst("__hgtu2", "__hgtu2", "device_function"); subst("__high2bfloat16", "__high2bfloat16", "device_function"); + subst("__high2bfloat162", "__high2bfloat162", "device_function"); subst("__high2float", "__high2float", "device_function"); subst("__high2half", "__high2half", "device_function"); subst("__high2half2", "__high2half2", "device_function"); @@ -6402,8 +6405,10 @@ sub simpleSubstitutions { subst("__hltu", "__hltu", "device_function"); subst("__hltu2", "__hltu2", "device_function"); subst("__hmax", "__hmax", "device_function"); + subst("__hmax2", "__hmax2", "device_function"); subst("__hmax_nan", "__hmax_nan", "device_function"); subst("__hmin", "__hmin", "device_function"); + subst("__hmin2", "__hmin2", "device_function"); subst("__hmin_nan", "__hmin_nan", "device_function"); subst("__hmul", "__hmul", "device_function"); subst("__hmul2", "__hmul2", "device_function"); @@ -6449,6 +6454,8 @@ sub simpleSubstitutions { subst("__log2f", "__log2f", "device_function"); subst("__logf", "__logf", "device_function"); subst("__longlong_as_double", "__longlong_as_double", "device_function"); + subst("__low2bfloat16", "__low2bfloat16", "device_function"); + subst("__low2bfloat162", "__low2bfloat162", "device_function"); subst("__low2float", "__low2float", "device_function"); subst("__low2half", "__low2half", "device_function"); subst("__low2half2", "__low2half2", "device_function"); @@ -6687,6 +6694,7 @@ sub simpleSubstitutions { subst("lrintf", "lrintf", "device_function"); subst("lround", "lround", "device_function"); subst("lroundf", "lroundf", "device_function"); + subst("make_half2", "make_half2", "device_function"); subst("max", "max", "device_function"); subst("min", "min", "device_function"); subst("modf", "modf", "device_function"); @@ -9114,6 +9122,7 @@ sub transformHostFunctions { "modf", "min", "max", + "make_half2", "lroundf", "lround", "lrintf", @@ -9352,6 +9361,8 @@ sub transformHostFunctions { "__low2half2", "__low2half", "__low2float", + "__low2bfloat162", + "__low2bfloat16", "__longlong_as_double", "__logf", "__log2f", @@ -9397,8 +9408,10 @@ sub transformHostFunctions { "__hmul2", "__hmul", "__hmin_nan", + "__hmin2", "__hmin", "__hmax_nan", + "__hmax2", "__hmax", "__hltu2", "__hltu", @@ -9417,6 +9430,7 @@ sub transformHostFunctions { "__high2half2", "__high2half", "__high2float", + "__high2bfloat162", "__high2bfloat16", "__hgtu2", "__hgtu", @@ -9448,6 +9462,7 @@ sub transformHostFunctions { "__hbequ2", "__hbeq2", "__halves2half2", + "__halves2bfloat162", "__half_as_ushort", "__half_as_short", "__half2ushort_rz", @@ -9565,6 +9580,7 @@ sub transformHostFunctions { "__byte_perm", "__brevll", "__brev", + "__bfloat16_as_short", "__bfloat162float", "__bfloat162bfloat162", "__bfloat1622float2", @@ -9606,7 +9622,6 @@ sub countSupportedDeviceFunctions { "mulhi", "mul64hi", "mul24", - "make_half2", "make_bfloat162", "llmin", "llmax", @@ -9729,8 +9744,6 @@ sub countSupportedDeviceFunctions { "__pm2", "__pm1", "__pm0", - "__low2bfloat162", - "__low2bfloat16", "__ll2bfloat16_rz", "__ll2bfloat16_ru", "__ll2bfloat16_rn", @@ -9754,14 +9767,11 @@ sub countSupportedDeviceFunctions { "__hmul_rn", "__hmul2_rn", "__hmin2_nan", - "__hmin2", "__hmax2_nan", - "__hmax2", "__hltu2_mask", "__hlt2_mask", "__hleu2_mask", "__hle2_mask", - "__high2bfloat162", "__hgtu2_mask", "__hgt2_mask", "__hgeu2_mask", @@ -9771,7 +9781,6 @@ sub countSupportedDeviceFunctions { "__hequ2_mask", "__heq2_mask", "__hcmadd", - "__halves2bfloat162", "__half2uchar_rz", "__half2char_rz", "__hadd_rn", @@ -9831,7 +9840,6 @@ sub countSupportedDeviceFunctions { "__dadd_rd", "__brkpt", "__bfloat16_as_ushort", - "__bfloat16_as_short", "__bfloat162ushort_rz", "__bfloat162ushort_ru", "__bfloat162ushort_rn", diff --git a/docs/tables/CUDA_Device_API_supported_by_HIP.md b/docs/tables/CUDA_Device_API_supported_by_HIP.md index d63ea033..5e70cb82 100644 --- a/docs/tables/CUDA_Device_API_supported_by_HIP.md +++ b/docs/tables/CUDA_Device_API_supported_by_HIP.md @@ -47,7 +47,7 @@ |`__bfloat162ushort_rn`|11.0| | | | | | | | | | |`__bfloat162ushort_ru`|11.0| | | | | | | | | | |`__bfloat162ushort_rz`|11.0| | | | | | | | | | -|`__bfloat16_as_short`|11.0| | | | | | | | | | +|`__bfloat16_as_short`|11.0| | | |`__bfloat16_as_short`|5.7.0| | | | | |`__bfloat16_as_ushort`|11.0| | | | | | | | | | |`__brev`| | | | |`__brev`|1.6.0| | | | | |`__brevll`| | | | |`__brevll`|1.6.0| | | | | @@ -224,7 +224,7 @@ |`__half2ushort_rz`| | | | |`__half2ushort_rz`|1.6.0| | | | | |`__half_as_short`| | | | |`__half_as_short`|1.6.0| | | | | |`__half_as_ushort`| | | | |`__half_as_ushort`|1.6.0| | | | | -|`__halves2bfloat162`|11.0| | | | | | | | | | +|`__halves2bfloat162`|11.0| | | |`__halves2bfloat162`|5.7.0| | | | | |`__halves2half2`| | | | |`__halves2half2`|1.6.0| | | | | |`__hbeq2`| | | | |`__hbeq2`|1.6.0| | | | | |`__hbequ2`| | | | |`__hbequ2`|1.9.0| | | | | @@ -265,7 +265,7 @@ |`__hgtu2`| | | | |`__hgtu2`|1.9.0| | | | | |`__hgtu2_mask`|12.0| | | | | | | | | | |`__high2bfloat16`|11.0| | | |`__high2bfloat16`|5.7.0| | | | | -|`__high2bfloat162`|11.0| | | | | | | | | | +|`__high2bfloat162`|11.0| | | |`__high2bfloat162`|5.7.0| | | | | |`__high2float`| | | | |`__high2float`|1.6.0| | | | | |`__high2half`| | | | |`__high2half`|1.6.0| | | | | |`__high2half2`| | | | |`__high2half2`|1.6.0| | | | | @@ -288,11 +288,11 @@ |`__hltu2`| | | | |`__hltu2`|1.9.0| | | | | |`__hltu2_mask`|12.0| | | | | | | | | | |`__hmax`|11.0| | | |`__hmax`|5.5.0| | | | | -|`__hmax2`|11.0| | | | | | | | | | +|`__hmax2`|11.0| | | |`__hmax2`|5.7.0| | | | | |`__hmax2_nan`|11.0| | | | | | | | | | |`__hmax_nan`|11.0| | | |`__hmax_nan`|5.5.0| | | | | |`__hmin`|11.0| | | |`__hmin`|5.5.0| | | | | -|`__hmin2`|11.0| | | | | | | | | | +|`__hmin2`|11.0| | | |`__hmin2`|5.7.0| | | | | |`__hmin2_nan`|11.0| | | | | | | | | | |`__hmin_nan`|11.0| | | |`__hmin_nan`|5.5.0| | | | | |`__hmul`| | | | |`__hmul`|1.6.0| | | | | @@ -361,8 +361,8 @@ |`__log2f`| | | | |`__log2f`|1.6.0| | | | | |`__logf`| | | | |`__logf`|1.6.0| | | | | |`__longlong_as_double`| | | | |`__longlong_as_double`|1.6.0| | | | | -|`__low2bfloat16`|11.0| | | | | | | | | | -|`__low2bfloat162`|11.0| | | | | | | | | | +|`__low2bfloat16`|11.0| | | |`__low2bfloat16`|5.7.0| | | | | +|`__low2bfloat162`|11.0| | | |`__low2bfloat162`|5.7.0| | | | | |`__low2float`| | | | |`__low2float`|1.6.0| | | | | |`__low2half`| | | | |`__low2half`|1.6.0| | | | | |`__low2half2`| | | | |`__low2half2`|1.6.0| | | | | @@ -723,7 +723,7 @@ |`lround`| | | | |`lround`|1.6.0| | | | | |`lroundf`| | | | |`lroundf`|1.6.0| | | | | |`make_bfloat162`|12.2| | | | | | | | | | -|`make_half2`|12.2| | | | | | | | | | +|`make_half2`|12.2| | | |`make_half2`|4.5.0| | | | | |`max`| | | | |`max`|1.6.0| | | | | |`min`| | | | |`min`|1.6.0| | | | | |`modf`| | | | |`modf`|1.9.0| | | | | diff --git a/src/CUDA2HIP_Device_functions.cpp b/src/CUDA2HIP_Device_functions.cpp index 84090a9b..5ed4739c 100644 --- a/src/CUDA2HIP_Device_functions.cpp +++ b/src/CUDA2HIP_Device_functions.cpp @@ -687,11 +687,11 @@ const std::map CUDA_DEVICE_FUNCTION_MAP { {"__double2half", {"__double2half", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, {"__hmax", {"__hmax", "", CONV_DEVICE_FUNC, API_RUNTIME, 1}}, {"__hmax_nan", {"__hmax_nan", "", CONV_DEVICE_FUNC, API_RUNTIME, 1}}, - {"__hmax2", {"__hmax2", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, + {"__hmax2", {"__hmax2", "", CONV_DEVICE_FUNC, API_RUNTIME, 1}}, {"__hmax2_nan", {"__hmax2_nan", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, {"__hmin", {"__hmin", "", CONV_DEVICE_FUNC, API_RUNTIME, 1}}, {"__hmin_nan", {"__hmin_nan", "", CONV_DEVICE_FUNC, API_RUNTIME, 1}}, - {"__hmin2", {"__hmin2", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, + {"__hmin2", {"__hmin2", "", CONV_DEVICE_FUNC, API_RUNTIME, 1}}, {"__hmin2_nan", {"__hmin2_nan", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, {"__stwb", {"__stwb", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, {"__stcg", {"__stcg", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, @@ -710,7 +710,7 @@ const std::map CUDA_DEVICE_FUNCTION_MAP { {"__hgeu2_mask", {"__hgeu2_mask", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, {"__hltu2_mask", {"__hltu2_mask", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, {"__hgtu2_mask", {"__hgtu2_mask", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, - {"make_half2", {"make_half2", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, + {"make_half2", {"make_half2", "", CONV_DEVICE_FUNC, API_RUNTIME, 1}}, {"__half2char_rz", {"__half2char_rz", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, {"__half2uchar_rz", {"__half2uchar_rz", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, // bfp16 functions @@ -776,11 +776,11 @@ const std::map CUDA_DEVICE_FUNCTION_MAP { {"__lows2bfloat162", {"__lows2bfloat162", "", CONV_DEVICE_FUNC, API_RUNTIME, 1}}, {"__highs2bfloat162", {"__highs2bfloat162", "", CONV_DEVICE_FUNC, API_RUNTIME, 1}}, {"__high2bfloat16", {"__high2bfloat16", "", CONV_DEVICE_FUNC, API_RUNTIME, 1}}, - {"__low2bfloat16", {"__low2bfloat16", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, - {"__halves2bfloat162", {"__halves2bfloat162", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, - {"__low2bfloat162", {"__low2bfloat162", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, - {"__high2bfloat162", {"__high2bfloat162", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, - {"__bfloat16_as_short", {"__bfloat16_as_short", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, + {"__low2bfloat16", {"__low2bfloat16", "", CONV_DEVICE_FUNC, API_RUNTIME, 1}}, + {"__halves2bfloat162", {"__halves2bfloat162", "", CONV_DEVICE_FUNC, API_RUNTIME, 1}}, + {"__low2bfloat162", {"__low2bfloat162", "", CONV_DEVICE_FUNC, API_RUNTIME, 1}}, + {"__high2bfloat162", {"__high2bfloat162", "", CONV_DEVICE_FUNC, API_RUNTIME, 1}}, + {"__bfloat16_as_short", {"__bfloat16_as_short", "", CONV_DEVICE_FUNC, API_RUNTIME, 1}}, {"__bfloat16_as_ushort", {"__bfloat16_as_ushort", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, {"__short_as_bfloat16", {"__short_as_bfloat16", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, {"__ushort_as_bfloat16", {"__ushort_as_bfloat16", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, @@ -1516,6 +1516,14 @@ const std::map HIP_DEVICE_FUNCTION_VER_MAP { {"__lows2bfloat162", {HIP_5070, HIP_0, HIP_0 }}, {"__highs2bfloat162", {HIP_5070, HIP_0, HIP_0 }}, {"__high2bfloat16", {HIP_5070, HIP_0, HIP_0 }}, + {"__hmin2", {HIP_5070, HIP_0, HIP_0 }}, + {"__hmax2", {HIP_5070, HIP_0, HIP_0 }}, + {"make_half2", {HIP_4050, HIP_0, HIP_0 }}, + {"__low2bfloat16", {HIP_5070, HIP_0, HIP_0 }}, + {"__halves2bfloat162", {HIP_5070, HIP_0, HIP_0 }}, + {"__low2bfloat162", {HIP_5070, HIP_0, HIP_0 }}, + {"__high2bfloat162", {HIP_5070, HIP_0, HIP_0 }}, + {"__bfloat16_as_short", {HIP_5070, HIP_0, HIP_0 }}, }; const std::map CUDA_DEVICE_FUNCTION_API_SECTION_MAP { diff --git a/tests/unit_tests/synthetic/libraries/cudevice2hipdevice.cu b/tests/unit_tests/synthetic/libraries/cudevice2hipdevice.cu index 48885a8f..f6fcc756 100644 --- a/tests/unit_tests/synthetic/libraries/cudevice2hipdevice.cu +++ b/tests/unit_tests/synthetic/libraries/cudevice2hipdevice.cu @@ -15,14 +15,22 @@ int main() { double dx = 0.0f; float fa = 0.0f; float fx = 0.0f; + short int shi = 0; double2 d2 = { 0.0f, 0.0f }; float2 f2 = { 0.0f, 0.0f }; + __half hx = { 0.0f }; + __half hy = { 0.0f }; + __half2 h2 = { 0.0f, 0.0f }; __half_raw hrx = { 0 }; __half2_raw h2rx = { 0, 0 }; #if CUDA_VERSION >= 11000 - // CHECK: __hip_bfloat16 bf16 = { 0 }; - __nv_bfloat16 bf16 = { 0 }; + // CHECK: __hip_bfloat16 bf16 = { 0.0f }; + // CHECK-NEXT: __hip_bfloat16 bf16a = { 0.0f }; + // CHECK-NEXT: __hip_bfloat16 bf16b = { 0.0f }; + __nv_bfloat16 bf16 = { 0.0f }; + __nv_bfloat16 bf16a = { 0.0f }; + __nv_bfloat16 bf16b = { 0.0f }; // CHECK: __hip_bfloat16_raw bf16r = { 0 }; __nv_bfloat16_raw bf16r = { 0 }; @@ -56,6 +64,41 @@ int main() { // HIP: __BF16_HOST_DEVICE_STATIC__ float2 __bfloat1622float2(const __hip_bfloat162 a); // CHECK: f2 = __bfloat1622float2(bf162); f2 = __bfloat1622float2(bf162); + + // CUDA: __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmax2(const __nv_bfloat162 a, const __nv_bfloat162 b); + // HIP: __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hmax2(const __hip_bfloat162 a, const __hip_bfloat162 b); + // CHECK: bf162 = __hmax2(bf162a, bf162b); + bf162 = __hmax2(bf162a, bf162b); + + // CUDA: __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmin2(const __nv_bfloat162 a, const __nv_bfloat162 b); + // HIP: __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hmin2(const __hip_bfloat162 a, const __hip_bfloat162 b); + // CHECK: bf162 = __hmin2(bf162a, bf162b); + bf162 = __hmin2(bf162a, bf162b); + + // CUDA: __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __low2bfloat16(const __nv_bfloat162 a); + // HIP: __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __low2bfloat16(const __hip_bfloat162 a); + // CHECK: bf16 = __low2bfloat16(bf162a); + bf16 = __low2bfloat16(bf162a); + + // CUDA: __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __halves2bfloat162(const __nv_bfloat16 a, const __nv_bfloat16 b); + // HIP: __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __halves2bfloat162(const __hip_bfloat16 a, const __hip_bfloat16 b); + // CHECK: bf162 = __halves2bfloat162(bf16a, bf16b); + bf162 = __halves2bfloat162(bf16a, bf16b); + + // CUDA: __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __low2bfloat162(const __nv_bfloat162 a); + // HIP: __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __low2bfloat162(const __hip_bfloat162 a); + // CHECK: bf162 = __low2bfloat162(bf162a); + bf162 = __low2bfloat162(bf162a); + + // CUDA: __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __high2bfloat162(const __nv_bfloat162 a); + // HIP: __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __high2bfloat162(const __hip_bfloat162 a); + // CHECK: bf162 = __high2bfloat162(bf162a); + bf162 = __high2bfloat162(bf162a); + + // CUDA: __CUDA_HOSTDEVICE_BF16_DECL__ short int __bfloat16_as_short(const __nv_bfloat16 h); + // HIP: __BF16_HOST_DEVICE_STATIC__ short int __bfloat16_as_short(const __hip_bfloat16 h); + // CHECK: shi = __bfloat16_as_short(bf16); + shi = __bfloat16_as_short(bf16); #endif #if CUDA_VERSION >= 11080 @@ -146,10 +189,17 @@ int main() { hrx = __nv_cvt_fp8_to_halfraw(fp8_storage_t, fp8_interpretation_t); // CUDA: __CUDA_HOSTDEVICE_FP8_DECL__ __half2_raw __nv_cvt_fp8x2_to_halfraw2(const __nv_fp8x2_storage_t x, const __nv_fp8_interpretation_t fp8_interpretation); - // HIP: __FP8_HOST_DEVICE_STATIC__ __half2_raw __hip_cvt_fp8x2_to_halfraw2(const __hip_fp8x2_storage_t x, const __hip_fp8_interpretation_t type); + // HIP: __FP8_HOST_DEVICE_STATIC__ __half2_raw __hip_cvt_fp8x2_to_halfraw2(const __hip_fp8x2_storage_t x, const __hip_fp8_interpretation_t type); // CHECK: h2rx = __hip_cvt_fp8x2_to_halfraw2(fp8x2_storage_t, fp8_interpretation_t); h2rx = __nv_cvt_fp8x2_to_halfraw2(fp8x2_storage_t, fp8_interpretation_t); #endif +#if CUDA_VERSION >= 12020 + // CUDA: __CUDA_HOSTDEVICE_FP16_DECL__ __half2 make_half2(const __half x, const __half y); + // HIP: __HOST_DEVICE__ __half2 make_half2(__half x, __half y); + // CHECK: h2 = make_half2(hx, hy); + h2 = make_half2(hx, hy); +#endif + return 0; }