diff --git a/docs/Device_API_functions.md b/docs/Device_API_functions.md new file mode 100644 index 000000000..19eddbb7a --- /dev/null +++ b/docs/Device_API_functions.md @@ -0,0 +1,504 @@ + +# List of CUDA/HIP device-side functions supported by CHIP-SPV + +## Double precision intrinsics + +| **CUDA** | **HIP** | **CHIP-SPV**| +|-----------------------------------------------------------|-----------------------------------|:------------:| +| double \_\_dadd_rd ( double x, double y ) | N | N | +| double \_\_dadd_rn ( double x, double y ) | N | N | +| double \_\_dadd_ru ( double x, double y ) | N | N | +| double \_\_dadd_rz ( double x, double y ) | N | N | +| double \_\_ddiv_rd ( double x, double y ) | N | N | +| double \_\_ddiv_rn ( double x, double y ) | N | N | +| double \_\_ddiv_ru ( double x, double y ) | N | N | +| double \_\_ddiv_rz ( double x, double y ) | N | N | +| double \_\_dmul_rd ( double x, double y ) | N | N | +| double \_\_dmul_rn ( double x, double y ) | N | N | +| double \_\_dmul_ru ( double x, double y ) | N | N | +| double \_\_dmul_rz ( double x, double y ) | N | N | +| double \_\_drcp_rd ( double x ) | N | N | +| double \_\_drcp_rn ( double x ) | N | N | +| double \_\_drcp_ru ( double x ) | N | N | +| double \_\_drcp_rz ( double x ) | N | N | +| double \_\_dsqrt_rd ( double x ) | N | N | +| double \_\_dsqrt_rn ( double x ) | Y | N | +| double \_\_dsqrt_ru ( double x ) | N | N | +| double \_\_dsqrt_rz ( double x ) | N | N | +| double \_\_dsub_rd ( double x, double y ) | N | N | +| double \_\_dsub_rn ( double x, double y ) | N | N | +| double \_\_dsub_ru ( double x, double y ) | N | N | +| double \_\_dsub_rz ( double x, double y ) | N | N | +| double \_\_fma_rd ( double x, double y, double z ) | N | N | +| double \_\_fma_rn ( double x, double y, double z ) | N | N | +| double \_\_fma_ru ( double x, double y, double z ) | N | N | +| double \_\_fma_rz ( double x, double y, double z ) | N | N | + +## Double precision math library + +| **CUDA** | **HIP** | **CHIP-SPV**| +|-----------------------------------------------------------|-----------------------------------|:----------------:| +| double acos(double x) | Y | Y | +| double acosh ( double x ) | Y | Y | +| double asin ( double x ) | Y | Y | +| double asinh ( double x ) | Y | Y | +| double atan2 ( double y, double x ) | Y | Y | +| double atan ( double x ) | Y | Y | +| double atanh ( double x ) | Y | Y | +| double cbrt ( double x ) | Y | Y | +| double ceil ( double x ) | Y | Y | +| long convert_long(double x); | Y | Y | +| double copysign ( double x, double y ) | Y | Y | +| double cos ( double x ) | Y | Y | +| double cosh ( double x ) | Y | Y | +| double cospi ( double x ) | Y | Y | +| double cyl_bessel_i0 ( double x ) | Y | Y | +| double cyl_bessel_i1 ( double x ) | Y | Y | +| double erfc ( double x ) | Y | Y | +| double erfcinv ( double x ) | Y | Y | +| double erfcx ( double x ) | Y | Y | +| double erf ( double x ) | Y | Y | +| double erfinv ( double x ) | Y | Y | +| double exp10 ( double x ) | Y | Y | +| double exp2 ( double x ) | Y | Y | +| double exp ( double x ) | Y | Y | +| double expm1 ( double x ) | Y | Y | +| double fabs ( double x ) | Y | Y | +| double fdim ( double x, double y ) | Y | Y | +| double floor ( double x ) | Y | Y | +| double fma ( double x, double y, double z ) | Y | Y | +| double fmax ( double , double ) | Y | Y | +| double fmin ( double x, double y ) | Y | Y | +| double fmod ( double x, double y ) | Y | Y | +| double frexp ( double x, int* nptr ) | Y | Y | +| double hypot ( double x, double y ) | Y | Y | +| int ilogb ( double x ) | Y | Y | +| bool isfinite ( double a ) | Y | Y | +| bool isinf ( double a ) | Y | Y | +| bool isnan ( double a ) | Y | Y | +| double j0 ( double x ) | Y | Y | +| double j1 ( double x ) | Y | Y | +| double jn ( int n, double x ) | Y | Y | +| double ldexp ( double x, int exp ) | Y | Y | +| double lgamma(double x); | Y | Y | +| double log10 ( double x ) | Y | Y | +| double log1p ( double x ) | Y | Y | +| double log2 ( double x ) | Y | Y | +| double logb ( double x ) | Y | Y | +| double log ( double x ) | Y | Y | +| long lrint(double x) { | Y | Y | +| long lround(double x) { | Y | Y | +| long llrint(double x) { return lrint(x); } | Y | Y | +| long llround(double x) { return lround(x); } | Y | Y | +| double max ( const double a, const double b ) | Y | Y | +| double max ( const double a, const float b ) | Y | Y | +| double max ( const float a, const double b ) | Y | Y | +| double min ( const double a, const double b ) | Y | Y | +| double min ( const double a, const float b ) | Y | Y | +| double min ( const float a, const double b ) | Y | Y | +| double modf ( double x, double* iptr ) | Y | Y | +| double nan ( const char* tagp ) | Y | Y | +| double nearbyint ( double x ) | Y | Y | +| double nextafter ( double x, double y ) | Y | Y | +| double norm3d ( double a, double b, double c ) | Y | Y | +| double norm4d ( double a, double b, double c, double d) | Y | Y | +| double normcdf ( double x ) | Y | Y | +| double normcdfinv ( double x ) | Y | Y | +| double norm ( int dim, const double* p ) | Y | Y | +| double pow ( double x, double y ) | Y | Y | +| double rcbrt ( double x ) | Y | Y | +| double remainder ( double x, double y ) | Y | Y | +| double remquo ( double x, double y, int* quo ) | Y | Y | +| double rhypot ( double x, double y ) | Y | Y | +| double rint(double x); | Y | Y | +| double rnorm3d(double a, double b, double c); | Y | Y | +| double rnorm4d(double a, double b, double c, double d); | Y | Y | +| double rnorm ( int dim, const double* p ) | Y | Y | +| double round(double x); | Y | Y | +| double rsqrt ( double x ) | Y | Y | +| double scalbln ( double x, long int n ) | Y | Y | +| double scalbn ( double x, int n ) | Y | Y | +| bool signbit ( double a ) | Y | Y | +| void sincos ( double x, double* sptr, double* cptr ) | Y | Y | +| void sincospi ( double x, double* sptr, double* cptr ) | Y | Y | +| double sin ( double x ) | Y | Y | +| double sinh ( double x ) | Y | Y | +| double sinpi ( double x ) | Y | Y | +| double sqrt ( double x ) | Y | Y | +| double tan ( double x ) | Y | Y | +| double tanh ( double x ) | Y | Y | +| double tgamma ( double x ) | Y | Y | +| double trunc ( double x ) | Y | Y | +| double y0 ( double x ) | Y | Y | +| double y1 ( double x ) | Y | Y | +| double yn ( int n, double x ) | Y | Y | + +## Single precision intrinsics + +| **CUDA** | **HIP** | **CHIP-SPV**| +|-----------------------------------------------------------|-----------------------------------|:----------------:| +| \_\_cosf ( float x ) | Y | N | +| \_\_sincosf ( float x, float* sptr, float* cptr ) | N | Y | +| \_\_exp10f ( float x ) | N | N | +| \_\_expf ( float x ) | Y | N | +| \_\_fadd_rd ( float x, float y ) | N | N | +| \_\_fadd_rn ( float x, float y ) | N | N | +| \_\_fadd_ru ( float x, float y ) | N | N | +| \_\_fadd_rz ( float x, float y ) | N | N | +| \_\_fdividef ( float x, float y ) | N | N | +| \_\_fdiv_rd ( float x, float y ) | N | N | +| \_\_fdiv_rn ( float x, float y ) | N | N | +| \_\_fdiv_ru ( float x, float y ) | N | N | +| \_\_fdiv_rz ( float x, float y ) | N | N | +| \_\_fmaf_ieee_rd ( float x, float y, float z ) | N | N | +| \_\_fmaf_ieee_rn ( float x, float y, float z ) | N | N | +| \_\_fmaf_ieee_ru ( float x, float y, float z ) | N | N | +| \_\_fmaf_ieee_rz ( float x, float y, float z ) | N | N | +| \_\_fmaf_rd ( float x, float y, float z ) | N | N | +| \_\_fmaf_rn ( float x, float y, float z ) | N | N | +| \_\_fmaf_ru ( float x, float y, float z ) | N | N | +| \_\_fmaf_rz ( float x, float y, float z ) | N | N | +| \_\_fmul_rd ( float x, float y ) | N | N | +| \_\_fmul_rn ( float x, float y ) | N | N | +| \_\_fmul_ru ( float x, float y ) | N | N | +| \_\_fmul_rz ( float x, float y ) | N | N | +| \_\_frcp_rd ( float x ) | N | N | +| \_\_frcp_rn ( float x ) | N | N | +| \_\_frcp_ru ( float x ) | N | N | +| \_\_frcp_rz ( float x ) | N | N | +| \_\_frsqrt_rn ( float x ) | Y | N | +| \_\_fsqrt_rd ( float x ) | N | N | +| \_\_fsqrt_rn ( float x ) | N | N | +| \_\_fsqrt_ru ( float x ) | N | N | +| \_\_fsqrt_rz ( float x ) | N | N | +| \_\_fsub_rd ( float x, float y ) | N | N | +| \_\_fsub_rn ( float x, float y ) | N | N | +| \_\_fsub_ru ( float x, float y ) | N | N | +| \_\_fsub_rz ( float x, float y ) | N | N | +| \_\_log10f ( float x ) | Y | N | +| \_\_log2f ( float x ) | Y | N | +| \_\_logf ( float x ) | Y | N | +| \_\_powf ( float x, float y ) | Y | Y | +| \_\_saturatef ( float x ) | N | N | +| \_\_sinf ( float x ) | Y | N | +| \_\_tanf ( float x ) | Y | N | + +## Single precision math library + +| **CUDA** | **HIP** | **CHIP-SPV**| +|-----------------------------------------------------------|-----------------------------------|:----------------:| +| float acosf(float x) | Y | Y | +| float acoshf ( float x ) | Y | Y | +| float asinf ( float x ) | Y | Y | +| float asinhf ( float x ) | Y | Y | +| float atan2f ( float y, float x ) | Y | Y | +| float atanf ( float x ) | Y | Y | +| float atanhf ( float x ) | Y | Y | +| float cbrtf ( float x ) | Y | Y | +| float ceilf ( float x ) | Y | Y | +| float copysignf ( float x, float y ) | Y | Y | +| long convert_long(float x); | Y | Y | +| float cosf ( float x ) | Y | Y | +| float coshf ( float x ) | Y | Y | +| float cospif ( float x ) | Y | Y | +| float cyl_bessel_i0f ( float x ) | Y | Y | +| float cyl_bessel_i1f ( float x ) | Y | Y | +| float erfcf ( float x ) | Y | Y | +| float erfcinvf ( float x ) | Y | Y | +| float erfcxf ( float x ) | Y | Y | +| float erff ( float x ) | Y | Y | +| float erfinvf ( float x ) | Y | Y | +| float exp10f ( float x ) | Y | Y | +| float exp2f ( float x ) | Y | Y | +| float expf ( float x ) | Y | Y | +| float expm1f ( float x ) | Y | Y | +| float fabsf ( float x ) | Y | Y | +| float fdimf ( float x, float y ) | Y | Y | +| float fdividef ( float x, float y ) | Y | Y | +| float floorf ( float x ) | Y | Y | +| float fmaf ( float x, float y, float z ) | Y | Y | +| float fmaxf ( float x, float y ) | Y | Y | +| float fminf ( float x, float y ) | Y | Y | +| float fmodf ( float x, float y ) | Y | Y | +| float frexpf ( float x, int* nptr ) | Y | Y | +| float hypotf ( float x, float y ) | Y | Y | +| int ilogbf ( float x ) | Y | Y | +| bool isfinite ( float a ) | Y | Y | +| bool isinf ( float a ) | Y | Y | +| bool isnan ( float a ) | Y | Y | +| float j0f ( float x ) | Y | Y | +| float j1f ( float x ) | Y | Y | +| float jnf ( int n, float x ) | Y | Y | +| float ldexpf ( float x, int exp ) | Y | Y | +| float lgammaf(float x) { return (lgamma(x)); }; | Y | Y | +| float lgamma(float x); | Y | Y | +| float log10f ( float x ) | Y | Y | +| float log1pf ( float x ) | Y | Y | +| float log2f ( float x ) | Y | Y | +| float logbf ( float x ) | Y | Y | +| float logf ( float x ) | Y | Y | +| long lrintf(float x) OK | Y | Y | +| long lroundf(float x) OK | Y | Y | +| long llrintf(float x) OK | Y | Y | +| long llroundf(float x) OK | Y | Y | +| float max ( const float a, const float b ) | Y | Y | +| float min ( const float a, const float b ) | Y | Y | +| float modff ( float x, float* iptr ) | Y | Y | +| float nanf ( const char* tagp ) | Y | Y | +| float nearbyintf ( float x ) | Y | Y | +| float nextafterf ( float x, float y ) | Y | Y | +| float norm3df ( float a, float b, float c ) | Y | Y | +| float norm4df ( float a, float b, float c, float d ) | Y | Y | +| float normcdff ( float x ) | Y | Y | +| float normcdfinvf ( float x ) | Y | Y | +| float normf ( int dim, const float* p ) | Y | Y | +| float powf ( float x, float y ) | Y | Y | +| float rcbrtf ( float x ) | Y | Y | +| float remainderf ( float x, float y ) | Y | Y | +| float remquof ( float x, float y, int* quo ) | Y | Y | +| float rhypotf ( float x, float y ) | Y | Y | +| float rintf(float x) { return rint(x); } | Y | Y | +| float rint(float x); | Y | Y | +| float rnorm3df(float a, float b, float c); | Y | Y | +| float rnorm4df(float a, float b, float c, float d); | Y | Y | +| float rnormf ( int dim, const float* p ) | Y | Y | +| float roundf(float x) { return round(x); } | Y | Y | +| float round(float x); | Y | Y | +| float rsqrtf ( float x ) | Y | Y | +| float scalblnf ( float x, long int n ) | Y | Y | +| float scalbnf ( float x, int n ) | Y | Y | +| bool signbit ( float a ) | Y | Y | +| float sinf ( float x ) | Y | Y | +| float sinhf ( float x ) | Y | Y | +| float sinpif ( float x ) | Y | Y | +| float sqrtf ( float x ) | Y | Y | +| float tanf ( float x ) | Y | Y | +| float tanhf ( float x ) | Y | Y | +| float tgammaf ( float x ) | Y | Y | +| float truncf ( float x ) | Y | Y | +| float y0f ( float x ) | Y | Y | +| float y1f ( float x ) | Y | Y | +| float ynf ( int n, float x ) | Y | Y | +| void sincosf ( float x, float* sptr, float* cptr ) | Y | Y | +| void sincospif ( float x, float* sptr, float* cptr ) | Y | Y | + +## Half precision intrinsics + math library + +| **CUDA** | **HIP** | **CHIP-SPV**| +|-----------------------------------------------------------|-----------------------------------|:----------------:| +| \_\_device\_\_ \_\_half \_\_habs | Y | Y | +| \_\_device\_\_ \_\_half \_\_hadd | Y | Y | +| \_\_device\_\_ \_\_half \_\_hadd_rn | Y | N | +| \_\_device\_\_ \_\_half \_\_hadd_sat | Y | Y | +| \_\_device\_\_ \_\_half \_\_hdiv | Y | Y | +| \_\_device\_\_ \_\_half \_\_hfma | Y | Y | +| \_\_device\_\_ \_\_half \_\_hfma_relu | Y | N | +| \_\_device\_\_ \_\_half \_\_hfma_sat | Y | Y | +| \_\_device\_\_ \_\_half \_\_hmul | Y | Y | +| \_\_device\_\_ \_\_half \_\_hmul_rn | Y | N | +| \_\_device\_\_ \_\_half \_\_hmul_sat | Y | Y | +| \_\_device\_\_ \_\_half \_\_hneg | Y | Y | +| \_\_device\_\_ \_\_half \_\_hsub | Y | Y | +| \_\_device\_\_ \_\_half \_\_hsub_rn | Y | N | +| \_\_device\_\_ \_\_half \_\_hsub_sat | Y | Y | +| \_\_device\_\_ \_\_half atomicAdd | Y | N | +| \_\_device\_\_ bool \_\_heq | Y | Y | +| \_\_device\_\_ bool \_\_hequ | Y | Y | +| \_\_device\_\_ bool \_\_hge | Y | Y | +| \_\_device\_\_ bool \_\_hgeu | Y | Y | +| \_\_device\_\_ bool \_\_hgt | Y | Y | +| \_\_device\_\_ bool \_\_hgtu | Y | Y | +| \_\_device\_\_ bool \_\_hne | Y | Y | +| \_\_device\_\_ bool \_\_hneu | Y | Y | +| \_\_device\_\_ bool \_\_hle | Y | Y | +| \_\_device\_\_ bool \_\_hleu | Y | Y | +| \_\_device\_\_ bool \_\_hlt | Y | Y | +| \_\_device\_\_ bool \_\_hltu | Y | Y | +| \_\_device\_\_ int \_\_hisinf | Y | Y | +| \_\_device\_\_ bool \_\_hisnan | Y | Y | +| \_\_device\_\_ \_\_half \_\_hmax | Y | Y | +| \_\_device\_\_ \_\_half \_\_hmax_nan | Y | N | +| \_\_device\_\_ \_\_half \_\_hmin | Y | Y | +| \_\_device\_\_ \_\_half \_\_hmin_nan | Y | N | +| \_\_host\_\_ \_\_device\_\_ \_\_half \_\_double2half | Y | N | +| \_\_host\_\_ \_\_device\_\_ \_\_half \_\_float2half | Y | Y | +| \_\_host\_\_ \_\_device\_\_ \_\_half \_\_float2half_rd | Y | Y | +| \_\_host\_\_ \_\_device\_\_ \_\_half \_\_float2half_rn | Y | Y | +| \_\_host\_\_ \_\_device\_\_ \_\_half \_\_float2half_ru | Y | Y | +| \_\_host\_\_ \_\_device\_\_ \_\_half \_\_float2half_rz | Y | Y | +| \_\_device\_\_ short int \_\_half_as_short | Y | Y | +| \_\_device\_\_ unsigned short int \_\_half_as_ushort | Y | Y | +| \_\_device\_\_ \_\_half \_\_int2half_rd | Y | Y | +| \_\_host\_\_ \_\_device\_\_ \_\_half \_\_int2half_rn | Y | Y | +| \_\_device\_\_ \_\_half \_\_int2half_ru | Y | Y | +| \_\_device\_\_ \_\_half \_\_int2half_rz | Y | Y | +| \_\_device\_\_ \_\_half \_\_ldca | Y | Y | +| \_\_device\_\_ \_\_half \_\_ldcs | Y | Y | +| \_\_device\_\_ \_\_half \_\_ldcv | Y | N | +| \_\_device\_\_ \_\_half \_\_ldg | Y | Y | +| \_\_device\_\_ \_\_half \_\_ldlu | Y | N | +| \_\_device\_\_ \_\_half \_\_ll2half_rd | Y | Y | +| \_\_host\_\_ \_\_device\_\_ \_\_half \_\_ll2half_rn | Y | Y | +| \_\_device\_\_ \_\_half \_\_ll2half_ru | Y | Y | +| \_\_device\_\_ \_\_half \_\_ll2half_rz | Y | Y | +| \_\_device\_\_ \_\_half \_\_shfl_down_sync | Y | N | +| \_\_device\_\_ \_\_half \_\_shfl_sync | Y | N | +| \_\_device\_\_ \_\_half \_\_shfl_up_sync | Y | N | +| \_\_device\_\_ \_\_half \_\_shfl_xor_sync | Y | N | +| \_\_device\_\_ \_\_half \_\_short2half_rd | Y | Y | +| \_\_host\_\_ \_\_device\_\_ \_\_half \_\_short2half_rn | Y | Y | +| \_\_device\_\_ \_\_half \_\_short2half_ru | Y | Y | +| \_\_device\_\_ \_\_half \_\_short2half_rz | Y | Y | +| \_\_device\_\_ \_\_half \_\_short_as_half | Y | Y | +| \_\_device\_\_ void \_\_stcg | Y | Y | +| \_\_device\_\_ void \_\_stcs | Y | Y | +| \_\_device\_\_ void \_\_stwb | Y | N | +| \_\_device\_\_ void \_\_stwt | Y | N | +| \_\_device\_\_ \_\_half \_\_uint2half_rd | Y | Y | +| \_\_host\_\_ \_\_device\_\_ \_\_half \_\_uint2half_rn | Y | Y | +| \_\_device\_\_ \_\_half \_\_uint2half_ru | Y | Y | +| \_\_device\_\_ \_\_half \_\_uint2half_rz | Y | Y | +| \_\_device\_\_ \_\_half \_\_ull2half_rd | Y | Y | +| \_\_host\_\_ \_\_device\_\_ \_\_half \_\_ull2half_rn | Y | Y | +| \_\_device\_\_ \_\_half \_\_ull2half_ru | Y | Y | +| \_\_device\_\_ \_\_half \_\_ull2half_rz | Y | Y | +| \_\_device\_\_ \_\_half \_\_ushort2half_rd | Y | Y | +| \_\_host\_\_ \_\_device\_\_ \_\_half \_\_ushort2half_rn | Y | Y | +| \_\_device\_\_ \_\_half \_\_ushort2half_ru | Y | Y | +| \_\_device\_\_ \_\_half \_\_ushort2half_rz | Y | Y | +| \_\_device\_\_ \_\_half \_\_ushort_as_half | Y | Y | +| \_\_device\_\_ \_\_half hceil | Y | Y | +| \_\_device\_\_ \_\_half hcos | Y | Y | +| \_\_device\_\_ \_\_half hexp | Y | Y | +| \_\_device\_\_ \_\_half hexp10 | Y | Y | +| \_\_device\_\_ \_\_half hexp2 | Y | Y | +| \_\_device\_\_ \_\_half hfloor | Y | Y | +| \_\_device\_\_ \_\_half hlog | Y | Y | +| \_\_device\_\_ \_\_half hlog10 | Y | Y | +| \_\_device\_\_ \_\_half hlog2 | Y | Y | +| \_\_device\_\_ \_\_half hrcp | Y | Y | +| \_\_device\_\_ \_\_half hrint | Y | Y | +| \_\_device\_\_ \_\_half hrsqrt | Y | Y | +| \_\_device\_\_ \_\_half hsin | Y | Y | +| \_\_device\_\_ \_\_half hsqrt | Y | Y | +| \_\_device\_\_ \_\_half htrunc | Y | Y | + +## Half2 precision intrinsics + math library + +| **CUDA** | **HIP** | **CHIP-SPV**| +|-----------------------------------------------------------|-----------------------------------|:----------------:| +| \_\_device\_\_ \_\_half2 \_\_h2div | Y | Y | +| \_\_device\_\_ \_\_half2 \_\_habs2 | Y | Y | +| \_\_device\_\_ \_\_half2 \_\_hadd2 | Y | Y | +| \_\_device\_\_ \_\_half2 \_\_hadd2_rn | Y | N | +| \_\_device\_\_ \_\_half2 \_\_hadd2_sat | Y | Y | +| \_\_device\_\_ \_\_half2 \_\_hcmadd | Y | N | +| \_\_device\_\_ \_\_half2 \_\_hfma2 | Y | Y | +| \_\_device\_\_ \_\_half2 \_\_hfma2_relu | Y | N | +| \_\_device\_\_ \_\_half2 \_\_hfma2_sat | Y | Y | +| \_\_device\_\_ \_\_half2 \_\_hmul2 | Y | Y | +| \_\_device\_\_ \_\_half2 \_\_hmul2_rn | Y | N | +| \_\_device\_\_ \_\_half2 \_\_hmul2_sat | Y | Y | +| \_\_device\_\_ \_\_half2 \_\_hneg2 | Y | Y | +| \_\_device\_\_ \_\_half2 \_\_hsub2 | Y | Y | +| \_\_device\_\_ \_\_half2 \_\_hsub2_rn | Y | N | +| \_\_device\_\_ \_\_half2 \_\_hsub2_sat | Y | Y | +| \_\_device\_\_ \_\_half2 atomicAdd | Y | Y | +| \_\_device\_\_ bool \_\_hbeq2 | Y | Y | +| \_\_device\_\_ bool \_\_hbequ2 | Y | Y | +| \_\_device\_\_ bool \_\_hbge2 | Y | Y | +| \_\_device\_\_ bool \_\_hbgeu2 | Y | Y | +| \_\_device\_\_ bool \_\_hbgt2 | Y | Y | +| \_\_device\_\_ bool \_\_hbgtu2 | Y | Y | +| \_\_device\_\_ bool \_\_hble2 | Y | Y | +| \_\_device\_\_ bool \_\_hbleu2 | Y | Y | +| \_\_device\_\_ bool \_\_hblt2 | Y | Y | +| \_\_device\_\_ bool \_\_hbltu2 | Y | Y | +| \_\_device\_\_ bool \_\_hbne2 | Y | Y | +| \_\_device\_\_ bool \_\_hbneu2 | Y | Y | +| \_\_device\_\_ \_\_half2 \_\_heq2 | Y | Y | +| \_\_device\_\_ \_\_half2 \_\_hequ2 | Y | Y | +| \_\_device\_\_ \_\_half2 \_\_hge2 | Y | Y | +| \_\_device\_\_ \_\_half2 \_\_hgeu2 | Y | Y | +| \_\_device\_\_ \_\_half2 \_\_hgt2 | Y | Y | +| \_\_device\_\_ \_\_half2 \_\_hgtu2 | Y | Y | +| \_\_device\_\_ \_\_half2 \_\_hle2 | Y | Y | +| \_\_device\_\_ \_\_half2 \_\_hleu2 | Y | Y | +| \_\_device\_\_ \_\_half2 \_\_hlt2 | Y | Y | +| \_\_device\_\_ \_\_half2 \_\_hltu2 | Y | Y | +| \_\_device\_\_ \_\_half2 \_\_hne2 | Y | Y | +| \_\_device\_\_ \_\_half2 \_\_hneu2 | Y | Y | +| \_\_device\_\_ \_\_half2 \_\_hisnan2 | Y | Y | +| \_\_device\_\_ \_\_half2 \_\_hmax2 | Y | Y | +| \_\_device\_\_ \_\_half2 \_\_hmax2_nan | Y | N | +| \_\_device\_\_ \_\_half2 \_\_hmin2 | Y | Y | +| \_\_device\_\_ \_\_half2 \_\_hmin2_nan | Y | N | +| \_\_device\_\_ \_\_half2 h2ceil | Y | Y | +| \_\_device\_\_ \_\_half2 h2cos | Y | Y | +| \_\_device\_\_ \_\_half2 h2exp | Y | Y | +| \_\_device\_\_ \_\_half2 h2exp10 | Y | Y | +| \_\_device\_\_ \_\_half2 h2exp2 | Y | Y | +| \_\_device\_\_ \_\_half2 h2floor | Y | Y | +| \_\_device\_\_ \_\_half2 h2log | Y | Y | +| \_\_device\_\_ \_\_half2 h2log10 | Y | Y | +| \_\_device\_\_ \_\_half2 h2log2 | Y | Y | +| \_\_device\_\_ \_\_half2 h2rcp | Y | Y | +| \_\_device\_\_ \_\_half2 h2rint | Y | Y | +| \_\_device\_\_ \_\_half2 h2rsqrt | Y | Y | +| \_\_device\_\_ \_\_half2 h2sin | Y | Y | +| \_\_device\_\_ \_\_half2 h2sqrt | Y | Y | +| \_\_device\_\_ \_\_half2 h2trunc | Y | Y | +| \_\_host\_\_ \_\_device\_\_ \_\_half2 \_\_float22half2_rn | Y | Y | +| \_\_host\_\_ \_\_device\_\_ \_\_half2 \_\_float2half2_rn | Y | Y | +| \_\_host\_\_ \_\_device\_\_ \_\_half2 \_\_floats2half2_rn | Y | Y | +| \_\_host\_\_ \_\_device\_\_ float2 \_\_half22float2 | Y | Y | +| \_\_host\_\_ \_\_device\_\_ float \_\_half2float | Y | Y | +| \_\_device\_\_ \_\_half2 \_\_half2half2 | Y | Y | +| \_\_device\_\_ int \_\_half2int_rd | Y | Y | +| \_\_device\_\_ int \_\_half2int_rn | Y | Y | +| \_\_device\_\_ int \_\_half2int_ru | Y | Y | +| \_\_host\_\_ \_\_device\_\_ int \_\_half2int_rz | Y | Y | +| \_\_device\_\_ long long int \_\_half2ll_rd | Y | Y | +| \_\_device\_\_ long long int \_\_half2ll_rn | Y | Y | +| \_\_device\_\_ long long int \_\_half2ll_ru | Y | Y | +| \_\_host\_\_ \_\_device\_\_ long long int \_\_half2ll_rz | Y | Y | +| \_\_device\_\_ short int \_\_half2short_rd | Y | Y | +| \_\_device\_\_ short int \_\_half2short_rn | Y | Y | +| \_\_device\_\_ short int \_\_half2short_ru | Y | Y | +| \_\_host\_\_ \_\_device\_\_ short int \_\_half2short_rz | Y | Y | +| \_\_device\_\_ unsigned int \_\_half2uint_rd | Y | Y | +| \_\_device\_\_ unsigned int \_\_half2uint_rn | Y | Y | +| \_\_device\_\_ unsigned int \_\_half2uint_ru | Y | Y | +| \_\_host\_\_ \_\_device\_\_ unsigned int \_\_half2uint_rz | Y | Y | +| \_\_device\_\_ unsigned long long int \_\_half2ull_rd | Y | Y | +| \_\_device\_\_ unsigned long long int \_\_half2ull_rn | Y | Y | +| \_\_device\_\_ unsigned long long int \_\_half2ull_ru | Y | Y | +| \_\_host\_\_ \_\_device\_\_ unsigned long long int \_\_half2ull_rz | Y | Y | +| \_\_device\_\_ unsigned short int \_\_half2ushort_rd | Y | Y | +| \_\_device\_\_ unsigned short int \_\_half2ushort_rn | Y | Y | +| \_\_device\_\_ unsigned short int \_\_half2ushort_ru | Y | Y | +| \_\_host\_\_ \_\_device\_\_ unsigned short int \_\_half2ushort_rz | Y | Y | +| \_\_device\_\_ \_\_half2 \_\_halves2half2 | Y | Y | +| \_\_host\_\_ \_\_device\_\_ float \_\_high2float | Y | Y | +| \_\_device\_\_ \_\_half \_\_high2half | Y | Y | +| \_\_device\_\_ \_\_half2 \_\_high2half2 | Y | Y | +| \_\_device\_\_ \_\_half2 \_\_highs2half2 | Y | Y | +| \_\_device\_\_ \_\_half2 \_\_ldca | Y | Y | +| \_\_device\_\_ \_\_half2 \_\_ldcg | Y | Y | +| \_\_device\_\_ \_\_half2 \_\_ldcs | Y | Y | +| \_\_device\_\_ \_\_half2 \_\_ldcv | Y | N | +| \_\_device\_\_ \_\_half2 \_\_ldg | Y | Y | +| \_\_device\_\_ \_\_half2 \_\_ldlu | Y | N | +| \_\_host\_\_ \_\_device\_\_ float \_\_low2float | Y | Y | +| \_\_device\_\_ \_\_half \_\_low2half | Y | Y | +| \_\_device\_\_ \_\_half2 \_\_low2half2 | Y | Y | +| \_\_device\_\_ \_\_half2 \_\_lowhigh2highlow | Y | Y | +| \_\_device\_\_ \_\_half2 \_\_lows2half2 | Y | Y | +| \_\_device\_\_ \_\_half2 \_\_shfl_down_sync | Y | N | +| \_\_device\_\_ \_\_half2 \_\_shfl_sync | Y | N | +| \_\_device\_\_ \_\_half2 \_\_shfl_up_sync | Y | N | +| \_\_device\_\_ \_\_half2 \_\_shfl_xor_sync | Y | N | +| \_\_device\_\_ void \_\_stcg | Y | Y | +| \_\_device\_\_ void \_\_stcs | Y | Y | +| \_\_device\_\_ void \_\_stwb | Y | N | +| \_\_device\_\_ void \_\_stwt | Y | N | diff --git a/docs/Device_API_support_matrix.md b/docs/Device_API_support_matrix.md new file mode 100644 index 000000000..4c95fc60c --- /dev/null +++ b/docs/Device_API_support_matrix.md @@ -0,0 +1,39 @@ + +## CHIP-SPV support matrix for device side + + * categories are roughly from HIP kernel guide: + * https://github.com/ROCm-Developer-Tools/HIP/blob/develop/docs/markdown/hip_kernel_language.md + +| Feature | HIP API # of funcs | # of impl in CHIP-SPV | CHIP-SPV notes | +|-------------------------------|---------------------|-----------------------|---------------------------| +| Coordinate Built-Ins | 12 | 12 | | +| Warp Size variable | supported | unsupported | CHIP-SPV support probably low effort, but requires guarantee from driver side to respect warpSize (cl_intel_required_subgroup_size) | +| Timer functions | 2 | 0 | missing: clock, clock64; seems already available in intel GPU hardware & driver (TODO: unclear about HW clock bit width), possibly needs software (SPIR-V) support | +| Atomic functions | ~30 | ~30 | all supported, but a few (on float/double types) are emulated, proper impl requires OpenCL/driver/HW support | +| Vector Types | 48 | 48 | | +| Memory-Fence Instructions | 3 | 2 | \_\_threadfence_system is unsupported | +| Synchronization Functions | 4 | 4 | | +| Float math functions | 94 | 94 | | +| Float math intrinsics | 9 | 2 | 45 in CUDA, 9 in HIP; what's currently possible, is mapped to OpenCL's native_XYZ functions; the rest requires an OpenCL extension + SPIR-V + HW + driver support | +| Double math functions | 94 | 94 | | +| Double math intrinsics | 1 | 0 | 28 in CUDA, 1 in HIP; same as float intrinsics | +| Integer Intrinsics | 14 | 14 | | +| Half math funcs + intrin | 96 | 81 | atomicAdd, __hadd_rn, __hfma_relu, __hmul_rn, __hsub_rn, __hmax_nan, __hmin_nan, __shfl{down,up,xor,sync}, ldcv, ldlu, stwb, stwt | +| Half2 math funcs + intrin | 115 | 99 | same as ^^ + double2half | +| Texture Functions | ? | ? | partially supported (1D/2D texture types, other types unsupported) | +| Surface Functions | unsupported | unsupported | unsupported in both HIP & CHIP-SPV | +| Cooperative Groups Functions | ~30 | 0 | all missing, pathfinding effort and HW features required for efficient support | +| Warp Vote & Ballot | 3 | 3 | | +| Warp Shuffle | 8 | 8 | Supported in some circumstances (on Intel GPUs, when warp/subgroup=32 and ids map to lanes correctly). Also "width" argument is ignored. | +| Device-Side Dynamic Global Memory Allocation | 3 | 0 | medium difficulty to implement, likely no special hardware/software stack support required except atomics | +| In-Line Assembly | supports GCN asm | unsupported | requires SPIR-V and driver support | +| Warp Matrix Functions | 5, unsupported | unsupported | unsupported in both HIP & CHIP-SPV | +| Profiler Counter Function | 1, unsupported | unsupported | unsupported in both HIP & CHIP-SPV | +| Independent Thread Scheduling | unsupported | unsupported | unsupported in both HIP & CHIP-SPV | +| Pragma Unroll | supported | supported | Clang feature | +| Assert | supported | unsupported | abort is supported, assert is not| +| Printf | 1 | 1 | fully supported | +| advanced C++ features (RTTI, virtual, exceptions) | unsupported | unsupported | | + + +| Total (countable) | 733 | approx 495 | ~67% | diff --git a/docs/Features.md b/docs/Features.md index 90e4d6dbf..5588e638b 100644 --- a/docs/Features.md +++ b/docs/Features.md @@ -1,49 +1,111 @@ - ## Support status -This is a (non-exhaustive) list of features currently (un)supported by CHIP-SPV. +This is a (non-exhaustive) list of HIP features currently (un)supported by CHIP-SPV. + +For a more detailed matrix of supported functions and features, see the files +Host_API_support_matrix.md and Device_API_support_matrix.md. + +For an even more detailed per-function support status, see the files +Host_API_functions.md and Device_API_functions.md. + +CUDA features not present in HIP are unsupported unless explicitly stated otherwise. ### Host side #### Unsupported / unimplemented APIs * hipGraph API -* hipIpc API + +* hipIpc API (hipIpcGetMemHandle etc) + * hipModuleOccupancy API -* hipTexRef (texture reference) API -* surface object / reference APIs -* hipMemcpyPeer, hipMemcpyPeerAsync, hipMemRangeGetAttribute, hipFuncGetAttributes, - hipDeviceSetCacheConfig, hipDeviceGetCacheConfig, - hipDeviceSetSharedMemConfig, hipDeviceGetSharedMemConfig, - hipSetDeviceFlags, hipGetDeviceFlags, + +* texture reference API (hipTexRef, (DEPRECATED in CUDA) + +* surface reference API (DEPRECATED in CUDA) + +* surface object API (hipCreateSurfaceObject, hipDestroySurfaceObject) + +* peer access (hipMemcpyPeer, hipMemcpyPeerAsync, hipDeviceCanAccessPeer, hipDeviceEnablePeerAccess, - hipDeviceDisablePeerAccess, hipDeviceGetStreamPriorityRange, - hipDevicePrimaryCtxRelease, hipDevicePrimaryCtxRetain, - hipDevicePrimaryCtxSetFlags, hipMemPrefetchAsync, hipMemAdvise, - hipModuleLoadData, hipModuleUnload, hipModuleLaunchKernel + hipDeviceDisablePeerAccess, etc) + +* hipMemRangeGetAttribute + +* some config APIs (hipDeviceSetCacheConfig, hipDeviceGetCacheConfig + hipDeviceSetSharedMemConfig, hipDeviceGetSharedMemConfig, + hipSetDeviceFlags, hipGetDeviceFlags, hipFuncSetCacheConfig) + +* primary context API (hipDevicePrimaryCtxRelease, + hipDevicePrimaryCtxRetain, hipDevicePrimaryCtxSetFlags) + +* few memory APIs (hipMemPrefetchAsync, hipMemAdvise) + +* few module APIs (hipModuleLoadData, hipModuleUnload, hipModuleLaunchKernel) #### partially supported + * Texture Objects of 1D/2D type are supported; 3D, LOD, Grad, - Cubemap, Gather and Mipmapped textures are not supported. + Cubemap, Gather and Mipmapped textures are not supported + * hiprtc: Referring global device variables, constants and texture references in the name expressions are not supported. +* hipFuncGetAttributes - not all attributes are supported, depends on backend + +* hipDeviceGetLimit - only some limits are supported + +------------------------------------------------------------------- + + ### Device side #### Unsupported / unavailable -* __syncwarp(), __activemask() -* cooperative_groups.h header -* Address Space Predicate Functions, Address Space Conversion Functions -* alloca(), malloc(), free() -* Warp Reduce Functions, Warp Matrix Functions + +* _syncwarp(), __activemask(), threadfence_system(), warpSize variable + +* clock(), clock64() + +* Cooperative Groups Functions + +* Warp Matrix Functions + +* Independent Thread Scheduling + +* Profiler Counter Function + +* Address Space Predicate Functions + +* Address Space Conversion Functions + +* Device-Side Dynamic Global Memory Allocation + +* In-Line Assembly + +* __trap(), __brkpt(), assert() + +* surface functions #### Partially supported +* math library: almost all single/double functions are available, + half/half2 functions are available but untested + +* functions that specify rounding (e.g. __fdiv_rd), except conversion functions (as rounding cannot be selected in OpenCL) + +* device intrinsics are mapped to OpenCL's native_* functions where possible, otherwise are unsupported / emulated + * Warp functions (__all, __any, __ballot): only the non-sync versions are supported + * Shuffle functions (__shfl_{up,down,xor}): only the non-sync versions are supported -* assert(), __trap, __brkpt are not available but abort() is -* mathematical library: almost all single/double functions are available, - but half-precision variants are not available + +* abort() + +* texture functions: only with certain image types + +* atomic functions: supported but atomics on float/double is emulated using CAS loop + +------------------------------------------------------------------- ### Known issues diff --git a/docs/Host_API_functions.md b/docs/Host_API_functions.md new file mode 100644 index 000000000..a09e77858 --- /dev/null +++ b/docs/Host_API_functions.md @@ -0,0 +1,307 @@ + +# List of HIP runtime API functions supported by CHIP-SPV + +## **1. Device Management** + +| **CUDA** | **HIP** | **CHIP-SPV**| +|-----------------------------------------------------------|-----------------------------------|:----------------:| + +## DEVICE API + +| `cudaChooseDevice` | `hipChooseDevice` | Y | +| `cudaDeviceGetAttribute` | `hipDeviceGetAttribute` | Y | +| `cudaDeviceGetByPCIBusId` | `hipDeviceGetByPCIBusId` | Y | +| `cudaDeviceGetCacheConfig` | `hipDeviceGetCacheConfig` | Y | +| `cudaDeviceGetLimit` | `hipDeviceGetLimit` | Y* | + +| `cudaDeviceGetPCIBusId` | `hipDeviceGetPCIBusId` | Y | +| `cudaDeviceGetSharedMemConfig` | `hipDeviceGetSharedMemConfig` | N | +| `cudaDeviceGetStreamPriorityRange` | `hipDeviceGetStreamPriorityRange` | Y | +| `cudaDeviceReset` | `hipDeviceReset` | Y | +| `cudaDeviceSetCacheConfig` | `hipDeviceSetCacheConfig` | Y | + +| `cudaDeviceSetLimit` | `hipDeviceSetLimit` | Y | +| `cudaDeviceSetSharedMemConfig` | `hipDeviceSetSharedMemConfig` | N | +| `cudaDeviceSynchronize` | `hipDeviceSynchronize` | Y | +| `cudaGetDevice` | `hipGetDevice` | Y | +| `cudaGetDeviceCount` | `hipGetDeviceCount` | Y | + +| `cudaGetDeviceFlags` | `hipGetDeviceFlags` | N | +| `cudaGetDeviceProperties` | `hipGetDeviceProperties` | Y | +| `cudaSetDevice` | `hipSetDevice` | Y | +| `cudaSetDeviceFlags` | `hipSetDeviceFlags` | N | +| `cudaThreadSynchronize` | `hipDeviceSynchronize` | Y | + +| `cudaThreadExit` | `hipDeviceReset` | Y | +| `cudaThreadGetCacheConfig` | `hipDeviceGetCacheConfig` | N | +| `cudaThreadSetCacheConfig` | `hipDeviceSetCacheConfig` | N | + + +############## IPC API + +| `cudaIpcCloseMemHandle` | `hipIpcCloseMemHandle` | N | +| `cudaIpcGetEventHandle` | `hipIpcGetEventHandle` | N | +| `cudaIpcGetMemHandle` | `hipIpcGetMemHandle` | N | +| `cudaIpcOpenEventHandle` | `hipIpcOpenEventHandle` | N | +| `cudaIpcOpenMemHandle` | `hipIpcOpenMemHandle` | N | + +## **3. Error Handling** + +| **CUDA** | **HIP** | **CHIP-SPV**| +|-----------------------------------------------------------|-------------------------------|:----------------:| +| `cudaGetErrorName` | `hipGetErrorName` | Y | +| `cudaGetErrorString` | `hipGetErrorString` | Y | +| `cudaGetLastError` | `hipGetLastError` | Y | +| `cudaPeekAtLastError` | `hipPeekAtLastError` | Y | + +## **4. Stream Management** + +| **CUDA** | **HIP** | **CHIP-SPV** | +|-----------------------------------------------------------|-------------------------------|:----------------:| +| `cudaStreamAddCallback` | `hipStreamAddCallback` | Y | +| `cudaStreamCreate` | `hipStreamCreate` | Y | +| `cudaStreamCreateWithFlags` | `hipStreamCreateWithFlags` | Y | +| `cudaStreamCreateWithPriority` | `hipStreamCreateWithPriority` | Y | +| `cudaStreamDestroy` | `hipStreamDestroy` | Y | + +| `cudaStreamGetFlags` | `hipStreamGetFlags` | Y | +| `cudaStreamGetPriority` | `hipStreamGetPriority` | Y | +| `cudaStreamQuery` | `hipStreamQuery` | Y | +| `cudaStreamSynchronize` | `hipStreamSynchronize` | Y | +| `cudaStreamWaitEvent` | `hipStreamWaitEvent` | Y | + +## **5. Event Management** + +| **CUDA** | **HIP** | **CHIP-SPV** | +|-----------------------------------------------------------|-------------------------------|:----------------:| +| `cudaEventCreate` | `hipEventCreate` | Y | +| `cudaEventCreateWithFlags` | `hipEventCreateWithFlags` | Y | +| `cudaEventDestroy` | `hipEventDestroy` | Y | +| `cudaEventElapsedTime` | `hipEventElapsedTime` | Y | +| `cudaEventQuery` | `hipEventQuery` | Y | +| `cudaEventRecord` | `hipEventRecord` | Y | +| `cudaEventSynchronize` | `hipEventSynchronize` | Y | + + +## **7. Execution Control** + +| **CUDA** | **HIP** | **CHIP-SPV** | +|-----------------------------------------------------------|---------------------------------------|:----------------:| +| `cudaFuncGetAttributes` |`hipFuncGetAttributes` | Y*| +| `cudaFuncSetAttribute` |`hipFuncSetAttribute` | Y | +| `cudaFuncSetCacheConfig` |`hipFuncSetCacheConfig` | N | +| `cudaFuncSetSharedMemConfig` |`hipFuncSetSharedMemConfig` | N | +| `cudaLaunchKernel` |`hipLaunchKernel` | Y | + +| `cudaLaunchCooperativeKernel` |`hipLaunchCooperativeKernel` | N | +| `cudaLaunchCooperativeKernelMultiDevice` |`hipLaunchCooperativeKernelMultiDevice`| N | +| `cudaConfigureCall` | `hipConfigureCall` | Y | +| `cudaLaunch` | `hipLaunchByPtr` | Y | +| `cudaSetupArgument` | `hipSetupArgument` | Y | + +## **8. Occupancy** + +| **CUDA** | **HIP** | **CHIP-SPV**| +|-----------------------------------------------------------|-------------------------------------------------------|:----------------:| +| `cudaOccupancyMaxActiveBlocksPerMultiprocessor` | `hipOccupancyMaxActiveBlocksPerMultiprocessor` | N | +| `cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags` | `hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags`| N | +| ? | `hipModuleOccupancyMaxPotentialBlockSize` | N | +| ? | `hipModuleOccupancyMaxPotentialBlockSizeWithFlags` | N | +| ? | `hipModuleOccupancyMaxActiveBlocksPerMultiprocessor` | N | +| ? | `hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags` N | +| ? | `hipOccupancyMaxPotentialBlockSize` | N | + + +## **9. Memory Management** + +| **CUDA** | **HIP** | **CHIP-SPV**| +|-----------------------------------------------------------|-------------------------------|:----------------:| +| `cudaArrayGetInfo` | | ? | +| `cudaFree` | `hipFree` | Y | +| `cudaFreeArray` | `hipFreeArray` | Y | +| `cudaFreeHost` | `hipHostFree` | Y | +| `cudaGetSymbolAddress` | `hipGetSymbolAddress` | Y | + +| `cudaGetSymbolSize` | `hipGetSymbolSize` | Y | +| `cudaHostAlloc` | `hipHostMalloc` | Y | +| `cudaHostGetDevicePointer` | `hipHostGetDevicePointer` | Y | +| `cudaHostGetFlags` | `hipHostGetFlags` | Y | +| `cudaHostRegister` | `hipHostRegister` | Y | + +| `cudaHostUnregister` | `hipHostUnregister` | Y | +| `cudaMalloc` | `hipMalloc` | Y | +| `cudaMalloc3D` | `hipMalloc3D` | Y | +| `cudaMalloc3DArray` | `hipMalloc3DArray` | Y | +| `cudaMallocArray` | `hipMallocArray` | Y | + +| `cudaMallocHost` | `hipHostMalloc` | Y | +| `cudaMallocManaged` | `hipMallocManaged` | Y | +| `cudaMemGetInfo` | `hipMemGetInfo` | Y | +| `cudaMemcpy` | `hipMemcpy` | Y | +| `cudaMemcpy2D` | `hipMemcpy2D` | Y | + +| `cudaMemcpy2DAsync` | `hipMemcpy2DAsync` | Y | +| `cudaMemcpy2DFromArray` | `hipMemcpy2DFromArray` | Y | +| `cudaMemcpy2DFromArrayAsync` | `hipMemcpy2DFromArrayAsync` | Y | +| `cudaMemcpy2DToArray` | `hipMemcpy2DToArray` | Y | +| `cudaMemcpy3D` | `hipMemcpy3D` | Y | + +| `cudaMemcpy3DAsync` | `hipMemcpy3DAsync` | Y | +| `cudaMemcpyAsync` | `hipMemcpyAsync` | Y | +| `cudaMemcpyFromSymbol` | `hipMemcpyFromSymbol` | Y | +| `cudaMemcpyFromSymbolAsync` | `hipMemcpyFromSymbolAsync` | Y | +| `cudaMemcpyPeer` | `hipMemcpyPeer` | N | + +| `cudaMemcpyPeerAsync` | `hipMemcpyPeerAsync` | N | +| `cudaMemcpyToSymbol` | `hipMemcpyToSymbol` | Y | +| `cudaMemcpyToSymbolAsync` | `hipMemcpyToSymbolAsync` | Y | +| `cudaMemset` | `hipMemset` | Y | +| `cudaMemset2D` | `hipMemset2D` | Y | + +| `cudaMemset2DAsync` | `hipMemset2DAsync` | Y | +| `cudaMemset3D` | `hipMemset3D` | Y | +| `cudaMemset3DAsync` | `hipMemset3DAsync` | Y | +| `cudaMemsetAsync` | `hipMemsetAsync` | Y | +| `make_cudaExtent` | `make_hipExtent` | Y | + +| `make_cudaPitchedPtr` | `make_hipPitchedPtr` | Y | +| `make_cudaPos` | `make_hipPos` | Y | +| `cudaMemcpyFromArray` | `hipMemcpyFromArray` | Y | +| `cudaMemcpyToArray` | `hipMemcpyToArray` | Y | + +| ? | `hipMemPrefetchAsync` | N | +| ? | `hipMemAdvise` | N | +| ? | `hipMemRangeGetAttribute` | N | + +## **11. Unified Addressing** + +| **CUDA** | **HIP** | **CHIP-SPV**| +|-----------------------------------------------------------|-------------------------------|:----------------:| +| `cudaPointerGetAttributes` | `hipPointerGetAttributes` | Y | + +## **12. Peer Device Memory Access** + +| **CUDA** | **HIP** | **CHIP-SPV**| +|-----------------------------------------------------------|-------------------------------|:----------------:| +| `cudaDeviceCanAccessPeer` | `hipDeviceCanAccessPeer` | N | +| `cudaDeviceDisablePeerAccess` | `hipDeviceDisablePeerAccess` | N | +| `cudaDeviceEnablePeerAccess` | `hipDeviceEnablePeerAccess` | N | + +## **24. Texture Reference Management [DEPRECATED]** + +| **CUDA** | **HIP** | **CHIP-SPV**| +|-----------------------------------------------------------|----------------------------------|:----------------:| +| `cudaBindTexture` | `hipBindTexture` | Y | +| `cudaBindTexture2D` | `hipBindTexture2D` | Y | +| `cudaBindTextureToArray` | `hipBindTextureToArray` | Y | +| `cudaBindTextureToMipmappedArray` | `hipBindTextureToMipmappedArray` | Y | +| `cudaCreateChannelDesc` | `hipCreateChannelDesc` | Y | + +| `cudaGetChannelDesc` | `hipGetChannelDesc` | Y | +| `cudaGetTextureAlignmentOffset` | `hipGetTextureAlignmentOffset` | Y | +| `cudaGetTextureReference` | `hipGetTextureReference` | Y | +| `cudaUnbindTexture` | `hipUnbindTexture` | Y | + +## **26. Texture Object Management** + +| **CUDA** | **HIP** | **CHIP-SPV**| +|-----------------------------------------------------------|--------------------------------------|:----------------:| +| `cudaCreateTextureObject` |`hipCreateTextureObject` | Y | +| `cudaDestroyTextureObject` |`hipDestroyTextureObject` | Y | +| `cudaGetTextureObjectResourceDesc` |`hipGetTextureObjectResourceDesc` | Y | +| `cudaGetTextureObjectResourceViewDesc` |`hipGetTextureObjectResourceViewDesc` | N | +| `cudaGetTextureObjectTextureDesc` |`hipGetTextureObjectTextureDesc` | N | + +## **27. Surface Object Management** + +| **CUDA** | **HIP** | **CHIP-SPV**| +|-----------------------------------------------------------|-------------------------------|:----------------:| +| `cudaCreateSurfaceObject` | `hipCreateSurfaceObject` | N | +| `cudaDestroySurfaceObject` | `hipDestroySurfaceObject` | N | + +## **28. Version Management** + +| **CUDA** | **HIP** | **CHIP-SPV**| +|-----------------------------------------------------------|-------------------------------|:----------------:| +| `cudaDriverGetVersion` | `hipDriverGetVersion` | Y | +| `cudaRuntimeGetVersion` | `hipRuntimeGetVersion` | Y | + +## **29. Graph Management** + +| **CUDA** | **HIP** | **CHIP-SPV**| +|-----------------------------------------------------------|-------------------------------|:----------------:| +| `cudaGraphAddChildGraphNode` | | N | +| `cudaGraphAddDependencies` | | N | +| `cudaGraphAddEmptyNode` | | N | +| `cudaGraphAddHostNode` | | N | +| `cudaGraphAddKernelNode` | | N | + +| `cudaGraphAddMemcpyNode` | | N | +| `cudaGraphAddMemsetNode` | | N | +| `cudaGraphChildGraphNodeGetGraph` | | N | +| `cudaGraphClone` | | N | +| `cudaGraphCreate` | | N | + +| `cudaGraphDestroy` | | N | +| `cudaGraphDestroyNode` | | N | +| `cudaGraphExecDestroy` | | N | +| `cudaGraphGetEdges` | | N | +| `cudaGraphGetNodes` | | N | + +| `cudaGraphGetRootNodes` | | N | +| `cudaGraphHostNodeGetParams` | | N | +| `cudaGraphHostNodeSetParams` | | N | +| `cudaGraphInstantiate` | | N | +| `cudaGraphExecKernelNodeSetParams` | | N | + +| `cudaGraphExecMemcpyNodeSetParams` | | N | +| `cudaGraphExecMemsetNodeSetParams` | | N | +| `cudaGraphExecHostNodeSetParams` | | N | +| `cudaGraphExecUpdate` | | N | +| `cudaGraphKernelNodeGetParams` | | N | + +| `cudaGraphKernelNodeSetParams` | | N | +| `cudaGraphLaunch` | | N | +| `cudaGraphMemcpyNodeGetParams` | | N | +| `cudaGraphMemcpyNodeSetParams` | | N | +| `cudaGraphMemsetNodeGetParams` | | N | + +| `cudaGraphMemsetNodeSetParams` | | N | +| `cudaGraphNodeFindInClone` | | N | +| `cudaGraphNodeGetDependencies` | | N | +| `cudaGraphNodeGetDependentNodes` | | N | +| `cudaGraphNodeGetType` | | N | + +| `cudaGraphRemoveDependencies` | | N | + +... INCOMPLETE, there are 55 Graph API functions in CHIPBindings.cc + + +## **32. Profiler Control** + +| **CUDA** | **HIP** | **CHIP-SPV**| +|-----------------------------------------------------------|-------------------------------|:----------------:| +| `cudaProfilerStart` | `hipProfilerStart` | N | +| `cudaProfilerStop` | `hipProfilerStop` | N | + + +#### Primary Context API + +| **CUDA** | **HIP** | **CHIP-SPV**| +|-----------------------------------------------------------|-------------------------------|:----------------:| +| ? | `hipDevicePrimaryCtxGetState` | Y | +| ? | `hipDevicePrimaryCtxRelease` | N | +| ? | `hipDevicePrimaryCtxRetain` | N | +| ? | `hipDevicePrimaryCtxReset` | Y | +| ? | `hipDevicePrimaryCtxSetFlags` | N | + + +#### Module API + +| **CUDA** | **HIP** | **CHIP-SPV**| +|-----------------------------------------------------------|-------------------------------|:----------------:| +| ? | `hipModuleLoadData` | Y | +| ? | `hipModuleUnload` | Y | +| ? | `hipModuleLaunchKernel` | Y* | + +* partially supported (with some caveats) diff --git a/docs/Host_API_support_matrix.md b/docs/Host_API_support_matrix.md new file mode 100644 index 000000000..deec5b6c1 --- /dev/null +++ b/docs/Host_API_support_matrix.md @@ -0,0 +1,25 @@ + +## CHIP-SPV support matrix for host side + +| Feature | HIP API # of funcs | # of impl in CHIP-SPV | CHIP-SPV missing / notes | +|-------------------------------|-----------|-----------|---------------------------| +| Device API | 23 | 17 | hipDeviceSetCacheConfig, hipDeviceGetCacheConfig, hipDeviceSetSharedMemConfig, hipDeviceGetSharedMemConfig, hipSetDeviceFlags, hipGetDeviceFlags | +| IPC API | 5 | 0 | hipIpcCloseMemHandle, hipIpcGetEventHandle, hipIpcGetMemHandle, hipIpcOpenEventHandle, hipIpcOpenMemHandle | +| Error API | 4 | 4 | | +| Stream API | 10 | 10 | | +| Event API | 7 | 7 | | +| Execution API | 10 | 7 | hipFuncSetSharedMemConfig, hipFuncSetCacheConfig, hipFuncGetAttributes only partially | +| Occupancy API | 7 | 0 | hipModuleOccupancyMaxPotentialBlockSize, hipModuleOccupancyMaxPotentialBlockSizeWithFlags, hipModuleOccupancyMaxActiveBlocksPerMultiprocessor, hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, hipOccupancyMaxActiveBlocksPerMultiprocessor, hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, hipOccupancyMaxPotentialBlockSize | +| Mem Manag API | 47 | 42 | hipMemcpyPeer, hipMemcpyPeerAsync, hipMemPrefetchAsync, hipMemAdvise, hipMemRangeGetAttribute | +| Unified Addressing API | 1 | 1 | | +| Peer Mem Access API | 3 | 0 | hipDeviceCanAccessPeer, hipDeviceEnablePeerAccess, hipDeviceDisablePeerAccess | +| Texture Reference API (DEPR.) | 9 | 0 | ..all missing | +| Texture Object API | 5 | 3 | hipGetTextureObjectResourceViewDesc, hipGetTextureObjectTextureDesc ; Texture Objects of 1D/2D type are supported; 3D, LOD, Grad, Cubemap, Gather and Mipmapped textures are not supported | +| Surface Object API | 2 | 0 | hipCreateSurfaceObject, hipDestroySurfaceObject | +| Version API | 2 | 2 | | +| Graph API | 55 | 0 | ..all missing | +| Profiler API | 2 | 0 | hipProfilerStart, hipProfilerStop | +| Primary Context API | 5 | 2 | hipDevicePrimaryCtxRelease, hipDevicePrimaryCtxRetain, hipDevicePrimaryCtxSetFlags | +| Module API | 3 | 3 | hipModuleLaunchKernel has some caveats | +| | | | | +| Total | 200 | 98 | 49% |