diff --git a/src/trans/gpu/algor/buffered_allocator_mod.F90 b/src/trans/gpu/algor/buffered_allocator_mod.F90 index ba613fe7..346b8597 100644 --- a/src/trans/gpu/algor/buffered_allocator_mod.F90 +++ b/src/trans/gpu/algor/buffered_allocator_mod.F90 @@ -12,7 +12,9 @@ MODULE BUFFERED_ALLOCATOR_MOD USE ABORT_TRANS_MOD, ONLY: ABORT_TRANS USE ISO_C_BINDING, ONLY: C_INT8_T, C_SIZE_T, C_LOC, C_F_POINTER USE GROWING_ALLOCATOR_MOD, ONLY: GROWING_ALLOCATION_TYPE +#ifdef ACCGPU USE OPENACC, ONLY: ACC_ASYNC_SYNC +#endif IMPLICIT NONE @@ -142,16 +144,28 @@ SUBROUTINE ASSIGN_PTR_FLOAT(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES, SET_VALUE IF (PRESENT(SET_STREAM)) THEN SET_STREAM_EFF = SET_STREAM ELSE +#ifdef ACCGPU SET_STREAM_EFF = ACC_ASYNC_SYNC +#endif +#ifdef OMPGPU +#endif ENDIF IF (SET_VALUE_EFF .AND. LENGTH_IN_BYTES > 0) THEN ! This option is turned off by default, but for experimentation we can turn it on. This is ! setting all bits to 1 (meaning NaN in floating point) +#ifdef ACCGPU !$ACC PARALLEL PRESENT(SRC) ASYNC(SET_STREAM_EFF) +#endif +#ifdef OMPGPU +#endif DO J=1_C_SIZE_T,LENGTH_IN_BYTES SRC(J) = -1 ENDDO +#ifdef ACCGPU !$ACC END PARALLEL +#endif +#ifdef OMPGPU +#endif ENDIF CALL C_F_POINTER(C_LOC(SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES-1)), DST, & & [C_SIZEOF(SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES-1))/C_SIZEOF(DST(0))]) @@ -180,17 +194,29 @@ SUBROUTINE ASSIGN_PTR_DOUBLE(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES, SET_VALU IF (PRESENT(SET_STREAM)) THEN SET_STREAM_EFF = SET_STREAM ELSE +#ifdef ACCGPU SET_STREAM_EFF = ACC_ASYNC_SYNC +#endif +#ifdef OMPGPU +#endif ENDIF IF (SET_VALUE_EFF .AND. LENGTH_IN_BYTES > 0) THEN ! This option is turned off by default, but for experimentation we can turn it on. This is ! setting all bits to 1 (meaning NaN in floating point) END_IN_BYTES=START_IN_BYTES+LENGTH_IN_BYTES-1 +#ifdef ACCGPU !$ACC PARALLEL PRESENT(SRC) ASYNC(SET_STREAM_EFF) +#endif +#ifdef OMPGPU +#endif DO J=1_C_SIZE_T,LENGTH_IN_BYTES SRC(J) = -1 ENDDO +#ifdef ACCGPU !$ACC END PARALLEL +#endif +#ifdef OMPGPU +#endif ENDIF CALL C_F_POINTER(C_LOC(SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES-1)), DST, & & [C_SIZEOF(SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES-1))/C_SIZEOF(DST(0))]) diff --git a/src/trans/gpu/algor/ext_acc.F90 b/src/trans/gpu/algor/ext_acc.F90 index a1f4db89..296c5902 100644 --- a/src/trans/gpu/algor/ext_acc.F90 +++ b/src/trans/gpu/algor/ext_acc.F90 @@ -18,13 +18,22 @@ module openacc_ext_type end module module openacc_ext use iso_c_binding, only: c_ptr, c_size_t, c_loc, c_sizeof +#ifdef ACCGPU use openacc, only: acc_handle_kind +#endif +#ifdef OMPGPU +#endif use openacc_ext_type, only: ext_acc_arr_desc implicit none private public :: ext_acc_pass, ext_acc_create, ext_acc_copyin, ext_acc_copyout, & - & ext_acc_delete, ext_acc_arr_desc, acc_handle_kind +#ifdef ACCGPU + & ext_acc_delete, ext_acc_arr_desc, acc_handle_kind +#endif +#ifdef OMPGPU + & ext_acc_delete, ext_acc_arr_desc +#endif type common_pointer_descr type(c_ptr) :: ptr @@ -247,94 +256,132 @@ function get_common_pointers(in_ptrs, out_ptrs) result(num_ranges) enddo end function subroutine ext_acc_create(ptrs, stream) +#ifdef ACCGPU use openacc, only: acc_async_sync +#endif use iso_fortran_env, only: int32 implicit none type(ext_acc_arr_desc), intent(in) :: ptrs(:) +#ifdef ACCGPU integer(acc_handle_kind), optional :: stream +#endif +#ifdef OMPGPU + integer(4), optional :: stream +#endif type(common_pointer_descr), allocatable :: common_ptrs(:) integer :: i, num_ranges integer(kind=int32), pointer :: pp(:) +#ifdef ACCGPU integer(acc_handle_kind) :: stream_act - if (present(stream)) then stream_act = stream else stream_act = acc_async_sync endif +#endif allocate(common_ptrs(size(ptrs))) num_ranges = get_common_pointers(ptrs, common_ptrs) do i = 1, num_ranges call c_f_pointer(common_ptrs(i)%ptr, pp, [common_ptrs(i)%sz/c_sizeof(pp(1))]) +#ifdef ACCGPU !$acc enter data create(pp) async(stream_act) +#endif +#ifdef OMPGPU +#endif enddo end subroutine subroutine ext_acc_copyin(ptrs, stream) +#ifdef ACCGPU use openacc, only: acc_async_sync +#endif implicit none type(ext_acc_arr_desc), intent(in) :: ptrs(:) +#ifdef ACCGPU integer(acc_handle_kind), optional :: stream +#endif +#ifdef OMPGPU + integer(4), optional :: stream +#endif type(common_pointer_descr), allocatable :: common_ptrs(:) integer :: i, num_ranges integer(4), pointer :: pp(:) - +#ifdef ACCGPU integer(acc_handle_kind) :: stream_act - if (present(stream)) then stream_act = stream else stream_act = acc_async_sync endif +#endif allocate(common_ptrs(size(ptrs))) num_ranges = get_common_pointers(ptrs, common_ptrs) do i = 1, num_ranges call c_f_pointer(common_ptrs(i)%ptr, pp, [common_ptrs(i)%sz/c_sizeof(pp(1))]) +#ifdef ACCGPU !$acc enter data copyin(pp) async(stream_act) +#endif +#ifdef OMPGPU +#endif enddo end subroutine subroutine ext_acc_copyout(ptrs, stream) +#ifdef ACCGPU use openacc, only: acc_async_sync +#endif implicit none type(ext_acc_arr_desc), intent(in) :: ptrs(:) +#ifdef ACCGPU integer(acc_handle_kind), optional :: stream - +#endif +#ifdef OMPGPU + integer(4), optional :: stream +#endif type(common_pointer_descr), allocatable :: common_ptrs(:) integer :: i, num_ranges integer(4), pointer :: pp(:) - +#ifdef ACCGPU integer(acc_handle_kind) :: stream_act - if (present(stream)) then stream_act = stream else stream_act = acc_async_sync - endif + endif +#endif allocate(common_ptrs(size(ptrs))) num_ranges = get_common_pointers(ptrs, common_ptrs) do i = 1, num_ranges call c_f_pointer(common_ptrs(i)%ptr, pp, [common_ptrs(i)%sz/c_sizeof(pp(1))]) +#ifdef ACCGPU !$acc exit data copyout(pp) async(stream_act) +#endif +#ifdef OMPGPU +#endif enddo end subroutine subroutine ext_acc_delete(ptrs, stream) +#ifdef ACCGPU use openacc, only: acc_async_sync +#endif implicit none type(ext_acc_arr_desc), intent(in) :: ptrs(:) +#ifdef ACCGPU integer(acc_handle_kind), optional :: stream - +#else + integer(4), optional :: stream +#endif type(common_pointer_descr), allocatable :: common_ptrs(:) integer :: i, num_ranges integer(4), pointer :: pp(:) - +#ifdef ACCGPU integer(acc_handle_kind) :: stream_act if (present(stream)) then @@ -342,12 +389,17 @@ subroutine ext_acc_delete(ptrs, stream) else stream_act = acc_async_sync endif +#endif allocate(common_ptrs(size(ptrs))) num_ranges = get_common_pointers(ptrs, common_ptrs) do i = 1, num_ranges call c_f_pointer(common_ptrs(i)%ptr, pp, [common_ptrs(i)%sz/c_sizeof(pp(1))]) +#ifdef ACCGPU !$acc exit data delete(pp) async(stream_act) +#endif +#ifdef OMPGPU +#endif enddo end subroutine end module diff --git a/src/trans/gpu/algor/growing_allocator_mod.F90 b/src/trans/gpu/algor/growing_allocator_mod.F90 index 844194e0..283db018 100644 --- a/src/trans/gpu/algor/growing_allocator_mod.F90 +++ b/src/trans/gpu/algor/growing_allocator_mod.F90 @@ -43,7 +43,7 @@ SUBROUTINE REALLOCATE_GROWING_ALLOCATION(ALLOC, SZ) IF (.NOT. ASSOCIATED(ALLOC%PTR)) THEN ALLOCATE(ALLOC%PTR(SZ)) -#ifdef OMPGU +#ifdef OMPGPU !$OMP TARGET ENTER DATA MAP(ALLOC:ALLOC%PTR) #endif #ifdef ACCGPU @@ -98,7 +98,12 @@ SUBROUTINE DESTROY_GROWING_ALLOCATOR(ALLOC) CALL ALLOC%FREE_FUNCS(I)%FUNC(ALLOC%PTR, & SIZE(ALLOC%PTR, 1, C_SIZE_T)) ENDDO +#ifdef OMPGPU + !$OMP TARGET EXIT DATA MAP(DELETE:ALLOC%PTR) +#endif +#ifdef ACCGPU !$ACC EXIT DATA DELETE(ALLOC%PTR) +#endif DEALLOCATE(ALLOC%PTR) NULLIFY(ALLOC%PTR) ENDIF diff --git a/src/trans/gpu/algor/hicblas_mod.F90 b/src/trans/gpu/algor/hicblas_mod.F90 index 528680f6..da0790ca 100644 --- a/src/trans/gpu/algor/hicblas_mod.F90 +++ b/src/trans/gpu/algor/hicblas_mod.F90 @@ -16,7 +16,11 @@ MODULE HICBLAS_MOD USE EC_PARKIND, ONLY: JPIM, JPRM, JPRD, JPIB USE GROWING_ALLOCATOR_MOD, ONLY: GROWING_ALLOCATION_TYPE +#ifdef ACCGPU USE OPENACC_LIB, ONLY: ACC_GET_HIP_STREAM +#endif +#ifdef OMPGPU +#endif IMPLICIT NONE @@ -147,7 +151,11 @@ SUBROUTINE HIP_DGEMM_BATCHED_OVERLOAD( & INTEGER(KIND=C_LONG) :: HIP_STREAM +#ifdef ACCGPU HIP_STREAM = INT(ACC_GET_HIP_STREAM(STREAM), C_LONG) +#endif +#ifdef OMPGPU +#endif #if defined(_CRAYFTN) !$ACC HOST_DATA USE_DEVICE(AARRAY,BARRAY,CARRAY) @@ -197,7 +205,11 @@ SUBROUTINE HIP_SGEMM_BATCHED_OVERLOAD( & INTEGER(KIND=C_LONG) :: HIP_STREAM +#ifdef ACCGPU HIP_STREAM = INT(ACC_GET_HIP_STREAM(STREAM), C_LONG) +#endif +#ifdef OMPGPU +#endif CALL HIP_SGEMM_BATCHED( & & TRANSA, TRANSB, & @@ -243,7 +255,11 @@ SUBROUTINE HIP_DGEMM_GROUPED_OVERLOAD( & INTEGER(KIND=C_LONG) :: HIP_STREAM +#ifdef ACCGPU HIP_STREAM = INT(ACC_GET_HIP_STREAM(STREAM), C_LONG) +#endif +#ifdef OMPGPU +#endif CALL HIP_DGEMM_GROUPED( & & RESOL_ID, BLAS_ID, TRANSA, TRANSB, & @@ -290,7 +306,11 @@ SUBROUTINE HIP_SGEMM_GROUPED_OVERLOAD(& INTEGER(KIND=C_LONG) :: HIP_STREAM +#ifdef ACCGPU HIP_STREAM = INT(ACC_GET_HIP_STREAM(STREAM), C_LONG) +#endif +#ifdef OMPGPU +#endif #if defined(_CRAYFTN) !$ACC HOST_DATA USE_DEVICE(AARRAY,BARRAY,CARRAY) diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index 055534cb..95188f37 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -124,7 +124,9 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION USE OPENACC_EXT, ONLY: EXT_ACC_ARR_DESC, EXT_ACC_PASS, EXT_ACC_CREATE, & & EXT_ACC_DELETE +#ifdef ACCGPU USE OPENACC, ONLY: ACC_HANDLE_KIND +#endif USE ABORT_TRANS_MOD, ONLY: ABORT_TRANS IMPLICIT NONE @@ -380,7 +382,15 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1 ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGP3B) ENDIF - IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_CREATE(ACC_POINTERS(1:ACC_POINTERS_CNT),STREAM=1_ACC_HANDLE_KIND) + + IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_CREATE(ACC_POINTERS(1:ACC_POINTERS_CNT), & +#ifdef ACCGPU + & STREAM=1_ACC_HANDLE_KIND) +#endif +#ifdef OMPGPU + & STREAM=1) +#endif + #ifdef ACCGPU !$ACC WAIT(1) #endif @@ -595,8 +605,12 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, !$ACC HOST_DATA USE_DEVICE(ZCOMBUFR,ZCOMBUFS) #endif #else +#ifdef OMPGPU +#endif +#ifdef ACCGPU !! this is safe-but-slow fallback for running without GPU-aware MPI !$ACC UPDATE HOST(ZCOMBUFS) IF(ISEND_COUNTS > 0) +#endif #endif ! Skip the own contribution because this is ok to overflow @@ -789,7 +803,14 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, !$ACC END DATA !PGPUV !$ACC END DATA !PGP #endif - IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_DELETE(ACC_POINTERS(1:ACC_POINTERS_CNT),STREAM=1_ACC_HANDLE_KIND) + + IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_DELETE(ACC_POINTERS(1:ACC_POINTERS_CNT), & +#ifdef ACCGPU + & STREAM=1_ACC_HANDLE_KIND) +#endif +#ifdef OMPGPU + & STREAM=1) +#endif IF (LHOOK) CALL DR_HOOK('TRGTOL',1,ZHOOK_HANDLE) END SUBROUTINE TRGTOL diff --git a/src/trans/gpu/internal/trltog_mod.F90 b/src/trans/gpu/internal/trltog_mod.F90 index d27cc74a..7df708b0 100755 --- a/src/trans/gpu/internal/trltog_mod.F90 +++ b/src/trans/gpu/internal/trltog_mod.F90 @@ -125,7 +125,9 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, USE ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF USE OPENACC_EXT, ONLY: EXT_ACC_ARR_DESC, EXT_ACC_PASS, EXT_ACC_CREATE, & & EXT_ACC_DELETE +#ifdef ACCGPU USE OPENACC, ONLY: ACC_HANDLE_KIND +#endif IMPLICIT NONE @@ -517,7 +519,15 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1 ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGP3B) ENDIF - IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_CREATE(ACC_POINTERS(1:ACC_POINTERS_CNT),STREAM=1_ACC_HANDLE_KIND) + + IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_CREATE(ACC_POINTERS(1:ACC_POINTERS_CNT), & +#ifdef ACCGPU + & STREAM=1_ACC_HANDLE_KIND) +#endif +#ifdef OMPGPU + & STREAM=1) +#endif + #ifdef OMPGPU #endif #ifdef ACCGPU @@ -711,8 +721,12 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, !$ACC HOST_DATA USE_DEVICE(ZCOMBUFS,ZCOMBUFR) #endif #else +#ifdef OMPGPU +#endif +#ifdef ACCGPU !! this is safe-but-slow fallback for running without GPU-aware MPI !$ACC UPDATE HOST(ZCOMBUFS) IF(ISEND_COUNTS > 0) +#endif #endif ! Skip the own contribution because this is ok to overflow @@ -766,8 +780,12 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, !$ACC END HOST_DATA #endif #else +#ifdef OMPGPU +#endif +#ifdef ACCGPU !! this is safe-but-slow fallback for running without GPU-aware MPI !$ACC UPDATE DEVICE(ZCOMBUFR) IF(IRECV_COUNTS > 0) +#endif #endif IF (LSYNC_TRANS) THEN @@ -922,7 +940,14 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, !$ACC UPDATE HOST(PGP3B) ASYNC(1) #endif ENDIF - IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_DELETE(ACC_POINTERS(1:ACC_POINTERS_CNT),STREAM=1_ACC_HANDLE_KIND) + IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_DELETE(ACC_POINTERS(1:ACC_POINTERS_CNT), & +#ifdef ACCGPU + & STREAM=1_ACC_HANDLE_KIND) +#endif +#ifdef OMPGPU + & STREAM=1) +#endif + IF (LSYNC_TRANS) THEN #ifdef ACCGPU !$ACC WAIT(1)