Skip to content

Commit

Permalink
Merge pull request #16 from PaulMullowney/refresh_openmp_missing_accgpu
Browse files Browse the repository at this point in the history
Adding missing ACCGPU protections
  • Loading branch information
samhatfield authored Jan 20, 2025
2 parents 85bfeb1 + 709ec0f commit 1e83795
Show file tree
Hide file tree
Showing 6 changed files with 164 additions and 15 deletions.
26 changes: 26 additions & 0 deletions src/trans/gpu/algor/buffered_allocator_mod.F90
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@ MODULE BUFFERED_ALLOCATOR_MOD
USE ABORT_TRANS_MOD, ONLY: ABORT_TRANS
USE ISO_C_BINDING, ONLY: C_INT8_T, C_SIZE_T, C_LOC, C_F_POINTER
USE GROWING_ALLOCATOR_MOD, ONLY: GROWING_ALLOCATION_TYPE
#ifdef ACCGPU
USE OPENACC, ONLY: ACC_ASYNC_SYNC
#endif

IMPLICIT NONE

Expand Down Expand Up @@ -142,16 +144,28 @@ SUBROUTINE ASSIGN_PTR_FLOAT(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES, SET_VALUE
IF (PRESENT(SET_STREAM)) THEN
SET_STREAM_EFF = SET_STREAM
ELSE
#ifdef ACCGPU
SET_STREAM_EFF = ACC_ASYNC_SYNC
#endif
#ifdef OMPGPU
#endif
ENDIF
IF (SET_VALUE_EFF .AND. LENGTH_IN_BYTES > 0) THEN
! This option is turned off by default, but for experimentation we can turn it on. This is
! setting all bits to 1 (meaning NaN in floating point)
#ifdef ACCGPU
!$ACC PARALLEL PRESENT(SRC) ASYNC(SET_STREAM_EFF)
#endif
#ifdef OMPGPU
#endif
DO J=1_C_SIZE_T,LENGTH_IN_BYTES
SRC(J) = -1
ENDDO
#ifdef ACCGPU
!$ACC END PARALLEL
#endif
#ifdef OMPGPU
#endif
ENDIF
CALL C_F_POINTER(C_LOC(SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES-1)), DST, &
& [C_SIZEOF(SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES-1))/C_SIZEOF(DST(0))])
Expand Down Expand Up @@ -180,17 +194,29 @@ SUBROUTINE ASSIGN_PTR_DOUBLE(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES, SET_VALU
IF (PRESENT(SET_STREAM)) THEN
SET_STREAM_EFF = SET_STREAM
ELSE
#ifdef ACCGPU
SET_STREAM_EFF = ACC_ASYNC_SYNC
#endif
#ifdef OMPGPU
#endif
ENDIF
IF (SET_VALUE_EFF .AND. LENGTH_IN_BYTES > 0) THEN
! This option is turned off by default, but for experimentation we can turn it on. This is
! setting all bits to 1 (meaning NaN in floating point)
END_IN_BYTES=START_IN_BYTES+LENGTH_IN_BYTES-1
#ifdef ACCGPU
!$ACC PARALLEL PRESENT(SRC) ASYNC(SET_STREAM_EFF)
#endif
#ifdef OMPGPU
#endif
DO J=1_C_SIZE_T,LENGTH_IN_BYTES
SRC(J) = -1
ENDDO
#ifdef ACCGPU
!$ACC END PARALLEL
#endif
#ifdef OMPGPU
#endif
ENDIF
CALL C_F_POINTER(C_LOC(SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES-1)), DST, &
& [C_SIZEOF(SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES-1))/C_SIZEOF(DST(0))])
Expand Down
72 changes: 62 additions & 10 deletions src/trans/gpu/algor/ext_acc.F90
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,22 @@ module openacc_ext_type
end module
module openacc_ext
use iso_c_binding, only: c_ptr, c_size_t, c_loc, c_sizeof
#ifdef ACCGPU
use openacc, only: acc_handle_kind
#endif
#ifdef OMPGPU
#endif
use openacc_ext_type, only: ext_acc_arr_desc
implicit none

private
public :: ext_acc_pass, ext_acc_create, ext_acc_copyin, ext_acc_copyout, &
& ext_acc_delete, ext_acc_arr_desc, acc_handle_kind
#ifdef ACCGPU
& ext_acc_delete, ext_acc_arr_desc, acc_handle_kind
#endif
#ifdef OMPGPU
& ext_acc_delete, ext_acc_arr_desc
#endif

type common_pointer_descr
type(c_ptr) :: ptr
Expand Down Expand Up @@ -247,107 +256,150 @@ function get_common_pointers(in_ptrs, out_ptrs) result(num_ranges)
enddo
end function
subroutine ext_acc_create(ptrs, stream)
#ifdef ACCGPU
use openacc, only: acc_async_sync
#endif
use iso_fortran_env, only: int32
implicit none
type(ext_acc_arr_desc), intent(in) :: ptrs(:)
#ifdef ACCGPU
integer(acc_handle_kind), optional :: stream
#endif
#ifdef OMPGPU
integer(4), optional :: stream
#endif

type(common_pointer_descr), allocatable :: common_ptrs(:)

integer :: i, num_ranges
integer(kind=int32), pointer :: pp(:)
#ifdef ACCGPU
integer(acc_handle_kind) :: stream_act

if (present(stream)) then
stream_act = stream
else
stream_act = acc_async_sync
endif
#endif
allocate(common_ptrs(size(ptrs)))
num_ranges = get_common_pointers(ptrs, common_ptrs)

do i = 1, num_ranges
call c_f_pointer(common_ptrs(i)%ptr, pp, [common_ptrs(i)%sz/c_sizeof(pp(1))])
#ifdef ACCGPU
!$acc enter data create(pp) async(stream_act)
#endif
#ifdef OMPGPU
#endif
enddo
end subroutine
subroutine ext_acc_copyin(ptrs, stream)
#ifdef ACCGPU
use openacc, only: acc_async_sync
#endif
implicit none
type(ext_acc_arr_desc), intent(in) :: ptrs(:)
#ifdef ACCGPU
integer(acc_handle_kind), optional :: stream
#endif
#ifdef OMPGPU
integer(4), optional :: stream
#endif

type(common_pointer_descr), allocatable :: common_ptrs(:)

integer :: i, num_ranges
integer(4), pointer :: pp(:)

#ifdef ACCGPU
integer(acc_handle_kind) :: stream_act

if (present(stream)) then
stream_act = stream
else
stream_act = acc_async_sync
endif
#endif
allocate(common_ptrs(size(ptrs)))
num_ranges = get_common_pointers(ptrs, common_ptrs)

do i = 1, num_ranges
call c_f_pointer(common_ptrs(i)%ptr, pp, [common_ptrs(i)%sz/c_sizeof(pp(1))])
#ifdef ACCGPU
!$acc enter data copyin(pp) async(stream_act)
#endif
#ifdef OMPGPU
#endif
enddo
end subroutine
subroutine ext_acc_copyout(ptrs, stream)
#ifdef ACCGPU
use openacc, only: acc_async_sync
#endif
implicit none
type(ext_acc_arr_desc), intent(in) :: ptrs(:)
#ifdef ACCGPU
integer(acc_handle_kind), optional :: stream

#endif
#ifdef OMPGPU
integer(4), optional :: stream
#endif
type(common_pointer_descr), allocatable :: common_ptrs(:)

integer :: i, num_ranges
integer(4), pointer :: pp(:)

#ifdef ACCGPU
integer(acc_handle_kind) :: stream_act

if (present(stream)) then
stream_act = stream
else
stream_act = acc_async_sync
endif
endif
#endif
allocate(common_ptrs(size(ptrs)))
num_ranges = get_common_pointers(ptrs, common_ptrs)

do i = 1, num_ranges
call c_f_pointer(common_ptrs(i)%ptr, pp, [common_ptrs(i)%sz/c_sizeof(pp(1))])
#ifdef ACCGPU
!$acc exit data copyout(pp) async(stream_act)
#endif
#ifdef OMPGPU
#endif
enddo
end subroutine
subroutine ext_acc_delete(ptrs, stream)
#ifdef ACCGPU
use openacc, only: acc_async_sync
#endif
implicit none
type(ext_acc_arr_desc), intent(in) :: ptrs(:)
#ifdef ACCGPU
integer(acc_handle_kind), optional :: stream

#else
integer(4), optional :: stream
#endif
type(common_pointer_descr), allocatable :: common_ptrs(:)

integer :: i, num_ranges
integer(4), pointer :: pp(:)

#ifdef ACCGPU
integer(acc_handle_kind) :: stream_act

if (present(stream)) then
stream_act = stream
else
stream_act = acc_async_sync
endif
#endif
allocate(common_ptrs(size(ptrs)))
num_ranges = get_common_pointers(ptrs, common_ptrs)

do i = 1, num_ranges
call c_f_pointer(common_ptrs(i)%ptr, pp, [common_ptrs(i)%sz/c_sizeof(pp(1))])
#ifdef ACCGPU
!$acc exit data delete(pp) async(stream_act)
#endif
#ifdef OMPGPU
#endif
enddo
end subroutine
end module
7 changes: 6 additions & 1 deletion src/trans/gpu/algor/growing_allocator_mod.F90
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ SUBROUTINE REALLOCATE_GROWING_ALLOCATION(ALLOC, SZ)

IF (.NOT. ASSOCIATED(ALLOC%PTR)) THEN
ALLOCATE(ALLOC%PTR(SZ))
#ifdef OMPGU
#ifdef OMPGPU
!$OMP TARGET ENTER DATA MAP(ALLOC:ALLOC%PTR)
#endif
#ifdef ACCGPU
Expand Down Expand Up @@ -98,7 +98,12 @@ SUBROUTINE DESTROY_GROWING_ALLOCATOR(ALLOC)
CALL ALLOC%FREE_FUNCS(I)%FUNC(ALLOC%PTR, &
SIZE(ALLOC%PTR, 1, C_SIZE_T))
ENDDO
#ifdef OMPGPU
!$OMP TARGET EXIT DATA MAP(DELETE:ALLOC%PTR)
#endif
#ifdef ACCGPU
!$ACC EXIT DATA DELETE(ALLOC%PTR)
#endif
DEALLOCATE(ALLOC%PTR)
NULLIFY(ALLOC%PTR)
ENDIF
Expand Down
20 changes: 20 additions & 0 deletions src/trans/gpu/algor/hicblas_mod.F90
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,11 @@ MODULE HICBLAS_MOD

USE EC_PARKIND, ONLY: JPIM, JPRM, JPRD, JPIB
USE GROWING_ALLOCATOR_MOD, ONLY: GROWING_ALLOCATION_TYPE
#ifdef ACCGPU
USE OPENACC_LIB, ONLY: ACC_GET_HIP_STREAM
#endif
#ifdef OMPGPU
#endif

IMPLICIT NONE

Expand Down Expand Up @@ -147,7 +151,11 @@ SUBROUTINE HIP_DGEMM_BATCHED_OVERLOAD( &

INTEGER(KIND=C_LONG) :: HIP_STREAM

#ifdef ACCGPU
HIP_STREAM = INT(ACC_GET_HIP_STREAM(STREAM), C_LONG)
#endif
#ifdef OMPGPU
#endif

#if defined(_CRAYFTN)
!$ACC HOST_DATA USE_DEVICE(AARRAY,BARRAY,CARRAY)
Expand Down Expand Up @@ -197,7 +205,11 @@ SUBROUTINE HIP_SGEMM_BATCHED_OVERLOAD( &

INTEGER(KIND=C_LONG) :: HIP_STREAM

#ifdef ACCGPU
HIP_STREAM = INT(ACC_GET_HIP_STREAM(STREAM), C_LONG)
#endif
#ifdef OMPGPU
#endif

CALL HIP_SGEMM_BATCHED( &
& TRANSA, TRANSB, &
Expand Down Expand Up @@ -243,7 +255,11 @@ SUBROUTINE HIP_DGEMM_GROUPED_OVERLOAD( &

INTEGER(KIND=C_LONG) :: HIP_STREAM

#ifdef ACCGPU
HIP_STREAM = INT(ACC_GET_HIP_STREAM(STREAM), C_LONG)
#endif
#ifdef OMPGPU
#endif

CALL HIP_DGEMM_GROUPED( &
& RESOL_ID, BLAS_ID, TRANSA, TRANSB, &
Expand Down Expand Up @@ -290,7 +306,11 @@ SUBROUTINE HIP_SGEMM_GROUPED_OVERLOAD(&

INTEGER(KIND=C_LONG) :: HIP_STREAM

#ifdef ACCGPU
HIP_STREAM = INT(ACC_GET_HIP_STREAM(STREAM), C_LONG)
#endif
#ifdef OMPGPU
#endif

#if defined(_CRAYFTN)
!$ACC HOST_DATA USE_DEVICE(AARRAY,BARRAY,CARRAY)
Expand Down
25 changes: 23 additions & 2 deletions src/trans/gpu/internal/trgtol_mod.F90
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,9 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,
USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION
USE OPENACC_EXT, ONLY: EXT_ACC_ARR_DESC, EXT_ACC_PASS, EXT_ACC_CREATE, &
& EXT_ACC_DELETE
#ifdef ACCGPU
USE OPENACC, ONLY: ACC_HANDLE_KIND
#endif
USE ABORT_TRANS_MOD, ONLY: ABORT_TRANS

IMPLICIT NONE
Expand Down Expand Up @@ -380,7 +382,15 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,
ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1
ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGP3B)
ENDIF
IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_CREATE(ACC_POINTERS(1:ACC_POINTERS_CNT),STREAM=1_ACC_HANDLE_KIND)

IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_CREATE(ACC_POINTERS(1:ACC_POINTERS_CNT), &
#ifdef ACCGPU
& STREAM=1_ACC_HANDLE_KIND)
#endif
#ifdef OMPGPU
& STREAM=1)
#endif

#ifdef ACCGPU
!$ACC WAIT(1)
#endif
Expand Down Expand Up @@ -595,8 +605,12 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,
!$ACC HOST_DATA USE_DEVICE(ZCOMBUFR,ZCOMBUFS)
#endif
#else
#ifdef OMPGPU
#endif
#ifdef ACCGPU
!! this is safe-but-slow fallback for running without GPU-aware MPI
!$ACC UPDATE HOST(ZCOMBUFS) IF(ISEND_COUNTS > 0)
#endif
#endif

! Skip the own contribution because this is ok to overflow
Expand Down Expand Up @@ -789,7 +803,14 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,
!$ACC END DATA !PGPUV
!$ACC END DATA !PGP
#endif
IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_DELETE(ACC_POINTERS(1:ACC_POINTERS_CNT),STREAM=1_ACC_HANDLE_KIND)

IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_DELETE(ACC_POINTERS(1:ACC_POINTERS_CNT), &
#ifdef ACCGPU
& STREAM=1_ACC_HANDLE_KIND)
#endif
#ifdef OMPGPU
& STREAM=1)
#endif

IF (LHOOK) CALL DR_HOOK('TRGTOL',1,ZHOOK_HANDLE)
END SUBROUTINE TRGTOL
Expand Down
Loading

0 comments on commit 1e83795

Please sign in to comment.