Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow GPU version to compile with OpenMP #180

Merged
merged 24 commits into from
Jan 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
b74250b
Refresh OpenMP CMake presence
samhatfield Nov 5, 2024
f4cf1c1
Refresh setup_trans.F90
samhatfield Nov 5, 2024
2ae9b5f
Allow gpnorm_trans_gpu.F90 to compile with OpenMP
samhatfield Nov 5, 2024
4e73316
Allow fsc_mod.F90 to compile with OpenMP
samhatfield Nov 28, 2024
4e914a2
Allow growing_allocator_mod.F90 to compile with OpenMP
samhatfield Nov 28, 2024
7de1003
Allow gpnorm_trans.F90 to compile with OpenMP
samhatfield Nov 28, 2024
b3efb4b
Allow ltinv_mod.F90 to compile with OpenMP
samhatfield Nov 28, 2024
2e08a7f
Allow prfi1b_mod.F90 to compile with OpenMP
samhatfield Nov 28, 2024
ba54d15
Allow spnsde_mod.F90 to compile with OpenMP
samhatfield Nov 28, 2024
d1e318c
Allow trgtol_mod.F90 to compile with OpenMP
samhatfield Nov 28, 2024
ab23a9d
Remove unnecessary mpif.h import
samhatfield Nov 28, 2024
287d97c
Allow vdtuv_mod.F90 to compile with OpenMP
samhatfield Nov 28, 2024
20e9a11
Allo uvtvd_mod.F90 to compile with OpenMP
samhatfield Dec 4, 2024
f85fc55
Allow updspb_mod.F90 to compile with OpenMP
samhatfield Jan 20, 2025
a71b64b
Tidy up gpu/SETUP_TRANS
samhatfield Dec 4, 2024
3002c3f
Properly handle LDUSEFFTW option for GPUs
samhatfield Dec 4, 2024
0cae7c0
Minor case change
samhatfield Jan 15, 2025
9324bed
Changes to get compliance with flang-new. This builds and runs on 8 H…
PaulMullowney Jan 16, 2025
2f6d01e
Add back FIRSTPRIVATE
samhatfield Jan 17, 2025
ce53d97
Adding missing ACCGPU protections
PaulMullowney Jan 16, 2025
c6bd239
slight fix
PaulMullowney Jan 16, 2025
079a464
Requested fixes
PaulMullowney Jan 17, 2025
26e33e2
Fix indentation
samhatfield Jan 21, 2025
388fd00
Add some more #ifdef ACCGPU guards
samhatfield Jan 21, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/programs/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ foreach( program ectrans-benchmark )
fiat
parkind_${prec}
trans_gpu_${prec}
OpenACC::OpenACC_Fortran
$<${HAVE_ACC}:OpenACC::OpenACC_Fortran>
$<${HAVE_OMP}:OpenMP::OpenMP_Fortran>
DEFINITIONS
VERSION="gpu"
$<$<COMPILE_LANG_AND_ID:Fortran,NVHPC>:USE_PINNED>
Expand Down
1 change: 1 addition & 0 deletions src/trans/gpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ ecbuild_add_library(
PUBLIC_LIBS fiat ectrans_common
PRIVATE_LIBS ${ECTRANS_GPU_HIP_LIBRARIES}
$<${HAVE_ACC}:OpenACC::OpenACC_Fortran>
$<${HAVE_OMP}:OpenMP::OpenMP_Fortran>
$<${HAVE_CUTLASS}:nvidia::cutlass::cutlass>
PRIVATE_DEFINITIONS ${GPU_RUNTIME}GPU ${GPU_OFFLOAD}GPU
$<${HAVE_CUTLASS}:USE_CUTLASS>
Expand Down
26 changes: 26 additions & 0 deletions src/trans/gpu/algor/buffered_allocator_mod.F90
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@ MODULE BUFFERED_ALLOCATOR_MOD
USE ABORT_TRANS_MOD, ONLY: ABORT_TRANS
USE ISO_C_BINDING, ONLY: C_INT8_T, C_SIZE_T, C_LOC, C_F_POINTER
USE GROWING_ALLOCATOR_MOD, ONLY: GROWING_ALLOCATION_TYPE
#ifdef ACCGPU
USE OPENACC, ONLY: ACC_ASYNC_SYNC
#endif

IMPLICIT NONE

Expand Down Expand Up @@ -142,16 +144,28 @@ SUBROUTINE ASSIGN_PTR_FLOAT(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES, SET_VALUE
IF (PRESENT(SET_STREAM)) THEN
SET_STREAM_EFF = SET_STREAM
ELSE
#ifdef ACCGPU
SET_STREAM_EFF = ACC_ASYNC_SYNC
#endif
#ifdef OMPGPU
#endif
ENDIF
IF (SET_VALUE_EFF .AND. LENGTH_IN_BYTES > 0) THEN
! This option is turned off by default, but for experimentation we can turn it on. This is
! setting all bits to 1 (meaning NaN in floating point)
#ifdef ACCGPU
!$ACC PARALLEL PRESENT(SRC) ASYNC(SET_STREAM_EFF)
#endif
#ifdef OMPGPU
#endif
DO J=1_C_SIZE_T,LENGTH_IN_BYTES
SRC(J) = -1
ENDDO
#ifdef ACCGPU
!$ACC END PARALLEL
#endif
#ifdef OMPGPU
#endif
ENDIF
CALL C_F_POINTER(C_LOC(SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES-1)), DST, &
& [C_SIZEOF(SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES-1))/C_SIZEOF(DST(0))])
Expand Down Expand Up @@ -180,17 +194,29 @@ SUBROUTINE ASSIGN_PTR_DOUBLE(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES, SET_VALU
IF (PRESENT(SET_STREAM)) THEN
SET_STREAM_EFF = SET_STREAM
ELSE
#ifdef ACCGPU
SET_STREAM_EFF = ACC_ASYNC_SYNC
#endif
#ifdef OMPGPU
#endif
ENDIF
IF (SET_VALUE_EFF .AND. LENGTH_IN_BYTES > 0) THEN
! This option is turned off by default, but for experimentation we can turn it on. This is
! setting all bits to 1 (meaning NaN in floating point)
END_IN_BYTES=START_IN_BYTES+LENGTH_IN_BYTES-1
#ifdef ACCGPU
!$ACC PARALLEL PRESENT(SRC) ASYNC(SET_STREAM_EFF)
#endif
#ifdef OMPGPU
#endif
DO J=1_C_SIZE_T,LENGTH_IN_BYTES
SRC(J) = -1
ENDDO
#ifdef ACCGPU
!$ACC END PARALLEL
#endif
#ifdef OMPGPU
#endif
ENDIF
CALL C_F_POINTER(C_LOC(SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES-1)), DST, &
& [C_SIZEOF(SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES-1))/C_SIZEOF(DST(0))])
Expand Down
78 changes: 65 additions & 13 deletions src/trans/gpu/algor/ext_acc.F90
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,22 @@ module openacc_ext_type
end module
module openacc_ext
use iso_c_binding, only: c_ptr, c_size_t, c_loc, c_sizeof
#ifdef ACCGPU
use openacc, only: acc_handle_kind
#endif
#ifdef OMPGPU
#endif
use openacc_ext_type, only: ext_acc_arr_desc
implicit none

private
public :: ext_acc_pass, ext_acc_create, ext_acc_copyin, ext_acc_copyout, &
& ext_acc_delete, ext_acc_arr_desc, acc_handle_kind
#ifdef ACCGPU
& ext_acc_delete, ext_acc_arr_desc, acc_handle_kind
#endif
#ifdef OMPGPU
& ext_acc_delete, ext_acc_arr_desc
#endif

type common_pointer_descr
type(c_ptr) :: ptr
Expand Down Expand Up @@ -247,107 +256,150 @@ function get_common_pointers(in_ptrs, out_ptrs) result(num_ranges)
enddo
end function
subroutine ext_acc_create(ptrs, stream)
#ifdef ACCGPU
use openacc, only: acc_async_sync
#endif
use iso_fortran_env, only: int32
implicit none
type(ext_acc_arr_desc), intent(in) :: ptrs(:)
#ifdef ACCGPU
integer(acc_handle_kind), optional :: stream
#endif
#ifdef OMPGPU
integer(4), optional :: stream
#endif

type(common_pointer_descr), allocatable :: common_ptrs(:)

integer :: i, num_ranges
integer(kind=int32), pointer :: pp(:)
#ifdef ACCGPU
integer(acc_handle_kind) :: stream_act

if (present(stream)) then
stream_act = stream
else
stream_act = acc_async_sync
endif
#endif
allocate(common_ptrs(size(ptrs)))
num_ranges = get_common_pointers(ptrs, common_ptrs)

do i = 1, num_ranges
call c_f_pointer(common_ptrs(i)%ptr, pp, shape=[common_ptrs(i)%sz/c_sizeof(pp(1))])
call c_f_pointer(common_ptrs(i)%ptr, pp, [common_ptrs(i)%sz/c_sizeof(pp(1))])
#ifdef ACCGPU
!$acc enter data create(pp) async(stream_act)
#endif
#ifdef OMPGPU
#endif
enddo
end subroutine
subroutine ext_acc_copyin(ptrs, stream)
#ifdef ACCGPU
use openacc, only: acc_async_sync
#endif
implicit none
type(ext_acc_arr_desc), intent(in) :: ptrs(:)
#ifdef ACCGPU
integer(acc_handle_kind), optional :: stream
#endif
#ifdef OMPGPU
integer(4), optional :: stream
#endif

type(common_pointer_descr), allocatable :: common_ptrs(:)

integer :: i, num_ranges
integer(4), pointer :: pp(:)

#ifdef ACCGPU
integer(acc_handle_kind) :: stream_act

if (present(stream)) then
stream_act = stream
else
stream_act = acc_async_sync
endif
#endif
allocate(common_ptrs(size(ptrs)))
num_ranges = get_common_pointers(ptrs, common_ptrs)

do i = 1, num_ranges
call c_f_pointer(common_ptrs(i)%ptr, pp, shape=[common_ptrs(i)%sz/c_sizeof(pp(1))])
call c_f_pointer(common_ptrs(i)%ptr, pp, [common_ptrs(i)%sz/c_sizeof(pp(1))])
#ifdef ACCGPU
!$acc enter data copyin(pp) async(stream_act)
#endif
#ifdef OMPGPU
#endif
enddo
end subroutine
subroutine ext_acc_copyout(ptrs, stream)
#ifdef ACCGPU
use openacc, only: acc_async_sync
#endif
implicit none
type(ext_acc_arr_desc), intent(in) :: ptrs(:)
#ifdef ACCGPU
integer(acc_handle_kind), optional :: stream

#endif
#ifdef OMPGPU
integer(4), optional :: stream
#endif
type(common_pointer_descr), allocatable :: common_ptrs(:)

integer :: i, num_ranges
integer(4), pointer :: pp(:)

#ifdef ACCGPU
integer(acc_handle_kind) :: stream_act

if (present(stream)) then
stream_act = stream
else
stream_act = acc_async_sync
endif
#endif
allocate(common_ptrs(size(ptrs)))
num_ranges = get_common_pointers(ptrs, common_ptrs)

do i = 1, num_ranges
call c_f_pointer(common_ptrs(i)%ptr, pp, shape=[common_ptrs(i)%sz/c_sizeof(pp(1))])
call c_f_pointer(common_ptrs(i)%ptr, pp, [common_ptrs(i)%sz/c_sizeof(pp(1))])
#ifdef ACCGPU
!$acc exit data copyout(pp) async(stream_act)
#endif
#ifdef OMPGPU
#endif
enddo
end subroutine
subroutine ext_acc_delete(ptrs, stream)
#ifdef ACCGPU
use openacc, only: acc_async_sync
#endif
implicit none
type(ext_acc_arr_desc), intent(in) :: ptrs(:)
#ifdef ACCGPU
integer(acc_handle_kind), optional :: stream

#else
integer(4), optional :: stream
#endif
type(common_pointer_descr), allocatable :: common_ptrs(:)

integer :: i, num_ranges
integer(4), pointer :: pp(:)

#ifdef ACCGPU
integer(acc_handle_kind) :: stream_act

if (present(stream)) then
stream_act = stream
else
stream_act = acc_async_sync
endif
#endif
allocate(common_ptrs(size(ptrs)))
num_ranges = get_common_pointers(ptrs, common_ptrs)

do i = 1, num_ranges
call c_f_pointer(common_ptrs(i)%ptr, pp, shape=[common_ptrs(i)%sz/c_sizeof(pp(1))])
call c_f_pointer(common_ptrs(i)%ptr, pp, [common_ptrs(i)%sz/c_sizeof(pp(1))])
#ifdef ACCGPU
!$acc exit data delete(pp) async(stream_act)
#endif
#ifdef OMPGPU
#endif
enddo
end subroutine
end module
10 changes: 10 additions & 0 deletions src/trans/gpu/algor/growing_allocator_mod.F90
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,12 @@ SUBROUTINE REALLOCATE_GROWING_ALLOCATION(ALLOC, SZ)

IF (.NOT. ASSOCIATED(ALLOC%PTR)) THEN
ALLOCATE(ALLOC%PTR(SZ))
#ifdef OMPGPU
!$OMP TARGET ENTER DATA MAP(ALLOC:ALLOC%PTR)
#endif
#ifdef ACCGPU
!$ACC ENTER DATA CREATE(ALLOC%PTR)
#endif
ALLOC%FREE_FUNCS_SZ = 0
ENDIF
END SUBROUTINE
Expand Down Expand Up @@ -93,7 +98,12 @@ SUBROUTINE DESTROY_GROWING_ALLOCATOR(ALLOC)
CALL ALLOC%FREE_FUNCS(I)%FUNC(ALLOC%PTR, &
SIZE(ALLOC%PTR, 1, C_SIZE_T))
ENDDO
#ifdef OMPGPU
!$OMP TARGET EXIT DATA MAP(DELETE:ALLOC%PTR)
#endif
#ifdef ACCGPU
!$ACC EXIT DATA DELETE(ALLOC%PTR)
#endif
DEALLOCATE(ALLOC%PTR)
NULLIFY(ALLOC%PTR)
ENDIF
Expand Down
Loading
Loading