From b74250be621e518d0c59d9856ef1ae2ca9ee0396 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Tue, 5 Nov 2024 16:28:57 +0200 Subject: [PATCH 01/24] Refresh OpenMP CMake presence Now we can at least run the configure with OpenMP enabled. --- src/programs/CMakeLists.txt | 3 ++- src/trans/gpu/CMakeLists.txt | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/programs/CMakeLists.txt b/src/programs/CMakeLists.txt index d7f6013cf..29d58b9c6 100644 --- a/src/programs/CMakeLists.txt +++ b/src/programs/CMakeLists.txt @@ -35,7 +35,8 @@ foreach( program ectrans-benchmark ) fiat parkind_${prec} trans_gpu_${prec} - OpenACC::OpenACC_Fortran + $<${HAVE_ACC}:OpenACC::OpenACC_Fortran> + $<${HAVE_OMP}:OpenMP::OpenMP_Fortran> DEFINITIONS VERSION="gpu" $<$:USE_PINNED> diff --git a/src/trans/gpu/CMakeLists.txt b/src/trans/gpu/CMakeLists.txt index af4e07e57..97579f52b 100644 --- a/src/trans/gpu/CMakeLists.txt +++ b/src/trans/gpu/CMakeLists.txt @@ -58,6 +58,7 @@ ecbuild_add_library( PUBLIC_LIBS fiat ectrans_common PRIVATE_LIBS ${ECTRANS_GPU_HIP_LIBRARIES} $<${HAVE_ACC}:OpenACC::OpenACC_Fortran> + $<${HAVE_OMP}:OpenMP::OpenMP_Fortran> $<${HAVE_CUTLASS}:nvidia::cutlass::cutlass> PRIVATE_DEFINITIONS ${GPU_RUNTIME}GPU ${GPU_OFFLOAD}GPU $<${HAVE_CUTLASS}:USE_CUTLASS> From f4cf1c1fa042290d3429d9a67235761f18543266 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Tue, 5 Nov 2024 16:48:43 +0200 Subject: [PATCH 02/24] Refresh setup_trans.F90 --- src/trans/gpu/external/setup_trans.F90 | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/trans/gpu/external/setup_trans.F90 b/src/trans/gpu/external/setup_trans.F90 index d5a2d63ac..6b5398197 100755 --- a/src/trans/gpu/external/setup_trans.F90 +++ b/src/trans/gpu/external/setup_trans.F90 @@ -130,8 +130,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& & ACC_SET_DEVICE_NUM, ACC_GET_DEVICE_NUM #endif #ifdef OMPGPU -! TODO: add OMP equivalents to ACC library routines -!USE OMP_LIB +USE OMP_LIB #endif !endif INTERFACE From 2ae9b5fb23edd959a151787c5ae8226149168560 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Tue, 5 Nov 2024 17:01:33 +0200 Subject: [PATCH 03/24] Allow gpnorm_trans_gpu.F90 to compile with OpenMP --- src/trans/gpu/external/gpnorm_trans_gpu.F90 | 53 +++++++++++---------- 1 file changed, 28 insertions(+), 25 deletions(-) diff --git a/src/trans/gpu/external/gpnorm_trans_gpu.F90 b/src/trans/gpu/external/gpnorm_trans_gpu.F90 index e6b280ca0..23f6e0f70 100755 --- a/src/trans/gpu/external/gpnorm_trans_gpu.F90 +++ b/src/trans/gpu/external/gpnorm_trans_gpu.F90 @@ -162,12 +162,18 @@ SUBROUTINE GPNORM_TRANS_GPU(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) ZMAXGPN = 0._JPRBT #ifdef ACCGPU !$ACC ENTER DATA COPYIN(ZAVE,ZMINGL,ZMAXGL,ZMINGPN,ZMAXGPN) +#endif +#ifdef OMPGPU + !$OMP TARGET ENTER DATA MAP(TO:ZAVE,ZMINGL,ZMAXGL,ZMINGPN,ZMAXGPN) #endif IF (.NOT. ALLOCATED(ZGTF)) THEN ALLOCATE(ZGTF(IF_FS*D%NLENGTF)) WRITE(NOUT,*)'ZGTF :',SIZE(ZGTF) #ifdef ACCGPU !$ACC ENTER DATA CREATE(ZGTF) +#endif +#ifdef OMPGPU + !$OMP TARGET ENTER DATA MAP(ALLOC:ZGTF) #endif ENDIF ENDIF @@ -203,34 +209,31 @@ SUBROUTINE GPNORM_TRANS_GPU(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) IF( IF_FS > 0 )THEN #ifdef ACCGPU - !$ACC DATA & - !$ACC& COPY(D,D_NSTAGTF,D_NPTRLS,G_NLOEN,F,F_RW) & - !$ACC& PRESENT(ZGTF,ZAVE,ZMINGL,ZMAXGL,ZMINGPN,ZMAXGPN) + !$ACC DATA & + !$ACC& COPY(D,D_NSTAGTF,D_NPTRLS,G_NLOEN,F,F_RW) & + !$ACC& PRESENT(ZGTF,ZAVE,ZMINGL,ZMAXGL,ZMINGPN,ZMAXGPN) + !$ACC KERNELS #endif #ifdef OMPGPU - !$OMP TARGET DATA MAP(TO:F,D,D_NSTAGTF,D_NPTRLS,G_NLOEN) & - !$OMP& MAP(PRESENT,ALLOC:ZGTF,ZAVE,ZMINGL,ZMAXGL,ZMINGPN,ZMAXGPN) - !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO + !$OMP TARGET DATA MAP(TO:F,D,D_NSTAGTF,D_NPTRLS,G_NLOEN) & + !$OMP& MAP(PRESENT,ALLOC:ZGTF,ZAVE,ZMINGL,ZMAXGL,ZMINGPN,ZMAXGPN) + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO #endif + DO JF=1,IF_FS + V = ZGTF(IF_FS*D_NSTAGTF(1)+(JF-1)*(D%NSTAGTF(2)-D%NSTAGTF(1))) + ZMINGL(JF,IBEG:IEND)=HUGE(1_JPRBT) + ZMAXGL(JF,IBEG:IEND)=-HUGE(1_JPRBT) + ENDDO #ifdef ACCGPU - !$ACC KERNELS + !$ACC END KERNELS #endif - DO JF=1,IF_FS - V = ZGTF(IF_FS*D_NSTAGTF(1)+(JF-1)*(D%NSTAGTF(2)-D%NSTAGTF(1))) - ZMINGL(JF,IBEG:IEND)=HUGE(1_JPRBT) - ZMAXGL(JF,IBEG:IEND)=-HUGE(1_JPRBT) - ENDDO + + ! FIRST DO SUMS IN EACH FULL LATITUDE #ifdef ACCGPU - !$ACC END KERNELS + !$ACC KERNELS #endif - -! FIRST DO SUMS IN EACH FULL LATITUDE - #ifdef OMPGPU !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO -#endif -#ifdef ACCGPU - !$ACC KERNELS #endif DO JGL=1,D%NDGL_FS IGL = D_NPTRLS(MYSETW) + JGL - 1 @@ -251,11 +254,11 @@ SUBROUTINE GPNORM_TRANS_GPU(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) !$ACC END KERNELS #endif -#ifdef OMPGPU - !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO -#endif #ifdef ACCGPU !$ACC KERNELS +#endif +#ifdef OMPGPU + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO #endif DO JF=1,IF_FS ZMINGPN(JF)=MINVAL(ZMINGL(JF,IBEG:IEND)) @@ -265,11 +268,11 @@ SUBROUTINE GPNORM_TRANS_GPU(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) !$ACC END KERNELS #endif -#ifdef OMPGPU - !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO -#endif #ifdef ACCGPU !$ACC KERNELS +#endif +#ifdef OMPGPU + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO #endif DO JGL=IBEG,IEND IGL = D_NPTRLS(MYSETW) + JGL - 1 From 4e733163cd42ffeeb7c174787d2c5dac76281c76 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Thu, 28 Nov 2024 16:43:42 +0200 Subject: [PATCH 04/24] Allow fsc_mod.F90 to compile with OpenMP --- src/trans/gpu/internal/fsc_mod.F90 | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/src/trans/gpu/internal/fsc_mod.F90 b/src/trans/gpu/internal/fsc_mod.F90 index b3c8be8f9..0122306a6 100755 --- a/src/trans/gpu/internal/fsc_mod.F90 +++ b/src/trans/gpu/internal/fsc_mod.F90 @@ -108,8 +108,9 @@ SUBROUTINE FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSE !$ACC& PRESENT(D,D_NPTRLS,D_NSTAGTF,PREEL_COMPLEX,F,F_RACTHE,G,G_NMEN,G_NLOEN,R,R_NSMAX) #endif #ifdef OMPGPU -!$OMP TARGET DATA MAP(PRESENT,ALLOC:ZGTF) & -!$OMP& MAP(ALLOC:PUV,PSCALAR,PNSDERS,PEWDERS,PUVDERS) +!$OMP TARGET DATA & +!$OMP& MAP(PRESENT,ALLOC:D_NSTAGTF,PREEL_COMPLEX,F_RACTHE,G_NMEN,G_NLOEN) & +!$OMP& MAP(TO:R_NSMAX) #endif ! ------------------------------------------------------------------ @@ -121,10 +122,13 @@ SUBROUTINE FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSE !* 1.1 U AND V. #ifdef OMPGPU - !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO DEFAULT(NONE) SHARED(KF_UV,PUV,ZACHTE2) +!$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) & +!$OMP& PRIVATE(IGLG,IOFF_LAT,IOFF_UV,ZACHTE2,JM,JF,KGL) & +!$OMP& FIRSTPRIVATE(IBEG,IEND,IINC,OFFSET_VAR,KF_UV,KUV_OFFSET,KF_FS) #endif #ifdef ACCGPU -!$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(IGLG,IOFF_LAT,IOFF_UV,ZACHTE2,JM,JF,KGL) & +!$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) & +!$ACC& PRIVATE(IGLG,IOFF_LAT,IOFF_UV,ZACHTE2,JM,JF,KGL) & !$ACC& FIRSTPRIVATE(IBEG,IEND,IINC,OFFSET_VAR,KF_UV,KUV_OFFSET,KF_FS) ASYNC(1) #endif DO KGL=IBEG,IEND,IINC @@ -150,7 +154,9 @@ SUBROUTINE FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSE IF (KSCALARS_NSDER_OFFSET >= 0) THEN #ifdef OMPGPU - !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO DEFAULT(NONE) SHARED(KF_SCALARS,PNSDERS,ZACHTE2) + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) & + !$OMP& PRIVATE(IGLG,IOFF_LAT,IOFF_KSCALARS_NSDER,ZACHTE2,KGL,JF,JM) & + !$OMP& FIRSTPRIVATE(IBEG,IEND,IINC,OFFSET_VAR,KF_SCALARS,KSCALARS_NSDER_OFFSET,KF_FS) #endif #ifdef ACCGPU !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(IGLG,IOFF_LAT,IOFF_KSCALARS_NSDER,ZACHTE2,KGL,JF,JM) & @@ -186,7 +192,9 @@ SUBROUTINE FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSE ILOEN_MAX = MAXVAL(G_NLOEN) IF (KUV_EWDER_OFFSET >= 0) THEN #ifdef OMPGPU - !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO DEFAULT(NONE) SHARED(KF_UV,PUVDERS,ZACHTE2,PUV) + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) & + !$OMP& PRIVATE(IGLG,IOFF_LAT,IOFF_UV,IOFF_UV_EWDER,RET_REAL,RET_COMPLEX,ZACHTE2,JM,JF,KGL) & + !$OMP& FIRSTPRIVATE(IBEG,IEND,IINC,OFFSET_VAR,KF_UV,KUV_EWDER_OFFSET,KUV_OFFSET,KF_FS) #endif #ifdef ACCGPU !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(IGLG,IOFF_LAT,IOFF_UV,IOFF_UV_EWDER,RET_REAL,RET_COMPLEX,ZACHTE2,JM,JF,KGL) & @@ -227,7 +235,9 @@ SUBROUTINE FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSE IF (KSCALARS_EWDER_OFFSET > 0) THEN #ifdef OMPGPU - !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO DEFAULT(NONE) SHARED(KF_SCALARS,PEWDERS,ZACHTE2,PSCALAR) + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) & + !$OMP& PRIVATE(IGLG,IOFF_LAT,IOFF_SCALARS_EWDER,IOFF_SCALARS,ZACHTE2,RET_REAL,RET_COMPLEX) & + !$OMP& FIRSTPRIVATE(IBEG,IEND,IINC,KF_SCALARS,OFFSET_VAR,KSCALARS_EWDER_OFFSET,KSCALARS_OFFSET,KF_FS) #endif #ifdef ACCGPU !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(IGLG,IOFF_LAT,IOFF_SCALARS_EWDER,IOFF_SCALARS,ZACHTE2,RET_REAL,RET_COMPLEX) & From 4e914a23279f7f169b5dde3e1934e153e78b1a59 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Thu, 28 Nov 2024 17:05:46 +0200 Subject: [PATCH 05/24] Allow growing_allocator_mod.F90 to compile with OpenMP --- src/trans/gpu/algor/growing_allocator_mod.F90 | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/trans/gpu/algor/growing_allocator_mod.F90 b/src/trans/gpu/algor/growing_allocator_mod.F90 index db869e480..844194e0c 100644 --- a/src/trans/gpu/algor/growing_allocator_mod.F90 +++ b/src/trans/gpu/algor/growing_allocator_mod.F90 @@ -43,7 +43,12 @@ SUBROUTINE REALLOCATE_GROWING_ALLOCATION(ALLOC, SZ) IF (.NOT. ASSOCIATED(ALLOC%PTR)) THEN ALLOCATE(ALLOC%PTR(SZ)) +#ifdef OMPGU + !$OMP TARGET ENTER DATA MAP(ALLOC:ALLOC%PTR) +#endif +#ifdef ACCGPU !$ACC ENTER DATA CREATE(ALLOC%PTR) +#endif ALLOC%FREE_FUNCS_SZ = 0 ENDIF END SUBROUTINE From 7de100333a04982f7e4868488bb201f018977637 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Thu, 28 Nov 2024 17:07:11 +0200 Subject: [PATCH 06/24] Allow gpnorm_trans.F90 to compile with OpenMP --- src/trans/gpu/external/gpnorm_trans.F90 | 54 ++++++++++++++++++------- 1 file changed, 40 insertions(+), 14 deletions(-) diff --git a/src/trans/gpu/external/gpnorm_trans.F90 b/src/trans/gpu/external/gpnorm_trans.F90 index 711cd7f25..460ace10e 100755 --- a/src/trans/gpu/external/gpnorm_trans.F90 +++ b/src/trans/gpu/external/gpnorm_trans.F90 @@ -163,7 +163,13 @@ SUBROUTINE GPNORM_TRANS(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) ZMAXGL = 0._JPRBT ZMINGPN = 0._JPRBT ZMAXGPN = 0._JPRBT + +#ifdef OMPGPU +!$OMP TARGET DATA MAP(TOFROM:ZAVE,ZMINGL,ZMAXGL,ZMINGPN,ZMAXGPN) +#endif +#ifdef ACCGPU !$ACC DATA COPY(ZAVE,ZMINGL,ZMAXGL,ZMINGPN,ZMAXGPN) +#endif ALLOCATE(IVSETS(NPRTRV)) IVSETS(:)=0 @@ -199,19 +205,29 @@ SUBROUTINE GPNORM_TRANS(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) CALL GSTATS(1429,0) IF( IF_FS > 0 )THEN - !$ACC DATA & - !$ACC& PRESENT(F,F_RW) & - !$ACC& PRESENT(D,D_NSTAGTF,D_NPTRLS,G_NLOEN) - - !$ACC KERNELS - DO JF=1,IF_FS - V = PREEL_REAL(IF_FS*D_NSTAGTF(1)+(JF-1)*(D%NSTAGTF(2)-D%NSTAGTF(1))) - ZMINGL(JF,IBEG:IEND)=HUGE(1_JPRBT) - ZMAXGL(JF,IBEG:IEND)=-HUGE(1_JPRBT) - ENDDO - !$ACC END KERNELS +#ifdef OMPGPU + !$OMP TARGET DATA MAP(PRESENT,ALLOC:F,F_RW,D,D_NSTAGTF,D_NPTRLS,G_NLOEN) +#endif +#ifdef ACCGPU + !$ACC DATA PRESENT(F,F_RW,D,D_NSTAGTF,D_NPTRLS,G_NLOEN) +#endif + +#ifdef OMPGPU + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO +#endif +#ifdef ACCGPU + !$ACC KERNELS +#endif + DO JF=1,IF_FS + V = PREEL_REAL(IF_FS*D_NSTAGTF(1)+(JF-1)*(D%NSTAGTF(2)-D%NSTAGTF(1))) + ZMINGL(JF,IBEG:IEND)=HUGE(1_JPRBT) + ZMAXGL(JF,IBEG:IEND)=-HUGE(1_JPRBT) + ENDDO +#ifdef ACCGPU + !$ACC END KERNELS +#endif -! FIRST DO SUMS IN EACH FULL LATITUDE + ! FIRST DO SUMS IN EACH FULL LATITUDE !$ACC KERNELS DO JGL=1,D%NDGL_FS @@ -246,10 +262,20 @@ SUBROUTINE GPNORM_TRANS(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) ENDDO !$ACC END KERNELS -!$ACC end data +#ifdef OMPGPU + !$OMP END TARGET DATA +#endif +#ifdef ACCGPU + !$ACC end data +#endif ENDIF -!$ACC end data +#ifdef OMPGPU +!$OMP END TARGET DATA +#endif +#ifdef ACCGPU +!$ACC END DATA +#endif CALL GSTATS(1429,1) END ASSOCIATE From b3efb4bfe652668bd4662d4e147716d6afee6cdb Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Thu, 28 Nov 2024 17:08:00 +0200 Subject: [PATCH 07/24] Allow ltinv_mod.F90 to compile with OpenMP --- src/trans/gpu/internal/ltinv_mod.F90 | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/trans/gpu/internal/ltinv_mod.F90 b/src/trans/gpu/internal/ltinv_mod.F90 index fe3fba0ea..301e55248 100755 --- a/src/trans/gpu/internal/ltinv_mod.F90 +++ b/src/trans/gpu/internal/ltinv_mod.F90 @@ -324,6 +324,11 @@ SUBROUTINE LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& ENDIF CALL GSTATS(422,0) #ifdef OMPGPU + !$OMP TARGET DATA MAP(TO:PSPVOR,PSPDIV) IF(KF_UV > 0) + !$OMP TARGET DATA MAP(TO:PSPSCALAR) IF(PRESENT(PSPSCALAR) .AND. KF_SCALARS > 0) + !$OMP TARGET DATA MAP(TO:PSPSC2) IF(NF_SC2 > 0) + !$OMP TARGET DATA MAP(TO:PSPSC3A) IF(NF_SC3A > 0) + !$OMP TARGET DATA MAP(TO:PSPSC3B) IF(NF_SC3B > 0) #endif #ifdef ACCGPU !$ACC DATA COPYIN(PSPVOR,PSPDIV) IF(KF_UV > 0) From 2e08a7f220a840afd488dbe78eefbf4cc17fc2d0 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Thu, 28 Nov 2024 17:08:16 +0200 Subject: [PATCH 08/24] Allow prfi1b_mod.F90 to compile with OpenMP --- src/trans/gpu/internal/prfi1b_mod.F90 | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/trans/gpu/internal/prfi1b_mod.F90 b/src/trans/gpu/internal/prfi1b_mod.F90 index 584575782..489683f8a 100755 --- a/src/trans/gpu/internal/prfi1b_mod.F90 +++ b/src/trans/gpu/internal/prfi1b_mod.F90 @@ -102,11 +102,12 @@ SUBROUTINE PRFI1B(PIA,PSPEC,KFIELDS,KDIM,KFLDPTR) !loop over wavenumber #ifdef OMPGPU - !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) PRIVATE(KM,ILCM,IFLD,IASM0,IR,II,INM) + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) PRIVATE(KM,ILCM,IFLD,IASM0,IR,II,INM) & + !$OMP& FIRSTPRIVATE(KFIELDS) #endif #ifdef ACCGPU !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(KM,ILCM,IFLD,IASM0,IR,II,INM) & - !$ACC& FIRSTPRIVATE(KFIELDS) ASYNC(1) + !$ACC& FIRSTPRIVATE(KFIELDS) ASYNC(1) #endif DO KMLOC=1,D_NUMP DO JN=1,R_NSMAX+1 @@ -129,11 +130,10 @@ SUBROUTINE PRFI1B(PIA,PSPEC,KFIELDS,KDIM,KFLDPTR) ENDDO #ifdef OMPGPU - !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(2) PRIVATE(KM,ILCM) + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(2) PRIVATE(KM,ILCM) FIRSTPRIVATE(KFIELDS) #endif #ifdef ACCGPU - !$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(2) PRIVATE(KM,ILCM) & - !$ACC& FIRSTPRIVATE(KFIELDS) ASYNC(1) + !$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(2) PRIVATE(KM,ILCM) FIRSTPRIVATE(KFIELDS) ASYNC(1) #endif DO KMLOC=1,D_NUMP DO JFLD=1,2*KFIELDS @@ -151,7 +151,8 @@ SUBROUTINE PRFI1B(PIA,PSPEC,KFIELDS,KDIM,KFLDPTR) !loop over wavenumber #ifdef OMPGPU - !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) PRIVATE(KM,ILCM,IOFF,INM,IR,II) + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) PRIVATE(KM,IASM0,INM) & + !$OMP& FIRSTPRIVATE(KFIELDS,KDIM) #endif #ifdef ACCGPU !$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(3) PRIVATE(KM,IASM0,INM) FIRSTPRIVATE(KFIELDS,KDIM) & @@ -189,7 +190,7 @@ SUBROUTINE PRFI1B(PIA,PSPEC,KFIELDS,KDIM,KFLDPTR) !$ACC END DATA #endif #ifdef OMPGPU - !$OMP END TARGET DATA +!$OMP END TARGET DATA #endif ! ------------------------------------------------------------------ From ba54d1544212f4f661a7b5dc98e1e858799d838e Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Thu, 28 Nov 2024 17:08:34 +0200 Subject: [PATCH 09/24] Allow spnsde_mod.F90 to compile with OpenMP --- src/trans/gpu/internal/spnsde_mod.F90 | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/src/trans/gpu/internal/spnsde_mod.F90 b/src/trans/gpu/internal/spnsde_mod.F90 index 73681f3ef..d7fdcd282 100755 --- a/src/trans/gpu/internal/spnsde_mod.F90 +++ b/src/trans/gpu/internal/spnsde_mod.F90 @@ -88,10 +88,6 @@ SUBROUTINE SPNSDE(KF_SCALARS,PEPSNM,PF,PNSD) !$ACC& PRESENT (R,R_NTMAX, D,D_MYMS) & !$ACC& PRESENT (D_NUMP,PEPSNM, PF, PNSD) ASYNC(1) #endif -#ifdef OMPGPU -!$OMP TARGET DATA & -!$OMP& MAP(PRESENT,PEPSNM,ALLOC:ZN) -#endif ! ------------------------------------------------------------------ @@ -102,16 +98,15 @@ SUBROUTINE SPNSDE(KF_SCALARS,PEPSNM,PF,PNSD) !* 1.1 COMPUTE #ifdef OMPGPU - !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO - !! DEFAULT(NONE) PRIVATE(IJ) & - !!$OMP& SHARED(KM,ZN,KMLOC) +!$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) PRIVATE(KM,IR,II,JI) MAP(TO:KF_SCALARS) & +!$OMP& SHARED(D,R) #endif #ifdef ACCGPU - !$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(3) PRIVATE(KM,IR,II,JI) FIRSTPRIVATE(KMLOC,KF_SCALARS) & +!$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(3) PRIVATE(KM,IR,II,JI) FIRSTPRIVATE(KF_SCALARS) & #ifndef _CRAYFTN - !$ACC& ASYNC(1) +!$ACC& ASYNC(1) #else - !$ACC& +!$ACC& #endif #endif DO KMLOC=1,D_NUMP @@ -139,9 +134,6 @@ SUBROUTINE SPNSDE(KF_SCALARS,PEPSNM,PF,PNSD) ENDDO END DO -#ifdef OMPGPU -!$OMP END TARGET DATA -#endif #ifdef ACCGPU !$ACC END DATA #endif From d1e318cae66e7d5b1605f83067ac8d01ca59c436 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Thu, 28 Nov 2024 17:08:51 +0200 Subject: [PATCH 10/24] Allow trgtol_mod.F90 to compile with OpenMP --- src/trans/gpu/internal/trgtol_mod.F90 | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index b2bebb587..055534cbf 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -339,6 +339,8 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, end block #ifdef OMPGPU + !$OMP TARGET DATA MAP(TO:IRECV_BUFR_TO_OUT) MAP(ALLOC:PREEL_REAL) IF (KF_FS > 0) + !$OMP TARGET DATA MAP(TO:PGP_INDICES) #endif #ifdef ACCGPU !$ACC DATA COPYIN(IRECV_BUFR_TO_OUT) PRESENT(PREEL_REAL) IF (KF_FS > 0) ASYNC(1) @@ -379,7 +381,9 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGP3B) ENDIF IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_CREATE(ACC_POINTERS(1:ACC_POINTERS_CNT),STREAM=1_ACC_HANDLE_KIND) +#ifdef ACCGPU !$ACC WAIT(1) +#endif IF (PRESENT(PGP)) THEN #ifdef OMPGPU #endif @@ -771,6 +775,8 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, CALL GSTATS(1603,1) #ifdef OMPGPU + !$OMP END TARGET DATA ! PGP_INDICES + !$OMP END TARGET DATA ! IRECV_BUFR_TO_OUT #endif #ifdef ACCGPU !$ACC END DATA ! ZCOMBUFR From ab23a9dba89754f4fda3fa3eeede904f0e1f144e Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Thu, 28 Nov 2024 17:09:25 +0200 Subject: [PATCH 11/24] Remove unnecessary mpif.h import --- src/trans/gpu/internal/trltog_mod.F90 | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/trans/gpu/internal/trltog_mod.F90 b/src/trans/gpu/internal/trltog_mod.F90 index 905967a6f..d27cc74a9 100755 --- a/src/trans/gpu/internal/trltog_mod.F90 +++ b/src/trans/gpu/internal/trltog_mod.F90 @@ -129,11 +129,6 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, IMPLICIT NONE -#ifdef OMPGPU - include 'mpif.h' -#endif - - REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PREEL_REAL(:) INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS,KF_GP INTEGER(KIND=JPIM),INTENT(IN) :: KF_UV_G, KF_SCALARS_G From 287d97c9793c4d9279da93e53d67706c10c7d49d Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Thu, 28 Nov 2024 17:09:40 +0200 Subject: [PATCH 12/24] Allow vdtuv_mod.F90 to compile with OpenMP --- src/trans/gpu/internal/vdtuv_mod.F90 | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/trans/gpu/internal/vdtuv_mod.F90 b/src/trans/gpu/internal/vdtuv_mod.F90 index 6cfc5f1dc..e548abe96 100755 --- a/src/trans/gpu/internal/vdtuv_mod.F90 +++ b/src/trans/gpu/internal/vdtuv_mod.F90 @@ -94,11 +94,7 @@ SUBROUTINE VDTUV(KFIELD,PEPSNM,PVOR,PDIV,PU,PV) !$ACC& PRESENT(PU, PV) #endif #ifdef OMPGPU -!$OMP TARGET DATA & -!$OMP& MAP (PRESENT,ALLOC:ZEPSNM, ZN, ZLAPIN) & -!$OMP& MAP (TO:R_NSMAX, D_MYMS,D_NUMP,F_RLAPIN) & -!$OMP& MAP(PRESENT,ALLOC:ZEPSNM, PVOR, PDIV) & -!$OMP& MAP(PRESENT,ALLOC:PU, PV) +!$OMP TARGET DATA MAP(PRESENT,ALLOC:R_NTMAX,D_MYMS,D_NUMP,F_RLAPIN,PEPSNM,PVOR,PDIV,PU,PV) #endif ! ------------------------------------------------------------------ @@ -107,6 +103,8 @@ SUBROUTINE VDTUV(KFIELD,PEPSNM,PVOR,PDIV,PU,PV) ! ------------------------------------------ #ifdef OMPGPU +!$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) PRIVATE(IR,II,KM,ZKM,JI) & +!$OMP& FIRSTPRIVATE(KFIELD,KMLOC) #endif #ifdef ACCGPU !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(IR,II,KM,ZKM,JI) FIRSTPRIVATE(KFIELD,KMLOC) & From 20e9a111f298c273212869b8b6da0f9bb87c0566 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Wed, 4 Dec 2024 16:42:47 +0200 Subject: [PATCH 13/24] Allo uvtvd_mod.F90 to compile with OpenMP --- src/trans/gpu/internal/uvtvd_mod.F90 | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/src/trans/gpu/internal/uvtvd_mod.F90 b/src/trans/gpu/internal/uvtvd_mod.F90 index 93323f92f..6362e8119 100755 --- a/src/trans/gpu/internal/uvtvd_mod.F90 +++ b/src/trans/gpu/internal/uvtvd_mod.F90 @@ -89,17 +89,12 @@ SUBROUTINE UVTVD(KF_UV,PU,PV,PVOR,PDIV) !$ACC& PRESENT(D,D_MYMS,D_NUMP,R,R_NTMAX) & !$ACC& PRESENT(FG,ZEPSNM,PU,PV,PVOR,PDIV) ASYNC(1) #endif -#ifdef OMPGPU -!WARNING: following line should be PRESENT,ALLOC but causes issues with AMD compiler! -!$OMP TARGET DATA& -!$OMP& MAP(TO:D_MYMS,D_NUMP,R_NTMAX) & -!$OMP& MAP(ALLOC:ZEPSNM,PU,PV,PVOR,PDIV) -#endif !* 1.1 SET N=KM-1 COMPONENT TO 0 FOR U AND V #ifdef OMPGPU -!$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(2) +!$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(2) PRIVATE(KM) SHARED(D,KF_UV,R,PU,PV) & +!$OMP& MAP(TO:KF_UV) DEFAULT(NONE) #endif #ifdef ACCGPU !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM) FIRSTPRIVATE(KF_UV) DEFAULT(NONE) & @@ -120,11 +115,11 @@ SUBROUTINE UVTVD(KF_UV,PU,PV,PVOR,PDIV) !* 1.2 COMPUTE VORTICITY AND DIVERGENCE. #ifdef OMPGPU -!$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) PRIVATE(IR,II,IN,KM,ZKM,JN,ZJN) DEFAULT(NONE) & -!$OMP& SHARED(D_NUMP,R_NTMAX,KF_UV,D_MYMS,PVOR,PV,PU,PDIV,ZEPSNM) +!$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) PRIVATE(IR,II,IN,KM,ZKM,ZJN) & +!$OMP& SHARED(D,R,KF_UV,FG,PVOR,PV,PU,PDIV) DEFAULT(NONE) #endif #ifdef ACCGPU -!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(IR,II,IN,KM,ZKM,JN,ZJN) FIRSTPRIVATE(KF_UV) DEFAULT(NONE) & +!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(IR,II,IN,KM,ZKM,ZJN) DEFAULT(NONE) & #ifndef _CRAYFTN !$ACC& ASYNC(1) #else @@ -172,9 +167,7 @@ SUBROUTINE UVTVD(KF_UV,PU,PV,PVOR,PDIV) ENDDO ENDDO ENDDO -#ifdef OMPGPU -!$OMP END TARGET DATA -#endif + #ifdef ACCGPU !$ACC END DATA #endif From f85fc552e14553e74e67cd0071a9be1306a6f789 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Mon, 20 Jan 2025 17:30:13 +0200 Subject: [PATCH 14/24] Allow updspb_mod.F90 to compile with OpenMP --- src/trans/gpu/internal/updspb_mod.F90 | 35 +++++++++++++++------------ 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/src/trans/gpu/internal/updspb_mod.F90 b/src/trans/gpu/internal/updspb_mod.F90 index 418810f44..32a12660b 100755 --- a/src/trans/gpu/internal/updspb_mod.F90 +++ b/src/trans/gpu/internal/updspb_mod.F90 @@ -87,28 +87,30 @@ SUBROUTINE UPDSPB(KFIELD,POA,PSPEC,KFLDPTR) ! and nn=NTMAX+2-n from NTMAX+2-m to NTMAX+2-NSMAX. ! NLTN(m)=NTMAX+2-m : n=NLTN(nn),nn=NLTN(n) ! nn is the loop index. - ASSOCIATE(D_NUMP=>D%NUMP, D_MYMS=>D%MYMS, D_NASM0=>D%NASM0, R_NTMAX=>R%NTMAX) - - IF(PRESENT(KFLDPTR)) THEN - CALL ABORT_TRANS('UPDSPB: Code path not (yet) supported in GPU version') - ENDIF + ASSOCIATE(D_NUMP=>D%NUMP, D_MYMS=>D%MYMS, D_NASM0=>D%NASM0, R_NTMAX=>R%NTMAX) + + IF(PRESENT(KFLDPTR)) THEN + CALL ABORT_TRANS('UPDSPB: Code path not (yet) supported in GPU version') + ENDIF !* 1. UPDATE SPECTRAL FIELDS. ! ----------------------- - !loop over wavenumber +#ifdef OMPGPU + !$OMP TARGET DATA MAP(TO:KFIELD) +#endif #ifdef ACCGPU !$ACC DATA PRESENT(PSPEC,POA,R,R_NTMAX,D,D_NUMP,D_MYMS,D_NASM0) ASYNC(1) #endif + +! Directive incomplete -> putting more variables in SHARED() triggers internal compiler error +! ftn-7991: INTERNAL COMPILER ERROR: "Too few arguments on the stack" #ifdef OMPGPU -!WARNING: following line should be PRESENT,ALLOC but causes issues with AMD compiler! - !$OMP TARGET DATA MAP(ALLOC:PSPEC,POA) & - !$OMP& MAP(TO:R_NTMAX,D_NUMP,D_MYMS,D_NASM0) - !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) PRIVATE(KM,IASM0,INM,IR,II) DEFAULT(NONE) & - !$OMP& SHARED(R_NTMAX,D_NUMP,D_MYMS,D_NASM0,PSPEC,KFIELD,POA) + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO PRIVATE(KM,IASM0,INM) & + !$OMP& SHARED(KFIELD) #endif #ifdef ACCGPU - !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(KM,IASM0,INM) DEFAULT(NONE) FIRSTPRIVATE(KFIELD) & + !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(KM,IASM0,INM) DEFAULT(NONE) COPYIN(KFIELD) & #ifndef _CRAYFTN !$ACC& ASYNC(1) #else @@ -135,14 +137,15 @@ SUBROUTINE UPDSPB(KFIELD,POA,PSPEC,KFLDPTR) ENDDO ENDDO ENDDO -#ifdef OMPGPU - !$OMP END TARGET DATA -#endif + #ifdef ACCGPU !$ACC END DATA #endif +#ifdef OMPGPU + !$OMP END TARGET DATA +#endif -END ASSOCIATE + END ASSOCIATE ! ------------------------------------------------------------------ END SUBROUTINE UPDSPB From a71b64bef1b2ac38ad9c3cc63e8c50ce084134ab Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Wed, 4 Dec 2024 19:22:12 +0200 Subject: [PATCH 15/24] Tidy up gpu/SETUP_TRANS --- src/trans/gpu/external/setup_trans.F90 | 168 ++++++++++++------------- 1 file changed, 80 insertions(+), 88 deletions(-) diff --git a/src/trans/gpu/external/setup_trans.F90 b/src/trans/gpu/external/setup_trans.F90 index 6b5398197..1013f8ab8 100755 --- a/src/trans/gpu/external/setup_trans.F90 +++ b/src/trans/gpu/external/setup_trans.F90 @@ -103,12 +103,11 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& !ifndef INTERFACE -USE, INTRINSIC :: ISO_C_BINDING, ONLY: C_PTR, C_INT, C_ASSOCIATED, C_SIZE_T, C_SIZEOF -USE EC_ENV_MOD, ONLY: EC_GETENV +USE, INTRINSIC :: ISO_C_BINDING, ONLY: C_PTR, C_ASSOCIATED, C_SIZE_T, C_SIZEOF USE TPM_GEN, ONLY: NOUT, MSETUP0, NCUR_RESOL, NDEF_RESOL, & & NMAX_RESOL, NPRINTLEV, LENABLED, NERR USE TPM_DIM, ONLY: R, DIM_RESOL -USE TPM_DISTR, ONLY: D, DISTR_RESOL, NPROC, NPRTRV, MYPROC +USE TPM_DISTR, ONLY: D, DISTR_RESOL, NPROC, MYPROC USE TPM_GEOMETRY, ONLY: G, GEOM_RESOL USE TPM_FIELDS, ONLY: FIELDS_RESOL, F USE TPM_FIELDS_GPU, ONLY: FIELDS_GPU_RESOL, FG @@ -129,9 +128,6 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& USE OPENACC, ONLY: ACC_DEVICE_KIND, ACC_GET_DEVICE_TYPE, ACC_GET_NUM_DEVICES, & & ACC_SET_DEVICE_NUM, ACC_GET_DEVICE_NUM #endif -#ifdef OMPGPU -USE OMP_LIB -#endif !endif INTERFACE @@ -164,21 +160,17 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& !ifndef INTERFACE ! Local variables -INTEGER(KIND=JPIM) :: JGL,JRES,IDEF_RESOL -INTEGER(KIND=JPIM) :: JMLOC, KM, ILA, ILS, KMLOC, KDGLU, JK, I, J - -INTEGER(KIND=JPIM) :: IPROC, IPROCS, ISTAN, ISTAS, ISL, IGLS, JFLD, IMLOC0(1) +INTEGER(KIND=JPIM) :: JGL, JRES, IDEF_RESOL +INTEGER(KIND=JPIM) :: JMLOC, KM, ILA, ILS, KDGLU +INTEGER(KIND=JPIM) :: IMLOC0(1) -LOGICAL :: LLP1,LLP2, LLSPSETUPONLY -REAL(KIND=JPRD) :: ZTIME0,ZTIME1,ZTIME2 +LOGICAL :: LLP1, LLP2, LLSPSETUPONLY REAL(KIND=JPHOOK) :: ZHOOK_HANDLE -CHARACTER(LEN=8) :: CENV - #ifdef ACCGPU INTEGER(ACC_DEVICE_KIND) :: IDEVTYPE #endif -INTEGER :: INUMDEVS, IUNIT, ISTAT, IDEV, MYGPU +INTEGER :: INUMDEVS, IDEV, MYGPU #include "user_clock.intfb.h" ! ------------------------------------------------------------------ @@ -256,8 +248,6 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& IF(LLP1) WRITE(NOUT,*) '=== DEFINING RESOLUTION ',NCUR_RESOL - - ! Defaults for optional arguments @@ -333,7 +323,8 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& ! Optional arguments ALLOCATE(G%NLOEN(R%NDGL)) -IF(LLP2)WRITE(NOUT,9) 'NLOEN ',SIZE(G%NLOEN ),SHAPE(G%NLOEN ) +IF (LLP2) WRITE(NOUT,'("ARRAY NLOEN ALLOCATED",8I8)') SIZE(G%NLOEN ),SHAPE(G%NLOEN ) + IF(PRESENT(KLOEN)) THEN IF( MINVAL(KLOEN(:)) <= 0 )THEN CALL ABORT_TRANS ('SETUP_TRANS: KLOEN INVALID (ONE or MORE POINTS <= 0)') @@ -444,6 +435,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& IF(PRESENT(LDKEEPRPNM)) THEN S%LKEEPRPNM=LDKEEPRPNM ENDIF + ! Setup resolution dependent structures ! ------------------------------------- @@ -452,14 +444,14 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& IF( .NOT.LLSPSETUPONLY ) THEN -! Compute Legendre polonomial and Gaussian Latitudes and Weights + ! Compute Legendre polonomial and Gaussian Latitudes and Weights CALL SULEG -! Second part of setup of distributed environment + ! Second part of setup of distributed environment CALL SUMP_TRANS CALL GSTATS(1802,0) -! Initialize Fast Fourier Transform package + ! Initialize Fast Fourier Transform package IF (.NOT.D%LCPNMONLY) CALL SUFFT CALL GSTATS(1802,1) ELSE @@ -472,111 +464,111 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& IF (LHOOK) CALL DR_HOOK('SETUP_TRANS',1,ZHOOK_HANDLE) ! ------------------------------------------------------------------ -9 FORMAT(1X,'ARRAY ',A10,' ALLOCATED ',8I8) -IF( .NOT.D%LGRIDONLY ) THEN -IUNIT=300+MYPROC +IF( .NOT.D%LGRIDONLY ) THEN #ifdef ACCGPU -!!IDEVTYPE=ACC_DEVICE_NVIDIA -IDEVTYPE=ACC_GET_DEVICE_TYPE() -INUMDEVS = ACC_GET_NUM_DEVICES(IDEVTYPE) -MYGPU = MOD(MYPROC-1,INUMDEVS) -CALL ACC_SET_DEVICE_NUM(MYGPU, IDEVTYPE) -MYGPU = ACC_GET_DEVICE_NUM(IDEVTYPE) -!ISTAT = CUDA_GETDEVICE(IDEV) + IDEVTYPE = ACC_GET_DEVICE_TYPE() + INUMDEVS = ACC_GET_NUM_DEVICES(IDEVTYPE) + MYGPU = MOD(MYPROC-1, INUMDEVS) + CALL ACC_SET_DEVICE_NUM(MYGPU, IDEVTYPE) #endif -WRITE(NOUT,*) 'R%NTMAX=',R%NTMAX -WRITE(NOUT,*) 'R%NSMAX=',R%NSMAX + WRITE(NOUT,*) 'R%NTMAX=',R%NTMAX + WRITE(NOUT,*) 'R%NSMAX=',R%NSMAX -! Initialize A arrays + ! Initialize A arrays -ALLOCATE(FG%ZAA(ALIGN(R%NDGNH,8),ALIGN((R%NTMAX+2)/2,8),D%NUMP)) -ALLOCATE(FG%ZAS(ALIGN(R%NDGNH,8),ALIGN((R%NTMAX+3)/2,8),D%NUMP)) + ALLOCATE(FG%ZAA(ALIGN(R%NDGNH,8),ALIGN((R%NTMAX+2)/2,8),D%NUMP)) + ALLOCATE(FG%ZAS(ALIGN(R%NDGNH,8),ALIGN((R%NTMAX+3)/2,8),D%NUMP)) -FG%ZAA(:,:,:) = 0._JPRBT -FG%ZAS(:,:,:) = 0._JPRBT + FG%ZAA(:,:,:) = 0._JPRBT + FG%ZAS(:,:,:) = 0._JPRBT -DO JMLOC=1,D%NUMP - KM = D%MYMS(JMLOC) - KDGLU = G%NDGLU(KM) - ILA = (R%NSMAX-KM+2)/2 - ILS = (R%NSMAX-KM+3)/2 + DO JMLOC=1,D%NUMP + KM = D%MYMS(JMLOC) + KDGLU = G%NDGLU(KM) + ILA = (R%NSMAX-KM+2)/2 + ILS = (R%NSMAX-KM+3)/2 - FG%ZAA(1:KDGLU,1:ILA,JMLOC)=S%FA(JMLOC)%RPNMA(1:KDGLU,1:ILA) - FG%ZAS(1:KDGLU,1:ILS,JMLOC)=S%FA(JMLOC)%RPNMS(1:KDGLU,1:ILS) -ENDDO + FG%ZAA(1:KDGLU,1:ILA,JMLOC)=S%FA(JMLOC)%RPNMA(1:KDGLU,1:ILA) + FG%ZAS(1:KDGLU,1:ILS,JMLOC)=S%FA(JMLOC)%RPNMS(1:KDGLU,1:ILS) + ENDDO -! arrays for m=0 in ledir_mod: -IMLOC0 = FINDLOC(D%MYMS,0) -IF(IMLOC0(1) > 0) THEN - ALLOCATE(FG%ZAA0(SIZE(FG%ZAA,1),SIZE(FG%ZAA,2))) - ALLOCATE(FG%ZAS0(SIZE(FG%ZAS,1),SIZE(FG%ZAS,2))) - FG%ZAA0 = FG%ZAA(:,:,IMLOC0(1)) - FG%ZAS0 = FG%ZAS(:,:,IMLOC0(1)) -ENDIF + ! arrays for m=0 in ledir_mod: + IMLOC0 = FINDLOC(D%MYMS,0) + IF(IMLOC0(1) > 0) THEN + ALLOCATE(FG%ZAA0(SIZE(FG%ZAA,1),SIZE(FG%ZAA,2))) + ALLOCATE(FG%ZAS0(SIZE(FG%ZAS,1),SIZE(FG%ZAS,2))) + FG%ZAA0 = FG%ZAA(:,:,IMLOC0(1)) + FG%ZAS0 = FG%ZAS(:,:,IMLOC0(1)) + ENDIF -ALLOCATE(FG%ZEPSNM(D%NUMP,0:R%NTMAX+2)) -FG%ZEPSNM = 0._JPRBT -CALL PREPSNM !Initialize on the host + ALLOCATE(FG%ZEPSNM(D%NUMP,0:R%NTMAX+2)) + FG%ZEPSNM = 0._JPRBT + CALL PREPSNM !Initialize on the host -WRITE(NOUT,*)'setup_trans: sizes1 NUMP=',D%NUMP + WRITE(NOUT,*)'setup_trans: sizes1 NUMP=',D%NUMP #ifdef ACCGPU WRITE(NOUT,*) 'Using OpenACC' #endif #ifdef OMPGPU WRITE(NOUT,*) 'Using OpenMP offloading' #endif -WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAS', C_SIZEOF(FG%ZAS(1,1,1))*SIZE(FG%ZAS) -WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAA', C_SIZEOF(FG%ZAA(1,1,1))*SIZE(FG%ZAA) -WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAS0', C_SIZEOF(FG%ZAS0(1,1))*SIZE(FG%ZAS0) -WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAA0', C_SIZEOF(FG%ZAA0(1,1))*SIZE(FG%ZAA0) -WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZEPSNM', C_SIZEOF(FG%ZEPSNM(1,1))*SIZE(FG%ZEPSNM) + WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAS', C_SIZEOF(FG%ZAS(1,1,1))*SIZE(FG%ZAS) + WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAA', C_SIZEOF(FG%ZAA(1,1,1))*SIZE(FG%ZAA) + WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAS0', C_SIZEOF(FG%ZAS0(1,1))*SIZE(FG%ZAS0) + WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAA0', C_SIZEOF(FG%ZAA0(1,1))*SIZE(FG%ZAA0) + WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZEPSNM', C_SIZEOF(FG%ZEPSNM(1,1))*SIZE(FG%ZEPSNM) -IF (IMLOC0(1) > 0) THEN + IF (IMLOC0(1) > 0) THEN #ifdef ACCGPU - !$ACC ENTER DATA COPYIN(FG%ZAA0,FG%ZAS0) ASYNC(1) + !$ACC ENTER DATA COPYIN(FG%ZAA0,FG%ZAS0) ASYNC(1) #endif #ifdef OMPGPU - !$OMP TARGET ENTER DATA MAP(TO:FG%ZAA0,FG%ZAS0) + !$OMP TARGET ENTER DATA MAP(TO:FG%ZAA0,FG%ZAS0) #endif -ENDIF + ENDIF + #ifdef ACCGPU #ifdef _CRAYFTN -!$ACC ENTER DATA COPYIN(R,R%NSMAX,R%NTMAX,R%NDGL,R%NDGNH) ASYNC(1) + !$ACC ENTER DATA COPYIN(R,R%NSMAX,R%NTMAX,R%NDGL,R%NDGNH) ASYNC(1) #else -!$ACC ENTER DATA COPYIN(R) ASYNC(1) + !$ACC ENTER DATA COPYIN(R) ASYNC(1) #endif -!$ACC ENTER DATA COPYIN(F,F%RLAPIN,F%RACTHE,F%RW) ASYNC(1) -!$ACC ENTER DATA COPYIN(FG,FG%ZAA,FG%ZAS,FG%ZEPSNM) ASYNC(1) + !$ACC ENTER DATA COPYIN(F,F%RLAPIN,F%RACTHE,F%RW) ASYNC(1) + !$ACC ENTER DATA COPYIN(FG,FG%ZAA,FG%ZAS,FG%ZEPSNM) ASYNC(1) #ifdef _CRAYFTN -!$ACC ENTER DATA COPYIN(D,D%NUMP,D%MYMS,D%NPNTGTB0,D%NPNTGTB1,D%NSTAGT0B,D%NSTAGT1B,D%NSTAGTF,D%NPROCM,D%NPROCL)& -!$ACC& COPYIN(D%NPTRLS,D%MSTABF,D%NASM0,D%OFFSETS_GEMM1,D%OFFSETS_GEMM2,D%NDGL_FS) ASYNC(1) + !$ACC ENTER DATA COPYIN(D,D%NUMP,D%MYMS,D%NPNTGTB0,D%NPNTGTB1,D%NSTAGT0B,D%NSTAGT1B,D%NSTAGTF,D%NPROCM,D%NPROCL)& + !$ACC& COPYIN(D%NPTRLS,D%MSTABF,D%NASM0,D%OFFSETS_GEMM1,D%OFFSETS_GEMM2,D%NDGL_FS) ASYNC(1) #else -!$ACC ENTER DATA COPYIN(D,D%MYMS,D%NPNTGTB0,D%NPNTGTB1,D%NSTAGT0B,D%NSTAGT1B,D%NSTAGTF,D%NPROCM,D%NPROCL)& -!$ACC& COPYIN(D%NPTRLS,D%MSTABF,D%NASM0,D%OFFSETS_GEMM1,D%OFFSETS_GEMM2) ASYNC(1) + !$ACC ENTER DATA COPYIN(D,D%MYMS,D%NPNTGTB0,D%NPNTGTB1,D%NSTAGT0B,D%NSTAGT1B,D%NSTAGTF,D%NPROCM,D%NPROCL)& + !$ACC& COPYIN(D%NPTRLS,D%MSTABF,D%NASM0,D%OFFSETS_GEMM1,D%OFFSETS_GEMM2) ASYNC(1) #endif -!$ACC ENTER DATA COPYIN(G,G%NDGLU,G%NMEN,G%NLOEN) ASYNC(1) -!$ACC WAIT(1) + !$ACC ENTER DATA COPYIN(G,G%NDGLU,G%NMEN,G%NLOEN) ASYNC(1) + !$ACC WAIT(1) #endif #ifdef OMPGPU -!$OMP TARGET ENTER DATA MAP(ALLOC:FG%ZAA,FG%ZAS) -!$OMP TARGET ENTER DATA MAP(TO:FG,F,S,D,R,G) -!$OMP BARRIER + !$OMP TARGET ENTER DATA MAP(TO:R) + !$OMP TARGET ENTER DATA MAP(TO:F%RLAPIN,F%RACTHE,F%RW) + !$OMP TARGET ENTER DATA MAP(TO:FG%ZAA,FG%ZAS,FG%ZEPSNM) + !$OMP TARGET ENTER DATA MAP(TO:D%MYMS,D%NPNTGTB0,D%NPNTGTB1,D%NSTAGT0B,D%NSTAGT1B,D%NSTAGTF,& + !$OMP& D%NPROCM,D%NPROCL,D%NPTRLS,D%MSTABF,D%NASM0,D%OFFSETS_GEMM1,& + !$OMP& D%OFFSETS_GEMM2) + !$OMP TARGET ENTER DATA MAP(TO:G%NDGLU,G%NMEN,G%NLOEN) #endif -WRITE(NOUT,*) '===GPU arrays successfully allocated' - -! TODO: This might be good idea - those polynomials are not needed -!DO JMLOC=1,D%NUMP -! DEALLOCATE(S%FA(JMLOC)%RPNMA) -! DEALLOCATE(S%FA(JMLOC)%RPNMS) -!ENDDO + WRITE(NOUT,*) '===GPU arrays successfully allocated' -!endif INTERFACE + ! TODO: This might be good idea - those polynomials are not needed + !DO JMLOC=1,D%NUMP + ! DEALLOCATE(S%FA(JMLOC)%RPNMA) + ! DEALLOCATE(S%FA(JMLOC)%RPNMS) + !ENDDO ENDIF ! D%LGRIDONLY +!endif INTERFACE + END SUBROUTINE SETUP_TRANS From 3002c3f27f3bae38da933da2216715b4c8a8fd71 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Wed, 4 Dec 2024 19:27:37 +0200 Subject: [PATCH 16/24] Properly handle LDUSEFFTW option for GPUs --- src/trans/gpu/external/setup_trans.F90 | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/trans/gpu/external/setup_trans.F90 b/src/trans/gpu/external/setup_trans.F90 index 1013f8ab8..b1a44fec3 100755 --- a/src/trans/gpu/external/setup_trans.F90 +++ b/src/trans/gpu/external/setup_trans.F90 @@ -387,6 +387,10 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& D%LCPNMONLY=LDPNMONLY ENDIF +IF(PRESENT(LDUSEFFTW)) THEN + WRITE(NOUT,*) 'SETUP_TRANS: LDUSEFFTW option is not relevant for GPUs' +ENDIF + ! Setup distribution independent dimensions CALL SETUP_DIMS From 0cae7c0bcc0256051ac4bb894e958446ce53374f Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Wed, 15 Jan 2025 10:17:50 +0000 Subject: [PATCH 17/24] Minor case change --- src/trans/gpu/external/gpnorm_trans.F90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/trans/gpu/external/gpnorm_trans.F90 b/src/trans/gpu/external/gpnorm_trans.F90 index 460ace10e..ff86aa325 100755 --- a/src/trans/gpu/external/gpnorm_trans.F90 +++ b/src/trans/gpu/external/gpnorm_trans.F90 @@ -266,7 +266,7 @@ SUBROUTINE GPNORM_TRANS(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) !$OMP END TARGET DATA #endif #ifdef ACCGPU - !$ACC end data + !$ACC END DATA #endif ENDIF From 9324bed0813dc90dc54dd3753a5ac50c0c2e3c6c Mon Sep 17 00:00:00 2001 From: Paul Mullowney Date: Thu, 16 Jan 2025 14:17:16 -0600 Subject: [PATCH 18/24] Changes to get compliance with flang-new. This builds and runs on 8 H100s (truncation 319) --- src/trans/gpu/algor/ext_acc.F90 | 8 ++++---- src/trans/gpu/algor/hicblas_mod.F90 | 8 ++++---- src/trans/gpu/internal/tpm_hicfft.F90 | 8 ++++---- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/trans/gpu/algor/ext_acc.F90 b/src/trans/gpu/algor/ext_acc.F90 index 49043ab90..a1f4db89a 100644 --- a/src/trans/gpu/algor/ext_acc.F90 +++ b/src/trans/gpu/algor/ext_acc.F90 @@ -268,7 +268,7 @@ subroutine ext_acc_create(ptrs, stream) num_ranges = get_common_pointers(ptrs, common_ptrs) do i = 1, num_ranges - call c_f_pointer(common_ptrs(i)%ptr, pp, shape=[common_ptrs(i)%sz/c_sizeof(pp(1))]) + call c_f_pointer(common_ptrs(i)%ptr, pp, [common_ptrs(i)%sz/c_sizeof(pp(1))]) !$acc enter data create(pp) async(stream_act) enddo end subroutine @@ -294,7 +294,7 @@ subroutine ext_acc_copyin(ptrs, stream) num_ranges = get_common_pointers(ptrs, common_ptrs) do i = 1, num_ranges - call c_f_pointer(common_ptrs(i)%ptr, pp, shape=[common_ptrs(i)%sz/c_sizeof(pp(1))]) + call c_f_pointer(common_ptrs(i)%ptr, pp, [common_ptrs(i)%sz/c_sizeof(pp(1))]) !$acc enter data copyin(pp) async(stream_act) enddo end subroutine @@ -320,7 +320,7 @@ subroutine ext_acc_copyout(ptrs, stream) num_ranges = get_common_pointers(ptrs, common_ptrs) do i = 1, num_ranges - call c_f_pointer(common_ptrs(i)%ptr, pp, shape=[common_ptrs(i)%sz/c_sizeof(pp(1))]) + call c_f_pointer(common_ptrs(i)%ptr, pp, [common_ptrs(i)%sz/c_sizeof(pp(1))]) !$acc exit data copyout(pp) async(stream_act) enddo end subroutine @@ -346,7 +346,7 @@ subroutine ext_acc_delete(ptrs, stream) num_ranges = get_common_pointers(ptrs, common_ptrs) do i = 1, num_ranges - call c_f_pointer(common_ptrs(i)%ptr, pp, shape=[common_ptrs(i)%sz/c_sizeof(pp(1))]) + call c_f_pointer(common_ptrs(i)%ptr, pp, [common_ptrs(i)%sz/c_sizeof(pp(1))]) !$acc exit data delete(pp) async(stream_act) enddo end subroutine diff --git a/src/trans/gpu/algor/hicblas_mod.F90 b/src/trans/gpu/algor/hicblas_mod.F90 index f7528a11f..528680f66 100644 --- a/src/trans/gpu/algor/hicblas_mod.F90 +++ b/src/trans/gpu/algor/hicblas_mod.F90 @@ -143,7 +143,7 @@ SUBROUTINE HIP_DGEMM_BATCHED_OVERLOAD( & INTEGER(KIND=JPIM) :: STRIDEC INTEGER(KIND=JPIM) :: BATCHCOUNT INTEGER(KIND=C_INT) :: STREAM - TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN) :: ALLOC + TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN), POINTER :: ALLOC INTEGER(KIND=C_LONG) :: HIP_STREAM @@ -193,7 +193,7 @@ SUBROUTINE HIP_SGEMM_BATCHED_OVERLOAD( & INTEGER(KIND=JPIM) :: STRIDEC INTEGER(KIND=JPIM) :: BATCHCOUNT INTEGER(KIND=C_INT) :: STREAM - TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN) :: ALLOC + TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN), POINTER :: ALLOC INTEGER(KIND=C_LONG) :: HIP_STREAM @@ -239,7 +239,7 @@ SUBROUTINE HIP_DGEMM_GROUPED_OVERLOAD( & INTEGER(KIND=JPIB) :: OFFSETC(:) INTEGER(KIND=JPIM) :: BATCHCOUNT INTEGER(KIND=C_INT) :: STREAM - TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN) :: ALLOC + TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN), POINTER :: ALLOC INTEGER(KIND=C_LONG) :: HIP_STREAM @@ -286,7 +286,7 @@ SUBROUTINE HIP_SGEMM_GROUPED_OVERLOAD(& INTEGER(KIND=JPIB) :: OFFSETC(:) INTEGER(KIND=JPIM) :: BATCHCOUNT INTEGER(KIND=C_INT) :: STREAM - TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN) :: ALLOC + TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN), POINTER :: ALLOC INTEGER(KIND=C_LONG) :: HIP_STREAM diff --git a/src/trans/gpu/internal/tpm_hicfft.F90 b/src/trans/gpu/internal/tpm_hicfft.F90 index 30c3bf20d..019071635 100755 --- a/src/trans/gpu/internal/tpm_hicfft.F90 +++ b/src/trans/gpu/internal/tpm_hicfft.F90 @@ -63,7 +63,7 @@ SUBROUTINE EXECUTE_DIR_FFT_FLOAT(PREEL_REAL,PREEL_COMPLEX,RESOL_ID,KFIELD,LOENS, INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD INTEGER(KIND=JPIM),INTENT(IN) :: LOENS(:) INTEGER(KIND=JPIB),INTENT(IN) :: OFFSETS(:) - TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN) :: ALLOC + TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN), POINTER :: ALLOC INTERFACE SUBROUTINE EXECUTE_DIR_FFT_FLOAT_C(PREEL_REAL,PREEL_COMPLEX,RESOL_ID,KFIELD,LOENS,OFFSETS,NFFT,ALLOC) & & BIND(C, NAME="execute_dir_fft_float") @@ -99,7 +99,7 @@ SUBROUTINE EXECUTE_DIR_FFT_DOUBLE(PREEL_REAL,PREEL_COMPLEX,RESOL_ID,KFIELD,LOENS INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD INTEGER(KIND=JPIM),INTENT(IN) :: LOENS(:) INTEGER(KIND=JPIB),INTENT(IN) :: OFFSETS(:) - TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN) :: ALLOC + TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN), POINTER :: ALLOC INTERFACE SUBROUTINE EXECUTE_DIR_FFT_DOUBLE_C(PREEL_REAL,PREEL_COMPLEX,RESOL_ID,KFIELD,LOENS,OFFSETS,NFFT,ALLOC) & & BIND(C, NAME="execute_dir_fft_double") @@ -136,7 +136,7 @@ SUBROUTINE EXECUTE_INV_FFT_FLOAT(PREEL_COMPLEX,PREEL_REAL,RESOL_ID,KFIELD,LOENS, INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD INTEGER(KIND=JPIM),INTENT(IN) :: LOENS(:) INTEGER(KIND=JPIB),INTENT(IN) :: OFFSETS(:) - TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN) :: ALLOC + TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN), POINTER :: ALLOC INTERFACE SUBROUTINE EXECUTE_INV_FFT_FLOAT_C(PREEL_COMPLEX,PREEL_REAL,RESOL_ID,KFIELD,LOENS,OFFSETS,NFFT,ALLOC) & & BIND(C, NAME="execute_inv_fft_float") @@ -172,7 +172,7 @@ SUBROUTINE EXECUTE_INV_FFT_DOUBLE(PREEL_COMPLEX,PREEL_REAL,RESOL_ID,KFIELD,LOENS INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD INTEGER(KIND=JPIM),INTENT(IN) :: LOENS(:) INTEGER(KIND=JPIB),INTENT(IN) :: OFFSETS(:) - TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN) :: ALLOC + TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN), POINTER :: ALLOC INTERFACE SUBROUTINE EXECUTE_INV_FFT_DOUBLE_C(PREEL_COMPLEX,PREEL_REAL,RESOL_ID,KFIELD,LOENS,OFFSETS,NFFT,ALLOC) & & BIND(C, NAME="execute_inv_fft_double") From 2f6d01e809088f59c64325c200569e6fe0f50ab8 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Fri, 17 Jan 2025 11:57:11 +0000 Subject: [PATCH 19/24] Add back FIRSTPRIVATE This was removed by mistake. --- src/trans/gpu/internal/uvtvd_mod.F90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/trans/gpu/internal/uvtvd_mod.F90 b/src/trans/gpu/internal/uvtvd_mod.F90 index 6362e8119..739592c49 100755 --- a/src/trans/gpu/internal/uvtvd_mod.F90 +++ b/src/trans/gpu/internal/uvtvd_mod.F90 @@ -119,7 +119,7 @@ SUBROUTINE UVTVD(KF_UV,PU,PV,PVOR,PDIV) !$OMP& SHARED(D,R,KF_UV,FG,PVOR,PV,PU,PDIV) DEFAULT(NONE) #endif #ifdef ACCGPU -!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(IR,II,IN,KM,ZKM,ZJN) DEFAULT(NONE) & +!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(IR,II,IN,KM,ZKM,ZJN) FIRSTPRIVATE(KF_UV) DEFAULT(NONE) & #ifndef _CRAYFTN !$ACC& ASYNC(1) #else From ce53d97bb987b18cd9ed7e37990aaa8d7af0e0af Mon Sep 17 00:00:00 2001 From: Paul Mullowney Date: Thu, 16 Jan 2025 12:24:18 -0600 Subject: [PATCH 20/24] Adding missing ACCGPU protections --- .../gpu/algor/buffered_allocator_mod.F90 | 26 +++++++ src/trans/gpu/algor/ext_acc.F90 | 74 ++++++++++++++++--- src/trans/gpu/algor/growing_allocator_mod.F90 | 8 +- src/trans/gpu/algor/hicblas_mod.F90 | 20 +++++ src/trans/gpu/internal/trgtol_mod.F90 | 25 ++++++- src/trans/gpu/internal/trltog_mod.F90 | 29 +++++++- 6 files changed, 166 insertions(+), 16 deletions(-) diff --git a/src/trans/gpu/algor/buffered_allocator_mod.F90 b/src/trans/gpu/algor/buffered_allocator_mod.F90 index ba613fe7a..346b85977 100644 --- a/src/trans/gpu/algor/buffered_allocator_mod.F90 +++ b/src/trans/gpu/algor/buffered_allocator_mod.F90 @@ -12,7 +12,9 @@ MODULE BUFFERED_ALLOCATOR_MOD USE ABORT_TRANS_MOD, ONLY: ABORT_TRANS USE ISO_C_BINDING, ONLY: C_INT8_T, C_SIZE_T, C_LOC, C_F_POINTER USE GROWING_ALLOCATOR_MOD, ONLY: GROWING_ALLOCATION_TYPE +#ifdef ACCGPU USE OPENACC, ONLY: ACC_ASYNC_SYNC +#endif IMPLICIT NONE @@ -142,16 +144,28 @@ SUBROUTINE ASSIGN_PTR_FLOAT(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES, SET_VALUE IF (PRESENT(SET_STREAM)) THEN SET_STREAM_EFF = SET_STREAM ELSE +#ifdef ACCGPU SET_STREAM_EFF = ACC_ASYNC_SYNC +#endif +#ifdef OMPGPU +#endif ENDIF IF (SET_VALUE_EFF .AND. LENGTH_IN_BYTES > 0) THEN ! This option is turned off by default, but for experimentation we can turn it on. This is ! setting all bits to 1 (meaning NaN in floating point) +#ifdef ACCGPU !$ACC PARALLEL PRESENT(SRC) ASYNC(SET_STREAM_EFF) +#endif +#ifdef OMPGPU +#endif DO J=1_C_SIZE_T,LENGTH_IN_BYTES SRC(J) = -1 ENDDO +#ifdef ACCGPU !$ACC END PARALLEL +#endif +#ifdef OMPGPU +#endif ENDIF CALL C_F_POINTER(C_LOC(SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES-1)), DST, & & [C_SIZEOF(SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES-1))/C_SIZEOF(DST(0))]) @@ -180,17 +194,29 @@ SUBROUTINE ASSIGN_PTR_DOUBLE(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES, SET_VALU IF (PRESENT(SET_STREAM)) THEN SET_STREAM_EFF = SET_STREAM ELSE +#ifdef ACCGPU SET_STREAM_EFF = ACC_ASYNC_SYNC +#endif +#ifdef OMPGPU +#endif ENDIF IF (SET_VALUE_EFF .AND. LENGTH_IN_BYTES > 0) THEN ! This option is turned off by default, but for experimentation we can turn it on. This is ! setting all bits to 1 (meaning NaN in floating point) END_IN_BYTES=START_IN_BYTES+LENGTH_IN_BYTES-1 +#ifdef ACCGPU !$ACC PARALLEL PRESENT(SRC) ASYNC(SET_STREAM_EFF) +#endif +#ifdef OMPGPU +#endif DO J=1_C_SIZE_T,LENGTH_IN_BYTES SRC(J) = -1 ENDDO +#ifdef ACCGPU !$ACC END PARALLEL +#endif +#ifdef OMPGPU +#endif ENDIF CALL C_F_POINTER(C_LOC(SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES-1)), DST, & & [C_SIZEOF(SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES-1))/C_SIZEOF(DST(0))]) diff --git a/src/trans/gpu/algor/ext_acc.F90 b/src/trans/gpu/algor/ext_acc.F90 index a1f4db89a..f9c5bf5a6 100644 --- a/src/trans/gpu/algor/ext_acc.F90 +++ b/src/trans/gpu/algor/ext_acc.F90 @@ -18,13 +18,22 @@ module openacc_ext_type end module module openacc_ext use iso_c_binding, only: c_ptr, c_size_t, c_loc, c_sizeof +#ifdef ACCGPU use openacc, only: acc_handle_kind +#endif +#ifdef OMPGPU +#endif use openacc_ext_type, only: ext_acc_arr_desc implicit none private public :: ext_acc_pass, ext_acc_create, ext_acc_copyin, ext_acc_copyout, & - & ext_acc_delete, ext_acc_arr_desc, acc_handle_kind + & ext_acc_delete, ext_acc_arr_desc, & +#ifdef ACCGPU + acc_handle_kind +#endif +#ifdef OMPGPU +#endif type common_pointer_descr type(c_ptr) :: ptr @@ -247,107 +256,150 @@ function get_common_pointers(in_ptrs, out_ptrs) result(num_ranges) enddo end function subroutine ext_acc_create(ptrs, stream) +#ifdef ACCGPU use openacc, only: acc_async_sync +#endif use iso_fortran_env, only: int32 implicit none type(ext_acc_arr_desc), intent(in) :: ptrs(:) +#ifdef ACCGPU integer(acc_handle_kind), optional :: stream +#endif +#ifdef OMPGPU + integer(4), optional :: stream +#endif type(common_pointer_descr), allocatable :: common_ptrs(:) integer :: i, num_ranges integer(kind=int32), pointer :: pp(:) +#ifdef ACCGPU integer(acc_handle_kind) :: stream_act - if (present(stream)) then stream_act = stream else stream_act = acc_async_sync endif +#endif allocate(common_ptrs(size(ptrs))) num_ranges = get_common_pointers(ptrs, common_ptrs) do i = 1, num_ranges call c_f_pointer(common_ptrs(i)%ptr, pp, [common_ptrs(i)%sz/c_sizeof(pp(1))]) +#ifdef ACCGPU !$acc enter data create(pp) async(stream_act) +#endif +#ifdef OMPGPU +#endif enddo end subroutine subroutine ext_acc_copyin(ptrs, stream) +#ifdef ACCGPU use openacc, only: acc_async_sync +#endif implicit none type(ext_acc_arr_desc), intent(in) :: ptrs(:) +#ifdef ACCGPU integer(acc_handle_kind), optional :: stream +#endif +#ifdef OMPGPU + integer(4), optional :: stream +#endif type(common_pointer_descr), allocatable :: common_ptrs(:) integer :: i, num_ranges integer(4), pointer :: pp(:) - +#ifdef ACCGPU integer(acc_handle_kind) :: stream_act - if (present(stream)) then stream_act = stream else stream_act = acc_async_sync endif +#endif allocate(common_ptrs(size(ptrs))) num_ranges = get_common_pointers(ptrs, common_ptrs) do i = 1, num_ranges call c_f_pointer(common_ptrs(i)%ptr, pp, [common_ptrs(i)%sz/c_sizeof(pp(1))]) +#ifdef ACCGPU !$acc enter data copyin(pp) async(stream_act) +#endif +#ifdef OMPGPU +#endif enddo end subroutine subroutine ext_acc_copyout(ptrs, stream) +#ifdef ACCGPU use openacc, only: acc_async_sync +#endif implicit none type(ext_acc_arr_desc), intent(in) :: ptrs(:) +#ifdef ACCGPU integer(acc_handle_kind), optional :: stream - +#endif +#ifdef OMPGPU + integer(4), optional :: stream +#endif type(common_pointer_descr), allocatable :: common_ptrs(:) integer :: i, num_ranges integer(4), pointer :: pp(:) - +#ifdef ACCGPU integer(acc_handle_kind) :: stream_act - if (present(stream)) then stream_act = stream else stream_act = acc_async_sync - endif + endif +#endif allocate(common_ptrs(size(ptrs))) num_ranges = get_common_pointers(ptrs, common_ptrs) do i = 1, num_ranges call c_f_pointer(common_ptrs(i)%ptr, pp, [common_ptrs(i)%sz/c_sizeof(pp(1))]) +#ifdef ACCGPU !$acc exit data copyout(pp) async(stream_act) +#endif +#ifdef OMPGPU +#endif enddo end subroutine subroutine ext_acc_delete(ptrs, stream) +#ifdef ACCGPU use openacc, only: acc_async_sync +#endif implicit none type(ext_acc_arr_desc), intent(in) :: ptrs(:) +#ifdef ACCGPU integer(acc_handle_kind), optional :: stream - +#else + integer(4), optional :: stream +#endif type(common_pointer_descr), allocatable :: common_ptrs(:) integer :: i, num_ranges integer(4), pointer :: pp(:) - +#ifdef ACCGPU integer(acc_handle_kind) :: stream_act if (present(stream)) then stream_act = stream else stream_act = acc_async_sync - endif + endif +#endif allocate(common_ptrs(size(ptrs))) num_ranges = get_common_pointers(ptrs, common_ptrs) do i = 1, num_ranges call c_f_pointer(common_ptrs(i)%ptr, pp, [common_ptrs(i)%sz/c_sizeof(pp(1))]) +#ifdef ACCGPU !$acc exit data delete(pp) async(stream_act) +#endif +#ifdef OMPGPU +#endif enddo end subroutine end module diff --git a/src/trans/gpu/algor/growing_allocator_mod.F90 b/src/trans/gpu/algor/growing_allocator_mod.F90 index 844194e0c..cf4788339 100644 --- a/src/trans/gpu/algor/growing_allocator_mod.F90 +++ b/src/trans/gpu/algor/growing_allocator_mod.F90 @@ -43,7 +43,7 @@ SUBROUTINE REALLOCATE_GROWING_ALLOCATION(ALLOC, SZ) IF (.NOT. ASSOCIATED(ALLOC%PTR)) THEN ALLOCATE(ALLOC%PTR(SZ)) -#ifdef OMPGU +#ifdef OMPGPU !$OMP TARGET ENTER DATA MAP(ALLOC:ALLOC%PTR) #endif #ifdef ACCGPU @@ -98,6 +98,12 @@ SUBROUTINE DESTROY_GROWING_ALLOCATOR(ALLOC) CALL ALLOC%FREE_FUNCS(I)%FUNC(ALLOC%PTR, & SIZE(ALLOC%PTR, 1, C_SIZE_T)) ENDDO +#ifdef OMPGPU + !$OMP TARGET EXIT DATA MAP(DELETE:ALLOC%PTR) +#endif +#ifdef ACCGPU + !$ACC EXIT DATA DELETE(ALLOC%PTR) +#endif !$ACC EXIT DATA DELETE(ALLOC%PTR) DEALLOCATE(ALLOC%PTR) NULLIFY(ALLOC%PTR) diff --git a/src/trans/gpu/algor/hicblas_mod.F90 b/src/trans/gpu/algor/hicblas_mod.F90 index 528680f66..da0790ca0 100644 --- a/src/trans/gpu/algor/hicblas_mod.F90 +++ b/src/trans/gpu/algor/hicblas_mod.F90 @@ -16,7 +16,11 @@ MODULE HICBLAS_MOD USE EC_PARKIND, ONLY: JPIM, JPRM, JPRD, JPIB USE GROWING_ALLOCATOR_MOD, ONLY: GROWING_ALLOCATION_TYPE +#ifdef ACCGPU USE OPENACC_LIB, ONLY: ACC_GET_HIP_STREAM +#endif +#ifdef OMPGPU +#endif IMPLICIT NONE @@ -147,7 +151,11 @@ SUBROUTINE HIP_DGEMM_BATCHED_OVERLOAD( & INTEGER(KIND=C_LONG) :: HIP_STREAM +#ifdef ACCGPU HIP_STREAM = INT(ACC_GET_HIP_STREAM(STREAM), C_LONG) +#endif +#ifdef OMPGPU +#endif #if defined(_CRAYFTN) !$ACC HOST_DATA USE_DEVICE(AARRAY,BARRAY,CARRAY) @@ -197,7 +205,11 @@ SUBROUTINE HIP_SGEMM_BATCHED_OVERLOAD( & INTEGER(KIND=C_LONG) :: HIP_STREAM +#ifdef ACCGPU HIP_STREAM = INT(ACC_GET_HIP_STREAM(STREAM), C_LONG) +#endif +#ifdef OMPGPU +#endif CALL HIP_SGEMM_BATCHED( & & TRANSA, TRANSB, & @@ -243,7 +255,11 @@ SUBROUTINE HIP_DGEMM_GROUPED_OVERLOAD( & INTEGER(KIND=C_LONG) :: HIP_STREAM +#ifdef ACCGPU HIP_STREAM = INT(ACC_GET_HIP_STREAM(STREAM), C_LONG) +#endif +#ifdef OMPGPU +#endif CALL HIP_DGEMM_GROUPED( & & RESOL_ID, BLAS_ID, TRANSA, TRANSB, & @@ -290,7 +306,11 @@ SUBROUTINE HIP_SGEMM_GROUPED_OVERLOAD(& INTEGER(KIND=C_LONG) :: HIP_STREAM +#ifdef ACCGPU HIP_STREAM = INT(ACC_GET_HIP_STREAM(STREAM), C_LONG) +#endif +#ifdef OMPGPU +#endif #if defined(_CRAYFTN) !$ACC HOST_DATA USE_DEVICE(AARRAY,BARRAY,CARRAY) diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index 055534cbf..95188f37b 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -124,7 +124,9 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION USE OPENACC_EXT, ONLY: EXT_ACC_ARR_DESC, EXT_ACC_PASS, EXT_ACC_CREATE, & & EXT_ACC_DELETE +#ifdef ACCGPU USE OPENACC, ONLY: ACC_HANDLE_KIND +#endif USE ABORT_TRANS_MOD, ONLY: ABORT_TRANS IMPLICIT NONE @@ -380,7 +382,15 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1 ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGP3B) ENDIF - IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_CREATE(ACC_POINTERS(1:ACC_POINTERS_CNT),STREAM=1_ACC_HANDLE_KIND) + + IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_CREATE(ACC_POINTERS(1:ACC_POINTERS_CNT), & +#ifdef ACCGPU + & STREAM=1_ACC_HANDLE_KIND) +#endif +#ifdef OMPGPU + & STREAM=1) +#endif + #ifdef ACCGPU !$ACC WAIT(1) #endif @@ -595,8 +605,12 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, !$ACC HOST_DATA USE_DEVICE(ZCOMBUFR,ZCOMBUFS) #endif #else +#ifdef OMPGPU +#endif +#ifdef ACCGPU !! this is safe-but-slow fallback for running without GPU-aware MPI !$ACC UPDATE HOST(ZCOMBUFS) IF(ISEND_COUNTS > 0) +#endif #endif ! Skip the own contribution because this is ok to overflow @@ -789,7 +803,14 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, !$ACC END DATA !PGPUV !$ACC END DATA !PGP #endif - IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_DELETE(ACC_POINTERS(1:ACC_POINTERS_CNT),STREAM=1_ACC_HANDLE_KIND) + + IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_DELETE(ACC_POINTERS(1:ACC_POINTERS_CNT), & +#ifdef ACCGPU + & STREAM=1_ACC_HANDLE_KIND) +#endif +#ifdef OMPGPU + & STREAM=1) +#endif IF (LHOOK) CALL DR_HOOK('TRGTOL',1,ZHOOK_HANDLE) END SUBROUTINE TRGTOL diff --git a/src/trans/gpu/internal/trltog_mod.F90 b/src/trans/gpu/internal/trltog_mod.F90 index d27cc74a9..7df708b0c 100755 --- a/src/trans/gpu/internal/trltog_mod.F90 +++ b/src/trans/gpu/internal/trltog_mod.F90 @@ -125,7 +125,9 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, USE ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF USE OPENACC_EXT, ONLY: EXT_ACC_ARR_DESC, EXT_ACC_PASS, EXT_ACC_CREATE, & & EXT_ACC_DELETE +#ifdef ACCGPU USE OPENACC, ONLY: ACC_HANDLE_KIND +#endif IMPLICIT NONE @@ -517,7 +519,15 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1 ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGP3B) ENDIF - IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_CREATE(ACC_POINTERS(1:ACC_POINTERS_CNT),STREAM=1_ACC_HANDLE_KIND) + + IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_CREATE(ACC_POINTERS(1:ACC_POINTERS_CNT), & +#ifdef ACCGPU + & STREAM=1_ACC_HANDLE_KIND) +#endif +#ifdef OMPGPU + & STREAM=1) +#endif + #ifdef OMPGPU #endif #ifdef ACCGPU @@ -711,8 +721,12 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, !$ACC HOST_DATA USE_DEVICE(ZCOMBUFS,ZCOMBUFR) #endif #else +#ifdef OMPGPU +#endif +#ifdef ACCGPU !! this is safe-but-slow fallback for running without GPU-aware MPI !$ACC UPDATE HOST(ZCOMBUFS) IF(ISEND_COUNTS > 0) +#endif #endif ! Skip the own contribution because this is ok to overflow @@ -766,8 +780,12 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, !$ACC END HOST_DATA #endif #else +#ifdef OMPGPU +#endif +#ifdef ACCGPU !! this is safe-but-slow fallback for running without GPU-aware MPI !$ACC UPDATE DEVICE(ZCOMBUFR) IF(IRECV_COUNTS > 0) +#endif #endif IF (LSYNC_TRANS) THEN @@ -922,7 +940,14 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, !$ACC UPDATE HOST(PGP3B) ASYNC(1) #endif ENDIF - IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_DELETE(ACC_POINTERS(1:ACC_POINTERS_CNT),STREAM=1_ACC_HANDLE_KIND) + IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_DELETE(ACC_POINTERS(1:ACC_POINTERS_CNT), & +#ifdef ACCGPU + & STREAM=1_ACC_HANDLE_KIND) +#endif +#ifdef OMPGPU + & STREAM=1) +#endif + IF (LSYNC_TRANS) THEN #ifdef ACCGPU !$ACC WAIT(1) From c6bd2391ed3d0494aac499cceca22e52dfb8a32b Mon Sep 17 00:00:00 2001 From: Paul Mullowney Date: Thu, 16 Jan 2025 13:07:40 -0600 Subject: [PATCH 21/24] slight fix --- src/trans/gpu/algor/ext_acc.F90 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/trans/gpu/algor/ext_acc.F90 b/src/trans/gpu/algor/ext_acc.F90 index f9c5bf5a6..6d9823f75 100644 --- a/src/trans/gpu/algor/ext_acc.F90 +++ b/src/trans/gpu/algor/ext_acc.F90 @@ -28,11 +28,11 @@ module openacc_ext private public :: ext_acc_pass, ext_acc_create, ext_acc_copyin, ext_acc_copyout, & - & ext_acc_delete, ext_acc_arr_desc, & #ifdef ACCGPU - acc_handle_kind + & ext_acc_delete, ext_acc_arr_desc, acc_handle_kind #endif #ifdef OMPGPU + & ext_acc_delete, ext_acc_arr_desc #endif type common_pointer_descr From 079a464e1e69666274d689a8349a9587ea68681c Mon Sep 17 00:00:00 2001 From: Paul Mullowney Date: Fri, 17 Jan 2025 10:31:56 -0600 Subject: [PATCH 22/24] Requested fixes --- src/trans/gpu/algor/ext_acc.F90 | 2 +- src/trans/gpu/algor/growing_allocator_mod.F90 | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/trans/gpu/algor/ext_acc.F90 b/src/trans/gpu/algor/ext_acc.F90 index 6d9823f75..296c5902e 100644 --- a/src/trans/gpu/algor/ext_acc.F90 +++ b/src/trans/gpu/algor/ext_acc.F90 @@ -388,7 +388,7 @@ subroutine ext_acc_delete(ptrs, stream) stream_act = stream else stream_act = acc_async_sync - endif + endif #endif allocate(common_ptrs(size(ptrs))) num_ranges = get_common_pointers(ptrs, common_ptrs) diff --git a/src/trans/gpu/algor/growing_allocator_mod.F90 b/src/trans/gpu/algor/growing_allocator_mod.F90 index cf4788339..283db018d 100644 --- a/src/trans/gpu/algor/growing_allocator_mod.F90 +++ b/src/trans/gpu/algor/growing_allocator_mod.F90 @@ -104,7 +104,6 @@ SUBROUTINE DESTROY_GROWING_ALLOCATOR(ALLOC) #ifdef ACCGPU !$ACC EXIT DATA DELETE(ALLOC%PTR) #endif - !$ACC EXIT DATA DELETE(ALLOC%PTR) DEALLOCATE(ALLOC%PTR) NULLIFY(ALLOC%PTR) ENDIF From 26e33e201417aff61fe1756d9412dab40396d9bd Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Tue, 21 Jan 2025 10:48:47 +0000 Subject: [PATCH 23/24] Fix indentation --- src/trans/gpu/algor/ext_acc.F90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/trans/gpu/algor/ext_acc.F90 b/src/trans/gpu/algor/ext_acc.F90 index 296c5902e..ffc760e61 100644 --- a/src/trans/gpu/algor/ext_acc.F90 +++ b/src/trans/gpu/algor/ext_acc.F90 @@ -352,7 +352,7 @@ subroutine ext_acc_copyout(ptrs, stream) stream_act = stream else stream_act = acc_async_sync - endif + endif #endif allocate(common_ptrs(size(ptrs))) num_ranges = get_common_pointers(ptrs, common_ptrs) From 388fd006b7371ef982d200e6f1cac89dc4c88857 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Tue, 21 Jan 2025 13:33:55 +0200 Subject: [PATCH 24/24] Add some more #ifdef ACCGPU guards --- src/trans/gpu/algor/hicblas_mod.F90 | 8 ++++++++ src/trans/gpu/external/gpnorm_trans.F90 | 23 +++++++++++++++++++++++ src/trans/gpu/internal/ltdir_mod.F90 | 2 ++ 3 files changed, 33 insertions(+) diff --git a/src/trans/gpu/algor/hicblas_mod.F90 b/src/trans/gpu/algor/hicblas_mod.F90 index da0790ca0..6ffc90812 100644 --- a/src/trans/gpu/algor/hicblas_mod.F90 +++ b/src/trans/gpu/algor/hicblas_mod.F90 @@ -158,7 +158,9 @@ SUBROUTINE HIP_DGEMM_BATCHED_OVERLOAD( & #endif #if defined(_CRAYFTN) +#ifdef ACCGPU !$ACC HOST_DATA USE_DEVICE(AARRAY,BARRAY,CARRAY) +#endif #endif CALL HIP_DGEMM_BATCHED( & & TRANSA, TRANSB, & @@ -170,8 +172,10 @@ SUBROUTINE HIP_DGEMM_BATCHED_OVERLOAD( & & CARRAY, LDC, STRIDEC, & & BATCHCOUNT, HIP_STREAM, C_LOC(ALLOC)) #if defined(_CRAYFTN) +#ifdef ACCGPU !$ACC END HOST_DATA #endif +#endif END SUBROUTINE HIP_DGEMM_BATCHED_OVERLOAD SUBROUTINE HIP_SGEMM_BATCHED_OVERLOAD( & @@ -313,7 +317,9 @@ SUBROUTINE HIP_SGEMM_GROUPED_OVERLOAD(& #endif #if defined(_CRAYFTN) +#ifdef ACCGPU !$ACC HOST_DATA USE_DEVICE(AARRAY,BARRAY,CARRAY) +#endif #endif CALL HIP_SGEMM_GROUPED( & & RESOL_ID, BLAS_ID, TRANSA, TRANSB, & @@ -325,8 +331,10 @@ SUBROUTINE HIP_SGEMM_GROUPED_OVERLOAD(& & CARRAY, LDC, OFFSETC, & & BATCHCOUNT, HIP_STREAM, C_LOC(ALLOC)) #if defined(_CRAYFTN) +#ifdef ACCGPU !$ACC END HOST_DATA #endif +#endif END SUBROUTINE HIP_SGEMM_GROUPED_OVERLOAD diff --git a/src/trans/gpu/external/gpnorm_trans.F90 b/src/trans/gpu/external/gpnorm_trans.F90 index ff86aa325..80ad505b7 100755 --- a/src/trans/gpu/external/gpnorm_trans.F90 +++ b/src/trans/gpu/external/gpnorm_trans.F90 @@ -229,12 +229,19 @@ SUBROUTINE GPNORM_TRANS(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) ! FIRST DO SUMS IN EACH FULL LATITUDE +#ifdef OMPGPU + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO +#endif +#ifdef ACCGPU !$ACC KERNELS +#endif DO JGL=1,D%NDGL_FS IGL = D_NPTRLS(MYSETW) + JGL - 1 DO JF=1,IF_FS ZAVE(JF,JGL)=0.0_JPRB +#ifdef ACCGPU !$ACC loop +#endif DO JL=1,G_NLOEN(IGL) V = PREEL_REAL(IF_FS*D%NSTAGTF(JGL)+(JF-1)*(D%NSTAGTF(JGL+1)-D%NSTAGTF(JGL))+JL) ZAVE(JF,JGL)=ZAVE(JF,JGL)+V @@ -243,16 +250,30 @@ SUBROUTINE GPNORM_TRANS(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) ENDDO ENDDO ENDDO +#ifdef ACCGPU !$ACC END KERNELS +#endif +#ifdef OMPGPU + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO +#endif +#ifdef ACCGPU !$ACC KERNELS +#endif DO JF=1,IF_FS ZMINGPN(JF)=MINVAL(ZMINGL(JF,IBEG:IEND)) ZMAXGPN(JF)=MAXVAL(ZMAXGL(JF,IBEG:IEND)) ENDDO +#ifdef ACCGPU !$ACC END KERNELS +#endif +#ifdef OMPGPU + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO +#endif +#ifdef ACCGPU !$ACC KERNELS +#endif DO JGL=IBEG,IEND IGL = D_NPTRLS(MYSETW) + JGL - 1 DO JF=1,IF_FS @@ -260,7 +281,9 @@ SUBROUTINE GPNORM_TRANS(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) !write(iunit,*) 'aver inside ',JF,IF_FS,IGL,ZAVE(JF,JGL), F_RW(IGL), G_NLOEN(IGL),ZMINGPN(JF),ZMAXGPN(JF) ENDDO ENDDO +#ifdef ACCGPU !$ACC END KERNELS +#endif #ifdef OMPGPU !$OMP END TARGET DATA diff --git a/src/trans/gpu/internal/ltdir_mod.F90 b/src/trans/gpu/internal/ltdir_mod.F90 index 567e76a89..a087c8e9f 100755 --- a/src/trans/gpu/internal/ltdir_mod.F90 +++ b/src/trans/gpu/internal/ltdir_mod.F90 @@ -261,7 +261,9 @@ SUBROUTINE LTDIR(ALLOCATOR,HLTDIR,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV,KF_SCALA & PSPSC3A,PSPSC3B,PSPSC2 , & & KFLDPTRUV,KFLDPTRSC) +#ifdef ACCGPU !$ACC WAIT(1) +#endif IF (LSYNC_TRANS) THEN CALL GSTATS(430,0)