diff --git a/src/trans/common/internal/tpm_distr.F90 b/src/trans/common/internal/tpm_distr.F90 index 6a151192f..5587be6b8 100755 --- a/src/trans/common/internal/tpm_distr.F90 +++ b/src/trans/common/internal/tpm_distr.F90 @@ -171,7 +171,8 @@ MODULE TPM_DISTR REAL(KIND=JPRD) ,ALLOCATABLE :: RWEIGHT(:) ! Weight per grid-point (if weighted distribution) INTEGER(KIND=JPIM) ,ALLOCATABLE :: NPROCA_GP(:) ! Number of grid-points per a-set -INTEGER(KIND=JPIM), ALLOCATABLE :: OFFSETS_GEMM1(:), OFFSETS_GEMM2(:) +INTEGER(KIND=JPIM), ALLOCATABLE :: OFFSETS_GEMM1(:), OFFSETS_GEMM2(:), OFFSETS_GEMM_MATRIX(:) +INTEGER(KIND=JPIM), ALLOCATABLE :: LEGENDRE_MATRIX_STRIDES(:) END TYPE DISTR_TYPE diff --git a/src/trans/gpu/algor/hicblas_cutlass.cuda.h b/src/trans/gpu/algor/hicblas_cutlass.cuda.h index 7a842a808..f0848f106 100644 --- a/src/trans/gpu/algor/hicblas_cutlass.cuda.h +++ b/src/trans/gpu/algor/hicblas_cutlass.cuda.h @@ -151,7 +151,7 @@ class cutlass_sgemm_grouped { template void cutlass_sgemm_wrapper_grouped_op(int blas_id, int m, int *n, int *k, float alpha, const float *A, int lda, - int *offsetsA, const float *B, int ldb, + int *offsetsA, const float *B, int *ldb, int *offsetsB, float beta, float *C, int ldc, int *offsetsC, int batchCount, cudaStream_t stream, @@ -179,7 +179,7 @@ void cutlass_sgemm_wrapper_grouped_op(int blas_id, int m, int *n, int *k, void cutlass_sgemm_wrapper_grouped(int blas_id, char transa, char transb, int m, int *n, int *k, float alpha, const float *A, int lda, int *offsetsA, - const float *B, int ldb, int *offsetsB, float beta, + const float *B, int *ldb, int *offsetsB, float beta, float *C, int ldc, int *offsetsC, int batchCount, cudaStream_t stream, void *growing_allocator) { diff --git a/src/trans/gpu/algor/hicblas_gemm.hip.cpp b/src/trans/gpu/algor/hicblas_gemm.hip.cpp index 9d6178bed..82ef7588c 100644 --- a/src/trans/gpu/algor/hicblas_gemm.hip.cpp +++ b/src/trans/gpu/algor/hicblas_gemm.hip.cpp @@ -63,11 +63,12 @@ template void free_gemm_cache(float *, size_t) { // this version is using graphs and caches the graphs template -void run_group_graph(Gemm &&gemm, int m, int *n, int *k, Real alpha, - const Real *A, int lda, int *offsetsA, const Real *B, - int ldb, int *offsetsB, Real beta, Real *C, int ldc, - int *offsetsC, int batchCount, hipStream_t stream, - int blas_id, void *growing_allocator) { +void run_group_graph(Gemm &&gemm, int const m, int const *n, int const *k, + Real alpha, const Real *A, int lda, int const *offsetsA, + const Real *B, int const *ldb, int const *offsetsB, Real beta, + Real *C, int ldc, int const *offsetsC, int batchCount, + hipStream_t stream, int blas_id, + void *growing_allocator) { growing_allocator_register_free_c(growing_allocator, free_gemm_cache); @@ -86,7 +87,7 @@ void run_group_graph(Gemm &&gemm, int m, int *n, int *k, Real alpha, // the plan is cached, but the pointers are not correct. we remove and // delete the graph, but we keep the hipblas handles, if this happens more // often, we should cache this... - std::cout << "WARNING GEMM: POINTER CHANGE - Graph recreation might be slow." << std::endl; + std::cout << "WARNING GEMM: POINTER CHANGE - Graph recreation might be slow.\n"; std::cout << "We have an entry with key {m=" << m << ", blas_id=" << blas_id << "}\n"; std::cout << "Pointers: " << std::get<0>(ptrs->second) << ", " @@ -111,7 +112,7 @@ void run_group_graph(Gemm &&gemm, int m, int *n, int *k, Real alpha, HIC_CHECK(hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal)); gemm(stream, m, n[i], k[i], alpha, A + offsetsA[i], lda, B + offsetsB[i], - ldb, beta, C + offsetsC[i], ldc); + ldb[i], beta, C + offsetsC[i], ldc); hipGraph_t my_graph; HIC_CHECK(hipStreamEndCapture(stream, &my_graph)); hipGraphNode_t my_node; @@ -133,14 +134,14 @@ void run_group_graph(Gemm &&gemm, int m, int *n, int *k, Real alpha, // stupid simple gemm calls template void run_group(Gemm &&gemm, int m, int *n, int *k, Real alpha, const Real *A, - int lda, int *offsetsA, const Real *B, int ldb, int *offsetsB, + int lda, int *offsetsA, const Real *B, int *ldb, int *offsetsB, Real beta, Real *C, int ldc, int *offsetsC, int batchCount, hipStream_t stream, int = -1) { for (int i = 0; i < batchCount; ++i) { if (m == 0 || n[i] == 0 || k[i] == 0) continue; - gemm(stream, m, n[i], k[i], alpha, A + offsetsA[i], lda, B + offsetsB[i], ldb, - beta, C + offsetsC[i], ldc); + gemm(stream, m, n[i], k[i], alpha, A + offsetsA[i], lda, B + offsetsB[i], + ldb[i], beta, C + offsetsC[i], ldc); } } @@ -187,7 +188,7 @@ template struct hipblas_gemm_grouped { void hipblas_sgemm_wrapper_grouped(int blas_id, char transa, char transb, int m, int *n, int *k, float alpha, const float *A, int lda, int *offsetsA, - const float *B, int ldb, int *offsetsB, float beta, + const float *B, int *ldb, int *offsetsB, float beta, float *C, int ldc, int *offsetsC, int batchCount, hipStream_t stream, void *growing_allocator) { @@ -216,7 +217,7 @@ void hipblas_dgemm_wrapper_grouped(int blas_id, char transa, char transb, int m, int *n, int *k, double alpha, const double *A, int lda, int *offsetsA, - const double *B, int ldb, int *offsetsB, + const double *B, int *ldb, int *offsetsB, double beta, double *C, int ldc, int *offsetsC, int batchCount, hipStream_t stream, void *) { @@ -293,7 +294,7 @@ void hipblas_sgemm_wrapper (char transa, char transb, void hipblas_sgemm_wrapper_grouped(int blas_id, char transa, char transb, int m, int *n, int *k, float alpha, const float *A, int lda, int *offsetsA, - const float *B, int ldb, int *offsetsB, float beta, + const float *B, int *ldb, int *offsetsB, float beta, float *C, int ldc, int *offsetsC, int batchCount, size_t stream, void *growing_allocator) { @@ -314,7 +315,7 @@ void hipblas_sgemm_wrapper_grouped(int blas_id, char transa, char transb, void hipblas_dgemm_wrapper_grouped(int blas_id, char transa, char transb, int m, int *n, int *k, double alpha, const double *A, int lda, int *offsetsA, - const double *B, int ldb, int *offsetsB, double beta, + const double *B, int *ldb, int *offsetsB, double beta, double *C, int ldc, int *offsetsC, int batchCount, size_t stream, void *growing_allocator) { diff --git a/src/trans/gpu/algor/hicblas_mod.F90 b/src/trans/gpu/algor/hicblas_mod.F90 index 988e1b3ef..1ba88bc5d 100644 --- a/src/trans/gpu/algor/hicblas_mod.F90 +++ b/src/trans/gpu/algor/hicblas_mod.F90 @@ -77,8 +77,8 @@ SUBROUTINE HIP_DGEMM_GROUPED( & &) BIND(C, NAME='hipblas_dgemm_wrapper_grouped') USE ISO_C_BINDING, ONLY: C_CHAR, C_INT, C_DOUBLE, C_SIZE_T, C_PTR CHARACTER(1,C_CHAR), VALUE :: CTA, CTB - INTEGER(C_INT), VALUE :: BLAS_ID, M, LDA, LDB, LDC, BATCHCOUNT - INTEGER(C_INT) :: N(*), K(*), OFFSETA(*), OFFSETB(*), OFFSETC(*) + INTEGER(C_INT), VALUE :: BLAS_ID, M, LDA, LDC, BATCHCOUNT + INTEGER(C_INT) :: N(*), K(*), LDB(*), OFFSETA(*), OFFSETB(*), OFFSETC(*) REAL(C_DOUBLE), VALUE :: ALPHA,BETA REAL(C_DOUBLE) :: A(*), B(*), C(*) INTEGER(KIND=C_SIZE_T) :: STREAM @@ -97,8 +97,8 @@ SUBROUTINE HIP_SGEMM_GROUPED( & &) BIND(C, NAME='hipblas_sgemm_wrapper_grouped') USE ISO_C_BINDING, ONLY: C_CHAR, C_INT, C_FLOAT, C_SIZE_T, C_PTR CHARACTER(1,C_CHAR), VALUE :: CTA, CTB - INTEGER(C_INT), VALUE :: BLAS_ID, M, LDA, LDB, LDC, BATCHCOUNT - INTEGER(C_INT) :: N(*), K(*), OFFSETA(*), OFFSETB(*), OFFSETC(*) + INTEGER(C_INT), VALUE :: BLAS_ID, M, LDA, LDC, BATCHCOUNT + INTEGER(C_INT) :: N(*), K(*), LDB(*), OFFSETA(*), OFFSETB(*), OFFSETC(*) REAL(C_FLOAT), VALUE :: ALPHA,BETA REAL(C_FLOAT) :: A(*), B(*), C(*) INTEGER(KIND=C_SIZE_T) :: STREAM @@ -221,8 +221,8 @@ SUBROUTINE HIP_DGEMM_GROUPED_OVERLOAD( & REAL(KIND=JPRD), DIMENSION(:) :: AARRAY INTEGER(KIND=JPIM) :: LDA INTEGER(KIND=JPIM) :: OFFSETA(:) - REAL(KIND=JPRD), DIMENSION(*) :: BARRAY - INTEGER(KIND=JPIM) :: LDB + REAL(KIND=JPRD), DIMENSION(:) :: BARRAY + INTEGER(KIND=JPIM) :: LDB(:) INTEGER(KIND=JPIM) :: OFFSETB(:) REAL(KIND=JPRD) :: BETA REAL(KIND=JPRD), DIMENSION(:) :: CARRAY @@ -267,8 +267,8 @@ SUBROUTINE HIP_SGEMM_GROUPED_OVERLOAD(& REAL(KIND=JPRM), DIMENSION(:) :: AARRAY INTEGER(KIND=JPIM) :: LDA INTEGER(KIND=JPIM) :: OFFSETA(:) - REAL(KIND=JPRM), DIMENSION(:,:,:) :: BARRAY - INTEGER(KIND=JPIM) :: LDB + REAL(KIND=JPRM), DIMENSION(:) :: BARRAY + INTEGER(KIND=JPIM) :: LDB(:) INTEGER(KIND=JPIM) :: OFFSETB(:) REAL(KIND=JPRM) :: BETA REAL(KIND=JPRM), DIMENSION(:) :: CARRAY diff --git a/src/trans/gpu/external/setup_trans.F90 b/src/trans/gpu/external/setup_trans.F90 index bf1cec9db..c330eabbe 100755 --- a/src/trans/gpu/external/setup_trans.F90 +++ b/src/trans/gpu/external/setup_trans.F90 @@ -98,7 +98,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& ! R. El Khatib 07-Mar-2016 Better flexibility for Legendre polynomials computation in stretched mode ! ------------------------------------------------------------------ -USE PARKIND1, ONLY: JPIM, JPRB, JPRD +USE PARKIND1, ONLY: JPIM, JPRB, JPRD, JPIB USE PARKIND_ECTRANS, ONLY: JPRBT !ifndef INTERFACE @@ -186,6 +186,8 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& #endif INTEGER :: INUMDEVS, IUNIT, ISTAT, IDEV, MYGPU +REAL(KIND=JPRBT), POINTER :: LOCAL_ARR(:,:) + #include "user_clock.intfb.h" ! ------------------------------------------------------------------ @@ -499,33 +501,49 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& ! Initialize A arrays -ALLOCATE(ZAA(ALIGN(R%NDGNH,8),ALIGN((R%NTMAX+2)/2,8),D%NUMP)) -ALLOCATE(ZAS(ALIGN(R%NDGNH,8),ALIGN((R%NTMAX+3)/2,8),D%NUMP)) +ALLOCATE(ZAA(D%OFFSETS_GEMM_MATRIX(D%NUMP+1))) +ALLOCATE(ZAS(D%OFFSETS_GEMM_MATRIX(D%NUMP+1))) WRITE(NOUT,*)'setup_trans: sizes1 NUMP=',D%NUMP -WRITE(NOUT,*)'ZAS:',size(ZAS) -WRITE(NOUT,*)'ZAA:',size(ZAA) +WRITE(NOUT,'("ZAA: ", I, " B")') SIZE(ZAA,KIND=JPIB)*STORAGE_SIZE(ZAA(1))/8 +WRITE(NOUT,'("ZAS: ", I, " B")') SIZE(ZAS,KIND=JPIB)*STORAGE_SIZE(ZAS(1))/8 + +! Be very careful here not to touch ZAA, ZAS, ZAA0 and ZAS0 on the host! -ZAA(:,:,:) = 0._JPRBT -ZAS(:,:,:) = 0._JPRBT +ZAA(:) = 0._JPRBT +ZAS(:) = 0._JPRBT +KMLOC0 = 0 DO JMLOC=1,D%NUMP KM = D%MYMS(JMLOC) KDGLU = G%NDGLU(KM) ILA = (R%NSMAX-KM+2)/2 ILS = (R%NSMAX-KM+3)/2 - ZAA(1:KDGLU,1:ILA,JMLOC)=S%FA(JMLOC)%RPNMA(1:KDGLU,1:ILA) - ZAS(1:KDGLU,1:ILS,JMLOC)=S%FA(JMLOC)%RPNMS(1:KDGLU,1:ILS) -ENDDO - -! permanent copy of Legendre polynomials into device + IF (KM /= 0) THEN + CALL C_F_POINTER(C_LOC(ZAA(1+D%OFFSETS_GEMM_MATRIX(JMLOC))), LOCAL_ARR, & + & (/D%LEGENDRE_MATRIX_STRIDES(JMLOC),ILA/)) + LOCAL_ARR(1:KDGLU,1:ILA) = S%FA(JMLOC)%RPNMA(1:KDGLU,1:ILA) -#ifdef ACCGPU -!$ACC ENTER DATA COPYIN(ZAA,ZAS) -#endif -#ifdef OMPGPU -#endif + CALL C_F_POINTER(C_LOC(ZAS(1+D%OFFSETS_GEMM_MATRIX(JMLOC))), LOCAL_ARR, & + & (/D%LEGENDRE_MATRIX_STRIDES(JMLOC),ILS/)) + LOCAL_ARR(1:KDGLU,1:ILS) = S%FA(JMLOC)%RPNMS(1:KDGLU,1:ILS) + ELSE + KMLOC0 = JMLOC + ALLOCATE(ZAA0(ALIGN(KDGLU,8),ILA)) + ALLOCATE(ZAS0(ALIGN(KDGLU,8),ILS)) + + ZAA0(:,:) = 0 + ZAS0(:,:) = 0 + ZAA0(1:KDGLU,1:ILA)=S%FA(JMLOC)%RPNMA(1:KDGLU,1:ILA) + ZAS0(1:KDGLU,1:ILS)=S%FA(JMLOC)%RPNMS(1:KDGLU,1:ILS) + WRITE(NOUT,'("ZAA0: ", I, " B")') SIZE(ZAA,KIND=JPIB)*STORAGE_SIZE(ZAA0(1,1))/8 + WRITE(NOUT,'("ZAS0: ", I, " B")') SIZE(ZAA,KIND=JPIB)*STORAGE_SIZE(ZAS0(1,1))/8 + !$ACC ENTER DATA COPYIN(ZAA0,ZAS0) ASYNC(1) + ENDIF +ENDDO +!$ACC ENTER DATA COPYIN(ZAA,ZAS) ASYNC(1) +!$ACC WAIT(1) ALLOCATE(ZEPSNM(D%NUMP,0:R%NTMAX+2)) WRITE(NOUT,*)'ZEPSNM :',SIZE(ZEPSNM) @@ -653,33 +671,9 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& D_NUMP=D%NUMP D_NDGL_FS=D%NDGL_FS -KMLOC0 = -1 DO I=1,SIZE(D%MYMS) D_MYMS(I)=D%MYMS(I) - IF(D_MYMS(I) == 0) KMLOC0 = I end DO - -! arrays for m=0 in ledir_mod: -IF(KMLOC0 >= 0) THEN - ALLOCATE(ZAA0(SIZE(ZAA,1),SIZE(ZAA,2))) - ALLOCATE(ZAS0(SIZE(ZAS,1),SIZE(ZAS,2))) - ZAA0 = ZAA(:,:,KMLOC0) - ZAS0 = ZAS(:,:,KMLOC0) -#ifdef ACCGPU - !$ACC ENTER DATA COPYIN(ZAA0,ZAS0) -#endif -#ifdef OMPGPU - !$OMP TARGET ENTER DATA MAP(TO:ZAA0,ZAS0) -#endif - WRITE(NOUT,*) 'GPU arrays for m=0 successfully allocated' -#ifdef ACCGPU - WRITE(NOUT,*) 'Using OpenACC' -#endif -#ifdef OMPGPU - WRITE(NOUT,*) 'Using OpenMP offloading' -#endif -ENDIF - DO I=1,SIZE(F%RW) F_RW(I)=F%RW(I) END DO diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index e12f89afb..7079d1ac2 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -99,7 +99,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) USE TPM_DIM, ONLY: R_NDGNH,R_NSMAX,R_NTMAX,R_NDGL USE TPM_GEOMETRY, ONLY: G_NDGLU USE TPM_FIELDS_FLAT, ONLY: ZAA,ZAS,ZAA0,ZAS0,KMLOC0 - USE TPM_DISTR, ONLY: D_NUMP, D_MYMS, D_OFFSETS_GEMM1, D_OFFSETS_GEMM2 + USE TPM_DISTR, ONLY: D_NUMP, D_MYMS, D_OFFSETS_GEMM1, D_OFFSETS_GEMM2, D USE HICBLAS_MOD, ONLY: HIP_DGEMM_BATCHED_OVERLOAD, & & HIP_DGEMM_GROUPED_OVERLOAD, HIP_SGEMM_GROUPED_OVERLOAD USE MPL_MODULE, ONLY: MPL_BARRIER,MPL_ALL_MS_COMM @@ -196,7 +196,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) NS(KMLOC) = (R_NSMAX-KM+2)/2 KS(KMLOC) = G_NDGLU(KM) AOFFSETS(KMLOC) = IIN_STRIDES0*D_OFFSETS_GEMM1(KMLOC) - BOFFSETS(KMLOC) = SIZE(ZAA,1)*SIZE(ZAA,2)*(KMLOC-1) + BOFFSETS(KMLOC) = D%OFFSETS_GEMM_MATRIX(KMLOC) COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM2(KMLOC) ENDDO IF(KMLOC0 > 0) THEN @@ -215,7 +215,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) & 2*KF_FS, NS(:), KS(:), & & 1.0_JPRBT, & & ZINPA, IIN_STRIDES0, AOFFSETS, & - & ZAA, SIZE(ZAA,1), BOFFSETS, & + & ZAA, D%LEGENDRE_MATRIX_STRIDES, BOFFSETS, & & 0.0_JPRBT, & & ZOUT, IOUT_STRIDES0, COFFSETS, & & D_NUMP, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR) @@ -311,7 +311,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) NS(KMLOC) = (R_NSMAX-KM+3)/2 KS(KMLOC) = G_NDGLU(KM) AOFFSETS(KMLOC) = IIN_STRIDES0*D_OFFSETS_GEMM1(KMLOC) - BOFFSETS(KMLOC) = SIZE(ZAS,1)*SIZE(ZAS,2)*(KMLOC-1) + BOFFSETS(KMLOC) = D%OFFSETS_GEMM_MATRIX(KMLOC) COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM2(KMLOC) ENDDO IF(KMLOC0 > 0) THEN @@ -330,7 +330,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) & 2*KF_FS, NS(:), KS(:), & & 1.0_JPRBT, & & ZINPS, IIN_STRIDES0, AOFFSETS, & - & ZAS, SIZE(ZAS,1), BOFFSETS, & + & ZAS, D%LEGENDRE_MATRIX_STRIDES, BOFFSETS, & & 0.0_JPRBT, & & ZOUT, IOUT_STRIDES0, COFFSETS, & & D_NUMP, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR) diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index 70a729ac0..4b9a354d2 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -98,7 +98,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) USE TPM_DIM, ONLY: R_NDGNH, R_NSMAX, R_NDGL USE TPM_GEOMETRY, ONLY: G_NDGLU USE TPM_FIELDS_FLAT, ONLY: ZAA, ZAS, ZAA0, ZAS0, KMLOC0 - USE TPM_DISTR, ONLY: D_NUMP, D_MYMS, MYPROC, D_OFFSETS_GEMM1, D_OFFSETS_GEMM2 + USE TPM_DISTR, ONLY: D_NUMP, D_MYMS, MYPROC, D_OFFSETS_GEMM1, D_OFFSETS_GEMM2, D USE HICBLAS_MOD, ONLY: HIP_DGEMM_BATCHED_OVERLOAD, & & HIP_DGEMM_GROUPED_OVERLOAD, HIP_SGEMM_GROUPED_OVERLOAD USE, INTRINSIC :: ISO_C_BINDING, ONLY: C_INT @@ -252,7 +252,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) KS(KMLOC) = (R_NSMAX-KM+2)/2 NS(KMLOC) = G_NDGLU(KM) AOFFSETS(KMLOC) = IIN_STRIDES0*D_OFFSETS_GEMM2(KMLOC) - BOFFSETS(KMLOC) = SIZE(ZAA,1)*SIZE(ZAA,2)*(KMLOC-1) + BOFFSETS(KMLOC) = D%OFFSETS_GEMM_MATRIX(KMLOC) COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM1(KMLOC) ENDDO IF(KMLOC0 > 0) THEN @@ -271,7 +271,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) & 2*KF_LEG, NS(:), KS(:), & & 1.0_JPRBT, & & ZINP, IIN_STRIDES0, AOFFSETS, & - & ZAA, SIZE(ZAA,1), BOFFSETS, & + & ZAA, D%LEGENDRE_MATRIX_STRIDES, BOFFSETS, & & 0.0_JPRBT, & & ZOUTA, IOUT_STRIDES0, COFFSETS, & & D_NUMP, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR) @@ -386,7 +386,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) KS(KMLOC) = (R_NSMAX-KM+3)/2 NS(KMLOC) = G_NDGLU(KM) AOFFSETS(KMLOC) = IIN_STRIDES0*D_OFFSETS_GEMM2(KMLOC) - BOFFSETS(KMLOC) = SIZE(ZAS,1)*SIZE(ZAS,2)*(KMLOC-1) + BOFFSETS(KMLOC) = D%OFFSETS_GEMM_MATRIX(KMLOC) COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM1(KMLOC) ENDDO IF(KMLOC0 > 0) THEN @@ -405,7 +405,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) & 2*KF_LEG, NS(:), KS(:), & & 1.0_JPRBT, & & ZINP, IIN_STRIDES0, AOFFSETS, & - & ZAS, SIZE(ZAS,1), BOFFSETS, & + & ZAS, D%LEGENDRE_MATRIX_STRIDES, BOFFSETS, & & 0.0_JPRBT, & & ZOUTS, IOUT_STRIDES0, COFFSETS, & & D_NUMP, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR) diff --git a/src/trans/gpu/internal/sump_trans_mod.F90 b/src/trans/gpu/internal/sump_trans_mod.F90 index a0f2260b4..571140a0c 100755 --- a/src/trans/gpu/internal/sump_trans_mod.F90 +++ b/src/trans/gpu/internal/sump_trans_mod.F90 @@ -40,7 +40,7 @@ SUBROUTINE SUMP_TRANS INTEGER(KIND=JPIM) :: JM INTEGER(KIND=JPIM) :: JGL,IGL,IPLAT,ISENDSET,IRECVSET,JML,IPOS,IM -INTEGER(KIND=JPIM) :: IGPTOT,IMEDIAP,IRESTM,JA,JB,IOFF,OFFSET1,OFFSET2,KMLOC,KM +INTEGER(KIND=JPIM) :: IGPTOT,IMEDIAP,IRESTM,JA,JB,IOFF,OFFSET1,OFFSET2,OFFSET3,KMLOC,KM INTEGER(KIND=JPIM),ALLOCATABLE :: IGPTOTL(:,:) REAL(KIND=JPRD),ALLOCATABLE :: ZDUM(:) @@ -271,23 +271,33 @@ SUBROUTINE SUMP_TRANS ALLOCATE(D%OFFSETS_GEMM1(D%NUMP+1)) ALLOCATE(D%OFFSETS_GEMM2(D%NUMP+1)) +ALLOCATE(D%OFFSETS_GEMM_MATRIX(D%NUMP+1)) +ALLOCATE(D%LEGENDRE_MATRIX_STRIDES(D%NUMP)) OFFSET1 = 0 OFFSET2 = 0 +OFFSET3 = 0 DO KMLOC=1,D%NUMP KM = D%MYMS(KMLOC) D%OFFSETS_GEMM1(KMLOC) = OFFSET1 D%OFFSETS_GEMM2(KMLOC) = OFFSET2 + D%OFFSETS_GEMM_MATRIX(KMLOC) = OFFSET3 !KM=0 is transformed in double precision, no need to store here IF (KM /= 0) THEN OFFSET1 = OFFSET1 + ALIGN(G%NDGLU(KM),8) ! N_OFFSET takes the max of the two GEMMs OFFSET2 = OFFSET2 + ALIGN((R%NSMAX-KM+3)/2,8) + + D%LEGENDRE_MATRIX_STRIDES(KMLOC) = ALIGN(G%NDGLU(KM),8) + ! Note that both sizes have to be aligned because we make the GEMMs + ! multiples of 8 + OFFSET3 = OFFSET3 + ALIGN((R%NSMAX-KM+3)/2,8) * D%LEGENDRE_MATRIX_STRIDES(KMLOC) ENDIF ENDDO D%OFFSETS_GEMM1(D%NUMP+1) = OFFSET1 D%OFFSETS_GEMM2(D%NUMP+1) = OFFSET2 +D%OFFSETS_GEMM_MATRIX(D%NUMP+1) = OFFSET3 ! ------------------------------------------------------------------ 9 FORMAT(1X,'ARRAY ',A10,' ALLOCATED ',8I8) diff --git a/src/trans/gpu/internal/tpm_fields_flat.F90 b/src/trans/gpu/internal/tpm_fields_flat.F90 index 780d0629c..95e69a058 100755 --- a/src/trans/gpu/internal/tpm_fields_flat.F90 +++ b/src/trans/gpu/internal/tpm_fields_flat.F90 @@ -24,8 +24,8 @@ MODULE TPM_FIELDS_FLAT ! scratch arrays for ltinv and ltdir and associated dimension variables -REAL(KIND=JPRBT),ALLOCATABLE :: ZAA(:,:,:) !! JPRL for 1/2 -REAL(KIND=JPRBT),ALLOCATABLE :: ZAS(:,:,:) !! JPRL for 1/2 +REAL(KIND=JPRBT),ALLOCATABLE :: ZAA(:) !! JPRL for 1/2 +REAL(KIND=JPRBT),ALLOCATABLE :: ZAS(:) !! JPRL for 1/2 ! for m=0 in ledir_mod: REAL(KIND=JPRD),ALLOCATABLE :: ZAA0(:,:)