From d0d2349bd47dc7015e75428690b020fc8ce5b837 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Mon, 16 Dec 2024 00:10:58 -0800 Subject: [PATCH] compact legendre polynomials --- src/trans/common/internal/tpm_distr.F90 | 3 +- src/trans/gpu/algor/hicblas_cutlass.cuda.h | 16 +++--- src/trans/gpu/algor/hicblas_gemm.hip.cpp | 54 ++++++++++---------- src/trans/gpu/algor/hicblas_mod.F90 | 14 +++--- src/trans/gpu/external/setup_trans.F90 | 57 ++++++++++++---------- src/trans/gpu/internal/ledir_mod.F90 | 8 +-- src/trans/gpu/internal/leinv_mod.F90 | 8 +-- src/trans/gpu/internal/sump_trans_mod.F90 | 12 ++++- src/trans/gpu/internal/tpm_fields_gpu.F90 | 4 +- 9 files changed, 96 insertions(+), 80 deletions(-) diff --git a/src/trans/common/internal/tpm_distr.F90 b/src/trans/common/internal/tpm_distr.F90 index 6ae338445..eddb16843 100755 --- a/src/trans/common/internal/tpm_distr.F90 +++ b/src/trans/common/internal/tpm_distr.F90 @@ -171,7 +171,8 @@ MODULE TPM_DISTR REAL(KIND=JPRD) ,ALLOCATABLE :: RWEIGHT(:) ! Weight per grid-point (if weighted distribution) INTEGER(KIND=JPIM) ,ALLOCATABLE :: NPROCA_GP(:) ! Number of grid-points per a-set -INTEGER(KIND=JPIB), ALLOCATABLE :: OFFSETS_GEMM1(:), OFFSETS_GEMM2(:) +INTEGER(KIND=JPIB), ALLOCATABLE :: OFFSETS_GEMM1(:), OFFSETS_GEMM2(:), OFFSETS_GEMM_MATRIX(:) +INTEGER(KIND=JPIM), ALLOCATABLE :: LEGENDRE_MATRIX_STRIDES(:) END TYPE DISTR_TYPE diff --git a/src/trans/gpu/algor/hicblas_cutlass.cuda.h b/src/trans/gpu/algor/hicblas_cutlass.cuda.h index 9a42bb2f6..8b0eacf88 100644 --- a/src/trans/gpu/algor/hicblas_cutlass.cuda.h +++ b/src/trans/gpu/algor/hicblas_cutlass.cuda.h @@ -154,11 +154,11 @@ class cutlass_sgemm_grouped { } // namespace detail template -void cutlass_sgemm_wrapper_grouped_op(int resol_id, int blas_id, int m, int *n, int *k, +void cutlass_sgemm_wrapper_grouped_op(int resol_id, int blas_id, int m, const int *n, const int *k, float alpha, const float *A, int lda, - int64_t *offsetsA, const float *B, int ldb, - int64_t *offsetsB, float beta, float *C, - int ldc, int64_t *offsetsC, int batchCount, + const int64_t *offsetsA, const float *B, const int *ldb, + const int64_t *offsetsB, float beta, float *C, + int ldc, const int64_t *offsetsC, int batchCount, cudaStream_t stream, void *growing_allocator) { using namespace detail; @@ -182,10 +182,10 @@ void cutlass_sgemm_wrapper_grouped_op(int resol_id, int blas_id, int m, int *n, } void cutlass_sgemm_wrapper_grouped(int resol_id, int blas_id, char transa, char transb, - int m, int *n, int *k, float alpha, - const float *A, int lda, int64_t *offsetsA, - const float *B, int ldb, int64_t *offsetsB, float beta, - float *C, int ldc, int64_t *offsetsC, + int m, const int *n, const int *k, float alpha, + const float *A, int lda, const int64_t *offsetsA, + const float *B, const int *ldb, const int64_t *offsetsB, float beta, + float *C, int ldc, const int64_t *offsetsC, int batchCount, cudaStream_t stream, void *growing_allocator) { diff --git a/src/trans/gpu/algor/hicblas_gemm.hip.cpp b/src/trans/gpu/algor/hicblas_gemm.hip.cpp index 252dc7dd4..5346667ef 100644 --- a/src/trans/gpu/algor/hicblas_gemm.hip.cpp +++ b/src/trans/gpu/algor/hicblas_gemm.hip.cpp @@ -89,10 +89,10 @@ template void erase_from_caches(int resol_id) { // this version is using graphs and caches the graphs template -void run_group_graph(Gemm &&gemm, int resol_id, int m, int *n, int *k, - Real alpha, const Real *A, int lda, int64_t *offsetsA, - const Real *B, int ldb, int64_t *offsetsB, Real beta, - Real *C, int ldc, int64_t *offsetsC, int batchCount, +void run_group_graph(Gemm &&gemm, int resol_id, int m, const int *n, const int *k, + Real alpha, const Real *A, int lda, const int64_t *offsetsA, + const Real *B, const int *ldb, const int64_t *offsetsB, Real beta, + Real *C, int ldc, const int64_t *offsetsC, int batchCount, hipStream_t stream, int blas_id, void *growing_allocator) { growing_allocator_register_free_c(growing_allocator, free_gemm_graph_cache); @@ -138,7 +138,7 @@ void run_group_graph(Gemm &&gemm, int resol_id, int m, int *n, int *k, HIC_CHECK(hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal)); gemm(stream, m, n[i], k[i], alpha, A + offsetsA[i], lda, B + offsetsB[i], - ldb, beta, C + offsetsC[i], ldc); + ldb[i], beta, C + offsetsC[i], ldc); hipGraph_t my_graph; HIC_CHECK(hipStreamEndCapture(stream, &my_graph)); hipGraphNode_t my_node; @@ -163,16 +163,16 @@ void run_group_graph(Gemm &&gemm, int resol_id, int m, int *n, int *k, // stupid simple gemm calls template -void run_group(Gemm &&gemm, int resol_id, int m, int *n, int *k, Real alpha, - const Real *A, int lda, int64_t *offsetsA, const Real *B, - int ldb, int64_t *offsetsB, Real beta, Real *C, int ldc, - int64_t *offsetsC, int batchCount, hipStream_t stream, +void run_group(Gemm &&gemm, int resol_id, int m, const int *n, const int *k, Real alpha, + const Real *A, int lda, const int64_t *offsetsA, const Real *B, + const int *ldb, const int64_t *offsetsB, Real beta, Real *C, int ldc, + const int64_t *offsetsC, int batchCount, hipStream_t stream, int = -1) { for (int i = 0; i < batchCount; ++i) { if (m == 0 || n[i] == 0 || k[i] == 0) continue; gemm(stream, m, n[i], k[i], alpha, A + offsetsA[i], lda, B + offsetsB[i], - ldb, beta, C + offsetsC[i], ldc); + ldb[i], beta, C + offsetsC[i], ldc); } } @@ -216,11 +216,11 @@ template struct hipblas_gemm_grouped { #ifndef USE_CUTLASS void hipblas_sgemm_wrapper_grouped(int resol_id, int blas_id, char transa, - char transb, int m, int *n, int *k, + char transb, int m, const int *n, const int *k, float alpha, const float *A, int lda, - int64_t *offsetsA, const float *B, int ldb, - int64_t *offsetsB, float beta, float *C, - int ldc, int64_t *offsetsC, int batchCount, + const int64_t *offsetsA, const float *B, const int *ldb, + const int64_t *offsetsB, float beta, float *C, + int ldc, const int64_t *offsetsC, int batchCount, hipStream_t stream, void *growing_allocator) { @@ -244,11 +244,11 @@ void hipblas_sgemm_wrapper_grouped(int resol_id, int blas_id, char transa, #endif void hipblas_dgemm_wrapper_grouped(int resol_id, int blas_id, char transa, - char transb, int m, int *n, int *k, + char transb, int m, const int *n, const int *k, double alpha, const double *A, int lda, - int64_t *offsetsA, const double *B, int ldb, - int64_t *offsetsB, double beta, double *C, - int ldc, int64_t *offsetsC, int batchCount, + const int64_t *offsetsA, double const *B, const int *ldb, + const int64_t *offsetsB, double beta, double *C, + int ldc, const int64_t *offsetsC, int batchCount, hipStream_t stream, void *) { hipblasOperation_t op_t1 = HIPBLAS_OP_N, op_t2 = HIPBLAS_OP_N; @@ -314,11 +314,11 @@ void hipblas_sgemm_wrapper(char transa, char transb, int m, int n, int k, } void hipblas_sgemm_wrapper_grouped(int resol_id, int blas_id, char transa, - char transb, int m, int *n, int *k, + char transb, int m, const int *n, const int *k, float alpha, const float *A, int lda, - int64_t *offsetsA, const float *B, int ldb, - int64_t *offsetsB, float beta, float *C, - int ldc, int64_t *offsetsC, int batchCount, + const int64_t *offsetsA, const float *B, const int *ldb, + const int64_t *offsetsB, float beta, float *C, + int ldc, const int64_t *offsetsC, int batchCount, size_t stream, void *growing_allocator) { #ifdef USE_CUTLASS cutlass_sgemm_wrapper_grouped(resol_id, blas_id, transa, transb, m, n, k, @@ -334,11 +334,11 @@ void hipblas_sgemm_wrapper_grouped(int resol_id, int blas_id, char transa, } void hipblas_dgemm_wrapper_grouped(int resol_id, int blas_id, char transa, - char transb, int m, int *n, int *k, - double alpha, const double *A, int lda, - int64_t *offsetsA, const double *B, int ldb, - int64_t *offsetsB, double beta, double *C, - int ldc, int64_t *offsetsC, int batchCount, + char transb, int m, const int *n, const int *k, + double alpha, double const *A, int lda, + const int64_t *offsetsA, double const *B, const int *ldb, + const int64_t *offsetsB, double beta, double *C, + int ldc, const int64_t *offsetsC, int batchCount, size_t stream, void *growing_allocator) { hipblas_dgemm_wrapper_grouped(resol_id, blas_id, transa, transb, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, diff --git a/src/trans/gpu/algor/hicblas_mod.F90 b/src/trans/gpu/algor/hicblas_mod.F90 index f7528a11f..5949db685 100644 --- a/src/trans/gpu/algor/hicblas_mod.F90 +++ b/src/trans/gpu/algor/hicblas_mod.F90 @@ -83,8 +83,8 @@ SUBROUTINE HIP_DGEMM_GROUPED( & &) BIND(C, NAME='hipblas_dgemm_wrapper_grouped') USE ISO_C_BINDING, ONLY: C_CHAR, C_INT, C_DOUBLE, C_SIZE_T, C_PTR, C_INT64_T CHARACTER(1,C_CHAR), VALUE :: CTA, CTB - INTEGER(C_INT), VALUE :: RESOL_ID, BLAS_ID, M, LDA, LDB, LDC, BATCHCOUNT - INTEGER(C_INT) :: N(*), K(*) + INTEGER(C_INT), VALUE :: RESOL_ID, BLAS_ID, M, LDA, LDC, BATCHCOUNT + INTEGER(C_INT) :: N(*), K(*), LDB(*) INTEGER(C_INT64_T) :: OFFSETA(*), OFFSETB(*), OFFSETC(*) REAL(C_DOUBLE), VALUE :: ALPHA,BETA REAL(C_DOUBLE) :: A(*), B(*), C(*) @@ -104,8 +104,8 @@ SUBROUTINE HIP_SGEMM_GROUPED( & &) BIND(C, NAME='hipblas_sgemm_wrapper_grouped') USE ISO_C_BINDING, ONLY: C_CHAR, C_INT, C_FLOAT, C_SIZE_T, C_PTR, C_INT64_T CHARACTER(1,C_CHAR), VALUE :: CTA, CTB - INTEGER(C_INT), VALUE :: RESOL_ID, BLAS_ID, M, LDA, LDB, LDC, BATCHCOUNT - INTEGER(C_INT) :: N(*), K(*) + INTEGER(C_INT), VALUE :: RESOL_ID, BLAS_ID, M, LDA, LDC, BATCHCOUNT + INTEGER(C_INT) :: N(*), K(*), LDB(*) INTEGER(C_INT64_T) :: OFFSETA(*), OFFSETB(*), OFFSETC(*) REAL(C_FLOAT), VALUE :: ALPHA,BETA REAL(C_FLOAT) :: A(*), B(*), C(*) @@ -231,7 +231,7 @@ SUBROUTINE HIP_DGEMM_GROUPED_OVERLOAD( & INTEGER(KIND=JPIM) :: LDA INTEGER(KIND=JPIB) :: OFFSETA(:) REAL(KIND=JPRD), DIMENSION(*) :: BARRAY - INTEGER(KIND=JPIM) :: LDB + INTEGER(KIND=JPIM) :: LDB(:) INTEGER(KIND=JPIB) :: OFFSETB(:) REAL(KIND=JPRD) :: BETA REAL(KIND=JPRD), DIMENSION(:) :: CARRAY @@ -277,8 +277,8 @@ SUBROUTINE HIP_SGEMM_GROUPED_OVERLOAD(& REAL(KIND=JPRM), DIMENSION(:) :: AARRAY INTEGER(KIND=JPIM) :: LDA INTEGER(KIND=JPIB) :: OFFSETA(:) - REAL(KIND=JPRM), DIMENSION(:,:,:) :: BARRAY - INTEGER(KIND=JPIM) :: LDB + REAL(KIND=JPRM), DIMENSION(*) :: BARRAY + INTEGER(KIND=JPIM) :: LDB(:) INTEGER(KIND=JPIB) :: OFFSETB(:) REAL(KIND=JPRM) :: BETA REAL(KIND=JPRM), DIMENSION(:) :: CARRAY diff --git a/src/trans/gpu/external/setup_trans.F90 b/src/trans/gpu/external/setup_trans.F90 index b1fdfc880..46087f28a 100755 --- a/src/trans/gpu/external/setup_trans.F90 +++ b/src/trans/gpu/external/setup_trans.F90 @@ -98,7 +98,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& ! R. El Khatib 07-Mar-2016 Better flexibility for Legendre polynomials computation in stretched mode ! ------------------------------------------------------------------ -USE PARKIND1, ONLY: JPIM, JPRB, JPRD +USE PARKIND1, ONLY: JPIM, JPRB, JPRD, JPIB USE PARKIND_ECTRANS, ONLY: JPRBT !ifndef INTERFACE @@ -166,12 +166,10 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& ! Local variables INTEGER(KIND=JPIM) :: JGL,JRES,IDEF_RESOL -INTEGER(KIND=JPIM) :: JMLOC, KM, ILA, ILS, KMLOC, KDGLU, JK, I, J - -INTEGER(KIND=JPIM) :: IPROC, IPROCS, ISTAN, ISTAS, ISL, IGLS, JFLD, IMLOC0(1) +INTEGER(KIND=JPIM) :: JMLOC, KM, ILA, ILS, KDGLU +INTEGER(KIND=JPIM) :: IMLOC0 LOGICAL :: LLP1,LLP2, LLSPSETUPONLY -REAL(KIND=JPRD) :: ZTIME0,ZTIME1,ZTIME2 REAL(KIND=JPHOOK) :: ZHOOK_HANDLE CHARACTER(LEN=8) :: CENV @@ -181,7 +179,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& #endif INTEGER :: INUMDEVS, IUNIT, ISTAT, IDEV, MYGPU -#include "user_clock.intfb.h" +REAL(KIND=JPRBT), POINTER :: LOCAL_ARR(:,:) ! ------------------------------------------------------------------ IF (LHOOK) CALL DR_HOOK('SETUP_TRANS',0,ZHOOK_HANDLE) @@ -470,35 +468,42 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& ! Initialize A arrays -ALLOCATE(FG%ZAA(ALIGN(R%NDGNH,8),ALIGN((R%NTMAX+2)/2,8),D%NUMP)) -ALLOCATE(FG%ZAS(ALIGN(R%NDGNH,8),ALIGN((R%NTMAX+3)/2,8),D%NUMP)) +ALLOCATE(FG%ZAA(D%OFFSETS_GEMM_MATRIX(D%NUMP+1))) +ALLOCATE(FG%ZAS(D%OFFSETS_GEMM_MATRIX(D%NUMP+1))) -FG%ZAA(:,:,:) = 0._JPRBT -FG%ZAS(:,:,:) = 0._JPRBT +FG%ZAA(:) = 0._JPRBT +FG%ZAS(:) = 0._JPRBT +IMLOC0 = 0 DO JMLOC=1,D%NUMP KM = D%MYMS(JMLOC) KDGLU = G%NDGLU(KM) ILA = (R%NSMAX-KM+2)/2 ILS = (R%NSMAX-KM+3)/2 - FG%ZAA(1:KDGLU,1:ILA,JMLOC)=S%FA(JMLOC)%RPNMA(1:KDGLU,1:ILA) - FG%ZAS(1:KDGLU,1:ILS,JMLOC)=S%FA(JMLOC)%RPNMS(1:KDGLU,1:ILS) -ENDDO + IF (KM /= 0) THEN + CALL C_F_POINTER(C_LOC(FG%ZAA(1+D%OFFSETS_GEMM_MATRIX(JMLOC))), LOCAL_ARR, & + & (/D%LEGENDRE_MATRIX_STRIDES(JMLOC),ILA/)) + LOCAL_ARR(1:KDGLU,1:ILA) = S%FA(JMLOC)%RPNMA(1:KDGLU,1:ILA) -! arrays for m=0 in ledir_mod: -IMLOC0 = FINDLOC(D%MYMS,0) -IF(IMLOC0(1) > 0) THEN - ALLOCATE(FG%ZAA0(SIZE(FG%ZAA,1),SIZE(FG%ZAA,2))) - ALLOCATE(FG%ZAS0(SIZE(FG%ZAS,1),SIZE(FG%ZAS,2))) - FG%ZAA0 = FG%ZAA(:,:,IMLOC0(1)) - FG%ZAS0 = FG%ZAS(:,:,IMLOC0(1)) -ENDIF + CALL C_F_POINTER(C_LOC(FG%ZAS(1+D%OFFSETS_GEMM_MATRIX(JMLOC))), LOCAL_ARR, & + & (/D%LEGENDRE_MATRIX_STRIDES(JMLOC),ILS/)) + LOCAL_ARR(1:KDGLU,1:ILS) = S%FA(JMLOC)%RPNMS(1:KDGLU,1:ILS) + ELSE + IMLOC0 = JMLOC + ALLOCATE(FG%ZAA0(ALIGN(KDGLU,8),ILA)) + ALLOCATE(FG%ZAS0(ALIGN(KDGLU,8),ILS)) + + FG%ZAA0(:,:) = 0 + FG%ZAS0(:,:) = 0 + FG%ZAA0(1:KDGLU,1:ILA)=S%FA(JMLOC)%RPNMA(1:KDGLU,1:ILA) + FG%ZAS0(1:KDGLU,1:ILS)=S%FA(JMLOC)%RPNMS(1:KDGLU,1:ILS) + ENDIF +ENDDO ALLOCATE(FG%ZEPSNM(D%NUMP,0:R%NTMAX+2)) FG%ZEPSNM = 0._JPRBT -CALL PREPSNM !Initialize on the host - +CALL PREPSNM WRITE(NOUT,*)'setup_trans: sizes1 NUMP=',D%NUMP #ifdef ACCGPU WRITE(NOUT,*) 'Using OpenACC' @@ -506,13 +511,13 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& #ifdef OMPGPU WRITE(NOUT,*) 'Using OpenMP offloading' #endif -WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAS', C_SIZEOF(FG%ZAS(1,1,1))*SIZE(FG%ZAS) -WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAA', C_SIZEOF(FG%ZAA(1,1,1))*SIZE(FG%ZAA) +WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAS', C_SIZEOF(FG%ZAS(1))*SIZE(FG%ZAS) +WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAA', C_SIZEOF(FG%ZAA(1))*SIZE(FG%ZAA) WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAS0', C_SIZEOF(FG%ZAS0(1,1))*SIZE(FG%ZAS0) WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAA0', C_SIZEOF(FG%ZAA0(1,1))*SIZE(FG%ZAA0) WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZEPSNM', C_SIZEOF(FG%ZEPSNM(1,1))*SIZE(FG%ZEPSNM) -IF (IMLOC0(1) > 0) THEN +IF (IMLOC0 > 0) THEN #ifdef ACCGPU !$ACC ENTER DATA COPYIN(FG%ZAA0,FG%ZAS0) ASYNC(1) #endif diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index c834cb993..e7a1fb18a 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -211,7 +211,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) NS(KMLOC) = (R_NSMAX-KM+2)/2 KS(KMLOC) = G_NDGLU(KM) AOFFSETS(KMLOC) = IIN_STRIDES0*D_OFFSETS_GEMM1(KMLOC) - BOFFSETS(KMLOC) = SIZE(ZAA,1)*SIZE(ZAA,2)*(KMLOC-1) + BOFFSETS(KMLOC) = D%OFFSETS_GEMM_MATRIX(KMLOC) COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM2(KMLOC) ENDDO IF(IMLOC0(1) > 0) THEN @@ -230,7 +230,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) & 2*KF_FS, NS(:), KS(:), & & 1.0_JPRBT, & & ZINPA, IIN_STRIDES0, AOFFSETS, & - & ZAA, SIZE(ZAA,1), BOFFSETS, & + & ZAA, D%LEGENDRE_MATRIX_STRIDES, BOFFSETS, & & 0.0_JPRBT, & & ZOUT, IOUT_STRIDES0, COFFSETS, & & D_NUMP, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR) @@ -331,7 +331,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) NS(KMLOC) = (R_NSMAX-KM+3)/2 KS(KMLOC) = G_NDGLU(KM) AOFFSETS(KMLOC) = IIN_STRIDES0*D_OFFSETS_GEMM1(KMLOC) - BOFFSETS(KMLOC) = SIZE(ZAS,1)*SIZE(ZAS,2)*(KMLOC-1) + BOFFSETS(KMLOC) = D%OFFSETS_GEMM_MATRIX(KMLOC) COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM2(KMLOC) ENDDO IF(IMLOC0(1) > 0) THEN @@ -350,7 +350,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) & 2*KF_FS, NS(:), KS(:), & & 1.0_JPRBT, & & ZINPS, IIN_STRIDES0, AOFFSETS, & - & ZAS, SIZE(ZAS,1), BOFFSETS, & + & ZAS, D%LEGENDRE_MATRIX_STRIDES, BOFFSETS, & & 0.0_JPRBT, & & ZOUT, IOUT_STRIDES0, COFFSETS, & & D_NUMP, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR) diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index f2738f44b..8bfc2ac0e 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -271,7 +271,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) KS(KMLOC) = (R_NSMAX-KM+2)/2 NS(KMLOC) = G_NDGLU(KM) AOFFSETS(KMLOC) = IIN_STRIDES0*D_OFFSETS_GEMM2(KMLOC) - BOFFSETS(KMLOC) = SIZE(ZAA,1)*SIZE(ZAA,2)*(KMLOC-1) + BOFFSETS(KMLOC) = D%OFFSETS_GEMM_MATRIX(KMLOC) COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM1(KMLOC) ENDDO IF(IMLOC0(1) > 0) THEN @@ -290,7 +290,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) & 2*KF_LEG, NS(:), KS(:), & & 1.0_JPRBT, & & ZINP, IIN_STRIDES0, AOFFSETS, & - & ZAA, SIZE(ZAA,1), BOFFSETS, & + & ZAA, D%LEGENDRE_MATRIX_STRIDES, BOFFSETS, & & 0.0_JPRBT, & & ZOUTA, IOUT_STRIDES0, COFFSETS, & & D_NUMP, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR) @@ -411,7 +411,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) KS(KMLOC) = (R_NSMAX-KM+3)/2 NS(KMLOC) = G_NDGLU(KM) AOFFSETS(KMLOC) = IIN_STRIDES0*D_OFFSETS_GEMM2(KMLOC) - BOFFSETS(KMLOC) = SIZE(ZAS,1)*SIZE(ZAS,2)*(KMLOC-1) + BOFFSETS(KMLOC) = D%OFFSETS_GEMM_MATRIX(KMLOC) COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM1(KMLOC) ENDDO IF(IMLOC0(1) > 0) THEN @@ -430,7 +430,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) & 2*KF_LEG, NS(:), KS(:), & & 1.0_JPRBT, & & ZINP, IIN_STRIDES0, AOFFSETS, & - & ZAS, SIZE(ZAS,1), BOFFSETS, & + & ZAS, D%LEGENDRE_MATRIX_STRIDES, BOFFSETS, & & 0.0_JPRBT, & & ZOUTS, IOUT_STRIDES0, COFFSETS, & & D_NUMP, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR) diff --git a/src/trans/gpu/internal/sump_trans_mod.F90 b/src/trans/gpu/internal/sump_trans_mod.F90 index 5a1de3028..3c3b94d69 100755 --- a/src/trans/gpu/internal/sump_trans_mod.F90 +++ b/src/trans/gpu/internal/sump_trans_mod.F90 @@ -41,7 +41,7 @@ SUBROUTINE SUMP_TRANS INTEGER(KIND=JPIM) :: JM INTEGER(KIND=JPIM) :: JGL,IGL,IPLAT,ISENDSET,IRECVSET,JML,IPOS,IM INTEGER(KIND=JPIM) :: IGPTOT,IMEDIAP,IRESTM,JA,JB,IOFF,KMLOC,KM -INTEGER(KIND=JPIB) :: OFFSET1,OFFSET2 +INTEGER(KIND=JPIB) :: OFFSET1,OFFSET2,OFFSET3 INTEGER(KIND=JPIM),ALLOCATABLE :: IGPTOTL(:,:) REAL(KIND=JPRD),ALLOCATABLE :: ZDUM(:) @@ -272,23 +272,33 @@ SUBROUTINE SUMP_TRANS ALLOCATE(D%OFFSETS_GEMM1(D%NUMP+1)) ALLOCATE(D%OFFSETS_GEMM2(D%NUMP+1)) +ALLOCATE(D%OFFSETS_GEMM_MATRIX(D%NUMP+1)) +ALLOCATE(D%LEGENDRE_MATRIX_STRIDES(D%NUMP)) OFFSET1 = 0 OFFSET2 = 0 +OFFSET3 = 0 DO KMLOC=1,D%NUMP KM = D%MYMS(KMLOC) D%OFFSETS_GEMM1(KMLOC) = OFFSET1 D%OFFSETS_GEMM2(KMLOC) = OFFSET2 + D%OFFSETS_GEMM_MATRIX(KMLOC) = OFFSET3 !KM=0 is transformed in double precision, no need to store here IF (KM /= 0) THEN OFFSET1 = OFFSET1 + ALIGN(G%NDGLU(KM),8) ! N_OFFSET takes the max of the two GEMMs OFFSET2 = OFFSET2 + ALIGN((R%NSMAX-KM+3)/2,8) + + D%LEGENDRE_MATRIX_STRIDES(KMLOC) = ALIGN(G%NDGLU(KM),8) + ! Note that both sizes have to be aligned because we make the GEMMs + ! multiples of 8 + OFFSET3 = OFFSET3 + ALIGN((R%NSMAX-KM+3)/2,8) * D%LEGENDRE_MATRIX_STRIDES(KMLOC) ENDIF ENDDO D%OFFSETS_GEMM1(D%NUMP+1) = OFFSET1 D%OFFSETS_GEMM2(D%NUMP+1) = OFFSET2 +D%OFFSETS_GEMM_MATRIX(D%NUMP+1) = OFFSET3 ! ------------------------------------------------------------------ 9 FORMAT(1X,'ARRAY ',A10,' ALLOCATED ',8I8) diff --git a/src/trans/gpu/internal/tpm_fields_gpu.F90 b/src/trans/gpu/internal/tpm_fields_gpu.F90 index 09baef270..7f0694b57 100644 --- a/src/trans/gpu/internal/tpm_fields_gpu.F90 +++ b/src/trans/gpu/internal/tpm_fields_gpu.F90 @@ -19,8 +19,8 @@ MODULE TPM_FIELDS_GPU TYPE FIELDS_GPU_TYPE ! scratch arrays for ltinv and ltdir and associated dimension variables -REAL(KIND=JPRBT),ALLOCATABLE :: ZAA(:,:,:) !! JPRL for 1/2 -REAL(KIND=JPRBT),ALLOCATABLE :: ZAS(:,:,:) !! JPRL for 1/2 +REAL(KIND=JPRBT),ALLOCATABLE :: ZAA(:) !! JPRL for 1/2 +REAL(KIND=JPRBT),ALLOCATABLE :: ZAS(:) !! JPRL for 1/2 ! for m=0 in ledir_mod: REAL(KIND=JPRD),ALLOCATABLE :: ZAA0(:,:)