ecmwf-ifs · samhatfield · Jan 23, 2025 · Dec 16, 2024 · Jan 10, 2025 · Jan 20, 2025
diff --git a/src/trans/common/internal/tpm_distr.F90 b/src/trans/common/internal/tpm_distr.F90
@@ -171,7 +171,8 @@ MODULE TPM_DISTR
 REAL(KIND=JPRD) ,ALLOCATABLE :: RWEIGHT(:) ! Weight per grid-point (if weighted distribution)
 INTEGER(KIND=JPIM) ,ALLOCATABLE :: NPROCA_GP(:) ! Number of grid-points per a-set
 
-INTEGER(KIND=JPIB), ALLOCATABLE :: OFFSETS_GEMM1(:), OFFSETS_GEMM2(:)
+INTEGER(KIND=JPIB), ALLOCATABLE :: OFFSETS_GEMM1(:), OFFSETS_GEMM2(:), OFFSETS_GEMM_MATRIX(:)
+INTEGER(KIND=JPIM), ALLOCATABLE :: LEGENDRE_MATRIX_STRIDES(:)
 
 END TYPE DISTR_TYPE
 

diff --git a/src/trans/gpu/algor/hicblas_cutlass.cuda.h b/src/trans/gpu/algor/hicblas_cutlass.cuda.h
@@ -154,11 +154,11 @@ class cutlass_sgemm_grouped<CutlassType::cutlass_fp32, TransA, TransB> {
 
 } // namespace detail
 template <cublasOperation_t TransA, cublasOperation_t TransB>
-void cutlass_sgemm_wrapper_grouped_op(int resol_id, int blas_id, int m, int *n, int *k,
+void cutlass_sgemm_wrapper_grouped_op(int resol_id, int blas_id, int m, const int *n, const int *k,
                                       float alpha, const float *A, int lda,
-                                      int64_t *offsetsA, const float *B, int ldb,
-                                      int64_t *offsetsB, float beta, float *C,
-                                      int ldc, int64_t *offsetsC, int batchCount,
+                                      const int64_t *offsetsA, const float *B, const int *ldb,
+                                      const int64_t *offsetsB, float beta, float *C,
+                                      int ldc, const int64_t *offsetsC, int batchCount,
                                       cudaStream_t stream,
                                       void *growing_allocator) {
   using namespace detail;
@@ -182,10 +182,10 @@ void cutlass_sgemm_wrapper_grouped_op(int resol_id, int blas_id, int m, int *n,
 }
 
 void cutlass_sgemm_wrapper_grouped(int resol_id, int blas_id, char transa, char transb,
-                                   int m, int *n, int *k, float alpha,
-                                   const float *A, int lda, int64_t *offsetsA,
-                                   const float *B, int ldb, int64_t *offsetsB, float beta,
-                                   float *C, int ldc, int64_t *offsetsC,
+                                   int m, const int *n, const int *k, float alpha,
+                                   const float *A, int lda, const int64_t *offsetsA,
+                                   const float *B, const int *ldb, const int64_t *offsetsB, float beta,
+                                   float *C, int ldc, const int64_t *offsetsC,
                                    int batchCount, cudaStream_t stream,
                                    void *growing_allocator) {
 

diff --git a/src/trans/gpu/algor/hicblas_gemm.hip.cpp b/src/trans/gpu/algor/hicblas_gemm.hip.cpp
@@ -61,7 +61,7 @@ template <typename Gemm> auto &get_graph_cache() {
 template <typename Gemm> auto &get_ptr_cache() {
   using real_t = typename Gemm::real_type;
   static std::unordered_map<
-      cache_key, std::tuple<real_t const *, real_t const *, real_t const *>>
+      cache_key, std::tuple<const real_t *, const real_t *, const real_t *>>
       ptrCache;
   return ptrCache;
 }
@@ -89,10 +89,11 @@ template <typename Gemm> void erase_from_caches(int resol_id) {
 
 // this version is using graphs and caches the graphs
 template <typename Gemm, typename Real>
-void run_group_graph(Gemm &&gemm, int resol_id, int m, int *n, int *k,
-                     Real alpha, const Real *A, int lda, int64_t *offsetsA,
-                     const Real *B, int ldb, int64_t *offsetsB, Real beta,
-                     Real *C, int ldc, int64_t *offsetsC, int batchCount,
+void run_group_graph(Gemm &&gemm, int resol_id, int m, const int *n,
+                     const int *k, Real alpha, const Real *A, int lda,
+                     const int64_t *offsetsA, const Real *B, const int *ldb,
+                     const int64_t *offsetsB, Real beta, Real *C, int ldc,
+                     const int64_t *offsetsC, int batchCount,
                      hipStream_t stream, int blas_id, void *growing_allocator) {
   growing_allocator_register_free_c(growing_allocator,
                                     free_gemm_graph_cache<Gemm>);
@@ -138,7 +139,7 @@ void run_group_graph(Gemm &&gemm, int resol_id, int m, int *n, int *k,
 
       HIC_CHECK(hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal));
       gemm(stream, m, n[i], k[i], alpha, A + offsetsA[i], lda, B + offsetsB[i],
-           ldb, beta, C + offsetsC[i], ldc);
+           ldb[i], beta, C + offsetsC[i], ldc);
       hipGraph_t my_graph;
       HIC_CHECK(hipStreamEndCapture(stream, &my_graph));
       hipGraphNode_t my_node;
@@ -163,16 +164,16 @@ void run_group_graph(Gemm &&gemm, int resol_id, int m, int *n, int *k,
 
 // stupid simple gemm calls
 template <typename Gemm, typename Real>
-void run_group(Gemm &&gemm, int resol_id, int m, int *n, int *k, Real alpha,
-               const Real *A, int lda, int64_t *offsetsA, const Real *B,
-               int ldb, int64_t *offsetsB, Real beta, Real *C, int ldc,
-               int64_t *offsetsC, int batchCount, hipStream_t stream,
-               int = -1) {
+void run_group(Gemm &&gemm, int resol_id, int m, const int *n, const int *k,
+               Real alpha, const Real *A, int lda, const int64_t *offsetsA,
+               const Real *B, const int *ldb, const int64_t *offsetsB,
+               Real beta, Real *C, int ldc, const int64_t *offsetsC,
+               int batchCount, hipStream_t stream, int = -1) {
   for (int i = 0; i < batchCount; ++i) {
     if (m == 0 || n[i] == 0 || k[i] == 0)
       continue;
     gemm(stream, m, n[i], k[i], alpha, A + offsetsA[i], lda, B + offsetsB[i],
-         ldb, beta, C + offsetsC[i], ldc);
+         ldb[i], beta, C + offsetsC[i], ldc);
   }
 }
 
@@ -215,14 +216,12 @@ template <typename Real> struct hipblas_gemm_grouped {
 
 #ifndef USE_CUTLASS
 
-void hipblas_sgemm_wrapper_grouped(int resol_id, int blas_id, char transa,
-                                   char transb, int m, int *n, int *k,
-                                   float alpha, const float *A, int lda,
-                                   int64_t *offsetsA, const float *B, int ldb,
-                                   int64_t *offsetsB, float beta, float *C,
-                                   int ldc, int64_t *offsetsC, int batchCount,
-                                   hipStream_t stream,
-                                   void *growing_allocator) {
+void hipblas_sgemm_wrapper_grouped(
+    int resol_id, int blas_id, char transa, char transb, int m, const int *n,
+    const int *k, float alpha, const float *A, int lda, const int64_t *offsetsA,
+    const float *B, const int *ldb, const int64_t *offsetsB, float beta,
+    float *C, int ldc, const int64_t *offsetsC, int batchCount,
+    hipStream_t stream, void *growing_allocator) {
 
   hipblasOperation_t op_t1 = HIPBLAS_OP_N, op_t2 = HIPBLAS_OP_N;
   if (transa == 'T' || transa == 't')
@@ -244,12 +243,13 @@ void hipblas_sgemm_wrapper_grouped(int resol_id, int blas_id, char transa,
 #endif
 
 void hipblas_dgemm_wrapper_grouped(int resol_id, int blas_id, char transa,
-                                   char transb, int m, int *n, int *k,
-                                   double alpha, const double *A, int lda,
-                                   int64_t *offsetsA, const double *B, int ldb,
-                                   int64_t *offsetsB, double beta, double *C,
-                                   int ldc, int64_t *offsetsC, int batchCount,
-                                   hipStream_t stream, void *) {
+                                   char transb, int m, const int *n,
+                                   const int *k, double alpha, const double *A,
+                                   int lda, const int64_t *offsetsA,
+                                   const double *B, const int *ldb,
+                                   const int64_t *offsetsB, double beta,
+                                   double *C, int ldc, const int64_t *offsetsC,
+                                   int batchCount, hipStream_t stream, void *) {
 
   hipblasOperation_t op_t1 = HIPBLAS_OP_N, op_t2 = HIPBLAS_OP_N;
   if (transa == 'T' || transa == 't')
@@ -313,13 +313,12 @@ void hipblas_sgemm_wrapper(char transa, char transb, int m, int n, int k,
       batchCount));
 }
 
-void hipblas_sgemm_wrapper_grouped(int resol_id, int blas_id, char transa,
-                                   char transb, int m, int *n, int *k,
-                                   float alpha, const float *A, int lda,
-                                   int64_t *offsetsA, const float *B, int ldb,
-                                   int64_t *offsetsB, float beta, float *C,
-                                   int ldc, int64_t *offsetsC, int batchCount,
-                                   size_t stream, void *growing_allocator) {
+void hipblas_sgemm_wrapper_grouped(
+    int resol_id, int blas_id, char transa, char transb, int m, const int *n,
+    const int *k, float alpha, const float *A, int lda, const int64_t *offsetsA,
+    const float *B, const int *ldb, const int64_t *offsetsB, float beta,
+    float *C, int ldc, const int64_t *offsetsC, int batchCount, size_t stream,
+    void *growing_allocator) {
 #ifdef USE_CUTLASS
   cutlass_sgemm_wrapper_grouped(resol_id, blas_id, transa, transb, m, n, k,
                                 alpha, A, lda, offsetsA, B, ldb, offsetsB, beta,
@@ -334,12 +333,14 @@ void hipblas_sgemm_wrapper_grouped(int resol_id, int blas_id, char transa,
 }
 
 void hipblas_dgemm_wrapper_grouped(int resol_id, int blas_id, char transa,
-                                   char transb, int m, int *n, int *k,
-                                   double alpha, const double *A, int lda,
-                                   int64_t *offsetsA, const double *B, int ldb,
-                                   int64_t *offsetsB, double beta, double *C,
-                                   int ldc, int64_t *offsetsC, int batchCount,
-                                   size_t stream, void *growing_allocator) {
+                                   char transb, int m, const int *n,
+                                   const int *k, double alpha, const double *A,
+                                   int lda, const int64_t *offsetsA,
+                                   const double *B, const int *ldb,
+                                   const int64_t *offsetsB, double beta,
+                                   double *C, int ldc, const int64_t *offsetsC,
+                                   int batchCount, size_t stream,
+                                   void *growing_allocator) {
   hipblas_dgemm_wrapper_grouped(resol_id, blas_id, transa, transb, m, n, k,
                                 alpha, A, lda, offsetsA, B, ldb, offsetsB, beta,
                                 C, ldc, offsetsC, batchCount,

diff --git a/src/trans/gpu/algor/hicblas_mod.F90 b/src/trans/gpu/algor/hicblas_mod.F90
@@ -87,8 +87,8 @@ SUBROUTINE HIP_DGEMM_GROUPED( &
   &) BIND(C, NAME='hipblas_dgemm_wrapper_grouped')
     USE ISO_C_BINDING, ONLY: C_CHAR, C_INT, C_DOUBLE, C_SIZE_T, C_PTR, C_INT64_T
     CHARACTER(1,C_CHAR), VALUE :: CTA, CTB
-    INTEGER(C_INT), VALUE  :: RESOL_ID, BLAS_ID, M, LDA, LDB, LDC, BATCHCOUNT
-    INTEGER(C_INT)         :: N(*), K(*)
+    INTEGER(C_INT), VALUE  :: RESOL_ID, BLAS_ID, M, LDA, LDC, BATCHCOUNT
+    INTEGER(C_INT)         :: N(*), K(*), LDB(*)
     INTEGER(C_INT64_T)     :: OFFSETA(*), OFFSETB(*), OFFSETC(*)
     REAL(C_DOUBLE), VALUE  :: ALPHA,BETA
     REAL(C_DOUBLE)         :: A(*), B(*), C(*)
@@ -108,8 +108,8 @@ SUBROUTINE HIP_SGEMM_GROUPED( &
   &) BIND(C, NAME='hipblas_sgemm_wrapper_grouped')
     USE ISO_C_BINDING, ONLY: C_CHAR, C_INT, C_FLOAT, C_SIZE_T, C_PTR, C_INT64_T
     CHARACTER(1,C_CHAR), VALUE :: CTA, CTB
-    INTEGER(C_INT), VALUE :: RESOL_ID, BLAS_ID, M, LDA, LDB, LDC, BATCHCOUNT
-    INTEGER(C_INT)        :: N(*), K(*)
+    INTEGER(C_INT), VALUE :: RESOL_ID, BLAS_ID, M, LDA, LDC, BATCHCOUNT
+    INTEGER(C_INT)        :: N(*), K(*), LDB(*)
     INTEGER(C_INT64_T)    :: OFFSETA(*), OFFSETB(*), OFFSETC(*)
     REAL(C_FLOAT), VALUE  :: ALPHA,BETA
     REAL(C_FLOAT)         :: A(*), B(*), C(*)
@@ -247,7 +247,7 @@ SUBROUTINE HIP_DGEMM_GROUPED_OVERLOAD( &
   INTEGER(KIND=JPIM) :: LDA
   INTEGER(KIND=JPIB) :: OFFSETA(:)
   REAL(KIND=JPRD), DIMENSION(*) :: BARRAY
-  INTEGER(KIND=JPIM) :: LDB
+  INTEGER(KIND=JPIM) :: LDB(:)
   INTEGER(KIND=JPIB) :: OFFSETB(:)
   REAL(KIND=JPRD) :: BETA
   REAL(KIND=JPRD), DIMENSION(:) :: CARRAY
@@ -297,8 +297,8 @@ SUBROUTINE HIP_SGEMM_GROUPED_OVERLOAD(&
   REAL(KIND=JPRM), DIMENSION(:) :: AARRAY
   INTEGER(KIND=JPIM) :: LDA
   INTEGER(KIND=JPIB) :: OFFSETA(:)
-  REAL(KIND=JPRM), DIMENSION(:,:,:) :: BARRAY
-  INTEGER(KIND=JPIM) :: LDB
+  REAL(KIND=JPRM), DIMENSION(*) :: BARRAY
+  INTEGER(KIND=JPIM) :: LDB(:)
   INTEGER(KIND=JPIB) :: OFFSETB(:)
   REAL(KIND=JPRM) :: BETA
   REAL(KIND=JPRM), DIMENSION(:) :: CARRAY

diff --git a/src/trans/gpu/external/setup_trans.F90 b/src/trans/gpu/external/setup_trans.F90
@@ -172,7 +172,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,&
 #endif
 INTEGER :: INUMDEVS, IDEV, MYGPU
 
-#include "user_clock.intfb.h"
+REAL(KIND=JPRBT), POINTER :: LOCAL_ARR(:,:)
 !     ------------------------------------------------------------------
 
 IF (LHOOK) CALL DR_HOOK('SETUP_TRANS',0,ZHOOK_HANDLE)
@@ -484,49 +484,54 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,&
 
   ! Initialize A arrays
 
-  ALLOCATE(FG%ZAA(ALIGN(R%NDGNH,8),ALIGN((R%NTMAX+2)/2,8),D%NUMP))
-  ALLOCATE(FG%ZAS(ALIGN(R%NDGNH,8),ALIGN((R%NTMAX+3)/2,8),D%NUMP))
+  ALLOCATE(FG%ZAA(D%OFFSETS_GEMM_MATRIX(D%NUMP+1)))
+  ALLOCATE(FG%ZAS(D%OFFSETS_GEMM_MATRIX(D%NUMP+1)))
 
-  FG%ZAA(:,:,:) = 0._JPRBT
-  FG%ZAS(:,:,:) = 0._JPRBT
+  FG%ZAA(:) = 0._JPRBT
+  FG%ZAS(:) = 0._JPRBT
 
   DO JMLOC=1,D%NUMP
     KM = D%MYMS(JMLOC)
     KDGLU = G%NDGLU(KM)
     ILA = (R%NSMAX-KM+2)/2
     ILS = (R%NSMAX-KM+3)/2
 
-    FG%ZAA(1:KDGLU,1:ILA,JMLOC)=S%FA(JMLOC)%RPNMA(1:KDGLU,1:ILA)
-    FG%ZAS(1:KDGLU,1:ILS,JMLOC)=S%FA(JMLOC)%RPNMS(1:KDGLU,1:ILS)
+    IF (KM /= 0) THEN
+      CALL C_F_POINTER(C_LOC(FG%ZAA(1+D%OFFSETS_GEMM_MATRIX(JMLOC))), LOCAL_ARR, &
+          & (/D%LEGENDRE_MATRIX_STRIDES(JMLOC),ILA/))
+      LOCAL_ARR(1:KDGLU,1:ILA) = S%FA(JMLOC)%RPNMA(1:KDGLU,1:ILA)
+
+      CALL C_F_POINTER(C_LOC(FG%ZAS(1+D%OFFSETS_GEMM_MATRIX(JMLOC))), LOCAL_ARR, &
+          & (/D%LEGENDRE_MATRIX_STRIDES(JMLOC),ILS/))
+      LOCAL_ARR(1:KDGLU,1:ILS) = S%FA(JMLOC)%RPNMS(1:KDGLU,1:ILS)
+    ELSE
+      ALLOCATE(FG%ZAA0(ALIGN(KDGLU,8),ILA))
+      ALLOCATE(FG%ZAS0(ALIGN(KDGLU,8),ILS))
+
+      FG%ZAA0(:,:) = 0
+      FG%ZAS0(:,:) = 0
+      FG%ZAA0(1:KDGLU,1:ILA)=S%FA(JMLOC)%RPNMA(1:KDGLU,1:ILA)
+      FG%ZAS0(1:KDGLU,1:ILS)=S%FA(JMLOC)%RPNMS(1:KDGLU,1:ILS)
+    ENDIF
   ENDDO
-
-  ! arrays for m=0 in ledir_mod:
-  IMLOC0 = FINDLOC(D%MYMS,0)
-  IF(IMLOC0(1) > 0) THEN
-    ALLOCATE(FG%ZAA0(SIZE(FG%ZAA,1),SIZE(FG%ZAA,2)))
-    ALLOCATE(FG%ZAS0(SIZE(FG%ZAS,1),SIZE(FG%ZAS,2)))
-    FG%ZAA0 = FG%ZAA(:,:,IMLOC0(1))
-    FG%ZAS0 = FG%ZAS(:,:,IMLOC0(1))
-  ENDIF
-
+
   ALLOCATE(FG%ZEPSNM(D%NUMP,0:R%NTMAX+2))
   FG%ZEPSNM = 0._JPRBT
-  CALL PREPSNM !Initialize on the host
-
+  CALL PREPSNM
   WRITE(NOUT,*)'setup_trans: sizes1 NUMP=',D%NUMP
 #ifdef ACCGPU
   WRITE(NOUT,*) 'Using OpenACC'
 #endif
 #ifdef OMPGPU
   WRITE(NOUT,*) 'Using OpenMP offloading'
 #endif
-  WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAS', C_SIZEOF(FG%ZAS(1,1,1))*SIZE(FG%ZAS)
-  WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAA', C_SIZEOF(FG%ZAA(1,1,1))*SIZE(FG%ZAA)
-  WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAS0', C_SIZEOF(FG%ZAS0(1,1))*SIZE(FG%ZAS0)
-  WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAA0', C_SIZEOF(FG%ZAA0(1,1))*SIZE(FG%ZAA0)
-  WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZEPSNM', C_SIZEOF(FG%ZEPSNM(1,1))*SIZE(FG%ZEPSNM)
+  WRITE(NOUT,'(A10,":",I11,"B")') 'FG%ZAS', C_SIZEOF(FG%ZAS(1))*SIZE(FG%ZAS)
+  WRITE(NOUT,'(A10,":",I11,"B")') 'FG%ZAA', C_SIZEOF(FG%ZAA(1))*SIZE(FG%ZAA)
+  WRITE(NOUT,'(A10,":",I11,"B")') 'FG%ZAS0', C_SIZEOF(FG%ZAS0(1,1))*SIZE(FG%ZAS0)
+  WRITE(NOUT,'(A10,":",I11,"B")') 'FG%ZAA0', C_SIZEOF(FG%ZAA0(1,1))*SIZE(FG%ZAA0)
+  WRITE(NOUT,'(A10,":",I11,"B")') 'FG%ZEPSNM', C_SIZEOF(FG%ZEPSNM(1,1))*SIZE(FG%ZEPSNM)
 
-  IF (IMLOC0(1) > 0) THEN
+  IF (ANY(D%MYMS == 0)) THEN
 #ifdef ACCGPU
     !$ACC ENTER DATA COPYIN(FG%ZAA0,FG%ZAS0) ASYNC(1)
 #endif

diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90
@@ -211,7 +211,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS)
       NS(KMLOC) = (R_NSMAX-KM+2)/2
       KS(KMLOC) = G_NDGLU(KM)
       AOFFSETS(KMLOC) = IIN_STRIDES0*D_OFFSETS_GEMM1(KMLOC)
-      BOFFSETS(KMLOC) = SIZE(ZAA,1)*SIZE(ZAA,2)*(KMLOC-1)
+      BOFFSETS(KMLOC) = D%OFFSETS_GEMM_MATRIX(KMLOC)
       COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM2(KMLOC)
     ENDDO
     IF(IMLOC0(1) > 0) THEN
@@ -230,7 +230,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS)
       & 2*KF_FS, NS(:), KS(:), &
       & 1.0_JPRBT, &
       & ZINPA, IIN_STRIDES0, AOFFSETS, &
-      & ZAA, SIZE(ZAA,1), BOFFSETS, &
+      & ZAA, D%LEGENDRE_MATRIX_STRIDES, BOFFSETS, &
       & 0.0_JPRBT, &
       & ZOUT, IOUT_STRIDES0, COFFSETS, &
       & D_NUMP, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR)
@@ -331,7 +331,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS)
       NS(KMLOC) = (R_NSMAX-KM+3)/2
       KS(KMLOC) = G_NDGLU(KM)
       AOFFSETS(KMLOC) = IIN_STRIDES0*D_OFFSETS_GEMM1(KMLOC)
-      BOFFSETS(KMLOC) = SIZE(ZAS,1)*SIZE(ZAS,2)*(KMLOC-1)
+      BOFFSETS(KMLOC) = D%OFFSETS_GEMM_MATRIX(KMLOC)
       COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM2(KMLOC)
     ENDDO
     IF(IMLOC0(1) > 0) THEN
@@ -350,7 +350,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS)
       & 2*KF_FS, NS(:), KS(:), &
       & 1.0_JPRBT, &
       & ZINPS, IIN_STRIDES0, AOFFSETS, &
-      & ZAS, SIZE(ZAS,1), BOFFSETS, &
+      & ZAS, D%LEGENDRE_MATRIX_STRIDES, BOFFSETS, &
       & 0.0_JPRBT, &
       & ZOUT, IOUT_STRIDES0, COFFSETS, &
       & D_NUMP, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR)

diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90
@@ -271,7 +271,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG)
       KS(KMLOC) = (R_NSMAX-KM+2)/2
       NS(KMLOC) = G_NDGLU(KM)
       AOFFSETS(KMLOC) = IIN_STRIDES0*D_OFFSETS_GEMM2(KMLOC)
-      BOFFSETS(KMLOC) = SIZE(ZAA,1)*SIZE(ZAA,2)*(KMLOC-1)
+      BOFFSETS(KMLOC) = D%OFFSETS_GEMM_MATRIX(KMLOC)
       COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM1(KMLOC)
     ENDDO
     IF(IMLOC0(1) > 0) THEN
@@ -290,7 +290,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG)
         & 2*KF_LEG, NS(:), KS(:), &
         & 1.0_JPRBT, &
         & ZINP, IIN_STRIDES0, AOFFSETS, &
-        & ZAA, SIZE(ZAA,1), BOFFSETS, &
+        & ZAA, D%LEGENDRE_MATRIX_STRIDES, BOFFSETS, &
         & 0.0_JPRBT, &
         & ZOUTA, IOUT_STRIDES0, COFFSETS, &
         & D_NUMP, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR)
@@ -411,7 +411,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG)
       KS(KMLOC) = (R_NSMAX-KM+3)/2
       NS(KMLOC) = G_NDGLU(KM)
       AOFFSETS(KMLOC) = IIN_STRIDES0*D_OFFSETS_GEMM2(KMLOC)
-      BOFFSETS(KMLOC) = SIZE(ZAS,1)*SIZE(ZAS,2)*(KMLOC-1)
+      BOFFSETS(KMLOC) = D%OFFSETS_GEMM_MATRIX(KMLOC)
       COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM1(KMLOC)
     ENDDO
     IF(IMLOC0(1) > 0) THEN
@@ -430,7 +430,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG)
       & 2*KF_LEG, NS(:), KS(:), &
       & 1.0_JPRBT, &
       & ZINP, IIN_STRIDES0, AOFFSETS, &
-      & ZAS, SIZE(ZAS,1), BOFFSETS, &
+      & ZAS, D%LEGENDRE_MATRIX_STRIDES, BOFFSETS, &
       & 0.0_JPRBT, &
       & ZOUTS, IOUT_STRIDES0, COFFSETS, &
       & D_NUMP, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR)