diff --git a/src/trans/common/internal/tpm_distr.F90 b/src/trans/common/internal/tpm_distr.F90
index 6ae33844..eddb1684 100755
--- a/src/trans/common/internal/tpm_distr.F90
+++ b/src/trans/common/internal/tpm_distr.F90
@@ -171,7 +171,8 @@ MODULE TPM_DISTR
 REAL(KIND=JPRD) ,ALLOCATABLE :: RWEIGHT(:) ! Weight per grid-point (if weighted distribution)
 INTEGER(KIND=JPIM) ,ALLOCATABLE :: NPROCA_GP(:) ! Number of grid-points per a-set
 
-INTEGER(KIND=JPIB), ALLOCATABLE :: OFFSETS_GEMM1(:), OFFSETS_GEMM2(:)
+INTEGER(KIND=JPIB), ALLOCATABLE :: OFFSETS_GEMM1(:), OFFSETS_GEMM2(:), OFFSETS_GEMM_MATRIX(:)
+INTEGER(KIND=JPIM), ALLOCATABLE :: LEGENDRE_MATRIX_STRIDES(:)
 
 END TYPE DISTR_TYPE
 
diff --git a/src/trans/gpu/algor/hicblas_cutlass.cuda.h b/src/trans/gpu/algor/hicblas_cutlass.cuda.h
index 9a42bb2f..8b0eacf8 100644
--- a/src/trans/gpu/algor/hicblas_cutlass.cuda.h
+++ b/src/trans/gpu/algor/hicblas_cutlass.cuda.h
@@ -154,11 +154,11 @@ class cutlass_sgemm_grouped<CutlassType::cutlass_fp32, TransA, TransB> {
 
 } // namespace detail
 template <cublasOperation_t TransA, cublasOperation_t TransB>
-void cutlass_sgemm_wrapper_grouped_op(int resol_id, int blas_id, int m, int *n, int *k,
+void cutlass_sgemm_wrapper_grouped_op(int resol_id, int blas_id, int m, const int *n, const int *k,
                                       float alpha, const float *A, int lda,
-                                      int64_t *offsetsA, const float *B, int ldb,
-                                      int64_t *offsetsB, float beta, float *C,
-                                      int ldc, int64_t *offsetsC, int batchCount,
+                                      const int64_t *offsetsA, const float *B, const int *ldb,
+                                      const int64_t *offsetsB, float beta, float *C,
+                                      int ldc, const int64_t *offsetsC, int batchCount,
                                       cudaStream_t stream,
                                       void *growing_allocator) {
   using namespace detail;
@@ -182,10 +182,10 @@ void cutlass_sgemm_wrapper_grouped_op(int resol_id, int blas_id, int m, int *n,
 }
 
 void cutlass_sgemm_wrapper_grouped(int resol_id, int blas_id, char transa, char transb,
-                                   int m, int *n, int *k, float alpha,
-                                   const float *A, int lda, int64_t *offsetsA,
-                                   const float *B, int ldb, int64_t *offsetsB, float beta,
-                                   float *C, int ldc, int64_t *offsetsC,
+                                   int m, const int *n, const int *k, float alpha,
+                                   const float *A, int lda, const int64_t *offsetsA,
+                                   const float *B, const int *ldb, const int64_t *offsetsB, float beta,
+                                   float *C, int ldc, const int64_t *offsetsC,
                                    int batchCount, cudaStream_t stream,
                                    void *growing_allocator) {
 
diff --git a/src/trans/gpu/algor/hicblas_gemm.hip.cpp b/src/trans/gpu/algor/hicblas_gemm.hip.cpp
index 252dc7dd..5346667e 100644
--- a/src/trans/gpu/algor/hicblas_gemm.hip.cpp
+++ b/src/trans/gpu/algor/hicblas_gemm.hip.cpp
@@ -89,10 +89,10 @@ template <typename Gemm> void erase_from_caches(int resol_id) {
 
 // this version is using graphs and caches the graphs
 template <typename Gemm, typename Real>
-void run_group_graph(Gemm &&gemm, int resol_id, int m, int *n, int *k,
-                     Real alpha, const Real *A, int lda, int64_t *offsetsA,
-                     const Real *B, int ldb, int64_t *offsetsB, Real beta,
-                     Real *C, int ldc, int64_t *offsetsC, int batchCount,
+void run_group_graph(Gemm &&gemm, int resol_id, int m, const int *n, const int *k,
+                     Real alpha, const Real *A, int lda, const int64_t *offsetsA,
+                     const Real *B, const int *ldb, const int64_t *offsetsB, Real beta,
+                     Real *C, int ldc, const int64_t *offsetsC, int batchCount,
                      hipStream_t stream, int blas_id, void *growing_allocator) {
   growing_allocator_register_free_c(growing_allocator,
                                     free_gemm_graph_cache<Gemm>);
@@ -138,7 +138,7 @@ void run_group_graph(Gemm &&gemm, int resol_id, int m, int *n, int *k,
 
       HIC_CHECK(hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal));
       gemm(stream, m, n[i], k[i], alpha, A + offsetsA[i], lda, B + offsetsB[i],
-           ldb, beta, C + offsetsC[i], ldc);
+           ldb[i], beta, C + offsetsC[i], ldc);
       hipGraph_t my_graph;
       HIC_CHECK(hipStreamEndCapture(stream, &my_graph));
       hipGraphNode_t my_node;
@@ -163,16 +163,16 @@ void run_group_graph(Gemm &&gemm, int resol_id, int m, int *n, int *k,
 
 // stupid simple gemm calls
 template <typename Gemm, typename Real>
-void run_group(Gemm &&gemm, int resol_id, int m, int *n, int *k, Real alpha,
-               const Real *A, int lda, int64_t *offsetsA, const Real *B,
-               int ldb, int64_t *offsetsB, Real beta, Real *C, int ldc,
-               int64_t *offsetsC, int batchCount, hipStream_t stream,
+void run_group(Gemm &&gemm, int resol_id, int m, const int *n, const int *k, Real alpha,
+               const Real *A, int lda, const int64_t *offsetsA, const Real *B,
+               const int *ldb, const int64_t *offsetsB, Real beta, Real *C, int ldc,
+               const int64_t *offsetsC, int batchCount, hipStream_t stream,
                int = -1) {
   for (int i = 0; i < batchCount; ++i) {
     if (m == 0 || n[i] == 0 || k[i] == 0)
       continue;
     gemm(stream, m, n[i], k[i], alpha, A + offsetsA[i], lda, B + offsetsB[i],
-         ldb, beta, C + offsetsC[i], ldc);
+         ldb[i], beta, C + offsetsC[i], ldc);
   }
 }
 
@@ -216,11 +216,11 @@ template <typename Real> struct hipblas_gemm_grouped {
 #ifndef USE_CUTLASS
 
 void hipblas_sgemm_wrapper_grouped(int resol_id, int blas_id, char transa,
-                                   char transb, int m, int *n, int *k,
+                                   char transb, int m, const int *n, const int *k,
                                    float alpha, const float *A, int lda,
-                                   int64_t *offsetsA, const float *B, int ldb,
-                                   int64_t *offsetsB, float beta, float *C,
-                                   int ldc, int64_t *offsetsC, int batchCount,
+                                   const int64_t *offsetsA, const float *B, const int *ldb,
+                                   const int64_t *offsetsB, float beta, float *C,
+                                   int ldc, const int64_t *offsetsC, int batchCount,
                                    hipStream_t stream,
                                    void *growing_allocator) {
 
@@ -244,11 +244,11 @@ void hipblas_sgemm_wrapper_grouped(int resol_id, int blas_id, char transa,
 #endif
 
 void hipblas_dgemm_wrapper_grouped(int resol_id, int blas_id, char transa,
-                                   char transb, int m, int *n, int *k,
+                                   char transb, int m, const int *n, const int *k,
                                    double alpha, const double *A, int lda,
-                                   int64_t *offsetsA, const double *B, int ldb,
-                                   int64_t *offsetsB, double beta, double *C,
-                                   int ldc, int64_t *offsetsC, int batchCount,
+                                   const int64_t *offsetsA, double const *B, const int *ldb,
+                                   const int64_t *offsetsB, double beta, double *C,
+                                   int ldc, const int64_t *offsetsC, int batchCount,
                                    hipStream_t stream, void *) {
 
   hipblasOperation_t op_t1 = HIPBLAS_OP_N, op_t2 = HIPBLAS_OP_N;
@@ -314,11 +314,11 @@ void hipblas_sgemm_wrapper(char transa, char transb, int m, int n, int k,
 }
 
 void hipblas_sgemm_wrapper_grouped(int resol_id, int blas_id, char transa,
-                                   char transb, int m, int *n, int *k,
+                                   char transb, int m, const int *n, const int *k,
                                    float alpha, const float *A, int lda,
-                                   int64_t *offsetsA, const float *B, int ldb,
-                                   int64_t *offsetsB, float beta, float *C,
-                                   int ldc, int64_t *offsetsC, int batchCount,
+                                   const int64_t *offsetsA, const float *B, const int *ldb,
+                                   const int64_t *offsetsB, float beta, float *C,
+                                   int ldc, const int64_t *offsetsC, int batchCount,
                                    size_t stream, void *growing_allocator) {
 #ifdef USE_CUTLASS
   cutlass_sgemm_wrapper_grouped(resol_id, blas_id, transa, transb, m, n, k,
@@ -334,11 +334,11 @@ void hipblas_sgemm_wrapper_grouped(int resol_id, int blas_id, char transa,
 }
 
 void hipblas_dgemm_wrapper_grouped(int resol_id, int blas_id, char transa,
-                                   char transb, int m, int *n, int *k,
-                                   double alpha, const double *A, int lda,
-                                   int64_t *offsetsA, const double *B, int ldb,
-                                   int64_t *offsetsB, double beta, double *C,
-                                   int ldc, int64_t *offsetsC, int batchCount,
+                                   char transb, int m, const int *n, const int *k,
+                                   double alpha, double const *A, int lda,
+                                   const int64_t *offsetsA, double const *B, const int *ldb,
+                                   const int64_t *offsetsB, double beta, double *C,
+                                   int ldc, const int64_t *offsetsC, int batchCount,
                                    size_t stream, void *growing_allocator) {
   hipblas_dgemm_wrapper_grouped(resol_id, blas_id, transa, transb, m, n, k,
                                 alpha, A, lda, offsetsA, B, ldb, offsetsB, beta,
diff --git a/src/trans/gpu/algor/hicblas_mod.F90 b/src/trans/gpu/algor/hicblas_mod.F90
index f7528a11..5949db68 100644
--- a/src/trans/gpu/algor/hicblas_mod.F90
+++ b/src/trans/gpu/algor/hicblas_mod.F90
@@ -83,8 +83,8 @@ SUBROUTINE HIP_DGEMM_GROUPED( &
   &) BIND(C, NAME='hipblas_dgemm_wrapper_grouped')
     USE ISO_C_BINDING, ONLY: C_CHAR, C_INT, C_DOUBLE, C_SIZE_T, C_PTR, C_INT64_T
     CHARACTER(1,C_CHAR), VALUE :: CTA, CTB
-    INTEGER(C_INT), VALUE  :: RESOL_ID, BLAS_ID, M, LDA, LDB, LDC, BATCHCOUNT
-    INTEGER(C_INT)         :: N(*), K(*)
+    INTEGER(C_INT), VALUE  :: RESOL_ID, BLAS_ID, M, LDA, LDC, BATCHCOUNT
+    INTEGER(C_INT)         :: N(*), K(*), LDB(*)
     INTEGER(C_INT64_T)     :: OFFSETA(*), OFFSETB(*), OFFSETC(*)
     REAL(C_DOUBLE), VALUE  :: ALPHA,BETA
     REAL(C_DOUBLE)         :: A(*), B(*), C(*)
@@ -104,8 +104,8 @@ SUBROUTINE HIP_SGEMM_GROUPED( &
   &) BIND(C, NAME='hipblas_sgemm_wrapper_grouped')
     USE ISO_C_BINDING, ONLY: C_CHAR, C_INT, C_FLOAT, C_SIZE_T, C_PTR, C_INT64_T
     CHARACTER(1,C_CHAR), VALUE :: CTA, CTB
-    INTEGER(C_INT), VALUE :: RESOL_ID, BLAS_ID, M, LDA, LDB, LDC, BATCHCOUNT
-    INTEGER(C_INT)        :: N(*), K(*)
+    INTEGER(C_INT), VALUE :: RESOL_ID, BLAS_ID, M, LDA, LDC, BATCHCOUNT
+    INTEGER(C_INT)        :: N(*), K(*), LDB(*)
     INTEGER(C_INT64_T)    :: OFFSETA(*), OFFSETB(*), OFFSETC(*)
     REAL(C_FLOAT), VALUE  :: ALPHA,BETA
     REAL(C_FLOAT)         :: A(*), B(*), C(*)
@@ -231,7 +231,7 @@ SUBROUTINE HIP_DGEMM_GROUPED_OVERLOAD( &
   INTEGER(KIND=JPIM) :: LDA
   INTEGER(KIND=JPIB) :: OFFSETA(:)
   REAL(KIND=JPRD), DIMENSION(*) :: BARRAY
-  INTEGER(KIND=JPIM) :: LDB
+  INTEGER(KIND=JPIM) :: LDB(:)
   INTEGER(KIND=JPIB) :: OFFSETB(:)
   REAL(KIND=JPRD) :: BETA
   REAL(KIND=JPRD), DIMENSION(:) :: CARRAY
@@ -277,8 +277,8 @@ SUBROUTINE HIP_SGEMM_GROUPED_OVERLOAD(&
   REAL(KIND=JPRM), DIMENSION(:) :: AARRAY
   INTEGER(KIND=JPIM) :: LDA
   INTEGER(KIND=JPIB) :: OFFSETA(:)
-  REAL(KIND=JPRM), DIMENSION(:,:,:) :: BARRAY
-  INTEGER(KIND=JPIM) :: LDB
+  REAL(KIND=JPRM), DIMENSION(*) :: BARRAY
+  INTEGER(KIND=JPIM) :: LDB(:)
   INTEGER(KIND=JPIB) :: OFFSETB(:)
   REAL(KIND=JPRM) :: BETA
   REAL(KIND=JPRM), DIMENSION(:) :: CARRAY
diff --git a/src/trans/gpu/external/setup_trans.F90 b/src/trans/gpu/external/setup_trans.F90
index b1fdfc88..46087f28 100755
--- a/src/trans/gpu/external/setup_trans.F90
+++ b/src/trans/gpu/external/setup_trans.F90
@@ -98,7 +98,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,&
 !        R. El Khatib 07-Mar-2016 Better flexibility for Legendre polynomials computation in stretched mode
 !     ------------------------------------------------------------------
 
-USE PARKIND1,        ONLY: JPIM, JPRB, JPRD
+USE PARKIND1,        ONLY: JPIM, JPRB, JPRD, JPIB
 USE PARKIND_ECTRANS, ONLY: JPRBT
 
 !ifndef INTERFACE
@@ -166,12 +166,10 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,&
 
 ! Local variables
 INTEGER(KIND=JPIM) :: JGL,JRES,IDEF_RESOL
-INTEGER(KIND=JPIM) :: JMLOC, KM, ILA, ILS, KMLOC, KDGLU, JK, I, J
-
-INTEGER(KIND=JPIM) :: IPROC, IPROCS, ISTAN, ISTAS, ISL, IGLS, JFLD, IMLOC0(1)
+INTEGER(KIND=JPIM) :: JMLOC, KM, ILA, ILS, KDGLU
+INTEGER(KIND=JPIM) :: IMLOC0
 
 LOGICAL :: LLP1,LLP2, LLSPSETUPONLY
-REAL(KIND=JPRD)    :: ZTIME0,ZTIME1,ZTIME2
 REAL(KIND=JPHOOK) :: ZHOOK_HANDLE
 
 CHARACTER(LEN=8)  :: CENV
@@ -181,7 +179,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,&
 #endif
 INTEGER :: INUMDEVS, IUNIT, ISTAT, IDEV, MYGPU
 
-#include "user_clock.intfb.h"
+REAL(KIND=JPRBT), POINTER :: LOCAL_ARR(:,:)
 !     ------------------------------------------------------------------
 
 IF (LHOOK) CALL DR_HOOK('SETUP_TRANS',0,ZHOOK_HANDLE)
@@ -470,35 +468,42 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,&
 
 ! Initialize A arrays
 
-ALLOCATE(FG%ZAA(ALIGN(R%NDGNH,8),ALIGN((R%NTMAX+2)/2,8),D%NUMP))
-ALLOCATE(FG%ZAS(ALIGN(R%NDGNH,8),ALIGN((R%NTMAX+3)/2,8),D%NUMP))
+ALLOCATE(FG%ZAA(D%OFFSETS_GEMM_MATRIX(D%NUMP+1)))
+ALLOCATE(FG%ZAS(D%OFFSETS_GEMM_MATRIX(D%NUMP+1)))
 
-FG%ZAA(:,:,:) = 0._JPRBT
-FG%ZAS(:,:,:) = 0._JPRBT
+FG%ZAA(:) = 0._JPRBT
+FG%ZAS(:) = 0._JPRBT
 
+IMLOC0 = 0
 DO JMLOC=1,D%NUMP
   KM = D%MYMS(JMLOC)
   KDGLU = G%NDGLU(KM)
   ILA = (R%NSMAX-KM+2)/2
   ILS = (R%NSMAX-KM+3)/2
 
-  FG%ZAA(1:KDGLU,1:ILA,JMLOC)=S%FA(JMLOC)%RPNMA(1:KDGLU,1:ILA)
-  FG%ZAS(1:KDGLU,1:ILS,JMLOC)=S%FA(JMLOC)%RPNMS(1:KDGLU,1:ILS)
-ENDDO
+  IF (KM /= 0) THEN
+    CALL C_F_POINTER(C_LOC(FG%ZAA(1+D%OFFSETS_GEMM_MATRIX(JMLOC))), LOCAL_ARR, &
+        & (/D%LEGENDRE_MATRIX_STRIDES(JMLOC),ILA/))
+    LOCAL_ARR(1:KDGLU,1:ILA) = S%FA(JMLOC)%RPNMA(1:KDGLU,1:ILA)
 
-! arrays for m=0 in ledir_mod:
-IMLOC0 = FINDLOC(D%MYMS,0)
-IF(IMLOC0(1) > 0) THEN
-  ALLOCATE(FG%ZAA0(SIZE(FG%ZAA,1),SIZE(FG%ZAA,2)))
-  ALLOCATE(FG%ZAS0(SIZE(FG%ZAS,1),SIZE(FG%ZAS,2)))
-  FG%ZAA0 = FG%ZAA(:,:,IMLOC0(1))
-  FG%ZAS0 = FG%ZAS(:,:,IMLOC0(1))
-ENDIF
+    CALL C_F_POINTER(C_LOC(FG%ZAS(1+D%OFFSETS_GEMM_MATRIX(JMLOC))), LOCAL_ARR, &
+        & (/D%LEGENDRE_MATRIX_STRIDES(JMLOC),ILS/))
+    LOCAL_ARR(1:KDGLU,1:ILS) = S%FA(JMLOC)%RPNMS(1:KDGLU,1:ILS)
+  ELSE
+    IMLOC0 = JMLOC
+    ALLOCATE(FG%ZAA0(ALIGN(KDGLU,8),ILA))
+    ALLOCATE(FG%ZAS0(ALIGN(KDGLU,8),ILS))
+
+    FG%ZAA0(:,:) = 0
+    FG%ZAS0(:,:) = 0
+    FG%ZAA0(1:KDGLU,1:ILA)=S%FA(JMLOC)%RPNMA(1:KDGLU,1:ILA)
+    FG%ZAS0(1:KDGLU,1:ILS)=S%FA(JMLOC)%RPNMS(1:KDGLU,1:ILS)
+  ENDIF
+ENDDO
 
 ALLOCATE(FG%ZEPSNM(D%NUMP,0:R%NTMAX+2))
 FG%ZEPSNM = 0._JPRBT
-CALL PREPSNM !Initialize on the host
-
+CALL PREPSNM
 WRITE(NOUT,*)'setup_trans: sizes1 NUMP=',D%NUMP
 #ifdef ACCGPU
   WRITE(NOUT,*) 'Using OpenACC'
@@ -506,13 +511,13 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,&
 #ifdef OMPGPU
   WRITE(NOUT,*) 'Using OpenMP offloading'
 #endif
-WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAS', C_SIZEOF(FG%ZAS(1,1,1))*SIZE(FG%ZAS)
-WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAA', C_SIZEOF(FG%ZAA(1,1,1))*SIZE(FG%ZAA)
+WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAS', C_SIZEOF(FG%ZAS(1))*SIZE(FG%ZAS)
+WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAA', C_SIZEOF(FG%ZAA(1))*SIZE(FG%ZAA)
 WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAS0', C_SIZEOF(FG%ZAS0(1,1))*SIZE(FG%ZAS0)
 WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAA0', C_SIZEOF(FG%ZAA0(1,1))*SIZE(FG%ZAA0)
 WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZEPSNM', C_SIZEOF(FG%ZEPSNM(1,1))*SIZE(FG%ZEPSNM)
 
-IF (IMLOC0(1) > 0) THEN
+IF (IMLOC0 > 0) THEN
 #ifdef ACCGPU
   !$ACC ENTER DATA COPYIN(FG%ZAA0,FG%ZAS0) ASYNC(1)
 #endif
diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90
index c834cb99..e7a1fb18 100755
--- a/src/trans/gpu/internal/ledir_mod.F90
+++ b/src/trans/gpu/internal/ledir_mod.F90
@@ -211,7 +211,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS)
       NS(KMLOC) = (R_NSMAX-KM+2)/2
       KS(KMLOC) = G_NDGLU(KM)
       AOFFSETS(KMLOC) = IIN_STRIDES0*D_OFFSETS_GEMM1(KMLOC)
-      BOFFSETS(KMLOC) = SIZE(ZAA,1)*SIZE(ZAA,2)*(KMLOC-1)
+      BOFFSETS(KMLOC) = D%OFFSETS_GEMM_MATRIX(KMLOC)
       COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM2(KMLOC)
     ENDDO
     IF(IMLOC0(1) > 0) THEN
@@ -230,7 +230,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS)
       & 2*KF_FS, NS(:), KS(:), &
       & 1.0_JPRBT, &
       & ZINPA, IIN_STRIDES0, AOFFSETS, &
-      & ZAA, SIZE(ZAA,1), BOFFSETS, &
+      & ZAA, D%LEGENDRE_MATRIX_STRIDES, BOFFSETS, &
       & 0.0_JPRBT, &
       & ZOUT, IOUT_STRIDES0, COFFSETS, &
       & D_NUMP, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR)
@@ -331,7 +331,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS)
       NS(KMLOC) = (R_NSMAX-KM+3)/2
       KS(KMLOC) = G_NDGLU(KM)
       AOFFSETS(KMLOC) = IIN_STRIDES0*D_OFFSETS_GEMM1(KMLOC)
-      BOFFSETS(KMLOC) = SIZE(ZAS,1)*SIZE(ZAS,2)*(KMLOC-1)
+      BOFFSETS(KMLOC) = D%OFFSETS_GEMM_MATRIX(KMLOC)
       COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM2(KMLOC)
     ENDDO
     IF(IMLOC0(1) > 0) THEN
@@ -350,7 +350,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS)
       & 2*KF_FS, NS(:), KS(:), &
       & 1.0_JPRBT, &
       & ZINPS, IIN_STRIDES0, AOFFSETS, &
-      & ZAS, SIZE(ZAS,1), BOFFSETS, &
+      & ZAS, D%LEGENDRE_MATRIX_STRIDES, BOFFSETS, &
       & 0.0_JPRBT, &
       & ZOUT, IOUT_STRIDES0, COFFSETS, &
       & D_NUMP, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR)
diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90
index f2738f44..8bfc2ac0 100755
--- a/src/trans/gpu/internal/leinv_mod.F90
+++ b/src/trans/gpu/internal/leinv_mod.F90
@@ -271,7 +271,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG)
       KS(KMLOC) = (R_NSMAX-KM+2)/2
       NS(KMLOC) = G_NDGLU(KM)
       AOFFSETS(KMLOC) = IIN_STRIDES0*D_OFFSETS_GEMM2(KMLOC)
-      BOFFSETS(KMLOC) = SIZE(ZAA,1)*SIZE(ZAA,2)*(KMLOC-1)
+      BOFFSETS(KMLOC) = D%OFFSETS_GEMM_MATRIX(KMLOC)
       COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM1(KMLOC)
     ENDDO
     IF(IMLOC0(1) > 0) THEN
@@ -290,7 +290,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG)
         & 2*KF_LEG, NS(:), KS(:), &
         & 1.0_JPRBT, &
         & ZINP, IIN_STRIDES0, AOFFSETS, &
-        & ZAA, SIZE(ZAA,1), BOFFSETS, &
+        & ZAA, D%LEGENDRE_MATRIX_STRIDES, BOFFSETS, &
         & 0.0_JPRBT, &
         & ZOUTA, IOUT_STRIDES0, COFFSETS, &
         & D_NUMP, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR)
@@ -411,7 +411,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG)
       KS(KMLOC) = (R_NSMAX-KM+3)/2
       NS(KMLOC) = G_NDGLU(KM)
       AOFFSETS(KMLOC) = IIN_STRIDES0*D_OFFSETS_GEMM2(KMLOC)
-      BOFFSETS(KMLOC) = SIZE(ZAS,1)*SIZE(ZAS,2)*(KMLOC-1)
+      BOFFSETS(KMLOC) = D%OFFSETS_GEMM_MATRIX(KMLOC)
       COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM1(KMLOC)
     ENDDO
     IF(IMLOC0(1) > 0) THEN
@@ -430,7 +430,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG)
       & 2*KF_LEG, NS(:), KS(:), &
       & 1.0_JPRBT, &
       & ZINP, IIN_STRIDES0, AOFFSETS, &
-      & ZAS, SIZE(ZAS,1), BOFFSETS, &
+      & ZAS, D%LEGENDRE_MATRIX_STRIDES, BOFFSETS, &
       & 0.0_JPRBT, &
       & ZOUTS, IOUT_STRIDES0, COFFSETS, &
       & D_NUMP, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR)
diff --git a/src/trans/gpu/internal/sump_trans_mod.F90 b/src/trans/gpu/internal/sump_trans_mod.F90
index 5a1de302..3c3b94d6 100755
--- a/src/trans/gpu/internal/sump_trans_mod.F90
+++ b/src/trans/gpu/internal/sump_trans_mod.F90
@@ -41,7 +41,7 @@ SUBROUTINE SUMP_TRANS
 INTEGER(KIND=JPIM) :: JM
 INTEGER(KIND=JPIM) :: JGL,IGL,IPLAT,ISENDSET,IRECVSET,JML,IPOS,IM
 INTEGER(KIND=JPIM) :: IGPTOT,IMEDIAP,IRESTM,JA,JB,IOFF,KMLOC,KM
-INTEGER(KIND=JPIB) :: OFFSET1,OFFSET2
+INTEGER(KIND=JPIB) :: OFFSET1,OFFSET2,OFFSET3
 INTEGER(KIND=JPIM),ALLOCATABLE :: IGPTOTL(:,:)
 
 REAL(KIND=JPRD),ALLOCATABLE :: ZDUM(:)
@@ -272,23 +272,33 @@ SUBROUTINE SUMP_TRANS
 
 ALLOCATE(D%OFFSETS_GEMM1(D%NUMP+1))
 ALLOCATE(D%OFFSETS_GEMM2(D%NUMP+1))
+ALLOCATE(D%OFFSETS_GEMM_MATRIX(D%NUMP+1))
+ALLOCATE(D%LEGENDRE_MATRIX_STRIDES(D%NUMP))
 
 OFFSET1 = 0
 OFFSET2 = 0
+OFFSET3 = 0
 DO KMLOC=1,D%NUMP
   KM = D%MYMS(KMLOC)
   D%OFFSETS_GEMM1(KMLOC) = OFFSET1
   D%OFFSETS_GEMM2(KMLOC) = OFFSET2
+  D%OFFSETS_GEMM_MATRIX(KMLOC) = OFFSET3
 
   !KM=0 is transformed in double precision, no need to store here
   IF (KM /= 0) THEN
     OFFSET1 = OFFSET1 + ALIGN(G%NDGLU(KM),8)
     ! N_OFFSET takes the max of the two GEMMs
     OFFSET2 = OFFSET2 + ALIGN((R%NSMAX-KM+3)/2,8)
+
+    D%LEGENDRE_MATRIX_STRIDES(KMLOC) = ALIGN(G%NDGLU(KM),8)
+    ! Note that both sizes have to be aligned because we make the GEMMs
+    ! multiples of 8
+    OFFSET3 = OFFSET3 + ALIGN((R%NSMAX-KM+3)/2,8) * D%LEGENDRE_MATRIX_STRIDES(KMLOC)
   ENDIF
 ENDDO
 D%OFFSETS_GEMM1(D%NUMP+1) = OFFSET1
 D%OFFSETS_GEMM2(D%NUMP+1) = OFFSET2
+D%OFFSETS_GEMM_MATRIX(D%NUMP+1) = OFFSET3
 
 !     ------------------------------------------------------------------
 9 FORMAT(1X,'ARRAY ',A10,' ALLOCATED ',8I8)
diff --git a/src/trans/gpu/internal/tpm_fields_gpu.F90 b/src/trans/gpu/internal/tpm_fields_gpu.F90
index 09baef27..7f0694b5 100644
--- a/src/trans/gpu/internal/tpm_fields_gpu.F90
+++ b/src/trans/gpu/internal/tpm_fields_gpu.F90
@@ -19,8 +19,8 @@ MODULE TPM_FIELDS_GPU
 
 TYPE FIELDS_GPU_TYPE
 ! scratch arrays for ltinv and ltdir and associated dimension variables
-REAL(KIND=JPRBT),ALLOCATABLE :: ZAA(:,:,:)  !! JPRL for 1/2
-REAL(KIND=JPRBT),ALLOCATABLE :: ZAS(:,:,:)  !! JPRL for 1/2
+REAL(KIND=JPRBT),ALLOCATABLE :: ZAA(:)  !! JPRL for 1/2
+REAL(KIND=JPRBT),ALLOCATABLE :: ZAS(:)  !! JPRL for 1/2
 
 ! for m=0 in ledir_mod:
 REAL(KIND=JPRD),ALLOCATABLE :: ZAA0(:,:)