Skip to content

Commit

Permalink
compact legendre polynomials
Browse files Browse the repository at this point in the history
  • Loading branch information
lukasm91 committed Dec 16, 2024
1 parent 094eca2 commit 2a87188
Show file tree
Hide file tree
Showing 9 changed files with 111 additions and 94 deletions.
3 changes: 2 additions & 1 deletion src/trans/common/internal/tpm_distr.F90
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,8 @@ MODULE TPM_DISTR
REAL(KIND=JPRD) ,ALLOCATABLE :: RWEIGHT(:) ! Weight per grid-point (if weighted distribution)
INTEGER(KIND=JPIM) ,ALLOCATABLE :: NPROCA_GP(:) ! Number of grid-points per a-set

INTEGER(KIND=JPIB), ALLOCATABLE :: OFFSETS_GEMM1(:), OFFSETS_GEMM2(:)
INTEGER(KIND=JPIB), ALLOCATABLE :: OFFSETS_GEMM1(:), OFFSETS_GEMM2(:), OFFSETS_GEMM_MATRIX(:)
INTEGER(KIND=JPIM), ALLOCATABLE :: LEGENDRE_MATRIX_STRIDES(:)

END TYPE DISTR_TYPE

Expand Down
16 changes: 8 additions & 8 deletions src/trans/gpu/algor/hicblas_cutlass.cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -154,11 +154,11 @@ class cutlass_sgemm_grouped<CutlassType::cutlass_fp32, TransA, TransB> {

} // namespace detail
template <cublasOperation_t TransA, cublasOperation_t TransB>
void cutlass_sgemm_wrapper_grouped_op(int resol_id, int blas_id, int m, int *n, int *k,
void cutlass_sgemm_wrapper_grouped_op(int resol_id, int blas_id, int m, const int *n, const int *k,
float alpha, const float *A, int lda,
int64_t *offsetsA, const float *B, int ldb,
int64_t *offsetsB, float beta, float *C,
int ldc, int64_t *offsetsC, int batchCount,
const int64_t *offsetsA, const float *B, const int *ldb,
const int64_t *offsetsB, float beta, float *C,
int ldc, const int64_t *offsetsC, int batchCount,
cudaStream_t stream,
void *growing_allocator) {
using namespace detail;
Expand All @@ -182,10 +182,10 @@ void cutlass_sgemm_wrapper_grouped_op(int resol_id, int blas_id, int m, int *n,
}

void cutlass_sgemm_wrapper_grouped(int resol_id, int blas_id, char transa, char transb,
int m, int *n, int *k, float alpha,
const float *A, int lda, int64_t *offsetsA,
const float *B, int ldb, int64_t *offsetsB, float beta,
float *C, int ldc, int64_t *offsetsC,
int m, const int *n, const int *k, float alpha,
const float *A, int lda, const int64_t *offsetsA,
const float *B, const int *ldb, const int64_t *offsetsB, float beta,
float *C, int ldc, const int64_t *offsetsC,
int batchCount, cudaStream_t stream,
void *growing_allocator) {

Expand Down
77 changes: 39 additions & 38 deletions src/trans/gpu/algor/hicblas_gemm.hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,11 @@ template <typename Gemm> void erase_from_caches(int resol_id) {

// this version is using graphs and caches the graphs
template <typename Gemm, typename Real>
void run_group_graph(Gemm &&gemm, int resol_id, int m, int *n, int *k,
Real alpha, const Real *A, int lda, int64_t *offsetsA,
const Real *B, int ldb, int64_t *offsetsB, Real beta,
Real *C, int ldc, int64_t *offsetsC, int batchCount,
void run_group_graph(Gemm &&gemm, int resol_id, int m, const int *n,
const int *k, Real alpha, const Real *A, int lda,
const int64_t *offsetsA, const Real *B, const int *ldb,
const int64_t *offsetsB, Real beta, Real *C, int ldc,
const int64_t *offsetsC, int batchCount,
hipStream_t stream, int blas_id, void *growing_allocator) {
growing_allocator_register_free_c(growing_allocator,
free_gemm_graph_cache<Gemm>);
Expand Down Expand Up @@ -138,7 +139,7 @@ void run_group_graph(Gemm &&gemm, int resol_id, int m, int *n, int *k,

HIC_CHECK(hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal));
gemm(stream, m, n[i], k[i], alpha, A + offsetsA[i], lda, B + offsetsB[i],
ldb, beta, C + offsetsC[i], ldc);
ldb[i], beta, C + offsetsC[i], ldc);
hipGraph_t my_graph;
HIC_CHECK(hipStreamEndCapture(stream, &my_graph));
hipGraphNode_t my_node;
Expand All @@ -163,16 +164,16 @@ void run_group_graph(Gemm &&gemm, int resol_id, int m, int *n, int *k,

// stupid simple gemm calls
template <typename Gemm, typename Real>
void run_group(Gemm &&gemm, int resol_id, int m, int *n, int *k, Real alpha,
const Real *A, int lda, int64_t *offsetsA, const Real *B,
int ldb, int64_t *offsetsB, Real beta, Real *C, int ldc,
int64_t *offsetsC, int batchCount, hipStream_t stream,
int = -1) {
void run_group(Gemm &&gemm, int resol_id, int m, const int *n, const int *k,
Real alpha, const Real *A, int lda, const int64_t *offsetsA,
const Real *B, const int *ldb, const int64_t *offsetsB,
Real beta, Real *C, int ldc, const int64_t *offsetsC,
int batchCount, hipStream_t stream, int = -1) {
for (int i = 0; i < batchCount; ++i) {
if (m == 0 || n[i] == 0 || k[i] == 0)
continue;
gemm(stream, m, n[i], k[i], alpha, A + offsetsA[i], lda, B + offsetsB[i],
ldb, beta, C + offsetsC[i], ldc);
ldb[i], beta, C + offsetsC[i], ldc);
}
}

Expand Down Expand Up @@ -215,14 +216,12 @@ template <typename Real> struct hipblas_gemm_grouped {

#ifndef USE_CUTLASS

void hipblas_sgemm_wrapper_grouped(int resol_id, int blas_id, char transa,
char transb, int m, int *n, int *k,
float alpha, const float *A, int lda,
int64_t *offsetsA, const float *B, int ldb,
int64_t *offsetsB, float beta, float *C,
int ldc, int64_t *offsetsC, int batchCount,
hipStream_t stream,
void *growing_allocator) {
void hipblas_sgemm_wrapper_grouped(
int resol_id, int blas_id, char transa, char transb, int m, const int *n,
const int *k, float alpha, const float *A, int lda, const int64_t *offsetsA,
const float *B, const int *ldb, const int64_t *offsetsB, float beta,
float *C, int ldc, const int64_t *offsetsC, int batchCount,
hipStream_t stream, void *growing_allocator) {

hipblasOperation_t op_t1 = HIPBLAS_OP_N, op_t2 = HIPBLAS_OP_N;
if (transa == 'T' || transa == 't')
Expand All @@ -244,12 +243,13 @@ void hipblas_sgemm_wrapper_grouped(int resol_id, int blas_id, char transa,
#endif

void hipblas_dgemm_wrapper_grouped(int resol_id, int blas_id, char transa,
char transb, int m, int *n, int *k,
double alpha, const double *A, int lda,
int64_t *offsetsA, const double *B, int ldb,
int64_t *offsetsB, double beta, double *C,
int ldc, int64_t *offsetsC, int batchCount,
hipStream_t stream, void *) {
char transb, int m, const int *n,
const int *k, double alpha, const double *A,
int lda, const int64_t *offsetsA,
double const *B, const int *ldb,
const int64_t *offsetsB, double beta,
double *C, int ldc, const int64_t *offsetsC,
int batchCount, hipStream_t stream, void *) {

hipblasOperation_t op_t1 = HIPBLAS_OP_N, op_t2 = HIPBLAS_OP_N;
if (transa == 'T' || transa == 't')
Expand Down Expand Up @@ -313,13 +313,12 @@ void hipblas_sgemm_wrapper(char transa, char transb, int m, int n, int k,
batchCount));
}

void hipblas_sgemm_wrapper_grouped(int resol_id, int blas_id, char transa,
char transb, int m, int *n, int *k,
float alpha, const float *A, int lda,
int64_t *offsetsA, const float *B, int ldb,
int64_t *offsetsB, float beta, float *C,
int ldc, int64_t *offsetsC, int batchCount,
size_t stream, void *growing_allocator) {
void hipblas_sgemm_wrapper_grouped(
int resol_id, int blas_id, char transa, char transb, int m, const int *n,
const int *k, float alpha, const float *A, int lda, const int64_t *offsetsA,
const float *B, const int *ldb, const int64_t *offsetsB, float beta,
float *C, int ldc, const int64_t *offsetsC, int batchCount, size_t stream,
void *growing_allocator) {
#ifdef USE_CUTLASS
cutlass_sgemm_wrapper_grouped(resol_id, blas_id, transa, transb, m, n, k,
alpha, A, lda, offsetsA, B, ldb, offsetsB, beta,
Expand All @@ -334,12 +333,14 @@ void hipblas_sgemm_wrapper_grouped(int resol_id, int blas_id, char transa,
}

void hipblas_dgemm_wrapper_grouped(int resol_id, int blas_id, char transa,
char transb, int m, int *n, int *k,
double alpha, const double *A, int lda,
int64_t *offsetsA, const double *B, int ldb,
int64_t *offsetsB, double beta, double *C,
int ldc, int64_t *offsetsC, int batchCount,
size_t stream, void *growing_allocator) {
char transb, int m, const int *n,
const int *k, double alpha, double const *A,
int lda, const int64_t *offsetsA,
double const *B, const int *ldb,
const int64_t *offsetsB, double beta,
double *C, int ldc, const int64_t *offsetsC,
int batchCount, size_t stream,
void *growing_allocator) {
hipblas_dgemm_wrapper_grouped(resol_id, blas_id, transa, transb, m, n, k,
alpha, A, lda, offsetsA, B, ldb, offsetsB, beta,
C, ldc, offsetsC, batchCount,
Expand Down
14 changes: 7 additions & 7 deletions src/trans/gpu/algor/hicblas_mod.F90
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,8 @@ SUBROUTINE HIP_DGEMM_GROUPED( &
&) BIND(C, NAME='hipblas_dgemm_wrapper_grouped')
USE ISO_C_BINDING, ONLY: C_CHAR, C_INT, C_DOUBLE, C_SIZE_T, C_PTR, C_INT64_T
CHARACTER(1,C_CHAR), VALUE :: CTA, CTB
INTEGER(C_INT), VALUE :: RESOL_ID, BLAS_ID, M, LDA, LDB, LDC, BATCHCOUNT
INTEGER(C_INT) :: N(*), K(*)
INTEGER(C_INT), VALUE :: RESOL_ID, BLAS_ID, M, LDA, LDC, BATCHCOUNT
INTEGER(C_INT) :: N(*), K(*), LDB(*)
INTEGER(C_INT64_T) :: OFFSETA(*), OFFSETB(*), OFFSETC(*)
REAL(C_DOUBLE), VALUE :: ALPHA,BETA
REAL(C_DOUBLE) :: A(*), B(*), C(*)
Expand All @@ -104,8 +104,8 @@ SUBROUTINE HIP_SGEMM_GROUPED( &
&) BIND(C, NAME='hipblas_sgemm_wrapper_grouped')
USE ISO_C_BINDING, ONLY: C_CHAR, C_INT, C_FLOAT, C_SIZE_T, C_PTR, C_INT64_T
CHARACTER(1,C_CHAR), VALUE :: CTA, CTB
INTEGER(C_INT), VALUE :: RESOL_ID, BLAS_ID, M, LDA, LDB, LDC, BATCHCOUNT
INTEGER(C_INT) :: N(*), K(*)
INTEGER(C_INT), VALUE :: RESOL_ID, BLAS_ID, M, LDA, LDC, BATCHCOUNT
INTEGER(C_INT) :: N(*), K(*), LDB(*)
INTEGER(C_INT64_T) :: OFFSETA(*), OFFSETB(*), OFFSETC(*)
REAL(C_FLOAT), VALUE :: ALPHA,BETA
REAL(C_FLOAT) :: A(*), B(*), C(*)
Expand Down Expand Up @@ -231,7 +231,7 @@ SUBROUTINE HIP_DGEMM_GROUPED_OVERLOAD( &
INTEGER(KIND=JPIM) :: LDA
INTEGER(KIND=JPIB) :: OFFSETA(:)
REAL(KIND=JPRD), DIMENSION(*) :: BARRAY
INTEGER(KIND=JPIM) :: LDB
INTEGER(KIND=JPIM) :: LDB(:)
INTEGER(KIND=JPIB) :: OFFSETB(:)
REAL(KIND=JPRD) :: BETA
REAL(KIND=JPRD), DIMENSION(:) :: CARRAY
Expand Down Expand Up @@ -277,8 +277,8 @@ SUBROUTINE HIP_SGEMM_GROUPED_OVERLOAD(&
REAL(KIND=JPRM), DIMENSION(:) :: AARRAY
INTEGER(KIND=JPIM) :: LDA
INTEGER(KIND=JPIB) :: OFFSETA(:)
REAL(KIND=JPRM), DIMENSION(:,:,:) :: BARRAY
INTEGER(KIND=JPIM) :: LDB
REAL(KIND=JPRM), DIMENSION(*) :: BARRAY
INTEGER(KIND=JPIM) :: LDB(:)
INTEGER(KIND=JPIB) :: OFFSETB(:)
REAL(KIND=JPRM) :: BETA
REAL(KIND=JPRM), DIMENSION(:) :: CARRAY
Expand Down
63 changes: 34 additions & 29 deletions src/trans/gpu/external/setup_trans.F90
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,&
! R. El Khatib 07-Mar-2016 Better flexibility for Legendre polynomials computation in stretched mode
! ------------------------------------------------------------------

USE PARKIND1, ONLY: JPIM, JPRB, JPRD
USE PARKIND1, ONLY: JPIM, JPRB, JPRD, JPIB
USE PARKIND_ECTRANS, ONLY: JPRBT

!ifndef INTERFACE
Expand Down Expand Up @@ -166,12 +166,10 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,&

! Local variables
INTEGER(KIND=JPIM) :: JGL,JRES,IDEF_RESOL
INTEGER(KIND=JPIM) :: JMLOC, KM, ILA, ILS, KMLOC, KDGLU, JK, I, J

INTEGER(KIND=JPIM) :: IPROC, IPROCS, ISTAN, ISTAS, ISL, IGLS, JFLD, IMLOC0(1)
INTEGER(KIND=JPIM) :: JMLOC, KM, ILA, ILS, KDGLU
INTEGER(KIND=JPIM) :: IMLOC0

LOGICAL :: LLP1,LLP2, LLSPSETUPONLY
REAL(KIND=JPRD) :: ZTIME0,ZTIME1,ZTIME2
REAL(KIND=JPHOOK) :: ZHOOK_HANDLE

CHARACTER(LEN=8) :: CENV
Expand All @@ -181,7 +179,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,&
#endif
INTEGER :: INUMDEVS, IUNIT, ISTAT, IDEV, MYGPU

#include "user_clock.intfb.h"
REAL(KIND=JPRBT), POINTER :: LOCAL_ARR(:,:)
! ------------------------------------------------------------------

IF (LHOOK) CALL DR_HOOK('SETUP_TRANS',0,ZHOOK_HANDLE)
Expand Down Expand Up @@ -470,49 +468,56 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,&

! Initialize A arrays

ALLOCATE(FG%ZAA(ALIGN(R%NDGNH,8),ALIGN((R%NTMAX+2)/2,8),D%NUMP))
ALLOCATE(FG%ZAS(ALIGN(R%NDGNH,8),ALIGN((R%NTMAX+3)/2,8),D%NUMP))
ALLOCATE(FG%ZAA(D%OFFSETS_GEMM_MATRIX(D%NUMP+1)))
ALLOCATE(FG%ZAS(D%OFFSETS_GEMM_MATRIX(D%NUMP+1)))

FG%ZAA(:,:,:) = 0._JPRBT
FG%ZAS(:,:,:) = 0._JPRBT
FG%ZAA(:) = 0._JPRBT
FG%ZAS(:) = 0._JPRBT

IMLOC0 = 0
DO JMLOC=1,D%NUMP
KM = D%MYMS(JMLOC)
KDGLU = G%NDGLU(KM)
ILA = (R%NSMAX-KM+2)/2
ILS = (R%NSMAX-KM+3)/2

FG%ZAA(1:KDGLU,1:ILA,JMLOC)=S%FA(JMLOC)%RPNMA(1:KDGLU,1:ILA)
FG%ZAS(1:KDGLU,1:ILS,JMLOC)=S%FA(JMLOC)%RPNMS(1:KDGLU,1:ILS)
ENDDO
IF (KM /= 0) THEN
CALL C_F_POINTER(C_LOC(FG%ZAA(1+D%OFFSETS_GEMM_MATRIX(JMLOC))), LOCAL_ARR, &
& (/D%LEGENDRE_MATRIX_STRIDES(JMLOC),ILA/))
LOCAL_ARR(1:KDGLU,1:ILA) = S%FA(JMLOC)%RPNMA(1:KDGLU,1:ILA)

! arrays for m=0 in ledir_mod:
IMLOC0 = FINDLOC(D%MYMS,0)
IF(IMLOC0(1) > 0) THEN
ALLOCATE(FG%ZAA0(SIZE(FG%ZAA,1),SIZE(FG%ZAA,2)))
ALLOCATE(FG%ZAS0(SIZE(FG%ZAS,1),SIZE(FG%ZAS,2)))
FG%ZAA0 = FG%ZAA(:,:,IMLOC0(1))
FG%ZAS0 = FG%ZAS(:,:,IMLOC0(1))
ENDIF
CALL C_F_POINTER(C_LOC(FG%ZAS(1+D%OFFSETS_GEMM_MATRIX(JMLOC))), LOCAL_ARR, &
& (/D%LEGENDRE_MATRIX_STRIDES(JMLOC),ILS/))
LOCAL_ARR(1:KDGLU,1:ILS) = S%FA(JMLOC)%RPNMS(1:KDGLU,1:ILS)
ELSE
IMLOC0 = JMLOC
ALLOCATE(FG%ZAA0(ALIGN(KDGLU,8),ILA))
ALLOCATE(FG%ZAS0(ALIGN(KDGLU,8),ILS))

FG%ZAA0(:,:) = 0
FG%ZAS0(:,:) = 0
FG%ZAA0(1:KDGLU,1:ILA)=S%FA(JMLOC)%RPNMA(1:KDGLU,1:ILA)
FG%ZAS0(1:KDGLU,1:ILS)=S%FA(JMLOC)%RPNMS(1:KDGLU,1:ILS)
ENDIF
ENDDO

ALLOCATE(FG%ZEPSNM(D%NUMP,0:R%NTMAX+2))
FG%ZEPSNM = 0._JPRBT
CALL PREPSNM !Initialize on the host

CALL PREPSNM
WRITE(NOUT,*)'setup_trans: sizes1 NUMP=',D%NUMP
#ifdef ACCGPU
WRITE(NOUT,*) 'Using OpenACC'
#endif
#ifdef OMPGPU
WRITE(NOUT,*) 'Using OpenMP offloading'
#endif
WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAS', C_SIZEOF(FG%ZAS(1,1,1))*SIZE(FG%ZAS)
WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAA', C_SIZEOF(FG%ZAA(1,1,1))*SIZE(FG%ZAA)
WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAS0', C_SIZEOF(FG%ZAS0(1,1))*SIZE(FG%ZAS0)
WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAA0', C_SIZEOF(FG%ZAA0(1,1))*SIZE(FG%ZAA0)
WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZEPSNM', C_SIZEOF(FG%ZEPSNM(1,1))*SIZE(FG%ZEPSNM)
WRITE(NOUT,'(A10,":",I11,"B")') 'FG%ZAS', C_SIZEOF(FG%ZAS(1))*SIZE(FG%ZAS)
WRITE(NOUT,'(A10,":",I11,"B")') 'FG%ZAA', C_SIZEOF(FG%ZAA(1))*SIZE(FG%ZAA)
WRITE(NOUT,'(A10,":",I11,"B")') 'FG%ZAS0', C_SIZEOF(FG%ZAS0(1,1))*SIZE(FG%ZAS0)
WRITE(NOUT,'(A10,":",I11,"B")') 'FG%ZAA0', C_SIZEOF(FG%ZAA0(1,1))*SIZE(FG%ZAA0)
WRITE(NOUT,'(A10,":",I11,"B")') 'FG%ZEPSNM', C_SIZEOF(FG%ZEPSNM(1,1))*SIZE(FG%ZEPSNM)

IF (IMLOC0(1) > 0) THEN
IF (IMLOC0 > 0) THEN
#ifdef ACCGPU
!$ACC ENTER DATA COPYIN(FG%ZAA0,FG%ZAS0) ASYNC(1)
#endif
Expand Down
8 changes: 4 additions & 4 deletions src/trans/gpu/internal/ledir_mod.F90
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS)
NS(KMLOC) = (R_NSMAX-KM+2)/2
KS(KMLOC) = G_NDGLU(KM)
AOFFSETS(KMLOC) = IIN_STRIDES0*D_OFFSETS_GEMM1(KMLOC)
BOFFSETS(KMLOC) = SIZE(ZAA,1)*SIZE(ZAA,2)*(KMLOC-1)
BOFFSETS(KMLOC) = D%OFFSETS_GEMM_MATRIX(KMLOC)
COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM2(KMLOC)
ENDDO
IF(IMLOC0(1) > 0) THEN
Expand All @@ -230,7 +230,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS)
& 2*KF_FS, NS(:), KS(:), &
& 1.0_JPRBT, &
& ZINPA, IIN_STRIDES0, AOFFSETS, &
& ZAA, SIZE(ZAA,1), BOFFSETS, &
& ZAA, D%LEGENDRE_MATRIX_STRIDES, BOFFSETS, &
& 0.0_JPRBT, &
& ZOUT, IOUT_STRIDES0, COFFSETS, &
& D_NUMP, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR)
Expand Down Expand Up @@ -331,7 +331,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS)
NS(KMLOC) = (R_NSMAX-KM+3)/2
KS(KMLOC) = G_NDGLU(KM)
AOFFSETS(KMLOC) = IIN_STRIDES0*D_OFFSETS_GEMM1(KMLOC)
BOFFSETS(KMLOC) = SIZE(ZAS,1)*SIZE(ZAS,2)*(KMLOC-1)
BOFFSETS(KMLOC) = D%OFFSETS_GEMM_MATRIX(KMLOC)
COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM2(KMLOC)
ENDDO
IF(IMLOC0(1) > 0) THEN
Expand All @@ -350,7 +350,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS)
& 2*KF_FS, NS(:), KS(:), &
& 1.0_JPRBT, &
& ZINPS, IIN_STRIDES0, AOFFSETS, &
& ZAS, SIZE(ZAS,1), BOFFSETS, &
& ZAS, D%LEGENDRE_MATRIX_STRIDES, BOFFSETS, &
& 0.0_JPRBT, &
& ZOUT, IOUT_STRIDES0, COFFSETS, &
& D_NUMP, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR)
Expand Down
8 changes: 4 additions & 4 deletions src/trans/gpu/internal/leinv_mod.F90
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG)
KS(KMLOC) = (R_NSMAX-KM+2)/2
NS(KMLOC) = G_NDGLU(KM)
AOFFSETS(KMLOC) = IIN_STRIDES0*D_OFFSETS_GEMM2(KMLOC)
BOFFSETS(KMLOC) = SIZE(ZAA,1)*SIZE(ZAA,2)*(KMLOC-1)
BOFFSETS(KMLOC) = D%OFFSETS_GEMM_MATRIX(KMLOC)
COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM1(KMLOC)
ENDDO
IF(IMLOC0(1) > 0) THEN
Expand All @@ -290,7 +290,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG)
& 2*KF_LEG, NS(:), KS(:), &
& 1.0_JPRBT, &
& ZINP, IIN_STRIDES0, AOFFSETS, &
& ZAA, SIZE(ZAA,1), BOFFSETS, &
& ZAA, D%LEGENDRE_MATRIX_STRIDES, BOFFSETS, &
& 0.0_JPRBT, &
& ZOUTA, IOUT_STRIDES0, COFFSETS, &
& D_NUMP, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR)
Expand Down Expand Up @@ -411,7 +411,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG)
KS(KMLOC) = (R_NSMAX-KM+3)/2
NS(KMLOC) = G_NDGLU(KM)
AOFFSETS(KMLOC) = IIN_STRIDES0*D_OFFSETS_GEMM2(KMLOC)
BOFFSETS(KMLOC) = SIZE(ZAS,1)*SIZE(ZAS,2)*(KMLOC-1)
BOFFSETS(KMLOC) = D%OFFSETS_GEMM_MATRIX(KMLOC)
COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM1(KMLOC)
ENDDO
IF(IMLOC0(1) > 0) THEN
Expand All @@ -430,7 +430,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG)
& 2*KF_LEG, NS(:), KS(:), &
& 1.0_JPRBT, &
& ZINP, IIN_STRIDES0, AOFFSETS, &
& ZAS, SIZE(ZAS,1), BOFFSETS, &
& ZAS, D%LEGENDRE_MATRIX_STRIDES, BOFFSETS, &
& 0.0_JPRBT, &
& ZOUTS, IOUT_STRIDES0, COFFSETS, &
& D_NUMP, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR)
Expand Down
Loading

0 comments on commit 2a87188

Please sign in to comment.