Skip to content

Commit

Permalink
compact legendre polynomials
Browse files Browse the repository at this point in the history
  • Loading branch information
lukasm91 committed Oct 16, 2024
1 parent ed88c5a commit 058dd37
Show file tree
Hide file tree
Showing 9 changed files with 85 additions and 79 deletions.
3 changes: 2 additions & 1 deletion src/trans/common/internal/tpm_distr.F90
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,8 @@ MODULE TPM_DISTR
REAL(KIND=JPRD) ,ALLOCATABLE :: RWEIGHT(:) ! Weight per grid-point (if weighted distribution)
INTEGER(KIND=JPIM) ,ALLOCATABLE :: NPROCA_GP(:) ! Number of grid-points per a-set

INTEGER(KIND=JPIM), ALLOCATABLE :: OFFSETS_GEMM1(:), OFFSETS_GEMM2(:)
INTEGER(KIND=JPIM), ALLOCATABLE :: OFFSETS_GEMM1(:), OFFSETS_GEMM2(:), OFFSETS_GEMM_MATRIX(:)
INTEGER(KIND=JPIM), ALLOCATABLE :: LEGENDRE_MATRIX_STRIDES(:)

END TYPE DISTR_TYPE

Expand Down
4 changes: 2 additions & 2 deletions src/trans/gpu/algor/hicblas_cutlass.cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ class cutlass_sgemm_grouped<CutlassType::cutlass_fp32, TransA, TransB> {
template <cublasOperation_t TransA, cublasOperation_t TransB>
void cutlass_sgemm_wrapper_grouped_op(int blas_id, int m, int *n, int *k,
float alpha, const float *A, int lda,
int *offsetsA, const float *B, int ldb,
int *offsetsA, const float *B, int *ldb,
int *offsetsB, float beta, float *C,
int ldc, int *offsetsC, int batchCount,
cudaStream_t stream,
Expand Down Expand Up @@ -179,7 +179,7 @@ void cutlass_sgemm_wrapper_grouped_op(int blas_id, int m, int *n, int *k,
void cutlass_sgemm_wrapper_grouped(int blas_id, char transa, char transb,
int m, int *n, int *k, float alpha,
const float *A, int lda, int *offsetsA,
const float *B, int ldb, int *offsetsB, float beta,
const float *B, int *ldb, int *offsetsB, float beta,
float *C, int ldc, int *offsetsC,
int batchCount, cudaStream_t stream,
void *growing_allocator) {
Expand Down
29 changes: 15 additions & 14 deletions src/trans/gpu/algor/hicblas_gemm.hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,12 @@ template <typename Gemm, typename Real> void free_gemm_cache(float *, size_t) {

// this version is using graphs and caches the graphs
template <typename Gemm, typename Real>
void run_group_graph(Gemm &&gemm, int m, int *n, int *k, Real alpha,
const Real *A, int lda, int *offsetsA, const Real *B,
int ldb, int *offsetsB, Real beta, Real *C, int ldc,
int *offsetsC, int batchCount, hipStream_t stream,
int blas_id, void *growing_allocator) {
void run_group_graph(Gemm &&gemm, int const m, int const *n, int const *k,
Real alpha, const Real *A, int lda, int const *offsetsA,
const Real *B, int const *ldb, int const *offsetsB, Real beta,
Real *C, int ldc, int const *offsetsC, int batchCount,
hipStream_t stream, int blas_id,
void *growing_allocator) {
growing_allocator_register_free_c(growing_allocator,
free_gemm_cache<Gemm, Real>);

Expand All @@ -86,7 +87,7 @@ void run_group_graph(Gemm &&gemm, int m, int *n, int *k, Real alpha,
// the plan is cached, but the pointers are not correct. we remove and
// delete the graph, but we keep the hipblas handles, if this happens more
// often, we should cache this...
std::cout << "WARNING GEMM: POINTER CHANGE - Graph recreation might be slow." << std::endl;
std::cout << "WARNING GEMM: POINTER CHANGE - Graph recreation might be slow.\n";
std::cout << "We have an entry with key {m=" << m << ", blas_id=" << blas_id
<< "}\n";
std::cout << "Pointers: " << std::get<0>(ptrs->second) << ", "
Expand All @@ -111,7 +112,7 @@ void run_group_graph(Gemm &&gemm, int m, int *n, int *k, Real alpha,

HIC_CHECK(hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal));
gemm(stream, m, n[i], k[i], alpha, A + offsetsA[i], lda, B + offsetsB[i],
ldb, beta, C + offsetsC[i], ldc);
ldb[i], beta, C + offsetsC[i], ldc);
hipGraph_t my_graph;
HIC_CHECK(hipStreamEndCapture(stream, &my_graph));
hipGraphNode_t my_node;
Expand All @@ -133,14 +134,14 @@ void run_group_graph(Gemm &&gemm, int m, int *n, int *k, Real alpha,
// stupid simple gemm calls
template <typename Gemm, typename Real>
void run_group(Gemm &&gemm, int m, int *n, int *k, Real alpha, const Real *A,
int lda, int *offsetsA, const Real *B, int ldb, int *offsetsB,
int lda, int *offsetsA, const Real *B, int *ldb, int *offsetsB,
Real beta, Real *C, int ldc, int *offsetsC, int batchCount,
hipStream_t stream, int = -1) {
for (int i = 0; i < batchCount; ++i) {
if (m == 0 || n[i] == 0 || k[i] == 0)
continue;
gemm(stream, m, n[i], k[i], alpha, A + offsetsA[i], lda, B + offsetsB[i], ldb,
beta, C + offsetsC[i], ldc);
gemm(stream, m, n[i], k[i], alpha, A + offsetsA[i], lda, B + offsetsB[i],
ldb[i], beta, C + offsetsC[i], ldc);
}
}

Expand Down Expand Up @@ -187,7 +188,7 @@ template <typename Real> struct hipblas_gemm_grouped {
void hipblas_sgemm_wrapper_grouped(int blas_id, char transa, char transb,
int m, int *n, int *k, float alpha,
const float *A, int lda, int *offsetsA,
const float *B, int ldb, int *offsetsB, float beta,
const float *B, int *ldb, int *offsetsB, float beta,
float *C, int ldc, int *offsetsC,
int batchCount, hipStream_t stream,
void *growing_allocator) {
Expand Down Expand Up @@ -216,7 +217,7 @@ void hipblas_dgemm_wrapper_grouped(int blas_id, char transa, char transb,
int m, int *n, int *k,
double alpha,
const double *A, int lda, int *offsetsA,
const double *B, int ldb, int *offsetsB,
const double *B, int *ldb, int *offsetsB,
double beta,
double *C, int ldc, int *offsetsC,
int batchCount, hipStream_t stream, void *) {
Expand Down Expand Up @@ -293,7 +294,7 @@ void hipblas_sgemm_wrapper (char transa, char transb,
void hipblas_sgemm_wrapper_grouped(int blas_id, char transa, char transb,
int m, int *n, int *k, float alpha,
const float *A, int lda, int *offsetsA,
const float *B, int ldb, int *offsetsB, float beta,
const float *B, int *ldb, int *offsetsB, float beta,
float *C, int ldc, int *offsetsC,
int batchCount, size_t stream,
void *growing_allocator) {
Expand All @@ -314,7 +315,7 @@ void hipblas_sgemm_wrapper_grouped(int blas_id, char transa, char transb,
void hipblas_dgemm_wrapper_grouped(int blas_id, char transa, char transb,
int m, int *n, int *k, double alpha,
const double *A, int lda, int *offsetsA,
const double *B, int ldb, int *offsetsB, double beta,
const double *B, int *ldb, int *offsetsB, double beta,
double *C, int ldc, int *offsetsC,
int batchCount, size_t stream,
void *growing_allocator) {
Expand Down
16 changes: 8 additions & 8 deletions src/trans/gpu/algor/hicblas_mod.F90
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,8 @@ SUBROUTINE HIP_DGEMM_GROUPED( &
&) BIND(C, NAME='hipblas_dgemm_wrapper_grouped')
USE ISO_C_BINDING, ONLY: C_CHAR, C_INT, C_DOUBLE, C_SIZE_T, C_PTR
CHARACTER(1,C_CHAR), VALUE :: CTA, CTB
INTEGER(C_INT), VALUE :: BLAS_ID, M, LDA, LDB, LDC, BATCHCOUNT
INTEGER(C_INT) :: N(*), K(*), OFFSETA(*), OFFSETB(*), OFFSETC(*)
INTEGER(C_INT), VALUE :: BLAS_ID, M, LDA, LDC, BATCHCOUNT
INTEGER(C_INT) :: N(*), K(*), LDB(*), OFFSETA(*), OFFSETB(*), OFFSETC(*)
REAL(C_DOUBLE), VALUE :: ALPHA,BETA
REAL(C_DOUBLE) :: A(*), B(*), C(*)
INTEGER(KIND=C_SIZE_T) :: STREAM
Expand All @@ -97,8 +97,8 @@ SUBROUTINE HIP_SGEMM_GROUPED( &
&) BIND(C, NAME='hipblas_sgemm_wrapper_grouped')
USE ISO_C_BINDING, ONLY: C_CHAR, C_INT, C_FLOAT, C_SIZE_T, C_PTR
CHARACTER(1,C_CHAR), VALUE :: CTA, CTB
INTEGER(C_INT), VALUE :: BLAS_ID, M, LDA, LDB, LDC, BATCHCOUNT
INTEGER(C_INT) :: N(*), K(*), OFFSETA(*), OFFSETB(*), OFFSETC(*)
INTEGER(C_INT), VALUE :: BLAS_ID, M, LDA, LDC, BATCHCOUNT
INTEGER(C_INT) :: N(*), K(*), LDB(*), OFFSETA(*), OFFSETB(*), OFFSETC(*)
REAL(C_FLOAT), VALUE :: ALPHA,BETA
REAL(C_FLOAT) :: A(*), B(*), C(*)
INTEGER(KIND=C_SIZE_T) :: STREAM
Expand Down Expand Up @@ -221,8 +221,8 @@ SUBROUTINE HIP_DGEMM_GROUPED_OVERLOAD( &
REAL(KIND=JPRD), DIMENSION(:) :: AARRAY
INTEGER(KIND=JPIM) :: LDA
INTEGER(KIND=JPIM) :: OFFSETA(:)
REAL(KIND=JPRD), DIMENSION(*) :: BARRAY
INTEGER(KIND=JPIM) :: LDB
REAL(KIND=JPRD), DIMENSION(:) :: BARRAY
INTEGER(KIND=JPIM) :: LDB(:)
INTEGER(KIND=JPIM) :: OFFSETB(:)
REAL(KIND=JPRD) :: BETA
REAL(KIND=JPRD), DIMENSION(:) :: CARRAY
Expand Down Expand Up @@ -267,8 +267,8 @@ SUBROUTINE HIP_SGEMM_GROUPED_OVERLOAD(&
REAL(KIND=JPRM), DIMENSION(:) :: AARRAY
INTEGER(KIND=JPIM) :: LDA
INTEGER(KIND=JPIM) :: OFFSETA(:)
REAL(KIND=JPRM), DIMENSION(:,:,:) :: BARRAY
INTEGER(KIND=JPIM) :: LDB
REAL(KIND=JPRM), DIMENSION(:) :: BARRAY
INTEGER(KIND=JPIM) :: LDB(:)
INTEGER(KIND=JPIM) :: OFFSETB(:)
REAL(KIND=JPRM) :: BETA
REAL(KIND=JPRM), DIMENSION(:) :: CARRAY
Expand Down
76 changes: 35 additions & 41 deletions src/trans/gpu/external/setup_trans.F90
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,&
! R. El Khatib 07-Mar-2016 Better flexibility for Legendre polynomials computation in stretched mode
! ------------------------------------------------------------------

USE PARKIND1, ONLY: JPIM, JPRB, JPRD
USE PARKIND1, ONLY: JPIM, JPRB, JPRD, JPIB
USE PARKIND_ECTRANS, ONLY: JPRBT

!ifndef INTERFACE
Expand Down Expand Up @@ -186,6 +186,8 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,&
#endif
INTEGER :: INUMDEVS, IUNIT, ISTAT, IDEV, MYGPU

REAL(KIND=JPRBT), POINTER :: LOCAL_ARR(:,:)

#include "user_clock.intfb.h"
! ------------------------------------------------------------------

Expand Down Expand Up @@ -499,33 +501,49 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,&

! Initialize A arrays

ALLOCATE(ZAA(ALIGN(R%NDGNH,8),ALIGN((R%NTMAX+2)/2,8),D%NUMP))
ALLOCATE(ZAS(ALIGN(R%NDGNH,8),ALIGN((R%NTMAX+3)/2,8),D%NUMP))
ALLOCATE(ZAA(D%OFFSETS_GEMM_MATRIX(D%NUMP+1)))
ALLOCATE(ZAS(D%OFFSETS_GEMM_MATRIX(D%NUMP+1)))

WRITE(NOUT,*)'setup_trans: sizes1 NUMP=',D%NUMP
WRITE(NOUT,*)'ZAS:',size(ZAS)
WRITE(NOUT,*)'ZAA:',size(ZAA)
WRITE(NOUT,'("ZAA: ", I, " B")') SIZE(ZAA,KIND=JPIB)*STORAGE_SIZE(ZAA(1))/8
WRITE(NOUT,'("ZAS: ", I, " B")') SIZE(ZAS,KIND=JPIB)*STORAGE_SIZE(ZAS(1))/8

! Be very careful here not to touch ZAA, ZAS, ZAA0 and ZAS0 on the host!

ZAA(:,:,:) = 0._JPRBT
ZAS(:,:,:) = 0._JPRBT
ZAA(:) = 0._JPRBT
ZAS(:) = 0._JPRBT

KMLOC0 = 0
DO JMLOC=1,D%NUMP
KM = D%MYMS(JMLOC)
KDGLU = G%NDGLU(KM)
ILA = (R%NSMAX-KM+2)/2
ILS = (R%NSMAX-KM+3)/2

ZAA(1:KDGLU,1:ILA,JMLOC)=S%FA(JMLOC)%RPNMA(1:KDGLU,1:ILA)
ZAS(1:KDGLU,1:ILS,JMLOC)=S%FA(JMLOC)%RPNMS(1:KDGLU,1:ILS)
ENDDO

! permanent copy of Legendre polynomials into device
IF (KM /= 0) THEN
CALL C_F_POINTER(C_LOC(ZAA(1+D%OFFSETS_GEMM_MATRIX(JMLOC))), LOCAL_ARR, &
& (/D%LEGENDRE_MATRIX_STRIDES(JMLOC),ILA/))
LOCAL_ARR(1:KDGLU,1:ILA) = S%FA(JMLOC)%RPNMA(1:KDGLU,1:ILA)

#ifdef ACCGPU
!$ACC ENTER DATA COPYIN(ZAA,ZAS)
#endif
#ifdef OMPGPU
#endif
CALL C_F_POINTER(C_LOC(ZAS(1+D%OFFSETS_GEMM_MATRIX(JMLOC))), LOCAL_ARR, &
& (/D%LEGENDRE_MATRIX_STRIDES(JMLOC),ILS/))
LOCAL_ARR(1:KDGLU,1:ILS) = S%FA(JMLOC)%RPNMS(1:KDGLU,1:ILS)
ELSE
KMLOC0 = JMLOC
ALLOCATE(ZAA0(ALIGN(KDGLU,8),ILA))
ALLOCATE(ZAS0(ALIGN(KDGLU,8),ILS))

ZAA0(:,:) = 0
ZAS0(:,:) = 0
ZAA0(1:KDGLU,1:ILA)=S%FA(JMLOC)%RPNMA(1:KDGLU,1:ILA)
ZAS0(1:KDGLU,1:ILS)=S%FA(JMLOC)%RPNMS(1:KDGLU,1:ILS)
WRITE(NOUT,'("ZAA0: ", I, " B")') SIZE(ZAA,KIND=JPIB)*STORAGE_SIZE(ZAA0(1,1))/8
WRITE(NOUT,'("ZAS0: ", I, " B")') SIZE(ZAA,KIND=JPIB)*STORAGE_SIZE(ZAS0(1,1))/8
!$ACC ENTER DATA COPYIN(ZAA0,ZAS0) ASYNC(1)
ENDIF
ENDDO
!$ACC ENTER DATA COPYIN(ZAA,ZAS) ASYNC(1)
!$ACC WAIT(1)

ALLOCATE(ZEPSNM(D%NUMP,0:R%NTMAX+2))
WRITE(NOUT,*)'ZEPSNM :',SIZE(ZEPSNM)
Expand Down Expand Up @@ -653,33 +671,9 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,&
D_NUMP=D%NUMP
D_NDGL_FS=D%NDGL_FS

KMLOC0 = -1
DO I=1,SIZE(D%MYMS)
D_MYMS(I)=D%MYMS(I)
IF(D_MYMS(I) == 0) KMLOC0 = I
end DO

! arrays for m=0 in ledir_mod:
IF(KMLOC0 >= 0) THEN
ALLOCATE(ZAA0(SIZE(ZAA,1),SIZE(ZAA,2)))
ALLOCATE(ZAS0(SIZE(ZAS,1),SIZE(ZAS,2)))
ZAA0 = ZAA(:,:,KMLOC0)
ZAS0 = ZAS(:,:,KMLOC0)
#ifdef ACCGPU
!$ACC ENTER DATA COPYIN(ZAA0,ZAS0)
#endif
#ifdef OMPGPU
!$OMP TARGET ENTER DATA MAP(TO:ZAA0,ZAS0)
#endif
WRITE(NOUT,*) 'GPU arrays for m=0 successfully allocated'
#ifdef ACCGPU
WRITE(NOUT,*) 'Using OpenACC'
#endif
#ifdef OMPGPU
WRITE(NOUT,*) 'Using OpenMP offloading'
#endif
ENDIF

DO I=1,SIZE(F%RW)
F_RW(I)=F%RW(I)
END DO
Expand Down
10 changes: 5 additions & 5 deletions src/trans/gpu/internal/ledir_mod.F90
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS)
USE TPM_DIM, ONLY: R_NDGNH,R_NSMAX,R_NTMAX,R_NDGL
USE TPM_GEOMETRY, ONLY: G_NDGLU
USE TPM_FIELDS_FLAT, ONLY: ZAA,ZAS,ZAA0,ZAS0,KMLOC0
USE TPM_DISTR, ONLY: D_NUMP, D_MYMS, D_OFFSETS_GEMM1, D_OFFSETS_GEMM2
USE TPM_DISTR, ONLY: D_NUMP, D_MYMS, D_OFFSETS_GEMM1, D_OFFSETS_GEMM2, D
USE HICBLAS_MOD, ONLY: HIP_DGEMM_BATCHED_OVERLOAD, &
& HIP_DGEMM_GROUPED_OVERLOAD, HIP_SGEMM_GROUPED_OVERLOAD
USE MPL_MODULE, ONLY: MPL_BARRIER,MPL_ALL_MS_COMM
Expand Down Expand Up @@ -196,7 +196,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS)
NS(KMLOC) = (R_NSMAX-KM+2)/2
KS(KMLOC) = G_NDGLU(KM)
AOFFSETS(KMLOC) = IIN_STRIDES0*D_OFFSETS_GEMM1(KMLOC)
BOFFSETS(KMLOC) = SIZE(ZAA,1)*SIZE(ZAA,2)*(KMLOC-1)
BOFFSETS(KMLOC) = D%OFFSETS_GEMM_MATRIX(KMLOC)
COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM2(KMLOC)
ENDDO
IF(KMLOC0 > 0) THEN
Expand All @@ -215,7 +215,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS)
& 2*KF_FS, NS(:), KS(:), &
& 1.0_JPRBT, &
& ZINPA, IIN_STRIDES0, AOFFSETS, &
& ZAA, SIZE(ZAA,1), BOFFSETS, &
& ZAA, D%LEGENDRE_MATRIX_STRIDES, BOFFSETS, &
& 0.0_JPRBT, &
& ZOUT, IOUT_STRIDES0, COFFSETS, &
& D_NUMP, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR)
Expand Down Expand Up @@ -311,7 +311,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS)
NS(KMLOC) = (R_NSMAX-KM+3)/2
KS(KMLOC) = G_NDGLU(KM)
AOFFSETS(KMLOC) = IIN_STRIDES0*D_OFFSETS_GEMM1(KMLOC)
BOFFSETS(KMLOC) = SIZE(ZAS,1)*SIZE(ZAS,2)*(KMLOC-1)
BOFFSETS(KMLOC) = D%OFFSETS_GEMM_MATRIX(KMLOC)
COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM2(KMLOC)
ENDDO
IF(KMLOC0 > 0) THEN
Expand All @@ -330,7 +330,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS)
& 2*KF_FS, NS(:), KS(:), &
& 1.0_JPRBT, &
& ZINPS, IIN_STRIDES0, AOFFSETS, &
& ZAS, SIZE(ZAS,1), BOFFSETS, &
& ZAS, D%LEGENDRE_MATRIX_STRIDES, BOFFSETS, &
& 0.0_JPRBT, &
& ZOUT, IOUT_STRIDES0, COFFSETS, &
& D_NUMP, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR)
Expand Down
10 changes: 5 additions & 5 deletions src/trans/gpu/internal/leinv_mod.F90
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG)
USE TPM_DIM, ONLY: R_NDGNH, R_NSMAX, R_NDGL
USE TPM_GEOMETRY, ONLY: G_NDGLU
USE TPM_FIELDS_FLAT, ONLY: ZAA, ZAS, ZAA0, ZAS0, KMLOC0
USE TPM_DISTR, ONLY: D_NUMP, D_MYMS, MYPROC, D_OFFSETS_GEMM1, D_OFFSETS_GEMM2
USE TPM_DISTR, ONLY: D_NUMP, D_MYMS, MYPROC, D_OFFSETS_GEMM1, D_OFFSETS_GEMM2, D
USE HICBLAS_MOD, ONLY: HIP_DGEMM_BATCHED_OVERLOAD, &
& HIP_DGEMM_GROUPED_OVERLOAD, HIP_SGEMM_GROUPED_OVERLOAD
USE, INTRINSIC :: ISO_C_BINDING, ONLY: C_INT
Expand Down Expand Up @@ -252,7 +252,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG)
KS(KMLOC) = (R_NSMAX-KM+2)/2
NS(KMLOC) = G_NDGLU(KM)
AOFFSETS(KMLOC) = IIN_STRIDES0*D_OFFSETS_GEMM2(KMLOC)
BOFFSETS(KMLOC) = SIZE(ZAA,1)*SIZE(ZAA,2)*(KMLOC-1)
BOFFSETS(KMLOC) = D%OFFSETS_GEMM_MATRIX(KMLOC)
COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM1(KMLOC)
ENDDO
IF(KMLOC0 > 0) THEN
Expand All @@ -271,7 +271,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG)
& 2*KF_LEG, NS(:), KS(:), &
& 1.0_JPRBT, &
& ZINP, IIN_STRIDES0, AOFFSETS, &
& ZAA, SIZE(ZAA,1), BOFFSETS, &
& ZAA, D%LEGENDRE_MATRIX_STRIDES, BOFFSETS, &
& 0.0_JPRBT, &
& ZOUTA, IOUT_STRIDES0, COFFSETS, &
& D_NUMP, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR)
Expand Down Expand Up @@ -386,7 +386,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG)
KS(KMLOC) = (R_NSMAX-KM+3)/2
NS(KMLOC) = G_NDGLU(KM)
AOFFSETS(KMLOC) = IIN_STRIDES0*D_OFFSETS_GEMM2(KMLOC)
BOFFSETS(KMLOC) = SIZE(ZAS,1)*SIZE(ZAS,2)*(KMLOC-1)
BOFFSETS(KMLOC) = D%OFFSETS_GEMM_MATRIX(KMLOC)
COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM1(KMLOC)
ENDDO
IF(KMLOC0 > 0) THEN
Expand All @@ -405,7 +405,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG)
& 2*KF_LEG, NS(:), KS(:), &
& 1.0_JPRBT, &
& ZINP, IIN_STRIDES0, AOFFSETS, &
& ZAS, SIZE(ZAS,1), BOFFSETS, &
& ZAS, D%LEGENDRE_MATRIX_STRIDES, BOFFSETS, &
& 0.0_JPRBT, &
& ZOUTS, IOUT_STRIDES0, COFFSETS, &
& D_NUMP, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR)
Expand Down
Loading

0 comments on commit 058dd37

Please sign in to comment.