Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Compact legendre polynomials #164

Merged
merged 6 commits into from
Jan 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/trans/common/internal/tpm_distr.F90
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,8 @@ MODULE TPM_DISTR
REAL(KIND=JPRD) ,ALLOCATABLE :: RWEIGHT(:) ! Weight per grid-point (if weighted distribution)
INTEGER(KIND=JPIM) ,ALLOCATABLE :: NPROCA_GP(:) ! Number of grid-points per a-set

INTEGER(KIND=JPIB), ALLOCATABLE :: OFFSETS_GEMM1(:), OFFSETS_GEMM2(:)
INTEGER(KIND=JPIB), ALLOCATABLE :: OFFSETS_GEMM1(:), OFFSETS_GEMM2(:), OFFSETS_GEMM_MATRIX(:)
INTEGER(KIND=JPIM), ALLOCATABLE :: LEGENDRE_MATRIX_STRIDES(:)

END TYPE DISTR_TYPE

Expand Down
16 changes: 8 additions & 8 deletions src/trans/gpu/algor/hicblas_cutlass.cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -154,11 +154,11 @@ class cutlass_sgemm_grouped<CutlassType::cutlass_fp32, TransA, TransB> {

} // namespace detail
template <cublasOperation_t TransA, cublasOperation_t TransB>
void cutlass_sgemm_wrapper_grouped_op(int resol_id, int blas_id, int m, int *n, int *k,
void cutlass_sgemm_wrapper_grouped_op(int resol_id, int blas_id, int m, const int *n, const int *k,
float alpha, const float *A, int lda,
int64_t *offsetsA, const float *B, int ldb,
int64_t *offsetsB, float beta, float *C,
int ldc, int64_t *offsetsC, int batchCount,
const int64_t *offsetsA, const float *B, const int *ldb,
samhatfield marked this conversation as resolved.
Show resolved Hide resolved
const int64_t *offsetsB, float beta, float *C,
int ldc, const int64_t *offsetsC, int batchCount,
cudaStream_t stream,
void *growing_allocator) {
using namespace detail;
Expand All @@ -182,10 +182,10 @@ void cutlass_sgemm_wrapper_grouped_op(int resol_id, int blas_id, int m, int *n,
}

void cutlass_sgemm_wrapper_grouped(int resol_id, int blas_id, char transa, char transb,
int m, int *n, int *k, float alpha,
const float *A, int lda, int64_t *offsetsA,
const float *B, int ldb, int64_t *offsetsB, float beta,
float *C, int ldc, int64_t *offsetsC,
int m, const int *n, const int *k, float alpha,
const float *A, int lda, const int64_t *offsetsA,
const float *B, const int *ldb, const int64_t *offsetsB, float beta,
float *C, int ldc, const int64_t *offsetsC,
int batchCount, cudaStream_t stream,
void *growing_allocator) {

Expand Down
79 changes: 40 additions & 39 deletions src/trans/gpu/algor/hicblas_gemm.hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ template <typename Gemm> auto &get_graph_cache() {
template <typename Gemm> auto &get_ptr_cache() {
using real_t = typename Gemm::real_type;
static std::unordered_map<
cache_key, std::tuple<real_t const *, real_t const *, real_t const *>>
cache_key, std::tuple<const real_t *, const real_t *, const real_t *>>
ptrCache;
return ptrCache;
}
Expand Down Expand Up @@ -89,10 +89,11 @@ template <typename Gemm> void erase_from_caches(int resol_id) {

// this version is using graphs and caches the graphs
template <typename Gemm, typename Real>
void run_group_graph(Gemm &&gemm, int resol_id, int m, int *n, int *k,
Real alpha, const Real *A, int lda, int64_t *offsetsA,
const Real *B, int ldb, int64_t *offsetsB, Real beta,
Real *C, int ldc, int64_t *offsetsC, int batchCount,
void run_group_graph(Gemm &&gemm, int resol_id, int m, const int *n,
const int *k, Real alpha, const Real *A, int lda,
const int64_t *offsetsA, const Real *B, const int *ldb,
const int64_t *offsetsB, Real beta, Real *C, int ldc,
const int64_t *offsetsC, int batchCount,
hipStream_t stream, int blas_id, void *growing_allocator) {
growing_allocator_register_free_c(growing_allocator,
free_gemm_graph_cache<Gemm>);
Expand Down Expand Up @@ -138,7 +139,7 @@ void run_group_graph(Gemm &&gemm, int resol_id, int m, int *n, int *k,

HIC_CHECK(hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal));
gemm(stream, m, n[i], k[i], alpha, A + offsetsA[i], lda, B + offsetsB[i],
ldb, beta, C + offsetsC[i], ldc);
ldb[i], beta, C + offsetsC[i], ldc);
hipGraph_t my_graph;
HIC_CHECK(hipStreamEndCapture(stream, &my_graph));
hipGraphNode_t my_node;
Expand All @@ -163,16 +164,16 @@ void run_group_graph(Gemm &&gemm, int resol_id, int m, int *n, int *k,

// stupid simple gemm calls
template <typename Gemm, typename Real>
void run_group(Gemm &&gemm, int resol_id, int m, int *n, int *k, Real alpha,
const Real *A, int lda, int64_t *offsetsA, const Real *B,
int ldb, int64_t *offsetsB, Real beta, Real *C, int ldc,
int64_t *offsetsC, int batchCount, hipStream_t stream,
int = -1) {
void run_group(Gemm &&gemm, int resol_id, int m, const int *n, const int *k,
Real alpha, const Real *A, int lda, const int64_t *offsetsA,
const Real *B, const int *ldb, const int64_t *offsetsB,
Real beta, Real *C, int ldc, const int64_t *offsetsC,
int batchCount, hipStream_t stream, int = -1) {
for (int i = 0; i < batchCount; ++i) {
if (m == 0 || n[i] == 0 || k[i] == 0)
continue;
gemm(stream, m, n[i], k[i], alpha, A + offsetsA[i], lda, B + offsetsB[i],
ldb, beta, C + offsetsC[i], ldc);
ldb[i], beta, C + offsetsC[i], ldc);
}
}

Expand Down Expand Up @@ -215,14 +216,12 @@ template <typename Real> struct hipblas_gemm_grouped {

#ifndef USE_CUTLASS

void hipblas_sgemm_wrapper_grouped(int resol_id, int blas_id, char transa,
char transb, int m, int *n, int *k,
float alpha, const float *A, int lda,
int64_t *offsetsA, const float *B, int ldb,
int64_t *offsetsB, float beta, float *C,
int ldc, int64_t *offsetsC, int batchCount,
hipStream_t stream,
void *growing_allocator) {
void hipblas_sgemm_wrapper_grouped(
int resol_id, int blas_id, char transa, char transb, int m, const int *n,
const int *k, float alpha, const float *A, int lda, const int64_t *offsetsA,
const float *B, const int *ldb, const int64_t *offsetsB, float beta,
float *C, int ldc, const int64_t *offsetsC, int batchCount,
hipStream_t stream, void *growing_allocator) {

hipblasOperation_t op_t1 = HIPBLAS_OP_N, op_t2 = HIPBLAS_OP_N;
if (transa == 'T' || transa == 't')
Expand All @@ -244,12 +243,13 @@ void hipblas_sgemm_wrapper_grouped(int resol_id, int blas_id, char transa,
#endif

void hipblas_dgemm_wrapper_grouped(int resol_id, int blas_id, char transa,
char transb, int m, int *n, int *k,
double alpha, const double *A, int lda,
int64_t *offsetsA, const double *B, int ldb,
int64_t *offsetsB, double beta, double *C,
int ldc, int64_t *offsetsC, int batchCount,
hipStream_t stream, void *) {
char transb, int m, const int *n,
const int *k, double alpha, const double *A,
int lda, const int64_t *offsetsA,
const double *B, const int *ldb,
const int64_t *offsetsB, double beta,
double *C, int ldc, const int64_t *offsetsC,
int batchCount, hipStream_t stream, void *) {

hipblasOperation_t op_t1 = HIPBLAS_OP_N, op_t2 = HIPBLAS_OP_N;
if (transa == 'T' || transa == 't')
Expand Down Expand Up @@ -313,13 +313,12 @@ void hipblas_sgemm_wrapper(char transa, char transb, int m, int n, int k,
batchCount));
}

void hipblas_sgemm_wrapper_grouped(int resol_id, int blas_id, char transa,
char transb, int m, int *n, int *k,
float alpha, const float *A, int lda,
int64_t *offsetsA, const float *B, int ldb,
int64_t *offsetsB, float beta, float *C,
int ldc, int64_t *offsetsC, int batchCount,
size_t stream, void *growing_allocator) {
void hipblas_sgemm_wrapper_grouped(
int resol_id, int blas_id, char transa, char transb, int m, const int *n,
const int *k, float alpha, const float *A, int lda, const int64_t *offsetsA,
const float *B, const int *ldb, const int64_t *offsetsB, float beta,
float *C, int ldc, const int64_t *offsetsC, int batchCount, size_t stream,
void *growing_allocator) {
#ifdef USE_CUTLASS
cutlass_sgemm_wrapper_grouped(resol_id, blas_id, transa, transb, m, n, k,
alpha, A, lda, offsetsA, B, ldb, offsetsB, beta,
Expand All @@ -334,12 +333,14 @@ void hipblas_sgemm_wrapper_grouped(int resol_id, int blas_id, char transa,
}

void hipblas_dgemm_wrapper_grouped(int resol_id, int blas_id, char transa,
char transb, int m, int *n, int *k,
double alpha, const double *A, int lda,
int64_t *offsetsA, const double *B, int ldb,
int64_t *offsetsB, double beta, double *C,
int ldc, int64_t *offsetsC, int batchCount,
size_t stream, void *growing_allocator) {
char transb, int m, const int *n,
const int *k, double alpha, const double *A,
int lda, const int64_t *offsetsA,
const double *B, const int *ldb,
const int64_t *offsetsB, double beta,
double *C, int ldc, const int64_t *offsetsC,
int batchCount, size_t stream,
void *growing_allocator) {
hipblas_dgemm_wrapper_grouped(resol_id, blas_id, transa, transb, m, n, k,
alpha, A, lda, offsetsA, B, ldb, offsetsB, beta,
C, ldc, offsetsC, batchCount,
Expand Down
14 changes: 7 additions & 7 deletions src/trans/gpu/algor/hicblas_mod.F90
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,8 @@ SUBROUTINE HIP_DGEMM_GROUPED( &
&) BIND(C, NAME='hipblas_dgemm_wrapper_grouped')
USE ISO_C_BINDING, ONLY: C_CHAR, C_INT, C_DOUBLE, C_SIZE_T, C_PTR, C_INT64_T
CHARACTER(1,C_CHAR), VALUE :: CTA, CTB
INTEGER(C_INT), VALUE :: RESOL_ID, BLAS_ID, M, LDA, LDB, LDC, BATCHCOUNT
INTEGER(C_INT) :: N(*), K(*)
INTEGER(C_INT), VALUE :: RESOL_ID, BLAS_ID, M, LDA, LDC, BATCHCOUNT
INTEGER(C_INT) :: N(*), K(*), LDB(*)
INTEGER(C_INT64_T) :: OFFSETA(*), OFFSETB(*), OFFSETC(*)
REAL(C_DOUBLE), VALUE :: ALPHA,BETA
REAL(C_DOUBLE) :: A(*), B(*), C(*)
Expand All @@ -108,8 +108,8 @@ SUBROUTINE HIP_SGEMM_GROUPED( &
&) BIND(C, NAME='hipblas_sgemm_wrapper_grouped')
USE ISO_C_BINDING, ONLY: C_CHAR, C_INT, C_FLOAT, C_SIZE_T, C_PTR, C_INT64_T
CHARACTER(1,C_CHAR), VALUE :: CTA, CTB
INTEGER(C_INT), VALUE :: RESOL_ID, BLAS_ID, M, LDA, LDB, LDC, BATCHCOUNT
INTEGER(C_INT) :: N(*), K(*)
INTEGER(C_INT), VALUE :: RESOL_ID, BLAS_ID, M, LDA, LDC, BATCHCOUNT
INTEGER(C_INT) :: N(*), K(*), LDB(*)
INTEGER(C_INT64_T) :: OFFSETA(*), OFFSETB(*), OFFSETC(*)
REAL(C_FLOAT), VALUE :: ALPHA,BETA
REAL(C_FLOAT) :: A(*), B(*), C(*)
Expand Down Expand Up @@ -247,7 +247,7 @@ SUBROUTINE HIP_DGEMM_GROUPED_OVERLOAD( &
INTEGER(KIND=JPIM) :: LDA
INTEGER(KIND=JPIB) :: OFFSETA(:)
REAL(KIND=JPRD), DIMENSION(*) :: BARRAY
INTEGER(KIND=JPIM) :: LDB
INTEGER(KIND=JPIM) :: LDB(:)
INTEGER(KIND=JPIB) :: OFFSETB(:)
REAL(KIND=JPRD) :: BETA
REAL(KIND=JPRD), DIMENSION(:) :: CARRAY
Expand Down Expand Up @@ -297,8 +297,8 @@ SUBROUTINE HIP_SGEMM_GROUPED_OVERLOAD(&
REAL(KIND=JPRM), DIMENSION(:) :: AARRAY
INTEGER(KIND=JPIM) :: LDA
INTEGER(KIND=JPIB) :: OFFSETA(:)
REAL(KIND=JPRM), DIMENSION(:,:,:) :: BARRAY
INTEGER(KIND=JPIM) :: LDB
REAL(KIND=JPRM), DIMENSION(*) :: BARRAY
INTEGER(KIND=JPIM) :: LDB(:)
INTEGER(KIND=JPIB) :: OFFSETB(:)
REAL(KIND=JPRM) :: BETA
REAL(KIND=JPRM), DIMENSION(:) :: CARRAY
Expand Down
55 changes: 30 additions & 25 deletions src/trans/gpu/external/setup_trans.F90
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,&
#endif
INTEGER :: INUMDEVS, IDEV, MYGPU

#include "user_clock.intfb.h"
REAL(KIND=JPRBT), POINTER :: LOCAL_ARR(:,:)
! ------------------------------------------------------------------

IF (LHOOK) CALL DR_HOOK('SETUP_TRANS',0,ZHOOK_HANDLE)
Expand Down Expand Up @@ -484,49 +484,54 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,&

! Initialize A arrays

ALLOCATE(FG%ZAA(ALIGN(R%NDGNH,8),ALIGN((R%NTMAX+2)/2,8),D%NUMP))
ALLOCATE(FG%ZAS(ALIGN(R%NDGNH,8),ALIGN((R%NTMAX+3)/2,8),D%NUMP))
ALLOCATE(FG%ZAA(D%OFFSETS_GEMM_MATRIX(D%NUMP+1)))
ALLOCATE(FG%ZAS(D%OFFSETS_GEMM_MATRIX(D%NUMP+1)))

FG%ZAA(:,:,:) = 0._JPRBT
FG%ZAS(:,:,:) = 0._JPRBT
FG%ZAA(:) = 0._JPRBT
FG%ZAS(:) = 0._JPRBT

DO JMLOC=1,D%NUMP
KM = D%MYMS(JMLOC)
KDGLU = G%NDGLU(KM)
ILA = (R%NSMAX-KM+2)/2
ILS = (R%NSMAX-KM+3)/2

FG%ZAA(1:KDGLU,1:ILA,JMLOC)=S%FA(JMLOC)%RPNMA(1:KDGLU,1:ILA)
FG%ZAS(1:KDGLU,1:ILS,JMLOC)=S%FA(JMLOC)%RPNMS(1:KDGLU,1:ILS)
IF (KM /= 0) THEN
CALL C_F_POINTER(C_LOC(FG%ZAA(1+D%OFFSETS_GEMM_MATRIX(JMLOC))), LOCAL_ARR, &
& (/D%LEGENDRE_MATRIX_STRIDES(JMLOC),ILA/))
LOCAL_ARR(1:KDGLU,1:ILA) = S%FA(JMLOC)%RPNMA(1:KDGLU,1:ILA)

CALL C_F_POINTER(C_LOC(FG%ZAS(1+D%OFFSETS_GEMM_MATRIX(JMLOC))), LOCAL_ARR, &
& (/D%LEGENDRE_MATRIX_STRIDES(JMLOC),ILS/))
LOCAL_ARR(1:KDGLU,1:ILS) = S%FA(JMLOC)%RPNMS(1:KDGLU,1:ILS)
ELSE
ALLOCATE(FG%ZAA0(ALIGN(KDGLU,8),ILA))
ALLOCATE(FG%ZAS0(ALIGN(KDGLU,8),ILS))

FG%ZAA0(:,:) = 0
FG%ZAS0(:,:) = 0
FG%ZAA0(1:KDGLU,1:ILA)=S%FA(JMLOC)%RPNMA(1:KDGLU,1:ILA)
FG%ZAS0(1:KDGLU,1:ILS)=S%FA(JMLOC)%RPNMS(1:KDGLU,1:ILS)
ENDIF
ENDDO

! arrays for m=0 in ledir_mod:
IMLOC0 = FINDLOC(D%MYMS,0)
IF(IMLOC0(1) > 0) THEN
ALLOCATE(FG%ZAA0(SIZE(FG%ZAA,1),SIZE(FG%ZAA,2)))
ALLOCATE(FG%ZAS0(SIZE(FG%ZAS,1),SIZE(FG%ZAS,2)))
FG%ZAA0 = FG%ZAA(:,:,IMLOC0(1))
FG%ZAS0 = FG%ZAS(:,:,IMLOC0(1))
ENDIF


ALLOCATE(FG%ZEPSNM(D%NUMP,0:R%NTMAX+2))
FG%ZEPSNM = 0._JPRBT
CALL PREPSNM !Initialize on the host

CALL PREPSNM
WRITE(NOUT,*)'setup_trans: sizes1 NUMP=',D%NUMP
#ifdef ACCGPU
WRITE(NOUT,*) 'Using OpenACC'
#endif
#ifdef OMPGPU
WRITE(NOUT,*) 'Using OpenMP offloading'
#endif
WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAS', C_SIZEOF(FG%ZAS(1,1,1))*SIZE(FG%ZAS)
WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAA', C_SIZEOF(FG%ZAA(1,1,1))*SIZE(FG%ZAA)
WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAS0', C_SIZEOF(FG%ZAS0(1,1))*SIZE(FG%ZAS0)
WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAA0', C_SIZEOF(FG%ZAA0(1,1))*SIZE(FG%ZAA0)
WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZEPSNM', C_SIZEOF(FG%ZEPSNM(1,1))*SIZE(FG%ZEPSNM)
WRITE(NOUT,'(A10,":",I11,"B")') 'FG%ZAS', C_SIZEOF(FG%ZAS(1))*SIZE(FG%ZAS)
WRITE(NOUT,'(A10,":",I11,"B")') 'FG%ZAA', C_SIZEOF(FG%ZAA(1))*SIZE(FG%ZAA)
WRITE(NOUT,'(A10,":",I11,"B")') 'FG%ZAS0', C_SIZEOF(FG%ZAS0(1,1))*SIZE(FG%ZAS0)
WRITE(NOUT,'(A10,":",I11,"B")') 'FG%ZAA0', C_SIZEOF(FG%ZAA0(1,1))*SIZE(FG%ZAA0)
WRITE(NOUT,'(A10,":",I11,"B")') 'FG%ZEPSNM', C_SIZEOF(FG%ZEPSNM(1,1))*SIZE(FG%ZEPSNM)

IF (IMLOC0(1) > 0) THEN
IF (ANY(D%MYMS == 0)) THEN
#ifdef ACCGPU
!$ACC ENTER DATA COPYIN(FG%ZAA0,FG%ZAS0) ASYNC(1)
#endif
Expand Down
8 changes: 4 additions & 4 deletions src/trans/gpu/internal/ledir_mod.F90
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS)
NS(KMLOC) = (R_NSMAX-KM+2)/2
KS(KMLOC) = G_NDGLU(KM)
AOFFSETS(KMLOC) = IIN_STRIDES0*D_OFFSETS_GEMM1(KMLOC)
BOFFSETS(KMLOC) = SIZE(ZAA,1)*SIZE(ZAA,2)*(KMLOC-1)
BOFFSETS(KMLOC) = D%OFFSETS_GEMM_MATRIX(KMLOC)
COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM2(KMLOC)
ENDDO
IF(IMLOC0(1) > 0) THEN
Expand All @@ -230,7 +230,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS)
& 2*KF_FS, NS(:), KS(:), &
& 1.0_JPRBT, &
& ZINPA, IIN_STRIDES0, AOFFSETS, &
& ZAA, SIZE(ZAA,1), BOFFSETS, &
& ZAA, D%LEGENDRE_MATRIX_STRIDES, BOFFSETS, &
& 0.0_JPRBT, &
& ZOUT, IOUT_STRIDES0, COFFSETS, &
& D_NUMP, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR)
Expand Down Expand Up @@ -331,7 +331,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS)
NS(KMLOC) = (R_NSMAX-KM+3)/2
KS(KMLOC) = G_NDGLU(KM)
AOFFSETS(KMLOC) = IIN_STRIDES0*D_OFFSETS_GEMM1(KMLOC)
BOFFSETS(KMLOC) = SIZE(ZAS,1)*SIZE(ZAS,2)*(KMLOC-1)
BOFFSETS(KMLOC) = D%OFFSETS_GEMM_MATRIX(KMLOC)
COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM2(KMLOC)
ENDDO
IF(IMLOC0(1) > 0) THEN
Expand All @@ -350,7 +350,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS)
& 2*KF_FS, NS(:), KS(:), &
& 1.0_JPRBT, &
& ZINPS, IIN_STRIDES0, AOFFSETS, &
& ZAS, SIZE(ZAS,1), BOFFSETS, &
& ZAS, D%LEGENDRE_MATRIX_STRIDES, BOFFSETS, &
& 0.0_JPRBT, &
& ZOUT, IOUT_STRIDES0, COFFSETS, &
& D_NUMP, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR)
Expand Down
8 changes: 4 additions & 4 deletions src/trans/gpu/internal/leinv_mod.F90
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG)
KS(KMLOC) = (R_NSMAX-KM+2)/2
NS(KMLOC) = G_NDGLU(KM)
AOFFSETS(KMLOC) = IIN_STRIDES0*D_OFFSETS_GEMM2(KMLOC)
BOFFSETS(KMLOC) = SIZE(ZAA,1)*SIZE(ZAA,2)*(KMLOC-1)
BOFFSETS(KMLOC) = D%OFFSETS_GEMM_MATRIX(KMLOC)
COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM1(KMLOC)
ENDDO
IF(IMLOC0(1) > 0) THEN
Expand All @@ -290,7 +290,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG)
& 2*KF_LEG, NS(:), KS(:), &
& 1.0_JPRBT, &
& ZINP, IIN_STRIDES0, AOFFSETS, &
& ZAA, SIZE(ZAA,1), BOFFSETS, &
& ZAA, D%LEGENDRE_MATRIX_STRIDES, BOFFSETS, &
& 0.0_JPRBT, &
& ZOUTA, IOUT_STRIDES0, COFFSETS, &
& D_NUMP, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR)
Expand Down Expand Up @@ -411,7 +411,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG)
KS(KMLOC) = (R_NSMAX-KM+3)/2
NS(KMLOC) = G_NDGLU(KM)
AOFFSETS(KMLOC) = IIN_STRIDES0*D_OFFSETS_GEMM2(KMLOC)
BOFFSETS(KMLOC) = SIZE(ZAS,1)*SIZE(ZAS,2)*(KMLOC-1)
BOFFSETS(KMLOC) = D%OFFSETS_GEMM_MATRIX(KMLOC)
COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM1(KMLOC)
ENDDO
IF(IMLOC0(1) > 0) THEN
Expand All @@ -430,7 +430,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG)
& 2*KF_LEG, NS(:), KS(:), &
& 1.0_JPRBT, &
& ZINP, IIN_STRIDES0, AOFFSETS, &
& ZAS, SIZE(ZAS,1), BOFFSETS, &
& ZAS, D%LEGENDRE_MATRIX_STRIDES, BOFFSETS, &
& 0.0_JPRBT, &
& ZOUTS, IOUT_STRIDES0, COFFSETS, &
& D_NUMP, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR)
Expand Down
Loading
Loading