Skip to content

Commit

Permalink
Merge pull request #597 from ThePortlandGroup/nv_stage
Browse files Browse the repository at this point in the history
Pull 2018-09-30T14-55 Recent NVIDIA Changes
  • Loading branch information
sscalpone authored Oct 1, 2018
2 parents 498be06 + 1af4225 commit 1dce76d
Show file tree
Hide file tree
Showing 82 changed files with 2,545 additions and 3,325 deletions.
27 changes: 26 additions & 1 deletion runtime/flang/directives.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
#define F3 % xmm2
#define F4 % xmm3

#else
#elif defined(LINUX_ELF) || defined(TARGET_LINUX_X86) || defined(TARGET_LINUX_X8664)
#define ENT(n) n
#define ALN_WORD .align 4
#define ALN_FUNC .align 16
Expand All @@ -71,6 +71,31 @@
#define F3 % xmm2
#define F4 % xmm3

#elif defined(TARGET_OSX_X8664)
#define ENT(n) ASM_CONCAT(_,n)
#define ALN_WORD .align 2
#define ALN_FUNC .align 4
#define ALN_DBLE .align 3
#define ALN_QUAD .align 4
#define ELF_FUNC(s)
#define ELF_OBJ(s)
#define ELF_SIZE(s)
#define AS_VER
#define I1 % rdi
#define I1W % edi
#define I2 % rsi
#define I2W % esi
#define I3 % rdx
#define I3W % edx
#define I4 % rcx
#define F1 % xmm0
#define F2 % xmm1
#define F3 % xmm2
#define F4 % xmm3

#else
#error X8664 TARGET platform not defined.
#error TARGET must be one of TARGET_LINUX_X8664, TARGET_OSX_X8664, or TARGET_WIN_X8664.
#endif

/* macros for handling pic and non-pic code */
Expand Down
4 changes: 3 additions & 1 deletion runtime/flang/fmtconv.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 1995-2017, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -563,6 +563,7 @@ __fortio_fmt_g(__BIGREAL_T val, int w, int d, int e, int sf, int type,
{
int sign_char;
int newd;
#if defined(TARGET_X8664)
/*
* the following guarded IF may look like a no-op, but is
* needed when val is a denorm and DAZ is enabled. In this case, the
Expand All @@ -580,6 +581,7 @@ __fortio_fmt_g(__BIGREAL_T val, int w, int d, int e, int sf, int type,
((int *)&val)[1] |= 0x80000000;
}
}
#endif
field_overflow = FALSE;
/*
fp_canon(val, type, round);
Expand Down
4 changes: 4 additions & 0 deletions runtime/flang/fortDt.h
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,10 @@ typedef __INT_T dtype;
* which can be either a 64-bit or 32-bit type depending on DESC_I8
*/

#if defined(TARGET_X8664)
#define __NELEM_T __INT8_T
#else
#define __NELEM_T __INT_T
#endif

#endif /*_PGHPF_TYPES_H_*/
7 changes: 6 additions & 1 deletion runtime/flang/ftncharsup.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 1993-2018, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -384,8 +384,13 @@ Ftn_str_free(char **first)

#define __HAVE_LONGLONG_T

#if defined(LINUX8664) || defined(OSX8664)
typedef long _LONGLONG_T;
typedef unsigned long _ULONGLONG_T;
#else
typedef long long _LONGLONG_T;
typedef unsigned long long _ULONGLONG_T;
#endif

/* ***********************************************************************/
/** \brief
Expand Down
23 changes: 22 additions & 1 deletion runtime/flang/ftni64.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 1997-2018, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -24,8 +24,13 @@

#define __HAVE_LONGLONG_T

#if defined(LINUX8664) || defined(OSX8664)
typedef long _LONGLONG_T;
typedef unsigned long _ULONGLONG_T;
#else
typedef long long _LONGLONG_T;
typedef unsigned long long _ULONGLONG_T;
#endif

/* now defined if BaseTsd10.h included */
typedef int INT64[2];
Expand All @@ -44,6 +49,7 @@ typedef union {
_LONGLONG_T lv;
} INT64D;

#if defined(LINUX8664) || defined(OSX8664)
#define __I8RET_T long
#define UTL_I_I64RET(m, l) \
{ \
Expand All @@ -52,3 +58,18 @@ typedef union {
I64_LSH(int64d.i) = l; \
return int64d.lv; \
}
#elif defined(WIN64)
/* Someday, should only care if TM_I8 is defined */
#define __I8RET_T long long
#define UTL_I_I64RET(m, l) \
{ \
INT64D int64d; \
I64_MSH(int64d.i) = m; \
I64_LSH(int64d.i) = l; \
return int64d.lv; \
}
#else
#define __I8RET_T void
#define UTL_I_I64RET __utl_i_i64ret
extern VOID UTL_I_I64RET();
#endif
7 changes: 6 additions & 1 deletion runtime/flang/ftnncharsup.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 1993-2018, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -160,8 +160,13 @@ int a2_len; /* length of a2 */

#define __HAVE_LONGLONG_T

#if defined(LINUX8664) || defined(OSX8664)
typedef long _LONGLONG_T;
typedef unsigned long _ULONGLONG_T;
#else
typedef long long _LONGLONG_T;
typedef unsigned long long _ULONGLONG_T;
#endif


/* ***********************************************************************/
Expand Down
14 changes: 14 additions & 0 deletions runtime/flang/miscsup_com.c
Original file line number Diff line number Diff line change
Expand Up @@ -816,7 +816,11 @@ ENTFTN(SYSCLK, sysclk)(__STAT_T *count, __STAT_T *count_rate,

if (resol == 0) {
int def;
#if defined(TARGET_X8664)
def = 1000000;
#else
def = sizeof(__STAT_T) < 8 ? 1000 : 1000000;
#endif
resol = __fort_getoptn("-system_clock_rate", def);
if (resol <= 0)
__fort_abort("invalid value given for system_clock rate");
Expand Down Expand Up @@ -2880,6 +2884,7 @@ ENTF90(TRIMA, trima)
i = CLEN(expr);
while (i > 0) {
if (CADR(expr)[i - 1] != ' ') {
#if defined(TARGET_X8664)
if (i <= 11) {
int *rptr = ((int *)CADR(res));
int *eptr = ((int *)CADR(expr));
Expand All @@ -2899,6 +2904,11 @@ ENTF90(TRIMA, trima)
}
rcptr = (char *)rptr;
ecptr = (char *)eptr;
#else
if (i <= 3) {
rcptr = ((char *)CADR(res));
ecptr = ((char *)CADR(expr));
#endif
j = i & 3;
if (j > 2)
*rcptr++ = *ecptr++;
Expand Down Expand Up @@ -4823,7 +4833,11 @@ ENTF90(SPACINGD, spacingd)(__REAL8_T *d)

#ifndef DESC_I8

#if defined(TARGET_X8664)
typedef __INT8_T SZ_T;
#else
typedef __INT4_T SZ_T;
#endif

#undef _MZERO
#define _MZERO(n, t) \
Expand Down
24 changes: 24 additions & 0 deletions runtime/flang/pgf90_mmul_cmplx16.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,11 @@
!
! Global variables
!
#ifdef TARGET_X8664
integer*8 :: mra, ncb, kab, lda, ldb, ldc
#else
integer :: mra, ncb, kab, lda, ldb, ldc
#endif
complex*16, dimension( lda, * )::a
complex*16, dimension( ldb, * )::b
complex*16, dimension( ldc, * )::c
Expand All @@ -26,6 +30,7 @@
!
! local variables
!
#ifdef TARGET_X8664
integer*8 :: colsa, rowsa, rowsb, colsb
integer*8 :: i, j, jb, k, ak, bk, jend
integer*8 :: ar, ar_sav, ac, ac_sav, br, bc
Expand All @@ -36,6 +41,18 @@
integer*8 :: colsb_chunk, colsb_chunks, colsb_strt, colsb_end
integer*8 :: colsa_chunk, colsa_chunks, colsa_strt, colsa_end
integer*8 :: bufr, bufr_sav, bufca, bufca_sav, bufcb, bufcb_sav
#else
integer :: colsa, rowsa, rowsb, colsb
integer :: i, j, jb, k, ak, bk, jend
integer :: ar, ar_sav, ac, ac_sav, br, bc
integer :: ndxa, ndxasav
integer :: ndxb, ndxbsav, ndxb0, ndxb1, ndxb2, ndxb3
integer :: colachunk, colachunks, colbchunk, colbchunks
integer :: rowchunk, rowchunks
integer :: colsb_chunk, colsb_chunks, colsb_strt, colsb_end
integer :: colsa_chunk, colsa_chunks, colsa_strt, colsa_end
integer :: bufr, bufr_sav, bufca, bufca_sav, bufcb, bufcb_sav
#endif
integer :: ta, tb
complex*16 :: temp, temp0, temp1, temp2, temp3
real*8 :: temprr0, temprr1, temprr2, temprr3
Expand All @@ -52,7 +69,14 @@
complex*16, allocatable, dimension(:) :: buffera, bufferb

!Minimun number of multiplications needed to activate the blocked optimization.
#ifdef TARGET_X8664
integer, parameter :: min_blocked_mult = 15000
#elif TARGET_LINUX_POWER
integer, parameter :: min_blocked_mult = 15000 !Complex calculations not vectorized on OpenPower.
#else
#warning untuned matrix multiplication parameter
integer, parameter :: min_blocked_mult = 15000
#endif

#undef DCMPLX
#define DCMPLX(r,i) cmplx(r,i,kind=8)
24 changes: 24 additions & 0 deletions runtime/flang/pgf90_mmul_cmplx8.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,11 @@
!
! Global variables
!
#ifdef TARGET_X8664
integer*8 :: mra, ncb, kab, lda, ldb, ldc
#else
integer :: mra, ncb, kab, lda, ldb, ldc
#endif
complex*8, dimension( lda, * )::a
complex*8, dimension( ldb, * )::b
complex*8, dimension( ldc, * )::c
Expand All @@ -26,6 +30,7 @@
!
! local variables
!
#ifdef TARGET_X8664
integer*8 :: colsa, rowsa, rowsb, colsb
integer*8 :: i, j, jb, k, ak, bk, jend
integer*8 :: ar, ar_sav, ac, ac_sav, br, bc
Expand All @@ -36,6 +41,18 @@
integer*8 :: colsb_chunk, colsb_chunks, colsb_strt, colsb_end
integer*8 :: colsa_chunk, colsa_chunks, colsa_strt, colsa_end
integer*8 :: bufr, bufr_sav, bufca, bufca_sav, bufcb, bufcb_sav
#else
integer :: colsa, rowsa, rowsb, colsb
integer :: i, j, jb, k, ak, bk, jend
integer :: ar, ar_sav, ac, ac_sav, br, bc
integer :: ndxa, ndxasav
integer :: ndxb, ndxbsav, ndxb0, ndxb1, ndxb2, ndxb3
integer :: colachunk, colachunks, colbchunk, colbchunks
integer :: rowchunk, rowchunks
integer :: colsb_chunk, colsb_chunks, colsb_strt, colsb_end
integer :: colsa_chunk, colsa_chunks, colsa_strt, colsa_end
integer :: bufr, bufr_sav, bufca, bufca_sav, bufcb, bufcb_sav
#endif
integer :: ta, tb
complex*8 :: temp, temp0, temp1, temp2, temp3
real*4 :: temprr0, temprr1, temprr2, temprr3
Expand All @@ -52,5 +69,12 @@
complex*8, allocatable, dimension(:) :: buffera, bufferb

!Minimun number of multiplications needed to activate the blocked optimization.
#ifdef TARGET_X8664
integer, parameter :: min_blocked_mult = 1750
#elif TARGET_LINUX_POWER
integer, parameter :: min_blocked_mult = 1750 !Complex calculations not vectorized on OpenPower.
#else
#warning untuned matrix multiplication parameter
integer, parameter :: min_blocked_mult = 1750
#endif

24 changes: 24 additions & 0 deletions runtime/flang/pgf90_mmul_real4.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,11 @@
!
! Global variables
!
#ifdef TARGET_X8664
integer*8 :: mra, ncb, kab, lda, ldb, ldc
#else
integer :: mra, ncb, kab, lda, ldb, ldc
#endif
real*4, dimension( lda, * )::a
real*4, dimension( ldb, * )::b
real*4, dimension( ldc, * )::c
Expand All @@ -30,6 +34,7 @@
!
! local variables
!
#ifdef TARGET_X8664
integer*8 :: colsa, rowsa, rowsb, colsb
integer*8 :: i, j, jb, k, ak, bk, jend
integer*8 :: ar, ar_sav, ac, ac_sav, br, bc
Expand All @@ -40,6 +45,18 @@
integer*8 :: colsb_chunk, colsb_chunks, colsb_strt, colsb_end
integer*8 :: colsa_chunk, colsa_chunks, colsa_strt, colsa_end
integer*8 :: bufr, bufr_sav, bufca, bufca_sav, bufcb, bufcb_sav
#else
integer :: colsa, rowsa, rowsb, colsb
integer :: i, j, jb, k, ak, bk, jend
integer :: ar, ar_sav, ac, ac_sav, br, bc
integer :: ndxa, ndxasav
integer :: ndxb, ndxbsav, ndxb0, ndxb1, ndxb2, ndxb3
integer :: colachunk, colachunks, colbchunk, colbchunks
integer :: rowchunk, rowchunks
integer :: colsb_chunk, colsb_chunks, colsb_strt, colsb_end
integer :: colsa_chunk, colsa_chunks, colsa_strt, colsa_end
integer :: bufr, bufr_sav, bufca, bufca_sav, bufcb, bufcb_sav
#endif
real*4 :: temp, temp0, temp1, temp2, temp3
real*4 :: bufatemp, bufbtemp
real*8 :: time_start, time_end, ttime, all_time
Expand All @@ -50,4 +67,11 @@
real*4, allocatable, dimension(:) :: buffera, bufferb

!Minimun number of multiplications needed to activate the blocked optimization.
#ifdef TARGET_X8664
integer, parameter :: min_blocked_mult = 5000
#elif TARGET_LINUX_POWER
integer, parameter :: min_blocked_mult = 10000
#else
#warning untuned matrix multiplication parameter
integer, parameter :: min_blocked_mult = 5000
#endif
Loading

0 comments on commit 1dce76d

Please sign in to comment.