Merge pull request #597 from ThePortlandGroup/nv_stage

Pull 2018-09-30T14-55 Recent NVIDIA Changes
flang-compiler · Oct 1, 2018 · 1dce76d · 1dce76d
2 parents 498be06 + 1af4225
commit 1dce76d
Show file tree

Hide file tree

Showing 82 changed files with 2,545 additions and 3,325 deletions.
diff --git a/runtime/flang/directives.h b/runtime/flang/directives.h
@@ -49,7 +49,7 @@
 #define F3 % xmm2
 #define F4 % xmm3
 
-#else
+#elif defined(LINUX_ELF) || defined(TARGET_LINUX_X86) || defined(TARGET_LINUX_X8664)
 #define ENT(n) n
 #define ALN_WORD .align 4
 #define ALN_FUNC .align 16
@@ -71,6 +71,31 @@
 #define F3 % xmm2
 #define F4 % xmm3
 
+#elif defined(TARGET_OSX_X8664)
+#define ENT(n) ASM_CONCAT(_,n)
+#define ALN_WORD .align 2
+#define ALN_FUNC .align 4
+#define ALN_DBLE .align 3
+#define ALN_QUAD .align 4
+#define ELF_FUNC(s)
+#define ELF_OBJ(s)
+#define ELF_SIZE(s)
+#define AS_VER
+#define I1 % rdi
+#define I1W % edi
+#define I2 % rsi
+#define I2W % esi
+#define I3 % rdx
+#define I3W % edx
+#define I4 % rcx
+#define F1 % xmm0
+#define F2 % xmm1
+#define F3 % xmm2
+#define F4 % xmm3
+
+#else
+#error	X8664 TARGET platform not defined.
+#error	TARGET must be one of TARGET_LINUX_X8664, TARGET_OSX_X8664, or TARGET_WIN_X8664.
 #endif
 
 /* macros for handling pic and non-pic code */

diff --git a/runtime/flang/fmtconv.c b/runtime/flang/fmtconv.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 1995-2017, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -563,6 +563,7 @@ __fortio_fmt_g(__BIGREAL_T val, int w, int d, int e, int sf, int type,
 {
   int sign_char;
   int newd;
+#if defined(TARGET_X8664)
   /*
    * the following guarded IF may look like a no-op, but is
    * needed when val is a denorm and DAZ is enabled.  In this case, the
@@ -580,6 +581,7 @@ __fortio_fmt_g(__BIGREAL_T val, int w, int d, int e, int sf, int type,
       ((int *)&val)[1] |= 0x80000000;
     }
   }
+#endif
   field_overflow = FALSE;
   /*
       fp_canon(val, type, round);

diff --git a/runtime/flang/fortDt.h b/runtime/flang/fortDt.h
@@ -307,6 +307,10 @@ typedef __INT_T dtype;
  * which can be either a 64-bit or 32-bit type depending on DESC_I8
  */
 
+#if defined(TARGET_X8664)
 #define __NELEM_T __INT8_T
+#else
+#define __NELEM_T __INT_T
+#endif
 
 #endif /*_PGHPF_TYPES_H_*/
diff --git a/runtime/flang/ftncharsup.c b/runtime/flang/ftncharsup.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 1993-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -384,8 +384,13 @@ Ftn_str_free(char **first)
 
 #define __HAVE_LONGLONG_T
 
+#if defined(LINUX8664) || defined(OSX8664)
 typedef long _LONGLONG_T;
 typedef unsigned long _ULONGLONG_T;
+#else
+typedef long long _LONGLONG_T;
+typedef unsigned long long _ULONGLONG_T;
+#endif
 
 /* ***********************************************************************/
 /** \brief

diff --git a/runtime/flang/ftni64.h b/runtime/flang/ftni64.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 1997-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,8 +24,13 @@
 
 #define __HAVE_LONGLONG_T
 
+#if defined(LINUX8664) || defined(OSX8664)
 typedef long _LONGLONG_T;
 typedef unsigned long _ULONGLONG_T;
+#else
+typedef long long _LONGLONG_T;
+typedef unsigned long long _ULONGLONG_T;
+#endif
 
 /* now defined if BaseTsd10.h included */
 typedef int INT64[2];
@@ -44,6 +49,7 @@ typedef union {
   _LONGLONG_T lv;
 } INT64D;
 
+#if defined(LINUX8664) || defined(OSX8664)
 #define __I8RET_T long
 #define UTL_I_I64RET(m, l)                                                     \
   {                                                                            \
@@ -52,3 +58,18 @@ typedef union {
     I64_LSH(int64d.i) = l;                                                     \
     return int64d.lv;                                                          \
   }
+#elif defined(WIN64)
+/* Someday, should only care if TM_I8 is defined */
+#define __I8RET_T long long
+#define UTL_I_I64RET(m, l)                                                     \
+  {                                                                            \
+    INT64D int64d;                                                             \
+    I64_MSH(int64d.i) = m;                                                     \
+    I64_LSH(int64d.i) = l;                                                     \
+    return int64d.lv;                                                          \
+  }
+#else
+#define __I8RET_T void
+#define UTL_I_I64RET __utl_i_i64ret
+extern VOID UTL_I_I64RET();
+#endif
diff --git a/runtime/flang/ftnncharsup.c b/runtime/flang/ftnncharsup.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 1993-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -160,8 +160,13 @@ int a2_len;                        /* length of a2 */
 
 #define __HAVE_LONGLONG_T
 
+#if defined(LINUX8664) || defined(OSX8664)
 typedef long _LONGLONG_T;
 typedef unsigned long _ULONGLONG_T;
+#else
+typedef long long _LONGLONG_T;
+typedef unsigned long long _ULONGLONG_T;
+#endif
 
 
 /* ***********************************************************************/

diff --git a/runtime/flang/miscsup_com.c b/runtime/flang/miscsup_com.c
@@ -816,7 +816,11 @@ ENTFTN(SYSCLK, sysclk)(__STAT_T *count, __STAT_T *count_rate,
 
   if (resol == 0) {
     int def;
+#if defined(TARGET_X8664)
     def = 1000000;
+#else
+    def = sizeof(__STAT_T) < 8 ? 1000 : 1000000;
+#endif
     resol = __fort_getoptn("-system_clock_rate", def);
     if (resol <= 0)
       __fort_abort("invalid value given for system_clock rate");
@@ -2880,6 +2884,7 @@ ENTF90(TRIMA, trima)
   i = CLEN(expr);
   while (i > 0) {
     if (CADR(expr)[i - 1] != ' ') {
+#if defined(TARGET_X8664)
       if (i <= 11) {
         int *rptr = ((int *)CADR(res));
         int *eptr = ((int *)CADR(expr));
@@ -2899,6 +2904,11 @@ ENTF90(TRIMA, trima)
         }
         rcptr = (char *)rptr;
         ecptr = (char *)eptr;
+#else
+      if (i <= 3) {
+        rcptr = ((char *)CADR(res));
+        ecptr = ((char *)CADR(expr));
+#endif
         j = i & 3;
         if (j > 2)
           *rcptr++ = *ecptr++;
@@ -4823,7 +4833,11 @@ ENTF90(SPACINGD, spacingd)(__REAL8_T *d)
 
 #ifndef DESC_I8
 
+#if defined(TARGET_X8664)
 typedef __INT8_T SZ_T;
+#else
+typedef __INT4_T SZ_T;
+#endif
 
 #undef _MZERO
 #define _MZERO(n, t)                                                    \

diff --git a/runtime/flang/pgf90_mmul_cmplx16.h b/runtime/flang/pgf90_mmul_cmplx16.h
@@ -17,7 +17,11 @@
   !
   ! Global variables
   !
+#ifdef TARGET_X8664
   integer*8 :: mra, ncb, kab, lda, ldb, ldc
+#else
+  integer   :: mra, ncb, kab, lda, ldb, ldc
+#endif
   complex*16, dimension( lda, * )::a
   complex*16, dimension( ldb, * )::b
   complex*16, dimension( ldc, * )::c
@@ -26,6 +30,7 @@
     !
     ! local variables
   !
+#ifdef TARGET_X8664
   integer*8  :: colsa, rowsa, rowsb, colsb
   integer*8  :: i, j, jb, k, ak, bk, jend
   integer*8  :: ar, ar_sav,  ac, ac_sav, br, bc
@@ -36,6 +41,18 @@
   integer*8  :: colsb_chunk, colsb_chunks, colsb_strt, colsb_end
   integer*8  :: colsa_chunk, colsa_chunks, colsa_strt, colsa_end
   integer*8  :: bufr, bufr_sav, bufca, bufca_sav, bufcb, bufcb_sav
+#else
+  integer  :: colsa, rowsa, rowsb, colsb
+  integer  :: i, j, jb, k, ak, bk, jend
+  integer  :: ar, ar_sav,  ac, ac_sav, br, bc
+  integer  :: ndxa, ndxasav 
+  integer  :: ndxb, ndxbsav, ndxb0, ndxb1, ndxb2, ndxb3
+  integer  :: colachunk, colachunks, colbchunk, colbchunks
+  integer  :: rowchunk, rowchunks
+  integer  :: colsb_chunk, colsb_chunks, colsb_strt, colsb_end
+  integer  :: colsa_chunk, colsa_chunks, colsa_strt, colsa_end
+  integer  :: bufr, bufr_sav, bufca, bufca_sav, bufcb, bufcb_sav
+#endif
   integer  :: ta, tb
   complex*16   :: temp, temp0, temp1, temp2, temp3 
     real*8   :: temprr0, temprr1, temprr2, temprr3
@@ -52,7 +69,14 @@
     complex*16, allocatable, dimension(:) :: buffera, bufferb
 
 !Minimun number of multiplications needed to activate the blocked optimization.
+#ifdef TARGET_X8664
   integer, parameter :: min_blocked_mult = 15000 
+#elif TARGET_LINUX_POWER
+  integer, parameter :: min_blocked_mult = 15000 !Complex calculations not vectorized on OpenPower.
+#else
+  #warning untuned matrix multiplication parameter
+  integer, parameter :: min_blocked_mult = 15000 
+#endif
 
 #undef DCMPLX
 #define DCMPLX(r,i) cmplx(r,i,kind=8)
diff --git a/runtime/flang/pgf90_mmul_cmplx8.h b/runtime/flang/pgf90_mmul_cmplx8.h
@@ -17,7 +17,11 @@
   !
   ! Global variables
   !
+#ifdef TARGET_X8664
   integer*8 :: mra, ncb, kab, lda, ldb, ldc
+#else
+  integer   :: mra, ncb, kab, lda, ldb, ldc
+#endif
   complex*8, dimension( lda, * )::a
   complex*8, dimension( ldb, * )::b
   complex*8, dimension( ldc, * )::c
@@ -26,6 +30,7 @@
     !
     ! local variables
   !
+#ifdef TARGET_X8664
   integer*8  :: colsa, rowsa, rowsb, colsb
   integer*8  :: i, j, jb, k, ak, bk, jend
   integer*8  :: ar, ar_sav,  ac, ac_sav, br, bc
@@ -36,6 +41,18 @@
   integer*8  :: colsb_chunk, colsb_chunks, colsb_strt, colsb_end
   integer*8  :: colsa_chunk, colsa_chunks, colsa_strt, colsa_end
   integer*8  :: bufr, bufr_sav, bufca, bufca_sav, bufcb, bufcb_sav
+#else
+  integer  :: colsa, rowsa, rowsb, colsb
+  integer  :: i, j, jb, k, ak, bk, jend
+  integer  :: ar, ar_sav,  ac, ac_sav, br, bc
+  integer  :: ndxa, ndxasav 
+  integer  :: ndxb, ndxbsav, ndxb0, ndxb1, ndxb2, ndxb3
+  integer  :: colachunk, colachunks, colbchunk, colbchunks
+  integer  :: rowchunk, rowchunks
+  integer  :: colsb_chunk, colsb_chunks, colsb_strt, colsb_end
+  integer  :: colsa_chunk, colsa_chunks, colsa_strt, colsa_end
+  integer  :: bufr, bufr_sav, bufca, bufca_sav, bufcb, bufcb_sav
+#endif
   integer  :: ta, tb
   complex*8   :: temp, temp0, temp1, temp2, temp3 
     real*4   :: temprr0, temprr1, temprr2, temprr3
@@ -52,5 +69,12 @@
     complex*8, allocatable, dimension(:) :: buffera, bufferb
 
   !Minimun number of multiplications needed to activate the blocked optimization.
+#ifdef TARGET_X8664
   integer, parameter :: min_blocked_mult = 1750
+#elif TARGET_LINUX_POWER
+  integer, parameter :: min_blocked_mult = 1750  !Complex calculations not vectorized on OpenPower.
+#else
+  #warning untuned matrix multiplication parameter
+  integer, parameter :: min_blocked_mult = 1750 
+#endif
 
diff --git a/runtime/flang/pgf90_mmul_real4.h b/runtime/flang/pgf90_mmul_real4.h
@@ -21,7 +21,11 @@
   !
   ! Global variables
   !
+#ifdef TARGET_X8664
   integer*8 :: mra, ncb, kab, lda, ldb, ldc
+#else
+  integer   :: mra, ncb, kab, lda, ldb, ldc
+#endif
   real*4, dimension( lda, * )::a
   real*4, dimension( ldb, * )::b
   real*4, dimension( ldc, * )::c
@@ -30,6 +34,7 @@
   !
   ! local variables
   !
+#ifdef TARGET_X8664
   integer*8  :: colsa, rowsa, rowsb, colsb
   integer*8  :: i, j, jb, k, ak, bk, jend
   integer*8  :: ar, ar_sav,  ac, ac_sav, br, bc
@@ -40,6 +45,18 @@
   integer*8  :: colsb_chunk, colsb_chunks, colsb_strt, colsb_end
   integer*8  :: colsa_chunk, colsa_chunks, colsa_strt, colsa_end
   integer*8  :: bufr, bufr_sav, bufca, bufca_sav, bufcb, bufcb_sav
+#else
+  integer  :: colsa, rowsa, rowsb, colsb
+  integer  :: i, j, jb, k, ak, bk, jend
+  integer  :: ar, ar_sav,  ac, ac_sav, br, bc
+  integer  :: ndxa, ndxasav 
+  integer  :: ndxb, ndxbsav, ndxb0, ndxb1, ndxb2, ndxb3
+  integer  :: colachunk, colachunks, colbchunk, colbchunks
+  integer  :: rowchunk, rowchunks
+  integer  :: colsb_chunk, colsb_chunks, colsb_strt, colsb_end
+  integer  :: colsa_chunk, colsa_chunks, colsa_strt, colsa_end
+  integer  :: bufr, bufr_sav, bufca, bufca_sav, bufcb, bufcb_sav
+#endif
   real*4   :: temp, temp0, temp1, temp2, temp3 
   real*4   :: bufatemp, bufbtemp
   real*8   :: time_start, time_end, ttime, all_time
@@ -50,4 +67,11 @@
   real*4, allocatable, dimension(:) :: buffera, bufferb
 
   !Minimun number of multiplications needed to activate the blocked optimization.
+#ifdef TARGET_X8664
   integer, parameter :: min_blocked_mult = 5000
+#elif TARGET_LINUX_POWER
+  integer, parameter :: min_blocked_mult = 10000
+#else
+  #warning untuned matrix multiplication parameter
+  integer, parameter :: min_blocked_mult = 5000 
+#endif