Merge branch 'master' of jsoftware.com:jsource

jsoftware · Nov 20, 2024 · 469fd88 · 469fd88
2 parents 40e5427 + 98f1867
commit 469fd88
Show file tree

Hide file tree

Showing 10 changed files with 141 additions and 109 deletions.
diff --git a/jsrc/avxintrin-emu.h b/jsrc/avxintrin-emu.h
@@ -1818,6 +1818,11 @@ static __emu_inline __emu__m256i __emu_mm256_sllv_epi64(__emu__m256i a, __emu__m
 #define _mm256_xor_pd __emu_mm256_xor_pd
 #define _mm256_xor_ps __emu_mm256_xor_ps
 
+/* clang 19 defines these macro */
+#undef _mm_cmp_pd
+#undef _mm_cmp_ps
+#undef _mm_cmp_sd
+#undef _mm_cmp_ss
 
 #define _mm_cmp_pd __emu_mm_cmp_pd
 #define _mm256_cmp_pd __emu_mm256_cmp_pd

diff --git a/jsrc/cu.c b/jsrc/cu.c
@@ -6,6 +6,9 @@
 #include "j.h"
 #include "ve.h"
 
+#ifdef BOXEDSPARSE
+extern UC fboxedsparse;
+#endif
 
 static A jteverysp(J jt,A w,A fs){A*wv,x,z,*zv;P*wp,*zp;
  ARGCHK1(w);
@@ -124,6 +127,8 @@ A jtevery(J jt, A w, A fs){A * RESTRICT wv,x,z,* RESTRICT zv;
   }
 #ifndef BOXEDSPARSE
   ASSERT(!ISSPARSE(AT(x)),EVNONCE);
+#else
+  ASSERT(fboxedsparse||!ISSPARSE(AT(x)),EVNONCE);
 #endif
   // Store result & advance to next cell
   *zv++=x;

diff --git a/jsrc/j.c b/jsrc/j.c
@@ -125,6 +125,9 @@ uint64_t g_cpuFeatures2;  // fsgsbase
 int numberOfCores;        // number of cpu cores
 UC  hwaes=0;              // hardware aes support
 UC  hwfma=0;              // blis cpu tuning
+#ifdef BOXEDSPARSE
+UC  fboxedsparse=1;       // enable boxed sparse
+#endif
 I fortesting=0;   // used for measurements
 // globals end
 

diff --git a/jsrc/j.h b/jsrc/j.h
@@ -145,6 +145,8 @@
 
 #if defined(__aarch64__)||defined(_M_ARM64)
 #if EMU_AVX2
+#undef SSE2NEON_SUPPRESS_WARNINGS
+#define SSE2NEON_SUPPRESS_WARNINGS
 #include <stdint.h>
 #include <string.h>
 #include "sse2neon.h"

diff --git a/jsrc/je.h b/jsrc/je.h
@@ -27,6 +27,7 @@ extern F1(jtbehead);
 extern F1(jtbinrep1);
 // extern F1(jtbitadv);
 extern F1(jtbox);
+extern F1(jtboxedsparse);
 extern F1(jtboxopen);
 extern F1(jtboxq);
 extern F1(jtboxs);

diff --git a/jsrc/sse2neon.h b/jsrc/sse2neon.h
@@ -106,28 +106,23 @@
 #pragma message("Macro name collisions may happen with unsupported compilers.")
 #endif
 
-
-#if defined(__GNUC__) && !defined(__clang__)
-#pragma push_macro("FORCE_INLINE_OPTNONE")
-#define FORCE_INLINE_OPTNONE static inline __attribute__((optimize("O0")))
-#elif defined(__clang__)
-#pragma push_macro("FORCE_INLINE_OPTNONE")
-#define FORCE_INLINE_OPTNONE static inline __attribute__((optnone))
-#else
-#define FORCE_INLINE_OPTNONE FORCE_INLINE
-#endif
-
 #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 10
 #warning "GCC versions earlier than 10 are not supported."
 #endif
 
+#if defined(__OPTIMIZE__) && !defined(SSE2NEON_SUPPRESS_WARNINGS)
+#warning \
+    "Report any potential compiler optimization issues when using SSE2NEON. See the 'Optimization' section at https://github.com/DLTcollab/sse2neon."
+#endif
+
 /* C language does not allow initializing a variable with a function call. */
 #ifdef __cplusplus
 #define _sse2neon_const static const
 #else
 #define _sse2neon_const const
 #endif
 
+#include <fenv.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@@ -160,6 +155,7 @@ FORCE_INLINE int64_t sse2neon_recast_f64_s64(double f64)
 #include <windows.h>
 #endif
 
+/*  && !defined(_MSC_VER) part for windows arm64 */
 #if !defined(__cplusplus) && !defined(_MSC_VER)
 #error SSE2NEON only supports C++ compilation with this compiler
 #endif
@@ -339,6 +335,15 @@ FORCE_INLINE void _sse2neon_smp_mb(void)
 #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
     (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
 
+/**
+ * MACRO for shuffle parameter for _mm_shuffle_pd().
+ * Argument fp1 is a digit[01] that represents the fp from argument "b"
+ * of mm_shuffle_pd that will be placed in fp1 of result.
+ * fp0 is a digit[01] that represents the fp from argument "a" of mm_shuffle_pd
+ * that will be placed in fp0 of result.
+ */
+#define _MM_SHUFFLE2(fp1, fp0) (((fp1) << 1) | (fp0))
+
 #if __has_builtin(__builtin_shufflevector)
 #define _sse2neon_shuffle(type, a, b, ...) \
     __builtin_shufflevector(a, b, __VA_ARGS__)
@@ -604,8 +609,8 @@ FORCE_INLINE __m128d _mm_ceil_pd(__m128d);
 FORCE_INLINE __m128 _mm_ceil_ps(__m128);
 FORCE_INLINE __m128d _mm_floor_pd(__m128d);
 FORCE_INLINE __m128 _mm_floor_ps(__m128);
-FORCE_INLINE_OPTNONE __m128d _mm_round_pd(__m128d, int);
-FORCE_INLINE_OPTNONE __m128 _mm_round_ps(__m128, int);
+FORCE_INLINE __m128d _mm_round_pd(__m128d, int);
+FORCE_INLINE __m128 _mm_round_ps(__m128, int);
 // SSE4.2
 FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
 
@@ -1846,25 +1851,20 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode(void)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE
 FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void)
 {
-    union {
-        fpcr_bitfield field;
-#if defined(__aarch64__) || defined(_M_ARM64)
-        uint64_t value;
-#else
-        uint32_t value;
-#endif
-    } r;
-
-#if defined(__aarch64__) || defined(_M_ARM64)
-    r.value = _sse2neon_get_fpcr();
-#else
-    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
-#endif
-
-    if (r.field.bit22) {
-        return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP;
-    } else {
-        return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST;
+    switch (fegetround()) {
+    case FE_TONEAREST:
+        return _MM_ROUND_NEAREST;
+    case FE_DOWNWARD:
+        return _MM_ROUND_DOWN;
+    case FE_UPWARD:
+        return _MM_ROUND_UP;
+    case FE_TOWARDZERO:
+        return _MM_ROUND_TOWARD_ZERO;
+    default:
+        // fegetround() must return _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
+        // _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO on success. all the other error
+        // cases we treat them as FE_TOWARDZERO (truncate).
+        return _MM_ROUND_TOWARD_ZERO;
     }
 }
 
@@ -2458,46 +2458,28 @@ FORCE_INLINE __m128 _mm_set_ps1(float _w)
 // the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
 // _MM_ROUND_TOWARD_ZERO
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE
-FORCE_INLINE_OPTNONE void _MM_SET_ROUNDING_MODE(int rounding)
+FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
 {
-    union {
-        fpcr_bitfield field;
-#if defined(__aarch64__) || defined(_M_ARM64)
-        uint64_t value;
-#else
-        uint32_t value;
-#endif
-    } r;
-
-#if defined(__aarch64__) || defined(_M_ARM64)
-    r.value = _sse2neon_get_fpcr();
-#else
-    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
-#endif
-
     switch (rounding) {
-    case _MM_ROUND_TOWARD_ZERO:
-        r.field.bit22 = 1;
-        r.field.bit23 = 1;
+    case _MM_ROUND_NEAREST:
+        rounding = FE_TONEAREST;
         break;
     case _MM_ROUND_DOWN:
-        r.field.bit22 = 0;
-        r.field.bit23 = 1;
+        rounding = FE_DOWNWARD;
         break;
     case _MM_ROUND_UP:
-        r.field.bit22 = 1;
-        r.field.bit23 = 0;
+        rounding = FE_UPWARD;
         break;
-    default:  //_MM_ROUND_NEAREST
-        r.field.bit22 = 0;
-        r.field.bit23 = 0;
+    case _MM_ROUND_TOWARD_ZERO:
+        rounding = FE_TOWARDZERO;
+        break;
+    default:
+        // rounding must be _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
+        // _MM_ROUND_TOWARD_ZERO. all the other invalid values we treat them as
+        // FE_TOWARDZERO (truncate).
+        rounding = FE_TOWARDZERO;
     }
-
-#if defined(__aarch64__) || defined(_M_ARM64)
-    _sse2neon_set_fpcr(r.value);
-#else
-    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
-#endif
+    fesetround(rounding);
 }
 
 // Copy single-precision (32-bit) floating-point element a to the lower element
@@ -3899,7 +3881,7 @@ FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
 // Convert packed double-precision (64-bit) floating-point elements in a to
 // packed 32-bit integers, and store the results in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32
-FORCE_INLINE_OPTNONE __m128i _mm_cvtpd_epi32(__m128d a)
+FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
 {
 // vrnd32xq_f64 not supported on clang
 #if defined(__ARM_FEATURE_FRINT) && !defined(__clang__)
@@ -3921,7 +3903,7 @@ FORCE_INLINE_OPTNONE __m128i _mm_cvtpd_epi32(__m128d a)
 // Convert packed double-precision (64-bit) floating-point elements in a to
 // packed 32-bit integers, and store the results in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_pi32
-FORCE_INLINE_OPTNONE __m64 _mm_cvtpd_pi32(__m128d a)
+FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
 {
     __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
     double d0, d1;
@@ -4217,7 +4199,7 @@ FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
 // Convert packed double-precision (64-bit) floating-point elements in a to
 // packed 32-bit integers with truncation, and store the results in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32
-FORCE_INLINE_OPTNONE __m64 _mm_cvttpd_pi32(__m128d a)
+FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
 {
     double a0, a1;
     a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
@@ -7559,7 +7541,7 @@ FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
 // the rounding parameter, and store the results as packed double-precision
 // floating-point elements in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd
-FORCE_INLINE_OPTNONE __m128d _mm_round_pd(__m128d a, int rounding)
+FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
 {
 #if defined(__aarch64__) || defined(_M_ARM64)
     switch (rounding) {
@@ -7628,7 +7610,7 @@ FORCE_INLINE_OPTNONE __m128d _mm_round_pd(__m128d a, int rounding)
 // the rounding parameter, and store the results as packed single-precision
 // floating-point elements in dst.
 // software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
-FORCE_INLINE_OPTNONE __m128 _mm_round_ps(__m128 a, int rounding)
+FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
 {
 #if (defined(__aarch64__) || defined(_M_ARM64)) || \
     defined(__ARM_FEATURE_DIRECTED_ROUNDING)
@@ -9346,8 +9328,7 @@ FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
 #endif
 }
 
-FORCE_INLINE_OPTNONE void _sse2neon_mm_set_denormals_zero_mode(
-    unsigned int flag)
+FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
 {
     // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
     // regardless of the value of the FZ bit.
@@ -9419,7 +9400,6 @@ FORCE_INLINE uint64_t _rdtsc(void)
 #if defined(__GNUC__) || defined(__clang__)
 #pragma pop_macro("ALIGN_STRUCT")
 #pragma pop_macro("FORCE_INLINE")
-#pragma pop_macro("FORCE_INLINE_OPTNONE")
 #endif
 
 #if defined(__GNUC__) && !defined(__clang__)

diff --git a/jsrc/v1.c b/jsrc/v1.c
@@ -7,6 +7,10 @@
 #include "vcomp.h"
 #include "ve.h"
 
+#ifdef BOXEDSPARSE
+extern UC fboxedsparse;
+#endif
+
 #ifdef MMSC_VER
 #pragma warning(disable: 4244)
 #endif
@@ -405,7 +409,7 @@ static B jtmatchsub(J jt,A a,A w,B* RESTRICT x,I af,I wf,I m,I n,I b1){C*av,*wv;
  if(unlikely(t&FUNC))R (!eqf(a,w))^(x==0?1:b1);  // true value, but switch if return is not 'match'
  if(unlikely(t&NAME))R (!eqname(a,w))^(x==0?1:b1);  // true value, but switch if return is not 'match'
 #ifdef BOXEDSPARSE
- if(unlikely(ISSPARSE(at|wt)))R num(1)==matchs(a,w);
+ if(fboxedsparse) if(unlikely(ISSPARSE(at|wt)))R num(1)==matchs(a,w);
 #endif
  // If the types mismatch, convert as needed to the common (unsafe) type calculated earlier
  if(at!=wt) {

diff --git a/jsrc/vo.c b/jsrc/vo.c
@@ -6,6 +6,10 @@
 #define ZZDEFN
 #include "result.h"
 
+#ifdef BOXEDSPARSE
+extern UC fboxedsparse;
+#endif
+
 I level(J jt,A w){A*wv;I d,j;
  ARGCHK1(w);
  if((-AN(w)&-(AT(w)&BOX))>=0)R 0;
@@ -31,6 +35,8 @@ F1(jtbox){A y,z,*zv;C*wv;I f,k,m,n,r,wr,*ws;
  F1PREFIP;ARGCHK1(w);I wt=AT(w); FLAGT waf=AFLAG(w);
 #ifndef BOXEDSPARSE
  ASSERTF(!ISSPARSE(wt),EVNONCE,"can't box sparse arrays");
+#else
+ ASSERTF(fboxedsparse||!ISSPARSE(wt),EVNONCE,"can't box sparse arrays");
 #endif
  wr=AR(w); r=(RANKT)jt->ranks; r=wr<r?wr:r; f=wr-r;   // no RESETRANK because we call no primitives
  if(likely(!f)){
@@ -54,7 +60,7 @@ F1(jtbox){A y,z,*zv;C*wv;I f,k,m,n,r,wr,*ws;
  } else {
   // <"r
 #ifdef BOXEDSPARSE
- ASSERTF(!ISSPARSE(wt),EVNONCE,"can't box sparse arrays");    // <"r not implemented
+ ASSERTF(fboxedsparse||!ISSPARSE(wt),EVNONCE,"can't box sparse arrays");    // <"r not implemented
 #endif
   ws=AS(w);
   CPROD(AN(w),n,f,ws); CPROD(AN(w),m,r,f+ws);
@@ -125,6 +131,8 @@ F2PREFIP;ARGCHK2(a,w);
 #endif
 #ifndef BOXEDSPARSE
  ASSERTF(!ISSPARSE(AT(a)|AT(w)),EVNONCE,"can't box sparse arrays");
+#else
+ ASSERTF(fboxedsparse||!ISSPARSE(AT(a)|AT(w)),EVNONCE,"can't box sparse arrays");
 #endif
  I optype=FAV(self)->localuse.lu1.linkvb;  // flag: sign set if (,<) or ,&< or (;<) which will always box w; bit 0 set if (,<)
  optype|=((I)jtinplace&JTWILLBEOPENED)<<(BOXX-JTWILLBEOPENEDX);  // fold in BOX flag that tells us to allow virtual boxed results