Skip to content

Commit

Permalink
Merge branch 'master' of jsoftware.com:jsource
Browse files Browse the repository at this point in the history
  • Loading branch information
HenryHRich committed Nov 20, 2024
2 parents 40e5427 + 98f1867 commit 469fd88
Show file tree
Hide file tree
Showing 10 changed files with 141 additions and 109 deletions.
5 changes: 5 additions & 0 deletions jsrc/avxintrin-emu.h
Original file line number Diff line number Diff line change
Expand Up @@ -1818,6 +1818,11 @@ static __emu_inline __emu__m256i __emu_mm256_sllv_epi64(__emu__m256i a, __emu__m
#define _mm256_xor_pd __emu_mm256_xor_pd
#define _mm256_xor_ps __emu_mm256_xor_ps

/* clang 19 defines these macro */
#undef _mm_cmp_pd
#undef _mm_cmp_ps
#undef _mm_cmp_sd
#undef _mm_cmp_ss

#define _mm_cmp_pd __emu_mm_cmp_pd
#define _mm256_cmp_pd __emu_mm256_cmp_pd
Expand Down
5 changes: 5 additions & 0 deletions jsrc/cu.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
#include "j.h"
#include "ve.h"

#ifdef BOXEDSPARSE
extern UC fboxedsparse;
#endif

static A jteverysp(J jt,A w,A fs){A*wv,x,z,*zv;P*wp,*zp;
ARGCHK1(w);
Expand Down Expand Up @@ -124,6 +127,8 @@ A jtevery(J jt, A w, A fs){A * RESTRICT wv,x,z,* RESTRICT zv;
}
#ifndef BOXEDSPARSE
ASSERT(!ISSPARSE(AT(x)),EVNONCE);
#else
ASSERT(fboxedsparse||!ISSPARSE(AT(x)),EVNONCE);
#endif
// Store result & advance to next cell
*zv++=x;
Expand Down
3 changes: 3 additions & 0 deletions jsrc/j.c
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,9 @@ uint64_t g_cpuFeatures2; // fsgsbase
int numberOfCores; // number of cpu cores
UC hwaes=0; // hardware aes support
UC hwfma=0; // blis cpu tuning
#ifdef BOXEDSPARSE
UC fboxedsparse=1; // enable boxed sparse
#endif
I fortesting=0; // used for measurements
// globals end

Expand Down
2 changes: 2 additions & 0 deletions jsrc/j.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,8 @@

#if defined(__aarch64__)||defined(_M_ARM64)
#if EMU_AVX2
#undef SSE2NEON_SUPPRESS_WARNINGS
#define SSE2NEON_SUPPRESS_WARNINGS
#include <stdint.h>
#include <string.h>
#include "sse2neon.h"
Expand Down
1 change: 1 addition & 0 deletions jsrc/je.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ extern F1(jtbehead);
extern F1(jtbinrep1);
// extern F1(jtbitadv);
extern F1(jtbox);
extern F1(jtboxedsparse);
extern F1(jtboxopen);
extern F1(jtboxq);
extern F1(jtboxs);
Expand Down
124 changes: 52 additions & 72 deletions jsrc/sse2neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,28 +106,23 @@
#pragma message("Macro name collisions may happen with unsupported compilers.")
#endif


#if defined(__GNUC__) && !defined(__clang__)
#pragma push_macro("FORCE_INLINE_OPTNONE")
#define FORCE_INLINE_OPTNONE static inline __attribute__((optimize("O0")))
#elif defined(__clang__)
#pragma push_macro("FORCE_INLINE_OPTNONE")
#define FORCE_INLINE_OPTNONE static inline __attribute__((optnone))
#else
#define FORCE_INLINE_OPTNONE FORCE_INLINE
#endif

#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 10
#warning "GCC versions earlier than 10 are not supported."
#endif

#if defined(__OPTIMIZE__) && !defined(SSE2NEON_SUPPRESS_WARNINGS)
#warning \
"Report any potential compiler optimization issues when using SSE2NEON. See the 'Optimization' section at https://github.com/DLTcollab/sse2neon."
#endif

/* C language does not allow initializing a variable with a function call. */
#ifdef __cplusplus
#define _sse2neon_const static const
#else
#define _sse2neon_const const
#endif

#include <fenv.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
Expand Down Expand Up @@ -160,6 +155,7 @@ FORCE_INLINE int64_t sse2neon_recast_f64_s64(double f64)
#include <windows.h>
#endif

/* && !defined(_MSC_VER) part for windows arm64 */
#if !defined(__cplusplus) && !defined(_MSC_VER)
#error SSE2NEON only supports C++ compilation with this compiler
#endif
Expand Down Expand Up @@ -339,6 +335,15 @@ FORCE_INLINE void _sse2neon_smp_mb(void)
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
(((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))

/**
* MACRO for shuffle parameter for _mm_shuffle_pd().
* Argument fp1 is a digit[01] that represents the fp from argument "b"
* of mm_shuffle_pd that will be placed in fp1 of result.
* fp0 is a digit[01] that represents the fp from argument "a" of mm_shuffle_pd
* that will be placed in fp0 of result.
*/
#define _MM_SHUFFLE2(fp1, fp0) (((fp1) << 1) | (fp0))

#if __has_builtin(__builtin_shufflevector)
#define _sse2neon_shuffle(type, a, b, ...) \
__builtin_shufflevector(a, b, __VA_ARGS__)
Expand Down Expand Up @@ -604,8 +609,8 @@ FORCE_INLINE __m128d _mm_ceil_pd(__m128d);
FORCE_INLINE __m128 _mm_ceil_ps(__m128);
FORCE_INLINE __m128d _mm_floor_pd(__m128d);
FORCE_INLINE __m128 _mm_floor_ps(__m128);
FORCE_INLINE_OPTNONE __m128d _mm_round_pd(__m128d, int);
FORCE_INLINE_OPTNONE __m128 _mm_round_ps(__m128, int);
FORCE_INLINE __m128d _mm_round_pd(__m128d, int);
FORCE_INLINE __m128 _mm_round_ps(__m128, int);
// SSE4.2
FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);

Expand Down Expand Up @@ -1846,25 +1851,20 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode(void)
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE
FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void)
{
union {
fpcr_bitfield field;
#if defined(__aarch64__) || defined(_M_ARM64)
uint64_t value;
#else
uint32_t value;
#endif
} r;

#if defined(__aarch64__) || defined(_M_ARM64)
r.value = _sse2neon_get_fpcr();
#else
__asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
#endif

if (r.field.bit22) {
return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP;
} else {
return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST;
switch (fegetround()) {
case FE_TONEAREST:
return _MM_ROUND_NEAREST;
case FE_DOWNWARD:
return _MM_ROUND_DOWN;
case FE_UPWARD:
return _MM_ROUND_UP;
case FE_TOWARDZERO:
return _MM_ROUND_TOWARD_ZERO;
default:
// fegetround() must return _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO on success. all the other error
// cases we treat them as FE_TOWARDZERO (truncate).
return _MM_ROUND_TOWARD_ZERO;
}
}

Expand Down Expand Up @@ -2458,46 +2458,28 @@ FORCE_INLINE __m128 _mm_set_ps1(float _w)
// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
// _MM_ROUND_TOWARD_ZERO
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE
FORCE_INLINE_OPTNONE void _MM_SET_ROUNDING_MODE(int rounding)
FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
{
union {
fpcr_bitfield field;
#if defined(__aarch64__) || defined(_M_ARM64)
uint64_t value;
#else
uint32_t value;
#endif
} r;

#if defined(__aarch64__) || defined(_M_ARM64)
r.value = _sse2neon_get_fpcr();
#else
__asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
#endif

switch (rounding) {
case _MM_ROUND_TOWARD_ZERO:
r.field.bit22 = 1;
r.field.bit23 = 1;
case _MM_ROUND_NEAREST:
rounding = FE_TONEAREST;
break;
case _MM_ROUND_DOWN:
r.field.bit22 = 0;
r.field.bit23 = 1;
rounding = FE_DOWNWARD;
break;
case _MM_ROUND_UP:
r.field.bit22 = 1;
r.field.bit23 = 0;
rounding = FE_UPWARD;
break;
default: //_MM_ROUND_NEAREST
r.field.bit22 = 0;
r.field.bit23 = 0;
case _MM_ROUND_TOWARD_ZERO:
rounding = FE_TOWARDZERO;
break;
default:
// rounding must be _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
// _MM_ROUND_TOWARD_ZERO. all the other invalid values we treat them as
// FE_TOWARDZERO (truncate).
rounding = FE_TOWARDZERO;
}

#if defined(__aarch64__) || defined(_M_ARM64)
_sse2neon_set_fpcr(r.value);
#else
__asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
#endif
fesetround(rounding);
}

// Copy single-precision (32-bit) floating-point element a to the lower element
Expand Down Expand Up @@ -3899,7 +3881,7 @@ FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
// Convert packed double-precision (64-bit) floating-point elements in a to
// packed 32-bit integers, and store the results in dst.
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32
FORCE_INLINE_OPTNONE __m128i _mm_cvtpd_epi32(__m128d a)
FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
{
// vrnd32xq_f64 not supported on clang
#if defined(__ARM_FEATURE_FRINT) && !defined(__clang__)
Expand All @@ -3921,7 +3903,7 @@ FORCE_INLINE_OPTNONE __m128i _mm_cvtpd_epi32(__m128d a)
// Convert packed double-precision (64-bit) floating-point elements in a to
// packed 32-bit integers, and store the results in dst.
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_pi32
FORCE_INLINE_OPTNONE __m64 _mm_cvtpd_pi32(__m128d a)
FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
{
__m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
double d0, d1;
Expand Down Expand Up @@ -4217,7 +4199,7 @@ FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
// Convert packed double-precision (64-bit) floating-point elements in a to
// packed 32-bit integers with truncation, and store the results in dst.
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32
FORCE_INLINE_OPTNONE __m64 _mm_cvttpd_pi32(__m128d a)
FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
{
double a0, a1;
a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
Expand Down Expand Up @@ -7559,7 +7541,7 @@ FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
// the rounding parameter, and store the results as packed double-precision
// floating-point elements in dst.
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd
FORCE_INLINE_OPTNONE __m128d _mm_round_pd(__m128d a, int rounding)
FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
{
#if defined(__aarch64__) || defined(_M_ARM64)
switch (rounding) {
Expand Down Expand Up @@ -7628,7 +7610,7 @@ FORCE_INLINE_OPTNONE __m128d _mm_round_pd(__m128d a, int rounding)
// the rounding parameter, and store the results as packed single-precision
// floating-point elements in dst.
// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
FORCE_INLINE_OPTNONE __m128 _mm_round_ps(__m128 a, int rounding)
FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
{
#if (defined(__aarch64__) || defined(_M_ARM64)) || \
defined(__ARM_FEATURE_DIRECTED_ROUNDING)
Expand Down Expand Up @@ -9346,8 +9328,7 @@ FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
#endif
}

FORCE_INLINE_OPTNONE void _sse2neon_mm_set_denormals_zero_mode(
unsigned int flag)
FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
{
// AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
// regardless of the value of the FZ bit.
Expand Down Expand Up @@ -9419,7 +9400,6 @@ FORCE_INLINE uint64_t _rdtsc(void)
#if defined(__GNUC__) || defined(__clang__)
#pragma pop_macro("ALIGN_STRUCT")
#pragma pop_macro("FORCE_INLINE")
#pragma pop_macro("FORCE_INLINE_OPTNONE")
#endif

#if defined(__GNUC__) && !defined(__clang__)
Expand Down
6 changes: 5 additions & 1 deletion jsrc/v1.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
#include "vcomp.h"
#include "ve.h"

#ifdef BOXEDSPARSE
extern UC fboxedsparse;
#endif

#ifdef MMSC_VER
#pragma warning(disable: 4244)
#endif
Expand Down Expand Up @@ -405,7 +409,7 @@ static B jtmatchsub(J jt,A a,A w,B* RESTRICT x,I af,I wf,I m,I n,I b1){C*av,*wv;
if(unlikely(t&FUNC))R (!eqf(a,w))^(x==0?1:b1); // true value, but switch if return is not 'match'
if(unlikely(t&NAME))R (!eqname(a,w))^(x==0?1:b1); // true value, but switch if return is not 'match'
#ifdef BOXEDSPARSE
if(unlikely(ISSPARSE(at|wt)))R num(1)==matchs(a,w);
if(fboxedsparse) if(unlikely(ISSPARSE(at|wt)))R num(1)==matchs(a,w);
#endif
// If the types mismatch, convert as needed to the common (unsafe) type calculated earlier
if(at!=wt) {
Expand Down
10 changes: 9 additions & 1 deletion jsrc/vo.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@
#define ZZDEFN
#include "result.h"

#ifdef BOXEDSPARSE
extern UC fboxedsparse;
#endif

I level(J jt,A w){A*wv;I d,j;
ARGCHK1(w);
if((-AN(w)&-(AT(w)&BOX))>=0)R 0;
Expand All @@ -31,6 +35,8 @@ F1(jtbox){A y,z,*zv;C*wv;I f,k,m,n,r,wr,*ws;
F1PREFIP;ARGCHK1(w);I wt=AT(w); FLAGT waf=AFLAG(w);
#ifndef BOXEDSPARSE
ASSERTF(!ISSPARSE(wt),EVNONCE,"can't box sparse arrays");
#else
ASSERTF(fboxedsparse||!ISSPARSE(wt),EVNONCE,"can't box sparse arrays");
#endif
wr=AR(w); r=(RANKT)jt->ranks; r=wr<r?wr:r; f=wr-r; // no RESETRANK because we call no primitives
if(likely(!f)){
Expand All @@ -54,7 +60,7 @@ F1(jtbox){A y,z,*zv;C*wv;I f,k,m,n,r,wr,*ws;
} else {
// <"r
#ifdef BOXEDSPARSE
ASSERTF(!ISSPARSE(wt),EVNONCE,"can't box sparse arrays"); // <"r not implemented
ASSERTF(fboxedsparse||!ISSPARSE(wt),EVNONCE,"can't box sparse arrays"); // <"r not implemented
#endif
ws=AS(w);
CPROD(AN(w),n,f,ws); CPROD(AN(w),m,r,f+ws);
Expand Down Expand Up @@ -125,6 +131,8 @@ F2PREFIP;ARGCHK2(a,w);
#endif
#ifndef BOXEDSPARSE
ASSERTF(!ISSPARSE(AT(a)|AT(w)),EVNONCE,"can't box sparse arrays");
#else
ASSERTF(fboxedsparse||!ISSPARSE(AT(a)|AT(w)),EVNONCE,"can't box sparse arrays");
#endif
I optype=FAV(self)->localuse.lu1.linkvb; // flag: sign set if (,<) or ,&< or (;<) which will always box w; bit 0 set if (,<)
optype|=((I)jtinplace&JTWILLBEOPENED)<<(BOXX-JTWILLBEOPENEDX); // fold in BOX flag that tells us to allow virtual boxed results
Expand Down
Loading

0 comments on commit 469fd88

Please sign in to comment.