diff --git a/build2/kram.xcodeproj/project.pbxproj b/build2/kram.xcodeproj/project.pbxproj index 09ec898..8d2762e 100644 --- a/build2/kram.xcodeproj/project.pbxproj +++ b/build2/kram.xcodeproj/project.pbxproj @@ -396,6 +396,8 @@ 70B563A82C857B360089A64F /* KramZipStream.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70B563A52C857B360089A64F /* KramZipStream.cpp */; }; 70B563A92C857B360089A64F /* KramZipStream.h in Headers */ = {isa = PBXBuildFile; fileRef = 70B563A62C857B360089A64F /* KramZipStream.h */; }; 70B563AA2C857B360089A64F /* KramZipStream.h in Headers */ = {isa = PBXBuildFile; fileRef = 70B563A62C857B360089A64F /* KramZipStream.h */; }; + 70B686E32CAA3409007ACA58 /* long234.h in Headers */ = {isa = PBXBuildFile; fileRef = 70B686E22CAA3405007ACA58 /* long234.h */; }; + 70B686E42CAA3409007ACA58 /* long234.h in Headers */ = {isa = PBXBuildFile; fileRef = 70B686E22CAA3405007ACA58 /* long234.h */; }; 70CDB65027A1382700A546C1 /* KramDDSHelper.h in Headers */ = {isa = PBXBuildFile; fileRef = 70CDB64E27A1382600A546C1 /* KramDDSHelper.h */; }; 70CDB65127A1382700A546C1 /* KramDDSHelper.h in Headers */ = {isa = PBXBuildFile; fileRef = 70CDB64E27A1382600A546C1 /* KramDDSHelper.h */; }; 70CDB65227A1382700A546C1 /* KramDDSHelper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70CDB64F27A1382600A546C1 /* KramDDSHelper.cpp */; }; @@ -764,6 +766,7 @@ 70A7BD2F27092A1200DBCCF7 /* hdr_encode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = hdr_encode.h; sourceTree = ""; }; 70B563A52C857B360089A64F /* KramZipStream.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = KramZipStream.cpp; sourceTree = ""; }; 70B563A62C857B360089A64F /* KramZipStream.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = KramZipStream.h; sourceTree = ""; }; + 70B686E22CAA3405007ACA58 /* long234.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = long234.h; sourceTree = ""; }; 70CDB64E27A1382600A546C1 /* KramDDSHelper.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = KramDDSHelper.h; sourceTree = ""; }; 70CDB64F27A1382600A546C1 /* KramDDSHelper.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = KramDDSHelper.cpp; sourceTree = ""; }; 70D222D62AC800AC00B9EA23 /* json11.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = json11.h; sourceTree = ""; }; @@ -807,6 +810,7 @@ 705F7EFD2C9FF42700E377B7 /* sse2neon.h */, 705F7EFE2C9FF42700E377B7 /* sse2neon-arm64.h */, 705F7F022C9FF42700E377B7 /* vectormath++.h */, + 70B686E22CAA3405007ACA58 /* long234.h */, 7013AD5D2CAA0E18007E5554 /* half234.h */, 7013AD602CAA0E21007E5554 /* int234.h */, 7013AD4E2CAA0818007E5554 /* float234.h */, @@ -1513,6 +1517,7 @@ 707789F32881BCE2008A51BC /* rdo_bc_encoder.h in Headers */, 70D222D82AC800AC00B9EA23 /* json11.h in Headers */, 706EF01726D15985001C950E /* colourset.h in Headers */, + 70B686E42CAA3409007ACA58 /* long234.h in Headers */, 708A6AA42708CE4700BA5410 /* bc6h_utils.h in Headers */, 706EF01826D15985001C950E /* colourblock.h in Headers */, 706EF01926D15985001C950E /* rangefit.h in Headers */, @@ -1632,6 +1637,7 @@ 707789F42881BCE2008A51BC /* rdo_bc_encoder.h in Headers */, 70D222D92AC800AC00B9EA23 /* json11.h in Headers */, 706EF19126D166C5001C950E /* colourset.h in Headers */, + 70B686E32CAA3409007ACA58 /* long234.h in Headers */, 708A6AA52708CE4700BA5410 /* bc6h_utils.h in Headers */, 706EF19226D166C5001C950E /* colourblock.h in Headers */, 706EF19326D166C5001C950E /* rangefit.h in Headers */, diff --git a/libkram/vectormath/double234.h b/libkram/vectormath/double234.h index af6c7e7..f8bedc4 100644 --- a/libkram/vectormath/double234.h +++ b/libkram/vectormath/double234.h @@ -122,16 +122,16 @@ SIMD_CALL double2 sqrt(double2 x) { return vsqrtq_f64(x); } SIMD_CALL double4 sqrt(double4 x) { - return double4m(sqrt(x.lo,x.lo), sqrt(x.hi,x.hi)); + return double4m(sqrt(x.lo), sqrt(x.hi)); } // use sse2neon to port this for now SIMD_CALL double4 reduce_addv(double4 x) { // 4:1 reduction - x = _mm_hadd_pd(x.lo, x.lo); - x = _mm_hadd_pd(x.hi, x.hi); - x = _mm_hadd_pd(x.lo, x.hi); - return x.x; // repeat x to all values + x.lo = _mm_hadd_pd(x.lo, x.lo); + x.hi = _mm_hadd_pd(x.hi, x.hi); + x.lo = _mm_hadd_pd(x.lo, x.hi); + return x.lo.x; // repeat x to all values } SIMD_CALL double reduce_add(double4 x) { @@ -179,16 +179,24 @@ SIMD_CALL double4 muladd(double4 x, double4 y, double4 t) { #endif } -SIMD_CALL double4 sqrt(double4 x) { +SIMD_CALL double2 sqrt(double2 x) { return _mm_sqrt_pd(x); } +SIMD_CALL double4 sqrt(double4 x) { + return double4m(sqrt(x.lo), sqrt(x.hi)); +} + +SIMD_CALL double2 reduce_addv(double2 x) { + x = _mm_hadd_pd(x.lo, x.lo); + return x.x; +} SIMD_CALL double4 reduce_addv(double4 x) { // 4:1 reduction - x = _mm_hadd_pd(x.lo, x.lo); - x = _mm_hadd_pd(x.hi, x.hi); - x = _mm_hadd_pd(x.lo, x.hi); - return x.x; // repeat x to all values + x.lo = _mm_hadd_pd(x.lo, x.lo); // TODO: fix + x.hi = _mm_hadd_pd(x.hi, x.hi); + x.lo = _mm_hadd_pd(x.lo, x.hi); + return x.lo.x; // repeat x to all values } SIMD_CALL double reduce_add(double4 x) { @@ -247,34 +255,26 @@ SIMD_CALL double4 zeroext(double3 x) { return (double4){x.x,x.y,x.z,0}; } -// any -SIMD_CALL bool any(long3 x) { - return any(vec3to4(x)); -} -SIMD_CALL bool all(long4 x) { - return all(vec3to4(x)); -} - // min -SIMD_CALL double2 min(double2 x, double2 y) { - return vec4to2(min(vec2to4(x), vec2to4(y))); -} +//SIMD_CALL double2 min(double2 x, double2 y) { +// return vec4to2(min(vec2to4(x), vec2to4(y))); +//} SIMD_CALL double3 min(double3 x, double3 y) { return vec4to3(min(vec3to4(x), vec3to4(y))); } // max -SIMD_CALL double2 max(double2 x, double2 y) { - return vec4to2(max(vec2to4(x), vec2to4(y))); -} +//SIMD_CALL double2 max(double2 x, double2 y) { +// return vec4to2(max(vec2to4(x), vec2to4(y))); +//} SIMD_CALL double3 max(double3 x, double3 y) { return vec4to3(max(vec3to4(x), vec3to4(y))); } // sqrt -SIMD_CALL double2 sqrt(double2 x) { - return vec4to2(sqrt(vec2to4(x))); -} +//SIMD_CALL double2 sqrt(double2 x) { +// return vec4to2(sqrt(vec2to4(x))); +//} SIMD_CALL double3 sqrt(double3 x) { return vec4to3(sqrt(vec3to4(x))); } @@ -315,18 +315,18 @@ SIMD_CALL double reduce_add(double3 x) { } // reduce_min - arm has double2 op -SIMD_CALL double reduce_min(double2 x) { - return reduce_min(vec2to4(x)); -} +//SIMD_CALL double reduce_min(double2 x) { +// return reduce_min(vec2to4(x)); +//} SIMD_CALL double reduce_min(double3 x) { return reduce_min(vec3to4(x)); } // reduce_max -SIMD_CALL double reduce_max(double2 x) { - return reduce_max(vec2to4(x)); -} +//SIMD_CALL double reduce_max(double2 x) { +// return reduce_max(vec2to4(x)); +//} SIMD_CALL double reduce_max(double3 x) { return reduce_max(vec3to4(x)); @@ -361,9 +361,9 @@ double3 saturate(double3 x); double4 saturate(double4 x); // muladd - arm has double2 op -SIMD_CALL double2 muladd(double2 x, double2 y, double2 t) { - return vec4to2(muladd(vec2to4(x), vec2to4(y), vec2to4(t))); -} +//SIMD_CALL double2 muladd(double2 x, double2 y, double2 t) { +// return vec4to2(muladd(vec2to4(x), vec2to4(y), vec2to4(t))); +//} SIMD_CALL double3 muladd(double3 x, double3 y, double3 t) { return vec4to3(muladd(vec3to4(x), vec3to4(y), vec3to4(t))); } diff --git a/libkram/vectormath/float234.h b/libkram/vectormath/float234.h index a70ba6a..99c7a6c 100644 --- a/libkram/vectormath/float234.h +++ b/libkram/vectormath/float234.h @@ -186,14 +186,6 @@ SIMD_CALL float4 zeroext(float3 x) { return (float4){x.x,x.y,x.z,0}; } -// any -SIMD_CALL bool any(int3 x) { - return any(vec3to4(x)); -} -SIMD_CALL bool all(int3 x) { - return all(vec3to4(x)); -} - // min SIMD_CALL float2 min(float2 x, float2 y) { return vec4to2(min(vec2to4(x), vec2to4(y))); diff --git a/libkram/vectormath/int234.h b/libkram/vectormath/int234.h index 25fc33d..c542ea2 100644 --- a/libkram/vectormath/int234.h +++ b/libkram/vectormath/int234.h @@ -66,6 +66,13 @@ SIMD_CALL bool all(int4 x) { } #endif // SIMD_SSE +// any-all +SIMD_CALL bool any(int3 x) { + return any(vec3to4(x)); +} +SIMD_CALL bool all(int3 x) { + return all(vec3to4(x)); +} // end of implementation //----------------------------------- @@ -94,7 +101,7 @@ SIMD_CALL int3 int3m(int x) { SIMD_CALL int3 int3m(int x, int y, int z) { return {x,y,z}; } -SIMD_CALL int3 int3m(int2 v, float z) { +SIMD_CALL int3 int3m(int2 v, int z) { int3 r; r.xy = v; r.z = z; return r; } @@ -108,7 +115,7 @@ SIMD_CALL int4 int4m(int2 xy, int2 zw) { SIMD_CALL int4 int4m(int x, int y, int z, int w) { return {x,y,z,w}; } -SIMD_CALL int4 int4m(int3 v, float w) { +SIMD_CALL int4 int4m(int3 v, int w) { int4 r; r.xyz = v; r.w = w; return r; } diff --git a/libkram/vectormath/long234.h b/libkram/vectormath/long234.h new file mode 100644 index 0000000..f9f9ace --- /dev/null +++ b/libkram/vectormath/long234.h @@ -0,0 +1,132 @@ +// kram - Copyright 2020-2024 by Alec Miller. - MIT License +// The license and copyright notice shall be included +// in all copies or substantial portions of the Software. + +#pragma once + +// This is not yet standalone. vectormath++.h includes it. +#if USE_SIMDLIB && SIMD_LONG + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +// define c vector types +// Apple uses long type here (32-bit) instead of long32_t +macroVector4TypesStorage(long, long) +macroVector4TypesPacked(long, long) + +#if SIMD_RENAME_TO_SIMD_NAMESPACE +macroVector4TypesStorageRenames(long, simd_long) +#endif // SIMD_RENAME_TO_SIMD_NAMESPACE + +#ifdef __cplusplus +} + +namespace SIMD_NAMESPACE { + +macroVector4TypesStorageRenames(long, long) + +//----------------------------------- +// imlementation - only code simd arch specific + +#if SIMD_NEON + +SIMD_CALL bool any(long2 x) { + return (x.x | x.y) & 0x8000000000000000U; +} +SIMD_CALL bool any(long3 x) { + return (x.x | x.y | x.z) & 0x8000000000000000U; +} +SIMD_CALL bool any(long4 x) { + return any(x.lo | x.hi); +} + +SIMD_CALL bool all(long2 x) { + return (x.x & x.y) & 0x8000000000000000U; +} +SIMD_CALL bool all(long3 x) { + return (x.x & x.y & x.z) & 0x8000000000000000U; +} +SIMD_CALL bool all(long4 x) { + return all(x.lo & x.hi); +} + +#endif // SIMD_NEON + +// These take in long types, this is what comparison gens from a < b, etc. +#if SIMD_SSE + +SIMD_CALL bool any(long2 x) { + return _mm_movemask_pd(x) & 0x3; // 2 bits +} +SIMD_CALL bool any(long3 x) { + // avx/2 have double4 op + return (x.x | x.y) & 0x8000000000000000U; +} +SIMD_CALL bool any(long4 x) { + // avx/2 have double4 op + return any(x.lo | x.hi); +} + +SIMD_CALL bool all(long2 x) { + return (_mm_movemask_pd(x) & 0x3) == 0x3; // 2 bits +} +SIMD_CALL bool all(long3 x) { + // avx/2 have double4 op + return (x.x & x.y & x.z) & 0x8000000000000000U; +} +SIMD_CALL bool all(long4 x) { + // avx/2 have double4 op + return any(x.lo & x.hi); +} +#endif // SIMD_SSE + +// end of implementation +//----------------------------------- + +// bitselect +SIMD_CALL long2 bitselect(long2 x, long2 y, long2 mask) { + return (x & ~mask) | (y & mask); +} +SIMD_CALL long3 bitselect(long3 x, long3 y, long3 mask) { + return (x & ~mask) | (y & mask); +} +SIMD_CALL long4 bitselect(long4 x, long4 y, long4 mask) { + return (x & ~mask) | (y & mask); +} + +SIMD_CALL long2 long2m(long x) { + return x; +} +SIMD_CALL long2 long2m(long x, long y) { + return {x,y}; +} + +SIMD_CALL long3 long3m(long x) { + return x; +} +SIMD_CALL long3 long3m(long x, long y, long z) { + return {x,y,z}; +} +SIMD_CALL long3 long3m(long2 v, long z) { + long3 r; r.xy = v; r.z = z; return r; +} + + +SIMD_CALL long4 long4m(long x) { + return x; +} +SIMD_CALL long4 long4m(long2 xy, long2 zw) { + long4 r; r.xy = xy; r.zw = zw; return r; +} +SIMD_CALL long4 long4m(long x, long y, long z, long w) { + return {x,y,z,w}; +} +SIMD_CALL long4 long4m(long3 v, long w) { + long4 r; r.xyz = v; r.w = w; return r; +} + +} +#endif // __cplusplus +#endif // USE_SIMDLIB && SIMD_LONG diff --git a/libkram/vectormath/vectormath++.cpp b/libkram/vectormath/vectormath++.cpp index bb08dd0..d7a2e9e 100644 --- a/libkram/vectormath/vectormath++.cpp +++ b/libkram/vectormath/vectormath++.cpp @@ -1444,15 +1444,19 @@ string vecf::str(const double4x4& m) const { // textbook transpose double2x2 transpose(const double2x2& x) { - double4 x0, x1; + double2 x0, x1; x0.xy = x[0]; x1.xy = x[1]; + + // std::swap would seem faster here? #if SIMD_SSE - double4 r01 = _mm_unpacklo_pd(x0, x1); // required AVX2 + double2 r0 = { x0[0], x1[0] }; + double2 r1 = { x0[1], x1[1] }; #else - double4 r01 = vzip1q_f64(x0, x1); + double2 r0 = vzip1q_f64(x0, x1); + double2 r1 = vzip2q_f64(x0, x1); #endif - return (double2x2){r01.lo, r01.hi}; + return (double2x2){r0, r1}; } double3x3 transpose(const double3x3& x) { diff --git a/libkram/vectormath/vectormath++.h b/libkram/vectormath/vectormath++.h index b410c28..0237048 100644 --- a/libkram/vectormath/vectormath++.h +++ b/libkram/vectormath/vectormath++.h @@ -136,22 +136,24 @@ // a define to override setings from prefix file #ifndef SIMD_CONFIG +// fp comparisons gen a corresponding signed integer type #define SIMD_INT 1 #define SIMD_LONG 1 +// don't need these yet, doing math, not string processing +#define SIMD_CHAR 0 +#define SIMD_SHORT 0 +//#define SIMD_UCHAR 0 +//#define SIMD_USHORT 0 +//#define SIMD_ULONG 0 + // Vector and matrix types. Currently only matrix types for SIMD_FLOAT, SIMD_DOUBLE. // SIMD_INT must be kept on for conditional tests. // SIMD_HALF for bitselect would need SIMD_SHORT or SIMD_INT? +// #define SIMD_HALF (1 && SIMD_SHORT) #define SIMD_HALF (1) #define SIMD_FLOAT (1 && SIMD_INT) -#define SIMD_DOUBLE (0 && SIMD_LONG) - - -#define SIMD_CHAR 0 -//#define SIMD_UCHAR 0 -#define SIMD_SHORT 0 -//#define SIMD_USHORT 0 -//#define SIMD_ULONG 0 +#define SIMD_DOUBLE (1 && SIMD_LONG) // Whether to support > 4 length vecs with some ops #define SIMD_FLOAT_EXT 0 @@ -339,6 +341,8 @@ SIMD_CALL type::column_t operator*(const type& x, const type::column_t& v) { ret // moved vec/matrix ops into secondary headers #include "int234.h" +#include "long234.h" + #include "half234.h" #include "float234.h" #include "double234.h" @@ -392,30 +396,6 @@ macroVector2TypesStorageRenames(short, short) #endif // __cplusplus #endif // SIMD_SHORT -//------------ -#if SIMD_LONG - -#ifdef __cplusplus -extern "C" { -#endif - -// define c vector types -macroVector8TypesStorage(long, long) -macroVector8TypesPacked(long, long) - -#if SIMD_RENAME_TO_SIMD_NAMESPACE -macroVector8TypesStorageRenames(long, simd_long) -#endif // SIMD_RENAME_TO_SIMD_NAMESPACE - -#ifdef __cplusplus -} - -namespace SIMD_NAMESPACE { -macroVector8TypesStorageRenames(long, long) -} -#endif // __cplusplus -#endif // SIMD_LONG - //------------------- #ifdef __cplusplus