kram - simd - add double and long mask support

This is only using 16B ops, where AVX/2 expose 32B ops.
alecazam · Sep 30, 2024 · 3ae9626 · 3ae9626
1 parent 85d4148
commit 3ae9626
Show file tree

Hide file tree

Showing 7 changed files with 203 additions and 82 deletions.
diff --git a/build2/kram.xcodeproj/project.pbxproj b/build2/kram.xcodeproj/project.pbxproj
@@ -396,6 +396,8 @@
 		70B563A82C857B360089A64F /* KramZipStream.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70B563A52C857B360089A64F /* KramZipStream.cpp */; };
 		70B563A92C857B360089A64F /* KramZipStream.h in Headers */ = {isa = PBXBuildFile; fileRef = 70B563A62C857B360089A64F /* KramZipStream.h */; };
 		70B563AA2C857B360089A64F /* KramZipStream.h in Headers */ = {isa = PBXBuildFile; fileRef = 70B563A62C857B360089A64F /* KramZipStream.h */; };
+		70B686E32CAA3409007ACA58 /* long234.h in Headers */ = {isa = PBXBuildFile; fileRef = 70B686E22CAA3405007ACA58 /* long234.h */; };
+		70B686E42CAA3409007ACA58 /* long234.h in Headers */ = {isa = PBXBuildFile; fileRef = 70B686E22CAA3405007ACA58 /* long234.h */; };
 		70CDB65027A1382700A546C1 /* KramDDSHelper.h in Headers */ = {isa = PBXBuildFile; fileRef = 70CDB64E27A1382600A546C1 /* KramDDSHelper.h */; };
 		70CDB65127A1382700A546C1 /* KramDDSHelper.h in Headers */ = {isa = PBXBuildFile; fileRef = 70CDB64E27A1382600A546C1 /* KramDDSHelper.h */; };
 		70CDB65227A1382700A546C1 /* KramDDSHelper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70CDB64F27A1382600A546C1 /* KramDDSHelper.cpp */; };
@@ -764,6 +766,7 @@
 		70A7BD2F27092A1200DBCCF7 /* hdr_encode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = hdr_encode.h; sourceTree = "<group>"; };
 		70B563A52C857B360089A64F /* KramZipStream.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = KramZipStream.cpp; sourceTree = "<group>"; };
 		70B563A62C857B360089A64F /* KramZipStream.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = KramZipStream.h; sourceTree = "<group>"; };
+		70B686E22CAA3405007ACA58 /* long234.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = long234.h; sourceTree = "<group>"; };
 		70CDB64E27A1382600A546C1 /* KramDDSHelper.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = KramDDSHelper.h; sourceTree = "<group>"; };
 		70CDB64F27A1382600A546C1 /* KramDDSHelper.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = KramDDSHelper.cpp; sourceTree = "<group>"; };
 		70D222D62AC800AC00B9EA23 /* json11.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = json11.h; sourceTree = "<group>"; };
@@ -807,6 +810,7 @@
 				705F7EFD2C9FF42700E377B7 /* sse2neon.h */,
 				705F7EFE2C9FF42700E377B7 /* sse2neon-arm64.h */,
 				705F7F022C9FF42700E377B7 /* vectormath++.h */,
+				70B686E22CAA3405007ACA58 /* long234.h */,
 				7013AD5D2CAA0E18007E5554 /* half234.h */,
 				7013AD602CAA0E21007E5554 /* int234.h */,
 				7013AD4E2CAA0818007E5554 /* float234.h */,
@@ -1513,6 +1517,7 @@
 				707789F32881BCE2008A51BC /* rdo_bc_encoder.h in Headers */,
 				70D222D82AC800AC00B9EA23 /* json11.h in Headers */,
 				706EF01726D15985001C950E /* colourset.h in Headers */,
+				70B686E42CAA3409007ACA58 /* long234.h in Headers */,
 				708A6AA42708CE4700BA5410 /* bc6h_utils.h in Headers */,
 				706EF01826D15985001C950E /* colourblock.h in Headers */,
 				706EF01926D15985001C950E /* rangefit.h in Headers */,
@@ -1632,6 +1637,7 @@
 				707789F42881BCE2008A51BC /* rdo_bc_encoder.h in Headers */,
 				70D222D92AC800AC00B9EA23 /* json11.h in Headers */,
 				706EF19126D166C5001C950E /* colourset.h in Headers */,
+				70B686E32CAA3409007ACA58 /* long234.h in Headers */,
 				708A6AA52708CE4700BA5410 /* bc6h_utils.h in Headers */,
 				706EF19226D166C5001C950E /* colourblock.h in Headers */,
 				706EF19326D166C5001C950E /* rangefit.h in Headers */,

diff --git a/libkram/vectormath/double234.h b/libkram/vectormath/double234.h
@@ -122,16 +122,16 @@ SIMD_CALL double2 sqrt(double2 x) {
     return vsqrtq_f64(x);
 }
 SIMD_CALL double4 sqrt(double4 x) {
-    return double4m(sqrt(x.lo,x.lo), sqrt(x.hi,x.hi));
+    return double4m(sqrt(x.lo), sqrt(x.hi));
 }
 
 // use sse2neon to port this for now
 SIMD_CALL double4 reduce_addv(double4 x) {
     // 4:1 reduction
-    x = _mm_hadd_pd(x.lo, x.lo);
-    x = _mm_hadd_pd(x.hi, x.hi);
-    x = _mm_hadd_pd(x.lo, x.hi);
-    return x.x; // repeat x to all values
+    x.lo = _mm_hadd_pd(x.lo, x.lo);
+    x.hi = _mm_hadd_pd(x.hi, x.hi);
+    x.lo = _mm_hadd_pd(x.lo, x.hi);
+    return x.lo.x; // repeat x to all values
 }
 
 SIMD_CALL double reduce_add(double4 x) {
@@ -179,16 +179,24 @@ SIMD_CALL double4 muladd(double4 x, double4 y, double4 t) {
 #endif
 }
 
-SIMD_CALL double4 sqrt(double4 x) {
+SIMD_CALL double2 sqrt(double2 x) {
     return _mm_sqrt_pd(x);
 }
+SIMD_CALL double4 sqrt(double4 x) {
+    return double4m(sqrt(x.lo), sqrt(x.hi));
+}
+
+SIMD_CALL double2 reduce_addv(double2 x) {
+    x = _mm_hadd_pd(x.lo, x.lo);
+    return x.x;
+}
 
 SIMD_CALL double4 reduce_addv(double4 x) {
     // 4:1 reduction
-    x = _mm_hadd_pd(x.lo, x.lo);
-    x = _mm_hadd_pd(x.hi, x.hi);
-    x = _mm_hadd_pd(x.lo, x.hi);
-    return x.x; // repeat x to all values
+    x.lo = _mm_hadd_pd(x.lo, x.lo); // TODO: fix
+    x.hi = _mm_hadd_pd(x.hi, x.hi);
+    x.lo = _mm_hadd_pd(x.lo, x.hi);
+    return x.lo.x; // repeat x to all values
 }
 
 SIMD_CALL double reduce_add(double4 x) {
@@ -247,34 +255,26 @@ SIMD_CALL double4 zeroext(double3 x) {
     return (double4){x.x,x.y,x.z,0};
 }
 
-// any
-SIMD_CALL bool any(long3 x) {
-    return any(vec3to4(x));
-}
-SIMD_CALL bool all(long4 x) {
-    return all(vec3to4(x));
-}
-
 // min
-SIMD_CALL double2 min(double2 x, double2 y) {
-    return vec4to2(min(vec2to4(x), vec2to4(y)));
-}
+//SIMD_CALL double2 min(double2 x, double2 y) {
+//    return vec4to2(min(vec2to4(x), vec2to4(y)));
+//}
 SIMD_CALL double3 min(double3 x, double3 y) {
     return vec4to3(min(vec3to4(x), vec3to4(y)));
 }
 
 // max
-SIMD_CALL double2 max(double2 x, double2 y) {
-    return vec4to2(max(vec2to4(x), vec2to4(y)));
-}
+//SIMD_CALL double2 max(double2 x, double2 y) {
+//    return vec4to2(max(vec2to4(x), vec2to4(y)));
+//}
 SIMD_CALL double3 max(double3 x, double3 y) {
     return vec4to3(max(vec3to4(x), vec3to4(y)));
 }
 
 // sqrt
-SIMD_CALL double2 sqrt(double2 x) {
-    return vec4to2(sqrt(vec2to4(x)));
-}
+//SIMD_CALL double2 sqrt(double2 x) {
+//    return vec4to2(sqrt(vec2to4(x)));
+//}
 SIMD_CALL double3 sqrt(double3 x) {
     return vec4to3(sqrt(vec3to4(x)));
 }
@@ -315,18 +315,18 @@ SIMD_CALL double reduce_add(double3 x) {
 }
 
 // reduce_min - arm has double2 op
-SIMD_CALL double reduce_min(double2 x) {
-    return reduce_min(vec2to4(x));
-}
+//SIMD_CALL double reduce_min(double2 x) {
+//    return reduce_min(vec2to4(x));
+//}
 
 SIMD_CALL double reduce_min(double3 x) {
     return reduce_min(vec3to4(x));
 }
 
 // reduce_max
-SIMD_CALL double reduce_max(double2 x) {
-    return reduce_max(vec2to4(x));
-}
+//SIMD_CALL double reduce_max(double2 x) {
+//    return reduce_max(vec2to4(x));
+//}
 
 SIMD_CALL double reduce_max(double3 x) {
     return reduce_max(vec3to4(x));
@@ -361,9 +361,9 @@ double3 saturate(double3 x);
 double4 saturate(double4 x);
 
 // muladd - arm has double2 op
-SIMD_CALL double2 muladd(double2 x, double2 y, double2 t) {
-    return vec4to2(muladd(vec2to4(x), vec2to4(y), vec2to4(t)));
-}
+//SIMD_CALL double2 muladd(double2 x, double2 y, double2 t) {
+//    return vec4to2(muladd(vec2to4(x), vec2to4(y), vec2to4(t)));
+//}
 SIMD_CALL double3 muladd(double3 x, double3 y, double3 t) {
     return vec4to3(muladd(vec3to4(x), vec3to4(y), vec3to4(t)));
 }

diff --git a/libkram/vectormath/float234.h b/libkram/vectormath/float234.h
@@ -186,14 +186,6 @@ SIMD_CALL float4 zeroext(float3 x) {
     return (float4){x.x,x.y,x.z,0};
 }
 
-// any
-SIMD_CALL bool any(int3 x) {
-    return any(vec3to4(x));
-}
-SIMD_CALL bool all(int3 x) {
-    return all(vec3to4(x));
-}
-
 // min
 SIMD_CALL float2 min(float2 x, float2 y) {
     return vec4to2(min(vec2to4(x), vec2to4(y)));

diff --git a/libkram/vectormath/int234.h b/libkram/vectormath/int234.h
@@ -66,6 +66,13 @@ SIMD_CALL bool all(int4 x) {
 }
 #endif // SIMD_SSE
 
+// any-all
+SIMD_CALL bool any(int3 x) {
+    return any(vec3to4(x));
+}
+SIMD_CALL bool all(int3 x) {
+    return all(vec3to4(x));
+}
 
 // end of implementation
 //-----------------------------------
@@ -94,7 +101,7 @@ SIMD_CALL int3 int3m(int x) {
 SIMD_CALL int3 int3m(int x, int y, int z) {
     return {x,y,z};
 }
-SIMD_CALL int3 int3m(int2 v, float z) {
+SIMD_CALL int3 int3m(int2 v, int z) {
     int3 r; r.xy = v; r.z = z; return r;
 }
 
@@ -108,7 +115,7 @@ SIMD_CALL int4 int4m(int2 xy, int2 zw) {
 SIMD_CALL int4 int4m(int x, int y, int z, int w) {
     return {x,y,z,w};
 }
-SIMD_CALL int4 int4m(int3 v, float w) {
+SIMD_CALL int4 int4m(int3 v, int w) {
     int4 r; r.xyz = v; r.w = w; return r;
 }
 

diff --git a/libkram/vectormath/long234.h b/libkram/vectormath/long234.h
@@ -0,0 +1,132 @@
+// kram - Copyright 2020-2024 by Alec Miller. - MIT License
+// The license and copyright notice shall be included
+// in all copies or substantial portions of the Software.
+
+#pragma once
+
+// This is not yet standalone.  vectormath++.h includes it.
+#if USE_SIMDLIB && SIMD_LONG
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+// define c vector types
+// Apple uses long type here (32-bit) instead of long32_t
+macroVector4TypesStorage(long, long)
+macroVector4TypesPacked(long, long)
+
+#if SIMD_RENAME_TO_SIMD_NAMESPACE
+macroVector4TypesStorageRenames(long, simd_long)
+#endif // SIMD_RENAME_TO_SIMD_NAMESPACE
+
+#ifdef __cplusplus
+}
+
+namespace SIMD_NAMESPACE {
+
+macroVector4TypesStorageRenames(long, long)
+
+//-----------------------------------
+// imlementation - only code simd arch specific
+
+#if SIMD_NEON
+
+SIMD_CALL bool any(long2 x) {
+    return (x.x | x.y) & 0x8000000000000000U;
+}
+SIMD_CALL bool any(long3 x) {
+    return (x.x | x.y | x.z) & 0x8000000000000000U;
+}
+SIMD_CALL bool any(long4 x) {
+    return any(x.lo | x.hi);
+}
+
+SIMD_CALL bool all(long2 x) {
+    return (x.x & x.y) & 0x8000000000000000U;
+}
+SIMD_CALL bool all(long3 x) {
+    return (x.x & x.y & x.z) & 0x8000000000000000U;
+}
+SIMD_CALL bool all(long4 x) {
+    return all(x.lo & x.hi);
+}
+
+#endif // SIMD_NEON
+
+// These take in long types, this is what comparison gens from a < b, etc.
+#if SIMD_SSE
+
+SIMD_CALL bool any(long2 x) {
+    return _mm_movemask_pd(x) & 0x3; // 2 bits
+}
+SIMD_CALL bool any(long3 x) {
+    // avx/2 have double4 op
+    return (x.x | x.y) & 0x8000000000000000U;
+}
+SIMD_CALL bool any(long4 x) {
+    // avx/2 have double4 op
+    return any(x.lo | x.hi);
+}
+
+SIMD_CALL bool all(long2 x) {
+    return (_mm_movemask_pd(x) & 0x3) == 0x3; // 2 bits
+}
+SIMD_CALL bool all(long3 x) {
+    // avx/2 have double4 op
+    return (x.x & x.y & x.z) & 0x8000000000000000U;
+}
+SIMD_CALL bool all(long4 x) {
+    // avx/2 have double4 op
+    return any(x.lo & x.hi);
+}
+#endif // SIMD_SSE
+
+// end of implementation
+//-----------------------------------
+
+// bitselect
+SIMD_CALL long2 bitselect(long2 x, long2 y, long2 mask) {
+    return (x & ~mask) | (y & mask);
+}
+SIMD_CALL long3 bitselect(long3 x, long3 y, long3 mask) {
+    return (x & ~mask) | (y & mask);
+}
+SIMD_CALL long4 bitselect(long4 x, long4 y, long4 mask) {
+    return (x & ~mask) | (y & mask);
+}
+
+SIMD_CALL long2 long2m(long x) {
+    return x;
+}
+SIMD_CALL long2 long2m(long x, long y) {
+    return {x,y};
+}
+
+SIMD_CALL long3 long3m(long x) {
+    return x;
+}
+SIMD_CALL long3 long3m(long x, long y, long z) {
+    return {x,y,z};
+}
+SIMD_CALL long3 long3m(long2 v, long z) {
+    long3 r; r.xy = v; r.z = z; return r;
+}
+
+
+SIMD_CALL long4 long4m(long x) {
+    return x;
+}
+SIMD_CALL long4 long4m(long2 xy, long2 zw) {
+    long4 r; r.xy = xy; r.zw = zw; return r;
+}
+SIMD_CALL long4 long4m(long x, long y, long z, long w) {
+    return {x,y,z,w};
+}
+SIMD_CALL long4 long4m(long3 v, long w) {
+    long4 r; r.xyz = v; r.w = w; return r;
+}
+
+}
+#endif // __cplusplus
+#endif // USE_SIMDLIB && SIMD_LONG
diff --git a/libkram/vectormath/vectormath++.cpp b/libkram/vectormath/vectormath++.cpp
@@ -1444,15 +1444,19 @@ string vecf::str(const double4x4& m) const {
 
 // textbook transpose 
 double2x2 transpose(const double2x2& x) {
-    double4 x0, x1;
+    double2 x0, x1;
     x0.xy = x[0];
     x1.xy = x[1];
+
+    // std::swap would seem faster here?
 #if SIMD_SSE
-    double4 r01 = _mm_unpacklo_pd(x0, x1); // required AVX2
+    double2 r0 = { x0[0], x1[0] };
+    double2 r1 = { x0[1], x1[1] };
 #else
-    double4 r01 = vzip1q_f64(x0, x1);
+    double2 r0 = vzip1q_f64(x0, x1);
+    double2 r1 = vzip2q_f64(x0, x1);
 #endif
-    return (double2x2){r01.lo, r01.hi};
+    return (double2x2){r0, r1};
 }
 
 double3x3 transpose(const double3x3& x) {