Skip to content

Commit

Permalink
kram - simd - add double and long mask support
Browse files Browse the repository at this point in the history
This is only using 16B ops, where AVX/2 expose 32B ops.
  • Loading branch information
alecazam committed Sep 30, 2024
1 parent 85d4148 commit 3ae9626
Show file tree
Hide file tree
Showing 7 changed files with 203 additions and 82 deletions.
6 changes: 6 additions & 0 deletions build2/kram.xcodeproj/project.pbxproj
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,8 @@
70B563A82C857B360089A64F /* KramZipStream.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70B563A52C857B360089A64F /* KramZipStream.cpp */; };
70B563A92C857B360089A64F /* KramZipStream.h in Headers */ = {isa = PBXBuildFile; fileRef = 70B563A62C857B360089A64F /* KramZipStream.h */; };
70B563AA2C857B360089A64F /* KramZipStream.h in Headers */ = {isa = PBXBuildFile; fileRef = 70B563A62C857B360089A64F /* KramZipStream.h */; };
70B686E32CAA3409007ACA58 /* long234.h in Headers */ = {isa = PBXBuildFile; fileRef = 70B686E22CAA3405007ACA58 /* long234.h */; };
70B686E42CAA3409007ACA58 /* long234.h in Headers */ = {isa = PBXBuildFile; fileRef = 70B686E22CAA3405007ACA58 /* long234.h */; };
70CDB65027A1382700A546C1 /* KramDDSHelper.h in Headers */ = {isa = PBXBuildFile; fileRef = 70CDB64E27A1382600A546C1 /* KramDDSHelper.h */; };
70CDB65127A1382700A546C1 /* KramDDSHelper.h in Headers */ = {isa = PBXBuildFile; fileRef = 70CDB64E27A1382600A546C1 /* KramDDSHelper.h */; };
70CDB65227A1382700A546C1 /* KramDDSHelper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70CDB64F27A1382600A546C1 /* KramDDSHelper.cpp */; };
Expand Down Expand Up @@ -764,6 +766,7 @@
70A7BD2F27092A1200DBCCF7 /* hdr_encode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = hdr_encode.h; sourceTree = "<group>"; };
70B563A52C857B360089A64F /* KramZipStream.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = KramZipStream.cpp; sourceTree = "<group>"; };
70B563A62C857B360089A64F /* KramZipStream.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = KramZipStream.h; sourceTree = "<group>"; };
70B686E22CAA3405007ACA58 /* long234.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = long234.h; sourceTree = "<group>"; };
70CDB64E27A1382600A546C1 /* KramDDSHelper.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = KramDDSHelper.h; sourceTree = "<group>"; };
70CDB64F27A1382600A546C1 /* KramDDSHelper.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = KramDDSHelper.cpp; sourceTree = "<group>"; };
70D222D62AC800AC00B9EA23 /* json11.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = json11.h; sourceTree = "<group>"; };
Expand Down Expand Up @@ -807,6 +810,7 @@
705F7EFD2C9FF42700E377B7 /* sse2neon.h */,
705F7EFE2C9FF42700E377B7 /* sse2neon-arm64.h */,
705F7F022C9FF42700E377B7 /* vectormath++.h */,
70B686E22CAA3405007ACA58 /* long234.h */,
7013AD5D2CAA0E18007E5554 /* half234.h */,
7013AD602CAA0E21007E5554 /* int234.h */,
7013AD4E2CAA0818007E5554 /* float234.h */,
Expand Down Expand Up @@ -1513,6 +1517,7 @@
707789F32881BCE2008A51BC /* rdo_bc_encoder.h in Headers */,
70D222D82AC800AC00B9EA23 /* json11.h in Headers */,
706EF01726D15985001C950E /* colourset.h in Headers */,
70B686E42CAA3409007ACA58 /* long234.h in Headers */,
708A6AA42708CE4700BA5410 /* bc6h_utils.h in Headers */,
706EF01826D15985001C950E /* colourblock.h in Headers */,
706EF01926D15985001C950E /* rangefit.h in Headers */,
Expand Down Expand Up @@ -1632,6 +1637,7 @@
707789F42881BCE2008A51BC /* rdo_bc_encoder.h in Headers */,
70D222D92AC800AC00B9EA23 /* json11.h in Headers */,
706EF19126D166C5001C950E /* colourset.h in Headers */,
70B686E32CAA3409007ACA58 /* long234.h in Headers */,
708A6AA52708CE4700BA5410 /* bc6h_utils.h in Headers */,
706EF19226D166C5001C950E /* colourblock.h in Headers */,
706EF19326D166C5001C950E /* rangefit.h in Headers */,
Expand Down
72 changes: 36 additions & 36 deletions libkram/vectormath/double234.h
Original file line number Diff line number Diff line change
Expand Up @@ -122,16 +122,16 @@ SIMD_CALL double2 sqrt(double2 x) {
return vsqrtq_f64(x);
}
SIMD_CALL double4 sqrt(double4 x) {
return double4m(sqrt(x.lo,x.lo), sqrt(x.hi,x.hi));
return double4m(sqrt(x.lo), sqrt(x.hi));
}

// use sse2neon to port this for now
SIMD_CALL double4 reduce_addv(double4 x) {
// 4:1 reduction
x = _mm_hadd_pd(x.lo, x.lo);
x = _mm_hadd_pd(x.hi, x.hi);
x = _mm_hadd_pd(x.lo, x.hi);
return x.x; // repeat x to all values
x.lo = _mm_hadd_pd(x.lo, x.lo);
x.hi = _mm_hadd_pd(x.hi, x.hi);
x.lo = _mm_hadd_pd(x.lo, x.hi);
return x.lo.x; // repeat x to all values
}

SIMD_CALL double reduce_add(double4 x) {
Expand Down Expand Up @@ -179,16 +179,24 @@ SIMD_CALL double4 muladd(double4 x, double4 y, double4 t) {
#endif
}

SIMD_CALL double4 sqrt(double4 x) {
SIMD_CALL double2 sqrt(double2 x) {
return _mm_sqrt_pd(x);
}
SIMD_CALL double4 sqrt(double4 x) {
return double4m(sqrt(x.lo), sqrt(x.hi));
}

SIMD_CALL double2 reduce_addv(double2 x) {
x = _mm_hadd_pd(x.lo, x.lo);
return x.x;
}

SIMD_CALL double4 reduce_addv(double4 x) {
// 4:1 reduction
x = _mm_hadd_pd(x.lo, x.lo);
x = _mm_hadd_pd(x.hi, x.hi);
x = _mm_hadd_pd(x.lo, x.hi);
return x.x; // repeat x to all values
x.lo = _mm_hadd_pd(x.lo, x.lo); // TODO: fix
x.hi = _mm_hadd_pd(x.hi, x.hi);
x.lo = _mm_hadd_pd(x.lo, x.hi);
return x.lo.x; // repeat x to all values
}

SIMD_CALL double reduce_add(double4 x) {
Expand Down Expand Up @@ -247,34 +255,26 @@ SIMD_CALL double4 zeroext(double3 x) {
return (double4){x.x,x.y,x.z,0};
}

// any
SIMD_CALL bool any(long3 x) {
return any(vec3to4(x));
}
SIMD_CALL bool all(long4 x) {
return all(vec3to4(x));
}

// min
SIMD_CALL double2 min(double2 x, double2 y) {
return vec4to2(min(vec2to4(x), vec2to4(y)));
}
//SIMD_CALL double2 min(double2 x, double2 y) {
// return vec4to2(min(vec2to4(x), vec2to4(y)));
//}
SIMD_CALL double3 min(double3 x, double3 y) {
return vec4to3(min(vec3to4(x), vec3to4(y)));
}

// max
SIMD_CALL double2 max(double2 x, double2 y) {
return vec4to2(max(vec2to4(x), vec2to4(y)));
}
//SIMD_CALL double2 max(double2 x, double2 y) {
// return vec4to2(max(vec2to4(x), vec2to4(y)));
//}
SIMD_CALL double3 max(double3 x, double3 y) {
return vec4to3(max(vec3to4(x), vec3to4(y)));
}

// sqrt
SIMD_CALL double2 sqrt(double2 x) {
return vec4to2(sqrt(vec2to4(x)));
}
//SIMD_CALL double2 sqrt(double2 x) {
// return vec4to2(sqrt(vec2to4(x)));
//}
SIMD_CALL double3 sqrt(double3 x) {
return vec4to3(sqrt(vec3to4(x)));
}
Expand Down Expand Up @@ -315,18 +315,18 @@ SIMD_CALL double reduce_add(double3 x) {
}

// reduce_min - arm has double2 op
SIMD_CALL double reduce_min(double2 x) {
return reduce_min(vec2to4(x));
}
//SIMD_CALL double reduce_min(double2 x) {
// return reduce_min(vec2to4(x));
//}

SIMD_CALL double reduce_min(double3 x) {
return reduce_min(vec3to4(x));
}

// reduce_max
SIMD_CALL double reduce_max(double2 x) {
return reduce_max(vec2to4(x));
}
//SIMD_CALL double reduce_max(double2 x) {
// return reduce_max(vec2to4(x));
//}

SIMD_CALL double reduce_max(double3 x) {
return reduce_max(vec3to4(x));
Expand Down Expand Up @@ -361,9 +361,9 @@ double3 saturate(double3 x);
double4 saturate(double4 x);

// muladd - arm has double2 op
SIMD_CALL double2 muladd(double2 x, double2 y, double2 t) {
return vec4to2(muladd(vec2to4(x), vec2to4(y), vec2to4(t)));
}
//SIMD_CALL double2 muladd(double2 x, double2 y, double2 t) {
// return vec4to2(muladd(vec2to4(x), vec2to4(y), vec2to4(t)));
//}
SIMD_CALL double3 muladd(double3 x, double3 y, double3 t) {
return vec4to3(muladd(vec3to4(x), vec3to4(y), vec3to4(t)));
}
Expand Down
8 changes: 0 additions & 8 deletions libkram/vectormath/float234.h
Original file line number Diff line number Diff line change
Expand Up @@ -186,14 +186,6 @@ SIMD_CALL float4 zeroext(float3 x) {
return (float4){x.x,x.y,x.z,0};
}

// any
SIMD_CALL bool any(int3 x) {
return any(vec3to4(x));
}
SIMD_CALL bool all(int3 x) {
return all(vec3to4(x));
}

// min
SIMD_CALL float2 min(float2 x, float2 y) {
return vec4to2(min(vec2to4(x), vec2to4(y)));
Expand Down
11 changes: 9 additions & 2 deletions libkram/vectormath/int234.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,13 @@ SIMD_CALL bool all(int4 x) {
}
#endif // SIMD_SSE

// any-all
SIMD_CALL bool any(int3 x) {
return any(vec3to4(x));
}
SIMD_CALL bool all(int3 x) {
return all(vec3to4(x));
}

// end of implementation
//-----------------------------------
Expand Down Expand Up @@ -94,7 +101,7 @@ SIMD_CALL int3 int3m(int x) {
SIMD_CALL int3 int3m(int x, int y, int z) {
return {x,y,z};
}
SIMD_CALL int3 int3m(int2 v, float z) {
SIMD_CALL int3 int3m(int2 v, int z) {
int3 r; r.xy = v; r.z = z; return r;
}

Expand All @@ -108,7 +115,7 @@ SIMD_CALL int4 int4m(int2 xy, int2 zw) {
SIMD_CALL int4 int4m(int x, int y, int z, int w) {
return {x,y,z,w};
}
SIMD_CALL int4 int4m(int3 v, float w) {
SIMD_CALL int4 int4m(int3 v, int w) {
int4 r; r.xyz = v; r.w = w; return r;
}

Expand Down
132 changes: 132 additions & 0 deletions libkram/vectormath/long234.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
// kram - Copyright 2020-2024 by Alec Miller. - MIT License
// The license and copyright notice shall be included
// in all copies or substantial portions of the Software.

#pragma once

// This is not yet standalone. vectormath++.h includes it.
#if USE_SIMDLIB && SIMD_LONG

#ifdef __cplusplus
extern "C" {
#endif // __cplusplus

// define c vector types
// Apple uses long type here (32-bit) instead of long32_t
macroVector4TypesStorage(long, long)
macroVector4TypesPacked(long, long)

#if SIMD_RENAME_TO_SIMD_NAMESPACE
macroVector4TypesStorageRenames(long, simd_long)
#endif // SIMD_RENAME_TO_SIMD_NAMESPACE

#ifdef __cplusplus
}

namespace SIMD_NAMESPACE {

macroVector4TypesStorageRenames(long, long)

//-----------------------------------
// imlementation - only code simd arch specific

#if SIMD_NEON

SIMD_CALL bool any(long2 x) {
return (x.x | x.y) & 0x8000000000000000U;
}
SIMD_CALL bool any(long3 x) {
return (x.x | x.y | x.z) & 0x8000000000000000U;
}
SIMD_CALL bool any(long4 x) {
return any(x.lo | x.hi);
}

SIMD_CALL bool all(long2 x) {
return (x.x & x.y) & 0x8000000000000000U;
}
SIMD_CALL bool all(long3 x) {
return (x.x & x.y & x.z) & 0x8000000000000000U;
}
SIMD_CALL bool all(long4 x) {
return all(x.lo & x.hi);
}

#endif // SIMD_NEON

// These take in long types, this is what comparison gens from a < b, etc.
#if SIMD_SSE

SIMD_CALL bool any(long2 x) {
return _mm_movemask_pd(x) & 0x3; // 2 bits
}
SIMD_CALL bool any(long3 x) {
// avx/2 have double4 op
return (x.x | x.y) & 0x8000000000000000U;
}
SIMD_CALL bool any(long4 x) {
// avx/2 have double4 op
return any(x.lo | x.hi);
}

SIMD_CALL bool all(long2 x) {
return (_mm_movemask_pd(x) & 0x3) == 0x3; // 2 bits
}
SIMD_CALL bool all(long3 x) {
// avx/2 have double4 op
return (x.x & x.y & x.z) & 0x8000000000000000U;
}
SIMD_CALL bool all(long4 x) {
// avx/2 have double4 op
return any(x.lo & x.hi);
}
#endif // SIMD_SSE

// end of implementation
//-----------------------------------

// bitselect
SIMD_CALL long2 bitselect(long2 x, long2 y, long2 mask) {
return (x & ~mask) | (y & mask);
}
SIMD_CALL long3 bitselect(long3 x, long3 y, long3 mask) {
return (x & ~mask) | (y & mask);
}
SIMD_CALL long4 bitselect(long4 x, long4 y, long4 mask) {
return (x & ~mask) | (y & mask);
}

SIMD_CALL long2 long2m(long x) {
return x;
}
SIMD_CALL long2 long2m(long x, long y) {
return {x,y};
}

SIMD_CALL long3 long3m(long x) {
return x;
}
SIMD_CALL long3 long3m(long x, long y, long z) {
return {x,y,z};
}
SIMD_CALL long3 long3m(long2 v, long z) {
long3 r; r.xy = v; r.z = z; return r;
}


SIMD_CALL long4 long4m(long x) {
return x;
}
SIMD_CALL long4 long4m(long2 xy, long2 zw) {
long4 r; r.xy = xy; r.zw = zw; return r;
}
SIMD_CALL long4 long4m(long x, long y, long z, long w) {
return {x,y,z,w};
}
SIMD_CALL long4 long4m(long3 v, long w) {
long4 r; r.xyz = v; r.w = w; return r;
}

}
#endif // __cplusplus
#endif // USE_SIMDLIB && SIMD_LONG
12 changes: 8 additions & 4 deletions libkram/vectormath/vectormath++.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1444,15 +1444,19 @@ string vecf::str(const double4x4& m) const {

// textbook transpose
double2x2 transpose(const double2x2& x) {
double4 x0, x1;
double2 x0, x1;
x0.xy = x[0];
x1.xy = x[1];

// std::swap would seem faster here?
#if SIMD_SSE
double4 r01 = _mm_unpacklo_pd(x0, x1); // required AVX2
double2 r0 = { x0[0], x1[0] };
double2 r1 = { x0[1], x1[1] };
#else
double4 r01 = vzip1q_f64(x0, x1);
double2 r0 = vzip1q_f64(x0, x1);
double2 r1 = vzip2q_f64(x0, x1);
#endif
return (double2x2){r01.lo, r01.hi};
return (double2x2){r0, r1};
}

double3x3 transpose(const double3x3& x) {
Expand Down
Loading

0 comments on commit 3ae9626

Please sign in to comment.