Skip to content

Commit

Permalink
Enable fast Huffman & Huffman zig-zag transform for Arm Neon (#1323)
Browse files Browse the repository at this point in the history
* Enable fast Huffman decoding on macOS

Enable fast Huffman decoding for macOS (x86 and Apple silicon)

Signed-off-by: Developer Ecosystem Engineering <[email protected]>

* Implement Huffman zig-zag transform

Implements Huffman zig-zag transform and 32 to 16 bit floating point

Signed-off-by: Developer Ecosystem Engineering <[email protected]>

Signed-off-by: Developer Ecosystem Engineering <[email protected]>
  • Loading branch information
Developer-Ecosystem-Engineering committed Jan 13, 2023
1 parent 71bffa3 commit 436fcd2
Show file tree
Hide file tree
Showing 5 changed files with 184 additions and 34 deletions.
7 changes: 7 additions & 0 deletions src/lib/OpenEXR/ImfDwaCompressor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2786,6 +2786,13 @@ DwaCompressor::initializeFuncs ()
fromHalfZigZag = fromHalfZigZag_f16c;
}

#ifdef IMF_HAVE_NEON
{
convertFloatToHalf64 = convertFloatToHalf64_neon;
fromHalfZigZag = fromHalfZigZag_neon;
}
#endif

//
// Setup inverse DCT implementations
//
Expand Down
49 changes: 49 additions & 0 deletions src/lib/OpenEXR/ImfDwaCompressorSimd.h
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,18 @@ convertFloatToHalf64_scalar (unsigned short* dst, float* src)
dst[i] = ((half) src[i]).bits ();
}

#ifdef IMF_HAVE_NEON

void
convertFloatToHalf64_neon (unsigned short* dst, float* src)
{
for (int i = 0; i < 64; i += 8) {
float32x4x2_t vec_fp32 = vld1q_f32_x2 (src + i);
vst1q_u16 (dst + i, vcombine_u16(vreinterpret_u16_f16(vcvt_f16_f32(vec_fp32.val[0])),vreinterpret_u16_f16(vcvt_f16_f32(vec_fp32.val[1]))));
}
}
#endif

//
// F16C conversion - Assumes aligned src and dst
//
Expand Down Expand Up @@ -809,6 +821,43 @@ fromHalfZigZag_f16c (unsigned short* src, float* dst)
#endif /* defined IMF_HAVE_GCC_INLINEASM_X86_64 */
}

#ifdef IMF_HAVE_NEON


void
fromHalfZigZag_neon(unsigned short* __restrict__ src, float* __restrict__ dst)
{
uint8x16_t res_tbl[4] = {
{0, 1, 5, 6, 14, 15, 27, 28, 2 , 4 , 7 ,13, 16, 26, 29, 42},
{3 , 8 ,12 ,17, 25, 30, 41, 43,9 ,11 ,18 ,24, 31, 40, 44, 53},
{10 ,19 ,23 ,32, 39, 45, 52, 54,20 ,22 ,33 ,38, 46, 51, 55, 60},
{21 ,34 ,37 ,47, 50, 56, 59, 61,35 ,36 ,48 ,49, 57, 58, 62, 63}};

uint8x16x4_t vec_input_l,vec_input_h;

for (int i = 0; i < 4; i++)
{
uint8x16x2_t vec_in_u8 = vld2q_u8 ((unsigned char*)(src + 16 * i));
vec_input_l.val[i] = vec_in_u8.val[0];
vec_input_h.val[i] = vec_in_u8.val[1];
}

#pragma unroll(4)
for (int i = 0; i < 4 ; i++) {
uint8x16_t res_vec_l,res_vec_h;
res_vec_l = vqtbl4q_u8(vec_input_l,res_tbl[i]);
res_vec_h = vqtbl4q_u8(vec_input_h,res_tbl[i]);
float16x8_t res_vec_l_f16 = vreinterpretq_f16_u8(vzip1q_u8(res_vec_l,res_vec_h));
float16x8_t res_vec_h_f16 = vreinterpretq_f16_u8(vzip2q_u8(res_vec_l,res_vec_h));
vst1q_f32(dst + i*16, vcvt_f32_f16(vget_low_f16(res_vec_l_f16)));
vst1q_f32(dst + i*16+4, vcvt_high_f32_f16(res_vec_l_f16));
vst1q_f32(dst + i*16+8, vcvt_f32_f16(vget_low_f16(res_vec_h_f16)));
vst1q_f32(dst + i*16+12, vcvt_high_f32_f16(res_vec_h_f16));
}
}

#endif // IMF_HAVE_NEON

//
// Inverse 8x8 DCT, only inverting the DC. This assumes that
// all AC frequencies are 0.
Expand Down
83 changes: 49 additions & 34 deletions src/lib/OpenEXR/ImfFastHuf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,40 @@
#include <string.h>
#include <vector>

// Static enabling/disabling the fast huffman decode


#if defined(__clang__)
//
// Enabled for clang on Apple platforms (tested):
//

# if defined(__APPLE__)
# define OPENEXR_IMF_ENABLE_FAST_HUF_DECODER
# endif

#elif defined(__INTEL_COMPILER) || defined(__GNUC__)
//
// Enabled for ICC, GCC:
// __i386__ -> x86
// __x86_64__ -> 64-bit x86
// __e2k__ -> e2k (MCST Elbrus 2000)

# if defined(__i386__) || defined(__x86_64__) || defined(__e2k__)
# define OPENEXR_IMF_ENABLE_FAST_HUF_DECODER
# endif

#elif defined(_MSC_VER)
//
// Enabled for Visual Studio:
// _M_IX86 -> x86
// _M_X64 -> 64bit x86

# if defined(_M_IX86) || defined(_M_X64)
# define OPENEXR_IMF_ENABLE_FAST_HUF_DECODER
# endif
#endif

OPENEXR_IMF_INTERNAL_NAMESPACE_SOURCE_ENTER

//
Expand Down Expand Up @@ -274,50 +308,31 @@ FastHufDecoder::~FastHufDecoder ()
((uint64_t) (c)[4] << 24) | ((uint64_t) (c)[5] << 16) | \
((uint64_t) (c)[6] << 8) | ((uint64_t) (c)[7])

#ifdef __INTEL_COMPILER // ICC built-in swap for LE hosts
# if defined(__i386__) || defined(__x86_64__)
# undef READ64
# define READ64(c) _bswap64 (*(const uint64_t*) (c))
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
# ifdef __INTEL_COMPILER // ICC built-in swap for LE hosts
# if defined(__i386__) || defined(__x86_64__)
# undef READ64
# define READ64(c) _bswap64 (*(const uint64_t*) (c))
# endif

# else
# ifdef __has_builtin
# if __has_builtin(__builtin_bswap64)
# undef READ64
# define READ64(c) __builtin_bswap64 (*(const uint64_t*) (c))
# endif
# endif
# endif
#endif

bool
FastHufDecoder::enabled ()
{
#if defined(__INTEL_COMPILER) || defined(__GNUC__)

//
// Enabled for ICC, GCC:
// __i386__ -> x86
// __x86_64__ -> 64-bit x86
// __e2k__ -> e2k (MCST Elbrus 2000)

# if defined(__i386__) || defined(__x86_64__) || defined(__e2k__)
return true;
# else
return false;
# endif

#elif defined(_MSC_VER)

//
// Enabled for Visual Studio:
// _M_IX86 -> x86
// _M_X64 -> 64bit x86

# if defined(_M_IX86) || defined(_M_X64)
# ifdef OPENEXR_IMF_ENABLE_FAST_HUF_DECODER
return true;
# else
return false;
# endif

#else

//
// Unknown compiler - Be safe and disable.
//
return false;
#endif
}

//
Expand Down
9 changes: 9 additions & 0 deletions src/lib/OpenEXR/ImfSimd.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@
# define IMF_HAVE_F16C 1
#endif

#if defined(__ARM_NEON)
# define IMF_HAVE_NEON
#endif

extern "C" {
#ifdef IMF_HAVE_SSE2
# include <emmintrin.h>
Expand All @@ -51,6 +55,11 @@ extern "C" {
#ifdef IMF_HAVE_SSE4_1
# include <smmintrin.h>
#endif

#ifdef IMF_HAVE_NEON
# include <arm_neon.h>
#endif

}

#endif
70 changes: 70 additions & 0 deletions src/test/OpenEXRTest/testDwaCompressorSimd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,37 @@ testFloatToHalf ()
}
}
}

#ifdef IMF_HAVE_NEON
{
cout << " convertFloatToHalf64_neon()" << endl;
for (int iter = 0; iter < numIter; ++iter)
{
for (int i = 0; i < 64; ++i)
{
if (i < 32)
{
src._buffer[i] = (float) 140000 * (rand48.nextf () - .5);
}
else { src._buffer[i] = (float) (rand48.nextf () - .5); }
dst._buffer[i] = 0;
}

convertFloatToHalf64_neon (dst._buffer, src._buffer);

for (int i = 0; i < 64; ++i)
{
half value = (half) src._buffer[i];
if (value.bits () != dst._buffer[i])
{
cout << src._buffer[i] << " -> " << dst._buffer[i]
<< " expected " << value.bits () << endl;
assert (false);
}
}
}
}
#endif // IMF_HAVE_NEON
}

//
Expand Down Expand Up @@ -488,6 +519,45 @@ testFromHalfZigZag ()
}
} // iter
} // f16c

#ifdef IMF_HAVE_NEON
{
const int numIter = 1000000;
Rand48 rand48 (0);
half h;
SimdAlignedBuffer64f dstF16c;

cout << " fromHalfZigZag_neon()" << endl;

for (int iter = 0; iter < numIter; ++iter)
{
for (int i = 0; i < 64; ++i)
{
if (i < 32) { h = (half) (140000. * (rand48.nextf () - .5)); }
else
{
h = (half) (rand48.nextf () - .5);
}
src._buffer[i] = h.bits ();
}

fromHalfZigZag_scalar (src._buffer, dst._buffer);
fromHalfZigZag_neon (src._buffer, dstF16c._buffer);

for (int i = 0; i < 64; ++i)
{
if (fabsf (dst._buffer[i] - dstF16c._buffer[i]) > 1e-5)
{
cout << "At index " << i << ": ";
cout << "expecting " << dst._buffer[i] << "; got "
<< dstF16c._buffer[i] << endl;
assert (false);
}
}
} // iter
} // neon

#endif // IMF_HAVE_NEON
}

} // namespace
Expand Down

1 comment on commit 436fcd2

@mandree
Copy link
Contributor

@mandree mandree commented on 436fcd2 Mar 21, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This causes a regression and breaks the build on ARMv7 without 64-bit NEON extensions. This relevant code requires #ifdef __aarch64__ - see #1365 which showed similar issues in other code. New bug #1367 filed.

Please sign in to comment.