aklomp · lucshi · Aug 18, 2022 · Sep 26, 2022 · Sep 29, 2022 · Sep 29, 2022
diff --git a/README.md b/README.md
@@ -3,7 +3,7 @@
 [![Build Status](https://github.com/aklomp/base64/actions/workflows/test.yml/badge.svg)](https://github.com/aklomp/base64/actions/workflows/test.yml)
 
 This is an implementation of a base64 stream encoding/decoding library in C99
-with SIMD (AVX2, NEON, AArch64/NEON, SSSE3, SSE4.1, SSE4.2, AVX) and
+with SIMD (AVX2, AVX512, NEON, AArch64/NEON, SSSE3, SSE4.1, SSE4.2, AVX) and
 [OpenMP](http://www.openmp.org) acceleration. It also contains wrapper functions
 to encode/decode simple length-delimited strings. This library aims to be:
 
@@ -19,6 +19,10 @@ will pick an optimized codec that lets it encode/decode 12 or 24 bytes at a
 time, which gives a speedup of four or more times compared to the "plain"
 bytewise codec.
 
+AVX512 support is only for encoding at present, utilizing the AVX512 VL and VBMI
+instructions. Decoding part reused AVX2 implementations. For CPUs later than
+Cannonlake (manufactured in 2018) supports these instructions.
+
 NEON support is hardcoded to on or off at compile time, because portable
 runtime feature detection is unavailable on ARM.
 
@@ -59,6 +63,9 @@ optimizations described by Wojciech Muła in a
 [articles](http://0x80.pl/notesen/2016-01-17-sse-base64-decoding.html).
 His own code is [here](https://github.com/WojciechMula/toys/tree/master/base64).
 
+The AVX512 encoder code is also referenced from the project of Wojciech Muła and
+the project code is [here](https://github.com/WojciechMula/base64-avx512)
+
 The OpenMP implementation was added by Ferry Toth (@htot) from [Exalon Delft](http://www.exalondelft.nl).
 
 ## Building
@@ -76,8 +83,8 @@ To compile just the "plain" library without SIMD codecs, type:
 make lib/libbase64.o
 ```
 
-Optional SIMD codecs can be included by specifying the `AVX2_CFLAGS`, `NEON32_CFLAGS`, `NEON64_CFLAGS`,
-`SSSE3_CFLAGS`, `SSE41_CFLAGS`, `SSE42_CFLAGS` and/or `AVX_CFLAGS` environment variables.
+Optional SIMD codecs can be included by specifying the `AVX2_CFLAGS`, `AVX512_CFLAGS`, 
+`NEON32_CFLAGS`, `NEON64_CFLAGS`, `SSSE3_CFLAGS`, `SSE41_CFLAGS`, `SSE42_CFLAGS` and/or `AVX_CFLAGS` environment variables.
 A typical build invocation on x86 looks like this:
 
 ```sh
@@ -93,6 +100,15 @@ Example:
 AVX2_CFLAGS=-mavx2 make
 ```
 
+### AVX512
+
+To build and include the AVX512 codec, set the `AVX512_CFLAGS` environment variable to a value that will turn on AVX512 support in your compiler, typically `-mavx512vl -mavx512vbmi`.
+Example:
+
+```sh
+AVX512_CFLAGS="-mavx512vl -mavx512vbmi" make
+```
+
 The codec will only be used if runtime feature detection shows that the target machine supports AVX2.
 
 ### SSSE3
@@ -208,6 +224,7 @@ Mainly there for testing purposes, this is also useful on ARM where the only way
 The following constants can be used:
 
 - `BASE64_FORCE_AVX2`
+- `BASE64_FORCE_AVX512`
 - `BASE64_FORCE_NEON32`
 - `BASE64_FORCE_NEON64`
 - `BASE64_FORCE_PLAIN`

diff --git a/lib/arch/avx512/chromiumbase64.c b/lib/arch/avx512/chromiumbase64.c
diff --git a/lib/arch/avx512/chromiumbase64.h b/lib/arch/avx512/chromiumbase64.h
diff --git a/lib/arch/avx512/codec.c b/lib/arch/avx512/codec.c
@@ -11,30 +11,32 @@
 #if HAVE_AVX512
 #include <immintrin.h>
 
+#include "dec_reshuffle.c"
+#include "dec_loop.c"
+#include "enc_reshuffle_translate.c"
 #include "enc_loop.c"
-#include "chromiumbase64.c"
-#include "../avx2/dec_reshuffle.c"
-#include "../avx2/dec_loop.c"
 
 #endif	// HAVE_AVX512
 
 BASE64_ENC_FUNCTION(avx512)
 {
-#if HAVE_AVX512
-	enc_loop_avx512(src, srclen, out, outlen);
+#if HAVE_AVX2
+	#include "../generic/enc_head.c"
+	enc_loop_avx512(&s, &slen, &o, &olen);
+	#include "../generic/enc_tail.c"
 #else
 	BASE64_ENC_STUB
 #endif
 }
 
+// Reuse AVX2 decoding. Not supporting AVX512 at present
 BASE64_DEC_FUNCTION(avx512)
 {
-// avx512 decode is not implemented yet, reuse avx2 version
-#if HAVE_AVX512
+#if HAVE_AVX2
 	#include "../generic/dec_head.c"
 	dec_loop_avx2(&s, &slen, &o, &olen);
 	#include "../generic/dec_tail.c"
 #else
 	BASE64_DEC_STUB
 #endif
-}
+}
diff --git a/lib/arch/avx512/dec_loop.c b/lib/arch/avx512/dec_loop.c
@@ -0,0 +1,110 @@
+static inline int
+dec_loop_avx2_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
+{
+	const __m256i lut_lo = _mm256_setr_epi8(
+		0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
+		0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A,
+		0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
+		0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A);
+
+	const __m256i lut_hi = _mm256_setr_epi8(
+		0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
+		0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
+		0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
+		0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10);
+
+	const __m256i lut_roll = _mm256_setr_epi8(
+		0,  16,  19,   4, -65, -65, -71, -71,
+		0,   0,   0,   0,   0,   0,   0,   0,
+		0,  16,  19,   4, -65, -65, -71, -71,
+		0,   0,   0,   0,   0,   0,   0,   0);
+
+	const __m256i mask_2F = _mm256_set1_epi8(0x2F);
+
+	// Load input:
+	__m256i str = _mm256_loadu_si256((__m256i *) *s);
+
+	// See the SSSE3 decoder for an explanation of the algorithm.
+	const __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi32(str, 4), mask_2F);
+	const __m256i lo_nibbles = _mm256_and_si256(str, mask_2F);
+	const __m256i hi         = _mm256_shuffle_epi8(lut_hi, hi_nibbles);
+	const __m256i lo         = _mm256_shuffle_epi8(lut_lo, lo_nibbles);
+
+	if (!_mm256_testz_si256(lo, hi)) {
+		return 0;
+	}
+
+	const __m256i eq_2F = _mm256_cmpeq_epi8(str, mask_2F);
+	const __m256i roll  = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles));
+
+	// Now simply add the delta values to the input:
+	str = _mm256_add_epi8(str, roll);
+
+	// Reshuffle the input to packed 12-byte output format:
+	str = dec_reshuffle(str);
+
+	// Store the output:
+	_mm256_storeu_si256((__m256i *) *o, str);
+
+	*s += 32;
+	*o += 24;
+	*rounds -= 1;
+
+	return 1;
+}
+
+static inline void
+dec_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
+{
+	if (*slen < 45) {
+		return;
+	}
+
+	// Process blocks of 32 bytes per round. Because 8 extra zero bytes are
+	// written after the output, ensure that there will be at least 13
+	// bytes of input data left to cover the gap. (11 data bytes and up to
+	// two end-of-string markers.)
+	size_t rounds = (*slen - 13) / 32;
+
+	*slen -= rounds * 32;	// 32 bytes consumed per round
+	*olen += rounds * 24;	// 24 bytes produced per round
+
+	do {
+		if (rounds >= 8) {
+			if (dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds)) {
+				continue;
+			}
+			break;
+		}
+		if (rounds >= 4) {
+			if (dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds)) {
+				continue;
+			}
+			break;
+		}
+		if (rounds >= 2) {
+			if (dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds)) {
+				continue;
+			}
+			break;
+		}
+		dec_loop_avx2_inner(s, o, &rounds);
+		break;
+
+	} while (rounds > 0);
+
+	// Adjust for any rounds that were skipped:
+	*slen += rounds * 32;
+	*olen -= rounds * 24;
+}