diff --git a/CMakeLists.txt b/CMakeLists.txt index dcca17f6..c6020190 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -62,6 +62,8 @@ cmake_dependent_option(BASE64_WITH_AVX "add AVX codepath" ON ${_IS_X86} OFF) add_feature_info(AVX BASE64_WITH_AVX "add AVX codepath") cmake_dependent_option(BASE64_WITH_AVX2 "add AVX 2 codepath" ON ${_IS_X86} OFF) add_feature_info(AVX2 BASE64_WITH_AVX2 "add AVX2 codepath") +cmake_dependent_option(BASE64_WITH_AVX512 "add AVX 512 codepath" ON ${_IS_X86} OFF) +add_feature_info(AVX2 BASE64_WITH_AVX512 "add AVX512 codepath") cmake_dependent_option(BASE64_WITH_NEON32 "add NEON32 codepath" OFF _TARGET_ARCH_arm OFF) add_feature_info(NEON32 BASE64_WITH_NEON32 "add NEON32 codepath") @@ -118,6 +120,7 @@ add_library(base64 lib/arch/sse42/codec.c lib/arch/avx/codec.c lib/arch/avx2/codec.c + lib/arch/avx512/codec.c lib/arch/neon32/codec.c lib/arch/neon64/codec.c @@ -206,6 +209,7 @@ if (_TARGET_ARCH STREQUAL "x86" OR _TARGET_ARCH STREQUAL "x64") configure_codec(SSE42 __SSSE4_2__) configure_codec(AVX) configure_codec(AVX2) + configure_codec(AVX512) elseif (_TARGET_ARCH STREQUAL "arm") set(BASE64_NEON32_CFLAGS "${COMPILE_FLAGS_NEON32}" CACHE STRING "the NEON32 compile flags (for 'lib/arch/neon32/codec.c')") diff --git a/Makefile b/Makefile index 2bb01e20..8dd55388 100644 --- a/Makefile +++ b/Makefile @@ -4,6 +4,7 @@ CFLAGS += -std=c99 -O3 -Wall -Wextra -pedantic OBJCOPY ?= objcopy OBJS = \ + lib/arch/avx512/codec.o \ lib/arch/avx2/codec.o \ lib/arch/generic/codec.o \ lib/arch/neon32/codec.o \ @@ -16,6 +17,7 @@ OBJS = \ lib/codec_choose.o \ lib/tables/tables.o +HAVE_AVX512 = 0 HAVE_AVX2 = 0 HAVE_NEON32 = 0 HAVE_NEON64 = 0 @@ -26,6 +28,9 @@ HAVE_AVX = 0 # The user should supply compiler flags for the codecs they want to build. # Check which codecs we're going to include: +ifdef AVX512_CFLAGS + HAVE_AVX512 = 1 +endif ifdef AVX2_CFLAGS HAVE_AVX2 = 1 endif @@ -64,7 +69,8 @@ lib/libbase64.o: $(OBJS) $(OBJCOPY) --keep-global-symbols=lib/exports.txt $@ lib/config.h: - @echo "#define HAVE_AVX2 $(HAVE_AVX2)" > $@ + @echo "#define HAVE_AVX512 $(HAVE_AVX512)" > $@ + @echo "#define HAVE_AVX2 $(HAVE_AVX2)" >> $@ @echo "#define HAVE_NEON32 $(HAVE_NEON32)" >> $@ @echo "#define HAVE_NEON64 $(HAVE_NEON64)" >> $@ @echo "#define HAVE_SSSE3 $(HAVE_SSSE3)" >> $@ @@ -75,6 +81,7 @@ lib/config.h: $(OBJS): lib/config.h $(OBJS): CFLAGS += -Ilib +lib/arch/avx512/codec.o: CFLAGS += $(AVX512_CFLAGS) lib/arch/avx2/codec.o: CFLAGS += $(AVX2_CFLAGS) lib/arch/neon32/codec.o: CFLAGS += $(NEON32_CFLAGS) lib/arch/neon64/codec.o: CFLAGS += $(NEON64_CFLAGS) diff --git a/README.md b/README.md index b953c324..a99ef540 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ [![Build Status](https://github.com/aklomp/base64/actions/workflows/test.yml/badge.svg)](https://github.com/aklomp/base64/actions/workflows/test.yml) This is an implementation of a base64 stream encoding/decoding library in C99 -with SIMD (AVX2, NEON, AArch64/NEON, SSSE3, SSE4.1, SSE4.2, AVX) and +with SIMD (AVX2, AVX512, NEON, AArch64/NEON, SSSE3, SSE4.1, SSE4.2, AVX) and [OpenMP](http://www.openmp.org) acceleration. It also contains wrapper functions to encode/decode simple length-delimited strings. This library aims to be: @@ -19,6 +19,10 @@ will pick an optimized codec that lets it encode/decode 12 or 24 bytes at a time, which gives a speedup of four or more times compared to the "plain" bytewise codec. +AVX512 support is only for encoding at present, utilizing the AVX512 VL and VBMI +instructions. Decoding part reused AVX2 implementations. For CPUs later than +Cannonlake (manufactured in 2018) supports these instructions. + NEON support is hardcoded to on or off at compile time, because portable runtime feature detection is unavailable on ARM. @@ -59,6 +63,9 @@ optimizations described by Wojciech Muła in a [articles](http://0x80.pl/notesen/2016-01-17-sse-base64-decoding.html). His own code is [here](https://github.com/WojciechMula/toys/tree/master/base64). +The AVX512 encoder code is also referenced from the project of Wojciech Muła and +the project code is [here](https://github.com/WojciechMula/base64-avx512) + The OpenMP implementation was added by Ferry Toth (@htot) from [Exalon Delft](http://www.exalondelft.nl). ## Building @@ -76,8 +83,8 @@ To compile just the "plain" library without SIMD codecs, type: make lib/libbase64.o ``` -Optional SIMD codecs can be included by specifying the `AVX2_CFLAGS`, `NEON32_CFLAGS`, `NEON64_CFLAGS`, -`SSSE3_CFLAGS`, `SSE41_CFLAGS`, `SSE42_CFLAGS` and/or `AVX_CFLAGS` environment variables. +Optional SIMD codecs can be included by specifying the `AVX2_CFLAGS`, `AVX512_CFLAGS`, +`NEON32_CFLAGS`, `NEON64_CFLAGS`, `SSSE3_CFLAGS`, `SSE41_CFLAGS`, `SSE42_CFLAGS` and/or `AVX_CFLAGS` environment variables. A typical build invocation on x86 looks like this: ```sh @@ -93,6 +100,15 @@ Example: AVX2_CFLAGS=-mavx2 make ``` +### AVX512 + +To build and include the AVX512 codec, set the `AVX512_CFLAGS` environment variable to a value that will turn on AVX512 support in your compiler, typically `-mavx512vl -mavx512vbmi`. +Example: + +```sh +AVX512_CFLAGS="-mavx512vl -mavx512vbmi" make +``` + The codec will only be used if runtime feature detection shows that the target machine supports AVX2. ### SSSE3 @@ -208,6 +224,7 @@ Mainly there for testing purposes, this is also useful on ARM where the only way The following constants can be used: - `BASE64_FORCE_AVX2` +- `BASE64_FORCE_AVX512` - `BASE64_FORCE_NEON32` - `BASE64_FORCE_NEON64` - `BASE64_FORCE_PLAIN` diff --git a/cmake/Modules/TargetSIMDInstructionSet.cmake b/cmake/Modules/TargetSIMDInstructionSet.cmake index ba1f6e51..48508090 100644 --- a/cmake/Modules/TargetSIMDInstructionSet.cmake +++ b/cmake/Modules/TargetSIMDInstructionSet.cmake @@ -21,6 +21,7 @@ macro(define_SIMD_compile_flags) set(COMPILE_FLAGS_SSE42 "-msse4.2") set(COMPILE_FLAGS_AVX "-mavx") set(COMPILE_FLAGS_AVX2 "-mavx2") + set(COMPILE_FLAGS_AVX512 "-mavx512vl -mavx512vbmi") #arm set(COMPILE_FLAGS_NEON32 "-mfpu=neon") @@ -30,5 +31,6 @@ macro(define_SIMD_compile_flags) set(COMPILE_FLAGS_SSE42 " ") set(COMPILE_FLAGS_AVX "/arch:AVX") set(COMPILE_FLAGS_AVX2 "/arch:AVX2") + set(COMPILE_FLAGS_AVX512 "/arch:AVX512") endif() endmacro(define_SIMD_compile_flags) diff --git a/cmake/config.h.in b/cmake/config.h.in index 8530d1e1..c7faa94b 100644 --- a/cmake/config.h.in +++ b/cmake/config.h.in @@ -16,6 +16,9 @@ #cmakedefine01 BASE64_WITH_AVX2 #define HAVE_AVX2 BASE64_WITH_AVX2 +#cmakedefine01 BASE64_WITH_AVX512 +#define HAVE_AVX512 BASE64_WITH_AVX512 + #cmakedefine01 BASE64_WITH_NEON32 #define HAVE_NEON32 BASE64_WITH_NEON32 diff --git a/include/libbase64.h b/include/libbase64.h index d470a82f..c5908973 100644 --- a/include/libbase64.h +++ b/include/libbase64.h @@ -53,6 +53,7 @@ extern "C" { #define BASE64_FORCE_SSE41 (1 << 5) #define BASE64_FORCE_SSE42 (1 << 6) #define BASE64_FORCE_AVX (1 << 7) +#define BASE64_FORCE_AVX512 (1 << 8) struct base64_state { int eof; diff --git a/lib/arch/avx512/codec.c b/lib/arch/avx512/codec.c new file mode 100644 index 00000000..3fd73521 --- /dev/null +++ b/lib/arch/avx512/codec.c @@ -0,0 +1,42 @@ +#include +#include +#include + +#include "../../../include/libbase64.h" +#include "../../tables/tables.h" +#include "../../codecs.h" +#include "config.h" +#include "../../env.h" + +#if HAVE_AVX512 +#include + +#include "dec_reshuffle.c" +#include "dec_loop.c" +#include "enc_reshuffle_translate.c" +#include "enc_loop.c" + +#endif // HAVE_AVX512 + +BASE64_ENC_FUNCTION(avx512) +{ +#if HAVE_AVX2 + #include "../generic/enc_head.c" + enc_loop_avx512(&s, &slen, &o, &olen); + #include "../generic/enc_tail.c" +#else + BASE64_ENC_STUB +#endif +} + +// Reuse AVX2 decoding. Not supporting AVX512 at present +BASE64_DEC_FUNCTION(avx512) +{ +#if HAVE_AVX2 + #include "../generic/dec_head.c" + dec_loop_avx2(&s, &slen, &o, &olen); + #include "../generic/dec_tail.c" +#else + BASE64_DEC_STUB +#endif +} diff --git a/lib/arch/avx512/dec_loop.c b/lib/arch/avx512/dec_loop.c new file mode 100644 index 00000000..f959fc4b --- /dev/null +++ b/lib/arch/avx512/dec_loop.c @@ -0,0 +1,110 @@ +static inline int +dec_loop_avx2_inner (const uint8_t **s, uint8_t **o, size_t *rounds) +{ + const __m256i lut_lo = _mm256_setr_epi8( + 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, + 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A, + 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, + 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A); + + const __m256i lut_hi = _mm256_setr_epi8( + 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10); + + const __m256i lut_roll = _mm256_setr_epi8( + 0, 16, 19, 4, -65, -65, -71, -71, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 16, 19, 4, -65, -65, -71, -71, + 0, 0, 0, 0, 0, 0, 0, 0); + + const __m256i mask_2F = _mm256_set1_epi8(0x2F); + + // Load input: + __m256i str = _mm256_loadu_si256((__m256i *) *s); + + // See the SSSE3 decoder for an explanation of the algorithm. + const __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi32(str, 4), mask_2F); + const __m256i lo_nibbles = _mm256_and_si256(str, mask_2F); + const __m256i hi = _mm256_shuffle_epi8(lut_hi, hi_nibbles); + const __m256i lo = _mm256_shuffle_epi8(lut_lo, lo_nibbles); + + if (!_mm256_testz_si256(lo, hi)) { + return 0; + } + + const __m256i eq_2F = _mm256_cmpeq_epi8(str, mask_2F); + const __m256i roll = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles)); + + // Now simply add the delta values to the input: + str = _mm256_add_epi8(str, roll); + + // Reshuffle the input to packed 12-byte output format: + str = dec_reshuffle(str); + + // Store the output: + _mm256_storeu_si256((__m256i *) *o, str); + + *s += 32; + *o += 24; + *rounds -= 1; + + return 1; +} + +static inline void +dec_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) +{ + if (*slen < 45) { + return; + } + + // Process blocks of 32 bytes per round. Because 8 extra zero bytes are + // written after the output, ensure that there will be at least 13 + // bytes of input data left to cover the gap. (11 data bytes and up to + // two end-of-string markers.) + size_t rounds = (*slen - 13) / 32; + + *slen -= rounds * 32; // 32 bytes consumed per round + *olen += rounds * 24; // 24 bytes produced per round + + do { + if (rounds >= 8) { + if (dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds)) { + continue; + } + break; + } + if (rounds >= 4) { + if (dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds)) { + continue; + } + break; + } + if (rounds >= 2) { + if (dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds)) { + continue; + } + break; + } + dec_loop_avx2_inner(s, o, &rounds); + break; + + } while (rounds > 0); + + // Adjust for any rounds that were skipped: + *slen += rounds * 32; + *olen -= rounds * 24; +} diff --git a/lib/arch/avx512/dec_reshuffle.c b/lib/arch/avx512/dec_reshuffle.c new file mode 100644 index 00000000..f3518098 --- /dev/null +++ b/lib/arch/avx512/dec_reshuffle.c @@ -0,0 +1,34 @@ +static inline __m256i +dec_reshuffle (const __m256i in) +{ + // in, lower lane, bits, upper case are most significant bits, lower + // case are least significant bits: + // 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ + // 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG + // 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD + // 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA + + const __m256i merge_ab_and_bc = _mm256_maddubs_epi16(in, _mm256_set1_epi32(0x01400140)); + // 0000kkkk LLllllll 0000JJJJ JJjjKKKK + // 0000hhhh IIiiiiii 0000GGGG GGggHHHH + // 0000eeee FFffffff 0000DDDD DDddEEEE + // 0000bbbb CCcccccc 0000AAAA AAaaBBBB + + __m256i out = _mm256_madd_epi16(merge_ab_and_bc, _mm256_set1_epi32(0x00011000)); + // 00000000 JJJJJJjj KKKKkkkk LLllllll + // 00000000 GGGGGGgg HHHHhhhh IIiiiiii + // 00000000 DDDDDDdd EEEEeeee FFffffff + // 00000000 AAAAAAaa BBBBbbbb CCcccccc + + // Pack bytes together in each lane: + out = _mm256_shuffle_epi8(out, _mm256_setr_epi8( + 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1, + 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1)); + // 00000000 00000000 00000000 00000000 + // LLllllll KKKKkkkk JJJJJJjj IIiiiiii + // HHHHhhhh GGGGGGgg FFffffff EEEEeeee + // DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa + + // Pack lanes: + return _mm256_permutevar8x32_epi32(out, _mm256_setr_epi32(0, 1, 2, 4, 5, 6, -1, -1)); +} diff --git a/lib/arch/avx512/enc_loop.c b/lib/arch/avx512/enc_loop.c new file mode 100644 index 00000000..a6daca66 --- /dev/null +++ b/lib/arch/avx512/enc_loop.c @@ -0,0 +1,61 @@ +static inline void +enc_loop_avx512_inner (const uint8_t **s, uint8_t **o) +{ + // Load input: + __m512i src = _mm512_loadu_si512((__m512i *) *s); + + // Reshuffle, translate, store: + src = enc_reshuffle_translate(src); + _mm512_storeu_si512((__m512i *) *o, src); + + *s += 48; + *o += 64; +} + +static inline void +enc_loop_avx512 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) +{ + if (*slen < 64) { + return; + } + + // Process blocks of 48 bytes at a time. Because blocks are loaded 64 + // bytes at a time, ensure that there will be at least 24 remaining bytes + // after the last round, so that the final read will not pass beyond the + // bounds of the input buffer: + size_t rounds = (*slen - 24) / 48; + + *slen -= rounds * 48; // 48 bytes consumed per round + *olen += rounds * 64; // 64 bytes produced per round + + while (rounds > 0) { + if (rounds >= 8) { + enc_loop_avx512_inner(s, o); + enc_loop_avx512_inner(s, o); + enc_loop_avx512_inner(s, o); + enc_loop_avx512_inner(s, o); + enc_loop_avx512_inner(s, o); + enc_loop_avx512_inner(s, o); + enc_loop_avx512_inner(s, o); + enc_loop_avx512_inner(s, o); + rounds -= 8; + continue; + } + if (rounds >= 4) { + enc_loop_avx512_inner(s, o); + enc_loop_avx512_inner(s, o); + enc_loop_avx512_inner(s, o); + enc_loop_avx512_inner(s, o); + rounds -= 4; + continue; + } + if (rounds >= 2) { + enc_loop_avx512_inner(s, o); + enc_loop_avx512_inner(s, o); + rounds -= 2; + continue; + } + enc_loop_avx512_inner(s, o); + break; + } +} \ No newline at end of file diff --git a/lib/arch/avx512/enc_reshuffle_translate.c b/lib/arch/avx512/enc_reshuffle_translate.c new file mode 100644 index 00000000..c033d81b --- /dev/null +++ b/lib/arch/avx512/enc_reshuffle_translate.c @@ -0,0 +1,56 @@ +// AVX512 algorithm is based on permutevar and multishift. The code is +// referenced from https://github.com/WojciechMula/base64-avx512 which +// is under BSD-3 license + +static inline __m512i +enc_reshuffle_translate (const __m512i input) +{ + // 32-bit input + // [ 0 0 0 0 0 0 0 0|c1 c0 d5 d4 d3 d2 d1 d0| + // b3 b2 b1 b0 c5 c4 c3 c2|a5 a4 a3 a2 a1 a0 b5 b4] + // output order [1, 2, 0, 1] + // [b3 b2 b1 b0 c5 c4 c3 c2|c1 c0 d5 d4 d3 d2 d1 d0| + // a5 a4 a3 a2 a1 a0 b5 b4|b3 b2 b1 b0 c3 c2 c1 c0] + + const __m512i shuffle_input = _mm512_setr_epi32(0x01020001, + 0x04050304, + 0x07080607, + 0x0a0b090a, + 0x0d0e0c0d, + 0x10110f10, + 0x13141213, + 0x16171516, + 0x191a1819, + 0x1c1d1b1c, + 0x1f201e1f, + 0x22232122, + 0x25262425, + 0x28292728, + 0x2b2c2a2b, + 0x2e2f2d2e); + + // Reorder bytes + // [b3 b2 b1 b0 c5 c4 c3 c2|c1 c0 d5 d4 d3 d2 d1 d0| + // a5 a4 a3 a2 a1 a0 b5 b4|b3 b2 b1 b0 c3 c2 c1 c0] + const __m512i in = _mm512_permutexvar_epi8(shuffle_input, input); + + + // After multishift a single 32-bit lane has following layout + // [c1 c0 d5 d4 d3 d2 d1 d0|b1 b0 c5 c4 c3 c2 c1 c0| + // a1 a0 b5 b4 b3 b2 b1 b0|d1 d0 a5 a4 a3 a2 a1 a0] + // (a = [10:17], b = [4:11], c = [22:27], d = [16:21]) + + // 48, 54, 36, 42, 16, 22, 4, 10 + const __m512i shifts = _mm512_set1_epi64(0x3036242a1016040alu); + __m512i shuffled_in = _mm512_multishift_epi64_epi8(shifts, in); + + // Translate immediatedly after reshuffled + static char base64_table[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz" + "0123456789+/"; + const char* lookup_tbl = base64_table; + const __m512i lookup = _mm512_loadu_si512(lookup_tbl); + + // Translation 6-bit values to ASCII. + return _mm512_permutexvar_epi8(shuffled_in, lookup); +} \ No newline at end of file diff --git a/lib/codec_choose.c b/lib/codec_choose.c index 6a07d6a7..a1b2c187 100644 --- a/lib/codec_choose.c +++ b/lib/codec_choose.c @@ -2,6 +2,7 @@ #include #include #include +#include #include "../include/libbase64.h" #include "codecs.h" @@ -10,7 +11,7 @@ #if (__x86_64__ || __i386__ || _M_X86 || _M_X64) #define BASE64_X86 - #if (HAVE_SSSE3 || HAVE_SSE41 || HAVE_SSE42 || HAVE_AVX || HAVE_AVX2) + #if (HAVE_SSSE3 || HAVE_SSE41 || HAVE_SSE42 || HAVE_AVX || HAVE_AVX2 || HAVE_AVX512) #define BASE64_X86_SIMD #endif #endif @@ -31,7 +32,7 @@ __cpuid_count(__level, 0, __eax, __ebx, __ecx, __edx) #else #include - #if HAVE_AVX2 || HAVE_AVX + #if HAVE_AVX512 || HAVE_AVX2 || HAVE_AVX #if ((__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 2) || (__clang_major__ >= 3)) static inline uint64_t _xgetbv (uint32_t index) { @@ -45,6 +46,10 @@ #endif #endif +#ifndef bit_AVX512 +#define bit_AVX512vl (1 << 31) +#define bit_AVX512vbmi (1 << 1) +#endif #ifndef bit_AVX2 #define bit_AVX2 (1 << 5) #endif @@ -75,6 +80,7 @@ BASE64_ENC_FUNCTION(arch); \ BASE64_DEC_FUNCTION(arch); \ +BASE64_CODEC_FUNCS(avx512) BASE64_CODEC_FUNCS(avx2) BASE64_CODEC_FUNCS(neon32) BASE64_CODEC_FUNCS(neon64) @@ -91,9 +97,10 @@ codec_choose_forced (struct codec *codec, int flags) // always allow it, even if the codec is a no-op. // For testing purposes. - if (!(flags & 0xFF)) { + if (!(flags & 0xFFFF)) { return false; } + if (flags & BASE64_FORCE_AVX2) { codec->enc = base64_stream_encode_avx2; codec->dec = base64_stream_decode_avx2; @@ -134,6 +141,11 @@ codec_choose_forced (struct codec *codec, int flags) codec->dec = base64_stream_decode_avx; return true; } + if (flags & BASE64_FORCE_AVX512) { + codec->enc = base64_stream_encode_avx512; + codec->dec = base64_stream_decode_avx512; + return true; + } return false; } @@ -178,8 +190,8 @@ codec_choose_x86 (struct codec *codec) max_level = __get_cpuid_max(0, NULL); #endif - #if HAVE_AVX2 || HAVE_AVX - // Check for AVX/AVX2 support: + #if HAVE_AVX512 || HAVE_AVX2 || HAVE_AVX + // Check for AVX/AVX2/AVX512 support: // Checking for AVX requires 3 things: // 1) CPUID indicates that the OS uses XSAVE and XRSTORE instructions // (allowing saving YMM registers on context switch) @@ -195,6 +207,16 @@ codec_choose_x86 (struct codec *codec) uint64_t xcr_mask; xcr_mask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK); if (xcr_mask & _XCR_XMM_AND_YMM_STATE_ENABLED_BY_OS) { + #if HAVE_AVX512 + if (max_level >= 7) { + __cpuid_count(7, 0, eax, ebx, ecx, edx); + if (ebx & bit_AVX512vl && ecx & bit_AVX512VBMI) { + codec->enc = base64_stream_encode_avx512; + codec->dec = base64_stream_decode_avx512; + return true; + } + } + #endif #if HAVE_AVX2 if (max_level >= 7) { __cpuid_count(7, 0, eax, ebx, ecx, edx); diff --git a/lib/lib.c b/lib/lib.c index 4703512b..053931a9 100644 --- a/lib/lib.c +++ b/lib/lib.c @@ -68,7 +68,7 @@ void base64_stream_decode_init (struct base64_state *state, int flags) { // If any of the codec flags are set, redo choice: - if (codec.dec == NULL || flags & 0xFF) { + if (codec.dec == NULL || flags & 0xFFFF) { codec_choose(&codec, flags); } state->eof = 0; diff --git a/test/ci/test.sh b/test/ci/test.sh index 066a49f4..a296bfcf 100755 --- a/test/ci/test.sh +++ b/test/ci/test.sh @@ -7,9 +7,10 @@ if [ "${MACHINE}" == "x86_64" ]; then export SSE41_CFLAGS=-msse4.1 export SSE42_CFLAGS=-msse4.2 export AVX_CFLAGS=-mavx - # no AVX2 on GHA macOS + # no AVX2 or AVX512 on GHA macOS if [ "$(uname -s)" != "Darwin" ]; then - export AVX2_CFLAGS=-mavx2 + export AVX2_CFLAGS=-mavx2 + export AVX512_CFLAGS="-mavx512vl -mavx512vbmi" fi elif [ "${MACHINE}" == "aarch64" ]; then export NEON64_CFLAGS="-march=armv8-a" diff --git a/test/codec_supported.c b/test/codec_supported.c index a027b994..f68c7668 100644 --- a/test/codec_supported.c +++ b/test/codec_supported.c @@ -11,6 +11,7 @@ static char *_codecs[] = , "SSE41" , "SSE42" , "AVX" +, "AVX512" , NULL } ;