diff --git a/CMakeLists.txt b/CMakeLists.txt
index dcca17f6..c6020190 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -62,6 +62,8 @@ cmake_dependent_option(BASE64_WITH_AVX "add AVX codepath" ON ${_IS_X86} OFF)
 add_feature_info(AVX BASE64_WITH_AVX "add AVX codepath")
 cmake_dependent_option(BASE64_WITH_AVX2 "add AVX 2 codepath" ON ${_IS_X86} OFF)
 add_feature_info(AVX2 BASE64_WITH_AVX2 "add AVX2 codepath")
+cmake_dependent_option(BASE64_WITH_AVX512 "add AVX 512 codepath" ON ${_IS_X86} OFF)
+add_feature_info(AVX2 BASE64_WITH_AVX512 "add AVX512 codepath")
 
 cmake_dependent_option(BASE64_WITH_NEON32 "add NEON32 codepath" OFF _TARGET_ARCH_arm OFF)
 add_feature_info(NEON32 BASE64_WITH_NEON32 "add NEON32 codepath")
@@ -118,6 +120,7 @@ add_library(base64
     lib/arch/sse42/codec.c
     lib/arch/avx/codec.c
     lib/arch/avx2/codec.c
+    lib/arch/avx512/codec.c
 
     lib/arch/neon32/codec.c
     lib/arch/neon64/codec.c
@@ -206,6 +209,7 @@ if (_TARGET_ARCH STREQUAL "x86" OR _TARGET_ARCH STREQUAL "x64")
     configure_codec(SSE42 __SSSE4_2__)
     configure_codec(AVX)
     configure_codec(AVX2)
+    configure_codec(AVX512)
 
 elseif (_TARGET_ARCH STREQUAL "arm")
     set(BASE64_NEON32_CFLAGS "${COMPILE_FLAGS_NEON32}" CACHE STRING "the NEON32 compile flags (for 'lib/arch/neon32/codec.c')")
diff --git a/Makefile b/Makefile
index 2bb01e20..8dd55388 100644
--- a/Makefile
+++ b/Makefile
@@ -4,6 +4,7 @@ CFLAGS += -std=c99 -O3 -Wall -Wextra -pedantic
 OBJCOPY ?= objcopy
 
 OBJS = \
+  lib/arch/avx512/codec.o \
   lib/arch/avx2/codec.o \
   lib/arch/generic/codec.o \
   lib/arch/neon32/codec.o \
@@ -16,6 +17,7 @@ OBJS = \
   lib/codec_choose.o \
   lib/tables/tables.o
 
+HAVE_AVX512 = 0
 HAVE_AVX2   = 0
 HAVE_NEON32 = 0
 HAVE_NEON64 = 0
@@ -26,6 +28,9 @@ HAVE_AVX    = 0
 
 # The user should supply compiler flags for the codecs they want to build.
 # Check which codecs we're going to include:
+ifdef AVX512_CFLAGS
+  HAVE_AVX512 = 1
+endif
 ifdef AVX2_CFLAGS
   HAVE_AVX2 = 1
 endif
@@ -64,7 +69,8 @@ lib/libbase64.o: $(OBJS)
 	$(OBJCOPY) --keep-global-symbols=lib/exports.txt $@
 
 lib/config.h:
-	@echo "#define HAVE_AVX2   $(HAVE_AVX2)"    > $@
+	@echo "#define HAVE_AVX512 $(HAVE_AVX512)" > $@
+	@echo "#define HAVE_AVX2   $(HAVE_AVX2)"   >> $@
 	@echo "#define HAVE_NEON32 $(HAVE_NEON32)" >> $@
 	@echo "#define HAVE_NEON64 $(HAVE_NEON64)" >> $@
 	@echo "#define HAVE_SSSE3  $(HAVE_SSSE3)"  >> $@
@@ -75,6 +81,7 @@ lib/config.h:
 $(OBJS): lib/config.h
 $(OBJS): CFLAGS += -Ilib
 
+lib/arch/avx512/codec.o: CFLAGS += $(AVX512_CFLAGS)
 lib/arch/avx2/codec.o:   CFLAGS += $(AVX2_CFLAGS)
 lib/arch/neon32/codec.o: CFLAGS += $(NEON32_CFLAGS)
 lib/arch/neon64/codec.o: CFLAGS += $(NEON64_CFLAGS)
diff --git a/README.md b/README.md
index b953c324..a99ef540 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 [![Build Status](https://github.com/aklomp/base64/actions/workflows/test.yml/badge.svg)](https://github.com/aklomp/base64/actions/workflows/test.yml)
 
 This is an implementation of a base64 stream encoding/decoding library in C99
-with SIMD (AVX2, NEON, AArch64/NEON, SSSE3, SSE4.1, SSE4.2, AVX) and
+with SIMD (AVX2, AVX512, NEON, AArch64/NEON, SSSE3, SSE4.1, SSE4.2, AVX) and
 [OpenMP](http://www.openmp.org) acceleration. It also contains wrapper functions
 to encode/decode simple length-delimited strings. This library aims to be:
 
@@ -19,6 +19,10 @@ will pick an optimized codec that lets it encode/decode 12 or 24 bytes at a
 time, which gives a speedup of four or more times compared to the "plain"
 bytewise codec.
 
+AVX512 support is only for encoding at present, utilizing the AVX512 VL and VBMI
+instructions. Decoding part reused AVX2 implementations. For CPUs later than
+Cannonlake (manufactured in 2018) supports these instructions.
+
 NEON support is hardcoded to on or off at compile time, because portable
 runtime feature detection is unavailable on ARM.
 
@@ -59,6 +63,9 @@ optimizations described by Wojciech Muła in a
 [articles](http://0x80.pl/notesen/2016-01-17-sse-base64-decoding.html).
 His own code is [here](https://github.com/WojciechMula/toys/tree/master/base64).
 
+The AVX512 encoder code is also referenced from the project of Wojciech Muła and
+the project code is [here](https://github.com/WojciechMula/base64-avx512)
+
 The OpenMP implementation was added by Ferry Toth (@htot) from [Exalon Delft](http://www.exalondelft.nl).
 
 ## Building
@@ -76,8 +83,8 @@ To compile just the "plain" library without SIMD codecs, type:
 make lib/libbase64.o
 ```
 
-Optional SIMD codecs can be included by specifying the `AVX2_CFLAGS`, `NEON32_CFLAGS`, `NEON64_CFLAGS`,
-`SSSE3_CFLAGS`, `SSE41_CFLAGS`, `SSE42_CFLAGS` and/or `AVX_CFLAGS` environment variables.
+Optional SIMD codecs can be included by specifying the `AVX2_CFLAGS`, `AVX512_CFLAGS`, 
+`NEON32_CFLAGS`, `NEON64_CFLAGS`, `SSSE3_CFLAGS`, `SSE41_CFLAGS`, `SSE42_CFLAGS` and/or `AVX_CFLAGS` environment variables.
 A typical build invocation on x86 looks like this:
 
 ```sh
@@ -93,6 +100,15 @@ Example:
 AVX2_CFLAGS=-mavx2 make
 ```
 
+### AVX512
+
+To build and include the AVX512 codec, set the `AVX512_CFLAGS` environment variable to a value that will turn on AVX512 support in your compiler, typically `-mavx512vl -mavx512vbmi`.
+Example:
+
+```sh
+AVX512_CFLAGS="-mavx512vl -mavx512vbmi" make
+```
+
 The codec will only be used if runtime feature detection shows that the target machine supports AVX2.
 
 ### SSSE3
@@ -208,6 +224,7 @@ Mainly there for testing purposes, this is also useful on ARM where the only way
 The following constants can be used:
 
 - `BASE64_FORCE_AVX2`
+- `BASE64_FORCE_AVX512`
 - `BASE64_FORCE_NEON32`
 - `BASE64_FORCE_NEON64`
 - `BASE64_FORCE_PLAIN`
diff --git a/cmake/Modules/TargetSIMDInstructionSet.cmake b/cmake/Modules/TargetSIMDInstructionSet.cmake
index ba1f6e51..48508090 100644
--- a/cmake/Modules/TargetSIMDInstructionSet.cmake
+++ b/cmake/Modules/TargetSIMDInstructionSet.cmake
@@ -21,6 +21,7 @@ macro(define_SIMD_compile_flags)
         set(COMPILE_FLAGS_SSE42 "-msse4.2")
         set(COMPILE_FLAGS_AVX "-mavx")
         set(COMPILE_FLAGS_AVX2 "-mavx2")
+        set(COMPILE_FLAGS_AVX512 "-mavx512vl -mavx512vbmi")
 
         #arm
         set(COMPILE_FLAGS_NEON32 "-mfpu=neon")
@@ -30,5 +31,6 @@ macro(define_SIMD_compile_flags)
         set(COMPILE_FLAGS_SSE42 " ")
         set(COMPILE_FLAGS_AVX "/arch:AVX")
         set(COMPILE_FLAGS_AVX2 "/arch:AVX2")
+        set(COMPILE_FLAGS_AVX512 "/arch:AVX512")
     endif()
 endmacro(define_SIMD_compile_flags)
diff --git a/cmake/config.h.in b/cmake/config.h.in
index 8530d1e1..c7faa94b 100644
--- a/cmake/config.h.in
+++ b/cmake/config.h.in
@@ -16,6 +16,9 @@
 #cmakedefine01 BASE64_WITH_AVX2
 #define HAVE_AVX2 BASE64_WITH_AVX2
 
+#cmakedefine01 BASE64_WITH_AVX512
+#define HAVE_AVX512 BASE64_WITH_AVX512
+
 #cmakedefine01 BASE64_WITH_NEON32
 #define HAVE_NEON32 BASE64_WITH_NEON32
 
diff --git a/include/libbase64.h b/include/libbase64.h
index d470a82f..c5908973 100644
--- a/include/libbase64.h
+++ b/include/libbase64.h
@@ -53,6 +53,7 @@ extern "C" {
 #define BASE64_FORCE_SSE41	(1 << 5)
 #define BASE64_FORCE_SSE42	(1 << 6)
 #define BASE64_FORCE_AVX	(1 << 7)
+#define BASE64_FORCE_AVX512	(1 << 8)
 
 struct base64_state {
 	int eof;
diff --git a/lib/arch/avx512/codec.c b/lib/arch/avx512/codec.c
new file mode 100644
index 00000000..3fd73521
--- /dev/null
+++ b/lib/arch/avx512/codec.c
@@ -0,0 +1,42 @@
+#include <stdint.h>
+#include <stddef.h>
+#include <stdlib.h>
+
+#include "../../../include/libbase64.h"
+#include "../../tables/tables.h"
+#include "../../codecs.h"
+#include "config.h"
+#include "../../env.h"
+
+#if HAVE_AVX512
+#include <immintrin.h>
+
+#include "dec_reshuffle.c"
+#include "dec_loop.c"
+#include "enc_reshuffle_translate.c"
+#include "enc_loop.c"
+
+#endif	// HAVE_AVX512
+
+BASE64_ENC_FUNCTION(avx512)
+{
+#if HAVE_AVX2
+	#include "../generic/enc_head.c"
+	enc_loop_avx512(&s, &slen, &o, &olen);
+	#include "../generic/enc_tail.c"
+#else
+	BASE64_ENC_STUB
+#endif
+}
+
+// Reuse AVX2 decoding. Not supporting AVX512 at present
+BASE64_DEC_FUNCTION(avx512)
+{
+#if HAVE_AVX2
+	#include "../generic/dec_head.c"
+	dec_loop_avx2(&s, &slen, &o, &olen);
+	#include "../generic/dec_tail.c"
+#else
+	BASE64_DEC_STUB
+#endif
+}
diff --git a/lib/arch/avx512/dec_loop.c b/lib/arch/avx512/dec_loop.c
new file mode 100644
index 00000000..f959fc4b
--- /dev/null
+++ b/lib/arch/avx512/dec_loop.c
@@ -0,0 +1,110 @@
+static inline int
+dec_loop_avx2_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
+{
+	const __m256i lut_lo = _mm256_setr_epi8(
+		0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
+		0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A,
+		0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
+		0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A);
+
+	const __m256i lut_hi = _mm256_setr_epi8(
+		0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
+		0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
+		0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
+		0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10);
+
+	const __m256i lut_roll = _mm256_setr_epi8(
+		0,  16,  19,   4, -65, -65, -71, -71,
+		0,   0,   0,   0,   0,   0,   0,   0,
+		0,  16,  19,   4, -65, -65, -71, -71,
+		0,   0,   0,   0,   0,   0,   0,   0);
+
+	const __m256i mask_2F = _mm256_set1_epi8(0x2F);
+
+	// Load input:
+	__m256i str = _mm256_loadu_si256((__m256i *) *s);
+
+	// See the SSSE3 decoder for an explanation of the algorithm.
+	const __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi32(str, 4), mask_2F);
+	const __m256i lo_nibbles = _mm256_and_si256(str, mask_2F);
+	const __m256i hi         = _mm256_shuffle_epi8(lut_hi, hi_nibbles);
+	const __m256i lo         = _mm256_shuffle_epi8(lut_lo, lo_nibbles);
+
+	if (!_mm256_testz_si256(lo, hi)) {
+		return 0;
+	}
+
+	const __m256i eq_2F = _mm256_cmpeq_epi8(str, mask_2F);
+	const __m256i roll  = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles));
+
+	// Now simply add the delta values to the input:
+	str = _mm256_add_epi8(str, roll);
+
+	// Reshuffle the input to packed 12-byte output format:
+	str = dec_reshuffle(str);
+
+	// Store the output:
+	_mm256_storeu_si256((__m256i *) *o, str);
+
+	*s += 32;
+	*o += 24;
+	*rounds -= 1;
+
+	return 1;
+}
+
+static inline void
+dec_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
+{
+	if (*slen < 45) {
+		return;
+	}
+
+	// Process blocks of 32 bytes per round. Because 8 extra zero bytes are
+	// written after the output, ensure that there will be at least 13
+	// bytes of input data left to cover the gap. (11 data bytes and up to
+	// two end-of-string markers.)
+	size_t rounds = (*slen - 13) / 32;
+
+	*slen -= rounds * 32;	// 32 bytes consumed per round
+	*olen += rounds * 24;	// 24 bytes produced per round
+
+	do {
+		if (rounds >= 8) {
+			if (dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds)) {
+				continue;
+			}
+			break;
+		}
+		if (rounds >= 4) {
+			if (dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds)) {
+				continue;
+			}
+			break;
+		}
+		if (rounds >= 2) {
+			if (dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds)) {
+				continue;
+			}
+			break;
+		}
+		dec_loop_avx2_inner(s, o, &rounds);
+		break;
+
+	} while (rounds > 0);
+
+	// Adjust for any rounds that were skipped:
+	*slen += rounds * 32;
+	*olen -= rounds * 24;
+}
diff --git a/lib/arch/avx512/dec_reshuffle.c b/lib/arch/avx512/dec_reshuffle.c
new file mode 100644
index 00000000..f3518098
--- /dev/null
+++ b/lib/arch/avx512/dec_reshuffle.c
@@ -0,0 +1,34 @@
+static inline __m256i
+dec_reshuffle (const __m256i in)
+{
+	// in, lower lane, bits, upper case are most significant bits, lower
+	// case are least significant bits:
+	// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
+	// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
+	// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
+	// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
+
+	const __m256i merge_ab_and_bc = _mm256_maddubs_epi16(in, _mm256_set1_epi32(0x01400140));
+	// 0000kkkk LLllllll 0000JJJJ JJjjKKKK
+	// 0000hhhh IIiiiiii 0000GGGG GGggHHHH
+	// 0000eeee FFffffff 0000DDDD DDddEEEE
+	// 0000bbbb CCcccccc 0000AAAA AAaaBBBB
+
+	__m256i out = _mm256_madd_epi16(merge_ab_and_bc, _mm256_set1_epi32(0x00011000));
+	// 00000000 JJJJJJjj KKKKkkkk LLllllll
+	// 00000000 GGGGGGgg HHHHhhhh IIiiiiii
+	// 00000000 DDDDDDdd EEEEeeee FFffffff
+	// 00000000 AAAAAAaa BBBBbbbb CCcccccc
+
+	// Pack bytes together in each lane:
+	out = _mm256_shuffle_epi8(out, _mm256_setr_epi8(
+		2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1,
+		2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1));
+	// 00000000 00000000 00000000 00000000
+	// LLllllll KKKKkkkk JJJJJJjj IIiiiiii
+	// HHHHhhhh GGGGGGgg FFffffff EEEEeeee
+	// DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa
+
+	// Pack lanes:
+	return _mm256_permutevar8x32_epi32(out, _mm256_setr_epi32(0, 1, 2, 4, 5, 6, -1, -1));
+}
diff --git a/lib/arch/avx512/enc_loop.c b/lib/arch/avx512/enc_loop.c
new file mode 100644
index 00000000..a6daca66
--- /dev/null
+++ b/lib/arch/avx512/enc_loop.c
@@ -0,0 +1,61 @@
+static inline void
+enc_loop_avx512_inner (const uint8_t **s, uint8_t **o)
+{
+	// Load input:
+	__m512i src = _mm512_loadu_si512((__m512i *) *s);
+
+	// Reshuffle, translate, store:
+	src = enc_reshuffle_translate(src);
+	_mm512_storeu_si512((__m512i *) *o, src);
+
+	*s += 48;
+	*o += 64;
+}
+
+static inline void
+enc_loop_avx512 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
+{
+	if (*slen < 64) {
+		return;
+	}
+
+	// Process blocks of 48 bytes at a time. Because blocks are loaded 64
+	// bytes at a time, ensure that there will be at least 24 remaining bytes 
+	// after the last round, so that the final read will not pass beyond the 
+	// bounds of the input buffer:
+	size_t rounds = (*slen - 24) / 48;
+
+	*slen -= rounds * 48;   // 48 bytes consumed per round
+	*olen += rounds * 64;   // 64 bytes produced per round
+
+	while (rounds > 0) {
+		if (rounds >= 8) {
+			enc_loop_avx512_inner(s, o);
+			enc_loop_avx512_inner(s, o);
+			enc_loop_avx512_inner(s, o);
+			enc_loop_avx512_inner(s, o);
+			enc_loop_avx512_inner(s, o);
+			enc_loop_avx512_inner(s, o);
+			enc_loop_avx512_inner(s, o);
+			enc_loop_avx512_inner(s, o);
+			rounds -= 8;
+			continue;
+		}
+		if (rounds >= 4) {
+			enc_loop_avx512_inner(s, o);
+			enc_loop_avx512_inner(s, o);
+			enc_loop_avx512_inner(s, o);
+			enc_loop_avx512_inner(s, o);
+			rounds -= 4;
+			continue;
+		}
+		if (rounds >= 2) {
+			enc_loop_avx512_inner(s, o);
+			enc_loop_avx512_inner(s, o);
+			rounds -= 2;
+			continue;
+		}
+		enc_loop_avx512_inner(s, o);
+		break;
+	}
+}
\ No newline at end of file
diff --git a/lib/arch/avx512/enc_reshuffle_translate.c b/lib/arch/avx512/enc_reshuffle_translate.c
new file mode 100644
index 00000000..c033d81b
--- /dev/null
+++ b/lib/arch/avx512/enc_reshuffle_translate.c
@@ -0,0 +1,56 @@
+// AVX512 algorithm is based on permutevar and multishift. The code is
+// referenced from https://github.com/WojciechMula/base64-avx512 which
+// is under BSD-3 license
+
+static inline __m512i
+enc_reshuffle_translate (const __m512i input)
+{
+	// 32-bit input
+    // [ 0  0  0  0  0  0  0  0|c1 c0 d5 d4 d3 d2 d1 d0|
+    //  b3 b2 b1 b0 c5 c4 c3 c2|a5 a4 a3 a2 a1 a0 b5 b4]
+    // output order  [1, 2, 0, 1]
+    // [b3 b2 b1 b0 c5 c4 c3 c2|c1 c0 d5 d4 d3 d2 d1 d0|
+    //  a5 a4 a3 a2 a1 a0 b5 b4|b3 b2 b1 b0 c3 c2 c1 c0]
+
+    const __m512i shuffle_input = _mm512_setr_epi32(0x01020001,
+                                                    0x04050304,
+                                                    0x07080607,
+                                                    0x0a0b090a,
+                                                    0x0d0e0c0d,
+                                                    0x10110f10,
+                                                    0x13141213,
+                                                    0x16171516,
+                                                    0x191a1819,
+                                                    0x1c1d1b1c,
+                                                    0x1f201e1f,
+                                                    0x22232122,
+                                                    0x25262425,
+                                                    0x28292728,
+                                                    0x2b2c2a2b,
+                                                    0x2e2f2d2e);
+
+	// Reorder bytes
+    // [b3 b2 b1 b0 c5 c4 c3 c2|c1 c0 d5 d4 d3 d2 d1 d0|
+    //  a5 a4 a3 a2 a1 a0 b5 b4|b3 b2 b1 b0 c3 c2 c1 c0]
+    const __m512i in = _mm512_permutexvar_epi8(shuffle_input, input);
+
+
+    // After multishift a single 32-bit lane has following layout
+    // [c1 c0 d5 d4 d3 d2 d1 d0|b1 b0 c5 c4 c3 c2 c1 c0|
+    //  a1 a0 b5 b4 b3 b2 b1 b0|d1 d0 a5 a4 a3 a2 a1 a0]
+    // (a = [10:17], b = [4:11], c = [22:27], d = [16:21])
+
+    // 48, 54, 36, 42, 16, 22, 4, 10
+    const __m512i shifts = _mm512_set1_epi64(0x3036242a1016040alu);
+    __m512i shuffled_in = _mm512_multishift_epi64_epi8(shifts, in);
+
+    // Translate immediatedly after reshuffled
+	static char base64_table[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+                                 "abcdefghijklmnopqrstuvwxyz"
+                                 "0123456789+/";
+    const char* lookup_tbl = base64_table;
+    const __m512i lookup = _mm512_loadu_si512(lookup_tbl);
+
+    // Translation 6-bit values to ASCII.
+    return _mm512_permutexvar_epi8(shuffled_in, lookup);
+}
\ No newline at end of file
diff --git a/lib/codec_choose.c b/lib/codec_choose.c
index 6a07d6a7..a1b2c187 100644
--- a/lib/codec_choose.c
+++ b/lib/codec_choose.c
@@ -2,6 +2,7 @@
 #include <stdint.h>
 #include <stddef.h>
 #include <stdint.h>
+#include <stdio.h>
 
 #include "../include/libbase64.h"
 #include "codecs.h"
@@ -10,7 +11,7 @@
 
 #if (__x86_64__ || __i386__ || _M_X86 || _M_X64)
   #define BASE64_X86
-  #if (HAVE_SSSE3 || HAVE_SSE41 || HAVE_SSE42 || HAVE_AVX || HAVE_AVX2)
+  #if (HAVE_SSSE3 || HAVE_SSE41 || HAVE_SSE42 || HAVE_AVX || HAVE_AVX2 || HAVE_AVX512)
     #define BASE64_X86_SIMD
   #endif
 #endif
@@ -31,7 +32,7 @@
 		__cpuid_count(__level, 0, __eax, __ebx, __ecx, __edx)
 #else
 	#include <cpuid.h>
-	#if HAVE_AVX2 || HAVE_AVX
+	#if HAVE_AVX512 || HAVE_AVX2 || HAVE_AVX
 		#if ((__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 2) || (__clang_major__ >= 3))
 			static inline uint64_t _xgetbv (uint32_t index)
 			{
@@ -45,6 +46,10 @@
 	#endif
 #endif
 
+#ifndef bit_AVX512
+#define bit_AVX512vl (1 << 31)
+#define bit_AVX512vbmi (1 << 1)
+#endif
 #ifndef bit_AVX2
 #define bit_AVX2 (1 << 5)
 #endif
@@ -75,6 +80,7 @@
 	BASE64_ENC_FUNCTION(arch);	\
 	BASE64_DEC_FUNCTION(arch);	\
 
+BASE64_CODEC_FUNCS(avx512)
 BASE64_CODEC_FUNCS(avx2)
 BASE64_CODEC_FUNCS(neon32)
 BASE64_CODEC_FUNCS(neon64)
@@ -91,9 +97,10 @@ codec_choose_forced (struct codec *codec, int flags)
 	// always allow it, even if the codec is a no-op.
 	// For testing purposes.
 
-	if (!(flags & 0xFF)) {
+	if (!(flags & 0xFFFF)) {
 		return false;
 	}
+
 	if (flags & BASE64_FORCE_AVX2) {
 		codec->enc = base64_stream_encode_avx2;
 		codec->dec = base64_stream_decode_avx2;
@@ -134,6 +141,11 @@ codec_choose_forced (struct codec *codec, int flags)
 		codec->dec = base64_stream_decode_avx;
 		return true;
 	}
+	if (flags & BASE64_FORCE_AVX512) {
+	    codec->enc = base64_stream_encode_avx512;
+	    codec->dec = base64_stream_decode_avx512;
+	    return true;
+    }
 	return false;
 }
 
@@ -178,8 +190,8 @@ codec_choose_x86 (struct codec *codec)
 	max_level = __get_cpuid_max(0, NULL);
 	#endif
 
-	#if HAVE_AVX2 || HAVE_AVX
-	// Check for AVX/AVX2 support:
+	#if HAVE_AVX512 || HAVE_AVX2 || HAVE_AVX
+	// Check for AVX/AVX2/AVX512 support:
 	// Checking for AVX requires 3 things:
 	// 1) CPUID indicates that the OS uses XSAVE and XRSTORE instructions
 	//    (allowing saving YMM registers on context switch)
@@ -195,6 +207,16 @@ codec_choose_x86 (struct codec *codec)
 			uint64_t xcr_mask;
 			xcr_mask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
 			if (xcr_mask & _XCR_XMM_AND_YMM_STATE_ENABLED_BY_OS) {
+				#if HAVE_AVX512
+				if (max_level >= 7) {
+					__cpuid_count(7, 0, eax, ebx, ecx, edx);
+					if (ebx & bit_AVX512vl && ecx & bit_AVX512VBMI) {
+						codec->enc = base64_stream_encode_avx512;
+						codec->dec = base64_stream_decode_avx512;
+						return true;
+					}
+				}
+				#endif
 				#if HAVE_AVX2
 				if (max_level >= 7) {
 					__cpuid_count(7, 0, eax, ebx, ecx, edx);
diff --git a/lib/lib.c b/lib/lib.c
index 4703512b..053931a9 100644
--- a/lib/lib.c
+++ b/lib/lib.c
@@ -68,7 +68,7 @@ void
 base64_stream_decode_init (struct base64_state *state, int flags)
 {
 	// If any of the codec flags are set, redo choice:
-	if (codec.dec == NULL || flags & 0xFF) {
+	if (codec.dec == NULL || flags & 0xFFFF) {
 		codec_choose(&codec, flags);
 	}
 	state->eof = 0;
diff --git a/test/ci/test.sh b/test/ci/test.sh
index 066a49f4..a296bfcf 100755
--- a/test/ci/test.sh
+++ b/test/ci/test.sh
@@ -7,9 +7,10 @@ if [ "${MACHINE}" == "x86_64" ]; then
 	export SSE41_CFLAGS=-msse4.1
 	export SSE42_CFLAGS=-msse4.2
 	export AVX_CFLAGS=-mavx
-	# no AVX2 on GHA macOS
+	# no AVX2 or AVX512 on GHA macOS
 	if [ "$(uname -s)" != "Darwin" ]; then
-		export AVX2_CFLAGS=-mavx2
+	    export AVX2_CFLAGS=-mavx2
+		export AVX512_CFLAGS="-mavx512vl -mavx512vbmi"
 	fi
 elif [ "${MACHINE}" == "aarch64" ]; then
 	export NEON64_CFLAGS="-march=armv8-a"
diff --git a/test/codec_supported.c b/test/codec_supported.c
index a027b994..f68c7668 100644
--- a/test/codec_supported.c
+++ b/test/codec_supported.c
@@ -11,6 +11,7 @@ static char *_codecs[] =
 , "SSE41"
 , "SSE42"
 , "AVX"
+, "AVX512"
 , NULL
 } ;