From 5b07bd0864de2053ce79adf4a157fc96bf7509a4 Mon Sep 17 00:00:00 2001
From: Jeremie Vandenplas <jeremie.vandenplas@gmail.com>
Date: Sun, 7 Jan 2024 21:31:27 +0100
Subject: [PATCH 1/8] Mv common.fypp and adapt CMakeLists.txt

---
 CMakeLists.txt               | 1 +
 {src => include}/common.fypp | 0
 test/CMakeLists.txt          | 5 -----
 3 files changed, 1 insertion(+), 5 deletions(-)
 rename {src => include}/common.fypp (100%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 471ce6eda..b10e1f73d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -65,6 +65,7 @@ list(
   "-DPROJECT_VERSION_MAJOR=${PROJECT_VERSION_MAJOR}"
   "-DPROJECT_VERSION_MINOR=${PROJECT_VERSION_MINOR}"
   "-DPROJECT_VERSION_PATCH=${PROJECT_VERSION_PATCH}"
+  "-I${PROJECT_SOURCE_DIR}/include"
 )
 
 add_subdirectory(src)
diff --git a/src/common.fypp b/include/common.fypp
similarity index 100%
rename from src/common.fypp
rename to include/common.fypp
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 7acdfba1c..8e199182d 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -10,11 +10,6 @@ macro(ADDTEST name)
              WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
 endmacro(ADDTEST)
 
-list(
-  APPEND fyppFlags
-  "-I${PROJECT_SOURCE_DIR}/src"
-)
-
 add_subdirectory(array)
 add_subdirectory(ascii)
 add_subdirectory(bitsets)

From ddfd419a94a4528aa75ce237612e146a01710e03 Mon Sep 17 00:00:00 2001
From: Jeremie Vandenplas <jeremie.vandenplas@gmail.com>
Date: Sun, 7 Jan 2024 21:41:24 +0100
Subject: [PATCH 2/8] Adapt for fpm script

---
 ci/fpm-deployment.sh                    | 3 +--
 test/string/test_string_assignment.fypp | 1 +
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/fpm-deployment.sh b/ci/fpm-deployment.sh
index c88c3a227..fd81d1258 100644
--- a/ci/fpm-deployment.sh
+++ b/ci/fpm-deployment.sh
@@ -30,14 +30,13 @@ prune=(
   "$destdir/test/test_always_fail.f90"
   "$destdir/test/test_always_skip.f90"
   "$destdir/test/test_hash_functions.f90"
-  "$destdir/src/common.f90"
   "$destdir/src/f18estop.f90"
 )
 
 major=$(cut -d. -f1 VERSION)
 minor=$(cut -d. -f2 VERSION)
 patch=$(cut -d. -f3 VERSION)
-fyflags="${fyflags} -DPROJECT_VERSION_MAJOR=${major} -DPROJECT_VERSION_MINOR=${minor} -DPROJECT_VERSION_PATCH=${patch}"
+fyflags="${fyflags} -DPROJECT_VERSION_MAJOR=${major} -DPROJECT_VERSION_MINOR=${minor} -DPROJECT_VERSION_PATCH=${patch} -I include"
 
 mkdir -p "$destdir/src" "$destdir/test" "$destdir/example"
 
diff --git a/test/string/test_string_assignment.fypp b/test/string/test_string_assignment.fypp
index e64bcc754..4e934599e 100644
--- a/test/string/test_string_assignment.fypp
+++ b/test/string/test_string_assignment.fypp
@@ -1,3 +1,4 @@
+#:include "common.fypp"
 ! SPDX-Identifier: MIT
 module test_string_assignment
     use testdrive, only : new_unittest, unittest_type, error_type, check

From 7a2bffacf70e91c99085c00e9e6010caab76c60f Mon Sep 17 00:00:00 2001
From: Jeremie Vandenplas <jeremie.vandenplas@gmail.com>
Date: Sun, 7 Jan 2024 21:51:02 +0100
Subject: [PATCH 3/8] Modif of test_mean_f03 following #675 by @arteebraina

---
 test/stats/test_mean_f03.fypp | 92 ++++++++++++++++++++++++++++++++++-
 1 file changed, 91 insertions(+), 1 deletion(-)

diff --git a/test/stats/test_mean_f03.fypp b/test/stats/test_mean_f03.fypp
index 786438d98..c01d2fee2 100644
--- a/test/stats/test_mean_f03.fypp
+++ b/test/stats/test_mean_f03.fypp
@@ -4,7 +4,7 @@
 #:set NRANK = 4
 
 module test_stats_meanf03
-    use testdrive, only : new_unittest, unittest_type, error_type, check
+    use testdrive, only : new_unittest, unittest_type, error_type, check, skip_test
     use stdlib_stats, only: mean
     use stdlib_kinds, only : int8, int16, int32, int64, sp, dp, xdp, qp
     use, intrinsic :: ieee_arithmetic, only : ieee_is_nan
@@ -65,25 +65,36 @@ contains
         !> Error handling
         type(error_type), allocatable, intent(out) :: error
 
+        #:if MAXRANK > 7
         call check(error, mean(d8_${k1}$), sum(real(d8_${k1}$, dp))/real(size(d8_${k1}$), dp)&
                     , 'mean(d8_${k1}$): uncorrect answer'&
                     , thr = dptol)
         if (allocated(error)) return
+
+        #:else
+        call skip_test(error, "Rank > 7 is not supported")
+        #:endif
     end subroutine
 
     subroutine test_stats_meanf03_all_optmask_${k1}$(error)
         !> Error handling
         type(error_type), allocatable, intent(out) :: error
 
+        #:if MAXRANK > 7
         call check(error, ieee_is_nan(mean(d8_${k1}$, .false.))&
                     , 'mean(d8_${k1}$, .false.): uncorrect answer')
         if (allocated(error)) return
+
+        #:else
+        call skip_test(error, "Rank > 7 is not supported")
+        #:endif
     end subroutine
 
     subroutine test_stats_meanf03_${k1}$(error)
         !> Error handling
         type(error_type), allocatable, intent(out) :: error
 
+        #:if MAXRANK > 7
         #:for dim in range(1, 9)
         call check(error&
                     , sum(abs(mean(d8_${k1}$, ${dim}$) -&
@@ -92,12 +103,17 @@ contains
                     )
         if (allocated(error)) return
         #:endfor
+
+        #:else
+        call skip_test(error, "Rank > 7 is not supported")
+        #:endif
     end subroutine
 
     subroutine test_stats_meanf03_optmask_${k1}$(error)
         !> Error handling
         type(error_type), allocatable, intent(out) :: error
 
+        #:if MAXRANK > 7
         call check(error, ieee_is_nan(mean(d1_${k1}$, 1, .false.))&
                     , 'mean(d1_${k1}$, 1, .false.): uncorrect answer'&
                     )
@@ -108,23 +124,33 @@ contains
                     , 'mean(d8_${k1}$, ${dim}$, .false.): uncorrect answer')
         if (allocated(error)) return
         #:endfor
+
+        #:else
+        call skip_test(error, "Rank > 7 is not supported")
+        #:endif
     end subroutine
 
     subroutine test_stats_meanf03_mask_all_${k1}$(error)
         !> Error handling
         type(error_type), allocatable, intent(out) :: error
 
+        #:if MAXRANK > 7
         call check(error, mean(d8_${k1}$, d8_${k1}$ > 0)&
                     , sum(real(d8_${k1}$, dp), d8_${k1}$ > 0)/real(count(d8_${k1}$ > 0), dp)&
                     , 'mean(d8_${k1}$, d8_${k1}$ > 0): uncorrect answer'&
                     , thr = dptol)
         if (allocated(error)) return
+
+        #:else
+        call skip_test(error, "Rank > 7 is not supported")
+        #:endif
     end subroutine
 
     subroutine test_stats_meanf03_mask_${k1}$(error)
         !> Error handling
         type(error_type), allocatable, intent(out) :: error
 
+        #:if MAXRANK > 7
         #:for dim in range(1, 9)
         call check(error&
                     , sum(abs(mean(d8_${k1}$, ${dim}$, d8_${k1}$ > 0) -&
@@ -133,6 +159,10 @@ contains
                     )
         if (allocated(error)) return
         #:endfor
+
+        #:else
+        call skip_test(error, "Rank > 7 is not supported")
+        #:endif
     end subroutine
     #:endfor
 
@@ -141,25 +171,36 @@ contains
         !> Error handling
         type(error_type), allocatable, intent(out) :: error
 
+        #:if MAXRANK > 7
         call check(error, mean(d8_${k1}$), sum(d8_${k1}$)/real(size(d8_${k1}$), ${k1}$)&
                     , 'mean(d8_${k1}$): uncorrect answer'&
                     , thr = ${k1}$tol)
         if (allocated(error)) return
+
+        #:else
+        call skip_test(error, "Rank > 7 is not supported")
+        #:endif
     end subroutine
 
     subroutine test_stats_meanf03_all_optmask_${k1}$(error)
         !> Error handling
         type(error_type), allocatable, intent(out) :: error
 
+        #:if MAXRANK > 7
         call check(error, ieee_is_nan(mean(d8_${k1}$, .false.))&
                     , 'mean(d8_${k1}$, .false.): uncorrect answer')
         if (allocated(error)) return
+
+        #:else
+        call skip_test(error, "Rank > 7 is not supported")
+        #:endif
     end subroutine
 
     subroutine test_stats_meanf03_${k1}$(error)
         !> Error handling
         type(error_type), allocatable, intent(out) :: error
 
+        #:if MAXRANK > 7
         #:for dim in range(1, 9)
         call check(error&
                     , sum(abs(mean(d8_${k1}$, ${dim}$) -&
@@ -168,34 +209,49 @@ contains
                     )
         if (allocated(error)) return
         #:endfor
+
+        #:else
+        call skip_test(error, "Rank > 7 is not supported")
+        #:endif
     end subroutine
 
     subroutine test_stats_meanf03_optmask_${k1}$(error)
         !> Error handling
         type(error_type), allocatable, intent(out) :: error
 
+        #:if MAXRANK > 7
         #:for dim in range(1, 9)
         call check(error, any(ieee_is_nan(mean(d8_${k1}$, ${dim}$, .false.)))&
                     , 'mean(d8_${k1}$, ${dim}$, .false.): uncorrect answer')
         if (allocated(error)) return
         #:endfor
+
+        #:else
+        call skip_test(error, "Rank > 7 is not supported")
+        #:endif
     end subroutine
 
     subroutine test_stats_meanf03_mask_all_${k1}$(error)
         !> Error handling
         type(error_type), allocatable, intent(out) :: error
 
+        #:if MAXRANK > 7
         call check(error, mean(d8_${k1}$, d8_${k1}$ > 0)&
                     , sum(d8_${k1}$, d8_${k1}$ > 0)/real(count(d8_${k1}$ > 0), ${k1}$)&
                     , 'mean(d8_${k1}$, d8_${k1}$ > 0): uncorrect answer'&
                     , thr = ${k1}$tol)
         if (allocated(error)) return
+
+        #:else
+        call skip_test(error, "Rank > 7 is not supported")
+        #:endif
     end subroutine
 
     subroutine test_stats_meanf03_mask_${k1}$(error)
         !> Error handling
         type(error_type), allocatable, intent(out) :: error
 
+        #:if MAXRANK > 7
         #:for dim in range(1, 9)
         call check(error&
                     , sum(abs(mean(d8_${k1}$, ${dim}$, d8_${k1}$ > 0) -&
@@ -204,6 +260,10 @@ contains
                     )
         if (allocated(error)) return
         #:endfor
+
+        #:else
+        call skip_test(error, "Rank > 7 is not supported")
+        #:endif
     end subroutine
     #:endfor
 
@@ -212,25 +272,36 @@ contains
         !> Error handling
         type(error_type), allocatable, intent(out) :: error
 
+        #:if MAXRANK > 7
         call check(error, mean(d8_c${k1}$), sum(d8_c${k1}$)/real(size(d8_c${k1}$), ${k1}$)&
                     , 'mean(d8_c${k1}$): uncorrect answer'&
                     , thr = ${k1}$tol)
         if (allocated(error)) return
+
+        #:else
+        call skip_test(error, "Rank > 7 is not supported")
+        #:endif
     end subroutine
 
     subroutine test_stats_meanf03_all_optmask_c${k1}$(error)
         !> Error handling
         type(error_type), allocatable, intent(out) :: error
 
+        #:if MAXRANK > 7
         call check(error, ieee_is_nan(real(mean(d8_c${k1}$, .false.)))&
                     , 'mean(d8_c${k1}$, .false.): uncorrect answer')
         if (allocated(error)) return
+
+        #:else
+        call skip_test(error, "Rank > 7 is not supported")
+        #:endif
     end subroutine
 
     subroutine test_stats_meanf03_c${k1}$(error)
         !> Error handling
         type(error_type), allocatable, intent(out) :: error
 
+        #:if MAXRANK > 7
         #:for dim in range(1, 9)
         call check(error&
                     , sum(abs(mean(d8_c${k1}$, ${dim}$) -&
@@ -239,34 +310,49 @@ contains
                     )
         if (allocated(error)) return
         #:endfor
+
+        #:else
+        call skip_test(error, "Rank > 7 is not supported")
+        #:endif
     end subroutine
 
     subroutine test_stats_meanf03_optmask_c${k1}$(error)
         !> Error handling
         type(error_type), allocatable, intent(out) :: error
 
+        #:if MAXRANK > 7
         #:for dim in range(1, 9)
         call check(error, any(ieee_is_nan(real(mean(d8_c${k1}$, ${dim}$, .false.))))&
                     , 'mean(d8_c${k1}$, ${dim}$, .false.): uncorrect answer')
         if (allocated(error)) return
         #:endfor
+
+        #:else
+        call skip_test(error, "Rank > 7 is not supported")
+        #:endif
     end subroutine
 
     subroutine test_stats_meanf03_mask_all_c${k1}$(error)
         !> Error handling
         type(error_type), allocatable, intent(out) :: error
 
+        #:if MAXRANK > 7
         call check(error, mean(d8_c${k1}$, d8_c${k1}$%re > 0)&
                     , sum(d8_c${k1}$, d8_c${k1}$%re > 0)/real(count(d8_c${k1}$%re > 0), ${k1}$)&
                     , 'mean(d8_c${k1}$, d8_c${k1}$%re > 0): uncorrect answer'&
                     , thr = ${k1}$tol)
         if (allocated(error)) return
+
+        #:else
+        call skip_test(error, "Rank > 7 is not supported")
+        #:endif
     end subroutine
 
     subroutine test_stats_meanf03_mask_c${k1}$(error)
         !> Error handling
         type(error_type), allocatable, intent(out) :: error
 
+        #:if MAXRANK > 7
         #:for dim in range(1, 9)
         call check(error&
                     , sum(abs(mean(d8_c${k1}$, ${dim}$, d8_c${k1}$%re > 0) -&
@@ -275,6 +361,10 @@ contains
                     )
         if (allocated(error)) return
         #:endfor
+
+        #:else
+        call skip_test(error, "Rank > 7 is not supported")
+        #:endif
     end subroutine
     #:endfor
 

From 9e6409e989209669b773d20726f770a7995ad88b Mon Sep 17 00:00:00 2001
From: Jeremie Vandenplas <jeremie.vandenplas@gmail.com>
Date: Sun, 7 Jan 2024 22:02:19 +0100
Subject: [PATCH 4/8] Adapt for FORD

---
 API-doc-FORD-file.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/API-doc-FORD-file.md b/API-doc-FORD-file.md
index 0db3e0a21..8aa447b3a 100644
--- a/API-doc-FORD-file.md
+++ b/API-doc-FORD-file.md
@@ -3,6 +3,7 @@ project: Fortran-lang/stdlib
 summary: A community driven standard library for (modern) Fortran
 src_dir: src
 include: src
+         include
 exclude_dir: src/tests
 output_dir: API-doc
 page_dir: doc

From c550f6c30bb742ea29947de82e502b6ca4cf19b9 Mon Sep 17 00:00:00 2001
From: Jeremie Vandenplas <jeremie.vandenplas@gmail.com>
Date: Sun, 7 Jan 2024 22:27:59 +0100
Subject: [PATCH 5/8] Adapt estop

---
 src/f08estop.f90 | 4 ++--
 src/f18estop.f90 | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/f08estop.f90 b/src/f08estop.f90
index 81c77f4a9..cbfa473d7 100644
--- a/src/f08estop.f90
+++ b/src/f08estop.f90
@@ -1,4 +1,4 @@
-submodule (stdlib_error) estop
+submodule (stdlib_error) f08estop
 
 implicit none
 
@@ -38,4 +38,4 @@
 endif
 end procedure
 
-end submodule
+end submodule f08estop
diff --git a/src/f18estop.f90 b/src/f18estop.f90
index 59fd0c97f..241665a36 100644
--- a/src/f18estop.f90
+++ b/src/f18estop.f90
@@ -1,4 +1,4 @@
-submodule (stdlib_error) estop
+submodule (stdlib_error) f18estop
 
 implicit none
 
@@ -26,4 +26,4 @@
 endif
 end procedure
 
-end submodule estop
+end submodule f18estop

From 2d042fd481270e4681e0d964d240217eadba7017 Mon Sep 17 00:00:00 2001
From: Jeremie Vandenplas <jeremie.vandenplas@gmail.com>
Date: Sun, 7 Jan 2024 22:35:36 +0100
Subject: [PATCH 6/8] Remove duplicate nmhash.c and nmhash.h files

---
 test/hash_functions/nmhash.c |   8 -
 test/hash_functions/nmhash.h | 833 -----------------------------------
 2 files changed, 841 deletions(-)
 delete mode 100644 test/hash_functions/nmhash.c
 delete mode 100644 test/hash_functions/nmhash.h

diff --git a/test/hash_functions/nmhash.c b/test/hash_functions/nmhash.c
deleted file mode 100644
index 987bc568c..000000000
--- a/test/hash_functions/nmhash.c
+++ /dev/null
@@ -1,8 +0,0 @@
-#include "nmhash.h"
-int32_t nmhash32_test ( const void * key, size_t len, uint32_t seed ) {
-  return NMHASH32 (key, (const size_t) len, seed);
-}
-
-int32_t nmhash32x_test ( const void * key, size_t len, uint32_t seed ) {
-  return NMHASH32X (key, (const size_t) len, seed);
-}
diff --git a/test/hash_functions/nmhash.h b/test/hash_functions/nmhash.h
deleted file mode 100644
index 85f9cf8a1..000000000
--- a/test/hash_functions/nmhash.h
+++ /dev/null
@@ -1,833 +0,0 @@
-/*
- * verification:
- * NMHASH32:
- *   rurban/smhasher: 0x12A30553
- *   demerphq/smhasher: 0x3D8F6C47
- * NMHASH32X:
- *   rurban/smhasher: 0xA8580227
- *   demerphq/smhasher: 0x40B451B3
- */
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifndef _nmhash_h_
-#define _nmhash_h_
-
-#define NMH_VERSION 2
-
-#ifdef _MSC_VER
-#  pragma warning(push, 3)
-#endif
-
-#if defined(__cplusplus) && __cplusplus < 201103L
-#  define __STDC_CONSTANT_MACROS 1
-#endif
-
-#include <stdint.h>
-#include <string.h>
-
-#if defined(__GNUC__)
-#  if defined(__AVX2__)
-#    include <immintrin.h>
-#  elif defined(__SSE2__)
-#    include <emmintrin.h>
-#  endif
-#elif defined(_MSC_VER)
-#  include <intrin.h>
-#endif
-
-#ifdef _MSC_VER
-#  pragma warning(pop)
-#endif
-
-#if (defined(__GNUC__) && (__GNUC__ >= 3))  \
-  || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \
-  || defined(__clang__)
-#    define NMH_likely(x) __builtin_expect(x, 1)
-#else
-#    define NMH_likely(x) (x)
-#endif
-
-#if defined(__has_builtin)
-#  if __has_builtin(__builtin_rotateleft32) \
-    && !(defined(__INTEL_COMPILER) && defined(__APPLE__))
-#    define NMH_rotl32 __builtin_rotateleft32 /* clang */
-#  endif
-#endif
-#if !defined(NMH_rotl32)
-#  if defined(_MSC_VER)
-     /* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */
-#    define NMH_rotl32(x,r) _rotl(x,r)
-#  else
-#    define NMH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))
-#  endif
-#endif
-
-#if ((defined(sun) || defined(__sun)) && __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested with GCC 5.5 */
-#  define NMH_RESTRICT /* disable */
-#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* >= C99 */
-#  define NMH_RESTRICT   restrict
-#elif defined(__cplusplus) && (defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER))
-#  define NMH_RESTRICT __restrict__
-#elif defined(__cplusplus) && defined(_MSC_VER)
-#  define NMH_RESTRICT __restrict
-#else
-#  define NMH_RESTRICT   /* disable */
-#endif
-
-/* endian macros */
-#ifndef NMHASH_LITTLE_ENDIAN
-#  if defined(_WIN32) || defined(__LITTLE_ENDIAN__) || defined(__x86_64__) || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) || defined(__SDCC)
-#    define NMHASH_LITTLE_ENDIAN 1
-#  elif defined(__BIG_ENDIAN__) || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
-#    define NMHASH_LITTLE_ENDIAN 0
-#  else
-#    warning could not determine endianness! Falling back to little endian.
-#    define NMHASH_LITTLE_ENDIAN 1
-#  endif
-#endif
-
-/* vector macros */
-#define NMH_SCALAR 0
-#define NMH_SSE2   1
-#define NMH_AVX2   2
-#define NMH_AVX512 3
-
-#ifndef NMH_VECTOR    /* can be defined on command line */
-#  if defined(__AVX512BW__)
-#    define NMH_VECTOR NMH_AVX512 /* _mm512_mullo_epi16 requires AVX512BW */
-#  elif defined(__AVX2__)
-#    define NMH_VECTOR NMH_AVX2  /* add '-mno-avx256-split-unaligned-load' and '-mn-oavx256-split-unaligned-store' for gcc */
-#  elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
-#    define NMH_VECTOR NMH_SSE2
-#  else
-#    define NMH_VECTOR NMH_SCALAR
-#  endif
-#endif
-
-/* align macros */
-#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)   /* C11+ */
-#  include <stdalign.h>
-#  define NMH_ALIGN(n)      alignas(n)
-#elif defined(__GNUC__)
-#  define NMH_ALIGN(n)      __attribute__ ((aligned(n)))
-#elif defined(_MSC_VER)
-#  define NMH_ALIGN(n)      __declspec(align(n))
-#else
-#  define NMH_ALIGN(n)   /* disabled */
-#endif
-
-#if NMH_VECTOR > 0
-#  define NMH_ACC_ALIGN 64
-#elif defined(__BIGGEST_ALIGNMENT__)
-#  define NMH_ACC_ALIGN __BIGGEST_ALIGNMENT__
-#elif defined(__SDCC)
-#  define NMH_ACC_ALIGN 1
-#else
-#  define NMH_ACC_ALIGN 16
-#endif
-
-/* constants */
-
-/* primes from xxh */
-#define NMH_PRIME32_1  UINT32_C(0x9E3779B1)
-#define NMH_PRIME32_2  UINT32_C(0x85EBCA77)
-#define NMH_PRIME32_3  UINT32_C(0xC2B2AE3D)
-#define NMH_PRIME32_4  UINT32_C(0x27D4EB2F)
-
-/*! Pseudorandom secret taken directly from FARSH. */
-NMH_ALIGN(NMH_ACC_ALIGN) static const uint32_t NMH_ACC_INIT[32] = {
-	UINT32_C(0xB8FE6C39), UINT32_C(0x23A44BBE), UINT32_C(0x7C01812C), UINT32_C(0xF721AD1C),
-	UINT32_C(0xDED46DE9), UINT32_C(0x839097DB), UINT32_C(0x7240A4A4), UINT32_C(0xB7B3671F),
-	UINT32_C(0xCB79E64E), UINT32_C(0xCCC0E578), UINT32_C(0x825AD07D), UINT32_C(0xCCFF7221),
-	UINT32_C(0xB8084674), UINT32_C(0xF743248E), UINT32_C(0xE03590E6), UINT32_C(0x813A264C),
-
-	UINT32_C(0x3C2852BB), UINT32_C(0x91C300CB), UINT32_C(0x88D0658B), UINT32_C(0x1B532EA3),
-	UINT32_C(0x71644897), UINT32_C(0xA20DF94E), UINT32_C(0x3819EF46), UINT32_C(0xA9DEACD8),
-	UINT32_C(0xA8FA763F), UINT32_C(0xE39C343F), UINT32_C(0xF9DCBBC7), UINT32_C(0xC70B4F1D),
-	UINT32_C(0x8A51E04B), UINT32_C(0xCDB45931), UINT32_C(0xC89F7EC9), UINT32_C(0xD9787364),
-};
-
-#if defined(_MSC_VER) && _MSC_VER >= 1914
-#  pragma warning(push)
-#  pragma warning(disable: 5045)
-#endif
-#ifdef __SDCC
-#  define const
-#  pragma save
-#  pragma disable_warning 110
-#  pragma disable_warning 126
-#endif
-
-/* read functions */
-static inline
-uint32_t
-NMH_readLE32(const void *const p)
-{
-	uint32_t v;
-	memcpy(&v, p, 4);
-#	if (NMHASH_LITTLE_ENDIAN)
-	return v;
-#	elif defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__clang__)
-	return __builtin_bswap32(v);
-#	elif defined(_MSC_VER)
-	return _byteswap_ulong(v);
-#	else
-	return ((v >> 24) & 0xff) | ((v >> 8) & 0xff00) | ((v << 8) & 0xff0000) | ((v << 24) & 0xff000000);
-#	endif
-}
-
-static inline
-uint16_t
-NMH_readLE16(const void *const p)
-{
-	uint16_t v;
-	memcpy(&v, p, 2);
-#	if (NMHASH_LITTLE_ENDIAN)
-	return v;
-#	else
-	return (uint16_t)((v << 8) | (v >> 8));
-#	endif
-}
-
-static inline
-uint32_t
-NMHASH32_0to8(uint32_t const x, uint32_t const seed2)
-{
-	/* base mixer: [-6 -12 776bf593 -19 11 3fb39c65 -15 -9 e9139917 -11 16] = 0.027071104091278835 */
-	const uint32_t m1 = UINT32_C(0x776BF593);
-	const uint32_t m2 = UINT32_C(0x3FB39C65);
-	const uint32_t m3 = UINT32_C(0xE9139917);
-
-#	if NMH_VECTOR == NMH_SCALAR
-	{
-		union { uint32_t u32; uint16_t u16[2]; } vx;
-		vx.u32 = x;
-		vx.u32 ^= (vx.u32 >> 12) ^ (vx.u32 >> 6);
-		vx.u16[0] *= (uint16_t)m1;
-		vx.u16[1] *= (uint16_t)(m1 >> 16);
-		vx.u32 ^= (vx.u32 << 11) ^ ( vx.u32 >> 19);
-		vx.u16[0] *= (uint16_t)m2;
-		vx.u16[1] *= (uint16_t)(m2 >> 16);
-		vx.u32 ^= seed2;
-		vx.u32 ^= (vx.u32 >> 15) ^ ( vx.u32 >> 9);
-		vx.u16[0] *= (uint16_t)m3;
-		vx.u16[1] *= (uint16_t)(m3 >> 16);
-		vx.u32 ^= (vx.u32 << 16) ^ ( vx.u32 >> 11);
-		return vx.u32;
-	}
-#	else /* at least NMH_SSE2 */
-	{
-		__m128i hv = _mm_setr_epi32((int)x, 0, 0, 0);
-		const __m128i sv = _mm_setr_epi32((int)seed2, 0, 0, 0);
-		const uint32_t *const result = (const uint32_t*)&hv;
-
-		hv = _mm_xor_si128(_mm_xor_si128(hv, _mm_srli_epi32(hv, 12)), _mm_srli_epi32(hv, 6));
-		hv = _mm_mullo_epi16(hv, _mm_setr_epi32((int)m1, 0, 0, 0));
-		hv = _mm_xor_si128(_mm_xor_si128(hv, _mm_slli_epi32(hv, 11)), _mm_srli_epi32(hv, 19));
-		hv = _mm_mullo_epi16(hv, _mm_setr_epi32((int)m2, 0, 0, 0));
-
-		hv = _mm_xor_si128(hv, sv);
-
-		hv = _mm_xor_si128(_mm_xor_si128(hv, _mm_srli_epi32(hv, 15)), _mm_srli_epi32(hv, 9));
-		hv = _mm_mullo_epi16(hv, _mm_setr_epi32((int)m3, 0, 0, 0));
-		hv = _mm_xor_si128(_mm_xor_si128(hv, _mm_slli_epi32(hv, 16)), _mm_srli_epi32(hv, 11));
-
-		return *result;
-	}
-#	endif
-}
-
-#define __NMH_M1 UINT32_C(0xF0D9649B)
-#define __NMH_M2 UINT32_C(0x29A7935D)
-#define __NMH_M3 UINT32_C(0x55D35831)
-
-NMH_ALIGN(NMH_ACC_ALIGN) static const uint32_t __NMH_M1_V[32] = {
-	__NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1,
-	__NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1,
-	__NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1,
-	__NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1,
-};
-NMH_ALIGN(NMH_ACC_ALIGN) static const uint32_t __NMH_M2_V[32] = {
-	__NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2,
-	__NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2,
-	__NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2,
-	__NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2,
-};
-NMH_ALIGN(NMH_ACC_ALIGN) static const uint32_t __NMH_M3_V[32] = {
-	__NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3,
-	__NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3,
-	__NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3,
-	__NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3,
-};
-
-static inline
-uint32_t
-NMHASH32_9to255(const uint8_t* const NMH_RESTRICT p, size_t const len, uint32_t const seed, int const type)
-{
-	/* base mixer: [f0d9649b  5 -13 29a7935d -9 11 55d35831 -20 -10 ] = 0.93495901789135362 */
-	uint32_t result = 0;
-#	if NMH_VECTOR == NMH_SCALAR
-	{
-		union { uint32_t u32; uint16_t u16[2]; } x[4], y[4];
-		uint32_t const sl = seed + (uint32_t)len;
-		size_t j;
-		x[0].u32 = NMH_PRIME32_1;
-		x[1].u32 = NMH_PRIME32_2;
-		x[2].u32 = NMH_PRIME32_3;
-		x[3].u32 = NMH_PRIME32_4;
-		for (j = 0; j < 4; ++j) y[j].u32 = sl;
-
-		if (type) {
-			/* 33 to 255 bytes */
-			size_t const r = (len - 1) / 32;
-			size_t i;
-			for (i = 0; i < r; ++i) {
-				for (j = 0; j < 4; ++j) x[j].u32 ^= NMH_readLE32(p + i * 32 + j * 4);
-				for (j = 0; j < 4; ++j) y[j].u32 ^= NMH_readLE32(p + i * 32 + j * 4 + 16);
-				for (j = 0; j < 4; ++j) x[j].u32 += y[j].u32;
-
-				for (j = 0; j < 4; ++j) {
-					x[j].u16[0] *= (uint16_t)(__NMH_M1 & 0xFFFF);
-					x[j].u16[1] *= (uint16_t)(__NMH_M1 >> 16);
-				}
-				for (j = 0; j < 4; ++j) x[j].u32 ^= (x[j].u32 << 5) ^ (x[j].u32 >> 13);
-				for (j = 0; j < 4; ++j) {
-					x[j].u16[0] *= (uint16_t)(__NMH_M2 & 0xFFFF);
-					x[j].u16[1] *= (uint16_t)(__NMH_M2 >> 16);
-				}
-
-				for (j = 0; j < 4; ++j) x[j].u32 ^= y[j].u32;
-
-				for (j = 0; j < 4; ++j) x[j].u32 ^= (x[j].u32 << 11) ^ (x[j].u32 >> 9);
-				for (j = 0; j < 4; ++j) {
-					x[j].u16[0] *= (uint16_t)(__NMH_M3 & 0xFFFF);
-					x[j].u16[1] *= (uint16_t)(__NMH_M3 >> 16);
-				}
-				for (j = 0; j < 4; ++j) x[j].u32 ^= (x[j].u32 >> 10) ^ (x[j].u32 >> 20);
-			}
-			for (j = 0; j < 4; ++j) x[j].u32 ^= NMH_readLE32(p + len - 32 + j * 4);
-			for (j = 0; j < 4; ++j) y[j].u32 ^= NMH_readLE32(p + len - 16 + j * 4);
-		} else {
-			/* 9 to 32 bytes */
-			x[0].u32 ^= NMH_readLE32(p);
-			x[1].u32 ^= NMH_readLE32(p + ((len>>4)<<3));
-			x[2].u32 ^= NMH_readLE32(p + len - 8);
-			x[3].u32 ^= NMH_readLE32(p + len - 8 - ((len>>4)<<3));
-			y[0].u32 ^= NMH_readLE32(p + 4);
-			y[1].u32 ^= NMH_readLE32(p + ((len>>4)<<3) + 4);
-			y[2].u32 ^= NMH_readLE32(p + len - 8 + 4);
-			y[3].u32 ^= NMH_readLE32(p + len - 8 - ((len>>4)<<3) + 4);
-		}
-
-		for (j = 0; j < 4; ++j) x[j].u32 += y[j].u32;
-		for (j = 0; j < 4; ++j) y[j].u32 ^= (y[j].u32 << 17) ^ (y[j].u32 >> 6);
-
-		for (j = 0; j < 4; ++j) {
-			x[j].u16[0] *= (uint16_t)(__NMH_M1 & 0xFFFF);
-			x[j].u16[1] *= (uint16_t)(__NMH_M1 >> 16);
-		}
-		for (j = 0; j < 4; ++j) x[j].u32 ^= (x[j].u32 << 5) ^ (x[j].u32 >> 13);
-		for (j = 0; j < 4; ++j) {
-			x[j].u16[0] *= (uint16_t)(__NMH_M2 & 0xFFFF);
-			x[j].u16[1] *= (uint16_t)(__NMH_M2 >> 16);
-		}
-
-		for (j = 0; j < 4; ++j) x[j].u32 ^= y[j].u32;
-
-		for (j = 0; j < 4; ++j) x[j].u32 ^= (x[j].u32 << 11) ^ (x[j].u32 >> 9);
-		for (j = 0; j < 4; ++j) {
-			x[j].u16[0] *= (uint16_t)(__NMH_M3 & 0xFFFF);
-			x[j].u16[1] *= (uint16_t)(__NMH_M3 >> 16);
-		}
-		for (j = 0; j < 4; ++j) x[j].u32 ^= (x[j].u32 >> 10) ^ (x[j].u32 >> 20);
-
-		x[0].u32 ^= NMH_PRIME32_1;
-		x[1].u32 ^= NMH_PRIME32_2;
-		x[2].u32 ^= NMH_PRIME32_3;
-		x[3].u32 ^= NMH_PRIME32_4;
-
-		for (j = 1; j < 4; ++j) x[0].u32 += x[j].u32;
-
-		x[0].u32 ^= sl + (sl >> 5);
-		x[0].u16[0] *= (uint16_t)(__NMH_M3 & 0xFFFF);
-		x[0].u16[1] *= (uint16_t)(__NMH_M3 >> 16);
-		x[0].u32 ^= (x[0].u32 >> 10) ^ (x[0].u32 >> 20);
-
-		result = x[0].u32;
-	}
-#	else /* at least NMH_SSE2 */
-	{
-		__m128i const h0 = _mm_setr_epi32((int)NMH_PRIME32_1, (int)NMH_PRIME32_2, (int)NMH_PRIME32_3, (int)NMH_PRIME32_4);
-		__m128i const sl = _mm_set1_epi32((int)seed + (int)len);
-		__m128i const m1 = _mm_set1_epi32((int)__NMH_M1);
-		__m128i const m2 = _mm_set1_epi32((int)__NMH_M2);
-		__m128i const m3 = _mm_set1_epi32((int)__NMH_M3);
-		__m128i       x = h0;
-		__m128i       y = sl;
-		const uint32_t *const px = (const uint32_t*)&x;
-
-		if (type) {
-			/* 32 to 127 bytes */
-			size_t const r = (len - 1) / 32;
-			size_t i;
-			for (i = 0; i < r; ++i) {
-				x = _mm_xor_si128(x, _mm_loadu_si128((const __m128i *)(p + i * 32)));
-				y = _mm_xor_si128(y, _mm_loadu_si128((const __m128i *)(p + i * 32 + 16)));
-				x = _mm_add_epi32(x, y);
-				x = _mm_mullo_epi16(x, m1);
-				x = _mm_xor_si128(_mm_xor_si128(x, _mm_slli_epi32(x, 5)), _mm_srli_epi32(x, 13));
-				x = _mm_mullo_epi16(x, m2);
-				x = _mm_xor_si128(x, y);
-				x = _mm_xor_si128(_mm_xor_si128(x, _mm_slli_epi32(x, 11)), _mm_srli_epi32(x, 9));
-				x = _mm_mullo_epi16(x, m3);
-				x = _mm_xor_si128(_mm_xor_si128(x, _mm_srli_epi32(x, 10)), _mm_srli_epi32(x, 20));
-			}
-			x = _mm_xor_si128(x, _mm_loadu_si128((const __m128i *)(p + len - 32)));
-			y = _mm_xor_si128(y, _mm_loadu_si128((const __m128i *)(p + len - 16)));
-		} else {
-			/* 9 to 32 bytes */
-			x = _mm_xor_si128(x, _mm_setr_epi32((int)NMH_readLE32(p), (int)NMH_readLE32(p + ((len>>4)<<3)), (int)NMH_readLE32(p + len - 8), (int)NMH_readLE32(p + len - 8 - ((len>>4)<<3))));
-			y = _mm_xor_si128(y, _mm_setr_epi32((int)NMH_readLE32(p + 4), (int)NMH_readLE32(p + ((len>>4)<<3) + 4), (int)NMH_readLE32(p + len - 8 + 4), (int)NMH_readLE32(p + len - 8 - ((len>>4)<<3) + 4)));
-		}
-
-		x = _mm_add_epi32(x, y);
-
-		y = _mm_xor_si128(_mm_xor_si128(y, _mm_slli_epi32(y, 17)), _mm_srli_epi32(y, 6));
-
-		x = _mm_mullo_epi16(x, m1);
-		x = _mm_xor_si128(_mm_xor_si128(x, _mm_slli_epi32(x, 5)), _mm_srli_epi32(x, 13));
-		x = _mm_mullo_epi16(x, m2);
-		x = _mm_xor_si128(x, y);
-		x = _mm_xor_si128(_mm_xor_si128(x, _mm_slli_epi32(x, 11)), _mm_srli_epi32(x, 9));
-		x = _mm_mullo_epi16(x, m3);
-		x = _mm_xor_si128(_mm_xor_si128(x, _mm_srli_epi32(x, 10)), _mm_srli_epi32(x, 20));
-
-		x = _mm_xor_si128(x, h0);
-		x = _mm_add_epi32(x, _mm_srli_si128(x, 4));
-		x = _mm_add_epi32(x, _mm_srli_si128(x, 8));
-
-		x = _mm_xor_si128(x, _mm_add_epi32(sl, _mm_srli_epi32(sl, 5)));
-		x = _mm_mullo_epi16(x, m3);
-		x = _mm_xor_si128(_mm_xor_si128(x, _mm_srli_epi32(x, 10)), _mm_srli_epi32(x, 20));
-
-		result = *px;
-	}
-#	endif
-	return *&result;
-}
-#define NMHASH32_9to32(p, len, seed) NMHASH32_9to255(p, len, seed, 0)
-#define NMHASH32_33to255(p, len, seed) NMHASH32_9to255(p, len, seed, 1)
-
-#undef __NMH_M1
-#undef __NMH_M2
-#undef __NMH_M3
-
-#if NMH_VECTOR == NMH_SCALAR
-#define NMHASH32_long_round NMHASH32_long_round_scalar
-static inline
-void
-NMHASH32_long_round_scalar(uint32_t *const NMH_RESTRICT accX, uint32_t *const NMH_RESTRICT accY, const uint8_t* const NMH_RESTRICT p)
-{
-	/* breadth first calculation will hint some compiler to auto vectorize the code
-	 * on gcc, the performance becomes 10x than the depth first, and about 80% of the manually vectorized code
-	 */
-	const size_t nbGroups = sizeof(NMH_ACC_INIT) / sizeof(*NMH_ACC_INIT);
-	size_t i;
-
-	for (i = 0; i < nbGroups; ++i) {
-		accX[i] ^= NMH_readLE32(p + i * 4);
-	}
-	for (i = 0; i < nbGroups; ++i) {
-		accY[i] ^= NMH_readLE32(p + i * 4 + sizeof(NMH_ACC_INIT));
-	}
-	for (i = 0; i < nbGroups; ++i) {
-		accX[i] += accY[i];
-	}
-	for (i = 0; i < nbGroups; ++i) {
-		accY[i] ^= accX[i] >> 1;
-	}
-	for (i = 0; i < nbGroups * 2; ++i) {
-		((uint16_t*)accX)[i] *= ((uint16_t*)__NMH_M1_V)[i];
-	}
-	for (i = 0; i < nbGroups; ++i) {
-		accX[i] ^= accX[i] << 5 ^ accX[i] >> 13;
-	}
-	for (i = 0; i < nbGroups * 2; ++i) {
-		((uint16_t*)accX)[i] *= ((uint16_t*)__NMH_M2_V)[i];
-	}
-	for (i = 0; i < nbGroups; ++i) {
-		accX[i] ^= accY[i];
-	}
-	for (i = 0; i < nbGroups; ++i) {
-		accX[i] ^= accX[i] << 11 ^ accX[i] >> 9;
-	}
-	for (i = 0; i < nbGroups * 2; ++i) {
-		((uint16_t*)accX)[i] *= ((uint16_t*)__NMH_M3_V)[i];
-	}
-	for (i = 0; i < nbGroups; ++i) {
-		accX[i] ^= accX[i] >> 10 ^ accX[i] >> 20;
-	}
-}
-#endif
-
-#if NMH_VECTOR == NMH_SSE2
-#  define _NMH_MM_(F) _mm_ ## F
-#  define _NMH_MMW_(F) _mm_ ## F ## 128
-#  define _NMH_MM_T __m128i
-#elif NMH_VECTOR == NMH_AVX2
-#  define _NMH_MM_(F) _mm256_ ## F
-#  define _NMH_MMW_(F) _mm256_ ## F ## 256
-#  define _NMH_MM_T __m256i
-#elif NMH_VECTOR == NMH_AVX512
-#  define _NMH_MM_(F) _mm512_ ## F
-#  define _NMH_MMW_(F) _mm512_ ## F ## 512
-#  define _NMH_MM_T __m512i
-#endif
-
-#if NMH_VECTOR == NMH_SSE2 || NMH_VECTOR == NMH_AVX2 || NMH_VECTOR == NMH_AVX512
-#  define NMHASH32_long_round NMHASH32_long_round_sse
-#  define NMH_VECTOR_NB_GROUP (sizeof(NMH_ACC_INIT) / sizeof(*NMH_ACC_INIT) / (sizeof(_NMH_MM_T) / sizeof(*NMH_ACC_INIT)))
-static inline
-void
-NMHASH32_long_round_sse(uint32_t *const NMH_RESTRICT accX, uint32_t *const NMH_RESTRICT accY, const uint8_t* const NMH_RESTRICT p)
-{
-	const _NMH_MM_T *const NMH_RESTRICT m1    = (const _NMH_MM_T * NMH_RESTRICT)__NMH_M1_V;
-	const _NMH_MM_T *const NMH_RESTRICT m2    = (const _NMH_MM_T * NMH_RESTRICT)__NMH_M2_V;
-	const _NMH_MM_T *const NMH_RESTRICT m3    = (const _NMH_MM_T * NMH_RESTRICT)__NMH_M3_V;
-	      _NMH_MM_T *const              xaccX = (      _NMH_MM_T *             )accX;
-	      _NMH_MM_T *const              xaccY = (      _NMH_MM_T *             )accY;
-	      _NMH_MM_T *const              xp    = (      _NMH_MM_T *             )p;
-	size_t i;
-
-	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
-		xaccX[i] = _NMH_MMW_(xor_si)(xaccX[i], _NMH_MMW_(loadu_si)(xp + i));
-	}
-	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
-		xaccY[i] = _NMH_MMW_(xor_si)(xaccY[i], _NMH_MMW_(loadu_si)(xp + i + NMH_VECTOR_NB_GROUP));
-	}
-	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
-		xaccX[i] = _NMH_MM_(add_epi32)(xaccX[i], xaccY[i]);
-	}
-	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
-		xaccY[i] = _NMH_MMW_(xor_si)(xaccY[i], _NMH_MM_(srli_epi32)(xaccX[i], 1));
-	}
-	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
-		xaccX[i] = _NMH_MM_(mullo_epi16)(xaccX[i], *m1);
-	}
-	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
-		xaccX[i] = _NMH_MMW_(xor_si)(_NMH_MMW_(xor_si)(xaccX[i], _NMH_MM_(slli_epi32)(xaccX[i], 5)), _NMH_MM_(srli_epi32)(xaccX[i], 13));
-	}
-	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
-		xaccX[i] = _NMH_MM_(mullo_epi16)(xaccX[i], *m2);
-	}
-	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
-		xaccX[i] = _NMH_MMW_(xor_si)(xaccX[i], xaccY[i]);
-	}
-	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
-		xaccX[i] = _NMH_MMW_(xor_si)(_NMH_MMW_(xor_si)(xaccX[i], _NMH_MM_(slli_epi32)(xaccX[i], 11)), _NMH_MM_(srli_epi32)(xaccX[i], 9));
-	}
-	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
-		xaccX[i] = _NMH_MM_(mullo_epi16)(xaccX[i], *m3);
-	}
-	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
-		xaccX[i] = _NMH_MMW_(xor_si)(_NMH_MMW_(xor_si)(xaccX[i], _NMH_MM_(srli_epi32)(xaccX[i], 10)), _NMH_MM_(srli_epi32)(xaccX[i], 20));
-	}
-}
-#  undef _NMH_MM_
-#  undef _NMH_MMW_
-#  undef _NMH_MM_T
-#  undef NMH_VECTOR_NB_GROUP
-#endif
-
-static
-uint32_t
-NMHASH32_long(const uint8_t* const NMH_RESTRICT p, size_t const len, uint32_t const seed)
-{
-	NMH_ALIGN(NMH_ACC_ALIGN) uint32_t accX[sizeof(NMH_ACC_INIT)/sizeof(*NMH_ACC_INIT)];
-	NMH_ALIGN(NMH_ACC_ALIGN) uint32_t accY[sizeof(accX)/sizeof(*accX)];
-	size_t const nbRounds = (len - 1) / (sizeof(accX) + sizeof(accY));
-	size_t i;
-	uint32_t sum = 0;
-
-	/* init */
-	for (i = 0; i < sizeof(accX)/sizeof(*accX); ++i) accX[i] = NMH_ACC_INIT[i];
-	for (i = 0; i < sizeof(accY)/sizeof(*accY); ++i) accY[i] = seed;
-
-	for (i = 0; i < nbRounds; ++i) {
-		NMHASH32_long_round(accX, accY, p + i * (sizeof(accX) + sizeof(accY)));
-	}
-	NMHASH32_long_round(accX, accY, p + len - (sizeof(accX) + sizeof(accY)));
-
-	/* merge acc */
-	for (i = 0; i < sizeof(accX)/sizeof(*accX); ++i) accX[i] ^= NMH_ACC_INIT[i];
-	for (i = 0; i < sizeof(accX)/sizeof(*accX); ++i) sum += accX[i];
-
-#	if SIZE_MAX > UINT32_C(-1)
-	sum += (uint32_t)(len >> 32);
-#	endif
-	return sum ^ (uint32_t)len;
-}
-
-static inline
-uint32_t
-NMHASH32_avalanche32(uint32_t const x)
-{
-	/* [-21 -8 cce5196d 12 -7 464be229 -21 -8] = 3.2267098842182733 */
-	const uint32_t m1 = UINT32_C(0xCCE5196D);
-	const uint32_t m2 = UINT32_C(0x464BE229);
-	union { uint32_t u32; uint16_t u16[2]; } vx;
-	vx.u32    = x;
-	vx.u32   ^= (vx.u32 >> 8) ^ (vx.u32 >> 21);
-	vx.u16[0] = (uint16_t)(vx.u16[0] * (uint16_t)m1);
-	vx.u16[1] = (uint16_t)(vx.u16[1] * (uint16_t)(m1 >> 16));
-	vx.u32   ^= (vx.u32 << 12) ^ (vx.u32 >> 7);
-	vx.u16[0] = (uint16_t)(vx.u16[0] * (uint16_t)m2);
-	vx.u16[1] = (uint16_t)(vx.u16[1] * (uint16_t)(m2 >> 16));
-	return vx.u32 ^ (vx.u32 >> 8) ^ (vx.u32 >> 21);
-}
-
-static inline
-uint32_t
-NMHASH32(const void* const NMH_RESTRICT input, size_t const len, uint32_t seed)
-{
-	const uint8_t *const p = (const uint8_t *)input;
-	if (NMH_likely(len <= 32)) {
-		if(NMH_likely(len > 8)) {
-			return NMHASH32_9to32(p, len, seed);
-		}
-		if(NMH_likely(len > 4)) {
-			uint32_t x = NMH_readLE32(p);
-			uint32_t y = NMH_readLE32(p + len - 4) ^ (NMH_PRIME32_4 + 2 + seed);
-			x += y;
-			x ^= x << (len + 7);
-			return NMHASH32_0to8(x, NMH_rotl32(y, 5));
-		} else {
-			union { uint32_t u32; uint16_t u16[2]; uint8_t u8[4]; } data;
-			switch (len) {
-				case 0: seed += NMH_PRIME32_2;
-					data.u32 = 0;
-					break;
-				case 1: seed += NMH_PRIME32_2 + (UINT32_C(1) << 24) + (1 << 1);
-					data.u32 = p[0];
-					break;
-				case 2: seed += NMH_PRIME32_2 + (UINT32_C(2) << 24) + (2 << 1);
-					data.u32 = NMH_readLE16(p);
-					break;
-				case 3: seed += NMH_PRIME32_2 + (UINT32_C(3) << 24) + (3 << 1);
-					data.u16[1] = p[2];
-					data.u16[0] = NMH_readLE16(p);
-					break;
-				case 4: seed += NMH_PRIME32_3;
-					data.u32 = NMH_readLE32(p);
-					break;
-				default: return 0;
-			}
-			return NMHASH32_0to8(data.u32 + seed, NMH_rotl32(seed, 5));
-		}
-	}
-	if (NMH_likely(len < 256)) {
-		return NMHASH32_33to255(p, len, seed);
-	}
-	return NMHASH32_avalanche32(NMHASH32_long(p, len, seed));
-}
-
-static inline
-uint32_t
-NMHASH32X_0to4(uint32_t x, uint32_t const seed)
-{
-	/* [bdab1ea9 18 a7896a1b 12 83796a2d 16] = 0.092922873297662509 */
-	x ^= seed;
-	x *= UINT32_C(0xBDAB1EA9);
-	x += NMH_rotl32(seed, 31);
-	x ^= x >> 18;
-	x *= UINT32_C(0xA7896A1B);
-	x ^= x >> 12;
-	x *= UINT32_C(0x83796A2D);
-	x ^= x >> 16;
-	return x;
-}
-
-static inline
-uint32_t
-NMHASH32X_5to8(const uint8_t* const NMH_RESTRICT p, size_t const len, uint32_t const seed)
-{
-	/* - 5 to 9 bytes
-	 * - mixer: [11049a7d 23 bcccdc7b 12 065e9dad 12] = 0.16577596555667246 */
-
-	uint32_t       x = NMH_readLE32(p) ^ NMH_PRIME32_3;
-	uint32_t const y = NMH_readLE32(p + len - 4) ^ seed;
-	x += y;
-	x ^= x >> len;
-	x *= UINT32_C(0x11049A7D);
-	x ^= x >> 23;
-	x *= UINT32_C(0xBCCCDC7B);
-	x ^= NMH_rotl32(y, 3);
-	x ^= x >> 12;
-	x *= UINT32_C(0x065E9DAD);
-	x ^= x >> 12;
-	return x;
-}
-
-static inline
-uint32_t
-NMHASH32X_9to255(const uint8_t* const NMH_RESTRICT p, size_t const len, uint32_t const seed)
-{
-	/* - at least 9 bytes
-	 * - base mixer: [11049a7d 23 bcccdc7b 12 065e9dad 12] = 0.16577596555667246
-	 * - tail mixer: [16 a52fb2cd 15 551e4d49 16] = 0.17162579707098322
-	 */
-
-	uint32_t x = NMH_PRIME32_3;
-	uint32_t y = seed;
-	uint32_t a = NMH_PRIME32_4;
-	uint32_t b = seed;
-	size_t i, r = (len - 1) / 16;
-
-	for (i = 0; i < r; ++i) {
-		x ^= NMH_readLE32(p + i * 16 + 0);
-		y ^= NMH_readLE32(p + i * 16 + 4);
-		x ^= y;
-		x *= UINT32_C(0x11049A7D);
-		x ^= x >> 23;
-		x *= UINT32_C(0xBCCCDC7B);
-		y  = NMH_rotl32(y, 4);
-		x ^= y;
-		x ^= x >> 12;
-		x *= UINT32_C(0x065E9DAD);
-		x ^= x >> 12;
-
-		a ^= NMH_readLE32(p + i * 16 + 8);
-		b ^= NMH_readLE32(p + i * 16 + 12);
-		a ^= b;
-		a *= UINT32_C(0x11049A7D);
-		a ^= a >> 23;
-		a *= UINT32_C(0xBCCCDC7B);
-		b  = NMH_rotl32(b, 3);
-		a ^= b;
-		a ^= a >> 12;
-		a *= UINT32_C(0x065E9DAD);
-		a ^= a >> 12;
-	}
-
-	if (NMH_likely(((uint8_t)len-1) & 8)) {
-		if (NMH_likely(((uint8_t)len-1) & 4)) {
-			a ^= NMH_readLE32(p + r * 16 + 0);
-			b ^= NMH_readLE32(p + r * 16 + 4);
-			a ^= b;
-			a *= UINT32_C(0x11049A7D);
-			a ^= a >> 23;
-			a *= UINT32_C(0xBCCCDC7B);
-			a ^= NMH_rotl32(b, 4);
-			a ^= a >> 12;
-			a *= UINT32_C(0x065E9DAD);
-		} else {
-			a ^= NMH_readLE32(p + r * 16) + b;
-			a ^= a >> 16;
-			a *= UINT32_C(0xA52FB2CD);
-			a ^= a >> 15;
-			a *= UINT32_C(0x551E4D49);
-		}
-
-		x ^= NMH_readLE32(p + len - 8);
-		y ^= NMH_readLE32(p + len - 4);
-		x ^= y;
-		x *= UINT32_C(0x11049A7D);
-		x ^= x >> 23;
-		x *= UINT32_C(0xBCCCDC7B);
-		x ^= NMH_rotl32(y, 3);
-		x ^= x >> 12;
-		x *= UINT32_C(0x065E9DAD);
-	} else {
-		if (NMH_likely(((uint8_t)len-1) & 4)) {
-			a ^= NMH_readLE32(p + r * 16) + b;
-			a ^= a >> 16;
-			a *= UINT32_C(0xA52FB2CD);
-			a ^= a >> 15;
-			a *= UINT32_C(0x551E4D49);
-		}
-		x ^= NMH_readLE32(p + len - 4) + y;
-		x ^= x >> 16;
-		x *= UINT32_C(0xA52FB2CD);
-		x ^= x >> 15;
-		x *= UINT32_C(0x551E4D49);
-	}
-
-	x ^= (uint32_t)len;
-	x ^= NMH_rotl32(a, 27); /* rotate one lane to pass Diff test */
-	x ^= x >> 14;
-	x *= UINT32_C(0x141CC535);
-
-	return x;
-}
-
-static inline
-uint32_t
-NMHASH32X_avalanche32(uint32_t x)
-{
-	/* mixer with 2 mul from skeeto/hash-prospector:
-	 * [15 d168aaad 15 af723597 15] = 0.15983776156606694
-	 */
-	x ^= x >> 15;
-	x *= UINT32_C(0xD168AAAD);
-	x ^= x >> 15;
-	x *= UINT32_C(0xAF723597);
-	x ^= x >> 15;
-	return x;
-}
-
-/* use 32*32->32 multiplication for short hash */
-static inline
-uint32_t
-NMHASH32X(const void* const NMH_RESTRICT input, size_t const len, uint32_t seed)
-{
-	const uint8_t *const p = (const uint8_t *)input;
-	if (NMH_likely(len <= 8)) {
-		if (NMH_likely(len > 4)) {
-			return NMHASH32X_5to8(p, len, seed);
-		} else {
-			/* 0-4 bytes */
-			union { uint32_t u32; uint16_t u16[2]; uint8_t u8[4]; } data;
-			switch (len) {
-				case 0: seed += NMH_PRIME32_2;
-					data.u32 = 0;
-					break;
-				case 1: seed += NMH_PRIME32_2 + (UINT32_C(1) << 24) + (1 << 1);
-					data.u32 = p[0];
-					break;
-				case 2: seed += NMH_PRIME32_2 + (UINT32_C(2) << 24) + (2 << 1);
-					data.u32 = NMH_readLE16(p);
-					break;
-				case 3: seed += NMH_PRIME32_2 + (UINT32_C(3) << 24) + (3 << 1);
-					data.u16[1] = p[2];
-					data.u16[0] = NMH_readLE16(p);
-					break;
-				case 4: seed += NMH_PRIME32_1;
-					data.u32 = NMH_readLE32(p);
-					break;
-				default: return 0;
-			}
-			return NMHASH32X_0to4(data.u32, seed);
-		}
-	}
-	if (NMH_likely(len < 256)) {
-		return NMHASH32X_9to255(p, len, seed);
-	}
-	return NMHASH32X_avalanche32(NMHASH32_long(p, len, seed));
-}
-
-#if defined(_MSC_VER) && _MSC_VER >= 1914
-#  pragma warning(pop)
-#endif
-#ifdef __SDCC
-#  pragma restore
-#  undef const
-#endif
-
-#endif /* _nmhash_h_ */
-
-#ifdef __cplusplus
-}
-#endif

From b9c44fe7d2d55aba8816ef1fd33c812b33cb3c12 Mon Sep 17 00:00:00 2001
From: Jeremie Vandenplas <jeremie.vandenplas@gmail.com>
Date: Sun, 7 Jan 2024 22:37:52 +0100
Subject: [PATCH 7/8] Revert "Remove duplicate nmhash.c and nmhash.h files"

This reverts commit 2d042fd481270e4681e0d964d240217eadba7017.
---
 test/hash_functions/nmhash.c |   8 +
 test/hash_functions/nmhash.h | 833 +++++++++++++++++++++++++++++++++++
 2 files changed, 841 insertions(+)
 create mode 100644 test/hash_functions/nmhash.c
 create mode 100644 test/hash_functions/nmhash.h

diff --git a/test/hash_functions/nmhash.c b/test/hash_functions/nmhash.c
new file mode 100644
index 000000000..987bc568c
--- /dev/null
+++ b/test/hash_functions/nmhash.c
@@ -0,0 +1,8 @@
+#include "nmhash.h"
+int32_t nmhash32_test ( const void * key, size_t len, uint32_t seed ) {
+  return NMHASH32 (key, (const size_t) len, seed);
+}
+
+int32_t nmhash32x_test ( const void * key, size_t len, uint32_t seed ) {
+  return NMHASH32X (key, (const size_t) len, seed);
+}
diff --git a/test/hash_functions/nmhash.h b/test/hash_functions/nmhash.h
new file mode 100644
index 000000000..85f9cf8a1
--- /dev/null
+++ b/test/hash_functions/nmhash.h
@@ -0,0 +1,833 @@
+/*
+ * verification:
+ * NMHASH32:
+ *   rurban/smhasher: 0x12A30553
+ *   demerphq/smhasher: 0x3D8F6C47
+ * NMHASH32X:
+ *   rurban/smhasher: 0xA8580227
+ *   demerphq/smhasher: 0x40B451B3
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef _nmhash_h_
+#define _nmhash_h_
+
+#define NMH_VERSION 2
+
+#ifdef _MSC_VER
+#  pragma warning(push, 3)
+#endif
+
+#if defined(__cplusplus) && __cplusplus < 201103L
+#  define __STDC_CONSTANT_MACROS 1
+#endif
+
+#include <stdint.h>
+#include <string.h>
+
+#if defined(__GNUC__)
+#  if defined(__AVX2__)
+#    include <immintrin.h>
+#  elif defined(__SSE2__)
+#    include <emmintrin.h>
+#  endif
+#elif defined(_MSC_VER)
+#  include <intrin.h>
+#endif
+
+#ifdef _MSC_VER
+#  pragma warning(pop)
+#endif
+
+#if (defined(__GNUC__) && (__GNUC__ >= 3))  \
+  || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \
+  || defined(__clang__)
+#    define NMH_likely(x) __builtin_expect(x, 1)
+#else
+#    define NMH_likely(x) (x)
+#endif
+
+#if defined(__has_builtin)
+#  if __has_builtin(__builtin_rotateleft32) \
+    && !(defined(__INTEL_COMPILER) && defined(__APPLE__))
+#    define NMH_rotl32 __builtin_rotateleft32 /* clang */
+#  endif
+#endif
+#if !defined(NMH_rotl32)
+#  if defined(_MSC_VER)
+     /* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */
+#    define NMH_rotl32(x,r) _rotl(x,r)
+#  else
+#    define NMH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))
+#  endif
+#endif
+
+#if ((defined(sun) || defined(__sun)) && __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested with GCC 5.5 */
+#  define NMH_RESTRICT /* disable */
+#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* >= C99 */
+#  define NMH_RESTRICT   restrict
+#elif defined(__cplusplus) && (defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER))
+#  define NMH_RESTRICT __restrict__
+#elif defined(__cplusplus) && defined(_MSC_VER)
+#  define NMH_RESTRICT __restrict
+#else
+#  define NMH_RESTRICT   /* disable */
+#endif
+
+/* endian macros */
+#ifndef NMHASH_LITTLE_ENDIAN
+#  if defined(_WIN32) || defined(__LITTLE_ENDIAN__) || defined(__x86_64__) || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) || defined(__SDCC)
+#    define NMHASH_LITTLE_ENDIAN 1
+#  elif defined(__BIG_ENDIAN__) || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#    define NMHASH_LITTLE_ENDIAN 0
+#  else
+#    warning could not determine endianness! Falling back to little endian.
+#    define NMHASH_LITTLE_ENDIAN 1
+#  endif
+#endif
+
+/* vector macros */
+#define NMH_SCALAR 0
+#define NMH_SSE2   1
+#define NMH_AVX2   2
+#define NMH_AVX512 3
+
+#ifndef NMH_VECTOR    /* can be defined on command line */
+#  if defined(__AVX512BW__)
+#    define NMH_VECTOR NMH_AVX512 /* _mm512_mullo_epi16 requires AVX512BW */
+#  elif defined(__AVX2__)
+#    define NMH_VECTOR NMH_AVX2  /* add '-mno-avx256-split-unaligned-load' and '-mn-oavx256-split-unaligned-store' for gcc */
+#  elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
+#    define NMH_VECTOR NMH_SSE2
+#  else
+#    define NMH_VECTOR NMH_SCALAR
+#  endif
+#endif
+
+/* align macros */
+#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)   /* C11+ */
+#  include <stdalign.h>
+#  define NMH_ALIGN(n)      alignas(n)
+#elif defined(__GNUC__)
+#  define NMH_ALIGN(n)      __attribute__ ((aligned(n)))
+#elif defined(_MSC_VER)
+#  define NMH_ALIGN(n)      __declspec(align(n))
+#else
+#  define NMH_ALIGN(n)   /* disabled */
+#endif
+
+#if NMH_VECTOR > 0
+#  define NMH_ACC_ALIGN 64
+#elif defined(__BIGGEST_ALIGNMENT__)
+#  define NMH_ACC_ALIGN __BIGGEST_ALIGNMENT__
+#elif defined(__SDCC)
+#  define NMH_ACC_ALIGN 1
+#else
+#  define NMH_ACC_ALIGN 16
+#endif
+
+/* constants */
+
+/* primes from xxh */
+#define NMH_PRIME32_1  UINT32_C(0x9E3779B1)
+#define NMH_PRIME32_2  UINT32_C(0x85EBCA77)
+#define NMH_PRIME32_3  UINT32_C(0xC2B2AE3D)
+#define NMH_PRIME32_4  UINT32_C(0x27D4EB2F)
+
+/*! Pseudorandom secret taken directly from FARSH. */
+NMH_ALIGN(NMH_ACC_ALIGN) static const uint32_t NMH_ACC_INIT[32] = {
+	UINT32_C(0xB8FE6C39), UINT32_C(0x23A44BBE), UINT32_C(0x7C01812C), UINT32_C(0xF721AD1C),
+	UINT32_C(0xDED46DE9), UINT32_C(0x839097DB), UINT32_C(0x7240A4A4), UINT32_C(0xB7B3671F),
+	UINT32_C(0xCB79E64E), UINT32_C(0xCCC0E578), UINT32_C(0x825AD07D), UINT32_C(0xCCFF7221),
+	UINT32_C(0xB8084674), UINT32_C(0xF743248E), UINT32_C(0xE03590E6), UINT32_C(0x813A264C),
+
+	UINT32_C(0x3C2852BB), UINT32_C(0x91C300CB), UINT32_C(0x88D0658B), UINT32_C(0x1B532EA3),
+	UINT32_C(0x71644897), UINT32_C(0xA20DF94E), UINT32_C(0x3819EF46), UINT32_C(0xA9DEACD8),
+	UINT32_C(0xA8FA763F), UINT32_C(0xE39C343F), UINT32_C(0xF9DCBBC7), UINT32_C(0xC70B4F1D),
+	UINT32_C(0x8A51E04B), UINT32_C(0xCDB45931), UINT32_C(0xC89F7EC9), UINT32_C(0xD9787364),
+};
+
+#if defined(_MSC_VER) && _MSC_VER >= 1914
+#  pragma warning(push)
+#  pragma warning(disable: 5045)
+#endif
+#ifdef __SDCC
+#  define const
+#  pragma save
+#  pragma disable_warning 110
+#  pragma disable_warning 126
+#endif
+
+/* read functions */
+static inline
+uint32_t
+NMH_readLE32(const void *const p)
+{
+	uint32_t v;
+	memcpy(&v, p, 4);
+#	if (NMHASH_LITTLE_ENDIAN)
+	return v;
+#	elif defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__clang__)
+	return __builtin_bswap32(v);
+#	elif defined(_MSC_VER)
+	return _byteswap_ulong(v);
+#	else
+	return ((v >> 24) & 0xff) | ((v >> 8) & 0xff00) | ((v << 8) & 0xff0000) | ((v << 24) & 0xff000000);
+#	endif
+}
+
+static inline
+uint16_t
+NMH_readLE16(const void *const p)
+{
+	uint16_t v;
+	memcpy(&v, p, 2);
+#	if (NMHASH_LITTLE_ENDIAN)
+	return v;
+#	else
+	return (uint16_t)((v << 8) | (v >> 8));
+#	endif
+}
+
+static inline
+uint32_t
+NMHASH32_0to8(uint32_t const x, uint32_t const seed2)
+{
+	/* base mixer: [-6 -12 776bf593 -19 11 3fb39c65 -15 -9 e9139917 -11 16] = 0.027071104091278835 */
+	const uint32_t m1 = UINT32_C(0x776BF593);
+	const uint32_t m2 = UINT32_C(0x3FB39C65);
+	const uint32_t m3 = UINT32_C(0xE9139917);
+
+#	if NMH_VECTOR == NMH_SCALAR
+	{
+		union { uint32_t u32; uint16_t u16[2]; } vx;
+		vx.u32 = x;
+		vx.u32 ^= (vx.u32 >> 12) ^ (vx.u32 >> 6);
+		vx.u16[0] *= (uint16_t)m1;
+		vx.u16[1] *= (uint16_t)(m1 >> 16);
+		vx.u32 ^= (vx.u32 << 11) ^ ( vx.u32 >> 19);
+		vx.u16[0] *= (uint16_t)m2;
+		vx.u16[1] *= (uint16_t)(m2 >> 16);
+		vx.u32 ^= seed2;
+		vx.u32 ^= (vx.u32 >> 15) ^ ( vx.u32 >> 9);
+		vx.u16[0] *= (uint16_t)m3;
+		vx.u16[1] *= (uint16_t)(m3 >> 16);
+		vx.u32 ^= (vx.u32 << 16) ^ ( vx.u32 >> 11);
+		return vx.u32;
+	}
+#	else /* at least NMH_SSE2 */
+	{
+		__m128i hv = _mm_setr_epi32((int)x, 0, 0, 0);
+		const __m128i sv = _mm_setr_epi32((int)seed2, 0, 0, 0);
+		const uint32_t *const result = (const uint32_t*)&hv;
+
+		hv = _mm_xor_si128(_mm_xor_si128(hv, _mm_srli_epi32(hv, 12)), _mm_srli_epi32(hv, 6));
+		hv = _mm_mullo_epi16(hv, _mm_setr_epi32((int)m1, 0, 0, 0));
+		hv = _mm_xor_si128(_mm_xor_si128(hv, _mm_slli_epi32(hv, 11)), _mm_srli_epi32(hv, 19));
+		hv = _mm_mullo_epi16(hv, _mm_setr_epi32((int)m2, 0, 0, 0));
+
+		hv = _mm_xor_si128(hv, sv);
+
+		hv = _mm_xor_si128(_mm_xor_si128(hv, _mm_srli_epi32(hv, 15)), _mm_srli_epi32(hv, 9));
+		hv = _mm_mullo_epi16(hv, _mm_setr_epi32((int)m3, 0, 0, 0));
+		hv = _mm_xor_si128(_mm_xor_si128(hv, _mm_slli_epi32(hv, 16)), _mm_srli_epi32(hv, 11));
+
+		return *result;
+	}
+#	endif
+}
+
+#define __NMH_M1 UINT32_C(0xF0D9649B)
+#define __NMH_M2 UINT32_C(0x29A7935D)
+#define __NMH_M3 UINT32_C(0x55D35831)
+
+NMH_ALIGN(NMH_ACC_ALIGN) static const uint32_t __NMH_M1_V[32] = {
+	__NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1,
+	__NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1,
+	__NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1,
+	__NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1,
+};
+NMH_ALIGN(NMH_ACC_ALIGN) static const uint32_t __NMH_M2_V[32] = {
+	__NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2,
+	__NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2,
+	__NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2,
+	__NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2,
+};
+NMH_ALIGN(NMH_ACC_ALIGN) static const uint32_t __NMH_M3_V[32] = {
+	__NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3,
+	__NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3,
+	__NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3,
+	__NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3,
+};
+
+static inline
+uint32_t
+NMHASH32_9to255(const uint8_t* const NMH_RESTRICT p, size_t const len, uint32_t const seed, int const type)
+{
+	/* base mixer: [f0d9649b  5 -13 29a7935d -9 11 55d35831 -20 -10 ] = 0.93495901789135362 */
+	uint32_t result = 0;
+#	if NMH_VECTOR == NMH_SCALAR
+	{
+		union { uint32_t u32; uint16_t u16[2]; } x[4], y[4];
+		uint32_t const sl = seed + (uint32_t)len;
+		size_t j;
+		x[0].u32 = NMH_PRIME32_1;
+		x[1].u32 = NMH_PRIME32_2;
+		x[2].u32 = NMH_PRIME32_3;
+		x[3].u32 = NMH_PRIME32_4;
+		for (j = 0; j < 4; ++j) y[j].u32 = sl;
+
+		if (type) {
+			/* 33 to 255 bytes */
+			size_t const r = (len - 1) / 32;
+			size_t i;
+			for (i = 0; i < r; ++i) {
+				for (j = 0; j < 4; ++j) x[j].u32 ^= NMH_readLE32(p + i * 32 + j * 4);
+				for (j = 0; j < 4; ++j) y[j].u32 ^= NMH_readLE32(p + i * 32 + j * 4 + 16);
+				for (j = 0; j < 4; ++j) x[j].u32 += y[j].u32;
+
+				for (j = 0; j < 4; ++j) {
+					x[j].u16[0] *= (uint16_t)(__NMH_M1 & 0xFFFF);
+					x[j].u16[1] *= (uint16_t)(__NMH_M1 >> 16);
+				}
+				for (j = 0; j < 4; ++j) x[j].u32 ^= (x[j].u32 << 5) ^ (x[j].u32 >> 13);
+				for (j = 0; j < 4; ++j) {
+					x[j].u16[0] *= (uint16_t)(__NMH_M2 & 0xFFFF);
+					x[j].u16[1] *= (uint16_t)(__NMH_M2 >> 16);
+				}
+
+				for (j = 0; j < 4; ++j) x[j].u32 ^= y[j].u32;
+
+				for (j = 0; j < 4; ++j) x[j].u32 ^= (x[j].u32 << 11) ^ (x[j].u32 >> 9);
+				for (j = 0; j < 4; ++j) {
+					x[j].u16[0] *= (uint16_t)(__NMH_M3 & 0xFFFF);
+					x[j].u16[1] *= (uint16_t)(__NMH_M3 >> 16);
+				}
+				for (j = 0; j < 4; ++j) x[j].u32 ^= (x[j].u32 >> 10) ^ (x[j].u32 >> 20);
+			}
+			for (j = 0; j < 4; ++j) x[j].u32 ^= NMH_readLE32(p + len - 32 + j * 4);
+			for (j = 0; j < 4; ++j) y[j].u32 ^= NMH_readLE32(p + len - 16 + j * 4);
+		} else {
+			/* 9 to 32 bytes */
+			x[0].u32 ^= NMH_readLE32(p);
+			x[1].u32 ^= NMH_readLE32(p + ((len>>4)<<3));
+			x[2].u32 ^= NMH_readLE32(p + len - 8);
+			x[3].u32 ^= NMH_readLE32(p + len - 8 - ((len>>4)<<3));
+			y[0].u32 ^= NMH_readLE32(p + 4);
+			y[1].u32 ^= NMH_readLE32(p + ((len>>4)<<3) + 4);
+			y[2].u32 ^= NMH_readLE32(p + len - 8 + 4);
+			y[3].u32 ^= NMH_readLE32(p + len - 8 - ((len>>4)<<3) + 4);
+		}
+
+		for (j = 0; j < 4; ++j) x[j].u32 += y[j].u32;
+		for (j = 0; j < 4; ++j) y[j].u32 ^= (y[j].u32 << 17) ^ (y[j].u32 >> 6);
+
+		for (j = 0; j < 4; ++j) {
+			x[j].u16[0] *= (uint16_t)(__NMH_M1 & 0xFFFF);
+			x[j].u16[1] *= (uint16_t)(__NMH_M1 >> 16);
+		}
+		for (j = 0; j < 4; ++j) x[j].u32 ^= (x[j].u32 << 5) ^ (x[j].u32 >> 13);
+		for (j = 0; j < 4; ++j) {
+			x[j].u16[0] *= (uint16_t)(__NMH_M2 & 0xFFFF);
+			x[j].u16[1] *= (uint16_t)(__NMH_M2 >> 16);
+		}
+
+		for (j = 0; j < 4; ++j) x[j].u32 ^= y[j].u32;
+
+		for (j = 0; j < 4; ++j) x[j].u32 ^= (x[j].u32 << 11) ^ (x[j].u32 >> 9);
+		for (j = 0; j < 4; ++j) {
+			x[j].u16[0] *= (uint16_t)(__NMH_M3 & 0xFFFF);
+			x[j].u16[1] *= (uint16_t)(__NMH_M3 >> 16);
+		}
+		for (j = 0; j < 4; ++j) x[j].u32 ^= (x[j].u32 >> 10) ^ (x[j].u32 >> 20);
+
+		x[0].u32 ^= NMH_PRIME32_1;
+		x[1].u32 ^= NMH_PRIME32_2;
+		x[2].u32 ^= NMH_PRIME32_3;
+		x[3].u32 ^= NMH_PRIME32_4;
+
+		for (j = 1; j < 4; ++j) x[0].u32 += x[j].u32;
+
+		x[0].u32 ^= sl + (sl >> 5);
+		x[0].u16[0] *= (uint16_t)(__NMH_M3 & 0xFFFF);
+		x[0].u16[1] *= (uint16_t)(__NMH_M3 >> 16);
+		x[0].u32 ^= (x[0].u32 >> 10) ^ (x[0].u32 >> 20);
+
+		result = x[0].u32;
+	}
+#	else /* at least NMH_SSE2 */
+	{
+		__m128i const h0 = _mm_setr_epi32((int)NMH_PRIME32_1, (int)NMH_PRIME32_2, (int)NMH_PRIME32_3, (int)NMH_PRIME32_4);
+		__m128i const sl = _mm_set1_epi32((int)seed + (int)len);
+		__m128i const m1 = _mm_set1_epi32((int)__NMH_M1);
+		__m128i const m2 = _mm_set1_epi32((int)__NMH_M2);
+		__m128i const m3 = _mm_set1_epi32((int)__NMH_M3);
+		__m128i       x = h0;
+		__m128i       y = sl;
+		const uint32_t *const px = (const uint32_t*)&x;
+
+		if (type) {
+			/* 32 to 127 bytes */
+			size_t const r = (len - 1) / 32;
+			size_t i;
+			for (i = 0; i < r; ++i) {
+				x = _mm_xor_si128(x, _mm_loadu_si128((const __m128i *)(p + i * 32)));
+				y = _mm_xor_si128(y, _mm_loadu_si128((const __m128i *)(p + i * 32 + 16)));
+				x = _mm_add_epi32(x, y);
+				x = _mm_mullo_epi16(x, m1);
+				x = _mm_xor_si128(_mm_xor_si128(x, _mm_slli_epi32(x, 5)), _mm_srli_epi32(x, 13));
+				x = _mm_mullo_epi16(x, m2);
+				x = _mm_xor_si128(x, y);
+				x = _mm_xor_si128(_mm_xor_si128(x, _mm_slli_epi32(x, 11)), _mm_srli_epi32(x, 9));
+				x = _mm_mullo_epi16(x, m3);
+				x = _mm_xor_si128(_mm_xor_si128(x, _mm_srli_epi32(x, 10)), _mm_srli_epi32(x, 20));
+			}
+			x = _mm_xor_si128(x, _mm_loadu_si128((const __m128i *)(p + len - 32)));
+			y = _mm_xor_si128(y, _mm_loadu_si128((const __m128i *)(p + len - 16)));
+		} else {
+			/* 9 to 32 bytes */
+			x = _mm_xor_si128(x, _mm_setr_epi32((int)NMH_readLE32(p), (int)NMH_readLE32(p + ((len>>4)<<3)), (int)NMH_readLE32(p + len - 8), (int)NMH_readLE32(p + len - 8 - ((len>>4)<<3))));
+			y = _mm_xor_si128(y, _mm_setr_epi32((int)NMH_readLE32(p + 4), (int)NMH_readLE32(p + ((len>>4)<<3) + 4), (int)NMH_readLE32(p + len - 8 + 4), (int)NMH_readLE32(p + len - 8 - ((len>>4)<<3) + 4)));
+		}
+
+		x = _mm_add_epi32(x, y);
+
+		y = _mm_xor_si128(_mm_xor_si128(y, _mm_slli_epi32(y, 17)), _mm_srli_epi32(y, 6));
+
+		x = _mm_mullo_epi16(x, m1);
+		x = _mm_xor_si128(_mm_xor_si128(x, _mm_slli_epi32(x, 5)), _mm_srli_epi32(x, 13));
+		x = _mm_mullo_epi16(x, m2);
+		x = _mm_xor_si128(x, y);
+		x = _mm_xor_si128(_mm_xor_si128(x, _mm_slli_epi32(x, 11)), _mm_srli_epi32(x, 9));
+		x = _mm_mullo_epi16(x, m3);
+		x = _mm_xor_si128(_mm_xor_si128(x, _mm_srli_epi32(x, 10)), _mm_srli_epi32(x, 20));
+
+		x = _mm_xor_si128(x, h0);
+		x = _mm_add_epi32(x, _mm_srli_si128(x, 4));
+		x = _mm_add_epi32(x, _mm_srli_si128(x, 8));
+
+		x = _mm_xor_si128(x, _mm_add_epi32(sl, _mm_srli_epi32(sl, 5)));
+		x = _mm_mullo_epi16(x, m3);
+		x = _mm_xor_si128(_mm_xor_si128(x, _mm_srli_epi32(x, 10)), _mm_srli_epi32(x, 20));
+
+		result = *px;
+	}
+#	endif
+	return *&result;
+}
+#define NMHASH32_9to32(p, len, seed) NMHASH32_9to255(p, len, seed, 0)
+#define NMHASH32_33to255(p, len, seed) NMHASH32_9to255(p, len, seed, 1)
+
+#undef __NMH_M1
+#undef __NMH_M2
+#undef __NMH_M3
+
+#if NMH_VECTOR == NMH_SCALAR
+#define NMHASH32_long_round NMHASH32_long_round_scalar
+static inline
+void
+NMHASH32_long_round_scalar(uint32_t *const NMH_RESTRICT accX, uint32_t *const NMH_RESTRICT accY, const uint8_t* const NMH_RESTRICT p)
+{
+	/* breadth first calculation will hint some compiler to auto vectorize the code
+	 * on gcc, the performance becomes 10x than the depth first, and about 80% of the manually vectorized code
+	 */
+	const size_t nbGroups = sizeof(NMH_ACC_INIT) / sizeof(*NMH_ACC_INIT);
+	size_t i;
+
+	for (i = 0; i < nbGroups; ++i) {
+		accX[i] ^= NMH_readLE32(p + i * 4);
+	}
+	for (i = 0; i < nbGroups; ++i) {
+		accY[i] ^= NMH_readLE32(p + i * 4 + sizeof(NMH_ACC_INIT));
+	}
+	for (i = 0; i < nbGroups; ++i) {
+		accX[i] += accY[i];
+	}
+	for (i = 0; i < nbGroups; ++i) {
+		accY[i] ^= accX[i] >> 1;
+	}
+	for (i = 0; i < nbGroups * 2; ++i) {
+		((uint16_t*)accX)[i] *= ((uint16_t*)__NMH_M1_V)[i];
+	}
+	for (i = 0; i < nbGroups; ++i) {
+		accX[i] ^= accX[i] << 5 ^ accX[i] >> 13;
+	}
+	for (i = 0; i < nbGroups * 2; ++i) {
+		((uint16_t*)accX)[i] *= ((uint16_t*)__NMH_M2_V)[i];
+	}
+	for (i = 0; i < nbGroups; ++i) {
+		accX[i] ^= accY[i];
+	}
+	for (i = 0; i < nbGroups; ++i) {
+		accX[i] ^= accX[i] << 11 ^ accX[i] >> 9;
+	}
+	for (i = 0; i < nbGroups * 2; ++i) {
+		((uint16_t*)accX)[i] *= ((uint16_t*)__NMH_M3_V)[i];
+	}
+	for (i = 0; i < nbGroups; ++i) {
+		accX[i] ^= accX[i] >> 10 ^ accX[i] >> 20;
+	}
+}
+#endif
+
+#if NMH_VECTOR == NMH_SSE2
+#  define _NMH_MM_(F) _mm_ ## F
+#  define _NMH_MMW_(F) _mm_ ## F ## 128
+#  define _NMH_MM_T __m128i
+#elif NMH_VECTOR == NMH_AVX2
+#  define _NMH_MM_(F) _mm256_ ## F
+#  define _NMH_MMW_(F) _mm256_ ## F ## 256
+#  define _NMH_MM_T __m256i
+#elif NMH_VECTOR == NMH_AVX512
+#  define _NMH_MM_(F) _mm512_ ## F
+#  define _NMH_MMW_(F) _mm512_ ## F ## 512
+#  define _NMH_MM_T __m512i
+#endif
+
+#if NMH_VECTOR == NMH_SSE2 || NMH_VECTOR == NMH_AVX2 || NMH_VECTOR == NMH_AVX512
+#  define NMHASH32_long_round NMHASH32_long_round_sse
+#  define NMH_VECTOR_NB_GROUP (sizeof(NMH_ACC_INIT) / sizeof(*NMH_ACC_INIT) / (sizeof(_NMH_MM_T) / sizeof(*NMH_ACC_INIT)))
+static inline
+void
+NMHASH32_long_round_sse(uint32_t *const NMH_RESTRICT accX, uint32_t *const NMH_RESTRICT accY, const uint8_t* const NMH_RESTRICT p)
+{
+	const _NMH_MM_T *const NMH_RESTRICT m1    = (const _NMH_MM_T * NMH_RESTRICT)__NMH_M1_V;
+	const _NMH_MM_T *const NMH_RESTRICT m2    = (const _NMH_MM_T * NMH_RESTRICT)__NMH_M2_V;
+	const _NMH_MM_T *const NMH_RESTRICT m3    = (const _NMH_MM_T * NMH_RESTRICT)__NMH_M3_V;
+	      _NMH_MM_T *const              xaccX = (      _NMH_MM_T *             )accX;
+	      _NMH_MM_T *const              xaccY = (      _NMH_MM_T *             )accY;
+	      _NMH_MM_T *const              xp    = (      _NMH_MM_T *             )p;
+	size_t i;
+
+	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
+		xaccX[i] = _NMH_MMW_(xor_si)(xaccX[i], _NMH_MMW_(loadu_si)(xp + i));
+	}
+	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
+		xaccY[i] = _NMH_MMW_(xor_si)(xaccY[i], _NMH_MMW_(loadu_si)(xp + i + NMH_VECTOR_NB_GROUP));
+	}
+	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
+		xaccX[i] = _NMH_MM_(add_epi32)(xaccX[i], xaccY[i]);
+	}
+	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
+		xaccY[i] = _NMH_MMW_(xor_si)(xaccY[i], _NMH_MM_(srli_epi32)(xaccX[i], 1));
+	}
+	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
+		xaccX[i] = _NMH_MM_(mullo_epi16)(xaccX[i], *m1);
+	}
+	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
+		xaccX[i] = _NMH_MMW_(xor_si)(_NMH_MMW_(xor_si)(xaccX[i], _NMH_MM_(slli_epi32)(xaccX[i], 5)), _NMH_MM_(srli_epi32)(xaccX[i], 13));
+	}
+	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
+		xaccX[i] = _NMH_MM_(mullo_epi16)(xaccX[i], *m2);
+	}
+	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
+		xaccX[i] = _NMH_MMW_(xor_si)(xaccX[i], xaccY[i]);
+	}
+	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
+		xaccX[i] = _NMH_MMW_(xor_si)(_NMH_MMW_(xor_si)(xaccX[i], _NMH_MM_(slli_epi32)(xaccX[i], 11)), _NMH_MM_(srli_epi32)(xaccX[i], 9));
+	}
+	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
+		xaccX[i] = _NMH_MM_(mullo_epi16)(xaccX[i], *m3);
+	}
+	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
+		xaccX[i] = _NMH_MMW_(xor_si)(_NMH_MMW_(xor_si)(xaccX[i], _NMH_MM_(srli_epi32)(xaccX[i], 10)), _NMH_MM_(srli_epi32)(xaccX[i], 20));
+	}
+}
+#  undef _NMH_MM_
+#  undef _NMH_MMW_
+#  undef _NMH_MM_T
+#  undef NMH_VECTOR_NB_GROUP
+#endif
+
+static
+uint32_t
+NMHASH32_long(const uint8_t* const NMH_RESTRICT p, size_t const len, uint32_t const seed)
+{
+	NMH_ALIGN(NMH_ACC_ALIGN) uint32_t accX[sizeof(NMH_ACC_INIT)/sizeof(*NMH_ACC_INIT)];
+	NMH_ALIGN(NMH_ACC_ALIGN) uint32_t accY[sizeof(accX)/sizeof(*accX)];
+	size_t const nbRounds = (len - 1) / (sizeof(accX) + sizeof(accY));
+	size_t i;
+	uint32_t sum = 0;
+
+	/* init */
+	for (i = 0; i < sizeof(accX)/sizeof(*accX); ++i) accX[i] = NMH_ACC_INIT[i];
+	for (i = 0; i < sizeof(accY)/sizeof(*accY); ++i) accY[i] = seed;
+
+	for (i = 0; i < nbRounds; ++i) {
+		NMHASH32_long_round(accX, accY, p + i * (sizeof(accX) + sizeof(accY)));
+	}
+	NMHASH32_long_round(accX, accY, p + len - (sizeof(accX) + sizeof(accY)));
+
+	/* merge acc */
+	for (i = 0; i < sizeof(accX)/sizeof(*accX); ++i) accX[i] ^= NMH_ACC_INIT[i];
+	for (i = 0; i < sizeof(accX)/sizeof(*accX); ++i) sum += accX[i];
+
+#	if SIZE_MAX > UINT32_C(-1)
+	sum += (uint32_t)(len >> 32);
+#	endif
+	return sum ^ (uint32_t)len;
+}
+
+static inline
+uint32_t
+NMHASH32_avalanche32(uint32_t const x)
+{
+	/* [-21 -8 cce5196d 12 -7 464be229 -21 -8] = 3.2267098842182733 */
+	const uint32_t m1 = UINT32_C(0xCCE5196D);
+	const uint32_t m2 = UINT32_C(0x464BE229);
+	union { uint32_t u32; uint16_t u16[2]; } vx;
+	vx.u32    = x;
+	vx.u32   ^= (vx.u32 >> 8) ^ (vx.u32 >> 21);
+	vx.u16[0] = (uint16_t)(vx.u16[0] * (uint16_t)m1);
+	vx.u16[1] = (uint16_t)(vx.u16[1] * (uint16_t)(m1 >> 16));
+	vx.u32   ^= (vx.u32 << 12) ^ (vx.u32 >> 7);
+	vx.u16[0] = (uint16_t)(vx.u16[0] * (uint16_t)m2);
+	vx.u16[1] = (uint16_t)(vx.u16[1] * (uint16_t)(m2 >> 16));
+	return vx.u32 ^ (vx.u32 >> 8) ^ (vx.u32 >> 21);
+}
+
+static inline
+uint32_t
+NMHASH32(const void* const NMH_RESTRICT input, size_t const len, uint32_t seed)
+{
+	const uint8_t *const p = (const uint8_t *)input;
+	if (NMH_likely(len <= 32)) {
+		if(NMH_likely(len > 8)) {
+			return NMHASH32_9to32(p, len, seed);
+		}
+		if(NMH_likely(len > 4)) {
+			uint32_t x = NMH_readLE32(p);
+			uint32_t y = NMH_readLE32(p + len - 4) ^ (NMH_PRIME32_4 + 2 + seed);
+			x += y;
+			x ^= x << (len + 7);
+			return NMHASH32_0to8(x, NMH_rotl32(y, 5));
+		} else {
+			union { uint32_t u32; uint16_t u16[2]; uint8_t u8[4]; } data;
+			switch (len) {
+				case 0: seed += NMH_PRIME32_2;
+					data.u32 = 0;
+					break;
+				case 1: seed += NMH_PRIME32_2 + (UINT32_C(1) << 24) + (1 << 1);
+					data.u32 = p[0];
+					break;
+				case 2: seed += NMH_PRIME32_2 + (UINT32_C(2) << 24) + (2 << 1);
+					data.u32 = NMH_readLE16(p);
+					break;
+				case 3: seed += NMH_PRIME32_2 + (UINT32_C(3) << 24) + (3 << 1);
+					data.u16[1] = p[2];
+					data.u16[0] = NMH_readLE16(p);
+					break;
+				case 4: seed += NMH_PRIME32_3;
+					data.u32 = NMH_readLE32(p);
+					break;
+				default: return 0;
+			}
+			return NMHASH32_0to8(data.u32 + seed, NMH_rotl32(seed, 5));
+		}
+	}
+	if (NMH_likely(len < 256)) {
+		return NMHASH32_33to255(p, len, seed);
+	}
+	return NMHASH32_avalanche32(NMHASH32_long(p, len, seed));
+}
+
+static inline
+uint32_t
+NMHASH32X_0to4(uint32_t x, uint32_t const seed)
+{
+	/* [bdab1ea9 18 a7896a1b 12 83796a2d 16] = 0.092922873297662509 */
+	x ^= seed;
+	x *= UINT32_C(0xBDAB1EA9);
+	x += NMH_rotl32(seed, 31);
+	x ^= x >> 18;
+	x *= UINT32_C(0xA7896A1B);
+	x ^= x >> 12;
+	x *= UINT32_C(0x83796A2D);
+	x ^= x >> 16;
+	return x;
+}
+
+static inline
+uint32_t
+NMHASH32X_5to8(const uint8_t* const NMH_RESTRICT p, size_t const len, uint32_t const seed)
+{
+	/* - 5 to 9 bytes
+	 * - mixer: [11049a7d 23 bcccdc7b 12 065e9dad 12] = 0.16577596555667246 */
+
+	uint32_t       x = NMH_readLE32(p) ^ NMH_PRIME32_3;
+	uint32_t const y = NMH_readLE32(p + len - 4) ^ seed;
+	x += y;
+	x ^= x >> len;
+	x *= UINT32_C(0x11049A7D);
+	x ^= x >> 23;
+	x *= UINT32_C(0xBCCCDC7B);
+	x ^= NMH_rotl32(y, 3);
+	x ^= x >> 12;
+	x *= UINT32_C(0x065E9DAD);
+	x ^= x >> 12;
+	return x;
+}
+
+static inline
+uint32_t
+NMHASH32X_9to255(const uint8_t* const NMH_RESTRICT p, size_t const len, uint32_t const seed)
+{
+	/* - at least 9 bytes
+	 * - base mixer: [11049a7d 23 bcccdc7b 12 065e9dad 12] = 0.16577596555667246
+	 * - tail mixer: [16 a52fb2cd 15 551e4d49 16] = 0.17162579707098322
+	 */
+
+	uint32_t x = NMH_PRIME32_3;
+	uint32_t y = seed;
+	uint32_t a = NMH_PRIME32_4;
+	uint32_t b = seed;
+	size_t i, r = (len - 1) / 16;
+
+	for (i = 0; i < r; ++i) {
+		x ^= NMH_readLE32(p + i * 16 + 0);
+		y ^= NMH_readLE32(p + i * 16 + 4);
+		x ^= y;
+		x *= UINT32_C(0x11049A7D);
+		x ^= x >> 23;
+		x *= UINT32_C(0xBCCCDC7B);
+		y  = NMH_rotl32(y, 4);
+		x ^= y;
+		x ^= x >> 12;
+		x *= UINT32_C(0x065E9DAD);
+		x ^= x >> 12;
+
+		a ^= NMH_readLE32(p + i * 16 + 8);
+		b ^= NMH_readLE32(p + i * 16 + 12);
+		a ^= b;
+		a *= UINT32_C(0x11049A7D);
+		a ^= a >> 23;
+		a *= UINT32_C(0xBCCCDC7B);
+		b  = NMH_rotl32(b, 3);
+		a ^= b;
+		a ^= a >> 12;
+		a *= UINT32_C(0x065E9DAD);
+		a ^= a >> 12;
+	}
+
+	if (NMH_likely(((uint8_t)len-1) & 8)) {
+		if (NMH_likely(((uint8_t)len-1) & 4)) {
+			a ^= NMH_readLE32(p + r * 16 + 0);
+			b ^= NMH_readLE32(p + r * 16 + 4);
+			a ^= b;
+			a *= UINT32_C(0x11049A7D);
+			a ^= a >> 23;
+			a *= UINT32_C(0xBCCCDC7B);
+			a ^= NMH_rotl32(b, 4);
+			a ^= a >> 12;
+			a *= UINT32_C(0x065E9DAD);
+		} else {
+			a ^= NMH_readLE32(p + r * 16) + b;
+			a ^= a >> 16;
+			a *= UINT32_C(0xA52FB2CD);
+			a ^= a >> 15;
+			a *= UINT32_C(0x551E4D49);
+		}
+
+		x ^= NMH_readLE32(p + len - 8);
+		y ^= NMH_readLE32(p + len - 4);
+		x ^= y;
+		x *= UINT32_C(0x11049A7D);
+		x ^= x >> 23;
+		x *= UINT32_C(0xBCCCDC7B);
+		x ^= NMH_rotl32(y, 3);
+		x ^= x >> 12;
+		x *= UINT32_C(0x065E9DAD);
+	} else {
+		if (NMH_likely(((uint8_t)len-1) & 4)) {
+			a ^= NMH_readLE32(p + r * 16) + b;
+			a ^= a >> 16;
+			a *= UINT32_C(0xA52FB2CD);
+			a ^= a >> 15;
+			a *= UINT32_C(0x551E4D49);
+		}
+		x ^= NMH_readLE32(p + len - 4) + y;
+		x ^= x >> 16;
+		x *= UINT32_C(0xA52FB2CD);
+		x ^= x >> 15;
+		x *= UINT32_C(0x551E4D49);
+	}
+
+	x ^= (uint32_t)len;
+	x ^= NMH_rotl32(a, 27); /* rotate one lane to pass Diff test */
+	x ^= x >> 14;
+	x *= UINT32_C(0x141CC535);
+
+	return x;
+}
+
+static inline
+uint32_t
+NMHASH32X_avalanche32(uint32_t x)
+{
+	/* mixer with 2 mul from skeeto/hash-prospector:
+	 * [15 d168aaad 15 af723597 15] = 0.15983776156606694
+	 */
+	x ^= x >> 15;
+	x *= UINT32_C(0xD168AAAD);
+	x ^= x >> 15;
+	x *= UINT32_C(0xAF723597);
+	x ^= x >> 15;
+	return x;
+}
+
+/* use 32*32->32 multiplication for short hash */
+static inline
+uint32_t
+NMHASH32X(const void* const NMH_RESTRICT input, size_t const len, uint32_t seed)
+{
+	const uint8_t *const p = (const uint8_t *)input;
+	if (NMH_likely(len <= 8)) {
+		if (NMH_likely(len > 4)) {
+			return NMHASH32X_5to8(p, len, seed);
+		} else {
+			/* 0-4 bytes */
+			union { uint32_t u32; uint16_t u16[2]; uint8_t u8[4]; } data;
+			switch (len) {
+				case 0: seed += NMH_PRIME32_2;
+					data.u32 = 0;
+					break;
+				case 1: seed += NMH_PRIME32_2 + (UINT32_C(1) << 24) + (1 << 1);
+					data.u32 = p[0];
+					break;
+				case 2: seed += NMH_PRIME32_2 + (UINT32_C(2) << 24) + (2 << 1);
+					data.u32 = NMH_readLE16(p);
+					break;
+				case 3: seed += NMH_PRIME32_2 + (UINT32_C(3) << 24) + (3 << 1);
+					data.u16[1] = p[2];
+					data.u16[0] = NMH_readLE16(p);
+					break;
+				case 4: seed += NMH_PRIME32_1;
+					data.u32 = NMH_readLE32(p);
+					break;
+				default: return 0;
+			}
+			return NMHASH32X_0to4(data.u32, seed);
+		}
+	}
+	if (NMH_likely(len < 256)) {
+		return NMHASH32X_9to255(p, len, seed);
+	}
+	return NMHASH32X_avalanche32(NMHASH32_long(p, len, seed));
+}
+
+#if defined(_MSC_VER) && _MSC_VER >= 1914
+#  pragma warning(pop)
+#endif
+#ifdef __SDCC
+#  pragma restore
+#  undef const
+#endif
+
+#endif /* _nmhash_h_ */
+
+#ifdef __cplusplus
+}
+#endif

From fd91311f1e4db1dd6ba605493b2ed85219e93246 Mon Sep 17 00:00:00 2001
From: Jeremie Vandenplas <jeremie.vandenplas@gmail.com>
Date: Sun, 7 Jan 2024 22:56:18 +0100
Subject: [PATCH 8/8] Remove nmhash_scalar.[c,h] because they are duplicates of
 nmhash.[c,h]

---
 test/hash_functions/CMakeLists.txt           |   2 +-
 test/hash_functions/generate_hash_arrays.cpp |   1 -
 test/hash_functions/nmhash_scalar.c          |   8 -
 test/hash_functions/nmhash_scalar.h          | 825 -------------------
 4 files changed, 1 insertion(+), 835 deletions(-)
 delete mode 100644 test/hash_functions/nmhash_scalar.c
 delete mode 100644 test/hash_functions/nmhash_scalar.h

diff --git a/test/hash_functions/CMakeLists.txt b/test/hash_functions/CMakeLists.txt
index eacdd727e..c46a09e4c 100755
--- a/test/hash_functions/CMakeLists.txt
+++ b/test/hash_functions/CMakeLists.txt
@@ -7,7 +7,7 @@ ADDTEST(hash_functions)
 target_sources(
   test_hash_functions
   PRIVATE
-  nmhash_scalar.c
+  nmhash.c
   pengyhash.c
   SpookyV2.cpp
   SpookyV2Test.cpp
diff --git a/test/hash_functions/generate_hash_arrays.cpp b/test/hash_functions/generate_hash_arrays.cpp
index 11aa9778d..e1aa4f42c 100644
--- a/test/hash_functions/generate_hash_arrays.cpp
+++ b/test/hash_functions/generate_hash_arrays.cpp
@@ -3,7 +3,6 @@
 
 extern "C" {
   #include "nmhash.h"
-  #include "nmhash_scalar.h"
   #include "pengyhash.h"
   #include "waterhash.h"
   int generate_all_c_hash();
diff --git a/test/hash_functions/nmhash_scalar.c b/test/hash_functions/nmhash_scalar.c
deleted file mode 100644
index 051a65d5f..000000000
--- a/test/hash_functions/nmhash_scalar.c
+++ /dev/null
@@ -1,8 +0,0 @@
-#include "nmhash_scalar.h"
-int32_t nmhash32_test ( const void * key, size_t len, uint32_t seed ) {
-  return NMHASH32 (key, (const size_t) len, seed);
-}
-
-int32_t nmhash32x_test ( const void * key, size_t len, uint32_t seed ) {
-  return NMHASH32X (key, (const size_t) len, seed);
-}
diff --git a/test/hash_functions/nmhash_scalar.h b/test/hash_functions/nmhash_scalar.h
deleted file mode 100644
index a2a1a897e..000000000
--- a/test/hash_functions/nmhash_scalar.h
+++ /dev/null
@@ -1,825 +0,0 @@
-/*
- * verification:
- * NMHASH32:
- *   rurban/smhasher: 0x12A30553
- *   demerphq/smhasher: 0x3D8F6C47
- * NMHASH32X:
- *   rurban/smhasher: 0xA8580227
- *   demerphq/smhasher: 0x40B451B3
- */
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifndef _nmhash_h_
-#define _nmhash_h_
-
-#define NMH_VERSION 2
-
-#ifdef _MSC_VER
-#  pragma warning(push, 3)
-#endif
-
-#if defined(__cplusplus) && __cplusplus < 201103L
-#  define __STDC_CONSTANT_MACROS 1
-#endif
-
-#include <stdint.h>
-#include <string.h>
-
-#if defined(__GNUC__)
-#  if defined(__AVX2__)
-#    include <immintrin.h>
-#  elif defined(__SSE2__)
-#    include <emmintrin.h>
-#  endif
-#elif defined(_MSC_VER)
-#  include <intrin.h>
-#endif
-
-#ifdef _MSC_VER
-#  pragma warning(pop)
-#endif
-
-#if (defined(__GNUC__) && (__GNUC__ >= 3))  \
-  || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \
-  || defined(__clang__)
-#    define NMH_likely(x) __builtin_expect(x, 1)
-#else
-#    define NMH_likely(x) (x)
-#endif
-
-#if defined(__has_builtin)
-#  if __has_builtin(__builtin_rotateleft32) \
-    && !(defined(__INTEL_COMPILER) && defined(__APPLE__))
-#    define NMH_rotl32 __builtin_rotateleft32 /* clang */
-#  endif
-#endif
-#if !defined(NMH_rotl32)
-#  if defined(_MSC_VER)
-     /* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */
-#    define NMH_rotl32(x,r) _rotl(x,r)
-#  else
-#    define NMH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))
-#  endif
-#endif
-
-#if ((defined(sun) || defined(__sun)) && __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested with GCC 5.5 */
-#  define NMH_RESTRICT /* disable */
-#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* >= C99 */
-#  define NMH_RESTRICT   restrict
-#elif defined(__cplusplus) && (defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER))
-#  define NMH_RESTRICT __restrict__
-#elif defined(__cplusplus) && defined(_MSC_VER)
-#  define NMH_RESTRICT __restrict
-#else
-#  define NMH_RESTRICT   /* disable */
-#endif
-
-/* endian macros */
-#ifndef NMHASH_LITTLE_ENDIAN
-#  if defined(_WIN32) || defined(__LITTLE_ENDIAN__) || defined(__x86_64__) || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) || defined(__SDCC)
-#    define NMHASH_LITTLE_ENDIAN 1
-#  elif defined(__BIG_ENDIAN__) || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
-#    define NMHASH_LITTLE_ENDIAN 0
-#  else
-#    warning could not determine endianness! Falling back to little endian.
-#    define NMHASH_LITTLE_ENDIAN 1
-#  endif
-#endif
-
-/* vector macros */
-#define NMH_SCALAR 0
-#define NMH_SSE2   1
-#define NMH_AVX2   2
-#define NMH_AVX512 3
-
-#ifndef NMH_VECTOR    /* can be defined on command line */
-#    define NMH_VECTOR NMH_SCALAR
-#endif
-
-/* align macros */
-#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)   /* C11+ */
-#  include <stdalign.h>
-#  define NMH_ALIGN(n)      alignas(n)
-#elif defined(__GNUC__)
-#  define NMH_ALIGN(n)      __attribute__ ((aligned(n)))
-#elif defined(_MSC_VER)
-#  define NMH_ALIGN(n)      __declspec(align(n))
-#else
-#  define NMH_ALIGN(n)   /* disabled */
-#endif
-
-#if NMH_VECTOR > 0
-#  define NMH_ACC_ALIGN 64
-#elif defined(__BIGGEST_ALIGNMENT__)
-#  define NMH_ACC_ALIGN __BIGGEST_ALIGNMENT__
-#elif defined(__SDCC)
-#  define NMH_ACC_ALIGN 1
-#else
-#  define NMH_ACC_ALIGN 16
-#endif
-
-/* constants */
-
-/* primes from xxh */
-#define NMH_PRIME32_1  UINT32_C(0x9E3779B1)
-#define NMH_PRIME32_2  UINT32_C(0x85EBCA77)
-#define NMH_PRIME32_3  UINT32_C(0xC2B2AE3D)
-#define NMH_PRIME32_4  UINT32_C(0x27D4EB2F)
-
-/*! Pseudorandom secret taken directly from FARSH. */
-NMH_ALIGN(NMH_ACC_ALIGN) static const uint32_t NMH_ACC_INIT[32] = {
-	UINT32_C(0xB8FE6C39), UINT32_C(0x23A44BBE), UINT32_C(0x7C01812C), UINT32_C(0xF721AD1C),
-	UINT32_C(0xDED46DE9), UINT32_C(0x839097DB), UINT32_C(0x7240A4A4), UINT32_C(0xB7B3671F),
-	UINT32_C(0xCB79E64E), UINT32_C(0xCCC0E578), UINT32_C(0x825AD07D), UINT32_C(0xCCFF7221),
-	UINT32_C(0xB8084674), UINT32_C(0xF743248E), UINT32_C(0xE03590E6), UINT32_C(0x813A264C),
-
-	UINT32_C(0x3C2852BB), UINT32_C(0x91C300CB), UINT32_C(0x88D0658B), UINT32_C(0x1B532EA3),
-	UINT32_C(0x71644897), UINT32_C(0xA20DF94E), UINT32_C(0x3819EF46), UINT32_C(0xA9DEACD8),
-	UINT32_C(0xA8FA763F), UINT32_C(0xE39C343F), UINT32_C(0xF9DCBBC7), UINT32_C(0xC70B4F1D),
-	UINT32_C(0x8A51E04B), UINT32_C(0xCDB45931), UINT32_C(0xC89F7EC9), UINT32_C(0xD9787364),
-};
-
-#if defined(_MSC_VER) && _MSC_VER >= 1914
-#  pragma warning(push)
-#  pragma warning(disable: 5045)
-#endif
-#ifdef __SDCC
-#  define const
-#  pragma save
-#  pragma disable_warning 110
-#  pragma disable_warning 126
-#endif
-
-/* read functions */
-static inline
-uint32_t
-NMH_readLE32(const void *const p)
-{
-	uint32_t v;
-	memcpy(&v, p, 4);
-#	if (NMHASH_LITTLE_ENDIAN)
-	return v;
-#	elif defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__clang__)
-	return __builtin_bswap32(v);
-#	elif defined(_MSC_VER)
-	return _byteswap_ulong(v);
-#	else
-	return ((v >> 24) & 0xff) | ((v >> 8) & 0xff00) | ((v << 8) & 0xff0000) | ((v << 24) & 0xff000000);
-#	endif
-}
-
-static inline
-uint16_t
-NMH_readLE16(const void *const p)
-{
-	uint16_t v;
-	memcpy(&v, p, 2);
-#	if (NMHASH_LITTLE_ENDIAN)
-	return v;
-#	else
-	return (uint16_t)((v << 8) | (v >> 8));
-#	endif
-}
-
-static inline
-uint32_t
-NMHASH32_0to8(uint32_t const x, uint32_t const seed2)
-{
-	/* base mixer: [-6 -12 776bf593 -19 11 3fb39c65 -15 -9 e9139917 -11 16] = 0.027071104091278835 */
-	const uint32_t m1 = UINT32_C(0x776BF593);
-	const uint32_t m2 = UINT32_C(0x3FB39C65);
-	const uint32_t m3 = UINT32_C(0xE9139917);
-
-#	if NMH_VECTOR == NMH_SCALAR
-	{
-		union { uint32_t u32; uint16_t u16[2]; } vx;
-		vx.u32 = x;
-		vx.u32 ^= (vx.u32 >> 12) ^ (vx.u32 >> 6);
-		vx.u16[0] *= (uint16_t)m1;
-		vx.u16[1] *= (uint16_t)(m1 >> 16);
-		vx.u32 ^= (vx.u32 << 11) ^ ( vx.u32 >> 19);
-		vx.u16[0] *= (uint16_t)m2;
-		vx.u16[1] *= (uint16_t)(m2 >> 16);
-		vx.u32 ^= seed2;
-		vx.u32 ^= (vx.u32 >> 15) ^ ( vx.u32 >> 9);
-		vx.u16[0] *= (uint16_t)m3;
-		vx.u16[1] *= (uint16_t)(m3 >> 16);
-		vx.u32 ^= (vx.u32 << 16) ^ ( vx.u32 >> 11);
-		return vx.u32;
-	}
-#	else /* at least NMH_SSE2 */
-	{
-		__m128i hv = _mm_setr_epi32((int)x, 0, 0, 0);
-		const __m128i sv = _mm_setr_epi32((int)seed2, 0, 0, 0);
-		const uint32_t *const result = (const uint32_t*)&hv;
-
-		hv = _mm_xor_si128(_mm_xor_si128(hv, _mm_srli_epi32(hv, 12)), _mm_srli_epi32(hv, 6));
-		hv = _mm_mullo_epi16(hv, _mm_setr_epi32((int)m1, 0, 0, 0));
-		hv = _mm_xor_si128(_mm_xor_si128(hv, _mm_slli_epi32(hv, 11)), _mm_srli_epi32(hv, 19));
-		hv = _mm_mullo_epi16(hv, _mm_setr_epi32((int)m2, 0, 0, 0));
-
-		hv = _mm_xor_si128(hv, sv);
-
-		hv = _mm_xor_si128(_mm_xor_si128(hv, _mm_srli_epi32(hv, 15)), _mm_srli_epi32(hv, 9));
-		hv = _mm_mullo_epi16(hv, _mm_setr_epi32((int)m3, 0, 0, 0));
-		hv = _mm_xor_si128(_mm_xor_si128(hv, _mm_slli_epi32(hv, 16)), _mm_srli_epi32(hv, 11));
-
-		return *result;
-	}
-#	endif
-}
-
-#define __NMH_M1 UINT32_C(0xF0D9649B)
-#define __NMH_M2 UINT32_C(0x29A7935D)
-#define __NMH_M3 UINT32_C(0x55D35831)
-
-NMH_ALIGN(NMH_ACC_ALIGN) static const uint32_t __NMH_M1_V[32] = {
-	__NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1,
-	__NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1,
-	__NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1,
-	__NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1,
-};
-NMH_ALIGN(NMH_ACC_ALIGN) static const uint32_t __NMH_M2_V[32] = {
-	__NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2,
-	__NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2,
-	__NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2,
-	__NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2,
-};
-NMH_ALIGN(NMH_ACC_ALIGN) static const uint32_t __NMH_M3_V[32] = {
-	__NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3,
-	__NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3,
-	__NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3,
-	__NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3,
-};
-
-static inline
-uint32_t
-NMHASH32_9to255(const uint8_t* const NMH_RESTRICT p, size_t const len, uint32_t const seed, int const type)
-{
-	/* base mixer: [f0d9649b  5 -13 29a7935d -9 11 55d35831 -20 -10 ] = 0.93495901789135362 */
-	uint32_t result = 0;
-#	if NMH_VECTOR == NMH_SCALAR
-	{
-		union { uint32_t u32; uint16_t u16[2]; } x[4], y[4];
-		uint32_t const sl = seed + (uint32_t)len;
-		size_t j;
-		x[0].u32 = NMH_PRIME32_1;
-		x[1].u32 = NMH_PRIME32_2;
-		x[2].u32 = NMH_PRIME32_3;
-		x[3].u32 = NMH_PRIME32_4;
-		for (j = 0; j < 4; ++j) y[j].u32 = sl;
-
-		if (type) {
-			/* 33 to 255 bytes */
-			size_t const r = (len - 1) / 32;
-			size_t i;
-			for (i = 0; i < r; ++i) {
-				for (j = 0; j < 4; ++j) x[j].u32 ^= NMH_readLE32(p + i * 32 + j * 4);
-				for (j = 0; j < 4; ++j) y[j].u32 ^= NMH_readLE32(p + i * 32 + j * 4 + 16);
-				for (j = 0; j < 4; ++j) x[j].u32 += y[j].u32;
-
-				for (j = 0; j < 4; ++j) {
-					x[j].u16[0] *= (uint16_t)(__NMH_M1 & 0xFFFF);
-					x[j].u16[1] *= (uint16_t)(__NMH_M1 >> 16);
-				}
-				for (j = 0; j < 4; ++j) x[j].u32 ^= (x[j].u32 << 5) ^ (x[j].u32 >> 13);
-				for (j = 0; j < 4; ++j) {
-					x[j].u16[0] *= (uint16_t)(__NMH_M2 & 0xFFFF);
-					x[j].u16[1] *= (uint16_t)(__NMH_M2 >> 16);
-				}
-
-				for (j = 0; j < 4; ++j) x[j].u32 ^= y[j].u32;
-
-				for (j = 0; j < 4; ++j) x[j].u32 ^= (x[j].u32 << 11) ^ (x[j].u32 >> 9);
-				for (j = 0; j < 4; ++j) {
-					x[j].u16[0] *= (uint16_t)(__NMH_M3 & 0xFFFF);
-					x[j].u16[1] *= (uint16_t)(__NMH_M3 >> 16);
-				}
-				for (j = 0; j < 4; ++j) x[j].u32 ^= (x[j].u32 >> 10) ^ (x[j].u32 >> 20);
-			}
-			for (j = 0; j < 4; ++j) x[j].u32 ^= NMH_readLE32(p + len - 32 + j * 4);
-			for (j = 0; j < 4; ++j) y[j].u32 ^= NMH_readLE32(p + len - 16 + j * 4);
-		} else {
-			/* 9 to 32 bytes */
-			x[0].u32 ^= NMH_readLE32(p);
-			x[1].u32 ^= NMH_readLE32(p + ((len>>4)<<3));
-			x[2].u32 ^= NMH_readLE32(p + len - 8);
-			x[3].u32 ^= NMH_readLE32(p + len - 8 - ((len>>4)<<3));
-			y[0].u32 ^= NMH_readLE32(p + 4);
-			y[1].u32 ^= NMH_readLE32(p + ((len>>4)<<3) + 4);
-			y[2].u32 ^= NMH_readLE32(p + len - 8 + 4);
-			y[3].u32 ^= NMH_readLE32(p + len - 8 - ((len>>4)<<3) + 4);
-		}
-
-		for (j = 0; j < 4; ++j) x[j].u32 += y[j].u32;
-		for (j = 0; j < 4; ++j) y[j].u32 ^= (y[j].u32 << 17) ^ (y[j].u32 >> 6);
-
-		for (j = 0; j < 4; ++j) {
-			x[j].u16[0] *= (uint16_t)(__NMH_M1 & 0xFFFF);
-			x[j].u16[1] *= (uint16_t)(__NMH_M1 >> 16);
-		}
-		for (j = 0; j < 4; ++j) x[j].u32 ^= (x[j].u32 << 5) ^ (x[j].u32 >> 13);
-		for (j = 0; j < 4; ++j) {
-			x[j].u16[0] *= (uint16_t)(__NMH_M2 & 0xFFFF);
-			x[j].u16[1] *= (uint16_t)(__NMH_M2 >> 16);
-		}
-
-		for (j = 0; j < 4; ++j) x[j].u32 ^= y[j].u32;
-
-		for (j = 0; j < 4; ++j) x[j].u32 ^= (x[j].u32 << 11) ^ (x[j].u32 >> 9);
-		for (j = 0; j < 4; ++j) {
-			x[j].u16[0] *= (uint16_t)(__NMH_M3 & 0xFFFF);
-			x[j].u16[1] *= (uint16_t)(__NMH_M3 >> 16);
-		}
-		for (j = 0; j < 4; ++j) x[j].u32 ^= (x[j].u32 >> 10) ^ (x[j].u32 >> 20);
-
-		x[0].u32 ^= NMH_PRIME32_1;
-		x[1].u32 ^= NMH_PRIME32_2;
-		x[2].u32 ^= NMH_PRIME32_3;
-		x[3].u32 ^= NMH_PRIME32_4;
-
-		for (j = 1; j < 4; ++j) x[0].u32 += x[j].u32;
-
-		x[0].u32 ^= sl + (sl >> 5);
-		x[0].u16[0] *= (uint16_t)(__NMH_M3 & 0xFFFF);
-		x[0].u16[1] *= (uint16_t)(__NMH_M3 >> 16);
-		x[0].u32 ^= (x[0].u32 >> 10) ^ (x[0].u32 >> 20);
-
-		result = x[0].u32;
-	}
-#	else /* at least NMH_SSE2 */
-	{
-		__m128i const h0 = _mm_setr_epi32((int)NMH_PRIME32_1, (int)NMH_PRIME32_2, (int)NMH_PRIME32_3, (int)NMH_PRIME32_4);
-		__m128i const sl = _mm_set1_epi32((int)seed + (int)len);
-		__m128i const m1 = _mm_set1_epi32((int)__NMH_M1);
-		__m128i const m2 = _mm_set1_epi32((int)__NMH_M2);
-		__m128i const m3 = _mm_set1_epi32((int)__NMH_M3);
-		__m128i       x = h0;
-		__m128i       y = sl;
-		const uint32_t *const px = (const uint32_t*)&x;
-
-		if (type) {
-			/* 32 to 127 bytes */
-			size_t const r = (len - 1) / 32;
-			size_t i;
-			for (i = 0; i < r; ++i) {
-				x = _mm_xor_si128(x, _mm_loadu_si128((const __m128i *)(p + i * 32)));
-				y = _mm_xor_si128(y, _mm_loadu_si128((const __m128i *)(p + i * 32 + 16)));
-				x = _mm_add_epi32(x, y);
-				x = _mm_mullo_epi16(x, m1);
-				x = _mm_xor_si128(_mm_xor_si128(x, _mm_slli_epi32(x, 5)), _mm_srli_epi32(x, 13));
-				x = _mm_mullo_epi16(x, m2);
-				x = _mm_xor_si128(x, y);
-				x = _mm_xor_si128(_mm_xor_si128(x, _mm_slli_epi32(x, 11)), _mm_srli_epi32(x, 9));
-				x = _mm_mullo_epi16(x, m3);
-				x = _mm_xor_si128(_mm_xor_si128(x, _mm_srli_epi32(x, 10)), _mm_srli_epi32(x, 20));
-			}
-			x = _mm_xor_si128(x, _mm_loadu_si128((const __m128i *)(p + len - 32)));
-			y = _mm_xor_si128(y, _mm_loadu_si128((const __m128i *)(p + len - 16)));
-		} else {
-			/* 9 to 32 bytes */
-			x = _mm_xor_si128(x, _mm_setr_epi32((int)NMH_readLE32(p), (int)NMH_readLE32(p + ((len>>4)<<3)), (int)NMH_readLE32(p + len - 8), (int)NMH_readLE32(p + len - 8 - ((len>>4)<<3))));
-			y = _mm_xor_si128(y, _mm_setr_epi32((int)NMH_readLE32(p + 4), (int)NMH_readLE32(p + ((len>>4)<<3) + 4), (int)NMH_readLE32(p + len - 8 + 4), (int)NMH_readLE32(p + len - 8 - ((len>>4)<<3) + 4)));
-		}
-
-		x = _mm_add_epi32(x, y);
-
-		y = _mm_xor_si128(_mm_xor_si128(y, _mm_slli_epi32(y, 17)), _mm_srli_epi32(y, 6));
-
-		x = _mm_mullo_epi16(x, m1);
-		x = _mm_xor_si128(_mm_xor_si128(x, _mm_slli_epi32(x, 5)), _mm_srli_epi32(x, 13));
-		x = _mm_mullo_epi16(x, m2);
-		x = _mm_xor_si128(x, y);
-		x = _mm_xor_si128(_mm_xor_si128(x, _mm_slli_epi32(x, 11)), _mm_srli_epi32(x, 9));
-		x = _mm_mullo_epi16(x, m3);
-		x = _mm_xor_si128(_mm_xor_si128(x, _mm_srli_epi32(x, 10)), _mm_srli_epi32(x, 20));
-
-		x = _mm_xor_si128(x, h0);
-		x = _mm_add_epi32(x, _mm_srli_si128(x, 4));
-		x = _mm_add_epi32(x, _mm_srli_si128(x, 8));
-
-		x = _mm_xor_si128(x, _mm_add_epi32(sl, _mm_srli_epi32(sl, 5)));
-		x = _mm_mullo_epi16(x, m3);
-		x = _mm_xor_si128(_mm_xor_si128(x, _mm_srli_epi32(x, 10)), _mm_srli_epi32(x, 20));
-
-		result = *px;
-	}
-#	endif
-	return *&result;
-}
-#define NMHASH32_9to32(p, len, seed) NMHASH32_9to255(p, len, seed, 0)
-#define NMHASH32_33to255(p, len, seed) NMHASH32_9to255(p, len, seed, 1)
-
-#undef __NMH_M1
-#undef __NMH_M2
-#undef __NMH_M3
-
-#if NMH_VECTOR == NMH_SCALAR
-#define NMHASH32_long_round NMHASH32_long_round_scalar
-static inline
-void
-NMHASH32_long_round_scalar(uint32_t *const NMH_RESTRICT accX, uint32_t *const NMH_RESTRICT accY, const uint8_t* const NMH_RESTRICT p)
-{
-	/* breadth first calculation will hint some compiler to auto vectorize the code
-	 * on gcc, the performance becomes 10x than the depth first, and about 80% of the manually vectorized code
-	 */
-	const size_t nbGroups = sizeof(NMH_ACC_INIT) / sizeof(*NMH_ACC_INIT);
-	size_t i;
-
-	for (i = 0; i < nbGroups; ++i) {
-		accX[i] ^= NMH_readLE32(p + i * 4);
-	}
-	for (i = 0; i < nbGroups; ++i) {
-		accY[i] ^= NMH_readLE32(p + i * 4 + sizeof(NMH_ACC_INIT));
-	}
-	for (i = 0; i < nbGroups; ++i) {
-		accX[i] += accY[i];
-	}
-	for (i = 0; i < nbGroups; ++i) {
-		accY[i] ^= accX[i] >> 1;
-	}
-	for (i = 0; i < nbGroups * 2; ++i) {
-		((uint16_t*)accX)[i] *= ((uint16_t*)__NMH_M1_V)[i];
-	}
-	for (i = 0; i < nbGroups; ++i) {
-		accX[i] ^= accX[i] << 5 ^ accX[i] >> 13;
-	}
-	for (i = 0; i < nbGroups * 2; ++i) {
-		((uint16_t*)accX)[i] *= ((uint16_t*)__NMH_M2_V)[i];
-	}
-	for (i = 0; i < nbGroups; ++i) {
-		accX[i] ^= accY[i];
-	}
-	for (i = 0; i < nbGroups; ++i) {
-		accX[i] ^= accX[i] << 11 ^ accX[i] >> 9;
-	}
-	for (i = 0; i < nbGroups * 2; ++i) {
-		((uint16_t*)accX)[i] *= ((uint16_t*)__NMH_M3_V)[i];
-	}
-	for (i = 0; i < nbGroups; ++i) {
-		accX[i] ^= accX[i] >> 10 ^ accX[i] >> 20;
-	}
-}
-#endif
-
-#if NMH_VECTOR == NMH_SSE2
-#  define _NMH_MM_(F) _mm_ ## F
-#  define _NMH_MMW_(F) _mm_ ## F ## 128
-#  define _NMH_MM_T __m128i
-#elif NMH_VECTOR == NMH_AVX2
-#  define _NMH_MM_(F) _mm256_ ## F
-#  define _NMH_MMW_(F) _mm256_ ## F ## 256
-#  define _NMH_MM_T __m256i
-#elif NMH_VECTOR == NMH_AVX512
-#  define _NMH_MM_(F) _mm512_ ## F
-#  define _NMH_MMW_(F) _mm512_ ## F ## 512
-#  define _NMH_MM_T __m512i
-#endif
-
-#if NMH_VECTOR == NMH_SSE2 || NMH_VECTOR == NMH_AVX2 || NMH_VECTOR == NMH_AVX512
-#  define NMHASH32_long_round NMHASH32_long_round_sse
-#  define NMH_VECTOR_NB_GROUP (sizeof(NMH_ACC_INIT) / sizeof(*NMH_ACC_INIT) / (sizeof(_NMH_MM_T) / sizeof(*NMH_ACC_INIT)))
-static inline
-void
-NMHASH32_long_round_sse(uint32_t *const NMH_RESTRICT accX, uint32_t *const NMH_RESTRICT accY, const uint8_t* const NMH_RESTRICT p)
-{
-	const _NMH_MM_T *const NMH_RESTRICT m1    = (const _NMH_MM_T * NMH_RESTRICT)__NMH_M1_V;
-	const _NMH_MM_T *const NMH_RESTRICT m2    = (const _NMH_MM_T * NMH_RESTRICT)__NMH_M2_V;
-	const _NMH_MM_T *const NMH_RESTRICT m3    = (const _NMH_MM_T * NMH_RESTRICT)__NMH_M3_V;
-	      _NMH_MM_T *const              xaccX = (      _NMH_MM_T *             )accX;
-	      _NMH_MM_T *const              xaccY = (      _NMH_MM_T *             )accY;
-	      _NMH_MM_T *const              xp    = (      _NMH_MM_T *             )p;
-	size_t i;
-
-	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
-		xaccX[i] = _NMH_MMW_(xor_si)(xaccX[i], _NMH_MMW_(loadu_si)(xp + i));
-	}
-	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
-		xaccY[i] = _NMH_MMW_(xor_si)(xaccY[i], _NMH_MMW_(loadu_si)(xp + i + NMH_VECTOR_NB_GROUP));
-	}
-	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
-		xaccX[i] = _NMH_MM_(add_epi32)(xaccX[i], xaccY[i]);
-	}
-	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
-		xaccY[i] = _NMH_MMW_(xor_si)(xaccY[i], _NMH_MM_(srli_epi32)(xaccX[i], 1));
-	}
-	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
-		xaccX[i] = _NMH_MM_(mullo_epi16)(xaccX[i], *m1);
-	}
-	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
-		xaccX[i] = _NMH_MMW_(xor_si)(_NMH_MMW_(xor_si)(xaccX[i], _NMH_MM_(slli_epi32)(xaccX[i], 5)), _NMH_MM_(srli_epi32)(xaccX[i], 13));
-	}
-	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
-		xaccX[i] = _NMH_MM_(mullo_epi16)(xaccX[i], *m2);
-	}
-	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
-		xaccX[i] = _NMH_MMW_(xor_si)(xaccX[i], xaccY[i]);
-	}
-	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
-		xaccX[i] = _NMH_MMW_(xor_si)(_NMH_MMW_(xor_si)(xaccX[i], _NMH_MM_(slli_epi32)(xaccX[i], 11)), _NMH_MM_(srli_epi32)(xaccX[i], 9));
-	}
-	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
-		xaccX[i] = _NMH_MM_(mullo_epi16)(xaccX[i], *m3);
-	}
-	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
-		xaccX[i] = _NMH_MMW_(xor_si)(_NMH_MMW_(xor_si)(xaccX[i], _NMH_MM_(srli_epi32)(xaccX[i], 10)), _NMH_MM_(srli_epi32)(xaccX[i], 20));
-	}
-}
-#  undef _NMH_MM_
-#  undef _NMH_MMW_
-#  undef _NMH_MM_T
-#  undef NMH_VECTOR_NB_GROUP
-#endif
-
-static
-uint32_t
-NMHASH32_long(const uint8_t* const NMH_RESTRICT p, size_t const len, uint32_t const seed)
-{
-	NMH_ALIGN(NMH_ACC_ALIGN) uint32_t accX[sizeof(NMH_ACC_INIT)/sizeof(*NMH_ACC_INIT)];
-	NMH_ALIGN(NMH_ACC_ALIGN) uint32_t accY[sizeof(accX)/sizeof(*accX)];
-	size_t const nbRounds = (len - 1) / (sizeof(accX) + sizeof(accY));
-	size_t i;
-	uint32_t sum = 0;
-
-	/* init */
-	for (i = 0; i < sizeof(accX)/sizeof(*accX); ++i) accX[i] = NMH_ACC_INIT[i];
-	for (i = 0; i < sizeof(accY)/sizeof(*accY); ++i) accY[i] = seed;
-
-	for (i = 0; i < nbRounds; ++i) {
-		NMHASH32_long_round(accX, accY, p + i * (sizeof(accX) + sizeof(accY)));
-	}
-	NMHASH32_long_round(accX, accY, p + len - (sizeof(accX) + sizeof(accY)));
-
-	/* merge acc */
-	for (i = 0; i < sizeof(accX)/sizeof(*accX); ++i) accX[i] ^= NMH_ACC_INIT[i];
-	for (i = 0; i < sizeof(accX)/sizeof(*accX); ++i) sum += accX[i];
-
-#	if SIZE_MAX > UINT32_C(-1)
-	sum += (uint32_t)(len >> 32);
-#	endif
-	return sum ^ (uint32_t)len;
-}
-
-static inline
-uint32_t
-NMHASH32_avalanche32(uint32_t const x)
-{
-	/* [-21 -8 cce5196d 12 -7 464be229 -21 -8] = 3.2267098842182733 */
-	const uint32_t m1 = UINT32_C(0xCCE5196D);
-	const uint32_t m2 = UINT32_C(0x464BE229);
-	union { uint32_t u32; uint16_t u16[2]; } vx;
-	vx.u32    = x;
-	vx.u32   ^= (vx.u32 >> 8) ^ (vx.u32 >> 21);
-	vx.u16[0] = (uint16_t)(vx.u16[0] * (uint16_t)m1);
-	vx.u16[1] = (uint16_t)(vx.u16[1] * (uint16_t)(m1 >> 16));
-	vx.u32   ^= (vx.u32 << 12) ^ (vx.u32 >> 7);
-	vx.u16[0] = (uint16_t)(vx.u16[0] * (uint16_t)m2);
-	vx.u16[1] = (uint16_t)(vx.u16[1] * (uint16_t)(m2 >> 16));
-	return vx.u32 ^ (vx.u32 >> 8) ^ (vx.u32 >> 21);
-}
-
-static inline
-uint32_t
-NMHASH32(const void* const NMH_RESTRICT input, size_t const len, uint32_t seed)
-{
-	const uint8_t *const p = (const uint8_t *)input;
-	if (NMH_likely(len <= 32)) {
-		if(NMH_likely(len > 8)) {
-			return NMHASH32_9to32(p, len, seed);
-		}
-		if(NMH_likely(len > 4)) {
-			uint32_t x = NMH_readLE32(p);
-			uint32_t y = NMH_readLE32(p + len - 4) ^ (NMH_PRIME32_4 + 2 + seed);
-			x += y;
-			x ^= x << (len + 7);
-			return NMHASH32_0to8(x, NMH_rotl32(y, 5));
-		} else {
-			union { uint32_t u32; uint16_t u16[2]; uint8_t u8[4]; } data;
-			switch (len) {
-				case 0: seed += NMH_PRIME32_2;
-					data.u32 = 0;
-					break;
-				case 1: seed += NMH_PRIME32_2 + (UINT32_C(1) << 24) + (1 << 1);
-					data.u32 = p[0];
-					break;
-				case 2: seed += NMH_PRIME32_2 + (UINT32_C(2) << 24) + (2 << 1);
-					data.u32 = NMH_readLE16(p);
-					break;
-				case 3: seed += NMH_PRIME32_2 + (UINT32_C(3) << 24) + (3 << 1);
-					data.u16[1] = p[2];
-					data.u16[0] = NMH_readLE16(p);
-					break;
-				case 4: seed += NMH_PRIME32_3;
-					data.u32 = NMH_readLE32(p);
-					break;
-				default: return 0;
-			}
-			return NMHASH32_0to8(data.u32 + seed, NMH_rotl32(seed, 5));
-		}
-	}
-	if (NMH_likely(len < 256)) {
-		return NMHASH32_33to255(p, len, seed);
-	}
-	return NMHASH32_avalanche32(NMHASH32_long(p, len, seed));
-}
-
-static inline
-uint32_t
-NMHASH32X_0to4(uint32_t x, uint32_t const seed)
-{
-	/* [bdab1ea9 18 a7896a1b 12 83796a2d 16] = 0.092922873297662509 */
-	x ^= seed;
-	x *= UINT32_C(0xBDAB1EA9);
-	x += NMH_rotl32(seed, 31);
-	x ^= x >> 18;
-	x *= UINT32_C(0xA7896A1B);
-	x ^= x >> 12;
-	x *= UINT32_C(0x83796A2D);
-	x ^= x >> 16;
-	return x;
-}
-
-static inline
-uint32_t
-NMHASH32X_5to8(const uint8_t* const NMH_RESTRICT p, size_t const len, uint32_t const seed)
-{
-	/* - 5 to 9 bytes
-	 * - mixer: [11049a7d 23 bcccdc7b 12 065e9dad 12] = 0.16577596555667246 */
-
-	uint32_t       x = NMH_readLE32(p) ^ NMH_PRIME32_3;
-	uint32_t const y = NMH_readLE32(p + len - 4) ^ seed;
-	x += y;
-	x ^= x >> len;
-	x *= UINT32_C(0x11049A7D);
-	x ^= x >> 23;
-	x *= UINT32_C(0xBCCCDC7B);
-	x ^= NMH_rotl32(y, 3);
-	x ^= x >> 12;
-	x *= UINT32_C(0x065E9DAD);
-	x ^= x >> 12;
-	return x;
-}
-
-static inline
-uint32_t
-NMHASH32X_9to255(const uint8_t* const NMH_RESTRICT p, size_t const len, uint32_t const seed)
-{
-	/* - at least 9 bytes
-	 * - base mixer: [11049a7d 23 bcccdc7b 12 065e9dad 12] = 0.16577596555667246
-	 * - tail mixer: [16 a52fb2cd 15 551e4d49 16] = 0.17162579707098322
-	 */
-
-	uint32_t x = NMH_PRIME32_3;
-	uint32_t y = seed;
-	uint32_t a = NMH_PRIME32_4;
-	uint32_t b = seed;
-	size_t i, r = (len - 1) / 16;
-
-	for (i = 0; i < r; ++i) {
-		x ^= NMH_readLE32(p + i * 16 + 0);
-		y ^= NMH_readLE32(p + i * 16 + 4);
-		x ^= y;
-		x *= UINT32_C(0x11049A7D);
-		x ^= x >> 23;
-		x *= UINT32_C(0xBCCCDC7B);
-		y  = NMH_rotl32(y, 4);
-		x ^= y;
-		x ^= x >> 12;
-		x *= UINT32_C(0x065E9DAD);
-		x ^= x >> 12;
-
-		a ^= NMH_readLE32(p + i * 16 + 8);
-		b ^= NMH_readLE32(p + i * 16 + 12);
-		a ^= b;
-		a *= UINT32_C(0x11049A7D);
-		a ^= a >> 23;
-		a *= UINT32_C(0xBCCCDC7B);
-		b  = NMH_rotl32(b, 3);
-		a ^= b;
-		a ^= a >> 12;
-		a *= UINT32_C(0x065E9DAD);
-		a ^= a >> 12;
-	}
-
-	if (NMH_likely(((uint8_t)len-1) & 8)) {
-		if (NMH_likely(((uint8_t)len-1) & 4)) {
-			a ^= NMH_readLE32(p + r * 16 + 0);
-			b ^= NMH_readLE32(p + r * 16 + 4);
-			a ^= b;
-			a *= UINT32_C(0x11049A7D);
-			a ^= a >> 23;
-			a *= UINT32_C(0xBCCCDC7B);
-			a ^= NMH_rotl32(b, 4);
-			a ^= a >> 12;
-			a *= UINT32_C(0x065E9DAD);
-		} else {
-			a ^= NMH_readLE32(p + r * 16) + b;
-			a ^= a >> 16;
-			a *= UINT32_C(0xA52FB2CD);
-			a ^= a >> 15;
-			a *= UINT32_C(0x551E4D49);
-		}
-
-		x ^= NMH_readLE32(p + len - 8);
-		y ^= NMH_readLE32(p + len - 4);
-		x ^= y;
-		x *= UINT32_C(0x11049A7D);
-		x ^= x >> 23;
-		x *= UINT32_C(0xBCCCDC7B);
-		x ^= NMH_rotl32(y, 3);
-		x ^= x >> 12;
-		x *= UINT32_C(0x065E9DAD);
-	} else {
-		if (NMH_likely(((uint8_t)len-1) & 4)) {
-			a ^= NMH_readLE32(p + r * 16) + b;
-			a ^= a >> 16;
-			a *= UINT32_C(0xA52FB2CD);
-			a ^= a >> 15;
-			a *= UINT32_C(0x551E4D49);
-		}
-		x ^= NMH_readLE32(p + len - 4) + y;
-		x ^= x >> 16;
-		x *= UINT32_C(0xA52FB2CD);
-		x ^= x >> 15;
-		x *= UINT32_C(0x551E4D49);
-	}
-
-	x ^= (uint32_t)len;
-	x ^= NMH_rotl32(a, 27); /* rotate one lane to pass Diff test */
-	x ^= x >> 14;
-	x *= UINT32_C(0x141CC535);
-
-	return x;
-}
-
-static inline
-uint32_t
-NMHASH32X_avalanche32(uint32_t x)
-{
-	/* mixer with 2 mul from skeeto/hash-prospector:
-	 * [15 d168aaad 15 af723597 15] = 0.15983776156606694
-	 */
-	x ^= x >> 15;
-	x *= UINT32_C(0xD168AAAD);
-	x ^= x >> 15;
-	x *= UINT32_C(0xAF723597);
-	x ^= x >> 15;
-	return x;
-}
-
-/* use 32*32->32 multiplication for short hash */
-static inline
-uint32_t
-NMHASH32X(const void* const NMH_RESTRICT input, size_t const len, uint32_t seed)
-{
-	const uint8_t *const p = (const uint8_t *)input;
-	if (NMH_likely(len <= 8)) {
-		if (NMH_likely(len > 4)) {
-			return NMHASH32X_5to8(p, len, seed);
-		} else {
-			/* 0-4 bytes */
-			union { uint32_t u32; uint16_t u16[2]; uint8_t u8[4]; } data;
-			switch (len) {
-				case 0: seed += NMH_PRIME32_2;
-					data.u32 = 0;
-					break;
-				case 1: seed += NMH_PRIME32_2 + (UINT32_C(1) << 24) + (1 << 1);
-					data.u32 = p[0];
-					break;
-				case 2: seed += NMH_PRIME32_2 + (UINT32_C(2) << 24) + (2 << 1);
-					data.u32 = NMH_readLE16(p);
-					break;
-				case 3: seed += NMH_PRIME32_2 + (UINT32_C(3) << 24) + (3 << 1);
-					data.u16[1] = p[2];
-					data.u16[0] = NMH_readLE16(p);
-					break;
-				case 4: seed += NMH_PRIME32_1;
-					data.u32 = NMH_readLE32(p);
-					break;
-				default: return 0;
-			}
-			return NMHASH32X_0to4(data.u32, seed);
-		}
-	}
-	if (NMH_likely(len < 256)) {
-		return NMHASH32X_9to255(p, len, seed);
-	}
-	return NMHASH32X_avalanche32(NMHASH32_long(p, len, seed));
-}
-
-#if defined(_MSC_VER) && _MSC_VER >= 1914
-#  pragma warning(pop)
-#endif
-#ifdef __SDCC
-#  pragma restore
-#  undef const
-#endif
-
-#endif /* _nmhash_h_ */
-
-#ifdef __cplusplus
-}
-#endif