diff --git a/src/Makefile b/src/Makefile
index 3e714988973..87e93cb581b 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -37,6 +37,8 @@ else
     INSTALL_EXE := stockfish
 endif
 
+KERNEL := $(shell uname -s)
+
 strip:
 	-@test -f stockfish && strip stockfish
 	-@test -f stockfish.exe && strip stockfish.exe
@@ -47,7 +49,7 @@ install:
 	strip $(INSTALL_PATH)/$(INSTALL_EXE)
 
 clean: objclean profileclean
-	@rm -f .depend *~ core
+	@rm -f .depend
 
 objclean:
 	@rm -f stockfish stockfish.exe $(OBJS) *.o.tmp
@@ -108,8 +110,8 @@ ifeq ($(call test-compiler-macro,__GNUC__),1)
     else ifeq ($(call test-compiler-macro,__clang__),1)
         $(info Using LLVM C/C++ Compiler (Clang)) $(info )
         COMP := clang
-        CLANG_MAJOR := $(call get-compiler-macro,__clang_major__)
-        LLVM_PROFDATA := $(shell command -v llvm-profdata-$(CLANG_MAJOR) 2> /dev/null || \
+        CLANG_VERSION := $(call get-compiler-macro,__clang_major__)
+        LLVM_PROFDATA := $(shell command -v llvm-profdata-$(CLANG_VERSION) 2> /dev/null || \
 								 command -v llvm-profdata 2> /dev/null)
         profile_make = clang-profile-make
         profile_use  = clang-profile-use
@@ -117,6 +119,7 @@ ifeq ($(call test-compiler-macro,__GNUC__),1)
     else
         $(info Using GNU C/C++ Compiler) $(info )
         COMP := gcc
+        GCC_VERSION := $(call get-compiler-macro,__GNUC__)
         profile_make = gcc-profile-make
         profile_use  = gcc-profile-use
     endif
@@ -171,12 +174,20 @@ ifeq ($(optimize),yes)
     SF_CXXFLAGS += -O3
 
     ifeq ($(COMP),gcc)
-        SF_CXXFLAGS += -funroll-loops -flto=jobserver -flto-partition=one
-        SF_LDFLAGS  += -flto=jobserver -flto-partition=one
+        SF_CXXFLAGS += -funroll-loops
+        ifeq ($(shell expr $(GCC_VERSION) \< 12),1)
+            SF_CXXFLAGS += -flto
+            SF_LDFLAGS  += -flto
+        else
+            SF_CXXFLAGS += -flto=jobserver
+            SF_LDFLAGS  += -flto=jobserver
+        endif
+        SF_CXXFLAGS += -flto-partition=one
+        SF_LDFLAGS  += -flto-partition=one
     else ifeq ($(COMP),clang)
         SF_CXXFLAGS += -funroll-loops -flto=full
         SF_LDFLAGS  += -flto=full
-        ifeq ($(shell expr $(CLANG_MAJOR) \< 16),1)
+        ifeq ($(shell expr $(CLANG_VERSION) \< 16),1)
             SF_CXXFLAGS += -fexperimental-new-pass-manager
         endif
 	else ifeq ($(COMP),icx)
@@ -220,6 +231,8 @@ endif
 endif   # CXX_REQUIRED_RULES
 
 ### 3. Add flags from architecture-specific Makefile
+###   Note that this section is not enclosed in the CXX_REQUIRED_RULES block;
+###   Users shall be able to see the help text even when there is no compiler.
 
 ifeq ($(ARCH),)
     override ARCH := native
@@ -244,6 +257,28 @@ export ARCH
 
 SF_CXXFLAGS += -DARCH=$(ARCH)
 
+### 4. Extra flags for cross-compilation
+###   Information of target architecture is needed here.
+
+ifneq ($(filter $(MAKECMDGOALS),$(CXX_REQUIRED_RULES)),)
+
+# Android NDK
+ifneq ($(filter $(ARCH_FAMILY),i386 arm),)
+    ifeq ($(call test-compiler-macro,__ANDROID__),1)
+        SF_CXXFLAGS += -stdlib=libc++ -fPIE
+        SF_LDFLAGS  += -static-libstdc++ -pie
+        SF_LIBS     += m atomic
+    endif
+endif
+
+# Link atomic library if not i386/arm family
+ifneq ($(ARCH_NATIVE),y)
+    ifeq ($(filter $(ARCH_FAMILY),i386 arm),)
+        SF_LIBS += atomic
+    endif
+endif
+
+endif   # CXX_REQUIRED_RULES
 endif   # MAKELEVEL=0
 
 SF_CXXFLAGS := $(strip $(SF_CXXFLAGS) $(CXXFLAGS))
@@ -320,7 +355,7 @@ profile-build: config-sanity objclean profileclean
 	@echo "Step 1/4. Building instrumented executable ..."
 	@$(MAKE) --no-print-directory CXXFLAGS="" LDFLAGS="" $(profile_make)
 	@printf "\n%s\n" "Step 2/4. Running benchmark for pgo-build ..."
-	@./$(EXE) bench > PGOBENCH.out 2>&1
+	@$(EMULATE) ./$(EXE) bench > PGOBENCH.out 2>&1
 	@tail -n 4 PGOBENCH.out
 	@printf "\n%s\n" "Step 3/4. Building optimized executable ..."
 	@$(MAKE) --no-print-directory objclean
@@ -358,7 +393,7 @@ clang-profile-make:
 		CXXFLAGS="-fprofile-generate" LDFLAGS="-fprofile-generate" all
 
 clang-profile-use:
-	@$(XCRUN) $(LLVM_PROFDATA) merge -output=stockfish.profdata *.profraw
+	$(XCRUN) $(LLVM_PROFDATA) merge -output=stockfish.profdata *.profraw
 	@$(MAKE) --no-print-directory \
 		CXXFLAGS="-fprofile-use=stockfish.profdata" \
 		LDFLAGS="-fprofile-use=stockfish.profdata" \
diff --git a/src/arch/arm/arch.h b/src/arch/arm/arch.h
index 74d91e1a4b3..30f50451aaf 100644
--- a/src/arch/arm/arch.h
+++ b/src/arch/arm/arch.h
@@ -58,15 +58,16 @@ inline void vdotq_s32_v(int32x4_t& acc, int8x16_t in, int8x16_t col) {
 #ifdef __ARM_FEATURE_DOTPROD
     acc = vdotq_s32(acc, in, col);
 #elif __ARM_ARCH >= 8
-    int16x8_t product0 = vmull_s8(vget_low_s8(a), vget_low_s8(b));
-    int16x8_t product1 = vmull_high_s8(a, b);
+    int16x8_t product0 = vmull_s8(vget_low_s8(in), vget_low_s8(col));
+    int16x8_t product1 = vmull_high_s8(in, col);
     int16x8_t sum      = vpaddq_s16(product0, product1);
     acc                = vpadalq_s16(acc, sum);
 #else
-    int16x8_t product0 = vmull_s8(vget_low_s8(a), vget_low_s8(b));
-    int16x8_t product1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
-    int16x8_t sum      = vpaddq_s16(product0, product1);
-    acc                = vpadalq_s16(acc, sum);
+    int16x8_t product0 = vmull_s8(vget_low_s8(in), vget_low_s8(col));
+    int16x8_t product1 = vmull_s8(vget_high_s8(in), vget_high_s8(col));
+    int16x8_t sum =
+      vcombine_s16(vqmovn_s32(vpaddlq_s16(product0)), vqmovn_s32(vpaddlq_s16(product1)));
+    acc = vpadalq_s16(acc, sum);
 #endif
 }
 
diff --git a/src/arch/arm/nnue/layers/affine_transform.h b/src/arch/arm/nnue/layers/affine_transform.h
index 44cafb3c3db..3fb69bce2d8 100644
--- a/src/arch/arm/nnue/layers/affine_transform.h
+++ b/src/arch/arm/nnue/layers/affine_transform.h
@@ -85,8 +85,8 @@ void AffineTransform<InDims, OutDims>::propagate(const InputType* input, OutputT
 
         for (IndexType j = 0; j < NumChunks; ++j)
         {
-            const int32x4_t in  = reinterpret_cast<const int32x4_t*>(input)[j];
-            const int32x4_t row = reinterpret_cast<const int32x4_t*>(weights)[j];
+            const int8x16_t in  = reinterpret_cast<const int8x16_t*>(input)[j];
+            const int8x16_t row = reinterpret_cast<const int8x16_t*>(weights)[j];
             vdotq_s32_v(sum, in, row);
         }
 
diff --git a/src/arch/arm/nnue/layers/affine_transform_sparse_input.h b/src/arch/arm/nnue/layers/affine_transform_sparse_input.h
index af21f3036de..546002c256f 100644
--- a/src/arch/arm/nnue/layers/affine_transform_sparse_input.h
+++ b/src/arch/arm/nnue/layers/affine_transform_sparse_input.h
@@ -33,7 +33,7 @@
 
 namespace Stockfish::Eval::NNUE::Layers {
 
-#ifdef __ARM_NEON
+#if __ARM_ARCH >= 8
 
 alignas(CacheLineSize) static const std::array<std::array<std::uint16_t, 8>, 256> lookupIndices =
   [] {
@@ -70,13 +70,13 @@ class AffineTransformSparseInput: public AffineTransform<InDims, OutDims> {
             const int32x4_t chunk0 = in[i * 2];
             const int32x4_t chunk1 = in[i * 2 + 1];
 
-            static const int32x4_t movemask = [] {
-                const std::int32_t n[4] = {1, 2, 4, 8};
-                return vld1q_s32(n);
+            static const uint32x4_t movemask = [] {
+                const std::uint32_t n[4] = {1, 2, 4, 8};
+                return vld1q_u32(n);
             }();
 
-            const std::uint32_t nnz = vaddvq_u32(vandq_s32(vtstq_s32(chunk0, chunk0), movemask))
-                                    | vaddvq_u32(vandq_s32(vtstq_s32(chunk1, chunk1), movemask))
+            const std::uint32_t nnz = vaddvq_u32(vandq_u32(vtstq_s32(chunk0, chunk0), movemask))
+                                    | vaddvq_u32(vandq_u32(vtstq_s32(chunk1, chunk1), movemask))
                                         << 4;
             const uint16x8_t offsets = *reinterpret_cast<const uint16x8_t*>(&lookupIndices[nnz]);
             *reinterpret_cast<uint16x8_t*>(indices + count) = vaddq_u16(base, offsets);
@@ -105,9 +105,10 @@ void AffineTransformSparseInput<InDims, OutDims>::propagate(const InputType* inp
 
     for (IndexType j = 0; j < count; ++j)
     {
-        const auto      i  = nnz[j];
-        const int32x4_t in = vdupq_n_s32(reinterpret_cast<const std::int32_t*>(input)[i]);
-        const auto col     = reinterpret_cast<const int32x4_t*>(&weights[i * OutputDimensions * 4]);
+        const auto      i = nnz[j];
+        const int8x16_t in =
+          vreinterpretq_s8_s32(vdupq_n_s32(reinterpret_cast<const std::int32_t*>(input)[i]));
+        const auto col = reinterpret_cast<const int8x16_t*>(&weights[i * OutputDimensions * 4]);
         for (std::size_t k = 0; k < array_size(acc); ++k)
             vdotq_s32_v(acc[k], in, col[k]);
     }
@@ -121,7 +122,7 @@ void AffineTransformSparseInput<InDims, OutDims>::propagate(const InputType* inp
 template<IndexType InDims, IndexType OutDims>
 using AffineTransformSparseInput = AffineTransform<InDims, OutDims>;
 
-#endif  // __ARM_NEON
+#endif  // __ARM_ARCH >= 8
 
 }  // namespace Stockfish::Eval::NNUE::Layers
 
diff --git a/src/arch/arm/nnue/layers/clipped_relu.h b/src/arch/arm/nnue/layers/clipped_relu.h
index ef01b684948..7e644ec5060 100644
--- a/src/arch/arm/nnue/layers/clipped_relu.h
+++ b/src/arch/arm/nnue/layers/clipped_relu.h
@@ -60,7 +60,7 @@ void ClippedReLU<InDims>::propagate(const InputType* input, OutputType* output)
 
         words.tuple.val[0] = vqshrn_n_s32(in[i * 2 + 0], WeightScaleBits);
         words.tuple.val[1] = vqshrn_n_s32(in[i * 2 + 1], WeightScaleBits);
-        out[i]             = vmax_s8(vqmovn_s16(words), vdup_n_s8(0));
+        out[i]             = vmax_s8(vqmovn_s16(words.all), vdup_n_s8(0));
 #endif
     }
 }
diff --git a/src/arch/arm/nnue/nnue_feature_transformer.h b/src/arch/arm/nnue/nnue_feature_transformer.h
index df3ec0071fc..0427507d224 100644
--- a/src/arch/arm/nnue/nnue_feature_transformer.h
+++ b/src/arch/arm/nnue/nnue_feature_transformer.h
@@ -346,6 +346,6 @@ void FeatureTransformer<TransformedFeatureDimensions, accPtr>::convert_accumulat
 
 }  // namespace Stockfish::Eval::NNUE
 
-#endif  // !__SSE2__
+#endif  // !__ARM_NEON
 
 #endif  // ARM_NNUE_FEATURE_TRANSFORMER_H_INCLUDED
diff --git a/src/arch/i386/arch.h b/src/arch/i386/arch.h
index aa98c2f21a6..1ef79088240 100644
--- a/src/arch/i386/arch.h
+++ b/src/arch/i386/arch.h
@@ -193,8 +193,27 @@ inline int tzcnt(T n) {
 #endif
 }
 
+#ifdef __SSE2__
+
+template<typename T>
+struct is_valid_vector {
+    static constexpr bool value = sizeof(T) == 16
+#ifdef __AVX2__
+                               || sizeof(T) == 32
+#endif
+#ifdef __AVX512F__
+                               || sizeof(T) == 64
+#endif
+      ;
+};
+
+template<typename T>
+inline constexpr bool is_valid_vector_v = is_valid_vector<T>::value;
+
 template<typename T>
 inline T _mm_setzero_v() {
+    static_assert(is_valid_vector_v<T>);
+
 #ifdef __AVX512F__
     if constexpr (sizeof(T) == 64)
         return _mm512_setzero_si512();
@@ -205,14 +224,14 @@ inline T _mm_setzero_v() {
         return _mm256_setzero_si256();
 #endif
 
-#ifdef __SSE2__
     if constexpr (sizeof(T) == 16)
         return _mm_setzero_si128();
-#endif
 }
 
 template<typename T>
 inline T _mm_set1_epi16_v(std::uint16_t n) {
+    static_assert(is_valid_vector_v<T>);
+
 #ifdef __AVX512F__
     if constexpr (sizeof(T) == 64)
         return _mm512_set1_epi16(n);
@@ -223,14 +242,14 @@ inline T _mm_set1_epi16_v(std::uint16_t n) {
         return _mm256_set1_epi16(n);
 #endif
 
-#ifdef __SSE2__
     if constexpr (sizeof(T) == 16)
         return _mm_set1_epi16(n);
-#endif
 }
 
 template<typename T>
 inline T _mm_set1_epi32_v(std::uint32_t n) {
+    static_assert(is_valid_vector_v<T>);
+
 #ifdef __AVX512F__
     if constexpr (sizeof(T) == 64)
         return _mm512_set1_epi32(n);
@@ -241,14 +260,14 @@ inline T _mm_set1_epi32_v(std::uint32_t n) {
         return _mm256_set1_epi32(n);
 #endif
 
-#ifdef __SSE2__
     if constexpr (sizeof(T) == 16)
         return _mm_set1_epi32(n);
-#endif
 }
 
 template<typename T>
 inline T _mm_packus_epi16_v(T a, T b) {
+    static_assert(is_valid_vector_v<T>);
+
 #ifdef __AVX512F__
     if constexpr (sizeof(T) == 64)
 #ifdef __AVX512BW__
@@ -267,14 +286,14 @@ inline T _mm_packus_epi16_v(T a, T b) {
 #endif
 #endif
 
-#ifdef __SSE2__
     if constexpr (sizeof(T) == 16)
         return _mm_packus_epi16(a, b);
-#endif
 }
 
 template<typename T>
 inline T _mm_add_epi16_v(T a, T b) {
+    static_assert(is_valid_vector_v<T>);
+
 #ifdef __AVX512F__
     if constexpr (sizeof(T) == 64)
 #ifdef __AVX512BW__
@@ -293,14 +312,14 @@ inline T _mm_add_epi16_v(T a, T b) {
 #endif
 #endif
 
-#ifdef __SSE2__
     if constexpr (sizeof(T) == 16)
         return _mm_add_epi16(a, b);
-#endif
 }
 
 template<typename T>
 inline T _mm_add_epi32_v(T a, T b) {
+    static_assert(is_valid_vector_v<T>);
+
 #ifdef __AVX512F__
     if constexpr (sizeof(T) == 64)
         return _mm512_add_epi32(a, b);
@@ -315,14 +334,14 @@ inline T _mm_add_epi32_v(T a, T b) {
 #endif
 #endif
 
-#ifdef __SSE2__
     if constexpr (sizeof(T) == 16)
         return _mm_add_epi32(a, b);
-#endif
 }
 
 template<typename T>
 inline T _mm_sub_epi16_v(T a, T b) {
+    static_assert(is_valid_vector_v<T>);
+
 #ifdef __AVX512F__
     if constexpr (sizeof(T) == 64)
 #ifdef __AVX512BW__
@@ -341,14 +360,14 @@ inline T _mm_sub_epi16_v(T a, T b) {
 #endif
 #endif
 
-#ifdef __SSE2__
     if constexpr (sizeof(T) == 16)
         return _mm_sub_epi16(a, b);
-#endif
 }
 
 template<typename T>
 inline T _mm_sub_epi32_v(T a, T b) {
+    static_assert(is_valid_vector_v<T>);
+
 #ifdef __AVX512F__
     if constexpr (sizeof(T) == 64)
         return _mm512_sub_epi32(a, b);
@@ -363,14 +382,14 @@ inline T _mm_sub_epi32_v(T a, T b) {
 #endif
 #endif
 
-#ifdef __SSE2__
     if constexpr (sizeof(T) == 16)
         return _mm_sub_epi32(a, b);
-#endif
 }
 
 template<typename T>
 inline T _mm_mulhi_epi16_v(T a, T b) {
+    static_assert(is_valid_vector_v<T>);
+
 #ifdef __AVX512F__
     if constexpr (sizeof(T) == 64)
 #ifdef __AVX512BW__
@@ -389,14 +408,14 @@ inline T _mm_mulhi_epi16_v(T a, T b) {
 #endif
 #endif
 
-#ifdef __SSE2__
     if constexpr (sizeof(T) == 16)
         return _mm_mulhi_epi16(a, b);
-#endif
 }
 
 template<typename T>
 inline T _mm_slli_epi16_v(T a, int n) {
+    static_assert(is_valid_vector_v<T>);
+
 #ifdef __AVX512F__
     if constexpr (sizeof(T) == 64)
 #ifdef __AVX512BW__
@@ -415,14 +434,14 @@ inline T _mm_slli_epi16_v(T a, int n) {
 #endif
 #endif
 
-#ifdef __SSE2__
     if constexpr (sizeof(T) == 16)
         return _mm_slli_epi16(a, n);
-#endif
 }
 
 template<typename T>
 inline T _mm_max_epi16_v(T a, T b) {
+    static_assert(is_valid_vector_v<T>);
+
 #ifdef __AVX512F__
     if constexpr (sizeof(T) == 64)
 #ifdef __AVX512BW__
@@ -441,14 +460,14 @@ inline T _mm_max_epi16_v(T a, T b) {
 #endif
 #endif
 
-#ifdef __SSE2__
     if constexpr (sizeof(T) == 16)
         return _mm_max_epi16(a, b);
-#endif
 }
 
 template<typename T>
 inline T _mm_min_epi16_v(T a, T b) {
+    static_assert(is_valid_vector_v<T>);
+
 #ifdef __AVX512F__
     if constexpr (sizeof(T) == 64)
 #ifdef __AVX512BW__
@@ -467,14 +486,14 @@ inline T _mm_min_epi16_v(T a, T b) {
 #endif
 #endif
 
-#ifdef __SSE2__
     if constexpr (sizeof(T) == 16)
         return _mm_min_epi16(a, b);
-#endif
 }
 
 template<typename T>
 inline std::int32_t _mm_reduce_add_epi32_v(T a) {
+    static_assert(is_valid_vector_v<T>);
+
 #ifdef __AVX512F__
     if constexpr (sizeof(T) == 64)
         return _mm512_reduce_add_epi32(a);
@@ -490,14 +509,12 @@ inline std::int32_t _mm_reduce_add_epi32_v(T a) {
     }
 #endif
 
-#ifdef __SSE2__
     if constexpr (sizeof(T) == 16)
     {
         a = _mm_add_epi32(a, _mm_shuffle_epi32(a, 0x4E));  // _MM_PERM_BADC
         a = _mm_add_epi32(a, _mm_shuffle_epi32(a, 0xB1));  // _MM_PERM_CDAB
         return _mm_cvtsi128_si32(a);
     }
-#endif
 }
 
 // Non-VNNI implementation of dpbusd works even with type saturation, only
@@ -505,6 +522,8 @@ inline std::int32_t _mm_reduce_add_epi32_v(T a) {
 // AffineTransform layer. Do not use this without VNNI for general purpose.
 template<typename T>
 inline void _mm_dpbusd_epi32_v(T& acc, T a, T b) {
+    static_assert(is_valid_vector_v<T>);
+
 #ifdef __AVX512F__
     if constexpr (sizeof(T) == 64)
     {
@@ -542,7 +561,6 @@ inline void _mm_dpbusd_epi32_v(T& acc, T a, T b) {
     }
 #endif
 
-#ifdef __SSE2__
     if constexpr (sizeof(T) == 16)
     {
 #if (defined(__AVX512VL__) && defined(__AVX512VNNI__)) || defined(__AVXVNNI__)
@@ -568,9 +586,10 @@ inline void _mm_dpbusd_epi32_v(T& acc, T a, T b) {
 
 #endif
     }
-#endif
 }
 
+#endif  // __SSE2__
+
 }  // namespace Stockfish
 
 #endif  // I386_ARCH_H_INCLUDED