Update

* Added preliminary Android NDK support. * Use -flto=jobserver for GCC 12 and later. * [i386] Fixed GCC warnings related to SFIANE. * [ARM] Disabled AffineTransformSparseInput in ARMv7. * [ARM] Fixed ClippedReLU in ARMv7. * [ARM] Fixed wrong variable types.
official-stockfish · Sep 1, 2024 · 71dda9b · 71dda9b
1 parent 4b4f7b1
commit 71dda9b
Show file tree

Hide file tree

Showing 7 changed files with 112 additions and 56 deletions.
diff --git a/src/Makefile b/src/Makefile
@@ -37,6 +37,8 @@ else
     INSTALL_EXE := stockfish
 endif
 
+KERNEL := $(shell uname -s)
+
 strip:
 	-@test -f stockfish && strip stockfish
 	-@test -f stockfish.exe && strip stockfish.exe
@@ -47,7 +49,7 @@ install:
 	strip $(INSTALL_PATH)/$(INSTALL_EXE)
 
 clean: objclean profileclean
-	@rm -f .depend *~ core
+	@rm -f .depend
 
 objclean:
 	@rm -f stockfish stockfish.exe $(OBJS) *.o.tmp
@@ -108,15 +110,16 @@ ifeq ($(call test-compiler-macro,__GNUC__),1)
     else ifeq ($(call test-compiler-macro,__clang__),1)
         $(info Using LLVM C/C++ Compiler (Clang)) $(info )
         COMP := clang
-        CLANG_MAJOR := $(call get-compiler-macro,__clang_major__)
-        LLVM_PROFDATA := $(shell command -v llvm-profdata-$(CLANG_MAJOR) 2> /dev/null || \
+        CLANG_VERSION := $(call get-compiler-macro,__clang_major__)
+        LLVM_PROFDATA := $(shell command -v llvm-profdata-$(CLANG_VERSION) 2> /dev/null || \
 								 command -v llvm-profdata 2> /dev/null)
         profile_make = clang-profile-make
         profile_use  = clang-profile-use
         export LLVM_PROFDATA
     else
         $(info Using GNU C/C++ Compiler) $(info )
         COMP := gcc
+        GCC_VERSION := $(call get-compiler-macro,__GNUC__)
         profile_make = gcc-profile-make
         profile_use  = gcc-profile-use
     endif
@@ -171,12 +174,20 @@ ifeq ($(optimize),yes)
     SF_CXXFLAGS += -O3
 
     ifeq ($(COMP),gcc)
-        SF_CXXFLAGS += -funroll-loops -flto=jobserver -flto-partition=one
-        SF_LDFLAGS  += -flto=jobserver -flto-partition=one
+        SF_CXXFLAGS += -funroll-loops
+        ifeq ($(shell expr $(GCC_VERSION) \< 12),1)
+            SF_CXXFLAGS += -flto
+            SF_LDFLAGS  += -flto
+        else
+            SF_CXXFLAGS += -flto=jobserver
+            SF_LDFLAGS  += -flto=jobserver
+        endif
+        SF_CXXFLAGS += -flto-partition=one
+        SF_LDFLAGS  += -flto-partition=one
     else ifeq ($(COMP),clang)
         SF_CXXFLAGS += -funroll-loops -flto=full
         SF_LDFLAGS  += -flto=full
-        ifeq ($(shell expr $(CLANG_MAJOR) \< 16),1)
+        ifeq ($(shell expr $(CLANG_VERSION) \< 16),1)
             SF_CXXFLAGS += -fexperimental-new-pass-manager
         endif
 	else ifeq ($(COMP),icx)
@@ -220,6 +231,8 @@ endif
 endif   # CXX_REQUIRED_RULES
 
 ### 3. Add flags from architecture-specific Makefile
+###   Note that this section is not enclosed in the CXX_REQUIRED_RULES block;
+###   Users shall be able to see the help text even when there is no compiler.
 
 ifeq ($(ARCH),)
     override ARCH := native
@@ -244,6 +257,28 @@ export ARCH
 
 SF_CXXFLAGS += -DARCH=$(ARCH)
 
+### 4. Extra flags for cross-compilation
+###   Information of target architecture is needed here.
+
+ifneq ($(filter $(MAKECMDGOALS),$(CXX_REQUIRED_RULES)),)
+
+# Android NDK
+ifneq ($(filter $(ARCH_FAMILY),i386 arm),)
+    ifeq ($(call test-compiler-macro,__ANDROID__),1)
+        SF_CXXFLAGS += -stdlib=libc++ -fPIE
+        SF_LDFLAGS  += -static-libstdc++ -pie
+        SF_LIBS     += m atomic
+    endif
+endif
+
+# Link atomic library if not i386/arm family
+ifneq ($(ARCH_NATIVE),y)
+    ifeq ($(filter $(ARCH_FAMILY),i386 arm),)
+        SF_LIBS += atomic
+    endif
+endif
+
+endif   # CXX_REQUIRED_RULES
 endif   # MAKELEVEL=0
 
 SF_CXXFLAGS := $(strip $(SF_CXXFLAGS) $(CXXFLAGS))
@@ -320,7 +355,7 @@ profile-build: config-sanity objclean profileclean
 	@echo "Step 1/4. Building instrumented executable ..."
 	@$(MAKE) --no-print-directory CXXFLAGS="" LDFLAGS="" $(profile_make)
 	@printf "\n%s\n" "Step 2/4. Running benchmark for pgo-build ..."
-	@./$(EXE) bench > PGOBENCH.out 2>&1
+	@$(EMULATE) ./$(EXE) bench > PGOBENCH.out 2>&1
 	@tail -n 4 PGOBENCH.out
 	@printf "\n%s\n" "Step 3/4. Building optimized executable ..."
 	@$(MAKE) --no-print-directory objclean
@@ -358,7 +393,7 @@ clang-profile-make:
 		CXXFLAGS="-fprofile-generate" LDFLAGS="-fprofile-generate" all
 
 clang-profile-use:
-	@$(XCRUN) $(LLVM_PROFDATA) merge -output=stockfish.profdata *.profraw
+	$(XCRUN) $(LLVM_PROFDATA) merge -output=stockfish.profdata *.profraw
 	@$(MAKE) --no-print-directory \
 		CXXFLAGS="-fprofile-use=stockfish.profdata" \
 		LDFLAGS="-fprofile-use=stockfish.profdata" \

diff --git a/src/arch/arm/arch.h b/src/arch/arm/arch.h
@@ -58,15 +58,16 @@ inline void vdotq_s32_v(int32x4_t& acc, int8x16_t in, int8x16_t col) {
 #ifdef __ARM_FEATURE_DOTPROD
     acc = vdotq_s32(acc, in, col);
 #elif __ARM_ARCH >= 8
-    int16x8_t product0 = vmull_s8(vget_low_s8(a), vget_low_s8(b));
-    int16x8_t product1 = vmull_high_s8(a, b);
+    int16x8_t product0 = vmull_s8(vget_low_s8(in), vget_low_s8(col));
+    int16x8_t product1 = vmull_high_s8(in, col);
     int16x8_t sum      = vpaddq_s16(product0, product1);
     acc                = vpadalq_s16(acc, sum);
 #else
-    int16x8_t product0 = vmull_s8(vget_low_s8(a), vget_low_s8(b));
-    int16x8_t product1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
-    int16x8_t sum      = vpaddq_s16(product0, product1);
-    acc                = vpadalq_s16(acc, sum);
+    int16x8_t product0 = vmull_s8(vget_low_s8(in), vget_low_s8(col));
+    int16x8_t product1 = vmull_s8(vget_high_s8(in), vget_high_s8(col));
+    int16x8_t sum =
+      vcombine_s16(vqmovn_s32(vpaddlq_s16(product0)), vqmovn_s32(vpaddlq_s16(product1)));
+    acc = vpadalq_s16(acc, sum);
 #endif
 }
 

diff --git a/src/arch/arm/nnue/layers/affine_transform.h b/src/arch/arm/nnue/layers/affine_transform.h
@@ -85,8 +85,8 @@ void AffineTransform<InDims, OutDims>::propagate(const InputType* input, OutputT
 
         for (IndexType j = 0; j < NumChunks; ++j)
         {
-            const int32x4_t in  = reinterpret_cast<const int32x4_t*>(input)[j];
-            const int32x4_t row = reinterpret_cast<const int32x4_t*>(weights)[j];
+            const int8x16_t in  = reinterpret_cast<const int8x16_t*>(input)[j];
+            const int8x16_t row = reinterpret_cast<const int8x16_t*>(weights)[j];
             vdotq_s32_v(sum, in, row);
         }
 

diff --git a/src/arch/arm/nnue/layers/affine_transform_sparse_input.h b/src/arch/arm/nnue/layers/affine_transform_sparse_input.h
@@ -33,7 +33,7 @@
 
 namespace Stockfish::Eval::NNUE::Layers {
 
-#ifdef __ARM_NEON
+#if __ARM_ARCH >= 8
 
 alignas(CacheLineSize) static const std::array<std::array<std::uint16_t, 8>, 256> lookupIndices =
   [] {
@@ -70,13 +70,13 @@ class AffineTransformSparseInput: public AffineTransform<InDims, OutDims> {
             const int32x4_t chunk0 = in[i * 2];
             const int32x4_t chunk1 = in[i * 2 + 1];
 
-            static const int32x4_t movemask = [] {
-                const std::int32_t n[4] = {1, 2, 4, 8};
-                return vld1q_s32(n);
+            static const uint32x4_t movemask = [] {
+                const std::uint32_t n[4] = {1, 2, 4, 8};
+                return vld1q_u32(n);
             }();
 
-            const std::uint32_t nnz = vaddvq_u32(vandq_s32(vtstq_s32(chunk0, chunk0), movemask))
-                                    | vaddvq_u32(vandq_s32(vtstq_s32(chunk1, chunk1), movemask))
+            const std::uint32_t nnz = vaddvq_u32(vandq_u32(vtstq_s32(chunk0, chunk0), movemask))
+                                    | vaddvq_u32(vandq_u32(vtstq_s32(chunk1, chunk1), movemask))
                                         << 4;
             const uint16x8_t offsets = *reinterpret_cast<const uint16x8_t*>(&lookupIndices[nnz]);
             *reinterpret_cast<uint16x8_t*>(indices + count) = vaddq_u16(base, offsets);
@@ -105,9 +105,10 @@ void AffineTransformSparseInput<InDims, OutDims>::propagate(const InputType* inp
 
     for (IndexType j = 0; j < count; ++j)
     {
-        const auto      i  = nnz[j];
-        const int32x4_t in = vdupq_n_s32(reinterpret_cast<const std::int32_t*>(input)[i]);
-        const auto col     = reinterpret_cast<const int32x4_t*>(&weights[i * OutputDimensions * 4]);
+        const auto      i = nnz[j];
+        const int8x16_t in =
+          vreinterpretq_s8_s32(vdupq_n_s32(reinterpret_cast<const std::int32_t*>(input)[i]));
+        const auto col = reinterpret_cast<const int8x16_t*>(&weights[i * OutputDimensions * 4]);
         for (std::size_t k = 0; k < array_size(acc); ++k)
             vdotq_s32_v(acc[k], in, col[k]);
     }
@@ -121,7 +122,7 @@ void AffineTransformSparseInput<InDims, OutDims>::propagate(const InputType* inp
 template<IndexType InDims, IndexType OutDims>
 using AffineTransformSparseInput = AffineTransform<InDims, OutDims>;
 
-#endif  // __ARM_NEON
+#endif  // __ARM_ARCH >= 8
 
 }  // namespace Stockfish::Eval::NNUE::Layers
 

diff --git a/src/arch/arm/nnue/layers/clipped_relu.h b/src/arch/arm/nnue/layers/clipped_relu.h
@@ -60,7 +60,7 @@ void ClippedReLU<InDims>::propagate(const InputType* input, OutputType* output)
 
         words.tuple.val[0] = vqshrn_n_s32(in[i * 2 + 0], WeightScaleBits);
         words.tuple.val[1] = vqshrn_n_s32(in[i * 2 + 1], WeightScaleBits);
-        out[i]             = vmax_s8(vqmovn_s16(words), vdup_n_s8(0));
+        out[i]             = vmax_s8(vqmovn_s16(words.all), vdup_n_s8(0));
 #endif
     }
 }

diff --git a/src/arch/arm/nnue/nnue_feature_transformer.h b/src/arch/arm/nnue/nnue_feature_transformer.h
@@ -346,6 +346,6 @@ void FeatureTransformer<TransformedFeatureDimensions, accPtr>::convert_accumulat
 
 }  // namespace Stockfish::Eval::NNUE
 
-#endif  // !__SSE2__
+#endif  // !__ARM_NEON
 
 #endif  // ARM_NNUE_FEATURE_TRANSFORMER_H_INCLUDED