diff --git a/src/Makefile b/src/Makefile index 3e714988973..87e93cb581b 100644 --- a/src/Makefile +++ b/src/Makefile @@ -37,6 +37,8 @@ else INSTALL_EXE := stockfish endif +KERNEL := $(shell uname -s) + strip: -@test -f stockfish && strip stockfish -@test -f stockfish.exe && strip stockfish.exe @@ -47,7 +49,7 @@ install: strip $(INSTALL_PATH)/$(INSTALL_EXE) clean: objclean profileclean - @rm -f .depend *~ core + @rm -f .depend objclean: @rm -f stockfish stockfish.exe $(OBJS) *.o.tmp @@ -108,8 +110,8 @@ ifeq ($(call test-compiler-macro,__GNUC__),1) else ifeq ($(call test-compiler-macro,__clang__),1) $(info Using LLVM C/C++ Compiler (Clang)) $(info ) COMP := clang - CLANG_MAJOR := $(call get-compiler-macro,__clang_major__) - LLVM_PROFDATA := $(shell command -v llvm-profdata-$(CLANG_MAJOR) 2> /dev/null || \ + CLANG_VERSION := $(call get-compiler-macro,__clang_major__) + LLVM_PROFDATA := $(shell command -v llvm-profdata-$(CLANG_VERSION) 2> /dev/null || \ command -v llvm-profdata 2> /dev/null) profile_make = clang-profile-make profile_use = clang-profile-use @@ -117,6 +119,7 @@ ifeq ($(call test-compiler-macro,__GNUC__),1) else $(info Using GNU C/C++ Compiler) $(info ) COMP := gcc + GCC_VERSION := $(call get-compiler-macro,__GNUC__) profile_make = gcc-profile-make profile_use = gcc-profile-use endif @@ -171,12 +174,20 @@ ifeq ($(optimize),yes) SF_CXXFLAGS += -O3 ifeq ($(COMP),gcc) - SF_CXXFLAGS += -funroll-loops -flto=jobserver -flto-partition=one - SF_LDFLAGS += -flto=jobserver -flto-partition=one + SF_CXXFLAGS += -funroll-loops + ifeq ($(shell expr $(GCC_VERSION) \< 12),1) + SF_CXXFLAGS += -flto + SF_LDFLAGS += -flto + else + SF_CXXFLAGS += -flto=jobserver + SF_LDFLAGS += -flto=jobserver + endif + SF_CXXFLAGS += -flto-partition=one + SF_LDFLAGS += -flto-partition=one else ifeq ($(COMP),clang) SF_CXXFLAGS += -funroll-loops -flto=full SF_LDFLAGS += -flto=full - ifeq ($(shell expr $(CLANG_MAJOR) \< 16),1) + ifeq ($(shell expr $(CLANG_VERSION) \< 16),1) SF_CXXFLAGS += -fexperimental-new-pass-manager endif else ifeq ($(COMP),icx) @@ -220,6 +231,8 @@ endif endif # CXX_REQUIRED_RULES ### 3. Add flags from architecture-specific Makefile +### Note that this section is not enclosed in the CXX_REQUIRED_RULES block; +### Users shall be able to see the help text even when there is no compiler. ifeq ($(ARCH),) override ARCH := native @@ -244,6 +257,28 @@ export ARCH SF_CXXFLAGS += -DARCH=$(ARCH) +### 4. Extra flags for cross-compilation +### Information of target architecture is needed here. + +ifneq ($(filter $(MAKECMDGOALS),$(CXX_REQUIRED_RULES)),) + +# Android NDK +ifneq ($(filter $(ARCH_FAMILY),i386 arm),) + ifeq ($(call test-compiler-macro,__ANDROID__),1) + SF_CXXFLAGS += -stdlib=libc++ -fPIE + SF_LDFLAGS += -static-libstdc++ -pie + SF_LIBS += m atomic + endif +endif + +# Link atomic library if not i386/arm family +ifneq ($(ARCH_NATIVE),y) + ifeq ($(filter $(ARCH_FAMILY),i386 arm),) + SF_LIBS += atomic + endif +endif + +endif # CXX_REQUIRED_RULES endif # MAKELEVEL=0 SF_CXXFLAGS := $(strip $(SF_CXXFLAGS) $(CXXFLAGS)) @@ -320,7 +355,7 @@ profile-build: config-sanity objclean profileclean @echo "Step 1/4. Building instrumented executable ..." @$(MAKE) --no-print-directory CXXFLAGS="" LDFLAGS="" $(profile_make) @printf "\n%s\n" "Step 2/4. Running benchmark for pgo-build ..." - @./$(EXE) bench > PGOBENCH.out 2>&1 + @$(EMULATE) ./$(EXE) bench > PGOBENCH.out 2>&1 @tail -n 4 PGOBENCH.out @printf "\n%s\n" "Step 3/4. Building optimized executable ..." @$(MAKE) --no-print-directory objclean @@ -358,7 +393,7 @@ clang-profile-make: CXXFLAGS="-fprofile-generate" LDFLAGS="-fprofile-generate" all clang-profile-use: - @$(XCRUN) $(LLVM_PROFDATA) merge -output=stockfish.profdata *.profraw + $(XCRUN) $(LLVM_PROFDATA) merge -output=stockfish.profdata *.profraw @$(MAKE) --no-print-directory \ CXXFLAGS="-fprofile-use=stockfish.profdata" \ LDFLAGS="-fprofile-use=stockfish.profdata" \ diff --git a/src/arch/arm/arch.h b/src/arch/arm/arch.h index 74d91e1a4b3..30f50451aaf 100644 --- a/src/arch/arm/arch.h +++ b/src/arch/arm/arch.h @@ -58,15 +58,16 @@ inline void vdotq_s32_v(int32x4_t& acc, int8x16_t in, int8x16_t col) { #ifdef __ARM_FEATURE_DOTPROD acc = vdotq_s32(acc, in, col); #elif __ARM_ARCH >= 8 - int16x8_t product0 = vmull_s8(vget_low_s8(a), vget_low_s8(b)); - int16x8_t product1 = vmull_high_s8(a, b); + int16x8_t product0 = vmull_s8(vget_low_s8(in), vget_low_s8(col)); + int16x8_t product1 = vmull_high_s8(in, col); int16x8_t sum = vpaddq_s16(product0, product1); acc = vpadalq_s16(acc, sum); #else - int16x8_t product0 = vmull_s8(vget_low_s8(a), vget_low_s8(b)); - int16x8_t product1 = vmull_s8(vget_high_s8(a), vget_high_s8(b)); - int16x8_t sum = vpaddq_s16(product0, product1); - acc = vpadalq_s16(acc, sum); + int16x8_t product0 = vmull_s8(vget_low_s8(in), vget_low_s8(col)); + int16x8_t product1 = vmull_s8(vget_high_s8(in), vget_high_s8(col)); + int16x8_t sum = + vcombine_s16(vqmovn_s32(vpaddlq_s16(product0)), vqmovn_s32(vpaddlq_s16(product1))); + acc = vpadalq_s16(acc, sum); #endif } diff --git a/src/arch/arm/nnue/layers/affine_transform.h b/src/arch/arm/nnue/layers/affine_transform.h index 44cafb3c3db..3fb69bce2d8 100644 --- a/src/arch/arm/nnue/layers/affine_transform.h +++ b/src/arch/arm/nnue/layers/affine_transform.h @@ -85,8 +85,8 @@ void AffineTransform::propagate(const InputType* input, OutputT for (IndexType j = 0; j < NumChunks; ++j) { - const int32x4_t in = reinterpret_cast(input)[j]; - const int32x4_t row = reinterpret_cast(weights)[j]; + const int8x16_t in = reinterpret_cast(input)[j]; + const int8x16_t row = reinterpret_cast(weights)[j]; vdotq_s32_v(sum, in, row); } diff --git a/src/arch/arm/nnue/layers/affine_transform_sparse_input.h b/src/arch/arm/nnue/layers/affine_transform_sparse_input.h index af21f3036de..546002c256f 100644 --- a/src/arch/arm/nnue/layers/affine_transform_sparse_input.h +++ b/src/arch/arm/nnue/layers/affine_transform_sparse_input.h @@ -33,7 +33,7 @@ namespace Stockfish::Eval::NNUE::Layers { -#ifdef __ARM_NEON +#if __ARM_ARCH >= 8 alignas(CacheLineSize) static const std::array, 256> lookupIndices = [] { @@ -70,13 +70,13 @@ class AffineTransformSparseInput: public AffineTransform { const int32x4_t chunk0 = in[i * 2]; const int32x4_t chunk1 = in[i * 2 + 1]; - static const int32x4_t movemask = [] { - const std::int32_t n[4] = {1, 2, 4, 8}; - return vld1q_s32(n); + static const uint32x4_t movemask = [] { + const std::uint32_t n[4] = {1, 2, 4, 8}; + return vld1q_u32(n); }(); - const std::uint32_t nnz = vaddvq_u32(vandq_s32(vtstq_s32(chunk0, chunk0), movemask)) - | vaddvq_u32(vandq_s32(vtstq_s32(chunk1, chunk1), movemask)) + const std::uint32_t nnz = vaddvq_u32(vandq_u32(vtstq_s32(chunk0, chunk0), movemask)) + | vaddvq_u32(vandq_u32(vtstq_s32(chunk1, chunk1), movemask)) << 4; const uint16x8_t offsets = *reinterpret_cast(&lookupIndices[nnz]); *reinterpret_cast(indices + count) = vaddq_u16(base, offsets); @@ -105,9 +105,10 @@ void AffineTransformSparseInput::propagate(const InputType* inp for (IndexType j = 0; j < count; ++j) { - const auto i = nnz[j]; - const int32x4_t in = vdupq_n_s32(reinterpret_cast(input)[i]); - const auto col = reinterpret_cast(&weights[i * OutputDimensions * 4]); + const auto i = nnz[j]; + const int8x16_t in = + vreinterpretq_s8_s32(vdupq_n_s32(reinterpret_cast(input)[i])); + const auto col = reinterpret_cast(&weights[i * OutputDimensions * 4]); for (std::size_t k = 0; k < array_size(acc); ++k) vdotq_s32_v(acc[k], in, col[k]); } @@ -121,7 +122,7 @@ void AffineTransformSparseInput::propagate(const InputType* inp template using AffineTransformSparseInput = AffineTransform; -#endif // __ARM_NEON +#endif // __ARM_ARCH >= 8 } // namespace Stockfish::Eval::NNUE::Layers diff --git a/src/arch/arm/nnue/layers/clipped_relu.h b/src/arch/arm/nnue/layers/clipped_relu.h index ef01b684948..7e644ec5060 100644 --- a/src/arch/arm/nnue/layers/clipped_relu.h +++ b/src/arch/arm/nnue/layers/clipped_relu.h @@ -60,7 +60,7 @@ void ClippedReLU::propagate(const InputType* input, OutputType* output) words.tuple.val[0] = vqshrn_n_s32(in[i * 2 + 0], WeightScaleBits); words.tuple.val[1] = vqshrn_n_s32(in[i * 2 + 1], WeightScaleBits); - out[i] = vmax_s8(vqmovn_s16(words), vdup_n_s8(0)); + out[i] = vmax_s8(vqmovn_s16(words.all), vdup_n_s8(0)); #endif } } diff --git a/src/arch/arm/nnue/nnue_feature_transformer.h b/src/arch/arm/nnue/nnue_feature_transformer.h index df3ec0071fc..0427507d224 100644 --- a/src/arch/arm/nnue/nnue_feature_transformer.h +++ b/src/arch/arm/nnue/nnue_feature_transformer.h @@ -346,6 +346,6 @@ void FeatureTransformer::convert_accumulat } // namespace Stockfish::Eval::NNUE -#endif // !__SSE2__ +#endif // !__ARM_NEON #endif // ARM_NNUE_FEATURE_TRANSFORMER_H_INCLUDED diff --git a/src/arch/i386/arch.h b/src/arch/i386/arch.h index aa98c2f21a6..1ef79088240 100644 --- a/src/arch/i386/arch.h +++ b/src/arch/i386/arch.h @@ -193,8 +193,27 @@ inline int tzcnt(T n) { #endif } +#ifdef __SSE2__ + +template +struct is_valid_vector { + static constexpr bool value = sizeof(T) == 16 +#ifdef __AVX2__ + || sizeof(T) == 32 +#endif +#ifdef __AVX512F__ + || sizeof(T) == 64 +#endif + ; +}; + +template +inline constexpr bool is_valid_vector_v = is_valid_vector::value; + template inline T _mm_setzero_v() { + static_assert(is_valid_vector_v); + #ifdef __AVX512F__ if constexpr (sizeof(T) == 64) return _mm512_setzero_si512(); @@ -205,14 +224,14 @@ inline T _mm_setzero_v() { return _mm256_setzero_si256(); #endif -#ifdef __SSE2__ if constexpr (sizeof(T) == 16) return _mm_setzero_si128(); -#endif } template inline T _mm_set1_epi16_v(std::uint16_t n) { + static_assert(is_valid_vector_v); + #ifdef __AVX512F__ if constexpr (sizeof(T) == 64) return _mm512_set1_epi16(n); @@ -223,14 +242,14 @@ inline T _mm_set1_epi16_v(std::uint16_t n) { return _mm256_set1_epi16(n); #endif -#ifdef __SSE2__ if constexpr (sizeof(T) == 16) return _mm_set1_epi16(n); -#endif } template inline T _mm_set1_epi32_v(std::uint32_t n) { + static_assert(is_valid_vector_v); + #ifdef __AVX512F__ if constexpr (sizeof(T) == 64) return _mm512_set1_epi32(n); @@ -241,14 +260,14 @@ inline T _mm_set1_epi32_v(std::uint32_t n) { return _mm256_set1_epi32(n); #endif -#ifdef __SSE2__ if constexpr (sizeof(T) == 16) return _mm_set1_epi32(n); -#endif } template inline T _mm_packus_epi16_v(T a, T b) { + static_assert(is_valid_vector_v); + #ifdef __AVX512F__ if constexpr (sizeof(T) == 64) #ifdef __AVX512BW__ @@ -267,14 +286,14 @@ inline T _mm_packus_epi16_v(T a, T b) { #endif #endif -#ifdef __SSE2__ if constexpr (sizeof(T) == 16) return _mm_packus_epi16(a, b); -#endif } template inline T _mm_add_epi16_v(T a, T b) { + static_assert(is_valid_vector_v); + #ifdef __AVX512F__ if constexpr (sizeof(T) == 64) #ifdef __AVX512BW__ @@ -293,14 +312,14 @@ inline T _mm_add_epi16_v(T a, T b) { #endif #endif -#ifdef __SSE2__ if constexpr (sizeof(T) == 16) return _mm_add_epi16(a, b); -#endif } template inline T _mm_add_epi32_v(T a, T b) { + static_assert(is_valid_vector_v); + #ifdef __AVX512F__ if constexpr (sizeof(T) == 64) return _mm512_add_epi32(a, b); @@ -315,14 +334,14 @@ inline T _mm_add_epi32_v(T a, T b) { #endif #endif -#ifdef __SSE2__ if constexpr (sizeof(T) == 16) return _mm_add_epi32(a, b); -#endif } template inline T _mm_sub_epi16_v(T a, T b) { + static_assert(is_valid_vector_v); + #ifdef __AVX512F__ if constexpr (sizeof(T) == 64) #ifdef __AVX512BW__ @@ -341,14 +360,14 @@ inline T _mm_sub_epi16_v(T a, T b) { #endif #endif -#ifdef __SSE2__ if constexpr (sizeof(T) == 16) return _mm_sub_epi16(a, b); -#endif } template inline T _mm_sub_epi32_v(T a, T b) { + static_assert(is_valid_vector_v); + #ifdef __AVX512F__ if constexpr (sizeof(T) == 64) return _mm512_sub_epi32(a, b); @@ -363,14 +382,14 @@ inline T _mm_sub_epi32_v(T a, T b) { #endif #endif -#ifdef __SSE2__ if constexpr (sizeof(T) == 16) return _mm_sub_epi32(a, b); -#endif } template inline T _mm_mulhi_epi16_v(T a, T b) { + static_assert(is_valid_vector_v); + #ifdef __AVX512F__ if constexpr (sizeof(T) == 64) #ifdef __AVX512BW__ @@ -389,14 +408,14 @@ inline T _mm_mulhi_epi16_v(T a, T b) { #endif #endif -#ifdef __SSE2__ if constexpr (sizeof(T) == 16) return _mm_mulhi_epi16(a, b); -#endif } template inline T _mm_slli_epi16_v(T a, int n) { + static_assert(is_valid_vector_v); + #ifdef __AVX512F__ if constexpr (sizeof(T) == 64) #ifdef __AVX512BW__ @@ -415,14 +434,14 @@ inline T _mm_slli_epi16_v(T a, int n) { #endif #endif -#ifdef __SSE2__ if constexpr (sizeof(T) == 16) return _mm_slli_epi16(a, n); -#endif } template inline T _mm_max_epi16_v(T a, T b) { + static_assert(is_valid_vector_v); + #ifdef __AVX512F__ if constexpr (sizeof(T) == 64) #ifdef __AVX512BW__ @@ -441,14 +460,14 @@ inline T _mm_max_epi16_v(T a, T b) { #endif #endif -#ifdef __SSE2__ if constexpr (sizeof(T) == 16) return _mm_max_epi16(a, b); -#endif } template inline T _mm_min_epi16_v(T a, T b) { + static_assert(is_valid_vector_v); + #ifdef __AVX512F__ if constexpr (sizeof(T) == 64) #ifdef __AVX512BW__ @@ -467,14 +486,14 @@ inline T _mm_min_epi16_v(T a, T b) { #endif #endif -#ifdef __SSE2__ if constexpr (sizeof(T) == 16) return _mm_min_epi16(a, b); -#endif } template inline std::int32_t _mm_reduce_add_epi32_v(T a) { + static_assert(is_valid_vector_v); + #ifdef __AVX512F__ if constexpr (sizeof(T) == 64) return _mm512_reduce_add_epi32(a); @@ -490,14 +509,12 @@ inline std::int32_t _mm_reduce_add_epi32_v(T a) { } #endif -#ifdef __SSE2__ if constexpr (sizeof(T) == 16) { a = _mm_add_epi32(a, _mm_shuffle_epi32(a, 0x4E)); // _MM_PERM_BADC a = _mm_add_epi32(a, _mm_shuffle_epi32(a, 0xB1)); // _MM_PERM_CDAB return _mm_cvtsi128_si32(a); } -#endif } // Non-VNNI implementation of dpbusd works even with type saturation, only @@ -505,6 +522,8 @@ inline std::int32_t _mm_reduce_add_epi32_v(T a) { // AffineTransform layer. Do not use this without VNNI for general purpose. template inline void _mm_dpbusd_epi32_v(T& acc, T a, T b) { + static_assert(is_valid_vector_v); + #ifdef __AVX512F__ if constexpr (sizeof(T) == 64) { @@ -542,7 +561,6 @@ inline void _mm_dpbusd_epi32_v(T& acc, T a, T b) { } #endif -#ifdef __SSE2__ if constexpr (sizeof(T) == 16) { #if (defined(__AVX512VL__) && defined(__AVX512VNNI__)) || defined(__AVXVNNI__) @@ -568,9 +586,10 @@ inline void _mm_dpbusd_epi32_v(T& acc, T a, T b) { #endif } -#endif } +#endif // __SSE2__ + } // namespace Stockfish #endif // I386_ARCH_H_INCLUDED