Skip to content

Commit

Permalink
Update
Browse files Browse the repository at this point in the history
* Added preliminary Android NDK support.
* Use -flto=jobserver for GCC 12 and later.
* [i386] Fixed GCC warnings related to SFIANE.
* [ARM] Disabled AffineTransformSparseInput in ARMv7.
* [ARM] Fixed ClippedReLU in ARMv7.
* [ARM] Fixed wrong variable types.
  • Loading branch information
MinetaS committed Sep 1, 2024
1 parent 4b4f7b1 commit 71dda9b
Show file tree
Hide file tree
Showing 7 changed files with 112 additions and 56 deletions.
51 changes: 43 additions & 8 deletions src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ else
INSTALL_EXE := stockfish
endif

KERNEL := $(shell uname -s)

strip:
-@test -f stockfish && strip stockfish
-@test -f stockfish.exe && strip stockfish.exe
Expand All @@ -47,7 +49,7 @@ install:
strip $(INSTALL_PATH)/$(INSTALL_EXE)

clean: objclean profileclean
@rm -f .depend *~ core
@rm -f .depend

objclean:
@rm -f stockfish stockfish.exe $(OBJS) *.o.tmp
Expand Down Expand Up @@ -108,15 +110,16 @@ ifeq ($(call test-compiler-macro,__GNUC__),1)
else ifeq ($(call test-compiler-macro,__clang__),1)
$(info Using LLVM C/C++ Compiler (Clang)) $(info )
COMP := clang
CLANG_MAJOR := $(call get-compiler-macro,__clang_major__)
LLVM_PROFDATA := $(shell command -v llvm-profdata-$(CLANG_MAJOR) 2> /dev/null || \
CLANG_VERSION := $(call get-compiler-macro,__clang_major__)
LLVM_PROFDATA := $(shell command -v llvm-profdata-$(CLANG_VERSION) 2> /dev/null || \
command -v llvm-profdata 2> /dev/null)
profile_make = clang-profile-make
profile_use = clang-profile-use
export LLVM_PROFDATA
else
$(info Using GNU C/C++ Compiler) $(info )
COMP := gcc
GCC_VERSION := $(call get-compiler-macro,__GNUC__)
profile_make = gcc-profile-make
profile_use = gcc-profile-use
endif
Expand Down Expand Up @@ -171,12 +174,20 @@ ifeq ($(optimize),yes)
SF_CXXFLAGS += -O3

ifeq ($(COMP),gcc)
SF_CXXFLAGS += -funroll-loops -flto=jobserver -flto-partition=one
SF_LDFLAGS += -flto=jobserver -flto-partition=one
SF_CXXFLAGS += -funroll-loops
ifeq ($(shell expr $(GCC_VERSION) \< 12),1)
SF_CXXFLAGS += -flto
SF_LDFLAGS += -flto
else
SF_CXXFLAGS += -flto=jobserver
SF_LDFLAGS += -flto=jobserver
endif
SF_CXXFLAGS += -flto-partition=one
SF_LDFLAGS += -flto-partition=one
else ifeq ($(COMP),clang)
SF_CXXFLAGS += -funroll-loops -flto=full
SF_LDFLAGS += -flto=full
ifeq ($(shell expr $(CLANG_MAJOR) \< 16),1)
ifeq ($(shell expr $(CLANG_VERSION) \< 16),1)
SF_CXXFLAGS += -fexperimental-new-pass-manager
endif
else ifeq ($(COMP),icx)
Expand Down Expand Up @@ -220,6 +231,8 @@ endif
endif # CXX_REQUIRED_RULES

### 3. Add flags from architecture-specific Makefile
### Note that this section is not enclosed in the CXX_REQUIRED_RULES block;
### Users shall be able to see the help text even when there is no compiler.

ifeq ($(ARCH),)
override ARCH := native
Expand All @@ -244,6 +257,28 @@ export ARCH

SF_CXXFLAGS += -DARCH=$(ARCH)

### 4. Extra flags for cross-compilation
### Information of target architecture is needed here.

ifneq ($(filter $(MAKECMDGOALS),$(CXX_REQUIRED_RULES)),)

# Android NDK
ifneq ($(filter $(ARCH_FAMILY),i386 arm),)
ifeq ($(call test-compiler-macro,__ANDROID__),1)
SF_CXXFLAGS += -stdlib=libc++ -fPIE
SF_LDFLAGS += -static-libstdc++ -pie
SF_LIBS += m atomic
endif
endif

# Link atomic library if not i386/arm family
ifneq ($(ARCH_NATIVE),y)
ifeq ($(filter $(ARCH_FAMILY),i386 arm),)
SF_LIBS += atomic
endif
endif

endif # CXX_REQUIRED_RULES
endif # MAKELEVEL=0

SF_CXXFLAGS := $(strip $(SF_CXXFLAGS) $(CXXFLAGS))
Expand Down Expand Up @@ -320,7 +355,7 @@ profile-build: config-sanity objclean profileclean
@echo "Step 1/4. Building instrumented executable ..."
@$(MAKE) --no-print-directory CXXFLAGS="" LDFLAGS="" $(profile_make)
@printf "\n%s\n" "Step 2/4. Running benchmark for pgo-build ..."
@./$(EXE) bench > PGOBENCH.out 2>&1
@$(EMULATE) ./$(EXE) bench > PGOBENCH.out 2>&1
@tail -n 4 PGOBENCH.out
@printf "\n%s\n" "Step 3/4. Building optimized executable ..."
@$(MAKE) --no-print-directory objclean
Expand Down Expand Up @@ -358,7 +393,7 @@ clang-profile-make:
CXXFLAGS="-fprofile-generate" LDFLAGS="-fprofile-generate" all

clang-profile-use:
@$(XCRUN) $(LLVM_PROFDATA) merge -output=stockfish.profdata *.profraw
$(XCRUN) $(LLVM_PROFDATA) merge -output=stockfish.profdata *.profraw
@$(MAKE) --no-print-directory \
CXXFLAGS="-fprofile-use=stockfish.profdata" \
LDFLAGS="-fprofile-use=stockfish.profdata" \
Expand Down
13 changes: 7 additions & 6 deletions src/arch/arm/arch.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,15 +58,16 @@ inline void vdotq_s32_v(int32x4_t& acc, int8x16_t in, int8x16_t col) {
#ifdef __ARM_FEATURE_DOTPROD
acc = vdotq_s32(acc, in, col);
#elif __ARM_ARCH >= 8
int16x8_t product0 = vmull_s8(vget_low_s8(a), vget_low_s8(b));
int16x8_t product1 = vmull_high_s8(a, b);
int16x8_t product0 = vmull_s8(vget_low_s8(in), vget_low_s8(col));
int16x8_t product1 = vmull_high_s8(in, col);
int16x8_t sum = vpaddq_s16(product0, product1);
acc = vpadalq_s16(acc, sum);
#else
int16x8_t product0 = vmull_s8(vget_low_s8(a), vget_low_s8(b));
int16x8_t product1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
int16x8_t sum = vpaddq_s16(product0, product1);
acc = vpadalq_s16(acc, sum);
int16x8_t product0 = vmull_s8(vget_low_s8(in), vget_low_s8(col));
int16x8_t product1 = vmull_s8(vget_high_s8(in), vget_high_s8(col));
int16x8_t sum =
vcombine_s16(vqmovn_s32(vpaddlq_s16(product0)), vqmovn_s32(vpaddlq_s16(product1)));
acc = vpadalq_s16(acc, sum);
#endif
}

Expand Down
4 changes: 2 additions & 2 deletions src/arch/arm/nnue/layers/affine_transform.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,8 @@ void AffineTransform<InDims, OutDims>::propagate(const InputType* input, OutputT

for (IndexType j = 0; j < NumChunks; ++j)
{
const int32x4_t in = reinterpret_cast<const int32x4_t*>(input)[j];
const int32x4_t row = reinterpret_cast<const int32x4_t*>(weights)[j];
const int8x16_t in = reinterpret_cast<const int8x16_t*>(input)[j];
const int8x16_t row = reinterpret_cast<const int8x16_t*>(weights)[j];
vdotq_s32_v(sum, in, row);
}

Expand Down
21 changes: 11 additions & 10 deletions src/arch/arm/nnue/layers/affine_transform_sparse_input.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@

namespace Stockfish::Eval::NNUE::Layers {

#ifdef __ARM_NEON
#if __ARM_ARCH >= 8

alignas(CacheLineSize) static const std::array<std::array<std::uint16_t, 8>, 256> lookupIndices =
[] {
Expand Down Expand Up @@ -70,13 +70,13 @@ class AffineTransformSparseInput: public AffineTransform<InDims, OutDims> {
const int32x4_t chunk0 = in[i * 2];
const int32x4_t chunk1 = in[i * 2 + 1];

static const int32x4_t movemask = [] {
const std::int32_t n[4] = {1, 2, 4, 8};
return vld1q_s32(n);
static const uint32x4_t movemask = [] {
const std::uint32_t n[4] = {1, 2, 4, 8};
return vld1q_u32(n);
}();

const std::uint32_t nnz = vaddvq_u32(vandq_s32(vtstq_s32(chunk0, chunk0), movemask))
| vaddvq_u32(vandq_s32(vtstq_s32(chunk1, chunk1), movemask))
const std::uint32_t nnz = vaddvq_u32(vandq_u32(vtstq_s32(chunk0, chunk0), movemask))
| vaddvq_u32(vandq_u32(vtstq_s32(chunk1, chunk1), movemask))
<< 4;
const uint16x8_t offsets = *reinterpret_cast<const uint16x8_t*>(&lookupIndices[nnz]);
*reinterpret_cast<uint16x8_t*>(indices + count) = vaddq_u16(base, offsets);
Expand Down Expand Up @@ -105,9 +105,10 @@ void AffineTransformSparseInput<InDims, OutDims>::propagate(const InputType* inp

for (IndexType j = 0; j < count; ++j)
{
const auto i = nnz[j];
const int32x4_t in = vdupq_n_s32(reinterpret_cast<const std::int32_t*>(input)[i]);
const auto col = reinterpret_cast<const int32x4_t*>(&weights[i * OutputDimensions * 4]);
const auto i = nnz[j];
const int8x16_t in =
vreinterpretq_s8_s32(vdupq_n_s32(reinterpret_cast<const std::int32_t*>(input)[i]));
const auto col = reinterpret_cast<const int8x16_t*>(&weights[i * OutputDimensions * 4]);
for (std::size_t k = 0; k < array_size(acc); ++k)
vdotq_s32_v(acc[k], in, col[k]);
}
Expand All @@ -121,7 +122,7 @@ void AffineTransformSparseInput<InDims, OutDims>::propagate(const InputType* inp
template<IndexType InDims, IndexType OutDims>
using AffineTransformSparseInput = AffineTransform<InDims, OutDims>;

#endif // __ARM_NEON
#endif // __ARM_ARCH >= 8

} // namespace Stockfish::Eval::NNUE::Layers

Expand Down
2 changes: 1 addition & 1 deletion src/arch/arm/nnue/layers/clipped_relu.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ void ClippedReLU<InDims>::propagate(const InputType* input, OutputType* output)

words.tuple.val[0] = vqshrn_n_s32(in[i * 2 + 0], WeightScaleBits);
words.tuple.val[1] = vqshrn_n_s32(in[i * 2 + 1], WeightScaleBits);
out[i] = vmax_s8(vqmovn_s16(words), vdup_n_s8(0));
out[i] = vmax_s8(vqmovn_s16(words.all), vdup_n_s8(0));
#endif
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/arch/arm/nnue/nnue_feature_transformer.h
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,6 @@ void FeatureTransformer<TransformedFeatureDimensions, accPtr>::convert_accumulat

} // namespace Stockfish::Eval::NNUE

#endif // !__SSE2__
#endif // !__ARM_NEON

#endif // ARM_NNUE_FEATURE_TRANSFORMER_H_INCLUDED
Loading

0 comments on commit 71dda9b

Please sign in to comment.