Skip to content

Commit

Permalink
Add AVX512 and ARM SVE pre-sieving (#154)
Browse files Browse the repository at this point in the history
* Add AVX512 pre-sieving

* Fix typo

* Update ChangeLog

* Fix typo

* Add AVX512 BW info

* Fix macro name

* Add ARM SVE support

* Fix preprocessor logic

* Remove comments

* Add ARM SVE pre-sieving

* Fix undefined behavior

* Update ChangeLog
  • Loading branch information
kimwalisch authored Nov 11, 2024
1 parent cdfd3e1 commit 99536a8
Show file tree
Hide file tree
Showing 9 changed files with 526 additions and 70 deletions.
7 changes: 6 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,14 @@ set(LIB_SRC src/api-c.cpp

if(WITH_MULTIARCH)
include("${PROJECT_SOURCE_DIR}/cmake/multiarch_x86_popcnt.cmake")
include("${PROJECT_SOURCE_DIR}/cmake/multiarch_avx512_bw.cmake")
include("${PROJECT_SOURCE_DIR}/cmake/multiarch_avx512_vbmi2.cmake")

if(multiarch_x86_popcnt OR multiarch_avx512_vbmi2)
if(NOT multiarch_avx512_bw)
include("${PROJECT_SOURCE_DIR}/cmake/multiarch_sve_arm.cmake")
endif()

if(multiarch_x86_popcnt OR multiarch_avx512_bw OR multiarch_avx512_vbmi2)
set(LIB_SRC ${LIB_SRC} src/x86/cpuid.cpp)
endif()
endif()
Expand Down
3 changes: 2 additions & 1 deletion ChangeLog
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
Changes in version 12.6, 10/11/2024
Changes in version 12.6, 11/11/2024
===================================

* CpuInfo.cpp: Correctly detect Intel Arrow Lake CPU cache
topology on Windows and Linux.
* PreSieve.cpp: Increased pre-sieving from primes <= 100 to
primes <= 163. Memory usage of pre-sieve lookup tables has been
reduced from 210 kilobytes to 123 kilobytes.
* PreSieve.cpp: Added AVX512 and ARM SVE multiarch support.

Changes in version 12.5, 22/10/2024
===================================
Expand Down
80 changes: 80 additions & 0 deletions cmake/multiarch_avx512_bw.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# We use GCC/Clang's function multi-versioning for AVX512
# support. This code will automatically dispatch to the
# AVX512 BW algorithm if the CPU supports it and use the
# default (portable) algorithm otherwise.

include(CheckCXXSourceCompiles)
include(CMakePushCheckState)

cmake_push_check_state()
set(CMAKE_REQUIRED_INCLUDES "${PROJECT_SOURCE_DIR}")

check_cxx_source_compiles("
// GCC/Clang function multiversioning for AVX512 is not needed if
// the user compiles with -mavx512f -mavx512bw.
// GCC/Clang function multiversioning generally causes a minor
// overhead, hence we disable it if it is not needed.
#if defined(__AVX512F__) && \
defined(__AVX512BW__)
Error: AVX512BW multiarch not needed!
#endif
#include <src/x86/cpuid.cpp>
#include <immintrin.h>
#include <stdint.h>
#include <cstddef>
__attribute__ ((target (\"avx512f,avx512bw\")))
void AND_PreSieveTables_avx512(const uint8_t* __restrict preSieve0,
const uint8_t* __restrict preSieve1,
uint8_t* __restrict sieve,
std::size_t bytes)
{
std::size_t i = 0;
for (; i + 64 <= bytes; i += sizeof(__m512i))
{
_mm512_storeu_epi8((__m512i*) &sieve[i],
_mm512_and_si512(_mm512_loadu_epi8((const __m512i*) &preSieve0[i]),
_mm512_loadu_epi8((const __m512i*) &preSieve1[i])));
}
if (i < bytes)
{
__mmask64 mask = 0xffffffffffffffffull >> (i + 64 - bytes);
_mm512_mask_storeu_epi8((__m512i*) &sieve[i], mask,
_mm512_and_si512(_mm512_maskz_loadu_epi8(mask, (const __m512i*) &preSieve0[i]),
_mm512_maskz_loadu_epi8(mask, (const __m512i*) &preSieve1[i])));
}
}
void AND_PreSieveTables_default(const uint8_t* __restrict preSieved0,
const uint8_t* __restrict preSieved1,
uint8_t* __restrict sieve,
std::size_t bytes)
{
for (std::size_t i = 0; i < bytes; i++)
sieve[i] = preSieved0[i] & preSieved1[i];
}
int main()
{
uint8_t sieve[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
uint8_t PreSieveTable1[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
uint8_t PreSieveTable2[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
if (primesieve::has_cpuid_avx512_bw())
AND_PreSieveTables_avx512(&PreSieveTable1[0], &PreSieveTable2[1], &sieve[0], 10);
else
AND_PreSieveTables_default(&PreSieveTable1[0], &PreSieveTable2[1], &sieve[0], 10);
return (sieve[0] == 0) ? 0 : 1;
}
" multiarch_avx512_bw)

if(multiarch_avx512_bw)
list(APPEND PRIMESIEVE_COMPILE_DEFINITIONS "ENABLE_MULTIARCH_AVX512_BW")
endif()

cmake_pop_check_state()
78 changes: 78 additions & 0 deletions cmake/multiarch_sve_arm.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# We use GCC/Clang's function multi-versioning for ARM SVE
# support. This code will automatically dispatch to the
# ARM SVE algorithm if the CPU supports it and use the default
# (portable) algorithm otherwise.

include(CheckCXXSourceCompiles)
include(CMakePushCheckState)

cmake_push_check_state()
set(CMAKE_REQUIRED_INCLUDES "${PROJECT_SOURCE_DIR}/include")

check_cxx_source_compiles("
// GCC/Clang function multiversioning for ARM SVE is not needed
// if the user compiles with -march=armv8-a+sve. GCC/Clang
// function multiversioning generally causes a minor overhead,
// hence we disable it if it is not needed.
#if defined(__ARM_FEATURE_SVE) && \
__has_include(<arm_sve.h>)
Error: ARM SVE multiarch not needed!
#endif
#include <primesieve/cpu_supports_arm_sve.hpp>
#include <arm_sve.h>
#include <stdint.h>
#include <cstddef>
__attribute__ ((target (\"arch=armv8-a+sve\")))
void AND_PreSieveTables_arm_sve(const uint8_t* __restrict preSieved0,
const uint8_t* __restrict preSieved1,
const uint8_t* __restrict preSieved2,
const uint8_t* __restrict preSieved3,
uint8_t* __restrict sieve,
std::size_t bytes)
{
for (std::size_t i = 0; i < bytes; i += svcntb())
{
svbool_t pg = svwhilelt_b8(i, bytes);
svst1_u8(pg, &sieve[i],
svand_u8_x(svptrue_b64(),
svand_u8_z(pg, svld1_u8(pg, &preSieved0[i]), svld1_u8(pg, &preSieved1[i])),
svand_u8_z(pg, svld1_u8(pg, &preSieved2[i]), svld1_u8(pg, &preSieved3[i]))));
}
}
void AND_PreSieveTables_default(const uint8_t* __restrict preSieved0,
const uint8_t* __restrict preSieved1,
const uint8_t* __restrict preSieved2,
const uint8_t* __restrict preSieved3,
uint8_t* __restrict sieve,
std::size_t bytes)
{
for (std::size_t i = 0; i < bytes; i++)
sieve[i] = preSieved0[i] & preSieved1[i] & preSieved2[i] & preSieved3[i];
}
int main()
{
uint8_t sieve[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
uint8_t PreSieveTable1[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
uint8_t PreSieveTable2[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
uint8_t PreSieveTable3[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
uint8_t PreSieveTable4[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
if (cpu_supports_sve)
AND_PreSieveTables_arm_sve(&PreSieveTable1[0], &PreSieveTable2[1], &PreSieveTable3[1], &PreSieveTable4[1], &sieve[0], 10);
else
AND_PreSieveTables_default(&PreSieveTable1[0], &PreSieveTable2[1], &PreSieveTable3[1], &PreSieveTable4[1], &sieve[0], 10);
return (sieve[0] == 0) ? 0 : 1;
}
" multiarch_sve_arm)

if(multiarch_sve_arm)
list(APPEND PRIMESIEVE_COMPILE_DEFINITIONS "ENABLE_MULTIARCH_ARM_SVE")
endif()

cmake_pop_check_state()
27 changes: 27 additions & 0 deletions include/primesieve/cpu_supports_arm_sve.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
///
/// @file cpu_supports_arm_sve.hpp
/// Check if the CPU supports the ARM SVE instruction set.
///
/// Copyright (C) 2024 Kim Walisch, <[email protected]>
///
/// This file is distributed under the BSD License. See the COPYING
/// file in the top level directory.
///

#ifndef CPU_SUPPORTS_ARM_SVE_HPP
#define CPU_SUPPORTS_ARM_SVE_HPP

#include "macros.hpp"

#if __has_builtin(__builtin_cpu_supports)

namespace {

/// Initialized at startup
const bool cpu_supports_sve = __builtin_cpu_supports("sve");

} // namespace

#endif // __builtin_cpu_supports

#endif
27 changes: 27 additions & 0 deletions include/primesieve/cpu_supports_avx512_bw.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
///
/// @file cpu_supports_avx512_bw.hpp
/// @brief Detect if the x86 CPU supports AVX512 BW.
///
/// Copyright (C) 2024 Kim Walisch, <[email protected]>
///
/// This file is distributed under the BSD License. See the COPYING
/// file in the top level directory.
///

#ifndef CPU_SUPPORTS_AVX512_BW_HPP
#define CPU_SUPPORTS_AVX512_BW_HPP

namespace primesieve {

bool has_cpuid_avx512_bw();

} // namespace

namespace {

/// Initialized at startup
const bool cpu_supports_avx512_bw = primesieve::has_cpuid_avx512_bw();

} // namespace

#endif
Loading

0 comments on commit 99536a8

Please sign in to comment.