-
-
Notifications
You must be signed in to change notification settings - Fork 123
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add AVX512 and ARM SVE pre-sieving (#154)
* Add AVX512 pre-sieving * Fix typo * Update ChangeLog * Fix typo * Add AVX512 BW info * Fix macro name * Add ARM SVE support * Fix preprocessor logic * Remove comments * Add ARM SVE pre-sieving * Fix undefined behavior * Update ChangeLog
- Loading branch information
1 parent
cdfd3e1
commit 99536a8
Showing
9 changed files
with
526 additions
and
70 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
# We use GCC/Clang's function multi-versioning for AVX512 | ||
# support. This code will automatically dispatch to the | ||
# AVX512 BW algorithm if the CPU supports it and use the | ||
# default (portable) algorithm otherwise. | ||
|
||
include(CheckCXXSourceCompiles) | ||
include(CMakePushCheckState) | ||
|
||
cmake_push_check_state() | ||
set(CMAKE_REQUIRED_INCLUDES "${PROJECT_SOURCE_DIR}") | ||
|
||
check_cxx_source_compiles(" | ||
// GCC/Clang function multiversioning for AVX512 is not needed if | ||
// the user compiles with -mavx512f -mavx512bw. | ||
// GCC/Clang function multiversioning generally causes a minor | ||
// overhead, hence we disable it if it is not needed. | ||
#if defined(__AVX512F__) && \ | ||
defined(__AVX512BW__) | ||
Error: AVX512BW multiarch not needed! | ||
#endif | ||
#include <src/x86/cpuid.cpp> | ||
#include <immintrin.h> | ||
#include <stdint.h> | ||
#include <cstddef> | ||
__attribute__ ((target (\"avx512f,avx512bw\"))) | ||
void AND_PreSieveTables_avx512(const uint8_t* __restrict preSieve0, | ||
const uint8_t* __restrict preSieve1, | ||
uint8_t* __restrict sieve, | ||
std::size_t bytes) | ||
{ | ||
std::size_t i = 0; | ||
for (; i + 64 <= bytes; i += sizeof(__m512i)) | ||
{ | ||
_mm512_storeu_epi8((__m512i*) &sieve[i], | ||
_mm512_and_si512(_mm512_loadu_epi8((const __m512i*) &preSieve0[i]), | ||
_mm512_loadu_epi8((const __m512i*) &preSieve1[i]))); | ||
} | ||
if (i < bytes) | ||
{ | ||
__mmask64 mask = 0xffffffffffffffffull >> (i + 64 - bytes); | ||
_mm512_mask_storeu_epi8((__m512i*) &sieve[i], mask, | ||
_mm512_and_si512(_mm512_maskz_loadu_epi8(mask, (const __m512i*) &preSieve0[i]), | ||
_mm512_maskz_loadu_epi8(mask, (const __m512i*) &preSieve1[i]))); | ||
} | ||
} | ||
void AND_PreSieveTables_default(const uint8_t* __restrict preSieved0, | ||
const uint8_t* __restrict preSieved1, | ||
uint8_t* __restrict sieve, | ||
std::size_t bytes) | ||
{ | ||
for (std::size_t i = 0; i < bytes; i++) | ||
sieve[i] = preSieved0[i] & preSieved1[i]; | ||
} | ||
int main() | ||
{ | ||
uint8_t sieve[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; | ||
uint8_t PreSieveTable1[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; | ||
uint8_t PreSieveTable2[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; | ||
if (primesieve::has_cpuid_avx512_bw()) | ||
AND_PreSieveTables_avx512(&PreSieveTable1[0], &PreSieveTable2[1], &sieve[0], 10); | ||
else | ||
AND_PreSieveTables_default(&PreSieveTable1[0], &PreSieveTable2[1], &sieve[0], 10); | ||
return (sieve[0] == 0) ? 0 : 1; | ||
} | ||
" multiarch_avx512_bw) | ||
|
||
if(multiarch_avx512_bw) | ||
list(APPEND PRIMESIEVE_COMPILE_DEFINITIONS "ENABLE_MULTIARCH_AVX512_BW") | ||
endif() | ||
|
||
cmake_pop_check_state() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
# We use GCC/Clang's function multi-versioning for ARM SVE | ||
# support. This code will automatically dispatch to the | ||
# ARM SVE algorithm if the CPU supports it and use the default | ||
# (portable) algorithm otherwise. | ||
|
||
include(CheckCXXSourceCompiles) | ||
include(CMakePushCheckState) | ||
|
||
cmake_push_check_state() | ||
set(CMAKE_REQUIRED_INCLUDES "${PROJECT_SOURCE_DIR}/include") | ||
|
||
check_cxx_source_compiles(" | ||
// GCC/Clang function multiversioning for ARM SVE is not needed | ||
// if the user compiles with -march=armv8-a+sve. GCC/Clang | ||
// function multiversioning generally causes a minor overhead, | ||
// hence we disable it if it is not needed. | ||
#if defined(__ARM_FEATURE_SVE) && \ | ||
__has_include(<arm_sve.h>) | ||
Error: ARM SVE multiarch not needed! | ||
#endif | ||
#include <primesieve/cpu_supports_arm_sve.hpp> | ||
#include <arm_sve.h> | ||
#include <stdint.h> | ||
#include <cstddef> | ||
__attribute__ ((target (\"arch=armv8-a+sve\"))) | ||
void AND_PreSieveTables_arm_sve(const uint8_t* __restrict preSieved0, | ||
const uint8_t* __restrict preSieved1, | ||
const uint8_t* __restrict preSieved2, | ||
const uint8_t* __restrict preSieved3, | ||
uint8_t* __restrict sieve, | ||
std::size_t bytes) | ||
{ | ||
for (std::size_t i = 0; i < bytes; i += svcntb()) | ||
{ | ||
svbool_t pg = svwhilelt_b8(i, bytes); | ||
svst1_u8(pg, &sieve[i], | ||
svand_u8_x(svptrue_b64(), | ||
svand_u8_z(pg, svld1_u8(pg, &preSieved0[i]), svld1_u8(pg, &preSieved1[i])), | ||
svand_u8_z(pg, svld1_u8(pg, &preSieved2[i]), svld1_u8(pg, &preSieved3[i])))); | ||
} | ||
} | ||
void AND_PreSieveTables_default(const uint8_t* __restrict preSieved0, | ||
const uint8_t* __restrict preSieved1, | ||
const uint8_t* __restrict preSieved2, | ||
const uint8_t* __restrict preSieved3, | ||
uint8_t* __restrict sieve, | ||
std::size_t bytes) | ||
{ | ||
for (std::size_t i = 0; i < bytes; i++) | ||
sieve[i] = preSieved0[i] & preSieved1[i] & preSieved2[i] & preSieved3[i]; | ||
} | ||
int main() | ||
{ | ||
uint8_t sieve[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; | ||
uint8_t PreSieveTable1[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; | ||
uint8_t PreSieveTable2[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; | ||
uint8_t PreSieveTable3[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; | ||
uint8_t PreSieveTable4[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; | ||
if (cpu_supports_sve) | ||
AND_PreSieveTables_arm_sve(&PreSieveTable1[0], &PreSieveTable2[1], &PreSieveTable3[1], &PreSieveTable4[1], &sieve[0], 10); | ||
else | ||
AND_PreSieveTables_default(&PreSieveTable1[0], &PreSieveTable2[1], &PreSieveTable3[1], &PreSieveTable4[1], &sieve[0], 10); | ||
return (sieve[0] == 0) ? 0 : 1; | ||
} | ||
" multiarch_sve_arm) | ||
|
||
if(multiarch_sve_arm) | ||
list(APPEND PRIMESIEVE_COMPILE_DEFINITIONS "ENABLE_MULTIARCH_ARM_SVE") | ||
endif() | ||
|
||
cmake_pop_check_state() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
/// | ||
/// @file cpu_supports_arm_sve.hpp | ||
/// Check if the CPU supports the ARM SVE instruction set. | ||
/// | ||
/// Copyright (C) 2024 Kim Walisch, <[email protected]> | ||
/// | ||
/// This file is distributed under the BSD License. See the COPYING | ||
/// file in the top level directory. | ||
/// | ||
|
||
#ifndef CPU_SUPPORTS_ARM_SVE_HPP | ||
#define CPU_SUPPORTS_ARM_SVE_HPP | ||
|
||
#include "macros.hpp" | ||
|
||
#if __has_builtin(__builtin_cpu_supports) | ||
|
||
namespace { | ||
|
||
/// Initialized at startup | ||
const bool cpu_supports_sve = __builtin_cpu_supports("sve"); | ||
|
||
} // namespace | ||
|
||
#endif // __builtin_cpu_supports | ||
|
||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
/// | ||
/// @file cpu_supports_avx512_bw.hpp | ||
/// @brief Detect if the x86 CPU supports AVX512 BW. | ||
/// | ||
/// Copyright (C) 2024 Kim Walisch, <[email protected]> | ||
/// | ||
/// This file is distributed under the BSD License. See the COPYING | ||
/// file in the top level directory. | ||
/// | ||
|
||
#ifndef CPU_SUPPORTS_AVX512_BW_HPP | ||
#define CPU_SUPPORTS_AVX512_BW_HPP | ||
|
||
namespace primesieve { | ||
|
||
bool has_cpuid_avx512_bw(); | ||
|
||
} // namespace | ||
|
||
namespace { | ||
|
||
/// Initialized at startup | ||
const bool cpu_supports_avx512_bw = primesieve::has_cpuid_avx512_bw(); | ||
|
||
} // namespace | ||
|
||
#endif |
Oops, something went wrong.