Skip to content

Commit

Permalink
Improve large buffer crc32c performance on Arm [3/3]
Browse files Browse the repository at this point in the history
Summary: Integrate neon and neon+eor3+sha3 crc32c implementations into the Folly hash library.

Reviewed By: skrueger

Differential Revision: D59322056

fbshipit-source-id: cb6fba0ec6677f439235d90e2d95d248ba7c47e2
  • Loading branch information
Michael van der Westhuizen authored and facebook-github-bot committed Sep 29, 2024
1 parent e7499ed commit 4ab180a
Show file tree
Hide file tree
Showing 5 changed files with 159 additions and 0 deletions.
2 changes: 2 additions & 0 deletions folly/hash/BUCK
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ cpp_library(
"//folly:cpu_id",
"//folly/detail:traponavx512",
"//folly/external/fast-crc32:avx512_crc32c_v8s3x4", # @manual
"//folly/external/fast-crc32:neon_crc32c_v3s4x2e_v2", # @manual
"//folly/external/fast-crc32:neon_eor3_crc32c_v8s2x4_s3", # @manual
"//folly/external/fast-crc32:sse_crc32c_v8s3x3", # @manual
"//folly/hash/detail:checksum_detail",
],
Expand Down
38 changes: 38 additions & 0 deletions folly/hash/Checksum.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
#include <folly/CpuId.h>
#include <folly/detail/TrapOnAvx512.h>
#include <folly/external/fast-crc32/avx512_crc32c_v8s3x4.h> // @manual
#include <folly/external/fast-crc32/neon_crc32c_v3s4x2e_v2.h> // @manual
#include <folly/external/fast-crc32/neon_eor3_crc32c_v8s2x4_s3.h> // @manual
#include <folly/external/fast-crc32/sse_crc32c_v8s3x3.h> // @manual
#include <folly/hash/detail/ChecksumDetail.h>

Expand Down Expand Up @@ -90,6 +92,14 @@ bool crc32_hw_supported() {
return id.sse42();
}

bool crc32c_hw_supported_neon() {
return false;
}

bool crc32c_hw_supported_neon_eor3_sha3() {
return false;
}

#elif FOLLY_ARM_FEATURE_CRC32

// crc32_hw is defined in folly/external/nvidia/hash/Checksum.cpp
Expand All @@ -106,6 +116,16 @@ bool crc32c_hw_supported_avx512() {
return false;
}

bool crc32c_hw_supported_neon() {
static bool has_neon = has_neon_crc32c_v3s4x2e_v2();
return has_neon;
}

bool crc32c_hw_supported_neon_eor3_sha3() {
static bool has_neon_eor3 = has_neon_eor3_crc32c_v8s2x4_s3();
return has_neon_eor3;
}

bool crc32_hw_supported() {
return true;
}
Expand Down Expand Up @@ -134,6 +154,14 @@ bool crc32c_hw_supported_avx512() {
bool crc32_hw_supported() {
return false;
}

bool crc32c_hw_supported_neon() {
return false;
}

bool crc32c_hw_supported_neon_eor3_sha3() {
return false;
}
#endif

template <uint32_t CRC_POLYNOMIAL>
Expand Down Expand Up @@ -179,6 +207,16 @@ uint32_t crc32c(const uint8_t* data, size_t nbytes, uint32_t startingChecksum) {
}
#endif

#if FOLLY_AARCH64
if (nbytes >= 2048 && detail::crc32c_hw_supported_neon_eor3_sha3()) {
return detail::neon_eor3_crc32c_v8s2x4_s3(data, nbytes, startingChecksum);
}

if (nbytes >= 4096 && detail::crc32c_hw_supported_neon()) {
return detail::neon_crc32c_v3s4x2e_v2(data, nbytes, startingChecksum);
}
#endif

if (detail::crc32c_hw_supported()) {
#if defined(FOLLY_ENABLE_SSE42_CRC32C_V8S3X3)
if (nbytes > 4096) {
Expand Down
12 changes: 12 additions & 0 deletions folly/hash/detail/ChecksumDetail.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,18 @@ bool crc32c_hw_supported();
*/
bool crc32c_hw_supported_avx512();

/**
* Check whether a NEON hardware-accelerated CRC-32C implementation is
* supported on the current CPU.
*/
bool crc32c_hw_supported_neon();

/**
* Check whether a NEON+EOR3+SHA3 hardware-accelerated CRC-32C implementation
* is supported on the current CPU.
*/
bool crc32c_hw_supported_neon_eor3_sha3();

/**
* Compute a CRC-32C checksum of a buffer using a portable,
* software-only implementation.
Expand Down
3 changes: 3 additions & 0 deletions folly/hash/test/BUCK
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,11 @@ cpp_unittest(
headers = [],
deps = [
"//folly:benchmark",
"//folly:portability",
"//folly:random",
"//folly/external/fast-crc32:avx512_crc32c_v8s3x4",
"//folly/external/fast-crc32:neon_crc32c_v3s4x2e_v2",
"//folly/external/fast-crc32:neon_eor3_crc32c_v8s2x4_s3",
"//folly/external/fast-crc32:sse_crc32c_v8s3x3",
"//folly/hash:checksum",
"//folly/hash:hash",
Expand Down
104 changes: 104 additions & 0 deletions folly/hash/test/ChecksumTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,11 @@
#include <boost/crc.hpp>

#include <folly/Benchmark.h>
#include <folly/Portability.h>
#include <folly/Random.h>
#include <folly/external/fast-crc32/avx512_crc32c_v8s3x4.h>
#include <folly/external/fast-crc32/neon_crc32c_v3s4x2e_v2.h>
#include <folly/external/fast-crc32/neon_eor3_crc32c_v8s2x4_s3.h>
#include <folly/external/fast-crc32/sse_crc32c_v8s3x3.h>
#include <folly/hash/Hash.h>
#include <folly/hash/detail/ChecksumDetail.h>
Expand Down Expand Up @@ -119,8 +122,10 @@ TEST(Checksum, crc32cHardware) {
if (folly::detail::crc32c_hw_supported()) {
testCRC32C(folly::detail::crc32c_hw);
} else {
#if FOLLY_X64
LOG(WARNING) << "skipping hardware-accelerated CRC-32C tests"
<< " (not supported on this CPU)";
#endif
}
}

Expand All @@ -132,26 +137,32 @@ TEST(Checksum, crc32cHardwareEq) {
EXPECT_EQ(sw, hw);
}
} else {
#if FOLLY_X64
LOG(WARNING) << "skipping hardware-accelerated CRC-32C tests"
<< " (not supported on this CPU)";
#endif
}
}

TEST(Checksum, crc32cContinuationHardware) {
if (folly::detail::crc32c_hw_supported()) {
testCRC32CContinuation(folly::detail::crc32c_hw);
} else {
#if FOLLY_X64
LOG(WARNING) << "skipping hardware-accelerated CRC-32C tests"
<< " (not supported on this CPU)";
#endif
}
}

TEST(Checksum, crc32cHardwareSse42) {
if (folly::detail::crc32c_hw_supported_sse42()) {
testCRC32C(folly::detail::sse_crc32c_v8s3x3);
} else {
#if FOLLY_X64
LOG(WARNING) << "skipping SSE4.2 hardware-accelerated CRC-32C tests"
<< " (not supported on this CPU)";
#endif
}
}

Expand All @@ -163,26 +174,32 @@ TEST(Checksum, crc32cHardwareEqSse42) {
ASSERT_EQ(sw, hw);
}
} else {
#if FOLLY_X64
LOG(WARNING) << "skipping SSE4.2 hardware-accelerated CRC-32C tests"
<< " (not supported on this CPU)";
#endif
}
}

TEST(Checksum, crc32cContinuationHardwareSse42) {
if (folly::detail::crc32c_hw_supported_sse42()) {
testCRC32CContinuation(folly::detail::sse_crc32c_v8s3x3);
} else {
#if FOLLY_X64
LOG(WARNING) << "skipping SSE4.2 hardware-accelerated CRC-32C tests"
<< " (not supported on this CPU)";
#endif
}
}

TEST(Checksum, crc32cHardwareAvx512) {
if (folly::detail::crc32c_hw_supported_avx512()) {
testCRC32C(folly::detail::avx512_crc32c_v8s3x4);
} else {
#if FOLLY_X64
LOG(WARNING) << "skipping AVX512 hardware-accelerated CRC-32C tests"
<< " (not supported on this CPU)";
#endif
}
}

Expand All @@ -194,17 +211,95 @@ TEST(Checksum, crc32cHardwareEqAvx512) {
ASSERT_EQ(sw, hw);
}
} else {
#if FOLLY_X64
LOG(WARNING) << "skipping AVX512 hardware-accelerated CRC-32C tests"
<< " (not supported on this CPU)";
#endif
}
}

TEST(Checksum, crc32cContinuationHardwareAvx512) {
if (folly::detail::crc32c_hw_supported_avx512()) {
testCRC32CContinuation(folly::detail::avx512_crc32c_v8s3x4);
} else {
#if FOLLY_X64
LOG(WARNING) << "skipping AVX512 hardware-accelerated CRC-32C tests"
<< " (not supported on this CPU)";
#endif
}
}

TEST(Checksum, crc32cHardwareNeon) {
if (folly::detail::crc32c_hw_supported_neon()) {
testCRC32C(folly::detail::neon_crc32c_v3s4x2e_v2);
} else {
#if FOLLY_AARCH64
LOG(WARNING) << "skipping NEON hardware-accelerated CRC-32C tests"
<< " (not supported on this CPU)";
#endif
}
}

TEST(Checksum, crc32cHardwareEqNeon) {
if (folly::detail::crc32c_hw_supported_neon()) {
for (size_t i = 0; i < 1000; i++) {
auto sw = folly::detail::crc32c_sw(buffer, i, 0);
auto hw = folly::detail::neon_crc32c_v3s4x2e_v2(buffer, i, 0);
ASSERT_EQ(sw, hw);
}
} else {
#if FOLLY_AARCH64
LOG(WARNING) << "skipping NEON hardware-accelerated CRC-32C tests"
<< " (not supported on this CPU)";
#endif
}
}

TEST(Checksum, crc32cContinuationHardwareNeon) {
if (folly::detail::crc32c_hw_supported_neon()) {
testCRC32CContinuation(folly::detail::neon_crc32c_v3s4x2e_v2);
} else {
#if FOLLY_AARCH64
LOG(WARNING) << "skipping NEON hardware-accelerated CRC-32C tests"
<< " (not supported on this CPU)";
#endif
}
}

TEST(Checksum, crc32cHardwareNeonEor3Sha3) {
if (folly::detail::crc32c_hw_supported_neon_eor3_sha3()) {
testCRC32C(folly::detail::neon_eor3_crc32c_v8s2x4_s3);
} else {
#if FOLLY_AARCH64
LOG(WARNING) << "skipping NEON+EOR3+SHA3 hardware-accelerated CRC-32C tests"
<< " (not supported on this CPU)";
#endif
}
}

TEST(Checksum, crc32cHardwareEqNeonEor3Sha3) {
if (folly::detail::crc32c_hw_supported_neon_eor3_sha3()) {
for (size_t i = 0; i < 1000; i++) {
auto sw = folly::detail::crc32c_sw(buffer, i, 0);
auto hw = folly::detail::neon_eor3_crc32c_v8s2x4_s3(buffer, i, 0);
ASSERT_EQ(sw, hw);
}
} else {
#if FOLLY_AARCH64
LOG(WARNING) << "skipping NEON+EOR3+SHA3 hardware-accelerated CRC-32C tests"
<< " (not supported on this CPU)";
#endif
}
}

TEST(Checksum, crc32cContinuationHardwareNeonEor3Sha3) {
if (folly::detail::crc32c_hw_supported_neon_eor3_sha3()) {
testCRC32CContinuation(folly::detail::neon_eor3_crc32c_v8s2x4_s3);
} else {
#if FOLLY_AARCH64
LOG(WARNING) << "skipping NEON+EOR3+SHA3 hardware-accelerated CRC-32C tests"
<< " (not supported on this CPU)";
#endif
}
}

Expand All @@ -230,6 +325,15 @@ TEST(Checksum, crc32clargeBuffers) {
auto crcAvx = folly::detail::avx512_crc32c_v8s3x4(bufp, kLargeBufSz, ~0);
ASSERT_EQ(kCrc, crcAvx);
}
if (folly::detail::crc32c_hw_supported_neon()) {
auto crcHw = folly::detail::neon_crc32c_v3s4x2e_v2(bufp, kLargeBufSz, ~0);
ASSERT_EQ(kCrc, crcHw);
}
if (folly::detail::crc32c_hw_supported_neon_eor3_sha3()) {
auto crcHw =
folly::detail::neon_eor3_crc32c_v8s2x4_s3(bufp, kLargeBufSz, ~0);
ASSERT_EQ(kCrc, crcHw);
}
}
#endif

Expand Down

0 comments on commit 4ab180a

Please sign in to comment.