From 71d883510787d65fae816ac711551c8957838391 Mon Sep 17 00:00:00 2001 From: "hengjiang.ly" Date: Tue, 6 Aug 2024 14:35:18 +0800 Subject: [PATCH 1/3] add simd strstr --- velox/common/base/SimdUtil.cpp | 137 +++++++++++++++++++++++ velox/common/base/SimdUtil.h | 2 + velox/common/base/tests/SimdUtilTest.cpp | 21 ++++ 3 files changed, 160 insertions(+) diff --git a/velox/common/base/SimdUtil.cpp b/velox/common/base/SimdUtil.cpp index 03576ac31ec4..3f7d0de91d5a 100644 --- a/velox/common/base/SimdUtil.cpp +++ b/velox/common/base/SimdUtil.cpp @@ -112,4 +112,141 @@ bool initializeSimdUtil() { static bool FB_ANONYMOUS_VARIABLE(g_simdConstants) = initializeSimdUtil(); +namespace detail { + +#if XSIMD_WITH_SSE4_2 +using CharVector = xsimd::batch; +#elif XSIMD_WITH_NEON +using CharVector = xsimd::batch; +#endif + +const int kPageSize = sysconf(_SC_PAGESIZE); +FOLLY_ALWAYS_INLINE bool pageSafe(const void* const ptr) { + return ((kPageSize - 1) & reinterpret_cast(ptr)) <= + kPageSize - CharVector::size; +} + +template +size_t FOLLY_ALWAYS_INLINE smidStrstrMemcmp( + const char* s, + size_t n, + const char* needle, + size_t needleSize) { + static_assert(compiledNeedleSize >= 2); + VELOX_CHECK_GT(needleSize, 1); + VELOX_CHECK_GT(n, 0); + auto first = CharVector::broadcast(needle[0]); + auto last = CharVector::broadcast(needle[needleSize - 1]); + size_t i = 0; + // Fast path for page-safe data. + // It`s safe to over-read CharVector if all-data are in same page. + // see: https://mudongliang.github.io/x86/html/file_module_x86_id_208.html + // While executing in 16-bit addressing mode, a linear address for a 128-bit + // data access that overlaps the end of a 16-bit segment is not allowed and is + // defined as reserved behavior. A specific processor implementation may or + // may not generate a general-protection exception (#GP) in this situation, + // and the address that spans the end of the segment may or may not wrap + // around to the beginning of the segment. + for (; i <= n - needleSize && pageSafe(s + i + needleSize - 1) && + pageSafe(s + i); + i += CharVector::size) { + auto blockFirst = CharVector::load_unaligned(s + i); + auto blockLast = CharVector::load_unaligned(s + i + needleSize - 1); + + const auto eqFirst = (first == blockFirst); + const auto eqLast = (last == blockLast); + + auto mask = toBitMask(eqFirst && eqLast); + + while (mask != 0) { + const auto bitpos = __builtin_ctz(mask); + if constexpr (compiled) { + if constexpr (compiledNeedleSize == 2) { + return i + bitpos; + } + if (memcmp(s + i + bitpos + 1, needle + 1, compiledNeedleSize - 2) == + 0) { + return i + bitpos; + } + } else { + if (memcmp(s + i + bitpos + 1, needle + 1, needleSize - 2) == 0) { + return i + bitpos; + } + } + mask = mask & (mask - 1); + } + } + // Fallback path for generic path. + for (; i <= n - needleSize; ++i) { + if constexpr (compiled) { + if (memcmp(s + i, needle, compiledNeedleSize) == 0) { + return i; + } + } else { + if (memcmp(s + i, needle, needleSize) == 0) { + return i; + } + } + } + + return std::string::npos; +}; + +} // namespace detail + +/// A faster implementation for c_strstr(), about 2x faster than string_view`s +/// find(), proved by TpchLikeBenchmark. Use xsmid-batch to compare first&&last +/// char first, use fixed-memcmp to compare left chars. Inline in header file +/// will be a little faster. +size_t simdStrstr(const char* s, size_t n, const char* needle, size_t k) { + size_t result = std::string::npos; + + if (n < k) { + return result; + } + + switch (k) { + case 0: + return 0; + + case 1: { + const char* res = strchr(s, needle[0]); + + return (res != nullptr) ? res - s : std::string::npos; + } +#define FIXED_MEM_STRSTR(size) \ + case size: \ + result = detail::smidStrstrMemcmp(s, n, needle, size); \ + break; + FIXED_MEM_STRSTR(2) + FIXED_MEM_STRSTR(3) + FIXED_MEM_STRSTR(4) + FIXED_MEM_STRSTR(5) + FIXED_MEM_STRSTR(6) + FIXED_MEM_STRSTR(7) + FIXED_MEM_STRSTR(8) + FIXED_MEM_STRSTR(9) + FIXED_MEM_STRSTR(10) + FIXED_MEM_STRSTR(11) + FIXED_MEM_STRSTR(12) + FIXED_MEM_STRSTR(13) + FIXED_MEM_STRSTR(14) + FIXED_MEM_STRSTR(15) + FIXED_MEM_STRSTR(16) + FIXED_MEM_STRSTR(17) + FIXED_MEM_STRSTR(18) + default: + result = detail::smidStrstrMemcmp(s, n, needle, k); + break; + } +#undef FIXED_MEM_STRSTR + // load_unaligned is used for better performance, so result maybe bigger than + // n-k. + if (result <= n - k) { + return result; + } else { + return std::string::npos; + } +} + } // namespace facebook::velox::simd diff --git a/velox/common/base/SimdUtil.h b/velox/common/base/SimdUtil.h index 9a6ad0c37425..ba63d3c1d237 100644 --- a/velox/common/base/SimdUtil.h +++ b/velox/common/base/SimdUtil.h @@ -497,6 +497,8 @@ xsimd::batch reinterpretBatch(xsimd::batch, const A& = {}); template inline bool memEqualUnsafe(const void* x, const void* y, int32_t size); +size_t simdStrstr(const char* s, size_t n, const char* needle, size_t k); + } // namespace facebook::velox::simd #include "velox/common/base/SimdUtil-inl.h" diff --git a/velox/common/base/tests/SimdUtilTest.cpp b/velox/common/base/tests/SimdUtilTest.cpp index ba389780b1cb..9dbebc060fb3 100644 --- a/velox/common/base/tests/SimdUtilTest.cpp +++ b/velox/common/base/tests/SimdUtilTest.cpp @@ -491,4 +491,25 @@ TEST_F(SimdUtilTest, memcpyTime) { LOG(INFO) << "simd=" << simd << " sys=" << sys; } +TEST_F(SimdUtilTest, testSimdStrStr) { + // 48 chars. + std::string s1 = "aabbccddeeffgghhiijjkkllmmnnooppqqrrssttuuvvwwxxyyzz"; + std::string s2 = "aabbccddeeffgghhiijjkkllmmnnooppqqrrssttuuvvwwxxyyzz"; + std::string s3 = "xxx"; + auto test = [](char* text, size_t size, char* needle, size_t k) { + ASSERT_EQ( + simd::simdStrstr(text, size, needle, k), + std::string_view(text, size).find(std::string_view(needle, k))); + }; + // Match cases : substrings in s2 should be a substring in s1. + for (int i = 0; i < 20; i++) { + for (int k = 0; k < 28; k++) { + char* data = s2.data() + i; + test(s1.data(), s1.size(), data, k); + } + } + // Not match case : "xxx" not in s1. + test(s1.data(), s1.size(), s3.data(), s3.size()); +} + } // namespace From a61a4540e2182416aa1bd617e634155698f5af5b Mon Sep 17 00:00:00 2001 From: "hengjiang.ly" Date: Wed, 28 Aug 2024 16:56:41 +0800 Subject: [PATCH 2/3] add ut / benchmark --- velox/common/base/SimdUtil-inl.h | 168 +++++++++++ velox/common/base/SimdUtil.cpp | 138 +-------- velox/common/base/SimdUtil.h | 3 +- velox/common/base/benchmarks/CMakeLists.txt | 6 + .../base/benchmarks/StringSearchBenchmark.cpp | 269 ++++++++++++++++++ velox/common/base/tests/SimdUtilTest.cpp | 155 +++++++++- 6 files changed, 595 insertions(+), 144 deletions(-) create mode 100644 velox/common/base/benchmarks/StringSearchBenchmark.cpp diff --git a/velox/common/base/SimdUtil-inl.h b/velox/common/base/SimdUtil-inl.h index 87ff71f8b181..dddd1de3836b 100644 --- a/velox/common/base/SimdUtil-inl.h +++ b/velox/common/base/SimdUtil-inl.h @@ -1436,4 +1436,172 @@ inline bool memEqualUnsafe(const void* x, const void* y, int32_t size) { return true; } +namespace detail { + +/// NOTE: SSE_4_2`s the performance of simdStrStr is a little slower than +/// std::find in first-char-unmatch(read only one char per match.) Use AVX2 the +/// performance will be better than std::find in that case. +#if XSIMD_WITH_AVX2 +using CharVector = xsimd::batch; +#define VELOX_SIMD_STRSTR 1 +#elif XSIMD_WITH_NEON +using CharVector = xsimd::batch; +#define VELOX_SIMD_STRSTR 1 +#else +#define VELOX_SIMD_STRSTR 0 +#endif + +extern const int kPageSize; + +FOLLY_ALWAYS_INLINE bool pageSafe(const void* const ptr) { + return ((kPageSize - 1) & reinterpret_cast(ptr)) <= + kPageSize - CharVector::size; +} + +template +size_t FOLLY_ALWAYS_INLINE smidStrstrMemcmp( + const char* s, + size_t n, + const char* needle, + size_t needleSize) { + static_assert(kNeedleSize >= 2); + VELOX_DCHECK_GT(needleSize, 1); + VELOX_DCHECK_GT(n, 0); + auto first = CharVector::broadcast(needle[0]); + auto last = CharVector::broadcast(needle[needleSize - 1]); + size_t i = 0; + // Fast path for page-safe data. + // It`s safe to over-read CharVector if all-data are in same page. + // see: https://mudongliang.github.io/x86/html/file_module_x86_id_208.html + // While executing in 16-bit addressing mode, a linear address for a 128-bit + // data access that overlaps the end of a 16-bit segment is not allowed and is + // defined as reserved behavior. A specific processor implementation may or + // may not generate a general-protection exception (#GP) in this situation, + // and the address that spans the end of the segment may or may not wrap + // around to the beginning of the segment. + for (; i <= n - needleSize && pageSafe(s + i) && + pageSafe(s + i + needleSize - 1); + i += CharVector::size) { + auto blockFirst = CharVector::load_unaligned(s + i); + const auto eqFirst = (first == blockFirst); + /// std:find handle the fast-path for first-char-unmatch, so we also need + /// to handle eqFirst. + if (eqFirst.mask() == 0) { + continue; + } + auto blockLast = CharVector::load_unaligned(s + i + needleSize - 1); + const auto eqLast = (last == blockLast); + auto mask = (eqFirst && eqLast).mask(); + while (mask != 0) { + const auto bitpos = __builtin_ctz(mask); + if constexpr (compiled) { + if constexpr (kNeedleSize == 2) { + return i + bitpos; + } + if (memcmp(s + i + bitpos + 1, needle + 1, kNeedleSize - 2) == 0) { + return i + bitpos; + } + } else { + if (memcmp(s + i + bitpos + 1, needle + 1, needleSize - 2) == 0) { + return i + bitpos; + } + } + mask = mask & (mask - 1); + } + } + // Fallback path for generic path. + for (; i <= n - needleSize; ++i) { + if constexpr (compiled) { + if (memcmp(s + i, needle, kNeedleSize) == 0) { + return i; + } + } else { + if (memcmp(s + i, needle, needleSize) == 0) { + return i; + } + } + } + + return std::string::npos; +}; + +} // namespace detail + +/// A faster implementation for std::find, about 2x faster than string_view`s +/// find() in almost cases, proved by StringSearchBenchmark.cpp. Use xsmid-batch +/// to compare first&&last char first, use fixed-memcmp to compare left chars. +/// Inline in header file will be 30% faster. +FOLLY_ALWAYS_INLINE size_t +simdStrstr(const char* s, size_t n, const char* needle, size_t k) { +#if VELOX_SIMD_STRSTR + size_t result = std::string::npos; + + if (n < k) { + return result; + } + + switch (k) { + case 0: + return 0; + + case 1: { + const char* res = strchr(s, needle[0]); + + return (res != nullptr) ? res - s : std::string::npos; + } +#define VELOX_SIMD_STRSTR_CASE(size) \ + case size: \ + result = detail::smidStrstrMemcmp(s, n, needle, size); \ + break; + VELOX_SIMD_STRSTR_CASE(2) + VELOX_SIMD_STRSTR_CASE(3) + VELOX_SIMD_STRSTR_CASE(4) + VELOX_SIMD_STRSTR_CASE(5) + VELOX_SIMD_STRSTR_CASE(6) + VELOX_SIMD_STRSTR_CASE(7) + VELOX_SIMD_STRSTR_CASE(8) + VELOX_SIMD_STRSTR_CASE(9) + VELOX_SIMD_STRSTR_CASE(10) + VELOX_SIMD_STRSTR_CASE(11) + VELOX_SIMD_STRSTR_CASE(12) + VELOX_SIMD_STRSTR_CASE(13) + VELOX_SIMD_STRSTR_CASE(14) + VELOX_SIMD_STRSTR_CASE(15) + VELOX_SIMD_STRSTR_CASE(16) + VELOX_SIMD_STRSTR_CASE(17) + VELOX_SIMD_STRSTR_CASE(18) +#if XSIMD_WITH_AVX2 + VELOX_SIMD_STRSTR_CASE(19) + VELOX_SIMD_STRSTR_CASE(20) + VELOX_SIMD_STRSTR_CASE(21) + VELOX_SIMD_STRSTR_CASE(22) + VELOX_SIMD_STRSTR_CASE(23) + VELOX_SIMD_STRSTR_CASE(24) + VELOX_SIMD_STRSTR_CASE(25) + VELOX_SIMD_STRSTR_CASE(26) + VELOX_SIMD_STRSTR_CASE(27) + VELOX_SIMD_STRSTR_CASE(28) + VELOX_SIMD_STRSTR_CASE(29) + VELOX_SIMD_STRSTR_CASE(30) + VELOX_SIMD_STRSTR_CASE(31) + VELOX_SIMD_STRSTR_CASE(32) + VELOX_SIMD_STRSTR_CASE(33) + VELOX_SIMD_STRSTR_CASE(34) +#endif + default: + result = detail::smidStrstrMemcmp(s, n, needle, k); + break; + } +#undef VELOX_SIMD_STRSTR_CASE + // load_unaligned is used for better performance, so result maybe bigger than + // n-k. + if (result <= n - k) { + return result; + } else { + return std::string::npos; + } +#endif + return std::string_view(s, n).find(std::string_view(needle, k)); +} + } // namespace facebook::velox::simd diff --git a/velox/common/base/SimdUtil.cpp b/velox/common/base/SimdUtil.cpp index 3f7d0de91d5a..18a70d118ebd 100644 --- a/velox/common/base/SimdUtil.cpp +++ b/velox/common/base/SimdUtil.cpp @@ -62,6 +62,7 @@ const LeadingMask leadingMask64; const FromBitMask fromBitMask32; const FromBitMask fromBitMask64; +const int kPageSize = sysconf(_SC_PAGESIZE); } // namespace detail namespace { @@ -112,141 +113,4 @@ bool initializeSimdUtil() { static bool FB_ANONYMOUS_VARIABLE(g_simdConstants) = initializeSimdUtil(); -namespace detail { - -#if XSIMD_WITH_SSE4_2 -using CharVector = xsimd::batch; -#elif XSIMD_WITH_NEON -using CharVector = xsimd::batch; -#endif - -const int kPageSize = sysconf(_SC_PAGESIZE); -FOLLY_ALWAYS_INLINE bool pageSafe(const void* const ptr) { - return ((kPageSize - 1) & reinterpret_cast(ptr)) <= - kPageSize - CharVector::size; -} - -template -size_t FOLLY_ALWAYS_INLINE smidStrstrMemcmp( - const char* s, - size_t n, - const char* needle, - size_t needleSize) { - static_assert(compiledNeedleSize >= 2); - VELOX_CHECK_GT(needleSize, 1); - VELOX_CHECK_GT(n, 0); - auto first = CharVector::broadcast(needle[0]); - auto last = CharVector::broadcast(needle[needleSize - 1]); - size_t i = 0; - // Fast path for page-safe data. - // It`s safe to over-read CharVector if all-data are in same page. - // see: https://mudongliang.github.io/x86/html/file_module_x86_id_208.html - // While executing in 16-bit addressing mode, a linear address for a 128-bit - // data access that overlaps the end of a 16-bit segment is not allowed and is - // defined as reserved behavior. A specific processor implementation may or - // may not generate a general-protection exception (#GP) in this situation, - // and the address that spans the end of the segment may or may not wrap - // around to the beginning of the segment. - for (; i <= n - needleSize && pageSafe(s + i + needleSize - 1) && - pageSafe(s + i); - i += CharVector::size) { - auto blockFirst = CharVector::load_unaligned(s + i); - auto blockLast = CharVector::load_unaligned(s + i + needleSize - 1); - - const auto eqFirst = (first == blockFirst); - const auto eqLast = (last == blockLast); - - auto mask = toBitMask(eqFirst && eqLast); - - while (mask != 0) { - const auto bitpos = __builtin_ctz(mask); - if constexpr (compiled) { - if constexpr (compiledNeedleSize == 2) { - return i + bitpos; - } - if (memcmp(s + i + bitpos + 1, needle + 1, compiledNeedleSize - 2) == - 0) { - return i + bitpos; - } - } else { - if (memcmp(s + i + bitpos + 1, needle + 1, needleSize - 2) == 0) { - return i + bitpos; - } - } - mask = mask & (mask - 1); - } - } - // Fallback path for generic path. - for (; i <= n - needleSize; ++i) { - if constexpr (compiled) { - if (memcmp(s + i, needle, compiledNeedleSize) == 0) { - return i; - } - } else { - if (memcmp(s + i, needle, needleSize) == 0) { - return i; - } - } - } - - return std::string::npos; -}; - -} // namespace detail - -/// A faster implementation for c_strstr(), about 2x faster than string_view`s -/// find(), proved by TpchLikeBenchmark. Use xsmid-batch to compare first&&last -/// char first, use fixed-memcmp to compare left chars. Inline in header file -/// will be a little faster. -size_t simdStrstr(const char* s, size_t n, const char* needle, size_t k) { - size_t result = std::string::npos; - - if (n < k) { - return result; - } - - switch (k) { - case 0: - return 0; - - case 1: { - const char* res = strchr(s, needle[0]); - - return (res != nullptr) ? res - s : std::string::npos; - } -#define FIXED_MEM_STRSTR(size) \ - case size: \ - result = detail::smidStrstrMemcmp(s, n, needle, size); \ - break; - FIXED_MEM_STRSTR(2) - FIXED_MEM_STRSTR(3) - FIXED_MEM_STRSTR(4) - FIXED_MEM_STRSTR(5) - FIXED_MEM_STRSTR(6) - FIXED_MEM_STRSTR(7) - FIXED_MEM_STRSTR(8) - FIXED_MEM_STRSTR(9) - FIXED_MEM_STRSTR(10) - FIXED_MEM_STRSTR(11) - FIXED_MEM_STRSTR(12) - FIXED_MEM_STRSTR(13) - FIXED_MEM_STRSTR(14) - FIXED_MEM_STRSTR(15) - FIXED_MEM_STRSTR(16) - FIXED_MEM_STRSTR(17) - FIXED_MEM_STRSTR(18) - default: - result = detail::smidStrstrMemcmp(s, n, needle, k); - break; - } -#undef FIXED_MEM_STRSTR - // load_unaligned is used for better performance, so result maybe bigger than - // n-k. - if (result <= n - k) { - return result; - } else { - return std::string::npos; - } -} - } // namespace facebook::velox::simd diff --git a/velox/common/base/SimdUtil.h b/velox/common/base/SimdUtil.h index ba63d3c1d237..5230abe6ff1f 100644 --- a/velox/common/base/SimdUtil.h +++ b/velox/common/base/SimdUtil.h @@ -497,7 +497,8 @@ xsimd::batch reinterpretBatch(xsimd::batch, const A& = {}); template inline bool memEqualUnsafe(const void* x, const void* y, int32_t size); -size_t simdStrstr(const char* s, size_t n, const char* needle, size_t k); +FOLLY_ALWAYS_INLINE size_t +simdStrstr(const char* s, size_t n, const char* needle, size_t k); } // namespace facebook::velox::simd diff --git a/velox/common/base/benchmarks/CMakeLists.txt b/velox/common/base/benchmarks/CMakeLists.txt index 065db83c3672..a8e956f8542a 100644 --- a/velox/common/base/benchmarks/CMakeLists.txt +++ b/velox/common/base/benchmarks/CMakeLists.txt @@ -17,3 +17,9 @@ target_link_libraries( velox_common_base_benchmarks PUBLIC ${FOLLY_BENCHMARK} PRIVATE velox_common_base Folly::folly) + +add_executable(velox_common_stringsearch_benchmarks StringSearchBenchmark.cpp) +target_link_libraries( + velox_common_stringsearch_benchmarks + PUBLIC ${FOLLY_BENCHMARK} + PRIVATE velox_common_base Folly::folly) diff --git a/velox/common/base/benchmarks/StringSearchBenchmark.cpp b/velox/common/base/benchmarks/StringSearchBenchmark.cpp new file mode 100644 index 000000000000..010d330d1585 --- /dev/null +++ b/velox/common/base/benchmarks/StringSearchBenchmark.cpp @@ -0,0 +1,269 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include "velox/common/base/SimdUtil.h" + +/// Copy Part code from +/// https://github.com/facebook/folly/blob/ce5edfb9b08ead9e78cb46879e7b9499861f7cd2/folly/test/FBStringTestBenchmarks.cpp.h +using namespace std; +using namespace folly; +/// Fixed seed for stable benchmark result, simdStrStr is always faster than +/// std::find with different seeds. +static const int seed = 123456; +static std::mt19937 rng(seed); + +namespace facebook::velox { +template +Integral2 random(Integral1 low, Integral2 up) { + std::uniform_int_distribution<> range(low, up); + return range(rng); +} + +enum ALG { SIMD, STD, KMP, BOYER_MOORE }; + +class KmpSearcher { + public: + KmpSearcher(const std::string& needle) : needle_(std::move(needle)) { + next_ = new int[1 + needle.size()]; + initNextArr(needle); + } + + ~KmpSearcher() { + delete[] next_; + } + + size_t search(const char* heyStack, size_t heyStackSize) const { + int i = 0, j = 0; + while ((i < (int32_t)heyStackSize) && (j < (int32_t)needle_.size())) { + if (j == -1 || heyStack[i] == needle_[j]) { + i++; + j++; + } else { + j = next_[j]; + } + } + if (j >= needle_.size()) { + return (i - needle_.size()); + }; + return (std::string::npos); + } + + private: + void initNextArr(const string& needle) { + int j = 0, k = -1; + next_[0] = -1; + for (; j < needle.length();) { + if (k == -1 || needle[j] == needle[k]) { + j++; + k++; + next_[j] = k; + } else + k = next_[k]; + } + } + std::string needle_; + int* next_; +}; + +class TestStringSearch { + public: + TestStringSearch(const std::string& heyStack, const std::string& needle) + : heyStack_(std::move(heyStack)), + needle_(std::move(needle)), + searher_(needle_.begin(), needle_.end()), + kmpSearcher_(needle_) {} + + template + void runSearching(size_t iters) const { + if constexpr (alg == SIMD) { + FOR_EACH_RANGE (i, 0, iters) + doNotOptimizeAway(simd::simdStrstr( + heyStack_.data(), + heyStack_.size(), + needle_.data(), + needle_.size())); + } else if constexpr (alg == STD) { + FOR_EACH_RANGE (i, 0, iters) + doNotOptimizeAway( + std::string_view(heyStack_.data(), heyStack_.size()) + .find(std::string_view(needle_.data(), needle_.size()))); + } else if constexpr (alg == BOYER_MOORE) { + FOR_EACH_RANGE (i, 0, iters) + doNotOptimizeAway( + std::search(heyStack_.begin(), heyStack_.end(), searher_)); + } else if constexpr (alg == KMP) { + FOR_EACH_RANGE (i, 0, iters) + doNotOptimizeAway( + kmpSearcher_.search(heyStack_.data(), heyStack_.size())); + } + } + + private: + std::string heyStack_; + std::string needle_; + std::boyer_moore_searcher searher_; + KmpSearcher kmpSearcher_; +}; + +TestStringSearch generateTest(int hayStackSize, int needleSize) { + // Text courtesy (ahem) of + // http://www.psychologytoday.com/blog/career-transitions/200906/ + // the-dreaded-writing-sample + // 1028chars + static const std::string s = + "\ +Even if you've mastered the art of the cover letter and the resume, \ +another part of the job search process can trip up an otherwise \ +qualified candidate: the writing sample.\n\ +\n\ +Strong writing and communication skills are highly sought after by \ +most employers. Whether crafting short emails or lengthy annual \ +reports, many workers use their writing skills every day. And for an \ +employer seeking proof behind that ubiquitous candidate \ +phrase,\"excellent communication skills\", a required writing sample \ +is invaluable.\n\ +\n\ +Writing samples need the same care and attention given to cover \ +letters and resumes. Candidates with otherwise impeccable credentials \ +are routinely eliminated by a poorly chosen writing sample. Notice I \ +said \"poorly chosen\" not \"poorly written.\" Because that's the rub: \ +a writing sample not only reveals the individual's writing skills, it \ +also offers a peek into what they consider important or relevant for \ +the position. If you miss that mark with your writing sample, don't \ +expect to get a call for an interview."; + auto pos = random(0, s.size() - hayStackSize); + auto needlePos = random(2, hayStackSize - needleSize); + std::string haystack = s.substr(pos, hayStackSize); + std::string needle = haystack.substr(needlePos, needleSize); + return TestStringSearch(std::move(haystack), std::move(needle)); +} + +void findSuccessful( + unsigned /*arg*/, + ALG alg, + size_t iters, + const TestStringSearch& testdata) { + switch (alg) { + case KMP: + testdata.runSearching(iters); + break; + case STD: + testdata.runSearching(iters); + break; + case SIMD: + testdata.runSearching(iters); + break; + case BOYER_MOORE: + testdata.runSearching(iters); + break; + } +} + +/// Folly uses random test data for each iteration, but this cannot guarantee +/// that the data for each test of different algorithms is the same, so we use +/// the same random data for each comparison benchmark here. +#define STRING_SEARCH_BENCHMARK(name, start, end, iters) \ + TestStringSearch test##start##end = generateTest(start, end); \ + BENCHMARK_NAMED_PARAM( \ + name, simd_##start##_to_##end, SIMD, iters, test##start##end); \ + BENCHMARK_NAMED_PARAM( \ + name, std_##start##_to_##end, STD, iters, test##start##end); \ + BENCHMARK_NAMED_PARAM( \ + name, \ + std_boyer_moore_##start##_to_##end, \ + BOYER_MOORE, \ + iters, \ + test##start##end); \ + \ + BENCHMARK_NAMED_PARAM( \ + name, kmp_##start##_to_##end, KMP, iters, test##start##end); + +STRING_SEARCH_BENCHMARK(findSuccessful, 50, 5, 52428800) +STRING_SEARCH_BENCHMARK(findSuccessful, 100, 10, 52428800) +STRING_SEARCH_BENCHMARK(findSuccessful, 100, 20, 52428800) +STRING_SEARCH_BENCHMARK(findSuccessful, 1000, 10, 52428800) +STRING_SEARCH_BENCHMARK(findSuccessful, 1000, 100, 5242880) +STRING_SEARCH_BENCHMARK(findSuccessful, 1000, 200, 5242880) + +/// std::find only handle fast-path for prefix-unmatch-char, if there is a +/// prefix-match-char(in practice, it is a high probability event that a +/// first char match is successful.), the performance of std::find drops +/// significantly in such a scenario. +TestStringSearch prefixMatch = { + "luffily close dugouts wake about the pinto beans. pending, ironic dependencies", + "b???"}; + +TestStringSearch prefixUnMatch = { + "luffily close dugouts wake about the pinto beans. pending, ironic dependencies", + "????"}; +void findUnsuccessful( + size_t /*arg*/, + bool useStd, + size_t iters, + const TestStringSearch& test) { + if (useStd) { + test.runSearching(iters); + } else { + test.runSearching(iters); + } +} + +BENCHMARK_NAMED_PARAM( + findUnsuccessful, + std_first_char_match, + true, + 52428800, + prefixMatch) +BENCHMARK_NAMED_PARAM( + findUnsuccessful, + opt_first_char_match, + false, + 52428800, + prefixMatch) +BENCHMARK_NAMED_PARAM( + findUnsuccessful, + std_first_char_unmatch, + true, + 52428800, + prefixUnMatch) +BENCHMARK_NAMED_PARAM( + findUnsuccessful, + opt_first_char_unmatch, + false, + 52428800, + prefixUnMatch) +} // namespace facebook::velox + +int main(int argc, char** argv) { + gflags::ParseCommandLineFlags(&argc, &argv, true); + folly::runBenchmarks(); + return 0; +} diff --git a/velox/common/base/tests/SimdUtilTest.cpp b/velox/common/base/tests/SimdUtilTest.cpp index 9dbebc060fb3..2e0d26416e73 100644 --- a/velox/common/base/tests/SimdUtilTest.cpp +++ b/velox/common/base/tests/SimdUtilTest.cpp @@ -109,6 +109,20 @@ class SimdUtilTest : public testing::Test { EXPECT_EQ(reference, target); } + template + Integral2 random(Integral1 low, Integral2 up) { + std::uniform_int_distribution<> range(low, up); + return range(rng_); + } + + void randomString(std::string* toFill, unsigned int maxSize = 1000) { + assert(toFill); + toFill->resize(random(0, maxSize)); + for (int i = 0; i < toFill->size(); i++) { + (*toFill)[i] = random('a', 'z'); + } + } + folly::Random::DefaultGenerator rng_; }; @@ -491,25 +505,154 @@ TEST_F(SimdUtilTest, memcpyTime) { LOG(INFO) << "simd=" << simd << " sys=" << sys; } -TEST_F(SimdUtilTest, testSimdStrStr) { - // 48 chars. +/// Copy from std::boyer_moore_searcher proposal: +/// https://github.com/mclow/search-library/blob/master/basic_tests.cpp +/// Basic sanity checking. It makes sure that all the algorithms work. +TEST_F(SimdUtilTest, basicSimdStrStr) { + auto checkOne = [](const std::string& text, const std::string& needle) { + auto size = text.size(); + auto k = needle.size(); + ASSERT_EQ( + simd::simdStrstr(text.data(), size, needle.data(), k), + text.find(needle)); + }; + std::string haystack1("NOW AN FOWE\220ER ANNMAN THE ANPANMANEND"); + std::string needle1("ANPANMAN"); + std::string needle2("MAN THE"); + std::string needle3("WE\220ER"); + // At the beginning + std::string needle4("NOW "); + // At the end + std::string needle5("NEND"); + // Nowhere + std::string needle6("NOT FOUND"); + // Nowhere + std::string needle7("NOT FO\340ND"); + + std::string haystack2("ABC ABCDAB ABCDABCDABDE"); + std::string needle11("ABCDABD"); + + std::string haystack3("abra abracad abracadabra"); + std::string needle12("abracadabra"); + + std::string needle13(""); + std::string haystack4(""); + + checkOne(haystack1, needle1); + checkOne(haystack1, needle2); + checkOne(haystack1, needle3); + checkOne(haystack1, needle4); + checkOne(haystack1, needle5); + checkOne(haystack1, needle6); + checkOne(haystack1, needle7); + + // Cant find long pattern in short corpus + checkOne(needle1, haystack1); + // Find something in itself + checkOne(haystack1, haystack1); + // Find something in itself + checkOne(haystack2, haystack2); + + checkOne(haystack2, needle11); + checkOne(haystack3, needle12); + // Find the empty string + checkOne(haystack1, needle13); + // Can't find in an empty haystack + checkOne(haystack4, needle1); + + // Comment copy from the origin code. + // Mikhail Levin found a problem, and this was the + // test that triggered it. + const std::string mikhailPattern = + "GATACACCTACCTTCACCAGTTACTCTATGCACTAGGTGCGCCAGGCCCATGCACAAGGGCTTGAGTGGATGGGAAGGA" + "TGTGCCCTAGTGATGGCAGCATAAGCTACGCAGAGAAGTTCCAGGGCAGAGTCACCATGACCAGGGACACATCCACGAG" + "CACAGCCTACATGGAGCTGAGCAGCCTGAGATCTGAAGACACGGCCATGTATTACTGTGGGAGAGATGTCTGGAGTGGT" + "TATTATTGCCCCGGTAATATTACTACTACTACTACTACATGGACGTCTGGGGCAAAGGGACCACG"; + const std::string mikhailCorpus = std::string(8, 'a') + mikhailPattern; + + checkOne(mikhailCorpus, mikhailPattern); +} + +TEST_F(SimdUtilTest, variableNeedleSize) { std::string s1 = "aabbccddeeffgghhiijjkkllmmnnooppqqrrssttuuvvwwxxyyzz"; std::string s2 = "aabbccddeeffgghhiijjkkllmmnnooppqqrrssttuuvvwwxxyyzz"; - std::string s3 = "xxx"; + std::string s3 = "01234567890123456789"; auto test = [](char* text, size_t size, char* needle, size_t k) { + if (simd::simdStrstr(text, size, needle, k) != + std::string_view(text, size).find(std::string_view(needle, k))) { + LOG(ERROR) << "text: " << std::string(text, size) + << " needle :" << std::string(needle, k); + } ASSERT_EQ( simd::simdStrstr(text, size, needle, k), std::string_view(text, size).find(std::string_view(needle, k))); }; - // Match cases : substrings in s2 should be a substring in s1. + // Match cases (prefix/middle/suffix): substrings in s2 should be a substring + // in s1. Choose different needle-size left from s2, testing prefix-match in + // s1. + for (int k = 0; k < s2.size(); k++) { + test(s1.data(), s1.size(), s2.data(), k); + } + // Choose different needle-size left from s2, testing middle-match in s1. for (int i = 0; i < 20; i++) { for (int k = 0; k < 28; k++) { char* data = s2.data() + i; test(s1.data(), s1.size(), data, k); } } - // Not match case : "xxx" not in s1. - test(s1.data(), s1.size(), s3.data(), s3.size()); + // Choose different needle-size right from s2, testing suffix-match in s1. + for (int k = 0; k < s2.size(); k++) { + char* data = s2.data() + s2.size() - k; + test(s1.data(), s1.size(), data, k); + } + // Not match case : substring in s3 not in s1. + for (auto k = 0; k < s3.size(); k++) { + test(s1.data(), s1.size(), s3.data(), k); + } + + // FirstBlock match + for (auto k = 0; k < s3.size(); k++) { + std::string somePrefix = "xxxxxx"; + std::string matchString = "a" + std::string(k, 'x'); + std::string someSuffix = "yyyyyyyy"; + std::string text = somePrefix + matchString + someSuffix; + auto s = "a" + std::string(k, '9'); + test(text.data(), text.size(), s.data(), s.size()); + } + // FirstBlock and LastBlock match + for (auto k = 0; k < s3.size(); k++) { + std::string somePrefix = "xxxxxx"; + std::string matchString = "a" + std::string(k, 'x') + "b"; + std::string someSuffix = "yyyyyyyy"; + std::string text = somePrefix + matchString + someSuffix; + auto s = "a" + std::string(k, '9') + "b"; + test(text.data(), text.size(), s.data(), s.size()); + } +} + +/// Copy from +/// https://github.com/facebook/folly/blob/ce5edfb9b08ead9e78cb46879e7b9499861f7cd2/folly/test/FBStringTest.cpp#L1277 +/// clause11_21_4_7_2_a1 +TEST_F(SimdUtilTest, randomStringStrStr) { + std::string test; + const int kTestLoop = 1; + auto checkOne = + [](const std::string& text, const std::string& needle, size_t pos) { + auto size = text.length() - pos; + auto textPtr = text.data() + pos; + auto k = needle.size(); + ASSERT_EQ( + simd::simdStrstr(textPtr, size, needle.data(), k), + text.substr(pos).find(needle)); + }; + for (int i = 0; i < kTestLoop; i++) { + // clause11_21_4_7_2_a1 + randomString(&test); + auto from = random(0, test.size()); + auto length = random(0, test.size() - from); + std::string str = test.substr(from, length); + checkOne(test, str, random(0, test.size())); + } } } // namespace From cb6e208d7e327e9dc8587a676a06e207e277e2dc Mon Sep 17 00:00:00 2001 From: "hengjiang.ly" Date: Thu, 26 Sep 2024 10:52:51 +0800 Subject: [PATCH 3/3] check the last page --- velox/common/base/SimdUtil-inl.h | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/velox/common/base/SimdUtil-inl.h b/velox/common/base/SimdUtil-inl.h index dddd1de3836b..04d0628338c9 100644 --- a/velox/common/base/SimdUtil-inl.h +++ b/velox/common/base/SimdUtil-inl.h @@ -1470,18 +1470,16 @@ size_t FOLLY_ALWAYS_INLINE smidStrstrMemcmp( auto first = CharVector::broadcast(needle[0]); auto last = CharVector::broadcast(needle[needleSize - 1]); size_t i = 0; - // Fast path for page-safe data. - // It`s safe to over-read CharVector if all-data are in same page. - // see: https://mudongliang.github.io/x86/html/file_module_x86_id_208.html - // While executing in 16-bit addressing mode, a linear address for a 128-bit - // data access that overlaps the end of a 16-bit segment is not allowed and is - // defined as reserved behavior. A specific processor implementation may or - // may not generate a general-protection exception (#GP) in this situation, - // and the address that spans the end of the segment may or may not wrap - // around to the beginning of the segment. - for (; i <= n - needleSize && pageSafe(s + i) && - pageSafe(s + i + needleSize - 1); - i += CharVector::size) { + + for (; i <= n - needleSize; i += CharVector::size) { + // Assume that the input string is allocated on virtual pages : VP1, VP2, + // VP3 and VP4 has not been allocated yet, we need to check the end of input + // string is page-safe to over-read CharVector. + const auto lastPos = i + needleSize - 1; + + if (lastPos + CharVector::size > n && !pageSafe(s + lastPos)) { + break; + } auto blockFirst = CharVector::load_unaligned(s + i); const auto eqFirst = (first == blockFirst); /// std:find handle the fast-path for first-char-unmatch, so we also need @@ -1489,7 +1487,7 @@ size_t FOLLY_ALWAYS_INLINE smidStrstrMemcmp( if (eqFirst.mask() == 0) { continue; } - auto blockLast = CharVector::load_unaligned(s + i + needleSize - 1); + auto blockLast = CharVector::load_unaligned(s + lastPos); const auto eqLast = (last == blockLast); auto mask = (eqFirst && eqLast).mask(); while (mask != 0) {