diff --git a/velox/common/base/SimdUtil-inl.h b/velox/common/base/SimdUtil-inl.h index 87ff71f8b181..04d0628338c9 100644 --- a/velox/common/base/SimdUtil-inl.h +++ b/velox/common/base/SimdUtil-inl.h @@ -1436,4 +1436,170 @@ inline bool memEqualUnsafe(const void* x, const void* y, int32_t size) { return true; } +namespace detail { + +/// NOTE: SSE_4_2`s the performance of simdStrStr is a little slower than +/// std::find in first-char-unmatch(read only one char per match.) Use AVX2 the +/// performance will be better than std::find in that case. +#if XSIMD_WITH_AVX2 +using CharVector = xsimd::batch; +#define VELOX_SIMD_STRSTR 1 +#elif XSIMD_WITH_NEON +using CharVector = xsimd::batch; +#define VELOX_SIMD_STRSTR 1 +#else +#define VELOX_SIMD_STRSTR 0 +#endif + +extern const int kPageSize; + +FOLLY_ALWAYS_INLINE bool pageSafe(const void* const ptr) { + return ((kPageSize - 1) & reinterpret_cast(ptr)) <= + kPageSize - CharVector::size; +} + +template +size_t FOLLY_ALWAYS_INLINE smidStrstrMemcmp( + const char* s, + size_t n, + const char* needle, + size_t needleSize) { + static_assert(kNeedleSize >= 2); + VELOX_DCHECK_GT(needleSize, 1); + VELOX_DCHECK_GT(n, 0); + auto first = CharVector::broadcast(needle[0]); + auto last = CharVector::broadcast(needle[needleSize - 1]); + size_t i = 0; + + for (; i <= n - needleSize; i += CharVector::size) { + // Assume that the input string is allocated on virtual pages : VP1, VP2, + // VP3 and VP4 has not been allocated yet, we need to check the end of input + // string is page-safe to over-read CharVector. + const auto lastPos = i + needleSize - 1; + + if (lastPos + CharVector::size > n && !pageSafe(s + lastPos)) { + break; + } + auto blockFirst = CharVector::load_unaligned(s + i); + const auto eqFirst = (first == blockFirst); + /// std:find handle the fast-path for first-char-unmatch, so we also need + /// to handle eqFirst. + if (eqFirst.mask() == 0) { + continue; + } + auto blockLast = CharVector::load_unaligned(s + lastPos); + const auto eqLast = (last == blockLast); + auto mask = (eqFirst && eqLast).mask(); + while (mask != 0) { + const auto bitpos = __builtin_ctz(mask); + if constexpr (compiled) { + if constexpr (kNeedleSize == 2) { + return i + bitpos; + } + if (memcmp(s + i + bitpos + 1, needle + 1, kNeedleSize - 2) == 0) { + return i + bitpos; + } + } else { + if (memcmp(s + i + bitpos + 1, needle + 1, needleSize - 2) == 0) { + return i + bitpos; + } + } + mask = mask & (mask - 1); + } + } + // Fallback path for generic path. + for (; i <= n - needleSize; ++i) { + if constexpr (compiled) { + if (memcmp(s + i, needle, kNeedleSize) == 0) { + return i; + } + } else { + if (memcmp(s + i, needle, needleSize) == 0) { + return i; + } + } + } + + return std::string::npos; +}; + +} // namespace detail + +/// A faster implementation for std::find, about 2x faster than string_view`s +/// find() in almost cases, proved by StringSearchBenchmark.cpp. Use xsmid-batch +/// to compare first&&last char first, use fixed-memcmp to compare left chars. +/// Inline in header file will be 30% faster. +FOLLY_ALWAYS_INLINE size_t +simdStrstr(const char* s, size_t n, const char* needle, size_t k) { +#if VELOX_SIMD_STRSTR + size_t result = std::string::npos; + + if (n < k) { + return result; + } + + switch (k) { + case 0: + return 0; + + case 1: { + const char* res = strchr(s, needle[0]); + + return (res != nullptr) ? res - s : std::string::npos; + } +#define VELOX_SIMD_STRSTR_CASE(size) \ + case size: \ + result = detail::smidStrstrMemcmp(s, n, needle, size); \ + break; + VELOX_SIMD_STRSTR_CASE(2) + VELOX_SIMD_STRSTR_CASE(3) + VELOX_SIMD_STRSTR_CASE(4) + VELOX_SIMD_STRSTR_CASE(5) + VELOX_SIMD_STRSTR_CASE(6) + VELOX_SIMD_STRSTR_CASE(7) + VELOX_SIMD_STRSTR_CASE(8) + VELOX_SIMD_STRSTR_CASE(9) + VELOX_SIMD_STRSTR_CASE(10) + VELOX_SIMD_STRSTR_CASE(11) + VELOX_SIMD_STRSTR_CASE(12) + VELOX_SIMD_STRSTR_CASE(13) + VELOX_SIMD_STRSTR_CASE(14) + VELOX_SIMD_STRSTR_CASE(15) + VELOX_SIMD_STRSTR_CASE(16) + VELOX_SIMD_STRSTR_CASE(17) + VELOX_SIMD_STRSTR_CASE(18) +#if XSIMD_WITH_AVX2 + VELOX_SIMD_STRSTR_CASE(19) + VELOX_SIMD_STRSTR_CASE(20) + VELOX_SIMD_STRSTR_CASE(21) + VELOX_SIMD_STRSTR_CASE(22) + VELOX_SIMD_STRSTR_CASE(23) + VELOX_SIMD_STRSTR_CASE(24) + VELOX_SIMD_STRSTR_CASE(25) + VELOX_SIMD_STRSTR_CASE(26) + VELOX_SIMD_STRSTR_CASE(27) + VELOX_SIMD_STRSTR_CASE(28) + VELOX_SIMD_STRSTR_CASE(29) + VELOX_SIMD_STRSTR_CASE(30) + VELOX_SIMD_STRSTR_CASE(31) + VELOX_SIMD_STRSTR_CASE(32) + VELOX_SIMD_STRSTR_CASE(33) + VELOX_SIMD_STRSTR_CASE(34) +#endif + default: + result = detail::smidStrstrMemcmp(s, n, needle, k); + break; + } +#undef VELOX_SIMD_STRSTR_CASE + // load_unaligned is used for better performance, so result maybe bigger than + // n-k. + if (result <= n - k) { + return result; + } else { + return std::string::npos; + } +#endif + return std::string_view(s, n).find(std::string_view(needle, k)); +} + } // namespace facebook::velox::simd diff --git a/velox/common/base/SimdUtil.cpp b/velox/common/base/SimdUtil.cpp index 03576ac31ec4..18a70d118ebd 100644 --- a/velox/common/base/SimdUtil.cpp +++ b/velox/common/base/SimdUtil.cpp @@ -62,6 +62,7 @@ const LeadingMask leadingMask64; const FromBitMask fromBitMask32; const FromBitMask fromBitMask64; +const int kPageSize = sysconf(_SC_PAGESIZE); } // namespace detail namespace { diff --git a/velox/common/base/SimdUtil.h b/velox/common/base/SimdUtil.h index 9a6ad0c37425..5230abe6ff1f 100644 --- a/velox/common/base/SimdUtil.h +++ b/velox/common/base/SimdUtil.h @@ -497,6 +497,9 @@ xsimd::batch reinterpretBatch(xsimd::batch, const A& = {}); template inline bool memEqualUnsafe(const void* x, const void* y, int32_t size); +FOLLY_ALWAYS_INLINE size_t +simdStrstr(const char* s, size_t n, const char* needle, size_t k); + } // namespace facebook::velox::simd #include "velox/common/base/SimdUtil-inl.h" diff --git a/velox/common/base/benchmarks/CMakeLists.txt b/velox/common/base/benchmarks/CMakeLists.txt index 065db83c3672..a8e956f8542a 100644 --- a/velox/common/base/benchmarks/CMakeLists.txt +++ b/velox/common/base/benchmarks/CMakeLists.txt @@ -17,3 +17,9 @@ target_link_libraries( velox_common_base_benchmarks PUBLIC ${FOLLY_BENCHMARK} PRIVATE velox_common_base Folly::folly) + +add_executable(velox_common_stringsearch_benchmarks StringSearchBenchmark.cpp) +target_link_libraries( + velox_common_stringsearch_benchmarks + PUBLIC ${FOLLY_BENCHMARK} + PRIVATE velox_common_base Folly::folly) diff --git a/velox/common/base/benchmarks/StringSearchBenchmark.cpp b/velox/common/base/benchmarks/StringSearchBenchmark.cpp new file mode 100644 index 000000000000..010d330d1585 --- /dev/null +++ b/velox/common/base/benchmarks/StringSearchBenchmark.cpp @@ -0,0 +1,269 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include "velox/common/base/SimdUtil.h" + +/// Copy Part code from +/// https://github.com/facebook/folly/blob/ce5edfb9b08ead9e78cb46879e7b9499861f7cd2/folly/test/FBStringTestBenchmarks.cpp.h +using namespace std; +using namespace folly; +/// Fixed seed for stable benchmark result, simdStrStr is always faster than +/// std::find with different seeds. +static const int seed = 123456; +static std::mt19937 rng(seed); + +namespace facebook::velox { +template +Integral2 random(Integral1 low, Integral2 up) { + std::uniform_int_distribution<> range(low, up); + return range(rng); +} + +enum ALG { SIMD, STD, KMP, BOYER_MOORE }; + +class KmpSearcher { + public: + KmpSearcher(const std::string& needle) : needle_(std::move(needle)) { + next_ = new int[1 + needle.size()]; + initNextArr(needle); + } + + ~KmpSearcher() { + delete[] next_; + } + + size_t search(const char* heyStack, size_t heyStackSize) const { + int i = 0, j = 0; + while ((i < (int32_t)heyStackSize) && (j < (int32_t)needle_.size())) { + if (j == -1 || heyStack[i] == needle_[j]) { + i++; + j++; + } else { + j = next_[j]; + } + } + if (j >= needle_.size()) { + return (i - needle_.size()); + }; + return (std::string::npos); + } + + private: + void initNextArr(const string& needle) { + int j = 0, k = -1; + next_[0] = -1; + for (; j < needle.length();) { + if (k == -1 || needle[j] == needle[k]) { + j++; + k++; + next_[j] = k; + } else + k = next_[k]; + } + } + std::string needle_; + int* next_; +}; + +class TestStringSearch { + public: + TestStringSearch(const std::string& heyStack, const std::string& needle) + : heyStack_(std::move(heyStack)), + needle_(std::move(needle)), + searher_(needle_.begin(), needle_.end()), + kmpSearcher_(needle_) {} + + template + void runSearching(size_t iters) const { + if constexpr (alg == SIMD) { + FOR_EACH_RANGE (i, 0, iters) + doNotOptimizeAway(simd::simdStrstr( + heyStack_.data(), + heyStack_.size(), + needle_.data(), + needle_.size())); + } else if constexpr (alg == STD) { + FOR_EACH_RANGE (i, 0, iters) + doNotOptimizeAway( + std::string_view(heyStack_.data(), heyStack_.size()) + .find(std::string_view(needle_.data(), needle_.size()))); + } else if constexpr (alg == BOYER_MOORE) { + FOR_EACH_RANGE (i, 0, iters) + doNotOptimizeAway( + std::search(heyStack_.begin(), heyStack_.end(), searher_)); + } else if constexpr (alg == KMP) { + FOR_EACH_RANGE (i, 0, iters) + doNotOptimizeAway( + kmpSearcher_.search(heyStack_.data(), heyStack_.size())); + } + } + + private: + std::string heyStack_; + std::string needle_; + std::boyer_moore_searcher searher_; + KmpSearcher kmpSearcher_; +}; + +TestStringSearch generateTest(int hayStackSize, int needleSize) { + // Text courtesy (ahem) of + // http://www.psychologytoday.com/blog/career-transitions/200906/ + // the-dreaded-writing-sample + // 1028chars + static const std::string s = + "\ +Even if you've mastered the art of the cover letter and the resume, \ +another part of the job search process can trip up an otherwise \ +qualified candidate: the writing sample.\n\ +\n\ +Strong writing and communication skills are highly sought after by \ +most employers. Whether crafting short emails or lengthy annual \ +reports, many workers use their writing skills every day. And for an \ +employer seeking proof behind that ubiquitous candidate \ +phrase,\"excellent communication skills\", a required writing sample \ +is invaluable.\n\ +\n\ +Writing samples need the same care and attention given to cover \ +letters and resumes. Candidates with otherwise impeccable credentials \ +are routinely eliminated by a poorly chosen writing sample. Notice I \ +said \"poorly chosen\" not \"poorly written.\" Because that's the rub: \ +a writing sample not only reveals the individual's writing skills, it \ +also offers a peek into what they consider important or relevant for \ +the position. If you miss that mark with your writing sample, don't \ +expect to get a call for an interview."; + auto pos = random(0, s.size() - hayStackSize); + auto needlePos = random(2, hayStackSize - needleSize); + std::string haystack = s.substr(pos, hayStackSize); + std::string needle = haystack.substr(needlePos, needleSize); + return TestStringSearch(std::move(haystack), std::move(needle)); +} + +void findSuccessful( + unsigned /*arg*/, + ALG alg, + size_t iters, + const TestStringSearch& testdata) { + switch (alg) { + case KMP: + testdata.runSearching(iters); + break; + case STD: + testdata.runSearching(iters); + break; + case SIMD: + testdata.runSearching(iters); + break; + case BOYER_MOORE: + testdata.runSearching(iters); + break; + } +} + +/// Folly uses random test data for each iteration, but this cannot guarantee +/// that the data for each test of different algorithms is the same, so we use +/// the same random data for each comparison benchmark here. +#define STRING_SEARCH_BENCHMARK(name, start, end, iters) \ + TestStringSearch test##start##end = generateTest(start, end); \ + BENCHMARK_NAMED_PARAM( \ + name, simd_##start##_to_##end, SIMD, iters, test##start##end); \ + BENCHMARK_NAMED_PARAM( \ + name, std_##start##_to_##end, STD, iters, test##start##end); \ + BENCHMARK_NAMED_PARAM( \ + name, \ + std_boyer_moore_##start##_to_##end, \ + BOYER_MOORE, \ + iters, \ + test##start##end); \ + \ + BENCHMARK_NAMED_PARAM( \ + name, kmp_##start##_to_##end, KMP, iters, test##start##end); + +STRING_SEARCH_BENCHMARK(findSuccessful, 50, 5, 52428800) +STRING_SEARCH_BENCHMARK(findSuccessful, 100, 10, 52428800) +STRING_SEARCH_BENCHMARK(findSuccessful, 100, 20, 52428800) +STRING_SEARCH_BENCHMARK(findSuccessful, 1000, 10, 52428800) +STRING_SEARCH_BENCHMARK(findSuccessful, 1000, 100, 5242880) +STRING_SEARCH_BENCHMARK(findSuccessful, 1000, 200, 5242880) + +/// std::find only handle fast-path for prefix-unmatch-char, if there is a +/// prefix-match-char(in practice, it is a high probability event that a +/// first char match is successful.), the performance of std::find drops +/// significantly in such a scenario. +TestStringSearch prefixMatch = { + "luffily close dugouts wake about the pinto beans. pending, ironic dependencies", + "b???"}; + +TestStringSearch prefixUnMatch = { + "luffily close dugouts wake about the pinto beans. pending, ironic dependencies", + "????"}; +void findUnsuccessful( + size_t /*arg*/, + bool useStd, + size_t iters, + const TestStringSearch& test) { + if (useStd) { + test.runSearching(iters); + } else { + test.runSearching(iters); + } +} + +BENCHMARK_NAMED_PARAM( + findUnsuccessful, + std_first_char_match, + true, + 52428800, + prefixMatch) +BENCHMARK_NAMED_PARAM( + findUnsuccessful, + opt_first_char_match, + false, + 52428800, + prefixMatch) +BENCHMARK_NAMED_PARAM( + findUnsuccessful, + std_first_char_unmatch, + true, + 52428800, + prefixUnMatch) +BENCHMARK_NAMED_PARAM( + findUnsuccessful, + opt_first_char_unmatch, + false, + 52428800, + prefixUnMatch) +} // namespace facebook::velox + +int main(int argc, char** argv) { + gflags::ParseCommandLineFlags(&argc, &argv, true); + folly::runBenchmarks(); + return 0; +} diff --git a/velox/common/base/tests/SimdUtilTest.cpp b/velox/common/base/tests/SimdUtilTest.cpp index ba389780b1cb..2e0d26416e73 100644 --- a/velox/common/base/tests/SimdUtilTest.cpp +++ b/velox/common/base/tests/SimdUtilTest.cpp @@ -109,6 +109,20 @@ class SimdUtilTest : public testing::Test { EXPECT_EQ(reference, target); } + template + Integral2 random(Integral1 low, Integral2 up) { + std::uniform_int_distribution<> range(low, up); + return range(rng_); + } + + void randomString(std::string* toFill, unsigned int maxSize = 1000) { + assert(toFill); + toFill->resize(random(0, maxSize)); + for (int i = 0; i < toFill->size(); i++) { + (*toFill)[i] = random('a', 'z'); + } + } + folly::Random::DefaultGenerator rng_; }; @@ -491,4 +505,154 @@ TEST_F(SimdUtilTest, memcpyTime) { LOG(INFO) << "simd=" << simd << " sys=" << sys; } +/// Copy from std::boyer_moore_searcher proposal: +/// https://github.com/mclow/search-library/blob/master/basic_tests.cpp +/// Basic sanity checking. It makes sure that all the algorithms work. +TEST_F(SimdUtilTest, basicSimdStrStr) { + auto checkOne = [](const std::string& text, const std::string& needle) { + auto size = text.size(); + auto k = needle.size(); + ASSERT_EQ( + simd::simdStrstr(text.data(), size, needle.data(), k), + text.find(needle)); + }; + std::string haystack1("NOW AN FOWE\220ER ANNMAN THE ANPANMANEND"); + std::string needle1("ANPANMAN"); + std::string needle2("MAN THE"); + std::string needle3("WE\220ER"); + // At the beginning + std::string needle4("NOW "); + // At the end + std::string needle5("NEND"); + // Nowhere + std::string needle6("NOT FOUND"); + // Nowhere + std::string needle7("NOT FO\340ND"); + + std::string haystack2("ABC ABCDAB ABCDABCDABDE"); + std::string needle11("ABCDABD"); + + std::string haystack3("abra abracad abracadabra"); + std::string needle12("abracadabra"); + + std::string needle13(""); + std::string haystack4(""); + + checkOne(haystack1, needle1); + checkOne(haystack1, needle2); + checkOne(haystack1, needle3); + checkOne(haystack1, needle4); + checkOne(haystack1, needle5); + checkOne(haystack1, needle6); + checkOne(haystack1, needle7); + + // Cant find long pattern in short corpus + checkOne(needle1, haystack1); + // Find something in itself + checkOne(haystack1, haystack1); + // Find something in itself + checkOne(haystack2, haystack2); + + checkOne(haystack2, needle11); + checkOne(haystack3, needle12); + // Find the empty string + checkOne(haystack1, needle13); + // Can't find in an empty haystack + checkOne(haystack4, needle1); + + // Comment copy from the origin code. + // Mikhail Levin found a problem, and this was the + // test that triggered it. + const std::string mikhailPattern = + "GATACACCTACCTTCACCAGTTACTCTATGCACTAGGTGCGCCAGGCCCATGCACAAGGGCTTGAGTGGATGGGAAGGA" + "TGTGCCCTAGTGATGGCAGCATAAGCTACGCAGAGAAGTTCCAGGGCAGAGTCACCATGACCAGGGACACATCCACGAG" + "CACAGCCTACATGGAGCTGAGCAGCCTGAGATCTGAAGACACGGCCATGTATTACTGTGGGAGAGATGTCTGGAGTGGT" + "TATTATTGCCCCGGTAATATTACTACTACTACTACTACATGGACGTCTGGGGCAAAGGGACCACG"; + const std::string mikhailCorpus = std::string(8, 'a') + mikhailPattern; + + checkOne(mikhailCorpus, mikhailPattern); +} + +TEST_F(SimdUtilTest, variableNeedleSize) { + std::string s1 = "aabbccddeeffgghhiijjkkllmmnnooppqqrrssttuuvvwwxxyyzz"; + std::string s2 = "aabbccddeeffgghhiijjkkllmmnnooppqqrrssttuuvvwwxxyyzz"; + std::string s3 = "01234567890123456789"; + auto test = [](char* text, size_t size, char* needle, size_t k) { + if (simd::simdStrstr(text, size, needle, k) != + std::string_view(text, size).find(std::string_view(needle, k))) { + LOG(ERROR) << "text: " << std::string(text, size) + << " needle :" << std::string(needle, k); + } + ASSERT_EQ( + simd::simdStrstr(text, size, needle, k), + std::string_view(text, size).find(std::string_view(needle, k))); + }; + // Match cases (prefix/middle/suffix): substrings in s2 should be a substring + // in s1. Choose different needle-size left from s2, testing prefix-match in + // s1. + for (int k = 0; k < s2.size(); k++) { + test(s1.data(), s1.size(), s2.data(), k); + } + // Choose different needle-size left from s2, testing middle-match in s1. + for (int i = 0; i < 20; i++) { + for (int k = 0; k < 28; k++) { + char* data = s2.data() + i; + test(s1.data(), s1.size(), data, k); + } + } + // Choose different needle-size right from s2, testing suffix-match in s1. + for (int k = 0; k < s2.size(); k++) { + char* data = s2.data() + s2.size() - k; + test(s1.data(), s1.size(), data, k); + } + // Not match case : substring in s3 not in s1. + for (auto k = 0; k < s3.size(); k++) { + test(s1.data(), s1.size(), s3.data(), k); + } + + // FirstBlock match + for (auto k = 0; k < s3.size(); k++) { + std::string somePrefix = "xxxxxx"; + std::string matchString = "a" + std::string(k, 'x'); + std::string someSuffix = "yyyyyyyy"; + std::string text = somePrefix + matchString + someSuffix; + auto s = "a" + std::string(k, '9'); + test(text.data(), text.size(), s.data(), s.size()); + } + // FirstBlock and LastBlock match + for (auto k = 0; k < s3.size(); k++) { + std::string somePrefix = "xxxxxx"; + std::string matchString = "a" + std::string(k, 'x') + "b"; + std::string someSuffix = "yyyyyyyy"; + std::string text = somePrefix + matchString + someSuffix; + auto s = "a" + std::string(k, '9') + "b"; + test(text.data(), text.size(), s.data(), s.size()); + } +} + +/// Copy from +/// https://github.com/facebook/folly/blob/ce5edfb9b08ead9e78cb46879e7b9499861f7cd2/folly/test/FBStringTest.cpp#L1277 +/// clause11_21_4_7_2_a1 +TEST_F(SimdUtilTest, randomStringStrStr) { + std::string test; + const int kTestLoop = 1; + auto checkOne = + [](const std::string& text, const std::string& needle, size_t pos) { + auto size = text.length() - pos; + auto textPtr = text.data() + pos; + auto k = needle.size(); + ASSERT_EQ( + simd::simdStrstr(textPtr, size, needle.data(), k), + text.substr(pos).find(needle)); + }; + for (int i = 0; i < kTestLoop; i++) { + // clause11_21_4_7_2_a1 + randomString(&test); + auto from = random(0, test.size()); + auto length = random(0, test.size() - from); + std::string str = test.substr(from, length); + checkOne(test, str, random(0, test.size())); + } +} + } // namespace