From 71d883510787d65fae816ac711551c8957838391 Mon Sep 17 00:00:00 2001
From: "hengjiang.ly" <hengjiang.ly@alibaba-inc.com>
Date: Tue, 6 Aug 2024 14:35:18 +0800
Subject: [PATCH 1/3] add simd strstr

---
 velox/common/base/SimdUtil.cpp           | 137 +++++++++++++++++++++++
 velox/common/base/SimdUtil.h             |   2 +
 velox/common/base/tests/SimdUtilTest.cpp |  21 ++++
 3 files changed, 160 insertions(+)
diff --git a/velox/common/base/SimdUtil.cpp b/velox/common/base/SimdUtil.cpp
index 03576ac31ec4..3f7d0de91d5a 100644
--- a/velox/common/base/SimdUtil.cpp
+++ b/velox/common/base/SimdUtil.cpp
@@ -112,4 +112,141 @@ bool initializeSimdUtil() {
 
 static bool FB_ANONYMOUS_VARIABLE(g_simdConstants) = initializeSimdUtil();
 
+namespace detail {
+
+#if XSIMD_WITH_SSE4_2
+using CharVector = xsimd::batch<uint8_t, xsimd::sse4_2>;
+#elif XSIMD_WITH_NEON
+using CharVector = xsimd::batch<uint8_t, xsimd::neon>;
+#endif
+
+const int kPageSize = sysconf(_SC_PAGESIZE);
+FOLLY_ALWAYS_INLINE bool pageSafe(const void* const ptr) {
+  return ((kPageSize - 1) & reinterpret_cast<std::uintptr_t>(ptr)) <=
+      kPageSize - CharVector::size;
+}
+
+template <bool compiled, size_t compiledNeedleSize>
+size_t FOLLY_ALWAYS_INLINE smidStrstrMemcmp(
+    const char* s,
+    size_t n,
+    const char* needle,
+    size_t needleSize) {
+  static_assert(compiledNeedleSize >= 2);
+  VELOX_CHECK_GT(needleSize, 1);
+  VELOX_CHECK_GT(n, 0);
+  auto first = CharVector::broadcast(needle[0]);
+  auto last = CharVector::broadcast(needle[needleSize - 1]);
+  size_t i = 0;
+  // Fast path for page-safe data.
+  // It`s safe to over-read CharVector if all-data are in same page.
+  // see: https://mudongliang.github.io/x86/html/file_module_x86_id_208.html
+  // While executing in 16-bit addressing mode, a linear address for a 128-bit
+  // data access that overlaps the end of a 16-bit segment is not allowed and is
+  // defined as reserved behavior. A specific processor implementation may or
+  // may not generate a general-protection exception (#GP) in this situation,
+  // and the address that spans the end of the segment may or may not wrap
+  // around to the beginning of the segment.
+  for (; i <= n - needleSize && pageSafe(s + i + needleSize - 1) &&
+       pageSafe(s + i);
+       i += CharVector::size) {
+    auto blockFirst = CharVector::load_unaligned(s + i);
+    auto blockLast = CharVector::load_unaligned(s + i + needleSize - 1);
+
+    const auto eqFirst = (first == blockFirst);
+    const auto eqLast = (last == blockLast);
+
+    auto mask = toBitMask(eqFirst && eqLast);
+
+    while (mask != 0) {
+      const auto bitpos = __builtin_ctz(mask);
+      if constexpr (compiled) {
+        if constexpr (compiledNeedleSize == 2) {
+          return i + bitpos;
+        }
+        if (memcmp(s + i + bitpos + 1, needle + 1, compiledNeedleSize - 2) ==
+            0) {
+          return i + bitpos;
+        }
+      } else {
+        if (memcmp(s + i + bitpos + 1, needle + 1, needleSize - 2) == 0) {
+          return i + bitpos;
+        }
+      }
+      mask = mask & (mask - 1);
+    }
+  }
+  // Fallback path for generic path.
+  for (; i <= n - needleSize; ++i) {
+    if constexpr (compiled) {
+      if (memcmp(s + i, needle, compiledNeedleSize) == 0) {
+        return i;
+      }
+    } else {
+      if (memcmp(s + i, needle, needleSize) == 0) {
+        return i;
+      }
+    }
+  }
+
+  return std::string::npos;
+};
+
+} // namespace detail
+
+/// A faster implementation for c_strstr(), about 2x faster than string_view`s
+/// find(), proved by TpchLikeBenchmark. Use xsmid-batch to compare first&&last
+/// char first, use fixed-memcmp to compare left chars. Inline in header file
+/// will be a little faster.
+size_t simdStrstr(const char* s, size_t n, const char* needle, size_t k) {
+  size_t result = std::string::npos;
+
+  if (n < k) {
+    return result;
+  }
+
+  switch (k) {
+    case 0:
+      return 0;
+
+    case 1: {
+      const char* res = strchr(s, needle[0]);
+
+      return (res != nullptr) ? res - s : std::string::npos;
+    }
+#define FIXED_MEM_STRSTR(size)                                         \
+  case size:                                                           \
+    result = detail::smidStrstrMemcmp<true, size>(s, n, needle, size); \
+    break;
+      FIXED_MEM_STRSTR(2)
+      FIXED_MEM_STRSTR(3)
+      FIXED_MEM_STRSTR(4)
+      FIXED_MEM_STRSTR(5)
+      FIXED_MEM_STRSTR(6)
+      FIXED_MEM_STRSTR(7)
+      FIXED_MEM_STRSTR(8)
+      FIXED_MEM_STRSTR(9)
+      FIXED_MEM_STRSTR(10)
+      FIXED_MEM_STRSTR(11)
+      FIXED_MEM_STRSTR(12)
+      FIXED_MEM_STRSTR(13)
+      FIXED_MEM_STRSTR(14)
+      FIXED_MEM_STRSTR(15)
+      FIXED_MEM_STRSTR(16)
+      FIXED_MEM_STRSTR(17)
+      FIXED_MEM_STRSTR(18)
+    default:
+      result = detail::smidStrstrMemcmp<false, 2>(s, n, needle, k);
+      break;
+  }
+#undef FIXED_MEM_STRSTR
+  // load_unaligned is used for better performance, so result maybe bigger than
+  // n-k.
+  if (result <= n - k) {
+    return result;
+  } else {
+    return std::string::npos;
+  }
+}
+
 } // namespace facebook::velox::simd
diff --git a/velox/common/base/SimdUtil.h b/velox/common/base/SimdUtil.h
index 9a6ad0c37425..ba63d3c1d237 100644
--- a/velox/common/base/SimdUtil.h
+++ b/velox/common/base/SimdUtil.h
@@ -497,6 +497,8 @@ xsimd::batch<T, A> reinterpretBatch(xsimd::batch<U, A>, const A& = {});
 template <typename A = xsimd::default_arch>
 inline bool memEqualUnsafe(const void* x, const void* y, int32_t size);
 
+size_t simdStrstr(const char* s, size_t n, const char* needle, size_t k);
+
 } // namespace facebook::velox::simd
 
 #include "velox/common/base/SimdUtil-inl.h"
diff --git a/velox/common/base/tests/SimdUtilTest.cpp b/velox/common/base/tests/SimdUtilTest.cpp
index ba389780b1cb..9dbebc060fb3 100644
--- a/velox/common/base/tests/SimdUtilTest.cpp
+++ b/velox/common/base/tests/SimdUtilTest.cpp
@@ -491,4 +491,25 @@ TEST_F(SimdUtilTest, memcpyTime) {
   LOG(INFO) << "simd=" << simd << " sys=" << sys;
 }
 
+TEST_F(SimdUtilTest, testSimdStrStr) {
+  // 48 chars.
+  std::string s1 = "aabbccddeeffgghhiijjkkllmmnnooppqqrrssttuuvvwwxxyyzz";
+  std::string s2 = "aabbccddeeffgghhiijjkkllmmnnooppqqrrssttuuvvwwxxyyzz";
+  std::string s3 = "xxx";
+  auto test = [](char* text, size_t size, char* needle, size_t k) {
+    ASSERT_EQ(
+        simd::simdStrstr(text, size, needle, k),
+        std::string_view(text, size).find(std::string_view(needle, k)));
+  };
+  // Match cases : substrings in s2 should be a substring in s1.
+  for (int i = 0; i < 20; i++) {
+    for (int k = 0; k < 28; k++) {
+      char* data = s2.data() + i;
+      test(s1.data(), s1.size(), data, k);
+    }
+  }
+  // Not match case : "xxx" not in s1.
+  test(s1.data(), s1.size(), s3.data(), s3.size());
+}
+
 } // namespace

From a61a4540e2182416aa1bd617e634155698f5af5b Mon Sep 17 00:00:00 2001
From: "hengjiang.ly" <hengjiang.ly@alibaba-inc.com>
Date: Wed, 28 Aug 2024 16:56:41 +0800
Subject: [PATCH 2/3] add ut / benchmark

---
 velox/common/base/SimdUtil-inl.h              | 168 +++++++++++
 velox/common/base/SimdUtil.cpp                | 138 +--------
 velox/common/base/SimdUtil.h                  |   3 +-
 velox/common/base/benchmarks/CMakeLists.txt   |   6 +
 .../base/benchmarks/StringSearchBenchmark.cpp | 269 ++++++++++++++++++
 velox/common/base/tests/SimdUtilTest.cpp      | 155 +++++++++-
 6 files changed, 595 insertions(+), 144 deletions(-)
 create mode 100644 velox/common/base/benchmarks/StringSearchBenchmark.cpp

diff --git a/velox/common/base/SimdUtil-inl.h b/velox/common/base/SimdUtil-inl.h
index 87ff71f8b181..dddd1de3836b 100644
--- a/velox/common/base/SimdUtil-inl.h
+++ b/velox/common/base/SimdUtil-inl.h
@@ -1436,4 +1436,172 @@ inline bool memEqualUnsafe(const void* x, const void* y, int32_t size) {
   return true;
 }
 
+namespace detail {
+
+/// NOTE: SSE_4_2`s the performance of simdStrStr is a little slower than
+/// std::find in first-char-unmatch(read only one char per match.) Use AVX2 the
+/// performance will be better than std::find in that case.
+#if XSIMD_WITH_AVX2
+using CharVector = xsimd::batch<uint8_t, xsimd::avx2>;
+#define VELOX_SIMD_STRSTR 1
+#elif XSIMD_WITH_NEON
+using CharVector = xsimd::batch<uint8_t, xsimd::neon>;
+#define VELOX_SIMD_STRSTR 1
+#else
+#define VELOX_SIMD_STRSTR 0
+#endif
+
+extern const int kPageSize;
+
+FOLLY_ALWAYS_INLINE bool pageSafe(const void* const ptr) {
+  return ((kPageSize - 1) & reinterpret_cast<std::uintptr_t>(ptr)) <=
+      kPageSize - CharVector::size;
+}
+
+template <bool compiled, size_t kNeedleSize>
+size_t FOLLY_ALWAYS_INLINE smidStrstrMemcmp(
+    const char* s,
+    size_t n,
+    const char* needle,
+    size_t needleSize) {
+  static_assert(kNeedleSize >= 2);
+  VELOX_DCHECK_GT(needleSize, 1);
+  VELOX_DCHECK_GT(n, 0);
+  auto first = CharVector::broadcast(needle[0]);
+  auto last = CharVector::broadcast(needle[needleSize - 1]);
+  size_t i = 0;
+  // Fast path for page-safe data.
+  // It`s safe to over-read CharVector if all-data are in same page.
+  // see: https://mudongliang.github.io/x86/html/file_module_x86_id_208.html
+  // While executing in 16-bit addressing mode, a linear address for a 128-bit
+  // data access that overlaps the end of a 16-bit segment is not allowed and is
+  // defined as reserved behavior. A specific processor implementation may or
+  // may not generate a general-protection exception (#GP) in this situation,
+  // and the address that spans the end of the segment may or may not wrap
+  // around to the beginning of the segment.
+  for (; i <= n - needleSize && pageSafe(s + i) &&
+       pageSafe(s + i + needleSize - 1);
+       i += CharVector::size) {
+    auto blockFirst = CharVector::load_unaligned(s + i);
+    const auto eqFirst = (first == blockFirst);
+    /// std:find handle the fast-path for first-char-unmatch, so we also need
+    /// to handle eqFirst.
+    if (eqFirst.mask() == 0) {
+      continue;
+    }
+    auto blockLast = CharVector::load_unaligned(s + i + needleSize - 1);
+    const auto eqLast = (last == blockLast);
+    auto mask = (eqFirst && eqLast).mask();
+    while (mask != 0) {
+      const auto bitpos = __builtin_ctz(mask);
+      if constexpr (compiled) {
+        if constexpr (kNeedleSize == 2) {
+          return i + bitpos;
+        }
+        if (memcmp(s + i + bitpos + 1, needle + 1, kNeedleSize - 2) == 0) {
+          return i + bitpos;
+        }
+      } else {
+        if (memcmp(s + i + bitpos + 1, needle + 1, needleSize - 2) == 0) {
+          return i + bitpos;
+        }
+      }
+      mask = mask & (mask - 1);
+    }
+  }
+  // Fallback path for generic path.
+  for (; i <= n - needleSize; ++i) {
+    if constexpr (compiled) {
+      if (memcmp(s + i, needle, kNeedleSize) == 0) {
+        return i;
+      }
+    } else {
+      if (memcmp(s + i, needle, needleSize) == 0) {
+        return i;
+      }
+    }
+  }
+
+  return std::string::npos;
+};
+
+} // namespace detail
+
+/// A faster implementation for std::find, about 2x faster than string_view`s
+/// find() in almost cases, proved by StringSearchBenchmark.cpp. Use xsmid-batch
+/// to compare first&&last char first, use fixed-memcmp to compare left chars.
+/// Inline in header file will be 30% faster.
+FOLLY_ALWAYS_INLINE size_t
+simdStrstr(const char* s, size_t n, const char* needle, size_t k) {
+#if VELOX_SIMD_STRSTR
+  size_t result = std::string::npos;
+
+  if (n < k) {
+    return result;
+  }
+
+  switch (k) {
+    case 0:
+      return 0;
+
+    case 1: {
+      const char* res = strchr(s, needle[0]);
+
+      return (res != nullptr) ? res - s : std::string::npos;
+    }
+#define VELOX_SIMD_STRSTR_CASE(size)                                   \
+  case size:                                                           \
+    result = detail::smidStrstrMemcmp<true, size>(s, n, needle, size); \
+    break;
+      VELOX_SIMD_STRSTR_CASE(2)
+      VELOX_SIMD_STRSTR_CASE(3)
+      VELOX_SIMD_STRSTR_CASE(4)
+      VELOX_SIMD_STRSTR_CASE(5)
+      VELOX_SIMD_STRSTR_CASE(6)
+      VELOX_SIMD_STRSTR_CASE(7)
+      VELOX_SIMD_STRSTR_CASE(8)
+      VELOX_SIMD_STRSTR_CASE(9)
+      VELOX_SIMD_STRSTR_CASE(10)
+      VELOX_SIMD_STRSTR_CASE(11)
+      VELOX_SIMD_STRSTR_CASE(12)
+      VELOX_SIMD_STRSTR_CASE(13)
+      VELOX_SIMD_STRSTR_CASE(14)
+      VELOX_SIMD_STRSTR_CASE(15)
+      VELOX_SIMD_STRSTR_CASE(16)
+      VELOX_SIMD_STRSTR_CASE(17)
+      VELOX_SIMD_STRSTR_CASE(18)
+#if XSIMD_WITH_AVX2
+      VELOX_SIMD_STRSTR_CASE(19)
+      VELOX_SIMD_STRSTR_CASE(20)
+      VELOX_SIMD_STRSTR_CASE(21)
+      VELOX_SIMD_STRSTR_CASE(22)
+      VELOX_SIMD_STRSTR_CASE(23)
+      VELOX_SIMD_STRSTR_CASE(24)
+      VELOX_SIMD_STRSTR_CASE(25)
+      VELOX_SIMD_STRSTR_CASE(26)
+      VELOX_SIMD_STRSTR_CASE(27)
+      VELOX_SIMD_STRSTR_CASE(28)
+      VELOX_SIMD_STRSTR_CASE(29)
+      VELOX_SIMD_STRSTR_CASE(30)
+      VELOX_SIMD_STRSTR_CASE(31)
+      VELOX_SIMD_STRSTR_CASE(32)
+      VELOX_SIMD_STRSTR_CASE(33)
+      VELOX_SIMD_STRSTR_CASE(34)
+#endif
+    default:
+      result = detail::smidStrstrMemcmp<false, 2>(s, n, needle, k);
+      break;
+  }
+#undef VELOX_SIMD_STRSTR_CASE
+  // load_unaligned is used for better performance, so result maybe bigger than
+  // n-k.
+  if (result <= n - k) {
+    return result;
+  } else {
+    return std::string::npos;
+  }
+#endif
+  return std::string_view(s, n).find(std::string_view(needle, k));
+}
+
 } // namespace facebook::velox::simd
diff --git a/velox/common/base/SimdUtil.cpp b/velox/common/base/SimdUtil.cpp
index 3f7d0de91d5a..18a70d118ebd 100644
--- a/velox/common/base/SimdUtil.cpp
+++ b/velox/common/base/SimdUtil.cpp
@@ -62,6 +62,7 @@ const LeadingMask<int64_t, xsimd::default_arch> leadingMask64;
 const FromBitMask<int32_t, xsimd::default_arch> fromBitMask32;
 const FromBitMask<int64_t, xsimd::default_arch> fromBitMask64;
 
+const int kPageSize = sysconf(_SC_PAGESIZE);
 } // namespace detail
 
 namespace {
@@ -112,141 +113,4 @@ bool initializeSimdUtil() {
 
 static bool FB_ANONYMOUS_VARIABLE(g_simdConstants) = initializeSimdUtil();
 
-namespace detail {
-
-#if XSIMD_WITH_SSE4_2
-using CharVector = xsimd::batch<uint8_t, xsimd::sse4_2>;
-#elif XSIMD_WITH_NEON
-using CharVector = xsimd::batch<uint8_t, xsimd::neon>;
-#endif
-
-const int kPageSize = sysconf(_SC_PAGESIZE);
-FOLLY_ALWAYS_INLINE bool pageSafe(const void* const ptr) {
-  return ((kPageSize - 1) & reinterpret_cast<std::uintptr_t>(ptr)) <=
-      kPageSize - CharVector::size;
-}
-
-template <bool compiled, size_t compiledNeedleSize>
-size_t FOLLY_ALWAYS_INLINE smidStrstrMemcmp(
-    const char* s,
-    size_t n,
-    const char* needle,
-    size_t needleSize) {
-  static_assert(compiledNeedleSize >= 2);
-  VELOX_CHECK_GT(needleSize, 1);
-  VELOX_CHECK_GT(n, 0);
-  auto first = CharVector::broadcast(needle[0]);
-  auto last = CharVector::broadcast(needle[needleSize - 1]);
-  size_t i = 0;
-  // Fast path for page-safe data.
-  // It`s safe to over-read CharVector if all-data are in same page.
-  // see: https://mudongliang.github.io/x86/html/file_module_x86_id_208.html
-  // While executing in 16-bit addressing mode, a linear address for a 128-bit
-  // data access that overlaps the end of a 16-bit segment is not allowed and is
-  // defined as reserved behavior. A specific processor implementation may or
-  // may not generate a general-protection exception (#GP) in this situation,
-  // and the address that spans the end of the segment may or may not wrap
-  // around to the beginning of the segment.
-  for (; i <= n - needleSize && pageSafe(s + i + needleSize - 1) &&
-       pageSafe(s + i);
-       i += CharVector::size) {
-    auto blockFirst = CharVector::load_unaligned(s + i);
-    auto blockLast = CharVector::load_unaligned(s + i + needleSize - 1);
-
-    const auto eqFirst = (first == blockFirst);
-    const auto eqLast = (last == blockLast);
-
-    auto mask = toBitMask(eqFirst && eqLast);
-
-    while (mask != 0) {
-      const auto bitpos = __builtin_ctz(mask);
-      if constexpr (compiled) {
-        if constexpr (compiledNeedleSize == 2) {
-          return i + bitpos;
-        }
-        if (memcmp(s + i + bitpos + 1, needle + 1, compiledNeedleSize - 2) ==
-            0) {
-          return i + bitpos;
-        }
-      } else {
-        if (memcmp(s + i + bitpos + 1, needle + 1, needleSize - 2) == 0) {
-          return i + bitpos;
-        }
-      }
-      mask = mask & (mask - 1);
-    }
-  }
-  // Fallback path for generic path.
-  for (; i <= n - needleSize; ++i) {
-    if constexpr (compiled) {
-      if (memcmp(s + i, needle, compiledNeedleSize) == 0) {
-        return i;
-      }
-    } else {
-      if (memcmp(s + i, needle, needleSize) == 0) {
-        return i;
-      }
-    }
-  }
-
-  return std::string::npos;
-};
-
-} // namespace detail
-
-/// A faster implementation for c_strstr(), about 2x faster than string_view`s
-/// find(), proved by TpchLikeBenchmark. Use xsmid-batch to compare first&&last
-/// char first, use fixed-memcmp to compare left chars. Inline in header file
-/// will be a little faster.
-size_t simdStrstr(const char* s, size_t n, const char* needle, size_t k) {
-  size_t result = std::string::npos;
-
-  if (n < k) {
-    return result;
-  }
-
-  switch (k) {
-    case 0:
-      return 0;
-
-    case 1: {
-      const char* res = strchr(s, needle[0]);
-
-      return (res != nullptr) ? res - s : std::string::npos;
-    }
-#define FIXED_MEM_STRSTR(size)                                         \
-  case size:                                                           \
-    result = detail::smidStrstrMemcmp<true, size>(s, n, needle, size); \
-    break;
-      FIXED_MEM_STRSTR(2)
-      FIXED_MEM_STRSTR(3)
-      FIXED_MEM_STRSTR(4)
-      FIXED_MEM_STRSTR(5)
-      FIXED_MEM_STRSTR(6)
-      FIXED_MEM_STRSTR(7)
-      FIXED_MEM_STRSTR(8)
-      FIXED_MEM_STRSTR(9)
-      FIXED_MEM_STRSTR(10)
-      FIXED_MEM_STRSTR(11)
-      FIXED_MEM_STRSTR(12)
-      FIXED_MEM_STRSTR(13)
-      FIXED_MEM_STRSTR(14)
-      FIXED_MEM_STRSTR(15)
-      FIXED_MEM_STRSTR(16)
-      FIXED_MEM_STRSTR(17)
-      FIXED_MEM_STRSTR(18)
-    default:
-      result = detail::smidStrstrMemcmp<false, 2>(s, n, needle, k);
-      break;
-  }
-#undef FIXED_MEM_STRSTR
-  // load_unaligned is used for better performance, so result maybe bigger than
-  // n-k.
-  if (result <= n - k) {
-    return result;
-  } else {
-    return std::string::npos;
-  }
-}
-
 } // namespace facebook::velox::simd
diff --git a/velox/common/base/SimdUtil.h b/velox/common/base/SimdUtil.h
index ba63d3c1d237..5230abe6ff1f 100644
--- a/velox/common/base/SimdUtil.h
+++ b/velox/common/base/SimdUtil.h
@@ -497,7 +497,8 @@ xsimd::batch<T, A> reinterpretBatch(xsimd::batch<U, A>, const A& = {});
 template <typename A = xsimd::default_arch>
 inline bool memEqualUnsafe(const void* x, const void* y, int32_t size);
 
-size_t simdStrstr(const char* s, size_t n, const char* needle, size_t k);
+FOLLY_ALWAYS_INLINE size_t
+simdStrstr(const char* s, size_t n, const char* needle, size_t k);
 
 } // namespace facebook::velox::simd
 
diff --git a/velox/common/base/benchmarks/CMakeLists.txt b/velox/common/base/benchmarks/CMakeLists.txt
index 065db83c3672..a8e956f8542a 100644
--- a/velox/common/base/benchmarks/CMakeLists.txt
+++ b/velox/common/base/benchmarks/CMakeLists.txt
@@ -17,3 +17,9 @@ target_link_libraries(
   velox_common_base_benchmarks
   PUBLIC ${FOLLY_BENCHMARK}
   PRIVATE velox_common_base Folly::folly)
+
+add_executable(velox_common_stringsearch_benchmarks StringSearchBenchmark.cpp)
+target_link_libraries(
+  velox_common_stringsearch_benchmarks
+  PUBLIC ${FOLLY_BENCHMARK}
+  PRIVATE velox_common_base Folly::folly)
diff --git a/velox/common/base/benchmarks/StringSearchBenchmark.cpp b/velox/common/base/benchmarks/StringSearchBenchmark.cpp
new file mode 100644
index 000000000000..010d330d1585
--- /dev/null
+++ b/velox/common/base/benchmarks/StringSearchBenchmark.cpp
@@ -0,0 +1,269 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <folly/FBString.h>
+
+#include <cstdlib>
+#include <fstream>
+#include <list>
+#include <random>
+#include <sstream>
+
+#include <folly/Benchmark.h>
+#include <folly/Random.h>
+#include <folly/container/Foreach.h>
+#include <folly/portability/GFlags.h>
+
+#include <iostream>
+
+#include "velox/common/base/SimdUtil.h"
+
+/// Copy Part code from
+/// https://github.com/facebook/folly/blob/ce5edfb9b08ead9e78cb46879e7b9499861f7cd2/folly/test/FBStringTestBenchmarks.cpp.h
+using namespace std;
+using namespace folly;
+/// Fixed seed for stable benchmark result, simdStrStr is always faster than
+/// std::find with different seeds.
+static const int seed = 123456;
+static std::mt19937 rng(seed);
+
+namespace facebook::velox {
+template <class Integral1, class Integral2>
+Integral2 random(Integral1 low, Integral2 up) {
+  std::uniform_int_distribution<> range(low, up);
+  return range(rng);
+}
+
+enum ALG { SIMD, STD, KMP, BOYER_MOORE };
+
+class KmpSearcher {
+ public:
+  KmpSearcher(const std::string& needle) : needle_(std::move(needle)) {
+    next_ = new int[1 + needle.size()];
+    initNextArr(needle);
+  }
+
+  ~KmpSearcher() {
+    delete[] next_;
+  }
+
+  size_t search(const char* heyStack, size_t heyStackSize) const {
+    int i = 0, j = 0;
+    while ((i < (int32_t)heyStackSize) && (j < (int32_t)needle_.size())) {
+      if (j == -1 || heyStack[i] == needle_[j]) {
+        i++;
+        j++;
+      } else {
+        j = next_[j];
+      }
+    }
+    if (j >= needle_.size()) {
+      return (i - needle_.size());
+    };
+    return (std::string::npos);
+  }
+
+ private:
+  void initNextArr(const string& needle) {
+    int j = 0, k = -1;
+    next_[0] = -1;
+    for (; j < needle.length();) {
+      if (k == -1 || needle[j] == needle[k]) {
+        j++;
+        k++;
+        next_[j] = k;
+      } else
+        k = next_[k];
+    }
+  }
+  std::string needle_;
+  int* next_;
+};
+
+class TestStringSearch {
+ public:
+  TestStringSearch(const std::string& heyStack, const std::string& needle)
+      : heyStack_(std::move(heyStack)),
+        needle_(std::move(needle)),
+        searher_(needle_.begin(), needle_.end()),
+        kmpSearcher_(needle_) {}
+
+  template <ALG alg>
+  void runSearching(size_t iters) const {
+    if constexpr (alg == SIMD) {
+      FOR_EACH_RANGE (i, 0, iters)
+        doNotOptimizeAway(simd::simdStrstr(
+            heyStack_.data(),
+            heyStack_.size(),
+            needle_.data(),
+            needle_.size()));
+    } else if constexpr (alg == STD) {
+      FOR_EACH_RANGE (i, 0, iters)
+        doNotOptimizeAway(
+            std::string_view(heyStack_.data(), heyStack_.size())
+                .find(std::string_view(needle_.data(), needle_.size())));
+    } else if constexpr (alg == BOYER_MOORE) {
+      FOR_EACH_RANGE (i, 0, iters)
+        doNotOptimizeAway(
+            std::search(heyStack_.begin(), heyStack_.end(), searher_));
+    } else if constexpr (alg == KMP) {
+      FOR_EACH_RANGE (i, 0, iters)
+        doNotOptimizeAway(
+            kmpSearcher_.search(heyStack_.data(), heyStack_.size()));
+    }
+  }
+
+ private:
+  std::string heyStack_;
+  std::string needle_;
+  std::boyer_moore_searcher<std::string::iterator> searher_;
+  KmpSearcher kmpSearcher_;
+};
+
+TestStringSearch generateTest(int hayStackSize, int needleSize) {
+  // Text courtesy (ahem) of
+  // http://www.psychologytoday.com/blog/career-transitions/200906/
+  // the-dreaded-writing-sample
+  // 1028chars
+  static const std::string s =
+      "\
+Even if you've mastered the art of the cover letter and the resume, \
+another part of the job search process can trip up an otherwise \
+qualified candidate: the writing sample.\n\
+\n\
+Strong writing and communication skills are highly sought after by \
+most employers. Whether crafting short emails or lengthy annual \
+reports, many workers use their writing skills every day. And for an \
+employer seeking proof behind that ubiquitous candidate \
+phrase,\"excellent communication skills\", a required writing sample \
+is invaluable.\n\
+\n\
+Writing samples need the same care and attention given to cover \
+letters and resumes. Candidates with otherwise impeccable credentials \
+are routinely eliminated by a poorly chosen writing sample. Notice I \
+said \"poorly chosen\" not \"poorly written.\" Because that's the rub: \
+a writing sample not only reveals the individual's writing skills, it \
+also offers a peek into what they consider important or relevant for \
+the position. If you miss that mark with your writing sample, don't \
+expect to get a call for an interview.";
+  auto pos = random(0, s.size() - hayStackSize);
+  auto needlePos = random(2, hayStackSize - needleSize);
+  std::string haystack = s.substr(pos, hayStackSize);
+  std::string needle = haystack.substr(needlePos, needleSize);
+  return TestStringSearch(std::move(haystack), std::move(needle));
+}
+
+void findSuccessful(
+    unsigned /*arg*/,
+    ALG alg,
+    size_t iters,
+    const TestStringSearch& testdata) {
+  switch (alg) {
+    case KMP:
+      testdata.runSearching<KMP>(iters);
+      break;
+    case STD:
+      testdata.runSearching<STD>(iters);
+      break;
+    case SIMD:
+      testdata.runSearching<SIMD>(iters);
+      break;
+    case BOYER_MOORE:
+      testdata.runSearching<BOYER_MOORE>(iters);
+      break;
+  }
+}
+
+/// Folly uses random test data for each iteration, but this cannot guarantee
+/// that the data for each test of different algorithms is the same, so we use
+/// the same random data for each comparison benchmark here.
+#define STRING_SEARCH_BENCHMARK(name, start, end, iters)             \
+  TestStringSearch test##start##end = generateTest(start, end);      \
+  BENCHMARK_NAMED_PARAM(                                             \
+      name, simd_##start##_to_##end, SIMD, iters, test##start##end); \
+  BENCHMARK_NAMED_PARAM(                                             \
+      name, std_##start##_to_##end, STD, iters, test##start##end);   \
+  BENCHMARK_NAMED_PARAM(                                             \
+      name,                                                          \
+      std_boyer_moore_##start##_to_##end,                            \
+      BOYER_MOORE,                                                   \
+      iters,                                                         \
+      test##start##end);                                             \
+                                                                     \
+  BENCHMARK_NAMED_PARAM(                                             \
+      name, kmp_##start##_to_##end, KMP, iters, test##start##end);
+
+STRING_SEARCH_BENCHMARK(findSuccessful, 50, 5, 52428800)
+STRING_SEARCH_BENCHMARK(findSuccessful, 100, 10, 52428800)
+STRING_SEARCH_BENCHMARK(findSuccessful, 100, 20, 52428800)
+STRING_SEARCH_BENCHMARK(findSuccessful, 1000, 10, 52428800)
+STRING_SEARCH_BENCHMARK(findSuccessful, 1000, 100, 5242880)
+STRING_SEARCH_BENCHMARK(findSuccessful, 1000, 200, 5242880)
+
+/// std::find only handle fast-path for prefix-unmatch-char, if there is a
+/// prefix-match-char(in practice, it is a high probability event that a
+/// first char match is successful.), the performance of std::find drops
+/// significantly in such a scenario.
+TestStringSearch prefixMatch = {
+    "luffily close dugouts wake about the pinto beans. pending, ironic dependencies",
+    "b???"};
+
+TestStringSearch prefixUnMatch = {
+    "luffily close dugouts wake about the pinto beans. pending, ironic dependencies",
+    "????"};
+void findUnsuccessful(
+    size_t /*arg*/,
+    bool useStd,
+    size_t iters,
+    const TestStringSearch& test) {
+  if (useStd) {
+    test.runSearching<STD>(iters);
+  } else {
+    test.runSearching<SIMD>(iters);
+  }
+}
+
+BENCHMARK_NAMED_PARAM(
+    findUnsuccessful,
+    std_first_char_match,
+    true,
+    52428800,
+    prefixMatch)
+BENCHMARK_NAMED_PARAM(
+    findUnsuccessful,
+    opt_first_char_match,
+    false,
+    52428800,
+    prefixMatch)
+BENCHMARK_NAMED_PARAM(
+    findUnsuccessful,
+    std_first_char_unmatch,
+    true,
+    52428800,
+    prefixUnMatch)
+BENCHMARK_NAMED_PARAM(
+    findUnsuccessful,
+    opt_first_char_unmatch,
+    false,
+    52428800,
+    prefixUnMatch)
+} // namespace facebook::velox
+
+int main(int argc, char** argv) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  folly::runBenchmarks();
+  return 0;
+}
diff --git a/velox/common/base/tests/SimdUtilTest.cpp b/velox/common/base/tests/SimdUtilTest.cpp
index 9dbebc060fb3..2e0d26416e73 100644
--- a/velox/common/base/tests/SimdUtilTest.cpp
+++ b/velox/common/base/tests/SimdUtilTest.cpp
@@ -109,6 +109,20 @@ class SimdUtilTest : public testing::Test {
     EXPECT_EQ(reference, target);
   }
 
+  template <class Integral1, class Integral2>
+  Integral2 random(Integral1 low, Integral2 up) {
+    std::uniform_int_distribution<> range(low, up);
+    return range(rng_);
+  }
+
+  void randomString(std::string* toFill, unsigned int maxSize = 1000) {
+    assert(toFill);
+    toFill->resize(random(0, maxSize));
+    for (int i = 0; i < toFill->size(); i++) {
+      (*toFill)[i] = random('a', 'z');
+    }
+  }
+
   folly::Random::DefaultGenerator rng_;
 };
 
@@ -491,25 +505,154 @@ TEST_F(SimdUtilTest, memcpyTime) {
   LOG(INFO) << "simd=" << simd << " sys=" << sys;
 }
 
-TEST_F(SimdUtilTest, testSimdStrStr) {
-  // 48 chars.
+/// Copy from std::boyer_moore_searcher proposal:
+/// https://github.com/mclow/search-library/blob/master/basic_tests.cpp
+/// Basic sanity checking. It makes sure that all the algorithms work.
+TEST_F(SimdUtilTest, basicSimdStrStr) {
+  auto checkOne = [](const std::string& text, const std::string& needle) {
+    auto size = text.size();
+    auto k = needle.size();
+    ASSERT_EQ(
+        simd::simdStrstr(text.data(), size, needle.data(), k),
+        text.find(needle));
+  };
+  std::string haystack1("NOW AN FOWE\220ER ANNMAN THE ANPANMANEND");
+  std::string needle1("ANPANMAN");
+  std::string needle2("MAN THE");
+  std::string needle3("WE\220ER");
+  // At the beginning
+  std::string needle4("NOW ");
+  // At the end
+  std::string needle5("NEND");
+  // Nowhere
+  std::string needle6("NOT FOUND");
+  // Nowhere
+  std::string needle7("NOT FO\340ND");
+
+  std::string haystack2("ABC ABCDAB ABCDABCDABDE");
+  std::string needle11("ABCDABD");
+
+  std::string haystack3("abra abracad abracadabra");
+  std::string needle12("abracadabra");
+
+  std::string needle13("");
+  std::string haystack4("");
+
+  checkOne(haystack1, needle1);
+  checkOne(haystack1, needle2);
+  checkOne(haystack1, needle3);
+  checkOne(haystack1, needle4);
+  checkOne(haystack1, needle5);
+  checkOne(haystack1, needle6);
+  checkOne(haystack1, needle7);
+
+  // Cant find long pattern in short corpus
+  checkOne(needle1, haystack1);
+  // Find something in itself
+  checkOne(haystack1, haystack1);
+  // Find something in itself
+  checkOne(haystack2, haystack2);
+
+  checkOne(haystack2, needle11);
+  checkOne(haystack3, needle12);
+  // Find the empty string
+  checkOne(haystack1, needle13);
+  // Can't find in an empty haystack
+  checkOne(haystack4, needle1);
+
+  // Comment copy from the origin code.
+  // Mikhail Levin <svarneticist@gmail.com> found a problem, and this was the
+  // test that triggered it.
+  const std::string mikhailPattern =
+      "GATACACCTACCTTCACCAGTTACTCTATGCACTAGGTGCGCCAGGCCCATGCACAAGGGCTTGAGTGGATGGGAAGGA"
+      "TGTGCCCTAGTGATGGCAGCATAAGCTACGCAGAGAAGTTCCAGGGCAGAGTCACCATGACCAGGGACACATCCACGAG"
+      "CACAGCCTACATGGAGCTGAGCAGCCTGAGATCTGAAGACACGGCCATGTATTACTGTGGGAGAGATGTCTGGAGTGGT"
+      "TATTATTGCCCCGGTAATATTACTACTACTACTACTACATGGACGTCTGGGGCAAAGGGACCACG";
+  const std::string mikhailCorpus = std::string(8, 'a') + mikhailPattern;
+
+  checkOne(mikhailCorpus, mikhailPattern);
+}
+
+TEST_F(SimdUtilTest, variableNeedleSize) {
   std::string s1 = "aabbccddeeffgghhiijjkkllmmnnooppqqrrssttuuvvwwxxyyzz";
   std::string s2 = "aabbccddeeffgghhiijjkkllmmnnooppqqrrssttuuvvwwxxyyzz";
-  std::string s3 = "xxx";
+  std::string s3 = "01234567890123456789";
   auto test = [](char* text, size_t size, char* needle, size_t k) {
+    if (simd::simdStrstr(text, size, needle, k) !=
+        std::string_view(text, size).find(std::string_view(needle, k))) {
+      LOG(ERROR) << "text: " << std::string(text, size)
+                 << " needle :" << std::string(needle, k);
+    }
     ASSERT_EQ(
         simd::simdStrstr(text, size, needle, k),
         std::string_view(text, size).find(std::string_view(needle, k)));
   };
-  // Match cases : substrings in s2 should be a substring in s1.
+  // Match cases (prefix/middle/suffix): substrings in s2 should be a substring
+  // in s1. Choose different needle-size left from s2, testing prefix-match in
+  // s1.
+  for (int k = 0; k < s2.size(); k++) {
+    test(s1.data(), s1.size(), s2.data(), k);
+  }
+  // Choose different needle-size left from s2, testing middle-match in s1.
   for (int i = 0; i < 20; i++) {
     for (int k = 0; k < 28; k++) {
       char* data = s2.data() + i;
       test(s1.data(), s1.size(), data, k);
     }
   }
-  // Not match case : "xxx" not in s1.
-  test(s1.data(), s1.size(), s3.data(), s3.size());
+  // Choose different needle-size right from s2, testing suffix-match in s1.
+  for (int k = 0; k < s2.size(); k++) {
+    char* data = s2.data() + s2.size() - k;
+    test(s1.data(), s1.size(), data, k);
+  }
+  // Not match case : substring in s3 not in s1.
+  for (auto k = 0; k < s3.size(); k++) {
+    test(s1.data(), s1.size(), s3.data(), k);
+  }
+
+  // FirstBlock match
+  for (auto k = 0; k < s3.size(); k++) {
+    std::string somePrefix = "xxxxxx";
+    std::string matchString = "a" + std::string(k, 'x');
+    std::string someSuffix = "yyyyyyyy";
+    std::string text = somePrefix + matchString + someSuffix;
+    auto s = "a" + std::string(k, '9');
+    test(text.data(), text.size(), s.data(), s.size());
+  }
+  // FirstBlock and LastBlock match
+  for (auto k = 0; k < s3.size(); k++) {
+    std::string somePrefix = "xxxxxx";
+    std::string matchString = "a" + std::string(k, 'x') + "b";
+    std::string someSuffix = "yyyyyyyy";
+    std::string text = somePrefix + matchString + someSuffix;
+    auto s = "a" + std::string(k, '9') + "b";
+    test(text.data(), text.size(), s.data(), s.size());
+  }
+}
+
+/// Copy from
+/// https://github.com/facebook/folly/blob/ce5edfb9b08ead9e78cb46879e7b9499861f7cd2/folly/test/FBStringTest.cpp#L1277
+/// clause11_21_4_7_2_a1
+TEST_F(SimdUtilTest, randomStringStrStr) {
+  std::string test;
+  const int kTestLoop = 1;
+  auto checkOne =
+      [](const std::string& text, const std::string& needle, size_t pos) {
+        auto size = text.length() - pos;
+        auto textPtr = text.data() + pos;
+        auto k = needle.size();
+        ASSERT_EQ(
+            simd::simdStrstr(textPtr, size, needle.data(), k),
+            text.substr(pos).find(needle));
+      };
+  for (int i = 0; i < kTestLoop; i++) {
+    // clause11_21_4_7_2_a1
+    randomString(&test);
+    auto from = random(0, test.size());
+    auto length = random(0, test.size() - from);
+    std::string str = test.substr(from, length);
+    checkOne(test, str, random(0, test.size()));
+  }
 }
 
 } // namespace

From cb6e208d7e327e9dc8587a676a06e207e277e2dc Mon Sep 17 00:00:00 2001
From: "hengjiang.ly" <hengjiang.ly@alibaba-inc.com>
Date: Thu, 26 Sep 2024 10:52:51 +0800
Subject: [PATCH 3/3] check the last page

---
 velox/common/base/SimdUtil-inl.h | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/velox/common/base/SimdUtil-inl.h b/velox/common/base/SimdUtil-inl.h
index dddd1de3836b..04d0628338c9 100644
--- a/velox/common/base/SimdUtil-inl.h
+++ b/velox/common/base/SimdUtil-inl.h
@@ -1470,18 +1470,16 @@ size_t FOLLY_ALWAYS_INLINE smidStrstrMemcmp(
   auto first = CharVector::broadcast(needle[0]);
   auto last = CharVector::broadcast(needle[needleSize - 1]);
   size_t i = 0;
-  // Fast path for page-safe data.
-  // It`s safe to over-read CharVector if all-data are in same page.
-  // see: https://mudongliang.github.io/x86/html/file_module_x86_id_208.html
-  // While executing in 16-bit addressing mode, a linear address for a 128-bit
-  // data access that overlaps the end of a 16-bit segment is not allowed and is
-  // defined as reserved behavior. A specific processor implementation may or
-  // may not generate a general-protection exception (#GP) in this situation,
-  // and the address that spans the end of the segment may or may not wrap
-  // around to the beginning of the segment.
-  for (; i <= n - needleSize && pageSafe(s + i) &&
-       pageSafe(s + i + needleSize - 1);
-       i += CharVector::size) {
+
+  for (; i <= n - needleSize; i += CharVector::size) {
+    // Assume that the input string is allocated on virtual pages : VP1, VP2,
+    // VP3 and VP4 has not been allocated yet, we need to check the end of input
+    // string is page-safe to over-read CharVector.
+    const auto lastPos = i + needleSize - 1;
+
+    if (lastPos + CharVector::size > n && !pageSafe(s + lastPos)) {
+      break;
+    }
     auto blockFirst = CharVector::load_unaligned(s + i);
     const auto eqFirst = (first == blockFirst);
     /// std:find handle the fast-path for first-char-unmatch, so we also need
@@ -1489,7 +1487,7 @@ size_t FOLLY_ALWAYS_INLINE smidStrstrMemcmp(
     if (eqFirst.mask() == 0) {
       continue;
     }
-    auto blockLast = CharVector::load_unaligned(s + i + needleSize - 1);
+    auto blockLast = CharVector::load_unaligned(s + lastPos);
     const auto eqLast = (last == blockLast);
     auto mask = (eqFirst && eqLast).mask();
     while (mask != 0) {