diff --git a/velox/common/base/SimdUtil-inl.h b/velox/common/base/SimdUtil-inl.h
index 87ff71f8b181..6a5bc78c2039 100644
--- a/velox/common/base/SimdUtil-inl.h
+++ b/velox/common/base/SimdUtil-inl.h
@@ -1436,4 +1436,144 @@ inline bool memEqualUnsafe(const void* x, const void* y, int32_t size) {
   return true;
 }
 
+namespace detail {
+
+/// NOTE: SSE_4_2`s performance lost a lot in un-match case.
+#if XSIMD_WITH_AVX2
+using CharVector = xsimd::batch<uint8_t, xsimd::avx2>;
+#elif XSIMD_WITH_NEON
+using CharVector = xsimd::batch<uint8_t, xsimd::neon>;
+#endif
+
+const int kPageSize = sysconf(_SC_PAGESIZE);
+FOLLY_ALWAYS_INLINE bool pageSafe(const void* const ptr, size_t length) {
+  return ((kPageSize - 1) & reinterpret_cast<std::uintptr_t>(ptr)) <=
+      kPageSize - CharVector::size - length;
+}
+
+template <bool compiled, size_t compiledNeedleSize>
+size_t FOLLY_ALWAYS_INLINE smidStrstrMemcmp(
+    const char* s,
+    size_t n,
+    const char* needle,
+    size_t needleSize) {
+  static_assert(compiledNeedleSize >= 2);
+  VELOX_CHECK_GT(needleSize, 1);
+  VELOX_CHECK_GT(n, 0);
+  auto first = CharVector::broadcast(needle[0]);
+  auto last = CharVector::broadcast(needle[needleSize - 1]);
+  size_t i = 0;
+  // Fast path for page-safe data.
+  // It`s safe to over-read CharVector if all-data are in same page.
+  // see: https://mudongliang.github.io/x86/html/file_module_x86_id_208.html
+  // While executing in 16-bit addressing mode, a linear address for a 128-bit
+  // data access that overlaps the end of a 16-bit segment is not allowed and is
+  // defined as reserved behavior. A specific processor implementation may or
+  // may not generate a general-protection exception (#GP) in this situation,
+  // and the address that spans the end of the segment may or may not wrap
+  // around to the beginning of the segment.
+  if (pageSafe(s + n, needleSize)) {
+    for (; i <= n - needleSize; i += CharVector::size) {
+      auto blockFirst = CharVector::load_unaligned(s + i);
+      const auto eqFirst = (first == blockFirst);
+      /// std:find handle the fast-path for first-char-unmatch, so we also need
+      /// to handle eqFirst.
+      if (eqFirst.mask() == 0) {
+        continue;
+      }
+      auto blockLast = CharVector::load_unaligned(s + i + needleSize - 1);
+      const auto eqLast = (last == blockLast);
+      auto mask = (eqFirst && eqLast).mask();
+      while (mask != 0) {
+        const auto bitpos = __builtin_ctz(mask);
+        if constexpr (compiled) {
+          if constexpr (compiledNeedleSize == 2) {
+            return i + bitpos;
+          }
+          if (memcmp(s + i + bitpos + 1, needle + 1, compiledNeedleSize - 2) ==
+              0) {
+            return i + bitpos;
+          }
+        } else {
+          if (memcmp(s + i + bitpos + 1, needle + 1, needleSize - 2) == 0) {
+            return i + bitpos;
+          }
+        }
+        mask = mask & (mask - 1);
+      }
+    }
+  }
+  // Fallback path for generic path.
+  for (; i <= n - needleSize; ++i) {
+    if constexpr (compiled) {
+      if (memcmp(s + i, needle, compiledNeedleSize) == 0) {
+        return i;
+      }
+    } else {
+      if (memcmp(s + i, needle, needleSize) == 0) {
+        return i;
+      }
+    }
+  }
+
+  return std::string::npos;
+};
+
+} // namespace detail
+
+/// A faster implementation for std::find, about 2x faster than string_view`s
+/// find() in almost cases, proved by StringSearchBenchmark.cpp. Use xsmid-batch
+/// to compare first&&last char first, use fixed-memcmp to compare left chars.
+/// Inline in header file will be 30% faster.
+FOLLY_ALWAYS_INLINE size_t
+simdStrstr(const char* s, size_t n, const char* needle, size_t k) {
+  size_t result = std::string::npos;
+
+  if (n < k) {
+    return result;
+  }
+
+  switch (k) {
+    case 0:
+      return 0;
+
+    case 1: {
+      const char* res = strchr(s, needle[0]);
+
+      return (res != nullptr) ? res - s : std::string::npos;
+    }
+#define FIXED_MEM_STRSTR(size)                                         \
+  case size:                                                           \
+    result = detail::smidStrstrMemcmp<true, size>(s, n, needle, size); \
+    break;
+      FIXED_MEM_STRSTR(2)
+      FIXED_MEM_STRSTR(3)
+      FIXED_MEM_STRSTR(4)
+      FIXED_MEM_STRSTR(5)
+      FIXED_MEM_STRSTR(6)
+      FIXED_MEM_STRSTR(7)
+      FIXED_MEM_STRSTR(8)
+      FIXED_MEM_STRSTR(9)
+      FIXED_MEM_STRSTR(10)
+      FIXED_MEM_STRSTR(11)
+      FIXED_MEM_STRSTR(12)
+      FIXED_MEM_STRSTR(13)
+      FIXED_MEM_STRSTR(14)
+      FIXED_MEM_STRSTR(15)
+      FIXED_MEM_STRSTR(16)
+      FIXED_MEM_STRSTR(17)
+      FIXED_MEM_STRSTR(18)
+    default:
+      result = detail::smidStrstrMemcmp<false, 2>(s, n, needle, k);
+      break;
+  }
+#undef FIXED_MEM_STRSTR
+  // load_unaligned is used for better performance, so result maybe bigger than
+  // n-k.
+  if (result <= n - k) {
+    return result;
+  } else {
+    return std::string::npos;
+  }
+}
 } // namespace facebook::velox::simd
diff --git a/velox/common/base/SimdUtil.cpp b/velox/common/base/SimdUtil.cpp
index 3f7d0de91d5a..f59ad20b7a3a 100644
--- a/velox/common/base/SimdUtil.cpp
+++ b/velox/common/base/SimdUtil.cpp
@@ -111,142 +111,4 @@ bool initializeSimdUtil() {
 }
 
 static bool FB_ANONYMOUS_VARIABLE(g_simdConstants) = initializeSimdUtil();
-
-namespace detail {
-
-#if XSIMD_WITH_SSE4_2
-using CharVector = xsimd::batch<uint8_t, xsimd::sse4_2>;
-#elif XSIMD_WITH_NEON
-using CharVector = xsimd::batch<uint8_t, xsimd::neon>;
-#endif
-
-const int kPageSize = sysconf(_SC_PAGESIZE);
-FOLLY_ALWAYS_INLINE bool pageSafe(const void* const ptr) {
-  return ((kPageSize - 1) & reinterpret_cast<std::uintptr_t>(ptr)) <=
-      kPageSize - CharVector::size;
-}
-
-template <bool compiled, size_t compiledNeedleSize>
-size_t FOLLY_ALWAYS_INLINE smidStrstrMemcmp(
-    const char* s,
-    size_t n,
-    const char* needle,
-    size_t needleSize) {
-  static_assert(compiledNeedleSize >= 2);
-  VELOX_CHECK_GT(needleSize, 1);
-  VELOX_CHECK_GT(n, 0);
-  auto first = CharVector::broadcast(needle[0]);
-  auto last = CharVector::broadcast(needle[needleSize - 1]);
-  size_t i = 0;
-  // Fast path for page-safe data.
-  // It`s safe to over-read CharVector if all-data are in same page.
-  // see: https://mudongliang.github.io/x86/html/file_module_x86_id_208.html
-  // While executing in 16-bit addressing mode, a linear address for a 128-bit
-  // data access that overlaps the end of a 16-bit segment is not allowed and is
-  // defined as reserved behavior. A specific processor implementation may or
-  // may not generate a general-protection exception (#GP) in this situation,
-  // and the address that spans the end of the segment may or may not wrap
-  // around to the beginning of the segment.
-  for (; i <= n - needleSize && pageSafe(s + i + needleSize - 1) &&
-       pageSafe(s + i);
-       i += CharVector::size) {
-    auto blockFirst = CharVector::load_unaligned(s + i);
-    auto blockLast = CharVector::load_unaligned(s + i + needleSize - 1);
-
-    const auto eqFirst = (first == blockFirst);
-    const auto eqLast = (last == blockLast);
-
-    auto mask = toBitMask(eqFirst && eqLast);
-
-    while (mask != 0) {
-      const auto bitpos = __builtin_ctz(mask);
-      if constexpr (compiled) {
-        if constexpr (compiledNeedleSize == 2) {
-          return i + bitpos;
-        }
-        if (memcmp(s + i + bitpos + 1, needle + 1, compiledNeedleSize - 2) ==
-            0) {
-          return i + bitpos;
-        }
-      } else {
-        if (memcmp(s + i + bitpos + 1, needle + 1, needleSize - 2) == 0) {
-          return i + bitpos;
-        }
-      }
-      mask = mask & (mask - 1);
-    }
-  }
-  // Fallback path for generic path.
-  for (; i <= n - needleSize; ++i) {
-    if constexpr (compiled) {
-      if (memcmp(s + i, needle, compiledNeedleSize) == 0) {
-        return i;
-      }
-    } else {
-      if (memcmp(s + i, needle, needleSize) == 0) {
-        return i;
-      }
-    }
-  }
-
-  return std::string::npos;
-};
-
-} // namespace detail
-
-/// A faster implementation for c_strstr(), about 2x faster than string_view`s
-/// find(), proved by TpchLikeBenchmark. Use xsmid-batch to compare first&&last
-/// char first, use fixed-memcmp to compare left chars. Inline in header file
-/// will be a little faster.
-size_t simdStrstr(const char* s, size_t n, const char* needle, size_t k) {
-  size_t result = std::string::npos;
-
-  if (n < k) {
-    return result;
-  }
-
-  switch (k) {
-    case 0:
-      return 0;
-
-    case 1: {
-      const char* res = strchr(s, needle[0]);
-
-      return (res != nullptr) ? res - s : std::string::npos;
-    }
-#define FIXED_MEM_STRSTR(size)                                         \
-  case size:                                                           \
-    result = detail::smidStrstrMemcmp<true, size>(s, n, needle, size); \
-    break;
-      FIXED_MEM_STRSTR(2)
-      FIXED_MEM_STRSTR(3)
-      FIXED_MEM_STRSTR(4)
-      FIXED_MEM_STRSTR(5)
-      FIXED_MEM_STRSTR(6)
-      FIXED_MEM_STRSTR(7)
-      FIXED_MEM_STRSTR(8)
-      FIXED_MEM_STRSTR(9)
-      FIXED_MEM_STRSTR(10)
-      FIXED_MEM_STRSTR(11)
-      FIXED_MEM_STRSTR(12)
-      FIXED_MEM_STRSTR(13)
-      FIXED_MEM_STRSTR(14)
-      FIXED_MEM_STRSTR(15)
-      FIXED_MEM_STRSTR(16)
-      FIXED_MEM_STRSTR(17)
-      FIXED_MEM_STRSTR(18)
-    default:
-      result = detail::smidStrstrMemcmp<false, 2>(s, n, needle, k);
-      break;
-  }
-#undef FIXED_MEM_STRSTR
-  // load_unaligned is used for better performance, so result maybe bigger than
-  // n-k.
-  if (result <= n - k) {
-    return result;
-  } else {
-    return std::string::npos;
-  }
-}
-
 } // namespace facebook::velox::simd
diff --git a/velox/common/base/SimdUtil.h b/velox/common/base/SimdUtil.h
index ba63d3c1d237..5230abe6ff1f 100644
--- a/velox/common/base/SimdUtil.h
+++ b/velox/common/base/SimdUtil.h
@@ -497,7 +497,8 @@ xsimd::batch<T, A> reinterpretBatch(xsimd::batch<U, A>, const A& = {});
 template <typename A = xsimd::default_arch>
 inline bool memEqualUnsafe(const void* x, const void* y, int32_t size);
 
-size_t simdStrstr(const char* s, size_t n, const char* needle, size_t k);
+FOLLY_ALWAYS_INLINE size_t
+simdStrstr(const char* s, size_t n, const char* needle, size_t k);
 
 } // namespace facebook::velox::simd
 
diff --git a/velox/common/base/benchmarks/CMakeLists.txt b/velox/common/base/benchmarks/CMakeLists.txt
index 065db83c3672..a8e956f8542a 100644
--- a/velox/common/base/benchmarks/CMakeLists.txt
+++ b/velox/common/base/benchmarks/CMakeLists.txt
@@ -17,3 +17,9 @@ target_link_libraries(
   velox_common_base_benchmarks
   PUBLIC ${FOLLY_BENCHMARK}
   PRIVATE velox_common_base Folly::folly)
+
+add_executable(velox_common_stringsearch_benchmarks StringSearchBenchmark.cpp)
+target_link_libraries(
+  velox_common_stringsearch_benchmarks
+  PUBLIC ${FOLLY_BENCHMARK}
+  PRIVATE velox_common_base Folly::folly)
diff --git a/velox/common/base/benchmarks/StringSearchBenchmark.cpp b/velox/common/base/benchmarks/StringSearchBenchmark.cpp
new file mode 100644
index 000000000000..562c1fe4c8be
--- /dev/null
+++ b/velox/common/base/benchmarks/StringSearchBenchmark.cpp
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <folly/FBString.h>
+
+#include <cstdlib>
+#include <fstream>
+#include <list>
+#include <random>
+#include <sstream>
+
+#include <folly/Benchmark.h>
+#include <folly/Random.h>
+#include <folly/container/Foreach.h>
+#include <folly/portability/GFlags.h>
+#include "velox/common/base/SimdUtil.h"
+
+/// Copy Part code from
+/// https://github.com/facebook/folly/blob/ce5edfb9b08ead9e78cb46879e7b9499861f7cd2/folly/test/FBStringTestBenchmarks.cpp.h
+using namespace std;
+using namespace folly;
+static const int seed = folly::randomNumberSeed();
+using RandomT = std::mt19937;
+static RandomT rng(seed);
+
+namespace facebook::velox {
+template <class Integral1, class Integral2>
+Integral2 random(Integral1 low, Integral2 up) {
+  std::uniform_int_distribution<> range(low, up);
+  return range(rng);
+}
+
+struct TestData {
+  std::string heyStack;
+  std::string needle;
+};
+
+TestData generateTestData(int hayStackSize, int needleSize) {
+  // Text courtesy (ahem) of
+  // http://www.psychologytoday.com/blog/career-transitions/200906/
+  // the-dreaded-writing-sample
+  // 1028chars
+  const std::string s =
+      "\
+Even if you've mastered the art of the cover letter and the resume, \
+another part of the job search process can trip up an otherwise \
+qualified candidate: the writing sample.\n\
+\n\
+Strong writing and communication skills are highly sought after by \
+most employers. Whether crafting short emails or lengthy annual \
+reports, many workers use their writing skills every day. And for an \
+employer seeking proof behind that ubiquitous candidate \
+phrase,\"excellent communication skills\", a required writing sample \
+is invaluable.\n\
+\n\
+Writing samples need the same care and attention given to cover \
+letters and resumes. Candidates with otherwise impeccable credentials \
+are routinely eliminated by a poorly chosen writing sample. Notice I \
+said \"poorly chosen\" not \"poorly written.\" Because that's the rub: \
+a writing sample not only reveals the individual's writing skills, it \
+also offers a peek into what they consider important or relevant for \
+the position. If you miss that mark with your writing sample, don't \
+expect to get a call for an interview.";
+  auto pos = random(0, s.size() - hayStackSize);
+  std::string haystack = s.substr(pos, hayStackSize);
+  auto needlePos = random(0, hayStackSize - needleSize);
+  std::string needle = haystack.substr(needlePos, needleSize);
+  return TestData{std::move(haystack), std::move(needle)};
+}
+
+void findSuccessful(
+    size_t /*arg*/,
+    bool useStd,
+    size_t iters,
+    const TestData& testdata) {
+  if (useStd) {
+    FOR_EACH_RANGE (i, 0, iters) {
+      doNotOptimizeAway(testdata.heyStack.find(testdata.needle));
+    }
+  } else {
+    FOR_EACH_RANGE (i, 0, iters) {
+      doNotOptimizeAway(simd::simdStrstr(
+          testdata.heyStack.data(),
+          testdata.heyStack.size(),
+          testdata.needle.data(),
+          testdata.needle.size()));
+    }
+  }
+}
+
+/// Folly uses random test data for each iteration, but this cannot guarantee
+/// that the data for each test of different algorithms is the same, so we use
+/// the same random data for each comparison benchmark here.
+TestData data50to5 = generateTestData(50, 5);
+TestData data100to10 = generateTestData(100, 10);
+TestData data100to20 = generateTestData(100, 20);
+TestData data1000to10 = generateTestData(1000, 10);
+TestData data1000to100 = generateTestData(1000, 100);
+
+BENCHMARK_NAMED_PARAM(findSuccessful, opt_50_5, false, 5242880, data50to5)
+BENCHMARK_NAMED_PARAM(findSuccessful, opt_100_10, false, 5242880, data100to10)
+BENCHMARK_NAMED_PARAM(findSuccessful, opt_100_20, false, 5242880, data100to20)
+BENCHMARK_NAMED_PARAM(findSuccessful, opt_1k_10, false, 5242880, data1000to10)
+BENCHMARK_NAMED_PARAM(findSuccessful, opt_1k_100, false, 5242880, data1000to100)
+BENCHMARK_NAMED_PARAM(findSuccessful, std_50_5, true, 5242880, data50to5)
+BENCHMARK_NAMED_PARAM(findSuccessful, std_100_10, true, 5242880, data100to10)
+BENCHMARK_NAMED_PARAM(findSuccessful, std_100_20, true, 5242880, data100to20)
+BENCHMARK_NAMED_PARAM(findSuccessful, std_1k_10, true, 5242880, data1000to10)
+BENCHMARK_NAMED_PARAM(findSuccessful, std_1k_100, true, 5242880, data1000to100)
+
+/// std::find only handle fast-path for prefix-unmatch-char, if there is a
+/// prefix-match-char(But in practice, it is a high probability event that a
+/// first char match is successful.), performance drops significantly in such a
+/// scenario.
+TestData prefixMatch = {
+    "luffily close dugouts wake about the pinto beans. pending, ironic dependencies",
+    "b???"};
+
+TestData prefixUnMatch = {
+    "luffily close dugouts wake about the pinto beans. pending, ironic dependencies",
+    "????"};
+
+void findUnsuccessful(
+    size_t /*arg*/,
+    bool useStd,
+    size_t iters,
+    const TestData& test) {
+  const char* haystack = test.heyStack.data();
+  const char* neddle = test.needle.data();
+  int haystackSize = test.heyStack.size();
+  int needleSize = test.needle.size();
+  if (useStd) {
+    FOR_EACH_RANGE (i, 0, iters) {
+      doNotOptimizeAway(std::string_view(haystack, haystackSize)
+                            .find(std::string_view(neddle, needleSize)));
+    }
+  } else {
+    FOR_EACH_RANGE (i, 0, iters) {
+      doNotOptimizeAway(
+          simd::simdStrstr(haystack, haystackSize, neddle, needleSize));
+    }
+  }
+}
+BENCHMARK_NAMED_PARAM(
+    findUnsuccessful,
+    std_first_char_match,
+    true,
+    52428800,
+    prefixMatch)
+BENCHMARK_NAMED_PARAM(
+    findUnsuccessful,
+    opt_first_char_match,
+    false,
+    52428800,
+    prefixMatch)
+BENCHMARK_NAMED_PARAM(
+    findUnsuccessful,
+    std_first_char_unmatch,
+    true,
+    52428800,
+    prefixUnMatch)
+BENCHMARK_NAMED_PARAM(
+    findUnsuccessful,
+    opt_first_char_unmatch,
+    false,
+    52428800,
+    prefixUnMatch)
+} // namespace facebook::velox
+int main(int argc, char** argv) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  folly::runBenchmarks();
+  return 0;
+}
diff --git a/velox/common/base/tests/SimdUtilTest.cpp b/velox/common/base/tests/SimdUtilTest.cpp
index 9dbebc060fb3..e8ea748299cb 100644
--- a/velox/common/base/tests/SimdUtilTest.cpp
+++ b/velox/common/base/tests/SimdUtilTest.cpp
@@ -109,6 +109,20 @@ class SimdUtilTest : public testing::Test {
     EXPECT_EQ(reference, target);
   }
 
+  template <class Integral1, class Integral2>
+  Integral2 random(Integral1 low, Integral2 up) {
+    std::uniform_int_distribution<> range(low, up);
+    return range(rng_);
+  }
+
+  void randomString(std::string* toFill, unsigned int maxSize = 1000) {
+    assert(toFill);
+    toFill->resize(random(0, maxSize));
+    for (int i = 0; i < toFill->size(); i++) {
+      (*toFill)[i] = random('a', 'z');
+    }
+  }
+
   folly::Random::DefaultGenerator rng_;
 };
 
@@ -491,25 +505,154 @@ TEST_F(SimdUtilTest, memcpyTime) {
   LOG(INFO) << "simd=" << simd << " sys=" << sys;
 }
 
-TEST_F(SimdUtilTest, testSimdStrStr) {
-  // 48 chars.
+/// Copy from std::boyer_moore_searcher proposal:
+/// https://github.com/mclow/search-library/blob/master/basic_tests.cpp
+/// Basic sanity checking. It makes sure that all the algorithms work.
+TEST_F(SimdUtilTest, basicSimdStrStr) {
+  auto checkOne = [](const std::string& text, const std::string& needle) {
+    auto size = text.size();
+    auto k = needle.size();
+    ASSERT_EQ(
+        simd::simdStrstr(text.data(), size, needle.data(), k),
+        text.find(needle));
+  };
+  std::string haystack1("NOW AN FOWE\220ER ANNMAN THE ANPANMANEND");
+  std::string needle1("ANPANMAN");
+  std::string needle2("MAN THE");
+  std::string needle3("WE\220ER");
+  // At the beginning
+  std::string needle4("NOW ");
+  // At the end
+  std::string needle5("NEND");
+  // Nowhere
+  std::string needle6("NOT FOUND");
+  // Nowhere
+  std::string needle7("NOT FO\340ND");
+
+  std::string haystack2("ABC ABCDAB ABCDABCDABDE");
+  std::string needle11("ABCDABD");
+
+  std::string haystack3("abra abracad abracadabra");
+  std::string needle12("abracadabra");
+
+  std::string needle13("");
+  std::string haystack4("");
+
+  checkOne(haystack1, needle1);
+  checkOne(haystack1, needle2);
+  checkOne(haystack1, needle3);
+  checkOne(haystack1, needle4);
+  checkOne(haystack1, needle5);
+  checkOne(haystack1, needle6);
+  checkOne(haystack1, needle7);
+
+  // Cant find long pattern in short corpus
+  checkOne(needle1, haystack1);
+  // Find something in itself
+  checkOne(haystack1, haystack1);
+  // Find something in itself
+  checkOne(haystack2, haystack2);
+
+  checkOne(haystack2, needle11);
+  checkOne(haystack3, needle12);
+  // Find the empty string
+  checkOne(haystack1, needle13);
+  // Can't find in an empty haystack
+  checkOne(haystack4, needle1);
+
+  // Comment copy from the origin code.
+  // Mikhail Levin <svarneticist@gmail.com> found a problem, and this was the
+  // test that triggered it.
+  const std::string mikhailPattern =
+      "GATACACCTACCTTCACCAGTTACTCTATGCACTAGGTGCGCCAGGCCCATGCACAAGGGCTTGAGTGGATGGGAAGGA"
+      "TGTGCCCTAGTGATGGCAGCATAAGCTACGCAGAGAAGTTCCAGGGCAGAGTCACCATGACCAGGGACACATCCACGAG"
+      "CACAGCCTACATGGAGCTGAGCAGCCTGAGATCTGAAGACACGGCCATGTATTACTGTGGGAGAGATGTCTGGAGTGGT"
+      "TATTATTGCCCCGGTAATATTACTACTACTACTACTACATGGACGTCTGGGGCAAAGGGACCACG";
+  const std::string mikhailCorpus = std::string(8, 'a') + mikhailPattern;
+
+  checkOne(mikhailCorpus, mikhailPattern);
+}
+
+TEST_F(SimdUtilTest, variableNeedleSize) {
   std::string s1 = "aabbccddeeffgghhiijjkkllmmnnooppqqrrssttuuvvwwxxyyzz";
   std::string s2 = "aabbccddeeffgghhiijjkkllmmnnooppqqrrssttuuvvwwxxyyzz";
-  std::string s3 = "xxx";
+  std::string s3 = "01234567890123456789";
   auto test = [](char* text, size_t size, char* needle, size_t k) {
+    if (simd::simdStrstr(text, size, needle, k) !=
+        std::string_view(text, size).find(std::string_view(needle, k))) {
+      LOG(ERROR) << "text: " << std::string(text, size)
+                 << " needle :" << std::string(needle, k);
+    }
     ASSERT_EQ(
         simd::simdStrstr(text, size, needle, k),
         std::string_view(text, size).find(std::string_view(needle, k)));
   };
-  // Match cases : substrings in s2 should be a substring in s1.
+  // Match cases (prefix/middle/suffix): substrings in s2 should be a substring
+  // in s1. Choose different needle-size left from s2, testing prefix-match in
+  // s1.
+  for (int k = 0; k < s2.size(); k++) {
+    test(s1.data(), s1.size(), s2.data(), k);
+  }
+  // Choose different needle-size left from s2, testing middle-match in s1.
   for (int i = 0; i < 20; i++) {
     for (int k = 0; k < 28; k++) {
       char* data = s2.data() + i;
       test(s1.data(), s1.size(), data, k);
     }
   }
-  // Not match case : "xxx" not in s1.
-  test(s1.data(), s1.size(), s3.data(), s3.size());
+  // Choose different needle-size right from s2, testing suffix-match in s1.
+  for (int k = 0; k < s2.size(); k++) {
+    char* data = s2.data() + s2.size() - k;
+    test(s1.data(), s1.size(), data, k);
+  }
+  // Not match case : substring in s3 not in s1.
+  for (auto k = 0; k < s3.size(); k++) {
+    test(s1.data(), s1.size(), s3.data(), k);
+  }
+
+  // FirstBlock match
+  for (auto k = 0; k < s3.size(); k++) {
+    std::string somePrefix = "xxxxxx";
+    std::string matchString = "a" + std::string(k, 'x');
+    std::string someSuffix = "yyyyyyyy";
+    std::string text = somePrefix + matchString + someSuffix;
+    auto s = "a" + std::string(k, '9');
+    test(text.data(), text.size(), s.data(), s.size());
+  }
+  // FirstBlock and LastBlock match
+  for (auto k = 0; k < s3.size(); k++) {
+    std::string somePrefix = "xxxxxx";
+    std::string matchString = "a" + std::string(k, 'x') + "b";
+    std::string someSuffix = "yyyyyyyy";
+    std::string text = somePrefix + matchString + someSuffix;
+    auto s = "a" + std::string(k, '9') + "b";
+    test(text.data(), text.size(), s.data(), s.size());
+  }
+}
+
+/// Copy from
+/// https://github.com/facebook/folly/blob/ce5edfb9b08ead9e78cb46879e7b9499861f7cd2/folly/test/FBStringTest.cpp#L1277
+/// clause11_21_4_7_2_a1
+TEST_F(SimdUtilTest, randomStringStrStr) {
+  std::string test;
+  const int kTestLoop = 1000;
+  auto checkOne =
+      [](const std::string& text, const std::string& needle, size_t pos) {
+        auto size = text.length() - pos;
+        auto textPtr = text.data() + pos;
+        auto k = needle.size();
+        ASSERT_EQ(
+            simd::simdStrstr(textPtr, size, needle.data(), k),
+            text.substr(pos).find(needle));
+      };
+  for (int i = 0; i < kTestLoop; i++) {
+    // clause11_21_4_7_2_a1
+    randomString(&test);
+    auto from = random(0, test.size());
+    auto length = random(0, test.size() - from);
+    std::string str = test.substr(from, length);
+    checkOne(test, str, random(0, test.size()));
+  }
 }
 
 } // namespace