Skip to content

Commit

Permalink
add ut / benchmark
Browse files Browse the repository at this point in the history
  • Loading branch information
skadilover committed Aug 29, 2024
1 parent a5994bc commit 457860e
Show file tree
Hide file tree
Showing 6 changed files with 483 additions and 145 deletions.
140 changes: 140 additions & 0 deletions velox/common/base/SimdUtil-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -1436,4 +1436,144 @@ inline bool memEqualUnsafe(const void* x, const void* y, int32_t size) {
return true;
}

namespace detail {

/// NOTE: SSE_4_2`s performance lost a lot in un-match case.
#if XSIMD_WITH_AVX2
using CharVector = xsimd::batch<uint8_t, xsimd::avx2>;
#elif XSIMD_WITH_NEON
using CharVector = xsimd::batch<uint8_t, xsimd::neon>;
#endif

const int kPageSize = sysconf(_SC_PAGESIZE);
FOLLY_ALWAYS_INLINE bool pageSafe(const void* const ptr, size_t length) {
return ((kPageSize - 1) & reinterpret_cast<std::uintptr_t>(ptr)) <=
kPageSize - CharVector::size - length;
}

template <bool compiled, size_t compiledNeedleSize>
size_t FOLLY_ALWAYS_INLINE smidStrstrMemcmp(
const char* s,
size_t n,
const char* needle,
size_t needleSize) {
static_assert(compiledNeedleSize >= 2);
VELOX_CHECK_GT(needleSize, 1);
VELOX_CHECK_GT(n, 0);
auto first = CharVector::broadcast(needle[0]);
auto last = CharVector::broadcast(needle[needleSize - 1]);
size_t i = 0;
// Fast path for page-safe data.
// It`s safe to over-read CharVector if all-data are in same page.
// see: https://mudongliang.github.io/x86/html/file_module_x86_id_208.html
// While executing in 16-bit addressing mode, a linear address for a 128-bit
// data access that overlaps the end of a 16-bit segment is not allowed and is
// defined as reserved behavior. A specific processor implementation may or
// may not generate a general-protection exception (#GP) in this situation,
// and the address that spans the end of the segment may or may not wrap
// around to the beginning of the segment.
if (pageSafe(s + n, needleSize)) {
for (; i <= n - needleSize; i += CharVector::size) {
auto blockFirst = CharVector::load_unaligned(s + i);
const auto eqFirst = (first == blockFirst);
/// std:find handle the fast-path for first-char-unmatch, so we also need
/// to handle eqFirst.
if (eqFirst.mask() == 0) {
continue;
}
auto blockLast = CharVector::load_unaligned(s + i + needleSize - 1);
const auto eqLast = (last == blockLast);
auto mask = (eqFirst && eqLast).mask();
while (mask != 0) {
const auto bitpos = __builtin_ctz(mask);
if constexpr (compiled) {
if constexpr (compiledNeedleSize == 2) {
return i + bitpos;
}
if (memcmp(s + i + bitpos + 1, needle + 1, compiledNeedleSize - 2) ==
0) {
return i + bitpos;
}
} else {
if (memcmp(s + i + bitpos + 1, needle + 1, needleSize - 2) == 0) {
return i + bitpos;
}
}
mask = mask & (mask - 1);
}
}
}
// Fallback path for generic path.
for (; i <= n - needleSize; ++i) {
if constexpr (compiled) {
if (memcmp(s + i, needle, compiledNeedleSize) == 0) {
return i;
}
} else {
if (memcmp(s + i, needle, needleSize) == 0) {
return i;
}
}
}

return std::string::npos;
};

} // namespace detail

/// A faster implementation for std::find, about 2x faster than string_view`s
/// find() in almost cases, proved by StringSearchBenchmark.cpp. Use xsmid-batch
/// to compare first&&last char first, use fixed-memcmp to compare left chars.
/// Inline in header file will be 30% faster.
FOLLY_ALWAYS_INLINE size_t
simdStrstr(const char* s, size_t n, const char* needle, size_t k) {
size_t result = std::string::npos;

if (n < k) {
return result;
}

switch (k) {
case 0:
return 0;

case 1: {
const char* res = strchr(s, needle[0]);

return (res != nullptr) ? res - s : std::string::npos;
}
#define FIXED_MEM_STRSTR(size) \
case size: \
result = detail::smidStrstrMemcmp<true, size>(s, n, needle, size); \
break;
FIXED_MEM_STRSTR(2)
FIXED_MEM_STRSTR(3)
FIXED_MEM_STRSTR(4)
FIXED_MEM_STRSTR(5)
FIXED_MEM_STRSTR(6)
FIXED_MEM_STRSTR(7)
FIXED_MEM_STRSTR(8)
FIXED_MEM_STRSTR(9)
FIXED_MEM_STRSTR(10)
FIXED_MEM_STRSTR(11)
FIXED_MEM_STRSTR(12)
FIXED_MEM_STRSTR(13)
FIXED_MEM_STRSTR(14)
FIXED_MEM_STRSTR(15)
FIXED_MEM_STRSTR(16)
FIXED_MEM_STRSTR(17)
FIXED_MEM_STRSTR(18)
default:
result = detail::smidStrstrMemcmp<false, 2>(s, n, needle, k);
break;
}
#undef FIXED_MEM_STRSTR
// load_unaligned is used for better performance, so result maybe bigger than
// n-k.
if (result <= n - k) {
return result;
} else {
return std::string::npos;
}
}
} // namespace facebook::velox::simd
138 changes: 0 additions & 138 deletions velox/common/base/SimdUtil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,142 +111,4 @@ bool initializeSimdUtil() {
}

static bool FB_ANONYMOUS_VARIABLE(g_simdConstants) = initializeSimdUtil();

namespace detail {

#if XSIMD_WITH_SSE4_2
using CharVector = xsimd::batch<uint8_t, xsimd::sse4_2>;
#elif XSIMD_WITH_NEON
using CharVector = xsimd::batch<uint8_t, xsimd::neon>;
#endif

const int kPageSize = sysconf(_SC_PAGESIZE);
FOLLY_ALWAYS_INLINE bool pageSafe(const void* const ptr) {
return ((kPageSize - 1) & reinterpret_cast<std::uintptr_t>(ptr)) <=
kPageSize - CharVector::size;
}

template <bool compiled, size_t compiledNeedleSize>
size_t FOLLY_ALWAYS_INLINE smidStrstrMemcmp(
const char* s,
size_t n,
const char* needle,
size_t needleSize) {
static_assert(compiledNeedleSize >= 2);
VELOX_CHECK_GT(needleSize, 1);
VELOX_CHECK_GT(n, 0);
auto first = CharVector::broadcast(needle[0]);
auto last = CharVector::broadcast(needle[needleSize - 1]);
size_t i = 0;
// Fast path for page-safe data.
// It`s safe to over-read CharVector if all-data are in same page.
// see: https://mudongliang.github.io/x86/html/file_module_x86_id_208.html
// While executing in 16-bit addressing mode, a linear address for a 128-bit
// data access that overlaps the end of a 16-bit segment is not allowed and is
// defined as reserved behavior. A specific processor implementation may or
// may not generate a general-protection exception (#GP) in this situation,
// and the address that spans the end of the segment may or may not wrap
// around to the beginning of the segment.
for (; i <= n - needleSize && pageSafe(s + i + needleSize - 1) &&
pageSafe(s + i);
i += CharVector::size) {
auto blockFirst = CharVector::load_unaligned(s + i);
auto blockLast = CharVector::load_unaligned(s + i + needleSize - 1);

const auto eqFirst = (first == blockFirst);
const auto eqLast = (last == blockLast);

auto mask = toBitMask(eqFirst && eqLast);

while (mask != 0) {
const auto bitpos = __builtin_ctz(mask);
if constexpr (compiled) {
if constexpr (compiledNeedleSize == 2) {
return i + bitpos;
}
if (memcmp(s + i + bitpos + 1, needle + 1, compiledNeedleSize - 2) ==
0) {
return i + bitpos;
}
} else {
if (memcmp(s + i + bitpos + 1, needle + 1, needleSize - 2) == 0) {
return i + bitpos;
}
}
mask = mask & (mask - 1);
}
}
// Fallback path for generic path.
for (; i <= n - needleSize; ++i) {
if constexpr (compiled) {
if (memcmp(s + i, needle, compiledNeedleSize) == 0) {
return i;
}
} else {
if (memcmp(s + i, needle, needleSize) == 0) {
return i;
}
}
}

return std::string::npos;
};

} // namespace detail

/// A faster implementation for c_strstr(), about 2x faster than string_view`s
/// find(), proved by TpchLikeBenchmark. Use xsmid-batch to compare first&&last
/// char first, use fixed-memcmp to compare left chars. Inline in header file
/// will be a little faster.
size_t simdStrstr(const char* s, size_t n, const char* needle, size_t k) {
size_t result = std::string::npos;

if (n < k) {
return result;
}

switch (k) {
case 0:
return 0;

case 1: {
const char* res = strchr(s, needle[0]);

return (res != nullptr) ? res - s : std::string::npos;
}
#define FIXED_MEM_STRSTR(size) \
case size: \
result = detail::smidStrstrMemcmp<true, size>(s, n, needle, size); \
break;
FIXED_MEM_STRSTR(2)
FIXED_MEM_STRSTR(3)
FIXED_MEM_STRSTR(4)
FIXED_MEM_STRSTR(5)
FIXED_MEM_STRSTR(6)
FIXED_MEM_STRSTR(7)
FIXED_MEM_STRSTR(8)
FIXED_MEM_STRSTR(9)
FIXED_MEM_STRSTR(10)
FIXED_MEM_STRSTR(11)
FIXED_MEM_STRSTR(12)
FIXED_MEM_STRSTR(13)
FIXED_MEM_STRSTR(14)
FIXED_MEM_STRSTR(15)
FIXED_MEM_STRSTR(16)
FIXED_MEM_STRSTR(17)
FIXED_MEM_STRSTR(18)
default:
result = detail::smidStrstrMemcmp<false, 2>(s, n, needle, k);
break;
}
#undef FIXED_MEM_STRSTR
// load_unaligned is used for better performance, so result maybe bigger than
// n-k.
if (result <= n - k) {
return result;
} else {
return std::string::npos;
}
}

} // namespace facebook::velox::simd
3 changes: 2 additions & 1 deletion velox/common/base/SimdUtil.h
Original file line number Diff line number Diff line change
Expand Up @@ -497,7 +497,8 @@ xsimd::batch<T, A> reinterpretBatch(xsimd::batch<U, A>, const A& = {});
template <typename A = xsimd::default_arch>
inline bool memEqualUnsafe(const void* x, const void* y, int32_t size);

size_t simdStrstr(const char* s, size_t n, const char* needle, size_t k);
FOLLY_ALWAYS_INLINE size_t
simdStrstr(const char* s, size_t n, const char* needle, size_t k);

} // namespace facebook::velox::simd

Expand Down
6 changes: 6 additions & 0 deletions velox/common/base/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,9 @@ target_link_libraries(
velox_common_base_benchmarks
PUBLIC ${FOLLY_BENCHMARK}
PRIVATE velox_common_base Folly::folly)

add_executable(velox_common_stringsearch_benchmarks StringSearchBenchmark.cpp)
target_link_libraries(
velox_common_stringsearch_benchmarks
PUBLIC ${FOLLY_BENCHMARK}
PRIVATE velox_common_base Folly::folly)
Loading

0 comments on commit 457860e

Please sign in to comment.