Skip to content

Commit

Permalink
Add substrings search path for constant like pattern (#10731)
Browse files Browse the repository at this point in the history
Summary:
We noticed that some like constant-patterns can be equivalent to substrings search such as tpch queries Q13 and Q16: "%special%requests%" "%Customer%Complaints%"

1.  Add substring-search fast path for like with search pattern : "((%+[^%#_\\]+)+%+)"
2. Some OLAP engine supports "\_" for substring-search. In order to obtain the best performance, "\_" is not supported here, '#' is not support for the same reason too.
3. This PR will also affect single substring-search like tpch Q9,  if a fast-simd-strstr to speedup string-search will be added as follow-up, changes nothing right now.

benchmark result :
before:

tpchQuery9                                                 23.96ms     41.73
tpchQuery13                                               156.02ms      6.41
tpchQuery16Part                                             9.02ms    110.83
tpchQuery16Supplier                                       192.13ms      5.20

after:

tpchQuery9                                                 24.30ms     41.14
tpchQuery13                                                55.03ms     18.17
tpchQuery16Part                                             8.92ms    112.07
tpchQuery16Supplier                                        17.64ms     56.69

Pull Request resolved: #10731

Reviewed By: xiaoxmeng

Differential Revision: D63261743

Pulled By: Yuhta

fbshipit-source-id: 3822d581790db32286bd7b9756f53401684c1012
  • Loading branch information
skadilover authored and facebook-github-bot committed Sep 24, 2024
1 parent 35b79eb commit 11e4047
Show file tree
Hide file tree
Showing 4 changed files with 115 additions and 18 deletions.
10 changes: 6 additions & 4 deletions velox/docs/functions/presto/regexp.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,12 @@ limited to 20 different expressions per instance and thread of execution.
Note: Each function instance allow for a maximum of 20 regular expressions to
be compiled per thread of execution. Not all patterns require
compilation of regular expressions. Patterns 'hello', 'hello%', '_hello__%',
'%hello', '%__hello_', '%hello%', where 'hello', 'velox'
contains only regular characters and '_' wildcards are evaluated without
using regular expressions. Only those patterns that require the compilation of
regular expressions are counted towards the limit.
'%hello', '%__hello_', '%hello%' where 'hello', 'velox' contains only regular
characters and '_' wildcards are evaluated without using regular expressions,
and constant pattern '%hello%velox%' where 'hello', 'velox' contains only regular
characters(not contains '_' '#' wildcards) is evaluated with substrings-searching.
Only those patterns that require the compilation of regular expressions are
counted towards the limit.

SELECT like('abc', '%b%'); -- true
SELECT like('a_c', '%#_%', '#'); -- true
Expand Down
79 changes: 67 additions & 12 deletions velox/functions/lib/Re2Functions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -650,6 +650,22 @@ bool matchSubstringPattern(
std::string::npos);
}

bool matchSubstringsPattern(
const StringView& input,
const std::vector<std::string>& patterns) {
const char* data = input.data();
for (int i = 0; i < patterns.size(); i++) {
auto curPos =
std::string_view(data, input.end() - data)
.find(std::string_view(patterns[i].c_str(), patterns[i].size()));
if (curPos == std::string::npos) {
return false;
}
data = data + curPos + patterns[i].size();
}
return true;
}

// Return true if the input VARCHAR argument is all-ASCII for the specified
// rows.
FOLLY_ALWAYS_INLINE static bool isAsciiArg(
Expand Down Expand Up @@ -705,6 +721,8 @@ class OptimizedLike final : public exec::VectorFunction {
.first;
case PatternKind::kSubstring:
return matchSubstringPattern(input, patternMetadata.fixedPattern());
case PatternKind::kSubstrings:
return matchSubstringsPattern(input, patternMetadata.substrings());
}
} else {
switch (P) {
Expand Down Expand Up @@ -739,6 +757,8 @@ class OptimizedLike final : public exec::VectorFunction {
input, patternMetadata, input.size() - 1);
case PatternKind::kSubstring:
return matchSubstringPattern(input, patternMetadata.fixedPattern());
case PatternKind::kSubstrings:
return matchSubstringsPattern(input, patternMetadata.substrings());
}
}
}
Expand Down Expand Up @@ -1688,19 +1708,19 @@ std::vector<std::shared_ptr<exec::FunctionSignature>> re2ExtractSignatures() {
}

PatternMetadata PatternMetadata::generic() {
return {PatternKind::kGeneric, 0, "", {}};
return {PatternKind::kGeneric, 0, "", {}, {}};
}

PatternMetadata PatternMetadata::atLeastN(size_t length) {
return {PatternKind::kAtLeastN, length, "", {}};
return {PatternKind::kAtLeastN, length, "", {}, {}};
}

PatternMetadata PatternMetadata::exactlyN(size_t length) {
return {PatternKind::kExactlyN, length, "", {}};
return {PatternKind::kExactlyN, length, "", {}, {}};
}

PatternMetadata PatternMetadata::fixed(const std::string& fixedPattern) {
return {PatternKind::kFixed, fixedPattern.length(), fixedPattern, {}};
return {PatternKind::kFixed, fixedPattern.length(), fixedPattern, {}, {}};
}

PatternMetadata PatternMetadata::relaxedFixed(
Expand All @@ -1711,11 +1731,12 @@ PatternMetadata PatternMetadata::relaxedFixed(
PatternKind::kRelaxedFixed,
fixedLength,
std::move(fixedPattern),
std::move(subPatterns)};
std::move(subPatterns),
{}};
}

PatternMetadata PatternMetadata::prefix(const std::string& fixedPattern) {
return {PatternKind::kPrefix, fixedPattern.length(), fixedPattern, {}};
return {PatternKind::kPrefix, fixedPattern.length(), fixedPattern, {}, {}};
}

PatternMetadata PatternMetadata::relaxedPrefix(
Expand All @@ -1726,11 +1747,12 @@ PatternMetadata PatternMetadata::relaxedPrefix(
PatternKind::kRelaxedPrefix,
fixedLength,
std::move(fixedPattern),
std::move(subPatterns)};
std::move(subPatterns),
{}};
}

PatternMetadata PatternMetadata::suffix(const std::string& fixedPattern) {
return {PatternKind::kSuffix, fixedPattern.length(), fixedPattern, {}};
return {PatternKind::kSuffix, fixedPattern.length(), fixedPattern, {}, {}};
}

PatternMetadata PatternMetadata::relaxedSuffix(
Expand All @@ -1741,22 +1763,47 @@ PatternMetadata PatternMetadata::relaxedSuffix(
PatternKind::kRelaxedSuffix,
fixedLength,
std::move(fixedPattern),
std::move(subPatterns)};
std::move(subPatterns),
{}};
}

PatternMetadata PatternMetadata::substring(const std::string& fixedPattern) {
return {PatternKind::kSubstring, fixedPattern.length(), fixedPattern, {}};
return {PatternKind::kSubstring, fixedPattern.length(), fixedPattern, {}, {}};
}

PatternMetadata PatternMetadata::substrings(
std::vector<std::string> substrings) {
return {PatternKind::kSubstrings, 0, "", {}, std::move(substrings)};
}

std::vector<std::string> PatternMetadata::parseSubstrings(
const std::string_view& pattern) {
// Not support substrings-search with '_' for best performance.
static const re2::RE2 fullPattern(R"((%+[^%_#\\]+)+%+)");
static const re2::RE2 subPattern(R"((?:%+)([^%_#\\]+))");
re2::StringPiece full(pattern);
re2::StringPiece cur;
std::vector<std::string> substrings;
if (RE2::FullMatch(full, fullPattern)) {
while (RE2::PartialMatch(full, subPattern, &cur)) {
substrings.push_back(cur.as_string());
full.set(cur.end(), full.end() - cur.end());
}
}
return substrings;
}

PatternMetadata::PatternMetadata(
PatternKind patternKind,
size_t length,
std::string fixedPattern,
std::vector<SubPatternMetadata> subPatterns)
std::vector<SubPatternMetadata> subPatterns,
std::vector<std::string> substrings)
: patternKind_{patternKind},
length_{length},
fixedPattern_(std::move(fixedPattern)),
subPatterns_(std::move(subPatterns)) {}
subPatterns_(std::move(subPatterns)),
substrings_(std::move(substrings)) {}

// Iterates through a pattern string. Transparently handles escape sequences.
class PatternStringIterator {
Expand Down Expand Up @@ -2154,6 +2201,14 @@ std::shared_ptr<exec::VectorFunction> makeLike(

PatternMetadata patternMetadata = PatternMetadata::generic();
try {
// Fast path for substrings search.
auto substrings =
PatternMetadata::parseSubstrings(std::string_view(pattern));
if (substrings.size() > 0) {
patternMetadata = PatternMetadata::substrings(std::move(substrings));
return std::make_shared<OptimizedLike<PatternKind::kSubstrings>>(
patternMetadata);
}
patternMetadata =
determinePatternKind(std::string_view(pattern), escapeChar);
} catch (...) {
Expand Down
19 changes: 17 additions & 2 deletions velox/functions/lib/Re2Functions.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
#include <vector>

#include <re2/re2.h>

#include "velox/expression/VectorFunction.h"
#include "velox/functions/Udf.h"
#include "velox/vector/BaseVector.h"
Expand Down Expand Up @@ -50,6 +49,10 @@ enum class PatternKind {
kRelaxedSuffix,
/// Patterns matching '%{c0}%', such as '%foo%%', '%%%hello%'.
kSubstring,
/// Patterns matching '%{c0}%{c1}%', such as '%%foo%%bar%%', '%foo%bar%'.
/// Note: Unlike kSubstring, kSubstrings applies only to constant patterns
/// as pattern parsing is expensive.
kSubstrings,
/// Patterns which do not fit any of the above types, such as 'hello_world',
/// '_presto%'.
kGeneric,
Expand Down Expand Up @@ -101,6 +104,11 @@ class PatternMetadata {

static PatternMetadata substring(const std::string& fixedPattern);

static PatternMetadata substrings(std::vector<std::string> substrings);

static std::vector<std::string> parseSubstrings(
const std::string_view& pattern);

PatternKind patternKind() const {
return patternKind_;
}
Expand All @@ -117,12 +125,17 @@ class PatternMetadata {
return fixedPattern_;
}

const std::vector<std::string>& substrings() const {
return substrings_;
}

private:
PatternMetadata(
PatternKind patternKind,
size_t length,
std::string fixedPattern,
std::vector<SubPatternMetadata> subPatterns);
std::vector<SubPatternMetadata> subPatterns,
std::vector<std::string> substrings);

PatternKind patternKind_;

Expand All @@ -140,6 +153,8 @@ class PatternMetadata {
/// used for kRelaxedXxx patterns. e.g. If the pattern is: _pr_sto%, we will
/// have four sub-patterns here: _, pr, _ and sto.
std::vector<SubPatternMetadata> subPatterns_;

std::vector<std::string> substrings_;
};

inline const int kMaxCompiledRegexes = 20;
Expand Down
25 changes: 25 additions & 0 deletions velox/functions/lib/tests/Re2FunctionsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -460,6 +460,9 @@ TEST_F(Re2FunctionsTest, likePattern) {
false);

testLike("abc", "MEDIUM POLISHED%", false);

testLike("aabbccddeeff", "%aa%bb%", true);
testLike("aaccddeeff", "%aa%bb%", false);
}

TEST_F(Re2FunctionsTest, likeDeterminePatternKind) {
Expand Down Expand Up @@ -1500,5 +1503,27 @@ TEST_F(Re2FunctionsTest, split) {
assertEqualVectors(expected, result);
}

TEST_F(Re2FunctionsTest, parseSubstrings) {
auto test = [&](const std::string& input,
const std::vector<std::string>& expected) {
ASSERT_EQ(PatternMetadata::parseSubstrings(input), expected);
};
// Cases that not supported by substrings-search.
// Note: we always return LikeGeneric for escape-case, see makeLike().
test("%%", {});
// Not supports prefix.
test("aa%bb%%", {});
// Not supports sufix.
test("%aa%bb", {});
// Not supports '_'.
test("%aa_%", {});
// Not supports '#'.
test("%aa#%", {});

// Cases that supported by substrings-search.
test("%aa%", {"aa"});
test("%aa%bb%%", {"aa", "bb"});
test("%aa%bb%%%cc%", {"aa", "bb", "cc"});
}
} // namespace
} // namespace facebook::velox::functions

0 comments on commit 11e4047

Please sign in to comment.