Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add substrings search path for constant like pattern #10731

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions velox/docs/functions/presto/regexp.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,12 @@ limited to 20 different expressions per instance and thread of execution.
Note: Each function instance allow for a maximum of 20 regular expressions to
be compiled per thread of execution. Not all patterns require
compilation of regular expressions. Patterns 'hello', 'hello%', '_hello__%',
'%hello', '%__hello_', '%hello%', where 'hello', 'velox'
contains only regular characters and '_' wildcards are evaluated without
using regular expressions. Only those patterns that require the compilation of
regular expressions are counted towards the limit.
'%hello', '%__hello_', '%hello%' where 'hello', 'velox' contains only regular
characters and '_' wildcards are evaluated without using regular expressions,
and constant pattern '%hello%velox%' where 'hello', 'velox' contains only regular
characters(not contains '_' '#' wildcards) is evaluated with substrings-searching.
Only those patterns that require the compilation of regular expressions are
counted towards the limit.

SELECT like('abc', '%b%'); -- true
SELECT like('a_c', '%#_%', '#'); -- true
Expand Down
79 changes: 67 additions & 12 deletions velox/functions/lib/Re2Functions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -650,6 +650,22 @@ bool matchSubstringPattern(
std::string::npos);
}

bool matchSubstringsPattern(
const StringView& input,
const std::vector<std::string>& patterns) {
const char* data = input.data();
for (int i = 0; i < patterns.size(); i++) {
auto curPos =
std::string_view(data, input.end() - data)
.find(std::string_view(patterns[i].c_str(), patterns[i].size()));
if (curPos == std::string::npos) {
return false;
}
data = data + curPos + patterns[i].size();
}
return true;
}

// Return true if the input VARCHAR argument is all-ASCII for the specified
// rows.
FOLLY_ALWAYS_INLINE static bool isAsciiArg(
Expand Down Expand Up @@ -705,6 +721,8 @@ class OptimizedLike final : public exec::VectorFunction {
.first;
case PatternKind::kSubstring:
return matchSubstringPattern(input, patternMetadata.fixedPattern());
case PatternKind::kSubstrings:
return matchSubstringsPattern(input, patternMetadata.substrings());
}
} else {
switch (P) {
Expand Down Expand Up @@ -739,6 +757,8 @@ class OptimizedLike final : public exec::VectorFunction {
input, patternMetadata, input.size() - 1);
case PatternKind::kSubstring:
return matchSubstringPattern(input, patternMetadata.fixedPattern());
case PatternKind::kSubstrings:
return matchSubstringsPattern(input, patternMetadata.substrings());
}
}
}
Expand Down Expand Up @@ -1688,19 +1708,19 @@ std::vector<std::shared_ptr<exec::FunctionSignature>> re2ExtractSignatures() {
}

PatternMetadata PatternMetadata::generic() {
return {PatternKind::kGeneric, 0, "", {}};
return {PatternKind::kGeneric, 0, "", {}, {}};
}

PatternMetadata PatternMetadata::atLeastN(size_t length) {
return {PatternKind::kAtLeastN, length, "", {}};
return {PatternKind::kAtLeastN, length, "", {}, {}};
}

PatternMetadata PatternMetadata::exactlyN(size_t length) {
return {PatternKind::kExactlyN, length, "", {}};
return {PatternKind::kExactlyN, length, "", {}, {}};
}

PatternMetadata PatternMetadata::fixed(const std::string& fixedPattern) {
return {PatternKind::kFixed, fixedPattern.length(), fixedPattern, {}};
return {PatternKind::kFixed, fixedPattern.length(), fixedPattern, {}, {}};
}

PatternMetadata PatternMetadata::relaxedFixed(
Expand All @@ -1711,11 +1731,12 @@ PatternMetadata PatternMetadata::relaxedFixed(
PatternKind::kRelaxedFixed,
fixedLength,
std::move(fixedPattern),
std::move(subPatterns)};
std::move(subPatterns),
{}};
}

PatternMetadata PatternMetadata::prefix(const std::string& fixedPattern) {
return {PatternKind::kPrefix, fixedPattern.length(), fixedPattern, {}};
return {PatternKind::kPrefix, fixedPattern.length(), fixedPattern, {}, {}};
}

PatternMetadata PatternMetadata::relaxedPrefix(
Expand All @@ -1726,11 +1747,12 @@ PatternMetadata PatternMetadata::relaxedPrefix(
PatternKind::kRelaxedPrefix,
fixedLength,
std::move(fixedPattern),
std::move(subPatterns)};
std::move(subPatterns),
{}};
}

PatternMetadata PatternMetadata::suffix(const std::string& fixedPattern) {
return {PatternKind::kSuffix, fixedPattern.length(), fixedPattern, {}};
return {PatternKind::kSuffix, fixedPattern.length(), fixedPattern, {}, {}};
}

PatternMetadata PatternMetadata::relaxedSuffix(
Expand All @@ -1741,22 +1763,47 @@ PatternMetadata PatternMetadata::relaxedSuffix(
PatternKind::kRelaxedSuffix,
fixedLength,
std::move(fixedPattern),
std::move(subPatterns)};
std::move(subPatterns),
{}};
}

PatternMetadata PatternMetadata::substring(const std::string& fixedPattern) {
return {PatternKind::kSubstring, fixedPattern.length(), fixedPattern, {}};
return {PatternKind::kSubstring, fixedPattern.length(), fixedPattern, {}, {}};
}

PatternMetadata PatternMetadata::substrings(
std::vector<std::string> substrings) {
return {PatternKind::kSubstrings, 0, "", {}, std::move(substrings)};
}

std::vector<std::string> PatternMetadata::parseSubstrings(
const std::string_view& pattern) {
// Not support substrings-search with '_' for best performance.
static const re2::RE2 fullPattern(R"((%+[^%_#\\]+)+%+)");
static const re2::RE2 subPattern(R"((?:%+)([^%_#\\]+))");
re2::StringPiece full(pattern);
re2::StringPiece cur;
std::vector<std::string> substrings;
skadilover marked this conversation as resolved.
Show resolved Hide resolved
if (RE2::FullMatch(full, fullPattern)) {
while (RE2::PartialMatch(full, subPattern, &cur)) {
substrings.push_back(cur.as_string());
full.set(cur.end(), full.end() - cur.end());
}
}
return substrings;
}

PatternMetadata::PatternMetadata(
PatternKind patternKind,
size_t length,
std::string fixedPattern,
std::vector<SubPatternMetadata> subPatterns)
std::vector<SubPatternMetadata> subPatterns,
std::vector<std::string> substrings)
: patternKind_{patternKind},
length_{length},
fixedPattern_(std::move(fixedPattern)),
subPatterns_(std::move(subPatterns)) {}
subPatterns_(std::move(subPatterns)),
substrings_(std::move(substrings)) {}

// Iterates through a pattern string. Transparently handles escape sequences.
class PatternStringIterator {
Expand Down Expand Up @@ -2154,6 +2201,14 @@ std::shared_ptr<exec::VectorFunction> makeLike(

PatternMetadata patternMetadata = PatternMetadata::generic();
try {
// Fast path for substrings search.
auto substrings =
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After picking this PR, Gluten will be a unit test that fails.

Incorrect evaluation: ///__ LIKE %//%/% ESCAPE '/', actual: true, expected: false

It seems that this implementation did not take into account the ESCAPE. In this example, the last slash (/) is the ESCAPE, which causes the last percent (%) to be a regular character, not a wildcard. Therefore, the result should be false, but Velox returns true.
@skadilover @mbasmanova @Yuhta @xiaoxmeng Do you have any input?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So sorry for the mistake, I mis-understand the code here:

    BaseVector* escape = inputArgs[2].constantValue.get();
    if (!escape) {
      return std::make_shared<LikeGeneric>();
    }

and the ut just skiped by '#'

  testLike("a%c", "%#%%", '#', true);
  testLike("%cd", "%#%%", '#', true);

fix it here : #11091

PatternMetadata::parseSubstrings(std::string_view(pattern));
if (substrings.size() > 0) {
patternMetadata = PatternMetadata::substrings(std::move(substrings));
return std::make_shared<OptimizedLike<PatternKind::kSubstrings>>(
patternMetadata);
}
patternMetadata =
determinePatternKind(std::string_view(pattern), escapeChar);
} catch (...) {
Expand Down
19 changes: 17 additions & 2 deletions velox/functions/lib/Re2Functions.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
#include <vector>

#include <re2/re2.h>

#include "velox/expression/VectorFunction.h"
#include "velox/functions/Udf.h"
#include "velox/vector/BaseVector.h"
Expand Down Expand Up @@ -50,6 +49,10 @@ enum class PatternKind {
kRelaxedSuffix,
/// Patterns matching '%{c0}%', such as '%foo%%', '%%%hello%'.
kSubstring,
/// Patterns matching '%{c0}%{c1}%', such as '%%foo%%bar%%', '%foo%bar%'.
/// Note: Unlike kSubstring, kSubstrings applies only to constant patterns
/// as pattern parsing is expensive.
kSubstrings,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need both kSubstring and kSubstrings ? Can we combine these two?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just for respect old code.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's combine these options.

Copy link
Contributor Author

@skadilover skadilover Aug 26, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Update : kSubstring is for non-constant case now.

I will add benchmark to measure determinePatternKind(), and disscuss it again, I simplely assume that parseSubstrings will add perf regression and only add it for constant-case.

/// Patterns which do not fit any of the above types, such as 'hello_world',
/// '_presto%'.
kGeneric,
Expand Down Expand Up @@ -101,6 +104,11 @@ class PatternMetadata {

static PatternMetadata substring(const std::string& fixedPattern);

static PatternMetadata substrings(std::vector<std::string> substrings);

static std::vector<std::string> parseSubstrings(
const std::string_view& pattern);

PatternKind patternKind() const {
return patternKind_;
}
Expand All @@ -117,12 +125,17 @@ class PatternMetadata {
return fixedPattern_;
}

const std::vector<std::string>& substrings() const {
return substrings_;
}

private:
PatternMetadata(
PatternKind patternKind,
size_t length,
std::string fixedPattern,
std::vector<SubPatternMetadata> subPatterns);
std::vector<SubPatternMetadata> subPatterns,
std::vector<std::string> substrings);

PatternKind patternKind_;

Expand All @@ -140,6 +153,8 @@ class PatternMetadata {
/// used for kRelaxedXxx patterns. e.g. If the pattern is: _pr_sto%, we will
/// have four sub-patterns here: _, pr, _ and sto.
std::vector<SubPatternMetadata> subPatterns_;

std::vector<std::string> substrings_;
};

inline const int kMaxCompiledRegexes = 20;
Expand Down
25 changes: 25 additions & 0 deletions velox/functions/lib/tests/Re2FunctionsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -460,6 +460,9 @@ TEST_F(Re2FunctionsTest, likePattern) {
false);

testLike("abc", "MEDIUM POLISHED%", false);

testLike("aabbccddeeff", "%aa%bb%", true);
testLike("aaccddeeff", "%aa%bb%", false);
}

TEST_F(Re2FunctionsTest, likeDeterminePatternKind) {
Expand Down Expand Up @@ -1500,5 +1503,27 @@ TEST_F(Re2FunctionsTest, split) {
assertEqualVectors(expected, result);
}

TEST_F(Re2FunctionsTest, parseSubstrings) {
auto test = [&](const std::string& input,
const std::vector<std::string>& expected) {
ASSERT_EQ(PatternMetadata::parseSubstrings(input), expected);
};
// Cases that not supported by substrings-search.
// Note: we always return LikeGeneric for escape-case, see makeLike().
test("%%", {});
// Not supports prefix.
test("aa%bb%%", {});
// Not supports sufix.
test("%aa%bb", {});
skadilover marked this conversation as resolved.
Show resolved Hide resolved
// Not supports '_'.
test("%aa_%", {});
// Not supports '#'.
test("%aa#%", {});

// Cases that supported by substrings-search.
test("%aa%", {"aa"});
test("%aa%bb%%", {"aa", "bb"});
test("%aa%bb%%%cc%", {"aa", "bb", "cc"});
}
} // namespace
} // namespace facebook::velox::functions
Loading