Skip to content

Commit

Permalink
Merge pull request #459 from VeriFIT/re2parser-bug-begin-end-line-mar…
Browse files Browse the repository at this point in the history
…kers

RE2Parser bug with regex begin line and end line markers #patch
  • Loading branch information
Adda0 authored Nov 18, 2024
2 parents 8b885af + 459c6cd commit e806b8d
Show file tree
Hide file tree
Showing 2 changed files with 183 additions and 21 deletions.
50 changes: 30 additions & 20 deletions src/re2parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -136,11 +136,14 @@ namespace {
this->outgoingEdges = std::vector<std::vector<std::pair<mata::Symbol, mata::nfa::State>>> (prog_size);

// We traverse all the states and create corresponding states and edges in Nfa
for (mata::nfa::State current_state = start_state; current_state < prog_size; current_state++) {
re2::Prog::Inst *inst = prog->inst(static_cast<int>(current_state));
for (State current_state = start_state, re2_state = start_state; re2_state < prog_size; ++
re2_state) {
/// Whether to increment the current state @c current_state when the @c re2_state increments.
bool increment_current_state{true};
re2::Prog::Inst *inst = prog->inst(static_cast<int>(re2_state));
// Every type of state can be final (due to epsilon transition), so we check it regardless of its type
if (this->state_cache.is_final_state[current_state]) {
this->make_state_final(current_state, explicit_nfa);
if (this->state_cache.is_final_state[re2_state]) {
this->make_state_final(re2_state, explicit_nfa);
}
switch (inst->opcode()) {
default:
Expand All @@ -164,33 +167,35 @@ namespace {
empty_flag = static_cast<int>(inst->empty());
// ^ - beginning of line
if (empty_flag & re2::kEmptyBeginLine) {
// TODO Symbol?
symbols.push_back(300);
increment_current_state = false;
}
// $ - end of line
if (empty_flag & re2::kEmptyEndLine) {
// TODO Symbol?
symbols.push_back(10);
// TODO How to handle?
// symbols.push_back(301);
increment_current_state = false;
}
// \A - beginning of text
if (empty_flag & re2::kEmptyBeginText) {
// TODO Symbol?
symbols.push_back(301);
increment_current_state = false;
}
// \z - end of text
if (empty_flag & re2::kEmptyEndText) {
// TODO Symbol?
symbols.push_back(302);
// TODO How to handle?
// symbols.push_back(302);
increment_current_state = false;
}
// \b - word boundary
if (empty_flag & re2::kEmptyWordBoundary) {
// TODO Symbol?
symbols.push_back(303);
// TODO How to handle?
// symbols.push_back(303);
increment_current_state = false;
}
// \B - not \b
if (empty_flag & re2::kEmptyNonWordBoundary) {
// TODO Symbol?
symbols.push_back(304);
// TODO How to handle?
// symbols.push_back(304);
increment_current_state = false;
}
break;
// kInstByteRange represents states with a "byte range" on the outgoing transition(s)
Expand All @@ -212,15 +217,17 @@ namespace {
if (!use_epsilon) {
// There is an epsilon transition to the currentState+1 we will need to copy transitions of
// the currentState+1 to the currentState.
if (!this->state_cache.is_last[current_state]) {
for (auto state: this->state_cache.state_mapping[current_state + 1]) {
copyEdgesFromTo.emplace_back(state, current_state);
if (!this->state_cache.is_last[re2_state]) {
for (auto state: this->state_cache.state_mapping[re2_state + 1]) {
copyEdgesFromTo.emplace_back(state, re2_state);
}
}
}
symbols.clear();
break;
}

if (increment_current_state) { ++current_state; }
}
if (!use_epsilon) {
// We will traverse the vector in reversed order. Like that, we will also handle chains of epsilon transitions
Expand Down Expand Up @@ -419,7 +426,8 @@ namespace {
if (inst->last()) {
this->state_cache.is_last[state] = true;
}
if (inst->opcode() == re2::kInstMatch) {
if (inst->opcode() == re2::kInstMatch ||
(inst->opcode() == re2::kInstEmptyWidth && inst->empty() & re2::kEmptyEndText)) {
this->state_cache.is_final_state[state] = true;
}
}
Expand Down Expand Up @@ -505,6 +513,8 @@ void mata::parser::create_nfa(nfa::Nfa* nfa, const std::string& pattern, bool us
RegexParser regexParser{};
auto parsed_regex = regexParser.parse_regex_string(pattern);
auto program = parsed_regex->CompileToProg(regexParser.options.max_mem() * 2 / 3);
// FIXME: use_epsilon = false completely breaks the method convert_pro_to_nfa(). Needs fixing before allowing to
// pass the argument use_epsilon to convert_pro_to_nfa().
regexParser.convert_pro_to_nfa(nfa, program, true, epsilon_value);
delete program;
// Decrements reference count and deletes object if the count reaches 0
Expand Down
154 changes: 153 additions & 1 deletion tests/re2parser.cc
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
#include <catch2/catch_test_macros.hpp>
#include <catch2/matchers/catch_matchers_string.hpp>

#include "mata/nfa/nfa.hh"
#include "mata/parser/re2parser.hh"
#include "mata/nfa/builder.hh"
#include "mata/nfa/nfa.hh"

using namespace mata::nfa;

using Symbol = mata::Symbol;
Expand Down Expand Up @@ -1333,3 +1335,153 @@ TEST_CASE("mata::Parser bug epsilon")
CHECK(x.is_in_lang(Run{Word{'a', 'a', 'a', 'a'}, {}}));
}
} // }}}

TEST_CASE("mata::parser Parsing regexes with ^ and $") {
Nfa nfa;
Nfa expected{};

SECTION("Handling of '\\'") {
mata::parser::create_nfa(&nfa, "a\\\\b");
expected = mata::nfa::builder::parse_from_mata(
std::string{ R"(
@NFA-explicit
%Alphabet-auto
%Initial q0
%Final q3
q0 97 q1
q1 92 q2
q2 98 q3)"
});
CHECK(mata::nfa::are_equivalent(nfa, expected));
}

SECTION("a|b$, a simple OR example with end marker") {
mata::parser::create_nfa(&nfa, "a|b$");
expected.initial.insert(0);
expected.delta.add(0, 'a', 1);
expected.delta.add(0, 'b', 1);
expected.final.insert(1);
CHECK(mata::nfa::are_equivalent(nfa, expected));
}

SECTION("^a|b, a simple OR example with begin marker") {
mata::parser::create_nfa(&nfa, "^a|b");
expected.initial.insert(0);
expected.delta.add(0, 'a', 1);
expected.delta.add(0, 'b', 1);
expected.final.insert(1);
CHECK(mata::nfa::are_equivalent(nfa, expected));
}

SECTION("^a|b$, a simple OR example with begin and end marker") {
mata::parser::create_nfa(&nfa, "^a|b$");
expected.initial.insert(0);
expected.delta.add(0, 'a', 1);
expected.delta.add(0, 'b', 1);
expected.final.insert(1);
CHECK(mata::nfa::are_equivalent(nfa, expected));
}

SECTION("^(a|b)$, a simple OR example with begin and end marker around capture group") {
mata::parser::create_nfa(&nfa, "^(a|b)$");
expected.initial.insert(0);
expected.delta.add(0, 'a', 1);
expected.delta.add(0, 'b', 1);
expected.final.insert(1);
CHECK(mata::nfa::are_equivalent(nfa, expected));
}

SECTION("a$|b, a simple OR example with end marker on the left side") {
mata::parser::create_nfa(&nfa, "a$|b");
expected.initial.insert(0);
expected.delta.add(0, 'a', 1);
expected.delta.add(0, 'b', 1);
expected.final.insert(1);
CHECK(mata::nfa::are_equivalent(nfa, expected));
}

SECTION("^a$|^b$, a simple OR example with multiple begin and end markers") {
mata::parser::create_nfa(&nfa, "^a$|^b$");
expected.initial.insert(0);
expected.delta.add(0, 'a', 1);
expected.delta.add(0, 'b', 1);
expected.final.insert(1);
CHECK(mata::nfa::are_equivalent(nfa, expected));
}

SECTION("aed|(bab)$, a simple OR example with trailing end marker") {
mata::parser::create_nfa(&nfa, "aed|(bab)$");
expected.initial.insert(0);
expected.delta.add(0, 'a', 1);
expected.delta.add(1, 'e', 2);
expected.delta.add(2, 'd', 3);
expected.delta.add(0, 'b', 4);
expected.delta.add(4, 'a', 5);
expected.delta.add(5, 'b', 3);
expected.final.insert(3);
CHECK(mata::nfa::are_equivalent(nfa, expected));
}

SECTION("aed|bab$, a simple OR example with trailing end marker") {
mata::parser::create_nfa(&nfa, "aed|bab$");
expected.initial.insert(0);
expected.delta.add(0, 'a', 1);
expected.delta.add(1, 'e', 2);
expected.delta.add(2, 'd', 3);
expected.delta.add(0, 'b', 4);
expected.delta.add(4, 'a', 5);
expected.delta.add(5, 'b', 3);
expected.final.insert(3);
CHECK(mata::nfa::are_equivalent(nfa, expected));
}

SECTION("^systempath\\=https|ftp$ correct parentheses") {
mata::parser::create_nfa(&nfa, "^[sS][yY][sS][tT][eE][mM][pP][aA][tT][hH]\\\\=(([hH][tT]{2}[pP][sS]?)|([fF][tT][pP]))$");
expected = mata::nfa::builder::parse_from_mata(std::string{ R"(
@NFA-explicit
%Alphabet-auto
%Initial q0
%Final q16 q17
q0 83 q1
q0 115 q1
q1 89 q2
q1 121 q2
q2 83 q3
q2 115 q3
q3 84 q4
q3 116 q4
q4 69 q5
q4 101 q5
q5 77 q6
q5 109 q6
q6 80 q7
q6 112 q7
q7 65 q8
q7 97 q8
q8 84 q9
q8 116 q9
q9 72 q10
q9 104 q10
q10 92 q11
q11 61 q12
q12 70 q18
q12 72 q13
q12 102 q18
q12 104 q13
q13 84 q14
q13 116 q14
q14 84 q15
q14 116 q15
q15 80 q16
q15 112 q16
q16 83 q17
q16 115 q17
q18 84 q19
q18 116 q19
q19 80 q17
q19 112 q17
)"
});
CHECK(mata::nfa::are_equivalent(nfa, expected));
}
}

0 comments on commit e806b8d

Please sign in to comment.