diff --git a/src/re2parser.cc b/src/re2parser.cc index bca480be..b92ffb5f 100644 --- a/src/re2parser.cc +++ b/src/re2parser.cc @@ -136,11 +136,14 @@ namespace { this->outgoingEdges = std::vector>> (prog_size); // We traverse all the states and create corresponding states and edges in Nfa - for (mata::nfa::State current_state = start_state; current_state < prog_size; current_state++) { - re2::Prog::Inst *inst = prog->inst(static_cast(current_state)); + for (State current_state = start_state, re2_state = start_state; re2_state < prog_size; ++ + re2_state) { + /// Whether to increment the current state @c current_state when the @c re2_state increments. + bool increment_current_state{true}; + re2::Prog::Inst *inst = prog->inst(static_cast(re2_state)); // Every type of state can be final (due to epsilon transition), so we check it regardless of its type - if (this->state_cache.is_final_state[current_state]) { - this->make_state_final(current_state, explicit_nfa); + if (this->state_cache.is_final_state[re2_state]) { + this->make_state_final(re2_state, explicit_nfa); } switch (inst->opcode()) { default: @@ -164,33 +167,35 @@ namespace { empty_flag = static_cast(inst->empty()); // ^ - beginning of line if (empty_flag & re2::kEmptyBeginLine) { - // TODO Symbol? - symbols.push_back(300); + increment_current_state = false; } // $ - end of line if (empty_flag & re2::kEmptyEndLine) { - // TODO Symbol? - symbols.push_back(10); + // TODO How to handle? + // symbols.push_back(301); + increment_current_state = false; } // \A - beginning of text if (empty_flag & re2::kEmptyBeginText) { - // TODO Symbol? - symbols.push_back(301); + increment_current_state = false; } // \z - end of text if (empty_flag & re2::kEmptyEndText) { - // TODO Symbol? - symbols.push_back(302); + // TODO How to handle? + // symbols.push_back(302); + increment_current_state = false; } // \b - word boundary if (empty_flag & re2::kEmptyWordBoundary) { - // TODO Symbol? - symbols.push_back(303); + // TODO How to handle? + // symbols.push_back(303); + increment_current_state = false; } // \B - not \b if (empty_flag & re2::kEmptyNonWordBoundary) { - // TODO Symbol? - symbols.push_back(304); + // TODO How to handle? + // symbols.push_back(304); + increment_current_state = false; } break; // kInstByteRange represents states with a "byte range" on the outgoing transition(s) @@ -212,15 +217,17 @@ namespace { if (!use_epsilon) { // There is an epsilon transition to the currentState+1 we will need to copy transitions of // the currentState+1 to the currentState. - if (!this->state_cache.is_last[current_state]) { - for (auto state: this->state_cache.state_mapping[current_state + 1]) { - copyEdgesFromTo.emplace_back(state, current_state); + if (!this->state_cache.is_last[re2_state]) { + for (auto state: this->state_cache.state_mapping[re2_state + 1]) { + copyEdgesFromTo.emplace_back(state, re2_state); } } } symbols.clear(); break; } + + if (increment_current_state) { ++current_state; } } if (!use_epsilon) { // We will traverse the vector in reversed order. Like that, we will also handle chains of epsilon transitions @@ -419,7 +426,8 @@ namespace { if (inst->last()) { this->state_cache.is_last[state] = true; } - if (inst->opcode() == re2::kInstMatch) { + if (inst->opcode() == re2::kInstMatch || + (inst->opcode() == re2::kInstEmptyWidth && inst->empty() & re2::kEmptyEndText)) { this->state_cache.is_final_state[state] = true; } } @@ -505,6 +513,8 @@ void mata::parser::create_nfa(nfa::Nfa* nfa, const std::string& pattern, bool us RegexParser regexParser{}; auto parsed_regex = regexParser.parse_regex_string(pattern); auto program = parsed_regex->CompileToProg(regexParser.options.max_mem() * 2 / 3); + // FIXME: use_epsilon = false completely breaks the method convert_pro_to_nfa(). Needs fixing before allowing to + // pass the argument use_epsilon to convert_pro_to_nfa(). regexParser.convert_pro_to_nfa(nfa, program, true, epsilon_value); delete program; // Decrements reference count and deletes object if the count reaches 0 diff --git a/tests/re2parser.cc b/tests/re2parser.cc index 3aa91d14..d6fdde63 100644 --- a/tests/re2parser.cc +++ b/tests/re2parser.cc @@ -1,8 +1,10 @@ #include #include -#include "mata/nfa/nfa.hh" #include "mata/parser/re2parser.hh" +#include "mata/nfa/builder.hh" +#include "mata/nfa/nfa.hh" + using namespace mata::nfa; using Symbol = mata::Symbol; @@ -1333,3 +1335,153 @@ TEST_CASE("mata::Parser bug epsilon") CHECK(x.is_in_lang(Run{Word{'a', 'a', 'a', 'a'}, {}})); } } // }}} + +TEST_CASE("mata::parser Parsing regexes with ^ and $") { + Nfa nfa; + Nfa expected{}; + + SECTION("Handling of '\\'") { + mata::parser::create_nfa(&nfa, "a\\\\b"); + expected = mata::nfa::builder::parse_from_mata( + std::string{ R"( + @NFA-explicit + %Alphabet-auto + %Initial q0 + %Final q3 + q0 97 q1 + q1 92 q2 + q2 98 q3)" + }); + CHECK(mata::nfa::are_equivalent(nfa, expected)); + } + + SECTION("a|b$, a simple OR example with end marker") { + mata::parser::create_nfa(&nfa, "a|b$"); + expected.initial.insert(0); + expected.delta.add(0, 'a', 1); + expected.delta.add(0, 'b', 1); + expected.final.insert(1); + CHECK(mata::nfa::are_equivalent(nfa, expected)); + } + + SECTION("^a|b, a simple OR example with begin marker") { + mata::parser::create_nfa(&nfa, "^a|b"); + expected.initial.insert(0); + expected.delta.add(0, 'a', 1); + expected.delta.add(0, 'b', 1); + expected.final.insert(1); + CHECK(mata::nfa::are_equivalent(nfa, expected)); + } + + SECTION("^a|b$, a simple OR example with begin and end marker") { + mata::parser::create_nfa(&nfa, "^a|b$"); + expected.initial.insert(0); + expected.delta.add(0, 'a', 1); + expected.delta.add(0, 'b', 1); + expected.final.insert(1); + CHECK(mata::nfa::are_equivalent(nfa, expected)); + } + + SECTION("^(a|b)$, a simple OR example with begin and end marker around capture group") { + mata::parser::create_nfa(&nfa, "^(a|b)$"); + expected.initial.insert(0); + expected.delta.add(0, 'a', 1); + expected.delta.add(0, 'b', 1); + expected.final.insert(1); + CHECK(mata::nfa::are_equivalent(nfa, expected)); + } + + SECTION("a$|b, a simple OR example with end marker on the left side") { + mata::parser::create_nfa(&nfa, "a$|b"); + expected.initial.insert(0); + expected.delta.add(0, 'a', 1); + expected.delta.add(0, 'b', 1); + expected.final.insert(1); + CHECK(mata::nfa::are_equivalent(nfa, expected)); + } + + SECTION("^a$|^b$, a simple OR example with multiple begin and end markers") { + mata::parser::create_nfa(&nfa, "^a$|^b$"); + expected.initial.insert(0); + expected.delta.add(0, 'a', 1); + expected.delta.add(0, 'b', 1); + expected.final.insert(1); + CHECK(mata::nfa::are_equivalent(nfa, expected)); + } + + SECTION("aed|(bab)$, a simple OR example with trailing end marker") { + mata::parser::create_nfa(&nfa, "aed|(bab)$"); + expected.initial.insert(0); + expected.delta.add(0, 'a', 1); + expected.delta.add(1, 'e', 2); + expected.delta.add(2, 'd', 3); + expected.delta.add(0, 'b', 4); + expected.delta.add(4, 'a', 5); + expected.delta.add(5, 'b', 3); + expected.final.insert(3); + CHECK(mata::nfa::are_equivalent(nfa, expected)); + } + + SECTION("aed|bab$, a simple OR example with trailing end marker") { + mata::parser::create_nfa(&nfa, "aed|bab$"); + expected.initial.insert(0); + expected.delta.add(0, 'a', 1); + expected.delta.add(1, 'e', 2); + expected.delta.add(2, 'd', 3); + expected.delta.add(0, 'b', 4); + expected.delta.add(4, 'a', 5); + expected.delta.add(5, 'b', 3); + expected.final.insert(3); + CHECK(mata::nfa::are_equivalent(nfa, expected)); + } + + SECTION("^systempath\\=https|ftp$ correct parentheses") { + mata::parser::create_nfa(&nfa, "^[sS][yY][sS][tT][eE][mM][pP][aA][tT][hH]\\\\=(([hH][tT]{2}[pP][sS]?)|([fF][tT][pP]))$"); + expected = mata::nfa::builder::parse_from_mata(std::string{ R"( + @NFA-explicit + %Alphabet-auto + %Initial q0 + %Final q16 q17 + q0 83 q1 + q0 115 q1 + q1 89 q2 + q1 121 q2 + q2 83 q3 + q2 115 q3 + q3 84 q4 + q3 116 q4 + q4 69 q5 + q4 101 q5 + q5 77 q6 + q5 109 q6 + q6 80 q7 + q6 112 q7 + q7 65 q8 + q7 97 q8 + q8 84 q9 + q8 116 q9 + q9 72 q10 + q9 104 q10 + q10 92 q11 + q11 61 q12 + q12 70 q18 + q12 72 q13 + q12 102 q18 + q12 104 q13 + q13 84 q14 + q13 116 q14 + q14 84 q15 + q14 116 q15 + q15 80 q16 + q15 112 q16 + q16 83 q17 + q16 115 q17 + q18 84 q19 + q18 116 q19 + q19 80 q17 + q19 112 q17 + )" + }); + CHECK(mata::nfa::are_equivalent(nfa, expected)); + } +}