From 9abe8bb3325a4526c37611533126727d60640c79 Mon Sep 17 00:00:00 2001 From: Elton Lee Date: Mon, 16 Sep 2024 12:08:55 +0800 Subject: [PATCH 1/2] feat: provide group struct for Capture Expr --- src/compile.zig | 2 +- src/debug.zig | 2 +- src/parse.zig | 23 ++++++++++++++++++++--- src/parse_test.zig | 2 +- 4 files changed, 23 insertions(+), 6 deletions(-) diff --git a/src/compile.zig b/src/compile.zig index 1ffdb8b..eaa6639 100644 --- a/src/compile.zig +++ b/src/compile.zig @@ -356,7 +356,7 @@ pub const Compiler = struct { const index = c.nextCaptureIndex(); try c.pushCompiled(Instruction.new(entry + 1, InstructionData{ .Save = index })); - const p = try c.compileInternal(subexpr); + const p = try c.compileInternal(subexpr.expr); c.fillToNext(p.hole); const h = try c.pushHole(InstHole{ .Save = index + 1 }); diff --git a/src/debug.zig b/src/debug.zig index 842595a..feafcf3 100644 --- a/src/debug.zig +++ b/src/debug.zig @@ -56,7 +56,7 @@ fn dumpExprIndent(e: Expr, indent: usize) void { }, Expr.Capture => |subexpr| { debug.print("{s}\n", .{@tagName(e)}); - dumpExprIndent(subexpr.*, indent + 1); + dumpExprIndent(subexpr.*.expr.*, indent + 1); }, Expr.Repeat => |repeat| { debug.print("{s}(min={d}, max={?d}, greedy={any})\n", .{ @tagName(e), repeat.min, repeat.max, repeat.greedy }); diff --git a/src/parse.zig b/src/parse.zig index c1052b7..cd59849 100644 --- a/src/parse.zig +++ b/src/parse.zig @@ -57,7 +57,7 @@ pub const Expr = union(enum) { // . character AnyCharNotNL, // Capture group - Capture: *Expr, + Capture: *Group, // *, +, ? Repeat: Repeater, // Character class [a-z0-9] @@ -95,6 +95,13 @@ pub const Expr = union(enum) { } }; +/// A single node of a group. The group could include different modifiers +/// by Perl flag for further features like non-capturing group. +pub const Group = struct { + expr: *Expr, + capturing: bool, +}; + // Private in fmt. fn charToDigit(c: u8, radix: u8) !u8 { const value = switch (c) { @@ -326,6 +333,10 @@ pub const Parser = struct { return try p.arena.allocator().create(Expr); } + fn createGroup(p: *Parser) !*Group { + return try p.arena.allocator().create(Group); + } + pub fn parse(p: *Parser, re: []const u8) !*Expr { p.it = StringIterator.init(re); // Shorter alias @@ -438,8 +449,11 @@ pub const Parser = struct { // pop the left parentheses that must now exist debug.assert(p.stack.pop().* == Expr.PseudoLeftParen); + const group = try p.createGroup(); + group.* = Group{ .expr = e, .capturing = true }; + const r = try p.createExpr(); - r.* = Expr{ .Capture = e }; + r.* = Expr{ .Capture = group }; try p.stack.append(r); break; }, @@ -458,8 +472,11 @@ pub const Parser = struct { ra.* = Expr{ .Concat = concat }; } + const group = try p.createGroup(); + group.* = Group{ .expr = ra, .capturing = true }; + const r = try p.createExpr(); - r.* = Expr{ .Capture = ra }; + r.* = Expr{ .Capture = group }; try p.stack.append(r); break; }, diff --git a/src/parse_test.zig b/src/parse_test.zig index fbd79fc..1883476 100644 --- a/src/parse_test.zig +++ b/src/parse_test.zig @@ -84,7 +84,7 @@ fn reprIndent(out: *StaticWriter, e: *Expr, indent: usize) anyerror!void { }, Expr.Capture => |subexpr| { try out.writer().print("cap\n", .{}); - try reprIndent(out, subexpr, indent + 1); + try reprIndent(out, subexpr.*.expr, indent + 1); }, Expr.Repeat => |repeat| { try out.writer().print("rep(", .{}); From f14d10cb61fc1ae1ac3f86eb6bebb25203367af0 Mon Sep 17 00:00:00 2001 From: Elton Lee Date: Tue, 17 Sep 2024 18:05:11 +0800 Subject: [PATCH 2/2] feat: add to support non-capturing group syntax - Add logic to check the group expression modifiers - Implement non-capturing group logic which return `Patch` without changing the capture parts responsible for capturing group Resolve: #34 --- src/compile.zig | 8 ++++++++ src/parse.zig | 49 +++++++++++++++++++++++++++++++++++++++------- src/regex_test.zig | 20 +++++++++++++++++++ 3 files changed, 70 insertions(+), 7 deletions(-) diff --git a/src/compile.zig b/src/compile.zig index eaa6639..ec6dc04 100644 --- a/src/compile.zig +++ b/src/compile.zig @@ -187,6 +187,8 @@ pub const Compiler = struct { fn nextCaptureIndex(c: *Compiler) usize { const s = c.capture_index; + // each capture contains start and end pos, hence add two for + // each iteration c.capture_index += 2; return s; } @@ -350,6 +352,12 @@ pub const Compiler = struct { // 3: restore 1, 4 // ... + if (!subexpr.capturing) { + const p = try c.compileInternal(subexpr.expr); + const hole = p.hole; + return Patch{ .hole = hole, .entry = p.entry }; + } + // Create a partial instruction with a hole outgoing at the current location. const entry = c.insts.items.len; diff --git a/src/parse.zig b/src/parse.zig index cd59849..1d722d9 100644 --- a/src/parse.zig +++ b/src/parse.zig @@ -48,6 +48,11 @@ pub const Assertion = enum { NotWordBoundaryAscii, }; +/// Extra attributes for group expression. +pub const GroupAttributes = struct { + capturing: bool, +}; + /// A single node of an expression tree. pub const Expr = union(enum) { // Empty match (\w assertion) @@ -67,7 +72,7 @@ pub const Expr = union(enum) { // | Alternate: ArrayList(*Expr), // Pseudo stack operator to define start of a capture - PseudoLeftParen, + PseudoLeftParen: GroupAttributes, pub fn isByteClass(re: *const Expr) bool { switch (re.*) { @@ -250,6 +255,7 @@ pub const ParseError = error{ InvalidHexDigit, InvalidOctalDigit, UnrecognizedEscapeCode, + UnimplementedModifier, }; pub const ParserOptions = struct { @@ -405,8 +411,24 @@ pub const Parser = struct { // Don't handle alternation just yet, parentheses group together arguments into // a sub-expression only. '(' => { + var capturing = true; + if (it.peekIs('?')) { + // Advance and discard + _ = it.next(); + if (it.peekIs(':')) { + // Advance and discard + _ = it.next(); + capturing = false; + } else { + // NOTE: Other modifiers are considered not implemented + return error.UnimplementedModifier; + } + } + const r = try p.createExpr(); - r.* = Expr{ .PseudoLeftParen = undefined }; + r.* = Expr{ .PseudoLeftParen = .{ + .capturing = capturing, + } }; try p.stack.append(r); }, ')' => { @@ -446,11 +468,21 @@ pub const Parser = struct { return error.UnopenedParentheses; } - // pop the left parentheses that must now exist - debug.assert(p.stack.pop().* == Expr.PseudoLeftParen); + const next_e = p.stack.pop().*; + var capturing: bool = undefined; + switch (next_e) { + // pop the left parentheses that must now exist + .PseudoLeftParen => |e_paren| { + capturing = e_paren.capturing; + }, + else => unreachable, + } const group = try p.createGroup(); - group.* = Group{ .expr = e, .capturing = true }; + group.* = Group{ + .expr = e, + .capturing = capturing, + }; const r = try p.createExpr(); r.* = Expr{ .Capture = group }; @@ -458,7 +490,7 @@ pub const Parser = struct { break; }, // Existing parentheses, push new alternation - .PseudoLeftParen => { + .PseudoLeftParen => |e_paren| { mem.reverse(*Expr, concat.items); const ra = try p.createExpr(); @@ -473,7 +505,10 @@ pub const Parser = struct { } const group = try p.createGroup(); - group.* = Group{ .expr = ra, .capturing = true }; + group.* = Group{ + .expr = ra, + .capturing = e_paren.capturing, + }; const r = try p.createExpr(); r.* = Expr{ .Capture = group }; diff --git a/src/regex_test.zig b/src/regex_test.zig index ae09c47..dfa9450 100644 --- a/src/regex_test.zig +++ b/src/regex_test.zig @@ -141,6 +141,26 @@ test "regex captures" { debug.assert(mem.eql(u8, "ab0123", caps.sliceAt(0).?)); debug.assert(mem.eql(u8, "0123", caps.sliceAt(1).?)); + + var r_non_capturing_1 = try Regex.compile(std.testing.allocator, "ab(?:\\d+)"); + defer r_non_capturing_1.deinit(); + + debug.assert(try r_non_capturing_1.partialMatch("xxxxab0123a")); + + var caps_non_capturing_1 = (try r_non_capturing_1.captures("xxxxab0123a")).?; + defer caps_non_capturing_1.deinit(); + + debug.assert(mem.eql(u8, "ab0123", caps_non_capturing_1.sliceAt(0).?)); + debug.assert(caps_non_capturing_1.slots.len == 2); + + var r_non_capturing_2 = try Regex.compile(std.testing.allocator, "(?:ab(cd))"); + defer r_non_capturing_2.deinit(); + + var caps_non_capturing_2 = (try r_non_capturing_2.captures("xabcdx")).?; + defer caps_non_capturing_2.deinit(); + + debug.assert(mem.eql(u8, "abcd", caps_non_capturing_2.sliceAt(0).?)); + debug.assert(mem.eql(u8, "cd", caps_non_capturing_2.sliceAt(1).?)); } test "regex memory leaks" {