Skip to content

Commit

Permalink
Merge pull request dlclark#52 from mstoykov/ecmascriptUnicodeEscape
Browse files Browse the repository at this point in the history
Support \u{HEX} syntax with ECMAScript with Unicode flag
  • Loading branch information
dlclark authored Jul 17, 2022
2 parents 304ee33 + e8de5ea commit 3511044
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 1 deletion.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,10 @@ if isMatch, _ := re.MatchString(`Something to match`); isMatch {

This feature is a work in progress and I'm open to ideas for more things to put here (maybe more relaxed character escaping rules?).

## ECMAScript compatibility mode
In this mode the engine provides compatibility with the [regex engine](https://tc39.es/ecma262/multipage/text-processing.html#sec-regexp-regular-expression-objects) described in the ECMAScript specification.

Additionally a Unicode mode is provided which allows parsing of `\u{CodePoint}` syntax that is only when both are provided.

## Library features that I'm still working on
- Regex split
Expand Down
1 change: 1 addition & 0 deletions regexp.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ const (
Debug = 0x0080 // "d"
ECMAScript = 0x0100 // "e"
RE2 = 0x0200 // RE2 (regexp package) compatibility mode
Unicode = 0x0400 // "u"
)

func (re *Regexp) RightToLeft() bool {
Expand Down
14 changes: 14 additions & 0 deletions regexp_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -858,6 +858,20 @@ func TestECMAScriptXCurlyBraceEscape(t *testing.T) {
}
}

func TestEcmaScriptUnicodeRange(t *testing.T) {
r, err := Compile(`([\u{001a}-\u{ffff}]+)`, ECMAScript|Unicode)
if err != nil {
panic(err)
}
m, err := r.FindStringMatch("qqqq")
if err != nil {
panic(err)
}
if m == nil {
t.Fatal("Expected non-nil, got nil")
}
}

func TestNegateRange(t *testing.T) {
re := MustCompile(`[\D]`, 0)
if m, err := re.MatchString("A"); err != nil {
Expand Down
16 changes: 15 additions & 1 deletion syntax/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ const (
Debug = 0x0080 // "d"
ECMAScript = 0x0100 // "e"
RE2 = 0x0200 // RE2 compat mode
Unicode = 0x0400 // "u"
)

func optionFromCode(ch rune) RegexOptions {
Expand All @@ -43,6 +44,8 @@ func optionFromCode(ch rune) RegexOptions {
return Debug
case 'e', 'E':
return ECMAScript
case 'u', 'U':
return Unicode
default:
return 0
}
Expand Down Expand Up @@ -1695,7 +1698,13 @@ func (p *parser) scanCharEscape() (r rune, err error) {
r, err = p.scanHex(2)
}
case 'u':
r, err = p.scanHex(4)
// ECMAscript suppot \u{HEX} only if `u` is also set
if p.useOptionE() && p.useOptionU() && p.charsRight() > 0 && p.rightChar(0) == '{' {
p.moveRight(1)
return p.scanHexUntilBrace()
} else {
r, err = p.scanHex(4)
}
case 'a':
return '\u0007', nil
case 'b':
Expand Down Expand Up @@ -1972,6 +1981,11 @@ func (p *parser) useRE2() bool {
return (p.options & RE2) != 0
}

// True if U option enabling ECMAScript's Unicode behavior on.
func (p *parser) useOptionU() bool {
return (p.options & Unicode) != 0
}

// True if options stack is empty.
func (p *parser) emptyOptionsStack() bool {
return len(p.optionsStack) == 0
Expand Down

0 comments on commit 3511044

Please sign in to comment.