Replace checks for valid UTF-8 in strings with go-maintained calls (#…

…1094)
google · Dec 10, 2024 · 0091f8d · 0091f8d
1 parent bd1ec92
commit 0091f8d
Show file tree

Hide file tree

Showing 3 changed files with 29 additions and 22 deletions.
diff --git a/parser/parser.go b/parser/parser.go
@@ -756,7 +756,7 @@ func (p *parser) VisitDouble(ctx *gen.DoubleContext) any {
 
 // Visit a parse tree produced by CELParser#String.
 func (p *parser) VisitString(ctx *gen.StringContext) any {
-	s := p.unquote(ctx, ctx.GetText(), false)
+	s := p.unquote(ctx, ctx.GetTok().GetText(), false)
 	return p.helper.newLiteralString(ctx, s)
 }
 

diff --git a/parser/parser_test.go b/parser/parser_test.go
@@ -1780,6 +1780,13 @@ var testCases = []testInfo{
 		 | ..................^
 		`,
 	},
+	{
+		I: `'\udead' == '\ufffd'`,
+		E: `
+		ERROR: <input>:1:1: invalid unicode code point
+         | '\udead' == '\ufffd'
+         | ^`,
+	},
 }
 
 type testInfo struct {

diff --git a/parser/unescape.go b/parser/unescape.go
@@ -15,7 +15,7 @@
 package parser
 
 import (
-	"fmt"
+	"errors"
 	"strings"
 	"unicode/utf8"
 )
@@ -30,7 +30,7 @@ func unescape(value string, isBytes bool) (string, error) {
 
 	// Nothing to unescape / decode.
 	if n < 2 {
-		return value, fmt.Errorf("unable to unescape string")
+		return value, errors.New("unable to unescape string")
 	}
 
 	// Raw string preceded by the 'r|R' prefix.
@@ -43,20 +43,20 @@ func unescape(value string, isBytes bool) (string, error) {
 
 	// Quoted string of some form, must have same first and last char.
 	if value[0] != value[n-1] || (value[0] != '"' && value[0] != '\'') {
-		return value, fmt.Errorf("unable to unescape string")
+		return value, errors.New("unable to unescape string")
 	}
 
 	// Normalize the multi-line CEL string representation to a standard
 	// Go quoted string.
 	if n >= 6 {
 		if strings.HasPrefix(value, "'''") {
 			if !strings.HasSuffix(value, "'''") {
-				return value, fmt.Errorf("unable to unescape string")
+				return value, errors.New("unable to unescape string")
 			}
 			value = "\"" + value[3:n-3] + "\""
 		} else if strings.HasPrefix(value, `"""`) {
 			if !strings.HasSuffix(value, `"""`) {
-				return value, fmt.Errorf("unable to unescape string")
+				return value, errors.New("unable to unescape string")
 			}
 			value = "\"" + value[3:n-3] + "\""
 		}
@@ -90,10 +90,10 @@ func unescape(value string, isBytes bool) (string, error) {
 
 // unescapeChar takes a string input and returns the following info:
 //
-//   value - the escaped unicode rune at the front of the string.
-//   encode - the value should be unicode-encoded
-//   tail - the remainder of the input string.
-//   err - error value, if the character could not be unescaped.
+//	value - the escaped unicode rune at the front of the string.
+//	encode - the value should be unicode-encoded
+//	tail - the remainder of the input string.
+//	err - error value, if the character could not be unescaped.
 //
 // When encode is true the return value may still fit within a single byte,
 // but unicode encoding is attempted which is more expensive than when the
@@ -113,7 +113,7 @@ func unescapeChar(s string, isBytes bool) (value rune, encode bool, tail string,
 
 	// 2. Last character is the start of an escape sequence.
 	if len(s) <= 1 {
-		err = fmt.Errorf("unable to unescape string, found '\\' as last character")
+		err = errors.New("unable to unescape string, found '\\' as last character")
 		return
 	}
 
@@ -157,53 +157,53 @@ func unescapeChar(s string, isBytes bool) (value rune, encode bool, tail string,
 		case 'u':
 			n = 4
 			if isBytes {
-				err = fmt.Errorf("unable to unescape string")
+				err = errors.New("unable to unescape string")
 				return
 			}
 		case 'U':
 			n = 8
 			if isBytes {
-				err = fmt.Errorf("unable to unescape string")
+				err = errors.New("unable to unescape string")
 				return
 			}
 		}
 		var v rune
 		if len(s) < n {
-			err = fmt.Errorf("unable to unescape string")
+			err = errors.New("unable to unescape string")
 			return
 		}
 		for j := 0; j < n; j++ {
 			x, ok := unhex(s[j])
 			if !ok {
-				err = fmt.Errorf("unable to unescape string")
+				err = errors.New("unable to unescape string")
 				return
 			}
 			v = v<<4 | x
 		}
 		s = s[n:]
-		if !isBytes && v > utf8.MaxRune {
-			err = fmt.Errorf("unable to unescape string")
+		if !isBytes && !utf8.ValidRune(v) {
+			err = errors.New("invalid unicode code point")
 			return
 		}
 		value = v
 
 	// 5. Octal escape sequences, must be three digits \[0-3][0-7][0-7]
 	case '0', '1', '2', '3':
 		if len(s) < 2 {
-			err = fmt.Errorf("unable to unescape octal sequence in string")
+			err = errors.New("unable to unescape octal sequence in string")
 			return
 		}
 		v := rune(c - '0')
 		for j := 0; j < 2; j++ {
 			x := s[j]
 			if x < '0' || x > '7' {
-				err = fmt.Errorf("unable to unescape octal sequence in string")
+				err = errors.New("unable to unescape octal sequence in string")
 				return
 			}
 			v = v*8 + rune(x-'0')
 		}
-		if !isBytes && v > utf8.MaxRune {
-			err = fmt.Errorf("unable to unescape string")
+		if !isBytes && !utf8.ValidRune(v) {
+			err = errors.New("invalid unicode code point")
 			return
 		}
 		value = v
@@ -212,7 +212,7 @@ func unescapeChar(s string, isBytes bool) (value rune, encode bool, tail string,
 
 		// Unknown escape sequence.
 	default:
-		err = fmt.Errorf("unable to unescape string")
+		err = errors.New("unable to unescape string")
 	}
 
 	tail = s