Merge pull request #10 from LukeShu/lukeshu/html-opt

Lukeshu/html opt
Unity-and-wireless-communications · Dec 13, 2024 · 79a78d2 · 79a78d2
2 parents 0921112 + 1e4aea6
commit 79a78d2
Show file tree

Hide file tree

Showing 6 changed files with 2,533 additions and 2,338 deletions.
diff --git a/html/entity.go b/html/entity.go
diff --git a/html/entity_test.go b/html/entity_test.go
@@ -6,24 +6,26 @@ package html
 
 import (
 	"testing"
-	"unicode/utf8"
 )
 
+func init() {
+	UnescapeString("") // force load of entity maps
+}
+
 func TestEntityLength(t *testing.T) {
-	// We verify that the length of UTF-8 encoding of each value is <= 1 + len(key).
-	// The +1 comes from the leading "&". This property implies that the length of
-	// unescaped text is <= the length of escaped text.
+	if len(entity) == 0 {
+		t.Fatal("maps not loaded")
+	}
+
+	// We verify that the length of UTF-8 encoding of each value
+	// is no more than 1 + len("&"+key), which is an assuption
+	// made in unescapeEntity.
 	for k, v := range entity {
-		if 1+len(k) < utf8.RuneLen(v) {
-			t.Error("escaped entity &" + k + " is shorter than its UTF-8 encoding " + string(v))
+		if 2+len(k) < int(v[0]) {
+			t.Error("escaped entity &" + k + " is more than 1 byte shorter than its UTF-8 encoding " + string(v[1:1+v[0]]))
 		}
 		if len(k) > longestEntityWithoutSemicolon && k[len(k)-1] != ';' {
 			t.Errorf("entity name %s is %d characters, but longestEntityWithoutSemicolon=%d", k, len(k), longestEntityWithoutSemicolon)
 		}
 	}
-	for k, v := range entity2 {
-		if 1+len(k) < utf8.RuneLen(v[0])+utf8.RuneLen(v[1]) {
-			t.Error("escaped entity &" + k + " is shorter than its UTF-8 encoding " + string(v[0]) + string(v[1]))
-		}
-	}
 }
diff --git a/html/escape.go b/html/escape.go
@@ -12,7 +12,7 @@ import (
 
 // These replacements permit compatibility with old numeric entities that
 // assumed Windows-1252 encoding.
-// https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference
+// https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
 var replacementTable = [...]rune{
 	'\u20AC', // First entry is what 0x80 should be replaced with.
 	'\u0081',
@@ -50,26 +50,32 @@ var replacementTable = [...]rune{
 	// 0x0D->'\u000D' is a no-op.
 }
 
-// unescapeEntity reads an entity like "&lt;" from b[src:] and writes the
-// corresponding "<" to b[dst:], returning the incremented dst and src cursors.
-// Precondition: b[src] == '&' && dst <= src.
+// unescapeEntity reads an entity like "&lt;" from src[srcPos:] and
+// writes the corresponding "<" to dst[dstPos:], returning dst and the
+// incremented dstPos and srcPos cursors.
+//
+// Usually, the returned dst is the dst argument, but in the event
+// that dstPos>srcPos it may be a copy.
+//
+// Precondition: src[srcPos] == '&'.
+//
 // attribute should be true if parsing an attribute value.
-func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
-	// https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference
+func unescapeEntity[S ~[]byte | string](dst []byte, src S, dstPos, srcPos int, attribute bool) (dst1 []byte, dstPos1, srcPos1 int) {
+	var dstIsSrc = len(dst) == len(src)
+
+	// https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state
 
 	// i starts at 1 because we already know that s[0] == '&'.
-	i, s := 1, b[src:]
+	i, s := 1, src[srcPos:]
 
-	if len(s) <= 1 {
-		b[dst] = b[src]
-		return dst + 1, src + 1
+	// shortest possible entities are all 3 bytes:
+	// "&GT", "&LT", "&gt", "&lt", "&#0" ... "&#9"
+	if len(s) < 3 {
+		dst[dstPos] = src[srcPos]
+		return dst, dstPos + 1, srcPos + 1
 	}
 
 	if s[i] == '#' {
-		if len(s) <= 3 { // We need to have at least "&#.".
-			b[dst] = b[src]
-			return dst + 1, src + 1
-		}
 		i++
 		c := s[i]
 		hex := false
@@ -79,9 +85,17 @@ func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
 		}
 
 		x := '\x00'
+		overflowed := false
 		for i < len(s) {
 			c = s[i]
 			i++
+			if x > 0x10FFFF {
+				// Make a note that we're above the maximum
+				// value, in case later we overflow the integer.
+				// Don't `break` though, we still want to
+				// consume the characters.
+				overflowed = true
+			}
 			if hex {
 				if '0' <= c && c <= '9' {
 					x = 16*x + rune(c) - '0'
@@ -102,10 +116,13 @@ func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
 			}
 			break
 		}
+		if overflowed {
+			x = 0x110000
+		}
 
-		if i <= 3 { // No characters matched.
-			b[dst] = b[src]
-			return dst + 1, src + 1
+		if i < 3 || (hex && i < 4) { // No characters matched.
+			dst[dstPos] = src[srcPos]
+			return dst, dstPos + 1, srcPos + 1
 		}
 
 		if 0x80 <= x && x <= 0x9F {
@@ -116,7 +133,7 @@ func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
 			x = '\uFFFD'
 		}
 
-		return dst + utf8.EncodeRune(b[dst:], x), src + i
+		return dst, dstPos + utf8.EncodeRune(dst[dstPos:], x), srcPos + i
 	}
 
 	// Consume the maximum number of characters possible, with the
@@ -135,52 +152,70 @@ func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
 		break
 	}
 
-	entityName := string(s[1:i])
-	if entityName == "" {
+	entityName := s[1:i]
+	if len(entityName) == 0 {
 		// No-op.
 	} else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
 		// No-op.
-	} else if x := entity[entityName]; x != 0 {
-		return dst + utf8.EncodeRune(b[dst:], x), src + i
-	} else if x := entity2[entityName]; x[0] != 0 {
-		dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
-		return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
+	} else if x := entity[string(entityName)]; x[0] != 0 {
+		if int(x[0]) > i {
+			// This assumes that it only ever has to grow by 1 byte per entity.
+			if dstPos == srcPos && dstIsSrc {
+				// make a copy + grow
+				dst = append(dst[:len(dst):len(dst)], 0)
+			} else if dstPos+int(x[0]) >= len(dst) {
+				// grow, but don't necessarily make a copy
+				dst = append(dst, 0)
+			}
+		}
+		return dst, dstPos + copy(dst[dstPos:], x[1:1+x[0]]), srcPos + i
 	} else if !attribute {
 		maxLen := len(entityName) - 1
 		if maxLen > longestEntityWithoutSemicolon {
 			maxLen = longestEntityWithoutSemicolon
 		}
 		for j := maxLen; j > 1; j-- {
-			if x := entity[entityName[:j]]; x != 0 {
-				return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
+			if x := entity[string(entityName[:j])]; x[0] != 0 {
+				// This assumes that no entity without a semicolon
+				// has a value that is wider than its name.
+				return dst, dstPos + copy(dst[dstPos:], x[1:1+x[0]]), srcPos + j + 1
 			}
 		}
 	}
 
-	dst1, src1 = dst+i, src+i
-	copy(b[dst:dst1], b[src:src1])
-	return dst1, src1
+	dstPos1, srcPos1 = dstPos+i, srcPos+i
+	copy(dst[dstPos:dstPos1], src[srcPos:srcPos1])
+	return dst, dstPos1, srcPos1
 }
 
 // unescape unescapes b's entities in-place, so that "a&lt;b" becomes "a<b".
 // attribute should be true if parsing an attribute value.
 func unescape(b []byte, attribute bool) []byte {
-	for i, c := range b {
-		if c == '&' {
-			dst, src := unescapeEntity(b, i, i, attribute)
-			for src < len(b) {
-				c := b[src]
-				if c == '&' {
-					dst, src = unescapeEntity(b, dst, src, attribute)
-				} else {
-					b[dst] = c
-					dst, src = dst+1, src+1
-				}
-			}
-			return b[0:dst]
+	populateMapOnce.Do(populateMap)
+	i := bytes.IndexByte(b, '&')
+
+	if i < 0 {
+		return b
+	}
+
+	b1, dst, src := unescapeEntity(b, b, i, i, attribute)
+	for len(b[src:]) > 0 {
+		if b[src] == '&' {
+			i = 0
+		} else {
+			i = bytes.IndexByte(b[src:], '&')
+		}
+		if i < 0 {
+			dst += copy(b1[dst:], b[src:])
+			break
+		}
+
+		if i > 0 {
+			copy(b1[dst:], b[src:src+i])
 		}
+		b1, dst, src = unescapeEntity(b1, b, dst+i, src+i, attribute)
 	}
-	return b
+	return b1[:dst]
 }
 
 // lower lower-cases the A-Z bytes in b in-place, so that "aBc" becomes "abc".
@@ -269,45 +304,22 @@ func escapeCommentString(s string) string {
 	if strings.IndexAny(s, "&>") == -1 {
 		return s
 	}
-	var buf bytes.Buffer
+	var buf strings.Builder
 	escapeComment(&buf, s)
 	return buf.String()
 }
 
-const escapedChars = "&'<>\"\r"
+var htmlEscaper = strings.NewReplacer(
+	`&`, "&amp;",
+	`'`, "&#39;", // "&#39;" is shorter than "&apos;" and apos was not in HTML until HTML5.
+	`<`, "&lt;",
+	`>`, "&gt;",
+	`"`, "&#34;", // "&#34;" is shorter than "&quot;".
+	"\r", "&#13;",
+)
 
 func escape(w writer, s string) error {
-	i := strings.IndexAny(s, escapedChars)
-	for i != -1 {
-		if _, err := w.WriteString(s[:i]); err != nil {
-			return err
-		}
-		var esc string
-		switch s[i] {
-		case '&':
-			esc = "&amp;"
-		case '\'':
-			// "&#39;" is shorter than "&apos;" and apos was not in HTML until HTML5.
-			esc = "&#39;"
-		case '<':
-			esc = "&lt;"
-		case '>':
-			esc = "&gt;"
-		case '"':
-			// "&#34;" is shorter than "&quot;".
-			esc = "&#34;"
-		case '\r':
-			esc = "&#13;"
-		default:
-			panic("unrecognized escape character")
-		}
-		s = s[i+1:]
-		if _, err := w.WriteString(esc); err != nil {
-			return err
-		}
-		i = strings.IndexAny(s, escapedChars)
-	}
-	_, err := w.WriteString(s)
+	_, err := htmlEscaper.WriteString(w, s)
 	return err
 }
 
@@ -316,24 +328,41 @@ func escape(w writer, s string) error {
 // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
 // always true.
 func EscapeString(s string) string {
-	if strings.IndexAny(s, escapedChars) == -1 {
-		return s
-	}
-	var buf bytes.Buffer
-	escape(&buf, s)
-	return buf.String()
+	return htmlEscaper.Replace(s)
 }
 
 // UnescapeString unescapes entities like "&lt;" to become "<". It unescapes a
 // larger range of entities than EscapeString escapes. For example, "&aacute;"
-// unescapes to "á", as does "&#225;" and "&xE1;".
+// unescapes to "á", as does "&#225;" and "&#xE1;".
 // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
 // always true.
 func UnescapeString(s string) string {
-	for _, c := range s {
-		if c == '&' {
-			return string(unescape([]byte(s), false))
+	populateMapOnce.Do(populateMap)
+	i := strings.IndexByte(s, '&')
+
+	if i < 0 {
+		return s
+	}
+
+	// The +1 is just so that dstIsSrc=false.
+	b := make([]byte, len(s)+1)
+	copy(b, s[:i])
+	b, dst, src := unescapeEntity(b, s, i, i, false)
+	for len(s[src:]) > 0 {
+		if s[src] == '&' {
+			i = 0
+		} else {
+			i = strings.IndexByte(s[src:], '&')
+		}
+		if i < 0 {
+			dst += copy(b[dst:], s[src:])
+			break
+		}
+
+		if i > 0 {
+			copy(b[dst:], s[src:src+i])
 		}
+		b, dst, src = unescapeEntity(b, s, dst+i, src+i, false)
 	}
-	return s
+	return string(b[:dst])
 }
diff --git a/html/escape_example_test.go b/html/escape_example_test.go
@@ -0,0 +1,22 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package html_test
+
+import (
+	"fmt"
+	"html"
+)
+
+func ExampleEscapeString() {
+	const s = `"Fran & Freddie's Diner" <[email protected]>`
+	fmt.Println(html.EscapeString(s))
+	// Output: &#34;Fran &amp; Freddie&#39;s Diner&#34; &lt;[email protected]&gt;
+}
+
+func ExampleUnescapeString() {
+	const s = `&quot;Fran &amp; Freddie&#39;s Diner&quot; &lt;[email protected]&gt;`
+	fmt.Println(html.UnescapeString(s))
+	// Output: "Fran & Freddie's Diner" <[email protected]>
+}