golang · LukeShu · Jul 8, 2024
diff --git a/html/entity.go b/html/entity.go
diff --git a/html/entity_test.go b/html/entity_test.go
@@ -9,7 +9,15 @@ import (
 	"unicode/utf8"
 )
 
+func init() {
+	UnescapeString("") // force load of entity maps
+}
+
 func TestEntityLength(t *testing.T) {
+	if len(entity) == 0 || len(entity2) == 0 {
+		t.Fatal("maps not loaded")
+	}
+
 	// We verify that the length of UTF-8 encoding of each value is <= 1 + len(key).
 	// The +1 comes from the leading "&". This property implies that the length of
 	// unescaped text is <= the length of escaped text.

diff --git a/html/escape.go b/html/escape.go
@@ -12,7 +12,7 @@ import (
 
 // These replacements permit compatibility with old numeric entities that
 // assumed Windows-1252 encoding.
-// https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference
+// https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
 var replacementTable = [...]rune{
 	'\u20AC', // First entry is what 0x80 should be replaced with.
 	'\u0081',
@@ -135,14 +135,14 @@ func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
 		break
 	}
 
-	entityName := string(s[1:i])
-	if entityName == "" {
+	entityName := s[1:i]
+	if len(entityName) == 0 {
 		// No-op.
 	} else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
 		// No-op.
-	} else if x := entity[entityName]; x != 0 {
+	} else if x := entity[string(entityName)]; x != 0 {
 		return dst + utf8.EncodeRune(b[dst:], x), src + i
-	} else if x := entity2[entityName]; x[0] != 0 {
+	} else if x := entity2[string(entityName)]; x[0] != 0 {
 		dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
 		return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
 	} else if !attribute {
@@ -151,7 +151,7 @@ func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
 			maxLen = longestEntityWithoutSemicolon
 		}
 		for j := maxLen; j > 1; j-- {
-			if x := entity[entityName[:j]]; x != 0 {
+			if x := entity[string(entityName[:j])]; x != 0 {
 				return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
 			}
 		}
@@ -165,24 +165,34 @@ func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
 // unescape unescapes b's entities in-place, so that "a&lt;b" becomes "a<b".
 // attribute should be true if parsing an attribute value.
 func unescape(b []byte, attribute bool) []byte {
-	for i, c := range b {
-		if c == '&' {
-			dst, src := unescapeEntity(b, i, i, attribute)
-			for src < len(b) {
-				c := b[src]
-				if c == '&' {
-					dst, src = unescapeEntity(b, dst, src, attribute)
-				} else {
-					b[dst] = c
-					dst, src = dst+1, src+1
-				}
-			}
-			return b[0:dst]
-		}
+	populateMapsOnce.Do(populateMaps)
+	if i := bytes.IndexByte(b, '&'); i >= 0 {
+		return unescapeInner(b, i, attribute)
 	}
 	return b
 }
 
+func unescapeInner(b []byte, i int, attribute bool) []byte {
+	dst, src := unescapeEntity(b, i, i, attribute)
+	for len(b[src:]) > 0 {
+		if b[src] == '&' {
+			i = 0
+		} else {
+			i = bytes.IndexByte(b[src:], '&')
+		}
+		if i < 0 {
+			dst += copy(b[dst:], b[src:])
+			break
+		}
+
+		if i > 0 {
+			copy(b[dst:], b[src:src+i])
+		}
+		dst, src = unescapeEntity(b, dst+i, src+i, attribute)
+	}
+	return b[:dst]
+}
+
 // lower lower-cases the A-Z bytes in b in-place, so that "aBc" becomes "abc".
 func lower(b []byte) []byte {
 	for i, c := range b {
@@ -274,66 +284,37 @@ func escapeCommentString(s string) string {
 	return buf.String()
 }
 
-const escapedChars = "&'<>\"\r"
+var htmlEscaper = strings.NewReplacer(
+	`&`, "&amp;",
+	`'`, "&#39;", // "&#39;" is shorter than "&apos;" and apos was not in HTML until HTML5.
+	`<`, "&lt;",
+	`>`, "&gt;",
+	`"`, "&#34;", // "&#34;" is shorter than "&quot;".
+	"\r", "&#13;",
+)
 
 func escape(w writer, s string) error {
-	i := strings.IndexAny(s, escapedChars)
-	for i != -1 {
-		if _, err := w.WriteString(s[:i]); err != nil {
-			return err
-		}
-		var esc string
-		switch s[i] {
-		case '&':
-			esc = "&amp;"
-		case '\'':
-			// "&#39;" is shorter than "&apos;" and apos was not in HTML until HTML5.
-			esc = "&#39;"
-		case '<':
-			esc = "&lt;"
-		case '>':
-			esc = "&gt;"
-		case '"':
-			// "&#34;" is shorter than "&quot;".
-			esc = "&#34;"
-		case '\r':
-			esc = "&#13;"
-		default:
-			panic("unrecognized escape character")
-		}
-		s = s[i+1:]
-		if _, err := w.WriteString(esc); err != nil {
-			return err
-		}
-		i = strings.IndexAny(s, escapedChars)
-	}
-	_, err := w.WriteString(s)
+	_, err := htmlEscaper.WriteString(w, s)
 	return err
 }
 
 // EscapeString escapes special characters like "<" to become "&lt;". It
-// escapes only five such characters: <, >, &, ' and ".
+// escapes only six such characters: <, >, &, ', ", and \r.
 // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
 // always true.
 func EscapeString(s string) string {
-	if strings.IndexAny(s, escapedChars) == -1 {
-		return s
-	}
-	var buf bytes.Buffer
-	escape(&buf, s)
-	return buf.String()
+	return htmlEscaper.Replace(s)
 }
 
 // UnescapeString unescapes entities like "&lt;" to become "<". It unescapes a
 // larger range of entities than EscapeString escapes. For example, "&aacute;"
-// unescapes to "á", as does "&#225;" and "&xE1;".
+// unescapes to "á", as does "&#225;" and "&#xE1;".
 // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
 // always true.
 func UnescapeString(s string) string {
-	for _, c := range s {
-		if c == '&' {
-			return string(unescape([]byte(s), false))
-		}
+	populateMapsOnce.Do(populateMaps)
+	if i := strings.IndexByte(s, '&'); i >= 0 {
+		return string(unescapeInner([]byte(s), i, false))
 	}
 	return s
 }
diff --git a/html/escape_example_test.go b/html/escape_example_test.go
@@ -0,0 +1,22 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package html_test
+
+import (
+	"fmt"
+	"html"
+)
+
+func ExampleEscapeString() {
+	const s = `"Fran & Freddie's Diner" <[email protected]>`
+	fmt.Println(html.EscapeString(s))
+	// Output: &#34;Fran &amp; Freddie&#39;s Diner&#34; &lt;[email protected]&gt;
+}
+
+func ExampleUnescapeString() {
+	const s = `&quot;Fran &amp; Freddie&#39;s Diner&quot; &lt;[email protected]&gt;`
+	fmt.Println(html.UnescapeString(s))
+	// Output: "Fran & Freddie's Diner" <[email protected]>
+}
diff --git a/html/escape_test.go b/html/escape_test.go
@@ -4,7 +4,10 @@
 
 package html
 
-import "testing"
+import (
+	"strings"
+	"testing"
+)
 
 type unescapeTest struct {
 	// A short description of the test case.
@@ -64,6 +67,24 @@ var unescapeTests = []unescapeTest{
 		"Footnote&#x87;",
 		"Footnote‡",
 	},
+	// Handle single ampersand.
+	{
+		"copySingleAmpersand",
+		"&",
+		"&",
+	},
+	// Handle ampersand followed by non-entity.
+	{
+		"copyAmpersandNonEntity",
+		"text &test",
+		"text &test",
+	},
+	// Handle "&#".
+	{
+		"copyAmpersandHash",
+		"text &#",
+		"text &#",
+	},
 }
 
 func TestUnescape(t *testing.T) {
@@ -95,3 +116,54 @@ func TestUnescapeEscape(t *testing.T) {
 		}
 	}
 }
+
+var (
+	benchEscapeData     = strings.Repeat("AAAAA < BBBBB > CCCCC & DDDDD ' EEEEE \" ", 100)
+	benchEscapeNone     = strings.Repeat("AAAAA x BBBBB x CCCCC x DDDDD x EEEEE x ", 100)
+	benchUnescapeSparse = strings.Repeat(strings.Repeat("AAAAA x BBBBB x CCCCC x DDDDD x EEEEE x ", 10)+"&amp;", 10)
+	benchUnescapeDense  = strings.Repeat("&amp;&lt; &amp; &lt;", 100)
+)
+
+func BenchmarkEscape(b *testing.B) {
+	n := 0
+	for i := 0; i < b.N; i++ {
+		n += len(EscapeString(benchEscapeData))
+	}
+}
+
+func BenchmarkEscapeNone(b *testing.B) {
+	n := 0
+	for i := 0; i < b.N; i++ {
+		n += len(EscapeString(benchEscapeNone))
+	}
+}
+
+func BenchmarkUnescape(b *testing.B) {
+	s := EscapeString(benchEscapeData)
+	n := 0
+	for i := 0; i < b.N; i++ {
+		n += len(UnescapeString(s))
+	}
+}
+
+func BenchmarkUnescapeNone(b *testing.B) {
+	s := EscapeString(benchEscapeNone)
+	n := 0
+	for i := 0; i < b.N; i++ {
+		n += len(UnescapeString(s))
+	}
+}
+
+func BenchmarkUnescapeSparse(b *testing.B) {
+	n := 0
+	for i := 0; i < b.N; i++ {
+		n += len(UnescapeString(benchUnescapeSparse))
+	}
+}
+
+func BenchmarkUnescapeDense(b *testing.B) {
+	n := 0
+	for i := 0; i < b.N; i++ {
+		n += len(UnescapeString(benchUnescapeDense))
+	}
+}
diff --git a/html/fuzz_test.go b/html/fuzz_test.go
@@ -0,0 +1,22 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package html
+
+import "testing"
+
+func FuzzEscapeUnescape(f *testing.F) {
+	f.Fuzz(func(t *testing.T, v string) {
+		e := EscapeString(v)
+		u := UnescapeString(e)
+		if u != v {
+			t.Errorf("EscapeString(%q) = %q, UnescapeString(%q) = %q, want %q", v, e, e, u, v)
+		}
+
+		// As per the documentation, this isn't always equal to v, so it makes
+		// no sense to check for equality. It can still be interesting to find
+		// panics in it though.
+		EscapeString(UnescapeString(v))
+	})
+}