Skip to content

Commit

Permalink
Merge pull request #10 from LukeShu/lukeshu/html-opt
Browse files Browse the repository at this point in the history
Lukeshu/html opt
  • Loading branch information
meslubi2021 authored Dec 13, 2024
2 parents 0921112 + 1e4aea6 commit 79a78d2
Show file tree
Hide file tree
Showing 6 changed files with 2,533 additions and 2,338 deletions.
4,489 changes: 2,252 additions & 2,237 deletions html/entity.go

Large diffs are not rendered by default.

24 changes: 13 additions & 11 deletions html/entity_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,24 +6,26 @@ package html

import (
"testing"
"unicode/utf8"
)

func init() {
UnescapeString("") // force load of entity maps
}

func TestEntityLength(t *testing.T) {
// We verify that the length of UTF-8 encoding of each value is <= 1 + len(key).
// The +1 comes from the leading "&". This property implies that the length of
// unescaped text is <= the length of escaped text.
if len(entity) == 0 {
t.Fatal("maps not loaded")
}

// We verify that the length of UTF-8 encoding of each value
// is no more than 1 + len("&"+key), which is an assuption
// made in unescapeEntity.
for k, v := range entity {
if 1+len(k) < utf8.RuneLen(v) {
t.Error("escaped entity &" + k + " is shorter than its UTF-8 encoding " + string(v))
if 2+len(k) < int(v[0]) {
t.Error("escaped entity &" + k + " is more than 1 byte shorter than its UTF-8 encoding " + string(v[1:1+v[0]]))
}
if len(k) > longestEntityWithoutSemicolon && k[len(k)-1] != ';' {
t.Errorf("entity name %s is %d characters, but longestEntityWithoutSemicolon=%d", k, len(k), longestEntityWithoutSemicolon)
}
}
for k, v := range entity2 {
if 1+len(k) < utf8.RuneLen(v[0])+utf8.RuneLen(v[1]) {
t.Error("escaped entity &" + k + " is shorter than its UTF-8 encoding " + string(v[0]) + string(v[1]))
}
}
}
205 changes: 117 additions & 88 deletions html/escape.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ import (

// These replacements permit compatibility with old numeric entities that
// assumed Windows-1252 encoding.
// https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference
// https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
var replacementTable = [...]rune{
'\u20AC', // First entry is what 0x80 should be replaced with.
'\u0081',
Expand Down Expand Up @@ -50,26 +50,32 @@ var replacementTable = [...]rune{
// 0x0D->'\u000D' is a no-op.
}

// unescapeEntity reads an entity like "&lt;" from b[src:] and writes the
// corresponding "<" to b[dst:], returning the incremented dst and src cursors.
// Precondition: b[src] == '&' && dst <= src.
// unescapeEntity reads an entity like "&lt;" from src[srcPos:] and
// writes the corresponding "<" to dst[dstPos:], returning dst and the
// incremented dstPos and srcPos cursors.
//
// Usually, the returned dst is the dst argument, but in the event
// that dstPos>srcPos it may be a copy.
//
// Precondition: src[srcPos] == '&'.
//
// attribute should be true if parsing an attribute value.
func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
// https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference
func unescapeEntity[S ~[]byte | string](dst []byte, src S, dstPos, srcPos int, attribute bool) (dst1 []byte, dstPos1, srcPos1 int) {
var dstIsSrc = len(dst) == len(src)

// https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state

// i starts at 1 because we already know that s[0] == '&'.
i, s := 1, b[src:]
i, s := 1, src[srcPos:]

if len(s) <= 1 {
b[dst] = b[src]
return dst + 1, src + 1
// shortest possible entities are all 3 bytes:
// "&GT", "&LT", "&gt", "&lt", "&#0" ... "&#9"
if len(s) < 3 {
dst[dstPos] = src[srcPos]
return dst, dstPos + 1, srcPos + 1
}

if s[i] == '#' {
if len(s) <= 3 { // We need to have at least "&#.".
b[dst] = b[src]
return dst + 1, src + 1
}
i++
c := s[i]
hex := false
Expand All @@ -79,9 +85,17 @@ func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
}

x := '\x00'
overflowed := false
for i < len(s) {
c = s[i]
i++
if x > 0x10FFFF {
// Make a note that we're above the maximum
// value, in case later we overflow the integer.
// Don't `break` though, we still want to
// consume the characters.
overflowed = true
}
if hex {
if '0' <= c && c <= '9' {
x = 16*x + rune(c) - '0'
Expand All @@ -102,10 +116,13 @@ func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
}
break
}
if overflowed {
x = 0x110000
}

if i <= 3 { // No characters matched.
b[dst] = b[src]
return dst + 1, src + 1
if i < 3 || (hex && i < 4) { // No characters matched.
dst[dstPos] = src[srcPos]
return dst, dstPos + 1, srcPos + 1
}

if 0x80 <= x && x <= 0x9F {
Expand All @@ -116,7 +133,7 @@ func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
x = '\uFFFD'
}

return dst + utf8.EncodeRune(b[dst:], x), src + i
return dst, dstPos + utf8.EncodeRune(dst[dstPos:], x), srcPos + i
}

// Consume the maximum number of characters possible, with the
Expand All @@ -135,52 +152,70 @@ func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
break
}

entityName := string(s[1:i])
if entityName == "" {
entityName := s[1:i]
if len(entityName) == 0 {
// No-op.
} else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
// No-op.
} else if x := entity[entityName]; x != 0 {
return dst + utf8.EncodeRune(b[dst:], x), src + i
} else if x := entity2[entityName]; x[0] != 0 {
dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
} else if x := entity[string(entityName)]; x[0] != 0 {
if int(x[0]) > i {
// This assumes that it only ever has to grow by 1 byte per entity.
if dstPos == srcPos && dstIsSrc {
// make a copy + grow
dst = append(dst[:len(dst):len(dst)], 0)
} else if dstPos+int(x[0]) >= len(dst) {
// grow, but don't necessarily make a copy
dst = append(dst, 0)
}
}
return dst, dstPos + copy(dst[dstPos:], x[1:1+x[0]]), srcPos + i
} else if !attribute {
maxLen := len(entityName) - 1
if maxLen > longestEntityWithoutSemicolon {
maxLen = longestEntityWithoutSemicolon
}
for j := maxLen; j > 1; j-- {
if x := entity[entityName[:j]]; x != 0 {
return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
if x := entity[string(entityName[:j])]; x[0] != 0 {
// This assumes that no entity without a semicolon
// has a value that is wider than its name.
return dst, dstPos + copy(dst[dstPos:], x[1:1+x[0]]), srcPos + j + 1
}
}
}

dst1, src1 = dst+i, src+i
copy(b[dst:dst1], b[src:src1])
return dst1, src1
dstPos1, srcPos1 = dstPos+i, srcPos+i
copy(dst[dstPos:dstPos1], src[srcPos:srcPos1])
return dst, dstPos1, srcPos1
}

// unescape unescapes b's entities in-place, so that "a&lt;b" becomes "a<b".
// attribute should be true if parsing an attribute value.
func unescape(b []byte, attribute bool) []byte {
for i, c := range b {
if c == '&' {
dst, src := unescapeEntity(b, i, i, attribute)
for src < len(b) {
c := b[src]
if c == '&' {
dst, src = unescapeEntity(b, dst, src, attribute)
} else {
b[dst] = c
dst, src = dst+1, src+1
}
}
return b[0:dst]
populateMapOnce.Do(populateMap)
i := bytes.IndexByte(b, '&')

if i < 0 {
return b
}

b1, dst, src := unescapeEntity(b, b, i, i, attribute)
for len(b[src:]) > 0 {
if b[src] == '&' {
i = 0
} else {
i = bytes.IndexByte(b[src:], '&')
}
if i < 0 {
dst += copy(b1[dst:], b[src:])
break
}

if i > 0 {
copy(b1[dst:], b[src:src+i])
}
b1, dst, src = unescapeEntity(b1, b, dst+i, src+i, attribute)
}
return b
return b1[:dst]
}

// lower lower-cases the A-Z bytes in b in-place, so that "aBc" becomes "abc".
Expand Down Expand Up @@ -269,45 +304,22 @@ func escapeCommentString(s string) string {
if strings.IndexAny(s, "&>") == -1 {
return s
}
var buf bytes.Buffer
var buf strings.Builder
escapeComment(&buf, s)
return buf.String()
}

const escapedChars = "&'<>\"\r"
var htmlEscaper = strings.NewReplacer(
`&`, "&amp;",
`'`, "&#39;", // "&#39;" is shorter than "&apos;" and apos was not in HTML until HTML5.
`<`, "&lt;",
`>`, "&gt;",
`"`, "&#34;", // "&#34;" is shorter than "&quot;".
"\r", "&#13;",
)

func escape(w writer, s string) error {
i := strings.IndexAny(s, escapedChars)
for i != -1 {
if _, err := w.WriteString(s[:i]); err != nil {
return err
}
var esc string
switch s[i] {
case '&':
esc = "&amp;"
case '\'':
// "&#39;" is shorter than "&apos;" and apos was not in HTML until HTML5.
esc = "&#39;"
case '<':
esc = "&lt;"
case '>':
esc = "&gt;"
case '"':
// "&#34;" is shorter than "&quot;".
esc = "&#34;"
case '\r':
esc = "&#13;"
default:
panic("unrecognized escape character")
}
s = s[i+1:]
if _, err := w.WriteString(esc); err != nil {
return err
}
i = strings.IndexAny(s, escapedChars)
}
_, err := w.WriteString(s)
_, err := htmlEscaper.WriteString(w, s)
return err
}

Expand All @@ -316,24 +328,41 @@ func escape(w writer, s string) error {
// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
// always true.
func EscapeString(s string) string {
if strings.IndexAny(s, escapedChars) == -1 {
return s
}
var buf bytes.Buffer
escape(&buf, s)
return buf.String()
return htmlEscaper.Replace(s)
}

// UnescapeString unescapes entities like "&lt;" to become "<". It unescapes a
// larger range of entities than EscapeString escapes. For example, "&aacute;"
// unescapes to "á", as does "&#225;" and "&xE1;".
// unescapes to "á", as does "&#225;" and "&#xE1;".
// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
// always true.
func UnescapeString(s string) string {
for _, c := range s {
if c == '&' {
return string(unescape([]byte(s), false))
populateMapOnce.Do(populateMap)
i := strings.IndexByte(s, '&')

if i < 0 {
return s
}

// The +1 is just so that dstIsSrc=false.
b := make([]byte, len(s)+1)
copy(b, s[:i])
b, dst, src := unescapeEntity(b, s, i, i, false)
for len(s[src:]) > 0 {
if s[src] == '&' {
i = 0
} else {
i = strings.IndexByte(s[src:], '&')
}
if i < 0 {
dst += copy(b[dst:], s[src:])
break
}

if i > 0 {
copy(b[dst:], s[src:src+i])
}
b, dst, src = unescapeEntity(b, s, dst+i, src+i, false)
}
return s
return string(b[:dst])
}
22 changes: 22 additions & 0 deletions html/escape_example_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package html_test

import (
"fmt"
"html"
)

func ExampleEscapeString() {
const s = `"Fran & Freddie's Diner" <[email protected]>`
fmt.Println(html.EscapeString(s))
// Output: &#34;Fran &amp; Freddie&#39;s Diner&#34; &lt;[email protected]&gt;
}

func ExampleUnescapeString() {
const s = `&quot;Fran &amp; Freddie&#39;s Diner&quot; &lt;[email protected]&gt;`
fmt.Println(html.UnescapeString(s))
// Output: "Fran & Freddie's Diner" <[email protected]>
}
Loading

0 comments on commit 79a78d2

Please sign in to comment.