Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

html: sync changes from std #208

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4,484 changes: 2,248 additions & 2,236 deletions html/entity.go

Large diffs are not rendered by default.

8 changes: 8 additions & 0 deletions html/entity_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,15 @@ import (
"unicode/utf8"
)

func init() {
UnescapeString("") // force load of entity maps
}

func TestEntityLength(t *testing.T) {
if len(entity) == 0 || len(entity2) == 0 {
t.Fatal("maps not loaded")
}

// We verify that the length of UTF-8 encoding of each value is <= 1 + len(key).
// The +1 comes from the leading "&". This property implies that the length of
// unescaped text is <= the length of escaped text.
Expand Down
109 changes: 45 additions & 64 deletions html/escape.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ import (

// These replacements permit compatibility with old numeric entities that
// assumed Windows-1252 encoding.
// https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference
// https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
var replacementTable = [...]rune{
'\u20AC', // First entry is what 0x80 should be replaced with.
'\u0081',
Expand Down Expand Up @@ -135,14 +135,14 @@ func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
break
}

entityName := string(s[1:i])
if entityName == "" {
entityName := s[1:i]
if len(entityName) == 0 {
// No-op.
} else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
// No-op.
} else if x := entity[entityName]; x != 0 {
} else if x := entity[string(entityName)]; x != 0 {
return dst + utf8.EncodeRune(b[dst:], x), src + i
} else if x := entity2[entityName]; x[0] != 0 {
} else if x := entity2[string(entityName)]; x[0] != 0 {
dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
} else if !attribute {
Expand All @@ -151,7 +151,7 @@ func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
maxLen = longestEntityWithoutSemicolon
}
for j := maxLen; j > 1; j-- {
if x := entity[entityName[:j]]; x != 0 {
if x := entity[string(entityName[:j])]; x != 0 {
return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
}
}
Expand All @@ -165,24 +165,34 @@ func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
// unescape unescapes b's entities in-place, so that "a&lt;b" becomes "a<b".
// attribute should be true if parsing an attribute value.
func unescape(b []byte, attribute bool) []byte {
for i, c := range b {
if c == '&' {
dst, src := unescapeEntity(b, i, i, attribute)
for src < len(b) {
c := b[src]
if c == '&' {
dst, src = unescapeEntity(b, dst, src, attribute)
} else {
b[dst] = c
dst, src = dst+1, src+1
}
}
return b[0:dst]
}
populateMapsOnce.Do(populateMaps)
if i := bytes.IndexByte(b, '&'); i >= 0 {
return unescapeInner(b, i, attribute)
}
return b
}

func unescapeInner(b []byte, i int, attribute bool) []byte {
dst, src := unescapeEntity(b, i, i, attribute)
for len(b[src:]) > 0 {
if b[src] == '&' {
i = 0
} else {
i = bytes.IndexByte(b[src:], '&')
}
if i < 0 {
dst += copy(b[dst:], b[src:])
break
}

if i > 0 {
copy(b[dst:], b[src:src+i])
}
dst, src = unescapeEntity(b, dst+i, src+i, attribute)
}
return b[:dst]
}

// lower lower-cases the A-Z bytes in b in-place, so that "aBc" becomes "abc".
func lower(b []byte) []byte {
for i, c := range b {
Expand Down Expand Up @@ -274,66 +284,37 @@ func escapeCommentString(s string) string {
return buf.String()
}

const escapedChars = "&'<>\"\r"
var htmlEscaper = strings.NewReplacer(
`&`, "&amp;",
`'`, "&#39;", // "&#39;" is shorter than "&apos;" and apos was not in HTML until HTML5.
`<`, "&lt;",
`>`, "&gt;",
`"`, "&#34;", // "&#34;" is shorter than "&quot;".
"\r", "&#13;",
)

func escape(w writer, s string) error {
i := strings.IndexAny(s, escapedChars)
for i != -1 {
if _, err := w.WriteString(s[:i]); err != nil {
return err
}
var esc string
switch s[i] {
case '&':
esc = "&amp;"
case '\'':
// "&#39;" is shorter than "&apos;" and apos was not in HTML until HTML5.
esc = "&#39;"
case '<':
esc = "&lt;"
case '>':
esc = "&gt;"
case '"':
// "&#34;" is shorter than "&quot;".
esc = "&#34;"
case '\r':
esc = "&#13;"
default:
panic("unrecognized escape character")
}
s = s[i+1:]
if _, err := w.WriteString(esc); err != nil {
return err
}
i = strings.IndexAny(s, escapedChars)
}
_, err := w.WriteString(s)
_, err := htmlEscaper.WriteString(w, s)
return err
}

// EscapeString escapes special characters like "<" to become "&lt;". It
// escapes only five such characters: <, >, &, ' and ".
// escapes only six such characters: <, >, &, ', ", and \r.
// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
// always true.
func EscapeString(s string) string {
if strings.IndexAny(s, escapedChars) == -1 {
return s
}
var buf bytes.Buffer
escape(&buf, s)
return buf.String()
return htmlEscaper.Replace(s)
}

// UnescapeString unescapes entities like "&lt;" to become "<". It unescapes a
// larger range of entities than EscapeString escapes. For example, "&aacute;"
// unescapes to "á", as does "&#225;" and "&xE1;".
// unescapes to "á", as does "&#225;" and "&#xE1;".
// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
// always true.
func UnescapeString(s string) string {
for _, c := range s {
if c == '&' {
return string(unescape([]byte(s), false))
}
populateMapsOnce.Do(populateMaps)
if i := strings.IndexByte(s, '&'); i >= 0 {
return string(unescapeInner([]byte(s), i, false))
}
return s
}
22 changes: 22 additions & 0 deletions html/escape_example_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package html_test

import (
"fmt"
"html"
)

func ExampleEscapeString() {
const s = `"Fran & Freddie's Diner" <[email protected]>`
fmt.Println(html.EscapeString(s))
// Output: &#34;Fran &amp; Freddie&#39;s Diner&#34; &lt;[email protected]&gt;
}

func ExampleUnescapeString() {
const s = `&quot;Fran &amp; Freddie&#39;s Diner&quot; &lt;[email protected]&gt;`
fmt.Println(html.UnescapeString(s))
// Output: "Fran & Freddie's Diner" <[email protected]>
}
74 changes: 73 additions & 1 deletion html/escape_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@

package html

import "testing"
import (
"strings"
"testing"
)

type unescapeTest struct {
// A short description of the test case.
Expand Down Expand Up @@ -64,6 +67,24 @@ var unescapeTests = []unescapeTest{
"Footnote&#x87;",
"Footnote‡",
},
// Handle single ampersand.
{
"copySingleAmpersand",
"&",
"&",
},
// Handle ampersand followed by non-entity.
{
"copyAmpersandNonEntity",
"text &test",
"text &test",
},
// Handle "&#".
{
"copyAmpersandHash",
"text &#",
"text &#",
},
}

func TestUnescape(t *testing.T) {
Expand Down Expand Up @@ -95,3 +116,54 @@ func TestUnescapeEscape(t *testing.T) {
}
}
}

var (
benchEscapeData = strings.Repeat("AAAAA < BBBBB > CCCCC & DDDDD ' EEEEE \" ", 100)
benchEscapeNone = strings.Repeat("AAAAA x BBBBB x CCCCC x DDDDD x EEEEE x ", 100)
benchUnescapeSparse = strings.Repeat(strings.Repeat("AAAAA x BBBBB x CCCCC x DDDDD x EEEEE x ", 10)+"&amp;", 10)
benchUnescapeDense = strings.Repeat("&amp;&lt; &amp; &lt;", 100)
)

func BenchmarkEscape(b *testing.B) {
n := 0
for i := 0; i < b.N; i++ {
n += len(EscapeString(benchEscapeData))
}
}

func BenchmarkEscapeNone(b *testing.B) {
n := 0
for i := 0; i < b.N; i++ {
n += len(EscapeString(benchEscapeNone))
}
}

func BenchmarkUnescape(b *testing.B) {
s := EscapeString(benchEscapeData)
n := 0
for i := 0; i < b.N; i++ {
n += len(UnescapeString(s))
}
}

func BenchmarkUnescapeNone(b *testing.B) {
s := EscapeString(benchEscapeNone)
n := 0
for i := 0; i < b.N; i++ {
n += len(UnescapeString(s))
}
}

func BenchmarkUnescapeSparse(b *testing.B) {
n := 0
for i := 0; i < b.N; i++ {
n += len(UnescapeString(benchUnescapeSparse))
}
}

func BenchmarkUnescapeDense(b *testing.B) {
n := 0
for i := 0; i < b.N; i++ {
n += len(UnescapeString(benchUnescapeDense))
}
}
22 changes: 22 additions & 0 deletions html/fuzz_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
// Copyright 2019 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package html

import "testing"

func FuzzEscapeUnescape(f *testing.F) {
f.Fuzz(func(t *testing.T, v string) {
e := EscapeString(v)
u := UnescapeString(e)
if u != v {
t.Errorf("EscapeString(%q) = %q, UnescapeString(%q) = %q, want %q", v, e, e, u, v)
}

// As per the documentation, this isn't always equal to v, so it makes
// no sense to check for equality. It can still be interesting to find
// panics in it though.
EscapeString(UnescapeString(v))
})
}