Skip to content

Commit

Permalink
[utils] Unescape HTML5 named character references (with no ;)
Browse files Browse the repository at this point in the history
  • Loading branch information
dirkf committed Mar 11, 2024
1 parent 059ef5b commit 80cb917
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 8 deletions.
5 changes: 4 additions & 1 deletion test/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,9 @@ def test_unescape_html(self):
self.assertEqual(unescapeHTML('&a"'), '&a"')
# HTML5 entities
self.assertEqual(unescapeHTML('.''), '.\'')
# non-semicolon HTML5 (bah!) entities
self.assertEqual(unescapeHTML('&amp&AMPetc'), '&&etc')
self.assertEqual(unescapeHTML('&pound&POUNDetc'), '£&POUNDetc')

def test_date_from_str(self):
self.assertEqual(date_from_str('yesterday'), date_from_str('now-1day'))
Expand Down Expand Up @@ -1251,7 +1254,7 @@ def test_intlist_to_bytes(self):
def test_args_to_str(self):
self.assertEqual(
args_to_str(['foo', 'ba/r', '-baz', '2 be', '']),
'foo ba/r -baz \'2 be\' \'\'' if not(compat_os_name in ('nt', 'ce')) else 'foo ba/r -baz "2 be" ""'
'foo ba/r -baz \'2 be\' \'\'' if not (compat_os_name in ('nt', 'ce')) else 'foo ba/r -baz "2 be" ""'
)

def test_parse_filesize(self):
Expand Down
35 changes: 28 additions & 7 deletions youtube_dl/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2232,25 +2232,25 @@ def orderedSet(iterable):
return res


def _htmlentity_transform(entity_with_semicolon):
def _htmlentity_transform(entity):
"""Transforms an HTML entity to a character."""
entity_with_semicolon = entity if entity[-1] == ';' else (entity + ';')
entity = entity_with_semicolon[:-1]

# Known non-numeric HTML entity
if entity in compat_html_entities.name2codepoint:
return compat_chr(compat_html_entities.name2codepoint[entity])

# TODO: HTML5 allows entities without a semicolon. For example,
# '&Eacuteric' should be decoded as 'Éric'.
if entity_with_semicolon in compat_html_entities_html5:
return compat_html_entities_html5[entity_with_semicolon]

mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
# numeric entity
mobj = re.match(r'(?i)#(x[0-9a-f]+|[0-9]+)', entity)
if mobj is not None:
numstr = mobj.group(1)
if numstr.startswith('x'):
if numstr[0] in 'xX':
base = 16
numstr = '0%s' % numstr
numstr = '0%s' % numstr.lower()
else:
base = 10
# See https://github.com/ytdl-org/youtube-dl/issues/7518
Expand All @@ -2263,13 +2263,34 @@ def _htmlentity_transform(entity_with_semicolon):
return '&%s;' % entity


# Based on https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references#cite_note-semicolon-2
# (someone else read WHATWG so we didn't have to)
_html5_CI_non_semicolon_entities = (
# canonically use lower-case REs
'quot', 'amp', '[lg]t', 'copy', 'reg', 'eth', 'thorn',
)
_html5_non_semicolon_entities = itertools.chain(
_html5_CI_non_semicolon_entities,
(e.upper() for e in _html5_CI_non_semicolon_entities),
('nbsp', 'i(?:excl|quest)', 'cent', 'pound', 'curren', 'yen', 'brvbar',
'sect', 'ord[fm]', '[lr]aquo', 'not', 'shy' 'macr', 'dseg',
'plusmn', 'sup[231]', 'micro', 'para', 'middot', '[cC]?cedil',
'frac(?:12|[13]4)', '[aAeEiIoOuUyY]?(?:acute|uml)',
'[aAeEiIoOuU](?:grave|circ)', '[aA]ring', '[aAnNoO]tilde',
'(?:ae|AE|sz|SZ)lig', '[oO]slash', 'divide', 'times', )
)
_html5_entities_re = '&([^&;]+;|%s)' % '|'.join(_html5_non_semicolon_entities)


def unescapeHTML(s):
if s is None:
return None
assert isinstance(s, compat_str)

return re.sub(
r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
# match generic &xxx;, &nnn; entities, and also
# HTML5 "named character references" with *omitted* final ;
_html5_entities_re, lambda m: _htmlentity_transform(m.group(1)), s)


def process_communicate_or_kill(p, *args, **kwargs):
Expand Down

0 comments on commit 80cb917

Please sign in to comment.