Skip to content

Commit bb6cf6e

Browse files
authored
♻️ REFACTOR: Port mdurl and punycode for URL normalisation (#171)
This port brings markdown-it-py closer inline with markdown-it, and fixes the outstanding CommonMark compliance tests.
1 parent 73763b3 commit bb6cf6e

File tree

7 files changed

+120
-156
lines changed

7 files changed

+120
-156
lines changed

markdown_it/_punycode.py

+66
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
# Copyright 2014 Mathias Bynens <https://mathiasbynens.be/>
2+
# Copyright 2021 Taneli Hukkinen
3+
#
4+
# Permission is hereby granted, free of charge, to any person obtaining
5+
# a copy of this software and associated documentation files (the
6+
# "Software"), to deal in the Software without restriction, including
7+
# without limitation the rights to use, copy, modify, merge, publish,
8+
# distribute, sublicense, and/or sell copies of the Software, and to
9+
# permit persons to whom the Software is furnished to do so, subject to
10+
# the following conditions:
11+
#
12+
# The above copyright notice and this permission notice shall be
13+
# included in all copies or substantial portions of the Software.
14+
#
15+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22+
23+
import codecs
24+
import re
25+
26+
REGEX_SEPARATORS = re.compile(r"[\x2E\u3002\uFF0E\uFF61]")
27+
REGEX_NON_ASCII = re.compile(r"[^\0-\x7E]")
28+
29+
30+
def encode(uni: str) -> str:
31+
return codecs.encode(uni, encoding="punycode").decode()
32+
33+
34+
def decode(ascii: str) -> str:
35+
return codecs.decode(ascii, encoding="punycode") # type: ignore[call-overload]
36+
37+
38+
def map_domain(string, fn):
39+
parts = string.split("@")
40+
result = ""
41+
if len(parts) > 1:
42+
# In email addresses, only the domain name should be punycoded. Leave
43+
# the local part (i.e. everything up to `@`) intact.
44+
result = parts[0] + "@"
45+
string = parts[1]
46+
labels = REGEX_SEPARATORS.split(string)
47+
encoded = ".".join(fn(label) for label in labels)
48+
return result + encoded
49+
50+
51+
def to_unicode(obj: str) -> str:
52+
def mapping(obj: str) -> str:
53+
if obj.startswith("xn--"):
54+
return decode(obj[4:].lower())
55+
return obj
56+
57+
return map_domain(obj, mapping)
58+
59+
60+
def to_ascii(obj: str) -> str:
61+
def mapping(obj: str) -> str:
62+
if REGEX_NON_ASCII.search(obj):
63+
return "xn--" + encode(obj)
64+
return obj
65+
66+
return map_domain(obj, mapping)

markdown_it/common/normalize_url.py

+39-138
Original file line numberDiff line numberDiff line change
@@ -1,70 +1,13 @@
1-
import html
21
import re
32
from typing import Callable, Optional
43
from urllib.parse import urlparse, urlunparse, quote, unquote # noqa: F401
54

6-
from .utils import ESCAPABLE
5+
import mdurl
76

8-
# TODO below we port the use of the JS packages:
9-
# var mdurl = require('mdurl')
10-
# var punycode = require('punycode')
11-
#
12-
# e.g. mdurl: parsed = mdurl.parse(url, True)
13-
#
14-
# but need to check these fixes from https://www.npmjs.com/package/mdurl:
15-
#
16-
# Parse url string. Similar to node's url.parse,
17-
# but without any normalizations and query string parse.
18-
# url - input url (string)
19-
# slashesDenoteHost - if url starts with //, expect a hostname after it. Optional, false.
20-
# Difference with node's url:
7+
from .. import _punycode
218

22-
# No leading slash in paths, e.g. in url.parse('http://foo?bar') pathname is ``, not /
23-
# Backslashes are not replaced with slashes, so http:\\example.org\ is treated like a relative path
24-
# Trailing colon is treated like a part of the path, i.e. in http://example.org:foo pathname is :foo
25-
# Nothing is URL-encoded in the resulting object,
26-
# (in joyent/node some chars in auth and paths are encoded)
27-
# url.parse() does not have parseQueryString argument
28-
# Removed extraneous result properties: host, path, query, etc.,
29-
# which can be constructed using other parts of the url.
309

31-
32-
# ################# Copied from Commonmark.py #################
33-
34-
ENTITY = "&(?:#x[a-f0-9]{1,6}|#[0-9]{1,7}|[a-z][a-z0-9]{1,31});"
35-
reBackslashOrAmp = re.compile(r"[\\&]")
36-
reEntityOrEscapedChar = re.compile(
37-
"\\\\" + "[" + ESCAPABLE + "]|" + ENTITY, re.IGNORECASE
38-
)
39-
40-
41-
def unescape_char(s: str) -> str:
42-
if s[0] == "\\":
43-
return s[1]
44-
else:
45-
return html.unescape(s)
46-
47-
48-
def unescape_string(s: str) -> str:
49-
"""Replace entities and backslash escapes with literal characters."""
50-
if re.search(reBackslashOrAmp, s):
51-
return re.sub(reEntityOrEscapedChar, lambda m: unescape_char(m.group()), s)
52-
else:
53-
return s
54-
55-
56-
def normalize_uri(uri: str) -> str:
57-
return quote(uri, safe="/@:+?=&()%#*,")
58-
59-
60-
##################
61-
62-
63-
RECODE_HOSTNAME_FOR = ("http", "https", "mailto")
64-
65-
66-
def unescape_normalize_uri(x: str) -> str:
67-
return normalize_uri(unescape_string(x))
10+
RECODE_HOSTNAME_FOR = ("http:", "https:", "mailto:")
6811

6912

7013
def normalizeLink(url: str) -> str:
@@ -75,91 +18,49 @@ def normalizeLink(url: str) -> str:
7518
[label]: destination 'title'
7619
^^^^^^^^^^^
7720
"""
78-
(scheme, netloc, path, params, query, fragment) = urlparse(url)
79-
if scheme in RECODE_HOSTNAME_FOR:
80-
url = urlunparse(
81-
(
82-
scheme,
83-
unescape_normalize_uri(netloc),
84-
normalize_uri(path),
85-
unescape_normalize_uri(params),
86-
normalize_uri(query),
87-
unescape_normalize_uri(fragment),
88-
)
89-
)
90-
else:
91-
url = unescape_normalize_uri(url)
92-
93-
return url
94-
95-
# TODO the selective encoding below should probably be done here,
96-
# something like:
97-
# url_check = urllib.parse.urlparse(destination)
98-
# if url_check.scheme in RECODE_HOSTNAME_FOR: ...
99-
100-
# parsed = urlparse(url)
101-
# if parsed.hostname:
102-
# # Encode hostnames in urls like:
103-
# # `http:#host/`, `https:#host/`, `mailto:user@host`, `#host/`
104-
# #
105-
# # We don't encode unknown schemas, because it's likely that we encode
106-
# # something we shouldn't (e.g. `skype:name` treated as `skype:host`)
107-
# #
108-
# if (not parsed.scheme) or parsed.scheme in RECODE_HOSTNAME_FOR:
109-
# try:
110-
# parsed.hostname = punycode.toASCII(parsed.hostname)
111-
# except Exception:
112-
# pass
113-
# return quote(urlunparse(parsed))
114-
115-
116-
def unescape_unquote(x: str) -> str:
117-
return unquote(unescape_string(x))
118-
119-
120-
def normalizeLinkText(link: str) -> str:
21+
parsed = mdurl.parse(url, slashes_denote_host=True)
22+
23+
if parsed.hostname:
24+
# Encode hostnames in urls like:
25+
# `http://host/`, `https://host/`, `mailto:user@host`, `//host/`
26+
#
27+
# We don't encode unknown schemas, because it's likely that we encode
28+
# something we shouldn't (e.g. `skype:name` treated as `skype:host`)
29+
#
30+
if not parsed.protocol or parsed.protocol in RECODE_HOSTNAME_FOR:
31+
try:
32+
parsed = parsed._replace(hostname=_punycode.to_ascii(parsed.hostname))
33+
except Exception:
34+
pass
35+
36+
return mdurl.encode(mdurl.format(parsed))
37+
38+
39+
def normalizeLinkText(url: str) -> str:
12140
"""Normalize autolink content
12241
12342
::
12443
12544
<destination>
12645
~~~~~~~~~~~
12746
"""
128-
(scheme, netloc, path, params, query, fragment) = urlparse(link)
129-
if scheme in RECODE_HOSTNAME_FOR:
130-
url = urlunparse(
131-
(
132-
scheme,
133-
unescape_unquote(netloc),
134-
unquote(path),
135-
unescape_unquote(params),
136-
unquote(query),
137-
unescape_unquote(fragment),
138-
)
139-
)
140-
else:
141-
url = unescape_unquote(link)
142-
return url
143-
144-
# TODO the selective encoding below should probably be done here,
145-
# something like:
146-
# url_check = urllib.parse.urlparse(destination)
147-
# if url_check.scheme in RECODE_HOSTNAME_FOR: ...
148-
149-
# parsed = urlparse(url)
150-
# if parsed.hostname:
151-
# # Encode hostnames in urls like:
152-
# # `http:#host/`, `https:#host/`, `mailto:user@host`, `#host/`
153-
# #
154-
# # We don't encode unknown schemas, because it's likely that we encode
155-
# # something we shouldn't (e.g. `skype:name` treated as `skype:host`)
156-
# #
157-
# if (not parsed.protocol) or parsed.protocol in RECODE_HOSTNAME_FOR:
158-
# try:
159-
# parsed.hostname = punycode.toUnicode(parsed.hostname)
160-
# except Exception:
161-
# pass
162-
# return unquote(urlunparse(parsed))
47+
parsed = mdurl.parse(url, slashes_denote_host=True)
48+
49+
if parsed.hostname:
50+
# Encode hostnames in urls like:
51+
# `http://host/`, `https://host/`, `mailto:user@host`, `//host/`
52+
#
53+
# We don't encode unknown schemas, because it's likely that we encode
54+
# something we shouldn't (e.g. `skype:name` treated as `skype:host`)
55+
#
56+
if not parsed.protocol or parsed.protocol in RECODE_HOSTNAME_FOR:
57+
try:
58+
parsed = parsed._replace(hostname=_punycode.to_unicode(parsed.hostname))
59+
except Exception:
60+
pass
61+
62+
# add '%' to exclude list because of https://github.com/markdown-it/markdown-it/issues/720
63+
return mdurl.decode(mdurl.format(parsed), mdurl.DECODE_DEFAULT_CHARS + "%")
16364

16465

16566
BAD_PROTO_RE = re.compile(r"^(vbscript|javascript|file|data):")

markdown_it/common/utils.py

+11-4
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,6 @@
66

77
from .entities import entities
88

9-
# from .normalize_url import unescape_string
10-
119

1210
def charCodeAt(src: str, pos: int) -> Any:
1311
"""
@@ -105,7 +103,7 @@ def fromCodePoint(c: int) -> str:
105103
UNESCAPE_MD_RE = re.compile(r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])')
106104
# ENTITY_RE_g = re.compile(r'&([a-z#][a-z0-9]{1,31})', re.IGNORECASE)
107105
UNESCAPE_ALL_RE = re.compile(
108-
r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])' + "|" + r"&([a-z#][a-z0-9]{1,31})",
106+
r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])' + "|" + r"&([a-z#][a-z0-9]{1,31});",
109107
re.IGNORECASE,
110108
)
111109
DIGITAL_ENTITY_TEST_RE = re.compile(r"^#((?:x[a-f0-9]{1,8}|[0-9]{1,8}))", re.IGNORECASE)
@@ -146,7 +144,16 @@ def unescapeMd(string: str) -> str:
146144

147145

148146
def unescapeAll(string: str) -> str:
149-
return html.unescape(string)
147+
def replacer_func(match):
148+
escaped = match.group(1)
149+
if escaped:
150+
return escaped
151+
entity = match.group(2)
152+
return replaceEntityPattern(match.group(), entity)
153+
154+
if "\\" not in string and "&" not in string:
155+
return string
156+
return UNESCAPE_ALL_RE.sub(replacer_func, string)
150157

151158

152159
ESCAPABLE = r"""\\!"#$%&'()*+,./:;<=>?@\[\]^`{}|_~-"""

markdown_it/helpers/parse_link_title.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""Parse link title
22
"""
3-
from ..common.utils import unescapeAll, charCodeAt, stripEscape
3+
from ..common.utils import unescapeAll, charCodeAt
44

55

66
class _Result:
@@ -40,7 +40,7 @@ def parseLinkTitle(string: str, pos: int, maximum: int) -> _Result:
4040
code = charCodeAt(string, pos)
4141
if code == marker:
4242
title = string[start + 1 : pos]
43-
title = unescapeAll(stripEscape(title))
43+
title = unescapeAll(title)
4444
result.pos = pos + 1
4545
result.lines = lines
4646
result.str = title

markdown_it/rules_block/fence.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
# fences (``` lang, ~~~ lang)
22
import logging
33

4-
from ..common.utils import stripEscape
54
from .state_block import StateBlock
65

76
LOGGER = logging.getLogger(__name__)
@@ -97,7 +96,7 @@ def fence(state: StateBlock, startLine: int, endLine: int, silent: bool):
9796
state.line = nextLine + (1 if haveEndMarker else 0)
9897

9998
token = state.push("fence", "code", 0)
100-
token.info = stripEscape(params)
99+
token.info = params
101100
token.content = state.getLines(startLine + 1, nextLine, length, True)
102101
token.markup = markup
103102
token.map = [startLine, state.line]

setup.cfg

+1
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ project_urls =
3030
[options]
3131
packages = find:
3232
install_requires =
33+
mdurl
3334
attrs>=19,<22
3435
typing_extensions>=3.7.4;python_version<'3.8'
3536
python_requires = ~=3.6

tests/test_port/test_fixtures.py

-10
Original file line numberDiff line numberDiff line change
@@ -64,13 +64,6 @@ def test_table(line, title, input, expected):
6464
read_fixture_file(FIXTURE_PATH.joinpath("commonmark_extras.md")),
6565
)
6666
def test_commonmark_extras(line, title, input, expected):
67-
if title in {
68-
"Escaping entities in links:",
69-
"Checking combination of replaceEntities and unescapeMd:",
70-
}:
71-
# TODO fix failing escaping tests
72-
# probably requires a fix of common.utils.stripEscape
73-
pytest.xfail("escaping entities in link titles / fence.info")
7467
md = MarkdownIt("commonmark")
7568
md.options["langPrefix"] = ""
7669
text = md.render(input)
@@ -99,9 +92,6 @@ def test_normalize_url(line, title, input, expected):
9992
"line,title,input,expected", read_fixture_file(FIXTURE_PATH.joinpath("fatal.md"))
10093
)
10194
def test_fatal(line, title, input, expected):
102-
if line in [1, 17]:
103-
# TODO fix failing url escaping tests
104-
pytest.xfail("url normalisation")
10595
md = MarkdownIt("commonmark").enable("replacements")
10696
md.options["typographer"] = True
10797
text = md.render(input)

0 commit comments

Comments
 (0)