1
- import html
2
1
import re
3
2
from typing import Callable , Optional
4
3
from urllib .parse import urlparse , urlunparse , quote , unquote # noqa: F401
5
4
6
- from . utils import ESCAPABLE
5
+ import mdurl
7
6
8
- # TODO below we port the use of the JS packages:
9
- # var mdurl = require('mdurl')
10
- # var punycode = require('punycode')
11
- #
12
- # e.g. mdurl: parsed = mdurl.parse(url, True)
13
- #
14
- # but need to check these fixes from https://www.npmjs.com/package/mdurl:
15
- #
16
- # Parse url string. Similar to node's url.parse,
17
- # but without any normalizations and query string parse.
18
- # url - input url (string)
19
- # slashesDenoteHost - if url starts with //, expect a hostname after it. Optional, false.
20
- # Difference with node's url:
7
+ from .. import _punycode
21
8
22
- # No leading slash in paths, e.g. in url.parse('http://foo?bar') pathname is ``, not /
23
- # Backslashes are not replaced with slashes, so http:\\example.org\ is treated like a relative path
24
- # Trailing colon is treated like a part of the path, i.e. in http://example.org:foo pathname is :foo
25
- # Nothing is URL-encoded in the resulting object,
26
- # (in joyent/node some chars in auth and paths are encoded)
27
- # url.parse() does not have parseQueryString argument
28
- # Removed extraneous result properties: host, path, query, etc.,
29
- # which can be constructed using other parts of the url.
30
9
31
-
32
- # ################# Copied from Commonmark.py #################
33
-
34
- ENTITY = "&(?:#x[a-f0-9]{1,6}|#[0-9]{1,7}|[a-z][a-z0-9]{1,31});"
35
- reBackslashOrAmp = re .compile (r"[\\&]" )
36
- reEntityOrEscapedChar = re .compile (
37
- "\\ \\ " + "[" + ESCAPABLE + "]|" + ENTITY , re .IGNORECASE
38
- )
39
-
40
-
41
- def unescape_char (s : str ) -> str :
42
- if s [0 ] == "\\ " :
43
- return s [1 ]
44
- else :
45
- return html .unescape (s )
46
-
47
-
48
- def unescape_string (s : str ) -> str :
49
- """Replace entities and backslash escapes with literal characters."""
50
- if re .search (reBackslashOrAmp , s ):
51
- return re .sub (reEntityOrEscapedChar , lambda m : unescape_char (m .group ()), s )
52
- else :
53
- return s
54
-
55
-
56
- def normalize_uri (uri : str ) -> str :
57
- return quote (uri , safe = "/@:+?=&()%#*," )
58
-
59
-
60
- ##################
61
-
62
-
63
- RECODE_HOSTNAME_FOR = ("http" , "https" , "mailto" )
64
-
65
-
66
- def unescape_normalize_uri (x : str ) -> str :
67
- return normalize_uri (unescape_string (x ))
10
+ RECODE_HOSTNAME_FOR = ("http:" , "https:" , "mailto:" )
68
11
69
12
70
13
def normalizeLink (url : str ) -> str :
@@ -75,91 +18,49 @@ def normalizeLink(url: str) -> str:
75
18
[label]: destination 'title'
76
19
^^^^^^^^^^^
77
20
"""
78
- (scheme , netloc , path , params , query , fragment ) = urlparse (url )
79
- if scheme in RECODE_HOSTNAME_FOR :
80
- url = urlunparse (
81
- (
82
- scheme ,
83
- unescape_normalize_uri (netloc ),
84
- normalize_uri (path ),
85
- unescape_normalize_uri (params ),
86
- normalize_uri (query ),
87
- unescape_normalize_uri (fragment ),
88
- )
89
- )
90
- else :
91
- url = unescape_normalize_uri (url )
92
-
93
- return url
94
-
95
- # TODO the selective encoding below should probably be done here,
96
- # something like:
97
- # url_check = urllib.parse.urlparse(destination)
98
- # if url_check.scheme in RECODE_HOSTNAME_FOR: ...
99
-
100
- # parsed = urlparse(url)
101
- # if parsed.hostname:
102
- # # Encode hostnames in urls like:
103
- # # `http:#host/`, `https:#host/`, `mailto:user@host`, `#host/`
104
- # #
105
- # # We don't encode unknown schemas, because it's likely that we encode
106
- # # something we shouldn't (e.g. `skype:name` treated as `skype:host`)
107
- # #
108
- # if (not parsed.scheme) or parsed.scheme in RECODE_HOSTNAME_FOR:
109
- # try:
110
- # parsed.hostname = punycode.toASCII(parsed.hostname)
111
- # except Exception:
112
- # pass
113
- # return quote(urlunparse(parsed))
114
-
115
-
116
- def unescape_unquote (x : str ) -> str :
117
- return unquote (unescape_string (x ))
118
-
119
-
120
- def normalizeLinkText (link : str ) -> str :
21
+ parsed = mdurl .parse (url , slashes_denote_host = True )
22
+
23
+ if parsed .hostname :
24
+ # Encode hostnames in urls like:
25
+ # `http://host/`, `https://host/`, `mailto:user@host`, `//host/`
26
+ #
27
+ # We don't encode unknown schemas, because it's likely that we encode
28
+ # something we shouldn't (e.g. `skype:name` treated as `skype:host`)
29
+ #
30
+ if not parsed .protocol or parsed .protocol in RECODE_HOSTNAME_FOR :
31
+ try :
32
+ parsed = parsed ._replace (hostname = _punycode .to_ascii (parsed .hostname ))
33
+ except Exception :
34
+ pass
35
+
36
+ return mdurl .encode (mdurl .format (parsed ))
37
+
38
+
39
+ def normalizeLinkText (url : str ) -> str :
121
40
"""Normalize autolink content
122
41
123
42
::
124
43
125
44
<destination>
126
45
~~~~~~~~~~~
127
46
"""
128
- (scheme , netloc , path , params , query , fragment ) = urlparse (link )
129
- if scheme in RECODE_HOSTNAME_FOR :
130
- url = urlunparse (
131
- (
132
- scheme ,
133
- unescape_unquote (netloc ),
134
- unquote (path ),
135
- unescape_unquote (params ),
136
- unquote (query ),
137
- unescape_unquote (fragment ),
138
- )
139
- )
140
- else :
141
- url = unescape_unquote (link )
142
- return url
143
-
144
- # TODO the selective encoding below should probably be done here,
145
- # something like:
146
- # url_check = urllib.parse.urlparse(destination)
147
- # if url_check.scheme in RECODE_HOSTNAME_FOR: ...
148
-
149
- # parsed = urlparse(url)
150
- # if parsed.hostname:
151
- # # Encode hostnames in urls like:
152
- # # `http:#host/`, `https:#host/`, `mailto:user@host`, `#host/`
153
- # #
154
- # # We don't encode unknown schemas, because it's likely that we encode
155
- # # something we shouldn't (e.g. `skype:name` treated as `skype:host`)
156
- # #
157
- # if (not parsed.protocol) or parsed.protocol in RECODE_HOSTNAME_FOR:
158
- # try:
159
- # parsed.hostname = punycode.toUnicode(parsed.hostname)
160
- # except Exception:
161
- # pass
162
- # return unquote(urlunparse(parsed))
47
+ parsed = mdurl .parse (url , slashes_denote_host = True )
48
+
49
+ if parsed .hostname :
50
+ # Encode hostnames in urls like:
51
+ # `http://host/`, `https://host/`, `mailto:user@host`, `//host/`
52
+ #
53
+ # We don't encode unknown schemas, because it's likely that we encode
54
+ # something we shouldn't (e.g. `skype:name` treated as `skype:host`)
55
+ #
56
+ if not parsed .protocol or parsed .protocol in RECODE_HOSTNAME_FOR :
57
+ try :
58
+ parsed = parsed ._replace (hostname = _punycode .to_unicode (parsed .hostname ))
59
+ except Exception :
60
+ pass
61
+
62
+ # add '%' to exclude list because of https://github.com/markdown-it/markdown-it/issues/720
63
+ return mdurl .decode (mdurl .format (parsed ), mdurl .DECODE_DEFAULT_CHARS + "%" )
163
64
164
65
165
66
BAD_PROTO_RE = re .compile (r"^(vbscript|javascript|file|data):" )
0 commit comments