From 1655a7601198c657dcd484881710d6b3973bbbb3 Mon Sep 17 00:00:00 2001 From: Anthony Ramine Date: Fri, 19 Jul 2019 10:32:59 +0200 Subject: [PATCH 01/15] Update tests from wpt The two json files were taken from web-platform-tests/wpt@e69af8258d25011f3bdb7577323dcb98880445ea > test result: FAILED. 624 passed; 89 failed; 0 ignored; 0 measured --- tests/setters_tests.json | 425 +++++++++++++++++++++++++--- tests/urltestdata.json | 596 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 946 insertions(+), 75 deletions(-) diff --git a/tests/setters_tests.json b/tests/setters_tests.json index a45171bf3..db23d9247 100644 --- a/tests/setters_tests.json +++ b/tests/setters_tests.json @@ -27,7 +27,7 @@ "href": "a://example.net", "new_value": "", "expected": { - "href": "a://example.net/", + "href": "a://example.net", "protocol": "a:" } }, @@ -35,16 +35,24 @@ "href": "a://example.net", "new_value": "b", "expected": { - "href": "b://example.net/", + "href": "b://example.net", "protocol": "b:" } }, + { + "href": "javascript:alert(1)", + "new_value": "defuse", + "expected": { + "href": "defuse:alert(1)", + "protocol": "defuse:" + } + }, { "comment": "Upper-case ASCII is lower-cased", "href": "a://example.net", "new_value": "B", "expected": { - "href": "b://example.net/", + "href": "b://example.net", "protocol": "b:" } }, @@ -53,7 +61,7 @@ "href": "a://example.net", "new_value": "é", "expected": { - "href": "a://example.net/", + "href": "a://example.net", "protocol": "a:" } }, @@ -62,7 +70,7 @@ "href": "a://example.net", "new_value": "0b", "expected": { - "href": "a://example.net/", + "href": "a://example.net", "protocol": "a:" } }, @@ -71,7 +79,7 @@ "href": "a://example.net", "new_value": "+b", "expected": { - "href": "a://example.net/", + "href": "a://example.net", "protocol": "a:" } }, @@ -79,7 +87,7 @@ "href": "a://example.net", "new_value": "bC0+-.", "expected": { - "href": "bc0+-.://example.net/", + "href": "bc0+-.://example.net", "protocol": "bc0+-.:" } }, @@ -88,7 +96,7 @@ "href": "a://example.net", "new_value": "b,c", "expected": { - "href": "a://example.net/", + "href": "a://example.net", "protocol": "a:" } }, @@ -97,10 +105,35 @@ "href": "a://example.net", "new_value": "bé", "expected": { - "href": "a://example.net/", + "href": "a://example.net", "protocol": "a:" } }, + { + "comment": "Can’t switch from URL containing username/password/port to file", + "href": "http://test@example.net", + "new_value": "file", + "expected": { + "href": "http://test@example.net/", + "protocol": "http:" + } + }, + { + "href": "gopher://example.net:1234", + "new_value": "file", + "expected": { + "href": "gopher://example.net:1234/", + "protocol": "gopher:" + } + }, + { + "href": "wss://x:x@example.net:1234", + "new_value": "file", + "expected": { + "href": "wss://x:x@example.net:1234/", + "protocol": "wss:" + } + }, { "comment": "Can’t switch from file URL with no host", "href": "file://localhost/", @@ -127,12 +160,36 @@ } }, { - "comment": "Spec deviation: from special scheme to not is not problematic. https://github.com/whatwg/url/issues/104", + "comment": "Can’t switch from special scheme to non-special", "href": "http://example.net", "new_value": "b", "expected": { - "href": "b://example.net/", - "protocol": "b:" + "href": "http://example.net/", + "protocol": "http:" + } + }, + { + "href": "file://hi/path", + "new_value": "s", + "expected": { + "href": "file://hi/path", + "protocol": "file:" + } + }, + { + "href": "https://example.net", + "new_value": "s", + "expected": { + "href": "https://example.net/", + "protocol": "https:" + } + }, + { + "href": "ftp://example.net", + "new_value": "test", + "expected": { + "href": "ftp://example.net/", + "protocol": "ftp:" } }, { @@ -145,12 +202,44 @@ } }, { - "comment": "Spec deviation: from non-special scheme with a host to special is not problematic. https://github.com/whatwg/url/issues/104", + "comment": "Can’t switch from non-special scheme to special", "href": "ssh://me@example.net", "new_value": "http", "expected": { - "href": "http://me@example.net/", - "protocol": "http:" + "href": "ssh://me@example.net", + "protocol": "ssh:" + } + }, + { + "href": "ssh://me@example.net", + "new_value": "gopher", + "expected": { + "href": "ssh://me@example.net", + "protocol": "ssh:" + } + }, + { + "href": "ssh://me@example.net", + "new_value": "file", + "expected": { + "href": "ssh://me@example.net", + "protocol": "ssh:" + } + }, + { + "href": "ssh://example.net", + "new_value": "file", + "expected": { + "href": "ssh://example.net", + "protocol": "ssh:" + } + }, + { + "href": "nonsense:///test", + "new_value": "https", + "expected": { + "href": "nonsense:///test", + "protocol": "nonsense:" } }, { @@ -170,6 +259,16 @@ "href": "view-source+data:text/html,

Test", "protocol": "view-source+data:" } + }, + { + "comment": "Port is set to null if it is the default for new scheme.", + "href": "http://foo.com:443/", + "new_value": "https", + "expected": { + "href": "https://foo.com/", + "protocol": "https:", + "port": "" + } } ], "username": [ @@ -266,14 +365,6 @@ "username": "" } }, - { - "href": "file://test/", - "new_value": "test", - "expected": { - "href": "file://test/", - "username": "" - } - }, { "href": "javascript://x/", "new_value": "wario", @@ -281,6 +372,14 @@ "href": "javascript://wario@x/", "username": "wario" } + }, + { + "href": "file://test/", + "new_value": "test", + "expected": { + "href": "file://test/", + "username": "" + } } ], "password": [ @@ -369,14 +468,6 @@ "password": "" } }, - { - "href": "file://test/", - "new_value": "test", - "expected": { - "href": "file://test/", - "password": "" - } - }, { "href": "javascript://x/", "new_value": "bowser", @@ -384,9 +475,27 @@ "href": "javascript://:bowser@x/", "password": "bowser" } + }, + { + "href": "file://test/", + "new_value": "test", + "expected": { + "href": "file://test/", + "password": "" + } } ], "host": [ + { + "comment": "Non-special scheme", + "href": "sc://x/", + "new_value": "\u0000", + "expected": { + "href": "sc://x/", + "host": "x", + "hostname": "x" + } + }, { "href": "sc://x/", "new_value": "\u0009", @@ -414,6 +523,15 @@ "hostname": "" } }, + { + "href": "sc://x/", + "new_value": " ", + "expected": { + "href": "sc://x/", + "host": "x", + "hostname": "x" + } + }, { "href": "sc://x/", "new_value": "#", @@ -459,6 +577,16 @@ "hostname": "%C3%9F" } }, + { + "comment": "IDNA Nontransitional_Processing", + "href": "https://x/", + "new_value": "ß", + "expected": { + "href": "https://xn--zca/", + "host": "xn--zca", + "hostname": "xn--zca" + } + }, { "comment": "Cannot-be-a-base means no host", "href": "mailto:me@example.net", @@ -499,14 +627,14 @@ } }, { - "comment": "Port number is removed if empty in the new value: https://github.com/whatwg/url/pull/113", + "comment": "Port number is unchanged if not specified", "href": "http://example.net:8080", "new_value": "example.com:", "expected": { - "href": "http://example.com/", - "host": "example.com", + "href": "http://example.com:8080/", + "host": "example.com:8080", "hostname": "example.com", - "port": "" + "port": "8080" } }, { @@ -591,6 +719,17 @@ "port": "80" } }, + { + "comment": "Port number is removed if new port is scheme default and existing URL has a non-default port", + "href": "http://example.net:8080", + "new_value": "example.com:80", + "expected": { + "href": "http://example.com/", + "host": "example.com", + "hostname": "example.com", + "port": "" + } + }, { "comment": "Stuff after a / delimiter is ignored", "href": "http://example.net/path", @@ -790,9 +929,69 @@ "host": "example.net", "hostname": "example.net" } + }, + { + "href": "file://y/", + "new_value": "x:123", + "expected": { + "href": "file://y/", + "host": "y", + "hostname": "y", + "port": "" + } + }, + { + "href": "file://y/", + "new_value": "loc%41lhost", + "expected": { + "href": "file:///", + "host": "", + "hostname": "", + "port": "" + } + }, + { + "href": "file://hi/x", + "new_value": "", + "expected": { + "href": "file:///x", + "host": "", + "hostname": "", + "port": "" + } + }, + { + "href": "sc://test@test/", + "new_value": "", + "expected": { + "href": "sc://test@test/", + "host": "test", + "hostname": "test", + "username": "test" + } + }, + { + "href": "sc://test:12/", + "new_value": "", + "expected": { + "href": "sc://test:12/", + "host": "test:12", + "hostname": "test", + "port": "12" + } } ], "hostname": [ + { + "comment": "Non-special scheme", + "href": "sc://x/", + "new_value": "\u0000", + "expected": { + "href": "sc://x/", + "host": "x", + "hostname": "x" + } + }, { "href": "sc://x/", "new_value": "\u0009", @@ -820,6 +1019,15 @@ "hostname": "" } }, + { + "href": "sc://x/", + "new_value": " ", + "expected": { + "href": "sc://x/", + "host": "x", + "hostname": "x" + } + }, { "href": "sc://x/", "new_value": "#", @@ -1055,6 +1263,56 @@ "host": "example.net", "hostname": "example.net" } + }, + { + "href": "file://y/", + "new_value": "x:123", + "expected": { + "href": "file://y/", + "host": "y", + "hostname": "y", + "port": "" + } + }, + { + "href": "file://y/", + "new_value": "loc%41lhost", + "expected": { + "href": "file:///", + "host": "", + "hostname": "", + "port": "" + } + }, + { + "href": "file://hi/x", + "new_value": "", + "expected": { + "href": "file:///x", + "host": "", + "hostname": "", + "port": "" + } + }, + { + "href": "sc://test@test/", + "new_value": "", + "expected": { + "href": "sc://test@test/", + "host": "test", + "hostname": "test", + "username": "test" + } + }, + { + "href": "sc://test:12/", + "new_value": "", + "expected": { + "href": "sc://test:12/", + "host": "test:12", + "hostname": "test", + "port": "12" + } } ], "port": [ @@ -1324,12 +1582,12 @@ } }, { - "comment": "UTF-8 percent encoding with the default encode set. Tabs and newlines are removed. Leading or training C0 controls and space are removed.", + "comment": "UTF-8 percent encoding with the default encode set. Tabs and newlines are removed.", "href": "a:/", - "new_value": "\u0000\u0001\t\n\r\u001f !\u0000\u0001\t\n\r\u001f !\"#$%&'()*+,-./09:;<=>?@AZ[\\]^_`az{|}~\u007f\u0080\u0081Éé", + "new_value": "\u0000\u0001\t\n\r\u001f !\"#$%&'()*+,-./09:;<=>?@AZ[\\]^_`az{|}~\u007f\u0080\u0081Éé", "expected": { - "href": "a:/!%00%01%1F%20!%22%23$%&'()*+,-./09:;%3C=%3E%3F@AZ[\\]^_%60az%7B|%7D~%7F%C2%80%C2%81%C3%89%C3%A9", - "pathname": "/!%00%01%1F%20!%22%23$%&'()*+,-./09:;%3C=%3E%3F@AZ[\\]^_%60az%7B|%7D~%7F%C2%80%C2%81%C3%89%C3%A9" + "href": "a:/%00%01%1F%20!%22%23$%&'()*+,-./09:;%3C=%3E%3F@AZ[\\]^_%60az%7B|%7D~%7F%C2%80%C2%81%C3%89%C3%A9", + "pathname": "/%00%01%1F%20!%22%23$%&'()*+,-./09:;%3C=%3E%3F@AZ[\\]^_%60az%7B|%7D~%7F%C2%80%C2%81%C3%89%C3%A9" } }, { @@ -1376,6 +1634,33 @@ "href": "sc://example.net/%23", "pathname": "/%23" } + }, + { + "comment": "File URLs and (back)slashes", + "href": "file://monkey/", + "new_value": "\\\\", + "expected": { + "href": "file://monkey/", + "pathname": "/" + } + }, + { + "comment": "File URLs and (back)slashes", + "href": "file:///unicorn", + "new_value": "//\\/", + "expected": { + "href": "file:///", + "pathname": "/" + } + }, + { + "comment": "File URLs and (back)slashes", + "href": "file:///unicorn", + "new_value": "//monkey/..//", + "expected": { + "href": "file:///", + "pathname": "/" + } } ], "search": [ @@ -1444,12 +1729,12 @@ } }, { - "comment": "UTF-8 percent encoding with the query encode set. Tabs and newlines are removed. Leading or training C0 controls and space are removed.", + "comment": "UTF-8 percent encoding with the query encode set. Tabs and newlines are removed.", "href": "a:/", - "new_value": "\u0000\u0001\t\n\r\u001f !\u0000\u0001\t\n\r\u001f !\"#$%&'()*+,-./09:;<=>?@AZ[\\]^_`az{|}~\u007f\u0080\u0081Éé", + "new_value": "\u0000\u0001\t\n\r\u001f !\"#$%&'()*+,-./09:;<=>?@AZ[\\]^_`az{|}~\u007f\u0080\u0081Éé", "expected": { - "href": "a:/?!%00%01%1F%20!%22%23$%&'()*+,-./09:;%3C=%3E?@AZ[\\]^_`az{|}~%7F%C2%80%C2%81%C3%89%C3%A9", - "search": "?!%00%01%1F%20!%22%23$%&'()*+,-./09:;%3C=%3E?@AZ[\\]^_`az{|}~%7F%C2%80%C2%81%C3%89%C3%A9" + "href": "a:/?%00%01%1F%20!%22%23$%&'()*+,-./09:;%3C=%3E?@AZ[\\]^_`az{|}~%7F%C2%80%C2%81%C3%89%C3%A9", + "search": "?%00%01%1F%20!%22%23$%&'()*+,-./09:;%3C=%3E?@AZ[\\]^_`az{|}~%7F%C2%80%C2%81%C3%89%C3%A9" } }, { @@ -1511,13 +1796,53 @@ "hash": "" } }, + { + "href": "http://example.net", + "new_value": "#foo bar", + "expected": { + "href": "http://example.net/#foo%20bar", + "hash": "#foo%20bar" + } + }, + { + "href": "http://example.net", + "new_value": "#foo\"bar", + "expected": { + "href": "http://example.net/#foo%22bar", + "hash": "#foo%22bar" + } + }, + { + "href": "http://example.net", + "new_value": "#foobar", + "expected": { + "href": "http://example.net/#foo%3Ebar", + "hash": "#foo%3Ebar" + } + }, + { + "href": "http://example.net", + "new_value": "#foo`bar", + "expected": { + "href": "http://example.net/#foo%60bar", + "hash": "#foo%60bar" + } + }, { "comment": "Simple percent-encoding; nuls, tabs, and newlines are removed", "href": "a:/", - "new_value": "\u0000\u0001\t\n\r\u001f !\u0000\u0001\t\n\r\u001f !\"#$%&'()*+,-./09:;<=>?@AZ[\\]^_`az{|}~\u007f\u0080\u0081Éé", + "new_value": "\u0000\u0001\t\n\r\u001f !\"#$%&'()*+,-./09:;<=>?@AZ[\\]^_`az{|}~\u007f\u0080\u0081Éé", "expected": { - "href": "a:/#!%01%1F !\"#$%&'()*+,-./09:;<=>?@AZ[\\]^_`az{|}~%7F%C2%80%C2%81%C3%89%C3%A9", - "hash": "#!%01%1F !\"#$%&'()*+,-./09:;<=>?@AZ[\\]^_`az{|}~%7F%C2%80%C2%81%C3%89%C3%A9" + "href": "a:/#%01%1F%20!%22#$%&'()*+,-./09:;%3C=%3E?@AZ[\\]^_%60az{|}~%7F%C2%80%C2%81%C3%89%C3%A9", + "hash": "#%01%1F%20!%22#$%&'()*+,-./09:;%3C=%3E?@AZ[\\]^_%60az{|}~%7F%C2%80%C2%81%C3%89%C3%A9" } }, { @@ -1528,6 +1853,14 @@ "href": "http://example.net/#%c3%89t%C3%A9", "hash": "#%c3%89t%C3%A9" } + }, + { + "href": "javascript:alert(1)", + "new_value": "castle", + "expected": { + "href": "javascript:alert(1)#castle", + "hash": "#castle" + } } ] } diff --git a/tests/urltestdata.json b/tests/urltestdata.json index 5565c938f..bf4e2a783 100644 --- a/tests/urltestdata.json +++ b/tests/urltestdata.json @@ -153,7 +153,7 @@ { "input": "http://f:21/ b ? d # e ", "base": "http://example.org/foo/bar", - "href": "http://f:21/%20b%20?%20d%20# e", + "href": "http://f:21/%20b%20?%20d%20#%20e", "origin": "http://f:21", "protocol": "http:", "username": "", @@ -163,12 +163,12 @@ "port": "21", "pathname": "/%20b%20", "search": "?%20d%20", - "hash": "# e" + "hash": "#%20e" }, { "input": "lolscheme:x x#x x", "base": "about:blank", - "href": "lolscheme:x x#x x", + "href": "lolscheme:x x#x%20x", "protocol": "lolscheme:", "username": "", "password": "", @@ -177,7 +177,7 @@ "port": "", "pathname": "x x", "search": "", - "hash": "#x x" + "hash": "#x%20x" }, { "input": "http://f:/c", @@ -572,7 +572,7 @@ { "input": "foo://", "base": "http://example.org/foo/bar", - "href": "foo:///", + "href": "foo://", "origin": "null", "protocol": "foo:", "username": "", @@ -580,7 +580,7 @@ "host": "", "hostname": "", "port": "", - "pathname": "/", + "pathname": "", "search": "", "hash": "" }, @@ -1433,6 +1433,22 @@ "search": "", "hash": "" }, + "# Based on https://felixfbecker.github.io/whatwg-url-custom-host-repro/", + { + "input": "ssh://example.com/foo/bar.git", + "base": "http://example.org/", + "href": "ssh://example.com/foo/bar.git", + "origin": "null", + "protocol": "ssh:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/foo/bar.git", + "search": "", + "hash": "" + }, "# Based on http://trac.webkit.org/browser/trunk/LayoutTests/fast/url/file.html", { "input": "file:c:\\foo\\bar.html", @@ -2260,7 +2276,7 @@ { "input": "http://www.google.com/foo?bar=baz# »", "base": "about:blank", - "href": "http://www.google.com/foo?bar=baz# %C2%BB", + "href": "http://www.google.com/foo?bar=baz#%20%C2%BB", "origin": "http://www.google.com", "protocol": "http:", "username": "", @@ -2270,12 +2286,12 @@ "port": "", "pathname": "/foo", "search": "?bar=baz", - "hash": "# %C2%BB" + "hash": "#%20%C2%BB" }, { "input": "data:test# »", "base": "about:blank", - "href": "data:test# %C2%BB", + "href": "data:test#%20%C2%BB", "origin": "null", "protocol": "data:", "username": "", @@ -2285,7 +2301,7 @@ "port": "", "pathname": "test", "search": "", - "hash": "# %C2%BB" + "hash": "#%20%C2%BB" }, { "input": "http://www.google.com", @@ -4015,6 +4031,37 @@ "search": "?`{}", "hash": "" }, + "byte is ' and url is special", + { + "input": "http://host/?'", + "base": "about:blank", + "href": "http://host/?%27", + "origin": "http://host", + "protocol": "http:", + "username": "", + "password": "", + "host": "host", + "hostname": "host", + "port": "", + "pathname": "/", + "search": "?%27", + "hash": "" + }, + { + "input": "notspecial://host/?'", + "base": "about:blank", + "href": "notspecial://host/?'", + "origin": "null", + "protocol": "notspecial:", + "username": "", + "password": "", + "host": "host", + "hostname": "host", + "port": "", + "pathname": "/", + "search": "?'", + "hash": "" + }, "# Credentials in base", { "input": "/some/path", @@ -4473,6 +4520,26 @@ "search": "", "hash": "" }, + { + "input": "sc://@/", + "base": "about:blank", + "failure": true + }, + { + "input": "sc://te@s:t@/", + "base": "about:blank", + "failure": true + }, + { + "input": "sc://:/", + "base": "about:blank", + "failure": true + }, + { + "input": "sc://:12/", + "base": "about:blank", + "failure": true + }, { "input": "sc://[/", "base": "about:blank", @@ -4566,6 +4633,22 @@ "search": "", "hash": "" }, + "# unknown scheme with non-URL characters in the path", + { + "input": "wow:\uFFFF", + "base": "about:blank", + "href": "wow:%EF%BF%BF", + "origin": "null", + "protocol": "wow:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "%EF%BF%BF", + "search": "", + "hash": "" + }, "# Hosts and percent-encoding", { "input": "ftp://example.com%80/", @@ -4767,6 +4850,70 @@ "searchParams": "qux=", "hash": "#foo%08bar" }, + { + "input": "http://foo.bar/baz?qux#foo\"bar", + "base": "about:blank", + "href": "http://foo.bar/baz?qux#foo%22bar", + "origin": "http://foo.bar", + "protocol": "http:", + "username": "", + "password": "", + "host": "foo.bar", + "hostname": "foo.bar", + "port": "", + "pathname": "/baz", + "search": "?qux", + "searchParams": "qux=", + "hash": "#foo%22bar" + }, + { + "input": "http://foo.bar/baz?qux#foobar", + "base": "about:blank", + "href": "http://foo.bar/baz?qux#foo%3Ebar", + "origin": "http://foo.bar", + "protocol": "http:", + "username": "", + "password": "", + "host": "foo.bar", + "hostname": "foo.bar", + "port": "", + "pathname": "/baz", + "search": "?qux", + "searchParams": "qux=", + "hash": "#foo%3Ebar" + }, + { + "input": "http://foo.bar/baz?qux#foo`bar", + "base": "about:blank", + "href": "http://foo.bar/baz?qux#foo%60bar", + "origin": "http://foo.bar", + "protocol": "http:", + "username": "", + "password": "", + "host": "foo.bar", + "hostname": "foo.bar", + "port": "", + "pathname": "/baz", + "search": "?qux", + "searchParams": "qux=", + "hash": "#foo%60bar" + }, "# IPv4 parsing (via https://github.com/nodejs/node/pull/10317)", { "input": "http://192.168.257", @@ -4954,6 +5101,11 @@ "hash": "" }, "More IPv4 parsing (via https://github.com/jsdom/whatwg-url/issues/92)", + { + "input": "https://0x100000000/test", + "base": "about:blank", + "failure": true + }, { "input": "https://256.0.0.1/test", "base": "about:blank", @@ -5187,6 +5339,90 @@ "hash": "#x" }, "# File URLs and many (back)slashes", + { + "input": "file:\\\\//", + "base": "about:blank", + "href": "file:///", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "file:\\\\\\\\", + "base": "about:blank", + "href": "file:///", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "file:\\\\\\\\?fox", + "base": "about:blank", + "href": "file:///?fox", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/", + "search": "?fox", + "hash": "" + }, + { + "input": "file:\\\\\\\\#guppy", + "base": "about:blank", + "href": "file:///#guppy", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/", + "search": "", + "hash": "#guppy" + }, + { + "input": "file://spider///", + "base": "about:blank", + "href": "file://spider/", + "protocol": "file:", + "username": "", + "password": "", + "host": "spider", + "hostname": "spider", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "file:\\\\localhost//", + "base": "about:blank", + "href": "file:///", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, { "input": "file:///localhost//cat", "base": "about:blank", @@ -5201,6 +5437,48 @@ "search": "", "hash": "" }, + { + "input": "file://\\/localhost//cat", + "base": "about:blank", + "href": "file:///localhost//cat", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/localhost//cat", + "search": "", + "hash": "" + }, + { + "input": "file://localhost//a//../..//", + "base": "about:blank", + "href": "file:///", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "/////mouse", + "base": "file:///elephant", + "href": "file:///mouse", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/mouse", + "search": "", + "hash": "" + }, { "input": "\\//pig", "base": "file://lion/", @@ -5215,6 +5493,48 @@ "search": "", "hash": "" }, + { + "input": "\\/localhost//pig", + "base": "file://lion/", + "href": "file:///pig", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/pig", + "search": "", + "hash": "" + }, + { + "input": "//localhost//pig", + "base": "file://lion/", + "href": "file:///pig", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/pig", + "search": "", + "hash": "" + }, + { + "input": "/..//localhost//pig", + "base": "file://lion/", + "href": "file://lion/localhost//pig", + "protocol": "file:", + "username": "", + "password": "", + "host": "lion", + "hostname": "lion", + "port": "", + "pathname": "/localhost//pig", + "search": "", + "hash": "" + }, { "input": "file://", "base": "file://ape/", @@ -5229,7 +5549,50 @@ "search": "", "hash": "" }, + "# File URLs with non-empty hosts", + { + "input": "/rooibos", + "base": "file://tea/", + "href": "file://tea/rooibos", + "protocol": "file:", + "username": "", + "password": "", + "host": "tea", + "hostname": "tea", + "port": "", + "pathname": "/rooibos", + "search": "", + "hash": "" + }, + { + "input": "/?chai", + "base": "file://tea/", + "href": "file://tea/?chai", + "protocol": "file:", + "username": "", + "password": "", + "host": "tea", + "hostname": "tea", + "port": "", + "pathname": "/", + "search": "?chai", + "hash": "" + }, "# Windows drive letter handling with the 'file:' base URL", + { + "input": "C|", + "base": "file://host/dir/file", + "href": "file:///C:", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/C:", + "search": "", + "hash": "" + }, { "input": "C|#", "base": "file://host/dir/file", @@ -5329,6 +5692,48 @@ "hash": "" }, "# Windows drive letter quirk in the file slash state", + { + "input": "/c:/foo/bar", + "base": "file:///c:/baz/qux", + "href": "file:///c:/foo/bar", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/c:/foo/bar", + "search": "", + "hash": "" + }, + { + "input": "/c|/foo/bar", + "base": "file:///c:/baz/qux", + "href": "file:///c:/foo/bar", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/c:/foo/bar", + "search": "", + "hash": "" + }, + { + "input": "file:\\c:\\foo\\bar", + "base": "file:///c:/baz/qux", + "href": "file:///c:/foo/bar", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/c:/foo/bar", + "search": "", + "hash": "" + }, { "input": "/c:/foo/bar", "base": "file://host/path", @@ -5343,9 +5748,9 @@ "search": "", "hash": "" }, - "# Windows drive letter quirk (no host)", + "# Windows drive letter quirk with not empty host", { - "input": "file:/C|/", + "input": "file://example.net/C:/", "base": "about:blank", "href": "file:///C:/", "protocol": "file:", @@ -5359,7 +5764,7 @@ "hash": "" }, { - "input": "file://C|/", + "input": "file://1.2.3.4/C:/", "base": "about:blank", "href": "file:///C:/", "protocol": "file:", @@ -5372,9 +5777,8 @@ "search": "", "hash": "" }, - "# Windows drive letter quirk with not empty host", { - "input": "file://example.net/C:/", + "input": "file://[1::8]/C:/", "base": "about:blank", "href": "file:///C:/", "protocol": "file:", @@ -5387,8 +5791,9 @@ "search": "", "hash": "" }, + "# Windows drive letter quirk (no host)", { - "input": "file://1.2.3.4/C:/", + "input": "file:/C|/", "base": "about:blank", "href": "file:///C:/", "protocol": "file:", @@ -5402,7 +5807,7 @@ "hash": "" }, { - "input": "file://[1::8]/C:/", + "input": "file://C|/", "base": "about:blank", "href": "file:///C:/", "protocol": "file:", @@ -5544,6 +5949,109 @@ "failure": true }, "# Non-special-URL path tests", + { + "input": "sc://ñ", + "base": "about:blank", + "href": "sc://%C3%B1", + "origin": "null", + "protocol": "sc:", + "username": "", + "password": "", + "host": "%C3%B1", + "hostname": "%C3%B1", + "port": "", + "pathname": "", + "search": "", + "hash": "" + }, + { + "input": "sc://ñ?x", + "base": "about:blank", + "href": "sc://%C3%B1?x", + "origin": "null", + "protocol": "sc:", + "username": "", + "password": "", + "host": "%C3%B1", + "hostname": "%C3%B1", + "port": "", + "pathname": "", + "search": "?x", + "hash": "" + }, + { + "input": "sc://ñ#x", + "base": "about:blank", + "href": "sc://%C3%B1#x", + "origin": "null", + "protocol": "sc:", + "username": "", + "password": "", + "host": "%C3%B1", + "hostname": "%C3%B1", + "port": "", + "pathname": "", + "search": "", + "hash": "#x" + }, + { + "input": "#x", + "base": "sc://ñ", + "href": "sc://%C3%B1#x", + "origin": "null", + "protocol": "sc:", + "username": "", + "password": "", + "host": "%C3%B1", + "hostname": "%C3%B1", + "port": "", + "pathname": "", + "search": "", + "hash": "#x" + }, + { + "input": "?x", + "base": "sc://ñ", + "href": "sc://%C3%B1?x", + "origin": "null", + "protocol": "sc:", + "username": "", + "password": "", + "host": "%C3%B1", + "hostname": "%C3%B1", + "port": "", + "pathname": "", + "search": "?x", + "hash": "" + }, + { + "input": "sc://?", + "base": "about:blank", + "href": "sc://?", + "protocol": "sc:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "", + "search": "", + "hash": "" + }, + { + "input": "sc://#", + "base": "about:blank", + "href": "sc://#", + "protocol": "sc:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "", + "search": "", + "hash": "" + }, { "input": "///", "base": "sc://x/", @@ -5558,6 +6066,34 @@ "search": "", "hash": "" }, + { + "input": "////", + "base": "sc://x/", + "href": "sc:////", + "protocol": "sc:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "//", + "search": "", + "hash": "" + }, + { + "input": "////x/", + "base": "sc://x/", + "href": "sc:////x/", + "protocol": "sc:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "//x/", + "search": "", + "hash": "" + }, { "input": "tftp://foobar.com/someconfig;mode=netascii", "base": "about:blank", @@ -6048,27 +6584,34 @@ "search": "?a", "hash": "#%GH" }, - "Bad bases", + "URLs that require a non-about:blank base. (Also serve as invalid base tests.)", { - "input": "test-a.html", - "base": "a", + "input": "a", + "base": "about:blank", "failure": true }, { - "input": "test-a-slash.html", - "base": "a/", + "input": "a/", + "base": "about:blank", "failure": true }, { - "input": "test-a-slash-slash.html", - "base": "a//", + "input": "a//", + "base": "about:blank", "failure": true }, + "Bases that don't fail to parse but fail to be bases", { "input": "test-a-colon.html", "base": "a:", "failure": true }, + { + "input": "test-a-colon-b.html", + "base": "a:b", + "failure": true + }, + "Other base URL tests, that must succeed", { "input": "test-a-colon-slash.html", "base": "a:/", @@ -6097,11 +6640,6 @@ "search": "", "hash": "" }, - { - "input": "test-a-colon-b.html", - "base": "a:b", - "failure": true - }, { "input": "test-a-colon-slash-b.html", "base": "a:/b", From fa9f04487b3f3404281ae42dcbbab2d88a5144de Mon Sep 17 00:00:00 2001 From: Anthony Ramine Date: Fri, 19 Jul 2019 10:35:24 +0200 Subject: [PATCH 02/15] Fix percent encoding of fragments (closes #491) > test result: FAILED. 637 passed; 76 failed; 0 ignored; 0 measured --- src/parser.rs | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index e2ea36bfa..6c1417ddb 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -1329,14 +1329,8 @@ impl<'a> Parser<'a> { self.log_violation(SyntaxViolation::NullInFragment) } else { self.check_url_code_point(c, &input); - self.serialization.extend(utf8_percent_encode( - utf8_c, - // FIXME: tests fail when we use the FRAGMENT set here - // as defined in the spec as of 2019-07-17, - // likely because tests are out of date. - // See https://github.com/servo/rust-url/issues/290 - CONTROLS, - )); + self.serialization + .extend(utf8_percent_encode(utf8_c, FRAGMENT)); } } } From 412266a2f838ff009d2c1103ad7b93bea0a43478 Mon Sep 17 00:00:00 2001 From: Anthony Ramine Date: Sat, 20 Jul 2019 11:49:14 +0200 Subject: [PATCH 03/15] Refactor parse_file to look more like the spec --- src/parser.rs | 338 ++++++++++++++++++++++---------------------------- 1 file changed, 148 insertions(+), 190 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index 6c1417ddb..537492459 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -488,15 +488,93 @@ impl<'a> Parser<'a> { mut self, input: Input, scheme_type: SchemeType, - mut base_file_url: Option<&Url>, + base_file_url: Option<&Url>, ) -> ParseResult { use SyntaxViolation::Backslash; // file state debug_assert!(self.serialization.is_empty()); let (first_char, input_after_first_char) = input.split_first(); - match first_char { - None => { + if matches!(first_char, Some('/') | Some('\\')) { + self.log_violation_if(SyntaxViolation::Backslash, || first_char == Some('\\')); + // file slash state + let (next_char, input_after_next_char) = input_after_first_char.split_first(); + if matches!(next_char, Some('/') | Some('\\')) { + self.log_violation_if(Backslash, || next_char == Some('\\')); + // file host state + self.serialization.push_str("file://"); + let scheme_end = "file".len() as u32; + let host_start = "file://".len() as u32; + let (path_start, mut host, remaining) = + self.parse_file_host(input_after_next_char)?; + let mut host_end = to_u32(self.serialization.len())?; + let mut has_host = !matches!(host, HostInternal::None); + let remaining = if path_start { + self.parse_path_start(SchemeType::File, &mut has_host, remaining) + } else { + let path_start = self.serialization.len(); + self.serialization.push('/'); + self.parse_path(SchemeType::File, &mut has_host, path_start, remaining) + }; + // For file URLs that have a host and whose path starts + // with the windows drive letter we just remove the host. + if !has_host { + self.serialization + .drain(host_start as usize..host_end as usize); + host_end = host_start; + host = HostInternal::None; + } + let (query_start, fragment_start) = + self.parse_query_and_fragment(scheme_type, scheme_end, remaining)?; + return Ok(Url { + serialization: self.serialization, + scheme_end: scheme_end, + username_end: host_start, + host_start: host_start, + host_end: host_end, + host: host, + port: None, + path_start: host_end, + query_start: query_start, + fragment_start: fragment_start, + }); + } else { + self.serialization.push_str("file:///"); + let scheme_end = "file".len() as u32; + let path_start = "file://".len(); if let Some(base_url) = base_file_url { + let first_segment = base_url.path_segments().unwrap().next().unwrap(); + // FIXME: *normalized* drive letter + if is_windows_drive_letter(first_segment) { + self.serialization.push_str(first_segment); + self.serialization.push('/'); + } + } + let remaining = self.parse_path( + SchemeType::File, + &mut false, + path_start, + input_after_first_char, + ); + let (query_start, fragment_start) = + self.parse_query_and_fragment(scheme_type, scheme_end, remaining)?; + let path_start = path_start as u32; + return Ok(Url { + serialization: self.serialization, + scheme_end: scheme_end, + username_end: path_start, + host_start: path_start, + host_end: path_start, + host: HostInternal::None, + port: None, + path_start: path_start, + query_start: query_start, + fragment_start: fragment_start, + }); + } + } + if let Some(base_url) = base_file_url { + match first_char { + None => { // Copy everything except the fragment let before_fragment = match base_url.fragment_start { Some(i) => &base_url.serialization[..i as usize], @@ -508,26 +586,8 @@ impl<'a> Parser<'a> { fragment_start: None, ..*base_url }) - } else { - self.serialization.push_str("file:///"); - let scheme_end = "file".len() as u32; - let path_start = "file://".len() as u32; - Ok(Url { - serialization: self.serialization, - scheme_end, - username_end: path_start, - host_start: path_start, - host_end: path_start, - host: HostInternal::None, - port: None, - path_start, - query_start: None, - fragment_start: None, - }) } - } - Some('?') => { - if let Some(base_url) = base_file_url { + Some('?') => { // Copy everything up to the query string let before_query = match (base_url.query_start, base_url.fragment_start) { (None, None) => &*base_url.serialization, @@ -542,179 +602,77 @@ impl<'a> Parser<'a> { fragment_start, ..*base_url }) - } else { - self.serialization.push_str("file:///"); - let scheme_end = "file".len() as u32; - let path_start = "file://".len() as u32; - let (query_start, fragment_start) = - self.parse_query_and_fragment(scheme_type, scheme_end, input)?; - Ok(Url { - serialization: self.serialization, - scheme_end, - username_end: path_start, - host_start: path_start, - host_end: path_start, - host: HostInternal::None, - port: None, - path_start, - query_start, - fragment_start, - }) } - } - Some('#') => { - if let Some(base_url) = base_file_url { - self.fragment_only(base_url, input) - } else { - self.serialization.push_str("file:///"); - let scheme_end = "file".len() as u32; - let path_start = "file://".len() as u32; - let fragment_start = "file:///".len() as u32; - self.serialization.push('#'); - self.parse_fragment(input_after_first_char); - Ok(Url { - serialization: self.serialization, - scheme_end, - username_end: path_start, - host_start: path_start, - host_end: path_start, - host: HostInternal::None, - port: None, - path_start, - query_start: None, - fragment_start: Some(fragment_start), - }) - } - } - Some('/') | Some('\\') => { - self.log_violation_if(Backslash, || first_char == Some('\\')); - // file slash state - let (next_char, input_after_next_char) = input_after_first_char.split_first(); - self.log_violation_if(Backslash, || next_char == Some('\\')); - if matches!(next_char, Some('/') | Some('\\')) { - // file host state - self.serialization.push_str("file://"); - let scheme_end = "file".len() as u32; - let host_start = "file://".len() as u32; - let (path_start, mut host, remaining) = - self.parse_file_host(input_after_next_char)?; - let mut host_end = to_u32(self.serialization.len())?; - let mut has_host = !matches!(host, HostInternal::None); - let remaining = if path_start { - self.parse_path_start(SchemeType::File, &mut has_host, remaining) + Some('#') => self.fragment_only(base_url, input), + _ => { + if !starts_with_windows_drive_letter_segment(&input) { + let before_query = match (base_url.query_start, base_url.fragment_start) { + (None, None) => &*base_url.serialization, + (Some(i), _) | (None, Some(i)) => base_url.slice(..i), + }; + self.serialization.push_str(before_query); + self.pop_path(SchemeType::File, base_url.path_start as usize); + let remaining = self.parse_path( + SchemeType::File, + &mut true, + base_url.path_start as usize, + input, + ); + self.with_query_and_fragment( + SchemeType::File, + base_url.scheme_end, + base_url.username_end, + base_url.host_start, + base_url.host_end, + base_url.host, + base_url.port, + base_url.path_start, + remaining, + ) } else { - let path_start = self.serialization.len(); - self.serialization.push('/'); - self.parse_path(SchemeType::File, &mut has_host, path_start, remaining) - }; - // For file URLs that have a host and whose path starts - // with the windows drive letter we just remove the host. - if !has_host { - self.serialization - .drain(host_start as usize..host_end as usize); - host_end = host_start; - host = HostInternal::None; + self.serialization.push_str("file:///"); + let scheme_end = "file".len() as u32; + let path_start = "file://".len(); + let remaining = + self.parse_path(SchemeType::File, &mut false, path_start, input); + let (query_start, fragment_start) = + self.parse_query_and_fragment(SchemeType::File, scheme_end, remaining)?; + let path_start = path_start as u32; + Ok(Url { + serialization: self.serialization, + scheme_end: scheme_end, + username_end: path_start, + host_start: path_start, + host_end: path_start, + host: HostInternal::None, + port: None, + path_start: path_start, + query_start: query_start, + fragment_start: fragment_start, + }) } - let (query_start, fragment_start) = - self.parse_query_and_fragment(scheme_type, scheme_end, remaining)?; - Ok(Url { - serialization: self.serialization, - scheme_end, - username_end: host_start, - host_start, - host_end, - host, - port: None, - path_start: host_end, - query_start, - fragment_start, - }) - } else { - self.serialization.push_str("file:///"); - let scheme_end = "file".len() as u32; - let path_start = "file://".len(); - if let Some(base_url) = base_file_url { - let first_segment = base_url.path_segments().unwrap().next().unwrap(); - // FIXME: *normalized* drive letter - if is_windows_drive_letter(first_segment) { - self.serialization.push_str(first_segment); - self.serialization.push('/'); - } - } - let remaining = self.parse_path( - SchemeType::File, - &mut false, - path_start, - input_after_first_char, - ); - let (query_start, fragment_start) = - self.parse_query_and_fragment(scheme_type, scheme_end, remaining)?; - let path_start = path_start as u32; - Ok(Url { - serialization: self.serialization, - scheme_end, - username_end: path_start, - host_start: path_start, - host_end: path_start, - host: HostInternal::None, - port: None, - path_start, - query_start, - fragment_start, - }) - } - } - _ => { - if starts_with_windows_drive_letter_segment(&input) { - base_file_url = None; - } - if let Some(base_url) = base_file_url { - let before_query = match (base_url.query_start, base_url.fragment_start) { - (None, None) => &*base_url.serialization, - (Some(i), _) | (None, Some(i)) => base_url.slice(..i), - }; - self.serialization.push_str(before_query); - self.pop_path(SchemeType::File, base_url.path_start as usize); - let remaining = self.parse_path( - SchemeType::File, - &mut true, - base_url.path_start as usize, - input, - ); - self.with_query_and_fragment( - SchemeType::File, - base_url.scheme_end, - base_url.username_end, - base_url.host_start, - base_url.host_end, - base_url.host, - base_url.port, - base_url.path_start, - remaining, - ) - } else { - self.serialization.push_str("file:///"); - let scheme_end = "file".len() as u32; - let path_start = "file://".len(); - let remaining = - self.parse_path(SchemeType::File, &mut false, path_start, input); - let (query_start, fragment_start) = - self.parse_query_and_fragment(SchemeType::File, scheme_end, remaining)?; - let path_start = path_start as u32; - Ok(Url { - serialization: self.serialization, - scheme_end, - username_end: path_start, - host_start: path_start, - host_end: path_start, - host: HostInternal::None, - port: None, - path_start, - query_start, - fragment_start, - }) } } + } else { + self.serialization.push_str("file:///"); + let scheme_end = "file".len() as u32; + let path_start = "file://".len(); + let remaining = self.parse_path(SchemeType::File, &mut false, path_start, input); + let (query_start, fragment_start) = + self.parse_query_and_fragment(SchemeType::File, scheme_end, remaining)?; + let path_start = path_start as u32; + Ok(Url { + serialization: self.serialization, + scheme_end: scheme_end, + username_end: path_start, + host_start: path_start, + host_end: path_start, + host: HostInternal::None, + port: None, + path_start: path_start, + query_start: query_start, + fragment_start: fragment_start, + }) } } From e93f999dc5f6f8de61ccf6fe79cc8d11774f08b8 Mon Sep 17 00:00:00 2001 From: Anthony Ramine Date: Sat, 20 Jul 2019 12:09:54 +0200 Subject: [PATCH 04/15] Fix a Windows quirk > test result: FAILED. 640 passed; 73 failed; 0 ignored; 0 measured --- src/parser.rs | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index 537492459..5805afb41 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -541,12 +541,13 @@ impl<'a> Parser<'a> { self.serialization.push_str("file:///"); let scheme_end = "file".len() as u32; let path_start = "file://".len(); - if let Some(base_url) = base_file_url { - let first_segment = base_url.path_segments().unwrap().next().unwrap(); - // FIXME: *normalized* drive letter - if is_windows_drive_letter(first_segment) { - self.serialization.push_str(first_segment); - self.serialization.push('/'); + if !starts_with_windows_drive_letter_segment(&input_after_first_char) { + if let Some(base_url) = base_file_url { + let first_segment = base_url.path_segments().unwrap().next().unwrap(); + if is_normalized_windows_drive_letter(first_segment) { + self.serialization.push_str(first_segment); + self.serialization.push('/'); + } } } let remaining = self.parse_path( @@ -1361,6 +1362,10 @@ pub fn to_u32(i: usize) -> ParseResult { } } +fn is_normalized_windows_drive_letter(segment: &str) -> bool { + is_windows_drive_letter(segment) && segment.as_bytes()[1] == b':' +} + /// Wether the scheme is file:, the path has a single segment, and that segment /// is a Windows drive letter fn is_windows_drive_letter(segment: &str) -> bool { From efe9ab98888e0229315c56b684f0e3fa8d40ca0d Mon Sep 17 00:00:00 2001 From: Anthony Ramine Date: Sat, 20 Jul 2019 13:21:45 +0200 Subject: [PATCH 05/15] Properly copy hosts of base file:// URLs when needed > test result: FAILED. 642 passed; 71 failed; 0 ignored; 0 measured --- src/parser.rs | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index 5805afb41..8daac86df 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -538,36 +538,44 @@ impl<'a> Parser<'a> { fragment_start: fragment_start, }); } else { - self.serialization.push_str("file:///"); + self.serialization.push_str("file://"); let scheme_end = "file".len() as u32; - let path_start = "file://".len(); + let host_start = "file://".len(); + let mut host_end = host_start; + let mut host = HostInternal::None; if !starts_with_windows_drive_letter_segment(&input_after_first_char) { if let Some(base_url) = base_file_url { let first_segment = base_url.path_segments().unwrap().next().unwrap(); if is_normalized_windows_drive_letter(first_segment) { - self.serialization.push_str(first_segment); self.serialization.push('/'); + self.serialization.push_str(first_segment); + } else if let Some(host_str) = base_url.host_str() { + self.serialization.push_str(host_str); + host_end = self.serialization.len(); + host = base_url.host.clone(); } } } + self.serialization.push('/'); let remaining = self.parse_path( SchemeType::File, &mut false, - path_start, + host_end, input_after_first_char, ); let (query_start, fragment_start) = self.parse_query_and_fragment(scheme_type, scheme_end, remaining)?; - let path_start = path_start as u32; + let host_start = host_start as u32; + let host_end = host_end as u32; return Ok(Url { serialization: self.serialization, scheme_end: scheme_end, - username_end: path_start, - host_start: path_start, - host_end: path_start, - host: HostInternal::None, + username_end: host_start, + host_start, + host_end, + host, port: None, - path_start: path_start, + path_start: host_end, query_start: query_start, fragment_start: fragment_start, }); From 54a158b7a239a4cca5c1c142aaafe4474a18ed4a Mon Sep 17 00:00:00 2001 From: Jeremy Lempereur Date: Tue, 30 Jul 2019 13:52:13 +0200 Subject: [PATCH 06/15] Path and file parsing. --- src/host.rs | 6 +- src/lib.rs | 41 +++++-- src/parser.rs | 256 +++++++++++++++++++++++++++++++++++-------- src/path_segments.rs | 15 ++- src/quirks.rs | 27 ++++- tests/unit.rs | 18 ++- 6 files changed, 300 insertions(+), 63 deletions(-) diff --git a/src/host.rs b/src/host.rs index 9afc6d8e7..238d523ed 100644 --- a/src/host.rs +++ b/src/host.rs @@ -24,9 +24,13 @@ pub(crate) enum HostInternal { Ipv6(Ipv6Addr), } -impl From> for HostInternal { +impl From> for HostInternal +where + S: ToString, +{ fn from(host: Host) -> HostInternal { match host { + Host::Domain(ref s) if s.to_string().is_empty() => HostInternal::None, Host::Domain(_) => HostInternal::Domain, Host::Ipv4(address) => HostInternal::Ipv4(address), Host::Ipv6(address) => HostInternal::Ipv6(address), diff --git a/src/lib.rs b/src/lib.rs index d60935c29..536cc1199 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -456,13 +456,15 @@ impl Url { if self.slice(self.scheme_end + 1..).starts_with("//") { // URL with authority - match self.byte_at(self.username_end) { - b':' => { - assert!(self.host_start >= self.username_end + 2); - assert_eq!(self.byte_at(self.host_start - 1), b'@'); + if self.username_end != self.serialization.len() as u32 { + match self.byte_at(self.username_end) { + b':' => { + assert!(self.host_start >= self.username_end + 2); + assert_eq!(self.byte_at(self.host_start - 1), b'@'); + } + b'@' => assert!(self.host_start == self.username_end + 1), + _ => assert_eq!(self.username_end, self.scheme_end + 3), } - b'@' => assert!(self.host_start == self.username_end + 1), - _ => assert_eq!(self.username_end, self.scheme_end + 3), } assert!(self.host_start >= self.username_end); assert!(self.host_end >= self.host_start); @@ -490,7 +492,10 @@ impl Url { Some(port_str.parse::().expect("Couldn't parse port?")) ); } - assert_eq!(self.byte_at(self.path_start), b'/'); + assert!( + self.path_start as usize == self.serialization.len() + || matches!(self.byte_at(self.path_start), b'/' | b'#' | b'?') + ); } else { // Anarchist URL (no authority) assert_eq!(self.username_end, self.scheme_end + 1); @@ -501,11 +506,11 @@ impl Url { assert_eq!(self.path_start, self.scheme_end + 1); } if let Some(start) = self.query_start { - assert!(start > self.path_start); + assert!(start >= self.path_start); assert_eq!(self.byte_at(start), b'?'); } if let Some(start) = self.fragment_start { - assert!(start > self.path_start); + assert!(start >= self.path_start); assert_eq!(self.byte_at(start), b'#'); } if let (Some(query_start), Some(fragment_start)) = (self.query_start, self.fragment_start) { @@ -745,7 +750,10 @@ impl Url { pub fn password(&self) -> Option<&str> { // This ':' is not the one marking a port number since a host can not be empty. // (Except for file: URLs, which do not have port numbers.) - if self.has_authority() && self.byte_at(self.username_end) == b':' { + if self.has_authority() + && self.username_end != self.serialization.len() as u32 + && self.byte_at(self.username_end) == b':' + { debug_assert!(self.byte_at(self.host_start - 1) == b'@'); Some(self.slice(self.username_end + 1..self.host_start - 1)) } else { @@ -1226,7 +1234,7 @@ impl Url { if let Some(input) = fragment { self.fragment_start = Some(to_u32(self.serialization.len()).unwrap()); self.serialization.push('#'); - self.mutate(|parser| parser.parse_fragment(parser::Input::new(input))) + self.mutate(|parser| parser.parse_fragment(parser::Input::no_trim(input))) } else { self.fragment_start = None } @@ -1284,7 +1292,12 @@ impl Url { let scheme_type = SchemeType::from(self.scheme()); let scheme_end = self.scheme_end; self.mutate(|parser| { - parser.parse_query(scheme_type, scheme_end, parser::Input::new(input)) + let vfn = parser.violation_fn; + parser.parse_query( + scheme_type, + scheme_end, + parser::Input::trim_tab_and_newlines(input, vfn), + ) }); } @@ -1390,8 +1403,12 @@ impl Url { } parser.parse_cannot_be_a_base_path(parser::Input::new(path)); } else { + let path_start = parser.serialization.len(); let mut has_host = true; // FIXME parser.parse_path_start(scheme_type, &mut has_host, parser::Input::new(path)); + if scheme_type.is_file() { + parser::trim_path(&mut parser.serialization, path_start); + } } }); self.restore_after_path(old_after_path_pos, &after_path); diff --git a/src/parser.rs b/src/parser.rs index 8daac86df..7cd1dbd8b 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -201,6 +201,30 @@ impl<'i> Input<'i> { Input::with_log(input, None) } + pub fn no_trim(input: &'i str) -> Self { + Input { + chars: input.chars(), + } + } + + pub fn trim_tab_and_newlines( + original_input: &'i str, + vfn: Option<&dyn Fn(SyntaxViolation)>, + ) -> Self { + let input = original_input.trim_matches(ascii_tab_or_new_line); + if let Some(vfn) = vfn { + if input.len() < original_input.len() { + vfn(SyntaxViolation::C0SpaceIgnored) + } + if input.chars().any(|c| matches!(c, '\t' | '\n' | '\r')) { + vfn(SyntaxViolation::TabOrNewlineIgnored) + } + } + Input { + chars: input.chars(), + } + } + pub fn with_log(original_input: &'i str, vfn: Option<&dyn Fn(SyntaxViolation)>) -> Self { let input = original_input.trim_matches(c0_control_or_space); if let Some(vfn) = vfn { @@ -515,6 +539,8 @@ impl<'a> Parser<'a> { self.serialization.push('/'); self.parse_path(SchemeType::File, &mut has_host, path_start, remaining) }; + + trim_path(&mut self.serialization, host_end as usize); // For file URLs that have a host and whose path starts // with the windows drive letter we just remove the host. if !has_host { @@ -556,16 +582,27 @@ impl<'a> Parser<'a> { } } } - self.serialization.push('/'); - let remaining = self.parse_path( - SchemeType::File, - &mut false, - host_end, - input_after_first_char, - ); + // If c is the EOF code point, U+002F (/), U+005C (\), U+003F (?), or U+0023 (#), then decrease pointer by one + let parse_path_input = if let Some(c) = first_char { + if c == '/' || c == '\\' || c == '?' || c == '#' { + input + } else { + input_after_first_char + } + } else { + input_after_first_char + }; + + let remaining = + self.parse_path(SchemeType::File, &mut false, host_end, parse_path_input); + + let host_start = host_start as u32; + + trim_path(&mut self.serialization, host_end); + let (query_start, fragment_start) = self.parse_query_and_fragment(scheme_type, scheme_end, remaining)?; - let host_start = host_start as u32; + let host_end = host_end as u32; return Ok(Url { serialization: self.serialization, @@ -620,7 +657,7 @@ impl<'a> Parser<'a> { (Some(i), _) | (None, Some(i)) => base_url.slice(..i), }; self.serialization.push_str(before_query); - self.pop_path(SchemeType::File, base_url.path_start as usize); + self.shorten_path(SchemeType::File, base_url.path_start as usize); let remaining = self.parse_path( SchemeType::File, &mut true, @@ -739,12 +776,14 @@ impl<'a> Parser<'a> { debug_assert!(base_url.byte_at(scheme_end) == b':'); self.serialization .push_str(base_url.slice(..scheme_end + 1)); + if let Some(after_prefix) = input.split_prefix("//") { + return self.after_double_slash(after_prefix, scheme_type, scheme_end); + } return self.after_double_slash(remaining, scheme_type, scheme_end); } let path_start = base_url.path_start; - debug_assert!(base_url.byte_at(path_start) == b'/'); - self.serialization - .push_str(base_url.slice(..path_start + 1)); + self.serialization.push_str(base_url.slice(..path_start)); + self.serialization.push_str("/"); let remaining = self.parse_path( scheme_type, &mut true, @@ -771,8 +810,24 @@ impl<'a> Parser<'a> { self.serialization.push_str(before_query); // FIXME spec says just "remove last entry", not the "pop" algorithm self.pop_path(scheme_type, base_url.path_start as usize); - let remaining = - self.parse_path(scheme_type, &mut true, base_url.path_start as usize, input); + // A special url always has a path. + // A path always starts with '/' + if self.serialization.len() == base_url.path_start as usize { + if SchemeType::from(base_url.scheme()).is_special() || !input.is_empty() { + self.serialization.push('/'); + } + } + let remaining = match input.split_first() { + (Some('/'), remaining) => self.parse_path( + scheme_type, + &mut true, + base_url.path_start as usize, + remaining, + ), + _ => { + self.parse_path(scheme_type, &mut true, base_url.path_start as usize, input) + } + }; self.with_query_and_fragment( scheme_type, base_url.scheme_end, @@ -946,7 +1001,7 @@ impl<'a> Parser<'a> { host_str = &input_str[..bytes] } } - if scheme_type.is_special() && host_str.is_empty() { + if scheme_type == SchemeType::SpecialNotFile && host_str.is_empty() { return Err(ParseError::EmptyHost); } if !scheme_type.is_special() { @@ -1040,21 +1095,34 @@ impl<'a> Parser<'a> { &mut self, scheme_type: SchemeType, has_host: &mut bool, - mut input: Input<'i>, + input: Input<'i>, ) -> Input<'i> { - // Path start state - match input.split_first() { - (Some('/'), remaining) => input = remaining, - (Some('\\'), remaining) => { - if scheme_type.is_special() { - self.log_violation(SyntaxViolation::Backslash); - input = remaining + let path_start = self.serialization.len(); + let (maybe_c, remaining) = input.split_first(); + // If url is special, then: + if scheme_type.is_special() { + if maybe_c == Some('\\') { + // If c is U+005C (\), validation error. + self.log_violation(SyntaxViolation::Backslash); + } + // A special URL always has a non-empty path. + if !self.serialization.ends_with("/") { + self.serialization.push('/'); + // We have already made sure the forward slash is present. + if maybe_c == Some('/') || maybe_c == Some('\\') { + return self.parse_path(scheme_type, has_host, path_start, remaining); } } - _ => {} + return self.parse_path(scheme_type, has_host, path_start, input); + } else if maybe_c == Some('?') || maybe_c == Some('#') { + // Otherwise, if state override is not given and c is U+003F (?), + // set url’s query to the empty string and state to query state. + // Otherwise, if state override is not given and c is U+0023 (#), + // set url’s fragment to the empty string and state to fragment state. + // The query and path states will be handled by the caller. + return input; } - let path_start = self.serialization.len(); - self.serialization.push('/'); + // Otherwise, if c is not the EOF code point: self.parse_path(scheme_type, has_host, path_start, input) } @@ -1066,7 +1134,6 @@ impl<'a> Parser<'a> { mut input: Input<'i>, ) -> Input<'i> { // Relative path state - debug_assert!(self.serialization.ends_with('/')); loop { let segment_start = self.serialization.len(); let mut ends_with_slash = false; @@ -1079,6 +1146,7 @@ impl<'a> Parser<'a> { }; match c { '/' if self.context != Context::PathSegmentSetter => { + self.serialization.push(c); ends_with_slash = true; break; } @@ -1086,6 +1154,7 @@ impl<'a> Parser<'a> { && scheme_type.is_special() => { self.log_violation(SyntaxViolation::Backslash); + self.serialization.push('/'); ends_with_slash = true; break; } @@ -1109,35 +1178,57 @@ impl<'a> Parser<'a> { } } } - match &self.serialization[segment_start..] { + + let segment_before_slash = if ends_with_slash { + &self.serialization[segment_start..self.serialization.len() - 1] + } else { + &self.serialization[segment_start..self.serialization.len()] + }; + match segment_before_slash { + // If buffer is a double-dot path segment, shorten url’s path, ".." | "%2e%2e" | "%2e%2E" | "%2E%2e" | "%2E%2E" | "%2e." | "%2E." | ".%2e" | ".%2E" => { debug_assert!(self.serialization.as_bytes()[segment_start - 1] == b'/'); - self.serialization.truncate(segment_start - 1); // Truncate "/.." - self.pop_path(scheme_type, path_start); - if !self.serialization[path_start..].ends_with('/') { - self.serialization.push('/') + self.serialization.truncate(segment_start); + if self.serialization.ends_with("/") + && Parser::last_slash_can_be_removed(&self.serialization, path_start) + { + self.serialization.pop(); + } + self.shorten_path(scheme_type, path_start); + + // and then if neither c is U+002F (/), nor url is special and c is U+005C (\), append the empty string to url’s path. + if ends_with_slash && !self.serialization.ends_with("/") { + self.serialization.push('/'); } } + // Otherwise, if buffer is a single-dot path segment and if neither c is U+002F (/), + // nor url is special and c is U+005C (\), append the empty string to url’s path. "." | "%2e" | "%2E" => { self.serialization.truncate(segment_start); + if !self.serialization.ends_with("/") { + self.serialization.push('/'); + } } _ => { - if scheme_type.is_file() - && is_windows_drive_letter(&self.serialization[path_start + 1..]) - { - if self.serialization.ends_with('|') { - self.serialization.pop(); + // If url’s scheme is "file", url’s path is empty, and buffer is a Windows drive letter, then + if scheme_type.is_file() && is_windows_drive_letter(segment_before_slash) { + // Replace the second code point in buffer with U+003A (:). + if let Some(c) = segment_before_slash.chars().nth(0) { + self.serialization.truncate(segment_start); + self.serialization.push(c); self.serialization.push(':'); + if ends_with_slash { + self.serialization.push('/'); + } } + // If url’s host is neither the empty string nor null, + // validation error, set url’s host to the empty string. if *has_host { self.log_violation(SyntaxViolation::FileWithHostAndWindowsDrive); *has_host = false; // FIXME account for this in callers } } - if ends_with_slash { - self.serialization.push('/') - } } } if !ends_with_slash { @@ -1147,6 +1238,39 @@ impl<'a> Parser<'a> { input } + fn last_slash_can_be_removed(serialization: &String, path_start: usize) -> bool { + let url_before_segment = &serialization[..serialization.len() - 1]; + if let Some(segment_before_start) = url_before_segment.rfind("/") { + // Do not remove the root slash + segment_before_start >= path_start + // Or a windows drive letter slash + && !path_starts_with_windows_drive_letter(&serialization[segment_before_start..]) + } else { + false + } + } + + /// https://url.spec.whatwg.org/#shorten-a-urls-path + fn shorten_path(&mut self, scheme_type: SchemeType, path_start: usize) { + // If path is empty, then return. + if self.serialization.len() == path_start { + return; + } + // If url’s scheme is "file", path’s size is 1, and path[0] is a normalized Windows drive letter, then return. + let segments: Vec<&str> = self.serialization[path_start..] + .split('/') + .filter(|s| !s.is_empty()) + .collect(); + if scheme_type.is_file() + && segments.len() == 1 + && is_normalized_windows_drive_letter(segments[0]) + { + return; + } + // Remove path’s last item. + self.pop_path(scheme_type, path_start); + } + /// https://url.spec.whatwg.org/#pop-a-urls-path fn pop_path(&mut self, scheme_type: SchemeType, path_start: usize) { if self.serialization.len() > path_start { @@ -1154,9 +1278,8 @@ impl<'a> Parser<'a> { // + 1 since rfind returns the position before the slash. let segment_start = path_start + slash_position + 1; // Don’t pop a Windows drive letter - // FIXME: *normalized* Windows drive letter if !(scheme_type.is_file() - && is_windows_drive_letter(&self.serialization[segment_start..])) + && is_normalized_windows_drive_letter(&self.serialization[segment_start..])) { self.serialization.truncate(segment_start); } @@ -1318,6 +1441,18 @@ impl<'a> Parser<'a> { } } +// Trim path start forward slashes when no authority is present +// https://github.com/whatwg/url/issues/232 +pub fn trim_path(serialization: &mut String, path_start: usize) { + let path = serialization.split_off(path_start); + if path.starts_with("/") { + serialization.push('/'); + serialization.push_str(&path.trim_start_matches("/")); + } else { + serialization.push_str(&path); + } +} + #[inline] fn is_ascii_hex_digit(c: char) -> bool { matches!(c, 'a'..='f' | 'A'..='F' | '0'..='9') @@ -1355,6 +1490,12 @@ fn c0_control_or_space(ch: char) -> bool { ch <= ' ' // U+0000 to U+0020 } +/// https://infra.spec.whatwg.org/#ascii-tab-or-newline +#[inline] +pub fn ascii_tab_or_new_line(ch: char) -> bool { + matches!(ch, '\t' | '\r' | '\n') +} + /// https://url.spec.whatwg.org/#ascii-alpha #[inline] pub fn ascii_alpha(ch: char) -> bool { @@ -1380,12 +1521,37 @@ fn is_windows_drive_letter(segment: &str) -> bool { segment.len() == 2 && starts_with_windows_drive_letter(segment) } +/// Wether path starts with a root slash +/// and a windows drive letter eg: "/c:" or "/a:/" +fn path_starts_with_windows_drive_letter(s: &str) -> bool { + if let Some(c) = s.as_bytes().get(0) { + matches!(c, b'/' | b'\\' | b'?' | b'#') && starts_with_windows_drive_letter(&s[1..]) + } else { + false + } +} + fn starts_with_windows_drive_letter(s: &str) -> bool { - ascii_alpha(s.as_bytes()[0] as char) && matches!(s.as_bytes()[1], b':' | b'|') + s.len() >= 2 + && ascii_alpha(s.as_bytes()[0] as char) + && matches!(s.as_bytes()[1], b':' | b'|') + && (s.len() == 2 || matches!(s.as_bytes()[2], b'/' | b'\\' | b'?' | b'#')) } +/// https://url.spec.whatwg.org/#start-with-a-windows-drive-letter fn starts_with_windows_drive_letter_segment(input: &Input) -> bool { let mut input = input.clone(); - matches!((input.next(), input.next(), input.next()), (Some(a), Some(b), Some(c)) - if ascii_alpha(a) && matches!(b, ':' | '|') && matches!(c, '/' | '\\' | '?' | '#')) + match (input.next(), input.next(), input.next()) { + // its first two code points are a Windows drive letter + // its third code point is U+002F (/), U+005C (\), U+003F (?), or U+0023 (#). + (Some(a), Some(b), Some(c)) + if ascii_alpha(a) && matches!(b, ':' | '|') && matches!(c, '/' | '\\' | '?' | '#') => + { + true + } + // its first two code points are a Windows drive letter + // its length is 2 + (Some(a), Some(b), None) if ascii_alpha(a) && matches!(b, ':' | '|') => true, + _ => false, + } } diff --git a/src/path_segments.rs b/src/path_segments.rs index 97055e777..6f5679887 100644 --- a/src/path_segments.rs +++ b/src/path_segments.rs @@ -45,7 +45,15 @@ pub struct PathSegmentsMut<'a> { pub fn new(url: &mut Url) -> PathSegmentsMut { let after_path = url.take_after_path(); let old_after_path_position = to_u32(url.serialization.len()).unwrap(); - debug_assert!(url.byte_at(url.path_start) == b'/'); + // Special urls always have a non empty path + if SchemeType::from(url.scheme()).is_special() { + debug_assert!(url.byte_at(url.path_start) == b'/'); + } else { + debug_assert!( + url.serialization.len() == url.path_start as usize + || url.byte_at(url.path_start) == b'/' + ); + } PathSegmentsMut { after_first_slash: url.path_start as usize + "/".len(), url, @@ -212,7 +220,10 @@ impl<'a> PathSegmentsMut<'a> { if matches!(segment, "." | "..") { continue; } - if parser.serialization.len() > path_start + 1 { + if parser.serialization.len() > path_start + 1 + // Non special url's path might still be empty + || parser.serialization.len() == path_start + { parser.serialization.push('/'); } let mut has_host = true; // FIXME account for this? diff --git a/src/quirks.rs b/src/quirks.rs index 285ee21b6..ded278565 100644 --- a/src/quirks.rs +++ b/src/quirks.rs @@ -99,9 +99,13 @@ pub fn host(url: &Url) -> &str { /// Setter for https://url.spec.whatwg.org/#dom-url-host pub fn set_host(url: &mut Url, new_host: &str) -> Result<(), ()> { + // If context object’s url’s cannot-be-a-base-URL flag is set, then return. if url.cannot_be_a_base() { return Err(()); } + // Host parsing rules are strict, + // We don't want to trim the input + let input = Input::no_trim(new_host); let host; let opt_port; { @@ -121,6 +125,20 @@ pub fn set_host(url: &mut Url, new_host: &str) -> Result<(), ()> { Err(_) => return Err(()), } } + // Make sure we won't set an empty host to a url with a username or a port + if host == Host::Domain("".to_string()) { + if !username(&url).is_empty() { + return Err(()); + } + if let Some(p) = opt_port { + if let Some(_) = p { + return Err(()); + } + } + if url.port().is_some() { + return Err(()); + } + } url.set_host_internal(host, opt_port); Ok(()) } @@ -182,7 +200,14 @@ pub fn pathname(url: &Url) -> &str { /// Setter for https://url.spec.whatwg.org/#dom-url-pathname pub fn set_pathname(url: &mut Url, new_pathname: &str) { - if !url.cannot_be_a_base() { + if url.cannot_be_a_base() { + return; + } + if Some('/') == new_pathname.chars().nth(0) + || SchemeType::from(url.scheme()).is_special() + // \ is a segment delimiter for 'special' URLs" + && Some('\\') == new_pathname.chars().nth(0) + { url.set_path(new_pathname) } } diff --git a/tests/unit.rs b/tests/unit.rs index 9918ea316..82493828c 100644 --- a/tests/unit.rs +++ b/tests/unit.rs @@ -23,6 +23,20 @@ fn size() { assert_eq!(size_of::(), size_of::>()); } +#[test] +fn test_relative() { + let base: Url = "sc://%C3%B1".parse().unwrap(); + let url = base.join("/resources/testharness.js").unwrap(); + assert_eq!(url.as_str(), "sc://%C3%B1/resources/testharness.js"); +} + +#[test] +fn test_relative_empty() { + let base: Url = "sc://%C3%B1".parse().unwrap(); + let url = base.join("").unwrap(); + assert_eq!(url.as_str(), "sc://%C3%B1"); +} + macro_rules! assert_from_file_path { ($path: expr) => { assert_from_file_path!($path, $path) @@ -413,9 +427,9 @@ fn test_set_host() { assert_eq!(url.as_str(), "foobar:/hello"); let mut url = Url::parse("foo://ș").unwrap(); - assert_eq!(url.as_str(), "foo://%C8%99/"); + assert_eq!(url.as_str(), "foo://%C8%99"); url.set_host(Some("goșu.ro")).unwrap(); - assert_eq!(url.as_str(), "foo://go%C8%99u.ro/"); + assert_eq!(url.as_str(), "foo://go%C8%99u.ro"); } #[test] From 0586854c8b778c9d14bccd1a0213bc2263e9345a Mon Sep 17 00:00:00 2001 From: Jeremy Lempereur Date: Sat, 20 Jul 2019 23:31:39 +0200 Subject: [PATCH 07/15] Host parsing rules. --- src/host.rs | 9 ++---- src/lib.rs | 21 +++++++++++-- src/parser.rs | 85 ++++++++++++++++++++++++++++++++++++++++----------- src/quirks.rs | 50 ++++++++++++++++++++++-------- 4 files changed, 127 insertions(+), 38 deletions(-) diff --git a/src/host.rs b/src/host.rs index 238d523ed..02bae9e25 100644 --- a/src/host.rs +++ b/src/host.rs @@ -24,13 +24,10 @@ pub(crate) enum HostInternal { Ipv6(Ipv6Addr), } -impl From> for HostInternal -where - S: ToString, -{ - fn from(host: Host) -> HostInternal { +impl From> for HostInternal { + fn from(host: Host) -> HostInternal { match host { - Host::Domain(ref s) if s.to_string().is_empty() => HostInternal::None, + Host::Domain(ref s) if s.is_empty() => HostInternal::None, Host::Domain(_) => HostInternal::Domain, Host::Ipv4(address) => HostInternal::Ipv4(address), Host::Ipv6(address) => HostInternal::Ipv6(address), diff --git a/src/lib.rs b/src/lib.rs index 536cc1199..4969a6672 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -690,7 +690,7 @@ impl Url { /// ``` #[inline] pub fn cannot_be_a_base(&self) -> bool { - !self.slice(self.path_start..).starts_with('/') + !self.slice(self.scheme_end + 1..).starts_with('/') } /// Return the username for this URL (typically the empty string) @@ -1642,10 +1642,25 @@ impl Url { if host == "" && SchemeType::from(self.scheme()).is_special() { return Err(ParseError::EmptyHost); } + let mut host_substr = host; + // Otherwise, if c is U+003A (:) and the [] flag is unset, then + if !host.starts_with('[') || !host.ends_with(']') { + match host.find(':') { + Some(0) => { + // If buffer is the empty string, validation error, return failure. + return Err(ParseError::InvalidDomainCharacter); + } + // Let host be the result of host parsing buffer + Some(colon_index) => { + host_substr = &host[..colon_index]; + } + None => {} + } + } if SchemeType::from(self.scheme()).is_special() { - self.set_host_internal(Host::parse(host)?, None) + self.set_host_internal(Host::parse(host_substr)?, None); } else { - self.set_host_internal(Host::parse_opaque(host)?, None) + self.set_host_internal(Host::parse_opaque(host_substr)?, None); } } else if self.has_host() { if SchemeType::from(self.scheme()).is_special() { diff --git a/src/parser.rs b/src/parser.rs index 7cd1dbd8b..b90cd7c7c 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -156,7 +156,7 @@ impl fmt::Display for SyntaxViolation { } } -#[derive(Copy, Clone)] +#[derive(Copy, Clone, PartialEq)] pub enum SchemeType { File, SpecialNotFile, @@ -852,11 +852,16 @@ impl<'a> Parser<'a> { self.serialization.push('/'); self.serialization.push('/'); // authority state + let before_authority = self.serialization.len(); let (username_end, remaining) = self.parse_userinfo(input, scheme_type)?; + let has_authority = before_authority != self.serialization.len(); // host state let host_start = to_u32(self.serialization.len())?; let (host_end, host, port, remaining) = self.parse_host_and_port(remaining, scheme_end, scheme_type)?; + if host == HostInternal::None && has_authority { + return Err(ParseError::EmptyHost); + } // path state let path_start = to_u32(self.serialization.len())?; let remaining = self.parse_path_start(scheme_type, &mut true, remaining); @@ -900,7 +905,18 @@ impl<'a> Parser<'a> { } let (mut userinfo_char_count, remaining) = match last_at { None => return Ok((to_u32(self.serialization.len())?, input)), - Some((0, remaining)) => return Ok((to_u32(self.serialization.len())?, remaining)), + Some((0, remaining)) => { + // Otherwise, if one of the following is true + // c is the EOF code point, U+002F (/), U+003F (?), or U+0023 (#) + // url is special and c is U+005C (\) + // If @ flag is set and buffer is the empty string, validation error, return failure. + if let (Some(c), _) = remaining.split_first() { + if c == '/' || c == '?' || c == '#' || scheme_type.is_special() && c == '\\' { + return Err(ParseError::EmptyHost); + } + } + return Ok((to_u32(self.serialization.len())?, remaining)); + } Some(x) => x, }; @@ -946,6 +962,18 @@ impl<'a> Parser<'a> { let (host, remaining) = Parser::parse_host(input, scheme_type)?; write!(&mut self.serialization, "{}", host).unwrap(); let host_end = to_u32(self.serialization.len())?; + if let Host::Domain(h) = &host { + if h.is_empty() { + // Port with an empty host + if remaining.starts_with(":") { + return Err(ParseError::EmptyHost); + } + if scheme_type.is_special() { + return Err(ParseError::EmptyHost); + } + } + }; + let (port, remaining) = if let Some(remaining) = remaining.split_prefix(':') { let scheme = || default_port(&self.serialization[..scheme_end as usize]); Parser::parse_port(remaining, scheme, self.context)? @@ -962,6 +990,9 @@ impl<'a> Parser<'a> { mut input: Input, scheme_type: SchemeType, ) -> ParseResult<(Host, Input)> { + if scheme_type.is_file() { + return Parser::get_file_host(input); + } // Undo the Input abstraction here to avoid allocating in the common case // where the host part of the input does not contain any tab or newline let input_str = input.chars.as_str(); @@ -1012,10 +1043,41 @@ impl<'a> Parser<'a> { Ok((host, input)) } - pub(crate) fn parse_file_host<'i>( + fn get_file_host<'i>(input: Input<'i>) -> ParseResult<(Host, Input)> { + let (_, host_str, remaining) = Parser::file_host(input)?; + let host = match Host::parse(&host_str)? { + Host::Domain(ref d) if d == "localhost" => Host::Domain("".to_string()), + host => host, + }; + Ok((host, remaining)) + } + + fn parse_file_host<'i>( &mut self, input: Input<'i>, ) -> ParseResult<(bool, HostInternal, Input<'i>)> { + let has_host; + let (_, host_str, remaining) = Parser::file_host(input)?; + let host = if host_str.is_empty() { + has_host = false; + HostInternal::None + } else { + match Host::parse(&host_str)? { + Host::Domain(ref d) if d == "localhost" => { + has_host = false; + HostInternal::None + } + host => { + write!(&mut self.serialization, "{}", host).unwrap(); + has_host = true; + host.into() + } + } + }; + Ok((has_host, host, remaining)) + } + + pub fn file_host<'i>(input: Input<'i>) -> ParseResult<(bool, String, Input<'i>)> { // Undo the Input abstraction here to avoid allocating in the common case // where the host part of the input does not contain any tab or newline let input_str = input.chars.as_str(); @@ -1044,20 +1106,9 @@ impl<'a> Parser<'a> { } } if is_windows_drive_letter(host_str) { - return Ok((false, HostInternal::None, input)); + return Ok((false, "".to_string(), input)); } - let host = if host_str.is_empty() { - HostInternal::None - } else { - match Host::parse(host_str)? { - Host::Domain(ref d) if d == "localhost" => HostInternal::None, - host => { - write!(&mut self.serialization, "{}", host).unwrap(); - host.into() - } - } - }; - Ok((true, host, remaining)) + Ok((true, host_str.to_string(), remaining)) } pub fn parse_port

( @@ -1492,7 +1543,7 @@ fn c0_control_or_space(ch: char) -> bool { /// https://infra.spec.whatwg.org/#ascii-tab-or-newline #[inline] -pub fn ascii_tab_or_new_line(ch: char) -> bool { +fn ascii_tab_or_new_line(ch: char) -> bool { matches!(ch, '\t' | '\r' | '\n') } diff --git a/src/quirks.rs b/src/quirks.rs index ded278565..b3ea3681f 100644 --- a/src/quirks.rs +++ b/src/quirks.rs @@ -12,6 +12,8 @@ //! you probably want to use `Url` method instead. use parser::{default_port, Context, Input, Parser, SchemeType}; +use std::cell::RefCell; +use SyntaxViolation; use {idna, Host, ParseError, Position, Url}; /// https://url.spec.whatwg.org/#dom-url-domaintoascii @@ -110,19 +112,22 @@ pub fn set_host(url: &mut Url, new_host: &str) -> Result<(), ()> { let opt_port; { let scheme = url.scheme(); - let result = Parser::parse_host(Input::new(new_host), SchemeType::from(scheme)); - match result { - Ok((h, remaining)) => { - host = h; - opt_port = if let Some(remaining) = remaining.split_prefix(':') { + let scheme_type = SchemeType::from(scheme); + if let Ok((h, remaining)) = Parser::parse_host(input, scheme_type) { + host = h; + opt_port = if let Some(remaining) = remaining.split_prefix(':') { + if remaining.is_empty() { + None + } else { Parser::parse_port(remaining, || default_port(scheme), Context::Setter) .ok() .map(|(port, _remaining)| port) - } else { - None - }; - } - Err(_) => return Err(()), + } + } else { + None + }; + } else { + return Err(()); } } // Make sure we won't set an empty host to a url with a username or a port @@ -154,8 +159,25 @@ pub fn set_hostname(url: &mut Url, new_hostname: &str) -> Result<(), ()> { if url.cannot_be_a_base() { return Err(()); } - let result = Parser::parse_host(Input::new(new_hostname), SchemeType::from(url.scheme())); - if let Ok((host, _remaining)) = result { + // Host parsing rules are strict, + // We don't want to trim the input + let input = Input::no_trim(new_hostname); + let scheme_type = SchemeType::from(url.scheme()); + if let Ok((host, _remaining)) = Parser::parse_host(input, scheme_type) { + if let Host::Domain(h) = &host { + if h.is_empty() { + // Empty host on special not file url + if SchemeType::from(url.scheme()) == SchemeType::SpecialNotFile + // Port with an empty host + ||!port(&url).is_empty() + // Empty host with includes credentials + || !url.username().is_empty() + || !url.password().unwrap_or(&"").is_empty() + { + return Err(()); + } + } + } url.set_host_internal(host, None); Ok(()) } else { @@ -209,6 +231,10 @@ pub fn set_pathname(url: &mut Url, new_pathname: &str) { && Some('\\') == new_pathname.chars().nth(0) { url.set_path(new_pathname) + } else { + let mut path_to_set = String::from("/"); + path_to_set.push_str(new_pathname); + url.set_path(&path_to_set) } } From 26ccc0d6ea46b9d244f6015177f29958fb0f84c3 Mon Sep 17 00:00:00 2001 From: Jeremy Lempereur Date: Sun, 21 Jul 2019 00:19:35 +0200 Subject: [PATCH 08/15] Hash getter and setter. --- src/quirks.rs | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/quirks.rs b/src/quirks.rs index b3ea3681f..c7d163533 100644 --- a/src/quirks.rs +++ b/src/quirks.rs @@ -259,13 +259,14 @@ pub fn hash(url: &Url) -> &str { /// Setter for https://url.spec.whatwg.org/#dom-url-hash pub fn set_hash(url: &mut Url, new_hash: &str) { - if url.scheme() != "javascript" { - url.set_fragment(match new_hash { - "" => None, - _ if new_hash.starts_with('#') => Some(&new_hash[1..]), - _ => Some(new_hash), - }) - } + url.set_fragment(match new_hash { + // If the given value is the empty string, + // then set context object’s url’s fragment to null and return. + "" => None, + // Let input be the given value with a single leading U+0023 (#) removed, if any. + _ if new_hash.starts_with('#') => Some(&new_hash[1..]), + _ => Some(new_hash), + }) } fn trim(s: &str) -> &str { From 7efdc53193adfdfd65c1d39bc7ad4762dd4c272b Mon Sep 17 00:00:00 2001 From: Jeremy Lempereur Date: Sat, 20 Jul 2019 12:36:32 +0200 Subject: [PATCH 09/15] Fix scheme setter > test result: FAILED. 650 passed; 63 failed; 0 ignored; 0 measured --- src/lib.rs | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 72 insertions(+), 5 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 4969a6672..822e31091 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1663,8 +1663,13 @@ impl Url { self.set_host_internal(Host::parse_opaque(host_substr)?, None); } } else if self.has_host() { - if SchemeType::from(self.scheme()).is_special() { + let scheme_type = SchemeType::from(self.scheme()); + if scheme_type.is_special() { return Err(ParseError::EmptyHost); + } else { + if self.serialization.len() == self.path_start as usize { + self.serialization.push('/'); + } } debug_assert!(self.byte_at(self.scheme_end) == b':'); debug_assert!(self.byte_at(self.path_start) == b'/'); @@ -1967,14 +1972,28 @@ impl Url { /// /// # fn run() -> Result<(), ParseError> { /// let mut url = Url::parse("https://example.net")?; - /// let result = url.set_scheme("foo"); - /// assert_eq!(url.as_str(), "foo://example.net/"); + /// let result = url.set_scheme("http"); + /// assert_eq!(url.as_str(), "http://example.net/"); /// assert!(result.is_ok()); /// # Ok(()) /// # } /// # run().unwrap(); /// ``` + /// Change the URL’s scheme from `foo` to `bar`: /// + /// ``` + /// use url::Url; + /// # use url::ParseError; + /// + /// # fn run() -> Result<(), ParseError> { + /// let mut url = Url::parse("foo://example.net")?; + /// let result = url.set_scheme("bar"); + /// assert_eq!(url.as_str(), "bar://example.net"); + /// assert!(result.is_ok()); + /// # Ok(()) + /// # } + /// # run().unwrap(); + /// ``` /// /// Cannot change URL’s scheme from `https` to `foõ`: /// @@ -2007,14 +2026,55 @@ impl Url { /// # } /// # run().unwrap(); /// ``` + /// Cannot change the URL’s scheme from `foo` to `https`: + /// + /// ``` + /// use url::Url; + /// # use url::ParseError; + /// + /// # fn run() -> Result<(), ParseError> { + /// let mut url = Url::parse("foo://example.net")?; + /// let result = url.set_scheme("https"); + /// assert_eq!(url.as_str(), "foo://example.net"); + /// assert!(result.is_err()); + /// # Ok(()) + /// # } + /// # run().unwrap(); + /// ``` + /// Cannot change the URL’s scheme from `http` to `foo`: + /// + /// ``` + /// use url::Url; + /// # use url::ParseError; + /// + /// # fn run() -> Result<(), ParseError> { + /// let mut url = Url::parse("http://example.net")?; + /// let result = url.set_scheme("foo"); + /// assert_eq!(url.as_str(), "http://example.net/"); + /// assert!(result.is_err()); + /// # Ok(()) + /// # } + /// # run().unwrap(); + /// ``` pub fn set_scheme(&mut self, scheme: &str) -> Result<(), ()> { let mut parser = Parser::for_setter(String::new()); let remaining = parser.parse_scheme(parser::Input::new(scheme))?; - if !remaining.is_empty() - || (!self.has_host() && SchemeType::from(&parser.serialization).is_special()) + let new_scheme_type = SchemeType::from(&parser.serialization); + let old_scheme_type = SchemeType::from(self.scheme()); + // If url’s scheme is a special scheme and buffer is not a special scheme, then return. + if new_scheme_type.is_special() && !old_scheme_type.is_special() || + // If url’s scheme is not a special scheme and buffer is a special scheme, then return. + !new_scheme_type.is_special() && old_scheme_type.is_special() || + // If url includes credentials or has a non-null port, and buffer is "file", then return. + // If url’s scheme is "file" and its host is an empty host or null, then return. + new_scheme_type.is_file() && self.has_authority() { return Err(()); } + + if !remaining.is_empty() || (!self.has_host() && new_scheme_type.is_special()) { + return Err(()); + } let old_scheme_end = self.scheme_end; let new_scheme_end = to_u32(parser.serialization.len()).unwrap(); let adjust = |index: &mut u32| { @@ -2036,6 +2096,13 @@ impl Url { parser.serialization.push_str(self.slice(old_scheme_end..)); self.serialization = parser.serialization; + + // Update the port so it can be removed + // If it is the scheme's default + // We don't mind it silently failing + // If there was no port in the first place + let _ = self.set_port(self.port()); + Ok(()) } From 736d7bc7c305e2ea9f4b152d67b852d346a64ca8 Mon Sep 17 00:00:00 2001 From: o0Ignition0o Date: Sun, 4 Aug 2019 00:27:20 +0200 Subject: [PATCH 10/15] removing unused imports. --- src/quirks.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/quirks.rs b/src/quirks.rs index c7d163533..5104317d6 100644 --- a/src/quirks.rs +++ b/src/quirks.rs @@ -12,8 +12,6 @@ //! you probably want to use `Url` method instead. use parser::{default_port, Context, Input, Parser, SchemeType}; -use std::cell::RefCell; -use SyntaxViolation; use {idna, Host, ParseError, Position, Url}; /// https://url.spec.whatwg.org/#dom-url-domaintoascii From a9ca033439001d05154dc4afd053570f4bfc0928 Mon Sep 17 00:00:00 2001 From: o0Ignition0o Date: Sun, 4 Aug 2019 01:02:45 +0200 Subject: [PATCH 11/15] Pleasing the 1.33.0 borrow checker. --- src/lib.rs | 3 ++- src/parser.rs | 29 ++++++++++++++++------------- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 822e31091..95a369599 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2101,7 +2101,8 @@ impl Url { // If it is the scheme's default // We don't mind it silently failing // If there was no port in the first place - let _ = self.set_port(self.port()); + let previous_port = self.port(); + let _ = self.set_port(previous_port); Ok(()) } diff --git a/src/parser.rs b/src/parser.rs index b90cd7c7c..2749e5dbc 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -1229,12 +1229,13 @@ impl<'a> Parser<'a> { } } } - - let segment_before_slash = if ends_with_slash { - &self.serialization[segment_start..self.serialization.len() - 1] + // Going from &str to String to &str to please the 1.33.0 borrow checker + let before_slash_string = if ends_with_slash { + self.serialization[segment_start..self.serialization.len() - 1].to_owned() } else { - &self.serialization[segment_start..self.serialization.len()] + self.serialization[segment_start..self.serialization.len()].to_owned() }; + let segment_before_slash: &str = &before_slash_string; match segment_before_slash { // If buffer is a double-dot path segment, shorten url’s path, ".." | "%2e%2e" | "%2e%2E" | "%2E%2e" | "%2E%2E" | "%2e." | "%2E." | ".%2e" @@ -1307,16 +1308,18 @@ impl<'a> Parser<'a> { if self.serialization.len() == path_start { return; } - // If url’s scheme is "file", path’s size is 1, and path[0] is a normalized Windows drive letter, then return. - let segments: Vec<&str> = self.serialization[path_start..] - .split('/') - .filter(|s| !s.is_empty()) - .collect(); - if scheme_type.is_file() - && segments.len() == 1 - && is_normalized_windows_drive_letter(segments[0]) { - return; + // If url’s scheme is "file", path’s size is 1, and path[0] is a normalized Windows drive letter, then return. + let segments: Vec<&str> = self.serialization[path_start..] + .split('/') + .filter(|s| !s.is_empty()) + .collect(); + if scheme_type.is_file() + && segments.len() == 1 + && is_normalized_windows_drive_letter(segments[0]) + { + return; + } } // Remove path’s last item. self.pop_path(scheme_type, path_start); From 8ef48471a8f82658fbb2eddad4a785ba54122d2e Mon Sep 17 00:00:00 2001 From: o0Ignition0o Date: Mon, 5 Aug 2019 13:23:45 +0200 Subject: [PATCH 12/15] Make sure a windows drive letter segment always ends with a slash. --- src/lib.rs | 18 ++++++++++++++++++ src/parser.rs | 3 ++- tests/unit.rs | 26 ++++++++++++++++++++++++++ 3 files changed, 46 insertions(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index 95a369599..05d4e56d9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2508,6 +2508,7 @@ fn path_to_file_url_segments_windows( } let mut components = path.components(); + let host_start = serialization.len() + 1; let host_end; let host_internal; match components.next() { @@ -2534,15 +2535,24 @@ fn path_to_file_url_segments_windows( _ => return Err(()), } + let mut path_only_has_prefix = true; for component in components { if component == Component::RootDir { continue; } + path_only_has_prefix = false; // FIXME: somehow work with non-unicode? let component = component.as_os_str().to_str().ok_or(())?; serialization.push('/'); serialization.extend(percent_encode(component.as_bytes(), PATH_SEGMENT)); } + // A windows drive letter must end with a slash. + if serialization.len() > host_start + && parser::is_windows_drive_letter(&serialization[host_start..]) + && path_only_has_prefix + { + serialization.push('/'); + } Ok((host_end, host_internal)) } @@ -2567,6 +2577,14 @@ fn file_url_segments_to_pathbuf( bytes.push(b'/'); bytes.extend(percent_decode(segment.as_bytes())); } + // A windows drive letter must end with a slash. + if bytes.len() > 2 { + if matches!(bytes[bytes.len() -2], b'a'..=b'z' | b'A'..=b'Z') + && matches!(bytes[bytes.len() - 1], b':' | b'|') + { + bytes.push(b'/'); + } + } let os_str = OsStr::from_bytes(&bytes); let path = PathBuf::from(os_str); debug_assert!( diff --git a/src/parser.rs b/src/parser.rs index 2749e5dbc..179caba67 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -1571,7 +1571,8 @@ fn is_normalized_windows_drive_letter(segment: &str) -> bool { /// Wether the scheme is file:, the path has a single segment, and that segment /// is a Windows drive letter -fn is_windows_drive_letter(segment: &str) -> bool { +#[inline] +pub fn is_windows_drive_letter(segment: &str) -> bool { segment.len() == 2 && starts_with_windows_drive_letter(segment) } diff --git a/tests/unit.rs b/tests/unit.rs index 82493828c..41d0b8268 100644 --- a/tests/unit.rs +++ b/tests/unit.rs @@ -564,3 +564,29 @@ fn test_options_reuse() { assert_eq!(url.as_str(), "http://mozilla.org/sub/path"); assert_eq!(*violations.borrow(), vec!(ExpectedDoubleSlash, Backslash)); } + +/// https://github.com/servo/rust-url/issues/505 +#[cfg(windows)] +#[test] +fn test_url_from_file_path() { + use std::path::PathBuf; + use url::Url; + + let p = PathBuf::from("c:///"); + let u = Url::from_file_path(p).unwrap(); + let path = u.to_file_path().unwrap(); + assert_eq!("C:\\", path.to_str().unwrap()); +} + +/// https://github.com/servo/rust-url/issues/505 +#[cfg(not(windows))] +#[test] +fn test_url_from_file_path() { + use std::path::PathBuf; + use url::Url; + + let p = PathBuf::from("/c:/"); + let u = Url::from_file_path(p).unwrap(); + let path = u.to_file_path().unwrap(); + assert_eq!("/c:/", path.to_str().unwrap()); +} From aeef54febed316165625321a6aaf54eabbd906d8 Mon Sep 17 00:00:00 2001 From: Jeremy Lempereur Date: Sun, 10 Nov 2019 16:10:55 +0100 Subject: [PATCH 13/15] trim file paths if needed. --- src/lib.rs | 4 ---- src/parser.rs | 24 +++++++++--------------- 2 files changed, 9 insertions(+), 19 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 05d4e56d9..2d432cc3b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1403,12 +1403,8 @@ impl Url { } parser.parse_cannot_be_a_base_path(parser::Input::new(path)); } else { - let path_start = parser.serialization.len(); let mut has_host = true; // FIXME parser.parse_path_start(scheme_type, &mut has_host, parser::Input::new(path)); - if scheme_type.is_file() { - parser::trim_path(&mut parser.serialization, path_start); - } } }); self.restore_after_path(old_after_path_pos, &after_path); diff --git a/src/parser.rs b/src/parser.rs index 179caba67..54369adf8 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -540,7 +540,6 @@ impl<'a> Parser<'a> { self.parse_path(SchemeType::File, &mut has_host, path_start, remaining) }; - trim_path(&mut self.serialization, host_end as usize); // For file URLs that have a host and whose path starts // with the windows drive letter we just remove the host. if !has_host { @@ -598,8 +597,6 @@ impl<'a> Parser<'a> { let host_start = host_start as u32; - trim_path(&mut self.serialization, host_end); - let (query_start, fragment_start) = self.parse_query_and_fragment(scheme_type, scheme_end, remaining)?; @@ -1287,6 +1284,15 @@ impl<'a> Parser<'a> { break; } } + if scheme_type.is_file() { + // while url’s path’s size is greater than 1 + // and url’s path[0] is the empty string, + // validation error, remove the first item from url’s path. + //FIXME: log violation + let path = self.serialization.split_off(path_start); + self.serialization.push('/'); + self.serialization.push_str(&path.trim_start_matches("/")); + } input } @@ -1495,18 +1501,6 @@ impl<'a> Parser<'a> { } } -// Trim path start forward slashes when no authority is present -// https://github.com/whatwg/url/issues/232 -pub fn trim_path(serialization: &mut String, path_start: usize) { - let path = serialization.split_off(path_start); - if path.starts_with("/") { - serialization.push('/'); - serialization.push_str(&path.trim_start_matches("/")); - } else { - serialization.push_str(&path); - } -} - #[inline] fn is_ascii_hex_digit(c: char) -> bool { matches!(c, 'a'..='f' | 'A'..='F' | '0'..='9') From 925ec94a6d0e2c3e66289f9922f90726d6b21e7f Mon Sep 17 00:00:00 2001 From: Jeremy Lempereur Date: Sun, 10 Nov 2019 16:41:37 +0100 Subject: [PATCH 14/15] Avoid allocation when checking for windows drive letters. --- src/parser.rs | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index 54369adf8..00458365a 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -1314,18 +1314,11 @@ impl<'a> Parser<'a> { if self.serialization.len() == path_start { return; } + // If url’s scheme is "file", path’s size is 1, and path[0] is a normalized Windows drive letter, then return. + if scheme_type.is_file() + && is_normalized_windows_drive_letter(&self.serialization[path_start..]) { - // If url’s scheme is "file", path’s size is 1, and path[0] is a normalized Windows drive letter, then return. - let segments: Vec<&str> = self.serialization[path_start..] - .split('/') - .filter(|s| !s.is_empty()) - .collect(); - if scheme_type.is_file() - && segments.len() == 1 - && is_normalized_windows_drive_letter(segments[0]) - { - return; - } + return; } // Remove path’s last item. self.pop_path(scheme_type, path_start); From 446484009e49caca8147ba32cd7125c260d0246a Mon Sep 17 00:00:00 2001 From: o0Ignition0o Date: Sat, 7 Dec 2019 11:15:18 +0100 Subject: [PATCH 15/15] Comments and nits fixups. --- src/lib.rs | 12 ++++++------ src/parser.rs | 2 +- src/quirks.rs | 11 +++++------ tests/unit.rs | 29 +++++++++++++++++++++++++++++ 4 files changed, 41 insertions(+), 13 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 2d432cc3b..2ad421d08 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2058,12 +2058,12 @@ impl Url { let new_scheme_type = SchemeType::from(&parser.serialization); let old_scheme_type = SchemeType::from(self.scheme()); // If url’s scheme is a special scheme and buffer is not a special scheme, then return. - if new_scheme_type.is_special() && !old_scheme_type.is_special() || + if (new_scheme_type.is_special() && !old_scheme_type.is_special()) || // If url’s scheme is not a special scheme and buffer is a special scheme, then return. - !new_scheme_type.is_special() && old_scheme_type.is_special() || + (!new_scheme_type.is_special() && old_scheme_type.is_special()) || // If url includes credentials or has a non-null port, and buffer is "file", then return. // If url’s scheme is "file" and its host is an empty host or null, then return. - new_scheme_type.is_file() && self.has_authority() + (new_scheme_type.is_file() && self.has_authority()) { return Err(()); } @@ -2095,8 +2095,8 @@ impl Url { // Update the port so it can be removed // If it is the scheme's default - // We don't mind it silently failing - // If there was no port in the first place + // we don't mind it silently failing + // if there was no port in the first place let previous_port = self.port(); let _ = self.set_port(previous_port); @@ -2575,7 +2575,7 @@ fn file_url_segments_to_pathbuf( } // A windows drive letter must end with a slash. if bytes.len() > 2 { - if matches!(bytes[bytes.len() -2], b'a'..=b'z' | b'A'..=b'Z') + if matches!(bytes[bytes.len() - 2], b'a'..=b'z' | b'A'..=b'Z') && matches!(bytes[bytes.len() - 1], b':' | b'|') { bytes.push(b'/'); diff --git a/src/parser.rs b/src/parser.rs index 00458365a..6c84ba412 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -908,7 +908,7 @@ impl<'a> Parser<'a> { // url is special and c is U+005C (\) // If @ flag is set and buffer is the empty string, validation error, return failure. if let (Some(c), _) = remaining.split_first() { - if c == '/' || c == '?' || c == '#' || scheme_type.is_special() && c == '\\' { + if c == '/' || c == '?' || c == '#' || (scheme_type.is_special() && c == '\\') { return Err(ParseError::EmptyHost); } } diff --git a/src/quirks.rs b/src/quirks.rs index 5104317d6..caab354cc 100644 --- a/src/quirks.rs +++ b/src/quirks.rs @@ -157,8 +157,7 @@ pub fn set_hostname(url: &mut Url, new_hostname: &str) -> Result<(), ()> { if url.cannot_be_a_base() { return Err(()); } - // Host parsing rules are strict, - // We don't want to trim the input + // Host parsing rules are strict we don't want to trim the input let input = Input::no_trim(new_hostname); let scheme_type = SchemeType::from(url.scheme()); if let Ok((host, _remaining)) = Parser::parse_host(input, scheme_type) { @@ -168,7 +167,7 @@ pub fn set_hostname(url: &mut Url, new_hostname: &str) -> Result<(), ()> { if SchemeType::from(url.scheme()) == SchemeType::SpecialNotFile // Port with an empty host ||!port(&url).is_empty() - // Empty host with includes credentials + // Empty host that includes credentials || !url.username().is_empty() || !url.password().unwrap_or(&"").is_empty() { @@ -224,9 +223,9 @@ pub fn set_pathname(url: &mut Url, new_pathname: &str) { return; } if Some('/') == new_pathname.chars().nth(0) - || SchemeType::from(url.scheme()).is_special() - // \ is a segment delimiter for 'special' URLs" - && Some('\\') == new_pathname.chars().nth(0) + || (SchemeType::from(url.scheme()).is_special() + // \ is a segment delimiter for 'special' URLs" + && Some('\\') == new_pathname.chars().nth(0)) { url.set_path(new_pathname) } else { diff --git a/tests/unit.rs b/tests/unit.rs index 41d0b8268..9cc7c53fe 100644 --- a/tests/unit.rs +++ b/tests/unit.rs @@ -37,6 +37,35 @@ fn test_relative_empty() { assert_eq!(url.as_str(), "sc://%C3%B1"); } +#[test] +fn test_set_empty_host() { + let mut base: Url = "moz://foo:bar@servo/baz".parse().unwrap(); + base.set_username("").unwrap(); + assert_eq!(base.as_str(), "moz://:bar@servo/baz"); + base.set_host(None).unwrap(); + assert_eq!(base.as_str(), "moz:/baz"); + base.set_host(Some("servo")).unwrap(); + assert_eq!(base.as_str(), "moz://servo/baz"); +} + +#[test] +fn test_set_empty_hostname() { + use url::quirks; + let mut base: Url = "moz://foo@servo/baz".parse().unwrap(); + assert!( + quirks::set_hostname(&mut base, "").is_err(), + "setting an empty hostname to a url with a username should fail" + ); + base = "moz://:pass@servo/baz".parse().unwrap(); + assert!( + quirks::set_hostname(&mut base, "").is_err(), + "setting an empty hostname to a url with a password should fail" + ); + base = "moz://servo/baz".parse().unwrap(); + quirks::set_hostname(&mut base, "").unwrap(); + assert_eq!(base.as_str(), "moz:///baz"); +} + macro_rules! assert_from_file_path { ($path: expr) => { assert_from_file_path!($path, $path)