'
+ )
assert clean(html, styles=styles) == expected
diff --git a/tests/test_links.py b/tests/test_links.py
index ac38ee70..6b7a77eb 100644
--- a/tests/test_links.py
+++ b/tests/test_links.py
@@ -3,7 +3,7 @@
except ImportError:
from urllib import quote_plus
-from html5lib.tokenizer import HTMLTokenizer
+# FIXME(willkg): from html5lib.tokenizer import HTMLTokenizer
import pytest
from bleach import linkify, url_re, DEFAULT_CALLBACKS as DC
@@ -406,6 +406,7 @@ def test_end_of_clause():
)
+@pytest.mark.xfail(reason='html5lib >= 0.99999999: changed API')
def test_sarcasm():
"""Jokes should crash."""
assert linkify('Yeah right ') == 'Yeah right <sarcasm/>'
@@ -498,6 +499,7 @@ def test_ports(data, expected_data):
assert linkify(data) == out.format(*expected_data)
+@pytest.mark.xfail(reason='html5lib >= 0.99999999: no access to tokenizer')
def test_tokenizer():
"""Linkify doesn't always have to sanitize."""
raw = 'test'
diff --git a/tests/test_security.py b/tests/test_security.py
index 6ffaf449..4fb30207 100644
--- a/tests/test_security.py
+++ b/tests/test_security.py
@@ -74,7 +74,9 @@ def test_invalid_href_attr():
def test_invalid_filter_attr():
IMG = ['img', ]
- IMG_ATTR = {'img': lambda n, v: n == 'src' and v == "http://example.com/"}
+ IMG_ATTR = {
+ 'img': lambda n, v: n == 'src' and v == "http://example.com/"
+ }
assert (
clean('', tags=IMG, attributes=IMG_ATTR) ==
@@ -145,7 +147,11 @@ def test_feed_protocol():
def get_tests():
- """Retrieves regression tests from data/ directory"""
+ """Retrieves regression tests from data/ directory
+
+ :returns: list of ``(filename, filedata)`` tuples
+
+ """
datadir = os.path.join(os.path.dirname(__file__), 'data')
tests = [
os.path.join(datadir, fn) for fn in os.listdir(datadir)
@@ -153,19 +159,23 @@ def get_tests():
]
# Sort numerically which makes it easier to iterate through them
tests.sort(key=lambda x: int(os.path.basename(x).split('.', 1)[0]))
- return tests
+
+ testcases = [
+ (fn, open(fn, 'r').read()) for fn in tests
+ ]
+
+ return testcases
-@pytest.mark.parametrize('fn', get_tests())
-def test_regressions(fn):
+@pytest.mark.parametrize('fn, text', get_tests())
+def test_regressions(fn, text):
"""Regression tests for clean so we can see if there are issues"""
- s = open(fn, 'r').read()
expected = six.text_type(open(fn + '.out', 'r').read())
# NOTE(willkg): This strips input and expected which makes it easier to
# maintain the files. If there comes a time when the input needs whitespace
# at the beginning or end, then we'll have to figure out something else.
- assert clean(s.strip()) == expected.strip()
+ assert clean(text.strip()) == expected.strip()
def test_regression_manually():
diff --git a/tox.ini b/tox.ini
index 09ed488f..53c175c9 100644
--- a/tox.ini
+++ b/tox.ini
@@ -4,7 +4,7 @@
# and then run "tox" from this directory.
[tox]
-envlist = py{27,33,34,35,36}-html5lib{999,999999,9999999},pypy-html5lib9999999
+envlist = py{27,33,34,35,36}-html5lib{99999999,999999999},pypy-html5lib99999999
[testenv]
basepython =
@@ -15,8 +15,7 @@ basepython =
py36: python3.6
deps =
-rrequirements.txt
- html5lib999: html5lib==0.999
- html5lib999999: html5lib==0.999999
- html5lib9999999: html5lib==0.9999999
+ html5lib99999999: html5lib==0.99999999
+ html5lib999999999: html5lib==0.999999999
commands =
py.test {posargs:-v}
From 3db588a5ed8b43da8324547b23ad5670b459ee9e Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Tue, 21 Feb 2017 11:45:01 -0500
Subject: [PATCH 03/12] Update security regression tests
I also moved test_nasty to the regression tests because it's just like those.
---
tests/data/14.test.out | 2 +-
tests/data/15.test.out | 2 +-
tests/data/16.test.out | 2 +-
tests/data/17.test.out | 2 +-
tests/data/3.test.out | 2 +-
tests/data/4.test | 1 +
tests/data/4.test.out | 1 +
tests/data/5.test.out | 2 +-
tests/data/9.test.out | 2 +-
tests/test_security.py | 35 +++++++++++++++++------------------
10 files changed, 26 insertions(+), 25 deletions(-)
create mode 100644 tests/data/4.test
create mode 100644 tests/data/4.test.out
diff --git a/tests/data/14.test.out b/tests/data/14.test.out
index 16445739..8e5ff754 100644
--- a/tests/data/14.test.out
+++ b/tests/data/14.test.out
@@ -1 +1 @@
-<imgsrc=java&<wbr>#115;crip&<wbr>#116;:a
\ No newline at end of file
+<imgsrc=java&<wbr>#115;crip&<wbr></wbr>#116;:a</imgsrc=java&<wbr>
\ No newline at end of file
diff --git a/tests/data/15.test.out b/tests/data/15.test.out
index 334f916b..8b90245f 100644
--- a/tests/data/15.test.out
+++ b/tests/data/15.test.out
@@ -1 +1 @@
-le&<wbr>#114;t('XS<wbr>;S')>
\ No newline at end of file
+le&<wbr></wbr>#114;t('XS<wbr></wbr>;S')>
\ No newline at end of file
diff --git a/tests/data/16.test.out b/tests/data/16.test.out
index 9c6ca965..1ecb332b 100644
--- a/tests/data/16.test.out
+++ b/tests/data/16.test.out
@@ -1 +1 @@
-<imgsrc=ja&<wbr>#0000118as&<wbr>#0000099ri&<wbr>#0000112t:&<wbr>#0000097le&<wbr>#0000114t(&<wbr>#0000039XS&<wbr>#0000083')>
\ No newline at end of file
+<imgsrc=ja&<wbr>#0000118as&<wbr></wbr>#0000099ri&<wbr></wbr>#0000112t:&<wbr></wbr>#0000097le&<wbr></wbr>#0000114t(&<wbr></wbr>#0000039XS&<wbr></wbr>#0000083')></imgsrc=ja&<wbr>
\ No newline at end of file
diff --git a/tests/data/17.test.out b/tests/data/17.test.out
index dabfaa2d..ae928a99 100644
--- a/tests/data/17.test.out
+++ b/tests/data/17.test.out
@@ -1 +1 @@
-<imgsrc=javas&<wbr>#x63ript:&<wbr>#x61lert(&<wbr>#x27XSS')>
\ No newline at end of file
+<imgsrc=javas&<wbr>#x63ript:&<wbr></wbr>#x61lert(&<wbr></wbr>#x27XSS')></imgsrc=javas&<wbr>
\ No newline at end of file
diff --git a/tests/data/3.test.out b/tests/data/3.test.out
index 20c3d0d4..f0d69629 100644
--- a/tests/data/3.test.out
+++ b/tests/data/3.test.out
@@ -1 +1 @@
->"'><img%20src%3d%26%23x6a;%26%23x61;%26%23x76;%26%23x61;%26%23x73;%26%23x63;%26%23x72;%26%23x69;%26%23x70;%26%23x74;%26%23x3a;alert(%26quot;%26%23x20;xss%26%23x20;test%26%23x20;successful%26quot;)>
\ No newline at end of file
+>"'><img%20src%3d%26%23x6a;%26%23x61;%26%23x76;%26%23x61;%26%23x73;%26%23x63;%26%23x72;%26%23x69;%26%23x70;%26%23x74;%26%23x3a;alert(%26quot;%26%23x20;xss%26%23x20;test%26%23x20;successful%26quot;)></img%20src%3d%26%23x6a;%26%23x61;%26%23x76;%26%23x61;%26%23x73;%26%23x63;%26%23x72;%26%23x69;%26%23x70;%26%23x74;%26%23x3a;alert(%26quot;%26%23x20;xss%26%23x20;test%26%23x20;successful%26quot;)>
\ No newline at end of file
diff --git a/tests/data/4.test b/tests/data/4.test
new file mode 100644
index 00000000..c4cf51cd
--- /dev/null
+++ b/tests/data/4.test
@@ -0,0 +1 @@
+ipt type="text/javascript">alert("foo");script>
diff --git a/tests/data/4.test.out b/tests/data/4.test.out
new file mode 100644
index 00000000..88ea86b2
--- /dev/null
+++ b/tests/data/4.test.out
@@ -0,0 +1 @@
+<scr<script>ipt type="text/javascript">alert("foo");script<del></del>></scr<script>
diff --git a/tests/data/5.test.out b/tests/data/5.test.out
index 1782eafb..0d88a88a 100644
--- a/tests/data/5.test.out
+++ b/tests/data/5.test.out
@@ -1 +1 @@
->%22%27><img%20src%3d%22javascript:alert(%27%20xss%27)%22>
\ No newline at end of file
+>%22%27><img%20src%3d%22javascript:alert(%27%20xss%27)%22></img%20src%3d%22javascript:alert(%27%20xss%27)%22>
\ No newline at end of file
diff --git a/tests/data/9.test.out b/tests/data/9.test.out
index 3a4d9b6c..5c5eb6ba 100644
--- a/tests/data/9.test.out
+++ b/tests/data/9.test.out
@@ -1 +1 @@
-'';!--"<xss>=&{()}
\ No newline at end of file
+'';!--"<xss>=&{()}</xss>
\ No newline at end of file
diff --git a/tests/test_security.py b/tests/test_security.py
index 4fb30207..356b1292 100644
--- a/tests/test_security.py
+++ b/tests/test_security.py
@@ -22,7 +22,7 @@ def test_nested_script_tag():
def test_nested_script_tag_r():
assert (
clean('>') ==
- '<script<script>>evil()</script<>>'
+ '<script<script>>evil()></script<script>'
)
@@ -90,8 +90,11 @@ def test_invalid_filter_attr():
def test_invalid_tag_char():
assert (
- clean('') ==
- '<script xss="" src="http://xx.com/xss.js"></script>'
+ clean('') in
+ [
+ '<script src="http://xx.com/xss.js" xss=""></script>',
+ '<script xss="" src="http://xx.com/xss.js"></script>'
+ ]
)
assert (
clean('') ==
@@ -102,15 +105,21 @@ def test_invalid_tag_char():
def test_unclosed_tag():
assert (
clean('ipt type="text/javascript">alert("foo");'
- 'script>')
- expect = ('<scr<script></script>ipt type="text/javascript"'
- '>alert("foo");</script>script<del></del>'
- '>')
- assert clean(test) == expect
-
-
def test_poster_attribute():
"""Poster attributes should not allow javascript."""
tags = ['video']
From 02facd34b02cd9d9e16b547e07b07ddd3db40455 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Tue, 21 Feb 2017 13:23:00 -0500
Subject: [PATCH 04/12] Minor code cleanup
* address FIXMEs
* minor cleanup to code and comments
---
bleach/__init__.py | 8 ++++----
bleach/sanitizer.py | 14 ++++++--------
tests/test_basics.py | 15 ++++++++++-----
tests/test_css.py | 6 +++---
tests/test_links.py | 9 ---------
tests/test_security.py | 2 +-
6 files changed, 24 insertions(+), 30 deletions(-)
diff --git a/bleach/__init__.py b/bleach/__init__.py
index c54dc72b..d1a82cde 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -120,9 +120,11 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
:arg strip: whether or not to strip disallowed elements
:arg strip_comments: whether or not to strip HTML comments
+ :returns: cleaned text as unicode
+
"""
if not text:
- return ''
+ return u''
text = force_unicode(text)
@@ -152,7 +154,6 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
- # FIXME(willkg): parse_email=False, tokenizer=HTMLSanitizer):
parse_email=False):
"""Convert URL-like strings in an HTML fragment to links
@@ -170,9 +171,8 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
text = force_unicode(text)
if not text:
- return ''
+ return u''
- # FIXME(willkg): parser = html5lib.HTMLParser(tokenizer=tokenizer)
parser = html5lib.HTMLParser()
forest = parser.parseFragment(text)
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index fb502b85..0701def2 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -63,9 +63,7 @@ def allow_token(self, token):
attrs = {}
for namespaced_name, val in token['data'].items():
namespace, name = namespaced_name
- # FIXME(willkg): "name" used to be something like "xlink:href"
- # but it's now (namespace['xlink'], 'href'). we should fix the
- # name here so it's what the callable would expect.
+
if callable(allowed_attributes):
if allowed_attributes(name, val):
attrs[namespaced_name] = val
@@ -73,12 +71,14 @@ def allow_token(self, token):
elif name in allowed_attributes:
attrs[namespaced_name] = val
- # Go through all the uri-type attributes
+ # Handle attributes that have uri values
for attr in self.attr_val_is_uri:
if attr not in attrs:
continue
+
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
unescape(attrs[attr])).lower()
+
# Remove replacement characters from unescaped characters.
val_unescaped = val_unescaped.replace("\ufffd", "")
@@ -87,17 +87,15 @@ def allow_token(self, token):
# It has a protocol, but it's not allowed--so drop it
del attrs[attr]
- # FIXME(willkg): is this right?
for attr in self.svg_attr_val_allows_ref:
if attr in attrs:
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
' ',
unescape(attrs[attr]))
- # FIXME(willkg): is this right?
if (token['name'] in self.svg_allow_local_href and
- (namespace['xlink'], 'href') in attrs and
- re.search(r'^\s*[^#\s].*', attrs[(namespace['xlink'], 'href')])):
+ (namespaces['xlink'], 'href') in attrs and
+ re.search(r'^\s*[^#\s].*', attrs[(namespaces['xlink'], 'href')])):
del attrs[(namespace['xlink'], 'href')]
# Sanitize css in style attribute
diff --git a/tests/test_basics.py b/tests/test_basics.py
index 8e293ca6..49148592 100644
--- a/tests/test_basics.py
+++ b/tests/test_basics.py
@@ -71,12 +71,17 @@ def test_function_arguments():
def test_named_arguments():
ATTRS = {'a': ['rel', 'href']}
- s = ('xx.com',
- 'xx.com')
- assert bleach.clean(s[0]) == 'xx.com'
- # FIXME: This might not be needed if attribute order is stable now.
- assert bleach.clean(s[0], attributes=ATTRS) in s
+ text = 'xx.com'
+
+ assert bleach.clean(text) == 'xx.com'
+ assert (
+ bleach.clean(text, attributes=ATTRS) in
+ [
+ 'xx.com',
+ 'xx.com'
+ ]
+ )
def test_disallowed_html():
diff --git a/tests/test_css.py b/tests/test_css.py
index 3d224fac..d8880d78 100644
--- a/tests/test_css.py
+++ b/tests/test_css.py
@@ -119,8 +119,7 @@ def test_style_hang():
'background-color: rgb(246, 246, 242);',
'overflow-x: auto;',
'overflow-y: auto;',
- # FIXME(willkg): This fails the first regxp gauntlet in sanitize_css.
- # 'font: italic small-caps bolder condensed 16px/3 cursive;',
+ 'font: italic small-caps bolder condensed 16px/3 cursive;',
'background-position: initial initial;',
'background-repeat: initial initial;'
]
@@ -146,7 +145,8 @@ def test_style_hang():
'padding-right: 15px; '
'padding-bottom: 15px; '
'padding-left: 15px; '
- 'background-color: rgb(246, 246, 242);'
+ 'background-color: rgb(246, 246, 242); '
+ 'font: italic small-caps bolder condensed 16px/3 cursive;'
'">Hello world
'
)
diff --git a/tests/test_links.py b/tests/test_links.py
index 6b7a77eb..53d60e5c 100644
--- a/tests/test_links.py
+++ b/tests/test_links.py
@@ -3,7 +3,6 @@
except ImportError:
from urllib import quote_plus
-# FIXME(willkg): from html5lib.tokenizer import HTMLTokenizer
import pytest
from bleach import linkify, url_re, DEFAULT_CALLBACKS as DC
@@ -499,14 +498,6 @@ def test_ports(data, expected_data):
assert linkify(data) == out.format(*expected_data)
-@pytest.mark.xfail(reason='html5lib >= 0.99999999: no access to tokenizer')
-def test_tokenizer():
- """Linkify doesn't always have to sanitize."""
- raw = 'test'
- assert linkify(raw) == 'test<x></x>'
- assert linkify(raw, tokenizer=HTMLTokenizer) == raw
-
-
def test_ignore_bad_protocols():
assert (
linkify('foohttp://bar') ==
diff --git a/tests/test_security.py b/tests/test_security.py
index 356b1292..2aac0200 100644
--- a/tests/test_security.py
+++ b/tests/test_security.py
@@ -75,7 +75,7 @@ def test_invalid_href_attr():
def test_invalid_filter_attr():
IMG = ['img', ]
IMG_ATTR = {
- 'img': lambda n, v: n == 'src' and v == "http://example.com/"
+ 'img': lambda attr, val: attr == 'src' and val == "http://example.com/"
}
assert (
From 10852231012ae3eece9c0a2af5c6c7c8e2e5212f Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Tue, 21 Feb 2017 15:37:00 -0500
Subject: [PATCH 05/12] Update CHANGES and docs
---
CHANGES | 19 +++++++++++++++----
README.rst | 2 --
docs/clean.rst | 5 ++++-
docs/conf.py | 3 +--
docs/dev.rst | 4 ++--
docs/goals.rst | 4 ++--
docs/linkify.rst | 17 +----------------
7 files changed, 25 insertions(+), 29 deletions(-)
diff --git a/CHANGES b/CHANGES
index ec3bc992..e8e49a8d 100644
--- a/CHANGES
+++ b/CHANGES
@@ -8,10 +8,19 @@ Version 2.0 (in development)
- Removed support for Python 2.6. #206
- Removed support for Python 3.2. #224
+- Bleach no longer supports html5lib < 0.99999999 (8 9s).
+
+ This version represents a rewrite to use the new sanitizing API since
+ the old one was dropped in html5lib 0.99999999 (8 9s).
+
+- linkify no longer accepts a tokenizer argument.
+- clean output is different than in previous versions; particularly this version
+ will add end tags even if the tag will be escaped.
**Changes**
-- Added testing for Python 3.6.
+- Supports Python 3.6.
+- Supports html5lib >= 0.99999999 (8 9s).
Version 1.5 (November 4th, 2016)
@@ -20,9 +29,11 @@ Version 1.5 (November 4th, 2016)
**Backwards incompatible changes**
- clean: The list of ``ALLOWED_PROTOCOLS`` now defaults to http, https and
- mailto. Previously it was a long list of protocols something like ed2k, ftp,
- http, https, irc, mailto, news, gopher, nntp, telnet, webcal, xmpp, callto,
- feed, urn, aim, rsync, tag, ssh, sftp, rtsp, afs, data. #149
+ mailto.
+
+ Previously it was a long list of protocols something like ed2k, ftp, http,
+ https, irc, mailto, news, gopher, nntp, telnet, webcal, xmpp, callto, feed,
+ urn, aim, rsync, tag, ssh, sftp, rtsp, afs, data. #149
**Changes**
diff --git a/README.rst b/README.rst
index 8f9ce05d..3bd87573 100644
--- a/README.rst
+++ b/README.rst
@@ -101,5 +101,3 @@ The simplest way to use Bleach is:
.. _GitHub: https://github.com/mozilla/bleach
.. _ReadTheDocs: https://bleach.readthedocs.io/
.. _PyPI: http://pypi.python.org/pypi/bleach
-
-
diff --git a/docs/clean.rst b/docs/clean.rst
index ebd82055..a988a81a 100644
--- a/docs/clean.rst
+++ b/docs/clean.rst
@@ -162,7 +162,7 @@ For example, this sets allowed protocols to http, https and smb:
u'allowed protocol'
-This adds smb to the bleach-specified set of allowed protocols:
+This adds smb to the Bleach-specified set of allowed protocols:
.. doctest::
@@ -187,6 +187,7 @@ whitelist and invalid markup. For example:
.. doctest::
>>> import bleach
+
>>> bleach.clean('is not allowed')
u'<span>is not allowed</span>'
>>> bleach.clean('is not allowed', tags=['b'])
@@ -199,6 +200,7 @@ If you would rather Bleach stripped this markup entirely, you can pass
.. doctest::
>>> import bleach
+
>>> bleach.clean('is not allowed', strip=True)
u'is not allowed'
>>> bleach.clean('is not allowed', tags=['b'], strip=True)
@@ -214,6 +216,7 @@ By default, Bleach will strip out HTML comments. To disable this behavior, set
.. doctest::
>>> import bleach
+
>>> html = 'my html'
>>> bleach.clean(html)
diff --git a/docs/conf.py b/docs/conf.py
index 00b9c239..e186c827 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -27,8 +27,7 @@
# Add any Sphinx extension module names here, as strings. They can be extensions
# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
-extensions = ['sphinx.ext.autodoc', 'sphinx.ext.pngmath', 'sphinx.ext.viewcode',
- 'sphinx.ext.doctest']
+extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode', 'sphinx.ext.doctest']
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
diff --git a/docs/dev.rst b/docs/dev.rst
index 027a0a76..02f8d44a 100644
--- a/docs/dev.rst
+++ b/docs/dev.rst
@@ -5,7 +5,7 @@ Bleach development
Docs
====
-Docs are in ``docs/``. We use Sphinx. Docs are pushed to readthedocs
+Docs are in ``docs/``. We use Sphinx. Docs are pushed to ReadTheDocs
via a GitHub webhook.
@@ -16,7 +16,7 @@ Run::
$ tox
-That'll run bleach tests in all the supported Python environments. Note
+That'll run Bleach tests in all the supported Python environments. Note
that you need the necessary Python binaries for them all to be tested.
Tests are run in Travis CI via a GitHub webhook.
diff --git a/docs/goals.rst b/docs/goals.rst
index 01f63a94..632c222c 100644
--- a/docs/goals.rst
+++ b/docs/goals.rst
@@ -91,10 +91,10 @@ Make malicious content look pretty or sane
------------------------------------------
Malicious content is designed to be malicious. Making it safe is a design goal
-of bleach. Making it pretty or sane-looking is not.
+of Bleach. Making it pretty or sane-looking is not.
If you want your malicious content to look pretty, you should pass it through
-bleach to make it safe and then do your own transform afterwards.
+Bleach to make it safe and then do your own transform afterwards.
Allow arbitrary styling
diff --git a/docs/linkify.rst b/docs/linkify.rst
index b7449c34..705000c2 100644
--- a/docs/linkify.rst
+++ b/docs/linkify.rst
@@ -9,7 +9,7 @@
control how and when those links are rendered::
def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
- parse_email=False, tokenizer=HTMLSanitizer):
+ parse_email=False):
"""Convert URL-like strings in an HTML fragment to links.
``linkify()`` works by building a document tree, so it's guaranteed never to do
@@ -194,19 +194,4 @@ they are newly created or already in the text, so be careful when writing
callbacks that may need to behave differently if the protocol is ``mailto:``.
-``tokenizer``
-=============
-
-``linkify()`` uses the ``html5lib.sanitizer.HTMLSanitizer`` tokenizer by
-default. This has the effect of scrubbing some tags and attributes. To use a
-more lenient, or totally different, tokenizer, you can specify the tokenizer
-class here. (See the implementation of :ref:`clean() ` for an
-example of building a custom tokenizer.)
-
-::
-
- from html5lib.tokenizer import HTMLTokenizer
- linked_text = linkify(text, tokenizer=HTMLTokenizer)
-
-
.. _Crate: https://crate.io/
From 066631af96ee9c16ddcf9d132bd8537a4af17da6 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Tue, 21 Feb 2017 20:49:21 -0500
Subject: [PATCH 06/12] Update .travis.yml
---
.travis.yml | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/.travis.yml b/.travis.yml
index 4e66cf1d..318dfa7d 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -11,9 +11,8 @@ python:
- "3.6"
- "pypy"
env:
-- HTML5LIB=0.999 # 3
-- HTML5LIB=0.999999 # 6
-- HTML5LIB=0.9999999 # 7
+- HTML5LIB=0.99999999 # 8
+- HTML5LIB=0.999999999 # 9
install:
- pip install -r requirements.txt
- pip install html5lib==$HTML5LIB
From 30772dd20f16a8ec75f7c1ddb7b76c9ff6fd97d2 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Tue, 21 Feb 2017 20:57:13 -0500
Subject: [PATCH 07/12] Another .travis.yml fix
---
.travis.yml | 1 +
1 file changed, 1 insertion(+)
diff --git a/.travis.yml b/.travis.yml
index 318dfa7d..a5db65b1 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -14,6 +14,7 @@ env:
- HTML5LIB=0.99999999 # 8
- HTML5LIB=0.999999999 # 9
install:
+- pip install -U pip
- pip install -r requirements.txt
- pip install html5lib==$HTML5LIB
script:
From 85cc802584f4290556b14e50384c2ed6bb6959b6 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Tue, 21 Feb 2017 21:05:59 -0500
Subject: [PATCH 08/12] One more .travis.yml fix
---
.travis.yml | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/.travis.yml b/.travis.yml
index a5db65b1..14015378 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -14,9 +14,11 @@ env:
- HTML5LIB=0.99999999 # 8
- HTML5LIB=0.999999999 # 9
install:
-- pip install -U pip
-- pip install -r requirements.txt
-- pip install html5lib==$HTML5LIB
+ # html5lib 0.99999999 (8 9s) requires at least setuptools 18.5
+ - pip install -U pip setuptools>=18.5
+ - pip install -r requirements.txt
+ # stomp on html5lib install with the specified one
+ - pip install html5lib==$HTML5LIB
script:
- py.test
- flake8 bleach/
From 4cca43b5f41a4cc157fa2e6f5dd5185f70cb7a6c Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Tue, 21 Feb 2017 21:10:40 -0500
Subject: [PATCH 09/12] Fix flake8 issues
---
bleach/sanitizer.py | 4 ++--
setup.cfg | 1 +
2 files changed, 3 insertions(+), 2 deletions(-)
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 0701def2..68438e9a 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -1,6 +1,6 @@
from __future__ import unicode_literals
import re
-from xml.sax.saxutils import escape, unescape
+from xml.sax.saxutils import unescape
from html5lib.constants import namespaces
from html5lib.filters import sanitizer
@@ -83,7 +83,7 @@ def allow_token(self, token):
val_unescaped = val_unescaped.replace("\ufffd", "")
if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped) and
- (val_unescaped.split(':')[0] not in self.allowed_protocols)):
+ (val_unescaped.split(':')[0] not in self.allowed_protocols)):
# It has a protocol, but it's not allowed--so drop it
del attrs[attr]
diff --git a/setup.cfg b/setup.cfg
index f3a416e4..950364a7 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -3,6 +3,7 @@ test=pytest
[flake8]
ignore = E731,W503
+max-line-length = 100
[wheel]
universal=1
From 49f35dd27317019e47febed1c0c507177297a8af Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Fri, 24 Feb 2017 12:21:38 -0500
Subject: [PATCH 10/12] Add tests, fix alphabetizing, code cleanup
* this adds some missing tests to add more coverage
* html5lib 0.99999999 and 0.999999999 have an alphabeticalattributes filter that
doesn't work when the attributes set has some items with a namespace and some
without in Python 3; this rolls alphabetizing into the Bleach sanitizer
* remove some dead code and clean some other code up
---
bleach/__init__.py | 17 ++++-----
bleach/sanitizer.py | 43 +++++++++++++++++-----
tests/test_basics.py | 86 ++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 129 insertions(+), 17 deletions(-)
diff --git a/bleach/__init__.py b/bleach/__init__.py
index d1a82cde..ae96a925 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -61,8 +61,6 @@
# Make sure that .com doesn't get matched by .co first
TLDS.reverse()
-PROTOCOLS = allowed_protocols
-
url_re = re.compile(
r"""\(* # Match any opening parentheses.
\b(?"]*)?
# /path/zz (excluding "unsafe" chars from RFC 1738,
# except for # and ~, which happen in practice)
- """.format('|'.join(PROTOCOLS), '|'.join(TLDS)),
+ """.format('|'.join(allowed_protocols), '|'.join(TLDS)),
re.IGNORECASE | re.VERBOSE | re.UNICODE)
proto_re = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE)
@@ -87,8 +85,6 @@
""",
re.IGNORECASE | re.MULTILINE | re.VERBOSE)
-NODE_TEXT = 4 # The numeric ID of a text node in simpletree.
-
ETREE_TAG = lambda x: "".join(['{http://www.w3.org/1999/xhtml}', x])
# a simple routine that returns the tag name with the namespace prefix
# as returned by etree's Element.tag attribute
@@ -147,8 +143,13 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
)
s = HTMLSerializer(
quote_attr_values='always',
- alphabetical_attributes=True,
- omit_optional_tags=False
+ omit_optional_tags=False,
+
+ # Bleach has its own sanitizer, so don't use the html5lib one
+ sanitize=False,
+
+ # Bleach sanitizer alphabetizes already, so don't use the html5lib one
+ alphabetical_attributes=False,
)
return s.render(filtered)
@@ -176,7 +177,7 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
parser = html5lib.HTMLParser()
forest = parser.parseFragment(text)
- _seen = set([])
+ _seen = set()
def replace_nodes(tree, new_frag, node, index=0):
"""Doesn't really replace nodes, but inserts the nodes contained in
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 68438e9a..c12b8d24 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -1,4 +1,5 @@
from __future__ import unicode_literals
+from collections import OrderedDict
import re
from xml.sax.saxutils import unescape
@@ -6,6 +7,19 @@
from html5lib.filters import sanitizer
+def _attr_key(attr):
+ """Returns appropriate key for sorting attribute names
+
+ Attribute names are a tuple of ``(namespace, name)`` where namespace can be
+ ``None`` or a string. These can't be compared in Python 3, so we conver the
+ ``None`` to an empty string.
+
+ """
+ key = (attr[0][0] or ''), attr[0][1]
+ print(key)
+ return key
+
+
class BleachSanitizerFilter(sanitizer.Filter):
def __init__(self, source, allowed_attributes_map,
strip_disallowed_elements=False, strip_html_comments=True,
@@ -87,22 +101,33 @@ def allow_token(self, token):
# It has a protocol, but it's not allowed--so drop it
del attrs[attr]
+ # Drop values in svg attrs with non-local IRIs
for attr in self.svg_attr_val_allows_ref:
if attr in attrs:
- attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
- ' ',
- unescape(attrs[attr]))
-
- if (token['name'] in self.svg_allow_local_href and
- (namespaces['xlink'], 'href') in attrs and
- re.search(r'^\s*[^#\s].*', attrs[(namespaces['xlink'], 'href')])):
- del attrs[(namespace['xlink'], 'href')]
+ new_val = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
+ ' ',
+ unescape(attrs[attr]))
+ new_val = new_val.strip()
+ if not new_val:
+ del attrs[attr]
+ else:
+ attrs[attr] = new_val
+
+ # Drop href and xlink:href attr for svg elements with non-local IRIs
+ if (None, token['name']) in self.svg_allow_local_href:
+ for href_attr in [(None, 'href'), (namespaces['xlink'], 'href')]:
+ if href_attr in attrs:
+ if re.search(r'^\s*[^#\s]', attrs[href_attr]):
+ del attrs[href_attr]
# Sanitize css in style attribute
if (None, u'style') in attrs:
attrs[(None, u'style')] = self.sanitize_css(attrs[(None, u'style')])
- token['data'] = attrs
+ # Alphabetize attributes
+ token['data'] = OrderedDict(
+ [(key, val) for key, val in sorted(attrs.items(), key=_attr_key)]
+ )
return token
def sanitize_css(self, style):
diff --git a/tests/test_basics.py b/tests/test_basics.py
index 49148592..bae1506e 100644
--- a/tests/test_basics.py
+++ b/tests/test_basics.py
@@ -240,6 +240,92 @@ def test_wildcard_attributes():
assert bleach.clean(dirty, tags=TAG, attributes=ATTR) in clean
+def test_callable_attributes():
+ """Verify callable attributes work and get correct arg values"""
+ def img_test(attr, val):
+ return attr == 'src' and val.startswith('https')
+
+ ATTR = {
+ 'img': img_test,
+ }
+ TAGS = ['img']
+
+ assert (
+ bleach.clean('foo baz', tags=TAGS, attributes=ATTR) ==
+ u'foo baz'
+ )
+ assert (
+ bleach.clean('foo baz', tags=TAGS, attributes=ATTR) ==
+ u'foo baz'
+ )
+
+
+def test_svg_attr_val_allows_ref():
+ """Unescape values in svg attrs that allow url references"""
+ # Local IRI, so keep it
+ text = ''
+ TAGS = ['svg', 'rect']
+ ATTRS = {
+ 'rect': ['fill'],
+ }
+ assert (
+ bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
+ ''
+ )
+
+ # Non-local IRI, so drop it
+ text = ''
+ TAGS = ['svg', 'rect']
+ ATTRS = {
+ 'rect': ['fill'],
+ }
+ assert (
+ bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
+ ''
+ )
+
+
+@pytest.mark.parametrize('text, expected', [
+ (
+ '',
+ ''
+ ),
+ (
+ '',
+ # NOTE(willkg): Bug in html5lib serializer drops the xlink part
+ ''
+ ),
+])
+def test_svg_allow_local_href(text, expected):
+ """Keep local hrefs for svg elements"""
+ TAGS = ['svg', 'pattern']
+ ATTRS = {
+ 'pattern': ['id', 'href'],
+ }
+ assert bleach.clean(text, tags=TAGS, attributes=ATTRS) == expected
+
+
+@pytest.mark.parametrize('text, expected', [
+ (
+ '',
+ ''
+ ),
+ (
+ '',
+ ''
+ ),
+])
+def test_svg_allow_local_href_nonlocal(text, expected):
+ """Drop non-local hrefs for svg elements"""
+ TAGS = ['svg', 'pattern']
+ ATTRS = {
+ 'pattern': ['id', 'href'],
+ }
+ assert bleach.clean(text, tags=TAGS, attributes=ATTRS) == expected
+
+
+
+
@pytest.mark.xfail(reason='html5lib >= 0.99999999: changed API')
def test_sarcasm():
"""Jokes should crash."""
From 70d96e390c1525d8508436d8582daa49357fdf3e Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Fri, 24 Feb 2017 12:49:12 -0500
Subject: [PATCH 11/12] Alphabetize before escaping disallowed tokens
Making this change means the output is stable since attributes will always
happen in the same order. Seems like maybe it's not a great idea, but stable
seems good. If it turns out this is terrible, someone will complain with a
compelling use case and we can undo it.
I also went through and removed a bunch of the "the output is either this or
that" in the tests.
---
bleach/sanitizer.py | 6 ++++++
tests/test_basics.py | 31 +++++++++++++------------------
tests/test_unicode.py | 7 ++-----
3 files changed, 21 insertions(+), 23 deletions(-)
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index c12b8d24..789d89e6 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -58,6 +58,12 @@ def sanitize_token(self, token):
pass
else:
+ if 'data' in token:
+ # Alphabetize the attributes before calling .disallowed_token()
+ # so that the resulting string is stable
+ token['data'] = OrderedDict(
+ [(key, val) for key, val in sorted(token['data'].items(), key=_attr_key)]
+ )
return self.disallowed_token(token)
elif token_type == 'Comment':
diff --git a/tests/test_basics.py b/tests/test_basics.py
index bae1506e..c42ccc62 100644
--- a/tests/test_basics.py
+++ b/tests/test_basics.py
@@ -76,11 +76,8 @@ def test_named_arguments():
assert bleach.clean(text) == 'xx.com'
assert (
- bleach.clean(text, attributes=ATTRS) in
- [
- 'xx.com',
- 'xx.com'
- ]
+ bleach.clean(text, attributes=ATTRS) ==
+ 'xx.com'
)
@@ -199,25 +196,22 @@ def test_idempotent():
clean = bleach.clean(dirty)
assert bleach.clean(clean) == clean
- possible_outs = (
- 'invalid & < extra http://link.com',
+ linked = bleach.linkify(dirty)
+ assert (
+ bleach.linkify(linked) ==
'invalid & < extra http://link.com'
)
- linked = bleach.linkify(dirty)
- assert bleach.linkify(linked) in possible_outs
def test_rel_already_there():
"""Make sure rel attribute is updated not replaced"""
linked = ('Click '
'here.')
- link_good = (('Click '
- 'here.'),
- ('Click '
- 'here.'))
- assert bleach.linkify(linked) in link_good
- assert bleach.linkify(link_good[0]) in link_good
+ link_good = 'Click here.'
+
+ assert bleach.linkify(linked) == link_good
+ assert bleach.linkify(link_good) == link_good
def test_lowercase_html():
@@ -235,9 +229,10 @@ def test_wildcard_attributes():
TAG = ['img', 'em']
dirty = ('both can have '
'')
- clean = ('both can have ',
- 'both can have ')
- assert bleach.clean(dirty, tags=TAG, attributes=ATTR) in clean
+ assert (
+ bleach.clean(dirty, tags=TAG, attributes=ATTR) ==
+ 'both can have '
+ )
def test_callable_attributes():
diff --git a/tests/test_unicode.py b/tests/test_unicode.py
index b8b670e8..08ab3f4e 100644
--- a/tests/test_unicode.py
+++ b/tests/test_unicode.py
@@ -27,11 +27,8 @@ def test_mixed():
def test_mixed_linkify():
assert (
- linkify('Домашняя http://example.com ヘルプとチュートリアル') in
- (
- 'Домашняя http://example.com ヘルプとチュートリアル',
- 'Домашняя http://example.com ヘルプとチュートリアル'
- )
+ linkify('Домашняя http://example.com ヘルプとチュートリアル') ==
+ 'Домашняя http://example.com ヘルプとチュートリアル'
)
From 86dea93c17d96a237acd2543264ba27d83f15d79 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene
Date: Fri, 24 Feb 2017 13:09:20 -0500
Subject: [PATCH 12/12] Cosmetic fix for readability
---
bleach/__init__.py | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/bleach/__init__.py b/bleach/__init__.py
index ae96a925..a645ac7e 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -130,16 +130,18 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
walker = html5lib.getTreeWalker('etree')
filtered = BleachSanitizerFilter(
source=walker(dom),
+
+ # Bleach-sanitizer-specific things
allowed_attributes_map=attributes,
+ strip_disallowed_elements=strip,
+ strip_html_comments=strip_comments,
+ # html5lib-sanitizer things
allowed_elements=tags,
allowed_css_properties=styles,
allowed_protocols=protocols,
-
allowed_svg_properties=[],
- strip_disallowed_elements=strip,
- strip_html_comments=strip_comments
)
s = HTMLSerializer(
quote_attr_values='always',