From ef0c48765c160f5724d915770060a095cbc69dda Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Fri, 17 Feb 2017 14:55:55 -0500 Subject: [PATCH 01/12] Update dev requirements --- requirements.txt | 7 ++++--- setup.py | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index a026c46e..6ec6bd90 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,12 +1,13 @@ -e . # Requirements to run the test suite: -pytest==3.0.3 -flake8==3.0.4 +pytest==3.0.6 +pytest-wholenodeid +flake8==3.3.0 tox==2.4.1 # Requirements for building docs -Sphinx==1.4.8 +Sphinx==1.5.2 # Requirements for updating package twine==1.8.1 diff --git a/setup.py b/setup.py index 167658ff..6c627a38 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup_requires.append('pytest-runner>=2.0,<3dev') tests_require = [ - 'pytest==3.0.3', + 'pytest>=3.0.0', ] install_requires = [ From 567eebb53e0716f6d267fa8951c548d63bf78a70 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Mon, 23 Jan 2017 17:02:03 -0500 Subject: [PATCH 02/12] Overhaul bleach to use html5lib >= 0.99999999 This is a bleach rewrite to use the new sanitizer API in html5lib 0.99999999. The new API happens as a filter when emitting the tree rather than in the tokenizer. Because of that, the output of .clean() and .linkify() are different than in previous versions of bleach. --- bleach/__init__.py | 53 +++++++----- bleach/sanitizer.py | 189 ++++++++++++++++++++--------------------- setup.py | 5 +- tests/test_basics.py | 8 +- tests/test_css.py | 74 ++++++++++------ tests/test_links.py | 4 +- tests/test_security.py | 24 ++++-- tox.ini | 7 +- 8 files changed, 204 insertions(+), 160 deletions(-) diff --git a/bleach/__init__.py b/bleach/__init__.py index 09dad637..c54dc72b 100644 --- a/bleach/__init__.py +++ b/bleach/__init__.py @@ -5,13 +5,14 @@ import re import html5lib -from html5lib.sanitizer import HTMLSanitizer -from html5lib.serializer.htmlserializer import HTMLSerializer +from html5lib.filters import sanitizer +from html5lib.filters.sanitizer import allowed_protocols +from html5lib.serializer import HTMLSerializer -from . import callbacks as linkify_callbacks -from .encoding import force_unicode -from .sanitizer import BleachSanitizer -from .version import __version__, VERSION # flake8: noqa +from bleach import callbacks as linkify_callbacks +from bleach.encoding import force_unicode +from bleach.sanitizer import BleachSanitizerFilter +from bleach.version import __version__, VERSION # flake8: noqa __all__ = ['clean', 'linkify'] @@ -60,7 +61,7 @@ # Make sure that .com doesn't get matched by .co first TLDS.reverse() -PROTOCOLS = HTMLSanitizer.acceptable_protocols +PROTOCOLS = allowed_protocols url_re = re.compile( r"""\(* # Match any opening parentheses. @@ -125,21 +126,34 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, text = force_unicode(text) - class s(BleachSanitizer): - allowed_elements = tags - allowed_attributes = attributes - allowed_css_properties = styles - allowed_protocols = protocols - strip_disallowed_elements = strip - strip_html_comments = strip_comments + parser = html5lib.HTMLParser(namespaceHTMLElements=False) + dom = parser.parseFragment(text) - parser = html5lib.HTMLParser(tokenizer=s) + walker = html5lib.getTreeWalker('etree') + filtered = BleachSanitizerFilter( + source=walker(dom), + allowed_attributes_map=attributes, - return _render(parser.parseFragment(text)) + allowed_elements=tags, + allowed_css_properties=styles, + allowed_protocols=protocols, + + allowed_svg_properties=[], + + strip_disallowed_elements=strip, + strip_html_comments=strip_comments + ) + s = HTMLSerializer( + quote_attr_values='always', + alphabetical_attributes=True, + omit_optional_tags=False + ) + return s.render(filtered) def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False, - parse_email=False, tokenizer=HTMLSanitizer): + # FIXME(willkg): parse_email=False, tokenizer=HTMLSanitizer): + parse_email=False): """Convert URL-like strings in an HTML fragment to links ``linkify()`` converts strings that look like URLs, domain names and email @@ -158,7 +172,8 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False, if not text: return '' - parser = html5lib.HTMLParser(tokenizer=tokenizer) + # FIXME(willkg): parser = html5lib.HTMLParser(tokenizer=tokenizer) + parser = html5lib.HTMLParser() forest = parser.parseFragment(text) _seen = set([]) @@ -427,7 +442,7 @@ def _render(tree): def _serialize(domtree): walker = html5lib.treewalkers.getTreeWalker('etree') stream = walker(domtree) - serializer = HTMLSerializer(quote_attr_values=True, + serializer = HTMLSerializer(quote_attr_values='always', alphabetical_attributes=True, omit_optional_tags=False) return serializer.render(stream) diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py index eec6659b..fb502b85 100644 --- a/bleach/sanitizer.py +++ b/bleach/sanitizer.py @@ -2,118 +2,125 @@ import re from xml.sax.saxutils import escape, unescape -from html5lib.constants import tokenTypes -from html5lib.sanitizer import HTMLSanitizerMixin -from html5lib.tokenizer import HTMLTokenizer +from html5lib.constants import namespaces +from html5lib.filters import sanitizer -PROTOS = HTMLSanitizerMixin.acceptable_protocols -PROTOS.remove('feed') +class BleachSanitizerFilter(sanitizer.Filter): + def __init__(self, source, allowed_attributes_map, + strip_disallowed_elements=False, strip_html_comments=True, + **kwargs): + if isinstance(allowed_attributes_map, dict): + self.wildcard_attributes = allowed_attributes_map.get('*', []) + self.allowed_attributes_map = allowed_attributes_map + else: + self.wildcard_attributes = allowed_attributes_map + self.allowed_attributes_map = {} -class BleachSanitizerMixin(HTMLSanitizerMixin): - """Mixin to replace sanitize_token() and sanitize_css().""" + self.strip_disallowed_elements = strip_disallowed_elements + self.strip_html_comments = strip_html_comments - allowed_svg_properties = [] + return super(BleachSanitizerFilter, self).__init__(source, **kwargs) def sanitize_token(self, token): """Sanitize a token either by HTML-encoding or dropping. - Unlike HTMLSanitizerMixin.sanitize_token, allowed_attributes can be - a dict of {'tag': ['attribute', 'pairs'], 'tag': callable}. + Unlike sanitizer.Filter, allowed_attributes can be a dict of {'tag': + ['attribute', 'pairs'], 'tag': callable}. - Here callable is a function with two arguments of attribute name - and value. It should return true of false. + Here callable is a function with two arguments of attribute name and + value. It should return true of false. Also gives the option to strip tags instead of encoding. """ - if (getattr(self, 'wildcard_attributes', None) is None and - isinstance(self.allowed_attributes, dict)): - self.wildcard_attributes = self.allowed_attributes.get('*', []) - - if token['type'] in (tokenTypes['StartTag'], tokenTypes['EndTag'], - tokenTypes['EmptyTag']): + token_type = token['type'] + if token_type in ['StartTag', 'EndTag', 'EmptyTag']: if token['name'] in self.allowed_elements: - if 'data' in token: - if isinstance(self.allowed_attributes, dict): - allowed_attributes = self.allowed_attributes.get( - token['name'], []) - if not callable(allowed_attributes): - allowed_attributes += self.wildcard_attributes - else: - allowed_attributes = self.allowed_attributes - attrs = dict([(name, val) for name, val in - token['data'][::-1] - if (allowed_attributes(name, val) - if callable(allowed_attributes) - else name in allowed_attributes)]) - for attr in self.attr_val_is_uri: - if attr not in attrs: - continue - val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', - unescape(attrs[attr])).lower() - # Remove replacement characters from unescaped - # characters. - val_unescaped = val_unescaped.replace("\ufffd", "") - if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped) - and (val_unescaped.split(':')[0] not in - self.allowed_protocols)): - del attrs[attr] - for attr in self.svg_attr_val_allows_ref: - if attr in attrs: - attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', - ' ', - unescape(attrs[attr])) - if (token['name'] in self.svg_allow_local_href and - 'xlink:href' in attrs and - re.search(r'^\s*[^#\s].*', attrs['xlink:href'])): - del attrs['xlink:href'] - if 'style' in attrs: - attrs['style'] = self.sanitize_css(attrs['style']) - token['data'] = [(name, val) for name, val in - attrs.items()] - return token + return self.allow_token(token) + elif self.strip_disallowed_elements: pass + else: - if token['type'] == tokenTypes['EndTag']: - token['data'] = ''.format(token['name']) - elif token['data']: - attr = ' {0!s}="{1!s}"' - attrs = ''.join([attr.format(k, escape(v)) for k, v in - token['data']]) - token['data'] = '<{0!s}{1!s}>'.format(token['name'], attrs) - else: - token['data'] = '<{0!s}>'.format(token['name']) - if token['selfClosing']: - token['data'] = token['data'][:-1] + '/>' - token['type'] = tokenTypes['Characters'] - del token["name"] - return token - elif token['type'] == tokenTypes['Comment']: + return self.disallowed_token(token) + + elif token_type == 'Comment': if not self.strip_html_comments: return token + else: return token - def sanitize_css(self, style): - """HTMLSanitizerMixin.sanitize_css replacement. - - HTMLSanitizerMixin.sanitize_css always whitelists background-*, - border-*, margin-*, and padding-*. We only whitelist what's in - the whitelist. + def allow_token(self, token): + if 'data' in token: + allowed_attributes = self.allowed_attributes_map.get(token['name'], []) + if not callable(allowed_attributes): + allowed_attributes += self.wildcard_attributes + + # Drop any attributes that aren't allowed + attrs = {} + for namespaced_name, val in token['data'].items(): + namespace, name = namespaced_name + # FIXME(willkg): "name" used to be something like "xlink:href" + # but it's now (namespace['xlink'], 'href'). we should fix the + # name here so it's what the callable would expect. + if callable(allowed_attributes): + if allowed_attributes(name, val): + attrs[namespaced_name] = val + + elif name in allowed_attributes: + attrs[namespaced_name] = val + + # Go through all the uri-type attributes + for attr in self.attr_val_is_uri: + if attr not in attrs: + continue + val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', + unescape(attrs[attr])).lower() + # Remove replacement characters from unescaped characters. + val_unescaped = val_unescaped.replace("\ufffd", "") + + if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped) and + (val_unescaped.split(':')[0] not in self.allowed_protocols)): + # It has a protocol, but it's not allowed--so drop it + del attrs[attr] + + # FIXME(willkg): is this right? + for attr in self.svg_attr_val_allows_ref: + if attr in attrs: + attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', + ' ', + unescape(attrs[attr])) + + # FIXME(willkg): is this right? + if (token['name'] in self.svg_allow_local_href and + (namespace['xlink'], 'href') in attrs and + re.search(r'^\s*[^#\s].*', attrs[(namespace['xlink'], 'href')])): + del attrs[(namespace['xlink'], 'href')] + + # Sanitize css in style attribute + if (None, u'style') in attrs: + attrs[(None, u'style')] = self.sanitize_css(attrs[(None, u'style')]) + + token['data'] = attrs + return token - """ + def sanitize_css(self, style): + """html5lib sanitizer filter replacement to fix issues""" # disallow urls style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style) # gauntlet - # TODO: Make sure this does what it's meant to - I *think* it wants to - # validate style attribute contents. + + # Validate the css in the style tag and if it's not valid, then drop + # the whole thing. parts = style.split(';') - gauntlet = re.compile("""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'""" - """\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$""") + gauntlet = re.compile( + r"""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$""" + ) + for part in parts: if not gauntlet.match(part): return '' @@ -125,23 +132,11 @@ def sanitize_css(self, style): for prop, value in re.findall('([-\w]+)\s*:\s*([^:;]*)', style): if not value: continue + if prop.lower() in self.allowed_css_properties: clean.append(prop + ': ' + value + ';') + elif prop.lower() in self.allowed_svg_properties: clean.append(prop + ': ' + value + ';') return ' '.join(clean) - - -class BleachSanitizer(HTMLTokenizer, BleachSanitizerMixin): - def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True, - lowercaseElementName=True, lowercaseAttrName=True, **kwargs): - HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet, - lowercaseElementName, lowercaseAttrName, - **kwargs) - - def __iter__(self): - for token in HTMLTokenizer.__iter__(self): - token = self.sanitize_token(token) - if token: - yield token diff --git a/setup.py b/setup.py index 6c627a38..39fbb370 100644 --- a/setup.py +++ b/setup.py @@ -15,9 +15,8 @@ install_requires = [ 'six', - # 3 9s up to but not including 8 9s, but not 4 9s or 5 9s because they're - # busted - 'html5lib>=0.999,!=0.9999,!=0.99999,<0.99999999', + # >= 8 9s because of breaking API change + 'html5lib>=0.99999999', ] diff --git a/tests/test_basics.py b/tests/test_basics.py index 07d4d918..8e293ca6 100644 --- a/tests/test_basics.py +++ b/tests/test_basics.py @@ -1,5 +1,6 @@ -import six import html5lib +import pytest +import six import bleach @@ -234,6 +235,7 @@ def test_wildcard_attributes(): assert bleach.clean(dirty, tags=TAG, attributes=ATTR) in clean +@pytest.mark.xfail(reason='html5lib >= 0.99999999: changed API') def test_sarcasm(): """Jokes should crash.""" dirty = 'Yeah right ' @@ -242,8 +244,8 @@ def test_sarcasm(): def test_user_defined_protocols_valid(): - valid_href = 'allowed href' - assert bleach.clean(valid_href, protocols=['my_protocol']) == valid_href + valid_href = 'allowed href' + assert bleach.clean(valid_href, protocols=['myprotocol']) == valid_href def test_user_defined_protocols_invalid(): diff --git a/tests/test_css.py b/tests/test_css.py index 0b92f40b..3d224fac 100644 --- a/tests/test_css.py +++ b/tests/test_css.py @@ -8,7 +8,7 @@ clean = partial(clean, tags=['p'], attributes=['style']) -@pytest.mark.parametrize('data,styles,expected', [ +@pytest.mark.parametrize('data, styles, expected', [ ( 'font-family: Arial; color: red; float: left; background-color: red;', ['color'], @@ -91,24 +91,40 @@ def test_valid_css(): def test_style_hang(): """The sanitizer should not hang on any inline styles""" - # TODO: Neaten this up. It's copypasta from MDN/Kuma to repro the bug - style = ("""margin-top: 0px; margin-right: 0px; margin-bottom: 1.286em; """ - """margin-left: 0px; padding-top: 15px; padding-right: 15px; """ - """padding-bottom: 15px; padding-left: 15px; border-top-width: """ - """1px; border-right-width: 1px; border-bottom-width: 1px; """ - """border-left-width: 1px; border-top-style: dotted; """ - """border-right-style: dotted; border-bottom-style: dotted; """ - """border-left-style: dotted; border-top-color: rgb(203, 200, """ - """185); border-right-color: rgb(203, 200, 185); """ - """border-bottom-color: rgb(203, 200, 185); border-left-color: """ - """rgb(203, 200, 185); background-image: initial; """ - """background-attachment: initial; background-origin: initial; """ - """background-clip: initial; background-color: """ - """rgb(246, 246, 242); overflow-x: auto; overflow-y: auto; """ - """font: normal normal normal 100%/normal 'Courier New', """ - """'Andale Mono', monospace; background-position: initial """ - """initial; background-repeat: initial initial;""") - html = '

Hello world

'.format(style) + style = [ + 'margin-top: 0px;', + 'margin-right: 0px;', + 'margin-bottom: 1.286em;', + 'margin-left: 0px;', + 'padding-top: 15px;', + 'padding-right: 15px;', + 'padding-bottom: 15px;', + 'padding-left: 15px;', + 'border-top-width: 1px;', + 'border-right-width: 1px;', + 'border-bottom-width: 1px;', + 'border-left-width: 1px;', + 'border-top-style: dotted;', + 'border-right-style: dotted;', + 'border-bottom-style: dotted;', + 'border-left-style: dotted;', + 'border-top-color: rgb(203, 200, 185);', + 'border-right-color: rgb(203, 200, 185);', + 'border-bottom-color: rgb(203, 200, 185);', + 'border-left-color: rgb(203, 200, 185);', + 'background-image: initial;', + 'background-attachment: initial;', + 'background-origin: initial;', + 'background-clip: initial;', + 'background-color: rgb(246, 246, 242);', + 'overflow-x: auto;', + 'overflow-y: auto;', + # FIXME(willkg): This fails the first regxp gauntlet in sanitize_css. + # 'font: italic small-caps bolder condensed 16px/3 cursive;', + 'background-position: initial initial;', + 'background-repeat: initial initial;' + ] + html = '

Hello world

' % ' '.join(style) styles = [ 'border', 'float', 'overflow', 'min-height', 'vertical-align', 'white-space', @@ -120,12 +136,18 @@ def test_style_hang(): 'font', 'font-size', 'font-weight', 'text-align', 'text-transform', ] - expected = ("""

""" - """Hello world

""") + expected = ( + '

Hello world

' + ) assert clean(html, styles=styles) == expected diff --git a/tests/test_links.py b/tests/test_links.py index ac38ee70..6b7a77eb 100644 --- a/tests/test_links.py +++ b/tests/test_links.py @@ -3,7 +3,7 @@ except ImportError: from urllib import quote_plus -from html5lib.tokenizer import HTMLTokenizer +# FIXME(willkg): from html5lib.tokenizer import HTMLTokenizer import pytest from bleach import linkify, url_re, DEFAULT_CALLBACKS as DC @@ -406,6 +406,7 @@ def test_end_of_clause(): ) +@pytest.mark.xfail(reason='html5lib >= 0.99999999: changed API') def test_sarcasm(): """Jokes should crash.""" assert linkify('Yeah right ') == 'Yeah right <sarcasm/>' @@ -498,6 +499,7 @@ def test_ports(data, expected_data): assert linkify(data) == out.format(*expected_data) +@pytest.mark.xfail(reason='html5lib >= 0.99999999: no access to tokenizer') def test_tokenizer(): """Linkify doesn't always have to sanitize.""" raw = 'test' diff --git a/tests/test_security.py b/tests/test_security.py index 6ffaf449..4fb30207 100644 --- a/tests/test_security.py +++ b/tests/test_security.py @@ -74,7 +74,9 @@ def test_invalid_href_attr(): def test_invalid_filter_attr(): IMG = ['img', ] - IMG_ATTR = {'img': lambda n, v: n == 'src' and v == "http://example.com/"} + IMG_ATTR = { + 'img': lambda n, v: n == 'src' and v == "http://example.com/" + } assert ( clean('', tags=IMG, attributes=IMG_ATTR) == @@ -145,7 +147,11 @@ def test_feed_protocol(): def get_tests(): - """Retrieves regression tests from data/ directory""" + """Retrieves regression tests from data/ directory + + :returns: list of ``(filename, filedata)`` tuples + + """ datadir = os.path.join(os.path.dirname(__file__), 'data') tests = [ os.path.join(datadir, fn) for fn in os.listdir(datadir) @@ -153,19 +159,23 @@ def get_tests(): ] # Sort numerically which makes it easier to iterate through them tests.sort(key=lambda x: int(os.path.basename(x).split('.', 1)[0])) - return tests + + testcases = [ + (fn, open(fn, 'r').read()) for fn in tests + ] + + return testcases -@pytest.mark.parametrize('fn', get_tests()) -def test_regressions(fn): +@pytest.mark.parametrize('fn, text', get_tests()) +def test_regressions(fn, text): """Regression tests for clean so we can see if there are issues""" - s = open(fn, 'r').read() expected = six.text_type(open(fn + '.out', 'r').read()) # NOTE(willkg): This strips input and expected which makes it easier to # maintain the files. If there comes a time when the input needs whitespace # at the beginning or end, then we'll have to figure out something else. - assert clean(s.strip()) == expected.strip() + assert clean(text.strip()) == expected.strip() def test_regression_manually(): diff --git a/tox.ini b/tox.ini index 09ed488f..53c175c9 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,7 @@ # and then run "tox" from this directory. [tox] -envlist = py{27,33,34,35,36}-html5lib{999,999999,9999999},pypy-html5lib9999999 +envlist = py{27,33,34,35,36}-html5lib{99999999,999999999},pypy-html5lib99999999 [testenv] basepython = @@ -15,8 +15,7 @@ basepython = py36: python3.6 deps = -rrequirements.txt - html5lib999: html5lib==0.999 - html5lib999999: html5lib==0.999999 - html5lib9999999: html5lib==0.9999999 + html5lib99999999: html5lib==0.99999999 + html5lib999999999: html5lib==0.999999999 commands = py.test {posargs:-v} From 3db588a5ed8b43da8324547b23ad5670b459ee9e Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Tue, 21 Feb 2017 11:45:01 -0500 Subject: [PATCH 03/12] Update security regression tests I also moved test_nasty to the regression tests because it's just like those. --- tests/data/14.test.out | 2 +- tests/data/15.test.out | 2 +- tests/data/16.test.out | 2 +- tests/data/17.test.out | 2 +- tests/data/3.test.out | 2 +- tests/data/4.test | 1 + tests/data/4.test.out | 1 + tests/data/5.test.out | 2 +- tests/data/9.test.out | 2 +- tests/test_security.py | 35 +++++++++++++++++------------------ 10 files changed, 26 insertions(+), 25 deletions(-) create mode 100644 tests/data/4.test create mode 100644 tests/data/4.test.out diff --git a/tests/data/14.test.out b/tests/data/14.test.out index 16445739..8e5ff754 100644 --- a/tests/data/14.test.out +++ b/tests/data/14.test.out @@ -1 +1 @@ -<imgsrc=&#106;&#97;&#118;&#97;&<wbr>#115;crip&<wbr>#116;:a \ No newline at end of file +<imgsrc=&#106;&#97;&#118;&#97;&<wbr>#115;crip&<wbr></wbr>#116;:a</imgsrc=&#106;&#97;&#118;&#97;&<wbr> \ No newline at end of file diff --git a/tests/data/15.test.out b/tests/data/15.test.out index 334f916b..8b90245f 100644 --- a/tests/data/15.test.out +++ b/tests/data/15.test.out @@ -1 +1 @@ -le&<wbr>#114;t('XS<wbr>;S')> \ No newline at end of file +le&<wbr></wbr>#114;t('XS<wbr></wbr>;S')> \ No newline at end of file diff --git a/tests/data/16.test.out b/tests/data/16.test.out index 9c6ca965..1ecb332b 100644 --- a/tests/data/16.test.out +++ b/tests/data/16.test.out @@ -1 +1 @@ -<imgsrc=&#0000106&#0000097&<wbr>#0000118as&<wbr>#0000099ri&<wbr>#0000112t:&<wbr>#0000097le&<wbr>#0000114t(&<wbr>#0000039XS&<wbr>#0000083')> \ No newline at end of file +<imgsrc=&#0000106&#0000097&<wbr>#0000118as&<wbr></wbr>#0000099ri&<wbr></wbr>#0000112t:&<wbr></wbr>#0000097le&<wbr></wbr>#0000114t(&<wbr></wbr>#0000039XS&<wbr></wbr>#0000083')></imgsrc=&#0000106&#0000097&<wbr> \ No newline at end of file diff --git a/tests/data/17.test.out b/tests/data/17.test.out index dabfaa2d..ae928a99 100644 --- a/tests/data/17.test.out +++ b/tests/data/17.test.out @@ -1 +1 @@ -<imgsrc=&#x6a&#x61&#x76&#x61&#x73&<wbr>#x63ript:&<wbr>#x61lert(&<wbr>#x27XSS')> \ No newline at end of file +<imgsrc=&#x6a&#x61&#x76&#x61&#x73&<wbr>#x63ript:&<wbr></wbr>#x61lert(&<wbr></wbr>#x27XSS')></imgsrc=&#x6a&#x61&#x76&#x61&#x73&<wbr> \ No newline at end of file diff --git a/tests/data/3.test.out b/tests/data/3.test.out index 20c3d0d4..f0d69629 100644 --- a/tests/data/3.test.out +++ b/tests/data/3.test.out @@ -1 +1 @@ ->"'><img%20src%3d%26%23x6a;%26%23x61;%26%23x76;%26%23x61;%26%23x73;%26%23x63;%26%23x72;%26%23x69;%26%23x70;%26%23x74;%26%23x3a;alert(%26quot;%26%23x20;xss%26%23x20;test%26%23x20;successful%26quot;)> \ No newline at end of file +>"'><img%20src%3d%26%23x6a;%26%23x61;%26%23x76;%26%23x61;%26%23x73;%26%23x63;%26%23x72;%26%23x69;%26%23x70;%26%23x74;%26%23x3a;alert(%26quot;%26%23x20;xss%26%23x20;test%26%23x20;successful%26quot;)></img%20src%3d%26%23x6a;%26%23x61;%26%23x76;%26%23x61;%26%23x73;%26%23x63;%26%23x72;%26%23x69;%26%23x70;%26%23x74;%26%23x3a;alert(%26quot;%26%23x20;xss%26%23x20;test%26%23x20;successful%26quot;)> \ No newline at end of file diff --git a/tests/data/4.test b/tests/data/4.test new file mode 100644 index 00000000..c4cf51cd --- /dev/null +++ b/tests/data/4.test @@ -0,0 +1 @@ +ipt type="text/javascript">alert("foo");script> diff --git a/tests/data/4.test.out b/tests/data/4.test.out new file mode 100644 index 00000000..88ea86b2 --- /dev/null +++ b/tests/data/4.test.out @@ -0,0 +1 @@ +<scr<script>ipt type="text/javascript">alert("foo");script<del></del>></scr<script> diff --git a/tests/data/5.test.out b/tests/data/5.test.out index 1782eafb..0d88a88a 100644 --- a/tests/data/5.test.out +++ b/tests/data/5.test.out @@ -1 +1 @@ ->%22%27><img%20src%3d%22javascript:alert(%27%20xss%27)%22> \ No newline at end of file +>%22%27><img%20src%3d%22javascript:alert(%27%20xss%27)%22></img%20src%3d%22javascript:alert(%27%20xss%27)%22> \ No newline at end of file diff --git a/tests/data/9.test.out b/tests/data/9.test.out index 3a4d9b6c..5c5eb6ba 100644 --- a/tests/data/9.test.out +++ b/tests/data/9.test.out @@ -1 +1 @@ -'';!--"<xss>=&{()} \ No newline at end of file +'';!--"<xss>=&{()}</xss> \ No newline at end of file diff --git a/tests/test_security.py b/tests/test_security.py index 4fb30207..356b1292 100644 --- a/tests/test_security.py +++ b/tests/test_security.py @@ -22,7 +22,7 @@ def test_nested_script_tag(): def test_nested_script_tag_r(): assert ( clean('>evil()>') == - '<script<script>>evil()</script<>>' + '<script<script>>evil()></script<script>' ) @@ -90,8 +90,11 @@ def test_invalid_filter_attr(): def test_invalid_tag_char(): assert ( - clean('') == - '<script xss="" src="http://xx.com/xss.js"></script>' + clean('') in + [ + '<script src="http://xx.com/xss.js" xss=""></script>', + '<script xss="" src="http://xx.com/xss.js"></script>' + ] ) assert ( clean('') == @@ -102,15 +105,21 @@ def test_invalid_tag_char(): def test_unclosed_tag(): assert ( clean('ipt type="text/javascript">alert("foo");script>') - expect = ('<scr<script></script>ipt type="text/javascript"' - '>alert("foo");</script>script<del></del>' - '>') - assert clean(test) == expect - - def test_poster_attribute(): """Poster attributes should not allow javascript.""" tags = ['video'] From 02facd34b02cd9d9e16b547e07b07ddd3db40455 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Tue, 21 Feb 2017 13:23:00 -0500 Subject: [PATCH 04/12] Minor code cleanup * address FIXMEs * minor cleanup to code and comments --- bleach/__init__.py | 8 ++++---- bleach/sanitizer.py | 14 ++++++-------- tests/test_basics.py | 15 ++++++++++----- tests/test_css.py | 6 +++--- tests/test_links.py | 9 --------- tests/test_security.py | 2 +- 6 files changed, 24 insertions(+), 30 deletions(-) diff --git a/bleach/__init__.py b/bleach/__init__.py index c54dc72b..d1a82cde 100644 --- a/bleach/__init__.py +++ b/bleach/__init__.py @@ -120,9 +120,11 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, :arg strip: whether or not to strip disallowed elements :arg strip_comments: whether or not to strip HTML comments + :returns: cleaned text as unicode + """ if not text: - return '' + return u'' text = force_unicode(text) @@ -152,7 +154,6 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False, - # FIXME(willkg): parse_email=False, tokenizer=HTMLSanitizer): parse_email=False): """Convert URL-like strings in an HTML fragment to links @@ -170,9 +171,8 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False, text = force_unicode(text) if not text: - return '' + return u'' - # FIXME(willkg): parser = html5lib.HTMLParser(tokenizer=tokenizer) parser = html5lib.HTMLParser() forest = parser.parseFragment(text) diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py index fb502b85..0701def2 100644 --- a/bleach/sanitizer.py +++ b/bleach/sanitizer.py @@ -63,9 +63,7 @@ def allow_token(self, token): attrs = {} for namespaced_name, val in token['data'].items(): namespace, name = namespaced_name - # FIXME(willkg): "name" used to be something like "xlink:href" - # but it's now (namespace['xlink'], 'href'). we should fix the - # name here so it's what the callable would expect. + if callable(allowed_attributes): if allowed_attributes(name, val): attrs[namespaced_name] = val @@ -73,12 +71,14 @@ def allow_token(self, token): elif name in allowed_attributes: attrs[namespaced_name] = val - # Go through all the uri-type attributes + # Handle attributes that have uri values for attr in self.attr_val_is_uri: if attr not in attrs: continue + val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', unescape(attrs[attr])).lower() + # Remove replacement characters from unescaped characters. val_unescaped = val_unescaped.replace("\ufffd", "") @@ -87,17 +87,15 @@ def allow_token(self, token): # It has a protocol, but it's not allowed--so drop it del attrs[attr] - # FIXME(willkg): is this right? for attr in self.svg_attr_val_allows_ref: if attr in attrs: attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', ' ', unescape(attrs[attr])) - # FIXME(willkg): is this right? if (token['name'] in self.svg_allow_local_href and - (namespace['xlink'], 'href') in attrs and - re.search(r'^\s*[^#\s].*', attrs[(namespace['xlink'], 'href')])): + (namespaces['xlink'], 'href') in attrs and + re.search(r'^\s*[^#\s].*', attrs[(namespaces['xlink'], 'href')])): del attrs[(namespace['xlink'], 'href')] # Sanitize css in style attribute diff --git a/tests/test_basics.py b/tests/test_basics.py index 8e293ca6..49148592 100644 --- a/tests/test_basics.py +++ b/tests/test_basics.py @@ -71,12 +71,17 @@ def test_function_arguments(): def test_named_arguments(): ATTRS = {'a': ['rel', 'href']} - s = ('xx.com', - 'xx.com') - assert bleach.clean(s[0]) == 'xx.com' - # FIXME: This might not be needed if attribute order is stable now. - assert bleach.clean(s[0], attributes=ATTRS) in s + text = 'xx.com' + + assert bleach.clean(text) == 'xx.com' + assert ( + bleach.clean(text, attributes=ATTRS) in + [ + 'xx.com', + 'xx.com' + ] + ) def test_disallowed_html(): diff --git a/tests/test_css.py b/tests/test_css.py index 3d224fac..d8880d78 100644 --- a/tests/test_css.py +++ b/tests/test_css.py @@ -119,8 +119,7 @@ def test_style_hang(): 'background-color: rgb(246, 246, 242);', 'overflow-x: auto;', 'overflow-y: auto;', - # FIXME(willkg): This fails the first regxp gauntlet in sanitize_css. - # 'font: italic small-caps bolder condensed 16px/3 cursive;', + 'font: italic small-caps bolder condensed 16px/3 cursive;', 'background-position: initial initial;', 'background-repeat: initial initial;' ] @@ -146,7 +145,8 @@ def test_style_hang(): 'padding-right: 15px; ' 'padding-bottom: 15px; ' 'padding-left: 15px; ' - 'background-color: rgb(246, 246, 242);' + 'background-color: rgb(246, 246, 242); ' + 'font: italic small-caps bolder condensed 16px/3 cursive;' '">Hello world

' ) diff --git a/tests/test_links.py b/tests/test_links.py index 6b7a77eb..53d60e5c 100644 --- a/tests/test_links.py +++ b/tests/test_links.py @@ -3,7 +3,6 @@ except ImportError: from urllib import quote_plus -# FIXME(willkg): from html5lib.tokenizer import HTMLTokenizer import pytest from bleach import linkify, url_re, DEFAULT_CALLBACKS as DC @@ -499,14 +498,6 @@ def test_ports(data, expected_data): assert linkify(data) == out.format(*expected_data) -@pytest.mark.xfail(reason='html5lib >= 0.99999999: no access to tokenizer') -def test_tokenizer(): - """Linkify doesn't always have to sanitize.""" - raw = 'test' - assert linkify(raw) == 'test<x></x>' - assert linkify(raw, tokenizer=HTMLTokenizer) == raw - - def test_ignore_bad_protocols(): assert ( linkify('foohttp://bar') == diff --git a/tests/test_security.py b/tests/test_security.py index 356b1292..2aac0200 100644 --- a/tests/test_security.py +++ b/tests/test_security.py @@ -75,7 +75,7 @@ def test_invalid_href_attr(): def test_invalid_filter_attr(): IMG = ['img', ] IMG_ATTR = { - 'img': lambda n, v: n == 'src' and v == "http://example.com/" + 'img': lambda attr, val: attr == 'src' and val == "http://example.com/" } assert ( From 10852231012ae3eece9c0a2af5c6c7c8e2e5212f Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Tue, 21 Feb 2017 15:37:00 -0500 Subject: [PATCH 05/12] Update CHANGES and docs --- CHANGES | 19 +++++++++++++++---- README.rst | 2 -- docs/clean.rst | 5 ++++- docs/conf.py | 3 +-- docs/dev.rst | 4 ++-- docs/goals.rst | 4 ++-- docs/linkify.rst | 17 +---------------- 7 files changed, 25 insertions(+), 29 deletions(-) diff --git a/CHANGES b/CHANGES index ec3bc992..e8e49a8d 100644 --- a/CHANGES +++ b/CHANGES @@ -8,10 +8,19 @@ Version 2.0 (in development) - Removed support for Python 2.6. #206 - Removed support for Python 3.2. #224 +- Bleach no longer supports html5lib < 0.99999999 (8 9s). + + This version represents a rewrite to use the new sanitizing API since + the old one was dropped in html5lib 0.99999999 (8 9s). + +- linkify no longer accepts a tokenizer argument. +- clean output is different than in previous versions; particularly this version + will add end tags even if the tag will be escaped. **Changes** -- Added testing for Python 3.6. +- Supports Python 3.6. +- Supports html5lib >= 0.99999999 (8 9s). Version 1.5 (November 4th, 2016) @@ -20,9 +29,11 @@ Version 1.5 (November 4th, 2016) **Backwards incompatible changes** - clean: The list of ``ALLOWED_PROTOCOLS`` now defaults to http, https and - mailto. Previously it was a long list of protocols something like ed2k, ftp, - http, https, irc, mailto, news, gopher, nntp, telnet, webcal, xmpp, callto, - feed, urn, aim, rsync, tag, ssh, sftp, rtsp, afs, data. #149 + mailto. + + Previously it was a long list of protocols something like ed2k, ftp, http, + https, irc, mailto, news, gopher, nntp, telnet, webcal, xmpp, callto, feed, + urn, aim, rsync, tag, ssh, sftp, rtsp, afs, data. #149 **Changes** diff --git a/README.rst b/README.rst index 8f9ce05d..3bd87573 100644 --- a/README.rst +++ b/README.rst @@ -101,5 +101,3 @@ The simplest way to use Bleach is: .. _GitHub: https://github.com/mozilla/bleach .. _ReadTheDocs: https://bleach.readthedocs.io/ .. _PyPI: http://pypi.python.org/pypi/bleach - - diff --git a/docs/clean.rst b/docs/clean.rst index ebd82055..a988a81a 100644 --- a/docs/clean.rst +++ b/docs/clean.rst @@ -162,7 +162,7 @@ For example, this sets allowed protocols to http, https and smb: u'allowed protocol' -This adds smb to the bleach-specified set of allowed protocols: +This adds smb to the Bleach-specified set of allowed protocols: .. doctest:: @@ -187,6 +187,7 @@ whitelist and invalid markup. For example: .. doctest:: >>> import bleach + >>> bleach.clean('is not allowed') u'<span>is not allowed</span>' >>> bleach.clean('is not allowed', tags=['b']) @@ -199,6 +200,7 @@ If you would rather Bleach stripped this markup entirely, you can pass .. doctest:: >>> import bleach + >>> bleach.clean('is not allowed', strip=True) u'is not allowed' >>> bleach.clean('is not allowed', tags=['b'], strip=True) @@ -214,6 +216,7 @@ By default, Bleach will strip out HTML comments. To disable this behavior, set .. doctest:: >>> import bleach + >>> html = 'my html' >>> bleach.clean(html) diff --git a/docs/conf.py b/docs/conf.py index 00b9c239..e186c827 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -27,8 +27,7 @@ # Add any Sphinx extension module names here, as strings. They can be extensions # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. -extensions = ['sphinx.ext.autodoc', 'sphinx.ext.pngmath', 'sphinx.ext.viewcode', - 'sphinx.ext.doctest'] +extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode', 'sphinx.ext.doctest'] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] diff --git a/docs/dev.rst b/docs/dev.rst index 027a0a76..02f8d44a 100644 --- a/docs/dev.rst +++ b/docs/dev.rst @@ -5,7 +5,7 @@ Bleach development Docs ==== -Docs are in ``docs/``. We use Sphinx. Docs are pushed to readthedocs +Docs are in ``docs/``. We use Sphinx. Docs are pushed to ReadTheDocs via a GitHub webhook. @@ -16,7 +16,7 @@ Run:: $ tox -That'll run bleach tests in all the supported Python environments. Note +That'll run Bleach tests in all the supported Python environments. Note that you need the necessary Python binaries for them all to be tested. Tests are run in Travis CI via a GitHub webhook. diff --git a/docs/goals.rst b/docs/goals.rst index 01f63a94..632c222c 100644 --- a/docs/goals.rst +++ b/docs/goals.rst @@ -91,10 +91,10 @@ Make malicious content look pretty or sane ------------------------------------------ Malicious content is designed to be malicious. Making it safe is a design goal -of bleach. Making it pretty or sane-looking is not. +of Bleach. Making it pretty or sane-looking is not. If you want your malicious content to look pretty, you should pass it through -bleach to make it safe and then do your own transform afterwards. +Bleach to make it safe and then do your own transform afterwards. Allow arbitrary styling diff --git a/docs/linkify.rst b/docs/linkify.rst index b7449c34..705000c2 100644 --- a/docs/linkify.rst +++ b/docs/linkify.rst @@ -9,7 +9,7 @@ control how and when those links are rendered:: def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False, - parse_email=False, tokenizer=HTMLSanitizer): + parse_email=False): """Convert URL-like strings in an HTML fragment to links. ``linkify()`` works by building a document tree, so it's guaranteed never to do @@ -194,19 +194,4 @@ they are newly created or already in the text, so be careful when writing callbacks that may need to behave differently if the protocol is ``mailto:``. -``tokenizer`` -============= - -``linkify()`` uses the ``html5lib.sanitizer.HTMLSanitizer`` tokenizer by -default. This has the effect of scrubbing some tags and attributes. To use a -more lenient, or totally different, tokenizer, you can specify the tokenizer -class here. (See the implementation of :ref:`clean() ` for an -example of building a custom tokenizer.) - -:: - - from html5lib.tokenizer import HTMLTokenizer - linked_text = linkify(text, tokenizer=HTMLTokenizer) - - .. _Crate: https://crate.io/ From 066631af96ee9c16ddcf9d132bd8537a4af17da6 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Tue, 21 Feb 2017 20:49:21 -0500 Subject: [PATCH 06/12] Update .travis.yml --- .travis.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 4e66cf1d..318dfa7d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,9 +11,8 @@ python: - "3.6" - "pypy" env: -- HTML5LIB=0.999 # 3 -- HTML5LIB=0.999999 # 6 -- HTML5LIB=0.9999999 # 7 +- HTML5LIB=0.99999999 # 8 +- HTML5LIB=0.999999999 # 9 install: - pip install -r requirements.txt - pip install html5lib==$HTML5LIB From 30772dd20f16a8ec75f7c1ddb7b76c9ff6fd97d2 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Tue, 21 Feb 2017 20:57:13 -0500 Subject: [PATCH 07/12] Another .travis.yml fix --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 318dfa7d..a5db65b1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,6 +14,7 @@ env: - HTML5LIB=0.99999999 # 8 - HTML5LIB=0.999999999 # 9 install: +- pip install -U pip - pip install -r requirements.txt - pip install html5lib==$HTML5LIB script: From 85cc802584f4290556b14e50384c2ed6bb6959b6 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Tue, 21 Feb 2017 21:05:59 -0500 Subject: [PATCH 08/12] One more .travis.yml fix --- .travis.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index a5db65b1..14015378 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,9 +14,11 @@ env: - HTML5LIB=0.99999999 # 8 - HTML5LIB=0.999999999 # 9 install: -- pip install -U pip -- pip install -r requirements.txt -- pip install html5lib==$HTML5LIB + # html5lib 0.99999999 (8 9s) requires at least setuptools 18.5 + - pip install -U pip setuptools>=18.5 + - pip install -r requirements.txt + # stomp on html5lib install with the specified one + - pip install html5lib==$HTML5LIB script: - py.test - flake8 bleach/ From 4cca43b5f41a4cc157fa2e6f5dd5185f70cb7a6c Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Tue, 21 Feb 2017 21:10:40 -0500 Subject: [PATCH 09/12] Fix flake8 issues --- bleach/sanitizer.py | 4 ++-- setup.cfg | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py index 0701def2..68438e9a 100644 --- a/bleach/sanitizer.py +++ b/bleach/sanitizer.py @@ -1,6 +1,6 @@ from __future__ import unicode_literals import re -from xml.sax.saxutils import escape, unescape +from xml.sax.saxutils import unescape from html5lib.constants import namespaces from html5lib.filters import sanitizer @@ -83,7 +83,7 @@ def allow_token(self, token): val_unescaped = val_unescaped.replace("\ufffd", "") if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped) and - (val_unescaped.split(':')[0] not in self.allowed_protocols)): + (val_unescaped.split(':')[0] not in self.allowed_protocols)): # It has a protocol, but it's not allowed--so drop it del attrs[attr] diff --git a/setup.cfg b/setup.cfg index f3a416e4..950364a7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -3,6 +3,7 @@ test=pytest [flake8] ignore = E731,W503 +max-line-length = 100 [wheel] universal=1 From 49f35dd27317019e47febed1c0c507177297a8af Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Fri, 24 Feb 2017 12:21:38 -0500 Subject: [PATCH 10/12] Add tests, fix alphabetizing, code cleanup * this adds some missing tests to add more coverage * html5lib 0.99999999 and 0.999999999 have an alphabeticalattributes filter that doesn't work when the attributes set has some items with a namespace and some without in Python 3; this rolls alphabetizing into the Bleach sanitizer * remove some dead code and clean some other code up --- bleach/__init__.py | 17 ++++----- bleach/sanitizer.py | 43 +++++++++++++++++----- tests/test_basics.py | 86 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 129 insertions(+), 17 deletions(-) diff --git a/bleach/__init__.py b/bleach/__init__.py index d1a82cde..ae96a925 100644 --- a/bleach/__init__.py +++ b/bleach/__init__.py @@ -61,8 +61,6 @@ # Make sure that .com doesn't get matched by .co first TLDS.reverse() -PROTOCOLS = allowed_protocols - url_re = re.compile( r"""\(* # Match any opening parentheses. \b(?"]*)? # /path/zz (excluding "unsafe" chars from RFC 1738, # except for # and ~, which happen in practice) - """.format('|'.join(PROTOCOLS), '|'.join(TLDS)), + """.format('|'.join(allowed_protocols), '|'.join(TLDS)), re.IGNORECASE | re.VERBOSE | re.UNICODE) proto_re = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE) @@ -87,8 +85,6 @@ """, re.IGNORECASE | re.MULTILINE | re.VERBOSE) -NODE_TEXT = 4 # The numeric ID of a text node in simpletree. - ETREE_TAG = lambda x: "".join(['{http://www.w3.org/1999/xhtml}', x]) # a simple routine that returns the tag name with the namespace prefix # as returned by etree's Element.tag attribute @@ -147,8 +143,13 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, ) s = HTMLSerializer( quote_attr_values='always', - alphabetical_attributes=True, - omit_optional_tags=False + omit_optional_tags=False, + + # Bleach has its own sanitizer, so don't use the html5lib one + sanitize=False, + + # Bleach sanitizer alphabetizes already, so don't use the html5lib one + alphabetical_attributes=False, ) return s.render(filtered) @@ -176,7 +177,7 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False, parser = html5lib.HTMLParser() forest = parser.parseFragment(text) - _seen = set([]) + _seen = set() def replace_nodes(tree, new_frag, node, index=0): """Doesn't really replace nodes, but inserts the nodes contained in diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py index 68438e9a..c12b8d24 100644 --- a/bleach/sanitizer.py +++ b/bleach/sanitizer.py @@ -1,4 +1,5 @@ from __future__ import unicode_literals +from collections import OrderedDict import re from xml.sax.saxutils import unescape @@ -6,6 +7,19 @@ from html5lib.filters import sanitizer +def _attr_key(attr): + """Returns appropriate key for sorting attribute names + + Attribute names are a tuple of ``(namespace, name)`` where namespace can be + ``None`` or a string. These can't be compared in Python 3, so we conver the + ``None`` to an empty string. + + """ + key = (attr[0][0] or ''), attr[0][1] + print(key) + return key + + class BleachSanitizerFilter(sanitizer.Filter): def __init__(self, source, allowed_attributes_map, strip_disallowed_elements=False, strip_html_comments=True, @@ -87,22 +101,33 @@ def allow_token(self, token): # It has a protocol, but it's not allowed--so drop it del attrs[attr] + # Drop values in svg attrs with non-local IRIs for attr in self.svg_attr_val_allows_ref: if attr in attrs: - attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', - ' ', - unescape(attrs[attr])) - - if (token['name'] in self.svg_allow_local_href and - (namespaces['xlink'], 'href') in attrs and - re.search(r'^\s*[^#\s].*', attrs[(namespaces['xlink'], 'href')])): - del attrs[(namespace['xlink'], 'href')] + new_val = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', + ' ', + unescape(attrs[attr])) + new_val = new_val.strip() + if not new_val: + del attrs[attr] + else: + attrs[attr] = new_val + + # Drop href and xlink:href attr for svg elements with non-local IRIs + if (None, token['name']) in self.svg_allow_local_href: + for href_attr in [(None, 'href'), (namespaces['xlink'], 'href')]: + if href_attr in attrs: + if re.search(r'^\s*[^#\s]', attrs[href_attr]): + del attrs[href_attr] # Sanitize css in style attribute if (None, u'style') in attrs: attrs[(None, u'style')] = self.sanitize_css(attrs[(None, u'style')]) - token['data'] = attrs + # Alphabetize attributes + token['data'] = OrderedDict( + [(key, val) for key, val in sorted(attrs.items(), key=_attr_key)] + ) return token def sanitize_css(self, style): diff --git a/tests/test_basics.py b/tests/test_basics.py index 49148592..bae1506e 100644 --- a/tests/test_basics.py +++ b/tests/test_basics.py @@ -240,6 +240,92 @@ def test_wildcard_attributes(): assert bleach.clean(dirty, tags=TAG, attributes=ATTR) in clean +def test_callable_attributes(): + """Verify callable attributes work and get correct arg values""" + def img_test(attr, val): + return attr == 'src' and val.startswith('https') + + ATTR = { + 'img': img_test, + } + TAGS = ['img'] + + assert ( + bleach.clean('foo blah baz', tags=TAGS, attributes=ATTR) == + u'foo baz' + ) + assert ( + bleach.clean('foo blah baz', tags=TAGS, attributes=ATTR) == + u'foo baz' + ) + + +def test_svg_attr_val_allows_ref(): + """Unescape values in svg attrs that allow url references""" + # Local IRI, so keep it + text = '' + TAGS = ['svg', 'rect'] + ATTRS = { + 'rect': ['fill'], + } + assert ( + bleach.clean(text, tags=TAGS, attributes=ATTRS) == + '' + ) + + # Non-local IRI, so drop it + text = '' + TAGS = ['svg', 'rect'] + ATTRS = { + 'rect': ['fill'], + } + assert ( + bleach.clean(text, tags=TAGS, attributes=ATTRS) == + '' + ) + + +@pytest.mark.parametrize('text, expected', [ + ( + '', + '' + ), + ( + '', + # NOTE(willkg): Bug in html5lib serializer drops the xlink part + '' + ), +]) +def test_svg_allow_local_href(text, expected): + """Keep local hrefs for svg elements""" + TAGS = ['svg', 'pattern'] + ATTRS = { + 'pattern': ['id', 'href'], + } + assert bleach.clean(text, tags=TAGS, attributes=ATTRS) == expected + + +@pytest.mark.parametrize('text, expected', [ + ( + '', + '' + ), + ( + '', + '' + ), +]) +def test_svg_allow_local_href_nonlocal(text, expected): + """Drop non-local hrefs for svg elements""" + TAGS = ['svg', 'pattern'] + ATTRS = { + 'pattern': ['id', 'href'], + } + assert bleach.clean(text, tags=TAGS, attributes=ATTRS) == expected + + + + @pytest.mark.xfail(reason='html5lib >= 0.99999999: changed API') def test_sarcasm(): """Jokes should crash.""" From 70d96e390c1525d8508436d8582daa49357fdf3e Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Fri, 24 Feb 2017 12:49:12 -0500 Subject: [PATCH 11/12] Alphabetize before escaping disallowed tokens Making this change means the output is stable since attributes will always happen in the same order. Seems like maybe it's not a great idea, but stable seems good. If it turns out this is terrible, someone will complain with a compelling use case and we can undo it. I also went through and removed a bunch of the "the output is either this or that" in the tests. --- bleach/sanitizer.py | 6 ++++++ tests/test_basics.py | 31 +++++++++++++------------------ tests/test_unicode.py | 7 ++----- 3 files changed, 21 insertions(+), 23 deletions(-) diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py index c12b8d24..789d89e6 100644 --- a/bleach/sanitizer.py +++ b/bleach/sanitizer.py @@ -58,6 +58,12 @@ def sanitize_token(self, token): pass else: + if 'data' in token: + # Alphabetize the attributes before calling .disallowed_token() + # so that the resulting string is stable + token['data'] = OrderedDict( + [(key, val) for key, val in sorted(token['data'].items(), key=_attr_key)] + ) return self.disallowed_token(token) elif token_type == 'Comment': diff --git a/tests/test_basics.py b/tests/test_basics.py index bae1506e..c42ccc62 100644 --- a/tests/test_basics.py +++ b/tests/test_basics.py @@ -76,11 +76,8 @@ def test_named_arguments(): assert bleach.clean(text) == 'xx.com' assert ( - bleach.clean(text, attributes=ATTRS) in - [ - 'xx.com', - 'xx.com' - ] + bleach.clean(text, attributes=ATTRS) == + 'xx.com' ) @@ -199,25 +196,22 @@ def test_idempotent(): clean = bleach.clean(dirty) assert bleach.clean(clean) == clean - possible_outs = ( - 'invalid & < extra http://link.com', + linked = bleach.linkify(dirty) + assert ( + bleach.linkify(linked) == 'invalid & < extra http://link.com' ) - linked = bleach.linkify(dirty) - assert bleach.linkify(linked) in possible_outs def test_rel_already_there(): """Make sure rel attribute is updated not replaced""" linked = ('Click ' 'here.') - link_good = (('Click ' - 'here.'), - ('Click ' - 'here.')) - assert bleach.linkify(linked) in link_good - assert bleach.linkify(link_good[0]) in link_good + link_good = 'Click here.' + + assert bleach.linkify(linked) == link_good + assert bleach.linkify(link_good) == link_good def test_lowercase_html(): @@ -235,9 +229,10 @@ def test_wildcard_attributes(): TAG = ['img', 'em'] dirty = ('both can have ' '') - clean = ('both can have ', - 'both can have ') - assert bleach.clean(dirty, tags=TAG, attributes=ATTR) in clean + assert ( + bleach.clean(dirty, tags=TAG, attributes=ATTR) == + 'both can have ' + ) def test_callable_attributes(): diff --git a/tests/test_unicode.py b/tests/test_unicode.py index b8b670e8..08ab3f4e 100644 --- a/tests/test_unicode.py +++ b/tests/test_unicode.py @@ -27,11 +27,8 @@ def test_mixed(): def test_mixed_linkify(): assert ( - linkify('Домашняя http://example.com ヘルプとチュートリアル') in - ( - 'Домашняя http://example.com ヘルプとチュートリアル', - 'Домашняя http://example.com ヘルプとチュートリアル' - ) + linkify('Домашняя http://example.com ヘルプとチュートリアル') == + 'Домашняя http://example.com ヘルプとチュートリアル' ) From 86dea93c17d96a237acd2543264ba27d83f15d79 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Fri, 24 Feb 2017 13:09:20 -0500 Subject: [PATCH 12/12] Cosmetic fix for readability --- bleach/__init__.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/bleach/__init__.py b/bleach/__init__.py index ae96a925..a645ac7e 100644 --- a/bleach/__init__.py +++ b/bleach/__init__.py @@ -130,16 +130,18 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, walker = html5lib.getTreeWalker('etree') filtered = BleachSanitizerFilter( source=walker(dom), + + # Bleach-sanitizer-specific things allowed_attributes_map=attributes, + strip_disallowed_elements=strip, + strip_html_comments=strip_comments, + # html5lib-sanitizer things allowed_elements=tags, allowed_css_properties=styles, allowed_protocols=protocols, - allowed_svg_properties=[], - strip_disallowed_elements=strip, - strip_html_comments=strip_comments ) s = HTMLSerializer( quote_attr_values='always',