From ef0c48765c160f5724d915770060a095cbc69dda Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Fri, 17 Feb 2017 14:55:55 -0500
Subject: [PATCH 01/12] Update dev requirements

---
 requirements.txt | 7 ++++---
 setup.py         | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index a026c46e..6ec6bd90 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,12 +1,13 @@
 -e .
 
 # Requirements to run the test suite:
-pytest==3.0.3
-flake8==3.0.4
+pytest==3.0.6
+pytest-wholenodeid
+flake8==3.3.0
 tox==2.4.1
  
 # Requirements for building docs
-Sphinx==1.4.8
+Sphinx==1.5.2
  
 # Requirements for updating package
 twine==1.8.1
diff --git a/setup.py b/setup.py
index 167658ff..6c627a38 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@
     setup_requires.append('pytest-runner>=2.0,<3dev')
 
 tests_require = [
-    'pytest==3.0.3',
+    'pytest>=3.0.0',
 ]
     
 install_requires = [

From 567eebb53e0716f6d267fa8951c548d63bf78a70 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 23 Jan 2017 17:02:03 -0500
Subject: [PATCH 02/12] Overhaul bleach to use html5lib >= 0.99999999

This is a bleach rewrite to use the new sanitizer API in html5lib 0.99999999.
The new API happens as a filter when emitting the tree rather than in
the tokenizer. Because of that, the output of .clean() and .linkify() are
different than in previous versions of bleach.
---
 bleach/__init__.py     |  53 +++++++-----
 bleach/sanitizer.py    | 189 ++++++++++++++++++++---------------------
 setup.py               |   5 +-
 tests/test_basics.py   |   8 +-
 tests/test_css.py      |  74 ++++++++++------
 tests/test_links.py    |   4 +-
 tests/test_security.py |  24 ++++--
 tox.ini                |   7 +-
 8 files changed, 204 insertions(+), 160 deletions(-)

diff --git a/bleach/__init__.py b/bleach/__init__.py
index 09dad637..c54dc72b 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -5,13 +5,14 @@
 import re
 
 import html5lib
-from html5lib.sanitizer import HTMLSanitizer
-from html5lib.serializer.htmlserializer import HTMLSerializer
+from html5lib.filters import sanitizer
+from html5lib.filters.sanitizer import allowed_protocols
+from html5lib.serializer import HTMLSerializer
 
-from . import callbacks as linkify_callbacks
-from .encoding import force_unicode
-from .sanitizer import BleachSanitizer
-from .version import __version__, VERSION # flake8: noqa
+from bleach import callbacks as linkify_callbacks
+from bleach.encoding import force_unicode
+from bleach.sanitizer import BleachSanitizerFilter
+from bleach.version import __version__, VERSION # flake8: noqa
 
 __all__ = ['clean', 'linkify']
 
@@ -60,7 +61,7 @@
 # Make sure that .com doesn't get matched by .co first
 TLDS.reverse()
 
-PROTOCOLS = HTMLSanitizer.acceptable_protocols
+PROTOCOLS = allowed_protocols
 
 url_re = re.compile(
     r"""\(*  # Match any opening parentheses.
@@ -125,21 +126,34 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
 
     text = force_unicode(text)
 
-    class s(BleachSanitizer):
-        allowed_elements = tags
-        allowed_attributes = attributes
-        allowed_css_properties = styles
-        allowed_protocols = protocols
-        strip_disallowed_elements = strip
-        strip_html_comments = strip_comments
+    parser = html5lib.HTMLParser(namespaceHTMLElements=False)
+    dom = parser.parseFragment(text)
 
-    parser = html5lib.HTMLParser(tokenizer=s)
+    walker = html5lib.getTreeWalker('etree')
+    filtered = BleachSanitizerFilter(
+        source=walker(dom),
+        allowed_attributes_map=attributes,
 
-    return _render(parser.parseFragment(text))
+        allowed_elements=tags,
+        allowed_css_properties=styles,
+        allowed_protocols=protocols,
+
+        allowed_svg_properties=[],
+
+        strip_disallowed_elements=strip,
+        strip_html_comments=strip_comments
+    )
+    s = HTMLSerializer(
+        quote_attr_values='always',
+        alphabetical_attributes=True,
+        omit_optional_tags=False
+    )
+    return s.render(filtered)
 
 
 def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
-            parse_email=False, tokenizer=HTMLSanitizer):
+            # FIXME(willkg): parse_email=False, tokenizer=HTMLSanitizer):
+            parse_email=False):
     """Convert URL-like strings in an HTML fragment to links
 
     ``linkify()`` converts strings that look like URLs, domain names and email
@@ -158,7 +172,8 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
     if not text:
         return ''
 
-    parser = html5lib.HTMLParser(tokenizer=tokenizer)
+    # FIXME(willkg): parser = html5lib.HTMLParser(tokenizer=tokenizer)
+    parser = html5lib.HTMLParser()
 
     forest = parser.parseFragment(text)
     _seen = set([])
@@ -427,7 +442,7 @@ def _render(tree):
 def _serialize(domtree):
     walker = html5lib.treewalkers.getTreeWalker('etree')
     stream = walker(domtree)
-    serializer = HTMLSerializer(quote_attr_values=True,
+    serializer = HTMLSerializer(quote_attr_values='always',
                                 alphabetical_attributes=True,
                                 omit_optional_tags=False)
     return serializer.render(stream)
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index eec6659b..fb502b85 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -2,118 +2,125 @@
 import re
 from xml.sax.saxutils import escape, unescape
 
-from html5lib.constants import tokenTypes
-from html5lib.sanitizer import HTMLSanitizerMixin
-from html5lib.tokenizer import HTMLTokenizer
+from html5lib.constants import namespaces
+from html5lib.filters import sanitizer
 
 
-PROTOS = HTMLSanitizerMixin.acceptable_protocols
-PROTOS.remove('feed')
+class BleachSanitizerFilter(sanitizer.Filter):
+    def __init__(self, source, allowed_attributes_map,
+                 strip_disallowed_elements=False, strip_html_comments=True,
+                 **kwargs):
 
+        if isinstance(allowed_attributes_map, dict):
+            self.wildcard_attributes = allowed_attributes_map.get('*', [])
+            self.allowed_attributes_map = allowed_attributes_map
+        else:
+            self.wildcard_attributes = allowed_attributes_map
+            self.allowed_attributes_map = {}
 
-class BleachSanitizerMixin(HTMLSanitizerMixin):
-    """Mixin to replace sanitize_token() and sanitize_css()."""
+        self.strip_disallowed_elements = strip_disallowed_elements
+        self.strip_html_comments = strip_html_comments
 
-    allowed_svg_properties = []
+        return super(BleachSanitizerFilter, self).__init__(source, **kwargs)
 
     def sanitize_token(self, token):
         """Sanitize a token either by HTML-encoding or dropping.
 
-        Unlike HTMLSanitizerMixin.sanitize_token, allowed_attributes can be
-        a dict of {'tag': ['attribute', 'pairs'], 'tag': callable}.
+        Unlike sanitizer.Filter, allowed_attributes can be a dict of {'tag':
+        ['attribute', 'pairs'], 'tag': callable}.
 
-        Here callable is a function with two arguments of attribute name
-        and value. It should return true of false.
+        Here callable is a function with two arguments of attribute name and
+        value. It should return true of false.
 
         Also gives the option to strip tags instead of encoding.
 
         """
-        if (getattr(self, 'wildcard_attributes', None) is None and
-                isinstance(self.allowed_attributes, dict)):
-            self.wildcard_attributes = self.allowed_attributes.get('*', [])
-
-        if token['type'] in (tokenTypes['StartTag'], tokenTypes['EndTag'],
-                             tokenTypes['EmptyTag']):
+        token_type = token['type']
+        if token_type in ['StartTag', 'EndTag', 'EmptyTag']:
             if token['name'] in self.allowed_elements:
-                if 'data' in token:
-                    if isinstance(self.allowed_attributes, dict):
-                        allowed_attributes = self.allowed_attributes.get(
-                            token['name'], [])
-                        if not callable(allowed_attributes):
-                            allowed_attributes += self.wildcard_attributes
-                    else:
-                        allowed_attributes = self.allowed_attributes
-                    attrs = dict([(name, val) for name, val in
-                                  token['data'][::-1]
-                                  if (allowed_attributes(name, val)
-                                      if callable(allowed_attributes)
-                                      else name in allowed_attributes)])
-                    for attr in self.attr_val_is_uri:
-                        if attr not in attrs:
-                            continue
-                        val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
-                                               unescape(attrs[attr])).lower()
-                        # Remove replacement characters from unescaped
-                        # characters.
-                        val_unescaped = val_unescaped.replace("\ufffd", "")
-                        if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped)
-                            and (val_unescaped.split(':')[0] not in
-                                 self.allowed_protocols)):
-                            del attrs[attr]
-                    for attr in self.svg_attr_val_allows_ref:
-                        if attr in attrs:
-                            attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
-                                                 ' ',
-                                                 unescape(attrs[attr]))
-                    if (token['name'] in self.svg_allow_local_href and
-                            'xlink:href' in attrs and
-                            re.search(r'^\s*[^#\s].*', attrs['xlink:href'])):
-                        del attrs['xlink:href']
-                    if 'style' in attrs:
-                        attrs['style'] = self.sanitize_css(attrs['style'])
-                    token['data'] = [(name, val) for name, val in
-                                     attrs.items()]
-                return token
+                return self.allow_token(token)
+
             elif self.strip_disallowed_elements:
                 pass
+
             else:
-                if token['type'] == tokenTypes['EndTag']:
-                    token['data'] = '</{0!s}>'.format(token['name'])
-                elif token['data']:
-                    attr = ' {0!s}="{1!s}"'
-                    attrs = ''.join([attr.format(k, escape(v)) for k, v in
-                                    token['data']])
-                    token['data'] = '<{0!s}{1!s}>'.format(token['name'], attrs)
-                else:
-                    token['data'] = '<{0!s}>'.format(token['name'])
-                if token['selfClosing']:
-                    token['data'] = token['data'][:-1] + '/>'
-                token['type'] = tokenTypes['Characters']
-                del token["name"]
-                return token
-        elif token['type'] == tokenTypes['Comment']:
+                return self.disallowed_token(token)
+
+        elif token_type == 'Comment':
             if not self.strip_html_comments:
                 return token
+
         else:
             return token
 
-    def sanitize_css(self, style):
-        """HTMLSanitizerMixin.sanitize_css replacement.
-
-        HTMLSanitizerMixin.sanitize_css always whitelists background-*,
-        border-*, margin-*, and padding-*. We only whitelist what's in
-        the whitelist.
+    def allow_token(self, token):
+        if 'data' in token:
+            allowed_attributes = self.allowed_attributes_map.get(token['name'], [])
+            if not callable(allowed_attributes):
+                allowed_attributes += self.wildcard_attributes
+
+            # Drop any attributes that aren't allowed
+            attrs = {}
+            for namespaced_name, val in token['data'].items():
+                namespace, name = namespaced_name
+                # FIXME(willkg): "name" used to be something like "xlink:href"
+                # but it's now (namespace['xlink'], 'href'). we should fix the
+                # name here so it's what the callable would expect.
+                if callable(allowed_attributes):
+                    if allowed_attributes(name, val):
+                        attrs[namespaced_name] = val
+
+                elif name in allowed_attributes:
+                    attrs[namespaced_name] = val
+
+            # Go through all the uri-type attributes
+            for attr in self.attr_val_is_uri:
+                if attr not in attrs:
+                    continue
+                val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
+                                       unescape(attrs[attr])).lower()
+                # Remove replacement characters from unescaped characters.
+                val_unescaped = val_unescaped.replace("\ufffd", "")
+
+                if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped) and
+                    (val_unescaped.split(':')[0] not in self.allowed_protocols)):
+                    # It has a protocol, but it's not allowed--so drop it
+                    del attrs[attr]
+
+            # FIXME(willkg): is this right?
+            for attr in self.svg_attr_val_allows_ref:
+                if attr in attrs:
+                    attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
+                                         ' ',
+                                         unescape(attrs[attr]))
+
+            # FIXME(willkg): is this right?
+            if (token['name'] in self.svg_allow_local_href and
+                    (namespace['xlink'], 'href') in attrs and
+                    re.search(r'^\s*[^#\s].*', attrs[(namespace['xlink'], 'href')])):
+                del attrs[(namespace['xlink'], 'href')]
+
+            # Sanitize css in style attribute
+            if (None, u'style') in attrs:
+                attrs[(None, u'style')] = self.sanitize_css(attrs[(None, u'style')])
+
+            token['data'] = attrs
+        return token
 
-        """
+    def sanitize_css(self, style):
+        """html5lib sanitizer filter replacement to fix issues"""
         # disallow urls
         style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
 
         # gauntlet
-        # TODO: Make sure this does what it's meant to - I *think* it wants to
-        # validate style attribute contents.
+
+        # Validate the css in the style tag and if it's not valid, then drop
+        # the whole thing.
         parts = style.split(';')
-        gauntlet = re.compile("""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'"""
-                              """\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$""")
+        gauntlet = re.compile(
+            r"""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$"""
+        )
+
         for part in parts:
             if not gauntlet.match(part):
                 return ''
@@ -125,23 +132,11 @@ def sanitize_css(self, style):
         for prop, value in re.findall('([-\w]+)\s*:\s*([^:;]*)', style):
             if not value:
                 continue
+
             if prop.lower() in self.allowed_css_properties:
                 clean.append(prop + ': ' + value + ';')
+
             elif prop.lower() in self.allowed_svg_properties:
                 clean.append(prop + ': ' + value + ';')
 
         return ' '.join(clean)
-
-
-class BleachSanitizer(HTMLTokenizer, BleachSanitizerMixin):
-    def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
-                 lowercaseElementName=True, lowercaseAttrName=True, **kwargs):
-        HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
-                               lowercaseElementName, lowercaseAttrName,
-                               **kwargs)
-
-    def __iter__(self):
-        for token in HTMLTokenizer.__iter__(self):
-            token = self.sanitize_token(token)
-            if token:
-                yield token
diff --git a/setup.py b/setup.py
index 6c627a38..39fbb370 100644
--- a/setup.py
+++ b/setup.py
@@ -15,9 +15,8 @@
     
 install_requires = [
     'six',
-    # 3 9s up to but not including 8 9s, but not 4 9s or 5 9s because they're
-    # busted
-    'html5lib>=0.999,!=0.9999,!=0.99999,<0.99999999',
+    # >= 8 9s because of breaking API change
+    'html5lib>=0.99999999',
 ]
 
 
diff --git a/tests/test_basics.py b/tests/test_basics.py
index 07d4d918..8e293ca6 100644
--- a/tests/test_basics.py
+++ b/tests/test_basics.py
@@ -1,5 +1,6 @@
-import six
 import html5lib
+import pytest
+import six
 
 import bleach
 
@@ -234,6 +235,7 @@ def test_wildcard_attributes():
     assert bleach.clean(dirty, tags=TAG, attributes=ATTR) in clean
 
 
+@pytest.mark.xfail(reason='html5lib >= 0.99999999: changed API')
 def test_sarcasm():
     """Jokes should crash.<sarcasm/>"""
     dirty = 'Yeah right <sarcasm/>'
@@ -242,8 +244,8 @@ def test_sarcasm():
 
 
 def test_user_defined_protocols_valid():
-    valid_href = '<a href="my_protocol://more_text">allowed href</a>'
-    assert bleach.clean(valid_href, protocols=['my_protocol']) == valid_href
+    valid_href = '<a href="myprotocol://more_text">allowed href</a>'
+    assert bleach.clean(valid_href, protocols=['myprotocol']) == valid_href
 
 
 def test_user_defined_protocols_invalid():
diff --git a/tests/test_css.py b/tests/test_css.py
index 0b92f40b..3d224fac 100644
--- a/tests/test_css.py
+++ b/tests/test_css.py
@@ -8,7 +8,7 @@
 clean = partial(clean, tags=['p'], attributes=['style'])
 
 
-@pytest.mark.parametrize('data,styles,expected', [
+@pytest.mark.parametrize('data, styles, expected', [
     (
         'font-family: Arial; color: red; float: left; background-color: red;',
         ['color'],
@@ -91,24 +91,40 @@ def test_valid_css():
 
 def test_style_hang():
     """The sanitizer should not hang on any inline styles"""
-    # TODO: Neaten this up. It's copypasta from MDN/Kuma to repro the bug
-    style = ("""margin-top: 0px; margin-right: 0px; margin-bottom: 1.286em; """
-             """margin-left: 0px; padding-top: 15px; padding-right: 15px; """
-             """padding-bottom: 15px; padding-left: 15px; border-top-width: """
-             """1px; border-right-width: 1px; border-bottom-width: 1px; """
-             """border-left-width: 1px; border-top-style: dotted; """
-             """border-right-style: dotted; border-bottom-style: dotted; """
-             """border-left-style: dotted; border-top-color: rgb(203, 200, """
-             """185); border-right-color: rgb(203, 200, 185); """
-             """border-bottom-color: rgb(203, 200, 185); border-left-color: """
-             """rgb(203, 200, 185); background-image: initial; """
-             """background-attachment: initial; background-origin: initial; """
-             """background-clip: initial; background-color: """
-             """rgb(246, 246, 242); overflow-x: auto; overflow-y: auto; """
-             """font: normal normal normal 100%/normal 'Courier New', """
-             """'Andale Mono', monospace; background-position: initial """
-             """initial; background-repeat: initial initial;""")
-    html = '<p style="{0!s}">Hello world</p>'.format(style)
+    style = [
+        'margin-top: 0px;',
+        'margin-right: 0px;',
+        'margin-bottom: 1.286em;',
+        'margin-left: 0px;',
+        'padding-top: 15px;',
+        'padding-right: 15px;',
+        'padding-bottom: 15px;',
+        'padding-left: 15px;',
+        'border-top-width: 1px;',
+        'border-right-width: 1px;',
+        'border-bottom-width: 1px;',
+        'border-left-width: 1px;',
+        'border-top-style: dotted;',
+        'border-right-style: dotted;',
+        'border-bottom-style: dotted;',
+        'border-left-style: dotted;',
+        'border-top-color: rgb(203, 200, 185);',
+        'border-right-color: rgb(203, 200, 185);',
+        'border-bottom-color: rgb(203, 200, 185);',
+        'border-left-color: rgb(203, 200, 185);',
+        'background-image: initial;',
+        'background-attachment: initial;',
+        'background-origin: initial;',
+        'background-clip: initial;',
+        'background-color: rgb(246, 246, 242);',
+        'overflow-x: auto;',
+        'overflow-y: auto;',
+        # FIXME(willkg): This fails the first regxp gauntlet in sanitize_css.
+        # 'font: italic small-caps bolder condensed 16px/3 cursive;',
+        'background-position: initial initial;',
+        'background-repeat: initial initial;'
+    ]
+    html = '<p style="%s">Hello world</p>' % ' '.join(style)
     styles = [
         'border', 'float', 'overflow', 'min-height', 'vertical-align',
         'white-space',
@@ -120,12 +136,18 @@ def test_style_hang():
         'font', 'font-size', 'font-weight', 'text-align', 'text-transform',
     ]
 
-    expected = ("""<p style="margin-top: 0px; margin-right: 0px; """
-                """margin-bottom: 1.286em; margin-left: 0px; padding-top: """
-                """15px; padding-right: 15px; padding-bottom: 15px; """
-                """padding-left: 15px; background-color: """
-                """rgb(246, 246, 242); font: normal normal normal """
-                """100%/normal 'Courier New', 'Andale Mono', monospace;">"""
-                """Hello world</p>""")
+    expected = (
+        '<p style="'
+        'margin-top: 0px; '
+        'margin-right: 0px; '
+        'margin-bottom: 1.286em; '
+        'margin-left: 0px; '
+        'padding-top: 15px; '
+        'padding-right: 15px; '
+        'padding-bottom: 15px; '
+        'padding-left: 15px; '
+        'background-color: rgb(246, 246, 242);'
+        '">Hello world</p>'
+    )
 
     assert clean(html, styles=styles) == expected
diff --git a/tests/test_links.py b/tests/test_links.py
index ac38ee70..6b7a77eb 100644
--- a/tests/test_links.py
+++ b/tests/test_links.py
@@ -3,7 +3,7 @@
 except ImportError:
     from urllib import quote_plus
 
-from html5lib.tokenizer import HTMLTokenizer
+# FIXME(willkg): from html5lib.tokenizer import HTMLTokenizer
 import pytest
 
 from bleach import linkify, url_re, DEFAULT_CALLBACKS as DC
@@ -406,6 +406,7 @@ def test_end_of_clause():
     )
 
 
+@pytest.mark.xfail(reason='html5lib >= 0.99999999: changed API')
 def test_sarcasm():
     """Jokes should crash.<sarcasm/>"""
     assert linkify('Yeah right <sarcasm/>') == 'Yeah right &lt;sarcasm/&gt;'
@@ -498,6 +499,7 @@ def test_ports(data, expected_data):
     assert linkify(data) == out.format(*expected_data)
 
 
+@pytest.mark.xfail(reason='html5lib >= 0.99999999: no access to tokenizer')
 def test_tokenizer():
     """Linkify doesn't always have to sanitize."""
     raw = '<em>test<x></x></em>'
diff --git a/tests/test_security.py b/tests/test_security.py
index 6ffaf449..4fb30207 100644
--- a/tests/test_security.py
+++ b/tests/test_security.py
@@ -74,7 +74,9 @@ def test_invalid_href_attr():
 
 def test_invalid_filter_attr():
     IMG = ['img', ]
-    IMG_ATTR = {'img': lambda n, v: n == 'src' and v == "http://example.com/"}
+    IMG_ATTR = {
+        'img': lambda n, v: n == 'src' and v == "http://example.com/"
+    }
 
     assert (
         clean('<img onclick="evil" src="http://example.com/" />', tags=IMG, attributes=IMG_ATTR) ==
@@ -145,7 +147,11 @@ def test_feed_protocol():
 
 
 def get_tests():
-    """Retrieves regression tests from data/ directory"""
+    """Retrieves regression tests from data/ directory
+
+    :returns: list of ``(filename, filedata)`` tuples
+
+    """
     datadir = os.path.join(os.path.dirname(__file__), 'data')
     tests = [
         os.path.join(datadir, fn) for fn in os.listdir(datadir)
@@ -153,19 +159,23 @@ def get_tests():
     ]
     # Sort numerically which makes it easier to iterate through them
     tests.sort(key=lambda x: int(os.path.basename(x).split('.', 1)[0]))
-    return tests
+
+    testcases = [
+        (fn, open(fn, 'r').read()) for fn in tests
+    ]
+
+    return testcases
 
 
-@pytest.mark.parametrize('fn', get_tests())
-def test_regressions(fn):
+@pytest.mark.parametrize('fn, text', get_tests())
+def test_regressions(fn, text):
     """Regression tests for clean so we can see if there are issues"""
-    s = open(fn, 'r').read()
     expected = six.text_type(open(fn + '.out', 'r').read())
 
     # NOTE(willkg): This strips input and expected which makes it easier to
     # maintain the files. If there comes a time when the input needs whitespace
     # at the beginning or end, then we'll have to figure out something else.
-    assert clean(s.strip()) == expected.strip()
+    assert clean(text.strip()) == expected.strip()
 
 
 def test_regression_manually():
diff --git a/tox.ini b/tox.ini
index 09ed488f..53c175c9 100644
--- a/tox.ini
+++ b/tox.ini
@@ -4,7 +4,7 @@
 # and then run "tox" from this directory.
 
 [tox]
-envlist = py{27,33,34,35,36}-html5lib{999,999999,9999999},pypy-html5lib9999999
+envlist = py{27,33,34,35,36}-html5lib{99999999,999999999},pypy-html5lib99999999
 
 [testenv]
 basepython =
@@ -15,8 +15,7 @@ basepython =
     py36: python3.6
 deps =
     -rrequirements.txt
-    html5lib999: html5lib==0.999
-    html5lib999999: html5lib==0.999999
-    html5lib9999999: html5lib==0.9999999
+    html5lib99999999: html5lib==0.99999999
+    html5lib999999999: html5lib==0.999999999
 commands =
     py.test {posargs:-v}

From 3db588a5ed8b43da8324547b23ad5670b459ee9e Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 21 Feb 2017 11:45:01 -0500
Subject: [PATCH 03/12] Update security regression tests

I also moved test_nasty to the regression tests because it's just like those.
---
 tests/data/14.test.out |  2 +-
 tests/data/15.test.out |  2 +-
 tests/data/16.test.out |  2 +-
 tests/data/17.test.out |  2 +-
 tests/data/3.test.out  |  2 +-
 tests/data/4.test      |  1 +
 tests/data/4.test.out  |  1 +
 tests/data/5.test.out  |  2 +-
 tests/data/9.test.out  |  2 +-
 tests/test_security.py | 35 +++++++++++++++++------------------
 10 files changed, 26 insertions(+), 25 deletions(-)
 create mode 100644 tests/data/4.test
 create mode 100644 tests/data/4.test.out

diff --git a/tests/data/14.test.out b/tests/data/14.test.out
index 16445739..8e5ff754 100644
--- a/tests/data/14.test.out
+++ b/tests/data/14.test.out
@@ -1 +1 @@
-&lt;imgsrc=&amp;#106;&amp;#97;&amp;#118;&amp;#97;&amp;&lt;wbr&gt;#115;crip&amp;&lt;wbr&gt;#116;:a
\ No newline at end of file
+&lt;imgsrc=&amp;#106;&amp;#97;&amp;#118;&amp;#97;&amp;&lt;wbr&gt;#115;crip&amp;&lt;wbr&gt;&lt;/wbr&gt;#116;:a&lt;/imgsrc=&amp;#106;&amp;#97;&amp;#118;&amp;#97;&amp;&lt;wbr&gt;
\ No newline at end of file
diff --git a/tests/data/15.test.out b/tests/data/15.test.out
index 334f916b..8b90245f 100644
--- a/tests/data/15.test.out
+++ b/tests/data/15.test.out
@@ -1 +1 @@
-le&amp;&lt;wbr&gt;#114;t('XS&lt;wbr&gt;;S')&gt;
\ No newline at end of file
+le&amp;&lt;wbr&gt;&lt;/wbr&gt;#114;t('XS&lt;wbr&gt;&lt;/wbr&gt;;S')&gt;
\ No newline at end of file
diff --git a/tests/data/16.test.out b/tests/data/16.test.out
index 9c6ca965..1ecb332b 100644
--- a/tests/data/16.test.out
+++ b/tests/data/16.test.out
@@ -1 +1 @@
-&lt;imgsrc=&amp;#0000106&amp;#0000097&amp;&lt;wbr&gt;#0000118as&amp;&lt;wbr&gt;#0000099ri&amp;&lt;wbr&gt;#0000112t:&amp;&lt;wbr&gt;#0000097le&amp;&lt;wbr&gt;#0000114t(&amp;&lt;wbr&gt;#0000039XS&amp;&lt;wbr&gt;#0000083')&gt;
\ No newline at end of file
+&lt;imgsrc=&amp;#0000106&amp;#0000097&amp;&lt;wbr&gt;#0000118as&amp;&lt;wbr&gt;&lt;/wbr&gt;#0000099ri&amp;&lt;wbr&gt;&lt;/wbr&gt;#0000112t:&amp;&lt;wbr&gt;&lt;/wbr&gt;#0000097le&amp;&lt;wbr&gt;&lt;/wbr&gt;#0000114t(&amp;&lt;wbr&gt;&lt;/wbr&gt;#0000039XS&amp;&lt;wbr&gt;&lt;/wbr&gt;#0000083')&gt;&lt;/imgsrc=&amp;#0000106&amp;#0000097&amp;&lt;wbr&gt;
\ No newline at end of file
diff --git a/tests/data/17.test.out b/tests/data/17.test.out
index dabfaa2d..ae928a99 100644
--- a/tests/data/17.test.out
+++ b/tests/data/17.test.out
@@ -1 +1 @@
-&lt;imgsrc=&amp;#x6a&amp;#x61&amp;#x76&amp;#x61&amp;#x73&amp;&lt;wbr&gt;#x63ript:&amp;&lt;wbr&gt;#x61lert(&amp;&lt;wbr&gt;#x27XSS')&gt;
\ No newline at end of file
+&lt;imgsrc=&amp;#x6a&amp;#x61&amp;#x76&amp;#x61&amp;#x73&amp;&lt;wbr&gt;#x63ript:&amp;&lt;wbr&gt;&lt;/wbr&gt;#x61lert(&amp;&lt;wbr&gt;&lt;/wbr&gt;#x27XSS')&gt;&lt;/imgsrc=&amp;#x6a&amp;#x61&amp;#x76&amp;#x61&amp;#x73&amp;&lt;wbr&gt;
\ No newline at end of file
diff --git a/tests/data/3.test.out b/tests/data/3.test.out
index 20c3d0d4..f0d69629 100644
--- a/tests/data/3.test.out
+++ b/tests/data/3.test.out
@@ -1 +1 @@
-&gt;"'&gt;&lt;img%20src%3d%26%23x6a;%26%23x61;%26%23x76;%26%23x61;%26%23x73;%26%23x63;%26%23x72;%26%23x69;%26%23x70;%26%23x74;%26%23x3a;alert(%26quot;%26%23x20;xss%26%23x20;test%26%23x20;successful%26quot;)&gt;
\ No newline at end of file
+&gt;"'&gt;&lt;img%20src%3d%26%23x6a;%26%23x61;%26%23x76;%26%23x61;%26%23x73;%26%23x63;%26%23x72;%26%23x69;%26%23x70;%26%23x74;%26%23x3a;alert(%26quot;%26%23x20;xss%26%23x20;test%26%23x20;successful%26quot;)&gt;&lt;/img%20src%3d%26%23x6a;%26%23x61;%26%23x76;%26%23x61;%26%23x73;%26%23x63;%26%23x72;%26%23x69;%26%23x70;%26%23x74;%26%23x3a;alert(%26quot;%26%23x20;xss%26%23x20;test%26%23x20;successful%26quot;)&gt;
\ No newline at end of file
diff --git a/tests/data/4.test b/tests/data/4.test
new file mode 100644
index 00000000..c4cf51cd
--- /dev/null
+++ b/tests/data/4.test
@@ -0,0 +1 @@
+<scr<script></script>ipt type="text/javascript">alert("foo");</<script></script>script<del></del>>
diff --git a/tests/data/4.test.out b/tests/data/4.test.out
new file mode 100644
index 00000000..88ea86b2
--- /dev/null
+++ b/tests/data/4.test.out
@@ -0,0 +1 @@
+&lt;scr&lt;script&gt;ipt type="text/javascript"&gt;alert("foo");script&lt;del&gt;&lt;/del&gt;&gt;&lt;/scr&lt;script&gt;
diff --git a/tests/data/5.test.out b/tests/data/5.test.out
index 1782eafb..0d88a88a 100644
--- a/tests/data/5.test.out
+++ b/tests/data/5.test.out
@@ -1 +1 @@
-&gt;%22%27&gt;&lt;img%20src%3d%22javascript:alert(%27%20xss%27)%22&gt;
\ No newline at end of file
+&gt;%22%27&gt;&lt;img%20src%3d%22javascript:alert(%27%20xss%27)%22&gt;&lt;/img%20src%3d%22javascript:alert(%27%20xss%27)%22&gt;
\ No newline at end of file
diff --git a/tests/data/9.test.out b/tests/data/9.test.out
index 3a4d9b6c..5c5eb6ba 100644
--- a/tests/data/9.test.out
+++ b/tests/data/9.test.out
@@ -1 +1 @@
-'';!--"&lt;xss&gt;=&amp;{()}
\ No newline at end of file
+'';!--"&lt;xss&gt;=&amp;{()}&lt;/xss&gt;
\ No newline at end of file
diff --git a/tests/test_security.py b/tests/test_security.py
index 4fb30207..356b1292 100644
--- a/tests/test_security.py
+++ b/tests/test_security.py
@@ -22,7 +22,7 @@ def test_nested_script_tag():
 def test_nested_script_tag_r():
     assert (
         clean('<script<script>>evil()</script</script>>') ==
-        '&lt;script&lt;script&gt;&gt;evil()&lt;/script&lt;&gt;&gt;'
+        '&lt;script&lt;script&gt;&gt;evil()&gt;&lt;/script&lt;script&gt;'
     )
 
 
@@ -90,8 +90,11 @@ def test_invalid_filter_attr():
 
 def test_invalid_tag_char():
     assert (
-        clean('<script/xss src="http://xx.com/xss.js"></script>') ==
-        '&lt;script xss="" src="http://xx.com/xss.js"&gt;&lt;/script&gt;'
+        clean('<script/xss src="http://xx.com/xss.js"></script>') in
+        [
+            '&lt;script src="http://xx.com/xss.js" xss=""&gt;&lt;/script&gt;',
+            '&lt;script xss="" src="http://xx.com/xss.js"&gt;&lt;/script&gt;'
+        ]
     )
     assert (
         clean('<script/src="http://xx.com/xss.js"></script>') ==
@@ -102,15 +105,21 @@ def test_invalid_tag_char():
 def test_unclosed_tag():
     assert (
         clean('<script src=http://xx.com/xss.js<b>') ==
-        '&lt;script src="http://xx.com/xss.js&amp;lt;b"&gt;'
+        '&lt;script src="http://xx.com/xss.js&amp;lt;b"&gt;&lt;/script&gt;'
     )
     assert (
-        clean('<script src="http://xx.com/xss.js"<b>') ==
-        '&lt;script src="http://xx.com/xss.js" &lt;b=""&gt;'
+        clean('<script src="http://xx.com/xss.js"<b>') in
+        [
+            '&lt;script src="http://xx.com/xss.js" &lt;b=""&gt;&lt;/script&gt;',
+            '&lt;script &lt;b="" src="http://xx.com/xss.js"&gt;&lt;/script&gt;'
+        ]
     )
     assert (
-        clean('<script src="http://xx.com/xss.js" <b>') ==
-        '&lt;script src="http://xx.com/xss.js" &lt;b=""&gt;'
+        clean('<script src="http://xx.com/xss.js" <b>') in
+        [
+            '&lt;script src="http://xx.com/xss.js" &lt;b=""&gt;&lt;/script&gt;',
+            '&lt;script &lt;b="" src="http://xx.com/xss.js"&gt;&lt;/script&gt;'
+        ]
     )
 
 
@@ -122,16 +131,6 @@ def test_strip():
     assert clean(s, strip=True) == 'pt&gt;pt&gt;alert(1)'
 
 
-def test_nasty():
-    """Nested, broken up, multiple tags, are still foiled!"""
-    test = ('<scr<script></script>ipt type="text/javascript">alert("foo");</'
-            '<script></script>script<del></del>>')
-    expect = ('&lt;scr&lt;script&gt;&lt;/script&gt;ipt type="text/javascript"'
-              '&gt;alert("foo");&lt;/script&gt;script&lt;del&gt;&lt;/del&gt;'
-              '&gt;')
-    assert clean(test) == expect
-
-
 def test_poster_attribute():
     """Poster attributes should not allow javascript."""
     tags = ['video']

From 02facd34b02cd9d9e16b547e07b07ddd3db40455 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 21 Feb 2017 13:23:00 -0500
Subject: [PATCH 04/12] Minor code cleanup

* address FIXMEs
* minor cleanup to code and comments
---
 bleach/__init__.py     |  8 ++++----
 bleach/sanitizer.py    | 14 ++++++--------
 tests/test_basics.py   | 15 ++++++++++-----
 tests/test_css.py      |  6 +++---
 tests/test_links.py    |  9 ---------
 tests/test_security.py |  2 +-
 6 files changed, 24 insertions(+), 30 deletions(-)

diff --git a/bleach/__init__.py b/bleach/__init__.py
index c54dc72b..d1a82cde 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -120,9 +120,11 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
     :arg strip: whether or not to strip disallowed elements
     :arg strip_comments: whether or not to strip HTML comments
 
+    :returns: cleaned text as unicode
+
     """
     if not text:
-        return ''
+        return u''
 
     text = force_unicode(text)
 
@@ -152,7 +154,6 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
 
 
 def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
-            # FIXME(willkg): parse_email=False, tokenizer=HTMLSanitizer):
             parse_email=False):
     """Convert URL-like strings in an HTML fragment to links
 
@@ -170,9 +171,8 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
     text = force_unicode(text)
 
     if not text:
-        return ''
+        return u''
 
-    # FIXME(willkg): parser = html5lib.HTMLParser(tokenizer=tokenizer)
     parser = html5lib.HTMLParser()
 
     forest = parser.parseFragment(text)
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index fb502b85..0701def2 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -63,9 +63,7 @@ def allow_token(self, token):
             attrs = {}
             for namespaced_name, val in token['data'].items():
                 namespace, name = namespaced_name
-                # FIXME(willkg): "name" used to be something like "xlink:href"
-                # but it's now (namespace['xlink'], 'href'). we should fix the
-                # name here so it's what the callable would expect.
+
                 if callable(allowed_attributes):
                     if allowed_attributes(name, val):
                         attrs[namespaced_name] = val
@@ -73,12 +71,14 @@ def allow_token(self, token):
                 elif name in allowed_attributes:
                     attrs[namespaced_name] = val
 
-            # Go through all the uri-type attributes
+            # Handle attributes that have uri values
             for attr in self.attr_val_is_uri:
                 if attr not in attrs:
                     continue
+
                 val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
                                        unescape(attrs[attr])).lower()
+
                 # Remove replacement characters from unescaped characters.
                 val_unescaped = val_unescaped.replace("\ufffd", "")
 
@@ -87,17 +87,15 @@ def allow_token(self, token):
                     # It has a protocol, but it's not allowed--so drop it
                     del attrs[attr]
 
-            # FIXME(willkg): is this right?
             for attr in self.svg_attr_val_allows_ref:
                 if attr in attrs:
                     attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
                                          ' ',
                                          unescape(attrs[attr]))
 
-            # FIXME(willkg): is this right?
             if (token['name'] in self.svg_allow_local_href and
-                    (namespace['xlink'], 'href') in attrs and
-                    re.search(r'^\s*[^#\s].*', attrs[(namespace['xlink'], 'href')])):
+                    (namespaces['xlink'], 'href') in attrs and
+                    re.search(r'^\s*[^#\s].*', attrs[(namespaces['xlink'], 'href')])):
                 del attrs[(namespace['xlink'], 'href')]
 
             # Sanitize css in style attribute
diff --git a/tests/test_basics.py b/tests/test_basics.py
index 8e293ca6..49148592 100644
--- a/tests/test_basics.py
+++ b/tests/test_basics.py
@@ -71,12 +71,17 @@ def test_function_arguments():
 
 def test_named_arguments():
     ATTRS = {'a': ['rel', 'href']}
-    s = ('<a href="http://xx.com" rel="alternate">xx.com</a>',
-         '<a rel="alternate" href="http://xx.com">xx.com</a>')
 
-    assert bleach.clean(s[0]) == '<a href="http://xx.com">xx.com</a>'
-    # FIXME: This might not be needed if attribute order is stable now.
-    assert bleach.clean(s[0], attributes=ATTRS) in s
+    text = '<a href="http://xx.com" rel="alternate">xx.com</a>'
+
+    assert bleach.clean(text) == '<a href="http://xx.com">xx.com</a>'
+    assert (
+        bleach.clean(text, attributes=ATTRS) in
+        [
+            '<a href="http://xx.com" rel="alternate">xx.com</a>',
+            '<a rel="alternate" href="http://xx.com">xx.com</a>'
+        ]
+    )
 
 
 def test_disallowed_html():
diff --git a/tests/test_css.py b/tests/test_css.py
index 3d224fac..d8880d78 100644
--- a/tests/test_css.py
+++ b/tests/test_css.py
@@ -119,8 +119,7 @@ def test_style_hang():
         'background-color: rgb(246, 246, 242);',
         'overflow-x: auto;',
         'overflow-y: auto;',
-        # FIXME(willkg): This fails the first regxp gauntlet in sanitize_css.
-        # 'font: italic small-caps bolder condensed 16px/3 cursive;',
+        'font: italic small-caps bolder condensed 16px/3 cursive;',
         'background-position: initial initial;',
         'background-repeat: initial initial;'
     ]
@@ -146,7 +145,8 @@ def test_style_hang():
         'padding-right: 15px; '
         'padding-bottom: 15px; '
         'padding-left: 15px; '
-        'background-color: rgb(246, 246, 242);'
+        'background-color: rgb(246, 246, 242); '
+        'font: italic small-caps bolder condensed 16px/3 cursive;'
         '">Hello world</p>'
     )
 
diff --git a/tests/test_links.py b/tests/test_links.py
index 6b7a77eb..53d60e5c 100644
--- a/tests/test_links.py
+++ b/tests/test_links.py
@@ -3,7 +3,6 @@
 except ImportError:
     from urllib import quote_plus
 
-# FIXME(willkg): from html5lib.tokenizer import HTMLTokenizer
 import pytest
 
 from bleach import linkify, url_re, DEFAULT_CALLBACKS as DC
@@ -499,14 +498,6 @@ def test_ports(data, expected_data):
     assert linkify(data) == out.format(*expected_data)
 
 
-@pytest.mark.xfail(reason='html5lib >= 0.99999999: no access to tokenizer')
-def test_tokenizer():
-    """Linkify doesn't always have to sanitize."""
-    raw = '<em>test<x></x></em>'
-    assert linkify(raw) == '<em>test&lt;x&gt;&lt;/x&gt;</em>'
-    assert linkify(raw, tokenizer=HTMLTokenizer) == raw
-
-
 def test_ignore_bad_protocols():
     assert (
         linkify('foohttp://bar') ==
diff --git a/tests/test_security.py b/tests/test_security.py
index 356b1292..2aac0200 100644
--- a/tests/test_security.py
+++ b/tests/test_security.py
@@ -75,7 +75,7 @@ def test_invalid_href_attr():
 def test_invalid_filter_attr():
     IMG = ['img', ]
     IMG_ATTR = {
-        'img': lambda n, v: n == 'src' and v == "http://example.com/"
+        'img': lambda attr, val: attr == 'src' and val == "http://example.com/"
     }
 
     assert (

From 10852231012ae3eece9c0a2af5c6c7c8e2e5212f Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 21 Feb 2017 15:37:00 -0500
Subject: [PATCH 05/12] Update CHANGES and docs

---
 CHANGES          | 19 +++++++++++++++----
 README.rst       |  2 --
 docs/clean.rst   |  5 ++++-
 docs/conf.py     |  3 +--
 docs/dev.rst     |  4 ++--
 docs/goals.rst   |  4 ++--
 docs/linkify.rst | 17 +----------------
 7 files changed, 25 insertions(+), 29 deletions(-)

diff --git a/CHANGES b/CHANGES
index ec3bc992..e8e49a8d 100644
--- a/CHANGES
+++ b/CHANGES
@@ -8,10 +8,19 @@ Version 2.0 (in development)
 
 - Removed support for Python 2.6. #206
 - Removed support for Python 3.2. #224
+- Bleach no longer supports html5lib < 0.99999999 (8 9s).
+
+  This version represents a rewrite to use the new sanitizing API since
+  the old one was dropped in html5lib 0.99999999 (8 9s).
+
+- linkify no longer accepts a tokenizer argument.
+- clean output is different than in previous versions; particularly this version
+  will add end tags even if the tag will be escaped.
 
 **Changes**
 
-- Added testing for Python 3.6.
+- Supports Python 3.6.
+- Supports html5lib >= 0.99999999 (8 9s).
 
 
 Version 1.5 (November 4th, 2016)
@@ -20,9 +29,11 @@ Version 1.5 (November 4th, 2016)
 **Backwards incompatible changes**
 
 - clean: The list of ``ALLOWED_PROTOCOLS`` now defaults to http, https and
-  mailto. Previously it was a long list of protocols something like ed2k, ftp,
-  http, https, irc, mailto, news, gopher, nntp, telnet, webcal, xmpp, callto,
-  feed, urn, aim, rsync, tag, ssh, sftp, rtsp, afs, data. #149
+  mailto.
+
+  Previously it was a long list of protocols something like ed2k, ftp, http,
+  https, irc, mailto, news, gopher, nntp, telnet, webcal, xmpp, callto, feed,
+  urn, aim, rsync, tag, ssh, sftp, rtsp, afs, data. #149
 
 **Changes**
 
diff --git a/README.rst b/README.rst
index 8f9ce05d..3bd87573 100644
--- a/README.rst
+++ b/README.rst
@@ -101,5 +101,3 @@ The simplest way to use Bleach is:
 .. _GitHub: https://github.com/mozilla/bleach
 .. _ReadTheDocs: https://bleach.readthedocs.io/
 .. _PyPI: http://pypi.python.org/pypi/bleach
-
-
diff --git a/docs/clean.rst b/docs/clean.rst
index ebd82055..a988a81a 100644
--- a/docs/clean.rst
+++ b/docs/clean.rst
@@ -162,7 +162,7 @@ For example, this sets allowed protocols to http, https and smb:
    u'<a href="smb://more_text">allowed protocol</a>'
 
 
-This adds smb to the bleach-specified set of allowed protocols:
+This adds smb to the Bleach-specified set of allowed protocols:
 
 .. doctest::
 
@@ -187,6 +187,7 @@ whitelist and invalid markup. For example:
 .. doctest::
 
    >>> import bleach
+
    >>> bleach.clean('<span>is not allowed</span>')
    u'&lt;span&gt;is not allowed&lt;/span&gt;'
    >>> bleach.clean('<b><span>is not allowed</span></b>', tags=['b'])
@@ -199,6 +200,7 @@ If you would rather Bleach stripped this markup entirely, you can pass
 .. doctest::
 
    >>> import bleach
+
    >>> bleach.clean('<span>is not allowed</span>', strip=True)
    u'is not allowed'
    >>> bleach.clean('<b><span>is not allowed</span></b>', tags=['b'], strip=True)
@@ -214,6 +216,7 @@ By default, Bleach will strip out HTML comments. To disable this behavior, set
 .. doctest::
 
    >>> import bleach
+
    >>> html = 'my<!-- commented --> html'
 
    >>> bleach.clean(html)
diff --git a/docs/conf.py b/docs/conf.py
index 00b9c239..e186c827 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -27,8 +27,7 @@
 
 # Add any Sphinx extension module names here, as strings. They can be extensions
 # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
-extensions = ['sphinx.ext.autodoc', 'sphinx.ext.pngmath', 'sphinx.ext.viewcode',
-              'sphinx.ext.doctest']
+extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode', 'sphinx.ext.doctest']
 
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
diff --git a/docs/dev.rst b/docs/dev.rst
index 027a0a76..02f8d44a 100644
--- a/docs/dev.rst
+++ b/docs/dev.rst
@@ -5,7 +5,7 @@ Bleach development
 Docs
 ====
 
-Docs are in ``docs/``. We use Sphinx. Docs are pushed to readthedocs
+Docs are in ``docs/``. We use Sphinx. Docs are pushed to ReadTheDocs
 via a GitHub webhook.
 
 
@@ -16,7 +16,7 @@ Run::
 
     $ tox
 
-That'll run bleach tests in all the supported Python environments. Note
+That'll run Bleach tests in all the supported Python environments. Note
 that you need the necessary Python binaries for them all to be tested.
 
 Tests are run in Travis CI via a GitHub webhook.
diff --git a/docs/goals.rst b/docs/goals.rst
index 01f63a94..632c222c 100644
--- a/docs/goals.rst
+++ b/docs/goals.rst
@@ -91,10 +91,10 @@ Make malicious content look pretty or sane
 ------------------------------------------
 
 Malicious content is designed to be malicious. Making it safe is a design goal
-of bleach. Making it pretty or sane-looking is not.
+of Bleach. Making it pretty or sane-looking is not.
 
 If you want your malicious content to look pretty, you should pass it through
-bleach to make it safe and then do your own transform afterwards.
+Bleach to make it safe and then do your own transform afterwards.
 
 
 Allow arbitrary styling
diff --git a/docs/linkify.rst b/docs/linkify.rst
index b7449c34..705000c2 100644
--- a/docs/linkify.rst
+++ b/docs/linkify.rst
@@ -9,7 +9,7 @@
 control how and when those links are rendered::
 
     def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
-                parse_email=False, tokenizer=HTMLSanitizer):
+                parse_email=False):
         """Convert URL-like strings in an HTML fragment to links.
 
 ``linkify()`` works by building a document tree, so it's guaranteed never to do
@@ -194,19 +194,4 @@ they are newly created or already in the text, so be careful when writing
 callbacks that may need to behave differently if the protocol is ``mailto:``.
 
 
-``tokenizer``
-=============
-
-``linkify()`` uses the ``html5lib.sanitizer.HTMLSanitizer`` tokenizer by
-default. This has the effect of scrubbing some tags and attributes. To use a
-more lenient, or totally different, tokenizer, you can specify the tokenizer
-class here. (See the implementation of :ref:`clean() <clean-chapter>` for an
-example of building a custom tokenizer.)
-
-::
-
-    from html5lib.tokenizer import HTMLTokenizer
-    linked_text = linkify(text, tokenizer=HTMLTokenizer)
-
-
 .. _Crate: https://crate.io/

From 066631af96ee9c16ddcf9d132bd8537a4af17da6 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 21 Feb 2017 20:49:21 -0500
Subject: [PATCH 06/12] Update .travis.yml

---
 .travis.yml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 4e66cf1d..318dfa7d 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -11,9 +11,8 @@ python:
 - "3.6"
 - "pypy"
 env:
-- HTML5LIB=0.999       # 3
-- HTML5LIB=0.999999    # 6
-- HTML5LIB=0.9999999   # 7
+- HTML5LIB=0.99999999   # 8
+- HTML5LIB=0.999999999  # 9
 install:
 - pip install -r requirements.txt
 - pip install html5lib==$HTML5LIB

From 30772dd20f16a8ec75f7c1ddb7b76c9ff6fd97d2 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 21 Feb 2017 20:57:13 -0500
Subject: [PATCH 07/12] Another .travis.yml fix

---
 .travis.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.travis.yml b/.travis.yml
index 318dfa7d..a5db65b1 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -14,6 +14,7 @@ env:
 - HTML5LIB=0.99999999   # 8
 - HTML5LIB=0.999999999  # 9
 install:
+- pip install -U pip
 - pip install -r requirements.txt
 - pip install html5lib==$HTML5LIB
 script:

From 85cc802584f4290556b14e50384c2ed6bb6959b6 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 21 Feb 2017 21:05:59 -0500
Subject: [PATCH 08/12] One more .travis.yml fix

---
 .travis.yml | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index a5db65b1..14015378 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -14,9 +14,11 @@ env:
 - HTML5LIB=0.99999999   # 8
 - HTML5LIB=0.999999999  # 9
 install:
-- pip install -U pip
-- pip install -r requirements.txt
-- pip install html5lib==$HTML5LIB
+  # html5lib 0.99999999 (8 9s) requires at least setuptools 18.5
+  - pip install -U pip setuptools>=18.5
+  - pip install -r requirements.txt
+  # stomp on html5lib install with the specified one
+  - pip install html5lib==$HTML5LIB
 script:
 - py.test
 - flake8 bleach/

From 4cca43b5f41a4cc157fa2e6f5dd5185f70cb7a6c Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 21 Feb 2017 21:10:40 -0500
Subject: [PATCH 09/12] Fix flake8 issues

---
 bleach/sanitizer.py | 4 ++--
 setup.cfg           | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 0701def2..68438e9a 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -1,6 +1,6 @@
 from __future__ import unicode_literals
 import re
-from xml.sax.saxutils import escape, unescape
+from xml.sax.saxutils import unescape
 
 from html5lib.constants import namespaces
 from html5lib.filters import sanitizer
@@ -83,7 +83,7 @@ def allow_token(self, token):
                 val_unescaped = val_unescaped.replace("\ufffd", "")
 
                 if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped) and
-                    (val_unescaped.split(':')[0] not in self.allowed_protocols)):
+                        (val_unescaped.split(':')[0] not in self.allowed_protocols)):
                     # It has a protocol, but it's not allowed--so drop it
                     del attrs[attr]
 
diff --git a/setup.cfg b/setup.cfg
index f3a416e4..950364a7 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -3,6 +3,7 @@ test=pytest
 
 [flake8]
 ignore = E731,W503
+max-line-length = 100
 
 [wheel]
 universal=1

From 49f35dd27317019e47febed1c0c507177297a8af Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Fri, 24 Feb 2017 12:21:38 -0500
Subject: [PATCH 10/12] Add tests, fix alphabetizing, code cleanup

* this adds some missing tests to add more coverage
* html5lib 0.99999999 and 0.999999999 have an alphabeticalattributes filter that
  doesn't work when the attributes set has some items with a namespace and some
  without in Python 3; this rolls alphabetizing into the Bleach sanitizer
* remove some dead code and clean some other code up
---
 bleach/__init__.py   | 17 ++++-----
 bleach/sanitizer.py  | 43 +++++++++++++++++-----
 tests/test_basics.py | 86 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 129 insertions(+), 17 deletions(-)

diff --git a/bleach/__init__.py b/bleach/__init__.py
index d1a82cde..ae96a925 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -61,8 +61,6 @@
 # Make sure that .com doesn't get matched by .co first
 TLDS.reverse()
 
-PROTOCOLS = allowed_protocols
-
 url_re = re.compile(
     r"""\(*  # Match any opening parentheses.
     \b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)?  # http://
@@ -70,7 +68,7 @@
     (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
         # /path/zz (excluding "unsafe" chars from RFC 1738,
         # except for # and ~, which happen in practice)
-    """.format('|'.join(PROTOCOLS), '|'.join(TLDS)),
+    """.format('|'.join(allowed_protocols), '|'.join(TLDS)),
     re.IGNORECASE | re.VERBOSE | re.UNICODE)
 
 proto_re = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE)
@@ -87,8 +85,6 @@
     """,
     re.IGNORECASE | re.MULTILINE | re.VERBOSE)
 
-NODE_TEXT = 4  # The numeric ID of a text node in simpletree.
-
 ETREE_TAG = lambda x: "".join(['{http://www.w3.org/1999/xhtml}', x])
 # a simple routine that returns the tag name with the namespace prefix
 # as returned by etree's Element.tag attribute
@@ -147,8 +143,13 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
     )
     s = HTMLSerializer(
         quote_attr_values='always',
-        alphabetical_attributes=True,
-        omit_optional_tags=False
+        omit_optional_tags=False,
+
+        # Bleach has its own sanitizer, so don't use the html5lib one
+        sanitize=False,
+
+        # Bleach sanitizer alphabetizes already, so don't use the html5lib one
+        alphabetical_attributes=False,
     )
     return s.render(filtered)
 
@@ -176,7 +177,7 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
     parser = html5lib.HTMLParser()
 
     forest = parser.parseFragment(text)
-    _seen = set([])
+    _seen = set()
 
     def replace_nodes(tree, new_frag, node, index=0):
         """Doesn't really replace nodes, but inserts the nodes contained in
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 68438e9a..c12b8d24 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -1,4 +1,5 @@
 from __future__ import unicode_literals
+from collections import OrderedDict
 import re
 from xml.sax.saxutils import unescape
 
@@ -6,6 +7,19 @@
 from html5lib.filters import sanitizer
 
 
+def _attr_key(attr):
+    """Returns appropriate key for sorting attribute names
+
+    Attribute names are a tuple of ``(namespace, name)`` where namespace can be
+    ``None`` or a string. These can't be compared in Python 3, so we conver the
+    ``None`` to an empty string.
+
+    """
+    key = (attr[0][0] or ''), attr[0][1]
+    print(key)
+    return key
+
+
 class BleachSanitizerFilter(sanitizer.Filter):
     def __init__(self, source, allowed_attributes_map,
                  strip_disallowed_elements=False, strip_html_comments=True,
@@ -87,22 +101,33 @@ def allow_token(self, token):
                     # It has a protocol, but it's not allowed--so drop it
                     del attrs[attr]
 
+            # Drop values in svg attrs with non-local IRIs
             for attr in self.svg_attr_val_allows_ref:
                 if attr in attrs:
-                    attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
-                                         ' ',
-                                         unescape(attrs[attr]))
-
-            if (token['name'] in self.svg_allow_local_href and
-                    (namespaces['xlink'], 'href') in attrs and
-                    re.search(r'^\s*[^#\s].*', attrs[(namespaces['xlink'], 'href')])):
-                del attrs[(namespace['xlink'], 'href')]
+                    new_val = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
+                                     ' ',
+                                     unescape(attrs[attr]))
+                    new_val = new_val.strip()
+                    if not new_val:
+                        del attrs[attr]
+                    else:
+                        attrs[attr] = new_val
+
+            # Drop href and xlink:href attr for svg elements with non-local IRIs
+            if (None, token['name']) in self.svg_allow_local_href:
+                for href_attr in [(None, 'href'), (namespaces['xlink'], 'href')]:
+                    if href_attr in attrs:
+                        if re.search(r'^\s*[^#\s]', attrs[href_attr]):
+                            del attrs[href_attr]
 
             # Sanitize css in style attribute
             if (None, u'style') in attrs:
                 attrs[(None, u'style')] = self.sanitize_css(attrs[(None, u'style')])
 
-            token['data'] = attrs
+            # Alphabetize attributes
+            token['data'] = OrderedDict(
+                [(key, val) for key, val in sorted(attrs.items(), key=_attr_key)]
+            )
         return token
 
     def sanitize_css(self, style):
diff --git a/tests/test_basics.py b/tests/test_basics.py
index 49148592..bae1506e 100644
--- a/tests/test_basics.py
+++ b/tests/test_basics.py
@@ -240,6 +240,92 @@ def test_wildcard_attributes():
     assert bleach.clean(dirty, tags=TAG, attributes=ATTR) in clean
 
 
+def test_callable_attributes():
+    """Verify callable attributes work and get correct arg values"""
+    def img_test(attr, val):
+        return attr == 'src' and val.startswith('https')
+
+    ATTR = {
+        'img': img_test,
+    }
+    TAGS = ['img']
+
+    assert (
+        bleach.clean('foo <img src="http://example.com" alt="blah"> baz', tags=TAGS, attributes=ATTR) ==
+        u'foo <img> baz'
+    )
+    assert (
+        bleach.clean('foo <img src="https://example.com" alt="blah"> baz', tags=TAGS, attributes=ATTR) ==
+        u'foo <img src="https://example.com"> baz'
+    )
+
+
+def test_svg_attr_val_allows_ref():
+    """Unescape values in svg attrs that allow url references"""
+    # Local IRI, so keep it
+    text = '<svg><rect fill="url(#foo)" /></svg>'
+    TAGS = ['svg', 'rect']
+    ATTRS = {
+        'rect': ['fill'],
+    }
+    assert (
+        bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
+        '<svg><rect fill="url(#foo)"></rect></svg>'
+    )
+
+    # Non-local IRI, so drop it
+    text = '<svg><rect fill="url(http://example.com#foo)" /></svg>'
+    TAGS = ['svg', 'rect']
+    ATTRS = {
+        'rect': ['fill'],
+    }
+    assert (
+        bleach.clean(text, tags=TAGS, attributes=ATTRS) ==
+        '<svg><rect></rect></svg>'
+    )
+
+
+@pytest.mark.parametrize('text, expected', [
+    (
+        '<svg><pattern id="patt1" href="#patt2"></pattern></svg>',
+        '<svg><pattern href="#patt2" id="patt1"></pattern></svg>'
+    ),
+    (
+        '<svg><pattern id="patt1" xlink:href="#patt2"></pattern></svg>',
+        # NOTE(willkg): Bug in html5lib serializer drops the xlink part
+        '<svg><pattern id="patt1" href="#patt2"></pattern></svg>'
+    ),
+])
+def test_svg_allow_local_href(text, expected):
+    """Keep local hrefs for svg elements"""
+    TAGS = ['svg', 'pattern']
+    ATTRS = {
+        'pattern': ['id', 'href'],
+    }
+    assert bleach.clean(text, tags=TAGS, attributes=ATTRS) == expected
+
+
+@pytest.mark.parametrize('text, expected', [
+    (
+        '<svg><pattern id="patt1" href="https://example.com/patt"></pattern></svg>',
+        '<svg><pattern id="patt1"></pattern></svg>'
+    ),
+    (
+        '<svg><pattern id="patt1" xlink:href="https://example.com/patt"></pattern></svg>',
+        '<svg><pattern id="patt1"></pattern></svg>'
+    ),
+])
+def test_svg_allow_local_href_nonlocal(text, expected):
+    """Drop non-local hrefs for svg elements"""
+    TAGS = ['svg', 'pattern']
+    ATTRS = {
+        'pattern': ['id', 'href'],
+    }
+    assert bleach.clean(text, tags=TAGS, attributes=ATTRS) == expected
+
+
+
+
 @pytest.mark.xfail(reason='html5lib >= 0.99999999: changed API')
 def test_sarcasm():
     """Jokes should crash.<sarcasm/>"""

From 70d96e390c1525d8508436d8582daa49357fdf3e Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Fri, 24 Feb 2017 12:49:12 -0500
Subject: [PATCH 11/12] Alphabetize before escaping disallowed tokens

Making this change means the output is stable since attributes will always
happen in the same order. Seems like maybe it's not a great idea, but stable
seems good. If it turns out this is terrible, someone will complain with a
compelling use case and we can undo it.

I also went through and removed a bunch of the "the output is either this or
that" in the tests.
---
 bleach/sanitizer.py   |  6 ++++++
 tests/test_basics.py  | 31 +++++++++++++------------------
 tests/test_unicode.py |  7 ++-----
 3 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index c12b8d24..789d89e6 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -58,6 +58,12 @@ def sanitize_token(self, token):
                 pass
 
             else:
+                if 'data' in token:
+                    # Alphabetize the attributes before calling .disallowed_token()
+                    # so that the resulting string is stable
+                    token['data'] = OrderedDict(
+                        [(key, val) for key, val in sorted(token['data'].items(), key=_attr_key)]
+                    )
                 return self.disallowed_token(token)
 
         elif token_type == 'Comment':
diff --git a/tests/test_basics.py b/tests/test_basics.py
index bae1506e..c42ccc62 100644
--- a/tests/test_basics.py
+++ b/tests/test_basics.py
@@ -76,11 +76,8 @@ def test_named_arguments():
 
     assert bleach.clean(text) == '<a href="http://xx.com">xx.com</a>'
     assert (
-        bleach.clean(text, attributes=ATTRS) in
-        [
-            '<a href="http://xx.com" rel="alternate">xx.com</a>',
-            '<a rel="alternate" href="http://xx.com">xx.com</a>'
-        ]
+        bleach.clean(text, attributes=ATTRS) ==
+        '<a href="http://xx.com" rel="alternate">xx.com</a>'
     )
 
 
@@ -199,25 +196,22 @@ def test_idempotent():
     clean = bleach.clean(dirty)
     assert bleach.clean(clean) == clean
 
-    possible_outs = (
-        '<span>invalid &amp; </span> &lt; extra <a rel="nofollow" href="http://link.com">http://link.com</a><em></em>',
+    linked = bleach.linkify(dirty)
+    assert (
+        bleach.linkify(linked) ==
         '<span>invalid &amp; </span> &lt; extra <a href="http://link.com" rel="nofollow">http://link.com</a><em></em>'
     )
-    linked = bleach.linkify(dirty)
-    assert bleach.linkify(linked) in possible_outs
 
 
 def test_rel_already_there():
     """Make sure rel attribute is updated not replaced"""
     linked = ('Click <a href="http://example.com" rel="tooltip">'
               'here</a>.')
-    link_good = (('Click <a href="http://example.com" rel="tooltip nofollow">'
-                  'here</a>.'),
-                 ('Click <a rel="tooltip nofollow" href="http://example.com">'
-                  'here</a>.'))
 
-    assert bleach.linkify(linked) in link_good
-    assert bleach.linkify(link_good[0]) in link_good
+    link_good = 'Click <a href="http://example.com" rel="tooltip nofollow">here</a>.'
+
+    assert bleach.linkify(linked) == link_good
+    assert bleach.linkify(link_good) == link_good
 
 
 def test_lowercase_html():
@@ -235,9 +229,10 @@ def test_wildcard_attributes():
     TAG = ['img', 'em']
     dirty = ('both <em id="foo" style="color: black">can</em> have '
              '<img id="bar" src="foo"/>')
-    clean = ('both <em id="foo">can</em> have <img src="foo" id="bar">',
-             'both <em id="foo">can</em> have <img id="bar" src="foo">')
-    assert bleach.clean(dirty, tags=TAG, attributes=ATTR) in clean
+    assert (
+        bleach.clean(dirty, tags=TAG, attributes=ATTR) ==
+        'both <em id="foo">can</em> have <img id="bar" src="foo">'
+    )
 
 
 def test_callable_attributes():
diff --git a/tests/test_unicode.py b/tests/test_unicode.py
index b8b670e8..08ab3f4e 100644
--- a/tests/test_unicode.py
+++ b/tests/test_unicode.py
@@ -27,11 +27,8 @@ def test_mixed():
 
 def test_mixed_linkify():
     assert (
-        linkify('Домашняя http://example.com ヘルプとチュートリアル') in
-        (
-            'Домашняя <a href="http://example.com" rel="nofollow">http://example.com</a> ヘルプとチュートリアル',
-            'Домашняя <a rel="nofollow" href="http://example.com">http://example.com</a> ヘルプとチュートリアル'
-        )
+        linkify('Домашняя http://example.com ヘルプとチュートリアル') ==
+        'Домашняя <a href="http://example.com" rel="nofollow">http://example.com</a> ヘルプとチュートリアル'
     )
 
 

From 86dea93c17d96a237acd2543264ba27d83f15d79 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Fri, 24 Feb 2017 13:09:20 -0500
Subject: [PATCH 12/12] Cosmetic fix for readability

---
 bleach/__init__.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/bleach/__init__.py b/bleach/__init__.py
index ae96a925..a645ac7e 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -130,16 +130,18 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
     walker = html5lib.getTreeWalker('etree')
     filtered = BleachSanitizerFilter(
         source=walker(dom),
+
+        # Bleach-sanitizer-specific things
         allowed_attributes_map=attributes,
+        strip_disallowed_elements=strip,
+        strip_html_comments=strip_comments,
 
+        # html5lib-sanitizer things
         allowed_elements=tags,
         allowed_css_properties=styles,
         allowed_protocols=protocols,
-
         allowed_svg_properties=[],
 
-        strip_disallowed_elements=strip,
-        strip_html_comments=strip_comments
     )
     s = HTMLSerializer(
         quote_attr_values='always',