mozilla · willkg · Feb 24, 2017 · Feb 17, 2017 · Jan 23, 2017 · Feb 21, 2017
diff --git a/.travis.yml b/.travis.yml
@@ -11,12 +11,14 @@ python:
 - "3.6"
 - "pypy"
 env:
-- HTML5LIB=0.999       # 3
-- HTML5LIB=0.999999    # 6
-- HTML5LIB=0.9999999   # 7
+- HTML5LIB=0.99999999   # 8
+- HTML5LIB=0.999999999  # 9
 install:
-- pip install -r requirements.txt
-- pip install html5lib==$HTML5LIB
+  # html5lib 0.99999999 (8 9s) requires at least setuptools 18.5
+  - pip install -U pip setuptools>=18.5
+  - pip install -r requirements.txt
+  # stomp on html5lib install with the specified one
+  - pip install html5lib==$HTML5LIB
 script:
 - py.test
 - flake8 bleach/

diff --git a/CHANGES b/CHANGES
@@ -8,10 +8,19 @@ Version 2.0 (in development)
 
 - Removed support for Python 2.6. #206
 - Removed support for Python 3.2. #224
+- Bleach no longer supports html5lib < 0.99999999 (8 9s).
+
+  This version represents a rewrite to use the new sanitizing API since
+  the old one was dropped in html5lib 0.99999999 (8 9s).
+
+- linkify no longer accepts a tokenizer argument.
+- clean output is different than in previous versions; particularly this version
+  will add end tags even if the tag will be escaped.
 
 **Changes**
 
-- Added testing for Python 3.6.
+- Supports Python 3.6.
+- Supports html5lib >= 0.99999999 (8 9s).
 
 
 Version 1.5 (November 4th, 2016)
@@ -20,9 +29,11 @@ Version 1.5 (November 4th, 2016)
 **Backwards incompatible changes**
 
 - clean: The list of ``ALLOWED_PROTOCOLS`` now defaults to http, https and
-  mailto. Previously it was a long list of protocols something like ed2k, ftp,
-  http, https, irc, mailto, news, gopher, nntp, telnet, webcal, xmpp, callto,
-  feed, urn, aim, rsync, tag, ssh, sftp, rtsp, afs, data. #149
+  mailto.
+
+  Previously it was a long list of protocols something like ed2k, ftp, http,
+  https, irc, mailto, news, gopher, nntp, telnet, webcal, xmpp, callto, feed,
+  urn, aim, rsync, tag, ssh, sftp, rtsp, afs, data. #149
 
 **Changes**
 

diff --git a/README.rst b/README.rst
@@ -101,5 +101,3 @@ The simplest way to use Bleach is:
 .. _GitHub: https://github.com/mozilla/bleach
 .. _ReadTheDocs: https://bleach.readthedocs.io/
 .. _PyPI: http://pypi.python.org/pypi/bleach
-
-
diff --git a/bleach/__init__.py b/bleach/__init__.py
@@ -5,13 +5,14 @@
 import re
 
 import html5lib
-from html5lib.sanitizer import HTMLSanitizer
-from html5lib.serializer.htmlserializer import HTMLSerializer
+from html5lib.filters import sanitizer
+from html5lib.filters.sanitizer import allowed_protocols
+from html5lib.serializer import HTMLSerializer
 
-from . import callbacks as linkify_callbacks
-from .encoding import force_unicode
-from .sanitizer import BleachSanitizer
-from .version import __version__, VERSION # flake8: noqa
+from bleach import callbacks as linkify_callbacks
+from bleach.encoding import force_unicode
+from bleach.sanitizer import BleachSanitizerFilter
+from bleach.version import __version__, VERSION # flake8: noqa
 
 __all__ = ['clean', 'linkify']
 
@@ -60,16 +61,14 @@
 # Make sure that .com doesn't get matched by .co first
 TLDS.reverse()
 
-PROTOCOLS = HTMLSanitizer.acceptable_protocols
-
 url_re = re.compile(
     r"""\(*  # Match any opening parentheses.
     \b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)?  # http://
     ([\w-]+\.)+(?:{1})(?:\:[0-9]+)?(?!\.\w)\b   # xx.yy.tld(:##)?
     (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
         # /path/zz (excluding "unsafe" chars from RFC 1738,
         # except for # and ~, which happen in practice)
-    """.format('|'.join(PROTOCOLS), '|'.join(TLDS)),
+    """.format('|'.join(allowed_protocols), '|'.join(TLDS)),
     re.IGNORECASE | re.VERBOSE | re.UNICODE)
 
 proto_re = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE)
@@ -86,8 +85,6 @@
     """,
     re.IGNORECASE | re.MULTILINE | re.VERBOSE)
 
-NODE_TEXT = 4  # The numeric ID of a text node in simpletree.
-
 ETREE_TAG = lambda x: "".join(['{http://www.w3.org/1999/xhtml}', x])
 # a simple routine that returns the tag name with the namespace prefix
 # as returned by etree's Element.tag attribute
@@ -119,27 +116,48 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
     :arg strip: whether or not to strip disallowed elements
     :arg strip_comments: whether or not to strip HTML comments
 
+    :returns: cleaned text as unicode
+
     """
     if not text:
-        return ''
+        return u''
 
     text = force_unicode(text)
 
-    class s(BleachSanitizer):
-        allowed_elements = tags
-        allowed_attributes = attributes
-        allowed_css_properties = styles
-        allowed_protocols = protocols
-        strip_disallowed_elements = strip
-        strip_html_comments = strip_comments
+    parser = html5lib.HTMLParser(namespaceHTMLElements=False)
+    dom = parser.parseFragment(text)
+
+    walker = html5lib.getTreeWalker('etree')
+    filtered = BleachSanitizerFilter(
+        source=walker(dom),
+
+        # Bleach-sanitizer-specific things
+        allowed_attributes_map=attributes,
+        strip_disallowed_elements=strip,
+        strip_html_comments=strip_comments,
+
+        # html5lib-sanitizer things
+        allowed_elements=tags,
+        allowed_css_properties=styles,
+        allowed_protocols=protocols,
+        allowed_svg_properties=[],
+
+    )
+    s = HTMLSerializer(
+        quote_attr_values='always',
+        omit_optional_tags=False,
 
-    parser = html5lib.HTMLParser(tokenizer=s)
+        # Bleach has its own sanitizer, so don't use the html5lib one
+        sanitize=False,
 
-    return _render(parser.parseFragment(text))
+        # Bleach sanitizer alphabetizes already, so don't use the html5lib one
+        alphabetical_attributes=False,
+    )
+    return s.render(filtered)
 
 
 def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
-            parse_email=False, tokenizer=HTMLSanitizer):
+            parse_email=False):
     """Convert URL-like strings in an HTML fragment to links
 
     ``linkify()`` converts strings that look like URLs, domain names and email
@@ -156,12 +174,12 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
     text = force_unicode(text)
 
     if not text:
-        return ''
+        return u''
 
-    parser = html5lib.HTMLParser(tokenizer=tokenizer)
+    parser = html5lib.HTMLParser()
 
     forest = parser.parseFragment(text)
-    _seen = set([])
+    _seen = set()
 
     def replace_nodes(tree, new_frag, node, index=0):
         """Doesn't really replace nodes, but inserts the nodes contained in
@@ -427,7 +445,7 @@ def _render(tree):
 def _serialize(domtree):
     walker = html5lib.treewalkers.getTreeWalker('etree')
     stream = walker(domtree)
-    serializer = HTMLSerializer(quote_attr_values=True,
+    serializer = HTMLSerializer(quote_attr_values='always',
                                 alphabetical_attributes=True,
                                 omit_optional_tags=False)
     return serializer.render(stream)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -101,5 +101,3 @@ The simplest way to use Bleach is:
		.. _GitHub: https://github.com/mozilla/bleach
		.. _ReadTheDocs: https://bleach.readthedocs.io/
		.. _PyPI: http://pypi.python.org/pypi/bleach