Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rewrite to use html5lib >= 0.99999999 #250

Merged
merged 12 commits into from
Feb 24, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,14 @@ python:
- "3.6"
- "pypy"
env:
- HTML5LIB=0.999 # 3
- HTML5LIB=0.999999 # 6
- HTML5LIB=0.9999999 # 7
- HTML5LIB=0.99999999 # 8
- HTML5LIB=0.999999999 # 9
install:
- pip install -r requirements.txt
- pip install html5lib==$HTML5LIB
# html5lib 0.99999999 (8 9s) requires at least setuptools 18.5
- pip install -U pip setuptools>=18.5
- pip install -r requirements.txt
# stomp on html5lib install with the specified one
- pip install html5lib==$HTML5LIB
script:
- py.test
- flake8 bleach/
Expand Down
19 changes: 15 additions & 4 deletions CHANGES
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,19 @@ Version 2.0 (in development)

- Removed support for Python 2.6. #206
- Removed support for Python 3.2. #224
- Bleach no longer supports html5lib < 0.99999999 (8 9s).

This version represents a rewrite to use the new sanitizing API since
the old one was dropped in html5lib 0.99999999 (8 9s).

- linkify no longer accepts a tokenizer argument.
- clean output is different than in previous versions; particularly this version
will add end tags even if the tag will be escaped.

**Changes**

- Added testing for Python 3.6.
- Supports Python 3.6.
- Supports html5lib >= 0.99999999 (8 9s).


Version 1.5 (November 4th, 2016)
Expand All @@ -20,9 +29,11 @@ Version 1.5 (November 4th, 2016)
**Backwards incompatible changes**

- clean: The list of ``ALLOWED_PROTOCOLS`` now defaults to http, https and
mailto. Previously it was a long list of protocols something like ed2k, ftp,
http, https, irc, mailto, news, gopher, nntp, telnet, webcal, xmpp, callto,
feed, urn, aim, rsync, tag, ssh, sftp, rtsp, afs, data. #149
mailto.

Previously it was a long list of protocols something like ed2k, ftp, http,
https, irc, mailto, news, gopher, nntp, telnet, webcal, xmpp, callto, feed,
urn, aim, rsync, tag, ssh, sftp, rtsp, afs, data. #149

**Changes**

Expand Down
2 changes: 0 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -101,5 +101,3 @@ The simplest way to use Bleach is:
.. _GitHub: https://github.com/mozilla/bleach
.. _ReadTheDocs: https://bleach.readthedocs.io/
.. _PyPI: http://pypi.python.org/pypi/bleach


70 changes: 44 additions & 26 deletions bleach/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@
import re

import html5lib
from html5lib.sanitizer import HTMLSanitizer
from html5lib.serializer.htmlserializer import HTMLSerializer
from html5lib.filters import sanitizer
from html5lib.filters.sanitizer import allowed_protocols
from html5lib.serializer import HTMLSerializer

from . import callbacks as linkify_callbacks
from .encoding import force_unicode
from .sanitizer import BleachSanitizer
from .version import __version__, VERSION # flake8: noqa
from bleach import callbacks as linkify_callbacks
from bleach.encoding import force_unicode
from bleach.sanitizer import BleachSanitizerFilter
from bleach.version import __version__, VERSION # flake8: noqa

__all__ = ['clean', 'linkify']

Expand Down Expand Up @@ -60,16 +61,14 @@
# Make sure that .com doesn't get matched by .co first
TLDS.reverse()

PROTOCOLS = HTMLSanitizer.acceptable_protocols

url_re = re.compile(
r"""\(* # Match any opening parentheses.
\b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)? # http://
([\w-]+\.)+(?:{1})(?:\:[0-9]+)?(?!\.\w)\b # xx.yy.tld(:##)?
(?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
# /path/zz (excluding "unsafe" chars from RFC 1738,
# except for # and ~, which happen in practice)
""".format('|'.join(PROTOCOLS), '|'.join(TLDS)),
""".format('|'.join(allowed_protocols), '|'.join(TLDS)),
re.IGNORECASE | re.VERBOSE | re.UNICODE)

proto_re = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE)
Expand All @@ -86,8 +85,6 @@
""",
re.IGNORECASE | re.MULTILINE | re.VERBOSE)

NODE_TEXT = 4 # The numeric ID of a text node in simpletree.

ETREE_TAG = lambda x: "".join(['{http://www.w3.org/1999/xhtml}', x])
# a simple routine that returns the tag name with the namespace prefix
# as returned by etree's Element.tag attribute
Expand Down Expand Up @@ -119,27 +116,48 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
:arg strip: whether or not to strip disallowed elements
:arg strip_comments: whether or not to strip HTML comments

:returns: cleaned text as unicode

"""
if not text:
return ''
return u''

text = force_unicode(text)

class s(BleachSanitizer):
allowed_elements = tags
allowed_attributes = attributes
allowed_css_properties = styles
allowed_protocols = protocols
strip_disallowed_elements = strip
strip_html_comments = strip_comments
parser = html5lib.HTMLParser(namespaceHTMLElements=False)
dom = parser.parseFragment(text)

walker = html5lib.getTreeWalker('etree')
filtered = BleachSanitizerFilter(
source=walker(dom),

# Bleach-sanitizer-specific things
allowed_attributes_map=attributes,
strip_disallowed_elements=strip,
strip_html_comments=strip_comments,

# html5lib-sanitizer things
allowed_elements=tags,
allowed_css_properties=styles,
allowed_protocols=protocols,
allowed_svg_properties=[],

)
s = HTMLSerializer(
quote_attr_values='always',
omit_optional_tags=False,

parser = html5lib.HTMLParser(tokenizer=s)
# Bleach has its own sanitizer, so don't use the html5lib one
sanitize=False,

return _render(parser.parseFragment(text))
# Bleach sanitizer alphabetizes already, so don't use the html5lib one
alphabetical_attributes=False,
)
return s.render(filtered)


def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
parse_email=False, tokenizer=HTMLSanitizer):
parse_email=False):
"""Convert URL-like strings in an HTML fragment to links

``linkify()`` converts strings that look like URLs, domain names and email
Expand All @@ -156,12 +174,12 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
text = force_unicode(text)

if not text:
return ''
return u''

parser = html5lib.HTMLParser(tokenizer=tokenizer)
parser = html5lib.HTMLParser()

forest = parser.parseFragment(text)
_seen = set([])
_seen = set()

def replace_nodes(tree, new_frag, node, index=0):
"""Doesn't really replace nodes, but inserts the nodes contained in
Expand Down Expand Up @@ -427,7 +445,7 @@ def _render(tree):
def _serialize(domtree):
walker = html5lib.treewalkers.getTreeWalker('etree')
stream = walker(domtree)
serializer = HTMLSerializer(quote_attr_values=True,
serializer = HTMLSerializer(quote_attr_values='always',
alphabetical_attributes=True,
omit_optional_tags=False)
return serializer.render(stream)
Loading