openedx · feanil · Jun 13, 2024 · May 28, 2024 · May 28, 2024 · Jun 10, 2024
@@ -10,7 +10,7 @@
 import urllib
 from functools import wraps
 
-import bleach
+import nh3
 from django.db import transaction
 from django.db.models import Q
 from django.http import HttpResponse, HttpResponseBadRequest, HttpResponseForbidden, HttpResponseServerError
@@ -88,8 +88,8 @@ def search_certificates(request):
         ]
 
     """
-    unbleached_filter = urllib.parse.unquote(urllib.parse.quote_plus(request.GET.get("user", "")))
-    user_filter = bleach.clean(unbleached_filter)
+    uncleaned_filter = urllib.parse.unquote(urllib.parse.quote_plus(request.GET.get("user", "")))
+    user_filter = nh3.clean(uncleaned_filter)
     if not user_filter:
         msg = _("user is not given.")
         return HttpResponseBadRequest(msg)

diff --git a/lms/djangoapps/courseware/views/views.py b/lms/djangoapps/courseware/views/views.py
@@ -10,7 +10,7 @@
 from datetime import datetime
 from urllib.parse import quote_plus, urlencode, urljoin, urlparse, urlunparse
 
-import bleach
+import nh3
 import requests
 from django.conf import settings
 from django.contrib.auth.decorators import login_required
@@ -1550,7 +1550,7 @@ def render_xblock(request, usage_key_string, check_if_enrolled=True, disable_sta
     requested_view = request.GET.get('view', 'student_view')
     if requested_view != 'student_view' and requested_view != 'public_view':  # lint-amnesty, pylint: disable=consider-using-in
         return HttpResponseBadRequest(
-            f"Rendering of the xblock view '{bleach.clean(requested_view, strip=True)}' is not supported."
+            f"Rendering of the xblock view '{nh3.clean(requested_view)}' is not supported."
         )
 
     staff_access = has_access(request.user, 'staff', course_key)

diff --git a/lms/djangoapps/discussion/django_comment_client/utils.py b/lms/djangoapps/discussion/django_comment_client/utils.py
@@ -1038,7 +1038,7 @@ def sanitize_body(body):
     This is possibly overly broad, and might tamper with legitimate posts that
     contain this code in fenced code blocks. As far as we can tell, this is an
     extra layer of protection, and current handling in the front end and using
-    bleach for HTML rendering on the server side should cover these cases.
+    nh3 for HTML rendering on the server side should cover these cases.
     """
     if not body:
         return body

diff --git a/lms/djangoapps/discussion/rest_api/render.py b/lms/djangoapps/discussion/rest_api/render.py
@@ -4,17 +4,16 @@
 Note that this module is designed to imitate the front end behavior as
 implemented in Markdown.Sanitizer.js.
 """
-import bleach
+import nh3
 import markdown
 
-ALLOWED_TAGS = bleach.ALLOWED_TAGS | {
+ALLOWED_TAGS = nh3.ALLOWED_TAGS | {
     'br', 'dd', 'del', 'dl', 'dt', 'h1', 'h2', 'h3', 'h4', 'hr', 'img', 'kbd', 'p', 'pre', 's',
     'strike', 'sub', 'sup', 'table', 'thead', 'th', 'tbody', 'tr', 'td', 'tfoot'
 }
-ALLOWED_PROTOCOLS = {"http", "https", "ftp", "mailto"}
 ALLOWED_ATTRIBUTES = {
-    "a": ["href", "title", "target", "rel"],
-    "img": ["src", "alt", "title", "width", "height"],
+    "a": {"href", "title", "target", "rel"},
+    "img": {"src", "alt", "title", "width", "height"},
 }
 
 
@@ -25,17 +24,16 @@ def render_body(raw_body):
     This includes the following steps:
 
     * Convert Markdown to HTML
-    * Sanitise HTML using bleach
+    * Sanitise HTML using nh3
 
     Note that this does not prevent Markdown syntax inside a MathJax block from
     being processed, which the forums JavaScript code does.
     """
     rendered_html = markdown.markdown(raw_body)
-    sanitised_html = bleach.clean(
+    sanitised_html = nh3.clean(
         rendered_html,
         tags=ALLOWED_TAGS,
-        protocols=ALLOWED_PROTOCOLS,
-        strip=True,
-        attributes=ALLOWED_ATTRIBUTES
+        attributes=ALLOWED_ATTRIBUTES,
+        link_rel=None,
     )
     return sanitised_html
diff --git a/lms/djangoapps/discussion/rest_api/tests/test_render.py b/lms/djangoapps/discussion/rest_api/tests/test_render.py
@@ -84,7 +84,7 @@ def test_protocols_img_tag(self, protocol, is_allowed):
 
     def test_script_tag(self):
         raw_body = '<script type="text/javascript">alert("evil script");</script>'
-        assert render_body(raw_body) == 'alert("evil script");'
+        assert render_body(raw_body) == ''
 
     @ddt.data(
         ("br", '<p>foo<br>bar</p>'),  # br is allowed inside p

diff --git a/lms/templates/courseware/progress_graph.js b/lms/templates/courseware/progress_graph.js
@@ -1,6 +1,6 @@
 <%page args="grade_summary, grade_cutoffs, graph_div_id, show_grade_breakdown = True, show_grade_cutoffs = True, **kwargs"/>
 <%!
-    import bleach
+    import nh3
     import json
     import math
     import six
@@ -74,7 +74,7 @@ $(function () {
       ## allowing the display of such images, and remove any previously stored HTML
       ## to prevent ugly HTML from being shown to learners.
       ## xss-lint: disable=javascript-jquery-append
-      ticks.append( [tickIndex, bleach.clean(section['label'], tags=set(), strip=True)] )
+      ticks.append( [tickIndex, nh3.clean(section['label'], tags=set())] )
 
       if section['category'] in detail_tooltips:
           ## xss-lint: disable=javascript-jquery-append

diff --git a/lms/templates/lti.html b/lms/templates/lti.html
@@ -61,7 +61,7 @@ <h4 class="hd hd-4 error_message">
 % if has_score and comment:
     <h4 class="hd hd-4 problem-feedback-label">${_("Feedback on your work from the grader:")}</h4>
     <div class="problem-feedback">
-        ## sanitized with bleach in view
+        ## sanitized with nh3 in view
         ${comment | n, decode.utf8}
     </div>
 % endif

diff --git a/openedx/core/djangoapps/debug/views.py b/openedx/core/djangoapps/debug/views.py
@@ -5,7 +5,7 @@
 """
 
 
-import bleach
+import nh3
 from django.http import HttpResponseNotFound
 from django.template import TemplateDoesNotExist
 from django.utils.translation import gettext as _
@@ -54,4 +54,4 @@ def show_reference_template(request, template):
 
         return render_to_response(template, context)
     except TemplateDoesNotExist:
-        return HttpResponseNotFound(f'Missing template {bleach.clean(template, strip=True)}')
+        return HttpResponseNotFound(f'Missing template {nh3.clean(template)}')
@@ -5,7 +5,7 @@
 import urllib.parse as parse  # pylint: disable=import-error
 from urllib.parse import parse_qs, urlsplit, urlunsplit  # pylint: disable=import-error
 
-import bleach
+import nh3
 from django.conf import settings
 from django.contrib.auth import logout
 from django.shortcuts import redirect
@@ -60,7 +60,7 @@ def target(self):
         #  >> /courses/course-v1:ARTS+D1+2018_T/course/
         #  to handle this scenario we need to encode our URL using quote_plus and then unquote it again.
         if target_url:
-            target_url = bleach.clean(parse.unquote(parse.quote_plus(target_url)))
+            target_url = nh3.clean(parse.unquote(parse.quote_plus(target_url)))
 
         use_target_url = target_url and is_safe_login_or_logout_redirect(
             redirect_to=target_url,

@@ -5,7 +5,7 @@
 import urllib
 from unittest import mock
 import ddt
-import bleach
+import nh3
 from django.conf import settings
 from django.test import TestCase
 from django.test.utils import override_settings
@@ -237,6 +237,6 @@ def test_logout_redirect_failure_with_xss_vulnerability(self, redirect_url, host
         )
         response = self.client.get(url, HTTP_HOST=host)
         expected = {
-            'target': bleach.clean(urllib.parse.unquote(redirect_url)),
+            'target': nh3.clean(urllib.parse.unquote(redirect_url)),
         }
         self.assertDictContainsSubset(expected, response.context_data)
diff --git a/openedx/core/djangolib/markup.py b/openedx/core/djangolib/markup.py
@@ -4,7 +4,7 @@
 
 
 import markupsafe
-import bleach
+import nh3
 from lxml.html.clean import Cleaner
 from mako.filters import decode
 
@@ -53,7 +53,7 @@ def strip_all_tags_but_br(string_to_strip):
         string_to_strip = ""
 
     string_to_strip = decode.utf8(string_to_strip)
-    string_to_strip = bleach.clean(string_to_strip, tags={'br'}, strip=True)
+    string_to_strip = nh3.clean(string_to_strip, tags={'br'})
 
     return HTML(string_to_strip)
 

diff --git a/requirements/edx/base.txt b/requirements/edx/base.txt
@@ -56,7 +56,6 @@ billiard==4.2.0
     # via celery
 bleach[css]==6.1.0
     # via
-    #   -r requirements/edx/kernel.in
     #   edx-enterprise
     #   lti-consumer-xblock
     #   openedx-django-wiki
@@ -714,6 +713,8 @@ newrelic==9.10.0
     # via
     #   -r requirements/edx/bundled.in
     #   edx-django-utils
+nh3==0.2.17
+    # via -r requirements/edx/kernel.in
 nltk==3.8.1
     # via chem
 nodeenv==1.9.1

diff --git a/requirements/edx/development.txt b/requirements/edx/development.txt
@@ -1232,6 +1232,10 @@ newrelic==9.10.0
     #   -r requirements/edx/doc.txt
     #   -r requirements/edx/testing.txt
     #   edx-django-utils
+nh3==0.2.17
+    # via
+    #   -r requirements/edx/doc.txt
+    #   -r requirements/edx/testing.txt
 nltk==3.8.1
     # via
     #   -r requirements/edx/doc.txt

diff --git a/requirements/edx/doc.txt b/requirements/edx/doc.txt
@@ -838,6 +838,8 @@ newrelic==9.10.0
     # via
     #   -r requirements/edx/base.txt
     #   edx-django-utils
+nh3==0.2.17
+    # via -r requirements/edx/base.txt
 nltk==3.8.1
     # via
     #   -r requirements/edx/base.txt

diff --git a/requirements/edx/kernel.in b/requirements/edx/kernel.in
@@ -24,7 +24,6 @@ acid-xblock                         # This XBlock is used for unit tests as well
 analytics-python                    # Used for Segment analytics
 attrs                               # Reduces boilerplate code involving class attributes
 Babel                               # Internationalization utilities, used for date formatting in a few places
-bleach[css]                         # Allowed-list-based HTML sanitizing library that escapes or strips markup and attributes; used for capa and LTI
 boto                                # Deprecated version of the AWS SDK; we should stop using this
 boto3                               # Amazon Web Services SDK for Python
 botocore                            # via boto3, s3transfer
@@ -111,6 +110,7 @@ Markdown                            # Convert text markup to HTML; used in capa
 meilisearch                         # Library to access Meilisearch search engine (will replace ElasticSearch)
 mongoengine                         # Object-document mapper for MongoDB, used in the LMS dashboard
 mysqlclient                         # Driver for the default production relational database
+nh3                                 # Python bindings to the ammonia (whitelist-based HTML sanitizing library); used for capa and LTI
 nodeenv                             # Utility for managing Node.js environments; we use this for deployments and testing
 oauthlib                            # OAuth specification support for authenticating via LTI or other Open edX services
 olxcleaner

diff --git a/requirements/edx/testing.txt b/requirements/edx/testing.txt
@@ -917,6 +917,8 @@ newrelic==9.10.0
     # via
     #   -r requirements/edx/base.txt
     #   edx-django-utils
+nh3==0.2.17
+    # via -r requirements/edx/base.txt
 nltk==3.8.1
     # via
     #   -r requirements/edx/base.txt

diff --git a/xmodule/capa/inputtypes.py b/xmodule/capa/inputtypes.py
@@ -47,7 +47,7 @@
 import time
 from datetime import datetime
 
-import bleach
+import nh3
 import html5lib
 import pyparsing
 import six
@@ -800,7 +800,7 @@ def setup_code_response_rendering(self):
         if self.status == 'incomplete':
             self.status = 'queued'
             self.queue_len = self.msg  # lint-amnesty, pylint: disable=attribute-defined-outside-init
-            self.msg = bleach.clean(self.submitted_msg)
+            self.msg = nh3.clean(self.submitted_msg)
 
     def setup(self):
         """ setup this input type """

diff --git a/xmodule/capa/tests/test_inputtypes.py b/xmodule/capa/tests/test_inputtypes.py
@@ -916,7 +916,7 @@ def test_matlab_queue_message_not_allowed_tag(self):
         }
         elt = etree.fromstring(self.xml)
         the_input = self.input_class(test_capa_system(), elt, state)
-        expected = "&lt;script&gt;Test message&lt;/script&gt;"
+        expected = ""
         assert the_input.queue_msg == expected
 
     def test_matlab_sanitize_msg(self):
@@ -925,7 +925,7 @@ def test_matlab_sanitize_msg(self):
         """
         not_allowed_tag = 'script'
         self.the_input.msg = "<{0}>Test message</{0}>".format(not_allowed_tag)
-        expected = "&lt;script&gt;Test message&lt;/script&gt;"
+        expected = ""
         assert self.the_input._get_render_context()['msg'] == expected  # pylint: disable=protected-access
 
 

diff --git a/xmodule/capa/tests/test_util.py b/xmodule/capa/tests/test_util.py
@@ -121,7 +121,7 @@ def test_compare_with_tolerance(self):  # lint-amnesty, pylint: disable=too-many
 
     def test_sanitize_html(self):
         """
-        Test for html sanitization with bleach.
+        Test for html sanitization with nh3.
         """
         allowed_tags = ['div', 'p', 'audio', 'pre', 'span']
         for tag in allowed_tags:
@@ -130,7 +130,7 @@ def test_sanitize_html(self):
 
         not_allowed_tag = 'script'
         queue_msg = "<{0}>Test message</{0}>".format(not_allowed_tag)
-        expected = "&lt;script&gt;Test message&lt;/script&gt;"
+        expected = ""
         assert sanitize_html(queue_msg) == expected
 
     def test_get_inner_html_from_xpath(self):
@@ -142,7 +142,7 @@ def test_get_inner_html_from_xpath(self):
 
     def test_remove_markup(self):
         """
-        Test for markup removal with bleach.
+        Test for markup removal with nh3.
         """
         assert remove_markup('The <mark>Truth</mark> is <em>Out There</em> & you need to <strong>find</strong> it') ==\
                'The Truth is Out There &amp; you need to find it'

diff --git a/xmodule/capa/util.py b/xmodule/capa/util.py
@@ -8,11 +8,10 @@
 from cmath import isinf, isnan
 from decimal import Decimal
 
-import bleach
+import nh3
 from calc import evaluator
 from lxml import etree
 
-from bleach.css_sanitizer import CSSSanitizer
 from openedx.core.djangolib.markup import HTML
 
 #-----------------------------------------------------------------------------
@@ -182,17 +181,15 @@ def sanitize_html(html_code):
 
     Used to sanitize XQueue responses from Matlab.
     """
-    attributes = bleach.ALLOWED_ATTRIBUTES.copy()
+    attributes = nh3.ALLOWED_ATTRIBUTES.copy()
     attributes.update({
-        '*': ['class', 'style', 'id'],
-        'audio': ['controls', 'autobuffer', 'autoplay', 'src'],
-        'img': ['src', 'width', 'height', 'class']
+        '*': {'class', 'style', 'id'},
+        'audio': {'controls', 'autobuffer', 'autoplay', 'src'},
+        'img': {'src', 'width', 'height', 'class'}
     })
-    output = bleach.clean(
+    output = nh3.clean(
         html_code,
-        protocols=bleach.ALLOWED_PROTOCOLS | {'data'},
-        tags=bleach.ALLOWED_TAGS | {'div', 'p', 'audio', 'pre', 'img', 'span'},
-        css_sanitizer=CSSSanitizer(allowed_css_properties=["white-space"]),
+        tags=nh3.ALLOWED_TAGS | {'div', 'p', 'audio', 'pre', 'img', 'span'},
         attributes=attributes
     )
     return output
@@ -215,12 +212,12 @@ def remove_markup(html):
     """
     Return html with markup stripped and text HTML-escaped.
 
-    >>> bleach.clean("<b>Rock & Roll</b>", tags=set(), strip=True)
+    >>> nh3.clean("<b>Rock & Roll</b>", tags=set())
     'Rock &amp; Roll'
-    >>> bleach.clean("<b>Rock &amp; Roll</b>", tags=set(), strip=True)
+    >>> nh3.clean("<b>Rock &amp; Roll</b>", tags=set())
     'Rock &amp; Roll'
     """
-    return HTML(bleach.clean(html, tags=set(), strip=True))
+    return HTML(nh3.clean(html, tags=set()))
 
 
 def get_course_id_from_capa_block(capa_block):

diff --git a/xmodule/capa_block.py b/xmodule/capa_block.py
@@ -14,7 +14,7 @@
 import sys
 import traceback
 
-from bleach.sanitizer import Cleaner
+import nh3
 from django.conf import settings
 from django.core.exceptions import ImproperlyConfigured
 from django.utils.encoding import smart_str
@@ -619,7 +619,7 @@ def index_dictionary(self):
         capa_content = re.sub(
             r"(\s|&nbsp;|//)+",
             " ",
-            Cleaner(tags=[], strip=True).clean(capa_content)
+            nh3.clean(capa_content, tags=set())
         )
 
         capa_body = {