sanitize snippets

codeallthethingz · codeallthethingz · commit def39cc9b949 · 2025-03-11T08:30:43.000-07:00
diff --git a/docs/contributing/documentation.md b/docs/contributing/documentation.md
@@ -155,6 +155,7 @@ python -m tools.github_readme_sync.cli check docs
 
 See the [Style Guide](style-guide.md#images) images section for details about creating and referencing images correctly.
 
+
 # VS Code Snippets
 
 > 👍 You have access to VS Code snippets
diff --git a/docs/contributing/pull-requests/pull-request-flow.md b/docs/contributing/pull-requests/pull-request-flow.md
@@ -1,7 +1,7 @@
 ---
 title: Pull Request Flow
 ---
-![Blue: Contributor responsibility  <br/>Gray: Maintainer responsibility](../../figures/contributing/pull_request_flow.png)
+![Blue: Contributor responsibility  <br>Gray: Maintainer responsibility](../../figures/contributing/pull_request_flow.png)
 
 
 The Pull Request flow begins when you **Create a Pull Request** to our [GitHub repository](https://github.com/thousandbrainsproject/tbp.monty/pulls).
diff --git a/docs/figures/contributing/pull_request_flow.md b/docs/figures/contributing/pull_request_flow.md
@@ -20,7 +20,7 @@ flowchart LR
   NBCP -- No --> UPR2(Update Pull Request):::contributor
   UPR2 --> NBCP
   NBCP -- Yes --> M(Merge):::maintainer
-  M --> AMCP{Post-merge<br/>checks and tasks<br/>pass?}
+  M --> AMCP{Post-merge<br>checks and tasks<br>pass?}
   AMCP -- No --> RV(((Revert))):::endFail
   AMCP -- Yes --> D(((Done))):::endSuccess
 
diff --git a/docs/snippets/edit-this-page.md b/docs/snippets/edit-this-page.md
@@ -12,24 +12,37 @@
     margin-top: -10px;
     align-items: center;
     border: 1px solid #CCCCCC;
-    text-decoration: none;
+    text-decoration: none !important;
 }
 .contribution-button:hover {
     background-color: #DDDDDD;
     cursor: pointer;
     color: #000000;
+    text-decoration:none;
+}
+.contribution-button img {
+    display: inline-block; 
+    vertical-align: middle;
+}
+.contribution-button div {
+    margin-left: 5px;
+}
+.contribution-button span {
+    margin:0;
+    padding:0;
 }
 </style>
-<br/><br/>
+<br><br>
 
 ----
 
 #### Help Us Make This Page Better
 
 All our docs are open-source. If something is wrong or unclear, submit a PR to fix it!
-
-<a class="contribution-button" style="text-decoration:none" href="!!LINK!!" target="_blank">
-    <img src="https://raw.githubusercontent.com/primer/octicons/main/icons/git-pull-request-16.svg" width="16" height="16" style="display: inline-block; vertical-align: middle;" alt="Pull request icon">
-<span style="margin-left: 5px;">Make a Contribution</span></a>
+<div>
+<a class="contribution-button" href="!!LINK!!" target="_blank">
+    <img src="https://raw.githubusercontent.com/primer/octicons/main/icons/git-pull-request-16.svg" width="16" height="16" alt="Pull request icon">
+<div>Make a Contribution</div></a>
+</div>
 
 [Learn how to contribute to our docs](../contributing/documentation.md)
diff --git a/tools/github_readme_sync/readme.py b/tools/github_readme_sync/readme.py
@@ -15,6 +15,7 @@
 import os
 import re
 from collections import OrderedDict
+from copy import deepcopy
 from typing import Any, List, Tuple
 from urllib.parse import parse_qs
 
@@ -262,20 +263,12 @@ def replace_match(match):
     def create_or_update_doc(
         self, order: int, category_id: str, doc: dict, parent_id: str, file_path: str
     ) -> Tuple[str, bool]:
-        body = doc["body"]
-        body = self.insert_edit_this_page(body, doc["slug"], file_path)
-        body = self.insert_markdown_snippet(body, file_path)
-        body = self.convert_csv_to_html_table(body, file_path)
-        body = self.correct_image_locations(body)
-        body = self.correct_file_locations(body)
-        body = self.convert_note_tags(body)
-        body = self.parse_images(body)
-        body = self.convert_cloudinary_videos(body)
+        markdown = self.process_markdown(doc["body"], file_path, doc["slug"])
 
         create_doc_request = {
             "title": doc["title"],
             "type": "basic",
-            "body": body,
+            "body": markdown,
             "category": category_id,
             "hidden": doc.get("hidden", False),
             "order": order,
@@ -301,6 +294,40 @@ def create_or_update_doc(
 
         return doc_id, created
 
+    def process_markdown(self, body: str, file_path: str, slug: str) -> str:
+        body = self.insert_edit_this_page(body, slug, file_path)
+        body = self.insert_markdown_snippet(body, file_path)
+        body = self.convert_csv_to_html_table(body, file_path)
+        body = self.correct_image_locations(body)
+        body = self.correct_file_locations(body)
+        body = self.convert_note_tags(body)
+        body = self.parse_images(body)
+        body = self.convert_cloudinary_videos(body)
+        return body
+
+    def sanitize_html(self, body: str) -> str:
+        allowed_attributes = deepcopy(nh3.ALLOWED_ATTRIBUTES)
+        allowed_tags = deepcopy(nh3.ALLOWED_TAGS)
+
+        allowed_tags.add("style")
+        allowed_tags.add("a")
+        allowed_tags.add("label")
+        for tag in allowed_attributes:
+            allowed_attributes[tag].add("width")
+            allowed_attributes[tag].add("style")
+            allowed_attributes[tag].add("target")
+            allowed_attributes[tag].add("class")
+
+        return nh3.clean(
+            body,
+            tags=allowed_tags,
+            attributes=allowed_attributes,
+            link_rel=None,
+            strip_comments=False,
+            generic_attribute_prefixes={"data-"},
+            clean_content_tags={"script"},
+        )
+
     def insert_edit_this_page(self, body: str, filename: str, file_path: str) -> str:
         depth = len(file_path.split("/")) - 1
         relative_path = "../" * depth
@@ -468,7 +495,9 @@ def replace_match(match):
 
             try:
                 with open(snippet_path, "r") as f:
-                    return f.read()
+                    unsafe_content = f.read()
+                    return self.sanitize_html(unsafe_content)
+
             except Exception as e:
                 return f"[File not found or could not be read: {snippet_path}]"
 
diff --git a/tools/github_readme_sync/tests/readme_test.py b/tools/github_readme_sync/tests/readme_test.py
@@ -661,6 +661,32 @@ def test_insert_markdown_snippet(self):
             )
             self.assertIn("File not found", result)
 
+    def test_sanitize_html_removes_scripts(self):
+        html_with_script = """
+        <div>
+            <h1>Test Content</h1>
+            <p>This is a test paragraph</p>
+            <script>
+                alert('This is a malicious script');
+                document.cookie = "session=stolen";
+            </script>
+            <p>More content after the script</p>
+        </div>
+        """
+
+        sanitized_html = self.readme.sanitize_html(html_with_script)
+
+        # Verify script tag is removed
+        self.assertNotIn("<script>", sanitized_html)
+        self.assertNotIn("</script>", sanitized_html)
+        self.assertNotIn("alert('This is a malicious script')", sanitized_html)
+        self.assertNotIn("document.cookie", sanitized_html)
+
+        # Verify legitimate content is preserved
+        self.assertIn("<h1>Test Content</h1>", sanitized_html)
+        self.assertIn("<p>This is a test paragraph</p>", sanitized_html)
+        self.assertIn("<p>More content after the script</p>", sanitized_html)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tools/github_readme_sync/upload.py b/tools/github_readme_sync/upload.py
@@ -19,6 +19,7 @@
 
 def upload(new_hierarchy, file_path: str, rdme: ReadMe):
     logging.info(f"Uploading export folder: {file_path}")
+    logging.info(f"Url: https://thousandbrainsproject.readme.io/v{rdme.version}/docs")
     rdme.create_version_if_not_exists()
     to_be_deleted = get_all_categories_docs(rdme)