From e1a50ba39b78db289ca7884ac091401f6deeec62 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sondre=20Gr=C3=B8n=C3=A5s?=
 <44143748+sondregronas@users.noreply.github.com>
Date: Fri, 23 Aug 2024 18:27:30 +0200
Subject: [PATCH] Update web_scraper.py

---
 .github/workflows/web_scraper.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/web_scraper.py b/.github/workflows/web_scraper.py
index 30fe13d..6658a0d 100644
--- a/.github/workflows/web_scraper.py
+++ b/.github/workflows/web_scraper.py
@@ -22,7 +22,12 @@ def get_html(link):
     """Get the html from the given url, and append the new links to the links list."""
     print(f"Visiting {url}/{link.strip("/")}")
     r = requests.get(f'{url}/{link.strip("/")}', allow_redirects=True)
-    html = bs(r.text, "html.parser").prettify()
+
+    # Only prettify if mimetype is text/html
+    if "text/html" in r.headers.get("Content-Type"):
+        html = bs(r.text, "html.parser").prettify()
+    else:
+        html = r.text
 
     new_links = get_links(html, path=link.strip("/"))
     new_media_links = get_media_links(html, path=link.strip("/"))