From e1a50ba39b78db289ca7884ac091401f6deeec62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sondre=20Gr=C3=B8n=C3=A5s?= <44143748+sondregronas@users.noreply.github.com> Date: Fri, 23 Aug 2024 18:27:30 +0200 Subject: [PATCH] Update web_scraper.py --- .github/workflows/web_scraper.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/web_scraper.py b/.github/workflows/web_scraper.py index 30fe13d..6658a0d 100644 --- a/.github/workflows/web_scraper.py +++ b/.github/workflows/web_scraper.py @@ -22,7 +22,12 @@ def get_html(link): """Get the html from the given url, and append the new links to the links list.""" print(f"Visiting {url}/{link.strip("/")}") r = requests.get(f'{url}/{link.strip("/")}', allow_redirects=True) - html = bs(r.text, "html.parser").prettify() + + # Only prettify if mimetype is text/html + if "text/html" in r.headers.get("Content-Type"): + html = bs(r.text, "html.parser").prettify() + else: + html = r.text new_links = get_links(html, path=link.strip("/")) new_media_links = get_media_links(html, path=link.strip("/"))