diff --git a/.github/workflows/web_scraper.py b/.github/workflows/web_scraper.py index a917bca..b87fe42 100644 --- a/.github/workflows/web_scraper.py +++ b/.github/workflows/web_scraper.py @@ -21,11 +21,22 @@ os.chdir(os.path.dirname(os.path.abspath(__file__))) +def unquote_path(path): + new_path = unquote(path) + new_path = new_path.replace("&", "&") + return new_path + + def get_html(link): """Get the html from the given url, and append the new links to the links list.""" print(f"Visiting {url}/{link.strip('/')}") r = requests.get(f'{url}/{link.strip("/")}', allow_redirects=True) + visited.append(link) + + if not r.ok: + return + # Only prettify if mimetype is text/html if "text/html" in r.headers.get("Content-Type"): html = str(bs(r.text, "html.parser")) @@ -45,8 +56,6 @@ def get_html(link): if l not in media_links: media_links.append(l) - visited.append(link) - # TODO: this is a hack. hopefully temporary. html = re.sub(r"""/api/generate_thumbnail/([^?]*)(\?[^"]*)""", r"/api/generate_thumbnail/\1.webp", html) @@ -92,6 +101,8 @@ def download_site(): for link in links: if link not in visited: html = get_html(link) + if not html: + continue if link == "/": path = "index.html" else: @@ -107,12 +118,12 @@ def download_site(): path = link.strip("/") r = requests.get(f"{url}/{path}", allow_redirects=True) os.makedirs(os.path.dirname(f"demo/{path}"), exist_ok=True) - path = unquote(path) + path = unquote_path(path) # TODO: this is a hack. hopefully temporary. if "/api/generate_thumbnail/" in link: path = path.rsplit("?")[0] + ".webp" - if not path: + if not path or not r.ok: continue with open(f"demo/{path}", "wb+") as f: f.write(r.content) diff --git a/piggy/api.py b/piggy/api.py index e30717f..91b7c55 100644 --- a/piggy/api.py +++ b/piggy/api.py @@ -1,7 +1,10 @@ +from hashlib import md5 +from html import unescape + from flask import Blueprint, request + from piggy.thumbnails import create_thumbnail from piggy.utils import serve_pil_image -from hashlib import md5 api_routes = Blueprint("api", __name__, url_prefix="/api") @@ -9,8 +12,7 @@ @api_routes.route("/generate_thumbnail/") def generate_thumbnail(text: str, request=request): """Generate a thumbnail image with the given text and query parameters.""" - # get query parameters from the request - text = text.replace("_", " ") + text = unescape(text) bg_color = request.args.get("bg_color", "") text_color = request.args.get("text_color", "") width = request.args.get("width", 500)