Skip to content

Commit

Permalink
ci(fix): fix escaped characters in web_scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
sondregronas committed Sep 11, 2024
1 parent 8cf0246 commit 91bdee8
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 7 deletions.
19 changes: 15 additions & 4 deletions .github/workflows/web_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,22 @@
os.chdir(os.path.dirname(os.path.abspath(__file__)))


def unquote_path(path):
new_path = unquote(path)
new_path = new_path.replace("&", "&")
return new_path


def get_html(link):
"""Get the html from the given url, and append the new links to the links list."""
print(f"Visiting {url}/{link.strip('/')}")
r = requests.get(f'{url}/{link.strip("/")}', allow_redirects=True)

visited.append(link)

if not r.ok:
return

# Only prettify if mimetype is text/html
if "text/html" in r.headers.get("Content-Type"):
html = str(bs(r.text, "html.parser"))
Expand All @@ -45,8 +56,6 @@ def get_html(link):
if l not in media_links:
media_links.append(l)

visited.append(link)

# TODO: this is a hack. hopefully temporary.
html = re.sub(r"""/api/generate_thumbnail/([^?]*)(\?[^"]*)""", r"/api/generate_thumbnail/\1.webp", html)

Expand Down Expand Up @@ -92,6 +101,8 @@ def download_site():
for link in links:
if link not in visited:
html = get_html(link)
if not html:
continue
if link == "/":
path = "index.html"
else:
Expand All @@ -107,12 +118,12 @@ def download_site():
path = link.strip("/")
r = requests.get(f"{url}/{path}", allow_redirects=True)
os.makedirs(os.path.dirname(f"demo/{path}"), exist_ok=True)
path = unquote(path)
path = unquote_path(path)
# TODO: this is a hack. hopefully temporary.
if "/api/generate_thumbnail/" in link:
path = path.rsplit("?")[0] + ".webp"

if not path:
if not path or not r.ok:
continue
with open(f"demo/{path}", "wb+") as f:
f.write(r.content)
Expand Down
8 changes: 5 additions & 3 deletions piggy/api.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
from hashlib import md5
from html import unescape

from flask import Blueprint, request

from piggy.thumbnails import create_thumbnail
from piggy.utils import serve_pil_image
from hashlib import md5

api_routes = Blueprint("api", __name__, url_prefix="/api")


@api_routes.route("/generate_thumbnail/<string:text>")
def generate_thumbnail(text: str, request=request):
"""Generate a thumbnail image with the given text and query parameters."""
# get query parameters from the request
text = text.replace("_", " ")
text = unescape(text)
bg_color = request.args.get("bg_color", "")
text_color = request.args.get("text_color", "")
width = request.args.get("width", 500)
Expand Down

0 comments on commit 91bdee8

Please sign in to comment.