From 208ddd957752a5fbdf17f0af9b61a97d65161e69 Mon Sep 17 00:00:00 2001 From: eggy <d7chen@uwaterloo.ca> Date: Wed, 31 Jul 2024 20:44:57 -0400 Subject: [PATCH] feat: add source batoto --- mandown/sources/__init__.py | 1 + mandown/sources/source_batoto.py | 103 +++++++++++++++++++++++++++++++ tests/test_source_batoto.py | 17 +++++ 3 files changed, 121 insertions(+) create mode 100644 mandown/sources/source_batoto.py create mode 100644 tests/test_source_batoto.py diff --git a/mandown/sources/__init__.py b/mandown/sources/__init__.py index d42d7ab..db5b920 100644 --- a/mandown/sources/__init__.py +++ b/mandown/sources/__init__.py @@ -12,6 +12,7 @@ import types from . import ( + source_batoto, source_blogtruyenmoi, source_comixextra, source_kuaikanmanhua, diff --git a/mandown/sources/source_batoto.py b/mandown/sources/source_batoto.py new file mode 100644 index 0000000..6833d5e --- /dev/null +++ b/mandown/sources/source_batoto.py @@ -0,0 +1,103 @@ +""" +Source file for bato.to +""" + +import html +import json +import re +from typing import cast + +import requests +from bs4 import BeautifulSoup + +from ..base import BaseChapter, BaseMetadata +from .base_source import BaseSource + + +class BatotoSource(BaseSource): + name = "Bato.to" + domains = ["https://bato.to"] + + def __init__(self, url: str) -> None: + super().__init__(url) + self.id = self.url_to_id(url) # we make a GET request here which is not ideal + self.url = f"https://bato.to/title/{self.id}" + self._scripts: str | None = None + + def fetch_metadata(self) -> BaseMetadata: + soup = BeautifulSoup(self._get_scripts(), "lxml") + + cover_art_el = soup.select_one("img.not-prose") + title = cast(str, cover_art_el["alt"]) + cover_art = cast(str, cover_art_el["src"]) + authors = list({strip_parentheses(item.text) for item in soup.select("div.mt-2 a")}) + genres = [ + strip_parentheses(g.text) + for g in soup.select( + "div.space-y-2 > div.flex.items-center.flex-wrap > span > :nth-child(1)" + ) + ] + description = soup.select_one("astro-island > div > .prose > .limit-html-p").text.strip() + + return BaseMetadata(title, authors, self.url, genres, description, cover_art) + + def fetch_chapter_list(self) -> list[BaseChapter]: + soup = BeautifulSoup(self._get_scripts(), "lxml") + + chapters: list[BaseChapter] = [] + for item in soup.select('div[name="chapter-list"] div.space-x-1 > a:nth-child(1)'): + link = cast(str, item["href"]) + title = item.text + chapters.append(BaseChapter(title, f"https://bato.to{link}?load=2")) + return chapters + + def fetch_chapter_image_list(self, chapter: BaseChapter) -> list[str]: + soup = BeautifulSoup(requests.get(chapter.url).text, "lxml") + + """ + It looks something like: + {'pageOpts': [0, {'load': [0, '2'], 'marg': [0, '0'], 'zoom': [0, '0']}], 'imageFiles': [1, + '[[0,"https://xfs-n03.xfsbb.com/comic/7006/fbf/65ac7d07a811b44f5e9abfbf/45869815_2560_2824_374174.webp"]]' + ], 'urlP': [0, 0]} + """ + data = json.loads( + html.unescape( + soup.select_one('astro-island[component-url^="/_astro/ImageList"]')["props"] + ) + ) + + images: list[str] = [] + for image in json.loads(data["imageFiles"][1]): + images.append(image[1]) + return images + + def _get_scripts(self) -> str: + if self._scripts: + return self._scripts + + self._scripts = requests.get(self.url).text or "" + return self._scripts + + @classmethod + def url_to_id(cls, url: str) -> str: + items = list(filter(None, url.split("/"))) + + # e.g., for https://bato.to/title/115663-my-not-so-fair-lady-is-doomed-but-not-if-i-can-help-it-official/2950514-ch_15 + # we want "115663-my-not-so-fair-lady-is-doomed-but-not-if-i-can-help-it-official" + + return items[3] + + @staticmethod + def check_url(url: str) -> bool: + return bool(re.match(r"https://bato.to/title/.*", url)) + + +def get_class() -> type[BaseSource]: + return BatotoSource + + +def strip_parentheses(text: str) -> str: + index = text.find("(") + if index != -1: + return text[:index].strip() + return text.strip() diff --git a/tests/test_source_batoto.py b/tests/test_source_batoto.py new file mode 100644 index 0000000..92d74b9 --- /dev/null +++ b/tests/test_source_batoto.py @@ -0,0 +1,17 @@ +from common import is_source_working, skip_in_ci + +URL = "https://bato.to/title/148152-calvin-hobbes" +COVER_URL = "https://xfs-n02.xfsbb.com/thumb/W600/ampi/338/338d61287365ec7247ce40b729ebf7de865862fc_474_503_48513.jpeg" +DESCRIPTION = "Calvin and Hobbes follows the humorous antics of the title characters: Calvin, a precocious, mischievous, and adventurous six-year-old boy; and Hobbes, his sardonic stuffed tiger. Set in the suburban United States of the 1980s and 1990s, the strip depicts Calvin's frequent flights of fancy and friendship with Hobbes. It also examines Calvin's relationships with his long-suffering parents and with his classmates, especially his neighbor Susie Derkins. Hobbes's dual nature is a defining motif for the strip: to Calvin, Hobbes is a living anthropomorphic tiger, while all the other characters seem to see Hobbes as an inanimate stuffed toy\u00e2\u0080\u0094though Watterson has not clarified exactly how Hobbes is perceived by others. Though the series does not frequently mention specific political figures or ongoing events, it does explore broad issues like environmentalism, public education, and philosophical quandaries" + + +@skip_in_ci +def test_calvinhobbes() -> None: + return is_source_working( + URL, + title="Calvin & Hobbes", + authors=["Bill Watterson"], + genres=["Comic", "Kodomo", "Shoujo", "Shounen", "Comedy", "Kids", "Slice of Life"], + description=DESCRIPTION, + cover_art=COVER_URL, + )