feat: add blogtruyenmoi

potatoeggy · Nov 4, 2023 · 3703180 · 3703180
1 parent 5a1cf04
commit 3703180
Show file tree

Hide file tree

Showing 2 changed files with 98 additions and 0 deletions.
diff --git a/mandown/sources/__init__.py b/mandown/sources/__init__.py
@@ -13,6 +13,7 @@
 import types
 
 from . import (
+    source_blogtruyenmoi,
     source_mangadex,
     source_mangakakalot,
     source_manganato,

diff --git a/mandown/sources/source_blogtruyenmoi.py b/mandown/sources/source_blogtruyenmoi.py
@@ -0,0 +1,97 @@
+"""
+Source file for blogtruyenmoi.com
+"""
+# pylint: disable=invalid-name
+
+import re
+
+import requests
+from bs4 import BeautifulSoup
+
+from ..base import BaseChapter, BaseMetadata
+from .base_source import BaseSource
+
+
+class BlogTruyenMoiSource(BaseSource):
+    name = "Blog Truyen"
+    domains = ["https://blogtruyenmoi.com"]
+    headers = {"Referer": "https://blogtruyenmoi.com/"}
+
+    def __init__(self, url: str) -> None:
+        super().__init__(url)
+        self.id = self.url_to_id(url)  # we make a GET request here which is not ideal
+        self.url = f"https://blogtruyenmoi.com/{self.id}"
+        self._scripts: str | None = None
+
+    def fetch_metadata(self) -> BaseMetadata:
+        soup = BeautifulSoup(self._get_scripts(), "lxml")
+
+        title: str = (
+            soup.select_one("div#breadcrumbs > span:nth-child(2)")
+            .text.split(" > ")[-1]
+            .strip()
+        )
+        authors: list[str] = [
+            soup.select_one("div.description > p:nth-child(1) > span").text
+        ]
+        genres: list[str] = [
+            string.text.strip() for string in soup.select("span.category > a")
+        ]
+        cover_art: str = soup.select_one(".thumbnail > img")["src"]
+
+        description_list: list[str] = []
+        for para in soup.select("div.detail > div.content > div.content > p"):
+            stripped = para.text.strip()
+            if not stripped:
+                break
+            description_list.append(stripped)
+        description = "\n".join(description_list)
+
+        return BaseMetadata(title, authors, self.url, genres, description, cover_art)
+
+    def fetch_chapter_list(self) -> list[BaseChapter]:
+        soup = BeautifulSoup(self._get_scripts(), "lxml")
+
+        chapters = [
+            BaseChapter(c.text.strip(), f"https://blogtruyenmoi.com/{c['href']}")
+            for c in soup.select("div#list-chapters > p > span > a")
+        ]
+        chapters.reverse()
+        return chapters
+
+    def fetch_chapter_image_list(self, chapter: BaseChapter) -> list[str]:
+        soup = BeautifulSoup(requests.get(chapter.url).text, "lxml")
+        return [el["src"] for el in soup.select("article#content > img")]
+
+    def _get_scripts(self) -> str:
+        if self._scripts:
+            return self._scripts
+
+        self._scripts = requests.get(self.url).text or ""
+        return self._scripts
+
+    @classmethod
+    def url_to_id(cls, url: str) -> str:
+        items = list(filter(None, url.split("/")))
+        # e.g., for https://blogtruyenmoi.com/31844/doc-la-dungeon
+        # we want '31844'
+        # for https://blogtruyenmoi.com/c842985/doc-la-dungeon-chap-62
+        # we want to crawl the page and find the link back to the ID
+
+        try:
+            return int(items[2])
+        except ValueError:
+            soup = BeautifulSoup(requests.get(url).text, "lxml")
+            return (
+                soup.select_one("header div.breadcrumbs > a:not([href='/'])")
+                .attrs["href"]
+                .split("/")[1]
+            )
+
+    @staticmethod
+    def check_url(url: str) -> bool:
+        return bool(re.match(r"https://blogtruyenmoi.com/(\d*(/.*)?)|(c.\d*/.*)", url))
+
+
+def get_class() -> type[BaseSource]:
+    return BlogTruyenMoiSource