Skip to content

Commit

Permalink
feat: add blogtruyenmoi
Browse files Browse the repository at this point in the history
  • Loading branch information
potatoeggy committed Nov 4, 2023
1 parent 5a1cf04 commit 3703180
Show file tree
Hide file tree
Showing 2 changed files with 98 additions and 0 deletions.
1 change: 1 addition & 0 deletions mandown/sources/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import types

from . import (
source_blogtruyenmoi,
source_mangadex,
source_mangakakalot,
source_manganato,
Expand Down
97 changes: 97 additions & 0 deletions mandown/sources/source_blogtruyenmoi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
"""
Source file for blogtruyenmoi.com
"""
# pylint: disable=invalid-name

import re

import requests
from bs4 import BeautifulSoup

from ..base import BaseChapter, BaseMetadata
from .base_source import BaseSource


class BlogTruyenMoiSource(BaseSource):
name = "Blog Truyen"
domains = ["https://blogtruyenmoi.com"]
headers = {"Referer": "https://blogtruyenmoi.com/"}

def __init__(self, url: str) -> None:
super().__init__(url)
self.id = self.url_to_id(url) # we make a GET request here which is not ideal
self.url = f"https://blogtruyenmoi.com/{self.id}"
self._scripts: str | None = None

def fetch_metadata(self) -> BaseMetadata:
soup = BeautifulSoup(self._get_scripts(), "lxml")

title: str = (
soup.select_one("div#breadcrumbs > span:nth-child(2)")
.text.split(" > ")[-1]
.strip()
)
authors: list[str] = [
soup.select_one("div.description > p:nth-child(1) > span").text
]
genres: list[str] = [
string.text.strip() for string in soup.select("span.category > a")
]
cover_art: str = soup.select_one(".thumbnail > img")["src"]

description_list: list[str] = []
for para in soup.select("div.detail > div.content > div.content > p"):
stripped = para.text.strip()
if not stripped:
break
description_list.append(stripped)
description = "\n".join(description_list)

return BaseMetadata(title, authors, self.url, genres, description, cover_art)

def fetch_chapter_list(self) -> list[BaseChapter]:
soup = BeautifulSoup(self._get_scripts(), "lxml")

chapters = [
BaseChapter(c.text.strip(), f"https://blogtruyenmoi.com/{c['href']}")
for c in soup.select("div#list-chapters > p > span > a")
]
chapters.reverse()
return chapters

def fetch_chapter_image_list(self, chapter: BaseChapter) -> list[str]:
soup = BeautifulSoup(requests.get(chapter.url).text, "lxml")
return [el["src"] for el in soup.select("article#content > img")]

def _get_scripts(self) -> str:
if self._scripts:
return self._scripts

self._scripts = requests.get(self.url).text or ""
return self._scripts

@classmethod
def url_to_id(cls, url: str) -> str:
items = list(filter(None, url.split("/")))
# e.g., for https://blogtruyenmoi.com/31844/doc-la-dungeon
# we want '31844'
# for https://blogtruyenmoi.com/c842985/doc-la-dungeon-chap-62
# we want to crawl the page and find the link back to the ID

try:
return int(items[2])
except ValueError:
soup = BeautifulSoup(requests.get(url).text, "lxml")
return (
soup.select_one("header div.breadcrumbs > a:not([href='/'])")
.attrs["href"]
.split("/")[1]
)

@staticmethod
def check_url(url: str) -> bool:
return bool(re.match(r"https://blogtruyenmoi.com/(\d*(/.*)?)|(c.\d*/.*)", url))


def get_class() -> type[BaseSource]:
return BlogTruyenMoiSource

0 comments on commit 3703180

Please sign in to comment.