Skip to content

Commit

Permalink
Merge pull request #76 from OpenPecha/tsadra-refactor
Browse files Browse the repository at this point in the history
Tsadra refactor
  • Loading branch information
kaldan007 authored Mar 23, 2021
2 parents f4550b3 + 0342a17 commit 2605a7b
Show file tree
Hide file tree
Showing 5 changed files with 35 additions and 62 deletions.
12 changes: 0 additions & 12 deletions openpecha/formatters/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
"Author",
"Archaic",
"Span",
"CreditPage",
"Footnote",
]

Expand All @@ -38,7 +37,6 @@ class AnnType:
poti_title = "PotiTitle"
author = "Author"
chapter = "Chapter"
credit_page = "Credit_page"

topic = "Text"
sub_topic = "SubText"
Expand Down Expand Up @@ -70,9 +68,6 @@ class _attr_names:
START = "start"
END = "end"

# Credit page
CREDIT_PAGE_IMG_NAME = "credit_page_img_name"

# Page
PAGE_INDEX = "page_index" # Page number based on Volume specified, type: int
PAGE_INFO = "page_info" # Page information. type: str
Expand Down Expand Up @@ -178,13 +173,6 @@ def BookNumber(span):
return {_attr_names.SPAN: span}


def CreditPage(credit_page_img_name, span):
return {
_attr_names.CREDIT_PAGE_IMG_NAME: credit_page_img_name,
_attr_names.SPAN: span,
}


def PotiTitle(span):
return {_attr_names.SPAN: span}

Expand Down
21 changes: 3 additions & 18 deletions openpecha/formatters/tsadra.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from .formatter import BaseFormatter
from .layers import *
from .layers import AnnType, CreditPage
from .layers import AnnType


class TsadraTemplate:
Expand All @@ -19,7 +19,6 @@ class TsadraTemplate:
"credits-page_front-page---text-author",
"credits-page_front-page---text-author1",
]
credit_page = "credits-page_epub-edition-line"
book_titles = [
"credits-page_front-title",
"tibetan-book-title",
Expand Down Expand Up @@ -77,7 +76,6 @@ def __init__(self, output_path="./output", metadata=None):
self.walker = 0 # The walker to traverse every character in the pecha
self.book_title = [] # list variable to store book title index
self.sub_title = []
self.credit_page = []
self.book_number = []
self.poti_title = []
self.author = [] # list variable to store author annotion index
Expand Down Expand Up @@ -167,18 +165,6 @@ def build_layers(self, html):
self.base_text += book_num + "\n"
self.walker += len(book_num) + 1

elif p["class"][0] == TsadraTemplate.credit_page:
credit_page = self.get_credit_page(p)
if credit_page:
self.credit_page.append(
(
None,
CreditPage(credit_page, Span(self.walker, self.walker + 1)),
)
)
self.base_text += " "
self.walker += 1

elif (
p["class"][0] in TsadraTemplate.author
): # to get the author annotation index
Expand Down Expand Up @@ -468,7 +454,6 @@ def get_result(self):
AnnType.sub_title: [self.sub_title],
AnnType.book_number: [self.book_number],
AnnType.poti_title: [self.poti_title],
AnnType.credit_page: [self.credit_page],
AnnType.author: [self.author],
AnnType.chapter: [self.chapter],
AnnType.topic: [self.topic],
Expand Down Expand Up @@ -537,8 +522,8 @@ def create_opf(self, input_path, id_):

# cover image path
image_path = input_path / "image"
(self.dirs["opf_path"] / "asset").mkdir(exist_ok=True)
os.system(f"cp -R {image_path} {self.dirs['opf_path']}/asset")
(self.dirs["opf_path"] / "assets").mkdir(exist_ok=True)
os.system(f"cp -R {image_path} {self.dirs['opf_path']}/assets")

# parse layers
for html in self.get_input(input_path):
Expand Down
41 changes: 27 additions & 14 deletions openpecha/serializers/epub.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,13 @@ class Tsadra_template:
footnote_EP = "</span></a>"
footnote_reference_SP = '<span class="tibetan-footnote-reference"'

toc_xpaths = {
"book-number": "//*[@class='tibetan-book-number']",
"chapter": "//*[@class='tibetan-chapters']",
"sabche": "//*[@class='tibetan-sabche1' or @class='tibetan-sabche']",
}
book_title_Xpath = "//*[@class='tibetan-book-title']"


class EpubSerializer(Serialize):
"""Epub serializer class for OpenPecha."""
Expand Down Expand Up @@ -82,10 +89,6 @@ def apply_annotation(self, vol_id, ann, uuid2localid):
elif ann["type"] == AnnType.peydurma:
start_payload = "#"
only_start_ann = True
elif ann["type"] == AnnType.credit_page:
credit_page_ann = ann["credit_page_img_name"]
start_payload = f'{Tsadra_template.credit_page_SP}<img src="{self.opf_path}/assets/image/{credit_page_ann}"/></span></p>\n'
only_start_ann = True
elif ann["type"] == AnnType.error_candidate:
start_payload = "["
end_payload = "]"
Expand Down Expand Up @@ -270,6 +273,14 @@ def get_footnote_references(self, footnotes):
footnote_references += f'{p_tag}<a href="#fm{footnote_id}">{Tsadra_template.footnote_reference_SP} id="fr{footnote_id}">{footnote["footnote_ref"]}</span></a></p>'
return footnote_references

def add_credit_page(self, result):
author_pat = re.search('<p class="text-author">.+</p>', result)
credit_pg_name = self.meta["source_metadata"].get("credit", "")
if credit_pg_name:
credit_page_pat = f'{author_pat[0]}\n{Tsadra_template.credit_page_SP}<img src="{self.opf_path}/assets/image/{credit_pg_name}"/></span></p>\n'
result = re.sub(author_pat[0], credit_page_pat, result, 1)
return result

def serialize(self, toc_levels={}, output_path="./output/epub_output"):
"""This module serialize .opf file to other format such as .epub etc. In case of epub,
we are using calibre ebook-convert command to do the conversion by passing our custom css template
Expand All @@ -290,6 +301,7 @@ def serialize(self, toc_levels={}, output_path="./output/epub_output"):

results = self.get_result()
for vol_id, result in results.items():
result = self.add_credit_page(result)
footnote_ref_tag = ""
if "Footnote" in self.layers:
footnote_fn = self.opf_path / "layers" / vol_id / "Footnote.yml"
Expand All @@ -311,20 +323,21 @@ def serialize(self, toc_levels={}, output_path="./output/epub_output"):
Path("template.css").write_bytes(template.content)
# Running ebook-convert command to convert html file to .epub (From calibre)
# XPath expression to detect chapter titles.
try:
level1_toc_Xpath = toc_levels["1"]
level2_toc_Xpath = toc_levels["2"]
level3_toc_Xpath = toc_levels["3"]
except Exception:
level1_toc_Xpath = ""
level2_toc_Xpath = ""
level3_toc_Xpath = ""
book_title_Xpath = "//*[@class='tibetan-book-title']"
level1_toc_Xpath = Tsadra_template.toc_xpaths.get(
toc_levels.get("1", ""), ""
)
level2_toc_Xpath = Tsadra_template.toc_xpaths.get(
toc_levels.get("2", ""), ""
)
level3_toc_Xpath = Tsadra_template.toc_xpaths.get(
toc_levels.get("3", ""), ""
)

cover_path = self.opf_path / f"assets/image/{cover_image}"
out_epub_fn = output_path / f"{self.meta['id']}.epub"
font_family = "Monlam Uni Ouchan2"
os.system(
f'ebook-convert {out_html_fn} {out_epub_fn} --extra-css=./template.css --embed-font-family="{font_family}" --page-breaks-before="{book_title_Xpath}" --cover={cover_path} --flow-size=0 --level1-toc="{level1_toc_Xpath}" --level2-toc="{level2_toc_Xpath}" --level3-toc="{level3_toc_Xpath}" --use-auto-toc --disable-font-rescaling'
f'ebook-convert {out_html_fn} {out_epub_fn} --extra-css=./template.css --embed-font-family="{font_family}" --page-breaks-before="{Tsadra_template.book_title_Xpath}" --cover={cover_path} --flow-size=0 --level1-toc="{level1_toc_Xpath}" --level2-toc="{level2_toc_Xpath}" --level3-toc="{level3_toc_Xpath}" --use-auto-toc --disable-font-rescaling'
)
# Removing html file and template file
os.system(f"rm {out_html_fn}")
Expand Down
22 changes: 5 additions & 17 deletions tests/integration/test_tsadra.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,13 @@

if __name__ == "__main__":

# ebook_path = "./output/demo/src/P000111/OEBPS/"
# opfs_path = "./output/demo/output"
# opf_path = "./output/demo/output/P000111/P000111.opf/"
# hfml_path = "./output/demo/output/P000111_hfml/"
# ebook_output_path = "./output/demo/output/ebooks"
# pecha_id = 111
# pecha_name = f"P{pecha_id:06}"

pecha_id = 112
pecha_name = f"P{pecha_id:06}"
ebook_path = f"./output/demo/src/tsadra_publication/{pecha_name}/OEBPS/"
opfs_path = "./output/demo/output"
opf_path = f"./output/demo/output/{pecha_name}/{pecha_name}.opf/"
ebook_path = f"./tests/data/serialize/tsadra/src/{pecha_name}/OEBPS/"
opfs_path = "./tests/data/serialize/tsadra"
opf_path = f"./tests/data/serialize/tsadra/{pecha_name}/{pecha_name}.opf/"
hfml_path = "./output/demo/output/P000113_hfml/"
ebook_output_path = "./output/demo/output/ebooks"
ebook_output_path = "./tests/data/serialize/tsadra/ebook"

# 1. Format Tsadra Ebook to OPF (OpenPecha Format)
# formatter = TsadraFormatter(output_path=opfs_path)
Expand All @@ -39,10 +31,6 @@
# '2': "//*[@class='tibetan-sabche1' or @class='tibetan-sabche']",
# '3': ""
# }
toc_levels = {
"1": "//*[@class='tibetan-book-number']",
"2": "//*[@class='tibetan-chapters']",
"3": "//*[@class='tibetan-sabche1' or @class='tibetan-sabche']",
}
toc_levels = {"1": "book-number", "2": "chapter", "3": "sabche"}
serializer = EpubSerializer(Path(opf_path))
serializer.serialize(toc_levels, ebook_output_path)
1 change: 0 additions & 1 deletion tests/test_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,6 @@ def test_tsadra_formatter(self):
expected_result = {
AnnType.book_title: [[(None, {"span": {"start": 0, "end": 84}})]],
AnnType.sub_title: [[]],
AnnType.credit_page: [[]],
AnnType.book_number: [[]],
AnnType.poti_title: [[]],
AnnType.author: [
Expand Down

0 comments on commit 2605a7b

Please sign in to comment.