Merge pull request #76 from OpenPecha/tsadra-refactor

Tsadra refactor
OpenPecha · Mar 23, 2021 · 2605a7b · 2605a7b
2 parents f4550b3 + 0342a17
commit 2605a7b
Show file tree

Hide file tree

Showing 5 changed files with 35 additions and 62 deletions.
diff --git a/openpecha/formatters/layers.py b/openpecha/formatters/layers.py
@@ -26,7 +26,6 @@
     "Author",
     "Archaic",
     "Span",
-    "CreditPage",
     "Footnote",
 ]
 
@@ -38,7 +37,6 @@ class AnnType:
     poti_title = "PotiTitle"
     author = "Author"
     chapter = "Chapter"
-    credit_page = "Credit_page"
 
     topic = "Text"
     sub_topic = "SubText"
@@ -70,9 +68,6 @@ class _attr_names:
     START = "start"
     END = "end"
 
-    # Credit page
-    CREDIT_PAGE_IMG_NAME = "credit_page_img_name"
-
     # Page
     PAGE_INDEX = "page_index"  # Page number based on Volume specified, type: int
     PAGE_INFO = "page_info"  # Page information. type: str
@@ -178,13 +173,6 @@ def BookNumber(span):
     return {_attr_names.SPAN: span}
 
 
-def CreditPage(credit_page_img_name, span):
-    return {
-        _attr_names.CREDIT_PAGE_IMG_NAME: credit_page_img_name,
-        _attr_names.SPAN: span,
-    }
-
-
 def PotiTitle(span):
     return {_attr_names.SPAN: span}
 

diff --git a/openpecha/formatters/tsadra.py b/openpecha/formatters/tsadra.py
@@ -8,7 +8,7 @@
 
 from .formatter import BaseFormatter
 from .layers import *
-from .layers import AnnType, CreditPage
+from .layers import AnnType
 
 
 class TsadraTemplate:
@@ -19,7 +19,6 @@ class TsadraTemplate:
         "credits-page_front-page---text-author",
         "credits-page_front-page---text-author1",
     ]
-    credit_page = "credits-page_epub-edition-line"
     book_titles = [
         "credits-page_front-title",
         "tibetan-book-title",
@@ -77,7 +76,6 @@ def __init__(self, output_path="./output", metadata=None):
         self.walker = 0  # The walker to traverse every character in the pecha
         self.book_title = []  # list variable to store book title index
         self.sub_title = []
-        self.credit_page = []
         self.book_number = []
         self.poti_title = []
         self.author = []  # list variable to store author annotion index
@@ -167,18 +165,6 @@ def build_layers(self, html):
                     self.base_text += book_num + "\n"
                     self.walker += len(book_num) + 1
 
-            elif p["class"][0] == TsadraTemplate.credit_page:
-                credit_page = self.get_credit_page(p)
-                if credit_page:
-                    self.credit_page.append(
-                        (
-                            None,
-                            CreditPage(credit_page, Span(self.walker, self.walker + 1)),
-                        )
-                    )
-                    self.base_text += " "
-                    self.walker += 1
-
             elif (
                 p["class"][0] in TsadraTemplate.author
             ):  # to get the author annotation index
@@ -468,7 +454,6 @@ def get_result(self):
             AnnType.sub_title: [self.sub_title],
             AnnType.book_number: [self.book_number],
             AnnType.poti_title: [self.poti_title],
-            AnnType.credit_page: [self.credit_page],
             AnnType.author: [self.author],
             AnnType.chapter: [self.chapter],
             AnnType.topic: [self.topic],
@@ -537,8 +522,8 @@ def create_opf(self, input_path, id_):
 
         # cover image path
         image_path = input_path / "image"
-        (self.dirs["opf_path"] / "asset").mkdir(exist_ok=True)
-        os.system(f"cp -R {image_path} {self.dirs['opf_path']}/asset")
+        (self.dirs["opf_path"] / "assets").mkdir(exist_ok=True)
+        os.system(f"cp -R {image_path} {self.dirs['opf_path']}/assets")
 
         # parse layers
         for html in self.get_input(input_path):

diff --git a/openpecha/serializers/epub.py b/openpecha/serializers/epub.py
@@ -36,6 +36,13 @@ class Tsadra_template:
     footnote_EP = "</span></a>"
     footnote_reference_SP = '<span class="tibetan-footnote-reference"'
 
+    toc_xpaths = {
+        "book-number": "//*[@class='tibetan-book-number']",
+        "chapter": "//*[@class='tibetan-chapters']",
+        "sabche": "//*[@class='tibetan-sabche1' or @class='tibetan-sabche']",
+    }
+    book_title_Xpath = "//*[@class='tibetan-book-title']"
+
 
 class EpubSerializer(Serialize):
     """Epub serializer class for OpenPecha."""
@@ -82,10 +89,6 @@ def apply_annotation(self, vol_id, ann, uuid2localid):
         elif ann["type"] == AnnType.peydurma:
             start_payload = "#"
             only_start_ann = True
-        elif ann["type"] == AnnType.credit_page:
-            credit_page_ann = ann["credit_page_img_name"]
-            start_payload = f'{Tsadra_template.credit_page_SP}<img src="{self.opf_path}/assets/image/{credit_page_ann}"/></span></p>\n'
-            only_start_ann = True
         elif ann["type"] == AnnType.error_candidate:
             start_payload = "["
             end_payload = "]"
@@ -270,6 +273,14 @@ def get_footnote_references(self, footnotes):
             footnote_references += f'{p_tag}<a href="#fm{footnote_id}">{Tsadra_template.footnote_reference_SP} id="fr{footnote_id}">{footnote["footnote_ref"]}</span></a></p>'
         return footnote_references
 
+    def add_credit_page(self, result):
+        author_pat = re.search('<p class="text-author">.+</p>', result)
+        credit_pg_name = self.meta["source_metadata"].get("credit", "")
+        if credit_pg_name:
+            credit_page_pat = f'{author_pat[0]}\n{Tsadra_template.credit_page_SP}<img src="{self.opf_path}/assets/image/{credit_pg_name}"/></span></p>\n'
+            result = re.sub(author_pat[0], credit_page_pat, result, 1)
+        return result
+
     def serialize(self, toc_levels={}, output_path="./output/epub_output"):
         """This module serialize .opf file to other format such as .epub etc. In case of epub,
         we are using calibre ebook-convert command to do the conversion by passing our custom css template
@@ -290,6 +301,7 @@ def serialize(self, toc_levels={}, output_path="./output/epub_output"):
 
         results = self.get_result()
         for vol_id, result in results.items():
+            result = self.add_credit_page(result)
             footnote_ref_tag = ""
             if "Footnote" in self.layers:
                 footnote_fn = self.opf_path / "layers" / vol_id / "Footnote.yml"
@@ -311,20 +323,21 @@ def serialize(self, toc_levels={}, output_path="./output/epub_output"):
             Path("template.css").write_bytes(template.content)
             # Running ebook-convert command to convert html file to .epub (From calibre)
             # XPath expression to detect chapter titles.
-            try:
-                level1_toc_Xpath = toc_levels["1"]
-                level2_toc_Xpath = toc_levels["2"]
-                level3_toc_Xpath = toc_levels["3"]
-            except Exception:
-                level1_toc_Xpath = ""
-                level2_toc_Xpath = ""
-                level3_toc_Xpath = ""
-            book_title_Xpath = "//*[@class='tibetan-book-title']"
+            level1_toc_Xpath = Tsadra_template.toc_xpaths.get(
+                toc_levels.get("1", ""), ""
+            )
+            level2_toc_Xpath = Tsadra_template.toc_xpaths.get(
+                toc_levels.get("2", ""), ""
+            )
+            level3_toc_Xpath = Tsadra_template.toc_xpaths.get(
+                toc_levels.get("3", ""), ""
+            )
+
             cover_path = self.opf_path / f"assets/image/{cover_image}"
             out_epub_fn = output_path / f"{self.meta['id']}.epub"
             font_family = "Monlam Uni Ouchan2"
             os.system(
-                f'ebook-convert {out_html_fn} {out_epub_fn} --extra-css=./template.css --embed-font-family="{font_family}" --page-breaks-before="{book_title_Xpath}" --cover={cover_path} --flow-size=0 --level1-toc="{level1_toc_Xpath}" --level2-toc="{level2_toc_Xpath}" --level3-toc="{level3_toc_Xpath}" --use-auto-toc --disable-font-rescaling'
+                f'ebook-convert {out_html_fn} {out_epub_fn} --extra-css=./template.css --embed-font-family="{font_family}" --page-breaks-before="{Tsadra_template.book_title_Xpath}" --cover={cover_path} --flow-size=0 --level1-toc="{level1_toc_Xpath}" --level2-toc="{level2_toc_Xpath}" --level3-toc="{level3_toc_Xpath}" --use-auto-toc --disable-font-rescaling'
             )
             # Removing html file and template file
             os.system(f"rm {out_html_fn}")

diff --git a/tests/integration/test_tsadra.py b/tests/integration/test_tsadra.py
@@ -5,21 +5,13 @@
 
 if __name__ == "__main__":
 
-    # ebook_path = "./output/demo/src/P000111/OEBPS/"
-    # opfs_path = "./output/demo/output"
-    # opf_path = "./output/demo/output/P000111/P000111.opf/"
-    # hfml_path = "./output/demo/output/P000111_hfml/"
-    # ebook_output_path = "./output/demo/output/ebooks"
-    # pecha_id = 111
-    # pecha_name = f"P{pecha_id:06}"
-
     pecha_id = 112
     pecha_name = f"P{pecha_id:06}"
-    ebook_path = f"./output/demo/src/tsadra_publication/{pecha_name}/OEBPS/"
-    opfs_path = "./output/demo/output"
-    opf_path = f"./output/demo/output/{pecha_name}/{pecha_name}.opf/"
+    ebook_path = f"./tests/data/serialize/tsadra/src/{pecha_name}/OEBPS/"
+    opfs_path = "./tests/data/serialize/tsadra"
+    opf_path = f"./tests/data/serialize/tsadra/{pecha_name}/{pecha_name}.opf/"
     hfml_path = "./output/demo/output/P000113_hfml/"
-    ebook_output_path = "./output/demo/output/ebooks"
+    ebook_output_path = "./tests/data/serialize/tsadra/ebook"
 
     # 1. Format Tsadra Ebook to OPF (OpenPecha Format)
     # formatter = TsadraFormatter(output_path=opfs_path)
@@ -39,10 +31,6 @@
     #     '2': "//*[@class='tibetan-sabche1' or @class='tibetan-sabche']",
     #     '3': ""
     # }
-    toc_levels = {
-        "1": "//*[@class='tibetan-book-number']",
-        "2": "//*[@class='tibetan-chapters']",
-        "3": "//*[@class='tibetan-sabche1' or @class='tibetan-sabche']",
-    }
+    toc_levels = {"1": "book-number", "2": "chapter", "3": "sabche"}
     serializer = EpubSerializer(Path(opf_path))
     serializer.serialize(toc_levels, ebook_output_path)
diff --git a/tests/test_formatter.py b/tests/test_formatter.py
@@ -307,7 +307,6 @@ def test_tsadra_formatter(self):
         expected_result = {
             AnnType.book_title: [[(None, {"span": {"start": 0, "end": 84}})]],
             AnnType.sub_title: [[]],
-            AnnType.credit_page: [[]],
             AnnType.book_number: [[]],
             AnnType.poti_title: [[]],
             AnnType.author: [