Fix issue wcember#24 - invalid epub

clach04 · clach04 · commit 274a97612030 · 2023-07-28T21:15:27.000-07:00
* Clarify epub version support - specifically epub2 / version 2
  * TODO notes in relation to title support
  * implement title per chapter to ensure epub is valid - epubcheck ERROR(RSC-005)
  * correct uuid support
  * ensure correct unique id is declared ERROR(OPF-048) and ERROR(RSC-005)
  * various missing html/xhml tag fixes ERROR(HTM-004
  * ISO/ANSI dates (date only at the moment) ERROR(OPF-054)
      * TODO note for dates, include timestamp
  * use valid xml id/names for chapters ERROR(HTM-004)
      * use consistent filenames for chapter that matches id

Allows EPUBCheck v5.1.0 to run clean with simple html chapter content.
diff --git a/Readme.md b/Readme.md
@@ -1,6 +1,6 @@
 # Pypub #
 
-Create epub's using python. Pypub is a python library to create epub files quickly without having to worry about the intricacies of the epub specification.
+Create epub (version 2) files using python. Pypub is a python library to create epub files quickly without having to worry about the intricacies of the epub specification.
 
 # Installation #
 The current release of pypub is available through pip:
@@ -38,6 +38,9 @@ To use code from a source code checkout from git
 
     $ python -m pip install -r requirements.txt
 
+# Useful Resources #
+
+  * https://www.eboundcanada.org/resources/whats-in-an-epub-the-opf-file/
 
 # Copyright and License #
 
diff --git a/pypub/chapter.py b/pypub/chapter.py
@@ -139,11 +139,15 @@ class Chapter(object):
     """
     def __init__(self, content, title, url=None):
         self._validate_input_types(content, title)
-        self.title = title
+        self.title = title # FIXME unused, apart from generating html_title is which also unused
         self.content = content
         self._content_tree = BeautifulSoup(self.content, 'html.parser')
         self.url = url
-        self.html_title = cgi.escape(self.title, quote=True)
+        self.html_title = cgi.escape(self.title, quote=True)  # FIXME unused
+        # TODO inject title into head of content. Options:
+        #   here at init
+        #   same time _replace_images_in_chapter() is called
+        # in create_chapter_from_ABC() functions - ideally create_chapter_from_string() as single place
 
     def write(self, file_name):
         """
@@ -153,7 +157,7 @@ def write(self, file_name):
             file_name (str): The full name of the xhtml file to save to.
         """
         try:
-            assert file_name[-6:] == '.xhtml'
+            assert file_name[-6:] == '.xhtml'  # FIXME use .endswith()
         except (AssertionError, IndexError):
             raise ValueError('filename must end with .xhtml')
         with open(file_name, 'wb') as f:
@@ -288,8 +292,9 @@ def create_chapter_from_string(self, html_string, url=None, title=None):
             Chapter: A chapter object whose content is the given string
                 and whose title is that provided or inferred from the url
         """
-        clean_html_string = self.clean_function(html_string)
+        clean_html_string = self.clean_function(html_string, title=title)
         clean_xhtml_string = clean.html_to_xhtml(clean_html_string)
+        # TODO refactor/simplify below title logic (see clean())
         if title:
             pass
         else:
diff --git a/pypub/clean.py b/pypub/clean.py
@@ -34,7 +34,8 @@ def create_html_from_fragment(tag):
 
 
 def clean(input_string,
-          tag_dictionary=constants.SUPPORTED_TAGS):
+          tag_dictionary=constants.SUPPORTED_TAGS,
+          title=None):
     """
     Sanitizes HTML. Tags not contained as keys in the tag_dictionary input are
     removed, and child nodes are recursively moved to parent of removed node.
@@ -81,6 +82,14 @@ def clean(input_string,
     #wrap partial tree if necessary
     if root.find_all('html') == []:
         root = create_html_from_fragment(root)
+    # TODO ensure there is a head!
+    if root.html.head.title is None and title is None:  # leaves empty content alone
+        title = 'Ebook Chapter'  # FIXME same logic in Chapter()
+    if title:
+        # override
+        tmp_tag = root.new_tag('title')
+        tmp_tag.string = title
+        root.html.head.append(tmp_tag)
     # Remove img tags without src attribute
     image_node_list = root.find_all('img')
     for node in image_node_list:
diff --git a/pypub/epub.py b/pypub/epub.py
@@ -5,6 +5,7 @@
 import shutil
 import tempfile
 import time
+import uuid
 from zipfile import ZipFile, ZIP_DEFLATED
 
 import jinja2
@@ -89,7 +90,8 @@ def __init__(self, template_file=os.path.join(EPUB_TEMPLATES_DIR, 'toc.html'), *
         super(TocHtml, self).__init__(template_file, **non_chapter_parameters)
 
     def add_chapters(self, chapter_list):
-        chapter_numbers = range(len(chapter_list))
+        chapter_numbers = ['ch%03d' % n for n in range(len(chapter_list))]  # need to be valid XML names, do not start with numeric - TODO central function for this
+
         link_list = [str(n) + '.xhtml' for n in chapter_numbers]
         try:
             for c in chapter_list:
@@ -118,8 +120,8 @@ def __init__(self,
         super(TocNcx, self).__init__(template_file, **non_chapter_parameters)
 
     def add_chapters(self, chapter_list):
-        id_list = range(len(chapter_list))
-        play_order_list = [n + 1 for n in id_list]
+        id_list = ['ch%03d' % n for n in range(len(chapter_list))]  # need to be valid XML names, do not start with numeric - TODO central function for this
+        play_order_list = list(range(1, len(chapter_list) + 1))
         title_list = [c.title for c in chapter_list]
         link_list = [str(n) + '.xhtml' for n in id_list]
         super(TocNcx, self).add_chapters(**{'id': id_list,
@@ -137,7 +139,7 @@ def get_content_as_element(self):
 
 class ContentOpf(_EpubFile):
 
-    def __init__(self, title, creator='', language='', rights='', publisher='', uid='', date=time.strftime("%m-%d-%Y")):
+    def __init__(self, title, creator='', language='', rights='', publisher='', uid='', date=time.strftime("%Y-%m-%d")):  # FIXME ISO date formated needed, include timestamp and TZ? For web server, check headers for last updated
         super(ContentOpf, self).__init__(os.path.join(EPUB_TEMPLATES_DIR, 'opf.xml'),
                                          title=title,
                                          creator=creator,
@@ -148,8 +150,8 @@ def __init__(self, title, creator='', language='', rights='', publisher='', uid=
                                          date=date)
 
     def add_chapters(self, chapter_list):
-        id_list = range(len(chapter_list))
-        link_list = [str(n) + '.xhtml' for n in id_list]
+        id_list = ['ch%03d' % n for n in range(len(chapter_list))]  # need to be valid XML names, do not start with numeric - TODO central function for this
+        link_list = [str(n) + '.xhtml' for n in id_list]  # be consitent with new IDs which need to be valid XML names
         super(ContentOpf, self).add_chapters(**{'id': id_list, 'link': link_list})
 
     def get_content_as_element(self):
@@ -187,11 +189,11 @@ def __init__(self, title, creator='pypub', language='en', rights='', publisher='
         self.language = language
         self.rights = rights
         self.publisher = publisher
-        self.uid = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(12))
+        self.uid = 'urn:uuid:%s' % uuid.uuid4()  # TODO allow use to pass something in (e.g. ISBN)
         self.current_chapter_number = None
         self._increase_current_chapter_number()
         self.toc_html = TocHtml()
-        self.toc_ncx = TocNcx()
+        self.toc_ncx = TocNcx(uid=self.uid)
         self.opf = ContentOpf(self.title, self.creator, self.language, self.rights, self.publisher, self.uid)
         self.mimetype = _Mimetype(self.EPUB_DIR)
         self.container = _ContainerFile(self.META_INF_DIR)
@@ -214,7 +216,7 @@ def _increase_current_chapter_number(self):
             self.current_chapter_number = 0
         else:
             self.current_chapter_number += 1
-        self.current_chapter_id = str(self.current_chapter_number)
+        self.current_chapter_id = 'ch%03d' % self.current_chapter_number  # TODO central function for chapter id gen? ncx, opf, etc.
         self.current_chapter_path = ''.join([self.current_chapter_id, '.xhtml'])
 
     def add_chapter(self, c):
@@ -233,7 +235,7 @@ def add_chapter(self, c):
         except AssertionError:
             raise TypeError('chapter must be of type Chapter')
         chapter_file_output = os.path.join(self.OEBPS_DIR, self.current_chapter_path)
-        c._replace_images_in_chapter(self.OEBPS_DIR)
+        c._replace_images_in_chapter(self.OEBPS_DIR)  # FIXME if this is the correct place to do this, then title should also be injected too at this point (or at chapter creation time)
         c.write(chapter_file_output)
         self._increase_current_chapter_number()
         self.chapters.append(c)
@@ -269,6 +271,7 @@ def create_zip_archive(epub_name):
             # TODO cleanup chdir code
             # TODO refactor/simplify walk code
             # TODO compression - debug Stored for now
+            # TODO change sort order, chapters AFTER opf, ncx, toc, etc.
             save_cwd = os.getcwd()
             os.chdir(self.EPUB_DIR)
             archname = epub_name_with_path + '.zip'
diff --git a/pypub/epub_templates/opf.xml b/pypub/epub_templates/opf.xml
@@ -1,12 +1,12 @@
 <?xml version="1.0" encoding="UTF-8" ?> 
-<package xmlns="http://www.idpf.org/2007/opf" version="2.0">
+<package xmlns="http://www.idpf.org/2007/opf" version="2.0" unique-identifier="book-id">
   <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">
     <dc:title>{{  title  }}</dc:title>
     <dc:creator opf:role="aut">{{ creator }}</dc:creator>
     <dc:language>{{ language }}</dc:language>
     <dc:rights>{{ rights }}</dc:rights>
     <dc:publisher>{{ publisher }}</dc:publisher>
-    <dc:identifier opf:scheme="UUID">{{ uid }}</dc:identifier>
+    <dc:identifier id="book-id" opf:scheme="UUID">{{ uid }}</dc:identifier>
     <dc:date>{{ date }}</dc:date>
   </metadata>
   <manifest>
diff --git a/pypub/epub_templates/toc.html b/pypub/epub_templates/toc.html
@@ -1,5 +1,5 @@
-<!DOCTYPE html>
-<html>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
   <head>
     <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
     <title>Table of Contents</title>
diff --git a/pypub/epub_templates/toc_ncx.xml b/pypub/epub_templates/toc_ncx.xml
@@ -1,7 +1,7 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
   <head>
-    <meta name="dtb:uid" content=""/>
+    <meta name="dtb:uid" content="{{ uid }}"/>
     <meta name="dtb:depth" content="1"/>
     <meta name="dtb:totalPageCount" content="0"/>
     <meta name="dtb:maxPageNumber" content="0"/>
diff --git a/setup.py b/setup.py
@@ -16,4 +16,6 @@
             'requests==2.22.0',
             ],
     description= "Create epub's using python. Pypub is a python library to create epub files quickly without having to worry about the intricacies of the epub specification.",
-)
+)
+# TODO long description from readme
+# TODO py 2.7 and other classifiers