Skip to content

Commit 274a976

Browse files
committed
Fix issue wcember#24 - invalid epub
* Clarify epub version support - specifically epub2 / version 2 * TODO notes in relation to title support * implement title per chapter to ensure epub is valid - epubcheck ERROR(RSC-005) * correct uuid support * ensure correct unique id is declared ERROR(OPF-048) and ERROR(RSC-005) * various missing html/xhml tag fixes ERROR(HTM-004 * ISO/ANSI dates (date only at the moment) ERROR(OPF-054) * TODO note for dates, include timestamp * use valid xml id/names for chapters ERROR(HTM-004) * use consistent filenames for chapter that matches id Allows EPUBCheck v5.1.0 to run clean with simple html chapter content.
1 parent df67ede commit 274a976

File tree

8 files changed

+44
-22
lines changed

8 files changed

+44
-22
lines changed

Readme.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Pypub #
22

3-
Create epub's using python. Pypub is a python library to create epub files quickly without having to worry about the intricacies of the epub specification.
3+
Create epub (version 2) files using python. Pypub is a python library to create epub files quickly without having to worry about the intricacies of the epub specification.
44

55
# Installation #
66
The current release of pypub is available through pip:
@@ -38,6 +38,9 @@ To use code from a source code checkout from git
3838

3939
$ python -m pip install -r requirements.txt
4040

41+
# Useful Resources #
42+
43+
* https://www.eboundcanada.org/resources/whats-in-an-epub-the-opf-file/
4144

4245
# Copyright and License #
4346

pypub/chapter.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -139,11 +139,15 @@ class Chapter(object):
139139
"""
140140
def __init__(self, content, title, url=None):
141141
self._validate_input_types(content, title)
142-
self.title = title
142+
self.title = title # FIXME unused, apart from generating html_title is which also unused
143143
self.content = content
144144
self._content_tree = BeautifulSoup(self.content, 'html.parser')
145145
self.url = url
146-
self.html_title = cgi.escape(self.title, quote=True)
146+
self.html_title = cgi.escape(self.title, quote=True) # FIXME unused
147+
# TODO inject title into head of content. Options:
148+
# here at init
149+
# same time _replace_images_in_chapter() is called
150+
# in create_chapter_from_ABC() functions - ideally create_chapter_from_string() as single place
147151

148152
def write(self, file_name):
149153
"""
@@ -153,7 +157,7 @@ def write(self, file_name):
153157
file_name (str): The full name of the xhtml file to save to.
154158
"""
155159
try:
156-
assert file_name[-6:] == '.xhtml'
160+
assert file_name[-6:] == '.xhtml' # FIXME use .endswith()
157161
except (AssertionError, IndexError):
158162
raise ValueError('filename must end with .xhtml')
159163
with open(file_name, 'wb') as f:
@@ -288,8 +292,9 @@ def create_chapter_from_string(self, html_string, url=None, title=None):
288292
Chapter: A chapter object whose content is the given string
289293
and whose title is that provided or inferred from the url
290294
"""
291-
clean_html_string = self.clean_function(html_string)
295+
clean_html_string = self.clean_function(html_string, title=title)
292296
clean_xhtml_string = clean.html_to_xhtml(clean_html_string)
297+
# TODO refactor/simplify below title logic (see clean())
293298
if title:
294299
pass
295300
else:

pypub/clean.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,8 @@ def create_html_from_fragment(tag):
3434

3535

3636
def clean(input_string,
37-
tag_dictionary=constants.SUPPORTED_TAGS):
37+
tag_dictionary=constants.SUPPORTED_TAGS,
38+
title=None):
3839
"""
3940
Sanitizes HTML. Tags not contained as keys in the tag_dictionary input are
4041
removed, and child nodes are recursively moved to parent of removed node.
@@ -81,6 +82,14 @@ def clean(input_string,
8182
#wrap partial tree if necessary
8283
if root.find_all('html') == []:
8384
root = create_html_from_fragment(root)
85+
# TODO ensure there is a head!
86+
if root.html.head.title is None and title is None: # leaves empty content alone
87+
title = 'Ebook Chapter' # FIXME same logic in Chapter()
88+
if title:
89+
# override
90+
tmp_tag = root.new_tag('title')
91+
tmp_tag.string = title
92+
root.html.head.append(tmp_tag)
8493
# Remove img tags without src attribute
8594
image_node_list = root.find_all('img')
8695
for node in image_node_list:

pypub/epub.py

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import shutil
66
import tempfile
77
import time
8+
import uuid
89
from zipfile import ZipFile, ZIP_DEFLATED
910

1011
import jinja2
@@ -89,7 +90,8 @@ def __init__(self, template_file=os.path.join(EPUB_TEMPLATES_DIR, 'toc.html'), *
8990
super(TocHtml, self).__init__(template_file, **non_chapter_parameters)
9091

9192
def add_chapters(self, chapter_list):
92-
chapter_numbers = range(len(chapter_list))
93+
chapter_numbers = ['ch%03d' % n for n in range(len(chapter_list))] # need to be valid XML names, do not start with numeric - TODO central function for this
94+
9395
link_list = [str(n) + '.xhtml' for n in chapter_numbers]
9496
try:
9597
for c in chapter_list:
@@ -118,8 +120,8 @@ def __init__(self,
118120
super(TocNcx, self).__init__(template_file, **non_chapter_parameters)
119121

120122
def add_chapters(self, chapter_list):
121-
id_list = range(len(chapter_list))
122-
play_order_list = [n + 1 for n in id_list]
123+
id_list = ['ch%03d' % n for n in range(len(chapter_list))] # need to be valid XML names, do not start with numeric - TODO central function for this
124+
play_order_list = list(range(1, len(chapter_list) + 1))
123125
title_list = [c.title for c in chapter_list]
124126
link_list = [str(n) + '.xhtml' for n in id_list]
125127
super(TocNcx, self).add_chapters(**{'id': id_list,
@@ -137,7 +139,7 @@ def get_content_as_element(self):
137139

138140
class ContentOpf(_EpubFile):
139141

140-
def __init__(self, title, creator='', language='', rights='', publisher='', uid='', date=time.strftime("%m-%d-%Y")):
142+
def __init__(self, title, creator='', language='', rights='', publisher='', uid='', date=time.strftime("%Y-%m-%d")): # FIXME ISO date formated needed, include timestamp and TZ? For web server, check headers for last updated
141143
super(ContentOpf, self).__init__(os.path.join(EPUB_TEMPLATES_DIR, 'opf.xml'),
142144
title=title,
143145
creator=creator,
@@ -148,8 +150,8 @@ def __init__(self, title, creator='', language='', rights='', publisher='', uid=
148150
date=date)
149151

150152
def add_chapters(self, chapter_list):
151-
id_list = range(len(chapter_list))
152-
link_list = [str(n) + '.xhtml' for n in id_list]
153+
id_list = ['ch%03d' % n for n in range(len(chapter_list))] # need to be valid XML names, do not start with numeric - TODO central function for this
154+
link_list = [str(n) + '.xhtml' for n in id_list] # be consitent with new IDs which need to be valid XML names
153155
super(ContentOpf, self).add_chapters(**{'id': id_list, 'link': link_list})
154156

155157
def get_content_as_element(self):
@@ -187,11 +189,11 @@ def __init__(self, title, creator='pypub', language='en', rights='', publisher='
187189
self.language = language
188190
self.rights = rights
189191
self.publisher = publisher
190-
self.uid = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(12))
192+
self.uid = 'urn:uuid:%s' % uuid.uuid4() # TODO allow use to pass something in (e.g. ISBN)
191193
self.current_chapter_number = None
192194
self._increase_current_chapter_number()
193195
self.toc_html = TocHtml()
194-
self.toc_ncx = TocNcx()
196+
self.toc_ncx = TocNcx(uid=self.uid)
195197
self.opf = ContentOpf(self.title, self.creator, self.language, self.rights, self.publisher, self.uid)
196198
self.mimetype = _Mimetype(self.EPUB_DIR)
197199
self.container = _ContainerFile(self.META_INF_DIR)
@@ -214,7 +216,7 @@ def _increase_current_chapter_number(self):
214216
self.current_chapter_number = 0
215217
else:
216218
self.current_chapter_number += 1
217-
self.current_chapter_id = str(self.current_chapter_number)
219+
self.current_chapter_id = 'ch%03d' % self.current_chapter_number # TODO central function for chapter id gen? ncx, opf, etc.
218220
self.current_chapter_path = ''.join([self.current_chapter_id, '.xhtml'])
219221

220222
def add_chapter(self, c):
@@ -233,7 +235,7 @@ def add_chapter(self, c):
233235
except AssertionError:
234236
raise TypeError('chapter must be of type Chapter')
235237
chapter_file_output = os.path.join(self.OEBPS_DIR, self.current_chapter_path)
236-
c._replace_images_in_chapter(self.OEBPS_DIR)
238+
c._replace_images_in_chapter(self.OEBPS_DIR) # FIXME if this is the correct place to do this, then title should also be injected too at this point (or at chapter creation time)
237239
c.write(chapter_file_output)
238240
self._increase_current_chapter_number()
239241
self.chapters.append(c)
@@ -269,6 +271,7 @@ def create_zip_archive(epub_name):
269271
# TODO cleanup chdir code
270272
# TODO refactor/simplify walk code
271273
# TODO compression - debug Stored for now
274+
# TODO change sort order, chapters AFTER opf, ncx, toc, etc.
272275
save_cwd = os.getcwd()
273276
os.chdir(self.EPUB_DIR)
274277
archname = epub_name_with_path + '.zip'

pypub/epub_templates/opf.xml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
<?xml version="1.0" encoding="UTF-8" ?>
2-
<package xmlns="http://www.idpf.org/2007/opf" version="2.0">
2+
<package xmlns="http://www.idpf.org/2007/opf" version="2.0" unique-identifier="book-id">
33
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">
44
<dc:title>{{ title }}</dc:title>
55
<dc:creator opf:role="aut">{{ creator }}</dc:creator>
66
<dc:language>{{ language }}</dc:language>
77
<dc:rights>{{ rights }}</dc:rights>
88
<dc:publisher>{{ publisher }}</dc:publisher>
9-
<dc:identifier opf:scheme="UUID">{{ uid }}</dc:identifier>
9+
<dc:identifier id="book-id" opf:scheme="UUID">{{ uid }}</dc:identifier>
1010
<dc:date>{{ date }}</dc:date>
1111
</metadata>
1212
<manifest>

pypub/epub_templates/toc.html

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
<!DOCTYPE html>
2-
<html>
1+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
2+
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
33
<head>
44
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
55
<title>Table of Contents</title>

pypub/epub_templates/toc_ncx.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
<?xml version="1.0" encoding="UTF-8"?>
22
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
33
<head>
4-
<meta name="dtb:uid" content=""/>
4+
<meta name="dtb:uid" content="{{ uid }}"/>
55
<meta name="dtb:depth" content="1"/>
66
<meta name="dtb:totalPageCount" content="0"/>
77
<meta name="dtb:maxPageNumber" content="0"/>

setup.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,6 @@
1616
'requests==2.22.0',
1717
],
1818
description= "Create epub's using python. Pypub is a python library to create epub files quickly without having to worry about the intricacies of the epub specification.",
19-
)
19+
)
20+
# TODO long description from readme
21+
# TODO py 2.7 and other classifiers

0 commit comments

Comments
 (0)