-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathEpubToc.py
95 lines (68 loc) · 3.1 KB
/
EpubToc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from Exceptions import BadEpubException
from EpubChapter import EpubChapter
from lxml import etree
import re
import json
class EpubToc:
'''Represents the Table Of Contents of an Epub. Is a collection of EpubChapters'''
# The XML TOC
__toc = None
# Namespaces
__namespaces = {}
# Are we using the OEBPS system?
__has_oebps = True
# List of chapters
chapters = []
def __init__(self, epub, tocfile=None):
'''Instantiates an EpubToc'''
if not tocfile:
try:
try:
tocfile = epub.open("OEBPS/toc.ncx") # Fallback on assumed defaults, that toc.ncx is in prescribed OEBPS/
except KeyError:
tocfile = epub.open("toc.ncx") # Fallback on assumed defaults, that toc.ncx is in root level
self.__has_oebps = False
except KeyError:
raise BadEpubException("Unable to find toc.ncx")
# We expect tocfile to be file-like object
self.__toc = etree.parse(tocfile)
# Get the TOC namespace
self.__namespaces['ncx'] = self.__toc.getroot().nsmap[None]
# Discover if this ePub has a dtb:generator tag..
meta = self.__toc.find("//{%(tocns)s}meta[@name='dtb:generator']" % {'tocns': self.__namespaces['ncx']})
# ... and if the generator was Calibre
# It seems that Calibre currently generates ePubs that do not place content in OEBPS/, and have toc.ncx's
# that do not quite match <spine> in content.opf
if meta is not None and re.match("calibre.+", meta.attrib['content']):
self.__parse_calibre(epub)
else:
self.__parse_oebps(epub)
def __parse_oebps(self, epub):
'''Construct the chapter list assuming that the ePub has files in OEBPS/.'''
# Parse the chapters
npoints = self.__toc.findall("//{%(tocns)s}navPoint" % {'tocns': self.__namespaces['ncx']})
for p in npoints:
#rt = p.getroottree()
title = p.findtext("{%(tocns)s}text" % {'tocns': self.__namespaces['ncx']}) # Label text
contentfile = p.find("{%(tocns)s}content[@src]" % {'tocns':self.__namespaces['ncx']}).attrib['src'] # Contentfile name
if self.__has_oebps:
contentfile = "OEBPS/" + contentfile
self.chapters.append(EpubChapter(epub, p.attrib['id'], p.attrib['playOrder'], contentfile, title))
def __parse_calibre(self, epub):
'''Special parsing mode for Calibre-generated ePubs that don't adhere to sticking files in OEBPS/.'''
# This is an ugly hack, I'm sorry.
# Calibre-generated ePubs seem to have toc.ncx's that are somewhat sparse, with <manifest> in content.opf more complete instead.
# So let's parse <manifest>
#spine = epub.get_opf_data("spine") # Get the <spine> ElementTree
manifest = epub.get_opf_data("manifest") # Get the <manifest> ElementTree
ns = ""
if manifest is not None:
ns = manifest.getroot().nsmap[None]
else:
raise BadEpubException("Unable to namespace of ebook spine.")
# Parse xhtml/xml items in the manifest
items = manifest.findall("//{%(ns)s}item[@media-type='application/xhtml+xml']" % {'ns': ns})
for idx, i in enumerate(items):
self.chapters.append(EpubChapter(epub, i.attrib['id'], idx, i.attrib['href']))
def get_chapter_titles(self):
return [ c.contentfile for c in self.chapters ]