forked from gstaxy/EpubToPdf
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpdfpy.py
155 lines (118 loc) · 3.78 KB
/
pdfpy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import pdfkit
import os
from PyPDF2 import PdfFileMerger
from PyPDF2.utils import PdfReadError
from bs4 import BeautifulSoup as bs
import re
class PdfEngine(object):
"""
This class carries operations on pdf files.
It has the following methods:
convert() --- Which converts each of the markup file
passed in to pdf. Markup file should be html
combine() --- Which merges all of the pdf files created by
the convert method, creating a new file.
del_pdf() --- Which deletes all the pdf files created by
the convert method.
"""
#def __init__(self, markup_files, style_files, pdf_files, directory):
def __init__(self, markup_files, pdf_files, directory):
#def __init__(self, markup_files, directory):
self.markup_files = markup_files
#self.style_files = style_files
self.pdf_files = pdf_files
self.directory = directory
def convert(self):
with open(self.markup_files[1]) as fp:
first_html = bs(fp, 'html.parser')
viewport_str = first_html.find("meta", {"name":"viewport"})['content']
viewport_str_arr = viewport_str.split(',')
viewport = {}
for view_str in viewport_str_arr:
name, val = view_str.split('=')
viewport[name.strip()]= val.strip()
print(viewport)
for each in self.markup_files:
print('Converting ' + str(each))
# Prevent conversion process from showing terminal updates
"""
options = {
'quiet': None,
'margin-bottom': '0',
'margin-left': '0',
'margin-right': '0',
'margin-top': '0'
}
"""
# 641/96*25,4 = 169.5979166667mm
"""
options = {
'quiet': None,
'viewport-size': viewport['width']+'x'+viewport['height'],
'page-width': str(int(viewport['width'])+2)+'px',
'page-height': str(int(viewport['height'])+2)+'px',
'margin-bottom': '0',
'margin-left': '0',
'margin-right': '0',
'margin-top': '0',
'disable-smart-shrinking': None
}
"""
page_w_mm = int(viewport['width'])/96*25.5
page_h_mm = int(viewport['height'])/96*25.5
options = {
'quiet': None,
'viewport-size': viewport['width']+'x'+viewport['height'],
'page-width': str(page_w_mm)+'mm',
'page-height': str(page_h_mm)+'mm',
'margin-bottom': '0',
'margin-left': '0',
'margin-right': '0',
'margin-top': '0',
'disable-smart-shrinking': None,
'enable-local-file-access': None,
'load-error-handling': 'ignore'
}
#print(options)
pdfkit.from_file(each, "{}.pdf".format(self.markup_files.index(each)),
options=options)
print('--- Sections converted to pdf')
def combine(self):
merger = PdfFileMerger()
for pdf in self.pdf_files:
try:
merger.append(pdf, import_bookmarks=False)
except PdfReadError:
pass
self.addOutline(merger)
merger.write("{}.pdf".format(self.directory))
print('--- Sections combined together in a single pdf file')
merger.close()
def addOutline(self, merger):
file = None
for root, dirs, files in os.walk(self.directory):
if file:
continue
for each in files:
if each == 'toc.xml':
file = os.path.join(root, each)
continue
if not file:
return
xml_content = open(file, "r").read()
xml_tree = bs(xml_content, features = "xml")
parentnodes = xml_tree.tocxml.toc.findAll('node', recursive=False)
self.addOutlineNodes(merger, parentnodes)
def addOutlineNodes(self, merger, nodes, parent=None):
for node in nodes:
title = node['title']
pagenum_str = re.search(r'[a-zA-Z]([0-9]+)\.xhtml', node['href']).group(1)
pagenum = int(pagenum_str) - 1
bookmark = merger.addBookmark(title, pagenum, parent)
children = node.findAll('node', recursive=False)
if len(children) > 0:
self.addOutlineNodes(merger, children, bookmark)
def del_pdf(self):
for each in self.pdf_files:
os.remove(each)
print('--- Individual pdf files deleted from directory')