-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract.py
107 lines (89 loc) · 3.42 KB
/
extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
A short and sweet script to read, decrypt and pull certain pages from a PDF
"""
__author__ = "David Klein"
__copyright__ = "Copyright 2021, David Klein"
__credits__ = ["David Klein"]
__license__ = "MIT"
__version__ = "1.0.0"
__maintainer__ = "David Klein"
__status__ = "Production"
import getpass
import os
import typing
import PyPDF2
import plac
from tqdm import tqdm
def parse_page_arg(pages: str) -> set:
"""
Parse a complete list of page numbers from the page argument provided
:param pages: a string of comma and hyphen separated page numbers e.g. 1,3-6 == 1,3,4,5,6
:return: a set of page numbers
"""
all_pages = set()
if not isinstance(pages, str):
raise TypeError(f"Pages argument must be a string but is a '{type(pages)}'")
for page in pages.split(","):
page = page.strip()
if page != "":
if page.isnumeric():
all_pages.add(int(page))
elif "-" in page:
page_a, page_b = page.split("-", 1)
page_a = page_a.strip()
page_b = page_b.strip()
if page_a.isnumeric() and page_b.isnumeric():
all_pages.update(set(range(int(page_a), int(page_b)+1)))
else:
raise ValueError(f"'{page}' is not a valid page format in '{pages}'")
else:
raise ValueError(f"'{page}' is not a valid page format in '{pages}'")
return all_pages
@plac.annotations(
path=plac.Annotation("The path of the PDF to take a subset of", "positional", None, str),
pages=plac.Annotation("The pages in comma or hyphen-separated values", "positional", None, str),
suffix=plac.Annotation("The suffix to append to the save file", "option", "s", str),
)
def main(
path: typing.Union[os.PathLike, str],
pages: str,
suffix: typing.Optional[str] = None
) -> None:
"""
:param path: the path to the PDF file
:param pages: a string of comma and hyphen separated page numbers e.g. 1,3-6 == 1,3,4,5,6
:param suffix: the suffix to append to the filename when saving. If None, will use 'pages'.
:return: None
"""
# Parse page numbers
page_nums = parse_page_arg(pages)
# Create save path
base, ext = os.path.splitext(path)
if not suffix:
suffix = pages
wpath = base + "_" + suffix + ext
# Open PDF for reading
print(f"Opening '{path}' for reading and '{wpath}' for writing")
with open(path, 'rb') as rf, open(wpath, 'wb') as wf:
reader = PyPDF2.PdfFileReader(rf)
writer = PyPDF2.PdfFileWriter()
# Decrypt if needed
if reader.isEncrypted:
pwd = getpass.getpass("This PDF is encrypted. Please enter a password to decrypt: ")
res = reader.decrypt(pwd)
if res == 0:
raise ValueError("Provided password does not match")
# Iterate through pages in order
for n in tqdm(sorted(list(page_nums)), "Extracting pages", unit="pages"):
page = reader.getPage(n-1) # Page indicies start at zero, normal page numbering starts at 1
writer.addPage(page)
# Copy encryption of input file
if reader.isEncrypted:
writer.encrypt(pwd)
# Write pages to new file
writer.write(wf)
print(f"Finished saving pages {page_nums} from '{path}' to '{wpath}'")
if __name__ == "__main__":
plac.call(main)