-
Notifications
You must be signed in to change notification settings - Fork 0
/
pagerange.py
156 lines (128 loc) · 5.44 KB
/
pagerange.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#!/usr/bin/env python
"""
Representation and utils for ranges of PDF file pages.
Copyright (c) 2014, Steve Witham <[email protected]>.
All rights reserved. This software is available under a BSD license;
see https://github.com/py-pdf/PyPDF2/blob/main/LICENSE
"""
import re
from PyPDF2.errors import ParseError
from ._utils import isString
_INT_RE = r"(0|-?[1-9]\d*)" # A decimal int, don't allow "-0".
PAGE_RANGE_RE = "^({int}|({int}?(:{int}?(:{int}?)?)))$".format(int=_INT_RE)
# groups: 12 34 5 6 7 8
PAGE_RANGE_HELP = """Remember, page indices start with zero.
Page range expression examples:
: all pages. -1 last page.
22 just the 23rd page. :-1 all but the last page.
0:3 the first three pages. -2 second-to-last page.
:3 the first three pages. -2: last two pages.
5: from the sixth page onward. -3:-1 third & second to last.
The third, "stride" or "step" number is also recognized.
::2 0 2 4 ... to the end. 3:0:-1 3 2 1 but not 0.
1:10:2 1 3 5 7 9 2::-1 2 1 0.
::-1 all pages in reverse order.
"""
class PageRange(object):
"""
A slice-like representation of a range of page indices,
i.e. page numbers, only starting at zero.
The syntax is like what you would put between brackets [ ].
The slice is one of the few Python types that can't be subclassed,
but this class converts to and from slices, and allows similar use.
- PageRange(str) parses a string representing a page range.
- PageRange(slice) directly "imports" a slice.
- to_slice() gives the equivalent slice.
- str() and repr() allow printing.
- indices(n) is like slice.indices(n).
"""
def __init__(self, arg):
"""
Initialize with either a slice -- giving the equivalent page range,
or a PageRange object -- making a copy,
or a string like
"int", "[int]:[int]" or "[int]:[int]:[int]",
where the brackets indicate optional ints.
{page_range_help}
Note the difference between this notation and arguments to slice():
slice(3) means the first three pages;
PageRange("3") means the range of only the fourth page.
However PageRange(slice(3)) means the first three pages.
"""
if isinstance(arg, slice):
self._slice = arg
return
if isinstance(arg, PageRange):
self._slice = arg.to_slice()
return
m = isString(arg) and re.match(PAGE_RANGE_RE, arg)
if not m:
raise ParseError(arg)
elif m.group(2):
# Special case: just an int means a range of one page.
start = int(m.group(2))
stop = start + 1 if start != -1 else None
self._slice = slice(start, stop)
else:
self._slice = slice(*[int(g) if g else None for g in m.group(4, 6, 8)])
if __init__.__doc__: # see https://github.com/py-pdf/PyPDF2/issues/737
__init__.__doc__ = __init__.__doc__.format(page_range_help=PAGE_RANGE_HELP)
@staticmethod
def valid(input):
"""True if input is a valid initializer for a PageRange."""
return isinstance(input, (slice, PageRange)) or (
isString(input) and bool(re.match(PAGE_RANGE_RE, input))
)
def to_slice(self):
"""Return the slice equivalent of this page range."""
return self._slice
def __str__(self):
"""A string like "1:2:3"."""
s = self._slice
if s.step is None:
if s.start is not None and s.stop == s.start + 1:
return str(s.start)
indices = s.start, s.stop
else:
indices = s.start, s.stop, s.step
return ":".join("" if i is None else str(i) for i in indices)
def __repr__(self):
"""A string like "PageRange('1:2:3')"."""
return "PageRange(" + repr(str(self)) + ")"
def indices(self, n):
"""
n is the length of the list of pages to choose from.
Returns arguments for range(). See help(slice.indices).
"""
return self._slice.indices(n)
def __eq__(self, other):
if not isinstance(other, PageRange):
return False
return self._slice == other._slice
PAGE_RANGE_ALL = PageRange(":") # The range of all pages.
def parse_filename_page_ranges(args):
"""
Given a list of filenames and page ranges, return a list of
(filename, page_range) pairs.
First arg must be a filename; other ags are filenames, page-range
expressions, slice objects, or PageRange objects.
A filename not followed by a page range indicates all pages of the file.
"""
pairs = []
pdf_filename = None
did_page_range = False
for arg in args + [None]:
if PageRange.valid(arg):
if not pdf_filename:
raise ValueError(
"The first argument must be a filename, not a page range."
)
pairs.append((pdf_filename, PageRange(arg)))
did_page_range = True
else:
# New filename or end of list--do all of the previous file?
if pdf_filename and not did_page_range:
pairs.append((pdf_filename, PAGE_RANGE_ALL))
pdf_filename = arg
did_page_range = False
return pairs