-
Notifications
You must be signed in to change notification settings - Fork 36
/
extract_text.py
43 lines (35 loc) · 2.1 KB
/
extract_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import argparse
import json
import pypdfium2 as pdfium
from pdftext.extraction import plain_text_output, dictionary_output
def main():
parser = argparse.ArgumentParser(description="Extract plain text from PDF. Not guaranteed to be in order.")
parser.add_argument("pdf_path", type=str, help="Path to the PDF file")
parser.add_argument("--out_path", type=str, help="Path to the output text file, defaults to stdout", default=None)
parser.add_argument("--json", action="store_true", help="Output json instead of plain text", default=False)
parser.add_argument("--sort", action="store_true", help="Attempt to sort the text by reading order", default=False)
parser.add_argument("--keep_hyphens", action="store_true", help="Keep hyphens in words", default=False)
parser.add_argument("--pages", type=str, help="Comma separated pages to extract, like 1,2,3", default=None)
parser.add_argument("--flatten_pdf", action="store_true", help="Flatten form fields and annotations into page contents", default=False)
parser.add_argument("--keep_chars", action="store_true", help="Keep character level information", default=False)
parser.add_argument("--workers", type=int, help="Number of workers to use for parallel processing", default=None)
args = parser.parse_args()
pages = None
if args.pages is not None:
pdf_doc = pdfium.PdfDocument(args.pdf_path)
pages = [int(p) for p in args.pages.split(",")]
doc_len = len(pdf_doc)
pdf_doc.close()
assert all(p <= doc_len for p in pages), "Invalid page number(s) provided"
if args.json:
text = dictionary_output(args.pdf_path, sort=args.sort, page_range=pages, flatten_pdf=args.flatten_pdf, keep_chars=args.keep_chars, workers=args.workers)
text = json.dumps(text)
else:
text = plain_text_output(args.pdf_path, sort=args.sort, hyphens=args.keep_hyphens, page_range=pages, flatten_pdf=args.flatten_pdf, workers=args.workers)
if args.out_path is None:
print(text)
else:
with open(args.out_path, "w+") as f:
f.write(text)
if __name__ == "__main__":
main()