-
Notifications
You must be signed in to change notification settings - Fork 1
/
pdfscraper.py
112 lines (94 loc) · 3.79 KB
/
pdfscraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/env python3
from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from PIL import Image
import pytesseract
from pdf2image import convert_from_path
import os
from os import listdir
from os.path import isfile, join
import sys, getopt
import re
import argparse
def convert_pdf(fname, pages=None): # pdfminer
if not pages:
pagenums = set()
else:
pagenums = set(pages)
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = "utf-8"
laparams = LAParams(all_texts=True, detect_vertical=False,
line_overlap=0.5, char_margin=2.0, line_margin=0.5,
word_margin=0.1, boxes_flow=0.5) # layout analysis
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
infile = open(fname, "rb")
for page in PDFPage.get_pages(infile, pagenums, check_extractable=False):
interpreter.process_page(page)
infile.close()
device.close()
message = retstr.getvalue()
retstr.close
return message
def convert_pdf_ocr(fname): # pytesseract for ocr support
pages = convert_from_path(fname, 500)
image_counter = 1
for page in pages:
filename = "page_"+str(image_counter)+".jpg"
page.save(filename, 'JPEG')
image_counter = image_counter + 1
totpages = image_counter - 1
message = ""
for image in range(1, totpages + 1):
filename = "page_" + str(image) + ".jpg"
message += str((pytesseract.image_to_string(Image.open(filename))))
return message
def txt_process(in_pdf, out_txt, token=None): # process text
message = ""
for pdf in os.listdir(in_pdf):
fileExtension = pdf.split(".")[-1]
if fileExtension == "pdf":
pdfFile = in_pdf + "/" + pdf
# try pdfminer
message = convert_pdf(pdfFile)
# if pdfminer returns blank string then try pytesseract
if not message.strip():
message = convert_pdf_ocr(pdfFile)
# select regex for tokenization or standard text
if token:
message = re.findall(r"\w+(?:[-']\w+)*|'|[-.(]+|\S\w*", message)
message = " ".join(str(e) for e in message)
message = message.lower()
# remove punctuation
message = re.sub(r"[^\w\s]+\s*", "", message)
# remove underscore
message = re.sub(r"\_\s*", "", message)
# remove combinations of letters + numbers
message = re.sub(r"\w*[\d.\-]\w*\s*", "", message)
# remove standalone numbers
message = re.sub(r"\b\d+\b\s*", "", message)
else:
message = re.findall(r"\w+(?:['-/]\w+)|\w+[?!.,:)(]|\S\w*", message)
message = " ".join(str(e) for e in message)
txtFile = out_txt + "/" + pdf + ".txt"
txtFile = open(txtFile, "w")
txtFile.write(message)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input-dir", dest="inpdf", required=True,
help="Path to the input pdf files")
parser.add_argument("-o", "--output-dir", dest="outtxt", required=True,
help="Path for the output txt files")
parser.add_argument("-t", "--token-gen", dest="token", action="store_true",
help="Use flag to generate tokenized output")
args = parser.parse_args()
in_pdf = args.inpdf
out_txt = args.outtxt
token = args.token
txt_process(in_pdf, out_txt, token)
if __name__ == "__main__":
main()