-
Notifications
You must be signed in to change notification settings - Fork 0
/
processor.py
194 lines (163 loc) · 6.36 KB
/
processor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
#!/usr/bin/python3
"""Modules for operating on different file types"""
import fitz
import os
import subprocess
import pytesseract
from PIL import Image
from pprint import pprint
from models.data_file import DataFile
import csv
# If you don't have tesseract executable in your PATH, include the following:
pytesseract.pytesseract.tesseract_cmd = os.path.realpath('/usr/bin/tesseract')
KEYWORDS = {
'Invoice no.',
'Payment date:',
'SPECIAL DISCOUNT',
'Discount',
'Total CHF'
}
def merge_files(file1: DataFile, file2: DataFile) -> DataFile:
"""Merges two files and saves the resulting merged file"""
_file1 = fitz.open(file1.path)
_file2 = fitz.open(file2.path)
# Merge the two files
_file1.insert_file(_file2)
# Save the merged file with a new filename
merged_file_name: str = f'merged_{file1.name}_{file2.name}.pdf'
merged_file_path: str = os.path.join(
os.path.dirname(file1), merged_file_name)
_file1.save(merged_file_path)
return DataFile(merged_file_name, 'pdf', merged_file_path)
def extract_data(data_file: DataFile) -> list:
"""Extracts data from a list of files and returns key/value pairs of the data"""
# Convert to image for OCR processing
if data_file.type == 'pdf':
data_file = convert_to_image_ocr(data_file)
text_data: list = get_data_from_image_ocr(data_file)
# Extract key/value pairs using predefined keywords
extract: dict = get_key_values_from_data(text_data)
return extract
def get_key_values_from_data(words: list, keywords: set = KEYWORDS) -> dict:
"""Gets all key/value pairs from predefined keywords"""
result_data: dict = {}
# Retrieve the information needed to extract data from the prefix keywords
for index, key in enumerate(words):
# Match the keyword
if key in keywords:
# Extract the value
value = words[index + 1]
result_data[key] = value
return result_data
def convert_to_pdf_ocr(pdf_file: DataFile) -> list:
"""Convert file to pdf ocr format"""
data_results: list = []
pdf = os.path.realpath(pdf_file.path)
document = fitz.open(pdf)
for page in document:
text_results = get_pdf_ocr_content(page)
data_results.append(text_results)
return data_results
def convert_to_image_ocr(file: DataFile, block_box=[
0, 0, 500, 700]) -> DataFile:
"""Convert file to image ocr format"""
document = fitz.open(file.path)
page = document[0]
mat = fitz.Matrix(5, 5) # high resolution matrix
pix = page.get_pixmap(
colorspace=fitz.csGRAY, # we need no color
matrix=mat,
clip=block_box,
)
img_bytes = pix.tobytes("png") # make a PNG image
output = fitz.open()
img = fitz.open('png', img_bytes) # output Image
output.insert_file(img) # append the image page to output
output_path = os.path.join(os.path.dirname(file.path), f"{file.name}.png")
output.ez_save(output_path) # save output
return DataFile(file.name, 'png', output_path)
def convert_image_to_pdf_ocr(image_file: DataFile):
"""Converts normal image to pdf ocr"""
document = fitz.open() # output PDF
image = os.path.realpath(image_file.path)
pix = fitz.Pixmap(image) # make a pixmap form the image file
# 1-page PDF with the OCRed image
pdfbytes = pix.pdfocr_tobytes(language="eng")
imgpdf = fitz.open("pdf", pdfbytes) # open it as a PDF
document.insert_pdf(imgpdf) # append the image page to output
# document.ez_save("ocr-pdf.pdf") # save output
return document
def get_pdf_ocr_content(page, block_box=[0, 0, 500, 700]) -> str:
"""Return OCR-ed span text using Tesseract.
Args:
page: fitz.Page
block_box: fitz.Rect or its tuple
Returns:
The OCR-ed text of the block_box.
"""
mat = fitz.Matrix(5, 5) # high resolution matrix
pix = page.get_pixmap(
colorspace=fitz.csGRAY, # we need no color
matrix=mat,
clip=block_box,
)
ocrpdf = fitz.open("pdf", pix.pdfocr_tobytes())
ocrpage = ocrpdf[0]
text = ocrpage.get_text()
if text.endswith("\n"):
text = text[:-1]
return text
def get_image_ocr_content(page, block_box=[0, 0, 500, 700]) -> str:
"""Return OCR-ed span text using Tesseract.
Args:
page: fitz.Page
block_box: fitz.Rect or its tuple
Returns:
The OCR-ed text of the block_box.
"""
mat = fitz.Matrix(5, 5) # high resolution matrix
tess = "tesseract stdin stdout --psm 7 -l eng"
pix = page.get_pixmap(
colorspace=fitz.csGRAY, # we need no color
matrix=mat,
clip=block_box,
)
image = pix.tobytes("png") # make a PNG image
# Step 2: Invoke Tesseract to OCR the image. Text is stored in stdout.
rc = subprocess.run(
tess, # the command
input=image, # the pixmap image
stdout=subprocess.PIPE, # find the text here
shell=True,
)
# because we told Tesseract to interpret the image as one line, we now need
# to strip off the line break characters from the tail.
text = rc.stdout.decode() # convert to string
text = text[:-3] # remove line end characters
return text
def get_data_from_image_ocr(file: DataFile) -> list:
"""Use pytesseract to get image ocr data"""
image = Image.open(file.path)
result_data = pytesseract.image_to_data(
image, lang='eng', nice=0, output_type=pytesseract.Output.DICT)
return result_data.get('text', [])
def write_data_to_csv(data: dict, file: DataFile) -> DataFile:
"""Writes data into a csv file"""
csv_file_name = f"{file.name}.csv"
csv_file_path = os.path.join(os.path.dirname(file.path), csv_file_name)
with open(csv_file_path, mode='w', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerow(['Extracted File Data'])
[writer.writerow(data_extract) for data_extract in data.items()]
return DataFile(file.name, 'csv', csv_file_path)
def read_csv_data(csv_file: DataFile) -> list:
"""Reads and returns csv file data contents"""
result_data: dict = {}
with open(csv_file.path, 'r', encoding='utf-8') as file:
reader = csv.reader(file)
for index, row in enumerate(reader):
if index == 0:
result_data.update({row[0]: ''})
else:
result_data.update({row[0]: row[1]})
return result_data