-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtable_ocr.py
240 lines (199 loc) · 9.5 KB
/
table_ocr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv
import streamlit as st
import base64
from io import BytesIO
try:
from PIL import Image
except ImportError:
import Image
import pytesseract
st.title("Tabular Data Extractor")
st.subheader("Application to Convert Tables in Images to Downloadable DataFrames")
ocr_img = st.checkbox("View what type of image to upload")
if ocr_img:
img_1 = Image.open('Access-studentmarks.png')
st.image(img_1, width=500, caption = 'Example of a clear table image')
############################# Sidebar ########################
st.sidebar.title("Guide")
st.sidebar.markdown("> Quality screenshot images of tables taken from phone, laptop, etc is preferred")
st.sidebar.markdown("> Images captured with phone camera yields incomplete tables")
st.sidebar.markdown("""> Images affected by artifacts including partial occulsion, distorted perspective,
and complex background yields incomplete tables""")
st.sidebar.markdown(""" > Handwriting recognition on images containing tables will be significantly harder
due to infinite variations of handwriting styles and limitations of optical character recognition""")
###################### loading images #######################
uploaded_file = st.file_uploader("Choose an image | Accepted formats: only jpg & jpeg & png files", type=("jpg","png","jpeg"))
########## Table Extraction #############################
# Read file
if uploaded_file is not None:
file_bytes = np.asarray(bytearray(uploaded_file.read()), dtype=np.uint8)
img = cv2.imdecode(file_bytes, 0)
# Viewing image
ocr_img2 = st.checkbox("View uploaded image")
if ocr_img2:
img_2 = Image.open(uploaded_file)
st.image(img_2, width=500)
# thresholding the image to a binary image
thresh,img_bin = cv2.threshold(img,128,255,cv2.THRESH_BINARY | cv2.THRESH_OTSU)
# inverting the image
img_bin = 255-img_bin
# countcol(width) of kernel as 100th of total width
kernel_len = np.array(img).shape[1]//100
# Defining a vertical kernel to detect all vertical lines of image
ver_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, kernel_len))
# Defining a horizontal kernel to detect all horizontal lines of image
hor_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_len, 1))
# A kernel of 2x2
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
# Use vertical kernel to detect and save the vertical lines in a jpg
image_1 = cv2.erode(img_bin, ver_kernel, iterations=3)
vertical_lines = cv2.dilate(image_1, ver_kernel, iterations=3)
# Use horizontal kernel to detect and save the horizontal lines in a jpg
image_2 = cv2.erode(img_bin, hor_kernel, iterations=3)
horizontal_lines = cv2.dilate(image_2, hor_kernel, iterations=3)
# Combine horizontal and vertical lines in a new third image, with both having same weight.
img_vh = cv2.addWeighted(vertical_lines, 0.5, horizontal_lines, 0.5, 0.0)
# Eroding and thesholding the image
img_vh = cv2.erode(~img_vh, kernel, iterations=2)
thresh, img_vh = cv2.threshold(img_vh,128,255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
bitxor = cv2.bitwise_xor(img,img_vh)
bitnot = cv2.bitwise_not(bitxor)
# Detect contours for following box detection
contours, hierarchy = cv2.findContours(img_vh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
@st.cache(suppress_st_warning=True)
def sort_contours(cnts, method="left-to-right"):
# initialize the reverse flag and sort index
reverse = False
i = 0
# handle if we need to sort in reverse
if method == "right-to-left" or method == "bottom-to-top":
reverse = True
# handle if we are sorting against the y-coordinate rather than
# the x-coordinate of the bounding box
if method == "top-to-bottom" or method == "bottom-to-top":
i = 1
# construct the list of bounding boxes and sort them from top to
# bottom
boundingBoxes = [cv2.boundingRect(c) for c in cnts]
(cnts, boundingBoxes) = zip(*sorted(zip(cnts, boundingBoxes),
key=lambda b:b[1][i], reverse=reverse))
# return the list of sorted contours and bounding boxes
return (cnts, boundingBoxes)
# Sort all the contours by top to bottom.
contours, boundingBoxes = sort_contours(contours, method="top-to-bottom")
# Creating a list of heights for all detected boxes
heights = [boundingBoxes[i][3] for i in range(len(boundingBoxes))]
# Get mean of heights
mean = np.mean(heights)
# Create list box to store all boxes in
box = []
# Get position (x,y), width and height for every contour and show the contour on image
for c in contours:
x, y, w, h = cv2.boundingRect(c)
if (w<1000 and h<500):
image = cv2.rectangle(img,(x,y),(x+w,y+h),(0,255,0),2)
box.append([x,y,w,h])
# Creating two lists to define row and column in which cell is located
row=[]
column=[]
j=0
# Sorting the boxes to their respective row and column
for i in range(len(box)):
if(i==0):
column.append(box[i])
previous=box[i]
else:
if(box[i][1]<=previous[1]+mean/2):
column.append(box[i])
previous=box[i]
if(i==len(box)-1):
row.append(column)
else:
row.append(column)
column=[]
previous = box[i]
column.append(box[i])
# calculating maximum number of cells
countcol = 0
for i in range(len(row)):
countcol = len(row[i])
if countcol > countcol:
countcol = countcol
# Retrieving the center of each column
center = [int(row[i][j][0]+row[i][j][2]/2) for j in range(len(row[i])) if row[0]]
center=np.array(center)
center.sort()
finalboxes = []
for i in range(len(row)):
lis=[]
for k in range(countcol):
lis.append([])
for j in range(len(row[i])):
diff = abs(center-(row[i][j][0]+row[i][j][2]/4))
minimum = min(diff)
indexing = list(diff).index(minimum)
lis[indexing].append(row[i][j])
finalboxes.append(lis)
# Optical character recognition (ORC) tool to recognise and read text embedded in images
pytesseract.pytesseract.tesseract_cmd = r'C:/Program Files/Tesseract-OCR/tesseract.exe'
# from every single image-based cell/box the strings are extracted via pytesseract and stored in a list
outer=[]
for i in range(len(finalboxes)):
for j in range(len(finalboxes[i])):
inner=''
if(len(finalboxes[i][j])==0):
outer.append(' ')
else:
for k in range(len(finalboxes[i][j])):
y,x,w,h = finalboxes[i][j][k][0],finalboxes[i][j][k][1], finalboxes[i][j][k][2],finalboxes[i][j][k][3]
finalimg = bitnot[x:x+h, y:y+w]
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 1))
border = cv2.copyMakeBorder(finalimg,2,2,2,2, cv2.BORDER_CONSTANT,value=[255,255])
resizing = cv2.resize(border, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
dilation = cv2.dilate(resizing, kernel,iterations=1)
erosion = cv2.erode(dilation, kernel,iterations=2)
out = pytesseract.image_to_string(erosion)
if(len(out)==0):
out = pytesseract.image_to_string(erosion, config='--psm 3')
inner = inner +" "+ out
outer.append(inner)
# Creating a dataframe of the generated OCR list
arr = np.array(outer)
dataframe = pd.DataFrame(arr.reshape(len(row), countcol))
for col in dataframe.columns:
dataframe[col] = dataframe[col].astype(str)
dataframe[col] = np.where(dataframe[col].str.contains("\n"), dataframe[col].str[:-2], dataframe[col])
st.subheader("DataFrame")
st.dataframe(dataframe)
# Download dataframe
col1, col2 = st.beta_columns(2)
download = col1.button("Download csv file")
if download:
csv = dataframe.to_csv(index=False)
b64 = base64.b64encode(csv.encode()).decode()
linko= f'<a href="data:file/csv;base64,{b64}" download="DataFrame.csv">Download csv file</a>'
col1.markdown(linko, unsafe_allow_html=True)
download2 = col2.button("Download excel xlsx file")
if download2:
@st.cache(suppress_st_warning=True)
def to_excel(df):
output = BytesIO()
writer = pd.ExcelWriter(output, engine='xlsxwriter')
df.to_excel(writer, sheet_name='Sheet1')
writer.save()
processed_data = output.getvalue()
return processed_data
@st.cache(suppress_st_warning=True)
def get_table_download_link(df):
"""Generates a link allowing the data in a given panda dataframe to be downloaded
in: dataframe
out: href string
"""
val = to_excel(df)
b64 = base64.b64encode(val)
return f'<a href="data:application/octet-stream;base64,{b64.decode()}" download="DataFrame.xlsx">Download excel xlsx file</a>'
col2.markdown(get_table_download_link(dataframe), unsafe_allow_html=True)