table_ocr.py

import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv
import streamlit as st
import base64
from io import BytesIO

try:
    from PIL import Image
except ImportError:
    import Image
import pytesseract

st.title("Tabular Data Extractor")
st.subheader("Application to Convert Tables in Images to Downloadable DataFrames")


ocr_img = st.checkbox("View what type of image to upload")
if ocr_img:
    img_1 = Image.open('Access-studentmarks.png')
    st.image(img_1, width=500, caption = 'Example of a clear table image')
    
############################# Sidebar ########################
st.sidebar.title("Guide")
st.sidebar.markdown("> Quality screenshot images of tables taken from phone, laptop, etc is preferred")
st.sidebar.markdown("> Images captured with phone camera yields incomplete tables")
st.sidebar.markdown("""> Images affected by artifacts including partial occulsion, distorted perspective, 
                    and complex background yields incomplete tables""")
st.sidebar.markdown(""" > Handwriting recognition on images containing tables will be significantly harder
                       due to infinite variations of handwriting styles and limitations of optical character recognition""")


###################### loading images #######################
uploaded_file = st.file_uploader("Choose an image | Accepted formats: only jpg & jpeg & png files", type=("jpg","png","jpeg"))

########## Table Extraction #############################
# Read file 
if uploaded_file is not None:
    file_bytes = np.asarray(bytearray(uploaded_file.read()), dtype=np.uint8)
    img = cv2.imdecode(file_bytes, 0)
    
    # Viewing image 
    ocr_img2 = st.checkbox("View uploaded image")
    if ocr_img2:
        img_2 = Image.open(uploaded_file)
        st.image(img_2, width=500)
        
    
    # thresholding the image to a binary image
    thresh,img_bin = cv2.threshold(img,128,255,cv2.THRESH_BINARY | cv2.THRESH_OTSU)
    
    # inverting the image 
    img_bin = 255-img_bin
    
    # countcol(width) of kernel as 100th of total width
    kernel_len = np.array(img).shape[1]//100
    # Defining a vertical kernel to detect all vertical lines of image 
    ver_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, kernel_len))
    # Defining a horizontal kernel to detect all horizontal lines of image
    hor_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_len, 1))
    # A kernel of 2x2
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
    
    # Use vertical kernel to detect and save the vertical lines in a jpg
    image_1 = cv2.erode(img_bin, ver_kernel, iterations=3)
    vertical_lines = cv2.dilate(image_1, ver_kernel, iterations=3)
    
    # Use horizontal kernel to detect and save the horizontal lines in a jpg
    image_2 = cv2.erode(img_bin, hor_kernel, iterations=3)
    horizontal_lines = cv2.dilate(image_2, hor_kernel, iterations=3)
    
    # Combine horizontal and vertical lines in a new third image, with both having same weight.
    img_vh = cv2.addWeighted(vertical_lines, 0.5, horizontal_lines, 0.5, 0.0)
    # Eroding and thesholding the image
    img_vh = cv2.erode(~img_vh, kernel, iterations=2)
    thresh, img_vh = cv2.threshold(img_vh,128,255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
    bitxor = cv2.bitwise_xor(img,img_vh)
    bitnot = cv2.bitwise_not(bitxor)
    
    # Detect contours for following box detection
    contours, hierarchy = cv2.findContours(img_vh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    @st.cache(suppress_st_warning=True)
    def sort_contours(cnts, method="left-to-right"):
        # initialize the reverse flag and sort index
        reverse = False
        i = 0
        # handle if we need to sort in reverse
        if method == "right-to-left" or method == "bottom-to-top":
            reverse = True
        # handle if we are sorting against the y-coordinate rather than
        # the x-coordinate of the bounding box
        if method == "top-to-bottom" or method == "bottom-to-top":
            i = 1
        # construct the list of bounding boxes and sort them from top to
        # bottom
        boundingBoxes = [cv2.boundingRect(c) for c in cnts]
        (cnts, boundingBoxes) = zip(*sorted(zip(cnts, boundingBoxes),
        key=lambda b:b[1][i], reverse=reverse))
        # return the list of sorted contours and bounding boxes
        return (cnts, boundingBoxes)
    
    # Sort all the contours by top to bottom.
    contours, boundingBoxes = sort_contours(contours, method="top-to-bottom")
    
    # Creating a list of heights for all detected boxes
    heights = [boundingBoxes[i][3] for i in range(len(boundingBoxes))]
    
    # Get mean of heights
    mean = np.mean(heights)
    
    # Create list box to store all boxes in  
    box = []
    # Get position (x,y), width and height for every contour and show the contour on image
    for c in contours:
        x, y, w, h = cv2.boundingRect(c)
        if (w<1000 and h<500):
            image = cv2.rectangle(img,(x,y),(x+w,y+h),(0,255,0),2)
            box.append([x,y,w,h])
    
    # Creating two lists to define row and column in which cell is located
    row=[]
    column=[]
    j=0
    
    # Sorting the boxes to their respective row and column
    for i in range(len(box)):    
        
        if(i==0):
            column.append(box[i])
            previous=box[i]    

        else:
            if(box[i][1]<=previous[1]+mean/2):
                column.append(box[i])
                previous=box[i]            

                if(i==len(box)-1):
                    row.append(column)        

            else:
                row.append(column)
                column=[]
                previous = box[i]
                column.append(box[i])
                
    # calculating maximum number of cells
    countcol = 0
    for i in range(len(row)):
        countcol = len(row[i])
        if countcol > countcol:
            countcol = countcol
    
    # Retrieving the center of each column
    center = [int(row[i][j][0]+row[i][j][2]/2) for j in range(len(row[i])) if row[0]]
    center=np.array(center)
    center.sort()
    
    finalboxes = []
    for i in range(len(row)):
        lis=[]
        for k in range(countcol):
            lis.append([])
        for j in range(len(row[i])):
            diff = abs(center-(row[i][j][0]+row[i][j][2]/4))
            minimum = min(diff)
            indexing = list(diff).index(minimum)
            lis[indexing].append(row[i][j])
        finalboxes.append(lis)
    
    # Optical character recognition (ORC) tool to recognise and read text embedded in images 
    pytesseract.pytesseract.tesseract_cmd = r'C:/Program Files/Tesseract-OCR/tesseract.exe'
    
    # from every single image-based cell/box the strings are extracted via pytesseract and stored in a list
    outer=[]
    for i in range(len(finalboxes)):
        for j in range(len(finalboxes[i])):
            inner=''
            if(len(finalboxes[i][j])==0):
                outer.append(' ')
            else:
                for k in range(len(finalboxes[i][j])):
                    y,x,w,h = finalboxes[i][j][k][0],finalboxes[i][j][k][1], finalboxes[i][j][k][2],finalboxes[i][j][k][3]
                    finalimg = bitnot[x:x+h, y:y+w]
                    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 1))
                    border = cv2.copyMakeBorder(finalimg,2,2,2,2, cv2.BORDER_CONSTANT,value=[255,255])
                    resizing = cv2.resize(border, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
                    dilation = cv2.dilate(resizing, kernel,iterations=1)
                    erosion = cv2.erode(dilation, kernel,iterations=2)

                    out = pytesseract.image_to_string(erosion)
                    if(len(out)==0):
                        out = pytesseract.image_to_string(erosion, config='--psm 3')
                    inner = inner +" "+ out
                outer.append(inner)
    
    # Creating a dataframe of the generated OCR list
    arr = np.array(outer)
    dataframe = pd.DataFrame(arr.reshape(len(row), countcol))
    for col in dataframe.columns:
        dataframe[col] = dataframe[col].astype(str)
        dataframe[col] = np.where(dataframe[col].str.contains("\n"), dataframe[col].str[:-2], dataframe[col])
    st.subheader("DataFrame")
    st.dataframe(dataframe)
    
    # Download dataframe 
    col1, col2 = st.beta_columns(2)
    
    download = col1.button("Download csv file")
    if download:
        csv = dataframe.to_csv(index=False)
        b64 = base64.b64encode(csv.encode()).decode()
        linko= f'<a href="data:file/csv;base64,{b64}" download="DataFrame.csv">Download csv file</a>'
        col1.markdown(linko, unsafe_allow_html=True)
        
    download2 = col2.button("Download excel xlsx file")
    if download2:
        @st.cache(suppress_st_warning=True)
        def to_excel(df):
            output = BytesIO()
            writer = pd.ExcelWriter(output, engine='xlsxwriter')
            df.to_excel(writer, sheet_name='Sheet1')
            writer.save()
            processed_data = output.getvalue()
            return processed_data

        @st.cache(suppress_st_warning=True)
        def get_table_download_link(df):
            """Generates a link allowing the data in a given panda dataframe to be downloaded
            in:  dataframe
            out: href string
            """
            val = to_excel(df)
            b64 = base64.b64encode(val)  
            return f'<a href="data:application/octet-stream;base64,{b64.decode()}" download="DataFrame.xlsx">Download excel xlsx file</a>' 
        col2.markdown(get_table_download_link(dataframe), unsafe_allow_html=True)