Skip to content

Commit

Permalink
TLDR 531 pdf_txtlayer_reader table fix (#380)
Browse files Browse the repository at this point in the history
  • Loading branch information
raxtemur authored Dec 1, 2023
1 parent d83bf23 commit 1fefda5
Show file tree
Hide file tree
Showing 10 changed files with 80 additions and 56 deletions.
16 changes: 3 additions & 13 deletions dedoc/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
# number of parallel jobs in some tasks as OCR
n_jobs=1,

# --------------------------------------------GPU SETTINGS-------------------------------------------------------
# --------------------------------------------GPU SETTINGS----------------------------------------------------------
# set gpu in XGBoost and torch models
on_gpu=False,

Expand All @@ -36,19 +36,9 @@
logger=logging.getLogger(),
import_path_init_api_args="dedoc.api.api_args",

# ----------------------------------------TABLE RECOGNIZER SETTINGS-------------------------------------------------
min_h_cell=8,
min_w_cell=20,
type_top_attr=1,
type_left_top_attr=2,
type_left_attr=3,
max_vertical_extended=20,
minimal_cell_cnt_line=5,
minimal_cell_avg_length_line=10,

path_cells=os.path.join(os.path.abspath(os.sep), "tmp", "dedoc", "debug_tables", "imgs", "cells"),
# ----------------------------------------TABLE RECOGNIZER DEBUG SETTINGS-------------------------------------------
# path to save debug images for tables recognizer
path_detect=os.path.join(os.path.abspath(os.sep), "tmp", "dedoc", "debug_tables", "imgs", "detect_lines"),
rotate_threshold=0.3,

# -------------------------------------------RECOGNIZE SETTINGS-----------------------------------------------------
# TESSERACT OCR confidence threshold ( values: [-1 - undefined; 0.0 : 100.0 % - confidence value)
Expand Down
6 changes: 5 additions & 1 deletion dedoc/readers/pdf_reader/data_classes/tables/table_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ class TableTree(object):
Table which has cells as sorted childs of tree.
Table has type of tree and was obtained with help contour analysis.
"""
min_h_cell = 8
min_w_cell = 20
minimal_cell_cnt_line = 5
minimal_cell_avg_length_line = 10

def __init__(self, *, config: dict) -> None:
self.left = None
Expand Down Expand Up @@ -94,7 +98,7 @@ def __build_childs(self, cur: "TableTree", hierarchy: List, contours: List) -> "
if h[3] == cur.id_contours:
bbox = cv2.boundingRect(contours[i]) # [x_begin, y_begin, width, height]
# Эвристика №1 на ячейку
if bbox[2] < self.config["min_w_cell"] or bbox[3] < self.config["min_h_cell"]:
if bbox[2] < self.min_w_cell or bbox[3] < self.min_h_cell:
if self.config.get("debug_mode", False):
self.logger.debug(f"Contour {i} isn't correct")
continue
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_page.ocr_page import OcrPage
from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_utils import get_text_with_bbox_from_cells
from dedoc.utils.image_utils import get_highest_pixel_frequency
from dedoc.utils.parameter_utils import get_path_param


class OCRCellExtractor:
Expand All @@ -30,7 +31,7 @@ def get_cells_text(self, page_image: np.ndarray, tree_nodes: List["TableTree"],
for num_batch, nodes_batch in enumerate(batches):

if self.config.get("debug_mode", False):
tmp_dir = os.path.join(self.config.get("path_debug"), "debug_tables/batches/")
tmp_dir = os.path.join(get_path_param(self.config, "path_debug"), "debug_tables/batches/")
os.makedirs(tmp_dir, exist_ok=True)
for i, table_tree_node in enumerate(nodes_batch):
cv2.imwrite(os.path.join(tmp_dir, f"image_{num_batch}_{i}.png"), BBox.crop_image_by_box(page_image, table_tree_node.cell_box))
Expand Down Expand Up @@ -64,7 +65,9 @@ def get_cells_text(self, page_image: np.ndarray, tree_nodes: List["TableTree"],
def __handle_one_batch(self, src_image: np.ndarray, tree_table_nodes: List["TableTree"], num_batch: int, language: str = "rus") -> Tuple[OcrPage, List[BBox]]: # noqa
concatenated, chunk_boxes = self.__concat_images(src_image=src_image, tree_table_nodes=tree_table_nodes)
if self.config.get("debug_mode", False):
image_path = os.path.join(self.config.get("path_debug"), "debug_tables", "batches", f"stacked_batch_image_{num_batch}.png")
debug_dir = os.path.join(get_path_param(self.config, "path_debug"), "debug_tables", "batches")
os.makedirs(debug_dir, exist_ok=True)
image_path = os.path.join(debug_dir, f"stacked_batch_image_{num_batch}.png")
cv2.imwrite(image_path, concatenated)
ocr_result = get_text_with_bbox_from_cells(concatenated, language, ocr_conf_threshold=0.0)

Expand All @@ -82,8 +85,11 @@ def __concat_images(self, src_image: np.ndarray, tree_table_nodes: List["TableTr
for tree_node in tree_table_nodes:
x_coord = space
cell_image = BBox.crop_image_by_box(src_image, tree_node.crop_text_box)
image_path = os.path.join(self.config.get("path_debug"), "debug_tables", "batches", "cell_croped.png")
cv2.imwrite(image_path, cell_image)
if self.config.get("debug_mode", False):
debug_dir = os.path.join(get_path_param(self.config, "path_debug"), "debug_tables", "batches")
os.makedirs(debug_dir, exist_ok=True)
image_path = os.path.join(debug_dir, "cell_croped.png")
cv2.imwrite(image_path, cell_image)
cell_height, cell_width = cell_image.shape[0], cell_image.shape[1]

stacked_image[y_prev:y_prev + cell_height, x_coord:x_coord + cell_width] = cell_image
Expand Down
15 changes: 8 additions & 7 deletions dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_line_extractor import OCRLineExtractor
from dedoc.train_dataset.train_dataset_utils import save_page_with_bbox
from dedoc.utils import supported_image_types
from dedoc.utils.parameter_utils import get_path_param


class PdfImageReader(PdfBaseReader):
Expand Down Expand Up @@ -52,8 +53,6 @@ def __init__(self, *, config: dict) -> None:
self.binarizer = AdaptiveBinarizer()
self.ocr = OCRLineExtractor(config=config)
self.logger = config.get("logger", logging.getLogger())
if self.config.get("debug_mode") and not os.path.exists(self.config["path_debug"]):
os.makedirs(self.config["path_debug"])

def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool:
"""
Expand All @@ -70,14 +69,15 @@ def _process_one_page(self,
path: str) -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[float]]:
# --- Step 1: correct orientation and detect column count ---
rotated_image, is_one_column_document, angle = self._detect_column_count_and_orientation(image, parameters)
if self.config.get("debug_mode"):
if self.config.get("debug_mode", False):
self.logger.info(f"Angle page rotation = {angle}")

# --- Step 2: do binarization ---
if parameters.need_binarization:
rotated_image, _ = self.binarizer.preprocess(rotated_image)
if self.config.get("debug_mode"):
cv2.imwrite(os.path.join(self.config["path_debug"], f"{datetime.now().strftime('%H-%M-%S')}_result_binarization.jpg"), rotated_image)
if self.config.get("debug_mode", False):
debug_dir = get_path_param(self.config, "path_debug")
cv2.imwrite(os.path.join(debug_dir, f"{datetime.now().strftime('%H-%M-%S')}_result_binarization.jpg"), rotated_image)

# --- Step 3: table detection and recognition ---
if parameters.need_pdf_table_analysis:
Expand Down Expand Up @@ -122,8 +122,9 @@ def _detect_column_count_and_orientation(self, image: np.ndarray, parameters: Pa
rotated_image, result_angle = self.scew_corrector.preprocess(image, {"orientation_angle": angle})
result_angle = result_angle["rotated_angle"]

if self.config.get("debug_mode"):
img_path = os.path.join(self.config["path_debug"], f"{datetime.now().strftime('%H-%M-%S')}_result_orientation.jpg")
if self.config.get("debug_mode", False):
debug_dir = get_path_param(self.config, "path_debug")
img_path = os.path.join(debug_dir, f"{datetime.now().strftime('%H-%M-%S')}_result_orientation.jpg")
self.logger.info(f"Save image to {img_path}")
cv2.imwrite(img_path, rotated_image)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class OnePageTableExtractor(BaseTableExtractor):

def __init__(self, *, config: dict, logger: logging.Logger) -> None:
super().__init__(config=config, logger=logger)

self.image = None
self.page_number = 0
self.attribute_selector = TableAttributeExtractor(logger=self.logger)
Expand Down Expand Up @@ -77,8 +78,8 @@ def __detect_diff_orient(self, cell_text: str) -> bool:
avg_len_part = np.average(len_parts)
# Эвристика: считаем что ячейка повернута, если у нас большое количество строк и строки короткие
if len(parts) > self.config["minimal_cell_cnt_line"] \
and avg_len_part < self.config["minimal_cell_avg_length_line"]:
if len(parts) > TableTree.minimal_cell_cnt_line \
and avg_len_part < TableTree.minimal_cell_avg_length_line:
return True
return False
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@


class TableRecognizer(object):

def __init__(self, *, config: dict = None) -> None:

self.logger = config.get("logger", logging.getLogger(__name__))
Expand All @@ -27,11 +28,6 @@ def __init__(self, *, config: dict = None) -> None:
self.multipage_tables_extractor = MultiPageTableExtractor(config=config, logger=self.logger)
self.config = config
self.table_type = TableTypeAdditionalOptions()
if config.get("debug", False):
if not os.path.exists(self.config["path_cells"]):
os.makedirs(self.config["path_cells"])
if not os.path.exists(self.config["path_detect"]):
os.makedirs(self.config["path_detect"])

def convert_to_multipages_tables(self, all_single_tables: List[ScanTable], lines_with_meta: List[LineWithMeta]) -> List[ScanTable]:

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,19 @@
from dedoc.config import get_config
from dedoc.readers.pdf_reader.data_classes.tables.table_tree import TableTree
from dedoc.readers.pdf_reader.data_classes.tables.table_type import TableTypeAdditionalOptions
from dedoc.utils.parameter_utils import get_path_param

logger = get_config().get("logger", logging.getLogger())
logger = logger if logger else logging.getLogger("TableRecognizer.detect_tables_by_contours")
table_options = TableTypeAdditionalOptions()

ROTATE_THRESHOLD = 0.3


def rotate_with_threshold(img: np.ndarray, angle: float, threshold: float = None, *, config: dict) -> np.ndarray:
"""rotates a table image and saving image.shape during rotation. It is important for word bounding box extraction"""
if threshold is None:
threshold = config["rotate_threshold"]
threshold = ROTATE_THRESHOLD
rotated = img
if abs(angle) > threshold:
if config.get("debug_mode", False):
Expand Down Expand Up @@ -79,43 +82,41 @@ def get_contours_cells(img: np.ndarray, table_type: str, *, config: dict) -> [An
img_bin = 255 - img_bin

if config.get("debug_mode", False):
os.makedirs(config["path_cells"], exist_ok=True)
os.makedirs(config["path_detect"], exist_ok=True)
cv2.imwrite(os.path.join(config["path_detect"], "image_bin.jpg"), img_bin)
cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "image_bin.jpg"), img_bin)
# step 2
img_final_bin = __detect_horizontal_and_vertical_lines(img_bin, config, "tables")
# step 3
img_final_bin_houph, angle_alignment = __apply_houph_lines_and_detect_angle(img_final_bin, config)

(thresh, img_final_bin_houph) = cv2.threshold(img_final_bin_houph, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
if config.get("debug_mode", False):
cv2.imwrite(os.path.join(config["path_detect"], "img_final_bin.jpg"), img_final_bin)
cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "img_final_bin.jpg"), img_final_bin)
if config.get("debug_mode", False):
cv2.imwrite(os.path.join(config["path_detect"], "img_final_bin_houph.jpg"), img_final_bin_houph)
cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "img_final_bin_houph.jpg"), img_final_bin_houph)

# step 4 - rotating
img_final_bin_houph = rotate_with_threshold(img_final_bin_houph, angle_alignment, config=config)
img = rotate_with_threshold(img, angle_alignment, config=config)
if config.get("debug_mode", False):
cv2.imwrite(os.path.join(config["path_detect"], "aligned_img.jpg"), img)
cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "aligned_img.jpg"), img)
img_final_bin_houph = __paint_bounds(img_final_bin_houph)

# step 5 - detect contours
contours, hierarchy = cv2.findContours(img_final_bin_houph, cv2.RETR_TREE, cv2.CHAIN_APPROX_TC89_KCOS)

if config.get("debug_mode", False):
cv2.imwrite(os.path.join(config["path_detect"], "img_houph_and_morph_wo_bound.jpg"), img_final_bin_houph)
cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "img_houph_and_morph_wo_bound.jpg"), img_final_bin_houph)
img_w_contour = img.copy()
cv2.drawContours(img_w_contour, contours, contourIdx=-1, color=(0, 0, 0), thickness=10, hierarchy=hierarchy, maxLevel=8)
cv2.imwrite(os.path.join(config["path_detect"], "img_with_contours.jpg"), img_w_contour)
cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "img_with_contours.jpg"), img_w_contour)

# Draw external contours for tables without external contours. It is a rare case, but important for invoices
if table_options.table_wo_external_bounds in table_type:
contours, hierarchy = __get_contours_for_table_wo_external_bounds(img, img_final_bin_houph.copy(), contours, hierarchy)
contours, hierarchy = __get_contours_for_table_wo_external_bounds(img, img_final_bin_houph.copy(), contours, hierarchy, config)
return contours, hierarchy, img, angle_alignment


def __get_contours_for_table_wo_external_bounds(img: np.ndarray, img_with_contours: np.ndarray, contours: List, hierarchy: List) -> [Any, Any]:
def __get_contours_for_table_wo_external_bounds(img: np.ndarray, img_with_contours: np.ndarray, contours: List, hierarchy: List, config: dict) -> [Any, Any]:
# get children (get table counters)
contours = np.array(contours)
list_contours, table_contours = __get_table_contours(contours, hierarchy)
Expand All @@ -137,8 +138,8 @@ def __get_contours_for_table_wo_external_bounds(img: np.ndarray, img_with_contou
x, y, w, h = cv2.boundingRect(c)
cv2.rectangle(img_with_contours, (x, y), (x + w, y + h), color=(0, 0, 0), thickness=5)

if get_config().get("debug_mode", False):
cv2.imwrite(os.path.join(get_config()["path_detect"], "img_with_external_bounds.jpg"), img_with_contours)
if config.get("debug_mode", False):
cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "img_with_external_bounds.jpg"), img_with_contours)
contours, hierarchy = cv2.findContours(img_with_contours, cv2.RETR_TREE, cv2.CHAIN_APPROX_TC89_KCOS)

return contours, hierarchy
Expand Down Expand Up @@ -172,7 +173,7 @@ def __apply_houph_lines_and_detect_angle(image: np.ndarray, config: dict) -> [np
gap_avg = min(np.mean([c[2] for c in contours_table]) // 45, gap_avg)
else:
gap_avg = 5
if config["debug_mode"]:
if config.get("debug_mode", False):
config.get("logger", logging.getLogger()).debug(f"Houph gap = {gap_avg}")

# ----- image alignment -----
Expand All @@ -191,8 +192,9 @@ def __detect_horizontal_and_vertical_lines(img_bin: np.ndarray, config: dict, ta
elif task == "tables":
length_div = 55
height_div = 100
kernel_length_weight = max(np.array(img_bin).shape[1] // length_div, config["min_w_cell"]) # 35
kernel_length_height = max(np.array(img_bin).shape[0] // height_div, config["min_h_cell"]) # 100

kernel_length_weight = max(np.array(img_bin).shape[1] // length_div, TableTree.min_w_cell) # 35
kernel_length_height = max(np.array(img_bin).shape[0] // height_div, TableTree.min_h_cell) # 100

# A verticle kernel of (1 X kernel_length), which will detect all the verticle lines from the image.
verticle_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, kernel_length_height))
Expand All @@ -211,8 +213,8 @@ def __detect_horizontal_and_vertical_lines(img_bin: np.ndarray, config: dict, ta
horizontal_lines_img = cv2.dilate(img_temp2, hori_kernel, iterations=iterations)

if config.get("debug_mode", False):
cv2.imwrite(os.path.join(config["path_detect"], "verticle_lines.jpg"), verticle_lines_img)
cv2.imwrite(os.path.join(config["path_detect"], "horizontal_lines.jpg"), horizontal_lines_img)
cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "verticle_lines.jpg"), verticle_lines_img)
cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "horizontal_lines.jpg"), horizontal_lines_img)

"""Now we will add these two images.
This will have only boxes and the information written in the box will be erased.
Expand All @@ -228,7 +230,7 @@ def __detect_horizontal_and_vertical_lines(img_bin: np.ndarray, config: dict, ta
img_bin_with_lines = cv2.erode(~img_bin_with_lines, kernel, iterations=2)
(thresh, img_bin_with_lines) = cv2.threshold(img_bin_with_lines, 200, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
if config.get("debug_mode", False):
cv2.imwrite(os.path.join(config["path_detect"], "img_bin_with_lines.jpg"), img_bin_with_lines)
cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "img_bin_with_lines.jpg"), img_bin_with_lines)

return img_bin_with_lines

Expand Down Expand Up @@ -265,8 +267,8 @@ def detect_tables_by_contours(img: np.ndarray,
if config.get("debug_mode", False):
config.get("logger", logging.getLogger()).debug(f"Hierarchy [Next, Previous, First_Child, Parent]:\n {hierarchy}")
tree_table.print_tree(depth=0)
if config.get("debug_mode", False):
cv2.imwrite(os.path.join(config["path_detect"], "img_draw_counters.jpg"), img)

cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "img_draw_counters.jpg"), img)

tree_table.set_text_into_tree(tree=tree_table, src_image=image, language=language, config=config)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from dedoc.readers.pdf_reader.data_classes.tables.location import Location
from dedoc.readers.pdf_reader.data_classes.text_with_bbox import TextWithBBox
from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdfminer_reader.pdfminer_utils import cleaning_text_from_hieroglyphics, create_bbox, draw_annotation
from dedoc.utils.parameter_utils import get_path_param
from dedoc.utils.pdf_utils import get_page_image

logging.getLogger("pdfminer").setLevel(logging.ERROR)
Expand Down Expand Up @@ -251,7 +252,7 @@ def __debug_extract_layout(self, image_src: np.ndarray, layout: LTContainer, pag
:param layout: container of layout element
:return: None
"""
tmp_dir = os.path.join(self.config.get("path_debug"), "pdfminer")
tmp_dir = os.path.join(get_path_param(self.config, "path_debug"), "pdfminer")
os.makedirs(tmp_dir, exist_ok=True)

file_text = open(os.path.join(tmp_dir, f"text_{page_num}.txt"), "wt")
Expand All @@ -268,10 +269,10 @@ def __debug_extract_layout(self, image_src: np.ndarray, layout: LTContainer, pag

for lobj in lobjs:
if isinstance(lobj, LTTextBoxHorizontal):
annotations.extend(self.__extract_words_bbox_annotation(lobj, k_w, k_h, height, width))
annotations.extend(self.__extract_words_bbox_annotation(lobj, height, width))
lobjs_textline.extend(lobj)
elif isinstance(lobj, LTTextLineHorizontal):
annotations.extend(self.__extract_words_bbox_annotation(lobj, k_w, k_h, height, width))
annotations.extend(self.__extract_words_bbox_annotation(lobj, height, width))
lobjs_textline.append(lobj)
elif isinstance(lobj, LTRect):
lobjs_box.append(lobj)
Expand Down
Loading

0 comments on commit 1fefda5

Please sign in to comment.