Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TLDR 531 pdf_txtlayer_reader table fix #380

Merged
merged 14 commits into from
Dec 1, 2023
16 changes: 3 additions & 13 deletions dedoc/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
# number of parallel jobs in some tasks as OCR
n_jobs=1,

# --------------------------------------------GPU SETTINGS-------------------------------------------------------
# --------------------------------------------GPU SETTINGS----------------------------------------------------------
# set gpu in XGBoost and torch models
on_gpu=False,

Expand All @@ -36,19 +36,9 @@
logger=logging.getLogger(),
import_path_init_api_args="dedoc.api.api_args",

# ----------------------------------------TABLE RECOGNIZER SETTINGS-------------------------------------------------
min_h_cell=8,
min_w_cell=20,
type_top_attr=1,
type_left_top_attr=2,
type_left_attr=3,
max_vertical_extended=20,
minimal_cell_cnt_line=5,
minimal_cell_avg_length_line=10,

path_cells=os.path.join(os.path.abspath(os.sep), "tmp", "dedoc", "debug_tables", "imgs", "cells"),
# ----------------------------------------TABLE RECOGNIZER DEBUG SETTINGS-------------------------------------------
# path to save debug images for tables recognizer
path_detect=os.path.join(os.path.abspath(os.sep), "tmp", "dedoc", "debug_tables", "imgs", "detect_lines"),
NastyBoget marked this conversation as resolved.
Show resolved Hide resolved
rotate_threshold=0.3,

# -------------------------------------------RECOGNIZE SETTINGS-----------------------------------------------------
# TESSERACT OCR confidence threshold ( values: [-1 - undefined; 0.0 : 100.0 % - confidence value)
Expand Down
6 changes: 5 additions & 1 deletion dedoc/readers/pdf_reader/data_classes/tables/table_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ class TableTree(object):
Table which has cells as sorted childs of tree.
Table has type of tree and was obtained with help contour analysis.
"""
min_h_cell = 8
min_w_cell = 20
minimal_cell_cnt_line = 5
minimal_cell_avg_length_line = 10

def __init__(self, *, config: dict) -> None:
self.left = None
Expand Down Expand Up @@ -94,7 +98,7 @@ def __build_childs(self, cur: "TableTree", hierarchy: List, contours: List) -> "
if h[3] == cur.id_contours:
bbox = cv2.boundingRect(contours[i]) # [x_begin, y_begin, width, height]
# Эвристика №1 на ячейку
if bbox[2] < self.config["min_w_cell"] or bbox[3] < self.config["min_h_cell"]:
if bbox[2] < self.min_w_cell or bbox[3] < self.min_h_cell:
if self.config.get("debug_mode", False):
self.logger.debug(f"Contour {i} isn't correct")
continue
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,9 @@ def __concat_images(self, src_image: np.ndarray, tree_table_nodes: List["TableTr
for tree_node in tree_table_nodes:
x_coord = space
cell_image = BBox.crop_image_by_box(src_image, tree_node.crop_text_box)
image_path = os.path.join(self.config.get("path_debug"), "debug_tables", "batches", "cell_croped.png")
cv2.imwrite(image_path, cell_image)
if self.config.get("debug_mode", False) and self.config.get("path_debug", False) and os.path.exists(self.config.get("path_debug")):
image_path = os.path.join(self.config.get("path_debug"), "debug_tables", "batches", "cell_croped.png")
cv2.imwrite(image_path, cell_image)
cell_height, cell_width = cell_image.shape[0], cell_image.shape[1]

stacked_image[y_prev:y_prev + cell_height, x_coord:x_coord + cell_width] = cell_image
Expand Down
8 changes: 4 additions & 4 deletions dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def __init__(self, *, config: dict) -> None:
self.binarizer = AdaptiveBinarizer()
self.ocr = OCRLineExtractor(config=config)
self.logger = config.get("logger", logging.getLogger())
if self.config.get("debug_mode") and not os.path.exists(self.config["path_debug"]):
if self.config.get("debug_mode", False) and not os.path.exists(self.config["path_debug"]):
Copy link
Collaborator

@NastyBoget NastyBoget Dec 1, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Насчет debug_mode:

выбери что тебе больше нравится: config.get("debug_mode") или config.get("debug_mode", False) (или что проще править), и давай сделаем везде одинаково. Хотя все равно в процессе разработки кто как будет писать.

Насчет path_debug лучше просто .get("path_debug")

os.makedirs(self.config["path_debug"])

def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool:
Expand All @@ -70,13 +70,13 @@ def _process_one_page(self,
path: str) -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[float]]:
# --- Step 1: correct orientation and detect column count ---
rotated_image, is_one_column_document, angle = self._detect_column_count_and_orientation(image, parameters)
if self.config.get("debug_mode"):
if self.config.get("debug_mode", False):
self.logger.info(f"Angle page rotation = {angle}")

# --- Step 2: do binarization ---
if parameters.need_binarization:
rotated_image, _ = self.binarizer.preprocess(rotated_image)
if self.config.get("debug_mode"):
if self.config.get("debug_mode", False):
cv2.imwrite(os.path.join(self.config["path_debug"], f"{datetime.now().strftime('%H-%M-%S')}_result_binarization.jpg"), rotated_image)

# --- Step 3: table detection and recognition ---
Expand Down Expand Up @@ -122,7 +122,7 @@ def _detect_column_count_and_orientation(self, image: np.ndarray, parameters: Pa
rotated_image, result_angle = self.scew_corrector.preprocess(image, {"orientation_angle": angle})
result_angle = result_angle["rotated_angle"]

if self.config.get("debug_mode"):
if self.config.get("debug_mode", False):
img_path = os.path.join(self.config["path_debug"], f"{datetime.now().strftime('%H-%M-%S')}_result_orientation.jpg")
NastyBoget marked this conversation as resolved.
Show resolved Hide resolved
self.logger.info(f"Save image to {img_path}")
cv2.imwrite(img_path, rotated_image)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class OnePageTableExtractor(BaseTableExtractor):

def __init__(self, *, config: dict, logger: logging.Logger) -> None:
super().__init__(config=config, logger=logger)

self.image = None
self.page_number = 0
self.attribute_selector = TableAttributeExtractor(logger=self.logger)
Expand Down Expand Up @@ -77,8 +78,8 @@ def __detect_diff_orient(self, cell_text: str) -> bool:
avg_len_part = np.average(len_parts)

# Эвристика: считаем что ячейка повернута, если у нас большое количество строк и строки короткие
if len(parts) > self.config["minimal_cell_cnt_line"] \
and avg_len_part < self.config["minimal_cell_avg_length_line"]:
NastyBoget marked this conversation as resolved.
Show resolved Hide resolved
if len(parts) > TableTree.minimal_cell_cnt_line \
and avg_len_part < TableTree.minimal_cell_avg_length_line:
return True
return False

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@


class TableRecognizer(object):

def __init__(self, *, config: dict = None) -> None:

self.logger = config.get("logger", logging.getLogger(__name__))
Expand All @@ -27,11 +28,6 @@ def __init__(self, *, config: dict = None) -> None:
self.multipage_tables_extractor = MultiPageTableExtractor(config=config, logger=self.logger)
self.config = config
self.table_type = TableTypeAdditionalOptions()
if config.get("debug", False):
if not os.path.exists(self.config["path_cells"]):
os.makedirs(self.config["path_cells"])
if not os.path.exists(self.config["path_detect"]):
NastyBoget marked this conversation as resolved.
Show resolved Hide resolved
os.makedirs(self.config["path_detect"])

def convert_to_multipages_tables(self, all_single_tables: List[ScanTable], lines_with_meta: List[LineWithMeta]) -> List[ScanTable]:

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,13 @@
logger = logger if logger else logging.getLogger("TableRecognizer.detect_tables_by_contours")
table_options = TableTypeAdditionalOptions()

ROTATE_THRESHOLD = 0.3


def rotate_with_threshold(img: np.ndarray, angle: float, threshold: float = None, *, config: dict) -> np.ndarray:
"""rotates a table image and saving image.shape during rotation. It is important for word bounding box extraction"""
if threshold is None:
threshold = config["rotate_threshold"]
threshold = ROTATE_THRESHOLD
rotated = img
if abs(angle) > threshold:
if config.get("debug_mode", False):
Expand Down Expand Up @@ -62,7 +64,7 @@ def apply_houph_line(img: np.ndarray, threshold_gap: int = 10, *, config: dict)
return cdst_p, angle


def get_contours_cells(img: np.ndarray, table_type: str, *, config: dict) -> [Any, Any, np.ndarray, float]:
def get_contours_cells(img: np.ndarray, table_type: str, *, config: dict, path_detect: str) -> [Any, Any, np.ndarray, float]:
NastyBoget marked this conversation as resolved.
Show resolved Hide resolved
"""
function's steps:
1) detects Houph lines for detecting rotate angle. Then input image has rotated on the rotate angle.
Expand All @@ -79,43 +81,47 @@ def get_contours_cells(img: np.ndarray, table_type: str, *, config: dict) -> [An
img_bin = 255 - img_bin

if config.get("debug_mode", False):
os.makedirs(config["path_cells"], exist_ok=True)
os.makedirs(config["path_detect"], exist_ok=True)
cv2.imwrite(os.path.join(config["path_detect"], "image_bin.jpg"), img_bin)
os.makedirs(path_detect, exist_ok=True)

if config.get("debug_mode", False):
cv2.imwrite(os.path.join(path_detect, "image_bin.jpg"), img_bin)
# step 2
img_final_bin = __detect_horizontal_and_vertical_lines(img_bin, config, "tables")
img_final_bin = __detect_horizontal_and_vertical_lines(img_bin, config, "tables", path_detect)
# step 3
img_final_bin_houph, angle_alignment = __apply_houph_lines_and_detect_angle(img_final_bin, config)

(thresh, img_final_bin_houph) = cv2.threshold(img_final_bin_houph, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
if config.get("debug_mode", False):
cv2.imwrite(os.path.join(config["path_detect"], "img_final_bin.jpg"), img_final_bin)
cv2.imwrite(os.path.join(path_detect, "img_final_bin.jpg"), img_final_bin)
if config.get("debug_mode", False):
cv2.imwrite(os.path.join(config["path_detect"], "img_final_bin_houph.jpg"), img_final_bin_houph)
cv2.imwrite(os.path.join(path_detect, "img_final_bin_houph.jpg"), img_final_bin_houph)

# step 4 - rotating
img_final_bin_houph = rotate_with_threshold(img_final_bin_houph, angle_alignment, config=config)
img = rotate_with_threshold(img, angle_alignment, config=config)
if config.get("debug_mode", False):
cv2.imwrite(os.path.join(config["path_detect"], "aligned_img.jpg"), img)
# TODO: paths should be configurable but now could not exist
cv2.imwrite(os.path.join(path_detect, "aligned_img.jpg"), img)
img_final_bin_houph = __paint_bounds(img_final_bin_houph)

# step 5 - detect contours
contours, hierarchy = cv2.findContours(img_final_bin_houph, cv2.RETR_TREE, cv2.CHAIN_APPROX_TC89_KCOS)

if config.get("debug_mode", False):
cv2.imwrite(os.path.join(config["path_detect"], "img_houph_and_morph_wo_bound.jpg"), img_final_bin_houph)
# TODO: paths should be configurable but now could not exist
cv2.imwrite(os.path.join(path_detect, "img_houph_and_morph_wo_bound.jpg"), img_final_bin_houph)
img_w_contour = img.copy()
cv2.drawContours(img_w_contour, contours, contourIdx=-1, color=(0, 0, 0), thickness=10, hierarchy=hierarchy, maxLevel=8)
cv2.imwrite(os.path.join(config["path_detect"], "img_with_contours.jpg"), img_w_contour)
cv2.imwrite(os.path.join(path_detect, "img_with_contours.jpg"), img_w_contour)

# Draw external contours for tables without external contours. It is a rare case, but important for invoices
if table_options.table_wo_external_bounds in table_type:
contours, hierarchy = __get_contours_for_table_wo_external_bounds(img, img_final_bin_houph.copy(), contours, hierarchy)
contours, hierarchy = __get_contours_for_table_wo_external_bounds(img, img_final_bin_houph.copy(), contours, hierarchy, path_detect)
return contours, hierarchy, img, angle_alignment


def __get_contours_for_table_wo_external_bounds(img: np.ndarray, img_with_contours: np.ndarray, contours: List, hierarchy: List) -> [Any, Any]:
def __get_contours_for_table_wo_external_bounds(img: np.ndarray, img_with_contours: np.ndarray, contours: List, hierarchy: List,
NastyBoget marked this conversation as resolved.
Show resolved Hide resolved
path_detect: str) -> [Any, Any]:
# get children (get table counters)
contours = np.array(contours)
list_contours, table_contours = __get_table_contours(contours, hierarchy)
Expand All @@ -138,7 +144,8 @@ def __get_contours_for_table_wo_external_bounds(img: np.ndarray, img_with_contou
cv2.rectangle(img_with_contours, (x, y), (x + w, y + h), color=(0, 0, 0), thickness=5)

if get_config().get("debug_mode", False):
cv2.imwrite(os.path.join(get_config()["path_detect"], "img_with_external_bounds.jpg"), img_with_contours)
# TODO: paths should be configurable but now could not exist
cv2.imwrite(os.path.join(path_detect, "img_with_external_bounds.jpg"), img_with_contours)
contours, hierarchy = cv2.findContours(img_with_contours, cv2.RETR_TREE, cv2.CHAIN_APPROX_TC89_KCOS)

return contours, hierarchy
Expand Down Expand Up @@ -172,7 +179,7 @@ def __apply_houph_lines_and_detect_angle(image: np.ndarray, config: dict) -> [np
gap_avg = min(np.mean([c[2] for c in contours_table]) // 45, gap_avg)
else:
gap_avg = 5
if config["debug_mode"]:
if config.get("debug_mode", False):
config.get("logger", logging.getLogger()).debug(f"Houph gap = {gap_avg}")

# ----- image alignment -----
Expand All @@ -182,7 +189,7 @@ def __apply_houph_lines_and_detect_angle(image: np.ndarray, config: dict) -> [np
return img_final_bin_houph, angle_alignment


def __detect_horizontal_and_vertical_lines(img_bin: np.ndarray, config: dict, task: str) -> np.ndarray:
def __detect_horizontal_and_vertical_lines(img_bin: np.ndarray, config: dict, task: str, path_detect: str) -> np.ndarray:
NastyBoget marked this conversation as resolved.
Show resolved Hide resolved
# Defining a kernel length

if task == "orientation":
Expand All @@ -191,8 +198,9 @@ def __detect_horizontal_and_vertical_lines(img_bin: np.ndarray, config: dict, ta
elif task == "tables":
length_div = 55
height_div = 100
kernel_length_weight = max(np.array(img_bin).shape[1] // length_div, config["min_w_cell"]) # 35
kernel_length_height = max(np.array(img_bin).shape[0] // height_div, config["min_h_cell"]) # 100

kernel_length_weight = max(np.array(img_bin).shape[1] // length_div, TableTree.min_w_cell) # 35
kernel_length_height = max(np.array(img_bin).shape[0] // height_div, TableTree.min_h_cell) # 100

# A verticle kernel of (1 X kernel_length), which will detect all the verticle lines from the image.
verticle_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, kernel_length_height))
Expand All @@ -211,8 +219,9 @@ def __detect_horizontal_and_vertical_lines(img_bin: np.ndarray, config: dict, ta
horizontal_lines_img = cv2.dilate(img_temp2, hori_kernel, iterations=iterations)

if config.get("debug_mode", False):
cv2.imwrite(os.path.join(config["path_detect"], "verticle_lines.jpg"), verticle_lines_img)
cv2.imwrite(os.path.join(config["path_detect"], "horizontal_lines.jpg"), horizontal_lines_img)
# TODO: paths should be configurable but now could not exist
cv2.imwrite(os.path.join(path_detect, "verticle_lines.jpg"), verticle_lines_img)
cv2.imwrite(os.path.join(path_detect, "horizontal_lines.jpg"), horizontal_lines_img)

"""Now we will add these two images.
This will have only boxes and the information written in the box will be erased.
Expand All @@ -228,7 +237,8 @@ def __detect_horizontal_and_vertical_lines(img_bin: np.ndarray, config: dict, ta
img_bin_with_lines = cv2.erode(~img_bin_with_lines, kernel, iterations=2)
(thresh, img_bin_with_lines) = cv2.threshold(img_bin_with_lines, 200, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
if config.get("debug_mode", False):
cv2.imwrite(os.path.join(config["path_detect"], "img_bin_with_lines.jpg"), img_bin_with_lines)
# TODO: paths should be configurable but now could not exist
cv2.imwrite(os.path.join(path_detect, "img_bin_with_lines.jpg"), img_bin_with_lines)

return img_bin_with_lines

Expand Down Expand Up @@ -259,14 +269,18 @@ def detect_tables_by_contours(img: np.ndarray,
:param config: dict from config.py
:return: TreeTable, contour, rotate angle
"""
contours, hierarchy, image, angle_rotate = get_contours_cells(img, table_type, config=config)
path_detect = config.get("path_detect", None)
NastyBoget marked this conversation as resolved.
Show resolved Hide resolved
if config.get("debug_mode", False) and path_detect is None:
path_detect = os.path.join(os.path.abspath(os.sep), "tmp", "dedoc", "debug_tables", "imgs", "detect_lines")

contours, hierarchy, image, angle_rotate = get_contours_cells(img, table_type, config=config, path_detect=path_detect)
tree_table = TableTree.parse_contours_to_tree(contours=contours, hierarchy=hierarchy, config=config)

if config.get("debug_mode", False):
config.get("logger", logging.getLogger()).debug(f"Hierarchy [Next, Previous, First_Child, Parent]:\n {hierarchy}")
tree_table.print_tree(depth=0)
if config.get("debug_mode", False):
cv2.imwrite(os.path.join(config["path_detect"], "img_draw_counters.jpg"), img)

cv2.imwrite(os.path.join(path_detect, "img_draw_counters.jpg"), img)

tree_table.set_text_into_tree(tree=tree_table, src_image=image, language=language, config=config)

Expand Down
7 changes: 7 additions & 0 deletions tests/unit_tests/test_format_pdf_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,3 +140,10 @@ def test_pdf_text_layer(self) -> None:
annotations = line.annotations
annotations_set = {(a.name, a.value, a.start, a.end) for a in annotations}
self.assertEqual(len(annotations_set), len(annotations))

def test_table_extractor(self) -> None:
config = {} # Has to work without config
any_doc_reader = PdfTxtlayerReader(config=config)
path = os.path.join(os.path.dirname(__file__), "../data/pdf_with_text_layer/english_doc.pdf")
result = any_doc_reader.read(path, document_type=None, parameters={"need_pdf_table_analysis": "True"})
self.assertEqual(len(result.tables), 1)
Loading