TLDR 531 pdf_txtlayer_reader table fix (#380)

ispras · Dec 1, 2023 · 1fefda5 · 1fefda5
1 parent d83bf23
commit 1fefda5
Show file tree

Hide file tree

Showing 10 changed files with 80 additions and 56 deletions.
diff --git a/dedoc/config.py b/dedoc/config.py
@@ -22,7 +22,7 @@
     # number of parallel jobs in some tasks as OCR
     n_jobs=1,
 
-    # --------------------------------------------GPU SETTINGS-------------------------------------------------------
+    # --------------------------------------------GPU SETTINGS----------------------------------------------------------
     # set gpu in XGBoost and torch models
     on_gpu=False,
 
@@ -36,19 +36,9 @@
     logger=logging.getLogger(),
     import_path_init_api_args="dedoc.api.api_args",
 
-    # ----------------------------------------TABLE RECOGNIZER SETTINGS-------------------------------------------------
-    min_h_cell=8,
-    min_w_cell=20,
-    type_top_attr=1,
-    type_left_top_attr=2,
-    type_left_attr=3,
-    max_vertical_extended=20,
-    minimal_cell_cnt_line=5,
-    minimal_cell_avg_length_line=10,
-
-    path_cells=os.path.join(os.path.abspath(os.sep), "tmp", "dedoc", "debug_tables", "imgs", "cells"),
+    # ----------------------------------------TABLE RECOGNIZER DEBUG SETTINGS-------------------------------------------
+    # path to save debug images for tables recognizer
     path_detect=os.path.join(os.path.abspath(os.sep), "tmp", "dedoc", "debug_tables", "imgs", "detect_lines"),
-    rotate_threshold=0.3,
 
     # -------------------------------------------RECOGNIZE SETTINGS-----------------------------------------------------
     # TESSERACT OCR confidence threshold ( values: [-1 - undefined;  0.0 : 100.0 % - confidence value)

diff --git a/dedoc/readers/pdf_reader/data_classes/tables/table_tree.py b/dedoc/readers/pdf_reader/data_classes/tables/table_tree.py
@@ -21,6 +21,10 @@ class TableTree(object):
     Table which has cells as sorted childs of tree.
     Table has type of tree and was obtained with help contour analysis.
     """
+    min_h_cell = 8
+    min_w_cell = 20
+    minimal_cell_cnt_line = 5
+    minimal_cell_avg_length_line = 10
 
     def __init__(self, *, config: dict) -> None:
         self.left = None
@@ -94,7 +98,7 @@ def __build_childs(self, cur: "TableTree", hierarchy: List, contours: List) -> "
             if h[3] == cur.id_contours:
                 bbox = cv2.boundingRect(contours[i])  # [x_begin, y_begin, width, height]
                 # Эвристика №1 на ячейку
-                if bbox[2] < self.config["min_w_cell"] or bbox[3] < self.config["min_h_cell"]:
+                if bbox[2] < self.min_w_cell or bbox[3] < self.min_h_cell:
                     if self.config.get("debug_mode", False):
                         self.logger.debug(f"Contour {i} isn't correct")
                     continue

diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py
@@ -11,6 +11,7 @@
 from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_page.ocr_page import OcrPage
 from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_utils import get_text_with_bbox_from_cells
 from dedoc.utils.image_utils import get_highest_pixel_frequency
+from dedoc.utils.parameter_utils import get_path_param
 
 
 class OCRCellExtractor:
@@ -30,7 +31,7 @@ def get_cells_text(self, page_image: np.ndarray, tree_nodes: List["TableTree"],
         for num_batch, nodes_batch in enumerate(batches):
 
             if self.config.get("debug_mode", False):
-                tmp_dir = os.path.join(self.config.get("path_debug"), "debug_tables/batches/")
+                tmp_dir = os.path.join(get_path_param(self.config, "path_debug"), "debug_tables/batches/")
                 os.makedirs(tmp_dir, exist_ok=True)
                 for i, table_tree_node in enumerate(nodes_batch):
                     cv2.imwrite(os.path.join(tmp_dir, f"image_{num_batch}_{i}.png"), BBox.crop_image_by_box(page_image, table_tree_node.cell_box))
@@ -64,7 +65,9 @@ def get_cells_text(self, page_image: np.ndarray, tree_nodes: List["TableTree"],
     def __handle_one_batch(self, src_image: np.ndarray, tree_table_nodes: List["TableTree"], num_batch: int, language: str = "rus") -> Tuple[OcrPage, List[BBox]]: # noqa
         concatenated, chunk_boxes = self.__concat_images(src_image=src_image, tree_table_nodes=tree_table_nodes)
         if self.config.get("debug_mode", False):
-            image_path = os.path.join(self.config.get("path_debug"), "debug_tables", "batches", f"stacked_batch_image_{num_batch}.png")
+            debug_dir = os.path.join(get_path_param(self.config, "path_debug"), "debug_tables", "batches")
+            os.makedirs(debug_dir, exist_ok=True)
+            image_path = os.path.join(debug_dir, f"stacked_batch_image_{num_batch}.png")
             cv2.imwrite(image_path, concatenated)
         ocr_result = get_text_with_bbox_from_cells(concatenated, language, ocr_conf_threshold=0.0)
 
@@ -82,8 +85,11 @@ def __concat_images(self, src_image: np.ndarray, tree_table_nodes: List["TableTr
         for tree_node in tree_table_nodes:
             x_coord = space
             cell_image = BBox.crop_image_by_box(src_image, tree_node.crop_text_box)
-            image_path = os.path.join(self.config.get("path_debug"), "debug_tables", "batches", "cell_croped.png")
-            cv2.imwrite(image_path, cell_image)
+            if self.config.get("debug_mode", False):
+                debug_dir = os.path.join(get_path_param(self.config, "path_debug"), "debug_tables", "batches")
+                os.makedirs(debug_dir, exist_ok=True)
+                image_path = os.path.join(debug_dir, "cell_croped.png")
+                cv2.imwrite(image_path, cell_image)
             cell_height, cell_width = cell_image.shape[0], cell_image.shape[1]
 
             stacked_image[y_prev:y_prev + cell_height, x_coord:x_coord + cell_width] = cell_image

diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py b/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py
@@ -17,6 +17,7 @@
 from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_line_extractor import OCRLineExtractor
 from dedoc.train_dataset.train_dataset_utils import save_page_with_bbox
 from dedoc.utils import supported_image_types
+from dedoc.utils.parameter_utils import get_path_param
 
 
 class PdfImageReader(PdfBaseReader):
@@ -52,8 +53,6 @@ def __init__(self, *, config: dict) -> None:
         self.binarizer = AdaptiveBinarizer()
         self.ocr = OCRLineExtractor(config=config)
         self.logger = config.get("logger", logging.getLogger())
-        if self.config.get("debug_mode") and not os.path.exists(self.config["path_debug"]):
-            os.makedirs(self.config["path_debug"])
 
     def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool:
         """
@@ -70,14 +69,15 @@ def _process_one_page(self,
                           path: str) -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[float]]:
         #  --- Step 1: correct orientation and detect column count ---
         rotated_image, is_one_column_document, angle = self._detect_column_count_and_orientation(image, parameters)
-        if self.config.get("debug_mode"):
+        if self.config.get("debug_mode", False):
             self.logger.info(f"Angle page rotation = {angle}")
 
         #  --- Step 2: do binarization ---
         if parameters.need_binarization:
             rotated_image, _ = self.binarizer.preprocess(rotated_image)
-            if self.config.get("debug_mode"):
-                cv2.imwrite(os.path.join(self.config["path_debug"], f"{datetime.now().strftime('%H-%M-%S')}_result_binarization.jpg"), rotated_image)
+            if self.config.get("debug_mode", False):
+                debug_dir = get_path_param(self.config, "path_debug")
+                cv2.imwrite(os.path.join(debug_dir, f"{datetime.now().strftime('%H-%M-%S')}_result_binarization.jpg"), rotated_image)
 
         #  --- Step 3: table detection and recognition ---
         if parameters.need_pdf_table_analysis:
@@ -122,8 +122,9 @@ def _detect_column_count_and_orientation(self, image: np.ndarray, parameters: Pa
         rotated_image, result_angle = self.scew_corrector.preprocess(image, {"orientation_angle": angle})
         result_angle = result_angle["rotated_angle"]
 
-        if self.config.get("debug_mode"):
-            img_path = os.path.join(self.config["path_debug"], f"{datetime.now().strftime('%H-%M-%S')}_result_orientation.jpg")
+        if self.config.get("debug_mode", False):
+            debug_dir = get_path_param(self.config, "path_debug")
+            img_path = os.path.join(debug_dir, f"{datetime.now().strftime('%H-%M-%S')}_result_orientation.jpg")
             self.logger.info(f"Save image to {img_path}")
             cv2.imwrite(img_path, rotated_image)
 

diff --git a/...e_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py b/...e_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py
@@ -20,6 +20,7 @@ class OnePageTableExtractor(BaseTableExtractor):
 
     def __init__(self, *, config: dict, logger: logging.Logger) -> None:
         super().__init__(config=config, logger=logger)
+
         self.image = None
         self.page_number = 0
         self.attribute_selector = TableAttributeExtractor(logger=self.logger)
@@ -77,8 +78,8 @@ def __detect_diff_orient(self, cell_text: str) -> bool:
         avg_len_part = np.average(len_parts)
 
         # Эвристика: считаем что ячейка повернута, если у нас большое количество строк и строки короткие
-        if len(parts) > self.config["minimal_cell_cnt_line"] \
-                and avg_len_part < self.config["minimal_cell_avg_length_line"]:
+        if len(parts) > TableTree.minimal_cell_cnt_line \
+                and avg_len_part < TableTree.minimal_cell_avg_length_line:
             return True
         return False
 

diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py
@@ -19,6 +19,7 @@
 
 
 class TableRecognizer(object):
+
     def __init__(self, *, config: dict = None) -> None:
 
         self.logger = config.get("logger", logging.getLogger(__name__))
@@ -27,11 +28,6 @@ def __init__(self, *, config: dict = None) -> None:
         self.multipage_tables_extractor = MultiPageTableExtractor(config=config, logger=self.logger)
         self.config = config
         self.table_type = TableTypeAdditionalOptions()
-        if config.get("debug", False):
-            if not os.path.exists(self.config["path_cells"]):
-                os.makedirs(self.config["path_cells"])
-            if not os.path.exists(self.config["path_detect"]):
-                os.makedirs(self.config["path_detect"])
 
     def convert_to_multipages_tables(self, all_single_tables: List[ScanTable], lines_with_meta: List[LineWithMeta]) -> List[ScanTable]:
 

diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/img_processing.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/img_processing.py
@@ -10,16 +10,19 @@
 from dedoc.config import get_config
 from dedoc.readers.pdf_reader.data_classes.tables.table_tree import TableTree
 from dedoc.readers.pdf_reader.data_classes.tables.table_type import TableTypeAdditionalOptions
+from dedoc.utils.parameter_utils import get_path_param
 
 logger = get_config().get("logger", logging.getLogger())
 logger = logger if logger else logging.getLogger("TableRecognizer.detect_tables_by_contours")
 table_options = TableTypeAdditionalOptions()
 
+ROTATE_THRESHOLD = 0.3
+
 
 def rotate_with_threshold(img: np.ndarray, angle: float, threshold: float = None, *, config: dict) -> np.ndarray:
     """rotates a table image and saving image.shape during rotation. It is important for word bounding box extraction"""
     if threshold is None:
-        threshold = config["rotate_threshold"]
+        threshold = ROTATE_THRESHOLD
     rotated = img
     if abs(angle) > threshold:
         if config.get("debug_mode", False):
@@ -79,43 +82,41 @@ def get_contours_cells(img: np.ndarray, table_type: str, *, config: dict) -> [An
     img_bin = 255 - img_bin
 
     if config.get("debug_mode", False):
-        os.makedirs(config["path_cells"], exist_ok=True)
-        os.makedirs(config["path_detect"], exist_ok=True)
-        cv2.imwrite(os.path.join(config["path_detect"], "image_bin.jpg"), img_bin)
+        cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "image_bin.jpg"), img_bin)
     # step 2
     img_final_bin = __detect_horizontal_and_vertical_lines(img_bin, config, "tables")
     # step 3
     img_final_bin_houph, angle_alignment = __apply_houph_lines_and_detect_angle(img_final_bin, config)
 
     (thresh, img_final_bin_houph) = cv2.threshold(img_final_bin_houph, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
     if config.get("debug_mode", False):
-        cv2.imwrite(os.path.join(config["path_detect"], "img_final_bin.jpg"), img_final_bin)
+        cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "img_final_bin.jpg"), img_final_bin)
     if config.get("debug_mode", False):
-        cv2.imwrite(os.path.join(config["path_detect"], "img_final_bin_houph.jpg"), img_final_bin_houph)
+        cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "img_final_bin_houph.jpg"), img_final_bin_houph)
 
     # step 4 - rotating
     img_final_bin_houph = rotate_with_threshold(img_final_bin_houph, angle_alignment, config=config)
     img = rotate_with_threshold(img, angle_alignment, config=config)
     if config.get("debug_mode", False):
-        cv2.imwrite(os.path.join(config["path_detect"], "aligned_img.jpg"), img)
+        cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "aligned_img.jpg"), img)
     img_final_bin_houph = __paint_bounds(img_final_bin_houph)
 
     # step 5  - detect contours
     contours, hierarchy = cv2.findContours(img_final_bin_houph, cv2.RETR_TREE, cv2.CHAIN_APPROX_TC89_KCOS)
 
     if config.get("debug_mode", False):
-        cv2.imwrite(os.path.join(config["path_detect"], "img_houph_and_morph_wo_bound.jpg"), img_final_bin_houph)
+        cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "img_houph_and_morph_wo_bound.jpg"), img_final_bin_houph)
         img_w_contour = img.copy()
         cv2.drawContours(img_w_contour, contours, contourIdx=-1, color=(0, 0, 0), thickness=10, hierarchy=hierarchy, maxLevel=8)
-        cv2.imwrite(os.path.join(config["path_detect"], "img_with_contours.jpg"), img_w_contour)
+        cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "img_with_contours.jpg"), img_w_contour)
 
     # Draw external contours for tables without external contours. It is a rare case, but important for invoices
     if table_options.table_wo_external_bounds in table_type:
-        contours, hierarchy = __get_contours_for_table_wo_external_bounds(img, img_final_bin_houph.copy(), contours, hierarchy)
+        contours, hierarchy = __get_contours_for_table_wo_external_bounds(img, img_final_bin_houph.copy(), contours, hierarchy, config)
     return contours, hierarchy, img, angle_alignment
 
 
-def __get_contours_for_table_wo_external_bounds(img: np.ndarray, img_with_contours: np.ndarray, contours: List, hierarchy: List) -> [Any, Any]:
+def __get_contours_for_table_wo_external_bounds(img: np.ndarray, img_with_contours: np.ndarray, contours: List, hierarchy: List, config: dict) -> [Any, Any]:
     # get children (get table counters)
     contours = np.array(contours)
     list_contours, table_contours = __get_table_contours(contours, hierarchy)
@@ -137,8 +138,8 @@ def __get_contours_for_table_wo_external_bounds(img: np.ndarray, img_with_contou
         x, y, w, h = cv2.boundingRect(c)
         cv2.rectangle(img_with_contours, (x, y), (x + w, y + h), color=(0, 0, 0), thickness=5)
 
-    if get_config().get("debug_mode", False):
-        cv2.imwrite(os.path.join(get_config()["path_detect"], "img_with_external_bounds.jpg"), img_with_contours)
+    if config.get("debug_mode", False):
+        cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "img_with_external_bounds.jpg"), img_with_contours)
     contours, hierarchy = cv2.findContours(img_with_contours, cv2.RETR_TREE, cv2.CHAIN_APPROX_TC89_KCOS)
 
     return contours, hierarchy
@@ -172,7 +173,7 @@ def __apply_houph_lines_and_detect_angle(image: np.ndarray, config: dict) -> [np
         gap_avg = min(np.mean([c[2] for c in contours_table]) // 45, gap_avg)
     else:
         gap_avg = 5
-    if config["debug_mode"]:
+    if config.get("debug_mode", False):
         config.get("logger", logging.getLogger()).debug(f"Houph gap = {gap_avg}")
 
     # ----- image alignment -----
@@ -191,8 +192,9 @@ def __detect_horizontal_and_vertical_lines(img_bin: np.ndarray, config: dict, ta
     elif task == "tables":
         length_div = 55
         height_div = 100
-    kernel_length_weight = max(np.array(img_bin).shape[1] // length_div, config["min_w_cell"])  # 35
-    kernel_length_height = max(np.array(img_bin).shape[0] // height_div, config["min_h_cell"])  # 100
+
+    kernel_length_weight = max(np.array(img_bin).shape[1] // length_div, TableTree.min_w_cell)  # 35
+    kernel_length_height = max(np.array(img_bin).shape[0] // height_div, TableTree.min_h_cell)  # 100
 
     # A verticle kernel of (1 X kernel_length), which will detect all the verticle lines from the image.
     verticle_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, kernel_length_height))
@@ -211,8 +213,8 @@ def __detect_horizontal_and_vertical_lines(img_bin: np.ndarray, config: dict, ta
     horizontal_lines_img = cv2.dilate(img_temp2, hori_kernel, iterations=iterations)
 
     if config.get("debug_mode", False):
-        cv2.imwrite(os.path.join(config["path_detect"], "verticle_lines.jpg"), verticle_lines_img)
-        cv2.imwrite(os.path.join(config["path_detect"], "horizontal_lines.jpg"), horizontal_lines_img)
+        cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "verticle_lines.jpg"), verticle_lines_img)
+        cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "horizontal_lines.jpg"), horizontal_lines_img)
 
     """Now we will add these two images.
     This will have only boxes and the information written in the box will be erased.
@@ -228,7 +230,7 @@ def __detect_horizontal_and_vertical_lines(img_bin: np.ndarray, config: dict, ta
     img_bin_with_lines = cv2.erode(~img_bin_with_lines, kernel, iterations=2)
     (thresh, img_bin_with_lines) = cv2.threshold(img_bin_with_lines, 200, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
     if config.get("debug_mode", False):
-        cv2.imwrite(os.path.join(config["path_detect"], "img_bin_with_lines.jpg"), img_bin_with_lines)
+        cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "img_bin_with_lines.jpg"), img_bin_with_lines)
 
     return img_bin_with_lines
 
@@ -265,8 +267,8 @@ def detect_tables_by_contours(img: np.ndarray,
     if config.get("debug_mode", False):
         config.get("logger", logging.getLogger()).debug(f"Hierarchy [Next, Previous, First_Child, Parent]:\n {hierarchy}")
         tree_table.print_tree(depth=0)
-    if config.get("debug_mode", False):
-        cv2.imwrite(os.path.join(config["path_detect"], "img_draw_counters.jpg"), img)
+
+        cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "img_draw_counters.jpg"), img)
 
     tree_table.set_text_into_tree(tree=tree_table, src_image=image, language=language, config=config)
 

diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py
@@ -28,6 +28,7 @@
 from dedoc.readers.pdf_reader.data_classes.tables.location import Location
 from dedoc.readers.pdf_reader.data_classes.text_with_bbox import TextWithBBox
 from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdfminer_reader.pdfminer_utils import cleaning_text_from_hieroglyphics, create_bbox, draw_annotation
+from dedoc.utils.parameter_utils import get_path_param
 from dedoc.utils.pdf_utils import get_page_image
 
 logging.getLogger("pdfminer").setLevel(logging.ERROR)
@@ -251,7 +252,7 @@ def __debug_extract_layout(self, image_src: np.ndarray, layout: LTContainer, pag
         :param layout: container of layout element
         :return: None
         """
-        tmp_dir = os.path.join(self.config.get("path_debug"), "pdfminer")
+        tmp_dir = os.path.join(get_path_param(self.config, "path_debug"), "pdfminer")
         os.makedirs(tmp_dir, exist_ok=True)
 
         file_text = open(os.path.join(tmp_dir, f"text_{page_num}.txt"), "wt")
@@ -268,10 +269,10 @@ def __debug_extract_layout(self, image_src: np.ndarray, layout: LTContainer, pag
 
         for lobj in lobjs:
             if isinstance(lobj, LTTextBoxHorizontal):
-                annotations.extend(self.__extract_words_bbox_annotation(lobj, k_w, k_h, height, width))
+                annotations.extend(self.__extract_words_bbox_annotation(lobj, height, width))
                 lobjs_textline.extend(lobj)
             elif isinstance(lobj, LTTextLineHorizontal):
-                annotations.extend(self.__extract_words_bbox_annotation(lobj, k_w, k_h, height, width))
+                annotations.extend(self.__extract_words_bbox_annotation(lobj, height, width))
                 lobjs_textline.append(lobj)
             elif isinstance(lobj, LTRect):
                 lobjs_box.append(lobj)