ispras · NastyBoget · Dec 1, 2023 · Nov 29, 2023 · Nov 29, 2023 · Nov 29, 2023
diff --git a/dedoc/config.py b/dedoc/config.py
@@ -22,7 +22,7 @@
     # number of parallel jobs in some tasks as OCR
     n_jobs=1,
 
-    # --------------------------------------------GPU SETTINGS-------------------------------------------------------
+    # --------------------------------------------GPU SETTINGS----------------------------------------------------------
     # set gpu in XGBoost and torch models
     on_gpu=False,
 
@@ -36,19 +36,9 @@
     logger=logging.getLogger(),
     import_path_init_api_args="dedoc.api.api_args",
 
-    # ----------------------------------------TABLE RECOGNIZER SETTINGS-------------------------------------------------
-    min_h_cell=8,
-    min_w_cell=20,
-    type_top_attr=1,
-    type_left_top_attr=2,
-    type_left_attr=3,
-    max_vertical_extended=20,
-    minimal_cell_cnt_line=5,
-    minimal_cell_avg_length_line=10,
-
-    path_cells=os.path.join(os.path.abspath(os.sep), "tmp", "dedoc", "debug_tables", "imgs", "cells"),
+    # ----------------------------------------TABLE RECOGNIZER DEBUG SETTINGS-------------------------------------------
+    # path to save debug images for tables recognizer
     path_detect=os.path.join(os.path.abspath(os.sep), "tmp", "dedoc", "debug_tables", "imgs", "detect_lines"),
-    rotate_threshold=0.3,
 
     # -------------------------------------------RECOGNIZE SETTINGS-----------------------------------------------------
     # TESSERACT OCR confidence threshold ( values: [-1 - undefined;  0.0 : 100.0 % - confidence value)

diff --git a/dedoc/readers/pdf_reader/data_classes/tables/table_tree.py b/dedoc/readers/pdf_reader/data_classes/tables/table_tree.py
@@ -21,6 +21,10 @@ class TableTree(object):
     Table which has cells as sorted childs of tree.
     Table has type of tree and was obtained with help contour analysis.
     """
+    min_h_cell = 8
+    min_w_cell = 20
+    minimal_cell_cnt_line = 5
+    minimal_cell_avg_length_line = 10
 
     def __init__(self, *, config: dict) -> None:
         self.left = None
@@ -94,7 +98,7 @@ def __build_childs(self, cur: "TableTree", hierarchy: List, contours: List) -> "
             if h[3] == cur.id_contours:
                 bbox = cv2.boundingRect(contours[i])  # [x_begin, y_begin, width, height]
                 # Эвристика №1 на ячейку
-                if bbox[2] < self.config["min_w_cell"] or bbox[3] < self.config["min_h_cell"]:
+                if bbox[2] < self.min_w_cell or bbox[3] < self.min_h_cell:
                     if self.config.get("debug_mode", False):
                         self.logger.debug(f"Contour {i} isn't correct")
                     continue

diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py
@@ -82,8 +82,9 @@ def __concat_images(self, src_image: np.ndarray, tree_table_nodes: List["TableTr
         for tree_node in tree_table_nodes:
             x_coord = space
             cell_image = BBox.crop_image_by_box(src_image, tree_node.crop_text_box)
-            image_path = os.path.join(self.config.get("path_debug"), "debug_tables", "batches", "cell_croped.png")
-            cv2.imwrite(image_path, cell_image)
+            if self.config.get("debug_mode", False) and self.config.get("path_debug", False) and os.path.exists(self.config.get("path_debug")):
+                image_path = os.path.join(self.config.get("path_debug"), "debug_tables", "batches", "cell_croped.png")
+                cv2.imwrite(image_path, cell_image)
             cell_height, cell_width = cell_image.shape[0], cell_image.shape[1]
 
             stacked_image[y_prev:y_prev + cell_height, x_coord:x_coord + cell_width] = cell_image

diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py b/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py
@@ -52,7 +52,7 @@ def __init__(self, *, config: dict) -> None:
         self.binarizer = AdaptiveBinarizer()
         self.ocr = OCRLineExtractor(config=config)
         self.logger = config.get("logger", logging.getLogger())
-        if self.config.get("debug_mode") and not os.path.exists(self.config["path_debug"]):
+        if self.config.get("debug_mode", False) and not os.path.exists(self.config["path_debug"]):
             os.makedirs(self.config["path_debug"])
 
     def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool:
@@ -70,13 +70,13 @@ def _process_one_page(self,
                           path: str) -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[float]]:
         #  --- Step 1: correct orientation and detect column count ---
         rotated_image, is_one_column_document, angle = self._detect_column_count_and_orientation(image, parameters)
-        if self.config.get("debug_mode"):
+        if self.config.get("debug_mode", False):
             self.logger.info(f"Angle page rotation = {angle}")
 
         #  --- Step 2: do binarization ---
         if parameters.need_binarization:
             rotated_image, _ = self.binarizer.preprocess(rotated_image)
-            if self.config.get("debug_mode"):
+            if self.config.get("debug_mode", False):
                 cv2.imwrite(os.path.join(self.config["path_debug"], f"{datetime.now().strftime('%H-%M-%S')}_result_binarization.jpg"), rotated_image)
 
         #  --- Step 3: table detection and recognition ---
@@ -122,7 +122,7 @@ def _detect_column_count_and_orientation(self, image: np.ndarray, parameters: Pa
         rotated_image, result_angle = self.scew_corrector.preprocess(image, {"orientation_angle": angle})
         result_angle = result_angle["rotated_angle"]
 
-        if self.config.get("debug_mode"):
+        if self.config.get("debug_mode", False):
             img_path = os.path.join(self.config["path_debug"], f"{datetime.now().strftime('%H-%M-%S')}_result_orientation.jpg")
             self.logger.info(f"Save image to {img_path}")
             cv2.imwrite(img_path, rotated_image)

diff --git a/...e_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py b/...e_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py
@@ -20,6 +20,7 @@ class OnePageTableExtractor(BaseTableExtractor):
 
     def __init__(self, *, config: dict, logger: logging.Logger) -> None:
         super().__init__(config=config, logger=logger)
+
         self.image = None
         self.page_number = 0
         self.attribute_selector = TableAttributeExtractor(logger=self.logger)
@@ -77,8 +78,8 @@ def __detect_diff_orient(self, cell_text: str) -> bool:
         avg_len_part = np.average(len_parts)
 
         # Эвристика: считаем что ячейка повернута, если у нас большое количество строк и строки короткие
-        if len(parts) > self.config["minimal_cell_cnt_line"] \
-                and avg_len_part < self.config["minimal_cell_avg_length_line"]:
+        if len(parts) > TableTree.minimal_cell_cnt_line \
+                and avg_len_part < TableTree.minimal_cell_avg_length_line:
             return True
         return False
 

diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py
@@ -19,6 +19,7 @@
 
 
 class TableRecognizer(object):
+
     def __init__(self, *, config: dict = None) -> None:
 
         self.logger = config.get("logger", logging.getLogger(__name__))
@@ -27,11 +28,6 @@ def __init__(self, *, config: dict = None) -> None:
         self.multipage_tables_extractor = MultiPageTableExtractor(config=config, logger=self.logger)
         self.config = config
         self.table_type = TableTypeAdditionalOptions()
-        if config.get("debug", False):
-            if not os.path.exists(self.config["path_cells"]):
-                os.makedirs(self.config["path_cells"])
-            if not os.path.exists(self.config["path_detect"]):
-                os.makedirs(self.config["path_detect"])
 
     def convert_to_multipages_tables(self, all_single_tables: List[ScanTable], lines_with_meta: List[LineWithMeta]) -> List[ScanTable]:
 

diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/img_processing.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/img_processing.py
@@ -15,11 +15,13 @@
 logger = logger if logger else logging.getLogger("TableRecognizer.detect_tables_by_contours")
 table_options = TableTypeAdditionalOptions()
 
+ROTATE_THRESHOLD = 0.3
+
 
 def rotate_with_threshold(img: np.ndarray, angle: float, threshold: float = None, *, config: dict) -> np.ndarray:
     """rotates a table image and saving image.shape during rotation. It is important for word bounding box extraction"""
     if threshold is None:
-        threshold = config["rotate_threshold"]
+        threshold = ROTATE_THRESHOLD
     rotated = img
     if abs(angle) > threshold:
         if config.get("debug_mode", False):
@@ -62,7 +64,7 @@ def apply_houph_line(img: np.ndarray, threshold_gap: int = 10, *, config: dict)
     return cdst_p, angle
 
 
-def get_contours_cells(img: np.ndarray, table_type: str, *, config: dict) -> [Any, Any, np.ndarray, float]:
+def get_contours_cells(img: np.ndarray, table_type: str, *, config: dict, path_detect: str) -> [Any, Any, np.ndarray, float]:
     """
     function's steps:
     1) detects Houph lines for detecting rotate angle. Then input image has rotated on the rotate angle.
@@ -79,43 +81,47 @@ def get_contours_cells(img: np.ndarray, table_type: str, *, config: dict) -> [An
     img_bin = 255 - img_bin
 
     if config.get("debug_mode", False):
-        os.makedirs(config["path_cells"], exist_ok=True)
-        os.makedirs(config["path_detect"], exist_ok=True)
-        cv2.imwrite(os.path.join(config["path_detect"], "image_bin.jpg"), img_bin)
+        os.makedirs(path_detect, exist_ok=True)
+
+    if config.get("debug_mode", False):
+        cv2.imwrite(os.path.join(path_detect, "image_bin.jpg"), img_bin)
     # step 2
-    img_final_bin = __detect_horizontal_and_vertical_lines(img_bin, config, "tables")
+    img_final_bin = __detect_horizontal_and_vertical_lines(img_bin, config, "tables", path_detect)
     # step 3
     img_final_bin_houph, angle_alignment = __apply_houph_lines_and_detect_angle(img_final_bin, config)
 
     (thresh, img_final_bin_houph) = cv2.threshold(img_final_bin_houph, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
     if config.get("debug_mode", False):
-        cv2.imwrite(os.path.join(config["path_detect"], "img_final_bin.jpg"), img_final_bin)
+        cv2.imwrite(os.path.join(path_detect, "img_final_bin.jpg"), img_final_bin)
     if config.get("debug_mode", False):
-        cv2.imwrite(os.path.join(config["path_detect"], "img_final_bin_houph.jpg"), img_final_bin_houph)
+        cv2.imwrite(os.path.join(path_detect, "img_final_bin_houph.jpg"), img_final_bin_houph)
 
     # step 4 - rotating
     img_final_bin_houph = rotate_with_threshold(img_final_bin_houph, angle_alignment, config=config)
     img = rotate_with_threshold(img, angle_alignment, config=config)
     if config.get("debug_mode", False):
-        cv2.imwrite(os.path.join(config["path_detect"], "aligned_img.jpg"), img)
+        # TODO: paths should be configurable but now could not exist
+        cv2.imwrite(os.path.join(path_detect, "aligned_img.jpg"), img)
     img_final_bin_houph = __paint_bounds(img_final_bin_houph)
 
     # step 5  - detect contours
     contours, hierarchy = cv2.findContours(img_final_bin_houph, cv2.RETR_TREE, cv2.CHAIN_APPROX_TC89_KCOS)
 
     if config.get("debug_mode", False):
-        cv2.imwrite(os.path.join(config["path_detect"], "img_houph_and_morph_wo_bound.jpg"), img_final_bin_houph)
+        # TODO: paths should be configurable but now could not exist
+        cv2.imwrite(os.path.join(path_detect, "img_houph_and_morph_wo_bound.jpg"), img_final_bin_houph)
         img_w_contour = img.copy()
         cv2.drawContours(img_w_contour, contours, contourIdx=-1, color=(0, 0, 0), thickness=10, hierarchy=hierarchy, maxLevel=8)
-        cv2.imwrite(os.path.join(config["path_detect"], "img_with_contours.jpg"), img_w_contour)
+        cv2.imwrite(os.path.join(path_detect, "img_with_contours.jpg"), img_w_contour)
 
     # Draw external contours for tables without external contours. It is a rare case, but important for invoices
     if table_options.table_wo_external_bounds in table_type:
-        contours, hierarchy = __get_contours_for_table_wo_external_bounds(img, img_final_bin_houph.copy(), contours, hierarchy)
+        contours, hierarchy = __get_contours_for_table_wo_external_bounds(img, img_final_bin_houph.copy(), contours, hierarchy, path_detect)
     return contours, hierarchy, img, angle_alignment
 
 
-def __get_contours_for_table_wo_external_bounds(img: np.ndarray, img_with_contours: np.ndarray, contours: List, hierarchy: List) -> [Any, Any]:
+def __get_contours_for_table_wo_external_bounds(img: np.ndarray, img_with_contours: np.ndarray, contours: List, hierarchy: List,
+                                                path_detect: str) -> [Any, Any]:
     # get children (get table counters)
     contours = np.array(contours)
     list_contours, table_contours = __get_table_contours(contours, hierarchy)
@@ -138,7 +144,8 @@ def __get_contours_for_table_wo_external_bounds(img: np.ndarray, img_with_contou
         cv2.rectangle(img_with_contours, (x, y), (x + w, y + h), color=(0, 0, 0), thickness=5)
 
     if get_config().get("debug_mode", False):
-        cv2.imwrite(os.path.join(get_config()["path_detect"], "img_with_external_bounds.jpg"), img_with_contours)
+        # TODO: paths should be configurable but now could not exist
+        cv2.imwrite(os.path.join(path_detect, "img_with_external_bounds.jpg"), img_with_contours)
     contours, hierarchy = cv2.findContours(img_with_contours, cv2.RETR_TREE, cv2.CHAIN_APPROX_TC89_KCOS)
 
     return contours, hierarchy
@@ -172,7 +179,7 @@ def __apply_houph_lines_and_detect_angle(image: np.ndarray, config: dict) -> [np
         gap_avg = min(np.mean([c[2] for c in contours_table]) // 45, gap_avg)
     else:
         gap_avg = 5
-    if config["debug_mode"]:
+    if config.get("debug_mode", False):
         config.get("logger", logging.getLogger()).debug(f"Houph gap = {gap_avg}")
 
     # ----- image alignment -----
@@ -182,7 +189,7 @@ def __apply_houph_lines_and_detect_angle(image: np.ndarray, config: dict) -> [np
     return img_final_bin_houph, angle_alignment
 
 
-def __detect_horizontal_and_vertical_lines(img_bin: np.ndarray, config: dict, task: str) -> np.ndarray:
+def __detect_horizontal_and_vertical_lines(img_bin: np.ndarray, config: dict, task: str, path_detect: str) -> np.ndarray:
     # Defining a kernel length
 
     if task == "orientation":
@@ -191,8 +198,9 @@ def __detect_horizontal_and_vertical_lines(img_bin: np.ndarray, config: dict, ta
     elif task == "tables":
         length_div = 55
         height_div = 100
-    kernel_length_weight = max(np.array(img_bin).shape[1] // length_div, config["min_w_cell"])  # 35
-    kernel_length_height = max(np.array(img_bin).shape[0] // height_div, config["min_h_cell"])  # 100
+
+    kernel_length_weight = max(np.array(img_bin).shape[1] // length_div, TableTree.min_w_cell)  # 35
+    kernel_length_height = max(np.array(img_bin).shape[0] // height_div, TableTree.min_h_cell)  # 100
 
     # A verticle kernel of (1 X kernel_length), which will detect all the verticle lines from the image.
     verticle_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, kernel_length_height))
@@ -211,8 +219,9 @@ def __detect_horizontal_and_vertical_lines(img_bin: np.ndarray, config: dict, ta
     horizontal_lines_img = cv2.dilate(img_temp2, hori_kernel, iterations=iterations)
 
     if config.get("debug_mode", False):
-        cv2.imwrite(os.path.join(config["path_detect"], "verticle_lines.jpg"), verticle_lines_img)
-        cv2.imwrite(os.path.join(config["path_detect"], "horizontal_lines.jpg"), horizontal_lines_img)
+        # TODO: paths should be configurable but now could not exist
+        cv2.imwrite(os.path.join(path_detect, "verticle_lines.jpg"), verticle_lines_img)
+        cv2.imwrite(os.path.join(path_detect, "horizontal_lines.jpg"), horizontal_lines_img)
 
     """Now we will add these two images.
     This will have only boxes and the information written in the box will be erased.
@@ -228,7 +237,8 @@ def __detect_horizontal_and_vertical_lines(img_bin: np.ndarray, config: dict, ta
     img_bin_with_lines = cv2.erode(~img_bin_with_lines, kernel, iterations=2)
     (thresh, img_bin_with_lines) = cv2.threshold(img_bin_with_lines, 200, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
     if config.get("debug_mode", False):
-        cv2.imwrite(os.path.join(config["path_detect"], "img_bin_with_lines.jpg"), img_bin_with_lines)
+        # TODO: paths should be configurable but now could not exist
+        cv2.imwrite(os.path.join(path_detect, "img_bin_with_lines.jpg"), img_bin_with_lines)
 
     return img_bin_with_lines
 
@@ -259,14 +269,18 @@ def detect_tables_by_contours(img: np.ndarray,
     :param config: dict from config.py
     :return: TreeTable, contour, rotate angle
     """
-    contours, hierarchy, image, angle_rotate = get_contours_cells(img, table_type, config=config)
+    path_detect = config.get("path_detect", None)
+    if config.get("debug_mode", False) and path_detect is None:
+        path_detect = os.path.join(os.path.abspath(os.sep), "tmp", "dedoc", "debug_tables", "imgs", "detect_lines")
+
+    contours, hierarchy, image, angle_rotate = get_contours_cells(img, table_type, config=config, path_detect=path_detect)
     tree_table = TableTree.parse_contours_to_tree(contours=contours, hierarchy=hierarchy, config=config)
 
     if config.get("debug_mode", False):
         config.get("logger", logging.getLogger()).debug(f"Hierarchy [Next, Previous, First_Child, Parent]:\n {hierarchy}")
         tree_table.print_tree(depth=0)
-    if config.get("debug_mode", False):
-        cv2.imwrite(os.path.join(config["path_detect"], "img_draw_counters.jpg"), img)
+
+        cv2.imwrite(os.path.join(path_detect, "img_draw_counters.jpg"), img)
 
     tree_table.set_text_into_tree(tree=tree_table, src_image=image, language=language, config=config)
 

diff --git a/tests/unit_tests/test_format_pdf_reader.py b/tests/unit_tests/test_format_pdf_reader.py
@@ -140,3 +140,10 @@ def test_pdf_text_layer(self) -> None:
             annotations = line.annotations
             annotations_set = {(a.name, a.value, a.start, a.end) for a in annotations}
             self.assertEqual(len(annotations_set), len(annotations))
+
+    def test_table_extractor(self) -> None:
+        config = {}  # Has to work without config
+        any_doc_reader = PdfTxtlayerReader(config=config)
+        path = os.path.join(os.path.dirname(__file__), "../data/pdf_with_text_layer/english_doc.pdf")
+        result = any_doc_reader.read(path, document_type=None, parameters={"need_pdf_table_analysis": "True"})
+        self.assertEqual(len(result.tables), 1)