diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py index 8c45b632..cf1417c0 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py @@ -82,7 +82,7 @@ def __concat_images(self, src_image: np.ndarray, tree_table_nodes: List["TableTr for tree_node in tree_table_nodes: x_coord = space cell_image = BBox.crop_image_by_box(src_image, tree_node.crop_text_box) - if self.config.get("debug_mode", False) and self.config.get("path_debug", False) and os.path.exists(self.config.get("path_debug")): + if self.config.get("debug_mode", False) and self.config.get("path_debug") is not None and os.path.exists(self.config.get("path_debug")): image_path = os.path.join(self.config.get("path_debug"), "debug_tables", "batches", "cell_croped.png") cv2.imwrite(image_path, cell_image) cell_height, cell_width = cell_image.shape[0], cell_image.shape[1] diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/img_processing.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/img_processing.py index 9eeba60c..b545986f 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/img_processing.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/img_processing.py @@ -1,3 +1,4 @@ +import copy import logging import math import os @@ -64,7 +65,7 @@ def apply_houph_line(img: np.ndarray, threshold_gap: int = 10, *, config: dict) return cdst_p, angle -def get_contours_cells(img: np.ndarray, table_type: str, *, config: dict, path_detect: str) -> [Any, Any, np.ndarray, float]: +def get_contours_cells(img: np.ndarray, table_type: str, *, config: dict) -> [Any, Any, np.ndarray, float]: """ function's steps: 1) detects Houph lines for detecting rotate angle. Then input image has rotated on the rotate angle. @@ -81,27 +82,24 @@ def get_contours_cells(img: np.ndarray, table_type: str, *, config: dict, path_d img_bin = 255 - img_bin if config.get("debug_mode", False): - os.makedirs(path_detect, exist_ok=True) - - if config.get("debug_mode", False): - cv2.imwrite(os.path.join(path_detect, "image_bin.jpg"), img_bin) + cv2.imwrite(os.path.join(config.get("path_detect"), "image_bin.jpg"), img_bin) # step 2 - img_final_bin = __detect_horizontal_and_vertical_lines(img_bin, config, "tables", path_detect) + img_final_bin = __detect_horizontal_and_vertical_lines(img_bin, config, "tables", config.get("path_detect")) # step 3 img_final_bin_houph, angle_alignment = __apply_houph_lines_and_detect_angle(img_final_bin, config) (thresh, img_final_bin_houph) = cv2.threshold(img_final_bin_houph, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU) if config.get("debug_mode", False): - cv2.imwrite(os.path.join(path_detect, "img_final_bin.jpg"), img_final_bin) + cv2.imwrite(os.path.join(config.get("path_detect"), "img_final_bin.jpg"), img_final_bin) if config.get("debug_mode", False): - cv2.imwrite(os.path.join(path_detect, "img_final_bin_houph.jpg"), img_final_bin_houph) + cv2.imwrite(os.path.join(config.get("path_detect"), "img_final_bin_houph.jpg"), img_final_bin_houph) # step 4 - rotating img_final_bin_houph = rotate_with_threshold(img_final_bin_houph, angle_alignment, config=config) img = rotate_with_threshold(img, angle_alignment, config=config) if config.get("debug_mode", False): # TODO: paths should be configurable but now could not exist - cv2.imwrite(os.path.join(path_detect, "aligned_img.jpg"), img) + cv2.imwrite(os.path.join(config.get("path_detect"), "aligned_img.jpg"), img) img_final_bin_houph = __paint_bounds(img_final_bin_houph) # step 5 - detect contours @@ -109,19 +107,18 @@ def get_contours_cells(img: np.ndarray, table_type: str, *, config: dict, path_d if config.get("debug_mode", False): # TODO: paths should be configurable but now could not exist - cv2.imwrite(os.path.join(path_detect, "img_houph_and_morph_wo_bound.jpg"), img_final_bin_houph) + cv2.imwrite(os.path.join(config.get("path_detect"), "img_houph_and_morph_wo_bound.jpg"), img_final_bin_houph) img_w_contour = img.copy() cv2.drawContours(img_w_contour, contours, contourIdx=-1, color=(0, 0, 0), thickness=10, hierarchy=hierarchy, maxLevel=8) - cv2.imwrite(os.path.join(path_detect, "img_with_contours.jpg"), img_w_contour) + cv2.imwrite(os.path.join(config.get("path_detect"), "img_with_contours.jpg"), img_w_contour) # Draw external contours for tables without external contours. It is a rare case, but important for invoices if table_options.table_wo_external_bounds in table_type: - contours, hierarchy = __get_contours_for_table_wo_external_bounds(img, img_final_bin_houph.copy(), contours, hierarchy, path_detect) + contours, hierarchy = __get_contours_for_table_wo_external_bounds(img, img_final_bin_houph.copy(), contours, hierarchy, config) return contours, hierarchy, img, angle_alignment -def __get_contours_for_table_wo_external_bounds(img: np.ndarray, img_with_contours: np.ndarray, contours: List, hierarchy: List, - path_detect: str) -> [Any, Any]: +def __get_contours_for_table_wo_external_bounds(img: np.ndarray, img_with_contours: np.ndarray, contours: List, hierarchy: List, config: dict) -> [Any, Any]: # get children (get table counters) contours = np.array(contours) list_contours, table_contours = __get_table_contours(contours, hierarchy) @@ -143,9 +140,9 @@ def __get_contours_for_table_wo_external_bounds(img: np.ndarray, img_with_contou x, y, w, h = cv2.boundingRect(c) cv2.rectangle(img_with_contours, (x, y), (x + w, y + h), color=(0, 0, 0), thickness=5) - if get_config().get("debug_mode", False): + if config.get("debug_mode", False): # TODO: paths should be configurable but now could not exist - cv2.imwrite(os.path.join(path_detect, "img_with_external_bounds.jpg"), img_with_contours) + cv2.imwrite(os.path.join(config.get("path_detect"), "img_with_external_bounds.jpg"), img_with_contours) contours, hierarchy = cv2.findContours(img_with_contours, cv2.RETR_TREE, cv2.CHAIN_APPROX_TC89_KCOS) return contours, hierarchy @@ -189,7 +186,7 @@ def __apply_houph_lines_and_detect_angle(image: np.ndarray, config: dict) -> [np return img_final_bin_houph, angle_alignment -def __detect_horizontal_and_vertical_lines(img_bin: np.ndarray, config: dict, task: str, path_detect: str) -> np.ndarray: +def __detect_horizontal_and_vertical_lines(img_bin: np.ndarray, config: dict, task: str) -> np.ndarray: # Defining a kernel length if task == "orientation": @@ -220,8 +217,8 @@ def __detect_horizontal_and_vertical_lines(img_bin: np.ndarray, config: dict, ta if config.get("debug_mode", False): # TODO: paths should be configurable but now could not exist - cv2.imwrite(os.path.join(path_detect, "verticle_lines.jpg"), verticle_lines_img) - cv2.imwrite(os.path.join(path_detect, "horizontal_lines.jpg"), horizontal_lines_img) + cv2.imwrite(os.path.join(config.get("path_detect"), "verticle_lines.jpg"), verticle_lines_img) + cv2.imwrite(os.path.join(config.get("path_detect"), "horizontal_lines.jpg"), horizontal_lines_img) """Now we will add these two images. This will have only boxes and the information written in the box will be erased. @@ -238,7 +235,7 @@ def __detect_horizontal_and_vertical_lines(img_bin: np.ndarray, config: dict, ta (thresh, img_bin_with_lines) = cv2.threshold(img_bin_with_lines, 200, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU) if config.get("debug_mode", False): # TODO: paths should be configurable but now could not exist - cv2.imwrite(os.path.join(path_detect, "img_bin_with_lines.jpg"), img_bin_with_lines) + cv2.imwrite(os.path.join(config.get("path_detect"), "img_bin_with_lines.jpg"), img_bin_with_lines) return img_bin_with_lines @@ -269,18 +266,22 @@ def detect_tables_by_contours(img: np.ndarray, :param config: dict from config.py :return: TreeTable, contour, rotate angle """ - path_detect = config.get("path_detect", None) + path_detect = config.get("path_detect") if config.get("debug_mode", False) and path_detect is None: - path_detect = os.path.join(os.path.abspath(os.sep), "tmp", "dedoc", "debug_tables", "imgs", "detect_lines") + path_detect = get_config()["path_detect"] + os.makedirs(path_detect, exist_ok=True) + + config = copy.deepcopy(config) + config["path_detect"] = path_detect - contours, hierarchy, image, angle_rotate = get_contours_cells(img, table_type, config=config, path_detect=path_detect) + contours, hierarchy, image, angle_rotate = get_contours_cells(img, table_type, config=config) tree_table = TableTree.parse_contours_to_tree(contours=contours, hierarchy=hierarchy, config=config) if config.get("debug_mode", False): config.get("logger", logging.getLogger()).debug(f"Hierarchy [Next, Previous, First_Child, Parent]:\n {hierarchy}") tree_table.print_tree(depth=0) - cv2.imwrite(os.path.join(path_detect, "img_draw_counters.jpg"), img) + cv2.imwrite(os.path.join(config.get("path_detect"), "img_draw_counters.jpg"), img) tree_table.set_text_into_tree(tree=tree_table, src_image=image, language=language, config=config)