diff --git a/dedoc/config.py b/dedoc/config.py index 1e7cacc0..10711e25 100644 --- a/dedoc/config.py +++ b/dedoc/config.py @@ -22,7 +22,7 @@ # number of parallel jobs in some tasks as OCR n_jobs=1, - # --------------------------------------------GPU SETTINGS------------------------------------------------------- + # --------------------------------------------GPU SETTINGS---------------------------------------------------------- # set gpu in XGBoost and torch models on_gpu=False, @@ -36,19 +36,9 @@ logger=logging.getLogger(), import_path_init_api_args="dedoc.api.api_args", - # ----------------------------------------TABLE RECOGNIZER SETTINGS------------------------------------------------- - min_h_cell=8, - min_w_cell=20, - type_top_attr=1, - type_left_top_attr=2, - type_left_attr=3, - max_vertical_extended=20, - minimal_cell_cnt_line=5, - minimal_cell_avg_length_line=10, - - path_cells=os.path.join(os.path.abspath(os.sep), "tmp", "dedoc", "debug_tables", "imgs", "cells"), + # ----------------------------------------TABLE RECOGNIZER DEBUG SETTINGS------------------------------------------- + # path to save debug images for tables recognizer path_detect=os.path.join(os.path.abspath(os.sep), "tmp", "dedoc", "debug_tables", "imgs", "detect_lines"), - rotate_threshold=0.3, # -------------------------------------------RECOGNIZE SETTINGS----------------------------------------------------- # TESSERACT OCR confidence threshold ( values: [-1 - undefined; 0.0 : 100.0 % - confidence value) diff --git a/dedoc/readers/pdf_reader/data_classes/tables/table_tree.py b/dedoc/readers/pdf_reader/data_classes/tables/table_tree.py index 2bf9e9a5..5516bd71 100644 --- a/dedoc/readers/pdf_reader/data_classes/tables/table_tree.py +++ b/dedoc/readers/pdf_reader/data_classes/tables/table_tree.py @@ -21,6 +21,10 @@ class TableTree(object): Table which has cells as sorted childs of tree. Table has type of tree and was obtained with help contour analysis. """ + min_h_cell = 8 + min_w_cell = 20 + minimal_cell_cnt_line = 5 + minimal_cell_avg_length_line = 10 def __init__(self, *, config: dict) -> None: self.left = None @@ -94,7 +98,7 @@ def __build_childs(self, cur: "TableTree", hierarchy: List, contours: List) -> " if h[3] == cur.id_contours: bbox = cv2.boundingRect(contours[i]) # [x_begin, y_begin, width, height] # Эвристика №1 на ячейку - if bbox[2] < self.config["min_w_cell"] or bbox[3] < self.config["min_h_cell"]: + if bbox[2] < self.min_w_cell or bbox[3] < self.min_h_cell: if self.config.get("debug_mode", False): self.logger.debug(f"Contour {i} isn't correct") continue diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py index 04fdd2d1..c9ef35a8 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py @@ -11,6 +11,7 @@ from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_page.ocr_page import OcrPage from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_utils import get_text_with_bbox_from_cells from dedoc.utils.image_utils import get_highest_pixel_frequency +from dedoc.utils.parameter_utils import get_path_param class OCRCellExtractor: @@ -30,7 +31,7 @@ def get_cells_text(self, page_image: np.ndarray, tree_nodes: List["TableTree"], for num_batch, nodes_batch in enumerate(batches): if self.config.get("debug_mode", False): - tmp_dir = os.path.join(self.config.get("path_debug"), "debug_tables/batches/") + tmp_dir = os.path.join(get_path_param(self.config, "path_debug"), "debug_tables/batches/") os.makedirs(tmp_dir, exist_ok=True) for i, table_tree_node in enumerate(nodes_batch): cv2.imwrite(os.path.join(tmp_dir, f"image_{num_batch}_{i}.png"), BBox.crop_image_by_box(page_image, table_tree_node.cell_box)) @@ -64,7 +65,9 @@ def get_cells_text(self, page_image: np.ndarray, tree_nodes: List["TableTree"], def __handle_one_batch(self, src_image: np.ndarray, tree_table_nodes: List["TableTree"], num_batch: int, language: str = "rus") -> Tuple[OcrPage, List[BBox]]: # noqa concatenated, chunk_boxes = self.__concat_images(src_image=src_image, tree_table_nodes=tree_table_nodes) if self.config.get("debug_mode", False): - image_path = os.path.join(self.config.get("path_debug"), "debug_tables", "batches", f"stacked_batch_image_{num_batch}.png") + debug_dir = os.path.join(get_path_param(self.config, "path_debug"), "debug_tables", "batches") + os.makedirs(debug_dir, exist_ok=True) + image_path = os.path.join(debug_dir, f"stacked_batch_image_{num_batch}.png") cv2.imwrite(image_path, concatenated) ocr_result = get_text_with_bbox_from_cells(concatenated, language, ocr_conf_threshold=0.0) @@ -82,8 +85,11 @@ def __concat_images(self, src_image: np.ndarray, tree_table_nodes: List["TableTr for tree_node in tree_table_nodes: x_coord = space cell_image = BBox.crop_image_by_box(src_image, tree_node.crop_text_box) - image_path = os.path.join(self.config.get("path_debug"), "debug_tables", "batches", "cell_croped.png") - cv2.imwrite(image_path, cell_image) + if self.config.get("debug_mode", False): + debug_dir = os.path.join(get_path_param(self.config, "path_debug"), "debug_tables", "batches") + os.makedirs(debug_dir, exist_ok=True) + image_path = os.path.join(debug_dir, "cell_croped.png") + cv2.imwrite(image_path, cell_image) cell_height, cell_width = cell_image.shape[0], cell_image.shape[1] stacked_image[y_prev:y_prev + cell_height, x_coord:x_coord + cell_width] = cell_image diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py b/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py index 72498e70..ad5f1335 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py @@ -17,6 +17,7 @@ from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_line_extractor import OCRLineExtractor from dedoc.train_dataset.train_dataset_utils import save_page_with_bbox from dedoc.utils import supported_image_types +from dedoc.utils.parameter_utils import get_path_param class PdfImageReader(PdfBaseReader): @@ -52,8 +53,6 @@ def __init__(self, *, config: dict) -> None: self.binarizer = AdaptiveBinarizer() self.ocr = OCRLineExtractor(config=config) self.logger = config.get("logger", logging.getLogger()) - if self.config.get("debug_mode") and not os.path.exists(self.config["path_debug"]): - os.makedirs(self.config["path_debug"]) def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool: """ @@ -70,14 +69,15 @@ def _process_one_page(self, path: str) -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[float]]: # --- Step 1: correct orientation and detect column count --- rotated_image, is_one_column_document, angle = self._detect_column_count_and_orientation(image, parameters) - if self.config.get("debug_mode"): + if self.config.get("debug_mode", False): self.logger.info(f"Angle page rotation = {angle}") # --- Step 2: do binarization --- if parameters.need_binarization: rotated_image, _ = self.binarizer.preprocess(rotated_image) - if self.config.get("debug_mode"): - cv2.imwrite(os.path.join(self.config["path_debug"], f"{datetime.now().strftime('%H-%M-%S')}_result_binarization.jpg"), rotated_image) + if self.config.get("debug_mode", False): + debug_dir = get_path_param(self.config, "path_debug") + cv2.imwrite(os.path.join(debug_dir, f"{datetime.now().strftime('%H-%M-%S')}_result_binarization.jpg"), rotated_image) # --- Step 3: table detection and recognition --- if parameters.need_pdf_table_analysis: @@ -122,8 +122,9 @@ def _detect_column_count_and_orientation(self, image: np.ndarray, parameters: Pa rotated_image, result_angle = self.scew_corrector.preprocess(image, {"orientation_angle": angle}) result_angle = result_angle["rotated_angle"] - if self.config.get("debug_mode"): - img_path = os.path.join(self.config["path_debug"], f"{datetime.now().strftime('%H-%M-%S')}_result_orientation.jpg") + if self.config.get("debug_mode", False): + debug_dir = get_path_param(self.config, "path_debug") + img_path = os.path.join(debug_dir, f"{datetime.now().strftime('%H-%M-%S')}_result_orientation.jpg") self.logger.info(f"Save image to {img_path}") cv2.imwrite(img_path, rotated_image) diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py index de623863..c946cccf 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py @@ -20,6 +20,7 @@ class OnePageTableExtractor(BaseTableExtractor): def __init__(self, *, config: dict, logger: logging.Logger) -> None: super().__init__(config=config, logger=logger) + self.image = None self.page_number = 0 self.attribute_selector = TableAttributeExtractor(logger=self.logger) @@ -77,8 +78,8 @@ def __detect_diff_orient(self, cell_text: str) -> bool: avg_len_part = np.average(len_parts) # Эвристика: считаем что ячейка повернута, если у нас большое количество строк и строки короткие - if len(parts) > self.config["minimal_cell_cnt_line"] \ - and avg_len_part < self.config["minimal_cell_avg_length_line"]: + if len(parts) > TableTree.minimal_cell_cnt_line \ + and avg_len_part < TableTree.minimal_cell_avg_length_line: return True return False diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py index e07a3171..4a61530a 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py @@ -19,6 +19,7 @@ class TableRecognizer(object): + def __init__(self, *, config: dict = None) -> None: self.logger = config.get("logger", logging.getLogger(__name__)) @@ -27,11 +28,6 @@ def __init__(self, *, config: dict = None) -> None: self.multipage_tables_extractor = MultiPageTableExtractor(config=config, logger=self.logger) self.config = config self.table_type = TableTypeAdditionalOptions() - if config.get("debug", False): - if not os.path.exists(self.config["path_cells"]): - os.makedirs(self.config["path_cells"]) - if not os.path.exists(self.config["path_detect"]): - os.makedirs(self.config["path_detect"]) def convert_to_multipages_tables(self, all_single_tables: List[ScanTable], lines_with_meta: List[LineWithMeta]) -> List[ScanTable]: diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/img_processing.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/img_processing.py index b24c1a53..4aad36fa 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/img_processing.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/img_processing.py @@ -10,16 +10,19 @@ from dedoc.config import get_config from dedoc.readers.pdf_reader.data_classes.tables.table_tree import TableTree from dedoc.readers.pdf_reader.data_classes.tables.table_type import TableTypeAdditionalOptions +from dedoc.utils.parameter_utils import get_path_param logger = get_config().get("logger", logging.getLogger()) logger = logger if logger else logging.getLogger("TableRecognizer.detect_tables_by_contours") table_options = TableTypeAdditionalOptions() +ROTATE_THRESHOLD = 0.3 + def rotate_with_threshold(img: np.ndarray, angle: float, threshold: float = None, *, config: dict) -> np.ndarray: """rotates a table image and saving image.shape during rotation. It is important for word bounding box extraction""" if threshold is None: - threshold = config["rotate_threshold"] + threshold = ROTATE_THRESHOLD rotated = img if abs(angle) > threshold: if config.get("debug_mode", False): @@ -79,9 +82,7 @@ def get_contours_cells(img: np.ndarray, table_type: str, *, config: dict) -> [An img_bin = 255 - img_bin if config.get("debug_mode", False): - os.makedirs(config["path_cells"], exist_ok=True) - os.makedirs(config["path_detect"], exist_ok=True) - cv2.imwrite(os.path.join(config["path_detect"], "image_bin.jpg"), img_bin) + cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "image_bin.jpg"), img_bin) # step 2 img_final_bin = __detect_horizontal_and_vertical_lines(img_bin, config, "tables") # step 3 @@ -89,33 +90,33 @@ def get_contours_cells(img: np.ndarray, table_type: str, *, config: dict) -> [An (thresh, img_final_bin_houph) = cv2.threshold(img_final_bin_houph, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU) if config.get("debug_mode", False): - cv2.imwrite(os.path.join(config["path_detect"], "img_final_bin.jpg"), img_final_bin) + cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "img_final_bin.jpg"), img_final_bin) if config.get("debug_mode", False): - cv2.imwrite(os.path.join(config["path_detect"], "img_final_bin_houph.jpg"), img_final_bin_houph) + cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "img_final_bin_houph.jpg"), img_final_bin_houph) # step 4 - rotating img_final_bin_houph = rotate_with_threshold(img_final_bin_houph, angle_alignment, config=config) img = rotate_with_threshold(img, angle_alignment, config=config) if config.get("debug_mode", False): - cv2.imwrite(os.path.join(config["path_detect"], "aligned_img.jpg"), img) + cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "aligned_img.jpg"), img) img_final_bin_houph = __paint_bounds(img_final_bin_houph) # step 5 - detect contours contours, hierarchy = cv2.findContours(img_final_bin_houph, cv2.RETR_TREE, cv2.CHAIN_APPROX_TC89_KCOS) if config.get("debug_mode", False): - cv2.imwrite(os.path.join(config["path_detect"], "img_houph_and_morph_wo_bound.jpg"), img_final_bin_houph) + cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "img_houph_and_morph_wo_bound.jpg"), img_final_bin_houph) img_w_contour = img.copy() cv2.drawContours(img_w_contour, contours, contourIdx=-1, color=(0, 0, 0), thickness=10, hierarchy=hierarchy, maxLevel=8) - cv2.imwrite(os.path.join(config["path_detect"], "img_with_contours.jpg"), img_w_contour) + cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "img_with_contours.jpg"), img_w_contour) # Draw external contours for tables without external contours. It is a rare case, but important for invoices if table_options.table_wo_external_bounds in table_type: - contours, hierarchy = __get_contours_for_table_wo_external_bounds(img, img_final_bin_houph.copy(), contours, hierarchy) + contours, hierarchy = __get_contours_for_table_wo_external_bounds(img, img_final_bin_houph.copy(), contours, hierarchy, config) return contours, hierarchy, img, angle_alignment -def __get_contours_for_table_wo_external_bounds(img: np.ndarray, img_with_contours: np.ndarray, contours: List, hierarchy: List) -> [Any, Any]: +def __get_contours_for_table_wo_external_bounds(img: np.ndarray, img_with_contours: np.ndarray, contours: List, hierarchy: List, config: dict) -> [Any, Any]: # get children (get table counters) contours = np.array(contours) list_contours, table_contours = __get_table_contours(contours, hierarchy) @@ -137,8 +138,8 @@ def __get_contours_for_table_wo_external_bounds(img: np.ndarray, img_with_contou x, y, w, h = cv2.boundingRect(c) cv2.rectangle(img_with_contours, (x, y), (x + w, y + h), color=(0, 0, 0), thickness=5) - if get_config().get("debug_mode", False): - cv2.imwrite(os.path.join(get_config()["path_detect"], "img_with_external_bounds.jpg"), img_with_contours) + if config.get("debug_mode", False): + cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "img_with_external_bounds.jpg"), img_with_contours) contours, hierarchy = cv2.findContours(img_with_contours, cv2.RETR_TREE, cv2.CHAIN_APPROX_TC89_KCOS) return contours, hierarchy @@ -172,7 +173,7 @@ def __apply_houph_lines_and_detect_angle(image: np.ndarray, config: dict) -> [np gap_avg = min(np.mean([c[2] for c in contours_table]) // 45, gap_avg) else: gap_avg = 5 - if config["debug_mode"]: + if config.get("debug_mode", False): config.get("logger", logging.getLogger()).debug(f"Houph gap = {gap_avg}") # ----- image alignment ----- @@ -191,8 +192,9 @@ def __detect_horizontal_and_vertical_lines(img_bin: np.ndarray, config: dict, ta elif task == "tables": length_div = 55 height_div = 100 - kernel_length_weight = max(np.array(img_bin).shape[1] // length_div, config["min_w_cell"]) # 35 - kernel_length_height = max(np.array(img_bin).shape[0] // height_div, config["min_h_cell"]) # 100 + + kernel_length_weight = max(np.array(img_bin).shape[1] // length_div, TableTree.min_w_cell) # 35 + kernel_length_height = max(np.array(img_bin).shape[0] // height_div, TableTree.min_h_cell) # 100 # A verticle kernel of (1 X kernel_length), which will detect all the verticle lines from the image. verticle_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, kernel_length_height)) @@ -211,8 +213,8 @@ def __detect_horizontal_and_vertical_lines(img_bin: np.ndarray, config: dict, ta horizontal_lines_img = cv2.dilate(img_temp2, hori_kernel, iterations=iterations) if config.get("debug_mode", False): - cv2.imwrite(os.path.join(config["path_detect"], "verticle_lines.jpg"), verticle_lines_img) - cv2.imwrite(os.path.join(config["path_detect"], "horizontal_lines.jpg"), horizontal_lines_img) + cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "verticle_lines.jpg"), verticle_lines_img) + cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "horizontal_lines.jpg"), horizontal_lines_img) """Now we will add these two images. This will have only boxes and the information written in the box will be erased. @@ -228,7 +230,7 @@ def __detect_horizontal_and_vertical_lines(img_bin: np.ndarray, config: dict, ta img_bin_with_lines = cv2.erode(~img_bin_with_lines, kernel, iterations=2) (thresh, img_bin_with_lines) = cv2.threshold(img_bin_with_lines, 200, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU) if config.get("debug_mode", False): - cv2.imwrite(os.path.join(config["path_detect"], "img_bin_with_lines.jpg"), img_bin_with_lines) + cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "img_bin_with_lines.jpg"), img_bin_with_lines) return img_bin_with_lines @@ -265,8 +267,8 @@ def detect_tables_by_contours(img: np.ndarray, if config.get("debug_mode", False): config.get("logger", logging.getLogger()).debug(f"Hierarchy [Next, Previous, First_Child, Parent]:\n {hierarchy}") tree_table.print_tree(depth=0) - if config.get("debug_mode", False): - cv2.imwrite(os.path.join(config["path_detect"], "img_draw_counters.jpg"), img) + + cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "img_draw_counters.jpg"), img) tree_table.set_text_into_tree(tree=tree_table, src_image=image, language=language, config=config) diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py index c9cda801..41196f00 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py @@ -28,6 +28,7 @@ from dedoc.readers.pdf_reader.data_classes.tables.location import Location from dedoc.readers.pdf_reader.data_classes.text_with_bbox import TextWithBBox from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdfminer_reader.pdfminer_utils import cleaning_text_from_hieroglyphics, create_bbox, draw_annotation +from dedoc.utils.parameter_utils import get_path_param from dedoc.utils.pdf_utils import get_page_image logging.getLogger("pdfminer").setLevel(logging.ERROR) @@ -251,7 +252,7 @@ def __debug_extract_layout(self, image_src: np.ndarray, layout: LTContainer, pag :param layout: container of layout element :return: None """ - tmp_dir = os.path.join(self.config.get("path_debug"), "pdfminer") + tmp_dir = os.path.join(get_path_param(self.config, "path_debug"), "pdfminer") os.makedirs(tmp_dir, exist_ok=True) file_text = open(os.path.join(tmp_dir, f"text_{page_num}.txt"), "wt") @@ -268,10 +269,10 @@ def __debug_extract_layout(self, image_src: np.ndarray, layout: LTContainer, pag for lobj in lobjs: if isinstance(lobj, LTTextBoxHorizontal): - annotations.extend(self.__extract_words_bbox_annotation(lobj, k_w, k_h, height, width)) + annotations.extend(self.__extract_words_bbox_annotation(lobj, height, width)) lobjs_textline.extend(lobj) elif isinstance(lobj, LTTextLineHorizontal): - annotations.extend(self.__extract_words_bbox_annotation(lobj, k_w, k_h, height, width)) + annotations.extend(self.__extract_words_bbox_annotation(lobj, height, width)) lobjs_textline.append(lobj) elif isinstance(lobj, LTRect): lobjs_box.append(lobj) diff --git a/dedoc/utils/parameter_utils.py b/dedoc/utils/parameter_utils.py index 66b00218..126e1d6a 100644 --- a/dedoc/utils/parameter_utils.py +++ b/dedoc/utils/parameter_utils.py @@ -1,7 +1,10 @@ +import os import subprocess from logging import Logger from typing import Any, Dict, Optional, Tuple +from dedoc.config import RESOURCES_PATH, get_config + def get_param_language(parameters: Optional[dict]) -> str: if parameters is None: @@ -136,6 +139,7 @@ def get_param_gpu_available(parameters: Optional[dict], logger: Logger) -> bool: Returns: bool: True if GPU is available, False otherwise. """ + parameters = {} if parameters is None else parameters if not parameters.get("on_gpu", False): return False @@ -148,3 +152,15 @@ def get_param_gpu_available(parameters: Optional[dict], logger: Logger) -> bool: return False return True + + +def get_path_param(parameters: Optional[dict], path_key: str) -> str: + parameters = {} if parameters is None else parameters + path_value = parameters.get(path_key) + + if path_value is None: + default_config = get_config() + path_value = default_config.get(path_key, RESOURCES_PATH) + + os.makedirs(path_value, exist_ok=True) + return path_value diff --git a/tests/unit_tests/test_format_pdf_reader.py b/tests/unit_tests/test_format_pdf_reader.py index 1226cd01..eb6af291 100644 --- a/tests/unit_tests/test_format_pdf_reader.py +++ b/tests/unit_tests/test_format_pdf_reader.py @@ -140,3 +140,10 @@ def test_pdf_text_layer(self) -> None: annotations = line.annotations annotations_set = {(a.name, a.value, a.start, a.end) for a in annotations} self.assertEqual(len(annotations_set), len(annotations)) + + def test_table_extractor(self) -> None: + config = {} # Has to work without config + any_doc_reader = PdfTxtlayerReader(config=config) + path = os.path.join(os.path.dirname(__file__), "../data/pdf_with_text_layer/english_doc.pdf") + result = any_doc_reader.read(path, document_type=None, parameters={"need_pdf_table_analysis": "True"}) + self.assertEqual(len(result.tables), 1)