diff --git a/README.md b/README.md index 74c523d3..52eb8166 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,12 @@ It extracts a document’s logical structure and content, its tables, text forma The document’s content is represented as a tree storing headings and lists of any level. Dedoc can be integrated in a document contents and structure analysis system as a separate module. +## Workflow + +![Workflow](https://github.com/ispras/dedoc/raw/master/docs/source/_static/workflow.png) + +Workflow description is given [`here`](https://dedoc.readthedocs.io/en/latest/?badge=latest#workflow) + ## Features and advantages Dedoc is implemented in Python and works with semi-structured data formats (DOC/DOCX, ODT, XLS/XLSX, CSV, TXT, JSON) and none-structured data formats like images (PNG, JPG etc.), archives (ZIP, RAR etc.), PDF and HTML formats. Document structure extraction is fully automatic regardless of input data type. @@ -53,6 +59,8 @@ still, the docker application should be installed and configured properly. If you don't need to change the application configuration, you may use the built docker image as well. +## Work with dedoc as service + ### 1. Pull the image ```shell docker pull dedocproject/dedoc diff --git a/dedoc/data_structures/line_with_meta.py b/dedoc/data_structures/line_with_meta.py index 2d906cf7..ca954573 100644 --- a/dedoc/data_structures/line_with_meta.py +++ b/dedoc/data_structures/line_with_meta.py @@ -1,4 +1,5 @@ import re +from copy import deepcopy from typing import List, Optional, Sized, Union from uuid import uuid1 @@ -45,7 +46,7 @@ def join(lines: List["LineWithMeta"], delimiter: str = "\n") -> "LineWithMeta": if len(lines) == 0: return LineWithMeta("") - common_line = lines[0] + common_line = deepcopy(lines[0]) for next_line in lines[1:]: common_line += delimiter diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py index f7a3fd98..04fdd2d1 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py @@ -158,9 +158,9 @@ def get_line_with_meta(text: str, return LineWithMeta(line=text, metadata=LineMetadata(page_id=0, line_id=None), annotations=annotations) @staticmethod - def upscale(image: Optional[np.ndarray], padding_px: int = 40) -> Optional[np.ndarray]: + def upscale(image: Optional[np.ndarray], padding_px: int = 40) -> Tuple[Optional[np.ndarray], int]: if image is None or sum(image.shape) < 5: - return image + return image, 0 color_backgr = get_highest_pixel_frequency(image) @@ -170,4 +170,5 @@ def upscale(image: Optional[np.ndarray], padding_px: int = 40) -> Optional[np.nd else: bigger_cell = np.full((image.shape[0] + padding_px, image.shape[1] + padding_px, 3), color_backgr) bigger_cell[padding_px // 2:-padding_px // 2, padding_px // 2:-padding_px // 2, :] = image - return bigger_cell + + return bigger_cell, padding_px // 2 diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py index c964c9a6..80055a9b 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py @@ -2,6 +2,7 @@ from typing import List import numpy as np +from dedocutils.data_structures import BBox from dedoc.data_structures import ConfidenceAnnotation, LineWithMeta from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell @@ -142,25 +143,35 @@ def _split_row(cell_splitter: Cell, union_cell: List[Cell], language: str, image union_cell[col_id].y_top_left = y_top_split union_cell[col_id].y_bottom_right = y_bottom_split - cell_image = OCRCellExtractor.upscale(image[y_top_split:y_bottom_split, x_left:x_right]) - result_row[col_id].lines = __get_ocr_lines(cell_image, language, page_image=image) + cell_image, padding_value = OCRCellExtractor.upscale(image[y_top_split:y_bottom_split, x_left:x_right]) + result_row[col_id].lines = __get_ocr_lines(cell_image, language, page_image=image, + cell_bbox=BBox(x_top_left=x_left, y_top_left=y_top_split, + width=x_right - x_left, height=y_bottom_split - y_top_split), + padding_cell_value=padding_value) col_id -= 1 return result_row -def __get_ocr_lines(cell_image: np.ndarray, language: str, page_image: np.ndarray) -> List[LineWithMeta]: +def __get_ocr_lines(cell_image: np.ndarray, language: str, page_image: np.ndarray, cell_bbox: BBox, padding_cell_value: int) -> List[LineWithMeta]: ocr_result = get_text_with_bbox_from_cells(cell_image, language) cell_lines = [] for line in list(ocr_result.lines): text_line = OCRCellExtractor.get_line_with_meta("") for word in line.words: + # do absolute coordinate on src_image (inside src_image) + word.bbox.y_top_left -= padding_cell_value + word.bbox.x_top_left -= padding_cell_value + word.bbox.y_top_left += cell_bbox.y_top_left + word.bbox.x_top_left += cell_bbox.x_top_left + # add space between words if len(text_line) != 0: text_line += OCRCellExtractor.get_line_with_meta(" ", bbox=word.bbox, image=page_image) - # add confidence value + # add confidence value + text_line += OCRCellExtractor.get_line_with_meta( text=word.text, bbox=word.bbox, image=page_image, confidences=[ConfidenceAnnotation(start=0, end=len(word.text), value=0. if word.confidence < 0 else word.confidence / 100.)] diff --git a/dedoc/scripts/test_words_bbox_extraction.py b/dedoc/scripts/test_words_bbox_extraction.py index 61385b4f..64a062dc 100644 --- a/dedoc/scripts/test_words_bbox_extraction.py +++ b/dedoc/scripts/test_words_bbox_extraction.py @@ -177,12 +177,17 @@ def test_table_word_extraction(self): cv2.imwrite(os.path.join(output_path, file_name.split('/')[-1]), image) - def test_document_image_reader(self) -> None: + def test_document_table_split_last_column(self) -> None: filename_to_parameters = { - "scanned/scan_orient_1.jpg": {}, - "skew_corrector/rotated_2.jpg": {} + f"plat_por/plat_por_png ({i}).png": { + "table_type": "split_last_column+wo_external_bounds", + "need_text_localization": "False", + "language": "rus", + "is_one_column_document": "true", + "document_orientation": "no_change" + } for i in range(9) } - output_path = os.path.join(self.output_path, "document_pipeline_readers") + output_path = os.path.join(self.output_path, "tables") os.makedirs(output_path, exist_ok=True) for filename, parameters in filename_to_parameters.items(): result = self._send_request(file_name=filename, data=parameters, expected_code=200) diff --git a/tests/data/plat_por/plat_por_png (0).png b/tests/data/plat_por/plat_por_png (0).png new file mode 100644 index 00000000..53b7ae1d Binary files /dev/null and b/tests/data/plat_por/plat_por_png (0).png differ diff --git a/tests/data/plat_por/plat_por_png (1).png b/tests/data/plat_por/plat_por_png (1).png new file mode 100644 index 00000000..f7380450 Binary files /dev/null and b/tests/data/plat_por/plat_por_png (1).png differ diff --git a/tests/data/plat_por/plat_por_png (2).png b/tests/data/plat_por/plat_por_png (2).png new file mode 100644 index 00000000..bbeaa2a3 Binary files /dev/null and b/tests/data/plat_por/plat_por_png (2).png differ diff --git a/tests/data/plat_por/plat_por_png (3).png b/tests/data/plat_por/plat_por_png (3).png new file mode 100644 index 00000000..9687cfa1 Binary files /dev/null and b/tests/data/plat_por/plat_por_png (3).png differ diff --git a/tests/data/plat_por/plat_por_png (4).png b/tests/data/plat_por/plat_por_png (4).png new file mode 100644 index 00000000..148219d8 Binary files /dev/null and b/tests/data/plat_por/plat_por_png (4).png differ diff --git a/tests/data/plat_por/plat_por_png (5).png b/tests/data/plat_por/plat_por_png (5).png new file mode 100644 index 00000000..72e97cb8 Binary files /dev/null and b/tests/data/plat_por/plat_por_png (5).png differ diff --git a/tests/data/plat_por/plat_por_png (6).png b/tests/data/plat_por/plat_por_png (6).png new file mode 100644 index 00000000..ab2ecb46 Binary files /dev/null and b/tests/data/plat_por/plat_por_png (6).png differ diff --git a/tests/data/plat_por/plat_por_png (7).png b/tests/data/plat_por/plat_por_png (7).png new file mode 100644 index 00000000..45bf8b2a Binary files /dev/null and b/tests/data/plat_por/plat_por_png (7).png differ diff --git a/tests/data/plat_por/plat_por_png (8).png b/tests/data/plat_por/plat_por_png (8).png new file mode 100644 index 00000000..b7fb8635 Binary files /dev/null and b/tests/data/plat_por/plat_por_png (8).png differ