ESL-165 table bboxes bug (#358)

* ESL-165 Added test with hard tables * ESL-165 fixed bug box extraction in payment_order * ESL-165 after rebase * ESL-165 update README.md * ESL-165 after review --------- Co-authored-by: Nasty <[email protected]>
ispras · Oct 20, 2023 · bf1a60d · bf1a60d
1 parent 62445da
commit bf1a60d
Show file tree

Hide file tree

Showing 14 changed files with 38 additions and 12 deletions.
diff --git a/README.md b/README.md
@@ -9,6 +9,12 @@ It extracts a document’s logical structure and content, its tables, text forma
 The document’s content is represented as a tree storing headings and lists of any level. 
 Dedoc can be integrated in a document contents and structure analysis system as a separate module.
 
+## Workflow
+
+![Workflow](https://github.com/ispras/dedoc/raw/master/docs/source/_static/workflow.png)
+
+Workflow description is given [`here`](https://dedoc.readthedocs.io/en/latest/?badge=latest#workflow)
+
 ## Features and advantages
 Dedoc is implemented in Python and works with semi-structured data formats (DOC/DOCX, ODT, XLS/XLSX, CSV, TXT, JSON) and none-structured data formats like images (PNG, JPG etc.), archives (ZIP, RAR etc.), PDF and HTML formats. 
 Document structure extraction is fully automatic regardless of input data type. 
@@ -53,6 +59,8 @@ still, the docker application should be installed and configured properly.
 
 If you don't need to change the application configuration, you may use the built docker image as well.
 
+## Work with dedoc as service
+
 ### 1. Pull the image
 ```shell
 docker pull dedocproject/dedoc

diff --git a/dedoc/data_structures/line_with_meta.py b/dedoc/data_structures/line_with_meta.py
@@ -1,4 +1,5 @@
 import re
+from copy import deepcopy
 from typing import List, Optional, Sized, Union
 from uuid import uuid1
 
@@ -45,7 +46,7 @@ def join(lines: List["LineWithMeta"], delimiter: str = "\n") -> "LineWithMeta":
         if len(lines) == 0:
             return LineWithMeta("")
 
-        common_line = lines[0]
+        common_line = deepcopy(lines[0])
 
         for next_line in lines[1:]:
             common_line += delimiter

diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py
@@ -158,9 +158,9 @@ def get_line_with_meta(text: str,
         return LineWithMeta(line=text, metadata=LineMetadata(page_id=0, line_id=None), annotations=annotations)
 
     @staticmethod
-    def upscale(image: Optional[np.ndarray], padding_px: int = 40) -> Optional[np.ndarray]:
+    def upscale(image: Optional[np.ndarray], padding_px: int = 40) -> Tuple[Optional[np.ndarray], int]:
         if image is None or sum(image.shape) < 5:
-            return image
+            return image, 0
 
         color_backgr = get_highest_pixel_frequency(image)
 
@@ -170,4 +170,5 @@ def upscale(image: Optional[np.ndarray], padding_px: int = 40) -> Optional[np.nd
         else:
             bigger_cell = np.full((image.shape[0] + padding_px, image.shape[1] + padding_px, 3), color_backgr)
             bigger_cell[padding_px // 2:-padding_px // 2, padding_px // 2:-padding_px // 2, :] = image
-        return bigger_cell
+
+        return bigger_cell, padding_px // 2
diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py
@@ -2,6 +2,7 @@
 from typing import List
 
 import numpy as np
+from dedocutils.data_structures import BBox
 
 from dedoc.data_structures import ConfidenceAnnotation, LineWithMeta
 from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell
@@ -142,25 +143,35 @@ def _split_row(cell_splitter: Cell, union_cell: List[Cell], language: str, image
         union_cell[col_id].y_top_left = y_top_split
         union_cell[col_id].y_bottom_right = y_bottom_split
 
-        cell_image = OCRCellExtractor.upscale(image[y_top_split:y_bottom_split, x_left:x_right])
-        result_row[col_id].lines = __get_ocr_lines(cell_image, language, page_image=image)
+        cell_image, padding_value = OCRCellExtractor.upscale(image[y_top_split:y_bottom_split, x_left:x_right])
+        result_row[col_id].lines = __get_ocr_lines(cell_image, language, page_image=image,
+                                                   cell_bbox=BBox(x_top_left=x_left, y_top_left=y_top_split,
+                                                                  width=x_right - x_left, height=y_bottom_split - y_top_split),
+                                                   padding_cell_value=padding_value)
 
         col_id -= 1
 
     return result_row
 
 
-def __get_ocr_lines(cell_image: np.ndarray, language: str, page_image: np.ndarray) -> List[LineWithMeta]:
+def __get_ocr_lines(cell_image: np.ndarray, language: str, page_image: np.ndarray, cell_bbox: BBox, padding_cell_value: int) -> List[LineWithMeta]:
 
     ocr_result = get_text_with_bbox_from_cells(cell_image, language)
     cell_lines = []
     for line in list(ocr_result.lines):
         text_line = OCRCellExtractor.get_line_with_meta("")
         for word in line.words:
+            # do absolute coordinate on src_image (inside src_image)
+            word.bbox.y_top_left -= padding_cell_value
+            word.bbox.x_top_left -= padding_cell_value
+            word.bbox.y_top_left += cell_bbox.y_top_left
+            word.bbox.x_top_left += cell_bbox.x_top_left
+
             # add space between words
             if len(text_line) != 0:
                 text_line += OCRCellExtractor.get_line_with_meta(" ", bbox=word.bbox, image=page_image)
-            # add confidence value
+                # add confidence value
+
             text_line += OCRCellExtractor.get_line_with_meta(
                 text=word.text, bbox=word.bbox, image=page_image,
                 confidences=[ConfidenceAnnotation(start=0, end=len(word.text), value=0. if word.confidence < 0 else word.confidence / 100.)]

diff --git a/dedoc/scripts/test_words_bbox_extraction.py b/dedoc/scripts/test_words_bbox_extraction.py
@@ -177,12 +177,17 @@ def test_table_word_extraction(self):
 
             cv2.imwrite(os.path.join(output_path, file_name.split('/')[-1]), image)
 
-    def test_document_image_reader(self) -> None:
+    def test_document_table_split_last_column(self) -> None:
         filename_to_parameters = {
-            "scanned/scan_orient_1.jpg": {},
-            "skew_corrector/rotated_2.jpg": {}
+            f"plat_por/plat_por_png ({i}).png": {
+                "table_type": "split_last_column+wo_external_bounds",
+                "need_text_localization": "False",
+                "language": "rus",
+                "is_one_column_document": "true",
+                "document_orientation": "no_change"
+            } for i in range(9)
         }
-        output_path = os.path.join(self.output_path, "document_pipeline_readers")
+        output_path = os.path.join(self.output_path, "tables")
         os.makedirs(output_path, exist_ok=True)
         for filename, parameters in filename_to_parameters.items():
             result = self._send_request(file_name=filename, data=parameters, expected_code=200)

diff --git a/tests/data/plat_por/plat_por_png (0).png b/tests/data/plat_por/plat_por_png (0).png
diff --git a/tests/data/plat_por/plat_por_png (1).png b/tests/data/plat_por/plat_por_png (1).png
diff --git a/tests/data/plat_por/plat_por_png (2).png b/tests/data/plat_por/plat_por_png (2).png
diff --git a/tests/data/plat_por/plat_por_png (3).png b/tests/data/plat_por/plat_por_png (3).png
diff --git a/tests/data/plat_por/plat_por_png (4).png b/tests/data/plat_por/plat_por_png (4).png
diff --git a/tests/data/plat_por/plat_por_png (5).png b/tests/data/plat_por/plat_por_png (5).png
diff --git a/tests/data/plat_por/plat_por_png (6).png b/tests/data/plat_por/plat_por_png (6).png
diff --git a/tests/data/plat_por/plat_por_png (7).png b/tests/data/plat_por/plat_por_png (7).png
diff --git a/tests/data/plat_por/plat_por_png (8).png b/tests/data/plat_por/plat_por_png (8).png