Skip to content

Commit

Permalink
ESL-165 table bboxes bug (#358)
Browse files Browse the repository at this point in the history
* ESL-165 Added test with hard tables

* ESL-165 fixed bug box extraction in payment_order

* ESL-165 after rebase

* ESL-165 update README.md

* ESL-165 after review

---------

Co-authored-by: Nasty <[email protected]>
  • Loading branch information
oksidgy and NastyBoget authored Oct 20, 2023
1 parent 62445da commit bf1a60d
Show file tree
Hide file tree
Showing 14 changed files with 38 additions and 12 deletions.
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,12 @@ It extracts a document’s logical structure and content, its tables, text forma
The document’s content is represented as a tree storing headings and lists of any level.
Dedoc can be integrated in a document contents and structure analysis system as a separate module.

## Workflow

![Workflow](https://github.com/ispras/dedoc/raw/master/docs/source/_static/workflow.png)

Workflow description is given [`here`](https://dedoc.readthedocs.io/en/latest/?badge=latest#workflow)

## Features and advantages
Dedoc is implemented in Python and works with semi-structured data formats (DOC/DOCX, ODT, XLS/XLSX, CSV, TXT, JSON) and none-structured data formats like images (PNG, JPG etc.), archives (ZIP, RAR etc.), PDF and HTML formats.
Document structure extraction is fully automatic regardless of input data type.
Expand Down Expand Up @@ -53,6 +59,8 @@ still, the docker application should be installed and configured properly.

If you don't need to change the application configuration, you may use the built docker image as well.

## Work with dedoc as service

### 1. Pull the image
```shell
docker pull dedocproject/dedoc
Expand Down
3 changes: 2 additions & 1 deletion dedoc/data_structures/line_with_meta.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re
from copy import deepcopy
from typing import List, Optional, Sized, Union
from uuid import uuid1

Expand Down Expand Up @@ -45,7 +46,7 @@ def join(lines: List["LineWithMeta"], delimiter: str = "\n") -> "LineWithMeta":
if len(lines) == 0:
return LineWithMeta("")

common_line = lines[0]
common_line = deepcopy(lines[0])

for next_line in lines[1:]:
common_line += delimiter
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -158,9 +158,9 @@ def get_line_with_meta(text: str,
return LineWithMeta(line=text, metadata=LineMetadata(page_id=0, line_id=None), annotations=annotations)

@staticmethod
def upscale(image: Optional[np.ndarray], padding_px: int = 40) -> Optional[np.ndarray]:
def upscale(image: Optional[np.ndarray], padding_px: int = 40) -> Tuple[Optional[np.ndarray], int]:
if image is None or sum(image.shape) < 5:
return image
return image, 0

color_backgr = get_highest_pixel_frequency(image)

Expand All @@ -170,4 +170,5 @@ def upscale(image: Optional[np.ndarray], padding_px: int = 40) -> Optional[np.nd
else:
bigger_cell = np.full((image.shape[0] + padding_px, image.shape[1] + padding_px, 3), color_backgr)
bigger_cell[padding_px // 2:-padding_px // 2, padding_px // 2:-padding_px // 2, :] = image
return bigger_cell

return bigger_cell, padding_px // 2
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import List

import numpy as np
from dedocutils.data_structures import BBox

from dedoc.data_structures import ConfidenceAnnotation, LineWithMeta
from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell
Expand Down Expand Up @@ -142,25 +143,35 @@ def _split_row(cell_splitter: Cell, union_cell: List[Cell], language: str, image
union_cell[col_id].y_top_left = y_top_split
union_cell[col_id].y_bottom_right = y_bottom_split

cell_image = OCRCellExtractor.upscale(image[y_top_split:y_bottom_split, x_left:x_right])
result_row[col_id].lines = __get_ocr_lines(cell_image, language, page_image=image)
cell_image, padding_value = OCRCellExtractor.upscale(image[y_top_split:y_bottom_split, x_left:x_right])
result_row[col_id].lines = __get_ocr_lines(cell_image, language, page_image=image,
cell_bbox=BBox(x_top_left=x_left, y_top_left=y_top_split,
width=x_right - x_left, height=y_bottom_split - y_top_split),
padding_cell_value=padding_value)

col_id -= 1

return result_row


def __get_ocr_lines(cell_image: np.ndarray, language: str, page_image: np.ndarray) -> List[LineWithMeta]:
def __get_ocr_lines(cell_image: np.ndarray, language: str, page_image: np.ndarray, cell_bbox: BBox, padding_cell_value: int) -> List[LineWithMeta]:

ocr_result = get_text_with_bbox_from_cells(cell_image, language)
cell_lines = []
for line in list(ocr_result.lines):
text_line = OCRCellExtractor.get_line_with_meta("")
for word in line.words:
# do absolute coordinate on src_image (inside src_image)
word.bbox.y_top_left -= padding_cell_value
word.bbox.x_top_left -= padding_cell_value
word.bbox.y_top_left += cell_bbox.y_top_left
word.bbox.x_top_left += cell_bbox.x_top_left

# add space between words
if len(text_line) != 0:
text_line += OCRCellExtractor.get_line_with_meta(" ", bbox=word.bbox, image=page_image)
# add confidence value
# add confidence value

text_line += OCRCellExtractor.get_line_with_meta(
text=word.text, bbox=word.bbox, image=page_image,
confidences=[ConfidenceAnnotation(start=0, end=len(word.text), value=0. if word.confidence < 0 else word.confidence / 100.)]
Expand Down
13 changes: 9 additions & 4 deletions dedoc/scripts/test_words_bbox_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,12 +177,17 @@ def test_table_word_extraction(self):

cv2.imwrite(os.path.join(output_path, file_name.split('/')[-1]), image)

def test_document_image_reader(self) -> None:
def test_document_table_split_last_column(self) -> None:
filename_to_parameters = {
"scanned/scan_orient_1.jpg": {},
"skew_corrector/rotated_2.jpg": {}
f"plat_por/plat_por_png ({i}).png": {
"table_type": "split_last_column+wo_external_bounds",
"need_text_localization": "False",
"language": "rus",
"is_one_column_document": "true",
"document_orientation": "no_change"
} for i in range(9)
}
output_path = os.path.join(self.output_path, "document_pipeline_readers")
output_path = os.path.join(self.output_path, "tables")
os.makedirs(output_path, exist_ok=True)
for filename, parameters in filename_to_parameters.items():
result = self._send_request(file_name=filename, data=parameters, expected_code=200)
Expand Down
Binary file added tests/data/plat_por/plat_por_png (0).png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added tests/data/plat_por/plat_por_png (1).png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added tests/data/plat_por/plat_por_png (2).png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added tests/data/plat_por/plat_por_png (3).png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added tests/data/plat_por/plat_por_png (4).png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added tests/data/plat_por/plat_por_png (5).png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added tests/data/plat_por/plat_por_png (6).png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added tests/data/plat_por/plat_por_png (7).png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added tests/data/plat_por/plat_por_png (8).png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit bf1a60d

Please sign in to comment.