Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TLDR-784 ignore gost frame #492

Merged
merged 11 commits into from
Sep 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions dedoc/api/api_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ class QueryParameters:
'"no_change" - set vertical orientation of the document without using an orientation classifier')
need_header_footer_analysis: str = Form("false", enum=["true", "false"], description="Exclude headers and footers from PDF parsing result")
need_binarization: str = Form("false", enum=["true", "false"], description="Binarize document pages (for images or PDF without a textual layer)")
need_gost_frame_analysis: str = Form("false", enum=["true", "false"], description="Parameter for detecting and ignoring GOST frame of the document")
NastyBoget marked this conversation as resolved.
Show resolved Hide resolved

# other formats handling
delimiter: Optional[str] = Form(None, description="Column separator for CSV files")
Expand Down
3 changes: 3 additions & 0 deletions dedoc/api/web/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,9 @@ <h4>PDF handling</h4>
<p>
<label><input name="need_binarization" type="checkbox" value="true"> need_binarization</label>
</p>
<p>
<label><input name="need_gost_frame_analysis" type="checkbox" value="true"> need_gost_frame_analysis</label>
</p>
</details>
</div>

Expand Down
9 changes: 9 additions & 0 deletions dedoc/data_structures/line_with_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,3 +180,12 @@ def __add__(self, other: Union["LineWithMeta", str]) -> "LineWithMeta":
def to_api_schema(self) -> ApiLineWithMeta:
annotations = [annotation.to_api_schema() for annotation in self.annotations]
return ApiLineWithMeta(text=self._line, annotations=annotations)

def shift(self, shift_x: int, shift_y: int, image_width: int, image_height: int) -> None:
import json
from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation
for annotation in self.annotations:
if annotation.name == "bounding box":
bbox, page_width, page_height = BBoxAnnotation.get_bbox_from_value(annotation.value)
bbox.shift(shift_x, shift_y)
annotation.value = json.dumps(bbox.to_relative_dict(image_width, image_height))
4 changes: 4 additions & 0 deletions dedoc/readers/pdf_reader/data_classes/line_with_location.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@ def __init__(self, line: str, metadata: LineMetadata, annotations: List[Annotati
self.order = order
super().__init__(line, metadata, annotations, uid)

def shift(self, shift_x: int, shift_y: int, image_width: int, image_height: int) -> None:
oksidgy marked this conversation as resolved.
Show resolved Hide resolved
super().shift(shift_x=shift_x, shift_y=shift_y, image_width=image_width, image_height=image_height)
self.location.shift(shift_x, shift_y)

def __repr__(self) -> str:
parent_repr = super().__repr__()
return parent_repr.replace("LineWithMeta", "LineWithLocation")
Expand Down
11 changes: 11 additions & 0 deletions dedoc/readers/pdf_reader/data_classes/tables/cell.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,17 @@ def copy_from(cell: "Cell",
uid=cell.cell_uid,
contour_coord=cell.con_coord)

def shift(self, shift_x: int, shift_y: int, image_width: int, image_height: int) -> None:
if self.lines:
for line in self.lines:
line.shift(shift_x=shift_x, shift_y=shift_y, image_width=image_width, image_height=image_height)
self.x_top_left += shift_x
self.x_bottom_right += shift_x
self.y_top_left += shift_y
self.y_bottom_right += shift_y
if self.con_coord:
self.con_coord.shift(shift_x=shift_x, shift_y=shift_y)

def __init__(self,
x_top_left: int,
x_bottom_right: int,
Expand Down
3 changes: 3 additions & 0 deletions dedoc/readers/pdf_reader/data_classes/tables/location.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ def __init__(self, page_number: int, bbox: BBox, name: str = "", rotated_angle:
self.name = name
self.rotated_angle = rotated_angle

def shift(self, shift_x: int, shift_y: int) -> None:
self.bbox.shift(shift_x, shift_y)

def to_dict(self) -> Dict[str, Any]:
from collections import OrderedDict

Expand Down
63 changes: 58 additions & 5 deletions dedoc/readers/pdf_reader/pdf_base_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
from collections import namedtuple
from typing import Iterator, List, Optional, Set, Tuple

import numpy as np
from dedocutils.data_structures.bbox import BBox
from numpy import ndarray

from dedoc.common.exceptions.bad_file_error import BadFileFormatError
Expand All @@ -11,6 +13,7 @@
from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation
from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment
from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.gost_frame_recognizer import GOSTFrameRecognizer

ParametersForParseDoc = namedtuple("ParametersForParseDoc", [
"orient_analysis_cells",
Expand All @@ -26,7 +29,9 @@
"table_type",
"with_attachments",
"attachments_dir",
"need_content_analysis"
"need_content_analysis",
"need_gost_frame_analysis",
"pdf_with_txt_layer"
])


Expand All @@ -50,6 +55,7 @@ def __init__(self, *, config: Optional[dict] = None, recognized_extensions: Opti
self.attachment_extractor = PDFAttachmentsExtractor(config=self.config)
self.linker = LineObjectLinker(config=self.config)
self.paragraph_extractor = ScanParagraphClassifierExtractor(config=self.config)
self.gost_frame_recognizer = GOSTFrameRecognizer(config=self.config)

def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
"""
Expand Down Expand Up @@ -79,7 +85,10 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
table_type=param_utils.get_param_table_type(parameters),
with_attachments=param_utils.get_param_with_attachments(parameters),
attachments_dir=param_utils.get_param_attachments_dir(parameters, file_path),
need_content_analysis=param_utils.get_param_need_content_analysis(parameters)
need_content_analysis=param_utils.get_param_need_content_analysis(parameters),
need_gost_frame_analysis=param_utils.get_param_need_gost_frame_analysis(parameters),
pdf_with_txt_layer=param_utils.get_param_pdf_with_txt_layer(parameters)

)

lines, scan_tables, attachments, warnings, metadata = self._parse_document(file_path, params_for_parse)
Expand All @@ -98,15 +107,23 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> (
from dedoc.data_structures.hierarchy_level import HierarchyLevel
from dedoc.readers.pdf_reader.utils.header_footers_analysis import footer_header_analysis
from dedoc.utils.pdf_utils import get_pdf_page_count
from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader
from dedoc.utils.utils import flatten

first_page = 0 if parameters.first_page is None or parameters.first_page < 0 else parameters.first_page
last_page = math.inf if parameters.last_page is None else parameters.last_page
images = self._get_images(path, first_page, last_page)

result = Parallel(n_jobs=self.config["n_jobs"])(
delayed(self._process_one_page)(image, parameters, page_number, path) for page_number, image in enumerate(images, start=first_page)
)
if parameters.need_gost_frame_analysis and isinstance(self, PdfImageReader):
gost_analyzed_images = Parallel(n_jobs=self.config["n_jobs"])(delayed(self.gost_frame_recognizer.rec_and_clean_frame)(image) for image in images)
result = Parallel(n_jobs=self.config["n_jobs"])(
delayed(self._process_one_page)(image, parameters, page_number, path) for page_number, (image, box) in
enumerate(gost_analyzed_images, start=first_page)
)
else:
result = Parallel(n_jobs=self.config["n_jobs"])(
delayed(self._process_one_page)(image, parameters, page_number, path) for page_number, image in enumerate(images, start=first_page)
)
oksidgy marked this conversation as resolved.
Show resolved Hide resolved

page_count = get_pdf_page_count(path)
page_count = math.inf if page_count is None else page_count
Expand Down Expand Up @@ -136,8 +153,44 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> (
all_lines_with_paragraphs = self.paragraph_extractor.extract(all_lines_with_links)
if page_angles:
metadata["rotated_page_angles"] = page_angles
if parameters.need_gost_frame_analysis and isinstance(self, PdfImageReader):
self._shift_all_contents(lines=all_lines_with_paragraphs, mp_tables=mp_tables, attachments=attachments, gost_analyzed_images=gost_analyzed_images)
return all_lines_with_paragraphs, mp_tables, attachments, warnings, metadata

def _shift_all_contents(self, lines: List[LineWithMeta], mp_tables: List[ScanTable], attachments: List[PdfImageAttachment],
gost_analyzed_images: List[Tuple[np.ndarray, BBox]]) -> None:
# shift mp_tables
for scan_table in mp_tables:
for location in scan_table.locations:
table_page_number = location.page_number
location.shift(shift_x=gost_analyzed_images[table_page_number][1].x_top_left, shift_y=gost_analyzed_images[table_page_number][1].y_top_left)
for row in scan_table.matrix_cells:
row_page_number = scan_table.page_number
for cell in row: # check page number information in the current table row, because table can be located on multiple pages
if cell.lines and len(cell.lines) >= 1:
row_page_number = cell.lines[0].metadata.page_id
break
for cell in row: # if cell doesn't contain page number information we use row_page_number
page_number = cell.lines[0].metadata.page_id if cell.lines and len(cell.lines) >= 1 else row_page_number
image_width, image_height = gost_analyzed_images[page_number][0].shape[1], gost_analyzed_images[page_number][0].shape[0]
shift_x, shift_y = gost_analyzed_images[page_number][1].x_top_left, gost_analyzed_images[page_number][1].y_top_left
cell.shift(shift_x=shift_x, shift_y=shift_y, image_width=image_width, image_height=image_height)

# shift attachments
for attachment in attachments:
attachment_page_number = attachment.location.page_number
shift_x, shift_y = gost_analyzed_images[attachment_page_number][1].x_top_left, gost_analyzed_images[attachment_page_number][1].y_top_left
attachment.location.shift(shift_x, shift_y)

# shift lines
for line in lines:
page_number = line.metadata.page_id
image_width, image_height = gost_analyzed_images[page_number][0].shape[1], gost_analyzed_images[page_number][0].shape[0]
line.shift(shift_x=gost_analyzed_images[page_number][1].x_top_left,
shift_y=gost_analyzed_images[page_number][1].y_top_left,
image_width=image_width,
image_height=image_height)

@abstractmethod
def _process_one_page(self, image: ndarray, parameters: ParametersForParseDoc, page_number: int, path: str) \
-> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[float]]:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import logging
from typing import Optional, Tuple

import cv2
import numpy as np
from dedocutils.data_structures import BBox

from dedoc.readers.pdf_reader.data_classes.tables.table_tree import TableTree
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_utils.img_processing import detect_horizontal_and_vertical_lines as detect_lines

MIN_FRAME_CONTENT_AREA = 0.7


class GOSTFrameRecognizer:
def __init__(self, *, config: dict = None) -> None:
self.logger = config.get("logger", logging.getLogger())
self.config = config

def rec_and_clean_frame(self, image: np.ndarray) -> Tuple[np.ndarray, BBox]:
if len(image.shape) < 3: # check if an image is already converted to grayscale
thresh, img_bin = cv2.threshold(image, 225, 255, cv2.THRESH_BINARY)
else:
thresh, img_bin = cv2.threshold(cv2.cvtColor(image, cv2.COLOR_BGR2GRAY), 225, 255, cv2.THRESH_BINARY)
lines_bin = detect_lines(255 - img_bin, self.config, "tables")
contours, hierarchy = cv2.findContours(lines_bin, cv2.RETR_TREE, cv2.CHAIN_APPROX_TC89_KCOS)
tree_table = TableTree.parse_contours_to_tree(contours=contours, hierarchy=hierarchy, config=self.config)

img_area = image.shape[0] * image.shape[1]
has_gost_frame, main_box = self._analyze_cells_on_frame(tree_table, img_area)
if has_gost_frame:
return BBox.crop_image_by_box(image, main_box), main_box
return image, BBox(0, 0, image.shape[1], image.shape[0])

def _analyze_cells_on_frame(self, tree_table: "TableTree", img_area: "int") -> Tuple[bool, Optional[BBox]]:
try:
sub_bboxes = tree_table.children[0].children
for box in sub_bboxes:
if box.cell_box.square / img_area > MIN_FRAME_CONTENT_AREA:
return True, box.cell_box
return False, None
except Exception as ex:
self.logger.warning(ex)
return False, None
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def get_contours_cells(img: np.ndarray, table_type: str, *, config: dict) -> [An
if config.get("debug_mode", False):
cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "image_bin.jpg"), img_bin)
# step 2
img_final_bin = __detect_horizontal_and_vertical_lines(img_bin, config, "tables")
img_final_bin = detect_horizontal_and_vertical_lines(img_bin, config, "tables")
# step 3
img_final_bin_houph, angle_alignment = __apply_houph_lines_and_detect_angle(img_final_bin, config)

Expand Down Expand Up @@ -182,7 +182,7 @@ def __apply_houph_lines_and_detect_angle(image: np.ndarray, config: dict) -> [np
return img_final_bin_houph, angle_alignment


def __detect_horizontal_and_vertical_lines(img_bin: np.ndarray, config: dict, task: str) -> np.ndarray:
def detect_horizontal_and_vertical_lines(img_bin: np.ndarray, config: dict, task: str) -> np.ndarray:
# Defining a kernel length

if task == "orientation":
Expand Down
7 changes: 7 additions & 0 deletions dedoc/utils/parameter_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,13 @@ def get_param_need_pdf_table_analysis(parameters: Optional[dict]) -> bool:
return need_pdf_table_analysis


def get_param_need_gost_frame_analysis(parameters: Optional[dict]) -> bool:
if parameters is None:
return False
need_gost_frame_analysis = str(parameters.get("need_gost_frame_analysis", "False")).lower() == "true"
return need_gost_frame_analysis


def get_param_need_binarization(parameters: Optional[dict]) -> bool:
if parameters is None:
return False
Expand Down
2 changes: 2 additions & 0 deletions docs/source/_static/code_examples/langchain/dedoc_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ def __init__(
result for parsing PDF and images
need_binarization: clean pages background (binarize) for PDF without a
textual layer and images
need_gost_frame_analysis: detect and ignore GOST frame of the document
need_pdf_table_analysis: parse tables for PDF without a textual layer
and images
delimiter: column separator for CSV, TSV files
Expand Down Expand Up @@ -374,6 +375,7 @@ def __init__(
result for parsing PDF and images
need_binarization: clean pages background (binarize) for PDF without a
textual layer and images
need_gost_frame_analysis: detect and ignore GOST frame
need_pdf_table_analysis: parse tables for PDF without a textual layer
and images
delimiter: column separator for CSV, TSV files
Expand Down
1 change: 1 addition & 0 deletions docs/source/_static/code_examples/langchain/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ class DedocPDFLoader(DedocBaseLoader):
need_header_footer_analysis: remove headers and footers from the output result
need_binarization: clean pages background (binarize) for PDF without a textual
layer
need_gost_frame_analysis: detect and ignore GOST frame
need_pdf_table_analysis: parse tables for PDF without a textual layer

Examples
Expand Down
5 changes: 5 additions & 0 deletions docs/source/dedoc_api_usage/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,11 @@ Api parameters description
* **true** -- if any text is detected in a PDF file, Dedoc assumes that textual layer is detected and it is correct. Much faster but less accurate.
* **false** -- use the textual layer classifier to detect textual layer and prove its correctness.

* - need_gost_frame_analysis
- true, false
- false
- This option is used to enable GOST (Russian government standard) frame recognition for PDF documents or images.
The GOST frame recognizer is used recognize and ignore GOST frame on images and PDF documents without correct textual layer.

* - language
- rus, eng, rus+eng, fra, spa
Expand Down
11 changes: 11 additions & 0 deletions docs/source/parameters/pdf_handling.rst
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,17 @@ PDF and images handling
If the document has a textual layer, it is recommended to use :class:`dedoc.readers.PdfTabbyReader`,
in this case tables will be parsed much easier and faster.

* - need_gost_frame_analysis
- True, False
- False
- * :meth:`dedoc.DedocManager.parse`
* :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfBaseReader.read`
NastyBoget marked this conversation as resolved.
Show resolved Hide resolved
* :meth:`dedoc.readers.ReaderComposition.read`
- This option is used to enable GOST (Russian government standard) frame recognition for PDF documents or images.
The GOST frame recognizer is used in :meth:`dedoc.readers.PdfBaseReader.read`. Its main function is to recognize and
ignore the GOST frame on the document. It allows :class:`dedoc.readers.PdfImageReader` to properly process the content
of the document containing GOST frame.

* - orient_analysis_cells
- True, False
- False
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
beautifulsoup4>=4.10.0,<=4.12.2
charset-normalizer>=2.0.12,<=3.2.0
Cython>=0.29.28,<=3.0.2
dedoc-utils==0.3.7
dedoc-utils==0.3.8
fastapi>=0.77.0,<1.0
huggingface-hub>=0.14.1,<1.0
imutils==0.5.4
Expand Down
10 changes: 10 additions & 0 deletions tests/api_tests/test_api_module_table_recognizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,3 +213,13 @@ def test_detect_small_table(self) -> None:
result = self._send_request(file_name, data={"language": "rus"})
tables = result["content"]["tables"]
self.assertEqual(2, len(tables))

def test_multipage_gost_table(self) -> None:
file_name = "gost_multipage_table.pdf"
result = self._send_request(file_name, data={"need_gost_frame_analysis": "True"}) # don't pass pdf_with_text_layer to check condition in PDFBaseReader
self.assertTrue(len(result["content"]["tables"][0]["cells"]) > 35)
self.assertTrue("KR13" in result["content"]["tables"][0]["cells"][-1][0]["lines"][0]["text"]) # check the last row of multipage table
self.assertTrue("R13.1" in result["content"]["tables"][0]["cells"][-1][1]["lines"][0]["text"]) # check that it belongs to first and only table
self.assertTrue("Испытание по проверке" in result["content"]["tables"][0]["cells"][-1][2]["lines"][0]["text"])
self.assertTrue("3.6" in result["content"]["tables"][0]["cells"][-1][3]["lines"][0]["text"])
self.assertTrue("7.4.9" in result["content"]["tables"][0]["cells"][-1][4]["lines"][0]["text"])
Binary file added tests/data/tables/gost_frame_1.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added tests/data/tables/gost_frame_2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added tests/data/tables/gost_frame_3.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added tests/data/tables/gost_multipage_table.pdf
Binary file not shown.
Binary file added tests/data/tables/not_gost_frame.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Loading