ispras · oksidgy · Sep 12, 2024 · Sep 4, 2024 · Sep 5, 2024 · Sep 6, 2024
diff --git a/dedoc/api/api_args.py b/dedoc/api/api_args.py
@@ -40,6 +40,7 @@ class QueryParameters:
                                                  '"no_change" - set vertical orientation of the document without using an orientation classifier')
     need_header_footer_analysis: str = Form("false", enum=["true", "false"], description="Exclude headers and footers from PDF parsing result")
     need_binarization: str = Form("false", enum=["true", "false"], description="Binarize document pages (for images or PDF without a textual layer)")
+    need_gost_frame_analysis: str = Form("false", enum=["true", "false"], description="Parameter for detecting and ignoring GOST frame of the document")
 
     # other formats handling
     delimiter: Optional[str] = Form(None, description="Column separator for CSV files")

diff --git a/dedoc/api/web/index.html b/dedoc/api/web/index.html
@@ -183,6 +183,9 @@ <h4>PDF handling</h4>
                 <p>
                     <label><input name="need_binarization" type="checkbox" value="true"> need_binarization</label>
                 </p>
+                <p>
+                    <label><input name="need_gost_frame_analysis" type="checkbox" value="true"> need_gost_frame_analysis</label>
+                </p>
             </details>
         </div>
 

diff --git a/dedoc/data_structures/line_with_meta.py b/dedoc/data_structures/line_with_meta.py
@@ -180,3 +180,12 @@ def __add__(self, other: Union["LineWithMeta", str]) -> "LineWithMeta":
     def to_api_schema(self) -> ApiLineWithMeta:
         annotations = [annotation.to_api_schema() for annotation in self.annotations]
         return ApiLineWithMeta(text=self._line, annotations=annotations)
+
+    def shift(self, shift_x: int, shift_y: int, image_width: int, image_height: int) -> None:
+        import json
+        from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation
+        for annotation in self.annotations:
+            if annotation.name == "bounding box":
+                bbox, page_width, page_height = BBoxAnnotation.get_bbox_from_value(annotation.value)
+                bbox.shift(shift_x, shift_y)
+                annotation.value = json.dumps(bbox.to_relative_dict(image_width, image_height))
diff --git a/dedoc/readers/pdf_reader/data_classes/line_with_location.py b/dedoc/readers/pdf_reader/data_classes/line_with_location.py
@@ -13,6 +13,10 @@ def __init__(self, line: str, metadata: LineMetadata, annotations: List[Annotati
         self.order = order
         super().__init__(line, metadata, annotations, uid)
 
+    def shift(self, shift_x: int, shift_y: int, image_width: int, image_height: int) -> None:
+        super().shift(shift_x=shift_x, shift_y=shift_y, image_width=image_width, image_height=image_height)
+        self.location.shift(shift_x, shift_y)
+
     def __repr__(self) -> str:
         parent_repr = super().__repr__()
         return parent_repr.replace("LineWithMeta", "LineWithLocation")

diff --git a/dedoc/readers/pdf_reader/data_classes/tables/cell.py b/dedoc/readers/pdf_reader/data_classes/tables/cell.py
@@ -30,6 +30,17 @@ def copy_from(cell: "Cell",
                     uid=cell.cell_uid,
                     contour_coord=cell.con_coord)
 
+    def shift(self, shift_x: int, shift_y: int, image_width: int, image_height: int) -> None:
+        if self.lines:
+            for line in self.lines:
+                line.shift(shift_x=shift_x, shift_y=shift_y, image_width=image_width, image_height=image_height)
+        self.x_top_left += shift_x
+        self.x_bottom_right += shift_x
+        self.y_top_left += shift_y
+        self.y_bottom_right += shift_y
+        if self.con_coord:
+            self.con_coord.shift(shift_x=shift_x, shift_y=shift_y)
+
     def __init__(self,
                  x_top_left: int,
                  x_bottom_right: int,

diff --git a/dedoc/readers/pdf_reader/data_classes/tables/location.py b/dedoc/readers/pdf_reader/data_classes/tables/location.py
@@ -12,6 +12,9 @@ def __init__(self, page_number: int, bbox: BBox, name: str = "", rotated_angle:
         self.name = name
         self.rotated_angle = rotated_angle
 
+    def shift(self, shift_x: int, shift_y: int) -> None:
+        self.bbox.shift(shift_x, shift_y)
+
     def to_dict(self) -> Dict[str, Any]:
         from collections import OrderedDict
 

diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py
@@ -2,6 +2,8 @@
 from collections import namedtuple
 from typing import Iterator, List, Optional, Set, Tuple
 
+import numpy as np
+from dedocutils.data_structures.bbox import BBox
 from numpy import ndarray
 
 from dedoc.common.exceptions.bad_file_error import BadFileFormatError
@@ -11,6 +13,7 @@
 from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation
 from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment
 from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable
+from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.gost_frame_recognizer import GOSTFrameRecognizer
 
 ParametersForParseDoc = namedtuple("ParametersForParseDoc", [
     "orient_analysis_cells",
@@ -26,7 +29,9 @@
     "table_type",
     "with_attachments",
     "attachments_dir",
-    "need_content_analysis"
+    "need_content_analysis",
+    "need_gost_frame_analysis",
+    "pdf_with_txt_layer"
 ])
 
 
@@ -50,6 +55,7 @@ def __init__(self, *, config: Optional[dict] = None, recognized_extensions: Opti
         self.attachment_extractor = PDFAttachmentsExtractor(config=self.config)
         self.linker = LineObjectLinker(config=self.config)
         self.paragraph_extractor = ScanParagraphClassifierExtractor(config=self.config)
+        self.gost_frame_recognizer = GOSTFrameRecognizer(config=self.config)
 
     def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
         """
@@ -79,7 +85,10 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
             table_type=param_utils.get_param_table_type(parameters),
             with_attachments=param_utils.get_param_with_attachments(parameters),
             attachments_dir=param_utils.get_param_attachments_dir(parameters, file_path),
-            need_content_analysis=param_utils.get_param_need_content_analysis(parameters)
+            need_content_analysis=param_utils.get_param_need_content_analysis(parameters),
+            need_gost_frame_analysis=param_utils.get_param_need_gost_frame_analysis(parameters),
+            pdf_with_txt_layer=param_utils.get_param_pdf_with_txt_layer(parameters)
+
         )
 
         lines, scan_tables, attachments, warnings, metadata = self._parse_document(file_path, params_for_parse)
@@ -98,15 +107,23 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> (
         from dedoc.data_structures.hierarchy_level import HierarchyLevel
         from dedoc.readers.pdf_reader.utils.header_footers_analysis import footer_header_analysis
         from dedoc.utils.pdf_utils import get_pdf_page_count
+        from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader
         from dedoc.utils.utils import flatten
 
         first_page = 0 if parameters.first_page is None or parameters.first_page < 0 else parameters.first_page
         last_page = math.inf if parameters.last_page is None else parameters.last_page
         images = self._get_images(path, first_page, last_page)
 
-        result = Parallel(n_jobs=self.config["n_jobs"])(
-            delayed(self._process_one_page)(image, parameters, page_number, path) for page_number, image in enumerate(images, start=first_page)
-        )
+        if parameters.need_gost_frame_analysis and isinstance(self, PdfImageReader):
+            gost_analyzed_images = Parallel(n_jobs=self.config["n_jobs"])(delayed(self.gost_frame_recognizer.rec_and_clean_frame)(image) for image in images)
+            result = Parallel(n_jobs=self.config["n_jobs"])(
+                delayed(self._process_one_page)(image, parameters, page_number, path) for page_number, (image, box) in
+                enumerate(gost_analyzed_images, start=first_page)
+            )
+        else:
+            result = Parallel(n_jobs=self.config["n_jobs"])(
+                delayed(self._process_one_page)(image, parameters, page_number, path) for page_number, image in enumerate(images, start=first_page)
+            )
 
         page_count = get_pdf_page_count(path)
         page_count = math.inf if page_count is None else page_count
@@ -136,8 +153,44 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> (
         all_lines_with_paragraphs = self.paragraph_extractor.extract(all_lines_with_links)
         if page_angles:
             metadata["rotated_page_angles"] = page_angles
+        if parameters.need_gost_frame_analysis and isinstance(self, PdfImageReader):
+            self._shift_all_contents(lines=all_lines_with_paragraphs, mp_tables=mp_tables, attachments=attachments, gost_analyzed_images=gost_analyzed_images)
         return all_lines_with_paragraphs, mp_tables, attachments, warnings, metadata
 
+    def _shift_all_contents(self, lines: List[LineWithMeta], mp_tables: List[ScanTable], attachments: List[PdfImageAttachment],
+                            gost_analyzed_images: List[Tuple[np.ndarray, BBox]]) -> None:
+        # shift mp_tables
+        for scan_table in mp_tables:
+            for location in scan_table.locations:
+                table_page_number = location.page_number
+                location.shift(shift_x=gost_analyzed_images[table_page_number][1].x_top_left, shift_y=gost_analyzed_images[table_page_number][1].y_top_left)
+            for row in scan_table.matrix_cells:
+                row_page_number = scan_table.page_number
+                for cell in row:  # check page number information in the current table row, because table can be located on multiple pages
+                    if cell.lines and len(cell.lines) >= 1:
+                        row_page_number = cell.lines[0].metadata.page_id
+                        break
+                for cell in row:  # if cell doesn't contain page number information we use row_page_number
+                    page_number = cell.lines[0].metadata.page_id if cell.lines and len(cell.lines) >= 1 else row_page_number
+                    image_width, image_height = gost_analyzed_images[page_number][0].shape[1], gost_analyzed_images[page_number][0].shape[0]
+                    shift_x, shift_y = gost_analyzed_images[page_number][1].x_top_left, gost_analyzed_images[page_number][1].y_top_left
+                    cell.shift(shift_x=shift_x, shift_y=shift_y, image_width=image_width, image_height=image_height)
+
+        # shift attachments
+        for attachment in attachments:
+            attachment_page_number = attachment.location.page_number
+            shift_x, shift_y = gost_analyzed_images[attachment_page_number][1].x_top_left, gost_analyzed_images[attachment_page_number][1].y_top_left
+            attachment.location.shift(shift_x, shift_y)
+
+        # shift lines
+        for line in lines:
+            page_number = line.metadata.page_id
+            image_width, image_height = gost_analyzed_images[page_number][0].shape[1], gost_analyzed_images[page_number][0].shape[0]
+            line.shift(shift_x=gost_analyzed_images[page_number][1].x_top_left,
+                       shift_y=gost_analyzed_images[page_number][1].y_top_left,
+                       image_width=image_width,
+                       image_height=image_height)
+
     @abstractmethod
     def _process_one_page(self, image: ndarray, parameters: ParametersForParseDoc, page_number: int, path: str) \
             -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[float]]:

diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/gost_frame_recognizer.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/gost_frame_recognizer.py
@@ -0,0 +1,43 @@
+import logging
+from typing import Optional, Tuple
+
+import cv2
+import numpy as np
+from dedocutils.data_structures import BBox
+
+from dedoc.readers.pdf_reader.data_classes.tables.table_tree import TableTree
+from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_utils.img_processing import detect_horizontal_and_vertical_lines as detect_lines
+
+MIN_FRAME_CONTENT_AREA = 0.7
+
+
+class GOSTFrameRecognizer:
+    def __init__(self, *, config: dict = None) -> None:
+        self.logger = config.get("logger", logging.getLogger())
+        self.config = config
+
+    def rec_and_clean_frame(self, image: np.ndarray) -> Tuple[np.ndarray, BBox]:
+        if len(image.shape) < 3:  # check if an image is already converted to grayscale
+            thresh, img_bin = cv2.threshold(image, 225, 255, cv2.THRESH_BINARY)
+        else:
+            thresh, img_bin = cv2.threshold(cv2.cvtColor(image, cv2.COLOR_BGR2GRAY), 225, 255, cv2.THRESH_BINARY)
+        lines_bin = detect_lines(255 - img_bin, self.config, "tables")
+        contours, hierarchy = cv2.findContours(lines_bin, cv2.RETR_TREE, cv2.CHAIN_APPROX_TC89_KCOS)
+        tree_table = TableTree.parse_contours_to_tree(contours=contours, hierarchy=hierarchy, config=self.config)
+
+        img_area = image.shape[0] * image.shape[1]
+        has_gost_frame, main_box = self._analyze_cells_on_frame(tree_table, img_area)
+        if has_gost_frame:
+            return BBox.crop_image_by_box(image, main_box), main_box
+        return image, BBox(0, 0, image.shape[1], image.shape[0])
+
+    def _analyze_cells_on_frame(self, tree_table: "TableTree", img_area: "int") -> Tuple[bool, Optional[BBox]]:
+        try:
+            sub_bboxes = tree_table.children[0].children
+            for box in sub_bboxes:
+                if box.cell_box.square / img_area > MIN_FRAME_CONTENT_AREA:
+                    return True, box.cell_box
+            return False, None
+        except Exception as ex:
+            self.logger.warning(ex)
+            return False, None
diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/img_processing.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/img_processing.py
@@ -83,7 +83,7 @@ def get_contours_cells(img: np.ndarray, table_type: str, *, config: dict) -> [An
     if config.get("debug_mode", False):
         cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "image_bin.jpg"), img_bin)
     # step 2
-    img_final_bin = __detect_horizontal_and_vertical_lines(img_bin, config, "tables")
+    img_final_bin = detect_horizontal_and_vertical_lines(img_bin, config, "tables")
     # step 3
     img_final_bin_houph, angle_alignment = __apply_houph_lines_and_detect_angle(img_final_bin, config)
 
@@ -182,7 +182,7 @@ def __apply_houph_lines_and_detect_angle(image: np.ndarray, config: dict) -> [np
     return img_final_bin_houph, angle_alignment
 
 
-def __detect_horizontal_and_vertical_lines(img_bin: np.ndarray, config: dict, task: str) -> np.ndarray:
+def detect_horizontal_and_vertical_lines(img_bin: np.ndarray, config: dict, task: str) -> np.ndarray:
     # Defining a kernel length
 
     if task == "orientation":

diff --git a/dedoc/utils/parameter_utils.py b/dedoc/utils/parameter_utils.py
@@ -66,6 +66,13 @@ def get_param_need_pdf_table_analysis(parameters: Optional[dict]) -> bool:
     return need_pdf_table_analysis
 
 
+def get_param_need_gost_frame_analysis(parameters: Optional[dict]) -> bool:
+    if parameters is None:
+        return False
+    need_gost_frame_analysis = str(parameters.get("need_gost_frame_analysis", "False")).lower() == "true"
+    return need_gost_frame_analysis
+
+
 def get_param_need_binarization(parameters: Optional[dict]) -> bool:
     if parameters is None:
         return False

diff --git a/docs/source/_static/code_examples/langchain/dedoc_loader.py b/docs/source/_static/code_examples/langchain/dedoc_loader.py
@@ -76,6 +76,7 @@ def __init__(
                     result for parsing PDF and images
                 need_binarization: clean pages background (binarize) for PDF without a
                     textual layer and images
+                need_gost_frame_analysis: detect and ignore GOST frame of the document
                 need_pdf_table_analysis: parse tables for PDF without a textual layer
                     and images
                 delimiter: column separator for CSV, TSV files
@@ -374,6 +375,7 @@ def __init__(
                     result for parsing PDF and images
                 need_binarization: clean pages background (binarize) for PDF without a
                     textual layer and images
+                need_gost_frame_analysis: detect and ignore GOST frame
                 need_pdf_table_analysis: parse tables for PDF without a textual layer
                     and images
                 delimiter: column separator for CSV, TSV files

diff --git a/docs/source/_static/code_examples/langchain/pdf.py b/docs/source/_static/code_examples/langchain/pdf.py
@@ -28,6 +28,7 @@ class DedocPDFLoader(DedocBaseLoader):
         need_header_footer_analysis: remove headers and footers from the output result
         need_binarization: clean pages background (binarize) for PDF without a textual
             layer
+        need_gost_frame_analysis: detect and ignore GOST frame
         need_pdf_table_analysis: parse tables for PDF without a textual layer
 
     Examples

diff --git a/docs/source/dedoc_api_usage/api.rst b/docs/source/dedoc_api_usage/api.rst
@@ -224,6 +224,11 @@ Api parameters description
         * **true** -- if any text is detected in a PDF file, Dedoc assumes that textual layer is detected and it is correct. Much faster but less accurate.
         * **false** -- use the textual layer classifier to detect textual layer and prove its correctness.
 
+    * - need_gost_frame_analysis
+      - true, false
+      - false
+      - This option is used to enable GOST (Russian government standard) frame recognition for PDF documents or images.
+        The GOST frame recognizer is used recognize and ignore GOST frame on images and PDF documents without correct textual layer.
 
     * - language
       - rus, eng, rus+eng, fra, spa

diff --git a/docs/source/parameters/pdf_handling.rst b/docs/source/parameters/pdf_handling.rst
@@ -151,6 +151,17 @@ PDF and images handling
         If the document has a textual layer, it is recommended to use :class:`dedoc.readers.PdfTabbyReader`,
         in this case tables will be parsed much easier and faster.
 
+    * - need_gost_frame_analysis
+      - True, False
+      - False
+      - * :meth:`dedoc.DedocManager.parse`
+        * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfBaseReader.read`
+        * :meth:`dedoc.readers.ReaderComposition.read`
+      - This option is used to enable GOST (Russian government standard) frame recognition for PDF documents or images.
+        The GOST frame recognizer is used in :meth:`dedoc.readers.PdfBaseReader.read`. Its main function is to recognize and
+        ignore the GOST frame on the document. It allows :class:`dedoc.readers.PdfImageReader` to properly process the content
+        of the document containing GOST frame.
+
     * - orient_analysis_cells
       - True, False
       - False

diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,7 @@
 beautifulsoup4>=4.10.0,<=4.12.2
 charset-normalizer>=2.0.12,<=3.2.0
 Cython>=0.29.28,<=3.0.2
-dedoc-utils==0.3.7
+dedoc-utils==0.3.8
 fastapi>=0.77.0,<1.0
 huggingface-hub>=0.14.1,<1.0
 imutils==0.5.4

diff --git a/tests/api_tests/test_api_module_table_recognizer.py b/tests/api_tests/test_api_module_table_recognizer.py
@@ -213,3 +213,13 @@ def test_detect_small_table(self) -> None:
         result = self._send_request(file_name, data={"language": "rus"})
         tables = result["content"]["tables"]
         self.assertEqual(2, len(tables))
+
+    def test_multipage_gost_table(self) -> None:
+        file_name = "gost_multipage_table.pdf"
+        result = self._send_request(file_name, data={"need_gost_frame_analysis": "True"})  # don't pass pdf_with_text_layer to check condition in PDFBaseReader
+        self.assertTrue(len(result["content"]["tables"][0]["cells"]) > 35)
+        self.assertTrue("KR13" in result["content"]["tables"][0]["cells"][-1][0]["lines"][0]["text"])  # check the last row of multipage table
+        self.assertTrue("R13.1" in result["content"]["tables"][0]["cells"][-1][1]["lines"][0]["text"])  # check that it belongs to first and only table
+        self.assertTrue("Испытание по проверке" in result["content"]["tables"][0]["cells"][-1][2]["lines"][0]["text"])
+        self.assertTrue("3.6" in result["content"]["tables"][0]["cells"][-1][3]["lines"][0]["text"])
+        self.assertTrue("7.4.9" in result["content"]["tables"][0]["cells"][-1][4]["lines"][0]["text"])
diff --git a/tests/data/tables/gost_frame_1.jpg b/tests/data/tables/gost_frame_1.jpg
diff --git a/tests/data/tables/gost_frame_2.png b/tests/data/tables/gost_frame_2.png
diff --git a/tests/data/tables/gost_frame_3.jpg b/tests/data/tables/gost_frame_3.jpg
diff --git a/tests/data/tables/gost_multipage_table.pdf b/tests/data/tables/gost_multipage_table.pdf
diff --git a/tests/data/tables/not_gost_frame.jpg b/tests/data/tables/not_gost_frame.jpg
-Original file line number
+Diff line change
@@ Expand Up / @@ -183,6 +183,9 @@ <h4>PDF handling</h4> @@
                     <p>
                         <label><input name="need_binarization" type="checkbox" value="true"> need_binarization</label>
                     </p>
+                    <p>
+                        <label><input name="need_gost_frame_analysis" type="checkbox" value="true"> need_gost_frame_analysis</label>
+                    </p>
                 </details>
             </div>
@@ Expand Down @@