TLDR-531 returned and updated commented code

ispras · Nov 30, 2023 · acd1c83 · acd1c83
1 parent 41d7eb7
commit acd1c83
Show file tree

Hide file tree

Showing 3 changed files with 55 additions and 2 deletions.
diff --git a/dedoc/readers/pdf_reader/data_classes/tables/table_tree.py b/dedoc/readers/pdf_reader/data_classes/tables/table_tree.py
@@ -17,12 +17,14 @@
 
 
 class TableTree(object):
-    min_h_cell = 8
-    min_w_cell = 20
     """
     Table which has cells as sorted childs of tree.
     Table has type of tree and was obtained with help contour analysis.
     """
+    min_h_cell = 8
+    min_w_cell = 20
+    minimal_cell_cnt_line = 5
+    minimal_cell_avg_length_line = 10
 
     def __init__(self, *, config: dict) -> None:
         self.left = None

diff --git a/...e_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py b/...e_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py
@@ -67,6 +67,50 @@ def extract_onepage_tables_from_image(self,
 
         return tables
 
+
+    """ TODO fix in the future (REMOVE)
+    def __detect_diff_orient(self, cell_text: str) -> bool:
+        # 1 - разбиваем на строки длины которых состоят хотя бы из одного символа
+        parts = cell_text.split("\n")
+        parts = [p for p in parts if len(p) > 0]
+
+        # 2 - подсчитываем среднюю длину строк ячейки
+        len_parts = [len(p) for p in parts]
+        avg_len_part = np.average(len_parts)
+
+        # Эвристика: считаем что ячейка повернута, если у нас большое количество строк и строки короткие
+        if len(parts) > TableTree.minimal_cell_cnt_line \
+                and avg_len_part < TableTree.minimal_cell_avg_length_line:
+            return True
+        return False
+
+    def __correct_orient_cell(self, cell: Cell, language: str, rotated_angle: int) -> [Cell, np.ndarray]:
+        img_cell = self.image[cell.y_top_left: cell.y_bottom_right, cell.x_top_left: cell.x_bottom_right]
+        rotated_image_cell = rotate_image(img_cell, -rotated_angle)
+
+        output_dict = get_text_with_bbox_from_cells(img_cell, language=language)
+        line_boxes = [
+            TextWithBBox(text=line.text, page_num=page_num, bbox=line.bbox, line_num=line_num, annotations=line.get_annotations(width, height))
+            for line_num, line in enumerate(output_dict.lines)]
+        # get_cell_text_by_ocr(rotated_image_cell, language=language)
+        cell.set_rotated_angle(rotated_angle=-rotated_angle)
+        return cell, rotated_image_cell
+
+
+    def __analyze_header_cell_with_diff_orient(self, tables: List[ScanTable], language: str,
+                                               rotated_angle: int) -> List[ScanTable]:
+
+        for table in tables:
+            attrs = TableAttributeExtractor.get_header_table(table.matrix_cells)
+            for i, row in enumerate(attrs):
+                for j, attr in enumerate(row):
+                    if self.__detect_diff_orient(attr.text):
+                        rotated_cell, rotated_image = self.__correct_orient_cell(attr, language=language, rotated_angle=rotated_angle)
+                        table.matrix_cells[i][j] = rotated_cell
+
+        return tables
+    """
+
     def __select_attributes_matrix_tables(self, tables: List[ScanTable]) -> List[ScanTable]:
         for matrix in tables:
             matrix = self.attribute_selector.select_attributes(matrix)

diff --git a/tests/unit_tests/test_format_pdf_reader.py b/tests/unit_tests/test_format_pdf_reader.py
@@ -140,3 +140,10 @@ def test_pdf_text_layer(self) -> None:
             annotations = line.annotations
             annotations_set = {(a.name, a.value, a.start, a.end) for a in annotations}
             self.assertEqual(len(annotations_set), len(annotations))
+
+    def test_table_extractor(self) -> None:
+        config = {}  # Has to work without config
+        any_doc_reader = PdfTxtlayerReader(config=config)
+        path = os.path.join(os.path.dirname(__file__), "../data/pdf_with_text_layer/english_doc.pdf")
+        result = any_doc_reader.read(path, document_type=None, parameters={"need_pdf_table_analysis": "True"})
+        self.assertEqual(len(result.tables), 1)