TLDR-619 fixed html table parsing (#413)

* fixed html table parsing * fixed code according to PR comments * optimized imports --------- Co-authored-by: Alexander Golodkov <[email protected]>
ispras · Mar 15, 2024 · b4163ae · b4163ae
1 parent c48b186
commit b4163ae
Show file tree

Hide file tree

Showing 3 changed files with 98 additions and 40 deletions.
diff --git a/dedoc/readers/html_reader/html_reader.py b/dedoc/readers/html_reader/html_reader.py
@@ -1,9 +1,8 @@
 import hashlib
 import string
-from typing import List, Optional, Union
+from typing import List, Optional, Tuple, Union
 
-from bs4 import BeautifulSoup
-from bs4 import Comment, Doctype, Tag
+from bs4 import BeautifulSoup, Comment, Doctype, NavigableString, Tag
 
 from dedoc.data_structures.cell_with_meta import CellWithMeta
 from dedoc.data_structures.hierarchy_level import HierarchyLevel
@@ -48,44 +47,46 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
             soup = BeautifulSoup(f.read(), "html.parser")
 
         handle_invisible_table = str(parameters.get("handle_invisible_table", "false")).lower() == "true"
-        path_hash = calculate_file_hash(path=file_path)
-        lines = self.__read_blocks(soup, path_hash=path_hash, handle_invisible_table=handle_invisible_table)
+        filepath_hash = calculate_file_hash(path=file_path)
+        lines = self.__read_blocks(soup, filepath_hash=filepath_hash, handle_invisible_table=handle_invisible_table)
         tables = [
-            self._read_table(table, path_hash) for table in soup.find_all("table") if self._visible_table(table, handle_invisible_table=handle_invisible_table)
+            self._read_table(table, filepath_hash) for table in soup.find_all("table") if self._visible_table(table,
+                                                                                                              handle_invisible_table=handle_invisible_table)
         ]
         document = UnstructuredDocument(tables=tables, lines=lines, attachments=[])
         document_postprocess = self.postprocessor.postprocess(document)
         return document_postprocess
 
-    def __handle_block(self, tag: Union[Tag], uid: str, handle_invisible_table: bool, table: Optional[bool] = False) -> List[LineWithMeta]:
+    def __handle_block(self, tag: Union[Tag], filepath_hash: str, handle_invisible_table: bool, table: Optional[bool] = False,
+                       uid: Optional[str] = "") -> List[LineWithMeta]:
         tag_uid = hashlib.md5((uid + str(tag.name)).encode()).hexdigest()
         assert isinstance(tag, (Tag, str))
         if not self.__is_content_tag(tag, handle_invisible_table=handle_invisible_table):
             block_lines = []
         elif tag.name == "table" and not self._visible_table(tag, handle_invisible_table=handle_invisible_table):
             # if table is invisible and we don't parse invisible tables (handle_invisible_table == False)
             # then we parse table as raw text
-            block_lines = self.__handle_invisible_table(block=tag, path_hash=uid)
+            block_lines = self.__handle_invisible_table(block=tag, filepath_hash=filepath_hash, uid=tag_uid)
         elif isinstance(tag, str):
-            block_lines = self._handle_text_line(block=tag, path_hash=uid)
+            block_lines = self._handle_text_line(block=tag, filepath_hash=filepath_hash, uid=tag_uid)
         elif tag.name not in HtmlTags.available_tags:
             self.logger.debug(f"skip tag {tag.name.encode()}")
             block_lines = []
         elif tag.name in HtmlTags.special_symbol_tags:
             tag_value = HtmlTags.special_symbol_tags[tag.name]
-            block_lines = self._handle_text_line(block=tag_value, path_hash=uid, ignore_space=False)
+            block_lines = self._handle_text_line(block=tag_value, filepath_hash=filepath_hash, uid=tag_uid, ignore_space=False)
         elif tag.name in HtmlTags.block_tags:
-            block_lines = self.__read_blocks(block=tag, path_hash=uid)
+            block_lines = self.__read_blocks(block=tag, filepath_hash=filepath_hash, uid=tag_uid)
         elif tag.name in HtmlTags.list_tags:
-            block_lines = self.__read_list(lst=tag, uid=tag_uid, path_hash=uid, handle_invisible_table=handle_invisible_table)
+            block_lines = self.__read_list(lst=tag, uid=tag_uid, filepath_hash=filepath_hash, handle_invisible_table=handle_invisible_table)
         else:
-            block_lines = self.__handle_single_tag(tag, uid, table)
+            block_lines = self.__handle_single_tag(tag=tag, filepath_hash=filepath_hash, uid=tag_uid, table=table)
         for line in block_lines:
             if not getattr(line.metadata, "html_tag", None):
                 line.metadata.extend_other_fields({"html_tag": tag.name})
         return block_lines
 
-    def __handle_single_tag(self, tag: Tag, uid: str, table: Optional[bool] = False) -> List[LineWithMeta]:
+    def __handle_single_tag(self, tag: Tag, filepath_hash: str, uid: str, table: Optional[bool] = False) -> List[LineWithMeta]:
         text = self.__get_text(tag, table)
 
         if not text or text.isspace():
@@ -95,38 +96,40 @@ def __handle_single_tag(self, tag: Tag, uid: str, table: Optional[bool] = False)
         header_level = int(tag.name[1:]) if tag.name in HtmlTags.header_tags else 0
         line_type = HierarchyLevel.unknown if header_level == 0 else HierarchyLevel.header
         tag_uid = hashlib.md5((uid + text).encode()).hexdigest()
-        line = self.__make_line(line=text, line_type=line_type, header_level=header_level, uid=tag_uid, path_hash=uid, annotations=annotations)
+        line = self.__make_line(line=text, line_type=line_type, header_level=header_level, uid=tag_uid, filepath_hash=filepath_hash, annotations=annotations)
         line.metadata.extend_other_fields({"html_tag": tag.name})
         return [line]
 
-    def __read_blocks(self, block: Tag, path_hash: str = "", handle_invisible_table: bool = False, table: Optional[bool] = False) -> List[LineWithMeta]:
-        uid = hashlib.md5((path_hash + str(block.name)).encode()).hexdigest()
+    def __read_blocks(self, block: Tag, filepath_hash: str = "", handle_invisible_table: bool = False, table: Optional[bool] = False,
+                      uid: Optional[str] = "") -> List[LineWithMeta]:
+        tag_uid = hashlib.md5((filepath_hash + uid + str(block.name)).encode()).hexdigest()
         if not self.__is_content_tag(block, handle_invisible_table=handle_invisible_table):
             return []
 
         lines = []
 
         for tag in block:
             assert isinstance(tag, (Tag, str))
-            block_lines = self.__handle_block(tag=tag, uid=uid, handle_invisible_table=handle_invisible_table, table=table)
+            block_lines = self.__handle_block(tag=tag, filepath_hash=filepath_hash, handle_invisible_table=handle_invisible_table, table=table, uid=tag_uid)
             lines.extend(block_lines)
         return lines
 
-    def _handle_text_line(self, block: str, path_hash: str, ignore_space: bool = True) -> List[LineWithMeta]:
+    def _handle_text_line(self, block: str, filepath_hash: str, uid: str, ignore_space: bool = True) -> List[LineWithMeta]:
         if not block.strip() and ignore_space:
             return []
-        uid = hashlib.md5(block.encode()).hexdigest()
-        line = self.__make_line(block, HierarchyLevel.unknown, 0, uid=uid, path_hash=path_hash)
+        tag_uid = hashlib.md5((uid + block).encode()).hexdigest()
+        line = self.__make_line(block, HierarchyLevel.unknown, 0, uid=tag_uid, filepath_hash=filepath_hash)
         return [line]
 
-    def __make_line(self, line: str, line_type: str, header_level: int = 0, uid: str = None, path_hash: str = None, annotations: List = None) -> LineWithMeta:
+    def __make_line(self, line: str, line_type: str, header_level: int = 0, uid: str = None, filepath_hash: str = None,
+                    annotations: List = None) -> LineWithMeta:
         if annotations is None:
             annotations = []
 
         level = None if header_level == 0 else HierarchyLevel(1, header_level, False, line_type=line_type)
         metadata = LineMetadata(page_id=0, line_id=None, tag_hierarchy_level=level)  # TODO line_id
 
-        uid = f"{path_hash}_{uid}"
+        uid = f"{filepath_hash}_{uid}"
         return LineWithMeta(line=line, metadata=metadata, annotations=annotations, uid=uid)
 
     def __get_li_header(self, list_type: str, index: int) -> LineWithMeta:
@@ -149,7 +152,8 @@ def __get_li_header(self, list_type: str, index: int) -> LineWithMeta:
         header_line = LineWithMeta(line=header, metadata=metadata)
         return header_line
 
-    def __read_list(self, lst: Tag, uid: str, path_hash: str, handle_invisible_table: bool) -> List[LineWithMeta]:
+    def __read_list(self, lst: Tag, uid: str, filepath_hash: str, handle_invisible_table: bool) -> List[LineWithMeta]:
+        tag_uid = hashlib.md5((uid + str(lst.name)).encode()).hexdigest()
         lines = []
         list_type = lst.get("type", "1" if lst.name in HtmlTags.ordered_list else "")
         item_index = 0
@@ -159,16 +163,18 @@ def __read_list(self, lst: Tag, uid: str, path_hash: str, handle_invisible_table
                 item_lines = self.__handle_list_item(item=item,
                                                      item_index=item_index,
                                                      list_type=list_type,
-                                                     path_hash=path_hash,
+                                                     filepath_hash=filepath_hash,
+                                                     uid=tag_uid,
                                                      handle_invisible_table=handle_invisible_table)
                 item_index += 1
                 lines.extend(item_lines)
         return lines
 
-    def __handle_list_item(self, item: Tag, item_index: int, list_type: str, path_hash: str, handle_invisible_table: bool) -> List[LineWithMeta]:
+    def __handle_list_item(self, item: Tag, item_index: int, list_type: str, filepath_hash: str, uid: str, handle_invisible_table: bool) -> List[LineWithMeta]:
+        tag_uid = hashlib.md5((uid + str(item.name)).encode()).hexdigest()
         lines = []
         header_line = self.__get_li_header(list_type=list_type, index=item_index)
-        block_lines = self.__handle_block(item, uid=path_hash, handle_invisible_table=handle_invisible_table)
+        block_lines = self.__handle_block(item, filepath_hash=filepath_hash, uid=tag_uid, handle_invisible_table=handle_invisible_table)
         hl_depth = header_line.metadata.tag_hierarchy_level.level_1
         for line in block_lines:
             if line.metadata.tag_hierarchy_level.is_unknown():
@@ -202,29 +208,69 @@ def __is_content_tag(self, tag: Tag, handle_invisible_table: bool = False) -> bo
             return True
         return not isinstance(tag, Doctype) and not isinstance(tag, Comment)
 
-    def __handle_invisible_table(self, block: Tag, path_hash: str) -> List[LineWithMeta]:
-        uid = hashlib.md5(block.name.encode()).hexdigest()
+    def __handle_invisible_table(self, block: Tag, filepath_hash: str, uid: str) -> List[LineWithMeta]:
         result = []
-        rows = self._read_table(block, path_hash).cells
+        rows = self._read_table(block, filepath_hash).cells
         for row in rows:
             text = "\t".join([cell.get_text() for cell in row])
             if text.strip() != "":
                 tag_uid = hashlib.md5((uid + text).encode()).hexdigest()
-                line = self.__make_line(line=text, line_type=HierarchyLevel.unknown, uid=tag_uid, path_hash=path_hash)
+                line = self.__make_line(line=text, line_type=HierarchyLevel.unknown, uid=tag_uid, filepath_hash=filepath_hash)
                 result.append(line)
         return result
 
-    def _read_table(self, table: Tag, path_hash: str) -> Table:
+    def __clone_cell(self, el: Tuple[Tag, NavigableString]) -> Tuple[Tag, NavigableString]:
+        if isinstance(el, NavigableString):
+            return type(el)(el)
+
+        copy = Tag(None, el.builder, el.name, el.namespace, el.nsprefix)
+        if el.name in HtmlTags.table_cells:
+            el_attrs = el.attrs
+            copy.hidden = True
+            copy.attrs = dict(el_attrs)
+            copy.attrs["colspan"] = 1
+            copy.attrs["rowspan"] = 1
+        for child in el.contents:
+            copy.append(self.__clone_cell(child))
+        return copy
+
+    def __split_table_cells(self, table: Tag, table_list: List[List[Tag]]) -> None:
+        for row_index, row in enumerate(table.find_all(HtmlTags.table_rows)):
+            for cell_index, cell in enumerate(row.find_all(HtmlTags.table_cells)):
+                cell_rowspan = int(cell.attrs.get("rowspan", 1))
+                cell_colspan = int(cell.attrs.get("colspan", 1))
+                if cell_rowspan > 1 or cell_colspan > 1:
+                    cell_copy = self.__clone_cell(cell)
+                    table_list[row_index][cell_index + 1:cell_index + 1] = [cell_copy] * (cell_colspan - 1)
+                    for index in range(row_index + 1, row_index + cell_rowspan):
+                        table_list[index][cell_index:cell_index] = [cell_copy] * cell_colspan
+
+    def __fix_table(self, table: Tag) -> List[List[Tag]]:
+        table_list = []
+
+        # create table list
+        for row in table.find_all(HtmlTags.table_rows):
+            row_line = []
+            for cell in row.find_all(HtmlTags.table_cells):
+                row_line.append(cell)
+            table_list.append(row_line)
+
+        self.__split_table_cells(table, table_list)
+
+        return table_list
+
+    def _read_table(self, table: Tag, filepath_hash: str) -> Table:
         cells_with_meta = []
+        fixed_table = self.__fix_table(table)
 
-        for row in table.find_all(HtmlTags.table_rows):
+        for row in fixed_table:
             row_lines = []
-            for cell in row.find_all(HtmlTags.table_cells):
+            for cell in row:
                 cell_with_meta = CellWithMeta(
-                    lines=self.__read_blocks(block=cell, path_hash=path_hash, handle_invisible_table=False, table=True),  # read each cell as block with styles
-                    colspan=cell.colspan if cell.colspan else 1,
-                    rowspan=cell.rowspan if cell.rowspan else 1,
-                    invisible=cell.invisible if cell.invisible else True
+                    lines=self.__read_blocks(block=cell, filepath_hash=filepath_hash, handle_invisible_table=False, table=True),  # read each cell as a block
+                    colspan=int(cell.attrs.get("colspan", 1)),
+                    rowspan=int(cell.attrs.get("rowspan", 1)),
+                    invisible=cell.hidden if cell.hidden else False
                 )
                 row_lines.append(cell_with_meta)
             cells_with_meta.append(row_lines)

diff --git a/tests/api_tests/test_api_format_html.py b/tests/api_tests/test_api_format_html.py
@@ -155,6 +155,15 @@ def test_html_table_with_styles(self) -> None:
         self.assertIn({"start": 0, "end": 10, "name": "italic", "value": "True"}, table["cells"][1][1]["lines"][0]["annotations"])
         self.assertIn({"start": 0, "end": 10, "name": "linked_text", "value": "some_text"}, table["cells"][2][0]["lines"][0]["annotations"])
         self.assertIn({"start": 0, "end": 16, "name": "strike", "value": "True"}, table["cells"][2][1]["lines"][0]["annotations"])
+        self.assertEqual(table["cells"][3][0]["rowspan"], 2)
+        self.assertEqual(table["cells"][3][0]["colspan"], 2)
+        self.assertEqual(table["cells"][3][0]["invisible"], False)
+        self.assertEqual(table["cells"][3][1]["rowspan"], 1)
+        self.assertEqual(table["cells"][3][1]["colspan"], 1)
+        self.assertEqual(table["cells"][3][1]["invisible"], True)
+        self.assertEqual(table["cells"][4][0]["rowspan"], 1)
+        self.assertEqual(table["cells"][4][0]["colspan"], 1)
+        self.assertEqual(table["cells"][4][0]["invisible"], True)
 
     def test_html_font_style_attribute(self) -> None:
         file_name = "210.html"

diff --git a/tests/data/htmls/table_with_styles.html b/tests/data/htmls/table_with_styles.html
@@ -10,9 +10,12 @@
    <tr>
     <th>Первый столбец</th>
     <th>Второй столбец</th>
+    <th>Третий столбец</th>
    </tr>
-   <tr><td><p><strong>Что-то</strong></p></td><td><div><i>Что-то ещё</i></div></td></tr>
-    <tr><td><a href="some_text">Ещё что-то</a></td><td><del>Последняя ячейка</del></td></tr>
+   <tr><td><p><strong>Что-то</strong></p></td><td><div><i>Что-то ещё</i></div></td><td>Просто текст</td></tr>
+    <tr><td><a href="some_text">Ещё что-то</a></td><td><del>Последняя ячейка</del></td><td>Просто текст</td></tr>
+    <tr><td colspan="2" rowspan="2">Текст</td><td>Просто текст</td></tr>
+    <tr><td>Просто текст</td></tr>
   </table>
 </body>
 </html>