TLDR-660 fixes in article type (#428)

* TLDR-660 some fixes for article type * TLDR-660 fix tables, acknowledgement and annex. Fix documentation * Fix tests * Review fixes
ispras · Apr 27, 2024 · dbf8629 · dbf8629
1 parent 07b4afc
commit dbf8629
Show file tree

Hide file tree

Showing 3 changed files with 108 additions and 46 deletions.
diff --git a/dedoc/readers/article_reader/article_reader.py b/dedoc/readers/article_reader/article_reader.py
@@ -11,6 +11,7 @@
 from dedoc.data_structures.unstructured_document import UnstructuredDocument
 from dedoc.extensions import recognized_mimes
 from dedoc.readers.base_reader import BaseReader
+from dedoc.structure_extractors.feature_extractors.list_features.list_utils import get_dotted_item_depth
 from dedoc.utils.parameter_utils import get_param_document_type
 from dedoc.utils.utils import get_mime_extension
 
@@ -33,7 +34,8 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
         using beautifulsoup library.
         As a result, the method fills the class :class:`~dedoc.data_structures.UnstructuredDocument`.
         Article reader adds additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`.
-        The method extracts information about ``authors``, ``bibliography items``, ``sections``, and ``tables``.
+        The method extracts information about ``authors``, ``keywords``, ``bibliography items``, ``sections``, and ``tables``.
+        In table cells, ``colspan`` attribute can be filled according to the GROBID's "cols" attribute.
         You can find more information about the extracted information from GROBID system on the page :ref:`article_structure`.
 
         Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
@@ -51,12 +53,13 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
                 self.logger.warning(warning)
                 return UnstructuredDocument(tables=[], lines=[], attachments=[], warnings=[warning])
 
-            soup = BeautifulSoup(response.text, features="lxml")
+            soup = BeautifulSoup(response.text, features="xml")
             lines = self.__parse_title(soup)
 
-            if soup.biblstruct is not None:
-                authors = soup.biblstruct.find_all("author")
+            if soup.biblStruct is not None:
+                authors = soup.biblStruct.find_all("author")
                 lines += [line for author in authors for line in self.__parse_author(author)]
+            lines += self.__parse_keywords(soup.keywords)
 
             bib_lines, bib2uid = self.__parse_bibliography(soup)
             tables, table2uid = self.__parse_tables(soup)
@@ -135,11 +138,13 @@ def __create_line(self, text: str, hierarchy_level_id: Optional[int] = None, par
     def __parse_affiliation(self, affiliation_tag: Tag) -> List[LineWithMeta]:
         lines = [self.__create_line(text=affiliation_tag.get("key"), hierarchy_level_id=2, paragraph_type="author_affiliation")]
 
-        if affiliation_tag.orgname:
-            lines.append(self.__create_line(text=self.__tag2text(affiliation_tag.orgname), hierarchy_level_id=3, paragraph_type="org_name"))
+        if affiliation_tag.orgName:
+            lines.append(self.__create_line(text=self.__tag2text(affiliation_tag.orgName), hierarchy_level_id=3, paragraph_type="org_name"))
 
         if affiliation_tag.address:
-            lines.append(self.__create_line(text=affiliation_tag.address.text, hierarchy_level_id=3, paragraph_type="address"))
+            lines.append(self.__create_line(text=self.__remove_newlines(affiliation_tag.address).get_text(separator=", "),
+                                            hierarchy_level_id=3,
+                                            paragraph_type="address"))
 
         return lines
 
@@ -169,11 +174,11 @@ def __parse_author(self, author_tag: Tag) -> List[LineWithMeta]:
         """
         lines = [self.__create_line(text="", hierarchy_level_id=1, paragraph_type="author")]
 
-        first_name = self.__get_tag_by_hierarchy_path(author_tag, ["persname", "forename"])
+        first_name = self.__get_tag_by_hierarchy_path(author_tag, ["persName", "forename"])
         if first_name:
             lines.append(self.__create_line(text=first_name, hierarchy_level_id=2, paragraph_type="author_first_name"))
 
-        surname = self.__get_tag_by_hierarchy_path(author_tag, ["persname", "surname"])
+        surname = self.__get_tag_by_hierarchy_path(author_tag, ["persName", "surname"])
         if surname:
             lines.append(self.__create_line(text=surname, hierarchy_level_id=2, paragraph_type="author_surname"))
 
@@ -187,6 +192,21 @@ def __parse_author(self, author_tag: Tag) -> List[LineWithMeta]:
 
         return lines
 
+    def __parse_keywords(self, keywords_tag: Tag) -> List[LineWithMeta]:
+        """
+        <keywords>
+            <term>Multi-Object Tracking</term>
+            <term>Data Association</term>
+            <term>Survey</term>
+        </keywords>
+        """
+        if keywords_tag is None:
+            return []
+
+        lines = [self.__create_line(text="", hierarchy_level_id=1, paragraph_type="keywords")]
+        lines += [self.__create_line(text=item.text, hierarchy_level_id=2, paragraph_type="keyword") for item in keywords_tag.find_all("term")]
+        return lines
+
     def __create_line_with_refs(self, content: List[Tuple[str, Tag]], bib2uid: dict, table2uid: dict) -> LineWithMeta:
         text = ""
         start = 0
@@ -219,20 +239,31 @@ def __parse_text(self, soup: Tag, bib2uid: dict, table2uid: dict) -> List[LineWi
         lines.append(self.__create_line(text="Abstract", hierarchy_level_id=1, paragraph_type="abstract"))
         lines.append(self.__create_line(text=self.__tag2text(abstract)))
 
-        for text in soup.find_all("text"):
-            for part in text.find_all("div"):
-                # TODO: Beautifulsoup doesn't read <head> tags from input XML file. WTF!
-                #       As a result we lose section number in text (see example above)
-                #       Need to fix this in the future.
-                number = part.head.get("n") + " " if part.head else ""
-                line_text = str(part.contents[0]) if len(part.contents) > 0 else None
-                if line_text is not None and len(line_text) > 0:
-                    lines.append(self.__create_line(text=number + line_text, hierarchy_level_id=1, paragraph_type="section"))
-                for subpart in part.find_all("p"):
-                    if subpart.string is not None:
-                        lines.append(self.__create_line_with_refs(subpart.string, bib2uid, table2uid))
-                    elif subpart.contents and len(subpart.contents) > 0:
-                        lines.append(self.__create_line_with_refs(subpart.contents, bib2uid, table2uid))
+        for part in soup.body.find_all("div"):
+            lines.extend(self.__parse_section(part, bib2uid, table2uid))
+
+        for other_text_type in ("acknowledgement", "annex"):
+            for text_tag in soup.find_all("div", attrs={"type": other_text_type}):
+                for part in text_tag.find_all("div"):
+                    lines.extend(self.__parse_section(part, bib2uid, table2uid))
+
+        return lines
+
+    def __parse_section(self, section_tag: Tag, bib2uid: dict, table2uid: dict) -> List[LineWithMeta]:
+        lines = []
+        number = section_tag.head.get("n") if section_tag.head else ""
+        number = number + " " if number else ""
+        section_depth = get_dotted_item_depth(number)
+        section_depth = section_depth if section_depth > 0 else 1
+
+        line_text = section_tag.head.string if section_tag.head else None
+        if line_text is not None and len(line_text) > 0:
+            lines.append(self.__create_line(text=number + line_text, hierarchy_level_id=section_depth, paragraph_type="section"))
+        for subpart in section_tag.find_all("p"):
+            if subpart.string is not None:
+                lines.append(self.__create_line_with_refs(subpart.string + "\n", bib2uid, table2uid))
+            elif subpart.contents and len(subpart.contents) > 0:
+                lines.append(self.__create_line_with_refs(subpart.contents, bib2uid, table2uid))
 
         return lines
 
@@ -265,12 +296,26 @@ def __parse_tables(self, soup: Tag) -> Tuple[List[Table], dict]:
 
         tag_tables = soup.find_all("figure", {"type": "table"})
         for table in tag_tables:
-            row_cells = []
+            table_cells = []
             head = table.contents[0] if len(table.contents) > 0 and isinstance(table.contents[0], str) else self.__tag2text(table.head)
-            title = head + self.__tag2text(table.figdesc)
+            title = head + self.__tag2text(table.figDesc)
             for row in table.table.find_all("row"):
-                row_cells.append([CellWithMeta(lines=[self.__create_line(self.__tag2text(cell))]) for cell in row.find_all("cell")])
-            tables.append(Table(cells=row_cells, metadata=TableMetadata(page_id=0, title=title)))
+                row_cells = []
+                for cell in row.find_all("cell"):
+                    cell_text = self.__create_line(self.__tag2text(cell))
+                    colspan = int(cell.get("cols", 1))
+                    row_cells.append(CellWithMeta(lines=[cell_text], colspan=colspan))
+
+                    if colspan > 1:
+                        row_cells.extend([CellWithMeta(lines=[cell_text], invisible=True) for _ in range(colspan - 1)])
+
+                table_cells.append(row_cells)
+
+            # ignore empty tables
+            if len(table_cells) == 0:
+                continue
+
+            tables.append(Table(cells=table_cells, metadata=TableMetadata(page_id=0, title=title)))
             table2uid["#" + table.get("xml:id")] = tables[-1].metadata.uid
 
         return tables, table2uid
@@ -310,12 +355,12 @@ def __parse_bibliography(self, soup: Tag) -> Tuple[List[LineWithMeta], dict]:
         # according GROBID description
         level_2_paragraph_type = {"a": "title", "j": "title_journal", "s": "title_series", "m": "title_conference_proceedings"}
 
-        bibliography = soup.find("listbibl", recursive=True)
+        bibliography = soup.find("listBibl", recursive=True)
         lines.append(self.__create_line(text="bibliography", hierarchy_level_id=1, paragraph_type="bibliography"))
         if not bibliography:
             return lines, cites
 
-        bib_items = bibliography.find_all("biblstruct")
+        bib_items = bibliography.find_all("biblStruct")
         if not bib_items:
             return lines, cites
 
@@ -331,19 +376,19 @@ def __parse_bibliography(self, soup: Tag) -> Tuple[List[LineWithMeta], dict]:
                     lines.append(self.__create_line(text=self.__tag2text(title), hierarchy_level_id=3, paragraph_type=paragraph_type))
 
             lines += [  # parse bib authors
-                self.__create_line(text=author.get_text(), hierarchy_level_id=3, paragraph_type="author")
+                self.__create_line(text=self.__remove_newlines(author).get_text(separator=" "), hierarchy_level_id=3, paragraph_type="author")
                 for author in bib_item.find_all("author", recursive=True) if author
             ]
 
             lines += [  # parse biblScope <biblScope unit="volume">
                 self.__create_line(text=self.__tag2text(bibl_scope), hierarchy_level_id=3, paragraph_type="biblScope_volume")
-                for bibl_scope in bib_item.find_all("biblscope", {"unit": "volume"}, recursive=True) if bibl_scope
+                for bibl_scope in bib_item.find_all("biblScope", {"unit": "volume"}, recursive=True) if bibl_scope
             ]
 
             try:
                 lines += [  # parse <biblScope unit="page"> values
                     self.__create_line(text=f"{bibl_scope.get('from')}-{bibl_scope.get('to')}", hierarchy_level_id=3, paragraph_type="biblScope_page")
-                    for bibl_scope in bib_item.find_all("biblscope", {"unit": "page"}, recursive=True) if bibl_scope
+                    for bibl_scope in bib_item.find_all("biblScope", {"unit": "page"}, recursive=True) if bibl_scope
                 ]
             finally:
                 self.logger.warning("Grobid parsing warning: <biblScope unit='page' ... /> was non-standard format")
@@ -363,3 +408,9 @@ def __parse_bibliography(self, soup: Tag) -> Tuple[List[LineWithMeta], dict]:
 
     def __parse_title(self, soup: Tag) -> List[LineWithMeta]:
         return [self.__create_line(text=self.__tag2text(soup.title), hierarchy_level_id=0, paragraph_type="root")]
+
+    def __remove_newlines(self, tag: Tag) -> Tag:
+        for item in tag:
+            if not isinstance(item, Tag):
+                item.extract()
+        return tag
diff --git a/docs/source/structure_types/article.rst b/docs/source/structure_types/article.rst
@@ -27,6 +27,7 @@ There are the following line types in the article structure type:
 
     * ``root``;
     * ``author`` (includes ``author_first_name``, ``author_surname``, ``email``);
+    * ``keywords`` (includes ``keyword``);
     * ``author_affiliation`` (includes ``org_name``, ``address``);
     * ``abstract``;
     * ``section``;
@@ -108,9 +109,15 @@ Below is a description of nodes in the output tree:
             :language: json
             :lines: 125-198
 
+    * **keywords** node (if exist) is a child  node of the node ``root``.
+
+        ``keywords`` node contains ``keyword`` nodes as children. Each ``keyword`` node contains the text of one key word item.
+
     * **abstract** is the article's abstract section (<abstract> tag in GROBID's output).
 
-    * **section**: nodes of article sections (for example "Introduction", "Conclusion", "V Experiments ..." etc.). This type of node has a subnode ``raw_text``. ``section`` nodes are children of a node ``root``.
+    * **section**: nodes of article sections (for example "Introduction", "Conclusion", "V Experiments ..." etc.). This type of node has a subnode ``raw_text``.
+
+        ``section`` nodes are children of a node ``root`` and may me nested (e.g., section "2.1. Datasets" is nested to the section "2. Related work").
 
     * **bibliography** is the article's bibliography list which contains only ``bibliography_item`` nodes.
 

diff --git a/tests/api_tests/test_api_doctype_article.py b/tests/api_tests/test_api_doctype_article.py
@@ -34,24 +34,28 @@ def test_article(self) -> None:
         self.assertEqual("org_name", self._get_by_tree_path(tree, "0.2.2.0")["metadata"]["paragraph_type"])
         self.assertEqual("ICTEAM/ELEN/Crypto Group", self._get_by_tree_path(tree, "0.2.2.0")["text"])
 
+        # check section
+        self.assertEqual("section", self._get_by_tree_path(tree, "0.4")["metadata"]["paragraph_type"])
+        self.assertEqual("1 Introduction", self._get_by_tree_path(tree, "0.4")["text"])
+
         # check bibliography list
-        self.assertEqual("bibliography", self._get_by_tree_path(tree, "0.20")["metadata"]["paragraph_type"])
-        self.assertEqual(65, len(self._get_by_tree_path(tree, "0.20")["subparagraphs"]))
+        self.assertEqual("bibliography", self._get_by_tree_path(tree, "0.12")["metadata"]["paragraph_type"])
+        self.assertEqual(65, len(self._get_by_tree_path(tree, "0.12")["subparagraphs"]))
 
         # check bib_item 1 recognizing
-        self.assertEqual("title", self._get_by_tree_path(tree, "0.20.0.0")["metadata"]["paragraph_type"])
-        self.assertEqual("Leakage-resilient symmetric encryption via re-keying", self._get_by_tree_path(tree, "0.20.0.0")["text"])
-        self.assertEqual("title_conference_proceedings", self._get_by_tree_path(tree, "0.20.0.1")["metadata"]["paragraph_type"])
-        self.assertEqual("Bertoni and Coron", self._get_by_tree_path(tree, "0.20.0.1")["text"])
-        self.assertEqual("author", self._get_by_tree_path(tree, "0.20.0.2")["metadata"]["paragraph_type"])  # author 1
-        self.assertEqual("\nMichelAbdalla\n", self._get_by_tree_path(tree, "0.20.0.2")["text"])
-        self.assertEqual("biblScope_volume", self._get_by_tree_path(tree, "0.20.0.5")["metadata"]["paragraph_type"])  # author 1
-        self.assertEqual("4", self._get_by_tree_path(tree, "0.20.0.5")["text"])
-        self.assertEqual("biblScope_page", self._get_by_tree_path(tree, "0.20.0.6")["metadata"]["paragraph_type"])  # author 1
-        self.assertEqual("471-488", self._get_by_tree_path(tree, "0.20.0.6")["text"])
+        self.assertEqual("title", self._get_by_tree_path(tree, "0.12.0.0")["metadata"]["paragraph_type"])
+        self.assertEqual("Leakage-resilient symmetric encryption via re-keying", self._get_by_tree_path(tree, "0.12.0.0")["text"])
+        self.assertEqual("title_conference_proceedings", self._get_by_tree_path(tree, "0.12.0.1")["metadata"]["paragraph_type"])
+        self.assertEqual("Bertoni and Coron", self._get_by_tree_path(tree, "0.12.0.1")["text"])
+        self.assertEqual("author", self._get_by_tree_path(tree, "0.12.0.2")["metadata"]["paragraph_type"])  # author 1
+        self.assertEqual("Michel Abdalla", self._get_by_tree_path(tree, "0.12.0.2")["text"])
+        self.assertEqual("biblScope_volume", self._get_by_tree_path(tree, "0.12.0.5")["metadata"]["paragraph_type"])  # author 1
+        self.assertEqual("4", self._get_by_tree_path(tree, "0.12.0.5")["text"])
+        self.assertEqual("biblScope_page", self._get_by_tree_path(tree, "0.12.0.6")["metadata"]["paragraph_type"])  # author 1
+        self.assertEqual("471-488", self._get_by_tree_path(tree, "0.12.0.6")["text"])
 
         # check cite on bib_item
-        bibliography_item_uuid = self._get_by_tree_path(tree, "0.20.57")["metadata"]["uid"]  # checking on [58] references
+        bibliography_item_uuid = self._get_by_tree_path(tree, "0.12.57")["metadata"]["uid"]  # checking on [58] references
         section = self._get_by_tree_path(tree, "0.4.0")
         bibliography_refs_in_text = [ann for ann in section["annotations"] if ann["name"] == "reference" and ann["value"] == bibliography_item_uuid]
         # We must found two refs [58] in Introduction section