diff --git a/dedoc/readers/article_reader/article_reader.py b/dedoc/readers/article_reader/article_reader.py index f2169452..2d4e9532 100644 --- a/dedoc/readers/article_reader/article_reader.py +++ b/dedoc/readers/article_reader/article_reader.py @@ -11,6 +11,7 @@ from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.extensions import recognized_mimes from dedoc.readers.base_reader import BaseReader +from dedoc.structure_extractors.feature_extractors.list_features.list_utils import get_dotted_item_depth from dedoc.utils.parameter_utils import get_param_document_type from dedoc.utils.utils import get_mime_extension @@ -33,7 +34,8 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure using beautifulsoup library. As a result, the method fills the class :class:`~dedoc.data_structures.UnstructuredDocument`. Article reader adds additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`. - The method extracts information about ``authors``, ``bibliography items``, ``sections``, and ``tables``. + The method extracts information about ``authors``, ``keywords``, ``bibliography items``, ``sections``, and ``tables``. + In table cells, ``colspan`` attribute can be filled according to the GROBID's "cols" attribute. You can find more information about the extracted information from GROBID system on the page :ref:`article_structure`. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. @@ -51,12 +53,13 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure self.logger.warning(warning) return UnstructuredDocument(tables=[], lines=[], attachments=[], warnings=[warning]) - soup = BeautifulSoup(response.text, features="lxml") + soup = BeautifulSoup(response.text, features="xml") lines = self.__parse_title(soup) - if soup.biblstruct is not None: - authors = soup.biblstruct.find_all("author") + if soup.biblStruct is not None: + authors = soup.biblStruct.find_all("author") lines += [line for author in authors for line in self.__parse_author(author)] + lines += self.__parse_keywords(soup.keywords) bib_lines, bib2uid = self.__parse_bibliography(soup) tables, table2uid = self.__parse_tables(soup) @@ -135,11 +138,13 @@ def __create_line(self, text: str, hierarchy_level_id: Optional[int] = None, par def __parse_affiliation(self, affiliation_tag: Tag) -> List[LineWithMeta]: lines = [self.__create_line(text=affiliation_tag.get("key"), hierarchy_level_id=2, paragraph_type="author_affiliation")] - if affiliation_tag.orgname: - lines.append(self.__create_line(text=self.__tag2text(affiliation_tag.orgname), hierarchy_level_id=3, paragraph_type="org_name")) + if affiliation_tag.orgName: + lines.append(self.__create_line(text=self.__tag2text(affiliation_tag.orgName), hierarchy_level_id=3, paragraph_type="org_name")) if affiliation_tag.address: - lines.append(self.__create_line(text=affiliation_tag.address.text, hierarchy_level_id=3, paragraph_type="address")) + lines.append(self.__create_line(text=self.__remove_newlines(affiliation_tag.address).get_text(separator=", "), + hierarchy_level_id=3, + paragraph_type="address")) return lines @@ -169,11 +174,11 @@ def __parse_author(self, author_tag: Tag) -> List[LineWithMeta]: """ lines = [self.__create_line(text="", hierarchy_level_id=1, paragraph_type="author")] - first_name = self.__get_tag_by_hierarchy_path(author_tag, ["persname", "forename"]) + first_name = self.__get_tag_by_hierarchy_path(author_tag, ["persName", "forename"]) if first_name: lines.append(self.__create_line(text=first_name, hierarchy_level_id=2, paragraph_type="author_first_name")) - surname = self.__get_tag_by_hierarchy_path(author_tag, ["persname", "surname"]) + surname = self.__get_tag_by_hierarchy_path(author_tag, ["persName", "surname"]) if surname: lines.append(self.__create_line(text=surname, hierarchy_level_id=2, paragraph_type="author_surname")) @@ -187,6 +192,21 @@ def __parse_author(self, author_tag: Tag) -> List[LineWithMeta]: return lines + def __parse_keywords(self, keywords_tag: Tag) -> List[LineWithMeta]: + """ + + Multi-Object Tracking + Data Association + Survey + + """ + if keywords_tag is None: + return [] + + lines = [self.__create_line(text="", hierarchy_level_id=1, paragraph_type="keywords")] + lines += [self.__create_line(text=item.text, hierarchy_level_id=2, paragraph_type="keyword") for item in keywords_tag.find_all("term")] + return lines + def __create_line_with_refs(self, content: List[Tuple[str, Tag]], bib2uid: dict, table2uid: dict) -> LineWithMeta: text = "" start = 0 @@ -219,20 +239,31 @@ def __parse_text(self, soup: Tag, bib2uid: dict, table2uid: dict) -> List[LineWi lines.append(self.__create_line(text="Abstract", hierarchy_level_id=1, paragraph_type="abstract")) lines.append(self.__create_line(text=self.__tag2text(abstract))) - for text in soup.find_all("text"): - for part in text.find_all("div"): - # TODO: Beautifulsoup doesn't read tags from input XML file. WTF! - # As a result we lose section number in text (see example above) - # Need to fix this in the future. - number = part.head.get("n") + " " if part.head else "" - line_text = str(part.contents[0]) if len(part.contents) > 0 else None - if line_text is not None and len(line_text) > 0: - lines.append(self.__create_line(text=number + line_text, hierarchy_level_id=1, paragraph_type="section")) - for subpart in part.find_all("p"): - if subpart.string is not None: - lines.append(self.__create_line_with_refs(subpart.string, bib2uid, table2uid)) - elif subpart.contents and len(subpart.contents) > 0: - lines.append(self.__create_line_with_refs(subpart.contents, bib2uid, table2uid)) + for part in soup.body.find_all("div"): + lines.extend(self.__parse_section(part, bib2uid, table2uid)) + + for other_text_type in ("acknowledgement", "annex"): + for text_tag in soup.find_all("div", attrs={"type": other_text_type}): + for part in text_tag.find_all("div"): + lines.extend(self.__parse_section(part, bib2uid, table2uid)) + + return lines + + def __parse_section(self, section_tag: Tag, bib2uid: dict, table2uid: dict) -> List[LineWithMeta]: + lines = [] + number = section_tag.head.get("n") if section_tag.head else "" + number = number + " " if number else "" + section_depth = get_dotted_item_depth(number) + section_depth = section_depth if section_depth > 0 else 1 + + line_text = section_tag.head.string if section_tag.head else None + if line_text is not None and len(line_text) > 0: + lines.append(self.__create_line(text=number + line_text, hierarchy_level_id=section_depth, paragraph_type="section")) + for subpart in section_tag.find_all("p"): + if subpart.string is not None: + lines.append(self.__create_line_with_refs(subpart.string + "\n", bib2uid, table2uid)) + elif subpart.contents and len(subpart.contents) > 0: + lines.append(self.__create_line_with_refs(subpart.contents, bib2uid, table2uid)) return lines @@ -265,12 +296,26 @@ def __parse_tables(self, soup: Tag) -> Tuple[List[Table], dict]: tag_tables = soup.find_all("figure", {"type": "table"}) for table in tag_tables: - row_cells = [] + table_cells = [] head = table.contents[0] if len(table.contents) > 0 and isinstance(table.contents[0], str) else self.__tag2text(table.head) - title = head + self.__tag2text(table.figdesc) + title = head + self.__tag2text(table.figDesc) for row in table.table.find_all("row"): - row_cells.append([CellWithMeta(lines=[self.__create_line(self.__tag2text(cell))]) for cell in row.find_all("cell")]) - tables.append(Table(cells=row_cells, metadata=TableMetadata(page_id=0, title=title))) + row_cells = [] + for cell in row.find_all("cell"): + cell_text = self.__create_line(self.__tag2text(cell)) + colspan = int(cell.get("cols", 1)) + row_cells.append(CellWithMeta(lines=[cell_text], colspan=colspan)) + + if colspan > 1: + row_cells.extend([CellWithMeta(lines=[cell_text], invisible=True) for _ in range(colspan - 1)]) + + table_cells.append(row_cells) + + # ignore empty tables + if len(table_cells) == 0: + continue + + tables.append(Table(cells=table_cells, metadata=TableMetadata(page_id=0, title=title))) table2uid["#" + table.get("xml:id")] = tables[-1].metadata.uid return tables, table2uid @@ -310,12 +355,12 @@ def __parse_bibliography(self, soup: Tag) -> Tuple[List[LineWithMeta], dict]: # according GROBID description level_2_paragraph_type = {"a": "title", "j": "title_journal", "s": "title_series", "m": "title_conference_proceedings"} - bibliography = soup.find("listbibl", recursive=True) + bibliography = soup.find("listBibl", recursive=True) lines.append(self.__create_line(text="bibliography", hierarchy_level_id=1, paragraph_type="bibliography")) if not bibliography: return lines, cites - bib_items = bibliography.find_all("biblstruct") + bib_items = bibliography.find_all("biblStruct") if not bib_items: return lines, cites @@ -331,19 +376,19 @@ def __parse_bibliography(self, soup: Tag) -> Tuple[List[LineWithMeta], dict]: lines.append(self.__create_line(text=self.__tag2text(title), hierarchy_level_id=3, paragraph_type=paragraph_type)) lines += [ # parse bib authors - self.__create_line(text=author.get_text(), hierarchy_level_id=3, paragraph_type="author") + self.__create_line(text=self.__remove_newlines(author).get_text(separator=" "), hierarchy_level_id=3, paragraph_type="author") for author in bib_item.find_all("author", recursive=True) if author ] lines += [ # parse biblScope self.__create_line(text=self.__tag2text(bibl_scope), hierarchy_level_id=3, paragraph_type="biblScope_volume") - for bibl_scope in bib_item.find_all("biblscope", {"unit": "volume"}, recursive=True) if bibl_scope + for bibl_scope in bib_item.find_all("biblScope", {"unit": "volume"}, recursive=True) if bibl_scope ] try: lines += [ # parse values self.__create_line(text=f"{bibl_scope.get('from')}-{bibl_scope.get('to')}", hierarchy_level_id=3, paragraph_type="biblScope_page") - for bibl_scope in bib_item.find_all("biblscope", {"unit": "page"}, recursive=True) if bibl_scope + for bibl_scope in bib_item.find_all("biblScope", {"unit": "page"}, recursive=True) if bibl_scope ] finally: self.logger.warning("Grobid parsing warning: was non-standard format") @@ -363,3 +408,9 @@ def __parse_bibliography(self, soup: Tag) -> Tuple[List[LineWithMeta], dict]: def __parse_title(self, soup: Tag) -> List[LineWithMeta]: return [self.__create_line(text=self.__tag2text(soup.title), hierarchy_level_id=0, paragraph_type="root")] + + def __remove_newlines(self, tag: Tag) -> Tag: + for item in tag: + if not isinstance(item, Tag): + item.extract() + return tag diff --git a/docs/source/structure_types/article.rst b/docs/source/structure_types/article.rst index 8de477e9..2285ddd9 100644 --- a/docs/source/structure_types/article.rst +++ b/docs/source/structure_types/article.rst @@ -27,6 +27,7 @@ There are the following line types in the article structure type: * ``root``; * ``author`` (includes ``author_first_name``, ``author_surname``, ``email``); + * ``keywords`` (includes ``keyword``); * ``author_affiliation`` (includes ``org_name``, ``address``); * ``abstract``; * ``section``; @@ -108,9 +109,15 @@ Below is a description of nodes in the output tree: :language: json :lines: 125-198 + * **keywords** node (if exist) is a child node of the node ``root``. + + ``keywords`` node contains ``keyword`` nodes as children. Each ``keyword`` node contains the text of one key word item. + * **abstract** is the article's abstract section ( tag in GROBID's output). - * **section**: nodes of article sections (for example "Introduction", "Conclusion", "V Experiments ..." etc.). This type of node has a subnode ``raw_text``. ``section`` nodes are children of a node ``root``. + * **section**: nodes of article sections (for example "Introduction", "Conclusion", "V Experiments ..." etc.). This type of node has a subnode ``raw_text``. + + ``section`` nodes are children of a node ``root`` and may me nested (e.g., section "2.1. Datasets" is nested to the section "2. Related work"). * **bibliography** is the article's bibliography list which contains only ``bibliography_item`` nodes. diff --git a/tests/api_tests/test_api_doctype_article.py b/tests/api_tests/test_api_doctype_article.py index 508e2574..bef10773 100644 --- a/tests/api_tests/test_api_doctype_article.py +++ b/tests/api_tests/test_api_doctype_article.py @@ -34,24 +34,28 @@ def test_article(self) -> None: self.assertEqual("org_name", self._get_by_tree_path(tree, "0.2.2.0")["metadata"]["paragraph_type"]) self.assertEqual("ICTEAM/ELEN/Crypto Group", self._get_by_tree_path(tree, "0.2.2.0")["text"]) + # check section + self.assertEqual("section", self._get_by_tree_path(tree, "0.4")["metadata"]["paragraph_type"]) + self.assertEqual("1 Introduction", self._get_by_tree_path(tree, "0.4")["text"]) + # check bibliography list - self.assertEqual("bibliography", self._get_by_tree_path(tree, "0.20")["metadata"]["paragraph_type"]) - self.assertEqual(65, len(self._get_by_tree_path(tree, "0.20")["subparagraphs"])) + self.assertEqual("bibliography", self._get_by_tree_path(tree, "0.12")["metadata"]["paragraph_type"]) + self.assertEqual(65, len(self._get_by_tree_path(tree, "0.12")["subparagraphs"])) # check bib_item 1 recognizing - self.assertEqual("title", self._get_by_tree_path(tree, "0.20.0.0")["metadata"]["paragraph_type"]) - self.assertEqual("Leakage-resilient symmetric encryption via re-keying", self._get_by_tree_path(tree, "0.20.0.0")["text"]) - self.assertEqual("title_conference_proceedings", self._get_by_tree_path(tree, "0.20.0.1")["metadata"]["paragraph_type"]) - self.assertEqual("Bertoni and Coron", self._get_by_tree_path(tree, "0.20.0.1")["text"]) - self.assertEqual("author", self._get_by_tree_path(tree, "0.20.0.2")["metadata"]["paragraph_type"]) # author 1 - self.assertEqual("\nMichelAbdalla\n", self._get_by_tree_path(tree, "0.20.0.2")["text"]) - self.assertEqual("biblScope_volume", self._get_by_tree_path(tree, "0.20.0.5")["metadata"]["paragraph_type"]) # author 1 - self.assertEqual("4", self._get_by_tree_path(tree, "0.20.0.5")["text"]) - self.assertEqual("biblScope_page", self._get_by_tree_path(tree, "0.20.0.6")["metadata"]["paragraph_type"]) # author 1 - self.assertEqual("471-488", self._get_by_tree_path(tree, "0.20.0.6")["text"]) + self.assertEqual("title", self._get_by_tree_path(tree, "0.12.0.0")["metadata"]["paragraph_type"]) + self.assertEqual("Leakage-resilient symmetric encryption via re-keying", self._get_by_tree_path(tree, "0.12.0.0")["text"]) + self.assertEqual("title_conference_proceedings", self._get_by_tree_path(tree, "0.12.0.1")["metadata"]["paragraph_type"]) + self.assertEqual("Bertoni and Coron", self._get_by_tree_path(tree, "0.12.0.1")["text"]) + self.assertEqual("author", self._get_by_tree_path(tree, "0.12.0.2")["metadata"]["paragraph_type"]) # author 1 + self.assertEqual("Michel Abdalla", self._get_by_tree_path(tree, "0.12.0.2")["text"]) + self.assertEqual("biblScope_volume", self._get_by_tree_path(tree, "0.12.0.5")["metadata"]["paragraph_type"]) # author 1 + self.assertEqual("4", self._get_by_tree_path(tree, "0.12.0.5")["text"]) + self.assertEqual("biblScope_page", self._get_by_tree_path(tree, "0.12.0.6")["metadata"]["paragraph_type"]) # author 1 + self.assertEqual("471-488", self._get_by_tree_path(tree, "0.12.0.6")["text"]) # check cite on bib_item - bibliography_item_uuid = self._get_by_tree_path(tree, "0.20.57")["metadata"]["uid"] # checking on [58] references + bibliography_item_uuid = self._get_by_tree_path(tree, "0.12.57")["metadata"]["uid"] # checking on [58] references section = self._get_by_tree_path(tree, "0.4.0") bibliography_refs_in_text = [ann for ann in section["annotations"] if ann["name"] == "reference" and ann["value"] == bibliography_item_uuid] # We must found two refs [58] in Introduction section