diff --git a/dedoc/readers/article_reader/article_reader.py b/dedoc/readers/article_reader/article_reader.py
index f2169452..2d4e9532 100644
--- a/dedoc/readers/article_reader/article_reader.py
+++ b/dedoc/readers/article_reader/article_reader.py
@@ -11,6 +11,7 @@
from dedoc.data_structures.unstructured_document import UnstructuredDocument
from dedoc.extensions import recognized_mimes
from dedoc.readers.base_reader import BaseReader
+from dedoc.structure_extractors.feature_extractors.list_features.list_utils import get_dotted_item_depth
from dedoc.utils.parameter_utils import get_param_document_type
from dedoc.utils.utils import get_mime_extension
@@ -33,7 +34,8 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
using beautifulsoup library.
As a result, the method fills the class :class:`~dedoc.data_structures.UnstructuredDocument`.
Article reader adds additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`.
- The method extracts information about ``authors``, ``bibliography items``, ``sections``, and ``tables``.
+ The method extracts information about ``authors``, ``keywords``, ``bibliography items``, ``sections``, and ``tables``.
+ In table cells, ``colspan`` attribute can be filled according to the GROBID's "cols" attribute.
You can find more information about the extracted information from GROBID system on the page :ref:`article_structure`.
Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
@@ -51,12 +53,13 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
self.logger.warning(warning)
return UnstructuredDocument(tables=[], lines=[], attachments=[], warnings=[warning])
- soup = BeautifulSoup(response.text, features="lxml")
+ soup = BeautifulSoup(response.text, features="xml")
lines = self.__parse_title(soup)
- if soup.biblstruct is not None:
- authors = soup.biblstruct.find_all("author")
+ if soup.biblStruct is not None:
+ authors = soup.biblStruct.find_all("author")
lines += [line for author in authors for line in self.__parse_author(author)]
+ lines += self.__parse_keywords(soup.keywords)
bib_lines, bib2uid = self.__parse_bibliography(soup)
tables, table2uid = self.__parse_tables(soup)
@@ -135,11 +138,13 @@ def __create_line(self, text: str, hierarchy_level_id: Optional[int] = None, par
def __parse_affiliation(self, affiliation_tag: Tag) -> List[LineWithMeta]:
lines = [self.__create_line(text=affiliation_tag.get("key"), hierarchy_level_id=2, paragraph_type="author_affiliation")]
- if affiliation_tag.orgname:
- lines.append(self.__create_line(text=self.__tag2text(affiliation_tag.orgname), hierarchy_level_id=3, paragraph_type="org_name"))
+ if affiliation_tag.orgName:
+ lines.append(self.__create_line(text=self.__tag2text(affiliation_tag.orgName), hierarchy_level_id=3, paragraph_type="org_name"))
if affiliation_tag.address:
- lines.append(self.__create_line(text=affiliation_tag.address.text, hierarchy_level_id=3, paragraph_type="address"))
+ lines.append(self.__create_line(text=self.__remove_newlines(affiliation_tag.address).get_text(separator=", "),
+ hierarchy_level_id=3,
+ paragraph_type="address"))
return lines
@@ -169,11 +174,11 @@ def __parse_author(self, author_tag: Tag) -> List[LineWithMeta]:
"""
lines = [self.__create_line(text="", hierarchy_level_id=1, paragraph_type="author")]
- first_name = self.__get_tag_by_hierarchy_path(author_tag, ["persname", "forename"])
+ first_name = self.__get_tag_by_hierarchy_path(author_tag, ["persName", "forename"])
if first_name:
lines.append(self.__create_line(text=first_name, hierarchy_level_id=2, paragraph_type="author_first_name"))
- surname = self.__get_tag_by_hierarchy_path(author_tag, ["persname", "surname"])
+ surname = self.__get_tag_by_hierarchy_path(author_tag, ["persName", "surname"])
if surname:
lines.append(self.__create_line(text=surname, hierarchy_level_id=2, paragraph_type="author_surname"))
@@ -187,6 +192,21 @@ def __parse_author(self, author_tag: Tag) -> List[LineWithMeta]:
return lines
+ def __parse_keywords(self, keywords_tag: Tag) -> List[LineWithMeta]:
+ """
+
+ Multi-Object Tracking
+ Data Association
+ Survey
+
+ """
+ if keywords_tag is None:
+ return []
+
+ lines = [self.__create_line(text="", hierarchy_level_id=1, paragraph_type="keywords")]
+ lines += [self.__create_line(text=item.text, hierarchy_level_id=2, paragraph_type="keyword") for item in keywords_tag.find_all("term")]
+ return lines
+
def __create_line_with_refs(self, content: List[Tuple[str, Tag]], bib2uid: dict, table2uid: dict) -> LineWithMeta:
text = ""
start = 0
@@ -219,20 +239,31 @@ def __parse_text(self, soup: Tag, bib2uid: dict, table2uid: dict) -> List[LineWi
lines.append(self.__create_line(text="Abstract", hierarchy_level_id=1, paragraph_type="abstract"))
lines.append(self.__create_line(text=self.__tag2text(abstract)))
- for text in soup.find_all("text"):
- for part in text.find_all("div"):
- # TODO: Beautifulsoup doesn't read
tags from input XML file. WTF!
- # As a result we lose section number in text (see example above)
- # Need to fix this in the future.
- number = part.head.get("n") + " " if part.head else ""
- line_text = str(part.contents[0]) if len(part.contents) > 0 else None
- if line_text is not None and len(line_text) > 0:
- lines.append(self.__create_line(text=number + line_text, hierarchy_level_id=1, paragraph_type="section"))
- for subpart in part.find_all("p"):
- if subpart.string is not None:
- lines.append(self.__create_line_with_refs(subpart.string, bib2uid, table2uid))
- elif subpart.contents and len(subpart.contents) > 0:
- lines.append(self.__create_line_with_refs(subpart.contents, bib2uid, table2uid))
+ for part in soup.body.find_all("div"):
+ lines.extend(self.__parse_section(part, bib2uid, table2uid))
+
+ for other_text_type in ("acknowledgement", "annex"):
+ for text_tag in soup.find_all("div", attrs={"type": other_text_type}):
+ for part in text_tag.find_all("div"):
+ lines.extend(self.__parse_section(part, bib2uid, table2uid))
+
+ return lines
+
+ def __parse_section(self, section_tag: Tag, bib2uid: dict, table2uid: dict) -> List[LineWithMeta]:
+ lines = []
+ number = section_tag.head.get("n") if section_tag.head else ""
+ number = number + " " if number else ""
+ section_depth = get_dotted_item_depth(number)
+ section_depth = section_depth if section_depth > 0 else 1
+
+ line_text = section_tag.head.string if section_tag.head else None
+ if line_text is not None and len(line_text) > 0:
+ lines.append(self.__create_line(text=number + line_text, hierarchy_level_id=section_depth, paragraph_type="section"))
+ for subpart in section_tag.find_all("p"):
+ if subpart.string is not None:
+ lines.append(self.__create_line_with_refs(subpart.string + "\n", bib2uid, table2uid))
+ elif subpart.contents and len(subpart.contents) > 0:
+ lines.append(self.__create_line_with_refs(subpart.contents, bib2uid, table2uid))
return lines
@@ -265,12 +296,26 @@ def __parse_tables(self, soup: Tag) -> Tuple[List[Table], dict]:
tag_tables = soup.find_all("figure", {"type": "table"})
for table in tag_tables:
- row_cells = []
+ table_cells = []
head = table.contents[0] if len(table.contents) > 0 and isinstance(table.contents[0], str) else self.__tag2text(table.head)
- title = head + self.__tag2text(table.figdesc)
+ title = head + self.__tag2text(table.figDesc)
for row in table.table.find_all("row"):
- row_cells.append([CellWithMeta(lines=[self.__create_line(self.__tag2text(cell))]) for cell in row.find_all("cell")])
- tables.append(Table(cells=row_cells, metadata=TableMetadata(page_id=0, title=title)))
+ row_cells = []
+ for cell in row.find_all("cell"):
+ cell_text = self.__create_line(self.__tag2text(cell))
+ colspan = int(cell.get("cols", 1))
+ row_cells.append(CellWithMeta(lines=[cell_text], colspan=colspan))
+
+ if colspan > 1:
+ row_cells.extend([CellWithMeta(lines=[cell_text], invisible=True) for _ in range(colspan - 1)])
+
+ table_cells.append(row_cells)
+
+ # ignore empty tables
+ if len(table_cells) == 0:
+ continue
+
+ tables.append(Table(cells=table_cells, metadata=TableMetadata(page_id=0, title=title)))
table2uid["#" + table.get("xml:id")] = tables[-1].metadata.uid
return tables, table2uid
@@ -310,12 +355,12 @@ def __parse_bibliography(self, soup: Tag) -> Tuple[List[LineWithMeta], dict]:
# according GROBID description
level_2_paragraph_type = {"a": "title", "j": "title_journal", "s": "title_series", "m": "title_conference_proceedings"}
- bibliography = soup.find("listbibl", recursive=True)
+ bibliography = soup.find("listBibl", recursive=True)
lines.append(self.__create_line(text="bibliography", hierarchy_level_id=1, paragraph_type="bibliography"))
if not bibliography:
return lines, cites
- bib_items = bibliography.find_all("biblstruct")
+ bib_items = bibliography.find_all("biblStruct")
if not bib_items:
return lines, cites
@@ -331,19 +376,19 @@ def __parse_bibliography(self, soup: Tag) -> Tuple[List[LineWithMeta], dict]:
lines.append(self.__create_line(text=self.__tag2text(title), hierarchy_level_id=3, paragraph_type=paragraph_type))
lines += [ # parse bib authors
- self.__create_line(text=author.get_text(), hierarchy_level_id=3, paragraph_type="author")
+ self.__create_line(text=self.__remove_newlines(author).get_text(separator=" "), hierarchy_level_id=3, paragraph_type="author")
for author in bib_item.find_all("author", recursive=True) if author
]
lines += [ # parse biblScope
self.__create_line(text=self.__tag2text(bibl_scope), hierarchy_level_id=3, paragraph_type="biblScope_volume")
- for bibl_scope in bib_item.find_all("biblscope", {"unit": "volume"}, recursive=True) if bibl_scope
+ for bibl_scope in bib_item.find_all("biblScope", {"unit": "volume"}, recursive=True) if bibl_scope
]
try:
lines += [ # parse values
self.__create_line(text=f"{bibl_scope.get('from')}-{bibl_scope.get('to')}", hierarchy_level_id=3, paragraph_type="biblScope_page")
- for bibl_scope in bib_item.find_all("biblscope", {"unit": "page"}, recursive=True) if bibl_scope
+ for bibl_scope in bib_item.find_all("biblScope", {"unit": "page"}, recursive=True) if bibl_scope
]
finally:
self.logger.warning("Grobid parsing warning: was non-standard format")
@@ -363,3 +408,9 @@ def __parse_bibliography(self, soup: Tag) -> Tuple[List[LineWithMeta], dict]:
def __parse_title(self, soup: Tag) -> List[LineWithMeta]:
return [self.__create_line(text=self.__tag2text(soup.title), hierarchy_level_id=0, paragraph_type="root")]
+
+ def __remove_newlines(self, tag: Tag) -> Tag:
+ for item in tag:
+ if not isinstance(item, Tag):
+ item.extract()
+ return tag
diff --git a/docs/source/structure_types/article.rst b/docs/source/structure_types/article.rst
index 8de477e9..2285ddd9 100644
--- a/docs/source/structure_types/article.rst
+++ b/docs/source/structure_types/article.rst
@@ -27,6 +27,7 @@ There are the following line types in the article structure type:
* ``root``;
* ``author`` (includes ``author_first_name``, ``author_surname``, ``email``);
+ * ``keywords`` (includes ``keyword``);
* ``author_affiliation`` (includes ``org_name``, ``address``);
* ``abstract``;
* ``section``;
@@ -108,9 +109,15 @@ Below is a description of nodes in the output tree:
:language: json
:lines: 125-198
+ * **keywords** node (if exist) is a child node of the node ``root``.
+
+ ``keywords`` node contains ``keyword`` nodes as children. Each ``keyword`` node contains the text of one key word item.
+
* **abstract** is the article's abstract section ( tag in GROBID's output).
- * **section**: nodes of article sections (for example "Introduction", "Conclusion", "V Experiments ..." etc.). This type of node has a subnode ``raw_text``. ``section`` nodes are children of a node ``root``.
+ * **section**: nodes of article sections (for example "Introduction", "Conclusion", "V Experiments ..." etc.). This type of node has a subnode ``raw_text``.
+
+ ``section`` nodes are children of a node ``root`` and may me nested (e.g., section "2.1. Datasets" is nested to the section "2. Related work").
* **bibliography** is the article's bibliography list which contains only ``bibliography_item`` nodes.
diff --git a/tests/api_tests/test_api_doctype_article.py b/tests/api_tests/test_api_doctype_article.py
index 508e2574..bef10773 100644
--- a/tests/api_tests/test_api_doctype_article.py
+++ b/tests/api_tests/test_api_doctype_article.py
@@ -34,24 +34,28 @@ def test_article(self) -> None:
self.assertEqual("org_name", self._get_by_tree_path(tree, "0.2.2.0")["metadata"]["paragraph_type"])
self.assertEqual("ICTEAM/ELEN/Crypto Group", self._get_by_tree_path(tree, "0.2.2.0")["text"])
+ # check section
+ self.assertEqual("section", self._get_by_tree_path(tree, "0.4")["metadata"]["paragraph_type"])
+ self.assertEqual("1 Introduction", self._get_by_tree_path(tree, "0.4")["text"])
+
# check bibliography list
- self.assertEqual("bibliography", self._get_by_tree_path(tree, "0.20")["metadata"]["paragraph_type"])
- self.assertEqual(65, len(self._get_by_tree_path(tree, "0.20")["subparagraphs"]))
+ self.assertEqual("bibliography", self._get_by_tree_path(tree, "0.12")["metadata"]["paragraph_type"])
+ self.assertEqual(65, len(self._get_by_tree_path(tree, "0.12")["subparagraphs"]))
# check bib_item 1 recognizing
- self.assertEqual("title", self._get_by_tree_path(tree, "0.20.0.0")["metadata"]["paragraph_type"])
- self.assertEqual("Leakage-resilient symmetric encryption via re-keying", self._get_by_tree_path(tree, "0.20.0.0")["text"])
- self.assertEqual("title_conference_proceedings", self._get_by_tree_path(tree, "0.20.0.1")["metadata"]["paragraph_type"])
- self.assertEqual("Bertoni and Coron", self._get_by_tree_path(tree, "0.20.0.1")["text"])
- self.assertEqual("author", self._get_by_tree_path(tree, "0.20.0.2")["metadata"]["paragraph_type"]) # author 1
- self.assertEqual("\nMichelAbdalla\n", self._get_by_tree_path(tree, "0.20.0.2")["text"])
- self.assertEqual("biblScope_volume", self._get_by_tree_path(tree, "0.20.0.5")["metadata"]["paragraph_type"]) # author 1
- self.assertEqual("4", self._get_by_tree_path(tree, "0.20.0.5")["text"])
- self.assertEqual("biblScope_page", self._get_by_tree_path(tree, "0.20.0.6")["metadata"]["paragraph_type"]) # author 1
- self.assertEqual("471-488", self._get_by_tree_path(tree, "0.20.0.6")["text"])
+ self.assertEqual("title", self._get_by_tree_path(tree, "0.12.0.0")["metadata"]["paragraph_type"])
+ self.assertEqual("Leakage-resilient symmetric encryption via re-keying", self._get_by_tree_path(tree, "0.12.0.0")["text"])
+ self.assertEqual("title_conference_proceedings", self._get_by_tree_path(tree, "0.12.0.1")["metadata"]["paragraph_type"])
+ self.assertEqual("Bertoni and Coron", self._get_by_tree_path(tree, "0.12.0.1")["text"])
+ self.assertEqual("author", self._get_by_tree_path(tree, "0.12.0.2")["metadata"]["paragraph_type"]) # author 1
+ self.assertEqual("Michel Abdalla", self._get_by_tree_path(tree, "0.12.0.2")["text"])
+ self.assertEqual("biblScope_volume", self._get_by_tree_path(tree, "0.12.0.5")["metadata"]["paragraph_type"]) # author 1
+ self.assertEqual("4", self._get_by_tree_path(tree, "0.12.0.5")["text"])
+ self.assertEqual("biblScope_page", self._get_by_tree_path(tree, "0.12.0.6")["metadata"]["paragraph_type"]) # author 1
+ self.assertEqual("471-488", self._get_by_tree_path(tree, "0.12.0.6")["text"])
# check cite on bib_item
- bibliography_item_uuid = self._get_by_tree_path(tree, "0.20.57")["metadata"]["uid"] # checking on [58] references
+ bibliography_item_uuid = self._get_by_tree_path(tree, "0.12.57")["metadata"]["uid"] # checking on [58] references
section = self._get_by_tree_path(tree, "0.4.0")
bibliography_refs_in_text = [ann for ann in section["annotations"] if ann["name"] == "reference" and ann["value"] == bibliography_item_uuid]
# We must found two refs [58] in Introduction section