diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/docx_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/docx_attachments_extractor.py
index 1c307409..8919c890 100644
--- a/dedoc/attachments_extractors/concrete_attachments_extractors/docx_attachments_extractor.py
+++ b/dedoc/attachments_extractors/concrete_attachments_extractors/docx_attachments_extractor.py
@@ -29,7 +29,7 @@ def can_extract(self,
"""
Checks if this extractor can get attachments from the document (it should have .docx extension)
"""
- extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
+ mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in recognized_extensions.docx_like_format or mime in recognized_mimes.docx_like_format
def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]:
diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/excel_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/excel_attachments_extractor.py
index cf5cfefa..96834db6 100644
--- a/dedoc/attachments_extractors/concrete_attachments_extractors/excel_attachments_extractor.py
+++ b/dedoc/attachments_extractors/concrete_attachments_extractors/excel_attachments_extractor.py
@@ -22,7 +22,7 @@ def can_extract(self,
"""
Checks if this extractor can get attachments from the document (it should have .xlsx extension)
"""
- extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
+ mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in recognized_extensions.excel_like_format or mime in recognized_mimes.excel_like_format
def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]:
diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/json_attachment_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/json_attachment_extractor.py
index 39e11c69..83fd572a 100644
--- a/dedoc/attachments_extractors/concrete_attachments_extractors/json_attachment_extractor.py
+++ b/dedoc/attachments_extractors/concrete_attachments_extractors/json_attachment_extractor.py
@@ -22,7 +22,7 @@ def can_extract(self,
"""
Checks if this extractor can get attachments from the document (it should have .json extension)
"""
- extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
+ mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower().endswith(".json")
def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]:
diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/pdf_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/pdf_attachments_extractor.py
index 0ae13fb4..e4a53c74 100644
--- a/dedoc/attachments_extractors/concrete_attachments_extractors/pdf_attachments_extractor.py
+++ b/dedoc/attachments_extractors/concrete_attachments_extractors/pdf_attachments_extractor.py
@@ -9,7 +9,7 @@
from dedoc.attachments_extractors.abstract_attachment_extractor import AbstractAttachmentsExtractor
from dedoc.data_structures.attached_file import AttachedFile
-from dedoc.extensions import recognized_extensions, recognized_mimes
+from dedoc.extensions import recognized_mimes
from dedoc.utils.utils import convert_datetime, get_mime_extension, get_unique_name
@@ -28,8 +28,8 @@ def can_extract(self,
"""
Checks if this extractor can get attachments from the document (it should have .pdf extension)
"""
- extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
- return extension.lower() in recognized_extensions.docx_like_format or mime in recognized_mimes.docx_like_format
+ mime, _ = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
+ return mime in recognized_mimes.pdf_like_format
def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]:
"""
diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/pptx_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/pptx_attachments_extractor.py
index 34acdef4..4b9ecb54 100644
--- a/dedoc/attachments_extractors/concrete_attachments_extractors/pptx_attachments_extractor.py
+++ b/dedoc/attachments_extractors/concrete_attachments_extractors/pptx_attachments_extractor.py
@@ -22,7 +22,7 @@ def can_extract(self,
"""
Checks if this extractor can get attachments from the document (it should have .pptx extension)
"""
- extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
+ mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in recognized_extensions.pptx_like_format or mime in recognized_mimes.pptx_like_format
def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]:
diff --git a/dedoc/attachments_handler/attachments_handler.py b/dedoc/attachments_handler/attachments_handler.py
index 1017ad45..1935e5d2 100644
--- a/dedoc/attachments_handler/attachments_handler.py
+++ b/dedoc/attachments_handler/attachments_handler.py
@@ -4,10 +4,10 @@
import time
from typing import List, Optional
-from dedoc.attachments_extractors import AbstractAttachmentsExtractor
from dedoc.common.exceptions.dedoc_error import DedocError
from dedoc.data_structures import AttachedFile, DocumentMetadata, ParsedDocument
from dedoc.data_structures.unstructured_document import UnstructuredDocument
+from dedoc.utils.parameter_utils import get_param_with_attachments
from dedoc.utils.utils import get_empty_content
@@ -39,11 +39,11 @@ def handle_attachments(self, document_parser: "DedocManager", document: Unstruct
are important, look to the API parameters documentation for more details).
:return: list of parsed document attachments
"""
- parsed_attachment_files = []
+ attachments = []
recursion_deep_attachments = int(parameters.get("recursion_deep_attachments", 10)) - 1
- if not AbstractAttachmentsExtractor.with_attachments(parameters) or recursion_deep_attachments < 0:
- return parsed_attachment_files
+ if not get_param_with_attachments(parameters) or recursion_deep_attachments < 0:
+ return attachments
previous_log_time = time.time()
@@ -73,8 +73,8 @@ def handle_attachments(self, document_parser: "DedocManager", document: Unstruct
parsed_file = self.__get_empty_document(document_parser=document_parser, attachment=attachment, parameters=parameters_copy)
parsed_file.metadata.set_uid(attachment.uid)
- parsed_attachment_files.append(parsed_file)
- return parsed_attachment_files
+ attachments.append(parsed_file)
+ return attachments
def __get_empty_document(self, document_parser: "DedocManager", attachment: AttachedFile, parameters: dict) -> ParsedDocument: # noqa
metadata = document_parser.document_metadata_extractor.extract(
diff --git a/dedoc/config.py b/dedoc/config.py
index f3c374eb..06d98894 100644
--- a/dedoc/config.py
+++ b/dedoc/config.py
@@ -43,7 +43,10 @@
# TESSERACT OCR confidence threshold ( values: [-1 - undefined; 0.0 : 100.0 % - confidence value)
ocr_conf_threshold=40.0,
# max depth of document structure tree
- recursion_deep_subparagraphs=30
+ recursion_deep_subparagraphs=30,
+
+ # -------------------------------------------EXTERNAL SERVICES SETTINGS---------------------------------------------
+ grobid_max_connection_attempts=3
)
diff --git a/dedoc/converters/concrete_converters/binary_converter.py b/dedoc/converters/concrete_converters/binary_converter.py
index 46142cff..ba7741cf 100644
--- a/dedoc/converters/concrete_converters/binary_converter.py
+++ b/dedoc/converters/concrete_converters/binary_converter.py
@@ -23,7 +23,7 @@ def can_convert(self,
"""
Checks if the document is image-like (e.g. it has .bmp, .jpg, .tiff, etc. extension) and has `mime=application/octet-stream`.
"""
- extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
+ mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return mime == "application/octet-stream" and extension in supported_image_types
def convert(self, file_path: str, parameters: Optional[dict] = None) -> str:
diff --git a/dedoc/converters/concrete_converters/docx_converter.py b/dedoc/converters/concrete_converters/docx_converter.py
index 3b50416a..ad8855ec 100644
--- a/dedoc/converters/concrete_converters/docx_converter.py
+++ b/dedoc/converters/concrete_converters/docx_converter.py
@@ -22,7 +22,7 @@ def can_convert(self,
"""
Checks if the document is docx-like, e.g. it has .doc, .rtf or .odt extension.
"""
- extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
+ mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in converted_extensions.docx_like_format or mime in converted_mimes.docx_like_format
def convert(self, file_path: str, parameters: Optional[dict] = None) -> str:
diff --git a/dedoc/converters/concrete_converters/excel_converter.py b/dedoc/converters/concrete_converters/excel_converter.py
index 1396a12b..8aaa8809 100644
--- a/dedoc/converters/concrete_converters/excel_converter.py
+++ b/dedoc/converters/concrete_converters/excel_converter.py
@@ -22,7 +22,7 @@ def can_convert(self,
"""
Checks if the document is xlsx-like, e.g. it has .xls or .ods extension.
"""
- extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
+ mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in converted_extensions.excel_like_format or mime in converted_mimes.excel_like_format
def convert(self, file_path: str, parameters: Optional[dict] = None) -> str:
diff --git a/dedoc/converters/concrete_converters/pdf_converter.py b/dedoc/converters/concrete_converters/pdf_converter.py
index f0b929e8..01483d4c 100644
--- a/dedoc/converters/concrete_converters/pdf_converter.py
+++ b/dedoc/converters/concrete_converters/pdf_converter.py
@@ -22,7 +22,7 @@ def can_convert(self,
"""
Checks if the document is pdf-like, e.g. it has .djvu extension.
"""
- extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
+ mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in converted_extensions.pdf_like_format or mime in converted_mimes.pdf_like_format
def convert(self, file_path: str, parameters: Optional[dict] = None) -> str:
diff --git a/dedoc/converters/concrete_converters/png_converter.py b/dedoc/converters/concrete_converters/png_converter.py
index cb50245d..6044c970 100644
--- a/dedoc/converters/concrete_converters/png_converter.py
+++ b/dedoc/converters/concrete_converters/png_converter.py
@@ -25,7 +25,7 @@ def can_convert(self,
"""
Checks if the document is image-like, e.g. it has .bmp, .jpg, .tiff, etc. extension.
"""
- extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
+ mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in converted_extensions.image_like_format or mime in converted_mimes.image_like_format
def convert(self, file_path: str, parameters: Optional[dict] = None) -> str:
diff --git a/dedoc/converters/concrete_converters/pptx_converter.py b/dedoc/converters/concrete_converters/pptx_converter.py
index d1e7aec3..afce2b94 100644
--- a/dedoc/converters/concrete_converters/pptx_converter.py
+++ b/dedoc/converters/concrete_converters/pptx_converter.py
@@ -22,7 +22,7 @@ def can_convert(self,
"""
Checks if the document is pptx-like, e.g. it has .ppt or .odp extension.
"""
- extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
+ mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in converted_extensions.pptx_like_format or mime in converted_mimes.pptx_like_format
def convert(self, file_path: str, parameters: Optional[dict] = None) -> str:
diff --git a/dedoc/converters/concrete_converters/txt_converter.py b/dedoc/converters/concrete_converters/txt_converter.py
index b1543fa0..1f384d59 100644
--- a/dedoc/converters/concrete_converters/txt_converter.py
+++ b/dedoc/converters/concrete_converters/txt_converter.py
@@ -23,7 +23,7 @@ def can_convert(self,
"""
Checks if the document is txt-like, e.g. it has .xml extension.
"""
- extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
+ mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in converted_extensions.txt_like_format or mime in converted_mimes.txt_like_format
def convert(self, file_path: str, parameters: Optional[dict] = None) -> str:
diff --git a/dedoc/converters/converter_composition.py b/dedoc/converters/converter_composition.py
index cf12c2ed..63543d4e 100644
--- a/dedoc/converters/converter_composition.py
+++ b/dedoc/converters/converter_composition.py
@@ -29,7 +29,7 @@ def convert(self, file_path: str, parameters: Optional[dict] = None) -> str:
:param parameters: parameters of converting, see :ref:`parameters_description` for more details
:return: path of converted file if conversion was executed else path of the original file
"""
- extension, mime = get_mime_extension(file_path=file_path)
+ mime, extension = get_mime_extension(file_path=file_path)
converted_file_path = file_path
for converter in self.converters:
diff --git a/dedoc/data_structures/concrete_annotations/__init__.py b/dedoc/data_structures/concrete_annotations/__init__.py
index 529acaa0..264abda0 100644
--- a/dedoc/data_structures/concrete_annotations/__init__.py
+++ b/dedoc/data_structures/concrete_annotations/__init__.py
@@ -15,7 +15,8 @@
from .superscript_annotation import SuperscriptAnnotation
from .table_annotation import TableAnnotation
from .underlined_annotation import UnderlinedAnnotation
+from .reference_annotation import ReferenceAnnotation
__all__ = ['AlignmentAnnotation', 'AttachAnnotation', 'BBoxAnnotation', 'BoldAnnotation', 'ColorAnnotation', 'ConfidenceAnnotation',
'IndentationAnnotation', 'ItalicAnnotation', 'LinkedTextAnnotation', 'SizeAnnotation', 'SpacingAnnotation', 'StrikeAnnotation',
- 'StyleAnnotation', 'SubscriptAnnotation', 'SuperscriptAnnotation', 'TableAnnotation', 'UnderlinedAnnotation']
+ 'StyleAnnotation', 'SubscriptAnnotation', 'SuperscriptAnnotation', 'TableAnnotation', 'UnderlinedAnnotation', 'ReferenceAnnotation']
diff --git a/dedoc/data_structures/concrete_annotations/reference_annotation.py b/dedoc/data_structures/concrete_annotations/reference_annotation.py
new file mode 100644
index 00000000..e629ba8b
--- /dev/null
+++ b/dedoc/data_structures/concrete_annotations/reference_annotation.py
@@ -0,0 +1,43 @@
+from dedoc.data_structures.annotation import Annotation
+
+
+class ReferenceAnnotation(Annotation):
+ """
+ This annotation points to a place in the document text that is a link to another line in the document (for example, another textual line).
+
+ Example of usage for document_type="article" with the example of link on the bibliography_item :class:`~dedoc.data_structures.LineWithMeta`.
+
+ LineWithMeta:
+
+ .. code-block:: python
+
+ LineWithMeta( # the line with the reference annotation
+ line="As for the PRF, we use the tree-based construction from Goldreich, Goldwasser and Micali [18]",
+ metadata=LineMetadata(page_id=0, line_id=32),
+ annotations=[ReferenceAnnotation(start=90, end=92, value="97cfac39-f0e3-11ee-b81c-b88584b4e4a1"), ...]
+ )
+
+ other LineWithMeta:
+
+ .. code-block:: python
+
+ LineWithMeta( # The line referenced by the previous one
+ line="some your text (can be empty)",
+ metadata=LineMetadata(
+ page_id=10,
+ line_id=189,
+ tag_hierarchy_level=HierarchyLevel(level1=2, level2=0, paragraph_type="bibliography_item")),
+ other_fields={"uid": "97cfac39-f0e3-11ee-b81c-b88584b4e4a1"}
+ ),
+ annotations=[]
+ )
+ """
+ name = "reference"
+
+ def __init__(self, value: str, start: int, end: int) -> None:
+ """
+ :param value: unique identifier of the line to which this annotation refers
+ :param start: start of the annotated text with a link
+ :param end: end of the annotated text with a link
+ """
+ super().__init__(start=start, end=end, name=ReferenceAnnotation.name, value=value, is_mergeable=False)
diff --git a/dedoc/data_structures/line_metadata.py b/dedoc/data_structures/line_metadata.py
index 504c5110..19b6730a 100644
--- a/dedoc/data_structures/line_metadata.py
+++ b/dedoc/data_structures/line_metadata.py
@@ -30,9 +30,9 @@ def __init__(self,
self.hierarchy_level = hierarchy_level
self.page_id = page_id
self.line_id = line_id
+ self.__other_fields = {}
if other_fields is not None and len(other_fields) > 0:
self.extend_other_fields(other_fields)
- self.__other_fields = {}
def extend_other_fields(self, new_fields: dict) -> None:
"""
diff --git a/dedoc/data_structures/line_with_meta.py b/dedoc/data_structures/line_with_meta.py
index ca954573..798a1712 100644
--- a/dedoc/data_structures/line_with_meta.py
+++ b/dedoc/data_structures/line_with_meta.py
@@ -136,7 +136,8 @@ def set_line(self, line: str) -> None:
self._line = line
def __repr__(self) -> str:
- return f"LineWithMeta({self.line[:65]})"
+ return (f"LineWithMeta({self.line[:65]}, "
+ f"tagHL={self.metadata.tag_hierarchy_level.level_1, self.metadata.tag_hierarchy_level.level_2, self.metadata.tag_hierarchy_level.line_type})")
def __add__(self, other: Union["LineWithMeta", str]) -> "LineWithMeta":
assert isinstance(other, (LineWithMeta, str))
diff --git a/dedoc/data_structures/table_metadata.py b/dedoc/data_structures/table_metadata.py
index a70ab2b4..fc934d9a 100644
--- a/dedoc/data_structures/table_metadata.py
+++ b/dedoc/data_structures/table_metadata.py
@@ -9,15 +9,17 @@ class TableMetadata(Serializable):
"""
This class holds the information about table unique identifier, rotation angle (if table has been rotated - for images) and so on.
"""
- def __init__(self, page_id: Optional[int], uid: Optional[str] = None, rotated_angle: float = 0.0) -> None:
+ def __init__(self, page_id: Optional[int], uid: Optional[str] = None, rotated_angle: float = 0.0, title: str = "") -> None:
"""
:param page_id: number of the page where table starts
:param uid: unique identifier of the table
:param rotated_angle: value of the rotation angle by which the table was rotated during recognition
+ :param title: table's title
"""
self.page_id = page_id
self.uid = str(uuid.uuid4()) if not uid else uid
self.rotated_angle = rotated_angle
+ self.title = title
def to_api_schema(self) -> ApiTableMetadata:
- return ApiTableMetadata(uid=self.uid, page_id=self.page_id, rotated_angle=self.rotated_angle)
+ return ApiTableMetadata(uid=self.uid, page_id=self.page_id, rotated_angle=self.rotated_angle, title=self.title)
diff --git a/dedoc/manager_config.py b/dedoc/manager_config.py
index 6854c6f4..679db954 100644
--- a/dedoc/manager_config.py
+++ b/dedoc/manager_config.py
@@ -1,5 +1,7 @@
from typing import Optional
+from dedoc.readers.article_reader.article_reader import ArticleReader
+
def _get_manager_config(config: dict) -> dict:
"""
@@ -57,6 +59,7 @@ def _get_manager_config(config: dict) -> dict:
BinaryConverter(config=config)
]
readers = [
+ ArticleReader(config=config),
DocxReader(config=config),
ExcelReader(config=config),
PptxReader(config=config),
diff --git a/dedoc/readers/__init__.py b/dedoc/readers/__init__.py
index 7c6cce29..2d96fdae 100644
--- a/dedoc/readers/__init__.py
+++ b/dedoc/readers/__init__.py
@@ -1,4 +1,5 @@
from .archive_reader.archive_reader import ArchiveReader
+from .article_reader.article_reader import ArticleReader
from .base_reader import BaseReader
from .csv_reader.csv_reader import CSVReader
from .docx_reader.docx_reader import DocxReader
@@ -17,6 +18,6 @@
from .reader_composition import ReaderComposition
from .txt_reader.raw_text_reader import RawTextReader
-__all__ = ['ArchiveReader', 'BaseReader', 'CSVReader', 'DocxReader', 'EmailReader', 'ExcelReader', 'HtmlReader', 'JsonReader', 'MhtmlReader',
+__all__ = ['ArchiveReader', 'ArticleReader', 'BaseReader', 'CSVReader', 'DocxReader', 'EmailReader', 'ExcelReader', 'HtmlReader', 'JsonReader', 'MhtmlReader',
'NoteReader', 'PptxReader', 'ReaderComposition', 'RawTextReader',
'PdfBaseReader', 'PdfImageReader', 'PdfTabbyReader', 'PdfTxtlayerReader', 'PdfAutoReader']
diff --git a/dedoc/readers/archive_reader/archive_reader.py b/dedoc/readers/archive_reader/archive_reader.py
index d8831b58..589014ac 100644
--- a/dedoc/readers/archive_reader/archive_reader.py
+++ b/dedoc/readers/archive_reader/archive_reader.py
@@ -29,7 +29,7 @@ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None,
Check if the document extension is suitable for this reader.
Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters.
"""
- extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
+ mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in recognized_extensions.archive_like_format or mime in recognized_mimes.archive_like_format
def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
diff --git a/dedoc/readers/article_reader/__init__.py b/dedoc/readers/article_reader/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/dedoc/readers/article_reader/article_reader.py b/dedoc/readers/article_reader/article_reader.py
new file mode 100644
index 00000000..f2169452
--- /dev/null
+++ b/dedoc/readers/article_reader/article_reader.py
@@ -0,0 +1,365 @@
+import os
+import time
+from typing import Dict, List, Optional, Tuple
+
+import requests
+from bs4 import BeautifulSoup, Tag
+
+from dedoc.data_structures import Annotation, CellWithMeta, HierarchyLevel, LineMetadata, Table, TableAnnotation, TableMetadata
+from dedoc.data_structures.concrete_annotations.reference_annotation import ReferenceAnnotation
+from dedoc.data_structures.line_with_meta import LineWithMeta
+from dedoc.data_structures.unstructured_document import UnstructuredDocument
+from dedoc.extensions import recognized_mimes
+from dedoc.readers.base_reader import BaseReader
+from dedoc.utils.parameter_utils import get_param_document_type
+from dedoc.utils.utils import get_mime_extension
+
+
+class ArticleReader(BaseReader):
+ """
+ This class is used for parsing scientific articles with .pdf extension using `GROBID `_ system.
+ """
+
+ def __init__(self, config: Optional[dict] = None) -> None:
+ super().__init__(config=config)
+ self.grobid_url = f"http://{os.environ.get('GROBID_HOST', 'localhost')}:{os.environ.get('GROBID_PORT', '8070')}"
+ self.url = f"{self.grobid_url}/api/processFulltextDocument"
+ self.grobid_is_alive = False
+ self.__update_grobid_alive(self.grobid_url, max_attempts=self.config.get("grobid_max_connection_attempts", 3))
+
+ def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
+ """
+ The method calls the service GROBID method ``/api/processFulltextDocument`` and analyzes the result (format XML/TEI) of the recognized article
+ using beautifulsoup library.
+ As a result, the method fills the class :class:`~dedoc.data_structures.UnstructuredDocument`.
+ Article reader adds additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`.
+ The method extracts information about ``authors``, ``bibliography items``, ``sections``, and ``tables``.
+ You can find more information about the extracted information from GROBID system on the page :ref:`article_structure`.
+
+ Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
+ """
+ with open(file_path, "rb") as file:
+ files = {"input": file}
+ try:
+ response = requests.post(self.url, files=files)
+ if response.status_code != 200:
+ warning = f"GROBID returns code {response.status_code}."
+ self.logger.warning(warning)
+ return UnstructuredDocument(tables=[], lines=[], attachments=[], warnings=[warning])
+ except requests.exceptions.ConnectionError as ex:
+ warning = f"GROBID doesn't response. Check GROBID service on {self.url}. Exception' msg: {ex}"
+ self.logger.warning(warning)
+ return UnstructuredDocument(tables=[], lines=[], attachments=[], warnings=[warning])
+
+ soup = BeautifulSoup(response.text, features="lxml")
+ lines = self.__parse_title(soup)
+
+ if soup.biblstruct is not None:
+ authors = soup.biblstruct.find_all("author")
+ lines += [line for author in authors for line in self.__parse_author(author)]
+
+ bib_lines, bib2uid = self.__parse_bibliography(soup)
+ tables, table2uid = self.__parse_tables(soup)
+
+ lines += self.__parse_text(soup, bib2uid, table2uid)
+ lines.extend(bib_lines)
+
+ return UnstructuredDocument(tables=tables, lines=lines, attachments=[], warnings=["use GROBID (version: 0.8.0)"])
+
+ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool:
+ """
+ Check if:
+
+ * the document extension is suitable for this reader (.pdf);
+ * parameter "document_type" is "article";
+ * GROBID service is running on port 8070.
+
+ Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters.
+ """
+ if get_param_document_type(parameters) != "article":
+ return False
+
+ self.__update_grobid_alive(self.grobid_url, max_attempts=1)
+ if not self.grobid_is_alive:
+ return False
+
+ mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
+ return mime in recognized_mimes.pdf_like_format and extension.lower() == ".pdf"
+
+ def __update_grobid_alive(self, grobid_url: str, max_attempts: int = 2) -> None:
+ if self.grobid_is_alive:
+ return
+
+ attempt = max_attempts
+ while attempt > 0:
+ try:
+ response = requests.get(f"{grobid_url}/api/isalive")
+ if response.status_code == 200:
+ self.logger.info(f"GROBID up on {grobid_url}.")
+ self.grobid_is_alive = True
+ return
+ except requests.exceptions.ConnectionError as ex:
+ self.logger.warning(f"GROBID doesn't response. Check GROBID service on {self.url}. Exception's msg: {ex}")
+ time.sleep(5)
+ attempt -= 1
+
+ self.grobid_is_alive = False
+
+ def __get_tag_by_hierarchy_path(self, source: Tag, hierarchy_path: List[str]) -> Optional[str]:
+ cur_tag = source
+ for path_item in hierarchy_path:
+ cur_tag = cur_tag.find(path_item)
+ if cur_tag is None:
+ # tag not found
+ return ""
+
+ return ArticleReader.__tag2text(cur_tag)
+
+ def __create_line(self, text: str, hierarchy_level_id: Optional[int] = None, paragraph_type: Optional[str] = None,
+ annotations: Optional[List[Annotation]] = None, other_fields: Optional[Dict] = None) -> LineWithMeta:
+ # TODO check on improve
+ if other_fields is None:
+ other_fields = {}
+ assert text is not None
+ assert isinstance(text, str)
+
+ if hierarchy_level_id is None or paragraph_type is None:
+ hierarchy_level = HierarchyLevel.create_raw_text()
+ else:
+ hierarchy_level = HierarchyLevel(level_1=hierarchy_level_id, level_2=0, can_be_multiline=False, line_type=paragraph_type)
+
+ return LineWithMeta(line=text,
+ metadata=LineMetadata(page_id=0, line_id=0, tag_hierarchy_level=hierarchy_level, other_fields=other_fields),
+ annotations=annotations)
+
+ def __parse_affiliation(self, affiliation_tag: Tag) -> List[LineWithMeta]:
+ lines = [self.__create_line(text=affiliation_tag.get("key"), hierarchy_level_id=2, paragraph_type="author_affiliation")]
+
+ if affiliation_tag.orgname:
+ lines.append(self.__create_line(text=self.__tag2text(affiliation_tag.orgname), hierarchy_level_id=3, paragraph_type="org_name"))
+
+ if affiliation_tag.address:
+ lines.append(self.__create_line(text=affiliation_tag.address.text, hierarchy_level_id=3, paragraph_type="address"))
+
+ return lines
+
+ def __parse_author(self, author_tag: Tag) -> List[LineWithMeta]:
+ """
+
+ Example:
+
+ SoniaBelaïd
+
+ École Normale Supérieure
+
+ 45 rue dUlm
+ 75005
+ Paris
+
+
+
+ Thales Communications & Security
+
+ 4 Avenue des Louvresses
+ 92230
+ Gennevilliers
+
+
+
+ """
+ lines = [self.__create_line(text="", hierarchy_level_id=1, paragraph_type="author")]
+
+ first_name = self.__get_tag_by_hierarchy_path(author_tag, ["persname", "forename"])
+ if first_name:
+ lines.append(self.__create_line(text=first_name, hierarchy_level_id=2, paragraph_type="author_first_name"))
+
+ surname = self.__get_tag_by_hierarchy_path(author_tag, ["persname", "surname"])
+ if surname:
+ lines.append(self.__create_line(text=surname, hierarchy_level_id=2, paragraph_type="author_surname"))
+
+ lines += [
+ self.__create_line(text=email.get_text(), hierarchy_level_id=3, paragraph_type="email")
+ for email in author_tag.find_all("email") if email
+ ]
+
+ affiliations = author_tag.find_all("affiliation")
+ lines += [line for affiliation in affiliations for line in self.__parse_affiliation(affiliation)]
+
+ return lines
+
+ def __create_line_with_refs(self, content: List[Tuple[str, Tag]], bib2uid: dict, table2uid: dict) -> LineWithMeta:
+ text = ""
+ start = 0
+ annotations = []
+
+ for subpart in content:
+ if isinstance(subpart, Tag) and subpart.name == "ref":
+ target = subpart.get("target")
+ sub_text = subpart.string
+ if subpart.get("type") == "bibr" and target in bib2uid:
+ annotations.append(ReferenceAnnotation(value=bib2uid[target], start=start, end=start + len(sub_text)))
+ if subpart.get("type") == "table" and target in table2uid:
+ annotations.append(TableAnnotation(name=table2uid[target], start=start, end=start + len(sub_text)))
+ else:
+ sub_text = subpart if isinstance(subpart, str) else ""
+
+ text += sub_text
+ start += len(sub_text)
+
+ return self.__create_line(text=text, hierarchy_level_id=None, paragraph_type=None, annotations=annotations)
+
+ def __parse_text(self, soup: Tag, bib2uid: dict, table2uid: dict) -> List[LineWithMeta]:
+ """
+ Example of section XML tag:
+
Preprocessing
...
...
+ """
+ lines = []
+
+ abstract = soup.find("abstract").p
+ lines.append(self.__create_line(text="Abstract", hierarchy_level_id=1, paragraph_type="abstract"))
+ lines.append(self.__create_line(text=self.__tag2text(abstract)))
+
+ for text in soup.find_all("text"):
+ for part in text.find_all("div"):
+ # TODO: Beautifulsoup doesn't read tags from input XML file. WTF!
+ # As a result we lose section number in text (see example above)
+ # Need to fix this in the future.
+ number = part.head.get("n") + " " if part.head else ""
+ line_text = str(part.contents[0]) if len(part.contents) > 0 else None
+ if line_text is not None and len(line_text) > 0:
+ lines.append(self.__create_line(text=number + line_text, hierarchy_level_id=1, paragraph_type="section"))
+ for subpart in part.find_all("p"):
+ if subpart.string is not None:
+ lines.append(self.__create_line_with_refs(subpart.string, bib2uid, table2uid))
+ elif subpart.contents and len(subpart.contents) > 0:
+ lines.append(self.__create_line_with_refs(subpart.contents, bib2uid, table2uid))
+
+ return lines
+
+ @staticmethod
+ def __tag2text(tag: Tag) -> str:
+ return "" if not tag or not tag.string else tag.string
+
+ def __parse_tables(self, soup: Tag) -> Tuple[List[Table], dict]:
+ """
+ Example Table with table's ref:
+ -----------------------------------------------
+ Table Reference Example:
+ 1
+ ...
+ Table Example:
+
+ Table 1 .
+
+ Performance of some illustrative AES implementations.
+