Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TLDR-645 added grobid #422

Merged
merged 13 commits into from
Apr 17, 2024
Merged
1 change: 1 addition & 0 deletions dedoc/api/schema/table_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ class TableMetadata(BaseModel):
page_id: Optional[int] = Field(description="Number of the page where the table starts", example=0)
uid: str = Field(description="Unique identifier of the table", example="e8ba5523-8546-4804-898c-2f4835a1804f")
rotated_angle: float = Field(description="Value of the rotation angle (in degrees) by which the table was rotated during recognition", example=1.0)
title: str = Field(description="Table's title")
1 change: 1 addition & 0 deletions dedoc/api/web/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ <h4>Type of document structure parsing</h4>
<option value="law">law</option>
<option value="tz">tz</option>
<option value="diploma">diploma</option>
<option value="article">article</option>
NastyBoget marked this conversation as resolved.
Show resolved Hide resolved
</select> document_type
</label>
</p>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def can_extract(self,
"""
Checks if this extractor can get attachments from the document (it should have .docx extension)
"""
extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in recognized_extensions.docx_like_format or mime in recognized_mimes.docx_like_format

def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def can_extract(self,
"""
Checks if this extractor can get attachments from the document (it should have .xlsx extension)
"""
extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in recognized_extensions.excel_like_format or mime in recognized_mimes.excel_like_format

def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def can_extract(self,
"""
Checks if this extractor can get attachments from the document (it should have .json extension)
"""
extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower().endswith(".json")

def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def can_extract(self,
"""
Checks if this extractor can get attachments from the document (it should have .pptx extension)
"""
extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in recognized_extensions.pptx_like_format or mime in recognized_mimes.pptx_like_format

def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]:
Expand Down
5 changes: 4 additions & 1 deletion dedoc/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,10 @@
# TESSERACT OCR confidence threshold ( values: [-1 - undefined; 0.0 : 100.0 % - confidence value)
ocr_conf_threshold=40.0,
# max depth of document structure tree
recursion_deep_subparagraphs=30
recursion_deep_subparagraphs=30,

# -------------------------------------------EXTERNAL SERVICES SETTINGS---------------------------------------------
grobid_max_connection_attempts=3
)


Expand Down
2 changes: 1 addition & 1 deletion dedoc/converters/concrete_converters/binary_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def can_convert(self,
"""
Checks if the document is image-like (e.g. it has .bmp, .jpg, .tiff, etc. extension) and has `mime=application/octet-stream`.
"""
extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return mime == "application/octet-stream" and extension in supported_image_types

def convert(self, file_path: str, parameters: Optional[dict] = None) -> str:
Expand Down
2 changes: 1 addition & 1 deletion dedoc/converters/concrete_converters/docx_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def can_convert(self,
"""
Checks if the document is docx-like, e.g. it has .doc, .rtf or .odt extension.
"""
extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in converted_extensions.docx_like_format or mime in converted_mimes.docx_like_format

def convert(self, file_path: str, parameters: Optional[dict] = None) -> str:
Expand Down
2 changes: 1 addition & 1 deletion dedoc/converters/concrete_converters/excel_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def can_convert(self,
"""
Checks if the document is xlsx-like, e.g. it has .xls or .ods extension.
"""
extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in converted_extensions.excel_like_format or mime in converted_mimes.excel_like_format

def convert(self, file_path: str, parameters: Optional[dict] = None) -> str:
Expand Down
2 changes: 1 addition & 1 deletion dedoc/converters/concrete_converters/pdf_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def can_convert(self,
"""
Checks if the document is pdf-like, e.g. it has .djvu extension.
"""
extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in converted_extensions.pdf_like_format or mime in converted_mimes.pdf_like_format

def convert(self, file_path: str, parameters: Optional[dict] = None) -> str:
Expand Down
2 changes: 1 addition & 1 deletion dedoc/converters/concrete_converters/png_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def can_convert(self,
"""
Checks if the document is image-like, e.g. it has .bmp, .jpg, .tiff, etc. extension.
"""
extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in converted_extensions.image_like_format or mime in converted_mimes.image_like_format

def convert(self, file_path: str, parameters: Optional[dict] = None) -> str:
Expand Down
2 changes: 1 addition & 1 deletion dedoc/converters/concrete_converters/pptx_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def can_convert(self,
"""
Checks if the document is pptx-like, e.g. it has .ppt or .odp extension.
"""
extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in converted_extensions.pptx_like_format or mime in converted_mimes.pptx_like_format

def convert(self, file_path: str, parameters: Optional[dict] = None) -> str:
Expand Down
2 changes: 1 addition & 1 deletion dedoc/converters/concrete_converters/txt_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def can_convert(self,
"""
Checks if the document is txt-like, e.g. it has .xml extension.
"""
extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in converted_extensions.txt_like_format or mime in converted_mimes.txt_like_format

def convert(self, file_path: str, parameters: Optional[dict] = None) -> str:
Expand Down
2 changes: 1 addition & 1 deletion dedoc/converters/converter_composition.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def convert(self, file_path: str, parameters: Optional[dict] = None) -> str:
:param parameters: parameters of converting, see :ref:`parameters_description` for more details
:return: path of converted file if conversion was executed else path of the original file
"""
extension, mime = get_mime_extension(file_path=file_path)
mime, extension = get_mime_extension(file_path=file_path)
converted_file_path = file_path

for converter in self.converters:
Expand Down
3 changes: 2 additions & 1 deletion dedoc/data_structures/concrete_annotations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
from .superscript_annotation import SuperscriptAnnotation
from .table_annotation import TableAnnotation
from .underlined_annotation import UnderlinedAnnotation
from .reference_annotation import ReferenceAnnotation

__all__ = ['AlignmentAnnotation', 'AttachAnnotation', 'BBoxAnnotation', 'BoldAnnotation', 'ColorAnnotation', 'ConfidenceAnnotation',
'IndentationAnnotation', 'ItalicAnnotation', 'LinkedTextAnnotation', 'SizeAnnotation', 'SpacingAnnotation', 'StrikeAnnotation',
'StyleAnnotation', 'SubscriptAnnotation', 'SuperscriptAnnotation', 'TableAnnotation', 'UnderlinedAnnotation']
'StyleAnnotation', 'SubscriptAnnotation', 'SuperscriptAnnotation', 'TableAnnotation', 'UnderlinedAnnotation', 'ReferenceAnnotation']
43 changes: 43 additions & 0 deletions dedoc/data_structures/concrete_annotations/reference_annotation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from dedoc.data_structures.annotation import Annotation


class ReferenceAnnotation(Annotation):
"""
This annotation points to a place in the document text that is a link to another line in the document (for example, another textual line).

Example of usage for document_type="article" with the example of link on the bibliography_item :class:`~dedoc.data_structures.LineWithMeta`.

LineWithMeta:

.. code-block:: python

LineWithMeta( # the line with the reference annotation
line="As for the PRF, we use the tree-based construction from Goldreich, Goldwasser and Micali [18]",
metadata=LineMetadata(page_id=0, line_id=32),
annotations=[ReferenceAnnotation(start=90, end=92, value="97cfac39-f0e3-11ee-b81c-b88584b4e4a1"), ...]
)

other LineWithMeta:

.. code-block:: python

LineWithMeta( # The line referenced by the previous one
line="some your text (can be empty)",
metadata=LineMetadata(
page_id=10,
line_id=189,
tag_hierarchy_level=HierarchyLevel(level1=2, level2=0, paragraph_type="bibliography_item")),
other_fields={"uid": "97cfac39-f0e3-11ee-b81c-b88584b4e4a1"}
),
annotations=[]
)
"""
name = "reference"

def __init__(self, value: str, start: int, end: int) -> None:
"""
:param value: unique identifier of the line to which this annotation refers
:param start: start of the annotated text with a link
:param end: end of the annotated text with a link
"""
super().__init__(start=start, end=end, name=ReferenceAnnotation.name, value=value, is_mergeable=False)
2 changes: 1 addition & 1 deletion dedoc/data_structures/line_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ def __init__(self,
self.hierarchy_level = hierarchy_level
self.page_id = page_id
self.line_id = line_id
self.__other_fields = {}
if other_fields is not None and len(other_fields) > 0:
self.extend_other_fields(other_fields)
self.__other_fields = {}

def extend_other_fields(self, new_fields: dict) -> None:
"""
Expand Down
3 changes: 2 additions & 1 deletion dedoc/data_structures/line_with_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,8 @@ def set_line(self, line: str) -> None:
self._line = line

def __repr__(self) -> str:
return f"LineWithMeta({self.line[:65]})"
return (f"LineWithMeta({self.line[:65]}, "
f"tagHL={self.metadata.tag_hierarchy_level.level_1, self.metadata.tag_hierarchy_level.level_2, self.metadata.tag_hierarchy_level.line_type})")

def __add__(self, other: Union["LineWithMeta", str]) -> "LineWithMeta":
assert isinstance(other, (LineWithMeta, str))
Expand Down
6 changes: 4 additions & 2 deletions dedoc/data_structures/table_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,17 @@ class TableMetadata(Serializable):
"""
This class holds the information about table unique identifier, rotation angle (if table has been rotated - for images) and so on.
"""
def __init__(self, page_id: Optional[int], uid: Optional[str] = None, rotated_angle: float = 0.0) -> None:
def __init__(self, page_id: Optional[int], uid: Optional[str] = None, rotated_angle: float = 0.0, title: str = "") -> None:
"""
:param page_id: number of the page where table starts
:param uid: unique identifier of the table
:param rotated_angle: value of the rotation angle by which the table was rotated during recognition
:param title: table's title
"""
self.page_id = page_id
self.uid = str(uuid.uuid4()) if not uid else uid
self.rotated_angle = rotated_angle
self.title = title

def to_api_schema(self) -> ApiTableMetadata:
return ApiTableMetadata(uid=self.uid, page_id=self.page_id, rotated_angle=self.rotated_angle)
return ApiTableMetadata(uid=self.uid, page_id=self.page_id, rotated_angle=self.rotated_angle, title=self.title)
3 changes: 3 additions & 0 deletions dedoc/manager_config.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from typing import Optional

from dedoc.readers.article_reader.article_reader import ArticleReader


def _get_manager_config(config: dict) -> dict:
"""
Expand Down Expand Up @@ -57,6 +59,7 @@ def _get_manager_config(config: dict) -> dict:
BinaryConverter(config=config)
]
readers = [
ArticleReader(config=config),
DocxReader(config=config),
ExcelReader(config=config),
PptxReader(config=config),
Expand Down
3 changes: 2 additions & 1 deletion dedoc/readers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .archive_reader.archive_reader import ArchiveReader
from .article_reader.article_reader import ArticleReader
from .base_reader import BaseReader
from .csv_reader.csv_reader import CSVReader
from .docx_reader.docx_reader import DocxReader
Expand All @@ -17,6 +18,6 @@
from .reader_composition import ReaderComposition
from .txt_reader.raw_text_reader import RawTextReader

__all__ = ['ArchiveReader', 'BaseReader', 'CSVReader', 'DocxReader', 'EmailReader', 'ExcelReader', 'HtmlReader', 'JsonReader', 'MhtmlReader',
__all__ = ['ArchiveReader', 'ArticleReader', 'BaseReader', 'CSVReader', 'DocxReader', 'EmailReader', 'ExcelReader', 'HtmlReader', 'JsonReader', 'MhtmlReader',
'NoteReader', 'PptxReader', 'ReaderComposition', 'RawTextReader',
'PdfBaseReader', 'PdfImageReader', 'PdfTabbyReader', 'PdfTxtlayerReader', 'PdfAutoReader']
2 changes: 1 addition & 1 deletion dedoc/readers/archive_reader/archive_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None,
Check if the document extension is suitable for this reader.
Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters.
"""
extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in recognized_extensions.archive_like_format or mime in recognized_mimes.archive_like_format

def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
Expand Down
Empty file.
Loading
Loading