Skip to content

Commit

Permalink
TLDR-748 review fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
NastyBoget committed Sep 2, 2024
1 parent 2b5e47c commit 08b9c21
Show file tree
Hide file tree
Showing 18 changed files with 123 additions and 54 deletions.
3 changes: 2 additions & 1 deletion dedoc/readers/pdf_reader/pdf_base_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@ def __init__(self, *, config: Optional[dict] = None, recognized_extensions: Opti
def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
"""
The method return document content with all document's lines, tables and attachments.
This reader is able to add some additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`.
This reader is able to add some additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`
(``can_be_multiline`` attribute is important for paragraph extraction).
Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
You can also see :ref:`pdf_handling_parameters` to get more information about `parameters` dictionary possible arguments.
Expand Down
2 changes: 1 addition & 1 deletion dedoc/readers/txt_reader/raw_text_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None,

def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
"""
This method returns only document lines, some types of the lines (e.g. `list_item`) may be found using regular expressions.
This method returns only document lines.
Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
"""
parameters = {} if parameters is None else parameters
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
from typing import List, Optional
from typing import Optional

from dedoc.common.exceptions.structure_extractor_error import StructureExtractorError
from dedoc.data_structures.hierarchy_level import HierarchyLevel
from dedoc.data_structures.unstructured_document import UnstructuredDocument
from dedoc.structure_extractors.abstract_structure_extractor import AbstractStructureExtractor
from dedoc.structure_extractors.patterns.abstract_pattern import AbstractPattern
from dedoc.structure_extractors.patterns.pattern_composition import PatternComposition


class DefaultStructureExtractor(AbstractStructureExtractor):
Expand All @@ -25,40 +26,36 @@ def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = N
Please see :ref:`dedoc_structure_extractors_patterns` and :ref:`using_patterns` to get information how to use patterns for making your custom structure.
"""
parameters = {} if parameters is None else parameters
patterns = self.__get_patterns(parameters)
pattern_composition = self.__get_pattern_composition(parameters)

for line in document.lines:
line_pattern = None
for pattern in patterns:
if pattern.match(line):
line_pattern = pattern
break

line.metadata.hierarchy_level = line_pattern.get_hierarchy_level(line) if line_pattern else HierarchyLevel.create_raw_text()
assert line.metadata.hierarchy_level is not None

line.metadata.hierarchy_level = pattern_composition.get_hierarchy_level(line=line)
return document

def __get_patterns(self, parameters: dict) -> List[AbstractPattern]:
def __get_pattern_composition(self, parameters: dict) -> PatternComposition:
patterns = parameters.get("patterns")
if not patterns:
from dedoc.structure_extractors.patterns.bracket_list_pattern import BracketListPattern
from dedoc.structure_extractors.patterns.bullet_list_pattern import BulletListPattern
from dedoc.structure_extractors.patterns.dotted_list_pattern import DottedListPattern
from dedoc.structure_extractors.patterns.letter_list_pattern import LetterListPattern
from dedoc.structure_extractors.patterns.roman_list_pattern import RomanListPattern
from dedoc.structure_extractors.patterns.tag_header_pattern import TagHeaderPattern
from dedoc.structure_extractors.patterns.tag_list_pattern import TagListPattern
from dedoc.structure_extractors.patterns.tag_pattern import TagPattern

return [
TagHeaderPattern(line_type=HierarchyLevel.header, level_1=1, can_be_multiline=False),
TagListPattern(line_type=HierarchyLevel.list_item, default_level_1=2, can_be_multiline=False),
DottedListPattern(line_type=HierarchyLevel.list_item, level_1=2, can_be_multiline=False),
BracketListPattern(line_type=HierarchyLevel.list_item, level_1=3, level_2=1, can_be_multiline=False),
LetterListPattern(line_type=HierarchyLevel.list_item, level_1=4, level_2=1, can_be_multiline=False),
BulletListPattern(line_type=HierarchyLevel.list_item, level_1=5, level_2=1, can_be_multiline=False),
TagPattern(default_line_type=HierarchyLevel.raw_text)
]
return PatternComposition(
patterns=[
TagHeaderPattern(line_type=HierarchyLevel.header, level_1=1, can_be_multiline=False),
TagListPattern(line_type=HierarchyLevel.list_item, default_level_1=2, can_be_multiline=False),
DottedListPattern(line_type=HierarchyLevel.list_item, level_1=2, can_be_multiline=False),
RomanListPattern(line_type=HierarchyLevel.list_item, level_1=3, level_2=1, can_be_multiline=False),
BracketListPattern(line_type=HierarchyLevel.list_item, level_1=4, level_2=1, can_be_multiline=False),
LetterListPattern(line_type=HierarchyLevel.list_item, level_1=5, level_2=1, can_be_multiline=False),
BulletListPattern(line_type=HierarchyLevel.list_item, level_1=6, level_2=1, can_be_multiline=False),
TagPattern(default_line_type=HierarchyLevel.raw_text)
]
)

import ast
from dedoc.structure_extractors.patterns.utils import get_pattern
Expand All @@ -81,4 +78,4 @@ def __get_patterns(self, parameters: dict) -> List[AbstractPattern]:
else:
raise StructureExtractorError(msg="Pattern should be dict or `AbstractPattern`")

return pattern_classes
return PatternComposition(patterns=pattern_classes)
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class BracketRomanPrefix(LinePrefix):
iv) forth item
"""

regexp = re.compile(r"^\s*[ivxl]\)")
regexp = re.compile(r"^\s*[ivxlcdm]\)")
name = "roman"

def __init__(self, prefix: str, indent: float) -> None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class RomanPrefix(LinePrefix):
IV. forth item
"""

regexp = re.compile(r"^\s*[ivxl]\.")
regexp = re.compile(r"^\s*[ivxlcdm]\.")
name = "roman"

def __init__(self, prefix: str, indent: float) -> None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
AbstractBodyHierarchyLevelBuilder
from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import regexps_digits_with_dots
from dedoc.structure_extractors.patterns import BracketListPattern, BulletListPattern, DottedListPattern, LetterListPattern, TagListPattern, TagPattern
from dedoc.structure_extractors.patterns.pattern_composition import PatternComposition


class DiplomaBodyBuilder(AbstractHierarchyLevelBuilder):
Expand All @@ -17,14 +18,16 @@ class DiplomaBodyBuilder(AbstractHierarchyLevelBuilder):
def __init__(self) -> None:
super().__init__()
self.digits_with_dots_regexp = regexps_digits_with_dots
self.patterns = [
TagListPattern(line_type=HierarchyLevel.list_item, default_level_1=2, can_be_multiline=False),
DottedListPattern(line_type=HierarchyLevel.list_item, level_1=2, can_be_multiline=False),
BracketListPattern(line_type=HierarchyLevel.list_item, level_1=3, level_2=1, can_be_multiline=False),
LetterListPattern(line_type=HierarchyLevel.list_item, level_1=4, level_2=1, can_be_multiline=False),
BulletListPattern(line_type=HierarchyLevel.list_item, level_1=5, level_2=1, can_be_multiline=False),
TagPattern(line_type=HierarchyLevel.raw_text)
]
self.pattern_composition = PatternComposition(
[
TagListPattern(line_type=HierarchyLevel.list_item, default_level_1=2, can_be_multiline=False),
DottedListPattern(line_type=HierarchyLevel.list_item, level_1=2, can_be_multiline=False),
BracketListPattern(line_type=HierarchyLevel.list_item, level_1=3, level_2=1, can_be_multiline=False),
LetterListPattern(line_type=HierarchyLevel.list_item, level_1=4, level_2=1, can_be_multiline=False),
BulletListPattern(line_type=HierarchyLevel.list_item, level_1=5, level_2=1, can_be_multiline=False),
TagPattern(line_type=HierarchyLevel.raw_text)
]
)

def get_lines_with_hierarchy(self, lines_with_labels: List[Tuple[LineWithMeta, str]], init_hl_depth: int) -> List[LineWithMeta]:
if len(lines_with_labels) > 0:
Expand All @@ -51,7 +54,7 @@ def get_lines_with_hierarchy(self, lines_with_labels: List[Tuple[LineWithMeta, s
elif prediction == "raw_text":
line = self.__postprocess_raw_text(line, init_hl_depth)
if not (line.metadata.hierarchy_level is not None and line.metadata.hierarchy_level.line_type == "named_item"):
line.metadata.hierarchy_level = self.__get_level_by_patterns(line)
line.metadata.hierarchy_level = self.pattern_composition.get_hierarchy_level(line)
else:
line.metadata.hierarchy_level = HierarchyLevel.create_raw_text()
line.metadata.hierarchy_level.line_type = prediction
Expand All @@ -75,15 +78,6 @@ def __handle_named_item(self, init_hl_depth: int, line: LineWithMeta, prediction
line.metadata.hierarchy_level = hierarchy_level
return line

def __get_level_by_patterns(self, line: LineWithMeta) -> HierarchyLevel:
line_pattern = None
for pattern in self.patterns:
if pattern.match(line):
line_pattern = pattern
break

return line_pattern.get_hierarchy_level(line) if line_pattern else HierarchyLevel.create_raw_text()

def __postprocess_raw_text(self, line: LineWithMeta, init_hl_depth: int) -> LineWithMeta:
text = line.line.strip().lower()
if not text.startswith(self.named_item_keywords):
Expand Down
2 changes: 1 addition & 1 deletion dedoc/structure_extractors/patterns/abstract_pattern.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class AbstractPattern(ABC):

def __init__(self, line_type: Optional[str], level_1: Optional[int], level_2: Optional[int], can_be_multiline: Optional[Union[bool, str]]) -> None:
"""
Initialise pattern with default values of :class:`~dedoc.data_structures.HierarchyLevel` attributes.
Initialize pattern with default values of :class:`~dedoc.data_structures.HierarchyLevel` attributes.
They can be used in :meth:`~dedoc.structure_extractors.patterns.abstract_pattern.AbstractPattern.get_hierarchy_level`
according to specific pattern logic.
Expand Down
56 changes: 56 additions & 0 deletions dedoc/structure_extractors/patterns/pattern_composition.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from typing import List

from dedoc.data_structures.hierarchy_level import HierarchyLevel
from dedoc.data_structures.line_with_meta import LineWithMeta
from dedoc.structure_extractors.patterns.abstract_pattern import AbstractPattern


class PatternComposition:
"""
Class for applying patterns to get line's hierarchy level.
Example of usage:
.. code-block:: python
from dedoc.data_structures.line_with_meta import LineWithMeta
from dedoc.structure_extractors.patterns import TagListPattern, TagPattern
from dedoc.structure_extractors.patterns.pattern_composition import PatternComposition
pattern_composition = PatternComposition(
patterns=[
TagListPattern(line_type="list_item", default_level_1=2, can_be_multiline=False),
TagPattern(default_line_type="raw_text")
]
)
line = LineWithMeta(line="Some text")
line.metadata.hierarchy_level = pattern_composition.get_hierarchy_level(line=line)
"""
def __init__(self, patterns: List[AbstractPattern]) -> None:
"""
Set the list of patterns to apply to lines.
**Note:** the order of the patterns is important. More specific patterns should go first.
Otherwise, they may be ignored because of the patterns which also are applicable to the given line.
:param patterns: list of patterns to apply to lines.
"""
self.patterns = patterns

def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel:
"""
Choose the suitable pattern from the list of patterns for applying to the given line.
The first applicable pattern will be chosen.
If no applicable pattern was found, the default ``raw_text`` :class:`~dedoc.data_structures.HierarchyLevel` is used as result.
:param line: line to get hierarchy level for.
"""
line_pattern = None

for pattern in self.patterns:
if pattern.match(line):
line_pattern = pattern
break

return line_pattern.get_hierarchy_level(line) if line_pattern else HierarchyLevel.create_raw_text()
2 changes: 1 addition & 1 deletion dedoc/structure_extractors/patterns/regexp_pattern.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def __init__(self,
level_2: Optional[int] = None,
can_be_multiline: Optional[Union[bool, str]] = None) -> None:
"""
Initialise pattern with default values of :class:`~dedoc.data_structures.HierarchyLevel` attributes.
Initialize pattern with default values of :class:`~dedoc.data_structures.HierarchyLevel` attributes.
:param regexp: regular expression for checking, if the line text matches the pattern.
Note that regular expression is used on the lowercase and stripped line.
Expand Down
2 changes: 1 addition & 1 deletion dedoc/structure_extractors/patterns/start_word_pattern.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def __init__(self,
level_2: Optional[int] = None,
can_be_multiline: Optional[Union[bool, str]] = None) -> None:
"""
Initialise pattern with default values of :class:`~dedoc.data_structures.HierarchyLevel` attributes.
Initialize pattern with default values of :class:`~dedoc.data_structures.HierarchyLevel` attributes.
:param start_word: string for checking of line text beginning.
Note that start_word will be stripped and made lowercase, and will be used on the lowercase and stripped line.
Expand Down
6 changes: 3 additions & 3 deletions dedoc/structure_extractors/patterns/tag_pattern.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def __init__(self,
default_level_1: Optional[int] = None,
default_level_2: Optional[int] = None) -> None:
"""
Initialise pattern for configuring values of :class:`~dedoc.data_structures.HierarchyLevel` attributes.
Initialize pattern for configuring values of :class:`~dedoc.data_structures.HierarchyLevel` attributes.
It is recommended to configure ``default_*`` values in case ``line.metadata.tag_hierarchy_level`` miss some values.
If you want to use values from ``line.metadata.tag_hierarchy_level``, it is recommended to leave
``line_type``, ``level_1``, ``level_2``, ``can_be_multiline`` empty.
Expand Down Expand Up @@ -88,7 +88,7 @@ def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel:
returned ``True`` for the given line.
Return :class:`~dedoc.data_structures.HierarchyLevel` for initialising ``line.metadata.hierarchy_level``.
The attribute ``line_type`` is initialised according to the following rules:
The attribute ``line_type`` is initialized according to the following rules:
* if non-empty ``line_type`` is given during pattern initialisation, then its value is used in the result;
* if ``line_type`` is not given (or ``None`` is given) and ``line.metadata.tag_hierarchy_level`` is not ``unknown``, \
Expand All @@ -97,7 +97,7 @@ def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel:
Similar rules work for ``level_1`` and ``level_2`` with comparing with ``None`` instead of ``unknown``.
The ``can_be_multiline`` attribute is initialised according to the following rules:
The ``can_be_multiline`` attribute is initialized according to the following rules:
* if non-empty ``can_be_multiline`` is given during pattern initialisation, then its value is used in the result;
* otherwise ``can_be_multiline`` value from ``line.metadata.tag_hierarchy_level`` is used in the result.
Expand Down
2 changes: 1 addition & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
("py:class", "train_dataset.data_structures.line_with_label.LineWithLabel"),
("py:class", "xgboost.sklearn.XGBClassifier"),
("py:class", "collections.Counter"),

("py:obj", "typing.Pattern")
]

# -- Options for HTML output -------------------------------------------------
Expand Down
6 changes: 6 additions & 0 deletions docs/source/dedoc_api_usage/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,12 @@ Api parameters description

This type is used for choosing a specific structure extractor (and, in some cases, a specific reader).

* - patterns
- list of patterns dictionaries converted to string
- None
- This parameter is used only when ``document_type="other"``.
Configuration of default document structure, please see :ref:`using_patterns` for more details.

* - structure_type
- tree, linear
- tree
Expand Down
4 changes: 4 additions & 0 deletions docs/source/modules/structure_extractors.rst
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,10 @@ Please see :ref:`using_patterns` to get examples of patterns usage.

.. autoattribute:: _name

.. autoclass:: dedoc.structure_extractors.patterns.pattern_composition.PatternComposition
:special-members: __init__
:members:

.. autoclass:: dedoc.structure_extractors.patterns.RegexpPattern
:show-inheritance:
:special-members: __init__
Expand Down
10 changes: 10 additions & 0 deletions docs/source/parameters/structure_type.rst
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,16 @@ Structure type configuring

If you use your custom configuration, look to the documentation of :class:`~dedoc.structure_extractors.StructureExtractorComposition`

* - patterns
- list of patterns based on :class:`~dedoc.structure_extractors.patterns.abstract_pattern.AbstractPattern`,
or list of patterns dicts, or list of dictionaries converted to string
- None
- * :meth:`dedoc.DedocManager.parse`
* :meth:`dedoc.structure_extractors.StructureExtractorComposition.extract`
* :meth:`dedoc.structure_extractors.DefaultStructureExtractor.extract`
- This parameter is used only by :class:`~dedoc.structure_extractors.DefaultStructureExtractor` (``document_type="other"``).
Configuration of default document structure, please see :ref:`using_patterns` for more details.

* - structure_type
- tree, linear
- tree
Expand Down
2 changes: 1 addition & 1 deletion docs/source/structure_types/other.rst
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ The detailed description of each line type:

Its text is an empty string.
This type of node is optional, it occurs only if lists are found in the given document.
For each list type (dotted, bracket, bullet) the new list node is created.
For each list type (dotted, roman, bracket, bullet) the new list node is created.
This type of node is more important than list_item and raw_text.
List nodes for less important lists are are nested into list items of more important list types.
For example, list node for bullet list beginning is less important than a list item of a dotted list.
Expand Down
3 changes: 2 additions & 1 deletion docs/source/tutorials/using_patterns.rst
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,8 @@ Let's read the document using :class:`~dedoc.readers.PdfTabbyReader` and see the
Here we consider class ``LineWithLocation`` almost the same as :class:`~dedoc.data_structures.LineWithMeta`.
As we see, the reader couldn't extract any useful information about lines types and levels.
As we see, ``tagHL=(None, None, 'unknown')`` for each line:
this means that the reader couldn't extract any useful information about lines types and levels.
So, :class:`~dedoc.structure_extractors.patterns.TagHeaderPattern` and
:class:`~dedoc.structure_extractors.patterns.TagListPattern` are useless in this case.

Expand Down
Loading

0 comments on commit 08b9c21

Please sign in to comment.