Skip to content

Commit

Permalink
TLDR-748 using patterns tutorial
Browse files Browse the repository at this point in the history
  • Loading branch information
NastyBoget committed Aug 27, 2024
1 parent 2a91bd3 commit 7229a31
Show file tree
Hide file tree
Showing 12 changed files with 243 additions and 7 deletions.
2 changes: 1 addition & 1 deletion .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -49,5 +49,5 @@ per-file-ignores =
scripts/*:T201
scripts/benchmark_pdf_performance*:JS101
tests/custom_test_runner.py:ANN001,ANN201,ANN202,ANN204,N802
docs/source/_static/code_examples/*:I251
docs/source/_static/code_examples/*:I251,T201
docs/source/_static/code_examples/langchain/*:FOL001,FOL002,FOL003,FOL004,FOL005,I100,I202,I251
1 change: 1 addition & 0 deletions .github/workflows/docs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,4 @@ jobs:
python dedoc_usage_tutorial.py
python dedoc_add_new_doc_type_tutorial.py
python dedoc_add_new_structure_type_tutorial.py
python dedoc_using_patterns_tutorial.py
3 changes: 1 addition & 2 deletions dedoc/data_structures/line_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,7 @@ def __init__(self,
:param hierarchy_level: the hierarchy level of the line extracted by some of the structure extractors - the result type and level of the line.
The lower the level of the hierarchy, the closer it is to the root, it's used to construct document tree.
"""
self.tag_hierarchy_level = HierarchyLevel(None, None, can_be_multiline=True, line_type=HierarchyLevel.unknown) \
if tag_hierarchy_level is None else tag_hierarchy_level
self.tag_hierarchy_level = HierarchyLevel.create_unknown() if tag_hierarchy_level is None else tag_hierarchy_level
self.hierarchy_level = hierarchy_level
self.page_id = page_id
self.line_id = line_id
Expand Down
8 changes: 6 additions & 2 deletions dedoc/data_structures/line_with_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,12 @@ def set_metadata(self, metadata: LineMetadata) -> None:
self._metadata = metadata

def __repr__(self) -> str:
return (f"LineWithMeta({self.line[:65]}, "
f"tagHL={self.metadata.tag_hierarchy_level.level_1, self.metadata.tag_hierarchy_level.level_2, self.metadata.tag_hierarchy_level.line_type})")
text = self.line if len(self.line) < 65 else self.line[:62] + "..."
tag_hl = "None" if self.metadata.tag_hierarchy_level is None else \
f"{self.metadata.tag_hierarchy_level.level_1, self.metadata.tag_hierarchy_level.level_2, self.metadata.tag_hierarchy_level.line_type}"
hl = "None" if self.metadata.hierarchy_level is None else \
f"{self.metadata.hierarchy_level.level_1, self.metadata.hierarchy_level.level_2, self.metadata.hierarchy_level.line_type}"
return f"LineWithMeta({text.strip()}, tagHL={tag_hl}, HL={hl})"

def __add__(self, other: Union["LineWithMeta", str]) -> "LineWithMeta":
from dedoc.utils.annotation_merger import AnnotationMerger
Expand Down
4 changes: 2 additions & 2 deletions dedoc/readers/pdf_reader/data_classes/line_with_location.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ def __init__(self, line: str, metadata: LineMetadata, annotations: List[Annotati
super().__init__(line, metadata, annotations, uid)

def __repr__(self) -> str:
text = self.line if len(self.line) < 65 else self.line[:62] + "..."
return f"LineWithLocation({text[:65]})"
parent_repr = super().__repr__()
return parent_repr.replace("LineWithMeta", "LineWithLocation")

def __str__(self) -> str:
return self.__repr__()
122 changes: 122 additions & 0 deletions docs/source/_static/code_examples/dedoc_using_patterns_tutorial.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
import re
from typing import List

import html2text

from dedoc.api.api_utils import json2html
from dedoc.data_structures import BoldAnnotation, HierarchyLevel, LineWithMeta, UnstructuredDocument
from dedoc.metadata_extractors import DocxMetadataExtractor, PdfMetadataExtractor
from dedoc.readers import DocxReader, PdfTabbyReader
from dedoc.structure_constructors import TreeConstructor
from dedoc.structure_extractors import DefaultStructureExtractor
from dedoc.structure_extractors.patterns import DottedListPattern, LetterListPattern, RegexpPattern, TagHeaderPattern, TagListPattern
from dedoc.structure_extractors.patterns.abstract_pattern import AbstractPattern


# example for docx

docx_reader = DocxReader()
docx_metadata_extractor = DocxMetadataExtractor()
structure_extractor = DefaultStructureExtractor()
structure_constructor = TreeConstructor()

docx_file_path = "test_dir/with_tags.docx"

docx_document = docx_reader.read(file_path=docx_file_path)
print("\n\nDocument lines\n")
for document_line in docx_document.lines:
print(document_line)

patterns = [
TagHeaderPattern(line_type="custom_header", level_1=1, can_be_multiline=False),
TagListPattern(line_type="custom_list", level_1=2),
]
docx_document = structure_extractor.extract(document=docx_document, parameters={"patterns": patterns})

docx_document.metadata = docx_metadata_extractor.extract(file_path=docx_file_path)
docx_parsed_document = structure_constructor.construct(document=docx_document)
html = json2html(
paragraph=docx_parsed_document.content.structure,
attachments=docx_parsed_document.attachments,
tables=docx_parsed_document.content.tables,
text=""
)
print(f"\n\nDocument tree\n{html2text.html2text(html)}")


def print_document_tree(document: UnstructuredDocument, patterns: List[AbstractPattern]) -> None:
document = structure_extractor.extract(document=document, parameters={"patterns": patterns})
parsed_document = structure_constructor.construct(document=document)
html = json2html(paragraph=parsed_document.content.structure, attachments=parsed_document.attachments, tables=parsed_document.content.tables, text="")
print(f"\n\nDocument tree\n{html2text.html2text(html)}")


patterns = [
TagHeaderPattern(line_type="custom_header", level_1=1, can_be_multiline=False),
TagListPattern(line_type="custom_list", level_1=2),
DottedListPattern(line_type="custom_list", level_1=2, can_be_multiline=False), # for lists like 1.
LetterListPattern(line_type="custom_list", level_1=3, level_2=1, can_be_multiline=False), # for lists like a)
RegexpPattern(regexp=re.compile(r"^header\s+\d+\.\d+"), line_type="custom_header", level_1=1, level_2=2, can_be_multiline=False),
RegexpPattern(regexp=re.compile(r"^header\s+\d+"), line_type="custom_header", level_1=1, level_2=1, can_be_multiline=False)
]
print_document_tree(document=docx_document, patterns=patterns)

# example for pdf

pdf_reader = PdfTabbyReader()
pdf_metadata_extractor = PdfMetadataExtractor()
pdf_file_path = "test_dir/law.pdf"

pdf_document = pdf_reader.read(file_path=pdf_file_path)
pdf_document.metadata = pdf_metadata_extractor.extract(file_path=pdf_file_path)
print("\n\nDocument lines\n")
for document_line in pdf_document.lines[:10]:
print(document_line)

patterns = [
RegexpPattern(regexp=re.compile(r"^part\s+\d+$"), line_type="part", level_1=1, level_2=1, can_be_multiline=False),
RegexpPattern(regexp=re.compile(r"^chapter\s+\d+$"), line_type="chapter", level_1=1, level_2=2, can_be_multiline=False),
DottedListPattern(line_type="point", level_1=2, can_be_multiline=False), # for lists like 1.
RegexpPattern(regexp=re.compile(r"^\(\d+\)\s"), line_type="item", level_1=3, level_2=1, can_be_multiline=False), # for lists like (1)
RegexpPattern(regexp=re.compile(r"^\(\w\)\s"), line_type="sub_item", level_1=3, level_2=2, can_be_multiline=False) # for lists like (a)
]
print_document_tree(document=pdf_document, patterns=patterns)


print("\n\nDocument lines\n")
for document_line in pdf_document.lines[:50]:
print(document_line, document_line.annotations)


class SubHeaderPattern(AbstractPattern):
_name = "sub_header"

def match(self, line: LineWithMeta) -> bool:
return self._is_bold(line)

def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel:
return HierarchyLevel(line_type=self._line_type, level_1=self._level_1, level_2=self._level_2, can_be_multiline=self._can_be_multiline)

def _is_bold(self, line: LineWithMeta) -> bool:
bold_annotations = [annotation for annotation in line.annotations if annotation.name == BoldAnnotation.name and annotation.value == "True"]
bold_character_number = sum([annotation.end - annotation.start for annotation in bold_annotations])
return bold_character_number / len(line.line) > 0.5


class TitlePattern(SubHeaderPattern):
_name = "title"

def match(self, line: LineWithMeta) -> bool:
return line.line.isupper() and self._is_bold(line)


patterns = [
RegexpPattern(regexp=re.compile(r"^part\s+\d+$"), line_type="part", level_1=1, level_2=2, can_be_multiline=False),
RegexpPattern(regexp=re.compile(r"^chapter\s+\d+$"), line_type="chapter", level_1=1, level_2=3, can_be_multiline=False),
DottedListPattern(line_type="point", level_1=2, can_be_multiline=False),
RegexpPattern(regexp=re.compile(r"^\(\d+\)\s"), line_type="item", level_1=3, level_2=1, can_be_multiline=False),
RegexpPattern(regexp=re.compile(r"^\(\w\)\s"), line_type="sub_item", level_1=3, level_2=2, can_be_multiline=False),
TitlePattern(line_type="title", level_1=1, level_2=2, can_be_multiline=False),
SubHeaderPattern(line_type="sub_header", level_1=1, level_2=4, can_be_multiline=True)
]
print_document_tree(document=pdf_document, patterns=patterns)
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
109 changes: 109 additions & 0 deletions docs/source/tutorials/using_patterns.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,119 @@
Configure structure extraction using patterns
=============================================

It is possible to configure structure type in Dedoc: option ``document_type`` in the ``parameters`` dictionary
(:ref:`api_parameters`, :ref:`structure_type_parameters`).
The default structure type (when ``document_type="other"``, see :ref:`other_structure`) allows to get a basic document structure which is fixed.
If you want to change this structure, e.g. names of line types (nodes) or their levels in the tree hierarchy, you can use structure patterns.

Use patterns in Dedoc library
-----------------------------

If you use Dedoc as a library, you can use existing pattern classes :ref:`dedoc_structure_extractors_patterns`
or implement your own custom pattern based on :class:`~dedoc.structure_extractors.patterns.abstract_pattern.AbstractPattern`.

Let's see some examples. First of all, we enlist all required imports:

.. literalinclude:: ../_static/code_examples/dedoc_using_patterns_tutorial.py
:language: python
:lines: 1-13

Using information from readers
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Assume we need to parse file :download:`with_tags.docx <../_static/code_examples/test_dir/with_tags.docx>`, which looks like follows:

.. _docx_with_tags_image:

.. figure:: ../_static/code_examples/test_dir/with_tags.png
:width: 400

DOCX document example



.. literalinclude:: ../_static/code_examples/dedoc_using_patterns_tutorial.py
:language: python
:lines: 18-28

.. literalinclude:: ../_static/code_examples/dedoc_using_patterns_tutorial.py
:language: python
:lines: 30-44


Using regular expressions
~~~~~~~~~~~~~~~~~~~~~~~~~

.. literalinclude:: ../_static/code_examples/dedoc_using_patterns_tutorial.py
:language: python
:lines: 47-51


.. literalinclude:: ../_static/code_examples/dedoc_using_patterns_tutorial.py
:language: python
:lines: 54-62


Practical example: get structured PDF
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Assume we need to parse file :download:`law.pdf <../_static/code_examples/test_dir/law.pdf>`, which looks like follows:

.. _pdf_law_image:

.. figure:: ../_static/code_examples/test_dir/law.png
:width: 400

PDF document example


.. literalinclude:: ../_static/code_examples/dedoc_using_patterns_tutorial.py
:language: python
:lines: 66-74


.. literalinclude:: ../_static/code_examples/dedoc_using_patterns_tutorial.py
:language: python
:lines: 76-83


.. literalinclude:: ../_static/code_examples/dedoc_using_patterns_tutorial.py
:language: python
:lines: 86-88


.. literalinclude:: ../_static/code_examples/dedoc_using_patterns_tutorial.py
:language: python
:lines: 91-110


.. literalinclude:: ../_static/code_examples/dedoc_using_patterns_tutorial.py
:language: python
:lines: 113-122

Conclusions
~~~~~~~~~~~



Use patterns in Dedoc API
-------------------------

.. code-block:: python
import requests
file_path = "test_dir/law.pdf"
file_name = "law.pdf"
patterns = [
{"name": "regexp", "regexp": "^part\s+\d+$", "line_type": "part", "level_1": 1, "level_2": 1, "can_be_multiline": "false"},
{"name": "regexp", "regexp": "^chapter\s+\d+$", "line_type": "chapter", "level_1": 1, "level_2": 2, "can_be_multiline": "false"},
{"name": "dotted_list", "line_type": "point", "level_1": 2, "can_be_multiline": "false"},
{"name": "regexp", "regexp": "^\(\d+\)\s", "line_type": "item", "level_1": 3, "level_2": 1, "can_be_multiline": "false"},
{"name": "regexp", "regexp": "^\(\w\)\s", "line_type": "sub_item", "level_1": 3, "level_2": 2, "can_be_multiline": "false"}
]
parameters = {"patterns": str(patterns)}
with open(file_path, "rb") as file:
files = {"file": (file_name, file)}
r = requests.post("http://localhost:1231/upload", files=files, data=parameters)
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ docs = [
"sphinx-togglebutton==0.3.2", # for using toggle button
"linuxdoc==20230506", # for using flat-table
"tabula-py==2.8.1", # for adding new doc type tutorial
"html2text==2024.2.26" # for using patterns tutorial
]
lint = [
"flake8==5.0.4",
Expand Down

0 comments on commit 7229a31

Please sign in to comment.