TLDR-748 using patterns tutorial

ispras · Aug 27, 2024 · 7229a31 · 7229a31
1 parent 2a91bd3
commit 7229a31
Show file tree

Hide file tree

Showing 12 changed files with 243 additions and 7 deletions.
diff --git a/.flake8 b/.flake8
@@ -49,5 +49,5 @@ per-file-ignores =
     scripts/*:T201
     scripts/benchmark_pdf_performance*:JS101
     tests/custom_test_runner.py:ANN001,ANN201,ANN202,ANN204,N802
-    docs/source/_static/code_examples/*:I251
+    docs/source/_static/code_examples/*:I251,T201
     docs/source/_static/code_examples/langchain/*:FOL001,FOL002,FOL003,FOL004,FOL005,I100,I202,I251
diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
@@ -33,3 +33,4 @@ jobs:
         python dedoc_usage_tutorial.py
         python dedoc_add_new_doc_type_tutorial.py
         python dedoc_add_new_structure_type_tutorial.py
+        python dedoc_using_patterns_tutorial.py
diff --git a/dedoc/data_structures/line_metadata.py b/dedoc/data_structures/line_metadata.py
@@ -24,8 +24,7 @@ def __init__(self,
         :param hierarchy_level: the hierarchy level of the line extracted by some of the structure extractors - the result type and level of the line.
             The lower the level of the hierarchy, the closer it is to the root, it's used to construct document tree.
         """
-        self.tag_hierarchy_level = HierarchyLevel(None, None, can_be_multiline=True, line_type=HierarchyLevel.unknown) \
-            if tag_hierarchy_level is None else tag_hierarchy_level
+        self.tag_hierarchy_level = HierarchyLevel.create_unknown() if tag_hierarchy_level is None else tag_hierarchy_level
         self.hierarchy_level = hierarchy_level
         self.page_id = page_id
         self.line_id = line_id

diff --git a/dedoc/data_structures/line_with_meta.py b/dedoc/data_structures/line_with_meta.py
@@ -140,8 +140,12 @@ def set_metadata(self, metadata: LineMetadata) -> None:
         self._metadata = metadata
 
     def __repr__(self) -> str:
-        return (f"LineWithMeta({self.line[:65]}, "
-                f"tagHL={self.metadata.tag_hierarchy_level.level_1, self.metadata.tag_hierarchy_level.level_2, self.metadata.tag_hierarchy_level.line_type})")
+        text = self.line if len(self.line) < 65 else self.line[:62] + "..."
+        tag_hl = "None" if self.metadata.tag_hierarchy_level is None else \
+            f"{self.metadata.tag_hierarchy_level.level_1, self.metadata.tag_hierarchy_level.level_2, self.metadata.tag_hierarchy_level.line_type}"
+        hl = "None" if self.metadata.hierarchy_level is None else \
+            f"{self.metadata.hierarchy_level.level_1, self.metadata.hierarchy_level.level_2, self.metadata.hierarchy_level.line_type}"
+        return f"LineWithMeta({text.strip()}, tagHL={tag_hl}, HL={hl})"
 
     def __add__(self, other: Union["LineWithMeta", str]) -> "LineWithMeta":
         from dedoc.utils.annotation_merger import AnnotationMerger

diff --git a/dedoc/readers/pdf_reader/data_classes/line_with_location.py b/dedoc/readers/pdf_reader/data_classes/line_with_location.py
@@ -14,8 +14,8 @@ def __init__(self, line: str, metadata: LineMetadata, annotations: List[Annotati
         super().__init__(line, metadata, annotations, uid)
 
     def __repr__(self) -> str:
-        text = self.line if len(self.line) < 65 else self.line[:62] + "..."
-        return f"LineWithLocation({text[:65]})"
+        parent_repr = super().__repr__()
+        return parent_repr.replace("LineWithMeta", "LineWithLocation")
 
     def __str__(self) -> str:
         return self.__repr__()
diff --git a/docs/source/_static/code_examples/dedoc_using_patterns_tutorial.py b/docs/source/_static/code_examples/dedoc_using_patterns_tutorial.py
@@ -0,0 +1,122 @@
+import re
+from typing import List
+
+import html2text
+
+from dedoc.api.api_utils import json2html
+from dedoc.data_structures import BoldAnnotation, HierarchyLevel, LineWithMeta, UnstructuredDocument
+from dedoc.metadata_extractors import DocxMetadataExtractor, PdfMetadataExtractor
+from dedoc.readers import DocxReader, PdfTabbyReader
+from dedoc.structure_constructors import TreeConstructor
+from dedoc.structure_extractors import DefaultStructureExtractor
+from dedoc.structure_extractors.patterns import DottedListPattern, LetterListPattern, RegexpPattern, TagHeaderPattern, TagListPattern
+from dedoc.structure_extractors.patterns.abstract_pattern import AbstractPattern
+
+
+# example for docx
+
+docx_reader = DocxReader()
+docx_metadata_extractor = DocxMetadataExtractor()
+structure_extractor = DefaultStructureExtractor()
+structure_constructor = TreeConstructor()
+
+docx_file_path = "test_dir/with_tags.docx"
+
+docx_document = docx_reader.read(file_path=docx_file_path)
+print("\n\nDocument lines\n")
+for document_line in docx_document.lines:
+    print(document_line)
+
+patterns = [
+    TagHeaderPattern(line_type="custom_header", level_1=1, can_be_multiline=False),
+    TagListPattern(line_type="custom_list", level_1=2),
+]
+docx_document = structure_extractor.extract(document=docx_document, parameters={"patterns": patterns})
+
+docx_document.metadata = docx_metadata_extractor.extract(file_path=docx_file_path)
+docx_parsed_document = structure_constructor.construct(document=docx_document)
+html = json2html(
+    paragraph=docx_parsed_document.content.structure,
+    attachments=docx_parsed_document.attachments,
+    tables=docx_parsed_document.content.tables,
+    text=""
+)
+print(f"\n\nDocument tree\n{html2text.html2text(html)}")
+
+
+def print_document_tree(document: UnstructuredDocument, patterns: List[AbstractPattern]) -> None:
+    document = structure_extractor.extract(document=document, parameters={"patterns": patterns})
+    parsed_document = structure_constructor.construct(document=document)
+    html = json2html(paragraph=parsed_document.content.structure, attachments=parsed_document.attachments, tables=parsed_document.content.tables, text="")
+    print(f"\n\nDocument tree\n{html2text.html2text(html)}")
+
+
+patterns = [
+    TagHeaderPattern(line_type="custom_header", level_1=1, can_be_multiline=False),
+    TagListPattern(line_type="custom_list", level_1=2),
+    DottedListPattern(line_type="custom_list", level_1=2, can_be_multiline=False),  # for lists like 1.
+    LetterListPattern(line_type="custom_list", level_1=3, level_2=1, can_be_multiline=False),  # for lists like a)
+    RegexpPattern(regexp=re.compile(r"^header\s+\d+\.\d+"), line_type="custom_header", level_1=1, level_2=2, can_be_multiline=False),
+    RegexpPattern(regexp=re.compile(r"^header\s+\d+"), line_type="custom_header", level_1=1, level_2=1, can_be_multiline=False)
+]
+print_document_tree(document=docx_document, patterns=patterns)
+
+# example for pdf
+
+pdf_reader = PdfTabbyReader()
+pdf_metadata_extractor = PdfMetadataExtractor()
+pdf_file_path = "test_dir/law.pdf"
+
+pdf_document = pdf_reader.read(file_path=pdf_file_path)
+pdf_document.metadata = pdf_metadata_extractor.extract(file_path=pdf_file_path)
+print("\n\nDocument lines\n")
+for document_line in pdf_document.lines[:10]:
+    print(document_line)
+
+patterns = [
+    RegexpPattern(regexp=re.compile(r"^part\s+\d+$"), line_type="part", level_1=1, level_2=1, can_be_multiline=False),
+    RegexpPattern(regexp=re.compile(r"^chapter\s+\d+$"), line_type="chapter", level_1=1, level_2=2, can_be_multiline=False),
+    DottedListPattern(line_type="point", level_1=2, can_be_multiline=False),  # for lists like 1.
+    RegexpPattern(regexp=re.compile(r"^\(\d+\)\s"), line_type="item", level_1=3, level_2=1, can_be_multiline=False),   # for lists like (1)
+    RegexpPattern(regexp=re.compile(r"^\(\w\)\s"), line_type="sub_item", level_1=3, level_2=2, can_be_multiline=False)    # for lists like (a)
+]
+print_document_tree(document=pdf_document, patterns=patterns)
+
+
+print("\n\nDocument lines\n")
+for document_line in pdf_document.lines[:50]:
+    print(document_line, document_line.annotations)
+
+
+class SubHeaderPattern(AbstractPattern):
+    _name = "sub_header"
+
+    def match(self, line: LineWithMeta) -> bool:
+        return self._is_bold(line)
+
+    def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel:
+        return HierarchyLevel(line_type=self._line_type, level_1=self._level_1, level_2=self._level_2, can_be_multiline=self._can_be_multiline)
+
+    def _is_bold(self, line: LineWithMeta) -> bool:
+        bold_annotations = [annotation for annotation in line.annotations if annotation.name == BoldAnnotation.name and annotation.value == "True"]
+        bold_character_number = sum([annotation.end - annotation.start for annotation in bold_annotations])
+        return bold_character_number / len(line.line) > 0.5
+
+
+class TitlePattern(SubHeaderPattern):
+    _name = "title"
+
+    def match(self, line: LineWithMeta) -> bool:
+        return line.line.isupper() and self._is_bold(line)
+
+
+patterns = [
+    RegexpPattern(regexp=re.compile(r"^part\s+\d+$"), line_type="part", level_1=1, level_2=2, can_be_multiline=False),
+    RegexpPattern(regexp=re.compile(r"^chapter\s+\d+$"), line_type="chapter", level_1=1, level_2=3, can_be_multiline=False),
+    DottedListPattern(line_type="point", level_1=2, can_be_multiline=False),
+    RegexpPattern(regexp=re.compile(r"^\(\d+\)\s"), line_type="item", level_1=3, level_2=1, can_be_multiline=False),
+    RegexpPattern(regexp=re.compile(r"^\(\w\)\s"), line_type="sub_item", level_1=3, level_2=2, can_be_multiline=False),
+    TitlePattern(line_type="title", level_1=1, level_2=2, can_be_multiline=False),
+    SubHeaderPattern(line_type="sub_header", level_1=1, level_2=4, can_be_multiline=True)
+]
+print_document_tree(document=pdf_document, patterns=patterns)
diff --git a/docs/source/_static/code_examples/test_dir/law.pdf b/docs/source/_static/code_examples/test_dir/law.pdf
diff --git a/docs/source/_static/code_examples/test_dir/law.png b/docs/source/_static/code_examples/test_dir/law.png
diff --git a/docs/source/_static/code_examples/test_dir/with_tags.docx b/docs/source/_static/code_examples/test_dir/with_tags.docx
diff --git a/docs/source/_static/code_examples/test_dir/with_tags.png b/docs/source/_static/code_examples/test_dir/with_tags.png
diff --git a/docs/source/tutorials/using_patterns.rst b/docs/source/tutorials/using_patterns.rst
@@ -3,10 +3,119 @@
 Configure structure extraction using patterns
 =============================================
 
+It is possible to configure structure type in Dedoc: option ``document_type`` in the ``parameters`` dictionary
+(:ref:`api_parameters`, :ref:`structure_type_parameters`).
+The default structure type (when ``document_type="other"``, see :ref:`other_structure`) allows to get a basic document structure which is fixed.
+If you want to change this structure, e.g. names of line types (nodes) or their levels in the tree hierarchy, you can use structure patterns.
 
 Use patterns in Dedoc library
 -----------------------------
 
+If you use Dedoc as a library, you can use existing pattern classes :ref:`dedoc_structure_extractors_patterns`
+or implement your own custom pattern based on :class:`~dedoc.structure_extractors.patterns.abstract_pattern.AbstractPattern`.
+
+Let's see some examples. First of all, we enlist all required imports:
+
+.. literalinclude:: ../_static/code_examples/dedoc_using_patterns_tutorial.py
+    :language: python
+    :lines: 1-13
+
+Using information from readers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Assume we need to parse file :download:`with_tags.docx <../_static/code_examples/test_dir/with_tags.docx>`, which looks like follows:
+
+.. _docx_with_tags_image:
+
+.. figure:: ../_static/code_examples/test_dir/with_tags.png
+    :width: 400
+
+    DOCX document example
+
+
+
+.. literalinclude:: ../_static/code_examples/dedoc_using_patterns_tutorial.py
+    :language: python
+    :lines: 18-28
+
+.. literalinclude:: ../_static/code_examples/dedoc_using_patterns_tutorial.py
+    :language: python
+    :lines: 30-44
+
+
+Using regular expressions
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. literalinclude:: ../_static/code_examples/dedoc_using_patterns_tutorial.py
+    :language: python
+    :lines: 47-51
+
+
+.. literalinclude:: ../_static/code_examples/dedoc_using_patterns_tutorial.py
+    :language: python
+    :lines: 54-62
+
+
+Practical example: get structured PDF
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Assume we need to parse file :download:`law.pdf <../_static/code_examples/test_dir/law.pdf>`, which looks like follows:
+
+.. _pdf_law_image:
+
+.. figure:: ../_static/code_examples/test_dir/law.png
+    :width: 400
+
+    PDF document example
+
+
+.. literalinclude:: ../_static/code_examples/dedoc_using_patterns_tutorial.py
+    :language: python
+    :lines: 66-74
+
+
+.. literalinclude:: ../_static/code_examples/dedoc_using_patterns_tutorial.py
+    :language: python
+    :lines: 76-83
+
+
+.. literalinclude:: ../_static/code_examples/dedoc_using_patterns_tutorial.py
+    :language: python
+    :lines: 86-88
+
+
+.. literalinclude:: ../_static/code_examples/dedoc_using_patterns_tutorial.py
+    :language: python
+    :lines: 91-110
+
+
+.. literalinclude:: ../_static/code_examples/dedoc_using_patterns_tutorial.py
+    :language: python
+    :lines: 113-122
+
+Conclusions
+~~~~~~~~~~~
+
+
 
 Use patterns in Dedoc API
 -------------------------
+
+.. code-block:: python
+
+    import requests
+
+    file_path = "test_dir/law.pdf"
+    file_name = "law.pdf"
+    patterns = [
+        {"name": "regexp", "regexp": "^part\s+\d+$", "line_type": "part", "level_1": 1, "level_2": 1, "can_be_multiline": "false"},
+        {"name": "regexp", "regexp": "^chapter\s+\d+$", "line_type": "chapter", "level_1": 1, "level_2": 2, "can_be_multiline": "false"},
+        {"name": "dotted_list", "line_type": "point", "level_1": 2, "can_be_multiline": "false"},
+        {"name": "regexp", "regexp": "^\(\d+\)\s", "line_type": "item", "level_1": 3, "level_2": 1, "can_be_multiline": "false"},
+        {"name": "regexp", "regexp": "^\(\w\)\s", "line_type": "sub_item", "level_1": 3, "level_2": 2, "can_be_multiline": "false"}
+    ]
+    parameters = {"patterns": str(patterns)}
+
+    with open(file_path, "rb") as file:
+        files = {"file": (file_name, file)}
+        r = requests.post("http://localhost:1231/upload", files=files, data=parameters)
diff --git a/pyproject.toml b/pyproject.toml
@@ -45,6 +45,7 @@ docs = [
     "sphinx-togglebutton==0.3.2",  # for using toggle button
     "linuxdoc==20230506",  # for using flat-table
     "tabula-py==2.8.1",  # for adding new doc type tutorial
+    "html2text==2024.2.26"  # for using patterns tutorial
 ]
 lint = [
     "flake8==5.0.4",