Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TLDR-544 docx bugs #382

Merged
merged 4 commits into from
Dec 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,4 @@ def __init__(self, attach_uid: str, start: int, end: int) -> None:
:param start: start of the annotated text (usually zero)
:param end: end of the annotated text (usually end of the line)
"""
super().__init__(start=start, end=end, name=AttachAnnotation.name, value=attach_uid)
super().__init__(start=start, end=end, name=AttachAnnotation.name, value=attach_uid, is_mergeable=False)
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def __init__(self, start: int, end: int, value: BBox, page_width: int, page_heig
if not isinstance(value, BBox):
raise ValueError("the value of bounding box annotation should be instance of BBox")

super().__init__(start=start, end=end, name=BBoxAnnotation.name, value=json.dumps(value.to_relative_dict(page_width, page_height)))
super().__init__(start=start, end=end, name=BBoxAnnotation.name, value=json.dumps(value.to_relative_dict(page_width, page_height)), is_mergeable=False)

@staticmethod
def get_bbox_from_value(value: str) -> Tuple[BBox, int, int]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@ def __init__(self, start: int, end: int, value: str) -> None:
:param end: end of the annotated text (not included)
:param value: text, linked to given one, for example text of the footnote
"""
super().__init__(start=start, end=end, name=LinkedTextAnnotation.name, value=value)
super().__init__(start=start, end=end, name=LinkedTextAnnotation.name, value=value, is_mergeable=False)
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@ def __init__(self, name: str, start: int, end: int) -> None:
:param start: start of the annotated text (usually zero)
:param end: end of the annotated text (usually end of the line)
"""
super().__init__(start=start, end=end, name=TableAnnotation.name, value=name)
super().__init__(start=start, end=end, name=TableAnnotation.name, value=name, is_mergeable=False)
3 changes: 1 addition & 2 deletions dedoc/readers/docx_reader/README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# Docx reader documentation

[стандарт Office Open XML File Formats с. 28-62; 167-1301](http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-376,%20Fifth%20Edition,%20Part%201%20-%20Fundamentals%20And%20Markup%20Language%20Reference.zip)
[Стандарт Office Open XML File Formats с. 28-62; 167-1301](https://ecma-international.org/wp-content/uploads/ECMA-376-1_5th_edition_december_2016.zip)

## Структура docx

Expand Down
20 changes: 14 additions & 6 deletions dedoc/readers/docx_reader/properties_extractor.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,16 @@
from typing import Union

from bs4 import Tag

from dedoc.readers.docx_reader.data_structures.base_props import BaseProperties


def spacing_to_float(spacing: Union[str, int, float]) -> float:
if str(spacing).endswith("pt"):
return float(spacing[:-2])
return float(spacing)


def check_if_true(value: str) -> bool:
if value == "1" or value == "True" or value == "true":
return True
Expand Down Expand Up @@ -79,7 +87,7 @@ def change_indent(old_properties: BaseProperties, tree: Tag) -> None:
["firstLine", "firstLineChars", "hanging", "hangingChars", "start", "startChars", "left"]
}
for attribute in attributes:
attributes[attribute] = float(tree.ind.get(f"w:{attribute}", 0))
attributes[attribute] = spacing_to_float(tree.ind.get(f"w:{attribute}", 0))

indentation = 0
if attributes["left"] != 0:
Expand Down Expand Up @@ -109,7 +117,7 @@ def change_size(old_properties: BaseProperties, tree: Tag) -> None:
:param tree: BeautifulSoup tree with properties
"""
if tree.sz:
new_size = float(tree.sz.get("w:val", old_properties.size))
new_size = spacing_to_float(tree.sz.get("w:val", old_properties.size))
old_properties.size = int(new_size)


Expand Down Expand Up @@ -180,19 +188,19 @@ def change_spacing(old_properties: BaseProperties, tree: Tag) -> None:

if not before_autospacing:
before_lines = tree.spacing.get("w:beforeLines", False)
before_lines = int(float(before_lines)) if before_lines else before_lines
before_lines = int(spacing_to_float(before_lines)) if before_lines else before_lines
if not before_lines:
before_tag = tree.spacing.get("w:before", False)
before = int(float(before_tag)) if before_tag else before
before = int(spacing_to_float(before_tag)) if before_tag else before
else:
before = before_lines

if not after_autospacing:
after_lines = tree.spacing.get("w:afterLines", False)
after_lines = int(float(after_lines)) if after_lines else after_lines
after_lines = int(spacing_to_float(after_lines)) if after_lines else after_lines
if not after_lines:
after_tag = tree.spacing.get("w:after", False)
after = int(float(after_tag)) if after_tag else after
after = int(spacing_to_float(after_tag)) if after_tag else after
else:
after = after_lines

Expand Down
30 changes: 28 additions & 2 deletions dedoc/utils/annotation_merger.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,18 +71,21 @@ def merge_annotations(self, annotations: List[Annotation], text: str) -> List[An
"""
if not annotations:
return []

annotations_group_by_name_value = self._group_annotations(annotations).values()
spaces = [Space(m.start(), m.end()) for m in self.spaces.finditer(text)]

merged = []
for annotation_group in annotations_group_by_name_value:
group = self._merge_one_group(annotations=annotation_group, spaces=spaces)
merged.extend(group)
return merged

filtered = self.__filter_contradicting_annotations(merged, text)
return filtered

def _merge_one_group(self, annotations: List[Annotation], spaces: List[Space]) -> List[Annotation]:
"""
Merge one group annotations, assume that all annotations has the same name and value
Merge one group annotations, assume that all annotations have the same name and value
"""
if len(annotations) <= 1 or not annotations[0].is_mergeable:
return annotations
Expand Down Expand Up @@ -118,6 +121,29 @@ def _group_annotations(annotations: List[Annotation]) -> Dict[str, List[Annotati
annotations_group_by_value[(annotation.name, annotation.value)].append(annotation)
return annotations_group_by_value

def __filter_contradicting_annotations(self, annotations: List[Annotation], text: str) -> List[Annotation]:
annotations_by_type = defaultdict(list)
for annotation in annotations:
annotations_by_type[annotation.name].append(annotation)

filtered = []
for annotation_list in annotations_by_type.values():
if not annotation_list[0].is_mergeable: # there may be different values of the same annotation type on the text
filtered.extend(annotation_list)
continue

sorted_annotations = sorted(annotation_list, key=lambda x: x.start)
prev_end = 0
for annotation in sorted_annotations:
if annotation.start >= prev_end:
filtered.append(annotation)
prev_end = annotation.end
elif self.spaces.match(text[filtered[-1].start:filtered[-1].end]):
filtered[-1] = annotation
prev_end = annotation.end

return filtered

@staticmethod
def delete_previous_merged(merged: List[Annotation], new_annotations: Annotation) -> List[Annotation]:
"""
Expand Down
Binary file added tests/data/docx/size1.docx
Binary file not shown.
Binary file added tests/data/docx/size2.docx
Binary file not shown.
21 changes: 21 additions & 0 deletions tests/unit_tests/test_format_docx_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from tempfile import TemporaryDirectory

from dedoc.config import get_config
from dedoc.data_structures import SizeAnnotation
from dedoc.data_structures.hierarchy_level import HierarchyLevel
from dedoc.metadata_extractors.concrete_metadata_extractors.docx_metadata_extractor import DocxMetadataExtractor
from dedoc.readers.docx_reader.docx_reader import DocxReader
Expand Down Expand Up @@ -283,6 +284,26 @@ def test_docx_metadata_broken_file(self) -> None:
path = os.path.abspath(path)
self.assertDictEqual({"broken_docx": True}, extractor._get_docx_fields(path))

def test_annotations(self) -> None:
docx_reader = DocxReader(config=get_config())
path = self._get_path("size1.docx")
# test 'pt' ending in size and check font size value
document = docx_reader.read(path)
for i in range(len(document.lines)):
for annotation in document.lines[i].annotations:
if annotation.name == SizeAnnotation.name:
self.assertEqual(12.0, float(annotation.value))

# test that different annotations of one type don't overlap
path = self._get_path("size2.docx")
NastyBoget marked this conversation as resolved.
Show resolved Hide resolved
document = docx_reader.read(path)
size_annotations = [annotation for annotation in document.lines[2].annotations if annotation.name == SizeAnnotation.name]
size_annotations = sorted(size_annotations, key=lambda x: x.start)
prev_end = size_annotations[0].end
for annotation in size_annotations[1:]:
self.assertGreaterEqual(annotation.start, prev_end, "Annotations of one type with different values shouldn't overlap")
prev_end = annotation.end

def _get_path(self, file_name: str) -> str:
path_in = os.path.join(self.directory, file_name)
path_out = os.path.join(self.tmpdir.name, file_name)
Expand Down
28 changes: 28 additions & 0 deletions tests/unit_tests/test_misc_annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,34 @@ def test_merge_1000_no_intersection(self) -> None:
result = self.merge(annotations, text)
self.assertSetEqual({(a.start, a.end, a.name, a.value) for a in annotations}, result)

def test_merge_space(self) -> None:
annotations = [
Annotation(start=0, end=6, name="size", value="12.0"),
Annotation(start=7, end=11, name="size", value="12.0"),
Annotation(start=6, end=7, name="size", value="1"),
Annotation(start=6, end=7, name="bold", value="True")
]
text = "normal text"
result = self.merge(annotations, text)
self.assertEqual(2, len(result))
self.assertIn((0, 11, "size", "12.0"), result)
self.assertIn((6, 7, "bold", "True"), result)

def test_merge_only_spaces(self) -> None:
annotations = [
Annotation(start=0, end=1, name="size", value="12.0"),
Annotation(start=0, end=1, name="bold", value="True"),
Annotation(start=1, end=2, name="italic", value="True"),
Annotation(start=2, end=3, name="bold", value="False"),
Annotation(start=3, end=4, name="size", value="1"),
Annotation(start=4, end=5, name="size", value="5")
]
text = " \t \t\n"
result = self.merge(annotations, text)
self.assertEqual(6, len(result))
actual_result = {(ann.start, ann.end, ann.name, ann.value) for ann in annotations}
self.assertSetEqual(actual_result, result)


class TestAbstractStructureExtractor(unittest.TestCase):
def test_annotation_extractor_left(self) -> None:
Expand Down
Loading