Skip to content

Commit

Permalink
Fix bugs
Browse files Browse the repository at this point in the history
  • Loading branch information
NastyBoget committed Dec 22, 2023
1 parent a1aef6d commit 90bf38d
Show file tree
Hide file tree
Showing 6 changed files with 73 additions and 12 deletions.
3 changes: 1 addition & 2 deletions dedoc/readers/docx_reader/README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# Docx reader documentation

[стандарт Office Open XML File Formats с. 28-62; 167-1301](http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-376,%20Fifth%20Edition,%20Part%201%20-%20Fundamentals%20And%20Markup%20Language%20Reference.zip)
[Стандарт Office Open XML File Formats с. 28-62; 167-1301](https://ecma-international.org/wp-content/uploads/ECMA-376-1_5th_edition_december_2016.zip)

## Структура docx

Expand Down
20 changes: 14 additions & 6 deletions dedoc/readers/docx_reader/properties_extractor.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,16 @@
from typing import Union

from bs4 import Tag

from dedoc.readers.docx_reader.data_structures.base_props import BaseProperties


def spacing_to_float(spacing: Union[str, int, float]) -> float:
if str(spacing).endswith("pt"):
return float(spacing[:-2])
return float(spacing)


def check_if_true(value: str) -> bool:
if value == "1" or value == "True" or value == "true":
return True
Expand Down Expand Up @@ -79,7 +87,7 @@ def change_indent(old_properties: BaseProperties, tree: Tag) -> None:
["firstLine", "firstLineChars", "hanging", "hangingChars", "start", "startChars", "left"]
}
for attribute in attributes:
attributes[attribute] = float(tree.ind.get(f"w:{attribute}", 0))
attributes[attribute] = spacing_to_float(tree.ind.get(f"w:{attribute}", 0))

indentation = 0
if attributes["left"] != 0:
Expand Down Expand Up @@ -109,7 +117,7 @@ def change_size(old_properties: BaseProperties, tree: Tag) -> None:
:param tree: BeautifulSoup tree with properties
"""
if tree.sz:
new_size = float(tree.sz.get("w:val", old_properties.size))
new_size = spacing_to_float(tree.sz.get("w:val", old_properties.size))
old_properties.size = int(new_size)


Expand Down Expand Up @@ -180,19 +188,19 @@ def change_spacing(old_properties: BaseProperties, tree: Tag) -> None:

if not before_autospacing:
before_lines = tree.spacing.get("w:beforeLines", False)
before_lines = int(float(before_lines)) if before_lines else before_lines
before_lines = int(spacing_to_float(before_lines)) if before_lines else before_lines
if not before_lines:
before_tag = tree.spacing.get("w:before", False)
before = int(float(before_tag)) if before_tag else before
before = int(spacing_to_float(before_tag)) if before_tag else before
else:
before = before_lines

if not after_autospacing:
after_lines = tree.spacing.get("w:afterLines", False)
after_lines = int(float(after_lines)) if after_lines else after_lines
after_lines = int(spacing_to_float(after_lines)) if after_lines else after_lines
if not after_lines:
after_tag = tree.spacing.get("w:after", False)
after = int(float(after_tag)) if after_tag else after
after = int(spacing_to_float(after_tag)) if after_tag else after
else:
after = after_lines

Expand Down
25 changes: 24 additions & 1 deletion dedoc/utils/annotation_merger.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,14 +71,17 @@ def merge_annotations(self, annotations: List[Annotation], text: str) -> List[An
"""
if not annotations:
return []

annotations_group_by_name_value = self._group_annotations(annotations).values()
spaces = [Space(m.start(), m.end()) for m in self.spaces.finditer(text)]

merged = []
for annotation_group in annotations_group_by_name_value:
group = self._merge_one_group(annotations=annotation_group, spaces=spaces)
merged.extend(group)
return merged

filtered = self.__filter_contradicting_annotations(merged, text)
return filtered

def _merge_one_group(self, annotations: List[Annotation], spaces: List[Space]) -> List[Annotation]:
"""
Expand Down Expand Up @@ -118,6 +121,26 @@ def _group_annotations(annotations: List[Annotation]) -> Dict[str, List[Annotati
annotations_group_by_value[(annotation.name, annotation.value)].append(annotation)
return annotations_group_by_value

def __filter_contradicting_annotations(self, annotations: List[Annotation], text: str) -> List[Annotation]:
annotations_by_type = defaultdict(list)
for annotation in annotations:
annotations_by_type[annotation.name].append(annotation)

filtered = []
for annotation_list in annotations_by_type.values():
sorted_annotations = sorted(annotation_list, key=lambda x: x.start)
prev_end = 0
for annotation in sorted_annotations:
if annotation.start >= prev_end:
filtered.append(annotation)
prev_end = annotation.end
elif self.spaces.match(text[filtered[-1].start:filtered[-1].end]):
del filtered[-1]
filtered.append(annotation)
prev_end = annotation.end

return filtered

@staticmethod
def delete_previous_merged(merged: List[Annotation], new_annotations: Annotation) -> List[Annotation]:
"""
Expand Down
Binary file modified tests/data/docx/size2.docx
Binary file not shown.
9 changes: 6 additions & 3 deletions tests/unit_tests/test_format_docx_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,12 +284,15 @@ def test_docx_metadata_broken_file(self) -> None:
path = os.path.abspath(path)
self.assertDictEqual({"broken_docx": True}, extractor._get_docx_fields(path))

@unittest.skip("For issues")
def test_annotations(self) -> None:
docx_reader = DocxReader(config=get_config())
path = self._get_path("size1.docx")
# test 'pt' ending in size
document = docx_reader.read(path) # TODO fix this and check font size value
# test 'pt' ending in size and check font size value
document = docx_reader.read(path)
for i in range(len(document.lines)):
for annotation in document.lines[i].annotations:
if annotation.name == SizeAnnotation.name:
self.assertEqual(12.0, float(annotation.value))

# test that different annotations of one type don't overlap
path = self._get_path("size2.docx")
Expand Down
28 changes: 28 additions & 0 deletions tests/unit_tests/test_misc_annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,34 @@ def test_merge_1000_no_intersection(self) -> None:
result = self.merge(annotations, text)
self.assertSetEqual({(a.start, a.end, a.name, a.value) for a in annotations}, result)

def test_merge_space(self) -> None:
annotations = [
Annotation(start=0, end=6, name="size", value="12.0"),
Annotation(start=7, end=11, name="size", value="12.0"),
Annotation(start=6, end=7, name="size", value="1"),
Annotation(start=6, end=7, name="bold", value="True")
]
text = "normal text"
result = self.merge(annotations, text)
self.assertEqual(2, len(result))
self.assertIn((0, 11, "size", "12.0"), result)
self.assertIn((6, 7, "bold", "True"), result)

def test_merge_only_spaces(self) -> None:
annotations = [
Annotation(start=0, end=1, name="size", value="12.0"),
Annotation(start=0, end=1, name="bold", value="True"),
Annotation(start=1, end=2, name="italic", value="True"),
Annotation(start=2, end=3, name="bold", value="False"),
Annotation(start=3, end=4, name="size", value="1"),
Annotation(start=4, end=5, name="size", value="5")
]
text = " \t \t\n"
result = self.merge(annotations, text)
self.assertEqual(6, len(result))
actual_result = {(ann.start, ann.end, ann.name, ann.value) for ann in annotations}
self.assertSetEqual(actual_result, result)


class TestAbstractStructureExtractor(unittest.TestCase):
def test_annotation_extractor_left(self) -> None:
Expand Down

0 comments on commit 90bf38d

Please sign in to comment.