-
Notifications
You must be signed in to change notification settings - Fork 818
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: improve pdfminer element processing (#3618)
This PR implements splitting of `pdfminer` elements (`groups of text chunks`) into smaller bounding boxes (`text lines`). This implementation prevents loss of information from the object detection model and facilitates more effective removal of duplicated `pdfminer` text. This PR also addresses #3430. --------- Co-authored-by: ryannikolaidis <[email protected]> Co-authored-by: christinestraub <[email protected]>
- Loading branch information
1 parent
639ca59
commit 87a88a3
Showing
19 changed files
with
4,688 additions
and
689 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
28 changes: 28 additions & 0 deletions
28
test_unstructured/partition/pdf_image/test_pdfminer_utils.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
from unittest.mock import MagicMock | ||
|
||
from pdfminer.layout import LTContainer, LTTextLine | ||
|
||
from unstructured.partition.pdf_image.pdfminer_utils import extract_text_objects | ||
|
||
|
||
def test_extract_text_objects_nested_containers(): | ||
"""Test extract_text_objects with nested LTContainers.""" | ||
# Mock LTTextLine objects | ||
mock_text_line1 = MagicMock(spec=LTTextLine) | ||
mock_text_line2 = MagicMock(spec=LTTextLine) | ||
|
||
# Mock inner container containing one LTTextLine | ||
mock_inner_container = MagicMock(spec=LTContainer) | ||
mock_inner_container.__iter__.return_value = [mock_text_line2] | ||
|
||
# Mock outer container containing another LTTextLine and the inner container | ||
mock_outer_container = MagicMock(spec=LTContainer) | ||
mock_outer_container.__iter__.return_value = [mock_text_line1, mock_inner_container] | ||
|
||
# Call the function with the outer container | ||
result = extract_text_objects(mock_outer_container) | ||
|
||
# Assert both text line objects are extracted, even from nested containers | ||
assert len(result) == 2 | ||
assert mock_text_line1 in result | ||
assert mock_text_line2 in result |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.