Skip to content

Commit

Permalink
feat: improve pdfminer element processing (#3618)
Browse files Browse the repository at this point in the history
This PR implements splitting of `pdfminer` elements (`groups of text
chunks`) into smaller bounding boxes (`text lines`). This implementation
prevents loss of information from the object detection model and
facilitates more effective removal of duplicated `pdfminer` text. This
PR also addresses #3430.

---------

Co-authored-by: ryannikolaidis <[email protected]>
Co-authored-by: christinestraub <[email protected]>
  • Loading branch information
3 people authored Sep 12, 2024
1 parent 639ca59 commit 87a88a3
Show file tree
Hide file tree
Showing 19 changed files with 4,688 additions and 689 deletions.
5 changes: 3 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
## 0.15.12-dev1
## 0.15.12-dev2

### Enhancements

* **Improve `pdfminer` element processing** Implemented splitting of `pdfminer` elements (groups of text chunks) into smaller bounding boxes (text lines). This prevents loss of information from the object detection model and facilitates more effective removal of duplicated `pdfminer` text.

### Features

### Fixes
Expand All @@ -22,7 +24,6 @@
* **Enhance `pdfminer` element cleanup** Expand removal of `pdfminer` elements to include those inside all `non-pdfminer` elements, not just `tables`.
* **Modified analysis drawing tools to dump to files and draw from dumps** If the parameter `analysis` of the `partition_pdf` function is set to `True`, the layout for Object Detection, Pdfminer Extraction, OCR and final layouts will be dumped as json files. The drawers now accept dict (dump) objects instead of internal classes instances.
* **Vectorize pdfminer elements deduplication computation**. Use `numpy` operations to compute IOU and sub-region membership instead of using simply loop. This improves the speed of deduplicating elements for pages with a lot of elements.
* **Add deprecation warning to embed code**

### Features

Expand Down
4 changes: 2 additions & 2 deletions test_unstructured/partition/pdf_image/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ def test_partition_pdf_local_raises_with_no_filename():
[
(PartitionStrategy.FAST, 1, {1, 4}, {"pdfminer"}),
(PartitionStrategy.FAST, 3, {3, 6}, {"pdfminer"}),
(PartitionStrategy.HI_RES, 4, {4, 6, 7}, {"yolox", "pdfminer"}),
(PartitionStrategy.HI_RES, 4, {4, 6, 7}, {"yolox", "pdfminer", "ocr_tesseract"}),
(PartitionStrategy.OCR_ONLY, 1, {1, 3, 4}, {"ocr_tesseract"}),
],
)
Expand Down Expand Up @@ -552,7 +552,7 @@ def test_partition_pdf_with_copy_protection():
filename = example_doc_path("pdf/copy-protected.pdf")
elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.HI_RES)
title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
idx = 2
idx = 22
assert elements[idx].text == title
assert {element.metadata.page_number for element in elements} == {1, 2}
assert elements[idx].metadata.detection_class_prob is not None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import pytest
from PIL import Image
from unstructured_inference.constants import Source as InferenceSource
from unstructured_inference.inference.elements import Rectangle, TextRegion
from unstructured_inference.inference.elements import EmbeddedTextRegion, Rectangle, TextRegion
from unstructured_inference.inference.layout import DocumentLayout, LayoutElement, PageLayout

from unstructured.partition.pdf_image.pdfminer_processing import (
Expand All @@ -11,6 +11,7 @@
boxes_self_iou,
clean_pdfminer_duplicate_image_elements,
clean_pdfminer_inner_elements,
remove_duplicate_embedded_text,
)
from unstructured.partition.utils.constants import Source

Expand Down Expand Up @@ -209,3 +210,22 @@ def test_bboxes1_is_almost_subregion_of_bboxes2(coords1, coords2, expected):
def test_boxes_self_iou(coords, threshold, expected):
bboxes = [Rectangle(*row) for row in coords]
np.testing.assert_array_equal(boxes_self_iou(bboxes, threshold), expected)


def test_remove_duplicate_embedded_text():
sample_elements = [
EmbeddedTextRegion(bbox=Rectangle(0, 0, 10, 10), text="Text 1"),
EmbeddedTextRegion(bbox=Rectangle(0, 0, 10, 10), text="Text 2"),
EmbeddedTextRegion(bbox=Rectangle(20, 20, 30, 30), text="Text 3"),
]

result = remove_duplicate_embedded_text(sample_elements)

# Check that duplicates were removed and only 2 unique elements remain
assert len(result) == 2
assert result[0].text == "Text 2"
assert result[1].text == "Text 3"

# Ensure the duplicate was removed by checking that result contains no redundant bboxes
assert result[0].bbox == Rectangle(0, 0, 10, 10)
assert result[1].bbox == Rectangle(20, 20, 30, 30)
28 changes: 28 additions & 0 deletions test_unstructured/partition/pdf_image/test_pdfminer_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from unittest.mock import MagicMock

from pdfminer.layout import LTContainer, LTTextLine

from unstructured.partition.pdf_image.pdfminer_utils import extract_text_objects


def test_extract_text_objects_nested_containers():
"""Test extract_text_objects with nested LTContainers."""
# Mock LTTextLine objects
mock_text_line1 = MagicMock(spec=LTTextLine)
mock_text_line2 = MagicMock(spec=LTTextLine)

# Mock inner container containing one LTTextLine
mock_inner_container = MagicMock(spec=LTContainer)
mock_inner_container.__iter__.return_value = [mock_text_line2]

# Mock outer container containing another LTTextLine and the inner container
mock_outer_container = MagicMock(spec=LTContainer)
mock_outer_container.__iter__.return_value = [mock_text_line1, mock_inner_container]

# Call the function with the outer container
result = extract_text_objects(mock_outer_container)

# Assert both text line objects are extracted, even from nested containers
assert len(result) == 2
assert mock_text_line1 in result
assert mock_text_line2 in result
29 changes: 10 additions & 19 deletions test_unstructured/partition/test_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import json
import os
import pathlib
import sys
import tempfile
import warnings
from importlib import import_module
Expand Down Expand Up @@ -505,7 +504,7 @@ def test_auto_partition_org_from_file():
[(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
)
def test_auto_partition_pdf_from_filename(pass_metadata_filename: bool, content_type: str | None):
file_path = example_doc_path("pdf/layout-parser-paper-fast.pdf")
file_path = example_doc_path("pdf/chevron-page.pdf")
metadata_filename = file_path if pass_metadata_filename else None

elements = partition(
Expand All @@ -515,28 +514,23 @@ def test_auto_partition_pdf_from_filename(pass_metadata_filename: bool, content_
strategy=PartitionStrategy.HI_RES,
)

# NOTE(scanny): gave up trying to figure out why, but this file partitions differently locally
# (on Mac) than it does in CI. Basically the first element when partitioning locally is split
# in two when partitioning on CI. Other than that split the text is exactly the same.
idx = 2 if sys.platform == "darwin" else 3

e = elements[idx]
e = elements[0]
assert isinstance(e, Title)
assert e.text.startswith("LayoutParser")
assert e.text.startswith("eastern mediterranean")
assert e.metadata.filename == os.path.basename(file_path)
assert e.metadata.file_directory == os.path.split(file_path)[0]

e = elements[idx + 1]
e = elements[1]
assert isinstance(e, NarrativeText)
assert e.text.startswith("Zejiang Shen")
assert e.text.startswith("We’re investing")


@pytest.mark.parametrize(
("pass_metadata_filename", "content_type"),
[(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
)
def test_auto_partition_pdf_from_file(pass_metadata_filename: bool, content_type: str | None):
file_path = example_doc_path("pdf/layout-parser-paper-fast.pdf")
file_path = example_doc_path("pdf/chevron-page.pdf")
metadata_filename = file_path if pass_metadata_filename else None

with open(file_path, "rb") as f:
Expand All @@ -547,16 +541,13 @@ def test_auto_partition_pdf_from_file(pass_metadata_filename: bool, content_type
strategy=PartitionStrategy.HI_RES,
)

# NOTE(scanny): see "from_filename" version of this test above for more on this oddness
idx = 2 if sys.platform == "darwin" else 3

e = elements[idx]
e = elements[0]
assert isinstance(e, Title)
assert e.text.startswith("LayoutParser")
assert e.text.startswith("eastern mediterranean")

e = elements[idx + 1]
e = elements[1]
assert isinstance(e, NarrativeText)
assert e.text.startswith("Zejiang Shen")
assert e.text.startswith("We’re investing")


def test_auto_partition_pdf_with_fast_strategy(request: FixtureRequest):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -307,10 +307,32 @@
}
}
},
{
"type": "Header",
"element_id": "9aa82368657b60536f152fd413aec316",
"text": "Core Skills for Biomedical Data Scientists",
"metadata": {
"filetype": "application/pdf",
"languages": [
"eng"
],
"page_number": 2,
"data_source": {
"url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
"version": "237960874052008560436652606947751982249",
"record_locator": {
"protocol": "abfs",
"remote_file_path": "abfs://container1/"
},
"date_created": "1678440764.0",
"date_modified": "1678440764.0"
}
}
},
{
"type": "UncategorizedText",
"element_id": "b810a8721369c3551c942aab9011b7d1",
"text": "Core Skills for Biomedical Data Scientists _____________________________________________________________________________________________",
"element_id": "4f2dbe3656a9ebc60c7e3426ad3cb3e3",
"text": "_____________________________________________________________________________________________",
"metadata": {
"filetype": "application/pdf",
"languages": [
Expand All @@ -331,7 +353,7 @@
},
{
"type": "NarrativeText",
"element_id": "c8fdefac1ae82fa42caeceff04853415",
"element_id": "cd359ae8c49885ead47318021438eead",
"text": "this commitment, a recent report to the NLM Director recommended working across NIH to identify and develop core skills required of a biomedical data scientist to consistency across the cohort of NIH-trained data scientists. This report provides a set of recommended core skills based on analysis of current BD2K-funded training programs, biomedical data science job ads, and practicing members of the current data science workforce.",
"metadata": {
"filetype": "application/pdf",
Expand All @@ -353,7 +375,7 @@
},
{
"type": "Title",
"element_id": "b5b7392d0a946f5016bfa8ad0c248a9b",
"element_id": "bf8321a34edb7103ec4209f3e4a8a8da",
"text": "Methodology",
"metadata": {
"filetype": "application/pdf",
Expand All @@ -375,7 +397,7 @@
},
{
"type": "NarrativeText",
"element_id": "d9d8e38d221ae621c0ddbcabaa4a28b4",
"element_id": "1e1d3d1a5c1397fc588393568d829bc8",
"text": "The Workforce Excellence team took a three-pronged approach to identifying core skills required of a biomedical data scientist (BDS), drawing from:",
"metadata": {
"filetype": "application/pdf",
Expand All @@ -397,7 +419,7 @@
},
{
"type": "ListItem",
"element_id": "ba70aa3bc3ad0dec6a62939c94c5a20c",
"element_id": "45d7ff56632d66a2ab2d4dd2716d4d2e",
"text": "a) Responses to a 2017 Kaggle1 survey2 of over 16,000 self-identified data scientists working across many industries. Analysis of the Kaggle survey responses from the current data science workforce provided insights into the current generation of data scientists, including how they were trained and what programming and analysis skills they use.",
"metadata": {
"filetype": "application/pdf",
Expand All @@ -419,7 +441,7 @@
},
{
"type": "ListItem",
"element_id": "24724b1f0d20a6575f2782fd525c562f",
"element_id": "bf452aac5123fcedda30dd6ed179f41c",
"text": "b) Data science skills taught in BD2K-funded training programs. A qualitative content analysis was applied to the descriptions of required courses offered under the 12 BD2K-funded training programs. Each course was coded using qualitative data analysis software, with each skill that was present in the description counted once. The coding schema of data science-related skills was inductively developed and was organized into four major categories: (1) statistics and math skills; (2) computer science; (3) subject knowledge; (4) general skills, like communication and teamwork. The coding schema is detailed in Appendix A.",
"metadata": {
"filetype": "application/pdf",
Expand All @@ -441,7 +463,7 @@
},
{
"type": "ListItem",
"element_id": "5e6c73154a1e5f74780c69afbc9bc084",
"element_id": "ca176cbef532792b1f11830ff7520587",
"text": "c) Desired skills identified from data science-related job ads. 59 job ads from government (8.5%), academia (42.4%), industry (33.9%), and the nonprofit sector (15.3%) were sampled from websites like Glassdoor, Linkedin, and Ziprecruiter. The content analysis methodology and coding schema utilized in analyzing the training programs were applied to the job descriptions. Because many job ads mentioned the same skill more than once, each occurrence of the skill was coded, therefore weighting important skills that were mentioned multiple times in a single ad.",
"metadata": {
"filetype": "application/pdf",
Expand All @@ -463,7 +485,7 @@
},
{
"type": "NarrativeText",
"element_id": "249f6c76b2c99dadbefb8b8811b0d4cd",
"element_id": "11b170fedd889c3b895bbd28acd811ca",
"text": "Analysis of the above data provided insights into the current state of biomedical data science training, as well as a view into data science-related skills likely to be needed to prepare the BDS workforce to succeed in the future. Together, these analyses informed recommendations for core skills necessary for a competitive biomedical data scientist.",
"metadata": {
"filetype": "application/pdf",
Expand All @@ -485,8 +507,30 @@
},
{
"type": "NarrativeText",
"element_id": "6543ce4e447de8fb3db98ceb06a50c28",
"text": "1 Kaggle is an online community for data scientists, serving as a platform for collaboration, competition, and learning: http://kaggle.com 2 In August 2017, Kaggle conducted an industry-wide survey to gain a clearer picture of the state of data science and machine learning. A standard set of questions were asked of all respondents, with more specific questions related to work for employed data scientists and questions related to learning for data scientists in training. Methodology and results: https://www.kaggle.com/kaggle/kaggle-survey-2017",
"element_id": "2665aadf75bca259f1f5b4c91a53a301",
"text": "1 Kaggle is an online community for data scientists, serving as a platform for collaboration, competition, and learning: http://kaggle.com",
"metadata": {
"filetype": "application/pdf",
"languages": [
"eng"
],
"page_number": 2,
"data_source": {
"url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
"version": "237960874052008560436652606947751982249",
"record_locator": {
"protocol": "abfs",
"remote_file_path": "abfs://container1/"
},
"date_created": "1678440764.0",
"date_modified": "1678440764.0"
}
}
},
{
"type": "NarrativeText",
"element_id": "8bbfe1c3e6bca9a33226d20d69b2297a",
"text": "2 In August 2017, Kaggle conducted an industry-wide survey to gain a clearer picture of the state of data science and machine learning. A standard set of questions were asked of all respondents, with more specific questions related to work for employed data scientists and questions related to learning for data scientists in training. Methodology and results: https://www.kaggle.com/kaggle/kaggle-survey-2017",
"metadata": {
"filetype": "application/pdf",
"languages": [
Expand All @@ -507,7 +551,7 @@
},
{
"type": "UncategorizedText",
"element_id": "1a6ff96d028f18331a9d9c9748b49321",
"element_id": "dd4a661e1a3c898a5cf6328ba56b924d",
"text": "2",
"metadata": {
"filetype": "application/pdf",
Expand Down
Loading

0 comments on commit 87a88a3

Please sign in to comment.