feat: improve pdfminer element processing (#3618)

This PR implements splitting of `pdfminer` elements (`groups of text chunks`) into smaller bounding boxes (`text lines`). This implementation prevents loss of information from the object detection model and facilitates more effective removal of duplicated `pdfminer` text. This PR also addresses #3430. --------- Co-authored-by: ryannikolaidis <[email protected]> Co-authored-by: christinestraub <[email protected]>
Unstructured-IO · Sep 12, 2024 · 87a88a3 · 87a88a3
1 parent 639ca59
commit 87a88a3
Show file tree

Hide file tree

Showing 19 changed files with 4,688 additions and 689 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,7 +1,9 @@
-## 0.15.12-dev1
+## 0.15.12-dev2
 
 ### Enhancements
 
+* **Improve `pdfminer` element processing** Implemented splitting of `pdfminer` elements (groups of text chunks) into smaller bounding boxes (text lines). This prevents loss of information from the object detection model and facilitates more effective removal of duplicated `pdfminer` text.
+
 ### Features
 
 ### Fixes
@@ -22,7 +24,6 @@
 * **Enhance `pdfminer` element cleanup** Expand removal of `pdfminer` elements to include those inside all `non-pdfminer` elements, not just `tables`.
 * **Modified analysis drawing tools to dump to files and draw from dumps** If the parameter `analysis` of the `partition_pdf` function is set to `True`, the layout for Object Detection, Pdfminer Extraction, OCR and final layouts will be dumped as json files. The drawers now accept dict (dump) objects instead of internal classes instances.
 * **Vectorize pdfminer elements deduplication computation**. Use `numpy` operations to compute IOU and sub-region membership instead of using simply loop. This improves the speed of deduplicating elements for pages with a lot of elements.
-* **Add deprecation warning to embed code**
 
 ### Features
 

diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py
@@ -159,7 +159,7 @@ def test_partition_pdf_local_raises_with_no_filename():
     [
         (PartitionStrategy.FAST, 1, {1, 4}, {"pdfminer"}),
         (PartitionStrategy.FAST, 3, {3, 6}, {"pdfminer"}),
-        (PartitionStrategy.HI_RES, 4, {4, 6, 7}, {"yolox", "pdfminer"}),
+        (PartitionStrategy.HI_RES, 4, {4, 6, 7}, {"yolox", "pdfminer", "ocr_tesseract"}),
         (PartitionStrategy.OCR_ONLY, 1, {1, 3, 4}, {"ocr_tesseract"}),
     ],
 )
@@ -552,7 +552,7 @@ def test_partition_pdf_with_copy_protection():
     filename = example_doc_path("pdf/copy-protected.pdf")
     elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.HI_RES)
     title = "LayoutParser: A Uniﬁed Toolkit for Deep Learning Based Document Image Analysis"
-    idx = 2
+    idx = 22
     assert elements[idx].text == title
     assert {element.metadata.page_number for element in elements} == {1, 2}
     assert elements[idx].metadata.detection_class_prob is not None

diff --git a/test_unstructured/partition/pdf_image/test_pdfminer_processing.py b/test_unstructured/partition/pdf_image/test_pdfminer_processing.py
@@ -2,7 +2,7 @@
 import pytest
 from PIL import Image
 from unstructured_inference.constants import Source as InferenceSource
-from unstructured_inference.inference.elements import Rectangle, TextRegion
+from unstructured_inference.inference.elements import EmbeddedTextRegion, Rectangle, TextRegion
 from unstructured_inference.inference.layout import DocumentLayout, LayoutElement, PageLayout
 
 from unstructured.partition.pdf_image.pdfminer_processing import (
@@ -11,6 +11,7 @@
     boxes_self_iou,
     clean_pdfminer_duplicate_image_elements,
     clean_pdfminer_inner_elements,
+    remove_duplicate_embedded_text,
 )
 from unstructured.partition.utils.constants import Source
 
@@ -209,3 +210,22 @@ def test_bboxes1_is_almost_subregion_of_bboxes2(coords1, coords2, expected):
 def test_boxes_self_iou(coords, threshold, expected):
     bboxes = [Rectangle(*row) for row in coords]
     np.testing.assert_array_equal(boxes_self_iou(bboxes, threshold), expected)
+
+
+def test_remove_duplicate_embedded_text():
+    sample_elements = [
+        EmbeddedTextRegion(bbox=Rectangle(0, 0, 10, 10), text="Text 1"),
+        EmbeddedTextRegion(bbox=Rectangle(0, 0, 10, 10), text="Text 2"),
+        EmbeddedTextRegion(bbox=Rectangle(20, 20, 30, 30), text="Text 3"),
+    ]
+
+    result = remove_duplicate_embedded_text(sample_elements)
+
+    # Check that duplicates were removed and only 2 unique elements remain
+    assert len(result) == 2
+    assert result[0].text == "Text 2"
+    assert result[1].text == "Text 3"
+
+    # Ensure the duplicate was removed by checking that result contains no redundant bboxes
+    assert result[0].bbox == Rectangle(0, 0, 10, 10)
+    assert result[1].bbox == Rectangle(20, 20, 30, 30)
diff --git a/test_unstructured/partition/pdf_image/test_pdfminer_utils.py b/test_unstructured/partition/pdf_image/test_pdfminer_utils.py
@@ -0,0 +1,28 @@
+from unittest.mock import MagicMock
+
+from pdfminer.layout import LTContainer, LTTextLine
+
+from unstructured.partition.pdf_image.pdfminer_utils import extract_text_objects
+
+
+def test_extract_text_objects_nested_containers():
+    """Test extract_text_objects with nested LTContainers."""
+    # Mock LTTextLine objects
+    mock_text_line1 = MagicMock(spec=LTTextLine)
+    mock_text_line2 = MagicMock(spec=LTTextLine)
+
+    # Mock inner container containing one LTTextLine
+    mock_inner_container = MagicMock(spec=LTContainer)
+    mock_inner_container.__iter__.return_value = [mock_text_line2]
+
+    # Mock outer container containing another LTTextLine and the inner container
+    mock_outer_container = MagicMock(spec=LTContainer)
+    mock_outer_container.__iter__.return_value = [mock_text_line1, mock_inner_container]
+
+    # Call the function with the outer container
+    result = extract_text_objects(mock_outer_container)
+
+    # Assert both text line objects are extracted, even from nested containers
+    assert len(result) == 2
+    assert mock_text_line1 in result
+    assert mock_text_line2 in result
diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py
@@ -6,7 +6,6 @@
 import json
 import os
 import pathlib
-import sys
 import tempfile
 import warnings
 from importlib import import_module
@@ -505,7 +504,7 @@ def test_auto_partition_org_from_file():
     [(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
 )
 def test_auto_partition_pdf_from_filename(pass_metadata_filename: bool, content_type: str | None):
-    file_path = example_doc_path("pdf/layout-parser-paper-fast.pdf")
+    file_path = example_doc_path("pdf/chevron-page.pdf")
     metadata_filename = file_path if pass_metadata_filename else None
 
     elements = partition(
@@ -515,28 +514,23 @@ def test_auto_partition_pdf_from_filename(pass_metadata_filename: bool, content_
         strategy=PartitionStrategy.HI_RES,
     )
 
-    # NOTE(scanny): gave up trying to figure out why, but this file partitions differently locally
-    # (on Mac) than it does in CI. Basically the first element when partitioning locally is split
-    # in two when partitioning on CI. Other than that split the text is exactly the same.
-    idx = 2 if sys.platform == "darwin" else 3
-
-    e = elements[idx]
+    e = elements[0]
     assert isinstance(e, Title)
-    assert e.text.startswith("LayoutParser")
+    assert e.text.startswith("eastern mediterranean")
     assert e.metadata.filename == os.path.basename(file_path)
     assert e.metadata.file_directory == os.path.split(file_path)[0]
 
-    e = elements[idx + 1]
+    e = elements[1]
     assert isinstance(e, NarrativeText)
-    assert e.text.startswith("Zejiang Shen")
+    assert e.text.startswith("We’re investing")
 
 
 @pytest.mark.parametrize(
     ("pass_metadata_filename", "content_type"),
     [(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
 )
 def test_auto_partition_pdf_from_file(pass_metadata_filename: bool, content_type: str | None):
-    file_path = example_doc_path("pdf/layout-parser-paper-fast.pdf")
+    file_path = example_doc_path("pdf/chevron-page.pdf")
     metadata_filename = file_path if pass_metadata_filename else None
 
     with open(file_path, "rb") as f:
@@ -547,16 +541,13 @@ def test_auto_partition_pdf_from_file(pass_metadata_filename: bool, content_type
             strategy=PartitionStrategy.HI_RES,
         )
 
-    # NOTE(scanny): see "from_filename" version of this test above for more on this oddness
-    idx = 2 if sys.platform == "darwin" else 3
-
-    e = elements[idx]
+    e = elements[0]
     assert isinstance(e, Title)
-    assert e.text.startswith("LayoutParser")
+    assert e.text.startswith("eastern mediterranean")
 
-    e = elements[idx + 1]
+    e = elements[1]
     assert isinstance(e, NarrativeText)
-    assert e.text.startswith("Zejiang Shen")
+    assert e.text.startswith("We’re investing")
 
 
 def test_auto_partition_pdf_with_fast_strategy(request: FixtureRequest):

diff --git a/...ected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json b/...ected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json
@@ -307,10 +307,32 @@
       }
     }
   },
+  {
+    "type": "Header",
+    "element_id": "9aa82368657b60536f152fd413aec316",
+    "text": "Core Skills for Biomedical Data Scientists",
+    "metadata": {
+      "filetype": "application/pdf",
+      "languages": [
+        "eng"
+      ],
+      "page_number": 2,
+      "data_source": {
+        "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
+        "version": "237960874052008560436652606947751982249",
+        "record_locator": {
+          "protocol": "abfs",
+          "remote_file_path": "abfs://container1/"
+        },
+        "date_created": "1678440764.0",
+        "date_modified": "1678440764.0"
+      }
+    }
+  },
   {
     "type": "UncategorizedText",
-    "element_id": "b810a8721369c3551c942aab9011b7d1",
-    "text": "Core Skills for Biomedical Data Scientists _____________________________________________________________________________________________",
+    "element_id": "4f2dbe3656a9ebc60c7e3426ad3cb3e3",
+    "text": "_____________________________________________________________________________________________",
     "metadata": {
       "filetype": "application/pdf",
       "languages": [
@@ -331,7 +353,7 @@
   },
   {
     "type": "NarrativeText",
-    "element_id": "c8fdefac1ae82fa42caeceff04853415",
+    "element_id": "cd359ae8c49885ead47318021438eead",
     "text": "this commitment, a recent report to the NLM Director recommended working across NIH to identify and develop core skills required of a biomedical data scientist to consistency across the cohort of NIH-trained data scientists. This report provides a set of recommended core skills based on analysis of current BD2K-funded training programs, biomedical data science job ads, and practicing members of the current data science workforce.",
     "metadata": {
       "filetype": "application/pdf",
@@ -353,7 +375,7 @@
   },
   {
     "type": "Title",
-    "element_id": "b5b7392d0a946f5016bfa8ad0c248a9b",
+    "element_id": "bf8321a34edb7103ec4209f3e4a8a8da",
     "text": "Methodology",
     "metadata": {
       "filetype": "application/pdf",
@@ -375,7 +397,7 @@
   },
   {
     "type": "NarrativeText",
-    "element_id": "d9d8e38d221ae621c0ddbcabaa4a28b4",
+    "element_id": "1e1d3d1a5c1397fc588393568d829bc8",
     "text": "The Workforce Excellence team took a three-pronged approach to identifying core skills required of a biomedical data scientist (BDS), drawing from:",
     "metadata": {
       "filetype": "application/pdf",
@@ -397,7 +419,7 @@
   },
   {
     "type": "ListItem",
-    "element_id": "ba70aa3bc3ad0dec6a62939c94c5a20c",
+    "element_id": "45d7ff56632d66a2ab2d4dd2716d4d2e",
     "text": "a) Responses to a 2017 Kaggle1 survey2 of over 16,000 self-identified data scientists working across many industries. Analysis of the Kaggle survey responses from the current data science workforce provided insights into the current generation of data scientists, including how they were trained and what programming and analysis skills they use.",
     "metadata": {
       "filetype": "application/pdf",
@@ -419,7 +441,7 @@
   },
   {
     "type": "ListItem",
-    "element_id": "24724b1f0d20a6575f2782fd525c562f",
+    "element_id": "bf452aac5123fcedda30dd6ed179f41c",
     "text": "b) Data science skills taught in BD2K-funded training programs. A qualitative content analysis was applied to the descriptions of required courses offered under the 12 BD2K-funded training programs. Each course was coded using qualitative data analysis software, with each skill that was present in the description counted once. The coding schema of data science-related skills was inductively developed and was organized into four major categories: (1) statistics and math skills; (2) computer science; (3) subject knowledge; (4) general skills, like communication and teamwork. The coding schema is detailed in Appendix A.",
     "metadata": {
       "filetype": "application/pdf",
@@ -441,7 +463,7 @@
   },
   {
     "type": "ListItem",
-    "element_id": "5e6c73154a1e5f74780c69afbc9bc084",
+    "element_id": "ca176cbef532792b1f11830ff7520587",
     "text": "c) Desired skills identified from data science-related job ads. 59 job ads from government (8.5%), academia (42.4%), industry (33.9%), and the nonprofit sector (15.3%) were sampled from websites like Glassdoor, Linkedin, and Ziprecruiter. The content analysis methodology and coding schema utilized in analyzing the training programs were applied to the job descriptions. Because many job ads mentioned the same skill more than once, each occurrence of the skill was coded, therefore weighting important skills that were mentioned multiple times in a single ad.",
     "metadata": {
       "filetype": "application/pdf",
@@ -463,7 +485,7 @@
   },
   {
     "type": "NarrativeText",
-    "element_id": "249f6c76b2c99dadbefb8b8811b0d4cd",
+    "element_id": "11b170fedd889c3b895bbd28acd811ca",
     "text": "Analysis of the above data provided insights into the current state of biomedical data science training, as well as a view into data science-related skills likely to be needed to prepare the BDS workforce to succeed in the future. Together, these analyses informed recommendations for core skills necessary for a competitive biomedical data scientist.",
     "metadata": {
       "filetype": "application/pdf",
@@ -485,8 +507,30 @@
   },
   {
     "type": "NarrativeText",
-    "element_id": "6543ce4e447de8fb3db98ceb06a50c28",
-    "text": "1 Kaggle is an online community for data scientists, serving as a platform for collaboration, competition, and learning: http://kaggle.com 2 In August 2017, Kaggle conducted an industry-wide survey to gain a clearer picture of the state of data science and machine learning. A standard set of questions were asked of all respondents, with more specific questions related to work for employed data scientists and questions related to learning for data scientists in training. Methodology and results: https://www.kaggle.com/kaggle/kaggle-survey-2017",
+    "element_id": "2665aadf75bca259f1f5b4c91a53a301",
+    "text": "1 Kaggle is an online community for data scientists, serving as a platform for collaboration, competition, and learning: http://kaggle.com",
+    "metadata": {
+      "filetype": "application/pdf",
+      "languages": [
+        "eng"
+      ],
+      "page_number": 2,
+      "data_source": {
+        "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
+        "version": "237960874052008560436652606947751982249",
+        "record_locator": {
+          "protocol": "abfs",
+          "remote_file_path": "abfs://container1/"
+        },
+        "date_created": "1678440764.0",
+        "date_modified": "1678440764.0"
+      }
+    }
+  },
+  {
+    "type": "NarrativeText",
+    "element_id": "8bbfe1c3e6bca9a33226d20d69b2297a",
+    "text": "2 In August 2017, Kaggle conducted an industry-wide survey to gain a clearer picture of the state of data science and machine learning. A standard set of questions were asked of all respondents, with more specific questions related to work for employed data scientists and questions related to learning for data scientists in training. Methodology and results: https://www.kaggle.com/kaggle/kaggle-survey-2017",
     "metadata": {
       "filetype": "application/pdf",
       "languages": [
@@ -507,7 +551,7 @@
   },
   {
     "type": "UncategorizedText",
-    "element_id": "1a6ff96d028f18331a9d9c9748b49321",
+    "element_id": "dd4a661e1a3c898a5cf6328ba56b924d",
     "text": "2",
     "metadata": {
       "filetype": "application/pdf",