Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/refactor layoutelement textregion to vectorized data structure #3881

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
badbf85
feat: refactor list into array
badGarnet Jan 9, 2025
6a62dfc
refactor paddle ocr return as arrays
badGarnet Jan 9, 2025
6a123b4
refactor build layout elements to build LayoutElements
badGarnet Jan 9, 2025
8138b9f
return layoutelements actually and update tests
badGarnet Jan 9, 2025
31b7488
refactor sorting
badGarnet Jan 10, 2025
8070bdd
fix process file with pdfminer and add test
badGarnet Jan 13, 2025
e81d201
fix test reference for links
badGarnet Jan 13, 2025
f07f960
light refactor of merge extracted and inferred layout
badGarnet Jan 13, 2025
de0e8ad
fix: fix a test expectation
badGarnet Jan 14, 2025
55e0e21
fix kwarg name
badGarnet Jan 14, 2025
f53fe20
update test with refactored data structure
badGarnet Jan 14, 2025
ba1d933
fix: save new elements array to merged layout
badGarnet Jan 15, 2025
efb040d
fix: fix conversion of pdfminer text regions
badGarnet Jan 16, 2025
76116c1
refactor pdfminer process page and bump dep
badGarnet Jan 16, 2025
37fa5df
bump deps again
badGarnet Jan 21, 2025
31edd43
pass in the correct threshold
badGarnet Jan 21, 2025
25e8969
Merge remote-tracking branch 'origin/main' into feat/refactor-layoute…
badGarnet Jan 21, 2025
4ea8b7a
bump version and changelog
badGarnet Jan 21, 2025
c71a58d
refactor tests in test_ocr
badGarnet Jan 21, 2025
0b1f17d
refactor tests
badGarnet Jan 22, 2025
c96d431
fix sorting test (to add sources)
badGarnet Jan 22, 2025
04ac46f
fix: dump elements list before non-vectorized step (remove nested pdf…
badGarnet Jan 22, 2025
083c04e
fix: fix condition to detect invalid coord values
badGarnet Jan 22, 2025
a179328
fix: fix logic
badGarnet Jan 22, 2025
5fadd4d
Feat/refactor layoutelement textregion to vectorized data structure <…
ryannikolaidis Jan 22, 2025
354895d
use env python to drive pytest
badGarnet Jan 22, 2025
fcb752a
Merge branch 'feat/refactor-layoutelement-textregion-to-vectorized-da…
badGarnet Jan 22, 2025
09695fc
fix docker test make command
badGarnet Jan 22, 2025
934614c
unpin protobuf and update dockerfile
badGarnet Jan 22, 2025
343161a
fix: fix flakey test
badGarnet Jan 22, 2025
6a91673
fix: fix updated weaviate client init
badGarnet Jan 22, 2025
894e7e6
pin weaviate so we can still use v3 client
badGarnet Jan 22, 2025
334ae6a
fix: fix bbox validation logic and add test
badGarnet Jan 23, 2025
5b8a6a5
Merge remote-tracking branch 'origin/main' into feat/refactor-layoute…
badGarnet Jan 23, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
## 0.16.16-dev0

### Enhancements

### Features
- **Vectorize layout (inferred, extracted, and OCR) data structure** Using `np.ndarray` to store a group of layout elements or text regions instead of using a list of objects. This improves the memory efficiency and compute speed around layout merging and deduplication.

### Fixes

## 0.16.15

### Enhancements
Expand Down
8 changes: 4 additions & 4 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
FROM quay.io/unstructured-io/base-images:wolfi-base-latest AS base

ARG PYTHON=python3.11
ARG PIP=pip3.11
ARG PIP="${PYTHON} -m pip"

USER root

Expand All @@ -19,6 +19,9 @@ RUN chown -R notebook-user:notebook-user /app && \

USER notebook-user

# append PATH before pip install to avoid warning logs; it also avoids issues with packages that needs compilation during installation
ENV PATH="${PATH}:/home/notebook-user/.local/bin"
ENV TESSDATA_PREFIX=/usr/local/share/tessdata
ENV NLTK_DATA=/home/notebook-user/nltk_data

# Install Python dependencies and download required NLTK packages
Expand All @@ -28,7 +31,4 @@ RUN find requirements/ -type f -name "*.txt" -exec $PIP install --no-cache-dir -
$PYTHON -c "from unstructured.partition.model_init import initialize; initialize()" && \
$PYTHON -c "from unstructured_inference.models.tables import UnstructuredTableTransformerModel; model = UnstructuredTableTransformerModel(); model.initialize('microsoft/table-transformer-structure-recognition')"

ENV PATH="${PATH}:/home/notebook-user/.local/bin"
ENV TESSDATA_PREFIX=/usr/local/share/tessdata

CMD ["/bin/bash"]
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,7 @@ docker-test:
$(DOCKER_IMAGE) \
bash -c "CI=$(CI) \
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) \
pytest $(if $(TEST_FILE),$(TEST_FILE),test_unstructured)"
python3 -m pytest $(if $(TEST_FILE),$(TEST_FILE),test_unstructured)"

.PHONY: docker-smoke-test
docker-smoke-test:
Expand Down
9 changes: 7 additions & 2 deletions test_unstructured/partition/pdf_image/test_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ def __init__(self, number: int, image: Image):
text="Charlie Brown and the Great Pumpkin",
),
]
self.elements_array = layout.LayoutElements.from_list(self.elements)


class MockDocumentLayout(layout.DocumentLayout):
Expand Down Expand Up @@ -254,7 +255,10 @@ def test_partition_image_with_ocr_detects_korean():
)

assert elements[0].text == "RULES AND INSTRUCTIONS"
assert elements[3].text.replace(" ", "").startswith("안녕하세요")
# FIXME (yao): revisit this lstrip after refactoring merging logics; right now on docker and
# local testing yield different results and on docker there is a "," at the start of the Korean
# text line
assert elements[3].text.replace(" ", "").lstrip(",").startswith("안녕하세요")


def test_partition_image_with_ocr_detects_korean_from_file():
Expand All @@ -267,7 +271,7 @@ def test_partition_image_with_ocr_detects_korean_from_file():
)

assert elements[0].text == "RULES AND INSTRUCTIONS"
assert elements[3].text.replace(" ", "").startswith("안녕하세요")
assert elements[3].text.replace(" ", "").lstrip(",").startswith("안녕하세요")


def test_partition_image_raises_with_bad_strategy():
Expand Down Expand Up @@ -579,6 +583,7 @@ def inference_results():
image=mock.MagicMock(format="JPEG"),
)
page.elements = [layout.LayoutElement.from_coords(0, 0, 600, 800, text="hello")]
page.elements_array = layout.LayoutElements.from_list(page.elements)
doc = layout.DocumentLayout(pages=[page])
return doc

Expand Down
82 changes: 69 additions & 13 deletions test_unstructured/partition/pdf_image/test_inference_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from unstructured_inference.inference.elements import TextRegion, TextRegions
from unstructured_inference.inference.layoutelement import LayoutElement
from unstructured_inference.inference.layoutelement import LayoutElement, LayoutElements

from unstructured.documents.elements import ElementType
from unstructured.partition.pdf_image.inference_utils import (
Expand All @@ -22,16 +22,72 @@ def test_merge_text_regions(mock_embedded_text_regions):


def test_build_layout_elements_from_ocr_regions(mock_embedded_text_regions):
expected = [
LayoutElement.from_coords(
x1=437.83888888888885,
y1=317.319341111111,
x2=1256.334784222222,
y2=406.9837855555556,
text="LayoutParser: A Unified Toolkit for Deep Learning Based Document Image",
type=ElementType.UNCATEGORIZED_TEXT,
),
]

elements = build_layout_elements_from_ocr_regions(mock_embedded_text_regions)
expected = LayoutElements.from_list(
[
LayoutElement.from_coords(
x1=437.83888888888885,
y1=317.319341111111,
x2=1256.334784222222,
y2=406.9837855555556,
text="LayoutParser: A Unified Toolkit for Deep Learning Based Document Image",
type=ElementType.UNCATEGORIZED_TEXT,
),
]
)

elements = build_layout_elements_from_ocr_regions(
TextRegions.from_list(mock_embedded_text_regions)
)
assert elements == expected


def test_build_layout_elements_from_ocr_regions_with_text(mock_embedded_text_regions):
text = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image"
expected = LayoutElements.from_list(
[
LayoutElement.from_coords(
x1=437.83888888888885,
y1=317.319341111111,
x2=1256.334784222222,
y2=406.9837855555556,
text=text,
type=ElementType.UNCATEGORIZED_TEXT,
),
]
)

elements = build_layout_elements_from_ocr_regions(
TextRegions.from_list(mock_embedded_text_regions),
text,
group_by_ocr_text=True,
)
assert elements == expected


def test_build_layout_elements_from_ocr_regions_with_multi_line_text(mock_embedded_text_regions):
text = "LayoutParser: \n\nA Unified Toolkit for Deep Learning Based Document Image"
elements = build_layout_elements_from_ocr_regions(
TextRegions.from_list(mock_embedded_text_regions),
text,
group_by_ocr_text=True,
)
assert elements == LayoutElements.from_list(
[
LayoutElement.from_coords(
x1=453.00277777777774,
y1=317.319341111111,
x2=711.5338541666665,
y2=358.28571222222206,
text="LayoutParser:",
type=ElementType.UNCATEGORIZED_TEXT,
),
LayoutElement.from_coords(
x1=437.83888888888885,
y1=317.319341111111,
x2=1256.334784222222,
y2=406.9837855555556,
text="A Unified Toolkit for Deep Learning Based Document Image",
type=ElementType.UNCATEGORIZED_TEXT,
),
]
)
Loading
Loading