Skip to content

Commit

Permalink
Simplify conditional logic for gale local ocr
Browse files Browse the repository at this point in the history
  • Loading branch information
rlskoeser committed Nov 4, 2024
1 parent 7722bb4 commit 4635418
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 10 deletions.
17 changes: 8 additions & 9 deletions ppa/archive/gale.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,20 +238,19 @@ def get_item_pages(self, item_id, gale_record=None):
# OCR. Set a tag to indicate when local OCR is present.
tags = []
ocr_text = None
if local_ocr_text:
if local_ocr_text and page_number in local_ocr_text:
ocr_text = local_ocr_text.get(page_number)
# if we have content for this page, set tag to indicate local ocr.
# If page is present but content is the empty string
# (i.e., blank page), still set tag since it was the local OCR
# that determined the page was blank
if ocr_text is not None:
tags = ["local_ocr"]
# If None, page is not present in the data;
else:
logger.warning(f"No local OCR for {item_id} {page_number}")
# try getting the ocr from the gale api result
# (may be empty, since some pages have no text)
ocr_text = page.get("ocrText")
tags = ["local_ocr"]
# If page is not present in the data, use Gale OCR as fallback
else:
logger.warning(f"No local OCR for {item_id} {page_number}")
# try getting the ocr from the gale api result
# (may be empty, since some pages have no text)
ocr_text = page.get("ocrText")

info = {
"page_id": page_number,
Expand Down
11 changes: 10 additions & 1 deletion ppa/archive/tests/test_gale.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,7 @@ def test_get_item_pages(self, mock_get_item, mock_get_local_ocr, mockrequests):
f"http://example.com/img/{i+1}" for i in range(3)
]

# page present but content empty should still return a local ocr tag
# page present but content empty should set local ocr tag
mock_get_local_ocr.return_value = {"0001": "", "0002": "", "0003": ""}
page_data = list(gale_api.get_item_pages(item_id))
assert [p["tags"] for p in page_data] == [
Expand All @@ -313,6 +313,15 @@ def test_get_item_pages(self, mock_get_item, mock_get_local_ocr, mockrequests):
["local_ocr"],
]

# no local ocr file - should use gale ocr
mock_get_local_ocr.return_value = None
page_data = list(gale_api.get_item_pages(item_id))
assert [p["content"] for p in page_data] == [
None,
"more test content",
"ignored text",
]

# skip api call if record is provided
mock_get_item.reset_mock()
mock_get_local_ocr.reset_mock()
Expand Down

0 comments on commit 4635418

Please sign in to comment.