Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revise local ocr handling for gale page indexing #686

Merged
merged 7 commits into from
Nov 4, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 42 additions & 22 deletions ppa/archive/gale.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import json
import logging
import pathlib
import time

import pymarc
Expand All @@ -13,32 +14,36 @@
logger = logging.getLogger(__name__)


def get_local_ocr(item_id, page_num):
def get_local_ocr(item_id):
"""
Get local OCR page text for specified page of a single Gale volume if available.
Load local OCR page text for the specified Gale volume, if available.
This requires a base directory (specified by GALE_LOCAL_OCR) to be configured and
assumes the following organization:

* Volume-level directories are organized in stub directories that correspond to
every third number (e.g., CW0128905397 --> 193). So, a Gale volume's OCR data
is located in the following directory: GALE_LOCAL_OCR / stub_dir / item_id

* A page's local ocr text file is named [volume id]_[page number]0.txt, where the
page number is a 4-digit string (e.g., "0004"). So, for page number "0004" in
volume CW0128905397, the local ocr file is named "CW0128905397_00040.txt".
is located in the following directory: GALE_LOCAL_OCR / stub_dir / item_id.json

* Page text is stored as a JSON dictionary with keys based on Gale page numbers,
which is a 4-digit string (e.g., "0004").

Raises a FileNotFoundError if the local OCR page text does not exist.
Raises a FileNotFoundError if the local OCR page text does not exist.
"""
ocr_dir = getattr(settings, "GALE_LOCAL_OCR", None)
if not ocr_dir:
raise ImproperlyConfigured(
"GALE_LOCAL_OCR configuration is required for indexing Gale page content"
)
# check that the id looks as expected (appease github codeql security concerns)
# first two characters are CW or CB; rest of the id is numeric
if not all([item_id[:2] in ["CW", "CB"], item_id[2:].isnumeric()]):
raise ValueError(f"{item_id} is not a valid Gale item identifier")

stub_dir = item_id[::3][1:] # Following conventions set in ppa-nlp
ocr_txt_fp = os.path.join(ocr_dir, stub_dir, item_id, f"{item_id}_{page_num}0.txt")
with open(ocr_txt_fp) as reader:
return reader.read()
# files are in stub directories; following conventions set in ppa-nlp
stub_dir = item_id[::3][1:]
ocr_path = pathlib.Path(ocr_dir, stub_dir, f"{item_id}.json")
with ocr_path.open() as ocrfile:
Dismissed Show dismissed Hide dismissed
return json.load(ocrfile)


class GaleAPIError(Exception):
Expand Down Expand Up @@ -218,19 +223,34 @@
if gale_record is None:
gale_record = self.get_item(item_id)

local_ocr_text = None
try:
# Use higher quality local OCR text if available
local_ocr_text = get_local_ocr(item_id)
except FileNotFoundError:
logger.warning(f"Local OCR not found for {item_id}")

# iterate through the pages in the response
for page in gale_record["pageResponse"]["pages"]:
page_number = page["pageNumber"]

# Fetch higher quality local OCR text if possible, with fallback to Gale
# OCR. Set a tag to indicate the usage of local OCR text.
# Use local OCR text if we have it, with fallback to Gale
# OCR. Set a tag to indicate when local OCR is present.
tags = []
try:
ocr_text = get_local_ocr(item_id, page_number)
tags = ["local_ocr"]
except FileNotFoundError as e:
ocr_text = page.get("ocrText") # some pages have no text
logger.warning(f'Local OCR not found for {item_id} {page_number}')
ocr_text = None
if local_ocr_text:
ocr_text = local_ocr_text.get(page_number)
# if we have content, set tag to indicate local ocr
if ocr_text:
tags = ["local_ocr"]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be set if the resulting OCR for the page is blank?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nope, good catch

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wait, maybe I misunderstood your question - that's what the logic is now, do you disagree with it?

Copy link
Contributor

@laurejt laurejt Oct 31, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems like the tag should be set if the page's local ocr text is blank, because the empty string is a judgment of the local OCR (and not the Gale OCR)

# we expect empty string if page is present but empty
# (e.g., for blank pages)
# ocr text = None indicates page is not present in the data
elif ocr_text is None:
logger.warning(f"No local OCR for {item_id} {page_number}")
# try getting the ocr from the gale api result
# (may still be empty, since some pages have no text)
ocr_text = page.get("ocrText")

info = {
"page_id": page_number,
Expand Down
67 changes: 44 additions & 23 deletions ppa/archive/tests/test_gale.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import os.path
from unittest.mock import Mock, patch

Expand All @@ -15,22 +16,32 @@
@override_settings()
def test_get_local_ocr(tmp_path):
item_id = "CB0123456789"
page_num = "0001"
content = "Testing...\n1\n2\n3"
content = {"0001": "Testing...\n1\n2\n3"}
# Mock ocr files for testing
ocr_dir = tmp_path.joinpath("147", item_id)
ocr_dir.mkdir(parents=True)
ocr_file = ocr_dir.joinpath(f"{item_id}_{page_num}0.txt")
ocr_file.write_text(content)
ocr_dir = tmp_path / "147"
laurejt marked this conversation as resolved.
Show resolved Hide resolved
ocr_dir.mkdir()
ocr_file = ocr_dir / f"{item_id}.json"
with ocr_file.open("w", encoding="utf-8") as outfile:
json.dump(content, outfile)

with override_settings(GALE_LOCAL_OCR=f"{tmp_path}"):
assert content == gale.get_local_ocr(item_id, page_num)
assert content == gale.get_local_ocr(item_id)


@override_settings(GALE_LOCAL_OCR=None)
def test_get_local_ocr_config_error():
with pytest.raises(ImproperlyConfigured):
gale.get_local_ocr("item_id", "page_num")
gale.get_local_ocr("item_id")


@override_settings(GALE_LOCAL_OCR="/example/path")
def test_get_local_ocr_invalid_id():
with pytest.raises(ValueError):
gale.get_local_ocr("item_id")
with pytest.raises(ValueError):
gale.get_local_ocr("CWabcdefg")
with pytest.raises(ValueError):
gale.get_local_ocr("AB12345")


@override_settings(GALE_API_USERNAME="galeuser123")
Expand Down Expand Up @@ -122,7 +133,7 @@ def test_make_request_refresh_key(self, mock_get_api_key, mockrequests):
ok_response.elapsed.total_seconds.return_value = 3

gale_api.session.get.side_effect = [unauth_response, ok_response]
resp = gale_api._make_request("foo", requires_api_key=True)
gale_api._make_request("foo", requires_api_key=True)
# should be called twice: once for initial request, then for retry
assert mock_get_api_key.call_count == 2
gale_api.session.get.assert_any_call(
Expand All @@ -136,7 +147,7 @@ def test_make_request_refresh_key(self, mock_get_api_key, mockrequests):
# retry should preserve parameters and streaming option
gale_api.session.reset_mock()
gale_api.session.get.side_effect = [unauth_response, ok_response]
resp = gale_api._make_request(
gale_api._make_request(
"foo", params={"bar": "baz"}, stream=True, requires_api_key=True
)
gale_api.session.get.assert_any_call(
Expand All @@ -158,7 +169,7 @@ def test_make_request_refresh_key(self, mock_get_api_key, mockrequests):
gale_api.session.reset_mock()
gale_api.session.get.side_effect = [unauth_response, ok_response]
with pytest.raises(gale.GaleUnauthorized):
resp = gale_api._make_request("foo", requires_api_key=False)
gale_api._make_request("foo", requires_api_key=False)
# should not get a new key; should only make the request once
mock_get_api_key.assert_not_called()
assert gale_api.session.get.call_count == 1
Expand All @@ -168,7 +179,7 @@ def test_make_request_refresh_key(self, mock_get_api_key, mockrequests):
gale_api.session.reset_mock()
gale_api.session.get.side_effect = [unauth_response, ok_response]
with pytest.raises(gale.GaleUnauthorized):
resp = gale_api._make_request("foo", requires_api_key=True, retry=1)
gale_api._make_request("foo", requires_api_key=True, retry=1)
# should not get a new key; should only make the request once
mock_get_api_key.assert_not_called()
assert gale_api.session.get.call_count == 1
Expand Down Expand Up @@ -270,32 +281,42 @@ def test_get_item_pages(self, mock_get_item, mock_get_local_ocr, mockrequests):
}
mock_get_item.return_value = api_response
# Set up get_local_ocr so that only the 3rd page's text is found
mock_get_local_ocr.side_effect = [FileNotFoundError, FileNotFoundError, "local ocr text"]
mock_get_local_ocr.return_value = {"0003": "local ocr text"}
page_data = list(gale_api.get_item_pages(item_id))
mock_get_item.called_once()
assert mock_get_local_ocr.call_count == 3
# called once per volume
assert mock_get_local_ocr.call_count == 1
assert len(page_data) == 3
assert [ p["page_id"] for p in page_data ] == ["0001", "0002", "0003"]
assert [ p["content"] for p in page_data ] == [None, "more test content", "local ocr text"]
assert [ p["label"] for p in page_data ] == ["i", None, None]
assert [ p["tags"] for p in page_data ] == [ [], [], ["local_ocr"] ]
assert [ p["image_id_s"] for p in page_data ] == ["09876001234567", "08765002345678", "0765400456789"]
assert [ p["image_url_s"] for p in page_data ] == [f"http://example.com/img/{i+1}" for i in range(3)]
assert [p["page_id"] for p in page_data] == ["0001", "0002", "0003"]
assert [p["content"] for p in page_data] == [
None,
"more test content",
"local ocr text",
]
assert [p["label"] for p in page_data] == ["i", None, None]
assert [p["tags"] for p in page_data] == [[], [], ["local_ocr"]]
assert [p["image_id_s"] for p in page_data] == [
"09876001234567",
"08765002345678",
"0765400456789",
]
assert [p["image_url_s"] for p in page_data] == [
f"http://example.com/img/{i+1}" for i in range(3)
]

# skip api call if record is provided
mock_get_item.reset_mock()
mock_get_local_ocr.reset_mock()
mock_get_local_ocr.side_effect = [FileNotFoundError, FileNotFoundError, "local ocr text"]
page_data = list(gale_api.get_item_pages(item_id, api_response))
mock_get_item.assert_not_called()
assert mock_get_local_ocr.call_count == 3
assert mock_get_local_ocr.call_count == 1
assert len(page_data) == 3


@override_settings(MARC_DATA="/path/to/data/marc")
@patch("ppa.archive.gale.PairtreeStorageFactory")
def test_get_marc_storage(mock_pairtree_storage_factory):
mstore = gale.get_marc_storage()
gale.get_marc_storage()
mock_pairtree_storage_factory.assert_called_with()
mock_pairtree_storage_factory.return_value.get_store.assert_called_with(
store_dir=settings.MARC_DATA, uri_base="info:local/"
Expand Down
2 changes: 1 addition & 1 deletion ppa/archive/tests/test_import_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,7 @@ def test_import_digitizedwork_success(self, mock_gale_api, mock_get_marc_record)
).exists()

digwork = importer.import_digitizedwork(
"t1234", user=script_user, item_type=DigitizedWork.ARTICLE
"CW1234", user=script_user, item_type=DigitizedWork.ARTICLE
)
# specified item type should be used
assert digwork.item_type == DigitizedWork.ARTICLE
Loading