Skip to content

Commit

Permalink
Add error handling to revised local ocr logic and update unit tests
Browse files Browse the repository at this point in the history
  • Loading branch information
rlskoeser committed Oct 29, 2024
1 parent ac57aeb commit 5cd0861
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 53 deletions.
68 changes: 39 additions & 29 deletions ppa/archive/gale.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os
import json
import logging
import pathlib
import time

import pymarc
Expand All @@ -14,19 +14,18 @@
logger = logging.getLogger(__name__)


def _get_local_ocr(item_id, page_num):
def get_local_ocr(item_id):
"""
Get local OCR page text for specified page of a single Gale volume if available.
Load local OCR page text for the specified Gale volume, if available.
This requires a base directory (specified by GALE_LOCAL_OCR) to be configured and
assumes the following organization:
* Volume-level directories are organized in stub directories that correspond to
every third number (e.g., CW0128905397 --> 193). So, a Gale volume's OCR data
is located in the following directory: GALE_LOCAL_OCR / stub_dir / item_id
is located in the following directory: GALE_LOCAL_OCR / stub_dir / item_id.json
* A page's local ocr text file is named [volume id]_[page number]0.txt, where the
page number is a 4-digit string (e.g., "0004"). So, for page number "0004" in
volume CW0128905397, the local ocr file is named "CW0128905397_00040.txt".
* Page text is stored as a JSON dictionary with keys based on Gale page numbers,
which is a 4-digit string (e.g., "0004").
Raises a FileNotFoundError if the local OCR page text does not exist.
"""
Expand All @@ -35,17 +34,15 @@ def _get_local_ocr(item_id, page_num):
raise ImproperlyConfigured(
"GALE_LOCAL_OCR configuration is required for indexing Gale page content"
)

stub_dir = item_id[::3][1:] # Following conventions set in ppa-nlp
ocr_txt_fp = os.path.join(ocr_dir, stub_dir, item_id, f"{item_id}_{page_num}0.txt")
with open(ocr_txt_fp) as reader:
return reader.read()


def get_local_ocr(item_id):
# error handling still TODO
ocr_dir = getattr(settings, "GALE_LOCAL_OCR", None)
with open(os.path.join(ocr_dir, f"{item_id}.json")) as ocrfile:
# check that the id looks as expected (appease github codeql security concerns)
# first two characters are CW or CB; rest of the id is numeric
if not all([item_id[:2] in ["CW", "CB"], item_id[2:].isnumeric()]):
raise ValueError(f"{item_id} is not a valid Gale item identifier")

# files are in stub directories; following conventions set in ppa-nlp
stub_dir = item_id[::3][1:]
ocr_dir = pathlib.Path(ocr_dir)
with (ocr_dir / stub_dir / f"{item_id}.json").open() as ocrfile:

Check failure

Code scanning / CodeQL

Uncontrolled data used in path expression High

This path depends on a
user-provided value
.
return json.load(ocrfile)


Expand Down Expand Up @@ -226,23 +223,36 @@ def get_item_pages(self, item_id, gale_record=None):
if gale_record is None:
gale_record = self.get_item(item_id)

# todo: needs try/catch error handling
local_ocr_text = get_local_ocr(item_id)
local_ocr_text = None
try:
# Use higher quality local OCR text if available
local_ocr_text = get_local_ocr(item_id)
except FileNotFoundError:
logger.warning(f"Local OCR not found for {item_id}")

# iterate through the pages in the response
for page in gale_record["pageResponse"]["pages"]:
page_number = page["pageNumber"]

# Fetch higher quality local OCR text if possible, with fallback to Gale
# OCR. Set a tag to indicate the usage of local OCR text.
# Use local OCR text if we have it, with fallback to Gale
# OCR. Set a tag to indicate when local OCR is present.
tags = []
ocr_text = local_ocr_text.get(page_number)
# TODO: set local ocr tag here
if not ocr_text and page_number not in local_ocr_text:
# if ocr text is unset and page number is not present,
# try getting the ocr from the gale api result
logger.warning(f"No local OCR for {item_id} {page_number}")
ocr_text = page.get("ocrText") # some pages have no text
ocr_text = None
if local_ocr_text:
ocr_text = local_ocr_text.get(page_number)
# if we have content, set local ocr tag
if ocr_text:
tags = ["local_ocr"]

# we expect empty string if page is present but empty
# (e.g., for blank pages)

# ocr text = None indicates not content not present in the data
if ocr_text is None:
logger.warning(f"No local OCR for {item_id} {page_number}")
# try getting the ocr from the gale api result
# (may still be empty, since some pages have no text)
ocr_text = page.get("ocrText")

info = {
"page_id": page_number,
Expand Down
67 changes: 44 additions & 23 deletions ppa/archive/tests/test_gale.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import os.path
from unittest.mock import Mock, patch

Expand All @@ -15,22 +16,32 @@
@override_settings()
def test_get_local_ocr(tmp_path):
item_id = "CB0123456789"
page_num = "0001"
content = "Testing...\n1\n2\n3"
content = {"0001": "Testing...\n1\n2\n3"}
# Mock ocr files for testing
ocr_dir = tmp_path.joinpath("147", item_id)
ocr_dir.mkdir(parents=True)
ocr_file = ocr_dir.joinpath(f"{item_id}_{page_num}0.txt")
ocr_file.write_text(content)
ocr_dir = tmp_path / "147"
ocr_dir.mkdir()
ocr_file = ocr_dir / f"{item_id}.json"
with ocr_file.open("w") as outfile:
json.dump(content, outfile)

with override_settings(GALE_LOCAL_OCR=f"{tmp_path}"):
assert content == gale.get_local_ocr(item_id, page_num)
assert content == gale.get_local_ocr(item_id)


@override_settings(GALE_LOCAL_OCR=None)
def test_get_local_ocr_config_error():
with pytest.raises(ImproperlyConfigured):
gale.get_local_ocr("item_id", "page_num")
gale.get_local_ocr("item_id")


@override_settings(GALE_LOCAL_OCR="/example/path")
def test_get_local_ocr_invalid_id():
with pytest.raises(ValueError):
gale.get_local_ocr("item_id")
with pytest.raises(ValueError):
gale.get_local_ocr("CWabcdefg")
with pytest.raises(ValueError):
gale.get_local_ocr("AB12345")


@override_settings(GALE_API_USERNAME="galeuser123")
Expand Down Expand Up @@ -122,7 +133,7 @@ def test_make_request_refresh_key(self, mock_get_api_key, mockrequests):
ok_response.elapsed.total_seconds.return_value = 3

gale_api.session.get.side_effect = [unauth_response, ok_response]
resp = gale_api._make_request("foo", requires_api_key=True)
gale_api._make_request("foo", requires_api_key=True)
# should be called twice: once for initial request, then for retry
assert mock_get_api_key.call_count == 2
gale_api.session.get.assert_any_call(
Expand All @@ -136,7 +147,7 @@ def test_make_request_refresh_key(self, mock_get_api_key, mockrequests):
# retry should preserve parameters and streaming option
gale_api.session.reset_mock()
gale_api.session.get.side_effect = [unauth_response, ok_response]
resp = gale_api._make_request(
gale_api._make_request(
"foo", params={"bar": "baz"}, stream=True, requires_api_key=True
)
gale_api.session.get.assert_any_call(
Expand All @@ -158,7 +169,7 @@ def test_make_request_refresh_key(self, mock_get_api_key, mockrequests):
gale_api.session.reset_mock()
gale_api.session.get.side_effect = [unauth_response, ok_response]
with pytest.raises(gale.GaleUnauthorized):
resp = gale_api._make_request("foo", requires_api_key=False)
gale_api._make_request("foo", requires_api_key=False)
# should not get a new key; should only make the request once
mock_get_api_key.assert_not_called()
assert gale_api.session.get.call_count == 1
Expand All @@ -168,7 +179,7 @@ def test_make_request_refresh_key(self, mock_get_api_key, mockrequests):
gale_api.session.reset_mock()
gale_api.session.get.side_effect = [unauth_response, ok_response]
with pytest.raises(gale.GaleUnauthorized):
resp = gale_api._make_request("foo", requires_api_key=True, retry=1)
gale_api._make_request("foo", requires_api_key=True, retry=1)
# should not get a new key; should only make the request once
mock_get_api_key.assert_not_called()
assert gale_api.session.get.call_count == 1
Expand Down Expand Up @@ -270,32 +281,42 @@ def test_get_item_pages(self, mock_get_item, mock_get_local_ocr, mockrequests):
}
mock_get_item.return_value = api_response
# Set up get_local_ocr so that only the 3rd page's text is found
mock_get_local_ocr.side_effect = [FileNotFoundError, FileNotFoundError, "local ocr text"]
mock_get_local_ocr.return_value = {"0003": "local ocr text"}
page_data = list(gale_api.get_item_pages(item_id))
mock_get_item.called_once()
assert mock_get_local_ocr.call_count == 3
# called once per volume
assert mock_get_local_ocr.call_count == 1
assert len(page_data) == 3
assert [ p["page_id"] for p in page_data ] == ["0001", "0002", "0003"]
assert [ p["content"] for p in page_data ] == [None, "more test content", "local ocr text"]
assert [ p["label"] for p in page_data ] == ["i", None, None]
assert [ p["tags"] for p in page_data ] == [ [], [], ["local_ocr"] ]
assert [ p["image_id_s"] for p in page_data ] == ["09876001234567", "08765002345678", "0765400456789"]
assert [ p["image_url_s"] for p in page_data ] == [f"http://example.com/img/{i+1}" for i in range(3)]
assert [p["page_id"] for p in page_data] == ["0001", "0002", "0003"]
assert [p["content"] for p in page_data] == [
None,
"more test content",
"local ocr text",
]
assert [p["label"] for p in page_data] == ["i", None, None]
assert [p["tags"] for p in page_data] == [[], [], ["local_ocr"]]
assert [p["image_id_s"] for p in page_data] == [
"09876001234567",
"08765002345678",
"0765400456789",
]
assert [p["image_url_s"] for p in page_data] == [
f"http://example.com/img/{i+1}" for i in range(3)
]

# skip api call if record is provided
mock_get_item.reset_mock()
mock_get_local_ocr.reset_mock()
mock_get_local_ocr.side_effect = [FileNotFoundError, FileNotFoundError, "local ocr text"]
page_data = list(gale_api.get_item_pages(item_id, api_response))
mock_get_item.assert_not_called()
assert mock_get_local_ocr.call_count == 3
assert mock_get_local_ocr.call_count == 1
assert len(page_data) == 3


@override_settings(MARC_DATA="/path/to/data/marc")
@patch("ppa.archive.gale.PairtreeStorageFactory")
def test_get_marc_storage(mock_pairtree_storage_factory):
mstore = gale.get_marc_storage()
gale.get_marc_storage()
mock_pairtree_storage_factory.assert_called_with()
mock_pairtree_storage_factory.return_value.get_store.assert_called_with(
store_dir=settings.MARC_DATA, uri_base="info:local/"
Expand Down
2 changes: 1 addition & 1 deletion ppa/archive/tests/test_import_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,7 @@ def test_import_digitizedwork_success(self, mock_gale_api, mock_get_marc_record)
).exists()

digwork = importer.import_digitizedwork(
"t1234", user=script_user, item_type=DigitizedWork.ARTICLE
"CW1234", user=script_user, item_type=DigitizedWork.ARTICLE
)
# specified item type should be used
assert digwork.item_type == DigitizedWork.ARTICLE

0 comments on commit 5cd0861

Please sign in to comment.