diff --git a/AIPscan/Aggregator/mets_parse_helpers.py b/AIPscan/Aggregator/mets_parse_helpers.py index 4c81d01e..e42d1e39 100644 --- a/AIPscan/Aggregator/mets_parse_helpers.py +++ b/AIPscan/Aggregator/mets_parse_helpers.py @@ -49,31 +49,32 @@ def get_aip_original_name(mets): # Negated as we're going to want to remove this length of values. NAMESUFFIX = -len("-00000000-0000-0000-0000-000000000000") + # Other intellectual entities exist in the METS, i.e. for empty + # directories. We can identify those with an invalid name prefix. + INVALID_NAMEPREFIX = "%transferDirectory%" + NAMESPACES = {u"premis": u"http://www.loc.gov/premis/v3"} ELEM_ORIGINAL_NAME_PATTERN = ".//premis:originalName" - FIRST_DMDSEC = "dmdSec_1" - original_name = "" for fsentry in mets.all_files(): - try: - dmdsec = fsentry.dmdsecs[0] - if dmdsec.id_string != FIRST_DMDSEC: - continue + for dmdsec in fsentry.dmdsecs: dmd_element = dmdsec.serialize() full_name = dmd_element.find( ELEM_ORIGINAL_NAME_PATTERN, namespaces=NAMESPACES ) + if full_name is not None and full_name.text.startswith(INVALID_NAMEPREFIX): + # We don't want this value, it will usually represent an + # directory entity. + continue try: original_name = full_name.text[:NAMESUFFIX] except AttributeError: - pass - break - except IndexError: - pass + continue + # There should be a transfer name in every METS. if original_name == "": - raise METSError() + raise METSError("Cannot locate transfer name in METS") return original_name diff --git a/AIPscan/Aggregator/tasks.py b/AIPscan/Aggregator/tasks.py index 4f9ab83d..4950ef4a 100644 --- a/AIPscan/Aggregator/tasks.py +++ b/AIPscan/Aggregator/tasks.py @@ -294,7 +294,7 @@ def get_mets( except METSError: # Some other error with the METS file that we might want to # log and act upon. - originalName = "" + originalName = packageUUID aip = create_aip_object( package_uuid=packageUUID, diff --git a/AIPscan/Aggregator/tests/test_mets.py b/AIPscan/Aggregator/tests/test_mets.py index f912f309..16e47e00 100644 --- a/AIPscan/Aggregator/tests/test_mets.py +++ b/AIPscan/Aggregator/tests/test_mets.py @@ -13,19 +13,30 @@ @pytest.mark.parametrize( - "fixture_path, transfer_name", + "fixture_path, transfer_name, mets_error", [ - (os.path.join("features_mets", "features-mets.xml"), "myTransfer"), - (os.path.join("iso_mets", "iso_mets.xml"), "iso"), + (os.path.join("features_mets", "features-mets.xml"), "myTransfer", False), + (os.path.join("iso_mets", "iso_mets.xml"), "iso", False), + (os.path.join("original_name_mets", "dataverse_example.xml"), "", True), + ( + os.path.join("original_name_mets", "document-empty-dirs.xml"), + "empty-dirs", + False, + ), ], ) -def test_get_aip_original_name(fixture_path, transfer_name): +def test_get_aip_original_name(fixture_path, transfer_name, mets_error): """Make sure that we can reliably get original name from the METS file given we haven't any mets-reader-writer helpers. """ script_dir = os.path.dirname(os.path.realpath(__file__)) mets_file = os.path.join(script_dir, FIXTURES_DIR, fixture_path) mets = metsrw.METSDocument.fromfile(mets_file) + if mets_error: + # Function should raise an error to work with. + with pytest.raises(mets_parse_helpers.METSError): + _ = mets_parse_helpers.get_aip_original_name(mets) + return assert mets_parse_helpers.get_aip_original_name(mets) == transfer_name # Test the same works with a string. with open(mets_file, "rb") as mets_stream: