From 9831ce6f0de770b68f1b78701d4adc1ec6b5de72 Mon Sep 17 00:00:00 2001 From: Martin Warin Date: Thu, 19 Sep 2024 14:40:17 -0400 Subject: [PATCH] tidy HTFeed::METS (finished) --- lib/HTFeed/METS.pm | 539 +++++++++++++++++++++++++-------------------- 1 file changed, 299 insertions(+), 240 deletions(-) diff --git a/lib/HTFeed/METS.pm b/lib/HTFeed/METS.pm index 675b9a39..1f0c12e7 100644 --- a/lib/HTFeed/METS.pm +++ b/lib/HTFeed/METS.pm @@ -498,67 +498,75 @@ sub _add_premis_events { EVENTCODE: foreach my $eventcode ( @{$events} ) { # query database for: datetime, outcome my $eventconfig = $nspkg->get_event_configuration($eventcode); - my ( $eventid, $datetime, $outcome,$custom ) = - $volume->get_event_info($eventcode); - if(defined $custom) { + my ($eventid, $datetime, $outcome, $custom) = $volume->get_event_info($eventcode); + if (defined $custom) { $premis->add_event($custom); - } elsif(defined $eventid) { + } elsif (defined $eventid) { $eventconfig->{eventid} = $eventid; - $eventconfig->{date} = $datetime; - if(defined $outcome) { + $eventconfig->{date} = $datetime; + if (defined $outcome) { $eventconfig->{outcomes} = [$outcome]; } $self->add_premis_event($eventconfig); } elsif (not defined $eventconfig->{optional} or !$eventconfig->{optional}) { - $self->set_error("MissingField",field=>"premis_$eventcode",detail=>"No PREMIS event recorded with config ID $eventcode"); + $self->set_error( + "MissingField", + field => "premis_$eventcode", + detail => "No PREMIS event recorded with config ID $eventcode" + ); } } - } sub _get_event_type { - my $event = shift; - - if (blessed($event) and $event->isa("PREMIS::Event") and defined $event->{event_type}) { - return $event->{event_type}; - } elsif (blessed($event) and $event->isa("XML::LibXML::Element") ) { - my $xc = XML::LibXML::XPathContext->new($event); - register_namespaces($xc); - return $xc->findvalue( './premis:eventType', $event ); - } else { - return undef; - } - + my $event = shift; + + if (blessed($event) and $event->isa("PREMIS::Event") and defined $event->{event_type}) { + return $event->{event_type}; + } elsif (blessed($event) and $event->isa("XML::LibXML::Element")) { + my $xc = XML::LibXML::XPathContext->new($event); + register_namespaces($xc); + return $xc->findvalue('./premis:eventType', $event); + } else { + return undef; + } } sub _check_premis { - my $self = shift; - my $volume = $self->{volume}; - - my %included_event_types = map { (_get_event_type($_),1) } values( %{$self->{included_events}} ); - # at a minimum there should be capture, message digest calculation, - # fixity check, validation and ingestion. - if($volume->get_packagetype() ne 'audio') { - foreach my $required_event_type (@{$self->{required_events}}) { - $self->set_error("BadField",detail=>"Missing required PREMIS event type", - field=>"premis event $required_event_type") - if not defined $included_event_types{$required_event_type}; - } - } + my $self = shift; + my $volume = $self->{volume}; + my %included_event_types = map { + (_get_event_type($_), 1) + } values(%{$self->{included_events}}); + # at a minimum there should be capture, message digest calculation, + # fixity check, validation and ingestion. + if ($volume->get_packagetype() ne 'audio') { + foreach my $required_event_type (@{$self->{required_events}}) { + if (not defined $included_event_types{$required_event_type}) { + $self->set_error( + "BadField", + detail => "Missing required PREMIS event type", + field => "premis event $required_event_type" + ); + } + } + } } sub add_premis_event { - my $self = shift; + my $self = shift; my $eventconfig = shift; - my $volume = $self->{volume}; - my $premis = $self->{premis}; + + my $volume = $self->{volume}; + my $premis = $self->{premis}; my $included_events = $self->{included_events}; - foreach my $field ('executor','executor_type','detail','type','date','eventid') { - if(not defined $eventconfig->{$field}) { - $self->set_error("MissingField", - field => $field, + foreach my $field ('executor', 'executor_type', 'detail', 'type', 'date', 'eventid') { + if (not defined $eventconfig->{$field}) { + $self->set_error( + "MissingField", + field => $field, actual => $eventconfig ); return; @@ -571,24 +579,37 @@ sub add_premis_event { return; } - my $event = new PREMIS::Event( $eventconfig->{'eventid'}, 'UUID', - $eventconfig->{'type'}, $eventconfig->{'date'}, - $eventconfig->{'detail'}); + my $event = new PREMIS::Event( + $eventconfig->{'eventid'}, + 'UUID', + $eventconfig->{'type'}, + $eventconfig->{'date'}, + $eventconfig->{'detail'} + ); + foreach my $outcome (@{ $eventconfig->{'outcomes'} }) { $event->add_outcome($outcome); } -# query namespace/packagetype for software tools to record for this event type + # query namespace/packagetype for software tools to record for this event type $event->add_linking_agent( - new PREMIS::LinkingAgent( $eventconfig->{'executor_type'}, - $eventconfig->{'executor'}, - 'Executor' ) ); + new PREMIS::LinkingAgent( + $eventconfig->{'executor_type'}, + $eventconfig->{'executor'}, + 'Executor' + ) + ); my @agents = (); my $tools_config = $eventconfig->{'tools'}; + foreach my $agent (@$tools_config) { $event->add_linking_agent( - new PREMIS::LinkingAgent( 'tool', get_tool_version($agent), 'software') + new PREMIS::LinkingAgent( + 'tool', + get_tool_version($agent), + 'software' + ) ); } $included_events->{$eventid} = $event; @@ -602,46 +623,48 @@ sub add_premis_event { sub _add_source_mets_events { my $self = shift; + my $volume = $self->{volume}; my $premis = $self->{premis}; - - my $xc = $volume->get_source_mets_xpc(); + my $xc = $volume->get_source_mets_xpc(); $self->migrate_agent_identifiers($xc); my $src_premis_events = {}; - foreach my $src_event ( $xc->findnodes('//premis:event') ) { - + foreach my $src_event ($xc->findnodes('//premis:event')) { # src event will be an XML node # do we want to keep this kind of event? - my $event_type = $xc->findvalue( './premis:eventType', $src_event ); - $src_premis_events->{$event_type} = [] - if not defined $src_premis_events->{$event_type}; - push( @{ $src_premis_events->{$event_type} }, $src_event ); + my $event_type = $xc->findvalue('./premis:eventType', $src_event); + if (not defined $src_premis_events->{$event_type}) { + $src_premis_events->{$event_type} = [] + } + push(@{$src_premis_events->{$event_type}}, $src_event); } - foreach my $eventcode ( - @{ $volume->get_nspkg()->get('source_premis_events_extract') } ) - { + my $eventcodes = $volume->get_nspkg()->get('source_premis_events_extract'); + foreach my $eventcode (@{$eventcodes}) { my $eventconfig = $volume->get_nspkg()->get_event_configuration($eventcode); - my $eventtype = $eventconfig->{type}; - - if(not defined $src_premis_events->{$eventtype}) { - $self->set_error("MissingField", - field => "premis $eventtype", - file => $volume->get_source_mets_file(), - detail => "Missing required PREMIS event in source METS") - unless (defined $eventconfig->{optional} and $eventconfig->{optional}); + my $eventtype = $eventconfig->{type}; + + unless (defined $src_premis_events->{$eventtype}) { + unless (defined $eventconfig->{optional} and $eventconfig->{optional}) { + $self->set_error( + "MissingField", + field => "premis $eventtype", + file => $volume->get_source_mets_file(), + detail => "Missing required PREMIS event in source METS" + ); + } } next unless defined $src_premis_events->{$eventtype}; - foreach my $src_event ( @{ $src_premis_events->{$eventtype} } ) { - my $eventid = $xc->findvalue( "./premis:eventIdentifier[premis:eventIdentifierType='UUID']/premis:eventIdentifierValue", + + foreach my $src_event (@{$src_premis_events->{$eventtype}}) { + my $eventid = $xc->findvalue( + "./premis:eventIdentifier[premis:eventIdentifierType='UUID']/premis:eventIdentifierValue", $src_event ); - # overwrite already-included event w/ updated information if needed $self->{included_events}{$eventid} = $src_event; $premis->add_event($src_event); - } } } @@ -653,12 +676,11 @@ sub _add_premis { # map from UUID to event - events that have already been added $self->{included_events} = {}; - my $premis = $self->{premis}; my $old_events = $self->_extract_old_premis(); if ($old_events) { - while ( my ( $eventid, $event ) = each(%$old_events) ) { + while (my ($eventid, $event) = each(%$old_events)) { $self->{included_events}{$eventid} = $event; $premis->add_event($event); } @@ -671,38 +693,49 @@ sub _add_premis { # create PREMIS object my $premis_object = - new PREMIS::Object( 'HathiTrust', $volume->get_identifier() ); - $premis_object->add_significant_property( 'file count', - $volume->get_file_count() ); + new PREMIS::Object('HathiTrust', $volume->get_identifier()); + $premis_object->add_significant_property( + 'file count', + $volume->get_file_count() + ); if ($volume->get_file_groups()->{image}) { - $premis_object->add_significant_property( 'page count', - $volume->get_page_count() ); + $premis_object->add_significant_property( + 'page count', + $volume->get_page_count() + ); } $premis->add_object($premis_object); # last chance to record, even though it's not done yet $volume->record_premis_event('ingestion'); - $self->_add_premis_events( $nspkg->get('premis_events') ); + $self->_add_premis_events($nspkg->get('premis_events')); - my $digiprovMD = - new METS::MetadataSection( 'digiprovMD', 'id' => 'premis1' ); - $digiprovMD->set_xml_node( $premis->to_node(), mdtype => 'PREMIS' ); - - push( @{ $self->{amd_mdsecs} }, $digiprovMD ); + my $digiprovMD = new METS::MetadataSection( + 'digiprovMD', + 'id' => 'premis1' + ); + $digiprovMD->set_xml_node( + $premis->to_node(), + mdtype => 'PREMIS' + ); + push(@{$self->{amd_mdsecs}}, $digiprovMD); } sub _add_amdsecs { my $self = shift; - $self->{'mets'} - ->add_amd_sec( $self->_get_subsec_id("AMD"), @{ $self->{amd_mdsecs} } ); + $self->{'mets'}->add_amd_sec( + $self->_get_subsec_id("AMD"), + @{$self->{amd_mdsecs}} + ); } sub _get_subsec_id { my $self = shift; my $subsec_type = shift; + $self->{counts} = {} if not exists $self->{counts}; $self->{counts}{$subsec_type} = 0 if not exists $self->{counts}{$subsec_type}; @@ -711,6 +744,7 @@ sub _get_subsec_id { sub _add_zip_fg { my $self = shift; + my $mets = $self->{mets}; my $volume = $self->{volume}; @@ -719,8 +753,16 @@ sub _add_zip_fg { id => $self->_get_subsec_id("FG"), use => 'zip archive' ); - my ($zip_path,$zip_name) = ($volume->get_zip_directory(), $volume->get_zip_filename()); - $zip_filegroup->add_file( $zip_name, path => $zip_path, prefix => 'ZIP' ); + + my $zip_path = $volume->get_zip_directory(); + my $zip_name = $volume->get_zip_filename(); + + $zip_filegroup->add_file( + $zip_name, + path => $zip_path, + prefix => 'ZIP' + ); + $mets->add_filegroup($zip_filegroup); } @@ -732,14 +774,16 @@ sub _add_srcmets_fg { # Add source METS if it is present my $src_mets_file = $self->{volume}->get_source_mets_file(); - if($src_mets_file) { + if ($src_mets_file) { my $mets_filegroup = new METS::FileGroup( id => $self->_get_subsec_id("FG"), use => 'source METS' ); - $mets_filegroup->add_file( $src_mets_file, - path => $volume->get_staging_directory(), - prefix => 'METS' ); + $mets_filegroup->add_file( + $src_mets_file, + path => $volume->get_staging_directory(), + prefix => 'METS' + ); $mets->add_filegroup($mets_filegroup); } } @@ -752,16 +796,18 @@ sub _add_content_fgs { # then add the actual content files my $filegroups = $volume->get_file_groups(); $self->{filegroups} = {}; - while ( my ( $filegroup_name, $filegroup ) = each(%$filegroups) ) { + while (my ($filegroup_name, $filegroup) = each(%$filegroups)) { # ignore empty file groups next unless @{$filegroup->get_filenames()}; my $mets_filegroup = new METS::FileGroup( id => $self->_get_subsec_id("FG"), use => $filegroup->get_use() ); - $mets_filegroup->add_files( $filegroup->get_filenames(), + $mets_filegroup->add_files( + $filegroup->get_filenames(), prefix => $filegroup->get_prefix(), - path => $volume->get_staging_directory() ); + path => $volume->get_staging_directory() + ); $self->{filegroups}{$filegroup_name} = $mets_filegroup; $mets->add_filegroup($mets_filegroup); @@ -775,31 +821,33 @@ sub _add_filesecs { $self->_add_zip_fg(); $self->_add_srcmets_fg(); $self->_add_content_fgs(); - } # Basic structMap with optional page labels. sub _add_struct_map { my $self = shift; - my $mets = $self->{mets}; - my $volume = $self->{volume}; - my $get_pagedata = $self->{pagedata}; - my $struct_map = new METS::StructMap( id => 'SM1', type => 'physical' ); - my $voldiv = new METS::StructMap::Div( type => 'volume' ); + my $mets = $self->{mets}; + my $volume = $self->{volume}; + my $get_pagedata = $self->{pagedata}; + my $struct_map = new METS::StructMap( + id => 'SM1', + type => 'physical' + ); + my $voldiv = new METS::StructMap::Div(type => 'volume'); $struct_map->add_div($voldiv); + my $order = 1; my $file_groups_by_page = $volume->get_structmap_file_groups_by_page(); - foreach my $seqnum ( sort( keys(%$file_groups_by_page) ) ) { + foreach my $seqnum (sort(keys(%$file_groups_by_page))) { my $pagefiles = $file_groups_by_page->{$seqnum}; my $pagediv_ids = []; my $pagedata; my @pagedata; - while ( my ( $filegroup_name, $files ) = each(%$pagefiles) ) { + while (my ($filegroup_name, $files) = each(%$pagefiles)) { foreach my $file (@$files) { - my $fileid = - $self->{filegroups}{$filegroup_name}->get_file_id($file); - if ( not defined $fileid ) { + my $fileid = $self->{filegroups}{$filegroup_name}->get_file_id($file); + if (not defined $fileid) { $self->set_error( "MissingField", field => "fileid", @@ -810,30 +858,26 @@ sub _add_struct_map { next; } - if(defined $get_pagedata) { + if (defined $get_pagedata) { # try to find page number & page tags for this page - if ( not defined $pagedata ) { + if (not defined $pagedata) { $pagedata = &$get_pagedata($file); @pagedata = %$pagedata if defined $pagedata; - } - else { + } else { my $other_pagedata = &$get_pagedata($file); - while ( my ( $key, $val ) = each(%$pagedata) ) { + while (my ($key, $val) = each(%$pagedata)) { my $val1 = $other_pagedata->{$key}; - $self->set_error( - "NotEqualValues", - actual => "other=$val ,$fileid=$val1", - detail => - "Mismatched page data for different files in pagefiles" - ) - unless ( not defined $val and not defined $val1 ) - or ( $val eq $val1 ); + unless ( not defined $val and not defined $val1 ) or ( $val eq $val1 ) { + $self->set_error( + "NotEqualValues", + actual => "other=$val ,$fileid=$val1", + detail => "Mismatched page data for different files in pagefiles" + ); + } } - } } - - push( @$pagediv_ids, $fileid ); + push(@$pagediv_ids, $fileid); } } $voldiv->add_file_div( @@ -844,7 +888,6 @@ sub _add_struct_map { ); } $mets->add_struct_map($struct_map); - } sub _save_mets { @@ -863,11 +906,12 @@ sub _validate_mets { my $self = shift; my $mets_path = $self->{outfile}; - croak("File $mets_path does not exist. Cannot validate.") - unless -e $mets_path; + unless (-e $mets_path) { + croak("File $mets_path does not exist. Cannot validate.") + } - my ( $mets_valid, $val_results ) = $self->validate_xml($mets_path); - if ( !$mets_valid ) { + my ($mets_valid, $val_results) = $self->validate_xml($mets_path); + if (!$mets_valid) { $self->set_error( "BadFile", file => $mets_path, @@ -877,27 +921,29 @@ sub _validate_mets { # TODO: set failure creating METS file return; } - } sub validate_xml { - my $self = shift; - my $use_caching = $self->{volume}->get_nspkg()->get('use_schema_caching'); + my $self = shift; + my $filename = shift; + + my $use_caching = $self->{volume}->get_nspkg()->get('use_schema_caching'); my $schema_cache = get_config('xerces_cache'); - my $xerces = get_config('xerces'); + my $xerces = get_config('xerces'); - $xerces .= " $schema_cache" if($use_caching); + $xerces .= " $schema_cache" if $use_caching; - my $filename = shift; my $validation_cmd = "$xerces '$filename' 2>&1"; my $val_results = `$validation_cmd`; - if ( ($use_caching and $val_results !~ /\Q$filename\E OK/) or - (!$use_caching and $val_results =~ /Error/) or - $? ) { - wantarray ? return ( 0, $val_results ) : return (0); - } - else { - wantarray ? return ( 1, undef ) : return (0); + + if ( + ($use_caching and $val_results !~ /\Q$filename\E OK/) or + (!$use_caching and $val_results =~ /Error/) or + $? + ) { + wantarray ? return (0, $val_results) : return (0); + } else { + wantarray ? return (1, undef) : return (0); } } @@ -911,9 +957,12 @@ sub _get_createdate { my $ts = sprintf( "%d-%02d-%02dT%02d:%02d:%02dZ", - ( 1900 + $gmtime_obj->year() ), ( 1 + $gmtime_obj->mon() ), - $gmtime_obj->mday(), $gmtime_obj->hour(), - $gmtime_obj->min(), $gmtime_obj->sec() + (1900 + $gmtime_obj->year()), + (1 + $gmtime_obj->mon()), + $gmtime_obj->mday(), + $gmtime_obj->hour(), + $gmtime_obj->min(), + $gmtime_obj->sec() ); return $ts; @@ -928,6 +977,7 @@ sub clean_always { # do cleaning that is appropriate after failure sub clean_failure { my $self = shift; + $self->{volume}->clean_mets(); } @@ -935,7 +985,7 @@ sub clean_failure { # do not match the regular expression for the leader in the MARC schema sub _remediate_marc { my $self = shift; - my $xc = shift; + my $xc = shift; foreach my $fakeleader ($xc->findnodes('.//marc:controlfield[@tag="LDR"]')) { $fakeleader->removeAttribute('tag'); @@ -946,27 +996,29 @@ sub _remediate_marc { my @controlfields = (); foreach my $controlfield ($xc->findnodes('.//marc:controlfield')) { $controlfield->parentNode()->removeChild($controlfield); - if($controlfield->getAttribute('tag') =~ /^\d{2}[A-Z0-9]$/) { - push(@controlfields,$controlfield); + if ($controlfield->getAttribute('tag') =~ /^\d{2}[A-Z0-9]$/) { + push(@controlfields, $controlfield); } } foreach my $datafield ($xc->findnodes('.//marc:datafield')) { - if($datafield->getAttribute('tag') =~ /^[A-Z]{3}$/) { + if ($datafield->getAttribute('tag') =~ /^[A-Z]{3}$/) { $datafield->parentNode()->removeChild($datafield); } } my @leaders = $xc->findnodes(".//marc:leader"); - if(@leaders != 1) { - $self->set_error("BadField",field=>"marc:leader",detail=>"Zero or more than one leader found"); + if (@leaders != 1) { + $self->set_error( + "BadField", + field => "marc:leader", + detail => "Zero or more than one leader found" + ); return; } my $leader = $leaders[0]; - - my $value = $leader->findvalue("."); - + my $value = $leader->findvalue("."); $value =~ s/\^/ /g; if ($value !~ /^ @@ -989,77 +1041,77 @@ sub _remediate_marc { $/x) { # fix up material with record status of 'a' and no record type - if(substr($value,5,2) eq 'a ') { - substr($value,5,2) = ' a'; + if (substr($value, 5, 2) eq 'a ') { + substr($value, 5, 2) = ' a'; } # 00-04: Record length - default to empty - if(substr($value,0,5) !~ /^[\d ]{5}$/) { - substr($value,0,5) = ' '; + if (substr($value, 0, 5) !~ /^[\d ]{5}$/) { + substr($value, 0, 5) = ' '; } # 05: Record status - if(substr($value,5,1) !~ /^[\dA-Za-z ]$/) { - substr($value,5,1) = ' '; + if (substr($value, 5, 1) !~ /^[\dA-Za-z ]$/) { + substr($value, 5, 1) = ' '; } # 06: Type of record - if(substr($value,6,1) !~ /^[\dA-Za-z]$/) { + if (substr($value, 6, 1) !~ /^[\dA-Za-z]$/) { get_logger()->warn("Invalid value found for record type, can't remediate"); } # 07: Bibliographic level - if(substr($value,7,1) !~ /^[\dA-Za-z ]$/) { - substr($value,7,1) = ' '; + if (substr($value, 7, 1) !~ /^[\dA-Za-z ]$/) { + substr($value, 7, 1) = ' '; } # 08: Type of control - if(substr($value,8,1) !~ /^[\dA-Za-z ]$/) { - substr($value,8,1) = ' '; + if (substr($value, 8, 1) !~ /^[\dA-Za-z ]$/) { + substr($value, 8, 1) = ' '; } # 09: Character coding scheme - if(substr($value,9,1) ne 'a') { + if (substr($value, 9, 1) ne 'a') { get_logger()->warn("Non-Unicode MARC-XML found"); } # 10: Indicator count - if(substr($value,10,1) !~ /^(2| )$/) { - substr($value,10,1) = ' '; + if (substr($value, 10, 1) !~ /^(2| )$/) { + substr($value, 10, 1) = ' '; } # 11: Subfield code count - if(substr($value,11,1) !~ /^(2| )$/) { - substr($value,11,1) = ' '; + if (substr($value, 11, 1) !~ /^(2| )$/) { + substr($value, 11, 1) = ' '; } # 12-16: Base address of data - if(substr($value,12,5) !~ /^[\d ]{5}$/) { - substr($value,12,5) = ' '; + if (substr($value, 12, 5) !~ /^[\d ]{5}$/) { + substr($value, 12, 5) = ' '; } # 17: Encoding level - if(substr($value,17,1) !~ /^[\dA-Za-z ]$/) { - substr($value,17,1) = 'u'; # unknown + if (substr($value, 17, 1) !~ /^[\dA-Za-z ]$/) { + substr($value, 17, 1) = 'u'; # unknown } # 18: Descriptive cataloging form - if(substr($value,18,1) !~ /^[\dA-Za-z ]$/) { - substr($value,18,1) = 'u'; # unknown + if (substr($value, 18, 1) !~ /^[\dA-Za-z ]$/) { + substr($value, 18, 1) = 'u'; # unknown } # 19: Multipart resource record level - if(substr($value,19,1) !~ /^[\dA-Za-z ]$/) { - substr($value,19,1) = ' '; + if (substr($value, 19, 1) !~ /^[\dA-Za-z ]$/) { + substr($value, 19, 1) = ' '; } # 20: Length of the length-of-field portion # 21: Length of the start-character-position portion # 22: Length of the implementatino-defined portion # 23: Undefined - if(substr($value,20,4) !~ /^(4500| )/) { + if (substr($value, 20, 4) !~ /^(4500| )/) { # default to unspecified - substr($value,20,4) = ' '; + substr($value, 20, 4) = ' '; } } @@ -1068,7 +1120,7 @@ sub _remediate_marc { # reinsert control fields in the correct place while (my $controlfield = pop @controlfields) { - $leader->parentNode()->insertAfter($controlfield,$leader); + $leader->parentNode()->insertAfter($controlfield, $leader); } foreach my $datafield ($xc->findnodes('.//marc:datafield')) { @@ -1079,19 +1131,18 @@ sub _remediate_marc { # clean ind1, ind2; move i{1,2} -> ind{1,2} 'ind1' => 'ind1', 'ind2' => 'ind2', - 'i1' => 'ind1', - 'i2' => 'ind2', + 'i1' => 'ind1', + 'i2' => 'ind2', }; - while (my ($old,$new) = each (%$attrs_to_move)) { - if($datafield->hasAttribute($old)) { - + while (my ($old, $new) = each (%$attrs_to_move)) { + if ($datafield->hasAttribute($old)) { my $attrval = $datafield->getAttribute($old); # default to empty if value is invalid - if($attrval !~ /^[\da-z ]{1}$/) { + if ($attrval !~ /^[\da-z ]{1}$/) { $attrval = " "; } $datafield->removeAttribute($old); - $datafield->setAttribute($new,$attrval); + $datafield->setAttribute($new, $attrval); } } } @@ -1100,18 +1151,17 @@ sub _remediate_marc { # remove empty data fields $datafield->parentNode()->removeChild($datafield); } - - } sub convert_tz { - my $self = shift; - my $date = shift; + my $self = shift; + my $date = shift; my $from_tz = shift; - die("No from_tz specified") unless defined $from_tz; + die("No from_tz specified") unless defined $from_tz; die("Missing Date::Manip::VERSION") unless defined $Date::Manip::VERSION; - if($Date::Manip::VERSION < 6.00) { + + if ($Date::Manip::VERSION < 6.00) { # version 5 functional interface, doesn't track timezone my $parsed = ParseDate($date); $self->set_error("BadValue",actual=>"$date",field=>"date",detail=>"Can't parse date") unless defined $parsed; @@ -1141,62 +1191,71 @@ sub is_uplift { } sub agent_type { - my $self = shift; - my $agentid = shift; + my $self = shift; + my $agentid = shift; - return "HathiTrust Institution ID"; + return "HathiTrust Institution ID"; } # map MARC21 agent codes to HathiTrust Institution IDs sub migrate_agent_identifiers { - my $self = shift; - my $xc = shift; - my $volume = $self->{volume}; - - # migrate agent IDs - # - foreach my $agent ( $xc->findnodes('//premis:linkingAgentIdentifier') ) { - my $agent_type = ($xc->findnodes('./premis:linkingAgentIdentifierType',$agent))[0]; - my $agent_value = ($xc->findnodes('./premis:linkingAgentIdentifierValue',$agent))[0]; - - my $agent_type_text = $agent_type->textContent(); - my $agent_value_text = $agent_value->textContent(); - my $new_agent_value = undef; - # TODO: remove after uplift - if($agent_type_text eq 'MARC21 Code' or $agent_type_text eq 'AgentID') { - $new_agent_value = $agent_mapping{$agent_value_text}; - if(not defined $new_agent_value) { - $self->set_error("BadValue",field=>'linkingAgentIdentifierValue', - actual => $agent_value_text, - detail => "Don't know what the HT institution ID is for obsolete agent identifier"); - } - } elsif($agent_type_text eq 'HathiTrust AgentID') { - if($agent_value_text eq 'UNKNOWN' and $volume->{namespace} = 'mdp') { - # best guess - $new_agent_value = 'umich'; - } else { - $self->set_error("BadValue",field=>'linkingAgentIdentifierValue', - actual => $agent_value_text, - detail => 'Unexpected HathiTrust AgentID'); - } - } elsif($agent_type_text eq 'HathiTrust Institution ID' or $agent_type_text eq 'tool') { - # do nothing - } else { - my $mets_in_repos = $volume->get_repository_mets_path(); - $self->set_error("BadValue",field => 'linkingAgentIdentifierType', - actual => $agent_type_text, - expected => 'tool, MARC21 Code, or HathiTrust Institution ID', - file => $mets_in_repos) - } + my $self = shift; + my $xc = shift; + + my $volume = $self->{volume}; - if(defined $new_agent_value) { - $agent_type->removeChildNodes(); - $agent_type->appendText("HathiTrust Institution ID"); - $agent_value->removeChildNodes(); - $agent_value->appendText($new_agent_value); + # migrate agent IDs + foreach my $agent ($xc->findnodes('//premis:linkingAgentIdentifier')) { + my $agent_type = ($xc->findnodes('./premis:linkingAgentIdentifierType', $agent))[0]; + my $agent_value = ($xc->findnodes('./premis:linkingAgentIdentifierValue', $agent))[0]; + + my $agent_type_text = $agent_type->textContent(); + my $agent_value_text = $agent_value->textContent(); + my $new_agent_value = undef; + # TODO: remove after uplift + if ($agent_type_text eq 'MARC21 Code' or $agent_type_text eq 'AgentID') { + $new_agent_value = $agent_mapping{$agent_value_text}; + if (not defined $new_agent_value) { + $self->set_error( + "BadValue", + field => 'linkingAgentIdentifierValue', + actual => $agent_value_text, + detail => "Don't know what the HT institution ID is for obsolete agent identifier" + ); + } + } elsif ($agent_type_text eq 'HathiTrust AgentID') { + if ($agent_value_text eq 'UNKNOWN' and $volume->{namespace} = 'mdp') { + # best guess + $new_agent_value = 'umich'; + } else { + $self->set_error( + "BadValue", + field => 'linkingAgentIdentifierValue', + actual => $agent_value_text, + detail => 'Unexpected HathiTrust AgentID' + ); + } + } elsif ($agent_type_text eq 'HathiTrust Institution ID' or $agent_type_text eq 'tool') { + # do nothing + } else { + my $mets_in_repos = $volume->get_repository_mets_path(); + $self->set_error( + "BadValue", + field => 'linkingAgentIdentifierType', + actual => $agent_type_text, + expected => 'tool, MARC21 Code, or HathiTrust Institution ID', + file => $mets_in_repos + ); + } + + if (defined $new_agent_value) { + $agent_type->removeChildNodes(); + $agent_type->appendText("HathiTrust Institution ID"); + $agent_value->removeChildNodes(); + $agent_value->appendText($new_agent_value); + } } - } } 1;