diff --git a/lib/HTFeed/PackageType/Simple/ImageRemediate.pm b/lib/HTFeed/PackageType/Simple/ImageRemediate.pm index a14c1e5a..005c19c4 100644 --- a/lib/HTFeed/PackageType/Simple/ImageRemediate.pm +++ b/lib/HTFeed/PackageType/Simple/ImageRemediate.pm @@ -1,84 +1,88 @@ package HTFeed::PackageType::Simple::ImageRemediate; -use warnings; use strict; +use warnings; + use base qw(HTFeed::Stage::ImageRemediate); -use List::Util qw(max min); -use POSIX qw(ceil); + +use Carp; +use File::Basename qw(basename); +use File::Copy qw(move); use HTFeed::Config qw(get_config); use HTFeed::Stage::Fetch; +use List::Util qw(max min); use Log::Log4perl qw(get_logger); -use File::Basename qw(basename); -use File::Copy qw(move); -use Carp; +use POSIX qw(ceil); my %tiff_field_map = ( # will be automatically reformatted for IFD0:ModifyDate and XMP-tiff:DateTime - capture_date => 'DateTime', - scanner_user => 'IFD0:Artist', - scanner_make => 'IFD0:Make', + capture_date => 'DateTime', + scanner_user => 'IFD0:Artist', + scanner_make => 'IFD0:Make', scanner_model => 'IFD0:Model', ); my %jpeg2000_field_map = ( - capture_date => 'XMP-tiff:DateTime', - scanner_user => 'XMP-tiff:Artist', - scanner_make => 'XMP-tiff:Make', + capture_date => 'XMP-tiff:DateTime', + scanner_user => 'XMP-tiff:Artist', + scanner_make => 'XMP-tiff:Make', scanner_model => 'XMP-tiff:Model', ); -sub run{ - my $self = shift; - my $volume = $self->{volume}; +sub run { + my $self = shift; + my $volume = $self->{volume}; my $preingest_dir = $volume->get_preingest_directory(); - my $staging_dir = $volume->get_staging_directory(); + my $staging_dir = $volume->get_staging_directory(); # decompress any lossless JPEG2000 images my @jp2 = glob("$preingest_dir/*.jp2"); - if(@jp2) { - $self->expand_lossless_jpeg2000($volume,$preingest_dir,[map { basename($_) } @jp2]); + if (@jp2) { + $self->expand_lossless_jpeg2000($volume, $preingest_dir, [map { basename($_) } @jp2]); } - #remediate TIFFs + #remediate TIFFs my @tiffs = map { basename($_) } glob("$preingest_dir/*.tif"); - $self->remediate_tiffs($volume,$preingest_dir,\@tiffs, - - # return extra fields to set that depend on the file - sub { - my $file = shift; - my $force_fields = {'IFD0:DocumentName' => join('/',$volume->get_objid(),$file) }; - my $set_if_undefined = {}; - while(my ($meta_yml_field,$tiff_field) = each(%tiff_field_map)) { - $self->set_from_meta_yml($meta_yml_field,$set_if_undefined,$tiff_field); - } - - # force override resolution if it is provided in meta.yml - $self->set_from_meta_yml('bitonal_resolution_dpi',$force_fields,'Resolution'); - - return ( $force_fields, $set_if_undefined, $file); - } - ) if @tiffs; + if (@tiffs) { + # return extra fields to set that depend on the file + my $headers_sub = sub { + my $file = shift; + my $force_fields = { 'IFD0:DocumentName' => join('/', $volume->get_objid(), $file) }; + my $set_if_undefined = {}; + while (my ($meta_yml_field, $tiff_field) = each(%tiff_field_map)) { + $self->set_from_meta_yml($meta_yml_field, $set_if_undefined, $tiff_field); + } + # force override resolution if it is provided in meta.yml + $self->set_from_meta_yml('bitonal_resolution_dpi', $force_fields, 'Resolution'); + + return ($force_fields, $set_if_undefined, $file); + }; + + $self->remediate_tiffs( + $volume, + $preingest_dir, + \@tiffs, + $headers_sub + ) + } # remediate JP2s - - foreach my $jp2_submitted (glob("$preingest_dir/*.jp2")) - { - my $jp2_fields = $self->get_exiftool_fields($jp2_submitted); - + foreach my $jp2_submitted (glob("$preingest_dir/*.jp2")) { + my $jp2_fields = $self->get_exiftool_fields($jp2_submitted); my $staging_dir = $volume->get_staging_directory(); # there shouldn't be any JP2s for MOA material? - my $force_fields = {'XMP-dc:source' => join('/',$volume->get_objid(),basename($jp2_submitted)) }; + my $force_fields = { 'XMP-dc:source' => join('/', $volume->get_objid(), basename($jp2_submitted)) }; my $set_if_undefined = {}; - my $jp2_remediated = "$staging_dir/" . basename($jp2_submitted); + my $jp2_remediated = "$staging_dir/" . basename($jp2_submitted); - while(my ($meta_yml_field,$jp2_field) = each(%jpeg2000_field_map)) { - $self->set_from_meta_yml($meta_yml_field,$set_if_undefined,$jp2_field); + while (my ($meta_yml_field, $jp2_field) = each(%jpeg2000_field_map)) { + $self->set_from_meta_yml($meta_yml_field, $set_if_undefined, $jp2_field); } # force override resolution if it is provided in meta.yml - $self->set_from_meta_yml('contone_resolution_dpi',$force_fields,'Resolution'); + $self->set_from_meta_yml('contone_resolution_dpi', $force_fields, 'Resolution'); $self->remediate_image( $jp2_submitted, $jp2_remediated, $force_fields, $set_if_undefined ); } @@ -88,40 +92,31 @@ sub run{ # remove newlines & move OCR, supplementary files my $fetch = HTFeed::Stage::Fetch->new(volume => $volume); foreach my $file (glob("$preingest_dir/[0-9]*[0-9].{txt,html,xml}")) { - move($file,$staging_dir); + move($file, $staging_dir); } foreach my $file (glob("$preingest_dir/*.pdf")) { - move($file,$staging_dir); + move($file, $staging_dir); } $fetch->fix_line_endings($staging_dir); - - $self->_set_done(); - return $self->succeeded(); + return $self->succeeded(); } sub set_from_meta_yml { - my $self = shift; - my $meta_yml_key = shift; - my $field_output = shift; + my $self = shift; + my $meta_yml_key = shift; + my $field_output = shift; my $metadata_field = shift; - my $require = shift; - - $require = 0 if not defined $require; - + my $require = shift || 0; my $metadata_value = $self->{volume}->get_meta($meta_yml_key); - if($require and not defined $metadata_value) { - $self->set_error("MissingField",file => 'meta.yml',field=> $meta_yml_key); + if ($require and not defined $metadata_value) { + $self->set_error("MissingField", file => 'meta.yml', field => $meta_yml_key); } - return if not defined $metadata_value; $field_output->{$metadata_field} = $metadata_value; } - 1; - -__END__ diff --git a/lib/HTFeed/Stage/ImageRemediate.pm b/lib/HTFeed/Stage/ImageRemediate.pm index c1019040..b408d9bc 100644 --- a/lib/HTFeed/Stage/ImageRemediate.pm +++ b/lib/HTFeed/Stage/ImageRemediate.pm @@ -57,10 +57,9 @@ sub get_exiftool_fields { $exifTool->Options('ScanForXMP' => 1); $exifTool->ExtractInfo( $file, { Binary => 1 } ); - foreach my $tag ( $exifTool->GetFoundTags() ) { - + foreach my $tag ($exifTool->GetFoundTags()) { # get only the groupname we'll use to update it later - my $group = $exifTool->GetGroup( $tag, "1" ); + my $group = $exifTool->GetGroup( $tag, "1" ); my $tagname = Image::ExifTool::GetTagName($tag); $fields->{"$group:$tagname"} = $exifTool->GetValue($tag); } @@ -335,7 +334,7 @@ sub _remediate_tiff { } # Fix the XMP, if needed - if($self->needs_xmp) { + if ($self->needs_xmp) { # force required fields $self->{newFields}{'XMP-tiff:BitsPerSample'} = 1; $self->{newFields}{'XMP-tiff:Compression'} = 'T6/Group 4 Fax'; @@ -366,8 +365,11 @@ sub _remediate_tiff { } - $ret = $ret - && $self->repair_tiff_exiftool( $infile, $outfile, $self->{newFields} ); + $ret = $ret && $self->repair_tiff_exiftool( + $infile, + $outfile, + $self->{newFields} + ); return $ret; } @@ -430,11 +432,28 @@ sub repair_tiff_imagemagick { "TIFF_REPAIR: attempting to repair $infile to $outfile\n" ); + my $in_exif = Image::ExifTool->new; + my $in_meta = $in_exif->ImageInfo($infile); + # convert returns 0 on success, 1 on failure my $imagemagick = get_config('imagemagick'); my $rval = system("$imagemagick -compress Group4 '$infile' '$outfile' > /dev/null 2>&1"); croak("failed repairing $infile\n") if $rval; + # Some metadata may be lost when imagemagick compresses infile to outfile. + # Here we are putting Artist back, or we'll crash at a later stage, + # due to missing ImageProducer (which depends on Artist). + my $out_exif = Image::ExifTool->new; + my $out_meta = $out_exif->ImageInfo($outfile); + if (defined $in_meta->{'Artist'} && !defined $out_meta->{'Artist'}) { + my ($success, $msg) = $out_exif->SetNewValue('Artist', $in_meta->{'Artist'}); + if (defined $msg) { + croak("Error setting new tag Artist => $in_meta->{'Artist'}: $msg\n"); + } else { + $self->update_tags($out_exif, $outfile); + } + } + $self->{job_metrics}->add("ingest_imageremediate_bytes", -s $infile); $self->{job_metrics}->inc("ingest_imageremediate_images"); @@ -746,7 +765,7 @@ sub expand_lossless_jpeg2000 { $exiftool->WriteInfo("$path/$jpeg2000_remediated"); rename("$path/$jpeg2000_remediated","$path/$jpeg2000"); - unlink("$path/$tiff"); + unlink("$path/$tiff"); } }, "-m JPEG2000-hul" @@ -876,10 +895,10 @@ for remediate_image (qv) =cut sub remediate_tiffs { - - my ( $self, $volume, $tiffpath, $files, $headers_sub ) = @_; + my ($self, $volume, $tiffpath, $files, $headers_sub) = @_; my $repStatus_xp = XML::LibXML::XPathExpression->new( - '/jhove:jhove/jhove:repInfo/jhove:status'); + '/jhove:jhove/jhove:repInfo/jhove:status' + ); my $error_xp = XML::LibXML::XPathExpression->new( '/jhove:jhove/jhove:repInfo/jhove:messages/jhove:message[@severity="error"]' ); @@ -891,28 +910,24 @@ sub remediate_tiffs { my $headers = $self->get_exiftool_fields("$tiffpath/$tiff"); my $needwrite = 0; my $exiftool = new Image::ExifTool; - $exiftool->Options('ScanForXMP' => 1); + + $exiftool->Options('ScanForXMP' => 1); $exiftool->Options('IgnoreMinorErrors' => 1); - foreach my $field ( 'IFD0:ModifyDate', 'IFD0:Artist' ) { + foreach my $field ('IFD0:ModifyDate', 'IFD0:Artist') { my $header = $headers->{$field}; eval { - # see if the header is valid ascii or UTF-8 - my $decoded_header = - decode( 'utf-8', $header, Encode::FB_CROAK ); + my $decoded_header = decode('utf-8', $header, Encode::FB_CROAK); }; if ($@) { - # if not, strip it $exiftool->SetNewValue($field); $needwrite = 1; - } - + } } if ($needwrite) { $exiftool->WriteInfo("$tiffpath/$tiff"); } - } $self->run_jhove( @@ -922,25 +937,30 @@ sub remediate_tiffs { sub { my ( $volume, $file, $node ) = @_; my $xpc = XML::LibXML::XPathContext->new($node); - my ( $force_headers, $set_if_undefined_headers, $renamed_file ) = - ( undef, undef, undef ); + my $force_headers = undef; + my $set_if_undefined_headers = undef; + my $renamed_file = undef; register_namespaces($xpc); $self->{jhoveStatus} = $xpc->findvalue($repStatus_xp); - $self->{jhoveErrors} = - [ map { $_->textContent } $xpc->findnodes($error_xp) ]; + $self->{jhoveErrors} = [ + map { $_->textContent } $xpc->findnodes($error_xp) + ]; # get headers that may depend on the individual file if ($headers_sub) { - ( $force_headers, $set_if_undefined_headers, $renamed_file ) = - &$headers_sub($file); + ($force_headers, $set_if_undefined_headers, $renamed_file) = &$headers_sub($file); } my $outfile = "$stage_path/$file"; $outfile = "$stage_path/$renamed_file" if ( defined $renamed_file ); - $self->remediate_image( "$tiffpath/$file", $outfile, $force_headers, - $set_if_undefined_headers ); + $self->remediate_image( + "$tiffpath/$file", + $outfile, + $force_headers, + $set_if_undefined_headers + ); }, "-m TIFF-hul" ); diff --git a/t/fixtures/simple/test/bitonal_tiff.zip b/t/fixtures/simple/test/bitonal_tiff.zip new file mode 100644 index 00000000..cb2969ec Binary files /dev/null and b/t/fixtures/simple/test/bitonal_tiff.zip differ diff --git a/t/local_ingest.t b/t/local_ingest.t index 2a4287ac..81a3fa26 100644 --- a/t/local_ingest.t +++ b/t/local_ingest.t @@ -162,9 +162,16 @@ describe "HTFeed::PackageType::Simple" => sub { my $exiftool = Image::ExifTool->new(); $exiftool->ExtractInfo("$tmpdirs->{ingest}/lossless_jp2_with_xmp/00000001.jp2"); is($exiftool->GetValue("XMP-tiff:Make"),"Test scanner make"); - }; + it "does not lose artist when compressing a bitonal tiff" => sub { + my $volume = unpacked_volume("bitonal_tiff"); + HTFeed::PackageType::Simple::ImageRemediate->new(volume => $volume)->run(); + HTFeed::PackageType::Simple::SourceMETS->new(volume => $volume)->run(); + my $validate = HTFeed::VolumeValidator->new(volume => $volume); + $validate->run(); + ok($validate->succeeded()); + }; }; };