From 52d569b9d681bfe739c2e9bd9db39f5a2742e5bd Mon Sep 17 00:00:00 2001 From: Banu Hapeloglu Kutlu Date: Wed, 24 Jun 2020 11:01:03 -0400 Subject: [PATCH] Ingesting HathiTrust overlap data (#215) * add ht_bib_key_ssim for multi/serial type hathi records and update access_online facet * hathi data as struct, adding access info as well * adds hathi etas on/off in settings --- .rubocop_todo.yml | 6 +-- README.md | 10 ++-- config/indexer_settings_dev.yml | 1 - config/indexer_settings_test.yml | 3 +- lib/marc_access_facet_processor.rb | 10 +++- .../{hathi_etas.rake => hathi_trust.rake} | 39 ++++++++++----- lib/traject/macros/custom.rb | 18 +++++-- lib/traject/psulib_config.rb | 18 +++---- spec/fixtures/access_hathi_allow.mrc | 1 + spec/fixtures/access_hathi_deny.mrc | 1 + .../hathitrust/final_hathi_mono_overlap.csv | 11 +++++ .../hathitrust/final_hathi_multi_overlap.csv | 11 +++++ spec/fixtures/hathitrust/mock_overlap.csv | 9 ---- spec/lib/traject/access_facet_spec.rb | 32 +++++++++++++ spec/lib/traject/macros/custom_spec.rb | 48 ++++++++++++++++--- 15 files changed, 167 insertions(+), 51 deletions(-) rename lib/tasks/{hathi_etas.rake => hathi_trust.rake} (52%) create mode 100644 spec/fixtures/access_hathi_allow.mrc create mode 100644 spec/fixtures/access_hathi_deny.mrc create mode 100644 spec/fixtures/hathitrust/final_hathi_mono_overlap.csv create mode 100644 spec/fixtures/hathitrust/final_hathi_multi_overlap.csv delete mode 100644 spec/fixtures/hathitrust/mock_overlap.csv diff --git a/.rubocop_todo.yml b/.rubocop_todo.yml index 170329d0..fb6ae70f 100644 --- a/.rubocop_todo.yml +++ b/.rubocop_todo.yml @@ -15,7 +15,7 @@ Metrics/AbcSize: # Configuration parameters: CountComments, ExcludedMethods. # ExcludedMethods: refine Metrics/BlockLength: - Max: 179 + Max: 206 # Offense count: 2 # Configuration parameters: CountComments. @@ -30,12 +30,12 @@ Metrics/CyclomaticComplexity: # Offense count: 15 # Configuration parameters: CountComments, ExcludedMethods. Metrics/MethodLength: - Max: 21 + Max: 23 # Offense count: 1 # Configuration parameters: CountComments. Metrics/ModuleLength: - Max: 156 + Max: 162 # Offense count: 1 # Configuration parameters: MinBodyLength. diff --git a/README.md b/README.md index 7b7f4003..afa28ecc 100644 --- a/README.md +++ b/README.md @@ -65,15 +65,19 @@ display the output to the console (and not push the data to Solr). $ bundle exec traject --debug-mode -c lib/traject/psulib_config.rb solr/sample_data/sample_psucat.mrc ``` -## Hathi Trust ETAS data +## HathiTrust ETAS data -To generate a synthesized Hathi Trust overlap report locally, you can run +To generate a synthesized HathiTrust overlap report locally, you can run ``` RUBY_ENVIRONMENT=dev bundle exec rake hathitrust:process_hathi_etas`. ``` -You will need the overlap file provided by Hathi Trust (`overlap_[date]_psu.tsv`) in your `ignorethis_hathi/` directory. +You will need the overlap file provided by HathiTrust (`overlap_[date]_psu.tsv`) and `hathi_field_list.csv` with the content + +`oclc_num,htid,ht_bib_key,access` + +in your `ignorethis_hathi/` directory (or whatever directory path you set in `hathi_overlap_path`). Also make sure the below settings in your `indexer_settings_dev.yml` file are set with the correct info: diff --git a/config/indexer_settings_dev.yml b/config/indexer_settings_dev.yml index 8b0d7508..3609aed5 100644 --- a/config/indexer_settings_dev.yml +++ b/config/indexer_settings_dev.yml @@ -2,6 +2,5 @@ processing_thread_pool: 5 hathi_overlap_path: ignorethis_hathi/ -hathi_overlap_file: final_hathi_overlap.csv hathi_load_period: 20200501 overlap_file: overlap_20200518_psu.tsv \ No newline at end of file diff --git a/config/indexer_settings_test.yml b/config/indexer_settings_test.yml index 55ef2b3b..31ccf883 100644 --- a/config/indexer_settings_test.yml +++ b/config/indexer_settings_test.yml @@ -1,5 +1,4 @@ --- processing_thread_pool: 5 -hathi_overlap_path: spec/fixtures/hathitrust/ -hathi_overlap_file: mock_overlap.csv \ No newline at end of file +hathi_overlap_path: spec/fixtures/hathitrust/ \ No newline at end of file diff --git a/lib/marc_access_facet_processor.rb b/lib/marc_access_facet_processor.rb index 1b3619a3..7fcb7af6 100644 --- a/lib/marc_access_facet_processor.rb +++ b/lib/marc_access_facet_processor.rb @@ -25,7 +25,7 @@ def extract_access_data(record, context) end end - access << 'Online' if context.output_hash&.dig('ht_id_ssim') + access << 'Online' if hathi_access? context access.compact! access.uniq! access.delete 'On Order' if not_only_on_order? access @@ -43,4 +43,12 @@ def resolve_library_code(field, library) 'On Order' end + + def hathi_access?(context) + hathitrust_struct = context.output_hash&.dig('hathitrust_struct') + hathitrust_etas = context.settings['hathi_etas'] + return false unless hathitrust_struct + + hathitrust_etas || JSON.parse(hathitrust_struct&.first)['access'] == 'allow' + end end diff --git a/lib/tasks/hathi_etas.rake b/lib/tasks/hathi_trust.rake similarity index 52% rename from lib/tasks/hathi_etas.rake rename to lib/tasks/hathi_trust.rake index deeb8602..e3554984 100644 --- a/lib/tasks/hathi_etas.rake +++ b/lib/tasks/hathi_trust.rake @@ -2,7 +2,7 @@ namespace :hathitrust do desc 'Process overlap file for emergency access to restricted HathiTrust material' - task :process_hathi_etas do + task :process_hathi_overlap do indexer_settings = YAML.load_file("config/indexer_settings_#{ENV['RUBY_ENVIRONMENT']}.yml") period = indexer_settings['hathi_load_period'] @@ -10,8 +10,8 @@ namespace :hathitrust do Rake::Task['hathitrust:load_hathi_full'].invoke(period) Rake::Task['hathitrust:pare_hathi_full'].invoke(period) Rake::Task['hathitrust:extract_overlap_oclc'].invoke(indexer_settings['overlap_file']) + Rake::Task['hathitrust:split_overlap_oclc'].invoke Rake::Task['hathitrust:filter_overlap'].invoke - Rake::Task['hathitrust:extract_uniq_oclc'].invoke end end @@ -20,29 +20,42 @@ namespace :hathitrust do print `curl -s -O 'https://www.hathitrust.org/sites/www.hathitrust.org/files/hathifiles/hathi_full_#{args[:period]}.txt.gz'` end - desc 'Pare the monthly file down to just the needed data: Hathi Trust id and OCLC number' + desc 'Pare the monthly file down to just the needed data: OCLC number, HathiTrust id, HathiTrust bib_key and HathiTurst access code' task :pare_hathi_full, [:period] do |_task, args| print `gunzip -c hathi_full_#{args[:period]}.txt.gz | \ - csvcut -t -c 1,8 -z 1310720 | \ - csvgrep -c 1,2 -r ".+" | \ + csvcut -t -c 8,1,4,2 -z 1310720 | \ + csvgrep -c 1,2,3,4 -r ".+" | \ sort | uniq > hathi_full_dedupe.csv` + + print `cat hathi_field_list.csv hathi_full_dedupe.csv > hathi_full_dedupe_with_headers.csv` end - desc 'Extract the unique set of OCLC numbers from the overlap report' + desc 'Extract the unique set of OCLC numbers and access code from the overlap report' task :extract_overlap_oclc, [:overlap_file] do |_task, args| print `csvgrep -t -c 4 -r ".+" #{args[:overlap_file]} | \ - csvcut -c 1 | \ + csvcut -c 1,3 | \ sort | uniq > overlap_all_unique.csv` end + desc 'Split the overlap report by item_type: mono and multi/serial' + task :split_overlap_oclc do + print `csvgrep -H -c 2 -m "mono" overlap_all_unique.csv | \ + csvcut -c 1 | \ + sort | uniq > overlap_mono_unique.csv` + + print `csvgrep -H -c 2 -m "mono" -i overlap_all_unique.csv | \ + csvcut -c 1 | \ + sort | uniq > overlap_multi_unique.csv` + end + desc 'Filter the pared down HathiTrust data using the overlap OCLC numbers as the filter input' task :filter_overlap do - print `csvgrep -c 2 -f overlap_all_unique.csv \ - hathi_full_dedupe.csv > hathi_filtered_by_overlap.csv` - end + print `csvgrep -c 1 -f overlap_mono_unique.csv hathi_full_dedupe_with_headers.csv | \ + csvcut -C 3 | \ + sort | uniq > final_hathi_mono_overlap.csv` - desc 'Extract the unique set of OCLC numbers from the filtered data' - task :extract_uniq_oclc do - print `sort -t, -k2 -u hathi_filtered_by_overlap.csv > final_hathi_overlap.csv` + print `csvgrep -c 1 -f overlap_multi_unique.csv hathi_full_dedupe_with_headers.csv | \ + csvcut -C 2 | \ + sort | uniq > final_hathi_multi_overlap.csv` end end diff --git a/lib/traject/macros/custom.rb b/lib/traject/macros/custom.rb index 09e45913..67aaee13 100644 --- a/lib/traject/macros/custom.rb +++ b/lib/traject/macros/custom.rb @@ -205,12 +205,22 @@ def self.includes_oclc_indicators?(sf_a) sf_a.include?('OCLC') end - # Extract ht_id - def extract_ht_id + def extract_hathi_data lambda do |_record, accumulator, context| oclc_number = context.output_hash&.dig('oclc_number_ssim')&.first - accumulator << HATHI_ETAS_OVERLAP[oclc_number] - accumulator.compact + ht_hash = HATHI_MULTI_OVERLAP&.dig(oclc_number)&.first || HATHI_MONO_OVERLAP&.dig(oclc_number)&.first + accumulator << ht_hash.to_json unless ht_hash.nil? + end + end + + def hathi_to_hash(ht_format) + hathi_overlap_csv = "#{settings['hathi_overlap_path']}final_hathi_#{ht_format}_overlap.csv" + CSV.read(hathi_overlap_csv) + .group_by(&:shift) + .each do |_oclc, ht_data| + ht_key = ht_format == 'mono' ? :ht_id : :ht_bib_key + ht_data.map! { |data| [ht_key, :access].zip(data).to_h } + ht_data.reject! { |data| data[:access] == 'deny' } if ht_data.length > 1 end end end diff --git a/lib/traject/psulib_config.rb b/lib/traject/psulib_config.rb index 08c8cac7..31aed02e 100644 --- a/lib/traject/psulib_config.rb +++ b/lib/traject/psulib_config.rb @@ -38,7 +38,7 @@ provide 'reader_class_name', 'Traject::MarcCombiningReader' provide 'commit_timeout', '10000' provide 'hathi_overlap_path', indexer_settings['hathi_overlap_path'] || '/data/hathitrust_data/' - provide 'hathi_overlap_file', indexer_settings['hathi_overlap_file'] || 'final_hathi_overlap.csv' + provide 'hathi_etas', indexer_settings['hathi_etas'] || true if is_jruby provide 'marc4j_reader.permissive', true @@ -47,12 +47,12 @@ end end -# This HATHI_ETAS_OVERLAP constant is expected to be a two column csv of htid and oclc numbers. It is -# created by a process described at -# https://github.com/psu-libraries/psulib_blacklight/wiki/Synthesizing-overlap-data-from-HathiTrust -# @todo replace above with rake task -hathi_overlap_csv = "#{settings['hathi_overlap_path']}#{settings['hathi_overlap_file']}" -Traject::Macros::Custom::HATHI_ETAS_OVERLAP = CSV.read(hathi_overlap_csv).map(&:reverse).to_h +# HATHI_MONO_OVERLAP constant is expected to be a three column csv of oclc number, htid and access info. +# HATHI_MULTI_OVERLAP constant is expected to be a three column csv of oclc number, ht_bib_key and access info. +# These are generated by running `rake hathitrust:process_hathi_etas`. +Traject::Macros::Custom::HATHI_MONO_OVERLAP = hathi_to_hash('mono') +Traject::Macros::Custom::HATHI_MULTI_OVERLAP = hathi_to_hash('multi') + ATOZ = ('a'..'z').to_a.join('') ATOU = ('a'..'u').to_a.join('') @@ -200,8 +200,8 @@ to_field 'author_meeting_display_ssm', extract_marc('111abcdfgklnpqj'), trim_punctuation to_field 'addl_author_display_ssm', extract_marc('700aqbcdjk:710abcdfgjkln:711abcdfgjklnpq'), trim_punctuation -# Permanent HathiTrust item identifier -to_field 'ht_id_ssim', extract_ht_id +# HathiTrust fields +to_field 'hathitrust_struct', extract_hathi_data ## Access facet access_facet_processor = MarcAccessFacetProcessor.new diff --git a/spec/fixtures/access_hathi_allow.mrc b/spec/fixtures/access_hathi_allow.mrc new file mode 100644 index 00000000..d66f5f0f --- /dev/null +++ b/spec/fixtures/access_hathi_allow.mrc @@ -0,0 +1 @@ +00076nam a22000491 4500 0010005000000350021000051234 a(OCoLC)100000391 \ No newline at end of file diff --git a/spec/fixtures/access_hathi_deny.mrc b/spec/fixtures/access_hathi_deny.mrc new file mode 100644 index 00000000..025b14f0 --- /dev/null +++ b/spec/fixtures/access_hathi_deny.mrc @@ -0,0 +1 @@ +00076cam a22000494a 45000010005000000350021000051234 a(OCoLC)100000499 \ No newline at end of file diff --git a/spec/fixtures/hathitrust/final_hathi_mono_overlap.csv b/spec/fixtures/hathitrust/final_hathi_mono_overlap.csv new file mode 100644 index 00000000..a90501b5 --- /dev/null +++ b/spec/fixtures/hathitrust/final_hathi_mono_overlap.csv @@ -0,0 +1,11 @@ +1000,wu.89030498562,deny +10000016,uiug.30112117983731,allow +100000499,mdp.39015069374455,deny +100000684,mdp.39015068763393,deny +100000780,mdp.39015069337908,deny +10000143,uva.x000891141,deny +1000021,uc1.b3547182,allow +1000021,12345,deny +1000022826,dul1.ark:/13960/t3rv7mm1q,deny +1000023113,dul1.ark:/13960/t4zh3px08,deny +10000247,uiug.30112010346952,deny diff --git a/spec/fixtures/hathitrust/final_hathi_multi_overlap.csv b/spec/fixtures/hathitrust/final_hathi_multi_overlap.csv new file mode 100644 index 00000000..c5e99c6a --- /dev/null +++ b/spec/fixtures/hathitrust/final_hathi_multi_overlap.csv @@ -0,0 +1,11 @@ +100000391,012292266,allow +100000393,012292267,allow +100000753,005602659,deny +10000162,001666867,deny +1000061,005893467,allow +1000061,12345,deny +1000075,001139875,deny +10000912,012263522,deny +10002026,009880991,deny +1000266,000938715,deny +1000310,009911637,deny diff --git a/spec/fixtures/hathitrust/mock_overlap.csv b/spec/fixtures/hathitrust/mock_overlap.csv deleted file mode 100644 index 5262610d..00000000 --- a/spec/fixtures/hathitrust/mock_overlap.csv +++ /dev/null @@ -1,9 +0,0 @@ -uiug.30112117983731,10000016 -pst.000019102375,100000391 -pst.000019102412,100000393 -mdp.39015069374455,100000499 -mdp.39015068763393,100000684 -mdp.39015073590872,100000753 -mdp.39015069337908,100000780 -uva.x000891141,10000143 -mdp.39015049761318,10000162 \ No newline at end of file diff --git a/spec/lib/traject/access_facet_spec.rb b/spec/lib/traject/access_facet_spec.rb index 2a1da548..2eaccb3d 100644 --- a/spec/lib/traject/access_facet_spec.rb +++ b/spec/lib/traject/access_facet_spec.rb @@ -46,5 +46,37 @@ result = @indexer.map_record(MARC::Reader.new(File.join(fixture_path, 'access_zremoved.mrc')).to_a.first) expect(result['access_facet']).to be_nil end + + context 'when hathitrust etas is enabled' do + before do + @indexer.settings['hathi_etas'] = true + end + + it 'produces Online when a record has a hathitrust copy with allow access' do + result = @indexer.map_record(MARC::Reader.new(File.join(fixture_path, 'access_hathi_allow.mrc')).to_a.first) + expect(result['access_facet']).to contain_exactly 'Online' + end + + it 'empty when a record has a hathitrust copy with deny access' do + result = @indexer.map_record(MARC::Reader.new(File.join(fixture_path, 'access_hathi_deny.mrc')).to_a.first) + expect(result['access_facet']).to contain_exactly 'Online' + end + end + + context 'when hathitrust etas is not enabled' do + before do + @indexer.settings['hathi_etas'] = false + end + + it 'produces Online when a record has a hathitrust copy with allow access' do + result = @indexer.map_record(MARC::Reader.new(File.join(fixture_path, 'access_hathi_allow.mrc')).to_a.first) + expect(result['access_facet']).to contain_exactly 'Online' + end + + it 'empty when record has a hathitrust copy with deny access' do + result = @indexer.map_record(MARC::Reader.new(File.join(fixture_path, 'access_hathi_deny.mrc')).to_a.first) + expect(result['access_facet']).to be_nil + end + end end end diff --git a/spec/lib/traject/macros/custom_spec.rb b/spec/lib/traject/macros/custom_spec.rb index 94eccdd2..ec67ca59 100644 --- a/spec/lib/traject/macros/custom_spec.rb +++ b/spec/lib/traject/macros/custom_spec.rb @@ -192,22 +192,58 @@ end end - describe '#extract_ht_id' do + describe '#extract_hathi_data' do let(:result) { @indexer.map_record(MARC::Record.new_from_hash('fields' => [oclc], 'leader' => leader)) } - context 'when a record does not have a match in the overlap report' do + context 'when a record does not have a match in the overlap reports' do let(:oclc) { { '035' => { 'ind1' => '', 'ind2' => '', 'subfields' => [{ 'a' => '(OCLC)99999999' }] } } } - it 'does not produce an ht_id' do - expect(result['ht_id_ssim']).to be_nil + it 'does not produce a hathitrust_struct' do + expect(result['hathitrust_struct']).to be_nil end end - context 'when a record has a match in the overlap report' do + context 'when a record has a match in the overlap mono report' do + let(:oclc) { { '035' => { 'ind1' => '', 'ind2' => '', 'subfields' => [{ 'a' => '(OCLC)100000499' }] } } } + + it 'does maps the ht_id' do + expect(result['hathitrust_struct']).to match ['{"ht_id":"mdp.39015069374455","access":"deny"}'] + end + end + + context 'when a record has a match in the overlap multi report' do let(:oclc) { { '035' => { 'ind1' => '', 'ind2' => '', 'subfields' => [{ 'a' => '(OCLC)100000391' }] } } } it 'does maps the ht_id' do - expect(result['ht_id_ssim']).to eq(['pst.000019102375']) + expect(result['hathitrust_struct']).to match ['{"ht_bib_key":"012292266","access":"allow"}'] + end + end + end + + describe '#hathi_to_hash' do + let(:result) { @indexer.hathi_to_hash(ht_format) } + + context 'for hathi records with mono item type' do + let(:ht_format) { 'mono' } + + it 'produces hathi data ' do + expect(result['1000']).to match [{ ht_id: 'wu.89030498562', access: 'deny' }] + end + + it 'prefers the record with allow access if there are multiple copies with same oclc' do + expect(result['1000021']).to match [{ ht_id: 'uc1.b3547182', access: 'allow' }] + end + end + + context 'for hathi records with mono item type' do + let(:ht_format) { 'multi' } + + it 'produces hathi data correctly' do + expect(result['100000391']).to match [{ ht_bib_key: '012292266', access: 'allow' }] + end + + it 'prefers the record with allow access if there are multiple copies with same oclc' do + expect(result['1000061']).to match [{ ht_bib_key: '005893467', access: 'allow' }] end end end