Skip to content

Commit

Permalink
Ingesting HathiTrust overlap data (#215)
Browse files Browse the repository at this point in the history
* add ht_bib_key_ssim for multi/serial type hathi records and update access_online facet
* hathi data as struct, adding access info as well
* adds hathi etas on/off in settings
  • Loading branch information
Banu Hapeloglu Kutlu authored Jun 24, 2020
1 parent 3d638a5 commit 52d569b
Show file tree
Hide file tree
Showing 15 changed files with 167 additions and 51 deletions.
6 changes: 3 additions & 3 deletions .rubocop_todo.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ Metrics/AbcSize:
# Configuration parameters: CountComments, ExcludedMethods.
# ExcludedMethods: refine
Metrics/BlockLength:
Max: 179
Max: 206

# Offense count: 2
# Configuration parameters: CountComments.
Expand All @@ -30,12 +30,12 @@ Metrics/CyclomaticComplexity:
# Offense count: 15
# Configuration parameters: CountComments, ExcludedMethods.
Metrics/MethodLength:
Max: 21
Max: 23

# Offense count: 1
# Configuration parameters: CountComments.
Metrics/ModuleLength:
Max: 156
Max: 162

# Offense count: 1
# Configuration parameters: MinBodyLength.
Expand Down
10 changes: 7 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,15 +65,19 @@ display the output to the console (and not push the data to Solr).
$ bundle exec traject --debug-mode -c lib/traject/psulib_config.rb solr/sample_data/sample_psucat.mrc
```
## Hathi Trust ETAS data
## HathiTrust ETAS data
To generate a synthesized Hathi Trust overlap report locally, you can run
To generate a synthesized HathiTrust overlap report locally, you can run
```
RUBY_ENVIRONMENT=dev bundle exec rake hathitrust:process_hathi_etas`.
```
You will need the overlap file provided by Hathi Trust (`overlap_[date]_psu.tsv`) in your `ignorethis_hathi/` directory.
You will need the overlap file provided by HathiTrust (`overlap_[date]_psu.tsv`) and `hathi_field_list.csv` with the content
`oclc_num,htid,ht_bib_key,access`
in your `ignorethis_hathi/` directory (or whatever directory path you set in `hathi_overlap_path`).
Also make sure the below settings in your `indexer_settings_dev.yml` file are set with the correct info:
Expand Down
1 change: 0 additions & 1 deletion config/indexer_settings_dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,5 @@

processing_thread_pool: 5
hathi_overlap_path: ignorethis_hathi/
hathi_overlap_file: final_hathi_overlap.csv
hathi_load_period: 20200501
overlap_file: overlap_20200518_psu.tsv
3 changes: 1 addition & 2 deletions config/indexer_settings_test.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
---

processing_thread_pool: 5
hathi_overlap_path: spec/fixtures/hathitrust/
hathi_overlap_file: mock_overlap.csv
hathi_overlap_path: spec/fixtures/hathitrust/
10 changes: 9 additions & 1 deletion lib/marc_access_facet_processor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def extract_access_data(record, context)
end
end

access << 'Online' if context.output_hash&.dig('ht_id_ssim')
access << 'Online' if hathi_access? context
access.compact!
access.uniq!
access.delete 'On Order' if not_only_on_order? access
Expand All @@ -43,4 +43,12 @@ def resolve_library_code(field, library)

'On Order'
end

def hathi_access?(context)
hathitrust_struct = context.output_hash&.dig('hathitrust_struct')
hathitrust_etas = context.settings['hathi_etas']
return false unless hathitrust_struct

hathitrust_etas || JSON.parse(hathitrust_struct&.first)['access'] == 'allow'
end
end
39 changes: 26 additions & 13 deletions lib/tasks/hathi_etas.rake → lib/tasks/hathi_trust.rake
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,16 @@

namespace :hathitrust do
desc 'Process overlap file for emergency access to restricted HathiTrust material'
task :process_hathi_etas do
task :process_hathi_overlap do
indexer_settings = YAML.load_file("config/indexer_settings_#{ENV['RUBY_ENVIRONMENT']}.yml")
period = indexer_settings['hathi_load_period']

Dir.chdir(indexer_settings['hathi_overlap_path']) do
Rake::Task['hathitrust:load_hathi_full'].invoke(period)
Rake::Task['hathitrust:pare_hathi_full'].invoke(period)
Rake::Task['hathitrust:extract_overlap_oclc'].invoke(indexer_settings['overlap_file'])
Rake::Task['hathitrust:split_overlap_oclc'].invoke
Rake::Task['hathitrust:filter_overlap'].invoke
Rake::Task['hathitrust:extract_uniq_oclc'].invoke
end
end

Expand All @@ -20,29 +20,42 @@ namespace :hathitrust do
print `curl -s -O 'https://www.hathitrust.org/sites/www.hathitrust.org/files/hathifiles/hathi_full_#{args[:period]}.txt.gz'`
end

desc 'Pare the monthly file down to just the needed data: Hathi Trust id and OCLC number'
desc 'Pare the monthly file down to just the needed data: OCLC number, HathiTrust id, HathiTrust bib_key and HathiTurst access code'
task :pare_hathi_full, [:period] do |_task, args|
print `gunzip -c hathi_full_#{args[:period]}.txt.gz | \
csvcut -t -c 1,8 -z 1310720 | \
csvgrep -c 1,2 -r ".+" | \
csvcut -t -c 8,1,4,2 -z 1310720 | \
csvgrep -c 1,2,3,4 -r ".+" | \
sort | uniq > hathi_full_dedupe.csv`

print `cat hathi_field_list.csv hathi_full_dedupe.csv > hathi_full_dedupe_with_headers.csv`
end

desc 'Extract the unique set of OCLC numbers from the overlap report'
desc 'Extract the unique set of OCLC numbers and access code from the overlap report'
task :extract_overlap_oclc, [:overlap_file] do |_task, args|
print `csvgrep -t -c 4 -r ".+" #{args[:overlap_file]} | \
csvcut -c 1 | \
csvcut -c 1,3 | \
sort | uniq > overlap_all_unique.csv`
end

desc 'Split the overlap report by item_type: mono and multi/serial'
task :split_overlap_oclc do
print `csvgrep -H -c 2 -m "mono" overlap_all_unique.csv | \
csvcut -c 1 | \
sort | uniq > overlap_mono_unique.csv`

print `csvgrep -H -c 2 -m "mono" -i overlap_all_unique.csv | \
csvcut -c 1 | \
sort | uniq > overlap_multi_unique.csv`
end

desc 'Filter the pared down HathiTrust data using the overlap OCLC numbers as the filter input'
task :filter_overlap do
print `csvgrep -c 2 -f overlap_all_unique.csv \
hathi_full_dedupe.csv > hathi_filtered_by_overlap.csv`
end
print `csvgrep -c 1 -f overlap_mono_unique.csv hathi_full_dedupe_with_headers.csv | \
csvcut -C 3 | \
sort | uniq > final_hathi_mono_overlap.csv`

desc 'Extract the unique set of OCLC numbers from the filtered data'
task :extract_uniq_oclc do
print `sort -t, -k2 -u hathi_filtered_by_overlap.csv > final_hathi_overlap.csv`
print `csvgrep -c 1 -f overlap_multi_unique.csv hathi_full_dedupe_with_headers.csv | \
csvcut -C 2 | \
sort | uniq > final_hathi_multi_overlap.csv`
end
end
18 changes: 14 additions & 4 deletions lib/traject/macros/custom.rb
Original file line number Diff line number Diff line change
Expand Up @@ -205,12 +205,22 @@ def self.includes_oclc_indicators?(sf_a)
sf_a.include?('OCLC')
end

# Extract ht_id
def extract_ht_id
def extract_hathi_data
lambda do |_record, accumulator, context|
oclc_number = context.output_hash&.dig('oclc_number_ssim')&.first
accumulator << HATHI_ETAS_OVERLAP[oclc_number]
accumulator.compact
ht_hash = HATHI_MULTI_OVERLAP&.dig(oclc_number)&.first || HATHI_MONO_OVERLAP&.dig(oclc_number)&.first
accumulator << ht_hash.to_json unless ht_hash.nil?
end
end

def hathi_to_hash(ht_format)
hathi_overlap_csv = "#{settings['hathi_overlap_path']}final_hathi_#{ht_format}_overlap.csv"
CSV.read(hathi_overlap_csv)
.group_by(&:shift)
.each do |_oclc, ht_data|
ht_key = ht_format == 'mono' ? :ht_id : :ht_bib_key
ht_data.map! { |data| [ht_key, :access].zip(data).to_h }
ht_data.reject! { |data| data[:access] == 'deny' } if ht_data.length > 1
end
end
end
Expand Down
18 changes: 9 additions & 9 deletions lib/traject/psulib_config.rb
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
provide 'reader_class_name', 'Traject::MarcCombiningReader'
provide 'commit_timeout', '10000'
provide 'hathi_overlap_path', indexer_settings['hathi_overlap_path'] || '/data/hathitrust_data/'
provide 'hathi_overlap_file', indexer_settings['hathi_overlap_file'] || 'final_hathi_overlap.csv'
provide 'hathi_etas', indexer_settings['hathi_etas'] || true

if is_jruby
provide 'marc4j_reader.permissive', true
Expand All @@ -47,12 +47,12 @@
end
end

# This HATHI_ETAS_OVERLAP constant is expected to be a two column csv of htid and oclc numbers. It is
# created by a process described at
# https://github.com/psu-libraries/psulib_blacklight/wiki/Synthesizing-overlap-data-from-HathiTrust
# @todo replace above with rake task
hathi_overlap_csv = "#{settings['hathi_overlap_path']}#{settings['hathi_overlap_file']}"
Traject::Macros::Custom::HATHI_ETAS_OVERLAP = CSV.read(hathi_overlap_csv).map(&:reverse).to_h
# HATHI_MONO_OVERLAP constant is expected to be a three column csv of oclc number, htid and access info.
# HATHI_MULTI_OVERLAP constant is expected to be a three column csv of oclc number, ht_bib_key and access info.
# These are generated by running `rake hathitrust:process_hathi_etas`.
Traject::Macros::Custom::HATHI_MONO_OVERLAP = hathi_to_hash('mono')
Traject::Macros::Custom::HATHI_MULTI_OVERLAP = hathi_to_hash('multi')

ATOZ = ('a'..'z').to_a.join('')
ATOU = ('a'..'u').to_a.join('')

Expand Down Expand Up @@ -200,8 +200,8 @@
to_field 'author_meeting_display_ssm', extract_marc('111abcdfgklnpqj'), trim_punctuation
to_field 'addl_author_display_ssm', extract_marc('700aqbcdjk:710abcdfgjkln:711abcdfgjklnpq'), trim_punctuation

# Permanent HathiTrust item identifier
to_field 'ht_id_ssim', extract_ht_id
# HathiTrust fields
to_field 'hathitrust_struct', extract_hathi_data

## Access facet
access_facet_processor = MarcAccessFacetProcessor.new
Expand Down
1 change: 1 addition & 0 deletions spec/fixtures/access_hathi_allow.mrc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
00076nam a22000491 4500 0010005000000350021000051234 a(OCoLC)100000391
1 change: 1 addition & 0 deletions spec/fixtures/access_hathi_deny.mrc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
00076cam a22000494a 45000010005000000350021000051234 a(OCoLC)100000499
11 changes: 11 additions & 0 deletions spec/fixtures/hathitrust/final_hathi_mono_overlap.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
1000,wu.89030498562,deny
10000016,uiug.30112117983731,allow
100000499,mdp.39015069374455,deny
100000684,mdp.39015068763393,deny
100000780,mdp.39015069337908,deny
10000143,uva.x000891141,deny
1000021,uc1.b3547182,allow
1000021,12345,deny
1000022826,dul1.ark:/13960/t3rv7mm1q,deny
1000023113,dul1.ark:/13960/t4zh3px08,deny
10000247,uiug.30112010346952,deny
11 changes: 11 additions & 0 deletions spec/fixtures/hathitrust/final_hathi_multi_overlap.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
100000391,012292266,allow
100000393,012292267,allow
100000753,005602659,deny
10000162,001666867,deny
1000061,005893467,allow
1000061,12345,deny
1000075,001139875,deny
10000912,012263522,deny
10002026,009880991,deny
1000266,000938715,deny
1000310,009911637,deny
9 changes: 0 additions & 9 deletions spec/fixtures/hathitrust/mock_overlap.csv

This file was deleted.

32 changes: 32 additions & 0 deletions spec/lib/traject/access_facet_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -46,5 +46,37 @@
result = @indexer.map_record(MARC::Reader.new(File.join(fixture_path, 'access_zremoved.mrc')).to_a.first)
expect(result['access_facet']).to be_nil
end

context 'when hathitrust etas is enabled' do
before do
@indexer.settings['hathi_etas'] = true
end

it 'produces Online when a record has a hathitrust copy with allow access' do
result = @indexer.map_record(MARC::Reader.new(File.join(fixture_path, 'access_hathi_allow.mrc')).to_a.first)
expect(result['access_facet']).to contain_exactly 'Online'
end

it 'empty when a record has a hathitrust copy with deny access' do
result = @indexer.map_record(MARC::Reader.new(File.join(fixture_path, 'access_hathi_deny.mrc')).to_a.first)
expect(result['access_facet']).to contain_exactly 'Online'
end
end

context 'when hathitrust etas is not enabled' do
before do
@indexer.settings['hathi_etas'] = false
end

it 'produces Online when a record has a hathitrust copy with allow access' do
result = @indexer.map_record(MARC::Reader.new(File.join(fixture_path, 'access_hathi_allow.mrc')).to_a.first)
expect(result['access_facet']).to contain_exactly 'Online'
end

it 'empty when record has a hathitrust copy with deny access' do
result = @indexer.map_record(MARC::Reader.new(File.join(fixture_path, 'access_hathi_deny.mrc')).to_a.first)
expect(result['access_facet']).to be_nil
end
end
end
end
48 changes: 42 additions & 6 deletions spec/lib/traject/macros/custom_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -192,22 +192,58 @@
end
end

describe '#extract_ht_id' do
describe '#extract_hathi_data' do
let(:result) { @indexer.map_record(MARC::Record.new_from_hash('fields' => [oclc], 'leader' => leader)) }

context 'when a record does not have a match in the overlap report' do
context 'when a record does not have a match in the overlap reports' do
let(:oclc) { { '035' => { 'ind1' => '', 'ind2' => '', 'subfields' => [{ 'a' => '(OCLC)99999999' }] } } }

it 'does not produce an ht_id' do
expect(result['ht_id_ssim']).to be_nil
it 'does not produce a hathitrust_struct' do
expect(result['hathitrust_struct']).to be_nil
end
end

context 'when a record has a match in the overlap report' do
context 'when a record has a match in the overlap mono report' do
let(:oclc) { { '035' => { 'ind1' => '', 'ind2' => '', 'subfields' => [{ 'a' => '(OCLC)100000499' }] } } }

it 'does maps the ht_id' do
expect(result['hathitrust_struct']).to match ['{"ht_id":"mdp.39015069374455","access":"deny"}']
end
end

context 'when a record has a match in the overlap multi report' do
let(:oclc) { { '035' => { 'ind1' => '', 'ind2' => '', 'subfields' => [{ 'a' => '(OCLC)100000391' }] } } }

it 'does maps the ht_id' do
expect(result['ht_id_ssim']).to eq(['pst.000019102375'])
expect(result['hathitrust_struct']).to match ['{"ht_bib_key":"012292266","access":"allow"}']
end
end
end

describe '#hathi_to_hash' do
let(:result) { @indexer.hathi_to_hash(ht_format) }

context 'for hathi records with mono item type' do
let(:ht_format) { 'mono' }

it 'produces hathi data ' do
expect(result['1000']).to match [{ ht_id: 'wu.89030498562', access: 'deny' }]
end

it 'prefers the record with allow access if there are multiple copies with same oclc' do
expect(result['1000021']).to match [{ ht_id: 'uc1.b3547182', access: 'allow' }]
end
end

context 'for hathi records with mono item type' do
let(:ht_format) { 'multi' }

it 'produces hathi data correctly' do
expect(result['100000391']).to match [{ ht_bib_key: '012292266', access: 'allow' }]
end

it 'prefers the record with allow access if there are multiple copies with same oclc' do
expect(result['1000061']).to match [{ ht_bib_key: '005893467', access: 'allow' }]
end
end
end
Expand Down

0 comments on commit 52d569b

Please sign in to comment.