diff --git a/bin/bash_session.sh b/bin/bash_session.sh new file mode 100644 index 0000000..93d5ec9 --- /dev/null +++ b/bin/bash_session.sh @@ -0,0 +1,7 @@ +#!/bin/bash +# Opens up a hathifiles bash session. +docker-compose up -d pushgateway +docker-compose run --rm "hf" bash +# Now do e.g. `bundle exec rspec` or whatever. +# Exit to be done with the session. +docker-compose down; yes | docker system prune diff --git a/bin/rights_change.sh b/bin/rights_change.sh new file mode 100644 index 0000000..88489fa --- /dev/null +++ b/bin/rights_change.sh @@ -0,0 +1,68 @@ +#!/bin/bash + +# Compare 2 hathifiles and report which items have changed rights +# from ic/bib in file1 to pd/bib in file2. +# +# Invoke thusly: +# $ bash rights_change.sh f1 f2 +# Results end up in ic_to_pd_YYYYMMDD.txt. +# Each record in the output are presumed to have changed from ic +# to pd between the generation of the 2 files. +# Script starts at the bottom. + +run(){ + f1=$1 + f2=$2 + echo "Started" + # First, simplify input hathifiles to the data we want. + # All the ic/bib records from f1 into one file... + cut_sort $f1 ic > cut_sort_ic.tsv + # And all the pd/bib records from f2 into another file. + cut_sort $f2 pd > cut_sort_pd.tsv + # Then compare the 2 simplified files. + isodate=`date +'%Y%m%d'` + outfile="`pwd`/ic_to_pd_${isodate}.tsv" + diff_records cut_sort_ic.tsv cut_sort_pd.tsv > $outfile + echo -e "Wrote $outfile" + # Remove intermediate files + rm cut_sort_ic.tsv cut_sort_pd.tsv + echo "Finished" +} + +# Turn a hathifile into fewer cols and sorted matching lines. +# Matching means: has rights:$rights and reason:bib +cut_sort(){ + file=$1 + rights=$2 + # Get these cols from the hathifiles: + # 1 (id), 3 (rights), 14 (reason), 16 (govdoc), + # grep to only get lines matching $rights, + # and sort the output. + zcat -f $file | + cut -f1,3,14,16 | + grep -P "\t${rights}\tbib\t[01]$" | + collated_sort +} + +# Compare 2 outputs from cut_sort, but only look at col 1 (id) and +# col 4 (govdoc), meaning we will only output records that have the +# same id + govdoc values in both files, meaning each output record +# changed from ic to pd but kept the same govdoc status. +diff_records(){ + ic_file=$1 + pd_file=$2 + collated_comm -12 <(cut -f1,4 $ic_file) <(cut -f1,4 $pd_file) +} + +# Sort and comm must use the same collation, +# or comm won't think the files are sorted... +# and the defaults may be different, so specify. +collated_comm(){ + LC_COLLATE=C comm $@ +} +collated_sort(){ + LC_COLLATE=C sort $@ +} + +# Script starts here. +run $1 $2 diff --git a/spec/data/rights_change_file_1.txt b/spec/data/rights_change_file_1.txt new file mode 100644 index 0000000..e3bbc7c --- /dev/null +++ b/spec/data/rights_change_file_1.txt @@ -0,0 +1,2 @@ +mdp.39015027625402 deny ic 000018677 MIU 990000186770106381 1613293 66014593 Go up for glory, by Bill Russell, as told to William McSweeny. Coward-McCann [1966] bib 0 1966 eng BK MIU umich umich google Russell, Bill, 1934-2022. +mdp.39015003746396 deny ic 000018677 MIU 990000186770106381 1613293 66014593 Go up for glory, by Bill Russell, as told to William McSweeny. Coward-McCann [1966] bib 0 1966 eng BK MIU umich umich google Russell, Bill, 1934-2022. diff --git a/spec/data/rights_change_file_2.txt b/spec/data/rights_change_file_2.txt new file mode 100644 index 0000000..d089032 --- /dev/null +++ b/spec/data/rights_change_file_2.txt @@ -0,0 +1,2 @@ +mdp.39015027625402 deny ic 000018677 MIU 990000186770106381 1613293 66014593 Go up for glory, by Bill Russell, as told to William McSweeny. Coward-McCann [1966] bib 0 1966 eng BK MIU umich umich google Russell, Bill, 1934-2022. +mdp.39015003746396 allow pd 000018677 MIU 990000186770106381 1613293 66014593 Go up for glory, by Bill Russell, as told to William McSweeny. Coward-McCann [1966] bib 0 1966 eng BK MIU umich umich google Russell, Bill, 1934-2022. diff --git a/spec/jobs/rights_change.rb b/spec/jobs/rights_change.rb new file mode 100644 index 0000000..f1a853b --- /dev/null +++ b/spec/jobs/rights_change.rb @@ -0,0 +1,30 @@ +# frozen_string_literal: true + +RSpec.describe "bin/rights_change.sh" do + it "writes the expected report file" do + # Setup. + Dir.chdir("/tmp") + isodate = Time.now.strftime("%Y%m%d") + # Command to run: + cmd = [ + "bash", + "/usr/src/app/bin/rights_change.sh", + "/usr/src/app/spec/data/rights_change_file_1.txt", + "/usr/src/app/spec/data/rights_change_file_2.txt" + ].join(" ") + # Expect this outfile + outfile = "/tmp/ic_to_pd_#{isodate}.tsv" + FileUtils.rm_f(outfile) + expect(File.exist?(outfile)).to be false + # Now do it. + system(cmd) + # Expect a file with a single line... + expect(File.exist?(outfile)).to be true + lines = File.read(outfile).split("\n") + expect(lines.count).to eq 1 + # and that single line looks like this: + expect(lines).to eq ["mdp.39015003746396\t0"] + # Cleanup + FileUtils.rm_f(outfile) + end +end