Skip to content

Commit

Permalink
DEV-1086: summarize deletion reports
Browse files Browse the repository at this point in the history
- dedupes output
- omits anything that is in the target dataset at time of report
  • Loading branch information
aelkiss committed Apr 24, 2024
1 parent 3009ed4 commit b1788e8
Show file tree
Hide file tree
Showing 9 changed files with 94 additions and 8 deletions.
15 changes: 12 additions & 3 deletions config/hathitrust_config.rb
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,15 @@ def src_path_resolver
}
end

def dest_path_resolver
subsets.map do |subset|
[subset, PairtreePathResolver.new(dest_parent_dir[subset])]
end.to_h.merge({
superset => PairtreePathResolver.new(dest_parent_dir[superset]),
force_superset => PairtreePathResolver.new(dest_parent_dir[superset])
})
end

def volume_repo
@volume_repo ||=
subsets.map do |subset|
Expand Down Expand Up @@ -85,15 +94,15 @@ def force_superset_volume_repo
def subset_volume_writer(profile)
VolumeLinker.new(
id: profile,
dest_path_resolver: PairtreePathResolver.new(dest_parent_dir[profile]),
dest_path_resolver: dest_path_resolver[profile],
fs: Filesystem.new
)
end

def superset_volume_writer
VolumeCreator.new(
id: superset,
dest_path_resolver: PairtreePathResolver.new(dest_parent_dir[superset]),
dest_path_resolver: dest_path_resolver[superset],
writer: ZipWriter.new,
fs: Filesystem.new
)
Expand All @@ -102,7 +111,7 @@ def superset_volume_writer
def force_superset_volume_writer
ForceVolumeCreator.new(
id: force_superset,
dest_path_resolver: PairtreePathResolver.new(dest_parent_dir[superset]),
dest_path_resolver: dest_path_resolver[force_superset],
writer: ZipWriter.new,
fs: Filesystem.new
)
Expand Down
32 changes: 32 additions & 0 deletions lib/datasets/dedupe_delete_log.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
module Datasets
class DedupeDeleteLog
attr_reader :profile
attr_reader :files

def initialize(profile:, files:)
@profile = profile
@files = files
@path_resolver = Datasets.config.dest_path_resolver[profile]
end

def compile_results
Tempfile.create("dedupe-deletes") do |f|
filename = f.path
f.close
system("sort #{files.join(" ")} | uniq > #{filename}")
f = File.open(filename)
yield f.map(&:strip).select { |id| not_in_dataset(id) }
end
end

def not_in_dataset(id)
(namespace, id) = id.split(".", 2)
volume = Volume.new(namespace: namespace, id: id, access_profile: :none, right: :none)
!File.exist?(path_resolver.path(volume))
end

private

attr_reader :path_resolver
end
end
49 changes: 49 additions & 0 deletions spec/dedupe_deletes_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
require_relative "spec_helper"
require "dedupe_delete_log"

module Datasets
RSpec.describe DedupeDeleteLog do
# needs the dataset paths there
include_context "integration" do
let(:profile) { :pd }

it "takes a profile and an array of files as input" do
expect(DedupeDeleteLog.new(profile: profile, files: ["foo", "bar"])).not_to be_nil
end

it "outputs each item at most once" do
files = Array.new(2) { Tempfile.create("dedupe-deletes") }
begin
files[0].puts("test.id1", "test.id2")
files[1].puts("test.id3", "test.id2")
files.map(&:close)

DedupeDeleteLog.new(profile: profile, files: files.map(&:path)).compile_results do |results|
expect(results).to contain_exactly("test.id1", "test.id2", "test.id3")
end
ensure
files.map { |f| File.unlink(f) }
end
end

it "only outputs deletes that aren't present in the current dataset" do
Tempfile.create("dedupe-deletes") do |f|
f.puts("test.still_there", "test.not_there")
f.close

volume = Volume.new(namespace: "test", id: "still_there", access_profile: :open, right: :pd)
writer = Datasets.config.volume_writer[profile]
src_path_resolver = Datasets.config.src_path_resolver[profile]
src_path = src_path_resolver.path(volume)
src_path.parent.mkpath
FileUtils.touch(src_path)
writer.save(volume, src_path)

DedupeDeleteLog.new(profile: profile, files: [f.path]).compile_results do |results|
expect(results).to contain_exactly("test.not_there")
end
end
end
end
end
end
1 change: 0 additions & 1 deletion spec/integration/force_update_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
require "spec_helper"
require "job_helper"
require "datasets"
require_relative "../../config/hathitrust_config"
require "yaml"
require "fileutils"

Expand Down
1 change: 0 additions & 1 deletion spec/integration/subset_creation_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
require "spec_helper"
require "job_helper"
require "datasets"
require_relative "../../config/hathitrust_config"
require "yaml"
require "fileutils"

Expand Down
1 change: 0 additions & 1 deletion spec/integration/subset_deletion_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
require "spec_helper"
require "job_helper"
require "datasets"
require_relative "../../config/hathitrust_config"
require "yaml"
require "fileutils"

Expand Down
1 change: 0 additions & 1 deletion spec/integration/superset_creation_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
require "spec_helper"
require "job_helper"
require "datasets"
require_relative "../../config/hathitrust_config"
require "yaml"
require "fileutils"
require "date"
Expand Down
1 change: 0 additions & 1 deletion spec/integration/superset_deletion_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
require "spec_helper"
require "job_helper"
require "datasets"
require_relative "../../config/hathitrust_config"
require "yaml"
require "fileutils"

Expand Down
1 change: 1 addition & 0 deletions spec/support/contexts/integration.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
require "sequel"
require "pathname"
require "timecop"
require_relative "../../../config/hathitrust_config"
require_relative "../schema_builder"

# Performs setup and teardown for integration tests.
Expand Down

0 comments on commit b1788e8

Please sign in to comment.