Skip to content

Commit

Permalink
Merge pull request #766 from pulibrary/i763_limit_recap_updates
Browse files Browse the repository at this point in the history
Introduce metering for submit collection
  • Loading branch information
kevinreiss authored Apr 19, 2024
2 parents 139d810 + e8f1ee3 commit 1570dc1
Show file tree
Hide file tree
Showing 5 changed files with 71 additions and 1 deletion.
24 changes: 23 additions & 1 deletion app/models/alma_submit_collection/alma_recap_file_list.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,15 @@ def initialize(input_sftp_base_dir: Rails.application.config.alma_sftp.alma_reca
@file_pattern = file_pattern
@alma_sftp = alma_sftp
@sftp_locations = []
@files = files || compile_file_list
@files = if Flipflop.meter_files_sent_to_recap?
files || compile_metered_file_list
else
files || compile_file_list
end
end

# @return [Array<StringIO>]
# Won't this be the contents of *all* the decompressed files? Maybe the source of our memory issues?
def file_contents
@file_contents ||= @files.map { |filename| download_and_decompress_file(filename) }
.flatten
Expand All @@ -38,6 +43,23 @@ def mark_files_as_processed
end
end

def compile_metered_file_list
files = []
all_matching_files = []
@alma_sftp.start do |sftp|
all_matching_files = sftp.dir.glob(@input_sftp_base_dir, '*.tar.gz')
end
files_oldest_to_newest = all_matching_files.sort_by { |entry| entry.attributes.mtime }
files_oldest_to_newest.take(Rails.application.config.alma_sftp.max_files_for_recap).each do |entry|
next unless /#{@file_pattern}/.match?(entry.name)
# rubocop:disable Style/ZeroLengthPredicate -- entry.attributes is an Net::SFTP::Protocol::V01::Attributes, not an array
next if entry.attributes.size.zero?
# rubocop:enable Style/ZeroLengthPredicate
files << entry.name
end
files
end

private

def compile_file_list
Expand Down
3 changes: 3 additions & 0 deletions config/alma_sftp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,16 @@ default: &default
renew_report_path: <%= ENV["ALMA_RENEW_INPUT_DIR"] || '/alma/scsb_renewals' %>
bursar_report_path: <%= ENV["ALMA_BURSAR_INPUT_DIR"] || '/alma/bursar' %>
alma_recap_output_path: <%= ENV["ALMA_RECAP_OUTPUT_DIR"] || '/alma/recap' %>
max_files_for_recap: 25

development:
<<: *default

test:
<<: *default
invoice_status_local_path: 'spec/fixtures/ephemeral'
max_files_for_recap: 3

staging: &staging
<<: *default

Expand Down
4 changes: 4 additions & 0 deletions config/features.rb
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,8 @@
feature :alma_person_ineligible,
default: false,
description: "Generate the ineligible user report when the person feed runs"

feature :meter_files_sent_to_recap,
default: false,
description: "Send only the configured number of files to recap"
end
4 changes: 4 additions & 0 deletions docs/alma_submitcollection/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,7 @@ sequenceDiagram
Lib Jobs->>+Lib Jobs: Store the final standard and boundwith files on disk for 1 month
Lib Jobs->>+S3 bucket: Upload all standard and boundwith files
```

### Metering
When we have an extremely large data dump from Alma, RECAP / SCSB has a hard time keeping up, and we can overwhelm their system (see [lib-jobs issue](https://github.com/pulibrary/lib_jobs/issues/765)). In response,
we have added metering, so that we process only the oldest configured number of files (configured in `config/alma_sftp.yml). This is also behind a feature flipper, so we can turn it on and off as needed.
37 changes: 37 additions & 0 deletions spec/models/alma_submit_collection/alma_recap_file_list_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,43 @@
allow(sftp_file_factory).to receive(:open).and_return StringIO.new
end

context 'metering files sent to recap' do
before do
allow(Flipflop).to receive(:meter_files_sent_to_recap?).and_return(true)
end

context 'with more files than recap can process' do
let(:sftp_entry1) { instance_double("Net::SFTP::Protocol::V01::Name", name: 'incremental_recap_25908087650006421_20230420_160408[001]_new.xml.tar.gz') }
let(:sftp_entry2) { instance_double("Net::SFTP::Protocol::V01::Name", name: 'incremental_recap_25908087650006421_20230420_160408[002]_new.xml.tar.gz') }
let(:sftp_entry3) { instance_double("Net::SFTP::Protocol::V01::Name", name: 'incremental_recap_25908087650006421_20230420_160408[003]_new.xml.tar.gz') }
let(:sftp_entry4) { instance_double("Net::SFTP::Protocol::V01::Name", name: 'incremental_recap_25908087650006421_20230420_160408[004]_new.xml.tar.gz') }
let(:sftp_entry5) { instance_double("Net::SFTP::Protocol::V01::Name", name: 'incremental_recap_25908087650006421_20230420_160408[005]_new.xml.tar.gz') }
# Ages are in seconds from epoch
let(:age_1_oldest) { 1_713_400_100 }
let(:age_2) { 1_713_400_200 }
let(:age_3) { 1_713_400_300 }
let(:age_4) { 1_713_400_400 }
let(:age_5_newest) { 1_713_400_500 }

before do
allow(sftp_dir).to receive(:foreach).and_yield(sftp_entry1).and_yield(sftp_entry2)
.and_yield(sftp_entry3).and_yield(sftp_entry4).and_yield(sftp_entry5)
[sftp_entry1, sftp_entry2, sftp_entry3, sftp_entry4, sftp_entry5].each do |entry|
allow(entry).to receive(:attributes).and_return(file_attributes)
end
allow(sftp_dir).to receive(:glob).and_return [sftp_entry1, sftp_entry2, sftp_entry3, sftp_entry4, sftp_entry5]
allow(file_attributes).to receive(:mtime).and_return(age_1_oldest, age_5_newest, age_2, age_4, age_3)
end
it 'does not process more than the configured number of files' do
expect(alma_recap_file_list.files.count).to eq(3)
end

it 'processes only the oldest files' do
expect(alma_recap_file_list.files).to match_array([sftp_entry1.name, sftp_entry3.name, sftp_entry5.name])
end
end
end

context 'if the file is valid' do
it "downloads the file" do
expect(alma_recap_file_list.files.count).to eq(1)
Expand Down

0 comments on commit 1570dc1

Please sign in to comment.