Skip to content

Commit

Permalink
Fix last scraped date for FA
Browse files Browse the repository at this point in the history
  • Loading branch information
Earlopain committed Jan 28, 2024
1 parent 332d961 commit 4f7640b
Show file tree
Hide file tree
Showing 5 changed files with 19 additions and 6 deletions.
6 changes: 3 additions & 3 deletions app/jobs/scrape_artist_url_job.rb
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@ def perform(artist_url) # rubocop:disable Metrics/CyclomaticComplexity

artist_url.update(scraper_status: artist_url.scraper_status.merge(scraper.class.state => scraper.state_value))

stop_marker = artist_url.last_scraped_at
break if stop_marker.present? && submissions.any? { |submission| submission.created_at.before? stop_marker }
break if submissions.any? { |submission| artist_url.scraper_stop_marker&.after?(submission.created_at) }
end
artist_url.last_scraped_at = scraper.cutoff_timestamp
artist_url.last_scraped_at = artist_url.scraper_status["started_at"]
artist_url.scraper_stop_marker = scraper.new_stop_marker
artist_url.scraper_status = {}
artist_url.save
end
Expand Down
2 changes: 1 addition & 1 deletion app/logical/scraper/base.rb
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def jumpstart(value)
# Which date should already be considered scraped? Normally this is good to set to
# when the scrape started but some sites may exhibit a delay with indexing, resulting
# in images being missed. See https://github.com/Earlopain/reverser/issues/113
def cutoff_timestamp
def new_stop_marker
@artist_url.scraper_status["started_at"]
end

Expand Down
2 changes: 1 addition & 1 deletion app/logical/scraper/furaffinity.rb
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def fetch_api_identifier
url_identifier
end

def cutoff_timestamp
def new_stop_marker
query = SubmissionFile.joins(artist_submission: :artist_url).where(artist_submission: { artist_url: @artist_url })
query.order(created_at_on_site: :desc).pick(:created_at_on_site)
end
Expand Down
12 changes: 12 additions & 0 deletions db/migrate/20240128115103_add_artist_url_scraper_stop_marker.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# frozen_string_literal: true

class AddArtistUrlScraperStopMarker < ActiveRecord::Migration[7.1]
def change
add_column :artist_urls, :scraper_stop_marker, :datetime
reversible do |dir|
dir.up do
execute("update artist_urls set scraper_stop_marker = last_scraped_at")
end
end
end
end
3 changes: 2 additions & 1 deletion db/schema.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#
# It's strongly recommended that you check this file into your version control system.

ActiveRecord::Schema[7.1].define(version: 2024_01_12_091112) do
ActiveRecord::Schema[7.1].define(version: 2024_01_28_115103) do
# These are extensions that must be enabled in order to support this database
enable_extension "pgcrypto"
enable_extension "plpgsql"
Expand Down Expand Up @@ -67,6 +67,7 @@
t.text "api_identifier"
t.integer "site_type", null: false
t.jsonb "scraper_status", default: {}, null: false
t.datetime "scraper_stop_marker"
t.index "site_type, lower(url_identifier)", name: "index_artist_urls_on_site_and_url_identifier", unique: true
t.index ["artist_id"], name: "index_artist_urls_on_artist_id"
t.index ["site_type", "api_identifier"], name: "index_site_type_on_api_identifier", unique: true
Expand Down

0 comments on commit 4f7640b

Please sign in to comment.