Skip to content

Commit

Permalink
fix odissei scraper bug for new type of date formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
mikesndrs committed Sep 13, 2024
1 parent fbb3166 commit 8e7e5b7
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 50 deletions.
25 changes: 14 additions & 11 deletions lib/ingestors/odissei_ingestor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -25,26 +25,24 @@ def read(url)

private

def process_odissei(url)
def process_odissei(_url)
odissei_url = 'https://odissei-data.nl/calendar/'

workshop_title_list = []
workshop_url_list = []
event_page = Nokogiri::HTML5.parse(open_url(odissei_url.to_s, raise: true)).css("div[class='tribe-events-calendar-list']").first.css("div[class='tribe-common-g-row tribe-events-calendar-list__event-row']")
event_page = Nokogiri::HTML5.parse(open_url(odissei_url.to_s,
raise: true)).css("div[class='tribe-events-calendar-list']").first.css("div[class='tribe-common-g-row tribe-events-calendar-list__event-row']")
event_page.each do |event_section|
event = OpenStruct.new
el = event_section.css("div[class='tribe-events-calendar-list__event-details tribe-common-g-col']").first
event.title = el.css("a[class='tribe-events-calendar-list__event-title-link tribe-common-anchor-thin']").first.text.gsub("\n", ' ').gsub("\t", '')
event.url = el.css("a[class='tribe-events-calendar-list__event-title-link tribe-common-anchor-thin']").first.get_attribute('href')
event.description = el.css("div[class='tribe-events-calendar-list__event-description tribe-common-b2 tribe-common-a11y-hidden']").first.css('p').first.text
unless (Rails.env.test? and File.exist?('test/vcr_cassettes/ingestors/odissei.yml'))
sleep(1)
end
sleep(1) unless Rails.env.test? and File.exist?('test/vcr_cassettes/ingestors/odissei.yml')
el = Nokogiri::HTML5.parse(open_url(event.url.to_s, raise: true)).css("div[id='tribe-events-content']").first
venue_css = el&.css("div[class='tribe-events-meta-group tribe-events-meta-group-venue']")&.first&.css("dl")
if !venue_css
next
end
venue_css = el&.css("div[class='tribe-events-meta-group tribe-events-meta-group-venue']")&.first&.css('dl')
next unless venue_css

event.venue = recursive_description_func(venue_css).gsub("\n", ' ').gsub("\t", '')
times = scrape_start_and_end_time(el.css("div[class='tribe-events-meta-group tribe-events-meta-group-details']").first)
event.start = times[0]
Expand Down Expand Up @@ -81,13 +79,18 @@ def scrape_start_and_end_time(el)
start_time = start_date_time.text.split('@').last.strip
end_date = end_date_time.get_attribute('title').strip
end_time = end_date_time.text.split('@').last.strip
else
start_date = start_date.get_attribute('title').strip
end_date = start_date
start_time = '09:00'
end_time = '17:00'
end
event_start = Time.zone.parse(start_date + ' ' + start_time)
event_end = Time.zone.parse(end_date + ' ' + end_time)
return [event_start, event_end]
[event_start, event_end]
end

def recursive_description_func(css, res='')
def recursive_description_func(css, res = '')
if css.length == 1
res += css.text.strip
else
Expand Down
12 changes: 6 additions & 6 deletions test/unit/ingestors/odissei_ingestor_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,14 @@ class OdisseiIngestorTest < ActiveSupport::TestCase
ingestor = Ingestors::OdisseiIngestor.new

# check event doesn't
new_title = "SICSS – ODISSEI Summer School 2023"
new_url = 'https://odissei-data.nl/calendar/sicss-odissei-summer-school-2023/'
new_title = 'ODISSEI Conference 2024'
new_url = 'https://odissei-data.nl/calendar/odissei-conference-2024/'
refute Event.where(title: new_title, url: new_url).any?

# run task
assert_difference 'Event.count', 2 do
freeze_time(2019) do
VCR.use_cassette("ingestors/odissei") do
VCR.use_cassette('ingestors/odissei') do
ingestor.read(source.url)
ingestor.write(@user, @content_provider)
end
Expand All @@ -51,9 +51,9 @@ class OdisseiIngestorTest < ActiveSupport::TestCase
# check other fields
assert_equal 'ODISSEI', event.source
assert_equal 'Amsterdam', event.timezone
assert_equal Time.zone.parse('Mon, 19 Jun 2023 09:00:00.000000000 UTC +00:00'), event.start
assert_equal Time.zone.parse('Fri, 30 Jun 2023 17:00:00.000000000 UTC +00:00'), event.end
assert_equal 'Erasmus University Rotterdam', event.venue
assert_equal Time.zone.parse('Tue, 10 Dec 2024 09:00:00.000000000 UTC +00:00'), event.start
assert_equal Time.zone.parse('Tue, 10 Dec 2024 17:00:00.000000000 UTC +00:00'), event.end
assert_equal 'Supernova, Jaarbeurs Utrecht Jaarbeursplein 6 Utrecht, 3521AL Netherlands', event.venue
refute event.online?
end
end
66 changes: 33 additions & 33 deletions test/vcr_cassettes/ingestors/odissei.yml

Large diffs are not rendered by default.

0 comments on commit 8e7e5b7

Please sign in to comment.