Skip to content

Commit

Permalink
Merge pull request #864 from DaanVanVugt/feature/odissei_scraper
Browse files Browse the repository at this point in the history
Feature/odissei scraper
  • Loading branch information
fbacall authored Jun 22, 2023
2 parents 2d0c4e9 + c77db83 commit c923e4d
Show file tree
Hide file tree
Showing 4 changed files with 347 additions and 0 deletions.
1 change: 1 addition & 0 deletions lib/ingestors/ingestor_factory.rb
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def self.ingestors
Ingestors::RugIngestor,
Ingestors::LcrdmIngestor,
Ingestors::TdccIngestor,
Ingestors::OdisseiIngestor,
]
end

Expand Down
99 changes: 99 additions & 0 deletions lib/ingestors/odissei_ingestor.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
require 'open-uri'
require 'csv'
require 'nokogiri'

module Ingestors
class OdisseiIngestor < Ingestor
def self.config
{
key: 'odissei_event',
title: 'ODISSEI Events API',
category: :events
}
end

def read(url)
begin
process_odissei(url)
rescue Exception => e
@messages << "#{self.class.name} failed with: #{e.message}"
end

# finished
nil
end

private

def process_odissei(url)
odissei_url = 'https://odissei-data.nl/calendar/'

workshop_title_list = []
workshop_url_list = []
event_page = Nokogiri::HTML5.parse(open_url(odissei_url.to_s, raise: true)).css("div[class='tribe-events-calendar-list']").first.css("div[class='tribe-common-g-row tribe-events-calendar-list__event-row']")
event_page.each do |event_section|
event = OpenStruct.new
el = event_section.css("div[class='tribe-events-calendar-list__event-details tribe-common-g-col']").first
event.title = el.css("a[class='tribe-events-calendar-list__event-title-link tribe-common-anchor-thin']").first.text.gsub("\n", ' ').gsub("\t", '')
event.url = el.css("a[class='tribe-events-calendar-list__event-title-link tribe-common-anchor-thin']").first.get_attribute('href')
event.description = el.css("div[class='tribe-events-calendar-list__event-description tribe-common-b2 tribe-common-a11y-hidden']").first.css('p').first.text
unless (Rails.env.test? and File.exist?('test/vcr_cassettes/ingestors/odissei.yml'))
sleep(1)
end
el = Nokogiri::HTML5.parse(open_url(event.url.to_s, raise: true)).css("div[id='tribe-events-content']").first
venue_css = el&.css("div[class='tribe-events-meta-group tribe-events-meta-group-venue']")&.first&.css("dl")
if !venue_css
next
end
event.venue = recursive_description_func(venue_css).gsub("\n", ' ').gsub("\t", '')
times = scrape_start_and_end_time(el.css("div[class='tribe-events-meta-group tribe-events-meta-group-details']").first)
event.start = times[0]
event.end = times[1]
event.source = 'ODISSEI'
event.timezone = 'Amsterdam'
event.set_default_times
add_event(event)
rescue Exception => e
@messages << "Extract event fields failed with: #{e.message}"
end
end
end
end

def scrape_start_and_end_time(el)
start_date = el&.css("abbr[class='tribe-events-abbr tribe-events-start-date published dtstart']")&.first
start_time = el&.css("div[class='tribe-events-abbr tribe-events-start-time published dtstart']")&.first
start_date_time = el&.css("abbr[class='tribe-events-abbr tribe-events-start-datetime updated published dtstart']")&.first
end_date_time = el&.css("abbr[class='tribe-events-abbr tribe-events-end-datetime dtend']")&.first
if start_time
start_date = start_time.get_attribute('title').strip
end_date = start_date
time = start_time.text.strip
if time.include?('-')
start_time = time.split('-')[0].strip
end_time = time.split('-')[1].strip
else
start_time = time
end_time = [time.to_i + 1, 18].max.to_s + ':00'
end
elsif start_date_time
start_date = start_date_time.get_attribute('title').strip
start_time = start_date_time.text.split('@').last.strip
end_date = end_date_time.get_attribute('title').strip
end_time = end_date_time.text.split('@').last.strip
end
event_start = (start_date + ' ' + start_time).to_time
event_end = (end_date + ' ' + end_time).to_time
return [event_start, event_end]
end

def recursive_description_func(css, res='')
if css.length == 1
res += css.text.strip
else
css.each do |css2|
res += recursive_description_func(css2, res)
end
end
res
end
54 changes: 54 additions & 0 deletions test/unit/ingestors/odissei_ingestor_test.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
require 'test_helper'

class OdisseiIngestorTest < ActiveSupport::TestCase
setup do
@user = users(:regular_user)
@content_provider = content_providers(:another_portal_provider)
mock_ingestions
end

test 'can ingest events from odissei' do
source = @content_provider.sources.build(
url: 'https://odissei-data.nl/calendar/',
method: 'odissei',
enabled: true
)

ingestor = Ingestors::OdisseiIngestor.new

# check event doesn't
new_title = "SICSS – ODISSEI Summer School 2023"
new_url = 'https://odissei-data.nl/calendar/sicss-odissei-summer-school-2023/'
refute Event.where(title: new_title, url: new_url).any?

# run task
assert_difference 'Event.count', 2 do
freeze_time(Time.new(2019)) do
VCR.use_cassette("ingestors/odissei") do
ingestor.read(source.url)
ingestor.write(@user, @content_provider)
end
end
end

assert_equal 2, ingestor.events.count
assert ingestor.materials.empty?
assert_equal 2, ingestor.stats[:events][:added]
assert_equal 0, ingestor.stats[:events][:updated]
assert_equal 0, ingestor.stats[:events][:rejected]

# check event does exist
event = Event.where(title: new_title, url: new_url).first
assert event
assert_equal new_title, event.title
assert_equal new_url, event.url

# check other fields
assert_equal 'ODISSEI', event.source
assert_equal 'Amsterdam', event.timezone
assert_equal 'Mon, 19 Jun 2023 09:00:00.000000000 UTC +00:00'.to_time, event.start
assert_equal 'Fri, 30 Jun 2023 17:00:00.000000000 UTC +00:00'.to_time, event.end
assert_equal 'Erasmus University Rotterdam', event.venue
assert_equal false, event.online
end
end
193 changes: 193 additions & 0 deletions test/vcr_cassettes/ingestors/odissei.yml

Large diffs are not rendered by default.

0 comments on commit c923e4d

Please sign in to comment.