Skip to content

Commit

Permalink
Added OSCI scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
mikesndrs committed Sep 12, 2023
1 parent f8349fb commit d1c5b15
Show file tree
Hide file tree
Showing 4 changed files with 646 additions and 0 deletions.
1 change: 1 addition & 0 deletions lib/ingestors/ingestor_factory.rb
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def self.ingestors
Ingestors::UhasseltIngestor,
Ingestors::OdisseiIngestor,
Ingestors::RstIngestor,
Ingestors::OsciIngestor,
]
end

Expand Down
79 changes: 79 additions & 0 deletions lib/ingestors/osci_ingestor.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
require 'open-uri'
require 'csv'
require 'nokogiri'

module Ingestors
class OsciIngestor < Ingestor
def self.config
{
key: 'osci',
title: 'OSCI Events API',
category: :events
}
end

def read(url)
begin
process_osci(url)
rescue Exception => e
@messages << "#{self.class.name} failed with: #{e.message}"
end

# finished
nil
end

private

def process_osci(url)
month = Time.zone.now.month
year = Time.zone.now.year
(1..12).each do |i|
unless Rails.env.test? and File.exist?('test/vcr_cassettes/ingestors/osci.yml')
sleep(1)
end
scrape_url = "https://osc-international.com/my-calendar/?format=calendar&month=#{i}&yr=#{i >= month ? year : year + 1}"
event_page = Nokogiri::HTML5.parse(open_url(scrape_url.to_s, raise: true)).css("div[id='my-calendar']")[0].css("tbody")[0].css("td")
event_page.each do |event_data|
next if event_data.get_attribute('class').include?('no-events')

beep = event_data.css("div[id*=calendar-my-calendar]")
beep.each do |boop|
event = OpenStruct.new
el = boop.css("h3[class='event-title summary']")[0]
url_str = el.css("a")[0].get_attribute('href')
event.url = scrape_url + url_str

el2 = boop.css("div[id='#{url_str.gsub('#', '')}']")[0]
event.title = el2.css("h4[class='mc-title']")[0].text.strip
event.venue = el2.css("div[class='mc-location']")[0].css("strong[class='location-link']")[0].text.strip

if el2.css("div[class='time-block']")[0].css("span[class='event-time dtstart']").count.positive?
event.start = Time.zone.parse(el2.css("div[class='time-block']")[0].css("span[class='event-time dtstart']")[0].css("time")[0].get_attribute('datetime'))
event.end = Time.zone.parse(el2.css("div[class='time-block']")[0].css("span[class='end-time dtend']")[0].css("time")[0].get_attribute('datetime'))
else
event.start = Time.zone.parse(el2.css("div[class='time-block']")[0].css("span[class='mc-start-date dtstart']")[0].get_attribute('content'))
if el2.css("div[class='time-block']")[0].css("span[class='event-time dtend']").count.positive?
event.end = Time.zone.parse(el2.css("div[class='time-block']")[0].css("span[class='event-time dtend']")[0].text.strip)
else
event.end = event.start
end
end

# parsed datetimes are always 2 hours off
event.start += 2.hours
event.end += 2.hours

event.source = 'OSCI'
event.timezone = 'Amsterdam'
event.set_default_times

add_event(event)
end
rescue Exception => e
@messages << "Extract event fields failed with: #{e.message}"
end
end
end
end
end
59 changes: 59 additions & 0 deletions test/unit/ingestors/osci_ingestor_test.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
require 'test_helper'

class OsciIngestorTest < ActiveSupport::TestCase
setup do
@user = users(:regular_user)
@content_provider = content_providers(:another_portal_provider)
mock_ingestions
mock_timezone # System time zone should not affect test result
end

teardown do
reset_timezone
end

test 'can ingest events from osci' do
source = @content_provider.sources.build(
url: 'https://osc-international.com/my-calendar/',
method: 'osci',
enabled: true
)

ingestor = Ingestors::OsciIngestor.new

# check event doesn't
new_title = "14:00: Open Science Coffee: Assessing robustness through multiverse analysis – Applications in research and education"
new_url = 'https://osc-international.com/my-calendar/?format=calendar&month=9&yr=2023#mc_calendar_03_2-calendar-details-my-calendar'

refute Event.where(title: new_title, url: new_url).any?

# run task
assert_difference 'Event.count', 12 do
freeze_time(2023) do
VCR.use_cassette("ingestors/osci") do
ingestor.read(source.url)
ingestor.write(@user, @content_provider)
end
end
end

assert_equal 18, ingestor.events.count
assert ingestor.materials.empty?
assert_equal 12, ingestor.stats[:events][:added]
assert_equal 6, ingestor.stats[:events][:updated]
assert_equal 0, ingestor.stats[:events][:rejected]

# check event does exist
event = Event.where(title: new_title, url: new_url).first
assert event
assert_equal new_title, event.title
assert_equal new_url, event.url

# check other fields
assert_equal 'OSCI', event.source
assert_equal 'Amsterdam', event.timezone
assert_equal Time.zone.parse('Sun, 03 Sep 2023 14:00:00.000000000 UTC +00:00'), event.start
assert_equal Time.zone.parse('Sun, 03 Sep 2023 15:00:00.000000000 UTC +00:00'), event.end
assert_equal 'OSC Leiden', event.venue
end
end
507 changes: 507 additions & 0 deletions test/vcr_cassettes/ingestors/osci.yml

Large diffs are not rendered by default.

0 comments on commit d1c5b15

Please sign in to comment.