Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added OSCI scraper #890

Merged
merged 2 commits into from
Sep 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion lib/ingestors/ingestor_factory.rb
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ def self.ingestors
Ingestors::UhasseltIngestor,
Ingestors::OdisseiIngestor,
Ingestors::RstIngestor,
Ingestors::DccIngestor,
Ingestors::OsciIngestor,
Ingestors::DccIngestor
]
end

Expand Down
79 changes: 79 additions & 0 deletions lib/ingestors/osci_ingestor.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
require 'open-uri'
require 'csv'
require 'nokogiri'

module Ingestors
class OsciIngestor < Ingestor
def self.config
{
key: 'osci',
title: 'OSCI Events API',
category: :events
}
end

def read(url)
begin
process_osci(url)
rescue Exception => e
@messages << "#{self.class.name} failed with: #{e.message}"
end

# finished
nil
end

private

def process_osci(url)
month = Time.zone.now.month
year = Time.zone.now.year
(1..12).each do |i|
unless Rails.env.test? and File.exist?('test/vcr_cassettes/ingestors/osci.yml')
sleep(1)
end
scrape_url = "https://osc-international.com/my-calendar/?format=calendar&month=#{i}&yr=#{i >= month ? year : year + 1}"
event_page = Nokogiri::HTML5.parse(open_url(scrape_url.to_s, raise: true)).css("div[id='my-calendar']")[0].css("tbody")[0].css("td")
event_page.each do |event_data|
next if event_data.get_attribute('class').include?('no-events')

beep = event_data.css("div[id*=calendar-my-calendar]")
beep.each do |boop|
event = OpenStruct.new
el = boop.css("h3[class='event-title summary']")[0]
url_str = el.css("a")[0].get_attribute('href')
event.url = scrape_url + url_str

el2 = boop.css("div[id='#{url_str.gsub('#', '')}']")[0]
event.title = el2.css("h4[class='mc-title']")[0].text.strip
event.venue = el2.css("div[class='mc-location']")[0].css("strong[class='location-link']")[0].text.strip

if el2.css("div[class='time-block']")[0].css("span[class='event-time dtstart']").count.positive?
event.start = Time.zone.parse(el2.css("div[class='time-block']")[0].css("span[class='event-time dtstart']")[0].css("time")[0].get_attribute('datetime'))
event.end = Time.zone.parse(el2.css("div[class='time-block']")[0].css("span[class='end-time dtend']")[0].css("time")[0].get_attribute('datetime'))
else
event.start = Time.zone.parse(el2.css("div[class='time-block']")[0].css("span[class='mc-start-date dtstart']")[0].get_attribute('content'))
if el2.css("div[class='time-block']")[0].css("span[class='event-time dtend']").count.positive?
event.end = Time.zone.parse(el2.css("div[class='time-block']")[0].css("span[class='event-time dtend']")[0].text.strip)
else
event.end = event.start
end
end

# parsed datetimes are always 2 hours off
event.start += 2.hours
event.end += 2.hours

event.source = 'OSCI'
event.timezone = 'Amsterdam'
event.set_default_times

add_event(event)
end
rescue Exception => e
@messages << "Extract event fields failed with: #{e.message}"
end
end
end
end
end
59 changes: 59 additions & 0 deletions test/unit/ingestors/osci_ingestor_test.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
require 'test_helper'

class OsciIngestorTest < ActiveSupport::TestCase
setup do
@user = users(:regular_user)
@content_provider = content_providers(:another_portal_provider)
mock_ingestions
mock_timezone # System time zone should not affect test result
end

teardown do
reset_timezone
end

test 'can ingest events from osci' do
source = @content_provider.sources.build(
url: 'https://osc-international.com/my-calendar/',
method: 'osci',
enabled: true
)

ingestor = Ingestors::OsciIngestor.new

# check event doesn't
new_title = "14:00: Open Science Coffee: Assessing robustness through multiverse analysis – Applications in research and education"
new_url = 'https://osc-international.com/my-calendar/?format=calendar&month=9&yr=2023#mc_calendar_03_2-calendar-details-my-calendar'

refute Event.where(title: new_title, url: new_url).any?

# run task
assert_difference 'Event.count', 12 do
freeze_time(2023) do
VCR.use_cassette("ingestors/osci") do
ingestor.read(source.url)
ingestor.write(@user, @content_provider)
end
end
end

assert_equal 18, ingestor.events.count
assert ingestor.materials.empty?
assert_equal 12, ingestor.stats[:events][:added]
assert_equal 6, ingestor.stats[:events][:updated]
assert_equal 0, ingestor.stats[:events][:rejected]

# check event does exist
event = Event.where(title: new_title, url: new_url).first
assert event
assert_equal new_title, event.title
assert_equal new_url, event.url

# check other fields
assert_equal 'OSCI', event.source
assert_equal 'Amsterdam', event.timezone
assert_equal Time.zone.parse('Sun, 03 Sep 2023 14:00:00.000000000 UTC +00:00'), event.start
assert_equal Time.zone.parse('Sun, 03 Sep 2023 15:00:00.000000000 UTC +00:00'), event.end
assert_equal 'OSC Leiden', event.venue
end
end
507 changes: 507 additions & 0 deletions test/vcr_cassettes/ingestors/osci.yml

Large diffs are not rendered by default.