Skip to content

Commit

Permalink
fix dcc ingestor
Browse files Browse the repository at this point in the history
  • Loading branch information
mikesndrs committed Sep 16, 2024
1 parent fbb3166 commit 90eba26
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 25 deletions.
20 changes: 13 additions & 7 deletions lib/ingestors/dcc_ingestor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -26,18 +26,24 @@ def read(url)
private

def process_dcc(url)
event_page = Nokogiri::HTML5.parse(open_url(url.to_s, raise: true)).css("div[class='archive__content grid']")[0].css("div[class='column span-4-sm span-8-md span-6-lg']")
# event_page = Nokogiri::HTML5.parse(open_url(url.to_s, raise: true)).css('main > .archive__content.grid > .column.span-4-sm.span-4-md.span-6-lg')
event_page = Nokogiri::HTML5.parse(open_url(url.to_s, raise: true)).css('main > article > .archive__grid > .column > .archive__content > .column')
event_page.each do |event_data|
event = OpenStruct.new

event.url = event_data.css("h2[class='post-item__title h5']")[0].css("a")[0].get_attribute('href')
event.title = event_data.css("h2[class='post-item__title h5']")[0].css("a")[0].text.strip
event.url = event_data.css('h2.post-item__title > a')[0].get_attribute('href')
event.title = event_data.css('h2.post-item__title > a')[0].text.strip

start_str = event_data.css("ul[class='post-item__meta']")[0].css("li")[0].text.strip.split('—')
event.start = Time.zone.parse(start_str[0])
event.end = Time.zone.parse(start_str[0]).beginning_of_day + Time.zone.parse(start_str[1]).seconds_since_midnight.seconds
start_str = event_data.css('ul.post-item__meta > li')[0].text.strip.split('—')
if start_str[1].include?(':')
event.start = Time.zone.parse(start_str[0])
event.end = Time.zone.parse(start_str[0]).beginning_of_day + Time.zone.parse(start_str[1]).seconds_since_midnight.seconds
else
event.start = Time.zone.parse(start_str[0])
event.end = Time.zone.parse(start_str[1])
end

event.venue = event_data.css("ul[class='post-item__meta']")[0].css("li")[1].text.strip
event.venue = event_data.css('ul.post-item__meta > li')[1].text.strip

event.source = 'DCC'
event.timezone = 'Amsterdam'
Expand Down
16 changes: 8 additions & 8 deletions test/unit/ingestors/dcc_ingestor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -22,23 +22,23 @@ class DccIngestorTest < ActiveSupport::TestCase
ingestor = Ingestors::DccIngestor.new

# check event doesn't
new_title = "DCC-PO dag"
new_url = 'https://dcc-po.nl/agenda/dcc-po-dag/'
new_title = 'Training FAIR data management'
new_url = 'https://dcc-po.nl/agenda/training-fair-data-management/'
refute Event.where(title: new_title, url: new_url).any?

# run task
assert_difference 'Event.count', 1 do
assert_difference 'Event.count', 2 do
freeze_time(2019) do
VCR.use_cassette("ingestors/dcc") do
VCR.use_cassette('ingestors/dcc') do
ingestor.read(source.url)
ingestor.write(@user, @content_provider)
end
end
end

assert_equal 1, ingestor.events.count
assert_equal 2, ingestor.events.count
assert ingestor.materials.empty?
assert_equal 1, ingestor.stats[:events][:added]
assert_equal 2, ingestor.stats[:events][:added]
assert_equal 0, ingestor.stats[:events][:updated]
assert_equal 0, ingestor.stats[:events][:rejected]

Expand All @@ -51,8 +51,8 @@ class DccIngestorTest < ActiveSupport::TestCase
# check other fields
assert_equal 'DCC', event.source
assert_equal 'Amsterdam', event.timezone
assert_equal Time.zone.parse('Mon, 09 Oct 2019 10:00:00.000000000 UTC +00:00'), event.start
assert_equal Time.zone.parse('Mon, 09 Oct 2019 16:30:00.000000000 UTC +00:00'), event.end
assert_equal Time.zone.parse('Fri, 30 Sep 2019 09:00:00.000000000 UTC +00:00'), event.start
assert_equal Time.zone.parse('Sat, 14 Oct 2019 17:00:00.000000000 UTC +00:00'), event.end
assert_equal 'Domstad, Utrecht', event.venue
end
end
26 changes: 16 additions & 10 deletions test/vcr_cassettes/ingestors/dcc.yml

Large diffs are not rendered by default.

0 comments on commit 90eba26

Please sign in to comment.