Skip to content

Commit

Permalink
Merge pull request #881 from DaanVanVugt/feature/researchsoftwaretrai…
Browse files Browse the repository at this point in the history
…ning

basic scraper for materials of researchsoftwaretraining
  • Loading branch information
fbacall authored Aug 22, 2023
2 parents 221920d + 37c6955 commit 46317e8
Show file tree
Hide file tree
Showing 6 changed files with 148 additions and 2 deletions.
1 change: 1 addition & 0 deletions lib/ingestors/ingestor_factory.rb
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ def self.ingestors
Ingestors::TdccIngestor,
Ingestors::UhasseltIngestor,
Ingestors::OdisseiIngestor,
Ingestors::RstIngestor,
]
end

Expand Down
42 changes: 42 additions & 0 deletions lib/ingestors/rst_ingestor.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
require 'open-uri'
require 'csv'
require 'nokogiri'

module Ingestors
class RstIngestor < Ingestor
def self.config
{
key: 'rst_material',
title: 'RST Events API',
category: :materials
}
end

def read(url)
begin
process_rst(url)
rescue Exception => e
@messages << "#{self.class.name} failed with: #{e.message}"
end

# finished
nil
end

private

def process_rst(url)
rst_url = 'https://researchsoftwaretraining.nl/resources/'
material_page = Nokogiri::HTML5.parse(open_url(rst_url.to_s, raise: true)).css("div[class='inner_content cf']").first.css('p > a')
material_page.each_with_index do |el, idx|
material = OpenStruct.new
material.title = el&.text
material.url = el&.get_attribute('href')
material.description = material.title
add_material(material)
rescue Exception => e
@messages << "Extract event fields failed with: #{e.message}"
end
end
end
end
2 changes: 1 addition & 1 deletion lib/ingestors/uhasselt_ingestor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def process_uhasselt(url)
end
start_date = Time.zone.parse("#{date_s[1]}/#{date_s[0]} #{start_hours}:00")
end_date = Time.zone.parse("#{date_s[1]}/#{date_s[0]} #{end_hours}:00")
if start_date < Date.today - 2.months
if start_date < Time.zone.now - 2.months
start_date += 1.year
end_date += 1.year
end
Expand Down
48 changes: 48 additions & 0 deletions test/unit/ingestors/rst_ingestor_test.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
require 'test_helper'

class RstIngestorTest < ActiveSupport::TestCase
setup do
@user = users(:regular_user)
@content_provider = content_providers(:portal_provider)
mock_ingestions
mock_timezone # System time zone should not affect test result
end

teardown do
reset_timezone
end

test 'can ingest materials from rst' do
source = @content_provider.sources.build(
url: 'https://researchsoftwaretraining.nl/resources/',
method: 'rst',
enabled: true
)

ingestor = Ingestors::RstIngestor.new

# check materials don't exist
new_title = 'Software Carpentry'
new_url = 'https://software-carpentry.org/lessons/'
refute Material.where(title: new_title, url: new_url).any?

# run task
assert_difference('Material.count', 24) do
freeze_time(2019) do
VCR.use_cassette("ingestors/rst") do
ingestor.read(source.url)
ingestor.write(@user, @content_provider)
end
end
end

# check event does exist
material = Material.where(title: new_title, url: new_url).first
assert material
assert_equal new_title, material.title
assert_equal new_url, material.url

# check other fields
assert_equal 'Software Carpentry', material.description
end
end
2 changes: 1 addition & 1 deletion test/unit/ingestors/uhasselt_ingestor_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class UhasseltIngestorTest < ActiveSupport::TestCase

# run task
assert_difference 'Event.count', 14 do
freeze_time(Time.new('2023-06-18')) do
freeze_time(2023) do
VCR.use_cassette("ingestors/uhasselt") do
ingestor.read(source.url)
ingestor.write(@user, @content_provider)
Expand Down
55 changes: 55 additions & 0 deletions test/vcr_cassettes/ingestors/rst.yml

Large diffs are not rendered by default.

0 comments on commit 46317e8

Please sign in to comment.