Skip to content

Commit

Permalink
Merge pull request #971 from DaanVanVugt/feature/gpt_scraper
Browse files Browse the repository at this point in the history
LLM scraper
  • Loading branch information
fbacall authored Jul 18, 2024
2 parents ddccfde + 464cc8d commit 6cd1508
Show file tree
Hide file tree
Showing 31 changed files with 1,772 additions and 89 deletions.
11 changes: 7 additions & 4 deletions Gemfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# frozen_string_literal: true

source 'https://rubygems.org'

gem 'rails', '7.0.8.1'
Expand All @@ -14,7 +15,6 @@ gem 'bootstrap-tab-history-rails'
gem 'country_select'
gem 'devise'
gem 'devise_invitable'
gem 'sitemap_generator'
gem 'eventbrite_sdk'
gem 'font-awesome-sass', '~> 4.7.0' # Prefer V4 icon styles
gem 'friendly_id'
Expand All @@ -33,23 +33,23 @@ gem 'jquery-turbolinks'
gem 'kt-paperclip'
gem 'linkeddata'
gem 'money-rails'
gem 'omniauth-rails_csrf_protection'
gem 'omniauth_openid_connect'
gem 'omniauth-rails_csrf_protection'
gem 'pg'
gem 'private_address_check'
gem 'public_activity'
gem 'pundit'
gem 'rack-cors', require: 'rack/cors'
gem 'rails-i18n'
gem 'rails_admin'
gem 'rails-i18n'
gem 'recaptcha', require: 'recaptcha/rails'
gem 'redcarpet'
gem 'redis'
gem 'rest-client'
gem 'reverse_markdown'
gem 'rss'
gem 'sass-rails'
gem 'sassc-rails'
gem 'sass-rails'
gem 'sentry-rails'
gem 'sentry-ruby'
gem 'sentry-sidekiq'
Expand All @@ -58,6 +58,7 @@ gem 'sidekiq-status'
gem 'simple_calendar', '~> 2.4'
gem 'simple_form'
gem 'simple_token_authentication'
gem 'sitemap_generator'
gem 'sitemap-parser'
gem 'slim'
gem 'sunspot_rails', github: 'sunspot/sunspot', branch: 'master' # Contains Ruby 3 fixes that are not released
Expand Down Expand Up @@ -103,3 +104,5 @@ group :test do
gem 'vcr'
gem 'webmock'
end

gem 'ruby-openai'
12 changes: 10 additions & 2 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -195,15 +195,17 @@ GEM
erubi (1.13.0)
ethon (0.16.0)
ffi (>= 1.15.0)
event_stream_parser (1.0.0)
eventbrite_sdk (3.6.0)
rest-client (~> 2.0)
execjs (2.8.1)
faraday (2.9.0)
faraday-net_http (>= 2.0, < 3.2)
faraday-follow_redirects (0.3.0)
faraday (>= 1, < 3)
faraday-net_http (3.1.0)
net-http
faraday-multipart (1.0.4)
multipart-post (~> 2)
faraday-net_http (3.0.2)
ffi (1.15.5)
font-awesome-sass (4.7.0)
sass (>= 3.2)
Expand Down Expand Up @@ -366,6 +368,7 @@ GEM
msgpack (1.7.2)
multi_json (1.15.0)
multi_xml (0.6.0)
multipart-post (2.4.0)
nested_form (0.3.2)
net-http (0.4.1)
uri
Expand Down Expand Up @@ -600,6 +603,10 @@ GEM
unicode-display_width (>= 2.4.0, < 3.0)
rubocop-ast (1.29.0)
parser (>= 3.2.1.0)
ruby-openai (6.3.1)
event_stream_parser (>= 0.3.0, < 2.0.0)
faraday (>= 1)
faraday-multipart (>= 1)
ruby-progressbar (1.13.0)
safely_block (0.4.0)
sass (3.7.4)
Expand Down Expand Up @@ -831,6 +838,7 @@ DEPENDENCIES
reverse_markdown
rss
rubocop
ruby-openai
sass-rails
sassc-rails
sentry-rails
Expand Down
7 changes: 4 additions & 3 deletions app/controllers/events_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -233,9 +233,10 @@ def event_params
:timezone, :content_provider_id, { collection_ids: [] }, { node_ids: [] },
{ node_names: [] }, { target_audience: [] }, { eligibility: [] }, :visible,
{ host_institutions: [] }, :capacity, :contact, :recognition, :learning_objectives,
:prerequisites, :tech_requirements, :cost_basis, :cost_value, :cost_currency,
:language, external_resources_attributes: %i[id url title _destroy],
material_ids: [], locked_fields: [])
:prerequisites, :tech_requirements, :cost_basis, :cost_value, :cost_currency, :language,
external_resources_attributes: %i[id url title _destroy], material_ids: [],
llm_interaction_attributes: %i[id scrape_or_process model prompt input output needs_processing _destroy],
locked_fields: [])
end

def event_report_params
Expand Down
9 changes: 8 additions & 1 deletion app/models/event.rb
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,8 @@ class Event < ApplicationRecord
enum presence: { onsite: 0, online: 1, hybrid: 2 }

belongs_to :user
has_one :llm_interaction, inverse_of: :event, dependent: :destroy
accepts_nested_attributes_for :llm_interaction, allow_destroy: true
has_one :edit_suggestion, as: :suggestible, dependent: :destroy
has_one :link_monitor, as: :lcheck, dependent: :destroy
has_many :collection_items, as: :resource
Expand Down Expand Up @@ -286,13 +288,18 @@ def self.disabled
end

def self.not_finished
where('events.end > ? OR events.end IS NULL', Time.now)
where('events.end >= ? OR events.end IS NULL', Time.now)
end

def self.finished
where('events.end < ?', Time.now).where.not(end: nil)
end

def self.needs_processing(llm_prompt)
joins('LEFT OUTER JOIN llm_interactions ON llm_interactions.event_id = events.id')
.where('llm_interactions.needs_processing = ? OR llm_interactions.prompt != ?', true, llm_prompt)
end

# Ticket #423
def check_country_name
if country and country.respond_to?(:parameterize)
Expand Down
9 changes: 9 additions & 0 deletions app/models/llm_interaction.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
class LlmInteraction < ApplicationRecord
belongs_to :event
validates :scrape_or_process, presence: true
validates :model, presence: true
validates :prompt, presence: true
validates :input, presence: true
validates :output, presence: true
# validates :needs_processing, presence: true
end
9 changes: 4 additions & 5 deletions config/application.rb
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class Application < Rails::Application
config.middleware.insert_before 0, Rack::Cors do
allow do
origins '*'
resource '*', headers: :any, methods: [:get, :post, :options]
resource '*', headers: :any, methods: %i[get post options]
end
end

Expand All @@ -38,7 +38,7 @@ class Application < Rails::Application
ActiveSupport::HashWithIndifferentAccess, BigDecimal
]

config.exceptions_app = self.routes
config.exceptions_app = routes
end

tess_config = Rails.configuration.tess.with_indifferent_access
Expand All @@ -64,9 +64,7 @@ def self.merge_config(default_config, config, current_path = '')
puts "Setting '#{current_path}#{key}' not configured, using defaults" if Rails.env.development?
config[key] = value
end
if value.is_a?(Hash) && config[key].is_a?(Hash)
merge_config(value, config[key], current_path + "#{key}: ")
end
merge_config(value, config[key], current_path + "#{key}: ") if value.is_a?(Hash) && config[key].is_a?(Hash)
end
end

Expand All @@ -83,6 +81,7 @@ def redis_url

def ingestion
return @ingestion if @ingestion

config_file = File.join(Rails.root, 'config', 'ingestion.yml')
@ingestion = File.exist?(config_file) ? YAML.safe_load(File.read(config_file)).deep_symbolize_keys! : {}
end
Expand Down
26 changes: 14 additions & 12 deletions config/schedule.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
require 'yaml'
begin
schedules = YAML.load_file("#{path}/config/schedule.yml")
rescue Exception => exception
rescue Exception => e
# ignore failure
ensure
# set to empty hash if not exists
Expand All @@ -17,58 +17,60 @@
# Generate a new sitemap...
every schedules.dig('sitemap', 'every')&.to_sym || :day,
at: schedules.dig('sitemap', 'at') || '1am' do
rake "sitemap:refresh"
rake 'sitemap:refresh'
end

# Process subscriptions...
if !schedules['subscriptions'].nil?
every :"#{schedules['subscriptions']['every']}", at: "#{schedules['subscriptions']['at']}" do
rake "tess:process_subscriptions"
rake 'tess:process_subscriptions'
end
else
every :day, at: '2am' do
rake "tess:process_subscriptions"
rake 'tess:process_subscriptions'
end
end

# Process ingestions
if !schedules['ingestions'].nil?
every :"#{schedules['ingestions']['every']}", at: "#{schedules['ingestions']['at']}" do
rake "tess:automated_ingestion"
rake 'tess:automated_ingestion'
rake 'tess:llm_post_processing' if TeSS::Config.llm_scraper['model_version'].present?
end
else
every :day, at: '3am' do
rake "tess:automated_ingestion"
rake 'tess:automated_ingestion'
rake 'tess:llm_post_processing' if TeSS::Config.llm_scraper['model_version'].present?
end
end

# Curation mail
if !schedules['curation_mail'].nil?
every :"#{schedules['curation_mail']['every']}", at: "#{schedules['curation_mail']['at']}" do
rake "tess:event_curation_mails"
rake 'tess:event_curation_mails'
end
else
every :monday, at: '9am' do
rake "tess:event_curation_mails"
rake 'tess:event_curation_mails'
end
end

if !schedules['autocomplete_suggestions'].nil?
every :"#{schedules['autocomplete_suggestions']['every']}", at: "#{schedules['autocomplete_suggestions']['at']}" do
rake "tess:rebuild_autocomplete_suggestions"
rake 'tess:rebuild_autocomplete_suggestions'
end
else
every :tuesday, at: '5am' do
rake "tess:rebuild_autocomplete_suggestions"
rake 'tess:rebuild_autocomplete_suggestions'
end
end

if !schedules['dead_link_check'].nil?
every :"#{schedules['dead_link_check']['every']}", at: "#{schedules['dead_link_check']['at']}" do
rake "tess:check_resource_urls"
rake 'tess:check_resource_urls'
end
else
every :day, at: '6am' do
rake "tess:check_resource_urls"
rake 'tess:check_resource_urls'
end
end
2 changes: 2 additions & 0 deletions config/secrets.example.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ external_api_keys: &external_api_keys
fairsharing:
username:
password:
gpt_api_key:
willma_api_key:

#Internal config
development:
Expand Down
3 changes: 3 additions & 0 deletions config/tess.example.yml
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,9 @@ default: &default
- CC-BY-NC-SA-4.0
- CC-BY-ND-4.0
- CC-BY-SA-4.0
llm_scraper:
model:
model_version:

development:
<<: *default
Expand Down
17 changes: 17 additions & 0 deletions db/migrate/20240220144246_add_llm_check.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
class AddLlmCheck < ActiveRecord::Migration[7.0]
def change
create_table :llm_interactions do |t|
t.belongs_to :event, foreign_key: true
t.datetime :created_at
t.datetime :updated_at
t.string :scrape_or_process
t.string :model
t.string :prompt
t.string :input
t.string :output
t.boolean :needs_processing, default: false
end
add_reference :events, :llm_interaction, foreign_key: true
add_column :events, :open_science, :string, array: true, default: []
end
end
13 changes: 13 additions & 0 deletions db/schema.rb
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,19 @@
t.index ["lcheck_type", "lcheck_id"], name: "index_link_monitors_on_lcheck_type_and_lcheck_id"
end

create_table "llm_interactions", force: :cascade do |t|
t.bigint "event_id"
t.datetime "created_at"
t.datetime "updated_at"
t.string "scrape_or_process"
t.string "model"
t.string "prompt"
t.string "input"
t.string "output"
t.boolean "needs_processing", default: false
t.index ["event_id"], name: "index_llm_interactions_on_event_id"
end

create_table "materials", force: :cascade do |t|
t.text "title"
t.string "url"
Expand Down
28 changes: 28 additions & 0 deletions lib/ingestors/fourtu_llm_ingestor.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
require 'open-uri'
require 'csv'
require 'nokogiri'

module Ingestors
class FourtuLlmIngestor < LlmIngestor
def self.config
{
key: '4tu_llm_event',
title: '4TU LLM Events API',
category: :events
}
end

private

def process_llm(_url)
url = 'https://www.4tu.nl/en/agenda/'
event_page = Nokogiri::HTML5.parse(open_url(url, raise: true)).css('.searchresults')[0].css('a.searchresult')
event_page.each do |event_data|
new_url = event_data['href']
sleep(1) unless Rails.env.test? and File.exist?('test/vcr_cassettes/ingestors/4tu_llm.yml')
new_event_page = Nokogiri::HTML5.parse(open_url(new_url, raise: true)).css('body').css('main, .page-header__content')
get_event_from_css(new_url, new_event_page)
end
end
end
end
Loading

0 comments on commit 6cd1508

Please sign in to comment.