diff --git a/Gemfile b/Gemfile index 62b1105d6..92ed77c07 100644 --- a/Gemfile +++ b/Gemfile @@ -1,4 +1,5 @@ # frozen_string_literal: true + source 'https://rubygems.org' gem 'rails', '7.0.8.1' @@ -14,7 +15,6 @@ gem 'bootstrap-tab-history-rails' gem 'country_select' gem 'devise' gem 'devise_invitable' -gem 'sitemap_generator' gem 'eventbrite_sdk' gem 'font-awesome-sass', '~> 4.7.0' # Prefer V4 icon styles gem 'friendly_id' @@ -33,23 +33,23 @@ gem 'jquery-turbolinks' gem 'kt-paperclip' gem 'linkeddata' gem 'money-rails' -gem 'omniauth-rails_csrf_protection' gem 'omniauth_openid_connect' +gem 'omniauth-rails_csrf_protection' gem 'pg' gem 'private_address_check' gem 'public_activity' gem 'pundit' gem 'rack-cors', require: 'rack/cors' -gem 'rails-i18n' gem 'rails_admin' +gem 'rails-i18n' gem 'recaptcha', require: 'recaptcha/rails' gem 'redcarpet' gem 'redis' gem 'rest-client' gem 'reverse_markdown' gem 'rss' -gem 'sass-rails' gem 'sassc-rails' +gem 'sass-rails' gem 'sentry-rails' gem 'sentry-ruby' gem 'sentry-sidekiq' @@ -58,6 +58,7 @@ gem 'sidekiq-status' gem 'simple_calendar', '~> 2.4' gem 'simple_form' gem 'simple_token_authentication' +gem 'sitemap_generator' gem 'sitemap-parser' gem 'slim' gem 'sunspot_rails', github: 'sunspot/sunspot', branch: 'master' # Contains Ruby 3 fixes that are not released @@ -103,3 +104,5 @@ group :test do gem 'vcr' gem 'webmock' end + +gem 'ruby-openai' diff --git a/Gemfile.lock b/Gemfile.lock index 385e706d0..72eae82ae 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -195,6 +195,7 @@ GEM erubi (1.13.0) ethon (0.16.0) ffi (>= 1.15.0) + event_stream_parser (1.0.0) eventbrite_sdk (3.6.0) rest-client (~> 2.0) execjs (2.8.1) @@ -202,8 +203,9 @@ GEM faraday-net_http (>= 2.0, < 3.2) faraday-follow_redirects (0.3.0) faraday (>= 1, < 3) - faraday-net_http (3.1.0) - net-http + faraday-multipart (1.0.4) + multipart-post (~> 2) + faraday-net_http (3.0.2) ffi (1.15.5) font-awesome-sass (4.7.0) sass (>= 3.2) @@ -366,6 +368,7 @@ GEM msgpack (1.7.2) multi_json (1.15.0) multi_xml (0.6.0) + multipart-post (2.4.0) nested_form (0.3.2) net-http (0.4.1) uri @@ -600,6 +603,10 @@ GEM unicode-display_width (>= 2.4.0, < 3.0) rubocop-ast (1.29.0) parser (>= 3.2.1.0) + ruby-openai (6.3.1) + event_stream_parser (>= 0.3.0, < 2.0.0) + faraday (>= 1) + faraday-multipart (>= 1) ruby-progressbar (1.13.0) safely_block (0.4.0) sass (3.7.4) @@ -831,6 +838,7 @@ DEPENDENCIES reverse_markdown rss rubocop + ruby-openai sass-rails sassc-rails sentry-rails diff --git a/app/controllers/events_controller.rb b/app/controllers/events_controller.rb index cbba8fef6..11f7fb116 100644 --- a/app/controllers/events_controller.rb +++ b/app/controllers/events_controller.rb @@ -233,9 +233,10 @@ def event_params :timezone, :content_provider_id, { collection_ids: [] }, { node_ids: [] }, { node_names: [] }, { target_audience: [] }, { eligibility: [] }, :visible, { host_institutions: [] }, :capacity, :contact, :recognition, :learning_objectives, - :prerequisites, :tech_requirements, :cost_basis, :cost_value, :cost_currency, - :language, external_resources_attributes: %i[id url title _destroy], - material_ids: [], locked_fields: []) + :prerequisites, :tech_requirements, :cost_basis, :cost_value, :cost_currency, :language, + external_resources_attributes: %i[id url title _destroy], material_ids: [], + llm_interaction_attributes: %i[id scrape_or_process model prompt input output needs_processing _destroy], + locked_fields: []) end def event_report_params diff --git a/app/models/event.rb b/app/models/event.rb index e4320de95..ec4ca68a6 100644 --- a/app/models/event.rb +++ b/app/models/event.rb @@ -110,6 +110,8 @@ class Event < ApplicationRecord enum presence: { onsite: 0, online: 1, hybrid: 2 } belongs_to :user + has_one :llm_interaction, inverse_of: :event, dependent: :destroy + accepts_nested_attributes_for :llm_interaction, allow_destroy: true has_one :edit_suggestion, as: :suggestible, dependent: :destroy has_one :link_monitor, as: :lcheck, dependent: :destroy has_many :collection_items, as: :resource @@ -286,13 +288,18 @@ def self.disabled end def self.not_finished - where('events.end > ? OR events.end IS NULL', Time.now) + where('events.end >= ? OR events.end IS NULL', Time.now) end def self.finished where('events.end < ?', Time.now).where.not(end: nil) end + def self.needs_processing(llm_prompt) + joins('LEFT OUTER JOIN llm_interactions ON llm_interactions.event_id = events.id') + .where('llm_interactions.needs_processing = ? OR llm_interactions.prompt != ?', true, llm_prompt) + end + # Ticket #423 def check_country_name if country and country.respond_to?(:parameterize) diff --git a/app/models/llm_interaction.rb b/app/models/llm_interaction.rb new file mode 100644 index 000000000..f101580f1 --- /dev/null +++ b/app/models/llm_interaction.rb @@ -0,0 +1,9 @@ +class LlmInteraction < ApplicationRecord + belongs_to :event + validates :scrape_or_process, presence: true + validates :model, presence: true + validates :prompt, presence: true + validates :input, presence: true + validates :output, presence: true + # validates :needs_processing, presence: true +end diff --git a/config/application.rb b/config/application.rb index ced34c78b..6f6f1c2f5 100644 --- a/config/application.rb +++ b/config/application.rb @@ -21,7 +21,7 @@ class Application < Rails::Application config.middleware.insert_before 0, Rack::Cors do allow do origins '*' - resource '*', headers: :any, methods: [:get, :post, :options] + resource '*', headers: :any, methods: %i[get post options] end end @@ -38,7 +38,7 @@ class Application < Rails::Application ActiveSupport::HashWithIndifferentAccess, BigDecimal ] - config.exceptions_app = self.routes + config.exceptions_app = routes end tess_config = Rails.configuration.tess.with_indifferent_access @@ -64,9 +64,7 @@ def self.merge_config(default_config, config, current_path = '') puts "Setting '#{current_path}#{key}' not configured, using defaults" if Rails.env.development? config[key] = value end - if value.is_a?(Hash) && config[key].is_a?(Hash) - merge_config(value, config[key], current_path + "#{key}: ") - end + merge_config(value, config[key], current_path + "#{key}: ") if value.is_a?(Hash) && config[key].is_a?(Hash) end end @@ -83,6 +81,7 @@ def redis_url def ingestion return @ingestion if @ingestion + config_file = File.join(Rails.root, 'config', 'ingestion.yml') @ingestion = File.exist?(config_file) ? YAML.safe_load(File.read(config_file)).deep_symbolize_keys! : {} end diff --git a/config/schedule.rb b/config/schedule.rb index cfbebca85..585419d5b 100644 --- a/config/schedule.rb +++ b/config/schedule.rb @@ -7,7 +7,7 @@ require 'yaml' begin schedules = YAML.load_file("#{path}/config/schedule.yml") -rescue Exception => exception +rescue Exception => e # ignore failure ensure # set to empty hash if not exists @@ -17,58 +17,60 @@ # Generate a new sitemap... every schedules.dig('sitemap', 'every')&.to_sym || :day, at: schedules.dig('sitemap', 'at') || '1am' do - rake "sitemap:refresh" + rake 'sitemap:refresh' end # Process subscriptions... if !schedules['subscriptions'].nil? every :"#{schedules['subscriptions']['every']}", at: "#{schedules['subscriptions']['at']}" do - rake "tess:process_subscriptions" + rake 'tess:process_subscriptions' end else every :day, at: '2am' do - rake "tess:process_subscriptions" + rake 'tess:process_subscriptions' end end # Process ingestions if !schedules['ingestions'].nil? every :"#{schedules['ingestions']['every']}", at: "#{schedules['ingestions']['at']}" do - rake "tess:automated_ingestion" + rake 'tess:automated_ingestion' + rake 'tess:llm_post_processing' if TeSS::Config.llm_scraper['model_version'].present? end else every :day, at: '3am' do - rake "tess:automated_ingestion" + rake 'tess:automated_ingestion' + rake 'tess:llm_post_processing' if TeSS::Config.llm_scraper['model_version'].present? end end # Curation mail if !schedules['curation_mail'].nil? every :"#{schedules['curation_mail']['every']}", at: "#{schedules['curation_mail']['at']}" do - rake "tess:event_curation_mails" + rake 'tess:event_curation_mails' end else every :monday, at: '9am' do - rake "tess:event_curation_mails" + rake 'tess:event_curation_mails' end end if !schedules['autocomplete_suggestions'].nil? every :"#{schedules['autocomplete_suggestions']['every']}", at: "#{schedules['autocomplete_suggestions']['at']}" do - rake "tess:rebuild_autocomplete_suggestions" + rake 'tess:rebuild_autocomplete_suggestions' end else every :tuesday, at: '5am' do - rake "tess:rebuild_autocomplete_suggestions" + rake 'tess:rebuild_autocomplete_suggestions' end end if !schedules['dead_link_check'].nil? every :"#{schedules['dead_link_check']['every']}", at: "#{schedules['dead_link_check']['at']}" do - rake "tess:check_resource_urls" + rake 'tess:check_resource_urls' end else every :day, at: '6am' do - rake "tess:check_resource_urls" + rake 'tess:check_resource_urls' end end diff --git a/config/secrets.example.yml b/config/secrets.example.yml index 4c3c3e258..40fbff566 100644 --- a/config/secrets.example.yml +++ b/config/secrets.example.yml @@ -38,6 +38,8 @@ external_api_keys: &external_api_keys fairsharing: username: password: + gpt_api_key: + willma_api_key: #Internal config development: diff --git a/config/tess.example.yml b/config/tess.example.yml index bd3e33a65..64cbfafc1 100644 --- a/config/tess.example.yml +++ b/config/tess.example.yml @@ -153,6 +153,9 @@ default: &default - CC-BY-NC-SA-4.0 - CC-BY-ND-4.0 - CC-BY-SA-4.0 + llm_scraper: + model: + model_version: development: <<: *default diff --git a/db/migrate/20240220144246_add_llm_check.rb b/db/migrate/20240220144246_add_llm_check.rb new file mode 100644 index 000000000..28a08facb --- /dev/null +++ b/db/migrate/20240220144246_add_llm_check.rb @@ -0,0 +1,17 @@ +class AddLlmCheck < ActiveRecord::Migration[7.0] + def change + create_table :llm_interactions do |t| + t.belongs_to :event, foreign_key: true + t.datetime :created_at + t.datetime :updated_at + t.string :scrape_or_process + t.string :model + t.string :prompt + t.string :input + t.string :output + t.boolean :needs_processing, default: false + end + add_reference :events, :llm_interaction, foreign_key: true + add_column :events, :open_science, :string, array: true, default: [] + end +end diff --git a/db/schema.rb b/db/schema.rb index f20cae85d..ab2e622c4 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -331,6 +331,19 @@ t.index ["lcheck_type", "lcheck_id"], name: "index_link_monitors_on_lcheck_type_and_lcheck_id" end + create_table "llm_interactions", force: :cascade do |t| + t.bigint "event_id" + t.datetime "created_at" + t.datetime "updated_at" + t.string "scrape_or_process" + t.string "model" + t.string "prompt" + t.string "input" + t.string "output" + t.boolean "needs_processing", default: false + t.index ["event_id"], name: "index_llm_interactions_on_event_id" + end + create_table "materials", force: :cascade do |t| t.text "title" t.string "url" diff --git a/lib/ingestors/fourtu_llm_ingestor.rb b/lib/ingestors/fourtu_llm_ingestor.rb new file mode 100644 index 000000000..a1e4dca44 --- /dev/null +++ b/lib/ingestors/fourtu_llm_ingestor.rb @@ -0,0 +1,28 @@ +require 'open-uri' +require 'csv' +require 'nokogiri' + +module Ingestors + class FourtuLlmIngestor < LlmIngestor + def self.config + { + key: '4tu_llm_event', + title: '4TU LLM Events API', + category: :events + } + end + + private + + def process_llm(_url) + url = 'https://www.4tu.nl/en/agenda/' + event_page = Nokogiri::HTML5.parse(open_url(url, raise: true)).css('.searchresults')[0].css('a.searchresult') + event_page.each do |event_data| + new_url = event_data['href'] + sleep(1) unless Rails.env.test? and File.exist?('test/vcr_cassettes/ingestors/4tu_llm.yml') + new_event_page = Nokogiri::HTML5.parse(open_url(new_url, raise: true)).css('body').css('main, .page-header__content') + get_event_from_css(new_url, new_event_page) + end + end + end +end diff --git a/lib/ingestors/ingestor.rb b/lib/ingestors/ingestor.rb index 90f9dc2f3..5cf4d7554 100644 --- a/lib/ingestors/ingestor.rb +++ b/lib/ingestors/ingestor.rb @@ -43,7 +43,7 @@ def write(user, provider) def stats_summary(type) summary = "\n### #{type.to_s.titleize}\n\n" - [:processed, :added, :updated, :rejected].each do |key| + %i[processed added updated rejected].each do |key| summary += " - #{key.to_s.titleize}: #{stats[type][key]}\n" end @@ -64,19 +64,17 @@ def open_url(url, raise: false) retry if (redirect_attempts -= 1) > 0 raise e rescue OpenURI::HTTPError => e - if raise - raise e - else - @messages << "Couldn't open URL #{url}: #{e}" - nil - end + raise e if raise + + @messages << "Couldn't open URL #{url}: #{e}" + nil end end def convert_description(input) return input if input.nil? - if input.match?(/<(li|p|b|i|ul|div|br|strong|em|h1)\/?>/) + if input.match?(%r{<(li|p|b|i|ul|div|br|strong|em|h1)/?>}) ReverseMarkdown.convert(input, tag_border: '').strip else input @@ -85,14 +83,15 @@ def convert_description(input) def convert_title(input) return input if input.nil? + CGI.unescapeHTML(input) end def get_json_response(url, accept_params = 'application/json', **kwargs) response = RestClient::Request.new({ method: :get, - url: CGI.unescape_html(url), - verify_ssl: false, - headers: { accept: accept_params } }.merge(kwargs)).execute + url: CGI.unescape_html(url), + verify_ssl: false, + headers: { accept: accept_params } }.merge(kwargs)).execute # check response raise "invalid response code: #{response.code}" unless response.code == 200 diff --git a/lib/ingestors/ingestor_factory.rb b/lib/ingestors/ingestor_factory.rb index 80982d116..900bdbdd2 100644 --- a/lib/ingestors/ingestor_factory.rb +++ b/lib/ingestors/ingestor_factory.rb @@ -30,6 +30,12 @@ def self.ingestors Ingestors::OsciIngestor, Ingestors::DccIngestor, Ingestors::SenseIngestor + ] + llm_ingestors + end + + def self.llm_ingestors + [ + Ingestors::FourtuLlmIngestor ] end @@ -41,11 +47,9 @@ def self.ingestor_config def self.get_ingestor(method) config = ingestor_config[method] - if config - config[:ingestor].new - else - raise "Invalid method: [#{method}]" - end + raise "Invalid method: [#{method}]" unless config + + config[:ingestor].new end def self.valid_ingestor?(method) diff --git a/lib/ingestors/llm_ingestor.rb b/lib/ingestors/llm_ingestor.rb new file mode 100644 index 000000000..5f7ebaae5 --- /dev/null +++ b/lib/ingestors/llm_ingestor.rb @@ -0,0 +1,62 @@ +require 'open-uri' +require 'csv' +require 'nokogiri' + +module Ingestors + class LlmIngestor < Ingestor + def self.config + { + key: 'llm_event', + title: 'LLM Events API', + category: :events + } + end + + def read(url) + begin + process_llm(url) + rescue StandardError => e + @messages << "#{self.class.name} failed with: #{e.message}" + end + + # finished + nil + end + + private + + include Llm + + def process_llm(_url) + puts 'please provide child class' + end + + def get_event_from_css(url, event_page) # rubocop:disable Metrics + llm_service_class = Llm.service_hash.fetch(TeSS::Config.llm_scraper['model'].to_sym, nil) + return unless llm_service_class + + event_page.css('script, link').each { |node| node.remove } + event_page = event_page.text.squeeze(" \n").squeeze("\n").squeeze("\t").squeeze(' ') + begin + llm_service = llm_service_class.new + event = OpenStruct.new + event = llm_service.scrape_func(event, event_page) + event = llm_service.post_process_func(event) + event.url = url + event.source = 'LLM' + event.timezone = 'Amsterdam' + a = Time.parse(event.start) + event.start = Time.new(a.year, a.month, a.day, a.hour, a.min, a.sec, '+00:00') + a = Time.parse(event.end) + event.end = Time.new(a.year, a.month, a.day, a.hour, a.min, a.sec, '+00:00') + event.set_default_times + event.nonsense_attr = 'nonsense' + event = OpenStruct.new(event.to_h.select { |key, _| (Event.attribute_names + [:online]).map(&:to_sym).include?(key) }) + add_event(event) + rescue StandardError => e + puts e + @messages << "Extract event fields failed with: #{e.message}" + end + end + end +end diff --git a/lib/ingestors/osci_ingestor.rb b/lib/ingestors/osci_ingestor.rb index dc65830d5..c9e2cd076 100644 --- a/lib/ingestors/osci_ingestor.rb +++ b/lib/ingestors/osci_ingestor.rb @@ -37,8 +37,8 @@ def process_osci(url) event_page.each do |event_data| next if event_data.get_attribute('class').include?('no-events') - beep = event_data.css("div[id*=calendar-my-calendar]") - beep.each do |boop| + event_cal = event_data.css("div[id*=calendar-my-calendar]") + event_cal.each do |boop| event = OpenStruct.new el = boop.css("h3[class='event-title summary']")[0] url_str = el.css("a")[0].get_attribute('href') diff --git a/lib/llm.rb b/lib/llm.rb new file mode 100644 index 000000000..260cf47bb --- /dev/null +++ b/lib/llm.rb @@ -0,0 +1,34 @@ +# frozen_string_literal: true + +# Module for LLM based scraping and post processing +module Llm + def self.service_hash + { + chatgpt: Llm::ChatgptService, + willma: Llm::WillmaService + } + end + + def self.post_processing_task + llm_service_class = Llm.service_hash.fetch(TeSS::Config.llm_scraper['model']&.to_sym, nil) + return unless llm_service_class + + prompt = File.read(File.join(Rails.root, 'lib', 'llm', 'llm_prompts', 'llm_process_prompt.txt')) + filtered_event_list(prompt).each do |event| + llm_service = llm_service_class.new + event = llm_service.post_process_func(event) + event.save! + end + end + + def self.filtered_event_list(prompt) + Event.not_finished.needs_processing(prompt) + end + + def self.reset_llm_status_task + Event.not_finished.each do |event| + event&.llm_interaction&.needs_processing = true + event.save! + end + end +end diff --git a/lib/llm/chatgpt_service.rb b/lib/llm/chatgpt_service.rb new file mode 100644 index 000000000..88fdc2ff9 --- /dev/null +++ b/lib/llm/chatgpt_service.rb @@ -0,0 +1,32 @@ +# frozen_string_literal: true + +# Module for LLM based scraping and post processing +module Llm + # ChatGPT based LLM scraping and post processing + class ChatgptService < Service + require 'openai' + def initialize + api_key = Rails.application.secrets&.gpt_api_key + @client = OpenAI::Client.new(access_token: api_key) + @params = { + # max_tokens: 50, + # model: 'gpt-3.5-turbo-1106', + model: TeSS::Config.llm_scraper['model_version'], + temperature: 0.7 + } + end + + def run(content) + call(content).dig('choices', 0, 'message', 'content') + end + + def call(prompt) + params = @params.merge( + { + messages: [{ role: 'user', content: prompt }] + } + ) + @client.chat(parameters: params) + end + end +end diff --git a/lib/llm/llm_prompts/llm_process_prompt.txt b/lib/llm/llm_prompts/llm_process_prompt.txt new file mode 100644 index 000000000..4bff9baab --- /dev/null +++ b/lib/llm/llm_prompts/llm_process_prompt.txt @@ -0,0 +1,45 @@ +Based on the following event: +*replace_with_event* + +and the following curation criteria: +Is this event useful for anyone rather than only people from a specific institution? +Is this event centered around research? + +Give me a valid json string describing a research themed event with the following format. + +keywords (array of strings): A set of keywords based which the event can be filtered. +target_audience (array of strings): The target audience for this event. +open_science (array of strings): The type of open science that this event advocates for, if any. +visible (bool): Whether or not the event is relevant according to the curation criteria + +Make sure the last json attribute is not followed by a comma. +Make sure the full json fits inside the response. +Strictly adhere to the provided options if applicable without introducing new categories or combinations of words. +Fill the keywords attributes with with ['domain agnostic'] if all options are relevant. +If a specified option is not applicable or missing, fill it with null. +The options are defined below between quotation marks. Options are separated by commas. + +keywords options: +[ + 'natural & engineering sciences', + 'humanities & social sciences', + 'life sciences' +] + +target_audience options: +[ + 'researchers', + 'research support staff', + 'bachelor & master students', + 'PhD candidates', + 'teaching staff', + 'other' +] + +open_science options: +[ + 'open software', + 'FAIR data', + 'Open Access, + 'citizen science' +] diff --git a/lib/llm/llm_prompts/llm_scrape_prompt.txt b/lib/llm/llm_prompts/llm_scrape_prompt.txt new file mode 100644 index 000000000..12c67a15f --- /dev/null +++ b/lib/llm/llm_prompts/llm_scrape_prompt.txt @@ -0,0 +1,16 @@ +Based on the following webpage describing a research event: +*replace_with_event_page* + +Give me a valid json string describing a research themed event with the following format: + +title (string): The title of the event +organizer (string): The organisation that hosts the event +description (string): A description of what will happen at the event +start (date or datetime): The local starting time of the event +end (date or datetime): The local end time of the event +venue (string): The venue where the event is hosted + +Instead of generating values for missing attributes, fill them with null. +Summarize the description in less than 400 characters please. +Make sure the last json attribute is not followed by a comma. +Make sure the full json fits inside the response. diff --git a/lib/llm/service.rb b/lib/llm/service.rb new file mode 100644 index 000000000..ed8f52793 --- /dev/null +++ b/lib/llm/service.rb @@ -0,0 +1,71 @@ +# frozen_string_literal: true + +# Module for LLM based scraping and post processing +module Llm + # Base class for LLM scraping and post processing + class Service + def initialize + raise NotImplementedError + end + + def llm_interaction_attributes + { + scrape_or_process: @scrape_or_process, + model: @params[:model], + prompt: @prompt, + input: @input, + output: @output, + needs_processing: false + } + end + + def unload_json(event, response) + response_json = JSON.parse(response) + response_json.each_key do |key| + event[key] = response_json[key] + end + event + end + + def scrape(event_page) + @scrape_or_process = 'scrape' + @prompt = File.read(File.join(Rails.root, 'lib', 'llm', 'llm_prompts', 'llm_scrape_prompt.txt')) + @input = event_page + content = @prompt.gsub('*replace_with_event_page*', event_page) + @output = run(content) + @output + end + + def process(event) + @scrape_or_process = 'process' + event_json = JSON.generate(event.to_json) + @prompt = File.read(File.join(Rails.root, 'lib', 'llm', 'llm_prompts', 'llm_process_prompt.txt')) + @input = event_json + content = @prompt.gsub('*replace_with_event*', event_json) + @output = run(content) + @output + end + + def scrape_func(event, event_page) + response = scrape(event_page) + event = unload_json(event, response) + event.llm_interaction_attributes = llm_interaction_attributes + event + end + + def post_process_func(event) + response = process(event) + event = unload_json(event, response) + event.llm_interaction_attributes = llm_interaction_attributes + event + end + + def run(_content) + raise NotImplementedError + end + + def call(_prompt) + raise NotImplementedError + end + end +end diff --git a/lib/llm/willma_service.rb b/lib/llm/willma_service.rb new file mode 100644 index 000000000..9334ed452 --- /dev/null +++ b/lib/llm/willma_service.rb @@ -0,0 +1,88 @@ +# frozen_string_literal: true + +# Module for LLM based scraping and post processing +module Llm + # Willma based LLM scraping and post processing + class WillmaService < Service + require 'openai' + def initialize + model_name = TeSS::Config.llm_scraper['model_version'] + model_url = 'https://willma.soil.surf.nl/api/models' + parsed_response = JSON.parse(do_request(model_url, 'get', {}).body) + model_id = parsed_response.select { |i| i['name'] == model_name }.first['id'] + @params = { + model: model_name, + sequence_id: model_id, + advanced_options: { "max_new_tokens": 4096 }, + temperature: 0 + # temperature: 0.7 + } + end + + def run(content) + msg = call(content)['message'] + get_first_json_from_string(msg) + end + + def call(prompt) + data = { + 'sequence_id': @params[:sequence_id], + 'input': prompt + } + query_url = 'https://willma.soil.surf.nl/api/query' + response = do_request(query_url, 'post', data) + JSON.parse(response.body) + end + + def do_request(url, mode, data = {}) + header = { + 'Content-Type': 'application/json', + 'X-API-KEY': Rails.application.secrets&.willma_api_key + } + + parsed_url = URI.parse(url) + http = Net::HTTP.new(parsed_url.host, parsed_url.port) + http.use_ssl = true + http.verify_mode = OpenSSL::SSL::VERIFY_PEER + http.read_timeout = 180 + + request = case mode + when 'post' + Net::HTTP::Post.new(parsed_url.path) + when 'get' + Net::HTTP::Get.new(parsed_url.path) + else + Net::HTTP::Post.new(parsed_url.path) + end + + header.each do |key, value| + request[key] = value + end + request.set_form_data(data) + request.body = data.to_json + request.content_type = 'application/json' + http.request(request) + end + + def get_first_json_from_string(msg) + char_dict = {} + char_dict['{'] = 0 + char_dict['}'] = 0 + start_end = [0, 0] + res = msg + msg.split('').each_with_index do |char, idx| + next unless '{}'.include?(char) + + char_dict[char] += 1 + if char == '{' && char_dict['{'] == 1 + start_end[0] = idx + elsif char == '}' && char_dict['{'] == char_dict['}'] + start_end[1] = idx + res = msg[start_end[0]..start_end[1]] + break + end + end + res + end + end +end diff --git a/lib/tasks/tess.rake b/lib/tasks/tess.rake index ed07ae875..a4d88fbbf 100644 --- a/lib/tasks/tess.rake +++ b/lib/tasks/tess.rake @@ -139,6 +139,16 @@ namespace :tess do puts "Finished successfully, output written to: #{log.path}" end + desc 'run LLM post processing' + task llm_post_processing: :environment do + Llm.post_processing_task + end + + desc 'open all events to being llm processed again' + task reset_llm_status: :environment do + Llm.reset_llm_status_task + end + desc 'mail content providers for curation of scraped events' task event_curation_mails: :environment do cut_off_time = Time.zone.now - 1.week diff --git a/test/fixtures/llm_interactions.yml b/test/fixtures/llm_interactions.yml new file mode 100644 index 000000000..532759293 --- /dev/null +++ b/test/fixtures/llm_interactions.yml @@ -0,0 +1,23 @@ +# Read about fixtures at http://api.rubyonrails.org/classes/ActiveRecord/FixtureSet.html + +scrape: + scrape_or_process: 'scrape' + model: 'Zephyr 7B' + prompt: 'Give JSON please' + input: 'Give JSON please' + output: 'Give JSON please' + needs_processing: false +different_prompt: + scrape_or_process: 'scrape' + model: 'Zephyr 7B' + prompt: 'different_prompt' + input: 'Give JSON please' + output: 'Give JSON please' + needs_processing: false +needs_processing: + scrape_or_process: 'scrape' + model: 'Zephyr 7B' + prompt: 'Give JSON please' + input: 'Give JSON please' + output: 'Give JSON please' + needs_processing: true \ No newline at end of file diff --git a/test/models/event_test.rb b/test/models/event_test.rb index f8eea7ea6..06d861609 100644 --- a/test/models/event_test.rb +++ b/test/models/event_test.rb @@ -509,13 +509,13 @@ class EventTest < ActiveSupport::TestCase event = Event.new( title: 'An event', timezone: 'UTC', - user: user, + user:, url: 'https://events.com/1', keywords: ['fun times'], nodes: [node], external_resources_attributes: { '0' => { title: 'test', url: 'https://external-resource.com' } }, materials: [material], - scientific_topic_names: ['Proteins', 'DNA'], + scientific_topic_names: %w[Proteins DNA], operation_names: ['Variant calling'] ) @@ -538,7 +538,7 @@ class EventTest < ActiveSupport::TestCase assert_nil dup.url assert_equal [material], dup.materials assert_equal [node], dup.nodes - assert_equal ['Proteins', 'DNA'], dup.scientific_topic_names + assert_equal %w[Proteins DNA], dup.scientific_topic_names assert_equal ['Variant calling'], dup.operation_names assert_equal 1, dup.external_resources.length assert_equal 'test', dup.external_resources.first.title @@ -685,4 +685,23 @@ class EventTest < ActiveSupport::TestCase assert_equal ['Fold recognition', 'Domain prediction', 'Fold prediction', 'Protein domain prediction', 'Protein fold prediction', 'Protein fold recognition'], @event.reload.operations_and_synonyms end + + test 'can add an llm_interaction to an event' do + e = events(:scraper_user_event) + l = llm_interactions(:scrape) + e.llm_interaction = l + e.save! + assert_equal e.llm_interaction.id, l.id + assert_equal l.event_id, e.id + end + + test 'can destroy an llm_interaction with an event' do + e = events(:scraper_user_event) + l = llm_interactions(:scrape) + e.llm_interaction = l + e.save! + assert_difference 'LlmInteraction.count', -1 do + e.destroy! + end + end end diff --git a/test/test_helper.rb b/test/test_helper.rb index a8f4eba1f..cc5ccc7f2 100644 --- a/test/test_helper.rb +++ b/test/test_helper.rb @@ -5,9 +5,9 @@ SimpleCov::Formatter::LcovFormatter.config.report_with_single_file = true SimpleCov.formatters = SimpleCov::Formatter::MultiFormatter.new([ - SimpleCov::Formatter::HTMLFormatter, - SimpleCov::Formatter::LcovFormatter -]) + SimpleCov::Formatter::HTMLFormatter, + SimpleCov::Formatter::LcovFormatter + ]) SimpleCov.start do add_filter '.gems' @@ -17,7 +17,7 @@ end ENV['RAILS_ENV'] ||= 'test' -require File.expand_path('../../config/environment', __FILE__) +require File.expand_path('../config/environment', __dir__) require 'rails/test_help' require 'webmock/minitest' require 'minitest/mock' @@ -26,8 +26,11 @@ require_relative './schema_helper' WebMock.disable_net_connect!(allow_localhost: true, allow: 'api.codacy.com') -Minitest::Reporters.use! [Minitest::Reporters::DefaultReporter.new( - fast_fail: true, color: true, detailed_skip: false, slow_count: 10)] unless ENV['RM_INFO'] +unless ENV['RM_INFO'] + Minitest::Reporters.use! [Minitest::Reporters::DefaultReporter.new( + fast_fail: true, color: true, detailed_skip: false, slow_count: 10 + )] +end VCR.configure do |config| config.cassette_library_dir = 'test/vcr_cassettes' @@ -60,11 +63,11 @@ def with_settings(settings, overwrite = false, &block) orig_config = {} settings.each do |k, v| orig_config[k] = TeSS::Config[k] - if !overwrite && TeSS::Config[k].is_a?(Hash) && v.is_a?(Hash) - TeSS::Config[k] = v.with_indifferent_access.reverse_merge!(TeSS::Config[k]) - else - TeSS::Config[k] = v - end + TeSS::Config[k] = if !overwrite && TeSS::Config[k].is_a?(Hash) && v.is_a?(Hash) + v.with_indifferent_access.reverse_merge!(TeSS::Config[k]) + else + v + end end block.call ensure @@ -142,36 +145,59 @@ def freeze_time(time_or_year = Time.now, &block) end end + def mock_llm_requests + get_body = [ + { + "name": 'Zephyr 7B', + "id": 0 + } + ].to_json + post_body = '{ + "message": "Here is your JSON: + { + \"title\":\"4TU-meeting National Technology Strategy\", + \"start\":\"2024-07-03T12:30:00+02:00\", + \"end\":\"2024-07-03T19:00:00+02:00\", + \"venue\":\"Basecamp, Nijverheidsweg 16A, 3534 AM Utrecht (https://basecamputrecht.nl)\", + \"description\":\"My cool description\", + \"nonsense_attr\":\"My cool nonsense attribute\" + } + I am a dumb llm and I have to say something afterward even though I was specifically asked not to." + }'.gsub(/\n/, '') + # run task + WebMock.stub_request(:get, 'https://willma.soil.surf.nl/api/models').to_return(status: 200, body: get_body) + WebMock.stub_request(:post, 'https://willma.soil.surf.nl/api/query').to_return(status: 200, body: post_body) + end + # Mock remote images so paperclip doesn't break: def mock_images - WebMock.stub_request(:any, /http\:\/\/example\.com\/(.+)\.png/).to_return( + WebMock.stub_request(:any, %r{http://example\.com/(.+)\.png}).to_return( status: 200, body: File.read(File.join(Rails.root, 'test/fixtures/files/image.png')), headers: { content_type: 'image/png' } ) - WebMock.stub_request(:any, "http://image.host/another_image.png").to_return( + WebMock.stub_request(:any, 'http://image.host/another_image.png').to_return( status: 200, body: File.read(File.join(Rails.root, 'test/fixtures/files/another_image.png')), headers: { content_type: 'image/png' } ) - WebMock.stub_request(:any, "http://malicious.host/image.png").to_return( + WebMock.stub_request(:any, 'http://malicious.host/image.png').to_return( status: 200, body: File.read(File.join(Rails.root, 'test/fixtures/files/bad.js')), headers: { content_type: 'image/png' } ) - WebMock.stub_request(:any, "http://text.host/text.txt").to_return( + WebMock.stub_request(:any, 'http://text.host/text.txt').to_return( status: 200, body: File.read(File.join(Rails.root, 'test/fixtures/files/text.txt')), headers: { content_type: 'text/plain' } ) - WebMock.stub_request(:any, "http://404.host/image.png").to_return(status: 404) - - WebMock.stub_request(:get, "https://bio.tools/api/tool?q=Training%20Material%20Example"). - with(:headers => { 'Accept' => '*/*', 'Accept-Encoding' => 'gzip;q=1.0,deflate;q=0.6,identity;q=0.3', 'User-Agent' => 'Ruby' }). - to_return(:status => 200, :body => "", :headers => {}) + WebMock.stub_request(:any, 'http://404.host/image.png').to_return(status: 404) - WebMock.stub_request(:get, "https://bio.tools/api/tool?q=Material%20with%20suggestions"). - with(:headers => { 'Accept' => '*/*', 'Accept-Encoding' => 'gzip;q=1.0,deflate;q=0.6,identity;q=0.3', 'User-Agent' => 'Ruby' }). - to_return(:status => 200, :body => "", :headers => {}) + WebMock.stub_request(:get, 'https://bio.tools/api/tool?q=Training%20Material%20Example') + .with(headers: { 'Accept' => '*/*', 'Accept-Encoding' => 'gzip;q=1.0,deflate;q=0.6,identity;q=0.3', 'User-Agent' => 'Ruby' }) + .to_return(status: 200, body: '', headers: {}) + WebMock.stub_request(:get, 'https://bio.tools/api/tool?q=Material%20with%20suggestions') + .with(headers: { 'Accept' => '*/*', 'Accept-Encoding' => 'gzip;q=1.0,deflate;q=0.6,identity;q=0.3', 'User-Agent' => 'Ruby' }) + .to_return(status: 200, body: '', headers: {}) end def mock_orcids @@ -223,7 +249,7 @@ def mock_ingestions { url: 'https://www.eventbriteapi.com/v3/organizations/34338661734', status: 404 }].each do |opts| url = opts.delete(:url) method = opts.delete(:method) || :get - opts[:body] = File.open(Rails.root.join( 'test', 'fixtures', 'files', 'ingestion', opts.delete(:filename))) if opts.key?(:filename) + opts[:body] = File.open(Rails.root.join('test', 'fixtures', 'files', 'ingestion', opts.delete(:filename))) if opts.key?(:filename) opts[:status] ||= 200 opts[:headers] ||= {} @@ -233,22 +259,20 @@ def mock_ingestions def mock_biotools biotools_file = File.read("#{Rails.root}/test/fixtures/files/annotation.json") - WebMock.stub_request(:get, /data.bioontology.org/). - to_return(:status => 200, :headers => {}, :body => biotools_file) + WebMock.stub_request(:get, /data.bioontology.org/).to_return(status: 200, headers: {}, body: biotools_file) end def mock_nominatim - nominatim_file = File.read(File.join(Rails.root, ['test', 'fixtures','files', 'nominatim.json'] )) - kensington_file = File.read(File.join(Rails.root,['test', 'fixtures', 'files', 'geocode_kensington.json'] )) + nominatim_file = File.read(File.join(Rails.root, ['test', 'fixtures', 'files', 'nominatim.json'])) + kensington_file = File.read(File.join(Rails.root, ['test', 'fixtures', 'files', 'geocode_kensington.json'])) - WebMock.stub_request(:get, /nominatim.openstreetmap.org/). - to_return(:status => 200, :headers => {}, :body => nominatim_file) + WebMock.stub_request(:get, /nominatim.openstreetmap.org/).to_return(status: 200, headers: {}, body: nominatim_file) # geocoder overrides Geocoder.configure(lookup: :test, ip_lookup: :test) - Geocoder::Lookup::Test.add_stub( "1 Bryce Avenue, Kensington, Western Australia, 6151, Australia", JSON.parse(kensington_file) ) - Geocoder::Lookup::Test.add_stub( "Pawsey Supercomputing Centre, 1 Bryce Avenue, Kensington, Western Australia, 6151, Australia", [] ) - Geocoder::Lookup::Test.add_stub( "Australia", [{ "address"=>{ "country"=>"Australia", "country_code"=>"au"} }] ) + Geocoder::Lookup::Test.add_stub('1 Bryce Avenue, Kensington, Western Australia, 6151, Australia', JSON.parse(kensington_file)) + Geocoder::Lookup::Test.add_stub('Pawsey Supercomputing Centre, 1 Bryce Avenue, Kensington, Western Australia, 6151, Australia', []) + Geocoder::Lookup::Test.add_stub('Australia', [{ 'address' => { 'country' => 'Australia', 'country_code' => 'au' } }]) end def assert_permitted(policy, user, action, *opts) @@ -260,7 +284,7 @@ def refute_permitted(*args) end def mock_timezone(tz = ActiveSupport::TimeZone.all.sample.tzinfo.identifier) - @_prev_tz = ENV['TZ'] # Time zone should not affect test result + @_prev_tz = ENV['TZ'] # Time zone should not affect test result ENV['TZ'] = tz end @@ -330,8 +354,10 @@ def mock_facets if @collection.any? c = @collection.first.class f = c.facet_fields.map do |ff| - { field_name: ff.to_sym, rows: (1 + rand(4)).times.map { { value: facet_value(ff), - count: (1 + rand(4)) } } } + { field_name: ff.to_sym, rows: rand(1..4).times.map do + { value: facet_value(ff), + count: rand(1..4) } + end } end JSON.parse(f.to_json, object_class: OpenStruct) else @@ -343,17 +369,16 @@ def mock_facets # Minitest's `stub` method but ignores any blocks class Object - - def blockless_stub name, val_or_callable, *block_args + def blockless_stub name, val_or_callable, *_block_args new_name = "__minitest_stub__#{name}" metaclass = class << self - self; + self end - if respond_to? name and not methods.map(&:to_s).include? name.to_s then + if respond_to? name and !methods.map(&:to_s).include? name.to_s metaclass.send :define_method, name do |*args| super(*args) end @@ -362,7 +387,7 @@ class << self metaclass.send :alias_method, new_name, name metaclass.send :define_method, name do |*args| - ret = if val_or_callable.respond_to? :call then + ret = if val_or_callable.respond_to? :call val_or_callable.call(*args) else val_or_callable diff --git a/test/unit/ingestors/4tu_gpt_llm_ingestor_test.rb b/test/unit/ingestors/4tu_gpt_llm_ingestor_test.rb new file mode 100644 index 000000000..1659771e3 --- /dev/null +++ b/test/unit/ingestors/4tu_gpt_llm_ingestor_test.rb @@ -0,0 +1,73 @@ +require 'test_helper' +require 'minitest/autorun' + +class FourtuGptLlmIngestorTest < ActiveSupport::TestCase + setup do + @user = users(:regular_user) + @content_provider = content_providers(:another_portal_provider) + mock_ingestions + mock_timezone # System time zone should not affect test result + end + + teardown do + reset_timezone + end + + test 'can ingest events from 4tu' do + source = @content_provider.sources.build( + url: 'https://www.4tu.nl/en/agenda/', + method: '4tu', + enabled: true + ) + + ingestor = Ingestors::FourtuLlmIngestor.new + + # check event doesn't + new_title = '4TU-meeting National Technology Strategy' + refute Event.where(title: new_title).any? + + run_res = '{ + "title":"4TU-meeting National Technology Strategy", + "start":"2024-07-03T12:30:00+02:00", + "end":"2024-07-03T19:00:00+02:00", + "venue":"Basecamp, Nijverheidsweg 16A, 3534 AM Utrecht (https://basecamputrecht.nl)", + "description":"My cool description", + "nonsense_attr":"My cool nonsense attribute" + }'.gsub(/\n/, '') + mock_client = Minitest::Mock.new + 8.times do + mock_client.expect(:chat, { 'choices' => { 0 => { 'message' => { 'content' => run_res } } } }, parameters: Object) + end + # run task + assert_difference 'Event.count', 1 do + freeze_time(2019) do + VCR.use_cassette('ingestors/4tu_gpt_llm') do + OpenAI::Client.stub(:new, mock_client) do + with_settings({ llm_scraper: { model: 'chatgpt', model_version: 'GPT-3.5' } }) do + ingestor.read(source.url) + ingestor.write(@user, @content_provider) + end + end + end + end + end + + assert_equal 4, ingestor.events.count + assert ingestor.materials.empty? + assert_equal 1, ingestor.stats[:events][:added] + assert_equal 3, ingestor.stats[:events][:updated] + assert_equal 0, ingestor.stats[:events][:rejected] + + # check event does exist + event = Event.where(title: new_title).first + assert event + assert_equal new_title, event.title + + # check other fields + assert_equal Time.zone.parse('Wed, 3 Jul 2024 12:30:00.000000000 UTC +00:00'), event.start + assert_equal Time.zone.parse('Wed, 3 Jul 2024 19:00:00.000000000 UTC +00:00'), event.end + assert_equal 'Amsterdam', event.timezone + assert_equal 'Basecamp, Nijverheidsweg 16A, 3534 AM Utrecht (https://basecamputrecht.nl)', event.venue + assert_equal 'LLM', event.source + end +end diff --git a/test/unit/ingestors/4tu_willma_llm_ingestor_test.rb b/test/unit/ingestors/4tu_willma_llm_ingestor_test.rb new file mode 100644 index 000000000..5395d89f6 --- /dev/null +++ b/test/unit/ingestors/4tu_willma_llm_ingestor_test.rb @@ -0,0 +1,59 @@ +require 'test_helper' + +class FourtuWillmaLlmIngestorTest < ActiveSupport::TestCase + setup do + @user = users(:regular_user) + @content_provider = content_providers(:another_portal_provider) + mock_ingestions + mock_timezone # System time zone should not affect test result + end + + teardown do + reset_timezone + end + + test 'can ingest events from 4tu' do + source = @content_provider.sources.build( + url: 'https://www.4tu.nl/en/agenda/', + method: '4tu', + enabled: true + ) + + ingestor = Ingestors::FourtuLlmIngestor.new + + # check event doesn't + new_title = '4TU-meeting National Technology Strategy' + refute Event.where(title: new_title).any? + + mock_llm_requests + # run task + assert_difference 'Event.count', 1 do + freeze_time(2019) do + VCR.use_cassette('ingestors/4tu_llm') do + with_settings({ llm_scraper: { model: 'willma', model_version: 'Zephyr 7B' } }) do + ingestor.read(source.url) + ingestor.write(@user, @content_provider) + end + end + end + end + + assert_equal 4, ingestor.events.count + assert ingestor.materials.empty? + assert_equal 1, ingestor.stats[:events][:added] + assert_equal 3, ingestor.stats[:events][:updated] + assert_equal 0, ingestor.stats[:events][:rejected] + + # check event does exist + event = Event.where(title: new_title).first + assert event + assert_equal new_title, event.title + + # check other fields + assert_equal Time.zone.parse('Wed, 3 Jul 2024 12:30:00.000000000 UTC +00:00'), event.start + assert_equal Time.zone.parse('Wed, 3 Jul 2024 19:00:00.000000000 UTC +00:00'), event.end + assert_equal 'Amsterdam', event.timezone + assert_equal 'Basecamp, Nijverheidsweg 16A, 3534 AM Utrecht (https://basecamputrecht.nl)', event.venue + assert_equal 'LLM', event.source + end +end diff --git a/test/unit/llm_service_test.rb b/test/unit/llm_service_test.rb new file mode 100644 index 000000000..e2adc263e --- /dev/null +++ b/test/unit/llm_service_test.rb @@ -0,0 +1,83 @@ +require 'test_helper' +require 'minitest/autorun' + +class LlmServiceTest < ActiveSupport::TestCase + def setup + mock_llm_requests + end + + test 'service_hash_contains_all_subclasses' do + hash_set = (Llm.service_hash.values + [Llm::Service]).map { |c| c.name.split('::').last.to_sym }.to_set + classes_set = Llm.constants.filter { |c| Llm.const_get(c).is_a?(Class) }.to_set + assert hash_set == classes_set + end + + test 'no_post_processing unless model provided' do + with_settings({ llm_scraper: { model: 'willma' } }) do + mock = Minitest::Mock.new + Event.stub :not_finished, mock do + mock.expect :needs_processing, [], [String] + Llm.post_processing_task + end + mock.verify + end + + with_settings({ llm_scraper: { model: nil } }) do + mock = Minitest::Mock.new + Event.stub :not_finished, mock do + mock.expect :needs_processing, [] + Llm.post_processing_task + end + assert_raises(MockExpectationError) { mock.verify } + end + end + + test 'NotImplementedError on parent class initialization' do + assert_raises(NotImplementedError) { Llm::Service.new } + end + + test 'check event filtering in post_processing' do + u = users(:scraper_user) + event1 = Event.create!(title: 'needs_processing', start: Time.zone.now - 5.hours, end: Time.zone.now + 5.hours, url: 'https://www.google.com#1', user_id: u.id) + event1.llm_interaction = llm_interactions(:needs_processing) + event1.save! + event2 = Event.create!(title: 'different_prompt', start: Time.zone.now - 5.hours, end: Time.zone.now + 5.hours, url: 'https://www.google.com#2', user_id: u.id) + event2.llm_interaction = llm_interactions(:different_prompt) + event2.save! + event3 = Event.create!(title: 'finished', start: Time.zone.now - 5.hours, end: Time.zone.now - 4.hours, url: 'https://www.google.com#3', user_id: u.id) + event3.llm_interaction = llm_interactions(:scrape) + event3.save! + result = Llm.filtered_event_list(event1.llm_interaction.prompt) + assert result.map(&:title) == %w[needs_processing different_prompt] + end + + test 'test willma scrape' do + with_settings({ llm_scraper: { model: 'willma', model_version: 'Zephyr 7B' } }) do + result = Llm::WillmaService.new.scrape('my_html') + assert result['title'] = 'my_title' + assert result['venue'] = 'my_venue' + assert result['description'] = 'my_description' + end + end + + test 'test gpt scrape' do + run_res = '{ + "title":"my_title", + "start":"2024-07-03T12:30:00+02:00", + "end":"2024-07-03T19:00:00+02:00", + "venue":"my_venue", + "description":"my_description", + }'.gsub(/\n/, '') + mock_client = Minitest::Mock.new + mock_client.expect(:chat, { 'choices' => { 0 => { 'message' => { 'content' => run_res } } } }, parameters: Object) + # run task + OpenAI::Client.stub(:new, mock_client) do + with_settings({ llm_scraper: { model: 'chatgpt', model_version: 'GPT-3.5' } }) do + result = Llm::ChatgptService.new.scrape('my_html') + assert result['title'] = 'my_title' + assert result['venue'] = 'my_venue' + assert result['description'] = 'my_description' + end + end + end +end diff --git a/test/vcr_cassettes/ingestors/4tu_gpt_llm.yml b/test/vcr_cassettes/ingestors/4tu_gpt_llm.yml new file mode 100644 index 000000000..828e53abd --- /dev/null +++ b/test/vcr_cassettes/ingestors/4tu_gpt_llm.yml @@ -0,0 +1,278 @@ +--- +http_interactions: +- request: + method: get + uri: https://www.4tu.nl/en/agenda/ + body: + encoding: US-ASCII + string: '' + headers: + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 200 + message: OK + headers: + Server: + - nginx + Date: + - Thu, 06 Jun 2024 07:51:56 GMT + Content-Type: + - text/html; charset=UTF-8 + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Vary: + - Accept-Encoding + Strict-Transport-Security: + - max-age=31536000 + X-Frame-Options: + - DENY + X-Xss-Protection: + - 1; mode=block + X-Content-Type-Options: + - nosniff + Referrer-Policy: + - strict-origin-when-cross-origin + Feature-Policy: + - geolocation 'none'; camera 'none'; payment 'none'; microphone 'none'; + Content-Security-Policy: + - script-src 'self' https://platform.twitter.com https://cdn.syndication.twimg.com + https://www.youtube.com https://player.vimeo.com 'sha256-B34d9UvaEWVUEqLK1XO7bmIC0GTuglVf9avDg11NcSc=' + 'sha256-P8GHTBinuD+aYtH+iNVPhJcdl3xVDaG/Y3fcqecF83w=' https://www.google-analytics.com + https://www.googletagmanager.com https://embed.podcasts.apple.com 'sha256-zN1rNu6bv+5uQuqzW39H8OOBooeKevi2fT089esapTo='; + frame-ancestors 'self' https://4tu.webhare.net + Cache-Control: + - no-cache + body: + encoding: ASCII-8BIT + string: !binary |- +  + recorded_at: Wed, 02 Jan 2019 16:00:00 GMT +- request: + method: get + uri: https://www.4tu.nl/en/agenda/summer-event-2024/ + body: + encoding: US-ASCII + string: '' + headers: + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 200 + message: OK + headers: + Server: + - nginx + Date: + - Thu, 06 Jun 2024 07:51:56 GMT + Content-Type: + - text/html; charset=UTF-8 + Content-Length: + - '34546' + Connection: + - keep-alive + Strict-Transport-Security: + - max-age=31536000 + X-Frame-Options: + - DENY + X-Xss-Protection: + - 1; mode=block + X-Content-Type-Options: + - nosniff + Referrer-Policy: + - strict-origin-when-cross-origin + Feature-Policy: + - geolocation 'none'; camera 'none'; payment 'none'; microphone 'none'; + Content-Security-Policy: + - script-src 'self' https://platform.twitter.com https://cdn.syndication.twimg.com + https://www.youtube.com https://player.vimeo.com 'sha256-B34d9UvaEWVUEqLK1XO7bmIC0GTuglVf9avDg11NcSc=' + 'sha256-P8GHTBinuD+aYtH+iNVPhJcdl3xVDaG/Y3fcqecF83w=' https://www.google-analytics.com + https://www.googletagmanager.com https://embed.podcasts.apple.com 'sha256-zN1rNu6bv+5uQuqzW39H8OOBooeKevi2fT089esapTo='; + frame-ancestors 'self' https://4tu.webhare.net + Last-Modified: + - Tue, 21 May 2024 20:39:53 GMT + Vary: + - Accept-Encoding + body: + encoding: ASCII-8BIT + string: !binary |- +  + recorded_at: Wed, 02 Jan 2019 16:00:00 GMT +- request: + method: get + uri: https://www.4tu.nl/en/agenda/2024-6-18-webinar-mathematics/ + body: + encoding: US-ASCII + string: '' + headers: + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 200 + message: OK + headers: + Server: + - nginx + Date: + - Thu, 06 Jun 2024 07:51:56 GMT + Content-Type: + - text/html; charset=UTF-8 + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Vary: + - Accept-Encoding + Strict-Transport-Security: + - max-age=31536000 + X-Frame-Options: + - DENY + X-Xss-Protection: + - 1; mode=block + X-Content-Type-Options: + - nosniff + Referrer-Policy: + - strict-origin-when-cross-origin + Feature-Policy: + - geolocation 'none'; camera 'none'; payment 'none'; microphone 'none'; + Content-Security-Policy: + - script-src 'self' https://platform.twitter.com https://cdn.syndication.twimg.com + https://www.youtube.com https://player.vimeo.com 'sha256-B34d9UvaEWVUEqLK1XO7bmIC0GTuglVf9avDg11NcSc=' + 'sha256-P8GHTBinuD+aYtH+iNVPhJcdl3xVDaG/Y3fcqecF83w=' https://www.google-analytics.com + https://www.googletagmanager.com https://embed.podcasts.apple.com 'sha256-zN1rNu6bv+5uQuqzW39H8OOBooeKevi2fT089esapTo='; + frame-ancestors 'self' https://4tu.webhare.net + Cache-Control: + - no-cache + body: + encoding: ASCII-8BIT + string: !binary |- +  + recorded_at: Wed, 02 Jan 2019 16:00:00 GMT +- request: + method: get + uri: https://www.4tu.nl/en/agenda/4TU-meeting%20National%20Technology%20Strategy/ + body: + encoding: US-ASCII + string: '' + headers: + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 200 + message: OK + headers: + Server: + - nginx + Date: + - Thu, 06 Jun 2024 07:51:56 GMT + Content-Type: + - text/html; charset=UTF-8 + Content-Length: + - '35025' + Connection: + - keep-alive + Strict-Transport-Security: + - max-age=31536000 + X-Frame-Options: + - DENY + X-Xss-Protection: + - 1; mode=block + X-Content-Type-Options: + - nosniff + Referrer-Policy: + - strict-origin-when-cross-origin + Feature-Policy: + - geolocation 'none'; camera 'none'; payment 'none'; microphone 'none'; + Content-Security-Policy: + - script-src 'self' https://platform.twitter.com https://cdn.syndication.twimg.com + https://www.youtube.com https://player.vimeo.com 'sha256-B34d9UvaEWVUEqLK1XO7bmIC0GTuglVf9avDg11NcSc=' + 'sha256-P8GHTBinuD+aYtH+iNVPhJcdl3xVDaG/Y3fcqecF83w=' https://www.google-analytics.com + https://www.googletagmanager.com https://embed.podcasts.apple.com 'sha256-zN1rNu6bv+5uQuqzW39H8OOBooeKevi2fT089esapTo='; + frame-ancestors 'self' https://4tu.webhare.net + Last-Modified: + - Wed, 05 Jun 2024 08:57:55 GMT + Vary: + - Accept-Encoding + body: + encoding: ASCII-8BIT + string: !binary |- +  + recorded_at: Wed, 02 Jan 2019 16:00:00 GMT +- request: + method: get + uri: https://www.4tu.nl/en/agenda/2024-10-09-room-for-everyones-educational-talent/ + body: + encoding: US-ASCII + string: '' + headers: + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 200 + message: OK + headers: + Server: + - nginx + Date: + - Thu, 06 Jun 2024 07:51:57 GMT + Content-Type: + - text/html; charset=UTF-8 + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Vary: + - Accept-Encoding + Strict-Transport-Security: + - max-age=31536000 + X-Frame-Options: + - DENY + X-Xss-Protection: + - 1; mode=block + X-Content-Type-Options: + - nosniff + Referrer-Policy: + - strict-origin-when-cross-origin + Feature-Policy: + - geolocation 'none'; camera 'none'; payment 'none'; microphone 'none'; + Content-Security-Policy: + - script-src 'self' https://platform.twitter.com https://cdn.syndication.twimg.com + https://www.youtube.com https://player.vimeo.com 'sha256-B34d9UvaEWVUEqLK1XO7bmIC0GTuglVf9avDg11NcSc=' + 'sha256-P8GHTBinuD+aYtH+iNVPhJcdl3xVDaG/Y3fcqecF83w=' https://www.google-analytics.com + https://www.googletagmanager.com https://embed.podcasts.apple.com 'sha256-zN1rNu6bv+5uQuqzW39H8OOBooeKevi2fT089esapTo='; + frame-ancestors 'self' https://4tu.webhare.net + Cache-Control: + - no-cache + body: + encoding: ASCII-8BIT + string: !binary |- +  + recorded_at: Wed, 02 Jan 2019 16:00:00 GMT +recorded_with: VCR 6.2.0 diff --git a/test/vcr_cassettes/ingestors/4tu_llm.yml b/test/vcr_cassettes/ingestors/4tu_llm.yml new file mode 100644 index 000000000..94c89b80f --- /dev/null +++ b/test/vcr_cassettes/ingestors/4tu_llm.yml @@ -0,0 +1,670 @@ +--- +http_interactions: +- request: + method: get + uri: https://www.4tu.nl/en/agenda/ + body: + encoding: US-ASCII + string: '' + headers: + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 200 + message: OK + headers: + Server: + - nginx + Date: + - Wed, 05 Jun 2024 14:16:15 GMT + Content-Type: + - text/html; charset=UTF-8 + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Vary: + - Accept-Encoding + Strict-Transport-Security: + - max-age=31536000 + X-Frame-Options: + - DENY + X-Xss-Protection: + - 1; mode=block + X-Content-Type-Options: + - nosniff + Referrer-Policy: + - strict-origin-when-cross-origin + Feature-Policy: + - geolocation 'none'; camera 'none'; payment 'none'; microphone 'none'; + Content-Security-Policy: + - script-src 'self' https://platform.twitter.com https://cdn.syndication.twimg.com + https://www.youtube.com https://player.vimeo.com 'sha256-B34d9UvaEWVUEqLK1XO7bmIC0GTuglVf9avDg11NcSc=' + 'sha256-P8GHTBinuD+aYtH+iNVPhJcdl3xVDaG/Y3fcqecF83w=' https://www.google-analytics.com + https://www.googletagmanager.com https://embed.podcasts.apple.com 'sha256-zN1rNu6bv+5uQuqzW39H8OOBooeKevi2fT089esapTo='; + frame-ancestors 'self' https://4tu.webhare.net + Cache-Control: + - no-cache + body: + encoding: ASCII-8BIT + string: !binary |- +  + recorded_at: Wed, 02 Jan 2019 19:00:00 GMT +- request: + method: get + uri: https://www.4tu.nl/en/agenda/summer-event-2024/ + body: + encoding: US-ASCII + string: '' + headers: + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 200 + message: OK + headers: + Server: + - nginx + Date: + - Wed, 05 Jun 2024 14:16:16 GMT + Content-Type: + - text/html; charset=UTF-8 + Content-Length: + - '34546' + Connection: + - keep-alive + Strict-Transport-Security: + - max-age=31536000 + X-Frame-Options: + - DENY + X-Xss-Protection: + - 1; mode=block + X-Content-Type-Options: + - nosniff + Referrer-Policy: + - strict-origin-when-cross-origin + Feature-Policy: + - geolocation 'none'; camera 'none'; payment 'none'; microphone 'none'; + Content-Security-Policy: + - script-src 'self' https://platform.twitter.com https://cdn.syndication.twimg.com + https://www.youtube.com https://player.vimeo.com 'sha256-B34d9UvaEWVUEqLK1XO7bmIC0GTuglVf9avDg11NcSc=' + 'sha256-P8GHTBinuD+aYtH+iNVPhJcdl3xVDaG/Y3fcqecF83w=' https://www.google-analytics.com + https://www.googletagmanager.com https://embed.podcasts.apple.com 'sha256-zN1rNu6bv+5uQuqzW39H8OOBooeKevi2fT089esapTo='; + frame-ancestors 'self' https://4tu.webhare.net + Last-Modified: + - Tue, 21 May 2024 20:39:53 GMT + Vary: + - Accept-Encoding + body: + encoding: ASCII-8BIT + string: !binary |- +  + recorded_at: Wed, 02 Jan 2019 19:00:00 GMT +- request: + method: get + uri: https://willma.soil.surf.nl/api/models + body: + encoding: UTF-8 + string: "{}" + headers: + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + Content-Type: + - application/json + X-Api-Key: + - 77696c6c6d61-c5f9443d-e1eb-469a-a9ba-d7c9c733ebb3 + response: + status: + code: 200 + message: OK + headers: + Date: + - Wed, 05 Jun 2024 14:16:16 GMT + Server: + - gunicorn + Strict-Transport-Security: + - max-age=31556952 + Content-Security-Policy: + - 'default-src ''self''; style-src ''self'' ''unsafe-inline'' *.bootstrapcdn.com + *.cloudflare.com fonts.googleapis.com estudybooks.surf.nl; script-src ''self'' + ''unsafe-inline'' ''unsafe-eval'' webstats.surf.nl code.jquery.com *.bootstrapcdn.com + *.cloudflare.com *.aspnetcdn.com; img-src ''self'' data: blob: ''unsafe-inline'' + https: data: webstats.surf.nl; font-src ''self'' data: *.bootstrapcdn.com + fonts.gstatic.com; connect-src ''self''; media-src ''self'' data: blob:' + X-Frame-Options: + - SAMEORIGIN + X-Xss-Protection: + - 1; mode=block; + X-Content-Type-Options: + - nosniff + Content-Type: + - application/json + Content-Length: + - '7139' + Vary: + - Cookie + Set-Cookie: + - session=.eJyNjcEKAjEMRP8lXnVhb5JfESkxjbqQdiVJvcj-u1XEiwh7GmZ4M_MAqTGFSunqgAdoVrEQC3qz81AV3aig0kn0HSVzRsp3qiw5t-CrakEPskjEPLcasP2zcrG53b4rK7GfsxW9XbebVzZ8ADj2koulKQOO4355AieiXTE.ZmBzMA.EVu_S4gNysljneNfzzp5ijcVjQs; + HttpOnly; Path=/ + body: + encoding: UTF-8 + string: '[{"description":"Based on the fluffy animal, Meta AI (from Facebook) + trained this model first on a huge unorganized public dataset from the web.\n Afterwards, + the model was further finetuned on natural conversations to accommodate: for + an improved chat interaction.\n Approximately 0.12% of the training + data is Dutch and raises questions on the performance on Dutch questions.\n \n Release + date: July 2023","id":1,"name":"LLaMa-2 13B Chat","sequence":[{"model_id":8,"type":"text"}],"sequence_type":"LLMSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":null,"web_visible":true},{"description":"Based + on the fluffy animal, Meta AI (from Facebook) trained this model first on + a huge unorganized public dataset from the web.\n Afterwards, the + model was further finetuned on natural conversations to accommodate: for an + improved chat interaction. \n Approximately 0.12% of the training + data is Dutch and raises questions on the performance on Dutch questions.}\n \n Release + date: July 2023","id":2,"name":"LLaMa-2 7B Chat","sequence":[{"model_id":9,"type":"text"}],"sequence_type":"LLMSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":null,"web_visible":true},{"description":"OpenAI + first released ChatGPT and shock the world due to its impressive performance + on producing relevant answers in a natural conversation.\n However, + there have been some data and privacy concerns. Hence, we for now offer ChatGPT-3.5 + through this interface via the OpenAI API.\n Hereby, we circumvent + any gathering of personal information via tracking cookies on the ChatGPT + website. Note: OpenAI could still store \n and process the messages + themselves.\n \n Release date: November 2022 (continuous + update)","id":3,"name":"ChatGPT-3.5","sequence":[{"model_id":10,"type":"text"}],"sequence_type":"LLMSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":null,"web_visible":true},{"description":"Zephyr + is a series of language models that are trained to act as helpful assistants.\n We + found that removing the in-built alignment of these datasets boosted performance + on MT Bench and made the model more helpful. \n However, this means + that model is likely to generate problematic text when prompted to do so and + should only be used for educational and research purposes.\n \n Release + Date: October 2023","id":4,"name":"Zephyr 7B","sequence":[{"model_id":11,"type":"text"}],"sequence_type":"LLMSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":null,"web_visible":true},{"description":"Stable + diffusion is for images","id":5,"name":"Stable Diffusion","sequence":[{"model_id":12,"type":"images"}],"sequence_type":"ImageGenerationSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":null,"web_visible":false},{"description":"This + is a text-to-speech model trained by facebook. It converts the input message + to output audio.\n The quality of the output is only guaranteed + by dutch speech, as this is the attempted output language.\n In + the future we may add different languages.\n\n Release + Date: December 2023","id":7,"name":"Massively Multilingual Speech Dutch","sequence":[{"model_id":14,"type":"tts"}],"sequence_type":"AudioGenerationSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":null,"web_visible":false},{"description":"Currently + only implemented for API access.","id":8,"name":"Whisper speech-to-text Dutch","sequence":[{"model_id":15,"type":"stt"}],"sequence_type":"SingleSessionSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":null,"web_visible":true},{"description":"","id":17,"name":"BramVanroy/GEITje-7B-ultra","sequence":[{"model_id":29,"type":"text"}],"sequence_type":"LLMSequence","sram_owner_regex":"urn:mace:surf.nl:sram:label:surf_rsc:advanceddutchllm","user_id":66,"web_visible":true},{"description":"Mixtral + GPT-Q quantization","id":22,"name":"casperhansen/mixtral-instruct-awq","sequence":[{"model_id":35,"type":"text"}],"sequence_type":"LLMSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":66,"web_visible":true},{"description":"sasd","id":23,"name":"asd","sequence":[{"template_id":6},{"model_id":11,"type":"text"}],"sequence_type":"PromptTemplateSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":1,"web_visible":false},{"description":"","id":24,"name":"synthetic + data","sequence":[{"model_id":38,"type":"search"},{"template_id":7},{"model_id":11,"type":"text"}],"sequence_type":"RAGSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc","user_id":68,"web_visible":true},{"description":"bozo","id":28,"name":"Rijgersberg/GEITje-7B-chat-v2","sequence":[{"model_id":42,"type":"search"},{"template_id":32},{"model_id":46,"type":"text"}],"sequence_type":"RAGSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":66,"web_visible":true},{"description":"","id":29,"name":"rhysjones/phi-2-orange-v2","sequence":[{"model_id":47,"type":"text"}],"sequence_type":"LLMSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":66,"web_visible":true},{"description":"Fietje + van T.V.","id":30,"name":"BramVanroy/fietje-2b-chat","sequence":[{"model_id":49,"type":"text"}],"sequence_type":"LLMSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":62,"web_visible":true},{"description":"","id":33,"name":"ibm-granite/granite-8b-code-instruct","sequence":[{"model_id":62,"type":"text"}],"sequence_type":"LLMSequence","sram_owner_regex":null,"user_id":64,"web_visible":true},{"description":"","id":34,"name":"codellama/CodeLlama-7b-Instruct-hf","sequence":[{"model_id":63,"type":"text"}],"sequence_type":"LLMSequence","sram_owner_regex":null,"user_id":63,"web_visible":true},{"description":"\n This + is a text-to-speech model trained by facebook. It converts the input message + to output audio.\n The quality of the output is only + guaranteed by english speech, as this is the attempted output language.\n In + the future we may add different languages.\n\n Release + Date: December 2023\n ","id":36,"name":"Massively Multilingual + Speech English","sequence":[{"model_id":65,"type":"tts"}],"sequence_type":"AudioGenerationSequence","sram_owner_regex":null,"user_id":null,"web_visible":false},{"description":"SURF.nl + chat obv Sitemap en Dienstbrochure","id":40,"name":"SURF.nl chat ","sequence":[{"template_id":43},{"model_id":10,"type":"text"}],"sequence_type":"PromptTemplateSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":85,"web_visible":true}] + + ' + recorded_at: Wed, 02 Jan 2019 19:00:00 GMT +- request: + method: get + uri: https://www.4tu.nl/en/agenda/2024-6-18-webinar-mathematics/ + body: + encoding: US-ASCII + string: '' + headers: + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 200 + message: OK + headers: + Server: + - nginx + Date: + - Wed, 05 Jun 2024 14:16:18 GMT + Content-Type: + - text/html; charset=UTF-8 + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Vary: + - Accept-Encoding + Strict-Transport-Security: + - max-age=31536000 + X-Frame-Options: + - DENY + X-Xss-Protection: + - 1; mode=block + X-Content-Type-Options: + - nosniff + Referrer-Policy: + - strict-origin-when-cross-origin + Feature-Policy: + - geolocation 'none'; camera 'none'; payment 'none'; microphone 'none'; + Content-Security-Policy: + - script-src 'self' https://platform.twitter.com https://cdn.syndication.twimg.com + https://www.youtube.com https://player.vimeo.com 'sha256-B34d9UvaEWVUEqLK1XO7bmIC0GTuglVf9avDg11NcSc=' + 'sha256-P8GHTBinuD+aYtH+iNVPhJcdl3xVDaG/Y3fcqecF83w=' https://www.google-analytics.com + https://www.googletagmanager.com https://embed.podcasts.apple.com 'sha256-zN1rNu6bv+5uQuqzW39H8OOBooeKevi2fT089esapTo='; + frame-ancestors 'self' https://4tu.webhare.net + Cache-Control: + - no-cache + body: + encoding: ASCII-8BIT + string: !binary |- +  + recorded_at: Wed, 02 Jan 2019 19:00:00 GMT +- request: + method: get + uri: https://willma.soil.surf.nl/api/models + body: + encoding: UTF-8 + string: "{}" + headers: + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + Content-Type: + - application/json + X-Api-Key: + - 77696c6c6d61-c5f9443d-e1eb-469a-a9ba-d7c9c733ebb3 + response: + status: + code: 200 + message: OK + headers: + Date: + - Wed, 05 Jun 2024 14:16:18 GMT + Server: + - gunicorn + Strict-Transport-Security: + - max-age=31556952 + Content-Security-Policy: + - 'default-src ''self''; style-src ''self'' ''unsafe-inline'' *.bootstrapcdn.com + *.cloudflare.com fonts.googleapis.com estudybooks.surf.nl; script-src ''self'' + ''unsafe-inline'' ''unsafe-eval'' webstats.surf.nl code.jquery.com *.bootstrapcdn.com + *.cloudflare.com *.aspnetcdn.com; img-src ''self'' data: blob: ''unsafe-inline'' + https: data: webstats.surf.nl; font-src ''self'' data: *.bootstrapcdn.com + fonts.gstatic.com; connect-src ''self''; media-src ''self'' data: blob:' + X-Frame-Options: + - SAMEORIGIN + X-Xss-Protection: + - 1; mode=block; + X-Content-Type-Options: + - nosniff + Content-Type: + - application/json + Content-Length: + - '7139' + Vary: + - Cookie + Set-Cookie: + - session=.eJyNjcEKAjEMRP8lXnVhb5JfESkxjbqQdiVJvcj-u1XEiwh7GmZ4M_MAqTGFSunqgAdoVrEQC3qz81AV3aig0kn0HSVzRsp3qiw5t-CrakEPskjEPLcasP2zcrG53b4rK7GfsxW9XbebVzZ8ADj2koulKQOO4355AieiXTE.ZmBzMg.u3eHyshxjBZTyRPagRrhTNgeZUc; + HttpOnly; Path=/ + body: + encoding: UTF-8 + string: '[{"description":"Based on the fluffy animal, Meta AI (from Facebook) + trained this model first on a huge unorganized public dataset from the web.\n Afterwards, + the model was further finetuned on natural conversations to accommodate: for + an improved chat interaction.\n Approximately 0.12% of the training + data is Dutch and raises questions on the performance on Dutch questions.\n \n Release + date: July 2023","id":1,"name":"LLaMa-2 13B Chat","sequence":[{"model_id":8,"type":"text"}],"sequence_type":"LLMSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":null,"web_visible":true},{"description":"Based + on the fluffy animal, Meta AI (from Facebook) trained this model first on + a huge unorganized public dataset from the web.\n Afterwards, the + model was further finetuned on natural conversations to accommodate: for an + improved chat interaction. \n Approximately 0.12% of the training + data is Dutch and raises questions on the performance on Dutch questions.}\n \n Release + date: July 2023","id":2,"name":"LLaMa-2 7B Chat","sequence":[{"model_id":9,"type":"text"}],"sequence_type":"LLMSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":null,"web_visible":true},{"description":"OpenAI + first released ChatGPT and shock the world due to its impressive performance + on producing relevant answers in a natural conversation.\n However, + there have been some data and privacy concerns. Hence, we for now offer ChatGPT-3.5 + through this interface via the OpenAI API.\n Hereby, we circumvent + any gathering of personal information via tracking cookies on the ChatGPT + website. Note: OpenAI could still store \n and process the messages + themselves.\n \n Release date: November 2022 (continuous + update)","id":3,"name":"ChatGPT-3.5","sequence":[{"model_id":10,"type":"text"}],"sequence_type":"LLMSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":null,"web_visible":true},{"description":"Zephyr + is a series of language models that are trained to act as helpful assistants.\n We + found that removing the in-built alignment of these datasets boosted performance + on MT Bench and made the model more helpful. \n However, this means + that model is likely to generate problematic text when prompted to do so and + should only be used for educational and research purposes.\n \n Release + Date: October 2023","id":4,"name":"Zephyr 7B","sequence":[{"model_id":11,"type":"text"}],"sequence_type":"LLMSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":null,"web_visible":true},{"description":"Stable + diffusion is for images","id":5,"name":"Stable Diffusion","sequence":[{"model_id":12,"type":"images"}],"sequence_type":"ImageGenerationSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":null,"web_visible":false},{"description":"This + is a text-to-speech model trained by facebook. It converts the input message + to output audio.\n The quality of the output is only guaranteed + by dutch speech, as this is the attempted output language.\n In + the future we may add different languages.\n\n Release + Date: December 2023","id":7,"name":"Massively Multilingual Speech Dutch","sequence":[{"model_id":14,"type":"tts"}],"sequence_type":"AudioGenerationSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":null,"web_visible":false},{"description":"Currently + only implemented for API access.","id":8,"name":"Whisper speech-to-text Dutch","sequence":[{"model_id":15,"type":"stt"}],"sequence_type":"SingleSessionSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":null,"web_visible":true},{"description":"","id":17,"name":"BramVanroy/GEITje-7B-ultra","sequence":[{"model_id":29,"type":"text"}],"sequence_type":"LLMSequence","sram_owner_regex":"urn:mace:surf.nl:sram:label:surf_rsc:advanceddutchllm","user_id":66,"web_visible":true},{"description":"Mixtral + GPT-Q quantization","id":22,"name":"casperhansen/mixtral-instruct-awq","sequence":[{"model_id":35,"type":"text"}],"sequence_type":"LLMSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":66,"web_visible":true},{"description":"sasd","id":23,"name":"asd","sequence":[{"template_id":6},{"model_id":11,"type":"text"}],"sequence_type":"PromptTemplateSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":1,"web_visible":false},{"description":"","id":24,"name":"synthetic + data","sequence":[{"model_id":38,"type":"search"},{"template_id":7},{"model_id":11,"type":"text"}],"sequence_type":"RAGSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc","user_id":68,"web_visible":true},{"description":"bozo","id":28,"name":"Rijgersberg/GEITje-7B-chat-v2","sequence":[{"model_id":42,"type":"search"},{"template_id":32},{"model_id":46,"type":"text"}],"sequence_type":"RAGSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":66,"web_visible":true},{"description":"","id":29,"name":"rhysjones/phi-2-orange-v2","sequence":[{"model_id":47,"type":"text"}],"sequence_type":"LLMSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":66,"web_visible":true},{"description":"Fietje + van T.V.","id":30,"name":"BramVanroy/fietje-2b-chat","sequence":[{"model_id":49,"type":"text"}],"sequence_type":"LLMSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":62,"web_visible":true},{"description":"","id":33,"name":"ibm-granite/granite-8b-code-instruct","sequence":[{"model_id":62,"type":"text"}],"sequence_type":"LLMSequence","sram_owner_regex":null,"user_id":64,"web_visible":true},{"description":"","id":34,"name":"codellama/CodeLlama-7b-Instruct-hf","sequence":[{"model_id":63,"type":"text"}],"sequence_type":"LLMSequence","sram_owner_regex":null,"user_id":63,"web_visible":true},{"description":"\n This + is a text-to-speech model trained by facebook. It converts the input message + to output audio.\n The quality of the output is only + guaranteed by english speech, as this is the attempted output language.\n In + the future we may add different languages.\n\n Release + Date: December 2023\n ","id":36,"name":"Massively Multilingual + Speech English","sequence":[{"model_id":65,"type":"tts"}],"sequence_type":"AudioGenerationSequence","sram_owner_regex":null,"user_id":null,"web_visible":false},{"description":"SURF.nl + chat obv Sitemap en Dienstbrochure","id":40,"name":"SURF.nl chat ","sequence":[{"template_id":43},{"model_id":10,"type":"text"}],"sequence_type":"PromptTemplateSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":85,"web_visible":true}] + + ' + recorded_at: Wed, 02 Jan 2019 19:00:00 GMT +- request: + method: get + uri: https://www.4tu.nl/en/agenda/4TU-meeting%20National%20Technology%20Strategy/ + body: + encoding: US-ASCII + string: '' + headers: + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 200 + message: OK + headers: + Server: + - nginx + Date: + - Wed, 05 Jun 2024 14:16:19 GMT + Content-Type: + - text/html; charset=UTF-8 + Content-Length: + - '35025' + Connection: + - keep-alive + Strict-Transport-Security: + - max-age=31536000 + X-Frame-Options: + - DENY + X-Xss-Protection: + - 1; mode=block + X-Content-Type-Options: + - nosniff + Referrer-Policy: + - strict-origin-when-cross-origin + Feature-Policy: + - geolocation 'none'; camera 'none'; payment 'none'; microphone 'none'; + Content-Security-Policy: + - script-src 'self' https://platform.twitter.com https://cdn.syndication.twimg.com + https://www.youtube.com https://player.vimeo.com 'sha256-B34d9UvaEWVUEqLK1XO7bmIC0GTuglVf9avDg11NcSc=' + 'sha256-P8GHTBinuD+aYtH+iNVPhJcdl3xVDaG/Y3fcqecF83w=' https://www.google-analytics.com + https://www.googletagmanager.com https://embed.podcasts.apple.com 'sha256-zN1rNu6bv+5uQuqzW39H8OOBooeKevi2fT089esapTo='; + frame-ancestors 'self' https://4tu.webhare.net + Last-Modified: + - Wed, 05 Jun 2024 08:57:55 GMT + Vary: + - Accept-Encoding + body: + encoding: ASCII-8BIT + string: !binary |- +  + recorded_at: Wed, 02 Jan 2019 19:00:00 GMT +- request: + method: get + uri: https://willma.soil.surf.nl/api/models + body: + encoding: UTF-8 + string: "{}" + headers: + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + Content-Type: + - application/json + X-Api-Key: + - 77696c6c6d61-c5f9443d-e1eb-469a-a9ba-d7c9c733ebb3 + response: + status: + code: 200 + message: OK + headers: + Date: + - Wed, 05 Jun 2024 14:16:19 GMT + Server: + - gunicorn + Strict-Transport-Security: + - max-age=31556952 + Content-Security-Policy: + - 'default-src ''self''; style-src ''self'' ''unsafe-inline'' *.bootstrapcdn.com + *.cloudflare.com fonts.googleapis.com estudybooks.surf.nl; script-src ''self'' + ''unsafe-inline'' ''unsafe-eval'' webstats.surf.nl code.jquery.com *.bootstrapcdn.com + *.cloudflare.com *.aspnetcdn.com; img-src ''self'' data: blob: ''unsafe-inline'' + https: data: webstats.surf.nl; font-src ''self'' data: *.bootstrapcdn.com + fonts.gstatic.com; connect-src ''self''; media-src ''self'' data: blob:' + X-Frame-Options: + - SAMEORIGIN + X-Xss-Protection: + - 1; mode=block; + X-Content-Type-Options: + - nosniff + Content-Type: + - application/json + Content-Length: + - '7139' + Vary: + - Cookie + Set-Cookie: + - session=.eJyNjcEKAjEMRP8lXnVhb5JfESkxjbqQdiVJvcj-u1XEiwh7GmZ4M_MAqTGFSunqgAdoVrEQC3qz81AV3aig0kn0HSVzRsp3qiw5t-CrakEPskjEPLcasP2zcrG53b4rK7GfsxW9XbebVzZ8ADj2koulKQOO4355AieiXTE.ZmBzMw.4GkyMCqKpiZo5501oc_GyauOmbs; + HttpOnly; Path=/ + body: + encoding: UTF-8 + string: '[{"description":"Based on the fluffy animal, Meta AI (from Facebook) + trained this model first on a huge unorganized public dataset from the web.\n Afterwards, + the model was further finetuned on natural conversations to accommodate: for + an improved chat interaction.\n Approximately 0.12% of the training + data is Dutch and raises questions on the performance on Dutch questions.\n \n Release + date: July 2023","id":1,"name":"LLaMa-2 13B Chat","sequence":[{"model_id":8,"type":"text"}],"sequence_type":"LLMSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":null,"web_visible":true},{"description":"Based + on the fluffy animal, Meta AI (from Facebook) trained this model first on + a huge unorganized public dataset from the web.\n Afterwards, the + model was further finetuned on natural conversations to accommodate: for an + improved chat interaction. \n Approximately 0.12% of the training + data is Dutch and raises questions on the performance on Dutch questions.}\n \n Release + date: July 2023","id":2,"name":"LLaMa-2 7B Chat","sequence":[{"model_id":9,"type":"text"}],"sequence_type":"LLMSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":null,"web_visible":true},{"description":"OpenAI + first released ChatGPT and shock the world due to its impressive performance + on producing relevant answers in a natural conversation.\n However, + there have been some data and privacy concerns. Hence, we for now offer ChatGPT-3.5 + through this interface via the OpenAI API.\n Hereby, we circumvent + any gathering of personal information via tracking cookies on the ChatGPT + website. Note: OpenAI could still store \n and process the messages + themselves.\n \n Release date: November 2022 (continuous + update)","id":3,"name":"ChatGPT-3.5","sequence":[{"model_id":10,"type":"text"}],"sequence_type":"LLMSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":null,"web_visible":true},{"description":"Zephyr + is a series of language models that are trained to act as helpful assistants.\n We + found that removing the in-built alignment of these datasets boosted performance + on MT Bench and made the model more helpful. \n However, this means + that model is likely to generate problematic text when prompted to do so and + should only be used for educational and research purposes.\n \n Release + Date: October 2023","id":4,"name":"Zephyr 7B","sequence":[{"model_id":11,"type":"text"}],"sequence_type":"LLMSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":null,"web_visible":true},{"description":"Stable + diffusion is for images","id":5,"name":"Stable Diffusion","sequence":[{"model_id":12,"type":"images"}],"sequence_type":"ImageGenerationSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":null,"web_visible":false},{"description":"This + is a text-to-speech model trained by facebook. It converts the input message + to output audio.\n The quality of the output is only guaranteed + by dutch speech, as this is the attempted output language.\n In + the future we may add different languages.\n\n Release + Date: December 2023","id":7,"name":"Massively Multilingual Speech Dutch","sequence":[{"model_id":14,"type":"tts"}],"sequence_type":"AudioGenerationSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":null,"web_visible":false},{"description":"Currently + only implemented for API access.","id":8,"name":"Whisper speech-to-text Dutch","sequence":[{"model_id":15,"type":"stt"}],"sequence_type":"SingleSessionSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":null,"web_visible":true},{"description":"","id":17,"name":"BramVanroy/GEITje-7B-ultra","sequence":[{"model_id":29,"type":"text"}],"sequence_type":"LLMSequence","sram_owner_regex":"urn:mace:surf.nl:sram:label:surf_rsc:advanceddutchllm","user_id":66,"web_visible":true},{"description":"Mixtral + GPT-Q quantization","id":22,"name":"casperhansen/mixtral-instruct-awq","sequence":[{"model_id":35,"type":"text"}],"sequence_type":"LLMSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":66,"web_visible":true},{"description":"sasd","id":23,"name":"asd","sequence":[{"template_id":6},{"model_id":11,"type":"text"}],"sequence_type":"PromptTemplateSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":1,"web_visible":false},{"description":"","id":24,"name":"synthetic + data","sequence":[{"model_id":38,"type":"search"},{"template_id":7},{"model_id":11,"type":"text"}],"sequence_type":"RAGSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc","user_id":68,"web_visible":true},{"description":"bozo","id":28,"name":"Rijgersberg/GEITje-7B-chat-v2","sequence":[{"model_id":42,"type":"search"},{"template_id":32},{"model_id":46,"type":"text"}],"sequence_type":"RAGSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":66,"web_visible":true},{"description":"","id":29,"name":"rhysjones/phi-2-orange-v2","sequence":[{"model_id":47,"type":"text"}],"sequence_type":"LLMSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":66,"web_visible":true},{"description":"Fietje + van T.V.","id":30,"name":"BramVanroy/fietje-2b-chat","sequence":[{"model_id":49,"type":"text"}],"sequence_type":"LLMSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":62,"web_visible":true},{"description":"","id":33,"name":"ibm-granite/granite-8b-code-instruct","sequence":[{"model_id":62,"type":"text"}],"sequence_type":"LLMSequence","sram_owner_regex":null,"user_id":64,"web_visible":true},{"description":"","id":34,"name":"codellama/CodeLlama-7b-Instruct-hf","sequence":[{"model_id":63,"type":"text"}],"sequence_type":"LLMSequence","sram_owner_regex":null,"user_id":63,"web_visible":true},{"description":"\n This + is a text-to-speech model trained by facebook. It converts the input message + to output audio.\n The quality of the output is only + guaranteed by english speech, as this is the attempted output language.\n In + the future we may add different languages.\n\n Release + Date: December 2023\n ","id":36,"name":"Massively Multilingual + Speech English","sequence":[{"model_id":65,"type":"tts"}],"sequence_type":"AudioGenerationSequence","sram_owner_regex":null,"user_id":null,"web_visible":false},{"description":"SURF.nl + chat obv Sitemap en Dienstbrochure","id":40,"name":"SURF.nl chat ","sequence":[{"template_id":43},{"model_id":10,"type":"text"}],"sequence_type":"PromptTemplateSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":85,"web_visible":true}] + + ' + recorded_at: Wed, 02 Jan 2019 19:00:00 GMT +- request: + method: get + uri: https://www.4tu.nl/en/agenda/2024-10-09-room-for-everyones-educational-talent/ + body: + encoding: US-ASCII + string: '' + headers: + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 200 + message: OK + headers: + Server: + - nginx + Date: + - Wed, 05 Jun 2024 14:16:20 GMT + Content-Type: + - text/html; charset=UTF-8 + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Vary: + - Accept-Encoding + Strict-Transport-Security: + - max-age=31536000 + X-Frame-Options: + - DENY + X-Xss-Protection: + - 1; mode=block + X-Content-Type-Options: + - nosniff + Referrer-Policy: + - strict-origin-when-cross-origin + Feature-Policy: + - geolocation 'none'; camera 'none'; payment 'none'; microphone 'none'; + Content-Security-Policy: + - script-src 'self' https://platform.twitter.com https://cdn.syndication.twimg.com + https://www.youtube.com https://player.vimeo.com 'sha256-B34d9UvaEWVUEqLK1XO7bmIC0GTuglVf9avDg11NcSc=' + 'sha256-P8GHTBinuD+aYtH+iNVPhJcdl3xVDaG/Y3fcqecF83w=' https://www.google-analytics.com + https://www.googletagmanager.com https://embed.podcasts.apple.com 'sha256-zN1rNu6bv+5uQuqzW39H8OOBooeKevi2fT089esapTo='; + frame-ancestors 'self' https://4tu.webhare.net + Cache-Control: + - no-cache + body: + encoding: ASCII-8BIT + string: !binary |- +  + recorded_at: Wed, 02 Jan 2019 19:00:00 GMT +- request: + method: get + uri: https://willma.soil.surf.nl/api/models + body: + encoding: UTF-8 + string: "{}" + headers: + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + Content-Type: + - application/json + X-Api-Key: + - 77696c6c6d61-c5f9443d-e1eb-469a-a9ba-d7c9c733ebb3 + response: + status: + code: 200 + message: OK + headers: + Date: + - Wed, 05 Jun 2024 14:16:21 GMT + Server: + - gunicorn + Strict-Transport-Security: + - max-age=31556952 + Content-Security-Policy: + - 'default-src ''self''; style-src ''self'' ''unsafe-inline'' *.bootstrapcdn.com + *.cloudflare.com fonts.googleapis.com estudybooks.surf.nl; script-src ''self'' + ''unsafe-inline'' ''unsafe-eval'' webstats.surf.nl code.jquery.com *.bootstrapcdn.com + *.cloudflare.com *.aspnetcdn.com; img-src ''self'' data: blob: ''unsafe-inline'' + https: data: webstats.surf.nl; font-src ''self'' data: *.bootstrapcdn.com + fonts.gstatic.com; connect-src ''self''; media-src ''self'' data: blob:' + X-Frame-Options: + - SAMEORIGIN + X-Xss-Protection: + - 1; mode=block; + X-Content-Type-Options: + - nosniff + Content-Type: + - application/json + Content-Length: + - '7139' + Vary: + - Cookie + Set-Cookie: + - session=.eJyNjcEKAjEMRP8lXnVhb5JfESkxjbqQdiVJvcj-u1XEiwh7GmZ4M_MAqTGFSunqgAdoVrEQC3qz81AV3aig0kn0HSVzRsp3qiw5t-CrakEPskjEPLcasP2zcrG53b4rK7GfsxW9XbebVzZ8ADj2koulKQOO4355AieiXTE.ZmBzNQ.8TjdBvhlSWEUw_AxwSeQB6r-B_M; + HttpOnly; Path=/ + body: + encoding: UTF-8 + string: '[{"description":"Based on the fluffy animal, Meta AI (from Facebook) + trained this model first on a huge unorganized public dataset from the web.\n Afterwards, + the model was further finetuned on natural conversations to accommodate: for + an improved chat interaction.\n Approximately 0.12% of the training + data is Dutch and raises questions on the performance on Dutch questions.\n \n Release + date: July 2023","id":1,"name":"LLaMa-2 13B Chat","sequence":[{"model_id":8,"type":"text"}],"sequence_type":"LLMSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":null,"web_visible":true},{"description":"Based + on the fluffy animal, Meta AI (from Facebook) trained this model first on + a huge unorganized public dataset from the web.\n Afterwards, the + model was further finetuned on natural conversations to accommodate: for an + improved chat interaction. \n Approximately 0.12% of the training + data is Dutch and raises questions on the performance on Dutch questions.}\n \n Release + date: July 2023","id":2,"name":"LLaMa-2 7B Chat","sequence":[{"model_id":9,"type":"text"}],"sequence_type":"LLMSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":null,"web_visible":true},{"description":"OpenAI + first released ChatGPT and shock the world due to its impressive performance + on producing relevant answers in a natural conversation.\n However, + there have been some data and privacy concerns. Hence, we for now offer ChatGPT-3.5 + through this interface via the OpenAI API.\n Hereby, we circumvent + any gathering of personal information via tracking cookies on the ChatGPT + website. Note: OpenAI could still store \n and process the messages + themselves.\n \n Release date: November 2022 (continuous + update)","id":3,"name":"ChatGPT-3.5","sequence":[{"model_id":10,"type":"text"}],"sequence_type":"LLMSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":null,"web_visible":true},{"description":"Zephyr + is a series of language models that are trained to act as helpful assistants.\n We + found that removing the in-built alignment of these datasets boosted performance + on MT Bench and made the model more helpful. \n However, this means + that model is likely to generate problematic text when prompted to do so and + should only be used for educational and research purposes.\n \n Release + Date: October 2023","id":4,"name":"Zephyr 7B","sequence":[{"model_id":11,"type":"text"}],"sequence_type":"LLMSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":null,"web_visible":true},{"description":"Stable + diffusion is for images","id":5,"name":"Stable Diffusion","sequence":[{"model_id":12,"type":"images"}],"sequence_type":"ImageGenerationSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":null,"web_visible":false},{"description":"This + is a text-to-speech model trained by facebook. It converts the input message + to output audio.\n The quality of the output is only guaranteed + by dutch speech, as this is the attempted output language.\n In + the future we may add different languages.\n\n Release + Date: December 2023","id":7,"name":"Massively Multilingual Speech Dutch","sequence":[{"model_id":14,"type":"tts"}],"sequence_type":"AudioGenerationSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":null,"web_visible":false},{"description":"Currently + only implemented for API access.","id":8,"name":"Whisper speech-to-text Dutch","sequence":[{"model_id":15,"type":"stt"}],"sequence_type":"SingleSessionSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":null,"web_visible":true},{"description":"","id":17,"name":"BramVanroy/GEITje-7B-ultra","sequence":[{"model_id":29,"type":"text"}],"sequence_type":"LLMSequence","sram_owner_regex":"urn:mace:surf.nl:sram:label:surf_rsc:advanceddutchllm","user_id":66,"web_visible":true},{"description":"Mixtral + GPT-Q quantization","id":22,"name":"casperhansen/mixtral-instruct-awq","sequence":[{"model_id":35,"type":"text"}],"sequence_type":"LLMSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":66,"web_visible":true},{"description":"sasd","id":23,"name":"asd","sequence":[{"template_id":6},{"model_id":11,"type":"text"}],"sequence_type":"PromptTemplateSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":1,"web_visible":false},{"description":"","id":24,"name":"synthetic + data","sequence":[{"model_id":38,"type":"search"},{"template_id":7},{"model_id":11,"type":"text"}],"sequence_type":"RAGSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc","user_id":68,"web_visible":true},{"description":"bozo","id":28,"name":"Rijgersberg/GEITje-7B-chat-v2","sequence":[{"model_id":42,"type":"search"},{"template_id":32},{"model_id":46,"type":"text"}],"sequence_type":"RAGSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":66,"web_visible":true},{"description":"","id":29,"name":"rhysjones/phi-2-orange-v2","sequence":[{"model_id":47,"type":"text"}],"sequence_type":"LLMSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":66,"web_visible":true},{"description":"Fietje + van T.V.","id":30,"name":"BramVanroy/fietje-2b-chat","sequence":[{"model_id":49,"type":"text"}],"sequence_type":"LLMSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":62,"web_visible":true},{"description":"","id":33,"name":"ibm-granite/granite-8b-code-instruct","sequence":[{"model_id":62,"type":"text"}],"sequence_type":"LLMSequence","sram_owner_regex":null,"user_id":64,"web_visible":true},{"description":"","id":34,"name":"codellama/CodeLlama-7b-Instruct-hf","sequence":[{"model_id":63,"type":"text"}],"sequence_type":"LLMSequence","sram_owner_regex":null,"user_id":63,"web_visible":true},{"description":"\n This + is a text-to-speech model trained by facebook. It converts the input message + to output audio.\n The quality of the output is only + guaranteed by english speech, as this is the attempted output language.\n In + the future we may add different languages.\n\n Release + Date: December 2023\n ","id":36,"name":"Massively Multilingual + Speech English","sequence":[{"model_id":65,"type":"tts"}],"sequence_type":"AudioGenerationSequence","sram_owner_regex":null,"user_id":null,"web_visible":false},{"description":"SURF.nl + chat obv Sitemap en Dienstbrochure","id":40,"name":"SURF.nl chat ","sequence":[{"template_id":43},{"model_id":10,"type":"text"}],"sequence_type":"PromptTemplateSequence","sram_owner_regex":"urn:mace:surf.nl:sram:group:surf_rsc:advanceddutchllm","user_id":85,"web_visible":true}] + + ' + recorded_at: Wed, 02 Jan 2019 19:00:00 GMT +recorded_with: VCR 6.2.0