-
Notifications
You must be signed in to change notification settings - Fork 8
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Feature: Fetch and detect latest modified Vocabularies using LOV SPARQL Endpoint #26
base: development
Are you sure you want to change the base?
Changes from 15 commits
6ec06e1
b4af5bb
070e69b
c79c08b
7ae4651
2e2950e
70563b1
a5aefd2
158f353
5e3a0db
760b762
f44d1dc
a74b855
dc9256f
54a3862
9b5a286
f36dc77
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,238 @@ | ||
#!/usr/bin/env ruby | ||
|
||
# Exit cleanly from an early interrupt | ||
Signal.trap("INT") { exit 1 } | ||
|
||
require 'optparse' | ||
|
||
options = {} | ||
OptionParser.new do |opts| | ||
opts.banner = "Usage: lov_migrator [options]" | ||
opts.on( '-a', '--all') do | ||
options[:vocabs] = [:all] | ||
end | ||
opts.on('-v', '--vocabularies PREFIX1, PREFIX2', 'Comma-separated list of vocabularies to update or import') do |acronym| | ||
ontologies_acronyms = acronym | ||
syphax-bouazzouni marked this conversation as resolved.
Show resolved
Hide resolved
|
||
end | ||
opts.on('--dispatch-csv FILENAME', 'Specify the CSV file to dispatch') do |csv_file| | ||
options[:csv_dispatch_filename] = csv_file | ||
end | ||
# Display the help screen, all programs are assumed to have this option. | ||
opts.on( '-h', '--help', 'Display this screen' ) do | ||
puts opts | ||
exit | ||
end | ||
end.parse! | ||
|
||
raise OptionParser::MissingArgument if options[:vocabs].nil? | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add a message like this: |
||
|
||
require 'open-uri' | ||
require 'net/http' | ||
require 'json' | ||
require 'date' | ||
require 'benchmark' | ||
require 'csv' | ||
LOV_ENDPOINT = "https://lov.linkeddata.es/dataset/lov" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. put the constants on top also |
||
CSV_ADDED_ATTRS = [ 'destination', 'who', 'comment' ] | ||
CSV_DISPATCH_FILENAME = 'LOV_vocabularies.csv' | ||
CSV_FILENAME = "vocabs.csv" | ||
|
||
module LOVMigrator | ||
|
||
attr_accessor :lov_endpoint | ||
class SparqlParser | ||
attr_accessor :last_processed_date | ||
def initialize(endpoint = LOV_ENDPOINT) | ||
@lov_endpoint = endpoint | ||
@last_processed_date_file = "/var/tmp/.lov_last_processed_date.txt" | ||
@last_processed_date = File.exist?(@last_processed_date_file) ? Date.parse(File.read(@last_processed_date_file).strip) : nil | ||
end | ||
|
||
def remote_changes? | ||
return true unless @last_processed_date | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could replace these two lines by: |
||
@last_processed_date < latest_remote_modification_date | ||
end | ||
|
||
def sparql_query(query, accept_format = 'application/sparql-results+json') | ||
uri = URI.parse("#{@lov_endpoint}/sparql") | ||
|
||
http = Net::HTTP.new(uri.host, uri.port) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it's better to keep request params grouped to make it more lisible, something like this:
|
||
http.use_ssl = (uri.scheme == 'https') | ||
http.verify_mode = OpenSSL::SSL::VERIFY_NONE if uri.scheme == 'https' | ||
|
||
request = Net::HTTP::Post.new(uri.request_uri) | ||
request.set_form_data('query' => query) | ||
request['Accept'] = accept_format | ||
|
||
response = http.request(request) | ||
|
||
case response | ||
when Net::HTTPSuccess | ||
accept_format == 'application/sparql-results+json' ? parse_json_response(response.body) : response.body | ||
else | ||
raise "SPARQL query failed: #{response.code} #{response.message}" | ||
end | ||
end | ||
|
||
def latest_remote_modification_date | ||
query = <<~SPARQL | ||
SELECT (MAX(?modified) AS ?lastModifiedInLOVAt) | ||
WHERE { | ||
?catalog a <http://www.w3.org/ns/dcat#Catalog> . | ||
?catalog <http://purl.org/dc/terms/modified> ?modified | ||
} | ||
ORDER BY DESC(?lastModifiedInLOVAt) | ||
SPARQL | ||
result = sparql_query(query).first | ||
Date.parse(result["lastModifiedInLOVAt"]["value"]) | ||
end | ||
|
||
def fetch_all_vocabs | ||
query = <<~SPARQL | ||
PREFIX foaf: <http://xmlns.com/foaf/0.1/> | ||
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> | ||
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> | ||
PREFIX dct: <http://purl.org/dc/terms/> | ||
|
||
SELECT DISTINCT | ||
(GROUP_CONCAT(DISTINCT ?_prefix; separator="\\n") AS ?prefix) | ||
(GROUP_CONCAT(DISTINCT ?_title; separator="\\n") AS ?title) | ||
(GROUP_CONCAT(DISTINCT ?_description; separator="\\n") AS ?description) | ||
(GROUP_CONCAT(DISTINCT ?_keyword; separator="\\n") AS ?keyword) | ||
(GROUP_CONCAT(DISTINCT ?_creatorName; separator="\\n") AS ?creator) | ||
?uri | ||
(GROUP_CONCAT(DISTINCT ?_lastModifiedInLOVAt; separator="\\n") AS ?lastModifiedInLOVAt) | ||
|
||
{ | ||
?uri a <http://purl.org/vocommons/voaf#Vocabulary> . | ||
?catalog a <http://www.w3.org/ns/dcat#CatalogRecord> . | ||
?catalog foaf:primaryTopic ?uri . | ||
|
||
?uri <http://purl.org/vocab/vann/preferredNamespacePrefix> ?_prefix. | ||
?uri <http://purl.org/dc/terms/title> ?_title. | ||
?uri <http://purl.org/dc/terms/description> ?_description. | ||
?uri <http://www.w3.org/ns/dcat#keyword> ?_keyword. | ||
OPTIONAL { | ||
?uri dct:creator ?_creator . | ||
?_creator foaf:name ?_creatorName | ||
} | ||
?catalog <http://purl.org/dc/terms/modified> ?_lastModifiedInLOVAt | ||
} | ||
GROUP BY ?uri ?catalog | ||
SPARQL | ||
response = sparql_query(query, 'text/csv') | ||
end | ||
def update_latest_modification_date | ||
File.open(@last_processed_date_file, "w") do |file| | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In ruby I think you can do directly: |
||
file.puts Date.today | ||
end | ||
end | ||
|
||
def fetch_latest_changed_vocabs | ||
query = <<~SPARQL | ||
SELECT ?catalogRecord | ||
WHERE { | ||
?catalogRecord a <http://www.w3.org/ns/dcat#CatalogRecord> . | ||
?catalogRecord <http://purl.org/dc/terms/modified> ?modified. | ||
FILTER(?modified > "#{@last_processed_date}"^^<http://www.w3.org/2001/XMLSchema#date>) | ||
} | ||
ORDER BY DESC(?modified) | ||
SPARQL | ||
response = sparql_query(query) | ||
response.map.with_index {|response, index| "[#{index+1}] #{response['catalogRecord']['value']}"} | ||
end | ||
private | ||
|
||
def parse_json_response(response_body) | ||
JSON.parse(response_body)["results"]["bindings"] | ||
end | ||
end | ||
|
||
class CSVGenerator | ||
attr_reader :csv_file_path, :csv_content | ||
|
||
def initialize(csv_content, csv_file_path = nil) | ||
@csv_file_path = csv_file_path | ||
@csv_content = csv_content | ||
@csv_file_data = load_csv_file if File.exist?(@csv_file_path) | ||
end | ||
|
||
# Load CSV file into a hash for easy lookups | ||
def load_csv_file | ||
data = {} | ||
CSV.foreach(@csv_file_path, headers: true) do |row| | ||
data[row['prefix']] = row.to_h # assuming 'prefix' is a unique prefixentifier | ||
end | ||
data | ||
end | ||
|
||
# Parse CSV string | ||
def parse_csv_content | ||
CSV.parse(@csv_content, headers: true) | ||
end | ||
|
||
# Modify CSV string data based on values in the file data | ||
def copy_added_values_to_csv | ||
csv_data = parse_csv_content | ||
if File.exist?(@csv_file_path) | ||
csv_data.each do |row| | ||
if @csv_file_data.key?(row['prefix']) | ||
CSV_ADDED_ATTRS.each do |attr| | ||
row[attr] = @csv_file_data[row['prefix']][attr] | ||
end | ||
end | ||
end | ||
end | ||
csv_data | ||
end | ||
|
||
# Save modified CSV data to a new file | ||
def save_to_csv(csv_file_path = CSV_FILENAME) | ||
modified_data = copy_added_values_to_csv | ||
CSV.open(csv_file_path, 'w', write_headers: true, headers: modified_data.headers, force_quotes: true) do |csv| | ||
modified_data.each do |row| | ||
csv << row | ||
end | ||
end | ||
puts "Modifications saved to #{csv_file_path}" | ||
end | ||
end | ||
|
||
|
||
end | ||
|
||
def logger(text, &block) | ||
puts ">> #{text} starting..." | ||
time = Benchmark.realtime do | ||
block.call | ||
end | ||
puts "#{text} finished in #{time} seconds" | ||
end | ||
|
||
def main(options) | ||
parser = LOVMigrator::SparqlParser.new() | ||
|
||
logger("Checking for new changes") do | ||
if not parser.remote_changes? | ||
puts "No changes occured since #{parser.last_processed_date}" | ||
exit | ||
else | ||
if parser.last_processed_date | ||
puts "The following vocabs were changed since #{parser.last_processed_date}" | ||
puts parser.fetch_latest_changed_vocabs | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you don't use this information anywhere? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For now we just print it, to see the changed vocabs. We may use this later to make automatic updates on LovPortal. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so if you fetch always all the vocals, so no need to do a second SPARQL query, and just filter all the vocabs by the date. |
||
end | ||
end | ||
end | ||
vocabularies = "" | ||
logger("Start fetching Vocabularies ...") do | ||
vocabularies = parser.fetch_all_vocabs | ||
end | ||
logger("Start creating CSV #{CSV_FILENAME}") do | ||
csvGen = LOVMigrator::CSVGenerator.new(vocabularies, options[:csv_dispatch_filename]) | ||
csvGen.save_to_csv | ||
end | ||
parser.update_latest_modification_date | ||
end | ||
|
||
main(options) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
put all require together in the very top of the file like this: