From 224c07d2baa1d6a22801e2f294da163c79f8446a Mon Sep 17 00:00:00 2001 From: Syphax bouazzouni Date: Thu, 23 Jan 2025 04:50:54 +0100 Subject: [PATCH 1/5] Feature: create triples stores migration scripts (#29) * add a script to migrate 4s dump to graph nt files * add a script to import any nt file into a graph * add a script to combine the metadata graphs files generation and import * add scripts that compares triples count in graph files and in triple store * update docker compose to use the default virtuoso image * add option to run import metadata graphs using docker for testing * add virtuoso custom scripts * simplify compare count to not do the benchmarks --- .gitignore | 6 + Gemfile.lock | 18 +-- bin/migrations/4s-to-graph-files | 42 ++++++ bin/migrations/compare_counts.rb | 135 ++++++++++++++++++ .../import_metadata_graphs_to_store | 71 +++++++++ bin/migrations/import_nt_file.rb | 71 +++++++++ .../virtuso_grant_write_permission.sh | 58 ++++++++ docker-compose.yml | 34 +++-- ncbo_cron.gemspec | 6 +- start_ontoportal_services.sh | 85 +++++++++++ test/data/graphdb-repo-config.ttl | 55 +++---- test/data/virtuoso.ini | 61 ++++++++ test/data/virtuso_grant_write_permission.sh | 58 ++++++++ 13 files changed, 651 insertions(+), 49 deletions(-) create mode 100755 bin/migrations/4s-to-graph-files create mode 100755 bin/migrations/compare_counts.rb create mode 100755 bin/migrations/import_metadata_graphs_to_store create mode 100644 bin/migrations/import_nt_file.rb create mode 100755 bin/migrations/virtuoso/virtuso_grant_write_permission.sh create mode 100755 start_ontoportal_services.sh create mode 100644 test/data/virtuoso.ini create mode 100755 test/data/virtuso_grant_write_permission.sh diff --git a/.gitignore b/.gitignore index 4c1af01d..dadfffcc 100644 --- a/.gitignore +++ b/.gitignore @@ -31,3 +31,9 @@ logs/ # Ignore jEnv files .java-version + +processed_files/ + +queries.txt + +graph_comparison.csv diff --git a/Gemfile.lock b/Gemfile.lock index 052099cf..d44a4105 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,6 +1,6 @@ GIT remote: https://github.com/ontoportal-lirmm/goo.git - revision: f8ac7b00e8d8b46d1eea04de014175525c1cdd83 + revision: 27300f28ca6c656c7e78af65013d88b792a6312f branch: development specs: goo (0.0.2) @@ -29,7 +29,7 @@ GIT GIT remote: https://github.com/ontoportal-lirmm/ontologies_linked_data.git - revision: e65d887616aaf4ae6f099437223d86515ffdca79 + revision: 6cb18910e322645e3cc3490951d10f19468da52f branch: development specs: ontologies_linked_data (0.0.1) @@ -49,7 +49,7 @@ GIT GIT remote: https://github.com/ontoportal-lirmm/sparql-client.git - revision: 59251e59346c9a69a67c88552ba55a1244eec602 + revision: 4364d34e9e4c411f1dd0ea706bf052465bf0b467 branch: development specs: sparql-client (3.2.2) @@ -101,7 +101,7 @@ GEM capistrano (~> 3.1) sshkit (~> 1.3) coderay (1.1.3) - concurrent-ruby (1.3.4) + concurrent-ruby (1.3.5) connection_pool (2.5.0) cube-ruby (0.0.3) dante (0.2.0) @@ -138,7 +138,7 @@ GEM google-cloud-errors (~> 1.0) google-apis-analytics_v3 (0.16.0) google-apis-core (>= 0.15.0, < 2.a) - google-apis-core (0.15.1) + google-apis-core (0.16.0) addressable (~> 2.5, >= 2.5.1) googleauth (~> 1.9) httpclient (>= 2.8.3, < 3.a) @@ -157,7 +157,7 @@ GEM google-protobuf (>= 3.18, < 5.a) googleapis-common-protos-types (~> 1.7) grpc (~> 1.41) - googleapis-common-protos-types (1.17.0) + googleapis-common-protos-types (1.18.0) google-protobuf (>= 3.18, < 5.a) googleauth (1.11.2) faraday (>= 1.0, < 3.a) @@ -209,7 +209,7 @@ GEM mutex_m (0.3.0) net-http-persistent (4.0.5) connection_pool (~> 2.2) - net-scp (4.0.0) + net-scp (4.1.0) net-ssh (>= 2.6.5, < 8.0.0) net-sftp (4.0.0) net-ssh (>= 5.0.0, < 8.0.0) @@ -251,7 +251,7 @@ GEM rexml (~> 3.2) redis (5.3.0) redis-client (>= 0.22.0) - redis-client (0.23.1) + redis-client (0.23.2) connection_pool representable (3.2.0) declarative (< 0.1.0) @@ -348,4 +348,4 @@ DEPENDENCIES test-unit-minitest BUNDLED WITH - 2.3.14 + 2.3.3 diff --git a/bin/migrations/4s-to-graph-files b/bin/migrations/4s-to-graph-files new file mode 100755 index 00000000..c7cd9c47 --- /dev/null +++ b/bin/migrations/4s-to-graph-files @@ -0,0 +1,42 @@ +#!/usr/bin/env ruby + +require 'fileutils' + +# Usage: ruby migrate_and_extract.rb +# Check if the correct number of arguments are provided +if ARGV.size != 2 + puts "Usage: #{$PROGRAM_NAME} " + exit 1 +end + +source_folder = ARGV[0] +target_folder = ARGV[1] +processed_dir = File.join(target_folder, 'processed_files') + +# Create the target directory if it doesn't exist +FileUtils.mkdir_p(processed_dir) + +# Find all files in the source folder and process them +Dir.glob(File.join(source_folder, '**', '*')).select { |file| File.file?(file) }.each do |file| + puts "Processing file: #{file}" + + # Define the new filename with .n3 extension + filename = File.basename(file) + new_file = File.join(processed_dir, "#{filename}.n3") + + # Copy the original file to the target folder with .n3 extension + FileUtils.cp(file, new_file) + puts "Copied to: #{new_file}" + + # Extract the first line and remove the "## GRAPH " prefix, then save it to .graph file + graph_file = "#{new_file}.graph" + first_line = File.open(file, &:readline).sub(/^## GRAPH /, '').strip + File.write(graph_file, first_line) + puts "Extracted graph URI to: #{graph_file}" + + # Remove the first line from the copied .n3 file + File.write(new_file, File.readlines(new_file).drop(1).join) + puts "Removed the first line from: #{new_file}" +end + +puts "Migration and extraction complete." diff --git a/bin/migrations/compare_counts.rb b/bin/migrations/compare_counts.rb new file mode 100755 index 00000000..d1a5b2a9 --- /dev/null +++ b/bin/migrations/compare_counts.rb @@ -0,0 +1,135 @@ +require 'open3' +require 'net/http' +require 'json' +require 'cgi' +require 'csv' +require 'pry' +require 'bundler/setup' +require 'benchmark' +require 'ncbo_annotator' +require 'ncbo_cron' +require 'ontologies_linked_data' + +PROCESSED_DIR = ARGV[0] || './processed_files' +profile = ARGV[1] + +case profile +when 'ag' + # AllegroGraph backend + ENV['GOO_BACKEND_NAME'] = 'allegrograph' + ENV['GOO_PORT'] = '10035' + ENV['GOO_PATH_QUERY'] = '/repositories/ontoportal_test' + ENV['GOO_PATH_DATA'] = '/repositories/ontoportal_test/statements' + ENV['GOO_PATH_UPDATE'] = '/repositories/ontoportal_test/statements' + ENV['COMPOSE_PROFILES'] = 'ag' +when 'fs' + # 4store backend + ENV['GOO_PORT'] = '9000' + ENV['COMPOSE_PROFILES'] = 'fs' +when 'vo' + # Virtuoso backend + ENV['GOO_BACKEND_NAME'] = 'virtuoso' + ENV['GOO_PORT'] = '8890' + ENV['GOO_PATH_QUERY'] = '/sparql' + ENV['GOO_PATH_DATA'] = '/sparql' + ENV['GOO_PATH_UPDATE'] = '/sparql' + ENV['COMPOSE_PROFILES'] = 'vo' +when 'gb' + # Graphdb backend + ENV['GOO_BACKEND_NAME'] = 'graphdb' + ENV['GOO_PORT'] = '7200' + ENV['GOO_PATH_QUERY'] = '/repositories/ontoportal' + ENV['GOO_PATH_DATA'] = '/repositories/ontoportal/statements' + ENV['GOO_PATH_UPDATE'] = '/repositories/ontoportal/statements' +else + puts "Will import to default config set in config/config.rb" +end + +require_relative '../../config/config' +# Set your Virtuoso SPARQL endpoint, user credentials, and the directory where the .n3 files are located +OUTPUT_CSV = './graph_comparison.csv' + +def get_all_graphs_counts + graphs = [] + time = Benchmark.realtime do + rs = Goo.sparql_query_client.query("SELECT DISTINCT ?graph (COUNT(?s) as ?triplesCount) WHERE { GRAPH ?graph { ?s ?p ?o } } GROUP BY ?graph") + rs.each do |solution| + graphs << solution + end + end + puts 'Found ' + graphs.length.to_s + ' graphs in ' + format("%.4f", time) + 's' + + counts = {} + graphs.each do |graph| + counts[graph['graph'].to_s] = graph['triplesCount'].to_i + end + counts +end + +# Count the number of lines in a file (excluding the first metadata line) +def count_file_lines(file_path) + File.read(file_path).each_line.count +end + +def build_graphs_file_hash(folder_path = PROCESSED_DIR) + # Ensure the folder path exists + unless Dir.exist?(folder_path) + puts "Folder does not exist: #{folder_path}" + return + end + + graphs = {} + # Loop through each file in the folder + Dir.foreach(folder_path) do |filename| + # Skip directories and only process files ending with .graph and starting with the specific string + if filename.end_with?('.graph') + file_path = File.join(folder_path, filename) + line = File.open(file_path, "r").readlines.first + graphs[line.strip] = filename.to_s.gsub('.graph', '') + end + end + graphs +end + +# Compare graph counts with file lines and output to CSV +def compare_graphs_with_files(graph_triples) + CSV.open(OUTPUT_CSV, 'w') do |csv| + # Write CSV headers + csv << ["Graph URI", "Triples in Graph", "Lines in File (excluding metadata)", "Match"] + graphs_files = build_graphs_file_hash + graph_triples.each do |graph, count| + graph_uri = graph + triples_count = count + graph_filename = graphs_files[graph_uri] + + next csv << [graph_uri, triples_count, "Graph not found", "N/A"] unless graph_filename + + # Construct the expected file name based on the graph URI + file_name = "#{PROCESSED_DIR}/#{graph_filename}" + + # puts "count lines of the file #{file_name} for the graph #{graph_uri}" + if File.exist?(file_name) + file_lines_count = count_file_lines(file_name) + + # Check if the counts match + match_status = triples_count == file_lines_count ? "Yes" : "No" + + # Output the result to CSV + csv << [graph_uri, triples_count, file_lines_count, match_status] + else + # If the file doesn't exist, indicate it in the CSV + csv << [graph_uri, triples_count, "File not found", "N/A"] + end + end + end + + puts "Comparison complete. Results saved to #{OUTPUT_CSV}" +end + +# Main execution +Goo.sparql_query_client.cache.redis_cache.flushdb +puts "Redis cache flushed" + +puts "Comparing graph triple counts with file lines and exporting to CSV..." +graph_triples = get_all_graphs_counts +compare_graphs_with_files(graph_triples) diff --git a/bin/migrations/import_metadata_graphs_to_store b/bin/migrations/import_metadata_graphs_to_store new file mode 100755 index 00000000..fe158bc9 --- /dev/null +++ b/bin/migrations/import_metadata_graphs_to_store @@ -0,0 +1,71 @@ +#!/usr/bin/env ruby + +require 'benchmark' +# Stop the script at the first error +begin + # Check if the correct number of arguments are provided + if ARGV.size < 1 + puts "Usage: #{$PROGRAM_NAME} " + exit 1 + end + + # Directory containing .n3 files and Virtuoso installation path + processed_dir = ARGV[0] + # Optional profile to use for the import (vo: virtruoso, fs: 4store, gb: GraphDB) + profile = ARGV[1] + + docker = ARGV[2] == "docker" + + if docker + result = system("./start_ontoportal_services.sh #{profile}") + unless result + puts "Error starting services" + exit 1 + end + end + # Check if processed_files directory exists + unless Dir.exist?(processed_dir) + puts "Processed files directory #{processed_dir} does not exist!" + exit 1 + end + + total_time = 0 + import_count = 0 + file_count = 0 + # Loop through all .n3 files in the processed_files directory + Dir.glob(File.join(processed_dir, '*.n3')).each do |file| + # Extract the associated .graph file (contains graph URI) + graph_file = "#{file}.graph" + + # Check if graph file exists + unless File.exist?(graph_file) + puts "Graph file #{graph_file} not found. Skipping import of #{file}." + next + end + + # Extract the graph URI from the graph file + graph_uri = File.read(graph_file).strip + line_count = `wc -l #{file}`.to_i + puts "Start importing #{file} into graph <#{graph_uri}> of line count #{line_count}" + result = false + time = Benchmark.realtime do + result = system("ruby bin/migrations/import_nt_file.rb #{file} #{graph_uri} #{profile} >> ./process_log.log 2>&1") + end + + file_count += 1 + total_time += time + + if !result + puts "Error importing #{file} into graph <#{graph_uri}>" + exit 1 + else + import_count += 1 + puts "Imported <#{graph_uri}> successfully in #{time.round(2)} seconds" + end + puts "#############################################################" + end + puts "#{import_count}/#{file_count} files imported in #{total_time.round(2)} seconds" +rescue => e + puts "Error: #{e.message}" + exit 1 +end diff --git a/bin/migrations/import_nt_file.rb b/bin/migrations/import_nt_file.rb new file mode 100644 index 00000000..8fef5fb9 --- /dev/null +++ b/bin/migrations/import_nt_file.rb @@ -0,0 +1,71 @@ +require 'bundler/setup' +require 'pry' +require 'benchmark' +require 'ncbo_annotator' +require 'ncbo_cron' +require 'ontologies_linked_data' + +file_path = ARGV[0] +graph = ARGV[1] +profile = ARGV[2] + +if file_path.nil? && graph.nil? + puts "Error: Missing arguments. Please provide the file path and the graph name." + exit(1) +end + +case profile +when 'ag' + # AllegroGraph backend + ENV['GOO_BACKEND_NAME'] = 'allegrograph' + ENV['GOO_PORT'] = '10035' + ENV['GOO_PATH_QUERY'] = '/repositories/ontoportal_test' + ENV['GOO_PATH_DATA'] = '/repositories/ontoportal_test/statements' + ENV['GOO_PATH_UPDATE'] = '/repositories/ontoportal_test/statements' + ENV['COMPOSE_PROFILES'] = 'ag' + +when 'fs' + # 4store backend + ENV['GOO_PORT'] = '9000' + ENV['COMPOSE_PROFILES'] = 'fs' + +when 'vo' + # Virtuoso backend + ENV['GOO_BACKEND_NAME'] = 'virtuoso' + ENV['GOO_PORT'] = '8890' + ENV['GOO_PATH_QUERY'] = '/sparql' + ENV['GOO_PATH_DATA'] = '/sparql' + ENV['GOO_PATH_UPDATE'] = '/sparql' + ENV['COMPOSE_PROFILES'] = 'vo' + +when 'gb' + # Graphdb backend + ENV['GOO_BACKEND_NAME'] = 'graphdb' + ENV['GOO_PORT'] = '7200' + ENV['GOO_PATH_QUERY'] = '/repositories/ontoportal' + ENV['GOO_PATH_DATA'] = '/repositories/ontoportal/statements' + ENV['GOO_PATH_UPDATE'] = '/repositories/ontoportal/statements' + +else + puts "Will import to default config set in config/config.rb" +end + +require_relative '../../config/config' +puts "Start importing file: #{file_path} to graph: #{graph} using profile: #{ENV['GOO_BACKEND_NAME']}" +puts "Delete graph: #{graph}" +time = Benchmark.realtime do + Goo.sparql_data_client.delete_graph(graph) +end +puts 'Time to delete graph: ' + format("%.4f", time.to_s) + 's' + +time = Benchmark.realtime do + Goo.sparql_data_client.append_triples_no_bnodes(graph, file_path, nil) +end +puts 'Time to append triples: ' + format("%.4f", time) + 's' + +puts "Count triples in graph: #{graph}" +count = 0 +time = Benchmark.realtime do + count = Goo.sparql_query_client.query("SELECT (COUNT(?s) as ?count) FROM <#{graph}> WHERE { ?s ?p ?o }") +end +puts 'Time to count triples: ' + format("%.4f", time) + 's with total count: ' + count.to_s diff --git a/bin/migrations/virtuoso/virtuso_grant_write_permission.sh b/bin/migrations/virtuoso/virtuso_grant_write_permission.sh new file mode 100755 index 00000000..ac456650 --- /dev/null +++ b/bin/migrations/virtuoso/virtuso_grant_write_permission.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +# Virtuoso database connection credentials +DB_PORT=1111 +DB_USER="dba" +DB_PASS="dba" +VIRTUOSO_DIR=$1 + +if [ "$#" -ne 1 ]; then + VIRTUOSO_DIR="/opt/virtuoso-opensource/" +fi +# Connect to Virtuoso using isql and grant EXECUTE permission +echo "-- Granting EXECUTE permission on DB.DBA.SPARQL_INSERT_DICT_CONTENT..." + +$VIRTUOSO_DIR/bin/isql $DB_PORT $DB_USER $DB_PASS < - bash -c " importrdf load -f -c /opt/graphdb/dist/configs/templates/data/graphdb-repo-config.ttl -m parallel /opt/graphdb/dist/configs/templates/data/graphdb-test-load.nt ; graphdb -Ddefault.min.distinct.threshold=3000 " + bash -c " importrdf load -f -c /opt/graphdb/dist/configs/templates/graphdb.ttl -m parallel /opt/graphdb/dist/configs/templates/data/graphdb-test-load.nt ; graphdb " profiles: - gb @@ -155,4 +164,7 @@ volumes: bundle: agdata: 4store: - #solr_data: \ No newline at end of file + #solr_data: +networks: + app: + driver: bridge diff --git a/ncbo_cron.gemspec b/ncbo_cron.gemspec index 960d3602..25e2f30e 100644 --- a/ncbo_cron.gemspec +++ b/ncbo_cron.gemspec @@ -1,5 +1,4 @@ # -*- encoding: utf-8 -*- - Gem::Specification.new do |gem| gem.version = "0.0.1" gem.authors = [""] @@ -8,8 +7,9 @@ Gem::Specification.new do |gem| gem.summary = %q{} gem.homepage = "https://github.com/ncbo/ncbo_cron" - gem.files = Dir['**/*'] - gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) } + gem.files = Dir['**/*'] - Dir['bin/migrations'] + # binding.pry + gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) rescue nil}.compact gem.test_files = gem.files.grep(%r{^(test|spec|features)/}) gem.name = "ncbo_cron" gem.require_paths = ["lib"] diff --git a/start_ontoportal_services.sh b/start_ontoportal_services.sh new file mode 100755 index 00000000..9a8a9823 --- /dev/null +++ b/start_ontoportal_services.sh @@ -0,0 +1,85 @@ +#!/usr/bin/env bash +profile=$1 +acronym=$2 +set -e + + +if [ -z "$profile" ]; then + echo "Usage: $0 " + exit 1 +fi + +BACKEND_TYPE=$profile +if [ "$BACKEND_TYPE" == "ag" ]; then + # AllegroGraph backend + export GOO_BACKEND_NAME="allegrograph" + export GOO_PORT="10035" + export GOO_PATH_QUERY="/repositories/ontoportal_test" + export GOO_PATH_DATA="/repositories/ontoportal_test/statements" + export GOO_PATH_UPDATE="/repositories/ontoportal_test/statements" + export COMPOSE_PROFILES="ag" + +elif [ "$BACKEND_TYPE" == "fs" ]; then + # 4store backend + export GOO_PORT="9000" + export COMPOSE_PROFILES="fs" + +elif [ "$BACKEND_TYPE" == "vo" ]; then + # Virtuoso backend + export GOO_BACKEND_NAME="virtuoso" + export GOO_PORT="8890" + export GOO_PATH_QUERY="/sparql" + export GOO_PATH_DATA="/sparql" + export GOO_PATH_UPDATE="/sparql" + export COMPOSE_PROFILES="vo" + +elif [ "$BACKEND_TYPE" == "gb" ]; then + # Graphdb backend + export GOO_BACKEND_NAME="graphdb" + export GOO_PORT="7200" + export GOO_PATH_QUERY="/repositories/ontoportal" + export GOO_PATH_DATA="/repositories/ontoportal/statements" + export GOO_PATH_UPDATE="/repositories/ontoportal/statements" +else + echo "Error: Unknown backend type. Please set BACKEND_TYPE to 'ag', 'fs', or 'vo'." +fi + +echo "###########################################################################" +echo "Stop and remove all containers, networks, and volumes and start fresh" +docker compose --profile fs --profile vo --profile gb --profile ag down --volumes --remove-orphans && docker compose --profile "$profile" up -d + +echo "Waiting for all Docker services to start..." + +while true; do + # Get the status of all containers + container_status=$(docker compose --profile "$profile" ps -a --format '{{.Names}} {{.State}}') + + all_running=true + while read -r container state; do + if [ "$state" != "running" ] && [ "$state" != "exited" ]; then + all_running=false + break + fi + done <<< "$container_status" + + # If all containers are running, exit the loop + if [ "$all_running" = true ]; then + echo "All containers are running!" + break + fi + + # Wait before checking again + sleep 2 +done + +if [ -z "$acronym" ]; then + exit 0 +fi + +echo "###########################################################################" +echo "Create a new user and make it an admin" +bundle exec rake user:create[admin,admin@nodomain.org,password] +bundle exec rake user:adminify[admin] +echo "###########################################################################" +echo "Create a new ontology $acronym and import it from a remote server" +bin/ncbo_ontology_import --admin-user admin -o "$acronym" --from https://data.stageportal.lirmm.fr --from-apikey 82602563-4750-41be-9654-36f46056a0db diff --git a/test/data/graphdb-repo-config.ttl b/test/data/graphdb-repo-config.ttl index 9200da9a..82fbcc8c 100644 --- a/test/data/graphdb-repo-config.ttl +++ b/test/data/graphdb-repo-config.ttl @@ -3,31 +3,34 @@ @prefix sail: . @prefix xsd: . -<#ontoportal> a rep:Repository; - rep:repositoryID "ontoportal"; - rep:repositoryImpl [ - rep:repositoryType "graphdb:SailRepository"; - [ - "http://example.org/owlim#"; - "false"; - ""; - "true"; - "false"; - "true"; - "true"; - "32"; - "10000000"; - ""; - "true"; - ""; - "0"; - "0"; - "false"; - "file-repository"; - "rdfsplus-optimized"; - "storage"; - "false"; - sail:sailType "owlim:Sail" +<#wines> a rep:Repository; + rep:repositoryID "ontoportal"; + rep:repositoryImpl [ + rep:repositoryType "graphdb:SailRepository"; + [ + "false"; + ""; + "true"; + "true"; + "false"; + "true"; + "true"; + "32"; + "100000000"; + ("default" "iri"); + "none"; + "default"; + ""; + "true"; + "0"; + "0"; + "false"; + "file-repository"; + "empty"; + "storage"; + + "false"; + sail:sailType "graphdb:Sail" ] ]; - rdfs:label "" . \ No newline at end of file + rdfs:label "" . diff --git a/test/data/virtuoso.ini b/test/data/virtuoso.ini new file mode 100644 index 00000000..2e3d3251 --- /dev/null +++ b/test/data/virtuoso.ini @@ -0,0 +1,61 @@ +[Database] +DatabaseFile = ./database/virtuoso.db +ErrorLogFile = ./database/virtuoso.log +TransactionFile = ./database/virtuoso.trx +xa_persistent_file = ./database/virtuoso.pxa +MaxCheckpointRemap = 200000 +CheckpointInterval = 60 +NumberOfBuffers = 2450000 ; Each buffer is 8KB, so ~19GB total +MaxDirtyBuffers = 1837500 ; About 75% of NumberOfBuffers +TransactionAfterImageLimit = 50000000 +; NumberOfBuffers = 1000000 +MaxStaticCursorRows = 5000 +Striping = 0 +TempStorage = . +ErrorLogLevel = 7 + +[HTTPServer] +ServerPort = 8890 +ServerRoot = ./var/lib/virtuoso/vsp +MaxClientConnections = 200 +MaxKeepAlives = 10 +KeepAliveTimeout = 10 +ServerThreads = 50 +HttpTimeout = 300 +MaxBody = 20000000 +EnableGzip = 1 +GzipMimeType = text/html, text/xml, text/plain, text/css, application/xml, application/xhtml+xml, application/rss+xml, application/javascript, application/x-javascript, image/svg+xml +HTTPLogFile = ./http18012025.log + +[Parameters] +ServerPort = 1111 +NumOfThreads = 100 +MaxMem = 20000000000 ; 5GB memory +ResultSetMaxRows = 10000 +DirsAllowed = ., ./vad, ./virtuoso, ../migration-to-virtuoso,../migration-to-virtuoso/processed_files +MaxQueryCostEstimationTime = 6000 +MaxQueryExecutionTime = 6000 +DynamicLocal = 1 +LogEnable = 2 ; Enable SPARQL query logging +TraceOn = errors +LogFile = virtuoso.log +NumberOfBuffers = 2450000 ; Each buffer is 8KB, so ~19GB total +MaxDirtyBuffers = 1837500 ; About 75% of NumberOfBuffers + +[VDB] +ArrayOptimization = 0 +NumArrayParams = 0 +VDBDisconnectTimeout = 1000 +KeepAliveTimeout = 60 +RetryCount = 3 +ThreadCleanupInterval = 600 + +[Replication] +ServerName = virtuoso +ServerEnable = 1 + +[SPARQL] +ResultSetMaxRows = 1000000000000 +MaxQueryExecutionTime = 6000 +DefaultGraph = http://localhost:8890/sparql +MaxSortedTopRows = 10000 diff --git a/test/data/virtuso_grant_write_permission.sh b/test/data/virtuso_grant_write_permission.sh new file mode 100755 index 00000000..ac456650 --- /dev/null +++ b/test/data/virtuso_grant_write_permission.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +# Virtuoso database connection credentials +DB_PORT=1111 +DB_USER="dba" +DB_PASS="dba" +VIRTUOSO_DIR=$1 + +if [ "$#" -ne 1 ]; then + VIRTUOSO_DIR="/opt/virtuoso-opensource/" +fi +# Connect to Virtuoso using isql and grant EXECUTE permission +echo "-- Granting EXECUTE permission on DB.DBA.SPARQL_INSERT_DICT_CONTENT..." + +$VIRTUOSO_DIR/bin/isql $DB_PORT $DB_USER $DB_PASS < Date: Thu, 23 Jan 2025 04:01:51 +0000 Subject: [PATCH 2/5] [ontoportal-bot] Gemfile.lock update --- Gemfile.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index d44a4105..0e81c8f2 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,6 +1,6 @@ GIT remote: https://github.com/ontoportal-lirmm/goo.git - revision: 27300f28ca6c656c7e78af65013d88b792a6312f + revision: 5825dc1f9d0ff439b1ba9d8f78fa7bb20b1c65d0 branch: development specs: goo (0.0.2) @@ -348,4 +348,4 @@ DEPENDENCIES test-unit-minitest BUNDLED WITH - 2.3.3 + 2.3.14 From 0645a46ca820384b2673ca985f27295b23473a10 Mon Sep 17 00:00:00 2001 From: OntoPortal Bot Date: Thu, 23 Jan 2025 05:08:31 +0100 Subject: [PATCH 3/5] [ontoportal-bot] Gemfile.lock update --- Gemfile.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Gemfile.lock b/Gemfile.lock index 0e81c8f2..8c86d66e 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -348,4 +348,4 @@ DEPENDENCIES test-unit-minitest BUNDLED WITH - 2.3.14 + 2.3.15 From dd736917974f13ac7558e0d2a61a84030d82acaa Mon Sep 17 00:00:00 2001 From: Syphax Date: Thu, 23 Jan 2025 20:30:34 +0100 Subject: [PATCH 4/5] fix graph count issue in assigning file path --- lib/ncbo_cron/graphs_counts.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/ncbo_cron/graphs_counts.rb b/lib/ncbo_cron/graphs_counts.rb index febeadea..2ac19338 100644 --- a/lib/ncbo_cron/graphs_counts.rb +++ b/lib/ncbo_cron/graphs_counts.rb @@ -10,7 +10,7 @@ class GraphsCounts attr_reader :logger, :file_path def initialize(logger = nil, file_path = nil) - @file_path ||= DATA_SAVE + @file_path = file_path || DATA_SAVE @logger = logger || Logger.new(STDOUT) end From 677a6cad7295c1a228acf3cc59babd5c15ab9b0e Mon Sep 17 00:00:00 2001 From: Syphax bouazzouni Date: Sat, 1 Feb 2025 23:07:32 +0100 Subject: [PATCH 5/5] Feature: Create triple store migration brenchmarks scripts (#30) * add a script to migrate 4s dump to graph nt files * add a script to import any nt file into a graph * add a script to combine the metadata graphs files generation and import * add scripts that compares triples count in graph files and in triple store * update docker compose to use the default virtuoso image * add option to run import metadata graphs using docker for testing * add virtuoso custom scripts * simplify compare count to not do the benchmarks * add benchmarking tests * update virtuoso docker image * add benchmarks examples documentation --- Gemfile | 2 + Gemfile.lock | 32 ++++++- bin/migrations/count_graph_triples.rb | 60 +++++++++++++ bin/migrations/virtuoso/import_to_virtuoso.sh | 0 docker-compose.yml | 29 ++++--- mise.toml | 1 + test/benchmarks/data_benchs.rb | 85 +++++++++++++++++++ test/benchmarks/examples.md | 10 +++ test/benchmarks/import_all_metadata_file.sh | 17 ++++ .../import_and_fetch_all_triples_nt_file.rb | 77 +++++++++++++++++ test/benchmarks/metadata_benchs.rb | 33 +++++++ .../parse_and_do_ontoportal_operations.rb | 74 ++++++++++++++++ test/benchmarks/run_metadata_benchs.rb | 55 ++++++++++++ .../benchmarks/start_ontoportal_services.sh | 19 ++++- test/data/initdb.d/1-grant-write-acess.sql | 3 + test/data/virtuoso.ini | 61 ------------- test/data/virtuso_grant_write_permission.sh | 58 ------------- 17 files changed, 477 insertions(+), 139 deletions(-) create mode 100755 bin/migrations/count_graph_triples.rb create mode 100644 bin/migrations/virtuoso/import_to_virtuoso.sh create mode 100644 test/benchmarks/data_benchs.rb create mode 100644 test/benchmarks/examples.md create mode 100755 test/benchmarks/import_all_metadata_file.sh create mode 100644 test/benchmarks/import_and_fetch_all_triples_nt_file.rb create mode 100644 test/benchmarks/metadata_benchs.rb create mode 100644 test/benchmarks/parse_and_do_ontoportal_operations.rb create mode 100644 test/benchmarks/run_metadata_benchs.rb rename start_ontoportal_services.sh => test/benchmarks/start_ontoportal_services.sh (80%) create mode 100644 test/data/initdb.d/1-grant-write-acess.sql delete mode 100644 test/data/virtuoso.ini delete mode 100755 test/data/virtuso_grant_write_permission.sh diff --git a/Gemfile b/Gemfile index bba53c54..4524baf7 100644 --- a/Gemfile +++ b/Gemfile @@ -52,3 +52,5 @@ group :deployment do end gem "binding_of_caller", "~> 1.0" +gem 'net-smtp' +gem 'net-ftp' diff --git a/Gemfile.lock b/Gemfile.lock index 8c86d66e..d5314492 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -29,7 +29,7 @@ GIT GIT remote: https://github.com/ontoportal-lirmm/ontologies_linked_data.git - revision: 6cb18910e322645e3cc3490951d10f19468da52f + revision: 194fcfb9a1c4660dabef738d16f32c210a23c343 branch: development specs: ontologies_linked_data (0.0.1) @@ -49,7 +49,7 @@ GIT GIT remote: https://github.com/ontoportal-lirmm/sparql-client.git - revision: 4364d34e9e4c411f1dd0ea706bf052465bf0b467 + revision: d4a226e75eb4aeaaf42720eac4f23f55380a0bd3 branch: development specs: sparql-client (3.2.2) @@ -84,6 +84,7 @@ GEM base64 (0.2.0) bcrypt (3.1.20) bcrypt_pbkdf (1.1.1) + bcrypt_pbkdf (1.1.1-arm64-darwin) bigdecimal (3.1.9) binding_of_caller (1.0.1) debug_inspector (>= 1.2.0) @@ -105,6 +106,7 @@ GEM connection_pool (2.5.0) cube-ruby (0.0.3) dante (0.2.0) + date (3.4.1) debug_inspector (1.2.0) declarative (0.0.20) docile (1.4.1) @@ -152,6 +154,8 @@ GEM google-cloud-env (2.1.1) faraday (>= 1.0, < 3.a) google-cloud-errors (1.4.0) + google-protobuf (3.25.3) + google-protobuf (3.25.3-arm64-darwin) google-protobuf (3.25.3-x86_64-linux) googleapis-common-protos (1.6.0) google-protobuf (>= 3.18, < 5.a) @@ -166,6 +170,12 @@ GEM multi_json (~> 1.11) os (>= 0.9, < 2.0) signet (>= 0.16, < 2.a) + grpc (1.65.2) + google-protobuf (>= 3.25, < 5.0) + googleapis-common-protos-types (~> 1.0) + grpc (1.65.2-arm64-darwin) + google-protobuf (>= 3.25, < 5.0) + googleapis-common-protos-types (~> 1.0) grpc (1.65.2-x86_64-linux) google-protobuf (>= 3.25, < 5.0) googleapis-common-protos-types (~> 1.0) @@ -207,12 +217,19 @@ GEM redis multi_json (1.15.0) mutex_m (0.3.0) + net-ftp (0.3.8) + net-protocol + time net-http-persistent (4.0.5) connection_pool (~> 2.2) + net-protocol (0.2.2) + timeout net-scp (4.1.0) net-ssh (>= 2.6.5, < 8.0.0) net-sftp (4.0.0) net-ssh (>= 5.0.0, < 8.0.0) + net-smtp (0.5.0) + net-protocol net-ssh (7.3.0) netrc (0.11.0) oj (3.16.9) @@ -230,7 +247,7 @@ GEM coderay (~> 1.1) method_source (~> 1.0) public_suffix (5.1.1) - rack (3.1.8) + rack (3.1.9) rack-test (2.2.0) rack (>= 1.3) rake (13.2.1) @@ -299,6 +316,9 @@ GEM systemu (2.6.5) test-unit-minitest (0.9.1) minitest (~> 4.7) + time (0.4.1) + date + timeout (0.4.3) trailblazer-option (0.1.2) tzinfo (2.0.6) concurrent-ruby (~> 1.0) @@ -307,6 +327,8 @@ GEM macaddr (~> 1.0) PLATFORMS + arm64-darwin-24 + ruby x86_64-linux DEPENDENCIES @@ -331,6 +353,8 @@ DEPENDENCIES multi_json ncbo_annotator! ncbo_cron! + net-ftp + net-smtp oj ontologies_linked_data! parallel @@ -348,4 +372,4 @@ DEPENDENCIES test-unit-minitest BUNDLED WITH - 2.3.15 + 2.4.22 diff --git a/bin/migrations/count_graph_triples.rb b/bin/migrations/count_graph_triples.rb new file mode 100755 index 00000000..db8aef65 --- /dev/null +++ b/bin/migrations/count_graph_triples.rb @@ -0,0 +1,60 @@ +# require 'bundler/setup' +require 'pry' +require 'benchmark' +require 'ncbo_annotator' +require 'ncbo_cron' +require 'ontologies_linked_data' + +graph = ARGV[1] +profile = ARGV[2] + +if graph.nil? + puts "Error: Missing arguments. Please provide the graph name." + exit(1) +end + +case profile +when 'ag' + # AllegroGraph backend + ENV['GOO_BACKEND_NAME'] = 'allegrograph' + ENV['GOO_PORT'] = '10035' + ENV['GOO_PATH_QUERY'] = '/repositories/ontoportal_test' + ENV['GOO_PATH_DATA'] = '/repositories/ontoportal_test/statements' + ENV['GOO_PATH_UPDATE'] = '/repositories/ontoportal_test/statements' + ENV['COMPOSE_PROFILES'] = 'ag' + +when 'fs' + # 4store backend + ENV['GOO_PORT'] = '9000' + ENV['COMPOSE_PROFILES'] = 'fs' + +when 'vo' + # Virtuoso backend + ENV['GOO_BACKEND_NAME'] = 'virtuoso' + ENV['GOO_PORT'] = '8890' + ENV['GOO_PATH_QUERY'] = '/sparql' + ENV['GOO_PATH_DATA'] = '/sparql' + ENV['GOO_PATH_UPDATE'] = '/sparql' + ENV['COMPOSE_PROFILES'] = 'vo' + +when 'gb' + # Graphdb backend + ENV['GOO_BACKEND_NAME'] = 'graphdb' + ENV['GOO_PORT'] = '7200' + ENV['GOO_PATH_QUERY'] = '/repositories/ontoportal' + ENV['GOO_PATH_DATA'] = '/repositories/ontoportal/statements' + ENV['GOO_PATH_UPDATE'] = '/repositories/ontoportal/statements' + +else + puts "Will import to default config set in config/config.rb" +end + +require_relative '../../config/config' +count = 0 +time = Benchmark.realtime do + rs = Goo.sparql_query_client.query("SELECT (COUNT(?s) as ?count) FROM <#{graph_uri}> WHERE { ?s ?p ?o }") + rs = rs.solutions.first + count = rs[:count].to_i if rs +end + +puts 'Imported triples in ' + format("%.4f", time) + 's with total count: ' + count.to_s diff --git a/bin/migrations/virtuoso/import_to_virtuoso.sh b/bin/migrations/virtuoso/import_to_virtuoso.sh new file mode 100644 index 00000000..e69de29b diff --git a/docker-compose.yml b/docker-compose.yml index 89438465..3fe6671c 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -50,10 +50,10 @@ services: ports: - "9393:9393" -# mgrep-ut: -# image: ontoportal/mgrep-ncbo:0.1 -# ports: -# - "55556:55555" + mgrep-ut: + image: ontoportal/mgrep-ncbo:0.1 + ports: + - "55556:55555" redis-ut: image: redis @@ -87,8 +87,7 @@ services: # volumes: #- solr_data:/var/solr/data agraph-ut: - image: franzinc/agraph:v8.1.0 - platform: linux/amd64 + image: franzinc/agraph:v8.3.0 environment: - AGRAPH_SUPER_USER=test - AGRAPH_SUPER_PASSWORD=xyzzy @@ -117,12 +116,12 @@ services: - ag virtuoso-ut: - image: tenforce/virtuoso:virtuoso7.2.5 - platform: linux/amd64 + image: openlink/virtuoso-opensource-7:latest environment: - - SPARQL_UPDATE=true - - VIRT_Parameters_NumberOfBuffers=2450000 - - VIRT_Parameters_MaxDirtyBuffers=1837500 + - DBA_PASSWORD= dba + - DAV_PASSWORD= dba + - VIRT_Parameters_NumberOfBuffers=680000 + - VIRT_Parameters_MaxDirtyBuffers=500000 - VIRT_Parameters_NumOfThreads=100 - VIRT_Parameters_MaxMem=20000000000 - VIRT_Parameters_LogEnable=2 @@ -132,10 +131,12 @@ services: profiles: - vo ports: - - 1111:1111 - - 8890:8890 + - "1111:1111" # Standard Virtuoso port + - "8890:8890" # HTTP port for SPARQL endpoint + volumes: + - ./test/data/initdb.d:/opt/virtuoso-opensource/initdb.d healthcheck: - test: [ "CMD-SHELL", "curl -sf http://localhost:8890/sparql || exit 1" ] + test: [ "CMD-SHELL", "curl -sf http://localhost:8890/sparql || exit 1" ] start_period: 10s interval: 60s timeout: 5s diff --git a/mise.toml b/mise.toml index 83aa57a8..cfe51c18 100644 --- a/mise.toml +++ b/mise.toml @@ -1,2 +1,3 @@ [tools] +java = "17" ruby = "2.7.8" diff --git a/test/benchmarks/data_benchs.rb b/test/benchmarks/data_benchs.rb new file mode 100644 index 00000000..68dd7393 --- /dev/null +++ b/test/benchmarks/data_benchs.rb @@ -0,0 +1,85 @@ +require 'ontologies_linked_data' +module Benchmarks + + def self.do_all_benchmarks(sub) + Benchmarks.bench("fetch triples") do + Benchmarks.paginate_all_triples(sub) + end + + Benchmarks.bench("get ontology Concept Roots") do + Benchmarks.ontology_roots(sub) + end + + Benchmarks.bench("concept children") do + Benchmarks.concept_children("http://terminologies.gfbio.org/ITIS/Taxa_0", sub) + end + + Benchmarks.bench("concept path to root") do + Benchmarks.concept_tree("http://terminologies.gfbio.org/ITIS/Taxa_6007", sub) + end + end + + def self.bench(label, &block) + time = Benchmark.realtime do + block.call + end + puts "Time to #{label}: " + time.round(2).to_s + end + + def self.import_nt_file(sub, file_path) + Goo.sparql_data_client.delete_graph(sub.id) + Goo.sparql_data_client.append_triples_no_bnodes(sub.id, file_path, nil) + end + + def self.paginate_all_triples(sub) + page = 1 + pagesize = 10000 + count = 1 + total_count = 0 + while count > 0 && page < 100 + puts "Starting query for page #{page}" + offset = " OFFSET #{(page - 1) * pagesize}" + rs = Goo.sparql_query_client.query("SELECT ?s ?p ?o FROM <#{sub.id}> WHERE { ?s ?p ?o } LIMIT #{pagesize} #{offset}") + count = rs.each_solution.size + total_count += count + page += 1 + end + puts "Total triples: " + total_count.to_s + end + + def self.ontology_roots(sub) + load_attrs = LinkedData::Models::Class.goo_attrs_to_load([:all]) + roots = [] + time = Benchmark.realtime do + roots = sub.roots(load_attrs) + end + puts "Time to find roots: " + time.round(2).to_s + Goo.log_debug_file('roots') + time = Benchmark.realtime do + LinkedData::Models::Class.in(sub).models(roots).include(:unmapped).all + end + puts "Time to load roots: " + time.round(2).to_s + Goo.log_debug_file('roots') + puts "Roots count: " + roots.length.to_s + puts "Roots total triples: " + roots.map { |r| r.properties.values.flatten.size}.sum.to_s + end + + def self.concept_children(uri, sub) + page, size = [1, 100] + cls = LinkedData::Models::Class.find(RDF::URI.new("http://terminologies.gfbio.org/ITIS/Taxa_0")).in(sub).first + ld = LinkedData::Models::Class.goo_attrs_to_load([:all]) + children = sub.children(cls, includes_param: ld, page: page, size: size) + puts "Children count: " + children.length.to_s + end + + def self.concept_tree(uri, sub) + cls = LinkedData::Models::Class.find("http://terminologies.gfbio.org/ITIS/Taxa_6007").in(sub).first + display_attrs = [:prefLabel, :hasChildren, :children, :obsolete, :subClassOf] + extra_include = display_attrs + [:hasChildren, :isInActiveScheme, :isInActiveScheme] + + roots = sub.roots(extra_include) + # path = cls.path_to_root(roots) + cls.tree(roots: roots) + end + +end diff --git a/test/benchmarks/examples.md b/test/benchmarks/examples.md new file mode 100644 index 00000000..b83f1946 --- /dev/null +++ b/test/benchmarks/examples.md @@ -0,0 +1,10 @@ +# Benchmarks +## Import all AgroPortal metadata +test/benchmarks/import_all_metadata_file.sh ./processed_files gb +ruby test/benchmarks/run_metadata_benchs.rb gb + +## Parse INRAETHES and do ontoportal operations +ruby test/benchmarks/parse_and_do_ontoportal_operations.rb INRAETHES fs + +## Parse ITIS and do ontoportal operations +ruby test/benchmarks/parse_and_do_ontoportal_operations.rb ITIS fs api_key https://data.biodivportal.gfbio.dev diff --git a/test/benchmarks/import_all_metadata_file.sh b/test/benchmarks/import_all_metadata_file.sh new file mode 100755 index 00000000..ad172cd6 --- /dev/null +++ b/test/benchmarks/import_all_metadata_file.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash +path_graphs_files=$1 +profile=$2 +set -e + + +if [ -z "$profile" ]; then + echo "Usage: $0 " + exit 1 +fi +echo "###########################################################################" +./test/benchmarks/start_ontoportal_services.sh "$profile" +./bin/migrations/import_metadata_graphs_to_store "$path_graphs_files" "$profile" +echo 'All metadata graphs imported successfully.' +echo "###########################################################################" + +bundle exec ruby bin/migrations/compare_counts.rb "$path_graphs_files" "$profile" diff --git a/test/benchmarks/import_and_fetch_all_triples_nt_file.rb b/test/benchmarks/import_and_fetch_all_triples_nt_file.rb new file mode 100644 index 00000000..a2350165 --- /dev/null +++ b/test/benchmarks/import_and_fetch_all_triples_nt_file.rb @@ -0,0 +1,77 @@ + +# Documentation: +# This script is used to import a large NT file into the triple store +# and then fetch all the triples by paginating through the triples. +# The script is used to compare the performance of the import and fetch of different backends. + +profile = ARGV[0] +file_path = ARGV[1] +acronym = ARGV[2] || 'STY' # Default to STY +pwd = File.dirname(__FILE__) +system("#{pwd}/start_ontoportal_services.sh #{profile} #{acronym}") + +if $?.exitstatus != 0 + puts "Error occurred during script execution." + exit(1) +end + +if file_path == nil + puts "Error: Missing arguments. Please provide the file path." + exit(1) +end + +puts "Finished parsing file" +case profile +when 'ag' + # AllegroGraph backend + ENV['GOO_BACKEND_NAME'] = 'allegrograph' + ENV['GOO_PORT'] = '10035' + ENV['GOO_PATH_QUERY'] = '/repositories/ontoportal_test' + ENV['GOO_PATH_DATA'] = '/repositories/ontoportal_test/statements' + ENV['GOO_PATH_UPDATE'] = '/repositories/ontoportal_test/statements' + ENV['COMPOSE_PROFILES'] = 'ag' + +when 'fs' + # 4store backend + ENV['GOO_PORT'] = '9000' + ENV['COMPOSE_PROFILES'] = 'fs' + +when 'vo' + # Virtuoso backend + ENV['GOO_BACKEND_NAME'] = 'virtuoso' + ENV['GOO_PORT'] = '8890' + ENV['GOO_PATH_QUERY'] = '/sparql' + ENV['GOO_PATH_DATA'] = '/sparql' + ENV['GOO_PATH_UPDATE'] = '/sparql' + ENV['COMPOSE_PROFILES'] = 'vo' + +when 'gb' + # Graphdb backend + ENV['GOO_BACKEND_NAME'] = 'graphdb' + ENV['GOO_PORT'] = '7200' + ENV['GOO_PATH_QUERY'] = '/repositories/ontoportal' + ENV['GOO_PATH_DATA'] = '/repositories/ontoportal/statements' + ENV['GOO_PATH_UPDATE'] = '/repositories/ontoportal/statements' + +else + puts "Error: Unknown backend type. Please set BACKEND_TYPE to 'ag', 'fs', 'vo', or 'gb'." +end + +require 'bundler/setup' +require 'pry' +require 'benchmark' +require 'ncbo_annotator' +require 'ncbo_cron' +require 'ontologies_linked_data' +require_relative '../../config/config' +require_relative 'data_benchs' + +puts "Starting to fetch triples" +sub = LinkedData::Models::Ontology.find(acronym).first.latest_submission(status: :any) +sub.bring_remaining + +Benchmarks.bench('Append triples') do + Benchmarks.import_nt_file(sub, file_path) +end + +Benchmarks.do_all_benchmarks(sub) diff --git a/test/benchmarks/metadata_benchs.rb b/test/benchmarks/metadata_benchs.rb new file mode 100644 index 00000000..385205aa --- /dev/null +++ b/test/benchmarks/metadata_benchs.rb @@ -0,0 +1,33 @@ +require 'ontologies_linked_data' +require_relative 'data_benchs' +module Benchmarks + module Metadata + + def self.do_all_benchmarks + Benchmarks.bench("Fetch all ontologies (display=all)") do + self.all_ontologies + end + + Benchmarks.bench("Fetch all submissions (display=all)") do + Goo.logger.info("Fetching all submissions") + self.all_submissions + end + + end + + + def self.all_ontologies + attr_ontology = LinkedData::Models::Ontology.attributes(:all) + count = LinkedData::Models::Ontology.where.include(attr_ontology).all.count + puts "Total ontologies: #{count}" + end + + def self.all_submissions + attr_ontology = LinkedData::Models::Ontology.attributes(:all) + attr = LinkedData::Models::OntologySubmission.attributes(:all) + attr << { ontology: attr_ontology } + count = LinkedData::Models::OntologySubmission.where.include(attr).all.count + puts "Total submissions: #{count}" + end + end +end diff --git a/test/benchmarks/parse_and_do_ontoportal_operations.rb b/test/benchmarks/parse_and_do_ontoportal_operations.rb new file mode 100644 index 00000000..8276cb90 --- /dev/null +++ b/test/benchmarks/parse_and_do_ontoportal_operations.rb @@ -0,0 +1,74 @@ +require 'benchmark' +acronym = ARGV[0] +profile = ARGV[1] +api_key = ARGV[2] || '1de0a270-29c5-4dda-b043-7c3580628cd5' +api_url = ARGV[3] || 'http://data.stageportal.lirmm.fr' +pwd = File.dirname(__FILE__) + +system("bash #{pwd}/start_ontoportal_services.sh #{profile} #{acronym} #{api_key} #{api_url}") +if $?.exitstatus != 0 + puts "Error occurred during running services script execution." + exit(1) +end + +case profile +when 'ag' + # AllegroGraph backend + ENV['GOO_BACKEND_NAME'] = 'allegrograph' + ENV['GOO_PORT'] = '10035' + ENV['GOO_PATH_QUERY'] = '/repositories/ontoportal_test' + ENV['GOO_PATH_DATA'] = '/repositories/ontoportal_test/statements' + ENV['GOO_PATH_UPDATE'] = '/repositories/ontoportal_test/statements' + ENV['COMPOSE_PROFILES'] = 'ag' + +when 'fs' + # 4store backend + ENV['GOO_PORT'] = '9000' + ENV['COMPOSE_PROFILES'] = 'fs' + +when 'vo' + # Virtuoso backend + ENV['GOO_BACKEND_NAME'] = 'virtuoso' + ENV['GOO_PORT'] = '8890' + ENV['GOO_PATH_QUERY'] = '/sparql' + ENV['GOO_PATH_DATA'] = '/sparql' + ENV['GOO_PATH_UPDATE'] = '/sparql' + ENV['COMPOSE_PROFILES'] = 'vo' + +when 'gb' + # Graphdb backend + ENV['GOO_BACKEND_NAME'] = 'graphdb' + ENV['GOO_PORT'] = '7200' + ENV['GOO_PATH_QUERY'] = '/repositories/ontoportal' + ENV['GOO_PATH_DATA'] = '/repositories/ontoportal/statements' + ENV['GOO_PATH_UPDATE'] = '/repositories/ontoportal/statements' + +else + puts "Error: Unknown backend type. Please set BACKEND_TYPE to 'ag', 'fs', 'vo', or 'gb'." +end + +puts "Parsing file for #{acronym} and #{profile}" +time = Benchmark.realtime do + system("#{pwd}/../../bin/ncbo_ontology_process -o #{acronym} -t process_rdf") +end +puts "Time to parse file: " + time.round(2).to_s + 's' + +if $?.exitstatus != 0 + puts "Error occurred during script execution." + exit(1) +end +puts "Finished parsing file" + +require 'bundler/setup' +require 'pry' +require 'benchmark' +require 'ncbo_annotator' +require 'ncbo_cron' +require 'ontologies_linked_data' +require_relative '../../config/config' +require_relative './metadata_benchs' +require_relative './data_benchs' +Goo.sparql_query_client.cache.redis_cache.flushdb +sub = LinkedData::Models::Ontology.find(acronym).first.latest_submission(status: :any) + +Benchmarks.do_all_benchmarks(sub) diff --git a/test/benchmarks/run_metadata_benchs.rb b/test/benchmarks/run_metadata_benchs.rb new file mode 100644 index 00000000..c67cca6b --- /dev/null +++ b/test/benchmarks/run_metadata_benchs.rb @@ -0,0 +1,55 @@ +require 'benchmark' +profile = ARGV[0] +pwd = File.dirname(__FILE__) + +case profile +when 'ag' + # AllegroGraph backend + ENV['GOO_BACKEND_NAME'] = 'allegrograph' + ENV['GOO_PORT'] = '10035' + ENV['GOO_PATH_QUERY'] = '/repositories/ontoportal_test' + ENV['GOO_PATH_DATA'] = '/repositories/ontoportal_test/statements' + ENV['GOO_PATH_UPDATE'] = '/repositories/ontoportal_test/statements' + ENV['COMPOSE_PROFILES'] = 'ag' + +when 'fs' + # 4store backend + ENV['GOO_PORT'] = '9000' + ENV['COMPOSE_PROFILES'] = 'fs' + +when 'vo' + # Virtuoso backend + ENV['GOO_BACKEND_NAME'] = 'virtuoso' + ENV['GOO_PORT'] = '8890' + ENV['GOO_PATH_QUERY'] = '/sparql' + ENV['GOO_PATH_DATA'] = '/sparql' + ENV['GOO_PATH_UPDATE'] = '/sparql' + ENV['COMPOSE_PROFILES'] = 'vo' + +when 'gb' + # Graphdb backend + ENV['GOO_BACKEND_NAME'] = 'graphdb' + ENV['GOO_PORT'] = '7200' + ENV['GOO_PATH_QUERY'] = '/repositories/ontoportal' + ENV['GOO_PATH_DATA'] = '/repositories/ontoportal/statements' + ENV['GOO_PATH_UPDATE'] = '/repositories/ontoportal/statements' + +else + puts "Error: Unknown backend type. Please set BACKEND_TYPE to 'ag', 'fs', 'vo', or 'gb'." +end + + + +require 'bundler/setup' +require 'pry' +require 'benchmark' +require 'ncbo_annotator' +require 'ncbo_cron' +require 'ontologies_linked_data' +require_relative '../../config/config' +require_relative './metadata_benchs' + +Goo.sparql_query_client.cache.redis_cache.flushdb + + +Benchmarks::Metadata.do_all_benchmarks diff --git a/start_ontoportal_services.sh b/test/benchmarks/start_ontoportal_services.sh similarity index 80% rename from start_ontoportal_services.sh rename to test/benchmarks/start_ontoportal_services.sh index 9a8a9823..e2c4cd99 100755 --- a/start_ontoportal_services.sh +++ b/test/benchmarks/start_ontoportal_services.sh @@ -1,8 +1,23 @@ #!/usr/bin/env bash profile=$1 acronym=$2 +from_apikey=$3 +import_from_api=$4 set -e +if [ -z "$import_from_api" ]; then + import_from_api=https://data.stageportal.lirmm.fr +fi + +if [ -z "$from_apikey" ]; then + from_apikey="1de0a270-29c5-4dda-b043-7c3580628cd5" +fi + +# Documentation: +# The below script is used to start the OntoPortal services with a specific backend type. +# The script takes two arguments: acronym and profile. +# The acronym is the name of the ontology to be imported. +# The profile is the type of backend to be used. if [ -z "$profile" ]; then echo "Usage: $0 " @@ -41,7 +56,7 @@ elif [ "$BACKEND_TYPE" == "gb" ]; then export GOO_PATH_DATA="/repositories/ontoportal/statements" export GOO_PATH_UPDATE="/repositories/ontoportal/statements" else - echo "Error: Unknown backend type. Please set BACKEND_TYPE to 'ag', 'fs', or 'vo'." + echo "Error: Unknown backend type $profile. Please set BACKEND_TYPE to 'ag', 'fs', or 'vo'." fi echo "###########################################################################" @@ -82,4 +97,4 @@ bundle exec rake user:create[admin,admin@nodomain.org,password] bundle exec rake user:adminify[admin] echo "###########################################################################" echo "Create a new ontology $acronym and import it from a remote server" -bin/ncbo_ontology_import --admin-user admin -o "$acronym" --from https://data.stageportal.lirmm.fr --from-apikey 82602563-4750-41be-9654-36f46056a0db +bin/ncbo_ontology_import --admin-user admin -o "$acronym" --from "$import_from_api" --from-apikey "$from_apikey" diff --git a/test/data/initdb.d/1-grant-write-acess.sql b/test/data/initdb.d/1-grant-write-acess.sql new file mode 100644 index 00000000..d509c6fb --- /dev/null +++ b/test/data/initdb.d/1-grant-write-acess.sql @@ -0,0 +1,3 @@ +GRANT EXECUTE ON DB.DBA.SPARQL_INSERT_DICT_CONTENT TO "SPARQL"; +GRANT SPARQL_UPDATE TO "SPARQL"; +DB.DBA.RDF_DEFAULT_USER_PERMS_SET ('nobody', 7); diff --git a/test/data/virtuoso.ini b/test/data/virtuoso.ini deleted file mode 100644 index 2e3d3251..00000000 --- a/test/data/virtuoso.ini +++ /dev/null @@ -1,61 +0,0 @@ -[Database] -DatabaseFile = ./database/virtuoso.db -ErrorLogFile = ./database/virtuoso.log -TransactionFile = ./database/virtuoso.trx -xa_persistent_file = ./database/virtuoso.pxa -MaxCheckpointRemap = 200000 -CheckpointInterval = 60 -NumberOfBuffers = 2450000 ; Each buffer is 8KB, so ~19GB total -MaxDirtyBuffers = 1837500 ; About 75% of NumberOfBuffers -TransactionAfterImageLimit = 50000000 -; NumberOfBuffers = 1000000 -MaxStaticCursorRows = 5000 -Striping = 0 -TempStorage = . -ErrorLogLevel = 7 - -[HTTPServer] -ServerPort = 8890 -ServerRoot = ./var/lib/virtuoso/vsp -MaxClientConnections = 200 -MaxKeepAlives = 10 -KeepAliveTimeout = 10 -ServerThreads = 50 -HttpTimeout = 300 -MaxBody = 20000000 -EnableGzip = 1 -GzipMimeType = text/html, text/xml, text/plain, text/css, application/xml, application/xhtml+xml, application/rss+xml, application/javascript, application/x-javascript, image/svg+xml -HTTPLogFile = ./http18012025.log - -[Parameters] -ServerPort = 1111 -NumOfThreads = 100 -MaxMem = 20000000000 ; 5GB memory -ResultSetMaxRows = 10000 -DirsAllowed = ., ./vad, ./virtuoso, ../migration-to-virtuoso,../migration-to-virtuoso/processed_files -MaxQueryCostEstimationTime = 6000 -MaxQueryExecutionTime = 6000 -DynamicLocal = 1 -LogEnable = 2 ; Enable SPARQL query logging -TraceOn = errors -LogFile = virtuoso.log -NumberOfBuffers = 2450000 ; Each buffer is 8KB, so ~19GB total -MaxDirtyBuffers = 1837500 ; About 75% of NumberOfBuffers - -[VDB] -ArrayOptimization = 0 -NumArrayParams = 0 -VDBDisconnectTimeout = 1000 -KeepAliveTimeout = 60 -RetryCount = 3 -ThreadCleanupInterval = 600 - -[Replication] -ServerName = virtuoso -ServerEnable = 1 - -[SPARQL] -ResultSetMaxRows = 1000000000000 -MaxQueryExecutionTime = 6000 -DefaultGraph = http://localhost:8890/sparql -MaxSortedTopRows = 10000 diff --git a/test/data/virtuso_grant_write_permission.sh b/test/data/virtuso_grant_write_permission.sh deleted file mode 100755 index ac456650..00000000 --- a/test/data/virtuso_grant_write_permission.sh +++ /dev/null @@ -1,58 +0,0 @@ -#!/bin/bash - -# Virtuoso database connection credentials -DB_PORT=1111 -DB_USER="dba" -DB_PASS="dba" -VIRTUOSO_DIR=$1 - -if [ "$#" -ne 1 ]; then - VIRTUOSO_DIR="/opt/virtuoso-opensource/" -fi -# Connect to Virtuoso using isql and grant EXECUTE permission -echo "-- Granting EXECUTE permission on DB.DBA.SPARQL_INSERT_DICT_CONTENT..." - -$VIRTUOSO_DIR/bin/isql $DB_PORT $DB_USER $DB_PASS <