From 03c19e7ab4bb01ab13a76891f2de1fff5c343a1b Mon Sep 17 00:00:00 2001 From: Syphax Date: Fri, 17 Jan 2025 10:27:57 +0100 Subject: [PATCH 1/8] add a script to migrate 4s dump to graph nt files --- bin/migrations/4s-to-graph-files | 42 ++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100755 bin/migrations/4s-to-graph-files diff --git a/bin/migrations/4s-to-graph-files b/bin/migrations/4s-to-graph-files new file mode 100755 index 0000000..c7cd9c4 --- /dev/null +++ b/bin/migrations/4s-to-graph-files @@ -0,0 +1,42 @@ +#!/usr/bin/env ruby + +require 'fileutils' + +# Usage: ruby migrate_and_extract.rb +# Check if the correct number of arguments are provided +if ARGV.size != 2 + puts "Usage: #{$PROGRAM_NAME} " + exit 1 +end + +source_folder = ARGV[0] +target_folder = ARGV[1] +processed_dir = File.join(target_folder, 'processed_files') + +# Create the target directory if it doesn't exist +FileUtils.mkdir_p(processed_dir) + +# Find all files in the source folder and process them +Dir.glob(File.join(source_folder, '**', '*')).select { |file| File.file?(file) }.each do |file| + puts "Processing file: #{file}" + + # Define the new filename with .n3 extension + filename = File.basename(file) + new_file = File.join(processed_dir, "#{filename}.n3") + + # Copy the original file to the target folder with .n3 extension + FileUtils.cp(file, new_file) + puts "Copied to: #{new_file}" + + # Extract the first line and remove the "## GRAPH " prefix, then save it to .graph file + graph_file = "#{new_file}.graph" + first_line = File.open(file, &:readline).sub(/^## GRAPH /, '').strip + File.write(graph_file, first_line) + puts "Extracted graph URI to: #{graph_file}" + + # Remove the first line from the copied .n3 file + File.write(new_file, File.readlines(new_file).drop(1).join) + puts "Removed the first line from: #{new_file}" +end + +puts "Migration and extraction complete." From 4eb9d7526778c11c43d0040e2df0b4a14b2ed159 Mon Sep 17 00:00:00 2001 From: Syphax Date: Fri, 17 Jan 2025 10:28:22 +0100 Subject: [PATCH 2/8] add a script to import any nt file into a graph --- bin/migrations/import_nt_file.rb | 71 ++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 bin/migrations/import_nt_file.rb diff --git a/bin/migrations/import_nt_file.rb b/bin/migrations/import_nt_file.rb new file mode 100644 index 0000000..8fef5fb --- /dev/null +++ b/bin/migrations/import_nt_file.rb @@ -0,0 +1,71 @@ +require 'bundler/setup' +require 'pry' +require 'benchmark' +require 'ncbo_annotator' +require 'ncbo_cron' +require 'ontologies_linked_data' + +file_path = ARGV[0] +graph = ARGV[1] +profile = ARGV[2] + +if file_path.nil? && graph.nil? + puts "Error: Missing arguments. Please provide the file path and the graph name." + exit(1) +end + +case profile +when 'ag' + # AllegroGraph backend + ENV['GOO_BACKEND_NAME'] = 'allegrograph' + ENV['GOO_PORT'] = '10035' + ENV['GOO_PATH_QUERY'] = '/repositories/ontoportal_test' + ENV['GOO_PATH_DATA'] = '/repositories/ontoportal_test/statements' + ENV['GOO_PATH_UPDATE'] = '/repositories/ontoportal_test/statements' + ENV['COMPOSE_PROFILES'] = 'ag' + +when 'fs' + # 4store backend + ENV['GOO_PORT'] = '9000' + ENV['COMPOSE_PROFILES'] = 'fs' + +when 'vo' + # Virtuoso backend + ENV['GOO_BACKEND_NAME'] = 'virtuoso' + ENV['GOO_PORT'] = '8890' + ENV['GOO_PATH_QUERY'] = '/sparql' + ENV['GOO_PATH_DATA'] = '/sparql' + ENV['GOO_PATH_UPDATE'] = '/sparql' + ENV['COMPOSE_PROFILES'] = 'vo' + +when 'gb' + # Graphdb backend + ENV['GOO_BACKEND_NAME'] = 'graphdb' + ENV['GOO_PORT'] = '7200' + ENV['GOO_PATH_QUERY'] = '/repositories/ontoportal' + ENV['GOO_PATH_DATA'] = '/repositories/ontoportal/statements' + ENV['GOO_PATH_UPDATE'] = '/repositories/ontoportal/statements' + +else + puts "Will import to default config set in config/config.rb" +end + +require_relative '../../config/config' +puts "Start importing file: #{file_path} to graph: #{graph} using profile: #{ENV['GOO_BACKEND_NAME']}" +puts "Delete graph: #{graph}" +time = Benchmark.realtime do + Goo.sparql_data_client.delete_graph(graph) +end +puts 'Time to delete graph: ' + format("%.4f", time.to_s) + 's' + +time = Benchmark.realtime do + Goo.sparql_data_client.append_triples_no_bnodes(graph, file_path, nil) +end +puts 'Time to append triples: ' + format("%.4f", time) + 's' + +puts "Count triples in graph: #{graph}" +count = 0 +time = Benchmark.realtime do + count = Goo.sparql_query_client.query("SELECT (COUNT(?s) as ?count) FROM <#{graph}> WHERE { ?s ?p ?o }") +end +puts 'Time to count triples: ' + format("%.4f", time) + 's with total count: ' + count.to_s From d0c2abac94fd867be33f0c34334cae660a14df64 Mon Sep 17 00:00:00 2001 From: Syphax Date: Fri, 17 Jan 2025 10:28:51 +0100 Subject: [PATCH 3/8] add a script to combine the metadata graphs files generation and import --- .../import_metadata_graphs_to_store | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100755 bin/migrations/import_metadata_graphs_to_store diff --git a/bin/migrations/import_metadata_graphs_to_store b/bin/migrations/import_metadata_graphs_to_store new file mode 100755 index 0000000..3084ce9 --- /dev/null +++ b/bin/migrations/import_metadata_graphs_to_store @@ -0,0 +1,61 @@ +#!/usr/bin/env ruby + +require 'benchmark' +# Stop the script at the first error +begin + # Check if the correct number of arguments are provided + if ARGV.size < 1 + puts "Usage: #{$PROGRAM_NAME} " + exit 1 + end + + # Directory containing .n3 files and Virtuoso installation path + processed_dir = ARGV[0] + # Optional profile to use for the import (vo: virtruoso, fs: 4store, gb: GraphDB) + profile = ARGV[1] + + # Check if processed_files directory exists + unless Dir.exist?(processed_dir) + puts "Processed files directory #{processed_dir} does not exist!" + exit 1 + end + + total_time = 0 + import_count = 0 + file_count = 0 + # Loop through all .n3 files in the processed_files directory + Dir.glob(File.join(processed_dir, '*.n3')).each do |file| + # Extract the associated .graph file (contains graph URI) + graph_file = "#{file}.graph" + + # Check if graph file exists + unless File.exist?(graph_file) + puts "Graph file #{graph_file} not found. Skipping import of #{file}." + next + end + + # Extract the graph URI from the graph file + graph_uri = File.read(graph_file).strip + line_count = `wc -l #{file}`.to_i + puts "Start importing #{file} into graph <#{graph_uri}> of line count #{line_count}" + result = false + time = Benchmark.realtime do + result = system("ruby bin/migrations/import_nt_file.rb #{file} #{graph_uri} #{profile} > /dev/null 2>&1") + end + + file_count += 1 + total_time += time + + if !result + puts "Error importing #{file} into graph <#{graph_uri}>" + else + import_count += 1 + puts "Imported <#{graph_uri}> successfully in #{time.round(2)} seconds" + end + puts "#############################################################" + end + puts "#{import_count}/#{file_count} files imported in #{total_time.round(2)} seconds" +rescue => e + puts "Error: #{e.message}" + exit 1 +end From 850c8da6c5b4479b743962e0a1a677655b6331b1 Mon Sep 17 00:00:00 2001 From: Syphax Date: Fri, 17 Jan 2025 10:30:14 +0100 Subject: [PATCH 4/8] add scripts that compares triples count in graph files and in triple store --- bin/migrations/compare_counts.rb | 179 +++++++++++++++++++++++++++++++ ncbo_cron.gemspec | 6 +- 2 files changed, 182 insertions(+), 3 deletions(-) create mode 100755 bin/migrations/compare_counts.rb diff --git a/bin/migrations/compare_counts.rb b/bin/migrations/compare_counts.rb new file mode 100755 index 0000000..485dff9 --- /dev/null +++ b/bin/migrations/compare_counts.rb @@ -0,0 +1,179 @@ +require 'open3' +require 'net/http' +require 'json' +require 'cgi' +require 'csv' +require 'pry' +require 'bundler/setup' +require 'benchmark' +require 'ncbo_annotator' +require 'ncbo_cron' +require 'ontologies_linked_data' + +PROCESSED_DIR = ARGV[0] || './processed_files' +profile = ARGV[1] + + +case profile +when 'ag' + # AllegroGraph backend + ENV['GOO_BACKEND_NAME'] = 'allegrograph' + ENV['GOO_PORT'] = '10035' + ENV['GOO_PATH_QUERY'] = '/repositories/ontoportal_test' + ENV['GOO_PATH_DATA'] = '/repositories/ontoportal_test/statements' + ENV['GOO_PATH_UPDATE'] = '/repositories/ontoportal_test/statements' + ENV['COMPOSE_PROFILES'] = 'ag' +when 'fs' + # 4store backend + ENV['GOO_PORT'] = '9000' + ENV['COMPOSE_PROFILES'] = 'fs' +when 'vo' + # Virtuoso backend + ENV['GOO_BACKEND_NAME'] = 'virtuoso' + ENV['GOO_PORT'] = '8890' + ENV['GOO_PATH_QUERY'] = '/sparql' + ENV['GOO_PATH_DATA'] = '/sparql' + ENV['GOO_PATH_UPDATE'] = '/sparql' + ENV['COMPOSE_PROFILES'] = 'vo' +when 'gb' + # Graphdb backend + ENV['GOO_BACKEND_NAME'] = 'graphdb' + ENV['GOO_PORT'] = '7200' + ENV['GOO_PATH_QUERY'] = '/repositories/ontoportal' + ENV['GOO_PATH_DATA'] = '/repositories/ontoportal/statements' + ENV['GOO_PATH_UPDATE'] = '/repositories/ontoportal/statements' +else + puts "Will import to default config set in config/config.rb" +end + +require_relative '../../config/config' +# Set your Virtuoso SPARQL endpoint, user credentials, and the directory where the .n3 files are located +OUTPUT_CSV = './graph_comparison.csv' + +def get_all_graphs_counts + graphs = [] + time = Benchmark.realtime do + rs = Goo.sparql_query_client.query("SELECT DISTINCT ?graph (COUNT(?s) as ?triplesCount) WHERE { GRAPH ?graph { ?s ?p ?o } } GROUP BY ?graph") + rs.each do |solution| + graphs << solution + end + end + puts 'Found ' + graphs.length.to_s + ' graphs in ' + format("%.4f", time) + 's' + + counts = {} + graphs.each do |graph| + counts[graph['graph'].to_s] = graph['triplesCount'].to_i + end + counts +end + +# Count the number of lines in a file (excluding the first metadata line) +def count_file_lines(file_path) + File.read(file_path).each_line.count +end + +def build_graphs_file_hash(folder_path = PROCESSED_DIR) + # Ensure the folder path exists + unless Dir.exist?(folder_path) + puts "Folder does not exist: #{folder_path}" + return + end + + graphs = {} + # Loop through each file in the folder + Dir.foreach(folder_path) do |filename| + # Skip directories and only process files ending with .graph and starting with the specific string + if filename.end_with?('.graph') + file_path = File.join(folder_path, filename) + line = File.open(file_path, "r").readlines.first + graphs[line.strip] = filename.to_s.gsub('.graph','') + end + end + graphs +end + +# Compare graph counts with file lines and output to CSV +def compare_graphs_with_files(graph_triples) + CSV.open(OUTPUT_CSV, 'w') do |csv| + # Write CSV headers + csv << ["Graph URI", "Triples in Graph", "Lines in File (excluding metadata)", "Match"] + graphs_files = build_graphs_file_hash + graph_triples.each do |graph, count| + graph_uri = graph + triples_count = count + graph_filename = graphs_files[graph_uri] + + next csv << [graph_uri, triples_count, "Graph not found", "N/A"] unless graph_filename + + # Construct the expected file name based on the graph URI + file_name = "#{PROCESSED_DIR}/#{graph_filename}" + + # puts "count lines of the file #{file_name} for the graph #{graph_uri}" + if File.exist?(file_name) + file_lines_count = count_file_lines(file_name) + + # Check if the counts match + match_status = triples_count == file_lines_count ? "Yes" : "No" + + # Output the result to CSV + csv << [graph_uri, triples_count, file_lines_count, match_status] + else + # If the file doesn't exist, indicate it in the CSV + csv << [graph_uri, triples_count, "File not found", "N/A"] + end + end + end + + puts "Comparison complete. Results saved to #{OUTPUT_CSV}" +end + +# Main execution + +Goo.sparql_query_client.cache.redis_cache.flushdb +puts "Redis cache flushed" + +puts "Comparing graph triple counts with file lines and exporting to CSV..." +graph_triples = get_all_graphs_counts +compare_graphs_with_files(graph_triples) + +count = 0 +attr_ontology = [] +time = Benchmark.realtime do + attr_ontology = LinkedData::Models::Ontology.attributes(:all) + count = LinkedData::Models::Ontology.where.include(attr_ontology).all.count +end +puts "Ontologies count: #{count} with display=all in #{format("%.4f", time)}s" +count = 0 +time = Benchmark.realtime do + count = LinkedData::Models::OntologySubmission.where.all.count +end +puts "Submissions count: #{count} with no display in #{format("%.4f", time)}s" + +count = 0 +time = Benchmark.realtime do + attr = LinkedData::Models::OntologySubmission.attributes(:all) + attr << {ontology: attr_ontology} + count = LinkedData::Models::OntologySubmission.where.include(attr).all.count +end +puts "Submissions count: #{count} with display=all in #{format("%.4f", time)}s" + +count = 0 +time = Benchmark.realtime do + attr = LinkedData::Models::Agent.attributes(:all) + count = LinkedData::Models::Agent.where.include(attr).all.count +end +puts "Agent count: #{count} with display=all in #{format("%.4f", time)}s" + +count = 0 +time = Benchmark.realtime do + attr = LinkedData::Models::MappingCount.attributes(:all) + count = LinkedData::Models::MappingCount.where.include(attr).all.count +end +puts "MappingsCount count: #{count} with display=all in #{format("%.4f", time)}s" + +count = 0 +time = Benchmark.realtime do + attr = LinkedData::Models::RestBackupMapping.attributes(:all) + LinkedData::Models::MappingProcess.attributes(:all) + count += LinkedData::Models::RestBackupMapping.where.include(attr).all.count +end +puts "RestMappings count: #{count} with display=all in #{format("%.4f", time)}s" diff --git a/ncbo_cron.gemspec b/ncbo_cron.gemspec index 960d360..25e2f30 100644 --- a/ncbo_cron.gemspec +++ b/ncbo_cron.gemspec @@ -1,5 +1,4 @@ # -*- encoding: utf-8 -*- - Gem::Specification.new do |gem| gem.version = "0.0.1" gem.authors = [""] @@ -8,8 +7,9 @@ Gem::Specification.new do |gem| gem.summary = %q{} gem.homepage = "https://github.com/ncbo/ncbo_cron" - gem.files = Dir['**/*'] - gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) } + gem.files = Dir['**/*'] - Dir['bin/migrations'] + # binding.pry + gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) rescue nil}.compact gem.test_files = gem.files.grep(%r{^(test|spec|features)/}) gem.name = "ncbo_cron" gem.require_paths = ["lib"] From 9dcfc785deb80a00042fd54f05fcd948f4754776 Mon Sep 17 00:00:00 2001 From: Syphax Date: Fri, 17 Jan 2025 21:40:51 +0100 Subject: [PATCH 5/8] update docker compose to use the default virtuoso image --- docker-compose.yml | 20 +++++-- test/data/graphdb-repo-config.ttl | 14 ++--- test/data/virtuoso.ini | 61 +++++++++++++++++++++ test/data/virtuso_grant_write_permission.sh | 58 ++++++++++++++++++++ 4 files changed, 141 insertions(+), 12 deletions(-) create mode 100644 test/data/virtuoso.ini create mode 100755 test/data/virtuso_grant_write_permission.sh diff --git a/docker-compose.yml b/docker-compose.yml index d554f11..037e7d6 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -84,7 +84,7 @@ services: ports: - 8983:8983 command: bin/solr start -cloud -f - # volumes: + # volumes: #- solr_data:/var/solr/data agraph-ut: image: franzinc/agraph:v8.1.0 @@ -111,19 +111,26 @@ services: # interval: 10s # timeout: 5s # retries: 5 + networks: + - app profiles: - ag virtuoso-ut: - image: tenforce/virtuoso:virtuoso7.2.5 + image: openlink/virtuoso-opensource-7:7.2 platform: linux/amd64 - environment: - - SPARQL_UPDATE=true ports: - 1111:1111 - 8890:8890 + environment: + - DBA_PASSWORD=dba profiles: - vo + volumes: + - ./test/data/virtuoso.ini:/opt/virtuoso-opensource/database/virtuoso.ini + - ./test/data/virtuso_grant_write_permission.sh:/opt/virtuoso-opensource/initdb.d/virtuso_grant_write_permission.sh + entrypoint: > + bash -c "cd ..; ./bin/virtuoso-t -c ./database/virtuoso.ini ; ./initdb.d/virtuso_grant_write_permission.sh ; tail -f database/virtuoso.log" healthcheck: test: [ "CMD-SHELL", "curl -sf http://localhost:8890/sparql || exit 1" ] start_period: 10s @@ -155,4 +162,7 @@ volumes: bundle: agdata: 4store: - #solr_data: \ No newline at end of file + #solr_data: +networks: + app: + driver: bridge diff --git a/test/data/graphdb-repo-config.ttl b/test/data/graphdb-repo-config.ttl index 9200da9..557ab23 100644 --- a/test/data/graphdb-repo-config.ttl +++ b/test/data/graphdb-repo-config.ttl @@ -13,21 +13,21 @@ ""; "true"; "false"; - "true"; + "false"; "true"; - "32"; - "10000000"; + "64"; + "100000000000000000"; ""; - "true"; + "false"; ""; "0"; - "0"; + "600"; "false"; "file-repository"; "rdfsplus-optimized"; "storage"; - "false"; + "true"; sail:sailType "owlim:Sail" ] ]; - rdfs:label "" . \ No newline at end of file + rdfs:label "" . diff --git a/test/data/virtuoso.ini b/test/data/virtuoso.ini new file mode 100644 index 0000000..2543ff2 --- /dev/null +++ b/test/data/virtuoso.ini @@ -0,0 +1,61 @@ +[Database] +DatabaseFile = ./database/virtuoso.db +ErrorLogFile = ./database/virtuoso.log +TransactionFile = ./database/virtuoso.trx +xa_persistent_file = ./database/virtuoso.pxa +MaxCheckpointRemap = 200000 +CheckpointInterval = 60 +NumberOfBuffers = 2450000 ; Each buffer is 8KB, so ~19GB total +MaxDirtyBuffers = 1837500 ; About 75% of NumberOfBuffers +TransactionAfterImageLimit = 50000000 +; NumberOfBuffers = 1000000 +MaxStaticCursorRows = 5000 +Striping = 0 +TempStorage = . +ErrorLogLevel = 7 + +[HTTPServer] +ServerPort = 8890 +ServerRoot = ./var/lib/virtuoso/vsp +MaxClientConnections = 200 +MaxKeepAlives = 10 +KeepAliveTimeout = 10 +ServerThreads = 50 +HttpTimeout = 300 +MaxBody = 20000000 +EnableGzip = 1 +GzipMimeType = text/html, text/xml, text/plain, text/css, application/xml, application/xhtml+xml, application/rss+xml, application/javascript, application/x-javascript, image/svg+xml +HTTPLogFile = ./http17012025.log + +[Parameters] +ServerPort = 1111 +NumOfThreads = 100 +MaxMem = 20000000000 ; 5GB memory +ResultSetMaxRows = 10000 +DirsAllowed = ., ./vad, ./virtuoso, ../migration-to-virtuoso,../migration-to-virtuoso/processed_files +MaxQueryCostEstimationTime = 6000 +MaxQueryExecutionTime = 6000 +DynamicLocal = 1 +LogEnable = 2 ; Enable SPARQL query logging +TraceOn = errors +LogFile = virtuoso.log +NumberOfBuffers = 2450000 ; Each buffer is 8KB, so ~19GB total +MaxDirtyBuffers = 1837500 ; About 75% of NumberOfBuffers + +[VDB] +ArrayOptimization = 0 +NumArrayParams = 0 +VDBDisconnectTimeout = 1000 +KeepAliveTimeout = 60 +RetryCount = 3 +ThreadCleanupInterval = 600 + +[Replication] +ServerName = virtuoso +ServerEnable = 1 + +[SPARQL] +ResultSetMaxRows = 1000000000000 +MaxQueryExecutionTime = 6000 +DefaultGraph = http://localhost:8890/sparql +MaxSortedTopRows = 10000 diff --git a/test/data/virtuso_grant_write_permission.sh b/test/data/virtuso_grant_write_permission.sh new file mode 100755 index 0000000..ac45665 --- /dev/null +++ b/test/data/virtuso_grant_write_permission.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +# Virtuoso database connection credentials +DB_PORT=1111 +DB_USER="dba" +DB_PASS="dba" +VIRTUOSO_DIR=$1 + +if [ "$#" -ne 1 ]; then + VIRTUOSO_DIR="/opt/virtuoso-opensource/" +fi +# Connect to Virtuoso using isql and grant EXECUTE permission +echo "-- Granting EXECUTE permission on DB.DBA.SPARQL_INSERT_DICT_CONTENT..." + +$VIRTUOSO_DIR/bin/isql $DB_PORT $DB_USER $DB_PASS < Date: Fri, 17 Jan 2025 23:35:31 +0100 Subject: [PATCH 6/8] add option to run import metadata graphs using docker for testing --- .gitignore | 6 ++ Gemfile.lock | 16 ++-- bin/migrations/compare_counts.rb | 6 ++ .../import_metadata_graphs_to_store | 9 ++ docker-compose.yml | 36 ++++---- start_ontoportal_services.sh | 85 +++++++++++++++++++ test/data/graphdb-repo-config.ttl | 55 ++++++------ test/data/virtuoso.ini | 2 +- 8 files changed, 163 insertions(+), 52 deletions(-) create mode 100755 start_ontoportal_services.sh diff --git a/.gitignore b/.gitignore index 4c1af01..dadfffc 100644 --- a/.gitignore +++ b/.gitignore @@ -31,3 +31,9 @@ logs/ # Ignore jEnv files .java-version + +processed_files/ + +queries.txt + +graph_comparison.csv diff --git a/Gemfile.lock b/Gemfile.lock index 052099c..cd86ea2 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,6 +1,6 @@ GIT remote: https://github.com/ontoportal-lirmm/goo.git - revision: f8ac7b00e8d8b46d1eea04de014175525c1cdd83 + revision: 8d108c23a043039e9675b36f8f444d29a87b11fe branch: development specs: goo (0.0.2) @@ -29,7 +29,7 @@ GIT GIT remote: https://github.com/ontoportal-lirmm/ontologies_linked_data.git - revision: e65d887616aaf4ae6f099437223d86515ffdca79 + revision: 0aa6219c44143b94135e01c78eb94ad99a5e8b32 branch: development specs: ontologies_linked_data (0.0.1) @@ -49,7 +49,7 @@ GIT GIT remote: https://github.com/ontoportal-lirmm/sparql-client.git - revision: 59251e59346c9a69a67c88552ba55a1244eec602 + revision: 24bccbd0f4a5150fa6ce2af50d7c378c681027ea branch: development specs: sparql-client (3.2.2) @@ -101,7 +101,7 @@ GEM capistrano (~> 3.1) sshkit (~> 1.3) coderay (1.1.3) - concurrent-ruby (1.3.4) + concurrent-ruby (1.3.5) connection_pool (2.5.0) cube-ruby (0.0.3) dante (0.2.0) @@ -138,7 +138,7 @@ GEM google-cloud-errors (~> 1.0) google-apis-analytics_v3 (0.16.0) google-apis-core (>= 0.15.0, < 2.a) - google-apis-core (0.15.1) + google-apis-core (0.16.0) addressable (~> 2.5, >= 2.5.1) googleauth (~> 1.9) httpclient (>= 2.8.3, < 3.a) @@ -157,7 +157,7 @@ GEM google-protobuf (>= 3.18, < 5.a) googleapis-common-protos-types (~> 1.7) grpc (~> 1.41) - googleapis-common-protos-types (1.17.0) + googleapis-common-protos-types (1.18.0) google-protobuf (>= 3.18, < 5.a) googleauth (1.11.2) faraday (>= 1.0, < 3.a) @@ -251,7 +251,7 @@ GEM rexml (~> 3.2) redis (5.3.0) redis-client (>= 0.22.0) - redis-client (0.23.1) + redis-client (0.23.2) connection_pool representable (3.2.0) declarative (< 0.1.0) @@ -348,4 +348,4 @@ DEPENDENCIES test-unit-minitest BUNDLED WITH - 2.3.14 + 2.3.3 diff --git a/bin/migrations/compare_counts.rb b/bin/migrations/compare_counts.rb index 485dff9..4c6be55 100755 --- a/bin/migrations/compare_counts.rb +++ b/bin/migrations/compare_counts.rb @@ -171,6 +171,12 @@ def compare_graphs_with_files(graph_triples) end puts "MappingsCount count: #{count} with display=all in #{format("%.4f", time)}s" +count = 0 +time = Benchmark.realtime do + count += LinkedData::Models::RestBackupMapping.where.all.count +end +puts "RestMappings count: #{count} with no display in #{format("%.4f", time)}s" + count = 0 time = Benchmark.realtime do attr = LinkedData::Models::RestBackupMapping.attributes(:all) + LinkedData::Models::MappingProcess.attributes(:all) diff --git a/bin/migrations/import_metadata_graphs_to_store b/bin/migrations/import_metadata_graphs_to_store index 3084ce9..d278664 100755 --- a/bin/migrations/import_metadata_graphs_to_store +++ b/bin/migrations/import_metadata_graphs_to_store @@ -14,6 +14,15 @@ begin # Optional profile to use for the import (vo: virtruoso, fs: 4store, gb: GraphDB) profile = ARGV[1] + docker = ARGV[2] == "docker" + + if docker + result = system("./start_ontoportal_services.sh #{profile}") + unless result + puts "Error starting services" + exit 1 + end + end # Check if processed_files directory exists unless Dir.exist?(processed_dir) puts "Processed files directory #{processed_dir} does not exist!" diff --git a/docker-compose.yml b/docker-compose.yml index 037e7d6..8943846 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -50,10 +50,10 @@ services: ports: - "9393:9393" - mgrep-ut: - image: ontoportal/mgrep-ncbo:0.1 - ports: - - "55556:55555" +# mgrep-ut: +# image: ontoportal/mgrep-ncbo:0.1 +# ports: +# - "55556:55555" redis-ut: image: redis @@ -117,20 +117,23 @@ services: - ag virtuoso-ut: - image: openlink/virtuoso-opensource-7:7.2 + image: tenforce/virtuoso:virtuoso7.2.5 platform: linux/amd64 - ports: - - 1111:1111 - - 8890:8890 environment: - - DBA_PASSWORD=dba + - SPARQL_UPDATE=true + - VIRT_Parameters_NumberOfBuffers=2450000 + - VIRT_Parameters_MaxDirtyBuffers=1837500 + - VIRT_Parameters_NumOfThreads=100 + - VIRT_Parameters_MaxMem=20000000000 + - VIRT_Parameters_LogEnable=2 + - VIRT_SPARQL_ResultSetMaxRows=1000000000000 + - VIRT_SPARQL_MaxQueryExecutionTime=6000 + - VIRT_SPARQL_MaxQueryCostEstimationTime=6000 profiles: - vo - volumes: - - ./test/data/virtuoso.ini:/opt/virtuoso-opensource/database/virtuoso.ini - - ./test/data/virtuso_grant_write_permission.sh:/opt/virtuoso-opensource/initdb.d/virtuso_grant_write_permission.sh - entrypoint: > - bash -c "cd ..; ./bin/virtuoso-t -c ./database/virtuoso.ini ; ./initdb.d/virtuso_grant_write_permission.sh ; tail -f database/virtuoso.log" + ports: + - 1111:1111 + - 8890:8890 healthcheck: test: [ "CMD-SHELL", "curl -sf http://localhost:8890/sparql || exit 1" ] start_period: 10s @@ -150,11 +153,10 @@ services: - 7200:7200 - 7300:7300 volumes: - - ./test/data/graphdb-repo-config.ttl:/opt/graphdb/dist/configs/templates/data/graphdb-repo-config.ttl + - ./test/data/graphdb-repo-config.ttl:/opt/graphdb/dist/configs/templates/graphdb.ttl - ./test/data/graphdb-test-load.nt:/opt/graphdb/dist/configs/templates/data/graphdb-test-load.nt - entrypoint: > - bash -c " importrdf load -f -c /opt/graphdb/dist/configs/templates/data/graphdb-repo-config.ttl -m parallel /opt/graphdb/dist/configs/templates/data/graphdb-test-load.nt ; graphdb -Ddefault.min.distinct.threshold=3000 " + bash -c " importrdf load -f -c /opt/graphdb/dist/configs/templates/graphdb.ttl -m parallel /opt/graphdb/dist/configs/templates/data/graphdb-test-load.nt ; graphdb " profiles: - gb diff --git a/start_ontoportal_services.sh b/start_ontoportal_services.sh new file mode 100755 index 0000000..9a8a982 --- /dev/null +++ b/start_ontoportal_services.sh @@ -0,0 +1,85 @@ +#!/usr/bin/env bash +profile=$1 +acronym=$2 +set -e + + +if [ -z "$profile" ]; then + echo "Usage: $0 " + exit 1 +fi + +BACKEND_TYPE=$profile +if [ "$BACKEND_TYPE" == "ag" ]; then + # AllegroGraph backend + export GOO_BACKEND_NAME="allegrograph" + export GOO_PORT="10035" + export GOO_PATH_QUERY="/repositories/ontoportal_test" + export GOO_PATH_DATA="/repositories/ontoportal_test/statements" + export GOO_PATH_UPDATE="/repositories/ontoportal_test/statements" + export COMPOSE_PROFILES="ag" + +elif [ "$BACKEND_TYPE" == "fs" ]; then + # 4store backend + export GOO_PORT="9000" + export COMPOSE_PROFILES="fs" + +elif [ "$BACKEND_TYPE" == "vo" ]; then + # Virtuoso backend + export GOO_BACKEND_NAME="virtuoso" + export GOO_PORT="8890" + export GOO_PATH_QUERY="/sparql" + export GOO_PATH_DATA="/sparql" + export GOO_PATH_UPDATE="/sparql" + export COMPOSE_PROFILES="vo" + +elif [ "$BACKEND_TYPE" == "gb" ]; then + # Graphdb backend + export GOO_BACKEND_NAME="graphdb" + export GOO_PORT="7200" + export GOO_PATH_QUERY="/repositories/ontoportal" + export GOO_PATH_DATA="/repositories/ontoportal/statements" + export GOO_PATH_UPDATE="/repositories/ontoportal/statements" +else + echo "Error: Unknown backend type. Please set BACKEND_TYPE to 'ag', 'fs', or 'vo'." +fi + +echo "###########################################################################" +echo "Stop and remove all containers, networks, and volumes and start fresh" +docker compose --profile fs --profile vo --profile gb --profile ag down --volumes --remove-orphans && docker compose --profile "$profile" up -d + +echo "Waiting for all Docker services to start..." + +while true; do + # Get the status of all containers + container_status=$(docker compose --profile "$profile" ps -a --format '{{.Names}} {{.State}}') + + all_running=true + while read -r container state; do + if [ "$state" != "running" ] && [ "$state" != "exited" ]; then + all_running=false + break + fi + done <<< "$container_status" + + # If all containers are running, exit the loop + if [ "$all_running" = true ]; then + echo "All containers are running!" + break + fi + + # Wait before checking again + sleep 2 +done + +if [ -z "$acronym" ]; then + exit 0 +fi + +echo "###########################################################################" +echo "Create a new user and make it an admin" +bundle exec rake user:create[admin,admin@nodomain.org,password] +bundle exec rake user:adminify[admin] +echo "###########################################################################" +echo "Create a new ontology $acronym and import it from a remote server" +bin/ncbo_ontology_import --admin-user admin -o "$acronym" --from https://data.stageportal.lirmm.fr --from-apikey 82602563-4750-41be-9654-36f46056a0db diff --git a/test/data/graphdb-repo-config.ttl b/test/data/graphdb-repo-config.ttl index 557ab23..82fbcc8 100644 --- a/test/data/graphdb-repo-config.ttl +++ b/test/data/graphdb-repo-config.ttl @@ -3,31 +3,34 @@ @prefix sail: . @prefix xsd: . -<#ontoportal> a rep:Repository; - rep:repositoryID "ontoportal"; - rep:repositoryImpl [ - rep:repositoryType "graphdb:SailRepository"; - [ - "http://example.org/owlim#"; - "false"; - ""; - "true"; - "false"; - "false"; - "true"; - "64"; - "100000000000000000"; - ""; - "false"; - ""; - "0"; - "600"; - "false"; - "file-repository"; - "rdfsplus-optimized"; - "storage"; - "true"; - sail:sailType "owlim:Sail" +<#wines> a rep:Repository; + rep:repositoryID "ontoportal"; + rep:repositoryImpl [ + rep:repositoryType "graphdb:SailRepository"; + [ + "false"; + ""; + "true"; + "true"; + "false"; + "true"; + "true"; + "32"; + "100000000"; + ("default" "iri"); + "none"; + "default"; + ""; + "true"; + "0"; + "0"; + "false"; + "file-repository"; + "empty"; + "storage"; + + "false"; + sail:sailType "graphdb:Sail" ] ]; - rdfs:label "" . + rdfs:label "" . diff --git a/test/data/virtuoso.ini b/test/data/virtuoso.ini index 2543ff2..2e3d325 100644 --- a/test/data/virtuoso.ini +++ b/test/data/virtuoso.ini @@ -25,7 +25,7 @@ HttpTimeout = 300 MaxBody = 20000000 EnableGzip = 1 GzipMimeType = text/html, text/xml, text/plain, text/css, application/xml, application/xhtml+xml, application/rss+xml, application/javascript, application/x-javascript, image/svg+xml -HTTPLogFile = ./http17012025.log +HTTPLogFile = ./http18012025.log [Parameters] ServerPort = 1111 From 796d70abf6ef2f94e571d398699bbba1d4357586 Mon Sep 17 00:00:00 2001 From: Syphax Date: Wed, 22 Jan 2025 01:19:14 +0100 Subject: [PATCH 7/8] add virtuoso custom scripts --- .../virtuso_grant_write_permission.sh | 58 +++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100755 bin/migrations/virtuoso/virtuso_grant_write_permission.sh diff --git a/bin/migrations/virtuoso/virtuso_grant_write_permission.sh b/bin/migrations/virtuoso/virtuso_grant_write_permission.sh new file mode 100755 index 0000000..ac45665 --- /dev/null +++ b/bin/migrations/virtuoso/virtuso_grant_write_permission.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +# Virtuoso database connection credentials +DB_PORT=1111 +DB_USER="dba" +DB_PASS="dba" +VIRTUOSO_DIR=$1 + +if [ "$#" -ne 1 ]; then + VIRTUOSO_DIR="/opt/virtuoso-opensource/" +fi +# Connect to Virtuoso using isql and grant EXECUTE permission +echo "-- Granting EXECUTE permission on DB.DBA.SPARQL_INSERT_DICT_CONTENT..." + +$VIRTUOSO_DIR/bin/isql $DB_PORT $DB_USER $DB_PASS < Date: Thu, 23 Jan 2025 04:28:50 +0100 Subject: [PATCH 8/8] simplify compare count to not do the benchmarks --- Gemfile.lock | 8 +-- bin/migrations/compare_counts.rb | 56 +------------------ .../import_metadata_graphs_to_store | 3 +- 3 files changed, 9 insertions(+), 58 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index cd86ea2..d44a410 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,6 +1,6 @@ GIT remote: https://github.com/ontoportal-lirmm/goo.git - revision: 8d108c23a043039e9675b36f8f444d29a87b11fe + revision: 27300f28ca6c656c7e78af65013d88b792a6312f branch: development specs: goo (0.0.2) @@ -29,7 +29,7 @@ GIT GIT remote: https://github.com/ontoportal-lirmm/ontologies_linked_data.git - revision: 0aa6219c44143b94135e01c78eb94ad99a5e8b32 + revision: 6cb18910e322645e3cc3490951d10f19468da52f branch: development specs: ontologies_linked_data (0.0.1) @@ -49,7 +49,7 @@ GIT GIT remote: https://github.com/ontoportal-lirmm/sparql-client.git - revision: 24bccbd0f4a5150fa6ce2af50d7c378c681027ea + revision: 4364d34e9e4c411f1dd0ea706bf052465bf0b467 branch: development specs: sparql-client (3.2.2) @@ -209,7 +209,7 @@ GEM mutex_m (0.3.0) net-http-persistent (4.0.5) connection_pool (~> 2.2) - net-scp (4.0.0) + net-scp (4.1.0) net-ssh (>= 2.6.5, < 8.0.0) net-sftp (4.0.0) net-ssh (>= 5.0.0, < 8.0.0) diff --git a/bin/migrations/compare_counts.rb b/bin/migrations/compare_counts.rb index 4c6be55..d1a5b2a 100755 --- a/bin/migrations/compare_counts.rb +++ b/bin/migrations/compare_counts.rb @@ -13,7 +13,6 @@ PROCESSED_DIR = ARGV[0] || './processed_files' profile = ARGV[1] - case profile when 'ag' # AllegroGraph backend @@ -86,7 +85,7 @@ def build_graphs_file_hash(folder_path = PROCESSED_DIR) if filename.end_with?('.graph') file_path = File.join(folder_path, filename) line = File.open(file_path, "r").readlines.first - graphs[line.strip] = filename.to_s.gsub('.graph','') + graphs[line.strip] = filename.to_s.gsub('.graph', '') end end graphs @@ -104,10 +103,10 @@ def compare_graphs_with_files(graph_triples) graph_filename = graphs_files[graph_uri] next csv << [graph_uri, triples_count, "Graph not found", "N/A"] unless graph_filename - + # Construct the expected file name based on the graph URI file_name = "#{PROCESSED_DIR}/#{graph_filename}" - + # puts "count lines of the file #{file_name} for the graph #{graph_uri}" if File.exist?(file_name) file_lines_count = count_file_lines(file_name) @@ -128,58 +127,9 @@ def compare_graphs_with_files(graph_triples) end # Main execution - Goo.sparql_query_client.cache.redis_cache.flushdb puts "Redis cache flushed" puts "Comparing graph triple counts with file lines and exporting to CSV..." graph_triples = get_all_graphs_counts compare_graphs_with_files(graph_triples) - -count = 0 -attr_ontology = [] -time = Benchmark.realtime do - attr_ontology = LinkedData::Models::Ontology.attributes(:all) - count = LinkedData::Models::Ontology.where.include(attr_ontology).all.count -end -puts "Ontologies count: #{count} with display=all in #{format("%.4f", time)}s" -count = 0 -time = Benchmark.realtime do - count = LinkedData::Models::OntologySubmission.where.all.count -end -puts "Submissions count: #{count} with no display in #{format("%.4f", time)}s" - -count = 0 -time = Benchmark.realtime do - attr = LinkedData::Models::OntologySubmission.attributes(:all) - attr << {ontology: attr_ontology} - count = LinkedData::Models::OntologySubmission.where.include(attr).all.count -end -puts "Submissions count: #{count} with display=all in #{format("%.4f", time)}s" - -count = 0 -time = Benchmark.realtime do - attr = LinkedData::Models::Agent.attributes(:all) - count = LinkedData::Models::Agent.where.include(attr).all.count -end -puts "Agent count: #{count} with display=all in #{format("%.4f", time)}s" - -count = 0 -time = Benchmark.realtime do - attr = LinkedData::Models::MappingCount.attributes(:all) - count = LinkedData::Models::MappingCount.where.include(attr).all.count -end -puts "MappingsCount count: #{count} with display=all in #{format("%.4f", time)}s" - -count = 0 -time = Benchmark.realtime do - count += LinkedData::Models::RestBackupMapping.where.all.count -end -puts "RestMappings count: #{count} with no display in #{format("%.4f", time)}s" - -count = 0 -time = Benchmark.realtime do - attr = LinkedData::Models::RestBackupMapping.attributes(:all) + LinkedData::Models::MappingProcess.attributes(:all) - count += LinkedData::Models::RestBackupMapping.where.include(attr).all.count -end -puts "RestMappings count: #{count} with display=all in #{format("%.4f", time)}s" diff --git a/bin/migrations/import_metadata_graphs_to_store b/bin/migrations/import_metadata_graphs_to_store index d278664..fe158bc 100755 --- a/bin/migrations/import_metadata_graphs_to_store +++ b/bin/migrations/import_metadata_graphs_to_store @@ -49,7 +49,7 @@ begin puts "Start importing #{file} into graph <#{graph_uri}> of line count #{line_count}" result = false time = Benchmark.realtime do - result = system("ruby bin/migrations/import_nt_file.rb #{file} #{graph_uri} #{profile} > /dev/null 2>&1") + result = system("ruby bin/migrations/import_nt_file.rb #{file} #{graph_uri} #{profile} >> ./process_log.log 2>&1") end file_count += 1 @@ -57,6 +57,7 @@ begin if !result puts "Error importing #{file} into graph <#{graph_uri}>" + exit 1 else import_count += 1 puts "Imported <#{graph_uri}> successfully in #{time.round(2)} seconds"