From 30220ec99bb34b83552d257fae7e807e4b3c3c1f Mon Sep 17 00:00:00 2001 From: Jan-Pieter Baert Date: Tue, 23 Aug 2022 10:51:08 +0200 Subject: [PATCH 1/4] Add script to dump all quads in the database --- scripts/config.json | 16 +++++++ scripts/virtuoso/dump-database.sh | 79 +++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+) create mode 100755 scripts/virtuoso/dump-database.sh diff --git a/scripts/config.json b/scripts/config.json index 87c518e..bf95091 100644 --- a/scripts/config.json +++ b/scripts/config.json @@ -1,6 +1,22 @@ { "version": "0.1", "scripts": [ + { + "documentation": { + "command": "dump-database", + "description": "A virtuoso dump will be created db/dumpss.\n Parameters:\n hostname: default triplestore\n username: default dba\n password: default dba", + "arguments": ["hostname", "username", "password"] + }, + "environment": { + "image": "redpencil/virtuoso", + "interactive": false, + "script": "virtuoso/dump-database.sh", + "join_networks": true + }, + "mounts": { + "app": "/project/" + } + }, { "documentation": { "command": "create-backup", diff --git a/scripts/virtuoso/dump-database.sh b/scripts/virtuoso/dump-database.sh new file mode 100755 index 0000000..a7a0c10 --- /dev/null +++ b/scripts/virtuoso/dump-database.sh @@ -0,0 +1,79 @@ +#!/bin/bash +USERNAME=${2:-"dba"} +PASSWORD=${3:-"dba"} +TRIPLESTORE=${1:-"triplestore"} + +if [[ "$#" -ge 3 ]]; then + echo "Usage:" + echo " mu script triplestore [hostname] [username] [password]" + exit -1; +fi + +if [[ -d "/project/data/db" ]];then + mkdir -p /project/data/db/dumps +else + echo "WARNING:" + echo " did not find data/db folder in your project, so did not create data/db/dumps!" + echo " " +fi + + +echo "connecting to $TRIPLESTORE with $USERNAME" +isql-v -H $TRIPLESTORE -U $USERNAME -P $PASSWORD <= file_length_limit) + { + file_name := sprintf ('%s/output%06d.nq', dir, inx); + string_to_file (file_name, ses, -2); + IF (comp) + { + gz_compress_file (file_name, file_name||'.gz'); + file_delete (file_name); + } + inx := inx + 1; + env := vector (0,0,0); + ses := string_output (10000000); + } + next:; + } + IF (length (ses)) + { + file_name := sprintf ('%s/output%06d.nq', dir, inx); + string_to_file (file_name, ses, -2); + IF (comp) + { + gz_compress_file (file_name, file_name||'.gz'); + file_delete (file_name); + } + inx := inx + 1; + env := vector (0,0,0); + } +} +; + dump_nquads ('dumps', 1, 100000000, 1); + exit; +EOF +gunzip /project/data/db/dumps/* From cd9b14ddaa528ba510ae7f203e1eab4270754935 Mon Sep 17 00:00:00 2001 From: Jan-Pieter Baert Date: Tue, 23 Aug 2022 16:52:54 +0200 Subject: [PATCH 2/4] Add script to diff quad files --- scripts/config.json | 16 ++++++++++++++++ scripts/virtuoso/data-diff.sh | 11 +++++++++++ scripts/virtuoso/generate-datadiff.py | 14 ++++++++++++++ 3 files changed, 41 insertions(+) create mode 100755 scripts/virtuoso/data-diff.sh create mode 100644 scripts/virtuoso/generate-datadiff.py diff --git a/scripts/config.json b/scripts/config.json index bf95091..87a15fd 100644 --- a/scripts/config.json +++ b/scripts/config.json @@ -1,6 +1,22 @@ { "version": "0.1", "scripts": [ + { + "documentation": { + "command": "data-diff", + "description": "A data diff will be taken of the data resulting from a grep on each file. Parameters:\n config: the config file to use\n", + "arguments": ["config"] + }, + "environment": { + "image": "ubuntu", + "interactive": false, + "script": "virtuoso/data-diff.sh", + "join_networks": true + }, + "mounts": { + "app": "/project/" + } + }, { "documentation": { "command": "dump-database", diff --git a/scripts/virtuoso/data-diff.sh b/scripts/virtuoso/data-diff.sh new file mode 100755 index 0000000..dd54c90 --- /dev/null +++ b/scripts/virtuoso/data-diff.sh @@ -0,0 +1,11 @@ +#!/bin/bash +apt-get update > /dev/null +apt-get -y install jq python3 > /dev/null + +config=$1 +source=$(jq -r ".source" $config) +target=$(jq -r ".target" $config) + +command=$(python3 generate-datadiff.py $config) + +diff <(cat $source | eval $command | sort) <(cat $target | eval $command | sort) diff --git a/scripts/virtuoso/generate-datadiff.py b/scripts/virtuoso/generate-datadiff.py new file mode 100644 index 0000000..7429924 --- /dev/null +++ b/scripts/virtuoso/generate-datadiff.py @@ -0,0 +1,14 @@ +import sys +import json + +config_file = sys.argv[1] +config = {} +with open(config_file) as f: + config = json.load(f) + +graph_regex = "<{}> .$".format("|".join(config['graphs'])) +grep_commands = f'egrep "{graph_regex}"' +for regex in config['graphRegexes']: + grep_commands += f'| egrep "{regex}"' + +print(grep_commands) From 8039cd16c4c92b531e69b6b9a6bd57da2bb8f5c2 Mon Sep 17 00:00:00 2001 From: Jan-Pieter Baert Date: Tue, 23 Aug 2022 20:04:36 +0200 Subject: [PATCH 3/4] Add config example to data-diff script --- scripts/virtuoso/data-diff-config-example.json | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 scripts/virtuoso/data-diff-config-example.json diff --git a/scripts/virtuoso/data-diff-config-example.json b/scripts/virtuoso/data-diff-config-example.json new file mode 100644 index 0000000..a188487 --- /dev/null +++ b/scripts/virtuoso/data-diff-config-example.json @@ -0,0 +1,7 @@ +{ "source": "/project/data/file1.nq", + "target": "/project/data/file2.nq", + "graphs": [ + "http://mu.semte.ch/application" + ], + "graphRegexes": ["http://mu.semte.ch/vocabularies/ext/tabId"] +} From 28756c760e9a2d434e26a488e55e8a17a041ce39 Mon Sep 17 00:00:00 2001 From: Jan-Pieter Baert Date: Wed, 24 Aug 2022 16:20:23 +0200 Subject: [PATCH 4/4] Make dump script output to a single quad file --- scripts/virtuoso/dump-database.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/virtuoso/dump-database.sh b/scripts/virtuoso/dump-database.sh index a7a0c10..d7ceb2e 100755 --- a/scripts/virtuoso/dump-database.sh +++ b/scripts/virtuoso/dump-database.sh @@ -76,4 +76,5 @@ CREATE PROCEDURE dump_nquads dump_nquads ('dumps', 1, 100000000, 1); exit; EOF -gunzip /project/data/db/dumps/* +gunzip /project/data/db/dumps/*.gz +cat /project/data/db/dumps/* > /project/data/dumped-quads.nq