Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add mu scripts to verify data sync #9

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions scripts/config.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,38 @@
{
"version": "0.1",
"scripts": [
{
"documentation": {
"command": "data-diff",
"description": "A data diff will be taken of the data resulting from a grep on each file. Parameters:\n config: the config file to use\n",
"arguments": ["config"]
},
"environment": {
"image": "ubuntu",
"interactive": false,
"script": "virtuoso/data-diff.sh",
"join_networks": true
},
"mounts": {
"app": "/project/"
}
},
{
"documentation": {
"command": "dump-database",
"description": "A virtuoso dump will be created db/dumpss.\n Parameters:\n hostname: default triplestore\n username: default dba\n password: default dba",
"arguments": ["hostname", "username", "password"]
},
"environment": {
"image": "redpencil/virtuoso",
"interactive": false,
"script": "virtuoso/dump-database.sh",
"join_networks": true
},
"mounts": {
"app": "/project/"
}
},
{
"documentation": {
"command": "create-backup",
Expand Down
7 changes: 7 additions & 0 deletions scripts/virtuoso/data-diff-config-example.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{ "source": "/project/data/file1.nq",
"target": "/project/data/file2.nq",
"graphs": [
"http://mu.semte.ch/application"
],
"graphRegexes": ["http://mu.semte.ch/vocabularies/ext/tabId"]
}
11 changes: 11 additions & 0 deletions scripts/virtuoso/data-diff.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash
apt-get update > /dev/null
apt-get -y install jq python3 > /dev/null

config=$1
source=$(jq -r ".source" $config)
target=$(jq -r ".target" $config)

command=$(python3 generate-datadiff.py $config)

diff <(cat $source | eval $command | sort) <(cat $target | eval $command | sort)
80 changes: 80 additions & 0 deletions scripts/virtuoso/dump-database.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#!/bin/bash
USERNAME=${2:-"dba"}
PASSWORD=${3:-"dba"}
TRIPLESTORE=${1:-"triplestore"}

if [[ "$#" -ge 3 ]]; then
echo "Usage:"
echo " mu script triplestore [hostname] [username] [password]"
exit -1;
fi

if [[ -d "/project/data/db" ]];then
mkdir -p /project/data/db/dumps
else
echo "WARNING:"
echo " did not find data/db folder in your project, so did not create data/db/dumps!"
echo " "
fi


echo "connecting to $TRIPLESTORE with $USERNAME"
isql-v -H $TRIPLESTORE -U $USERNAME -P $PASSWORD <<EOF
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

our db normally already has this procedure, so no need to add it again?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We've been testing this on Kaleidos and indeed just calling dump_nquads without re-defining the procedure works.

CREATE PROCEDURE dump_nquads
( IN dir VARCHAR := 'dumps'
, IN start_from INT := 1
, IN file_length_limit INTEGER := 100000000
, IN comp INT := 1
)
{
DECLARE inx, ses_len INT
; DECLARE file_name VARCHAR
; DECLARE env, ses ANY
;

inx := start_from;
SET isolation = 'uncommitted';
env := vector (0,0,0);
ses := string_output (10000000);
FOR (SELECT * FROM (sparql define input:storage "" SELECT ?s ?p ?o ?g { GRAPH ?g { ?s ?p ?o } . FILTER ( ?g != virtrdf: ) } ) AS sub OPTION (loop)) DO
{
DECLARE EXIT HANDLER FOR SQLSTATE '22023'
{
GOTO next;
};
http_nquad (env, "s", "p", "o", "g", ses);
ses_len := LENGTH (ses);
IF (ses_len >= file_length_limit)
{
file_name := sprintf ('%s/output%06d.nq', dir, inx);
string_to_file (file_name, ses, -2);
IF (comp)
{
gz_compress_file (file_name, file_name||'.gz');
file_delete (file_name);
}
inx := inx + 1;
env := vector (0,0,0);
ses := string_output (10000000);
}
next:;
}
IF (length (ses))
{
file_name := sprintf ('%s/output%06d.nq', dir, inx);
string_to_file (file_name, ses, -2);
IF (comp)
{
gz_compress_file (file_name, file_name||'.gz');
file_delete (file_name);
}
inx := inx + 1;
env := vector (0,0,0);
}
}
;
dump_nquads ('dumps', 1, 100000000, 1);
exit;
EOF
gunzip /project/data/db/dumps/*.gz
cat /project/data/db/dumps/* > /project/data/dumped-quads.nq
14 changes: 14 additions & 0 deletions scripts/virtuoso/generate-datadiff.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import sys
import json

config_file = sys.argv[1]
config = {}
with open(config_file) as f:
config = json.load(f)

graph_regex = "<{}> .$".format("|".join(config['graphs']))
grep_commands = f'egrep "{graph_regex}"'
for regex in config['graphRegexes']:
grep_commands += f'| egrep "{regex}"'

print(grep_commands)