Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add parallel tests #192

Merged
merged 6 commits into from
Oct 9, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,13 @@ add_custom_target(
# TEST
add_custom_target(
test
COMMAND ${CMAKE_SOURCE_DIR}/scripts/run_all_tests.sh
COMMAND ${CMAKE_SOURCE_DIR}/scripts/run_all_tests.sh --regression && ${CMAKE_SOURCE_DIR}/scripts/run_all_tests.sh --parallel
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/test
)

add_custom_target(
test-parallel
COMMAND ${CMAKE_SOURCE_DIR}/scripts/run_all_tests.sh --parallel
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/test
)

Expand Down
6 changes: 5 additions & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,11 @@ make test

# only run regression tests that have $FILTER in regression sql file path
make test FILTER=hnsw

# run parallel tests
make test-parallel
```
Running `make test` will run the lantern regression tests, these run independent of one another. At the moment the tests for `make test-parallel` are under development, they can be found in `test/parallel`. The goal of the parallel tests is to generate a more realistic workload on the index to discover timing errors and other bugs dependent on more complex use, they run in the same database.

## Running benchmarks
This requires Python to be installed. Please check the `Dockerfile.dev` for pip requirements.
Expand All @@ -30,7 +34,7 @@ If you build Lantern in a different directory, make sure to update `.vscode` con

## Debugging the C codebase

If you make changes to the C codebase, in addition to `make test`, you can also use the `livedebug.py` utility in a `tmux` session to easily attach `gdb` to the psql backend and find out what breaks.
If you make changes to the C codebase, in addition to `make test` and `make parallel-test`, you can also use the `livedebug.py` utility in a `tmux` session to easily attach `gdb` to the psql backend and find out what breaks.
Below is a short recording demonstrating the use of `livedebug.py`:

[![asciicast](https://asciinema.org/a/jTsbWdOcTvUl4iAJlAw3Cszbt.svg)](https://asciinema.org/a/jTsbWdOcTvUl4iAJlAw3Cszbt)
Expand Down
42 changes: 37 additions & 5 deletions scripts/run_all_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -67,13 +67,33 @@ fi
# Check if pgvector is available
pgvector_installed=$($PSQL -U $DB_USER -d postgres -c "SELECT 1 FROM pg_available_extensions WHERE name = 'vector'" -tA | tail -n 1 | tr -d '\n')

# Settings
REGRESSION=0
PARALLEL=0
while [[ "$#" -gt 0 ]]; do
case $1 in
--regression) REGRESSION=1 ;;
--parallel) PARALLEL=1 ;;
esac
shift
done

# Generate schedule.txt
rm -rf $TMP_OUTDIR/schedule.txt
if [ "$PARALLEL" -eq 1 ]; then
SCHEDULE='parallel_schedule.txt'
else
SCHEDULE='schedule.txt'
fi
if [ -n "$FILTER" ]; then
if [[ "$pgvector_installed" == "1" ]]; then
TEST_FILES=$(cat schedule.txt | grep -E '^(test:|test_pgvector:)' | sed -E -e 's/^test:|test_pgvector://' | tr " " "\n" | sed -e '/^$/d')
if [ "$PARALLEL" -eq 1 ]; then
TEST_FILES=$(cat $SCHEDULE | grep -E '^(test:|test_begin:|test_end:)' | sed -E -e 's/^test:|test_begin:|test_end://' | tr " " "\n" | sed -e '/^$/d')
else
TEST_FILES=$(cat schedule.txt | grep '^test:' | sed -e 's/^test://' | tr " " "\n" | sed -e '/^$/d')
if [[ "$pgvector_installed" == "1" ]]; then
TEST_FILES=$(cat $SCHEDULE | grep -E '^(test:|test_pgvector:)' | sed -E -e 's/^test:|test_pgvector://' | tr " " "\n" | sed -e '/^$/d')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unrelated to this PR, but still important: Do we currently run pgvector compat tests anywhere in ci/cd or release pipeline?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes we are running them in the pipeline, as we're installing the pgvector there

else
TEST_FILES=$(cat $SCHEDULE | grep '^test:' | sed -e 's/^test://' | tr " " "\n" | sed -e '/^$/d')
fi
fi

while IFS= read -r f; do
Expand All @@ -95,11 +115,18 @@ else
if [ "$pgvector_installed" == "1" ]; then
echo "test: $test_name" >> $TMP_OUTDIR/schedule.txt
fi
elif [[ "$line" =~ ^test_begin: ]]; then
test_name=$(echo "$line" | sed -e 's/test_begin:/test:/')
echo "$test_name" >> $TMP_OUTDIR/schedule.txt
elif [[ "$line" =~ ^test_end: ]]; then
test_name=$(echo "$line" | sed -e 's/test_end:/test:/')
echo "$test_name" >> $TMP_OUTDIR/schedule.txt
else
echo "$line" >> $TMP_OUTDIR/schedule.txt
fi
done < schedule.txt
done < $SCHEDULE
fi
unset $SCHEDULE
SCHEDULE=$TMP_OUTDIR/schedule.txt

function print_diff {
Expand All @@ -116,4 +143,9 @@ function print_diff {

trap print_diff ERR

DB_USER=$DB_USER $(pg_config --pkglibdir)/pgxs/src/test/regress/pg_regress --user=$DB_USER --schedule=$SCHEDULE --outputdir=$TMP_OUTDIR --launcher=./test_runner.sh
if [ "$PARALLEL" -eq 1 ]; then
cd parallel
DB_USER=$DB_USER $(pg_config --pkglibdir)/pgxs/src/test/regress/pg_regress --user=$DB_USER --schedule=$SCHEDULE --outputdir=$TMP_OUTDIR --launcher=./parallel_test_runner.sh
else
DB_USER=$DB_USER $(pg_config --pkglibdir)/pgxs/src/test/regress/pg_regress --user=$DB_USER --schedule=$SCHEDULE --outputdir=$TMP_OUTDIR --launcher=./test_runner.sh
fi
22 changes: 22 additions & 0 deletions test/parallel/expected/begin.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
\ir utils/sift10k_array.sql
CREATE TABLE IF NOT EXISTS sift_base10k (
id SERIAL PRIMARY KEY,
v REAL[128]
);
\copy sift_base10k (v) FROM '/tmp/lantern/vector_datasets/siftsmall_base_arrays.csv' with csv;
\ir utils/random_array.sql
CREATE OR REPLACE FUNCTION random_int_array(dim integer, min integer, max integer) RETURNS integer[] AS $BODY$
begin
return (select array_agg(round(random() * (max - min)) + min) from generate_series (0, dim - 1));
end
$BODY$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION random_array(dim integer, min real, max real) RETURNS REAL[] AS $BODY$
begin
return (select array_agg(random() * (max - min) + min) from generate_series (0, dim - 1));
end
$BODY$ LANGUAGE plpgsql;
CREATE SEQUENCE serial START 10001;
CREATE INDEX ON sift_base10k USING HNSW (v) WITH (M=5, ef=20, ef_construction=20);
INFO: done init usearch index
INFO: inserted 10000 elements
INFO: done saving 10000 vectors
12 changes: 12 additions & 0 deletions test/parallel/expected/end.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
SELECT COUNT(*) FROM sift_base10k;
count
-------
10030
(1 row)

SELECT * from sift_base10k WHERE id=4444;
id | v
------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
4444 | {55,61,11,4,5,2,13,24,65,49,13,9,23,37,94,38,54,11,14,14,40,31,50,44,53,4,0,0,27,17,8,34,12,10,4,4,22,52,68,53,9,2,0,0,2,116,119,64,119,2,0,0,2,30,119,119,116,5,0,8,47,9,5,60,7,7,10,23,56,50,23,5,28,68,6,18,24,65,50,9,119,75,3,0,1,8,12,85,119,11,4,6,8,9,5,74,25,11,8,20,18,12,2,21,11,90,25,32,33,15,2,9,84,67,8,4,22,31,11,33,119,30,3,6,0,0,0,26}
(1 row)

13 changes: 13 additions & 0 deletions test/parallel/expected/insert.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
BEGIN;
INSERT INTO sift_base10k (id, v) VALUES
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128));
COMMIT;
13 changes: 13 additions & 0 deletions test/parallel/expected/insert2.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
BEGIN;
INSERT INTO sift_base10k (id, v) VALUES
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128));
COMMIT;
13 changes: 13 additions & 0 deletions test/parallel/expected/insert3.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
BEGIN;
INSERT INTO sift_base10k (id, v) VALUES
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128));
COMMIT;
28 changes: 28 additions & 0 deletions test/parallel/expected/select.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
SELECT v AS v1111 FROM sift_base10k WHERE id = 1111 \gset
SELECT v AS v2222 FROM sift_base10k WHERE id = 2222 \gset
SELECT v AS v3333 FROM sift_base10k WHERE id = 3333 \gset
SELECT v AS v4444 FROM sift_base10k WHERE id = 4444 \gset
SELECT id FROM sift_base10k ORDER BY v <-> :'v1111' ASC LIMIT 1;
id
------
1111
(1 row)

SELECT id FROM sift_base10k ORDER BY v <-> :'v2222' ASC LIMIT 1;
id
------
2222
(1 row)

SELECT id FROM sift_base10k ORDER BY v <-> :'v3333' ASC LIMIT 1;
id
------
3333
(1 row)

SELECT id FROM sift_base10k ORDER BY v <-> :'v4444' ASC LIMIT 1;
id
------
4444
(1 row)

52 changes: 52 additions & 0 deletions test/parallel/parallel_test_runner.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#!/usr/bin/env bash

# Get current test file name
TESTFILE_NAME=${PGAPPNAME##pg_regress/}
# Set different name for each test database
# As pg_regress does not support cleaning db after each test
var77 marked this conversation as resolved.
Show resolved Hide resolved
TEST_CASE_DB="ldb_parallel"
# Set database user
if [ -z $DB_USER ]
then
echo "ERROR: DB_USER environment variable is not set before test_runner.sh is run by pg_regress"
exit 1
fi

# Drop db after each test on exit signal
function drop_db {
cat <<EOF | psql "$@" -U ${DB_USER} -d postgres -v ECHO=none >/dev/null 2>&1
SET client_min_messages=ERROR;
DROP DATABASE "${TEST_CASE_DB}";
EOF
}

if [[ "$TESTFILE_NAME" =~ ^end ]]; then
var77 marked this conversation as resolved.
Show resolved Hide resolved
trap drop_db EXIT
fi


# Change directory to sql so sql imports will work correctly
cd sql/
# install lantern extension
if [[ "$TESTFILE_NAME" =~ ^begin ]]; then
psql "$@" -U ${DB_USER} -d postgres -v ECHO=none -q -c "CREATE DATABASE ${TEST_CASE_DB};" 2>/dev/null
psql "$@" -U ${DB_USER} -d ${TEST_CASE_DB} -v ECHO=none -q -c "SET client_min_messages=error; CREATE EXTENSION lantern;" 2>/dev/null
#psql "$@" -U ${DB_USER} -d ${TEST_CASE_DB} -v ECHO=none -q -f utils/common.sql 2>/dev/null
fi

# Exclude debug/inconsistent output from psql
var77 marked this conversation as resolved.
Show resolved Hide resolved
# So tests will always have the same output
psql -U ${DB_USER} \
-v ON_ERROR_STOP=1 \
-v VERBOSITY=terse \
-v ECHO=all \
"$@" -d ${TEST_CASE_DB} 2>&1 | \
sed -e 's! Memory: [0-9]\{1,\}kB!!' \
-e 's! Memory Usage: [0-9]\{1,\}kB!!' \
-e 's! Average Peak Memory: [0-9]\{1,\}kB!!' \
-e 's! time=[0-9]\+\.[0-9]\+\.\.[0-9]\+\.[0-9]\+!!' | \
grep -v 'DEBUG: rehashing catalog cache id' | \
grep -Gv '^ Planning Time:' | \
grep -Gv '^ Execution Time:' | \
# Only print debug messages followed by LANTERN
perl -nle'print if !m{DEBUG:(?!.*LANTERN)}'
5 changes: 5 additions & 0 deletions test/parallel/sql/begin.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
\ir utils/sift10k_array.sql
\ir utils/random_array.sql

CREATE SEQUENCE serial START 10001;
CREATE INDEX ON sift_base10k USING HNSW (v) WITH (M=5, ef=20, ef_construction=20);
2 changes: 2 additions & 0 deletions test/parallel/sql/end.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
SELECT COUNT(*) FROM sift_base10k;
SELECT * from sift_base10k WHERE id=4444;
13 changes: 13 additions & 0 deletions test/parallel/sql/insert.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
BEGIN;
INSERT INTO sift_base10k (id, v) VALUES
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128));
COMMIT;
13 changes: 13 additions & 0 deletions test/parallel/sql/insert2.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
BEGIN;
INSERT INTO sift_base10k (id, v) VALUES
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128));
COMMIT;
13 changes: 13 additions & 0 deletions test/parallel/sql/insert3.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
BEGIN;
INSERT INTO sift_base10k (id, v) VALUES
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128));
COMMIT;
8 changes: 8 additions & 0 deletions test/parallel/sql/select.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
SELECT v AS v1111 FROM sift_base10k WHERE id = 1111 \gset
SELECT v AS v2222 FROM sift_base10k WHERE id = 2222 \gset
SELECT v AS v3333 FROM sift_base10k WHERE id = 3333 \gset
SELECT v AS v4444 FROM sift_base10k WHERE id = 4444 \gset
SELECT id FROM sift_base10k ORDER BY v <-> :'v1111' ASC LIMIT 1;
SELECT id FROM sift_base10k ORDER BY v <-> :'v2222' ASC LIMIT 1;
var77 marked this conversation as resolved.
Show resolved Hide resolved
SELECT id FROM sift_base10k ORDER BY v <-> :'v3333' ASC LIMIT 1;
SELECT id FROM sift_base10k ORDER BY v <-> :'v4444' ASC LIMIT 1;
11 changes: 11 additions & 0 deletions test/parallel/sql/utils/random_array.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
CREATE OR REPLACE FUNCTION random_int_array(dim integer, min integer, max integer) RETURNS integer[] AS $BODY$
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel like this would be useful in regular tests as well. Should be in the utils for regular tests.

begin
return (select array_agg(round(random() * (max - min)) + min) from generate_series (0, dim - 1));
end
$BODY$ LANGUAGE plpgsql;

CREATE OR REPLACE FUNCTION random_array(dim integer, min real, max real) RETURNS REAL[] AS $BODY$
begin
return (select array_agg(random() * (max - min) + min) from generate_series (0, dim - 1));
end
$BODY$ LANGUAGE plpgsql;
5 changes: 5 additions & 0 deletions test/parallel/sql/utils/sift10k_array.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
CREATE TABLE IF NOT EXISTS sift_base10k (
id SERIAL PRIMARY KEY,
v REAL[128]
);
\copy sift_base10k (v) FROM '/tmp/lantern/vector_datasets/siftsmall_base_arrays.csv' with csv;
8 changes: 8 additions & 0 deletions test/parallel_schedule.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# schedule.txt rules:
# - every test that needs to be run must appear in a 'test:' line
# - every test that needs to be run iff pgvector is installed appears in a 'test_pgvector:' line
# - 'test' lines may have multiple space-separated tests. All tests in a single 'test' line will be run in parallel

test_begin: begin
test: insert insert2 insert3 select
test_end: end
2 changes: 1 addition & 1 deletion test/sql/utils/small_world_array.sql
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@ INSERT INTO small_world (id, b, v) VALUES
('100', FALSE, '{1,0,0}'),
('101', FALSE, '{1,0,1}'),
('110', FALSE, '{1,1,0}'),
('111', TRUE, '{1,1,1}');
('111', TRUE, '{1,1,1}');