Skip to content

Commit

Permalink
Add parallel tests (#192)
Browse files Browse the repository at this point in the history
Uses pg_regress  to run tests in parallel against the database.
Allows custom DB initialization and finalization which can be used to load relevant data in the beginning
and check relevant invariants in the end
  • Loading branch information
ezra-varady authored Oct 9, 2023
1 parent 6c48316 commit 9e3e17b
Show file tree
Hide file tree
Showing 24 changed files with 351 additions and 19 deletions.
8 changes: 7 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,13 @@ add_custom_target(
# TEST
add_custom_target(
test
COMMAND ${CMAKE_SOURCE_DIR}/scripts/run_all_tests.sh
COMMAND ${CMAKE_SOURCE_DIR}/scripts/run_all_tests.sh --regression
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/test
)

add_custom_target(
test-parallel
COMMAND ${CMAKE_SOURCE_DIR}/scripts/run_all_tests.sh --parallel
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/test
)

Expand Down
6 changes: 5 additions & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,11 @@ make test

# only run regression tests that have $FILTER in regression sql file path
make test FILTER=hnsw

# run parallel tests
make test-parallel
```
Running `make test` will run the lantern regression tests, these run independent of one another. At the moment the tests for `make test-parallel` are under development, they can be found in `test/parallel`. The goal of the parallel tests is to generate a more realistic workload on the index to discover timing errors and other bugs dependent on more complex use, they run in the same database.

## Running benchmarks
This requires Python to be installed. Please check the `Dockerfile.dev` for pip requirements.
Expand All @@ -30,7 +34,7 @@ If you build Lantern in a different directory, make sure to update `.vscode` con

## Debugging the C codebase

If you make changes to the C codebase, in addition to `make test`, you can also use the `livedebug.py` utility in a `tmux` session to easily attach `gdb` to the psql backend and find out what breaks.
If you make changes to the C codebase, in addition to `make test` and `make parallel-test`, you can also use the `livedebug.py` utility in a `tmux` session to easily attach `gdb` to the psql backend and find out what breaks.
Below is a short recording demonstrating the use of `livedebug.py`:

[![asciicast](https://asciinema.org/a/jTsbWdOcTvUl4iAJlAw3Cszbt.svg)](https://asciinema.org/a/jTsbWdOcTvUl4iAJlAw3Cszbt)
Expand Down
1 change: 1 addition & 0 deletions ci/scripts/run-tests-linux.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,5 +27,6 @@ echo "port = 5432" >> ${PGDATA}/postgresql.conf
GCOV_PREFIX=$WORKDIR/build/CMakeFiles/lantern.dir/ GCOV_PREFIX_STRIP=5 POSTGRES_HOST_AUTH_METHOD=trust /usr/lib/postgresql/$PG_VERSION/bin/postgres 1>/tmp/pg-out.log 2>/tmp/pg-error.log &
# Wait for start and run tests
wait_for_pg && cd $WORKDIR/build && make test && \
make test-parallel && \
killall postgres && \
gcovr -r $WORKDIR/src/ --object-directory $WORKDIR/build/ --xml /tmp/coverage.xml
2 changes: 1 addition & 1 deletion ci/scripts/run-tests-mac.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,4 @@ wait_for_pg(){
# Start database
brew services start postgresql@$PG_VERSION

wait_for_pg && cd $WORKDIR/build && make test
wait_for_pg && cd $WORKDIR/build && make test && make test-parallel
42 changes: 37 additions & 5 deletions scripts/run_all_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -67,13 +67,33 @@ fi
# Check if pgvector is available
pgvector_installed=$($PSQL -U $DB_USER -d postgres -c "SELECT 1 FROM pg_available_extensions WHERE name = 'vector'" -tA | tail -n 1 | tr -d '\n')

# Settings
REGRESSION=0
PARALLEL=0
while [[ "$#" -gt 0 ]]; do
case $1 in
--regression) REGRESSION=1 ;;
--parallel) PARALLEL=1 ;;
esac
shift
done

# Generate schedule.txt
rm -rf $TMP_OUTDIR/schedule.txt
if [ "$PARALLEL" -eq 1 ]; then
SCHEDULE='parallel_schedule.txt'
else
SCHEDULE='schedule.txt'
fi
if [ -n "$FILTER" ]; then
if [[ "$pgvector_installed" == "1" ]]; then
TEST_FILES=$(cat schedule.txt | grep -E '^(test:|test_pgvector:)' | sed -E -e 's/^test:|test_pgvector://' | tr " " "\n" | sed -e '/^$/d')
if [ "$PARALLEL" -eq 1 ]; then
TEST_FILES=$(cat $SCHEDULE | grep -E '^(test:|test_begin:|test_end:)' | sed -E -e 's/^test:|test_begin:|test_end://' | tr " " "\n" | sed -e '/^$/d')
else
TEST_FILES=$(cat schedule.txt | grep '^test:' | sed -e 's/^test://' | tr " " "\n" | sed -e '/^$/d')
if [[ "$pgvector_installed" == "1" ]]; then
TEST_FILES=$(cat $SCHEDULE | grep -E '^(test:|test_pgvector:)' | sed -E -e 's/^test:|test_pgvector://' | tr " " "\n" | sed -e '/^$/d')
else
TEST_FILES=$(cat $SCHEDULE | grep '^test:' | sed -e 's/^test://' | tr " " "\n" | sed -e '/^$/d')
fi
fi

while IFS= read -r f; do
Expand All @@ -95,11 +115,18 @@ else
if [ "$pgvector_installed" == "1" ]; then
echo "test: $test_name" >> $TMP_OUTDIR/schedule.txt
fi
elif [[ "$line" =~ ^test_begin: ]]; then
test_name=$(echo "$line" | sed -e 's/test_begin:/test:/')
echo "$test_name" >> $TMP_OUTDIR/schedule.txt
elif [[ "$line" =~ ^test_end: ]]; then
test_name=$(echo "$line" | sed -e 's/test_end:/test:/')
echo "$test_name" >> $TMP_OUTDIR/schedule.txt
else
echo "$line" >> $TMP_OUTDIR/schedule.txt
fi
done < schedule.txt
done < $SCHEDULE
fi
unset $SCHEDULE
SCHEDULE=$TMP_OUTDIR/schedule.txt

function print_diff {
Expand All @@ -116,4 +143,9 @@ function print_diff {

trap print_diff ERR

DB_USER=$DB_USER $(pg_config --pkglibdir)/pgxs/src/test/regress/pg_regress --user=$DB_USER --schedule=$SCHEDULE --outputdir=$TMP_OUTDIR --launcher=./test_runner.sh
if [ "$PARALLEL" -eq 1 ]; then
cd parallel
PARALLEL=$PARALLEL DB_USER=$DB_USER $(pg_config --pkglibdir)/pgxs/src/test/regress/pg_regress --user=$DB_USER --schedule=$SCHEDULE --outputdir=$TMP_OUTDIR --launcher=../test_runner.sh
else
PARALLEL=$PARALLEL DB_USER=$DB_USER $(pg_config --pkglibdir)/pgxs/src/test/regress/pg_regress --user=$DB_USER --schedule=$SCHEDULE --outputdir=$TMP_OUTDIR --launcher=./test_runner.sh
fi
23 changes: 23 additions & 0 deletions test/parallel/expected/begin.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
-- This file handles initializing the database before parallel tests are run
\ir utils/sift10k_array.sql
CREATE TABLE IF NOT EXISTS sift_base10k (
id SERIAL PRIMARY KEY,
v REAL[128]
);
\copy sift_base10k (v) FROM '/tmp/lantern/vector_datasets/siftsmall_base_arrays.csv' with csv;
\ir utils/random_array.sql
CREATE OR REPLACE FUNCTION random_int_array(dim integer, min integer, max integer) RETURNS integer[] AS $BODY$
begin
return (select array_agg(round(random() * (max - min)) + min) from generate_series (0, dim - 1));
end
$BODY$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION random_array(dim integer, min real, max real) RETURNS REAL[] AS $BODY$
begin
return (select array_agg(random() * (max - min) + min) from generate_series (0, dim - 1));
end
$BODY$ LANGUAGE plpgsql;
CREATE SEQUENCE serial START 10001;
CREATE INDEX ON sift_base10k USING HNSW (v) WITH (M=5, ef=20, ef_construction=20);
INFO: done init usearch index
INFO: inserted 10000 elements
INFO: done saving 10000 vectors
13 changes: 13 additions & 0 deletions test/parallel/expected/end.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
-- This file contains invariants to be checked after the parallel tests have run
SELECT COUNT(*) FROM sift_base10k;
count
-------
10030
(1 row)

SELECT * from sift_base10k WHERE id=4444;
id | v
------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
4444 | {55,61,11,4,5,2,13,24,65,49,13,9,23,37,94,38,54,11,14,14,40,31,50,44,53,4,0,0,27,17,8,34,12,10,4,4,22,52,68,53,9,2,0,0,2,116,119,64,119,2,0,0,2,30,119,119,116,5,0,8,47,9,5,60,7,7,10,23,56,50,23,5,28,68,6,18,24,65,50,9,119,75,3,0,1,8,12,85,119,11,4,6,8,9,5,74,25,11,8,20,18,12,2,21,11,90,25,32,33,15,2,9,84,67,8,4,22,31,11,33,119,30,3,6,0,0,0,26}
(1 row)

13 changes: 13 additions & 0 deletions test/parallel/expected/insert.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
BEGIN;
INSERT INTO sift_base10k (id, v) VALUES
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128));
COMMIT;
13 changes: 13 additions & 0 deletions test/parallel/expected/insert2.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
BEGIN;
INSERT INTO sift_base10k (id, v) VALUES
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128));
COMMIT;
13 changes: 13 additions & 0 deletions test/parallel/expected/insert3.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
BEGIN;
INSERT INTO sift_base10k (id, v) VALUES
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128));
COMMIT;
38 changes: 38 additions & 0 deletions test/parallel/expected/select.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
SELECT v AS v1111 FROM sift_base10k WHERE id = 1111 \gset
SELECT v AS v2222 FROM sift_base10k WHERE id = 2222 \gset
SELECT v AS v3333 FROM sift_base10k WHERE id = 3333 \gset
SELECT v AS v4444 FROM sift_base10k WHERE id = 4444 \gset
-- Make sure that our index queries will actually run against the index
EXPLAIN (COSTS false) SELECT id FROM sift_base10k ORDER BY v <-> :'v1111' ASC LIMIT 1;
QUERY PLAN
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Limit
-> Index Scan using sift_base10k_v_idx on sift_base10k
Order By: (v <-> '{21,24,5,0,0,26,22,6,16,16,10,9,0,18,114,19,13,13,9,1,2,53,111,19,39,32,5,0,4,9,10,13,6,10,8,0,2,130,77,4,2,0,0,0,3,130,130,11,130,0,0,0,0,37,130,84,130,5,0,1,17,11,4,28,17,39,3,3,30,77,28,3,20,0,0,1,49,125,13,7,130,6,0,0,0,5,11,61,130,2,0,1,12,84,48,73,1,12,2,0,31,57,9,2,16,12,1,0,32,36,0,1,63,6,3,1,0,0,24,51,9,0,0,0,0,44,88,48}'::real[])
(3 rows)

-- Do the queries
SELECT id FROM sift_base10k ORDER BY v <-> :'v1111' ASC LIMIT 1;
id
------
1111
(1 row)

SELECT id FROM sift_base10k ORDER BY v <-> :'v2222' ASC LIMIT 1;
id
------
2222
(1 row)

SELECT id FROM sift_base10k ORDER BY v <-> :'v3333' ASC LIMIT 1;
id
------
3333
(1 row)

SELECT id FROM sift_base10k ORDER BY v <-> :'v4444' ASC LIMIT 1;
id
------
4444
(1 row)

6 changes: 6 additions & 0 deletions test/parallel/sql/begin.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
-- This file handles initializing the database before parallel tests are run
\ir utils/sift10k_array.sql
\ir utils/random_array.sql

CREATE SEQUENCE serial START 10001;
CREATE INDEX ON sift_base10k USING HNSW (v) WITH (M=5, ef=20, ef_construction=20);
3 changes: 3 additions & 0 deletions test/parallel/sql/end.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
-- This file contains invariants to be checked after the parallel tests have run
SELECT COUNT(*) FROM sift_base10k;
SELECT * from sift_base10k WHERE id=4444;
13 changes: 13 additions & 0 deletions test/parallel/sql/insert.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
BEGIN;
INSERT INTO sift_base10k (id, v) VALUES
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128));
COMMIT;
13 changes: 13 additions & 0 deletions test/parallel/sql/insert2.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
BEGIN;
INSERT INTO sift_base10k (id, v) VALUES
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128));
COMMIT;
13 changes: 13 additions & 0 deletions test/parallel/sql/insert3.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
BEGIN;
INSERT INTO sift_base10k (id, v) VALUES
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128)),
(nextval('serial'), random_array(128, 0, 128));
COMMIT;
11 changes: 11 additions & 0 deletions test/parallel/sql/select.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
SELECT v AS v1111 FROM sift_base10k WHERE id = 1111 \gset
SELECT v AS v2222 FROM sift_base10k WHERE id = 2222 \gset
SELECT v AS v3333 FROM sift_base10k WHERE id = 3333 \gset
SELECT v AS v4444 FROM sift_base10k WHERE id = 4444 \gset
-- Make sure that our index queries will actually run against the index
EXPLAIN (COSTS false) SELECT id FROM sift_base10k ORDER BY v <-> :'v1111' ASC LIMIT 1;
-- Do the queries
SELECT id FROM sift_base10k ORDER BY v <-> :'v1111' ASC LIMIT 1;
SELECT id FROM sift_base10k ORDER BY v <-> :'v2222' ASC LIMIT 1;
SELECT id FROM sift_base10k ORDER BY v <-> :'v3333' ASC LIMIT 1;
SELECT id FROM sift_base10k ORDER BY v <-> :'v4444' ASC LIMIT 1;
61 changes: 61 additions & 0 deletions test/parallel/sql/utils/common.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
-- test helper functions that should exist in all test runs live here
-- there is no need to explicitly include this file in other tests as the test runner will
-- run this before running the actual test

CREATE EXTENSION pageinspect;

\set ON_ERROR_STOP on

-- retrieves details for all indices associated with a given table, similar to \di+
-- the output of \di+ is not consistent across postgres versions
-- todo:: add a columns to this function which returning number of used DB pages
CREATE OR REPLACE FUNCTION ldb_get_indexes(tblname text)
RETURNS TABLE(
indexname name,
size text,
indexdef text,
total_index_size text
) AS
$BODY$
BEGIN
RETURN QUERY
WITH total_size_data AS (
SELECT
SUM(pg_relation_size(indexrelid)) as total_size
FROM
pg_index
WHERE
indisvalid
AND indrelid = tblname::regclass
)
SELECT
idx.indexname,
pg_size_pretty(pg_relation_size(idx.indexname::REGCLASS)) as size,
idx.indexdef,
pg_size_pretty(total_size_data.total_size) as total_index_size
FROM
pg_indexes idx,
total_size_data
WHERE
idx.tablename = tblname;
END;
$BODY$
LANGUAGE plpgsql;

-- Determines if the provided SQL query (with an EXPLAIN prefix) uses an "Index Scan"
-- by examining its execution plan. This function helps ensure consistent analysis
-- across varying Postgres versions where EXPLAIN output may differ.
CREATE OR REPLACE FUNCTION has_index_scan(explain_query text) RETURNS boolean AS $$
DECLARE
plan_row RECORD;
found boolean := false;
BEGIN
FOR plan_row IN EXECUTE explain_query LOOP
IF position('Index Scan' in plan_row."QUERY PLAN") > 0 THEN
found := true;
EXIT;
END IF;
END LOOP;
RETURN found;
END;
$$ LANGUAGE plpgsql;
11 changes: 11 additions & 0 deletions test/parallel/sql/utils/random_array.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
CREATE OR REPLACE FUNCTION random_int_array(dim integer, min integer, max integer) RETURNS integer[] AS $BODY$
begin
return (select array_agg(round(random() * (max - min)) + min) from generate_series (0, dim - 1));
end
$BODY$ LANGUAGE plpgsql;

CREATE OR REPLACE FUNCTION random_array(dim integer, min real, max real) RETURNS REAL[] AS $BODY$
begin
return (select array_agg(random() * (max - min) + min) from generate_series (0, dim - 1));
end
$BODY$ LANGUAGE plpgsql;
5 changes: 5 additions & 0 deletions test/parallel/sql/utils/sift10k_array.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
CREATE TABLE IF NOT EXISTS sift_base10k (
id SERIAL PRIMARY KEY,
v REAL[128]
);
\copy sift_base10k (v) FROM '/tmp/lantern/vector_datasets/siftsmall_base_arrays.csv' with csv;
10 changes: 10 additions & 0 deletions test/parallel_schedule.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# schedule.txt rules:
# - every test that needs to be run must appear in a 'test:' line
# - every test that needs to be run iff pgvector is installed appears in a 'test_pgvector:' line
# - 'test' lines may have multiple space-separated tests. All tests in a single 'test' line will be run in parallel
# parallel_schedule.txt notes:
# - Begin runs before and end runs after the actual tests, they run in the same database, but begin runs before to handle the necessary setup and end runs after to check invariants.

test_begin: begin
test: insert insert2 insert3 select
test_end: end
Loading

0 comments on commit 9e3e17b

Please sign in to comment.