Add parallel tests (#192)

Uses pg_regress to run tests in parallel against the database. Allows custom DB initialization and finalization which can be used to load relevant data in the beginning and check relevant invariants in the end
lanterndata · Oct 9, 2023 · 9e3e17b · 9e3e17b
1 parent 6c48316
commit 9e3e17b
Show file tree

Hide file tree

Showing 24 changed files with 351 additions and 19 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -192,7 +192,13 @@ add_custom_target(
 # TEST
 add_custom_target(
   test
-  COMMAND ${CMAKE_SOURCE_DIR}/scripts/run_all_tests.sh
+  COMMAND ${CMAKE_SOURCE_DIR}/scripts/run_all_tests.sh --regression
+  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/test
+)
+
+add_custom_target(
+  test-parallel
+  COMMAND ${CMAKE_SOURCE_DIR}/scripts/run_all_tests.sh --parallel
   WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/test
 )
 

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -8,7 +8,11 @@ make test
 
 # only run regression tests that have $FILTER in regression sql file path
 make test FILTER=hnsw
+
+# run parallel tests
+make test-parallel
 ```
+Running `make test` will run the lantern regression tests, these run independent of one another. At the moment the tests for `make test-parallel` are under development, they can be found in `test/parallel`. The goal of the parallel tests is to generate a more realistic workload on the index to discover timing errors and other bugs dependent on more complex use, they run in the same database. 
 
 ## Running benchmarks
 This requires Python to be installed. Please check the `Dockerfile.dev` for pip requirements.
@@ -30,7 +34,7 @@ If you build Lantern in a different directory, make sure to update `.vscode` con
 
 ## Debugging the C codebase
 
-If you make changes to the C codebase, in addition to `make test`, you can also use the `livedebug.py` utility in a `tmux` session to easily attach `gdb` to the psql backend and find out what breaks.
+If you make changes to the C codebase, in addition to `make test` and `make parallel-test`, you can also use the `livedebug.py` utility in a `tmux` session to easily attach `gdb` to the psql backend and find out what breaks.
 Below is a short recording demonstrating the use of `livedebug.py`:
 
 [![asciicast](https://asciinema.org/a/jTsbWdOcTvUl4iAJlAw3Cszbt.svg)](https://asciinema.org/a/jTsbWdOcTvUl4iAJlAw3Cszbt)

diff --git a/ci/scripts/run-tests-linux.sh b/ci/scripts/run-tests-linux.sh
@@ -27,5 +27,6 @@ echo "port = 5432" >> ${PGDATA}/postgresql.conf
 GCOV_PREFIX=$WORKDIR/build/CMakeFiles/lantern.dir/ GCOV_PREFIX_STRIP=5 POSTGRES_HOST_AUTH_METHOD=trust /usr/lib/postgresql/$PG_VERSION/bin/postgres 1>/tmp/pg-out.log 2>/tmp/pg-error.log &
 # Wait for start and run tests
 wait_for_pg && cd $WORKDIR/build && make test && \
+make test-parallel && \
 killall postgres && \
 gcovr -r $WORKDIR/src/ --object-directory $WORKDIR/build/ --xml /tmp/coverage.xml
diff --git a/ci/scripts/run-tests-mac.sh b/ci/scripts/run-tests-mac.sh
@@ -22,4 +22,4 @@ wait_for_pg(){
 # Start database
 brew services start postgresql@$PG_VERSION
 
-wait_for_pg && cd $WORKDIR/build && make test
+wait_for_pg && cd $WORKDIR/build && make test && make test-parallel
diff --git a/scripts/run_all_tests.sh b/scripts/run_all_tests.sh
@@ -67,13 +67,33 @@ fi
 # Check if pgvector is available
 pgvector_installed=$($PSQL -U $DB_USER -d postgres -c "SELECT 1 FROM pg_available_extensions WHERE name = 'vector'" -tA | tail -n 1 | tr -d '\n')
 
+# Settings
+REGRESSION=0
+PARALLEL=0
+while [[ "$#" -gt 0 ]]; do
+    case $1 in
+        --regression) REGRESSION=1 ;;
+        --parallel) PARALLEL=1 ;;
+    esac
+    shift
+done
+
 # Generate schedule.txt
 rm -rf $TMP_OUTDIR/schedule.txt
+if [ "$PARALLEL" -eq 1 ]; then
+    SCHEDULE='parallel_schedule.txt'
+else
+    SCHEDULE='schedule.txt'
+fi
 if [ -n "$FILTER" ]; then
-    if [[ "$pgvector_installed" == "1" ]]; then
-        TEST_FILES=$(cat schedule.txt | grep -E '^(test:|test_pgvector:)' | sed -E -e 's/^test:|test_pgvector://' | tr " " "\n" | sed -e '/^$/d')
+    if [ "$PARALLEL" -eq 1 ]; then
+    	TEST_FILES=$(cat $SCHEDULE | grep -E '^(test:|test_begin:|test_end:)' | sed -E -e 's/^test:|test_begin:|test_end://' | tr " " "\n" | sed -e '/^$/d')
     else
-        TEST_FILES=$(cat schedule.txt | grep '^test:' | sed -e 's/^test://' | tr " " "\n" | sed -e '/^$/d')
+	    if [[ "$pgvector_installed" == "1" ]]; then
+		TEST_FILES=$(cat $SCHEDULE | grep -E '^(test:|test_pgvector:)' | sed -E -e 's/^test:|test_pgvector://' | tr " " "\n" | sed -e '/^$/d')
+	    else
+		TEST_FILES=$(cat $SCHEDULE | grep '^test:' | sed -e 's/^test://' | tr " " "\n" | sed -e '/^$/d')
+	    fi
     fi
 
     while IFS= read -r f; do
@@ -95,11 +115,18 @@ else
             if [ "$pgvector_installed" == "1" ]; then
                 echo "test: $test_name" >> $TMP_OUTDIR/schedule.txt
             fi
+        elif [[ "$line" =~ ^test_begin: ]]; then
+            test_name=$(echo "$line" | sed -e 's/test_begin:/test:/')
+            echo "$test_name" >> $TMP_OUTDIR/schedule.txt
+        elif [[ "$line" =~ ^test_end: ]]; then
+            test_name=$(echo "$line" | sed -e 's/test_end:/test:/')
+            echo "$test_name" >> $TMP_OUTDIR/schedule.txt
         else
             echo "$line" >> $TMP_OUTDIR/schedule.txt
         fi
-    done < schedule.txt
+    done < $SCHEDULE
 fi
+unset $SCHEDULE
 SCHEDULE=$TMP_OUTDIR/schedule.txt
 
 function print_diff {
@@ -116,4 +143,9 @@ function print_diff {
 
 trap print_diff ERR
 
-DB_USER=$DB_USER $(pg_config --pkglibdir)/pgxs/src/test/regress/pg_regress --user=$DB_USER --schedule=$SCHEDULE --outputdir=$TMP_OUTDIR --launcher=./test_runner.sh
+if [ "$PARALLEL" -eq 1 ]; then
+    cd parallel
+    PARALLEL=$PARALLEL DB_USER=$DB_USER $(pg_config --pkglibdir)/pgxs/src/test/regress/pg_regress --user=$DB_USER --schedule=$SCHEDULE --outputdir=$TMP_OUTDIR --launcher=../test_runner.sh
+else
+    PARALLEL=$PARALLEL DB_USER=$DB_USER $(pg_config --pkglibdir)/pgxs/src/test/regress/pg_regress --user=$DB_USER --schedule=$SCHEDULE --outputdir=$TMP_OUTDIR --launcher=./test_runner.sh
+fi
diff --git a/test/parallel/expected/begin.out b/test/parallel/expected/begin.out
@@ -0,0 +1,23 @@
+-- This file handles initializing the database before parallel tests are run
+\ir utils/sift10k_array.sql
+CREATE TABLE IF NOT EXISTS sift_base10k (
+     id SERIAL PRIMARY KEY,
+     v REAL[128]
+);
+\copy sift_base10k (v) FROM '/tmp/lantern/vector_datasets/siftsmall_base_arrays.csv' with csv;
+\ir utils/random_array.sql
+CREATE OR REPLACE FUNCTION random_int_array(dim integer, min integer, max integer) RETURNS integer[] AS $BODY$
+begin
+        return (select array_agg(round(random() * (max - min)) + min) from generate_series (0, dim - 1));
+end
+$BODY$ LANGUAGE plpgsql;
+CREATE OR REPLACE FUNCTION random_array(dim integer, min real, max real) RETURNS REAL[] AS $BODY$
+begin
+        return (select array_agg(random() * (max - min) + min) from generate_series (0, dim - 1));
+end
+$BODY$ LANGUAGE plpgsql;
+CREATE SEQUENCE serial START 10001;
+CREATE INDEX ON sift_base10k  USING HNSW (v) WITH (M=5, ef=20, ef_construction=20);
+INFO:  done init usearch index
+INFO:  inserted 10000 elements
+INFO:  done saving 10000 vectors
diff --git a/test/parallel/expected/end.out b/test/parallel/expected/end.out
@@ -0,0 +1,13 @@
+-- This file contains invariants to be checked after the parallel tests have run
+SELECT COUNT(*) FROM sift_base10k;
+ count 
+-------
+ 10030
+(1 row)
+
+SELECT * from sift_base10k WHERE id=4444;
+  id  |                                                                                                                                                                             v                                                                                                                                                                             
+------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ 4444 | {55,61,11,4,5,2,13,24,65,49,13,9,23,37,94,38,54,11,14,14,40,31,50,44,53,4,0,0,27,17,8,34,12,10,4,4,22,52,68,53,9,2,0,0,2,116,119,64,119,2,0,0,2,30,119,119,116,5,0,8,47,9,5,60,7,7,10,23,56,50,23,5,28,68,6,18,24,65,50,9,119,75,3,0,1,8,12,85,119,11,4,6,8,9,5,74,25,11,8,20,18,12,2,21,11,90,25,32,33,15,2,9,84,67,8,4,22,31,11,33,119,30,3,6,0,0,0,26}
+(1 row)
+
diff --git a/test/parallel/expected/insert.out b/test/parallel/expected/insert.out
@@ -0,0 +1,13 @@
+BEGIN;
+INSERT INTO sift_base10k (id, v) VALUES 
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128));
+COMMIT;
diff --git a/test/parallel/expected/insert2.out b/test/parallel/expected/insert2.out
@@ -0,0 +1,13 @@
+BEGIN;
+INSERT INTO sift_base10k (id, v) VALUES 
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128));
+COMMIT;
diff --git a/test/parallel/expected/insert3.out b/test/parallel/expected/insert3.out
@@ -0,0 +1,13 @@
+BEGIN;
+INSERT INTO sift_base10k (id, v) VALUES 
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128));
+COMMIT;
diff --git a/test/parallel/expected/select.out b/test/parallel/expected/select.out
@@ -0,0 +1,38 @@
+SELECT v AS v1111 FROM sift_base10k WHERE id = 1111 \gset
+SELECT v AS v2222 FROM sift_base10k WHERE id = 2222 \gset
+SELECT v AS v3333 FROM sift_base10k WHERE id = 3333 \gset
+SELECT v AS v4444 FROM sift_base10k WHERE id = 4444 \gset
+-- Make sure that our index queries will actually run against the index
+EXPLAIN (COSTS false) SELECT id FROM sift_base10k ORDER BY  v <-> :'v1111'  ASC LIMIT 1;
+                                                                                                                                                                                   QUERY PLAN                                                                                                                                                                                    
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ Limit
+   ->  Index Scan using sift_base10k_v_idx on sift_base10k
+         Order By: (v <-> '{21,24,5,0,0,26,22,6,16,16,10,9,0,18,114,19,13,13,9,1,2,53,111,19,39,32,5,0,4,9,10,13,6,10,8,0,2,130,77,4,2,0,0,0,3,130,130,11,130,0,0,0,0,37,130,84,130,5,0,1,17,11,4,28,17,39,3,3,30,77,28,3,20,0,0,1,49,125,13,7,130,6,0,0,0,5,11,61,130,2,0,1,12,84,48,73,1,12,2,0,31,57,9,2,16,12,1,0,32,36,0,1,63,6,3,1,0,0,24,51,9,0,0,0,0,44,88,48}'::real[])
+(3 rows)
+
+-- Do the queries
+SELECT id FROM sift_base10k ORDER BY  v <-> :'v1111'  ASC LIMIT 1;
+  id  
+------
+ 1111
+(1 row)
+
+SELECT id FROM sift_base10k ORDER BY  v <-> :'v2222'  ASC LIMIT 1;
+  id  
+------
+ 2222
+(1 row)
+
+SELECT id FROM sift_base10k ORDER BY  v <-> :'v3333'  ASC LIMIT 1;
+  id  
+------
+ 3333
+(1 row)
+
+SELECT id FROM sift_base10k ORDER BY  v <-> :'v4444'  ASC LIMIT 1;
+  id  
+------
+ 4444
+(1 row)
+
diff --git a/test/parallel/sql/begin.sql b/test/parallel/sql/begin.sql
@@ -0,0 +1,6 @@
+-- This file handles initializing the database before parallel tests are run
+\ir utils/sift10k_array.sql
+\ir utils/random_array.sql
+
+CREATE SEQUENCE serial START 10001;
+CREATE INDEX ON sift_base10k  USING HNSW (v) WITH (M=5, ef=20, ef_construction=20);
diff --git a/test/parallel/sql/end.sql b/test/parallel/sql/end.sql
@@ -0,0 +1,3 @@
+-- This file contains invariants to be checked after the parallel tests have run
+SELECT COUNT(*) FROM sift_base10k;
+SELECT * from sift_base10k WHERE id=4444;
diff --git a/test/parallel/sql/insert.sql b/test/parallel/sql/insert.sql
@@ -0,0 +1,13 @@
+BEGIN;
+INSERT INTO sift_base10k (id, v) VALUES 
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128));
+COMMIT;
diff --git a/test/parallel/sql/insert2.sql b/test/parallel/sql/insert2.sql
@@ -0,0 +1,13 @@
+BEGIN;
+INSERT INTO sift_base10k (id, v) VALUES 
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128));
+COMMIT;
diff --git a/test/parallel/sql/insert3.sql b/test/parallel/sql/insert3.sql
@@ -0,0 +1,13 @@
+BEGIN;
+INSERT INTO sift_base10k (id, v) VALUES 
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128));
+COMMIT;
diff --git a/test/parallel/sql/select.sql b/test/parallel/sql/select.sql
@@ -0,0 +1,11 @@
+SELECT v AS v1111 FROM sift_base10k WHERE id = 1111 \gset
+SELECT v AS v2222 FROM sift_base10k WHERE id = 2222 \gset
+SELECT v AS v3333 FROM sift_base10k WHERE id = 3333 \gset
+SELECT v AS v4444 FROM sift_base10k WHERE id = 4444 \gset
+-- Make sure that our index queries will actually run against the index
+EXPLAIN (COSTS false) SELECT id FROM sift_base10k ORDER BY  v <-> :'v1111'  ASC LIMIT 1;
+-- Do the queries
+SELECT id FROM sift_base10k ORDER BY  v <-> :'v1111'  ASC LIMIT 1;
+SELECT id FROM sift_base10k ORDER BY  v <-> :'v2222'  ASC LIMIT 1;
+SELECT id FROM sift_base10k ORDER BY  v <-> :'v3333'  ASC LIMIT 1;
+SELECT id FROM sift_base10k ORDER BY  v <-> :'v4444'  ASC LIMIT 1;
diff --git a/test/parallel/sql/utils/common.sql b/test/parallel/sql/utils/common.sql
@@ -0,0 +1,61 @@
+-- test helper functions that should exist in all test runs live here
+-- there is no need to explicitly include this file in other tests as the test runner will
+-- run this before running the actual test
+
+CREATE EXTENSION pageinspect;
+
+\set ON_ERROR_STOP on
+
+-- retrieves details for all indices associated with a given table, similar to \di+
+-- the output of \di+ is not consistent across postgres versions
+-- todo:: add a columns to this function which returning number of used DB pages
+CREATE OR REPLACE FUNCTION ldb_get_indexes(tblname text)
+RETURNS TABLE(
+    indexname name,
+    size text,
+    indexdef text,
+    total_index_size text
+) AS
+$BODY$
+BEGIN
+    RETURN QUERY
+    WITH total_size_data AS (
+        SELECT
+            SUM(pg_relation_size(indexrelid)) as total_size
+        FROM
+            pg_index 
+        WHERE
+            indisvalid
+            AND indrelid = tblname::regclass
+    )
+    SELECT
+        idx.indexname,
+        pg_size_pretty(pg_relation_size(idx.indexname::REGCLASS)) as size,
+        idx.indexdef,
+        pg_size_pretty(total_size_data.total_size) as total_index_size
+    FROM
+        pg_indexes idx,
+        total_size_data
+    WHERE
+        idx.tablename = tblname;
+END;
+$BODY$
+LANGUAGE plpgsql;
+
+-- Determines if the provided SQL query (with an EXPLAIN prefix) uses an "Index Scan" 
+-- by examining its execution plan. This function helps ensure consistent analysis 
+-- across varying Postgres versions where EXPLAIN output may differ.
+CREATE OR REPLACE FUNCTION has_index_scan(explain_query text) RETURNS boolean AS $$
+DECLARE
+    plan_row RECORD;
+    found boolean := false;
+BEGIN
+    FOR plan_row IN EXECUTE explain_query LOOP
+        IF position('Index Scan' in plan_row."QUERY PLAN") > 0 THEN
+            found := true;
+            EXIT;
+        END IF;
+    END LOOP;
+    RETURN found;
+END;
+$$ LANGUAGE plpgsql;
diff --git a/test/parallel/sql/utils/random_array.sql b/test/parallel/sql/utils/random_array.sql
@@ -0,0 +1,11 @@
+CREATE OR REPLACE FUNCTION random_int_array(dim integer, min integer, max integer) RETURNS integer[] AS $BODY$
+begin
+        return (select array_agg(round(random() * (max - min)) + min) from generate_series (0, dim - 1));
+end
+$BODY$ LANGUAGE plpgsql;
+
+CREATE OR REPLACE FUNCTION random_array(dim integer, min real, max real) RETURNS REAL[] AS $BODY$
+begin
+        return (select array_agg(random() * (max - min) + min) from generate_series (0, dim - 1));
+end
+$BODY$ LANGUAGE plpgsql;
diff --git a/test/parallel/sql/utils/sift10k_array.sql b/test/parallel/sql/utils/sift10k_array.sql
@@ -0,0 +1,5 @@
+CREATE TABLE IF NOT EXISTS sift_base10k (
+     id SERIAL PRIMARY KEY,
+     v REAL[128]
+);
+\copy sift_base10k (v) FROM '/tmp/lantern/vector_datasets/siftsmall_base_arrays.csv' with csv;
diff --git a/test/parallel_schedule.txt b/test/parallel_schedule.txt
@@ -0,0 +1,10 @@
+# schedule.txt rules:
+# - every test that needs to be run must appear in a 'test:' line
+# - every test that needs to be run iff pgvector is installed appears in a 'test_pgvector:' line
+# - 'test' lines may have multiple space-separated tests. All tests in a single 'test' line will be run in parallel
+# parallel_schedule.txt notes:
+# - Begin runs before and end runs after the actual tests, they run in the same database, but begin runs before to handle the necessary setup and end runs after to check invariants.
+
+test_begin: begin
+test: insert insert2 insert3 select
+test_end: end