lanterndata · Ngalstyan4 · Oct 9, 2023 · Oct 2, 2023 · Oct 3, 2023 · Oct 4, 2023
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -192,7 +192,13 @@ add_custom_target(
 # TEST
 add_custom_target(
   test
-  COMMAND ${CMAKE_SOURCE_DIR}/scripts/run_all_tests.sh
+  COMMAND ${CMAKE_SOURCE_DIR}/scripts/run_all_tests.sh --regression && ${CMAKE_SOURCE_DIR}/scripts/run_all_tests.sh --parallel
+  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/test
+)
+
+add_custom_target(
+  test-parallel
+  COMMAND ${CMAKE_SOURCE_DIR}/scripts/run_all_tests.sh --parallel
   WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/test
 )
 

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -8,7 +8,11 @@ make test
 
 # only run regression tests that have $FILTER in regression sql file path
 make test FILTER=hnsw
+
+# run parallel tests
+make test-parallel
 ```
+Running `make test` will run the lantern regression tests, these run independent of one another. At the moment the tests for `make test-parallel` are under development, they can be found in `test/parallel`. The goal of the parallel tests is to generate a more realistic workload on the index to discover timing errors and other bugs dependent on more complex use, they run in the same database. 
 
 ## Running benchmarks
 This requires Python to be installed. Please check the `Dockerfile.dev` for pip requirements.
@@ -30,7 +34,7 @@ If you build Lantern in a different directory, make sure to update `.vscode` con
 
 ## Debugging the C codebase
 
-If you make changes to the C codebase, in addition to `make test`, you can also use the `livedebug.py` utility in a `tmux` session to easily attach `gdb` to the psql backend and find out what breaks.
+If you make changes to the C codebase, in addition to `make test` and `make parallel-test`, you can also use the `livedebug.py` utility in a `tmux` session to easily attach `gdb` to the psql backend and find out what breaks.
 Below is a short recording demonstrating the use of `livedebug.py`:
 
 [![asciicast](https://asciinema.org/a/jTsbWdOcTvUl4iAJlAw3Cszbt.svg)](https://asciinema.org/a/jTsbWdOcTvUl4iAJlAw3Cszbt)

diff --git a/scripts/run_all_tests.sh b/scripts/run_all_tests.sh
@@ -67,13 +67,33 @@ fi
 # Check if pgvector is available
 pgvector_installed=$($PSQL -U $DB_USER -d postgres -c "SELECT 1 FROM pg_available_extensions WHERE name = 'vector'" -tA | tail -n 1 | tr -d '\n')
 
+# Settings
+REGRESSION=0
+PARALLEL=0
+while [[ "$#" -gt 0 ]]; do
+    case $1 in
+        --regression) REGRESSION=1 ;;
+        --parallel) PARALLEL=1 ;;
+    esac
+    shift
+done
+
 # Generate schedule.txt
 rm -rf $TMP_OUTDIR/schedule.txt
+if [ "$PARALLEL" -eq 1 ]; then
+    SCHEDULE='parallel_schedule.txt'
+else
+    SCHEDULE='schedule.txt'
+fi
 if [ -n "$FILTER" ]; then
-    if [[ "$pgvector_installed" == "1" ]]; then
-        TEST_FILES=$(cat schedule.txt | grep -E '^(test:|test_pgvector:)' | sed -E -e 's/^test:|test_pgvector://' | tr " " "\n" | sed -e '/^$/d')
+    if [ "$PARALLEL" -eq 1 ]; then
+    	TEST_FILES=$(cat $SCHEDULE | grep -E '^(test:|test_begin:|test_end:)' | sed -E -e 's/^test:|test_begin:|test_end://' | tr " " "\n" | sed -e '/^$/d')
     else
-        TEST_FILES=$(cat schedule.txt | grep '^test:' | sed -e 's/^test://' | tr " " "\n" | sed -e '/^$/d')
+	    if [[ "$pgvector_installed" == "1" ]]; then
+		TEST_FILES=$(cat $SCHEDULE | grep -E '^(test:|test_pgvector:)' | sed -E -e 's/^test:|test_pgvector://' | tr " " "\n" | sed -e '/^$/d')
+	    else
+		TEST_FILES=$(cat $SCHEDULE | grep '^test:' | sed -e 's/^test://' | tr " " "\n" | sed -e '/^$/d')
+	    fi
     fi
 
     while IFS= read -r f; do
@@ -95,11 +115,18 @@ else
             if [ "$pgvector_installed" == "1" ]; then
                 echo "test: $test_name" >> $TMP_OUTDIR/schedule.txt
             fi
+        elif [[ "$line" =~ ^test_begin: ]]; then
+            test_name=$(echo "$line" | sed -e 's/test_begin:/test:/')
+            echo "$test_name" >> $TMP_OUTDIR/schedule.txt
+        elif [[ "$line" =~ ^test_end: ]]; then
+            test_name=$(echo "$line" | sed -e 's/test_end:/test:/')
+            echo "$test_name" >> $TMP_OUTDIR/schedule.txt
         else
             echo "$line" >> $TMP_OUTDIR/schedule.txt
         fi
-    done < schedule.txt
+    done < $SCHEDULE
 fi
+unset $SCHEDULE
 SCHEDULE=$TMP_OUTDIR/schedule.txt
 
 function print_diff {
@@ -116,4 +143,9 @@ function print_diff {
 
 trap print_diff ERR
 
-DB_USER=$DB_USER $(pg_config --pkglibdir)/pgxs/src/test/regress/pg_regress --user=$DB_USER --schedule=$SCHEDULE --outputdir=$TMP_OUTDIR --launcher=./test_runner.sh
+if [ "$PARALLEL" -eq 1 ]; then
+    cd parallel
+    DB_USER=$DB_USER $(pg_config --pkglibdir)/pgxs/src/test/regress/pg_regress --user=$DB_USER --schedule=$SCHEDULE --outputdir=$TMP_OUTDIR --launcher=./parallel_test_runner.sh
+else
+    DB_USER=$DB_USER $(pg_config --pkglibdir)/pgxs/src/test/regress/pg_regress --user=$DB_USER --schedule=$SCHEDULE --outputdir=$TMP_OUTDIR --launcher=./test_runner.sh
+fi
diff --git a/test/parallel/expected/begin.out b/test/parallel/expected/begin.out
@@ -0,0 +1,22 @@
+\ir utils/sift10k_array.sql
+CREATE TABLE IF NOT EXISTS sift_base10k (
+     id SERIAL PRIMARY KEY,
+     v REAL[128]
+);
+\copy sift_base10k (v) FROM '/tmp/lantern/vector_datasets/siftsmall_base_arrays.csv' with csv;
+\ir utils/random_array.sql
+CREATE OR REPLACE FUNCTION random_int_array(dim integer, min integer, max integer) RETURNS integer[] AS $BODY$
+begin
+        return (select array_agg(round(random() * (max - min)) + min) from generate_series (0, dim - 1));
+end
+$BODY$ LANGUAGE plpgsql;
+CREATE OR REPLACE FUNCTION random_array(dim integer, min real, max real) RETURNS REAL[] AS $BODY$
+begin
+        return (select array_agg(random() * (max - min) + min) from generate_series (0, dim - 1));
+end
+$BODY$ LANGUAGE plpgsql;
+CREATE SEQUENCE serial START 10001;
+CREATE INDEX ON sift_base10k  USING HNSW (v) WITH (M=5, ef=20, ef_construction=20);
+INFO:  done init usearch index
+INFO:  inserted 10000 elements
+INFO:  done saving 10000 vectors
diff --git a/test/parallel/expected/end.out b/test/parallel/expected/end.out
@@ -0,0 +1,12 @@
+SELECT COUNT(*) FROM sift_base10k;
+ count 
+-------
+ 10030
+(1 row)
+
+SELECT * from sift_base10k WHERE id=4444;
+  id  |                                                                                                                                                                             v                                                                                                                                                                             
+------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ 4444 | {55,61,11,4,5,2,13,24,65,49,13,9,23,37,94,38,54,11,14,14,40,31,50,44,53,4,0,0,27,17,8,34,12,10,4,4,22,52,68,53,9,2,0,0,2,116,119,64,119,2,0,0,2,30,119,119,116,5,0,8,47,9,5,60,7,7,10,23,56,50,23,5,28,68,6,18,24,65,50,9,119,75,3,0,1,8,12,85,119,11,4,6,8,9,5,74,25,11,8,20,18,12,2,21,11,90,25,32,33,15,2,9,84,67,8,4,22,31,11,33,119,30,3,6,0,0,0,26}
+(1 row)
+
diff --git a/test/parallel/expected/insert.out b/test/parallel/expected/insert.out
@@ -0,0 +1,13 @@
+BEGIN;
+INSERT INTO sift_base10k (id, v) VALUES 
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128));
+COMMIT;
diff --git a/test/parallel/expected/insert2.out b/test/parallel/expected/insert2.out
@@ -0,0 +1,13 @@
+BEGIN;
+INSERT INTO sift_base10k (id, v) VALUES 
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128));
+COMMIT;
diff --git a/test/parallel/expected/insert3.out b/test/parallel/expected/insert3.out
@@ -0,0 +1,13 @@
+BEGIN;
+INSERT INTO sift_base10k (id, v) VALUES 
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128));
+COMMIT;
diff --git a/test/parallel/expected/select.out b/test/parallel/expected/select.out
@@ -0,0 +1,28 @@
+SELECT v AS v1111 FROM sift_base10k WHERE id = 1111 \gset
+SELECT v AS v2222 FROM sift_base10k WHERE id = 2222 \gset
+SELECT v AS v3333 FROM sift_base10k WHERE id = 3333 \gset
+SELECT v AS v4444 FROM sift_base10k WHERE id = 4444 \gset
+SELECT id FROM sift_base10k ORDER BY  v <-> :'v1111'  ASC LIMIT 1;
+  id  
+------
+ 1111
+(1 row)
+
+SELECT id FROM sift_base10k ORDER BY  v <-> :'v2222'  ASC LIMIT 1;
+  id  
+------
+ 2222
+(1 row)
+
+SELECT id FROM sift_base10k ORDER BY  v <-> :'v3333'  ASC LIMIT 1;
+  id  
+------
+ 3333
+(1 row)
+
+SELECT id FROM sift_base10k ORDER BY  v <-> :'v4444'  ASC LIMIT 1;
+  id  
+------
+ 4444
+(1 row)
+
diff --git a/test/parallel/parallel_test_runner.sh b/test/parallel/parallel_test_runner.sh
@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+
+# Get current test file name
+TESTFILE_NAME=${PGAPPNAME##pg_regress/}
+# Set different name for each test database
+# As pg_regress does not support cleaning db after each test
+TEST_CASE_DB="ldb_parallel"
+# Set database user
+if [ -z $DB_USER ]
+then
+     echo "ERROR: DB_USER environment variable is not set before test_runner.sh is run by pg_regress"
+     exit 1
+fi
+
+# Drop db after each test on exit signal
+function drop_db {
+  cat <<EOF | psql "$@" -U ${DB_USER} -d postgres -v ECHO=none >/dev/null 2>&1
+    SET client_min_messages=ERROR;
+    DROP DATABASE "${TEST_CASE_DB}";
+EOF
+}
+
+if [[ "$TESTFILE_NAME" =~ ^end ]]; then
+    trap drop_db EXIT
+fi
+
+
+# Change directory to sql so sql imports will work correctly
+cd sql/
+# install lantern extension
+if [[ "$TESTFILE_NAME" =~ ^begin ]]; then
+    psql "$@" -U ${DB_USER} -d postgres -v ECHO=none -q -c "CREATE DATABASE ${TEST_CASE_DB};" 2>/dev/null
+    psql "$@" -U ${DB_USER} -d ${TEST_CASE_DB} -v ECHO=none -q -c "SET client_min_messages=error; CREATE EXTENSION lantern;" 2>/dev/null
+    #psql "$@" -U ${DB_USER} -d ${TEST_CASE_DB} -v ECHO=none -q -f utils/common.sql 2>/dev/null
+fi
+
+# Exclude debug/inconsistent output from psql
+# So tests will always have the same output
+psql -U ${DB_USER} \
+     -v ON_ERROR_STOP=1 \
+     -v VERBOSITY=terse \
+     -v ECHO=all \
+     "$@" -d ${TEST_CASE_DB} 2>&1 | \
+          sed  -e 's! Memory: [0-9]\{1,\}kB!!' \
+               -e 's! Memory Usage: [0-9]\{1,\}kB!!' \
+               -e 's! Average  Peak Memory: [0-9]\{1,\}kB!!' \
+               -e 's! time=[0-9]\+\.[0-9]\+\.\.[0-9]\+\.[0-9]\+!!' | \
+          grep -v 'DEBUG:  rehashing catalog cache id' | \
+          grep -Gv '^ Planning Time:' | \
+          grep -Gv '^ Execution Time:' | \
+          # Only print debug messages followed by LANTERN
+          perl -nle'print if !m{DEBUG:(?!.*LANTERN)}'
diff --git a/test/parallel/sql/begin.sql b/test/parallel/sql/begin.sql
@@ -0,0 +1,5 @@
+\ir utils/sift10k_array.sql
+\ir utils/random_array.sql
+
+CREATE SEQUENCE serial START 10001;
+CREATE INDEX ON sift_base10k  USING HNSW (v) WITH (M=5, ef=20, ef_construction=20);
diff --git a/test/parallel/sql/end.sql b/test/parallel/sql/end.sql
@@ -0,0 +1,2 @@
+SELECT COUNT(*) FROM sift_base10k;
+SELECT * from sift_base10k WHERE id=4444;
diff --git a/test/parallel/sql/insert.sql b/test/parallel/sql/insert.sql
@@ -0,0 +1,13 @@
+BEGIN;
+INSERT INTO sift_base10k (id, v) VALUES 
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128));
+COMMIT;
diff --git a/test/parallel/sql/insert2.sql b/test/parallel/sql/insert2.sql
@@ -0,0 +1,13 @@
+BEGIN;
+INSERT INTO sift_base10k (id, v) VALUES 
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128));
+COMMIT;
diff --git a/test/parallel/sql/insert3.sql b/test/parallel/sql/insert3.sql
@@ -0,0 +1,13 @@
+BEGIN;
+INSERT INTO sift_base10k (id, v) VALUES 
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128)),
+    (nextval('serial'), random_array(128, 0, 128));
+COMMIT;
diff --git a/test/parallel/sql/select.sql b/test/parallel/sql/select.sql
@@ -0,0 +1,8 @@
+SELECT v AS v1111 FROM sift_base10k WHERE id = 1111 \gset
+SELECT v AS v2222 FROM sift_base10k WHERE id = 2222 \gset
+SELECT v AS v3333 FROM sift_base10k WHERE id = 3333 \gset
+SELECT v AS v4444 FROM sift_base10k WHERE id = 4444 \gset
+SELECT id FROM sift_base10k ORDER BY  v <-> :'v1111'  ASC LIMIT 1;
+SELECT id FROM sift_base10k ORDER BY  v <-> :'v2222'  ASC LIMIT 1;
+SELECT id FROM sift_base10k ORDER BY  v <-> :'v3333'  ASC LIMIT 1;
+SELECT id FROM sift_base10k ORDER BY  v <-> :'v4444'  ASC LIMIT 1;
diff --git a/test/parallel/sql/utils/random_array.sql b/test/parallel/sql/utils/random_array.sql
@@ -0,0 +1,11 @@
+CREATE OR REPLACE FUNCTION random_int_array(dim integer, min integer, max integer) RETURNS integer[] AS $BODY$
+begin
+        return (select array_agg(round(random() * (max - min)) + min) from generate_series (0, dim - 1));
+end
+$BODY$ LANGUAGE plpgsql;
+
+CREATE OR REPLACE FUNCTION random_array(dim integer, min real, max real) RETURNS REAL[] AS $BODY$
+begin
+        return (select array_agg(random() * (max - min) + min) from generate_series (0, dim - 1));
+end
+$BODY$ LANGUAGE plpgsql;
diff --git a/test/parallel/sql/utils/sift10k_array.sql b/test/parallel/sql/utils/sift10k_array.sql
@@ -0,0 +1,5 @@
+CREATE TABLE IF NOT EXISTS sift_base10k (
+     id SERIAL PRIMARY KEY,
+     v REAL[128]
+);
+\copy sift_base10k (v) FROM '/tmp/lantern/vector_datasets/siftsmall_base_arrays.csv' with csv;
diff --git a/test/parallel_schedule.txt b/test/parallel_schedule.txt
@@ -0,0 +1,8 @@
+# schedule.txt rules:
+# - every test that needs to be run must appear in a 'test:' line
+# - every test that needs to be run iff pgvector is installed appears in a 'test_pgvector:' line
+# - 'test' lines may have multiple space-separated tests. All tests in a single 'test' line will be run in parallel
+
+test_begin: begin
+test: insert insert2 insert3 select
+test_end: end
diff --git a/test/sql/utils/small_world_array.sql b/test/sql/utils/small_world_array.sql
@@ -12,4 +12,4 @@ INSERT INTO small_world (id, b, v) VALUES
     ('100', FALSE, '{1,0,0}'),
     ('101', FALSE, '{1,0,1}'),
     ('110', FALSE, '{1,1,0}'),
-    ('111', TRUE,  '{1,1,1}');
+    ('111', TRUE,  '{1,1,1}');
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		SELECT COUNT(*) FROM sift_base10k;
		SELECT * from sift_base10k WHERE id=4444;