From 3cd43c292a330f073491dc9bbe14b51397efd9a1 Mon Sep 17 00:00:00 2001 From: Varik Matevosyan Date: Mon, 18 Dec 2023 13:42:44 +0400 Subject: [PATCH 01/13] Add lantern_reindex_external_index which will call corresponding function in lantern_extras --- CMakeLists.txt | 1 + sql/lantern.sql | 3 + sql/updates/0.0.11--0.0.12.sql | 2 + src/hnsw.c | 8 +++ src/hnsw.h | 1 + src/hnsw/build.c | 77 ++++++++++++++++++++++++++ src/hnsw/build.h | 1 + test/expected/ext_relocation.out | 3 +- test/expected/hnsw_index_from_file.out | 3 + test/sql/hnsw_index_from_file.sql | 3 + 10 files changed, 101 insertions(+), 1 deletion(-) create mode 100644 sql/updates/0.0.11--0.0.12.sql diff --git a/CMakeLists.txt b/CMakeLists.txt index ca13d7a29..4053c6dd3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -192,6 +192,7 @@ set (_update_files sql/updates/0.0.8--0.0.9.sql sql/updates/0.0.9--0.0.10.sql sql/updates/0.0.10--0.0.11.sql + sql/updates/0.0.11--0.0.12.sql ) add_custom_command( diff --git a/sql/lantern.sql b/sql/lantern.sql index ab4093deb..b410eb0c0 100644 --- a/sql/lantern.sql +++ b/sql/lantern.sql @@ -2,6 +2,9 @@ CREATE FUNCTION hnsw_handler(internal) RETURNS index_am_handler AS 'MODULE_PATHNAME' LANGUAGE C; +CREATE FUNCTION lantern_reindex_external_index(index regclass) RETURNS VOID + AS 'MODULE_PATHNAME', 'lantern_reindex_external_index' LANGUAGE C STABLE STRICT PARALLEL UNSAFE; + -- functions CREATE FUNCTION ldb_generic_dist(real[], real[]) RETURNS real AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; diff --git a/sql/updates/0.0.11--0.0.12.sql b/sql/updates/0.0.11--0.0.12.sql new file mode 100644 index 000000000..929808f7e --- /dev/null +++ b/sql/updates/0.0.11--0.0.12.sql @@ -0,0 +1,2 @@ +CREATE FUNCTION lantern_reindex_external_index (index regclass) RETURNS VOID AS 'MODULE_PATHNAME', +'lantern_reindex_external_index' LANGUAGE C STABLE STRICT PARALLEL UNSAFE; diff --git a/src/hnsw.c b/src/hnsw.c index d47656e41..a7210ea99 100644 --- a/src/hnsw.c +++ b/src/hnsw.c @@ -419,6 +419,14 @@ Datum lantern_internal_continue_blockmap_group_initialization(PG_FUNCTION_ PG_RETURN_VOID(); } +PGDLLEXPORT PG_FUNCTION_INFO_V1(lantern_reindex_external_index); +Datum lantern_reindex_external_index(PG_FUNCTION_ARGS) +{ + Oid indrelid = PG_GETARG_OID(0); + ldb_reindex_external_index(indrelid); + PG_RETURN_VOID(); +} + /* * Get data type for give oid * */ diff --git a/src/hnsw.h b/src/hnsw.h index c20911c9a..d1db4edc8 100644 --- a/src/hnsw.h +++ b/src/hnsw.h @@ -34,6 +34,7 @@ PGDLLEXPORT Datum hamming_dist_with_guard(PG_FUNCTION_ARGS); PGDLLEXPORT Datum cos_dist(PG_FUNCTION_ARGS); PGDLLEXPORT Datum cos_dist_with_guard(PG_FUNCTION_ARGS); PGDLLEXPORT Datum vector_cos_dist(PG_FUNCTION_ARGS); +PGDLLEXPORT Datum lantern_reindex_external_index(PG_FUNCTION_ARGS); HnswColumnType GetColumnTypeFromOid(Oid oid); HnswColumnType GetIndexColumnType(Relation index); diff --git a/src/hnsw/build.c b/src/hnsw/build.c index 7e0e6e1fb..8603103c0 100644 --- a/src/hnsw/build.c +++ b/src/hnsw/build.c @@ -8,14 +8,19 @@ #include #include #include +#include #include #include #include +#include #include #include +#include #include #include +#include "usearch.h" + #ifdef _WIN32 #define access _access #else @@ -531,3 +536,75 @@ void ldb_ambuildunlogged(Relation index) // todo:: elog(ERROR, "hnsw index on unlogged tables is currently not supported"); } + +void ldb_reindex_external_index(Oid indrelid) +{ + BlockNumber HEADER_BLOCK = 0; + Relation index_rel; + Buffer buf; + Page page; + char *metric_kind; + char *index_name; + HnswIndexHeaderPage *headerp; + FmgrInfo reindex_finfo = {0}; + char *function_sig = "_lantern_reindex_external_index(text, text, integer, integer, integer, integer)"; + Oid function_oid; + uint32_t dim = 0; + uint32_t m = 0; + uint32_t ef_construction = 0; + uint32_t ef = 0; + + PG_TRY(); + { + function_oid = DatumGetObjectId(DirectFunctionCall1(regprocedurein, CStringGetDatum(function_sig))); + } + PG_CATCH(); + { + elog(ERROR, "Please install 'lantern_extras' extension or update it to the latest version"); + } + PG_END_TRY(); + + index_rel = relation_open(indrelid, AccessShareLock); + buf = ReadBuffer(index_rel, HEADER_BLOCK); + LockBuffer(buf, BUFFER_LOCK_SHARE); + page = BufferGetPage(buf); + headerp = (HnswIndexHeaderPage *)PageGetContents(page); + + assert(headerp->magicNumber == LDB_WAL_MAGIC_NUMBER); + + switch(headerp->metric_kind) { + case usearch_metric_l2sq_k: + metric_kind = "l2sq"; + break; + case usearch_metric_cos_k: + metric_kind = "cos"; + break; + case usearch_metric_hamming_k: + metric_kind = "hamming"; + break; + default: + metric_kind = NULL; + ldb_invariant(true, "Unsupported metric kind"); + } + + index_name = pstrdup(RelationGetRelationName(index_rel)); + dim = headerp->vector_dim; + m = headerp->m; + ef = headerp->ef; + ef_construction = headerp->ef_construction; + + UnlockReleaseBuffer(buf); + relation_close(index_rel, AccessShareLock); + + fmgr_info(function_oid, &reindex_finfo); + + assert(reindex_finfo.fn_addr != NULL); + + DirectFunctionCall6(reindex_finfo.fn_addr, + CStringGetTextDatum(index_name), + CStringGetTextDatum(metric_kind), + Int32GetDatum(dim), + Int32GetDatum(m), + Int32GetDatum(ef_construction), + Int32GetDatum(ef)); +} diff --git a/src/hnsw/build.h b/src/hnsw/build.h index 272bd394b..91ef0947f 100644 --- a/src/hnsw/build.h +++ b/src/hnsw/build.h @@ -37,5 +37,6 @@ IndexBuildResult *ldb_ambuild(Relation heap, Relation index, IndexInfo *indexInf void ldb_ambuildunlogged(Relation index); int GetHnswIndexDimensions(Relation index, IndexInfo *indexInfo); void CheckHnswIndexDimensions(Relation index, Datum arrayDatum, int deimensions); +void ldb_reindex_external_index(Oid indrelid); // todo: does this render my check unnecessary #endif // LDB_HNSW_BUILD_H diff --git a/test/expected/ext_relocation.out b/test/expected/ext_relocation.out index 0275315b0..31c30dc9b 100644 --- a/test/expected/ext_relocation.out +++ b/test/expected/ext_relocation.out @@ -43,9 +43,10 @@ ORDER BY 1, 3, 2; schema1 | hamming_dist | schema1 schema1 | hnsw_handler | schema1 schema1 | l2sq_dist | schema1 + schema1 | lantern_reindex_external_index | schema1 schema1 | ldb_generic_dist | schema1 schema1 | ldb_generic_dist | schema1 -(11 rows) +(12 rows) -- show all the extension operators SELECT ne.nspname AS extschema, op.oprname, np.nspname AS proschema diff --git a/test/expected/hnsw_index_from_file.out b/test/expected/hnsw_index_from_file.out index 76ded26ee..9d89d5f67 100644 --- a/test/expected/hnsw_index_from_file.out +++ b/test/expected/hnsw_index_from_file.out @@ -183,3 +183,6 @@ SELECT ROUND(l2sq_dist(v, :'v777')::numeric, 2) FROM sift_base1k order by v 132813.00 (10 rows) +-- Should throw error when lantern_extras is not installed +SELECT lantern_reindex_external_index('hnsw_l2_index'); +ERROR: Please install 'lantern_extras' extension or update it to the latest version diff --git a/test/sql/hnsw_index_from_file.sql b/test/sql/hnsw_index_from_file.sql index 7978d1c02..f5e3707b4 100644 --- a/test/sql/hnsw_index_from_file.sql +++ b/test/sql/hnsw_index_from_file.sql @@ -61,3 +61,6 @@ CREATE INDEX hnsw_l2_index ON sift_base1k USING hnsw (v) WITH (_experimental_ind SELECT _lantern_internal.validate_index('hnsw_l2_index', false); -- This should not throw error, but the first result will not be 0 as vector 777 is deleted from the table SELECT ROUND(l2sq_dist(v, :'v777')::numeric, 2) FROM sift_base1k order by v :'v777' LIMIT 10; + +-- Should throw error when lantern_extras is not installed +SELECT lantern_reindex_external_index('hnsw_l2_index'); From 6cf726cf891fdc92350b98f57daee44d827f3e36 Mon Sep 17 00:00:00 2001 From: Varik Matevosyan Date: Mon, 18 Dec 2023 15:11:45 +0400 Subject: [PATCH 02/13] Change error message --- src/hnsw/build.c | 7 ++++++- test/expected/hnsw_index_from_file.out | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/hnsw/build.c b/src/hnsw/build.c index 8603103c0..558541337 100644 --- a/src/hnsw/build.c +++ b/src/hnsw/build.c @@ -431,7 +431,12 @@ static void BuildIndex( buildstate->hnsw = NULL; if(buildstate->index_file_path) { if(access(buildstate->index_file_path, F_OK) != 0) { - ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("Invalid index file path "))); + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("Invalid index file path. " + "If this is REINDEX operation call `SELECT " + "lantern_reindex_external_index('%s')` to recreate index", + RelationGetRelationName(index)))); } usearch_load(buildstate->usearch_index, buildstate->index_file_path, &error); if(error != NULL) { diff --git a/test/expected/hnsw_index_from_file.out b/test/expected/hnsw_index_from_file.out index 9d89d5f67..dd54d6ae0 100644 --- a/test/expected/hnsw_index_from_file.out +++ b/test/expected/hnsw_index_from_file.out @@ -15,7 +15,7 @@ COPY sift_base1k (v) FROM '/tmp/lantern/vector_datasets/sift_base1k_arrays.csv' -- Validate error on invalid path CREATE INDEX hnsw_l2_index ON sift_base1k USING hnsw (v) WITH (_experimental_index_path='/tmp/lantern/files/invalid-path'); INFO: done init usearch index -ERROR: Invalid index file path +ERROR: Invalid index file path. If this is REINDEX operation call `SELECT lantern_reindex_external_index('hnsw_l2_index')` to recreate index -- Validate error on incompatible version CREATE INDEX hnsw_l2_index ON sift_base1k USING hnsw (v) WITH (_experimental_index_path='/tmp/lantern/files/index-sift1k-l2-0.0.0.usearch'); INFO: done init usearch index From 23a4f8a68e7a1e6f8ad7875e3fb1d1d43c3966b4 Mon Sep 17 00:00:00 2001 From: Varik Matevosyan Date: Mon, 18 Dec 2023 17:45:08 +0400 Subject: [PATCH 03/13] Run test for lantern_extras functions if the extension is installed --- ci/scripts/build-linux.sh | 14 ++++++++ ci/scripts/build-mac.sh | 5 ++- ci/scripts/build.sh | 1 + scripts/run_all_tests.sh | 18 +++++++--- test/expected/hnsw_extras.out | 67 +++++++++++++++++++++++++++++++++++ test/schedule.txt | 1 + test/sql/hnsw_extras.sql | 27 ++++++++++++++ 7 files changed, 128 insertions(+), 5 deletions(-) create mode 100644 test/expected/hnsw_extras.out create mode 100644 test/sql/hnsw_extras.sql diff --git a/ci/scripts/build-linux.sh b/ci/scripts/build-linux.sh index 81f0869ba..13e9ea170 100755 --- a/ci/scripts/build-linux.sh +++ b/ci/scripts/build-linux.sh @@ -23,6 +23,20 @@ function setup_postgres() { rm -f /usr/bin/pg_config && ln -s /usr/lib/postgresql/$PG_VERSION/bin/pg_config /usr/bin/pg_config } +function install_platform_specific_dependencies() { + # Currently lantern_extras binaries are only available for Linux x86_64 + # We won't install onnxruntime as lantern_extras are used only for external index in tests + pushd /tmp + LANTERN_EXTRAS_VERSION=0.0.5 + wget https://github.com/lanterndata/lantern_extras/releases/download/${LANTERN_EXTRAS_VERSION}/lantern-extras-${LANTERN_EXTRAS_VERSION}.tar -O lantern-extras.tar + tar xf lantern-extras.tar + pushd lantern-extras-${LANTERN_EXTRAS_VERSION} + make install + popd + rm -rf lantern-extras* + popd +} + function package_if_necessary() { if [ -n "$BUILD_PACKAGES" ]; then # Bundle debian packages diff --git a/ci/scripts/build-mac.sh b/ci/scripts/build-mac.sh index 5229dbd46..ddd8b8057 100755 --- a/ci/scripts/build-mac.sh +++ b/ci/scripts/build-mac.sh @@ -18,9 +18,12 @@ function setup_postgres() { fi } +function install_platform_specific_dependencies() { + : +} + function package_if_necessary() { : - # TODO make and publish homebrew formula } function cleanup_environment() { diff --git a/ci/scripts/build.sh b/ci/scripts/build.sh index c6a98b5d4..caa4b7e54 100755 --- a/ci/scripts/build.sh +++ b/ci/scripts/build.sh @@ -76,6 +76,7 @@ setup_environment setup_locale_and_install_packages setup_postgres install_external_dependencies +install_platform_specific_dependencies clone_or_use_source build_and_install package_if_necessary diff --git a/scripts/run_all_tests.sh b/scripts/run_all_tests.sh index a729f251f..f18569349 100755 --- a/scripts/run_all_tests.sh +++ b/scripts/run_all_tests.sh @@ -73,6 +73,7 @@ fi # Check if pgvector is available pgvector_installed=$($PSQL -U $DB_USER -p $DB_PORT -d postgres -c "SELECT 1 FROM pg_available_extensions WHERE name = 'vector'" -tA | tail -n 1 | tr -d '\n') +lantern_extras_installed=$($PSQL -U $DB_USER -p $DB_PORT -d postgres -c "SELECT 1 FROM pg_available_extensions WHERE name = 'lantern_extras'" -tA | tail -n 1 | tr -d '\n') # Settings REGRESSION=0 @@ -133,10 +134,14 @@ if [[ -n "$FILTER" || -n "$EXCLUDE" ]]; then fi fi else + NEWLINE=$'\n' + TEST_FILES=$(cat $SCHEDULE | grep '^test:' | tr " " "\n" | sed -e '/^$/d') if [[ "$pgvector_installed" == "1" ]]; then - TEST_FILES=$(cat $SCHEDULE | grep -E '^(test:|test_pgvector:)' | sed -e 's/^test_pgvector:/test:/' | tr " " "\n" | sed -e '/^$/d') - else - TEST_FILES=$(cat $SCHEDULE | grep '^test:' | tr " " "\n" | sed -e '/^$/d') + TEST_FILES="${TEST_FILES}${NEWLINE}$(cat $SCHEDULE | grep -E '^(test_pgvector:)' | sed -e 's/^test_pgvector:/test:/' | tr " " "\n" | sed -e '/^$/d')" + fi + + if [[ "$lantern_extras_installed" ]]; then + TEST_FILES="${TEST_FILES}${NEWLINE}$(cat $SCHEDULE | grep -E '^(test_extras:)' | sed -e 's/^test_extras:/test:/' | tr " " "\n" | sed -e '/^$/d')" fi fi @@ -169,6 +174,11 @@ else if [ "$pgvector_installed" == "1" ]; then echo "test: $test_name" >> $TMP_OUTDIR/schedule.txt fi + elif [[ "$line" =~ ^test_extras: ]]; then + test_name=$(echo "$line" | sed -e 's/test_extras://') + if [ "$lantern_extras_installed" == "1" ]; then + echo "test: $test_name" >> $TMP_OUTDIR/schedule.txt + fi elif [[ "$line" =~ ^test_begin: ]]; then test_name=$(echo "$line" | sed -e 's/test_begin:/test:/') echo "$test_name" >> $TMP_OUTDIR/schedule.txt @@ -180,7 +190,7 @@ else fi done < $SCHEDULE fi -unset $SCHEDULE +unset SCHEDULE SCHEDULE=$TMP_OUTDIR/schedule.txt function print_diff { diff --git a/test/expected/hnsw_extras.out b/test/expected/hnsw_extras.out new file mode 100644 index 000000000..d9e2a407a --- /dev/null +++ b/test/expected/hnsw_extras.out @@ -0,0 +1,67 @@ +------------------------------------------------------------------------------ +-- Test Functions exported from lantern_extras extension +------------------------------------------------------------------------------ +\ir utils/sift1k_array.sql +CREATE TABLE IF NOT EXISTS sift_base1k ( + id SERIAL, + v REAL[] +); +COPY sift_base1k (v) FROM '/tmp/lantern/vector_datasets/sift_base1k_arrays.csv' WITH csv; +\set ON_ERROR_STOP off +CREATE EXTENSION lantern_extras; +-- Validate error on invalid params +SELECT lantern_create_external_index('v','sift_base1k', 'public', 'invalid_metric', 3, 10, 10, 10); +ERROR: Invalid metric invalid_metric +SELECT lantern_create_external_index('v','sift_base1k', 'public', 'l2sq', 3, -1, 10, 10); +ERROR: m should be in range [2, 128] +SELECT lantern_create_external_index('v','sift_base1k', 'public', 'l2sq', 3, 10, -2, 10); +ERROR: ef_construction should be in range [1, 400] +SELECT lantern_create_external_index('v','sift_base1k', 'public', 'l2sq', 3, 10, 10, -1); +ERROR: ef should be in range [1, 400] +\set ON_ERROR_STOP on +-- Create with defaults +SELECT lantern_create_external_index('v', 'sift_base1k'); + lantern_create_external_index +------------------------------- + +(1 row) + +SELECT _lantern_internal.validate_index('sift_base1k_v_idx', false); +INFO: validate_index() start for sift_base1k_v_idx +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + +DROP INDEX sift_base1k_v_idx; +-- Create with params +SELECT lantern_create_external_index('v', 'sift_base1k', 'public', 'l2sq', 128, 10, 10, 10, 'hnsw_l2_index'); + lantern_create_external_index +------------------------------- + +(1 row) + +SELECT _lantern_internal.validate_index('hnsw_l2_index', false); +INFO: validate_index() start for hnsw_l2_index +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + +-- Reindex external index +SELECT lantern_reindex_external_index('hnsw_l2_index'); + lantern_reindex_external_index +-------------------------------- + +(1 row) + +SELECT _lantern_internal.validate_index('hnsw_l2_index', false); +INFO: validate_index() start for hnsw_l2_index +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + diff --git a/test/schedule.txt b/test/schedule.txt index 7e6423c5f..f8210d6cd 100644 --- a/test/schedule.txt +++ b/test/schedule.txt @@ -5,3 +5,4 @@ test: hnsw_config hnsw_correct hnsw_create hnsw_create_expr hnsw_dist_func hnsw_insert hnsw_select hnsw_todo hnsw_index_from_file hnsw_cost_estimate ext_relocation hnsw_ef_search hnsw_failure_point hnsw_operators hnsw_blockmap_create test_pgvector: hnsw_vector +test_extras: hnsw_extras diff --git a/test/sql/hnsw_extras.sql b/test/sql/hnsw_extras.sql new file mode 100644 index 000000000..99eec85b0 --- /dev/null +++ b/test/sql/hnsw_extras.sql @@ -0,0 +1,27 @@ +------------------------------------------------------------------------------ +-- Test Functions exported from lantern_extras extension +------------------------------------------------------------------------------ + +\ir utils/sift1k_array.sql + +\set ON_ERROR_STOP off +CREATE EXTENSION lantern_extras; +-- Validate error on invalid params +SELECT lantern_create_external_index('v','sift_base1k', 'public', 'invalid_metric', 3, 10, 10, 10); +SELECT lantern_create_external_index('v','sift_base1k', 'public', 'l2sq', 3, -1, 10, 10); +SELECT lantern_create_external_index('v','sift_base1k', 'public', 'l2sq', 3, 10, -2, 10); +SELECT lantern_create_external_index('v','sift_base1k', 'public', 'l2sq', 3, 10, 10, -1); +\set ON_ERROR_STOP on + +-- Create with defaults +SELECT lantern_create_external_index('v', 'sift_base1k'); +SELECT _lantern_internal.validate_index('sift_base1k_v_idx', false); +DROP INDEX sift_base1k_v_idx; + +-- Create with params +SELECT lantern_create_external_index('v', 'sift_base1k', 'public', 'l2sq', 128, 10, 10, 10, 'hnsw_l2_index'); +SELECT _lantern_internal.validate_index('hnsw_l2_index', false); + +-- Reindex external index +SELECT lantern_reindex_external_index('hnsw_l2_index'); +SELECT _lantern_internal.validate_index('hnsw_l2_index', false); From 0c9e791cb5b6504ba277bc0d108d7b81c53843cf Mon Sep 17 00:00:00 2001 From: Varik Matevosyan Date: Tue, 19 Dec 2023 15:01:12 +0400 Subject: [PATCH 04/13] Get function oid from syscache and add more tests --- ci/scripts/build-linux.sh | 2 +- src/hnsw/build.c | 67 +++++++++++++++++++++++++---------- test/expected/hnsw_extras.out | 4 +++ test/sql/hnsw_extras.sql | 4 +++ 4 files changed, 58 insertions(+), 19 deletions(-) diff --git a/ci/scripts/build-linux.sh b/ci/scripts/build-linux.sh index 13e9ea170..557349a72 100755 --- a/ci/scripts/build-linux.sh +++ b/ci/scripts/build-linux.sh @@ -27,7 +27,7 @@ function install_platform_specific_dependencies() { # Currently lantern_extras binaries are only available for Linux x86_64 # We won't install onnxruntime as lantern_extras are used only for external index in tests pushd /tmp - LANTERN_EXTRAS_VERSION=0.0.5 + LANTERN_EXTRAS_VERSION=0.0.6 wget https://github.com/lanterndata/lantern_extras/releases/download/${LANTERN_EXTRAS_VERSION}/lantern-extras-${LANTERN_EXTRAS_VERSION}.tar -O lantern-extras.tar tar xf lantern-extras.tar pushd lantern-extras-${LANTERN_EXTRAS_VERSION} diff --git a/src/hnsw/build.c b/src/hnsw/build.c index 558541337..16d261232 100644 --- a/src/hnsw/build.c +++ b/src/hnsw/build.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -18,6 +19,7 @@ #include #include #include +#include #include "usearch.h" @@ -544,31 +546,54 @@ void ldb_ambuildunlogged(Relation index) void ldb_reindex_external_index(Oid indrelid) { + HnswIndexHeaderPage *headerp; + FmgrInfo reindex_finfo = {0}; BlockNumber HEADER_BLOCK = 0; Relation index_rel; Buffer buf; Page page; + HeapTuple proc_tup; + Form_pg_proc procform; + Oid lantern_extras_schema_oid = InvalidOid; + Oid function_oid; + Oid function_argtypes_oid[ 6 ]; + oidvector *function_argtypes; char *metric_kind; - char *index_name; - HnswIndexHeaderPage *headerp; - FmgrInfo reindex_finfo = {0}; - char *function_sig = "_lantern_reindex_external_index(text, text, integer, integer, integer, integer)"; - Oid function_oid; - uint32_t dim = 0; - uint32_t m = 0; - uint32_t ef_construction = 0; - uint32_t ef = 0; - - PG_TRY(); - { - function_oid = DatumGetObjectId(DirectFunctionCall1(regprocedurein, CStringGetDatum(function_sig))); + const char *lantern_extras_schema = "lantern_extras"; + uint32_t dim = 0; + uint32_t m = 0; + uint32_t ef_construction = 0; + uint32_t ef = 0; + + lantern_extras_schema_oid = get_namespace_oid(lantern_extras_schema, true); + + if(!OidIsValid(lantern_extras_schema_oid)) { + elog(ERROR, "Schema %s not found", lantern_extras_schema); } - PG_CATCH(); - { + + // Check if _reindex_external_index function exists in lantern schema + function_argtypes_oid[ 0 ] = REGCLASSOID; + function_argtypes_oid[ 1 ] = TEXTOID; + function_argtypes_oid[ 2 ] = INT4OID; + function_argtypes_oid[ 3 ] = INT4OID; + function_argtypes_oid[ 4 ] = INT4OID; + function_argtypes_oid[ 5 ] = INT4OID; + function_argtypes = buildoidvector(function_argtypes_oid, 6); + proc_tup = SearchSysCache3(PROCNAMEARGSNSP, + PointerGetDatum("_reindex_external_index"), + function_argtypes, + ObjectIdGetDatum(lantern_extras_schema_oid)); + + if(!HeapTupleIsValid(proc_tup)) { + ReleaseSysCache(proc_tup); elog(ERROR, "Please install 'lantern_extras' extension or update it to the latest version"); } - PG_END_TRY(); + procform = (Form_pg_proc)GETSTRUCT(proc_tup); + function_oid = procform->oid; + ReleaseSysCache(proc_tup); + + // Get index params from index header page index_rel = relation_open(indrelid, AccessShareLock); buf = ReadBuffer(index_rel, HEADER_BLOCK); LockBuffer(buf, BUFFER_LOCK_SHARE); @@ -577,6 +602,7 @@ void ldb_reindex_external_index(Oid indrelid) assert(headerp->magicNumber == LDB_WAL_MAGIC_NUMBER); + // Convert metric_kind enum to string representation switch(headerp->metric_kind) { case usearch_metric_l2sq_k: metric_kind = "l2sq"; @@ -592,7 +618,6 @@ void ldb_reindex_external_index(Oid indrelid) ldb_invariant(true, "Unsupported metric kind"); } - index_name = pstrdup(RelationGetRelationName(index_rel)); dim = headerp->vector_dim; m = headerp->m; ef = headerp->ef; @@ -601,12 +626,18 @@ void ldb_reindex_external_index(Oid indrelid) UnlockReleaseBuffer(buf); relation_close(index_rel, AccessShareLock); + // We can not have external index without knowing dimensions + if(dim <= 0) { + elog(ERROR, "Column does not have dimensions: can not create external index on empty table"); + } + + // Get _reindex_external_index function info to do direct call into it fmgr_info(function_oid, &reindex_finfo); assert(reindex_finfo.fn_addr != NULL); DirectFunctionCall6(reindex_finfo.fn_addr, - CStringGetTextDatum(index_name), + ObjectIdGetDatum(indrelid), CStringGetTextDatum(metric_kind), Int32GetDatum(dim), Int32GetDatum(m), diff --git a/test/expected/hnsw_extras.out b/test/expected/hnsw_extras.out index d9e2a407a..be82a1b6c 100644 --- a/test/expected/hnsw_extras.out +++ b/test/expected/hnsw_extras.out @@ -18,6 +18,10 @@ SELECT lantern_create_external_index('v','sift_base1k', 'public', 'l2sq', 3, 10 ERROR: ef_construction should be in range [1, 400] SELECT lantern_create_external_index('v','sift_base1k', 'public', 'l2sq', 3, 10, 10, -1); ERROR: ef should be in range [1, 400] +-- Validate error on empty table +CREATE TABLE empty (v REAL[]); +SELECT lantern_create_external_index('v', 'empty'); +ERROR: Can not create external index on empty table \set ON_ERROR_STOP on -- Create with defaults SELECT lantern_create_external_index('v', 'sift_base1k'); diff --git a/test/sql/hnsw_extras.sql b/test/sql/hnsw_extras.sql index 99eec85b0..1d4c50275 100644 --- a/test/sql/hnsw_extras.sql +++ b/test/sql/hnsw_extras.sql @@ -11,6 +11,10 @@ SELECT lantern_create_external_index('v','sift_base1k', 'public', 'invalid_metri SELECT lantern_create_external_index('v','sift_base1k', 'public', 'l2sq', 3, -1, 10, 10); SELECT lantern_create_external_index('v','sift_base1k', 'public', 'l2sq', 3, 10, -2, 10); SELECT lantern_create_external_index('v','sift_base1k', 'public', 'l2sq', 3, 10, 10, -1); + +-- Validate error on empty table +CREATE TABLE empty (v REAL[]); +SELECT lantern_create_external_index('v', 'empty'); \set ON_ERROR_STOP on -- Create with defaults From c94f19c2f344b4ddc03d7efa1cf0b14604fddc7d Mon Sep 17 00:00:00 2001 From: Varik Matevosyan Date: Tue, 19 Dec 2023 15:11:19 +0400 Subject: [PATCH 05/13] Fix type convertion --- src/hnsw/build.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hnsw/build.c b/src/hnsw/build.c index 16d261232..eeb5270ff 100644 --- a/src/hnsw/build.c +++ b/src/hnsw/build.c @@ -581,7 +581,7 @@ void ldb_reindex_external_index(Oid indrelid) function_argtypes = buildoidvector(function_argtypes_oid, 6); proc_tup = SearchSysCache3(PROCNAMEARGSNSP, PointerGetDatum("_reindex_external_index"), - function_argtypes, + PointerGetDatum(function_argtypes), ObjectIdGetDatum(lantern_extras_schema_oid)); if(!HeapTupleIsValid(proc_tup)) { From fec5c0d770e97c8ea1738b197fb8af4af3d64535 Mon Sep 17 00:00:00 2001 From: Varik Matevosyan Date: Tue, 19 Dec 2023 15:49:54 +0400 Subject: [PATCH 06/13] Get function oid instead of tuple --- src/hnsw/build.c | 25 ++++++++++--------------- test/expected/hnsw_extras.out | 2 +- 2 files changed, 11 insertions(+), 16 deletions(-) diff --git a/src/hnsw/build.c b/src/hnsw/build.c index eeb5270ff..a1c15b103 100644 --- a/src/hnsw/build.c +++ b/src/hnsw/build.c @@ -552,8 +552,6 @@ void ldb_reindex_external_index(Oid indrelid) Relation index_rel; Buffer buf; Page page; - HeapTuple proc_tup; - Form_pg_proc procform; Oid lantern_extras_schema_oid = InvalidOid; Oid function_oid; Oid function_argtypes_oid[ 6 ]; @@ -564,11 +562,12 @@ void ldb_reindex_external_index(Oid indrelid) uint32_t m = 0; uint32_t ef_construction = 0; uint32_t ef = 0; + char *ext_not_found_err = "Please install 'lantern_extras' extension or update it to the latest version"; lantern_extras_schema_oid = get_namespace_oid(lantern_extras_schema, true); if(!OidIsValid(lantern_extras_schema_oid)) { - elog(ERROR, "Schema %s not found", lantern_extras_schema); + elog(ERROR, "%s", ext_not_found_err); } // Check if _reindex_external_index function exists in lantern schema @@ -579,20 +578,16 @@ void ldb_reindex_external_index(Oid indrelid) function_argtypes_oid[ 4 ] = INT4OID; function_argtypes_oid[ 5 ] = INT4OID; function_argtypes = buildoidvector(function_argtypes_oid, 6); - proc_tup = SearchSysCache3(PROCNAMEARGSNSP, - PointerGetDatum("_reindex_external_index"), - PointerGetDatum(function_argtypes), - ObjectIdGetDatum(lantern_extras_schema_oid)); - - if(!HeapTupleIsValid(proc_tup)) { - ReleaseSysCache(proc_tup); - elog(ERROR, "Please install 'lantern_extras' extension or update it to the latest version"); - } - procform = (Form_pg_proc)GETSTRUCT(proc_tup); - function_oid = procform->oid; - ReleaseSysCache(proc_tup); + function_oid = GetSysCacheOid3(PROCNAMEARGSNSP, + Anum_pg_proc_oid, + PointerGetDatum("_reindex_external_index"), + PointerGetDatum(function_argtypes), + ObjectIdGetDatum(lantern_extras_schema_oid)); + if(!OidIsValid(function_oid)) { + elog(ERROR, "%s", ext_not_found_err); + } // Get index params from index header page index_rel = relation_open(indrelid, AccessShareLock); buf = ReadBuffer(index_rel, HEADER_BLOCK); diff --git a/test/expected/hnsw_extras.out b/test/expected/hnsw_extras.out index be82a1b6c..307dafb61 100644 --- a/test/expected/hnsw_extras.out +++ b/test/expected/hnsw_extras.out @@ -21,7 +21,7 @@ ERROR: ef should be in range [1, 400] -- Validate error on empty table CREATE TABLE empty (v REAL[]); SELECT lantern_create_external_index('v', 'empty'); -ERROR: Can not create external index on empty table +ERROR: Cannot create an external index on empty table \set ON_ERROR_STOP on -- Create with defaults SELECT lantern_create_external_index('v', 'sift_base1k'); From 8ff0b92a4e8c31e752631f1f0b0806c0de47e41c Mon Sep 17 00:00:00 2001 From: Varik Matevosyan Date: Tue, 19 Dec 2023 16:57:36 +0400 Subject: [PATCH 07/13] Fix GetSysCacheOid for pg11 --- src/hnsw/build.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/hnsw/build.c b/src/hnsw/build.c index a1c15b103..63be14dd6 100644 --- a/src/hnsw/build.c +++ b/src/hnsw/build.c @@ -552,7 +552,7 @@ void ldb_reindex_external_index(Oid indrelid) Relation index_rel; Buffer buf; Page page; - Oid lantern_extras_schema_oid = InvalidOid; + Oid lantern_extras_namespace_oid = InvalidOid; Oid function_oid; Oid function_argtypes_oid[ 6 ]; oidvector *function_argtypes; @@ -564,9 +564,9 @@ void ldb_reindex_external_index(Oid indrelid) uint32_t ef = 0; char *ext_not_found_err = "Please install 'lantern_extras' extension or update it to the latest version"; - lantern_extras_schema_oid = get_namespace_oid(lantern_extras_schema, true); + lantern_extras_namespace_oid = get_namespace_oid(lantern_extras_schema, true); - if(!OidIsValid(lantern_extras_schema_oid)) { + if(!OidIsValid(lantern_extras_namespace_oid)) { elog(ERROR, "%s", ext_not_found_err); } @@ -579,11 +579,14 @@ void ldb_reindex_external_index(Oid indrelid) function_argtypes_oid[ 5 ] = INT4OID; function_argtypes = buildoidvector(function_argtypes_oid, 6); - function_oid = GetSysCacheOid3(PROCNAMEARGSNSP, - Anum_pg_proc_oid, - PointerGetDatum("_reindex_external_index"), - PointerGetDatum(function_argtypes), - ObjectIdGetDatum(lantern_extras_schema_oid)); + function_oid = GetSysCacheOid(PROCNAMEARGSNSP, +#if PG_VERSION_NUM >= 120000 + Anum_pg_proc_oid, +#endif + CStringGetDatum("_reindex_external_index"), + PointerGetDatum(function_argtypes), + ObjectIdGetDatum(lantern_extras_namespace_oid), + 0); if(!OidIsValid(function_oid)) { elog(ERROR, "%s", ext_not_found_err); From e2d9bd12aa421b7be27f083816a367b2fd056bb5 Mon Sep 17 00:00:00 2001 From: Ezra Varady <76978395+ezra-varady@users.noreply.github.com> Date: Fri, 22 Dec 2023 11:40:23 -1000 Subject: [PATCH 08/13] Add binary versioning (#249) * add machinery to check version on various queries * improve error messages * symlink shared utilities add update tests to induce mismatch * feature gate misc tests and fix use after poison * expand feature gate * replace >> with > * add utility class to compare versions, allow builds * additional logging to get insight into workflow * change warning to error so it surfaces * use a proper template for generation. remove unused extern variables. reduce redundancy in version helper class * remove preemptive error, reinstall gettext in mac build script when in the github runner * fix user on relink * add upterm session to debug action * retab workflow * see if we can induce the condition manually * retrigger to see if tmux error was spurious * set it to on failure again * remove upterm job, possibly it was a caching issue? --- CMakeLists.txt | 13 ++++ ci/scripts/build-mac.sh | 3 + cmake/version.h.template | 6 ++ scripts/run_all_tests.sh | 17 ++++- scripts/test_updates.py | 30 ++++++++ src/hnsw/insert.c | 6 ++ src/hnsw/options.c | 6 ++ src/hnsw/scan.c | 9 ++- src/hnsw/utils.c | 66 ++++++++++++++++++ src/hnsw/utils.h | 1 + test/misc/expected/begin.out | 6 ++ test/misc/expected/version_mismatch.out | 4 ++ test/misc/sql/begin.sql | 1 + test/misc/sql/utils/common.sql | 1 + test/misc/sql/utils/sift1k_array.sql | 1 + test/misc/sql/version_mismatch.sql | 2 + test/misc_schedule.txt | 7 ++ test/parallel/sql/utils/common.sql | 91 +------------------------ test/test_runner.sh | 12 ++-- 19 files changed, 184 insertions(+), 98 deletions(-) create mode 100644 cmake/version.h.template create mode 100644 test/misc/expected/begin.out create mode 100644 test/misc/expected/version_mismatch.out create mode 100644 test/misc/sql/begin.sql create mode 120000 test/misc/sql/utils/common.sql create mode 120000 test/misc/sql/utils/sift1k_array.sql create mode 100644 test/misc/sql/version_mismatch.sql create mode 100644 test/misc_schedule.txt mode change 100644 => 120000 test/parallel/sql/utils/common.sql diff --git a/CMakeLists.txt b/CMakeLists.txt index 4053c6dd3..a294b3aef 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -204,6 +204,12 @@ add_custom_command( add_custom_target(phony_always_runs ALL DEPENDS ${CMAKE_BINARY_DIR}/${_script_file}) +set(VERSION_HEADER_TEMPLATE "${CMAKE_MODULE_PATH}/version.h.template") +set(VERSION_HEADER_OUTPUT "${CMAKE_BINARY_DIR}/include/version.h") +configure_file(${VERSION_HEADER_TEMPLATE} ${VERSION_HEADER_OUTPUT}) + +target_include_directories(lantern PUBLIC ${CMAKE_BINARY_DIR}/include) + # AUTO-GENERATE lantern.control file for PostgreSQL set(CONTROL_TEMPLATE "${CMAKE_MODULE_PATH}/lantern.control.template") @@ -248,6 +254,13 @@ add_custom_target( WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/test ) + +add_custom_target( + test-misc + COMMAND ${CMAKE_SOURCE_DIR}/scripts/run_all_tests.sh --misc + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/test +) + # BENCHMARK add_custom_target( benchmark diff --git a/ci/scripts/build-mac.sh b/ci/scripts/build-mac.sh index ddd8b8057..467923e13 100755 --- a/ci/scripts/build-mac.sh +++ b/ci/scripts/build-mac.sh @@ -13,6 +13,9 @@ function setup_postgres() { then # Runner is github CI user sh -c "sudo -u runner -i $cmd" + sh -c "sudo -u runner -i brew reinstall gettext" + sh -c "sudo -u runner -i brew unlink gettext" + sh -c "sudo -u runner -i brew link gettext --force" else sh -c $cmd fi diff --git a/cmake/version.h.template b/cmake/version.h.template new file mode 100644 index 000000000..7d9423ffb --- /dev/null +++ b/cmake/version.h.template @@ -0,0 +1,6 @@ +#ifndef LDB_HNSW_VERSION_H +#define LDB_HNSW_VERSION_H + +#define LDB_BINARY_VERSION "@LANTERNDB_VERSION@" + +#endif diff --git a/scripts/run_all_tests.sh b/scripts/run_all_tests.sh index f18569349..2157250c9 100755 --- a/scripts/run_all_tests.sh +++ b/scripts/run_all_tests.sh @@ -78,11 +78,13 @@ lantern_extras_installed=$($PSQL -U $DB_USER -p $DB_PORT -d postgres -c "SELECT # Settings REGRESSION=0 PARALLEL=0 +MISC=0 C_TESTS=0 while [[ "$#" -gt 0 ]]; do case $1 in --regression) REGRESSION=1 ;; --parallel) PARALLEL=1 ;; + --misc) MISC=1 ;; --client) C_TESTS=1 ;; esac shift @@ -107,9 +109,12 @@ function print_test { rm -rf $TMP_OUTDIR/schedule.txt if [ "$PARALLEL" -eq 1 ]; then SCHEDULE='parallel_schedule.txt' +elif [ "$MISC" -eq 1 ]; then + SCHEDULE='misc_schedule.txt' else SCHEDULE='schedule.txt' fi + if [[ -n "$FILTER" || -n "$EXCLUDE" ]]; then if [ "$PARALLEL" -eq 1 ]; then TEST_FILES=$(cat $SCHEDULE | grep -E '^(test:|test_begin:|test_end:)' | sed -E -e 's/^test_begin:|test_end:/test:/' | tr " " "\n" | sed -e '/^$/d') @@ -168,6 +173,11 @@ if [[ -n "$FILTER" || -n "$EXCLUDE" ]]; then exit 0 fi else + if [ "$MISC" -eq 1 ]; then + echo "misc tests are not intended to be run in parallel, please include a FILTER" + exit 1 + fi + while IFS= read -r line; do if [[ "$line" =~ ^test_pgvector: ]]; then test_name=$(echo "$line" | sed -e 's/test_pgvector://') @@ -209,7 +219,10 @@ trap print_diff ERR if [ "$PARALLEL" -eq 1 ]; then cd parallel - PARALLEL=$PARALLEL DB_USER=$DB_USER $(pg_config --pkglibdir)/pgxs/src/test/regress/pg_regress --user=$DB_USER --schedule=$SCHEDULE --outputdir=$TMP_OUTDIR --launcher=../test_runner.sh + MISC=$MISC PARALLEL=$PARALLEL DB_USER=$DB_USER $(pg_config --pkglibdir)/pgxs/src/test/regress/pg_regress --user=$DB_USER --schedule=$SCHEDULE --outputdir=$TMP_OUTDIR --launcher=../test_runner.sh +elif [ "$MISC" -eq 1 ]; then + cd misc + MISC=$MISC PARALLEL=$PARALLEL DB_USER=$DB_USER $(pg_config --pkglibdir)/pgxs/src/test/regress/pg_regress --user=$DB_USER --schedule=$SCHEDULE --outputdir=$TMP_OUTDIR --launcher=../test_runner.sh else - PARALLEL=$PARALLEL DB_USER=$DB_USER $(pg_config --pkglibdir)/pgxs/src/test/regress/pg_regress --user=$DB_USER --schedule=$SCHEDULE --outputdir=$TMP_OUTDIR --launcher=./test_runner.sh + MISC=$MISC PARALLEL=$PARALLEL DB_USER=$DB_USER $(pg_config --pkglibdir)/pgxs/src/test/regress/pg_regress --user=$DB_USER --schedule=$SCHEDULE --outputdir=$TMP_OUTDIR --launcher=./test_runner.sh fi diff --git a/scripts/test_updates.py b/scripts/test_updates.py index d83a31b4c..62d4b321c 100644 --- a/scripts/test_updates.py +++ b/scripts/test_updates.py @@ -10,6 +10,28 @@ '16': ['0.0.4'] } +class Version: + def __init__(self, version: str): + self.version_numbers = [int(n) for n in version.split('.')] + def __lt__(self, other): + for i, v in enumerate(self.version_numbers): + if v < other.version_numbers[i]: + return True + return False + def __eq__(self, other): + for i, v in enumerate(self.version_numbers): + if v != other.version_numbers[i]: + return False + return True + def __le__(self, other): + return self < other or self == other + def __ne__(self, other): + return not self == other + def __gt__(self, other): + return not self == other and not self < other + def __ge__(self, other): + return not self < other + def shell(cmd, exit_on_error=True): res = subprocess.run(cmd, shell=True) if res.returncode != 0: @@ -48,9 +70,17 @@ def update_from_tag(from_version: str, to_version: str): res = shell('rm -f /tmp/ldb_update.lock') res = shell('rm -f /tmp/ldb_update_finished') res = shell(f"cd {args.builddir} ; UPDATE_EXTENSION=1 UPDATE_FROM={from_version} UPDATE_TO={from_version} make test-parallel FILTER=begin") + + if Version(from_version) > Version('0.0.10'): + # misc tests added at v0.0.10, won't work before that + # initialize misc tests to ensure that version mismatch results in an error + res = shell(f"cd {args.builddir} ; UPDATE_EXTENSION=1 UPDATE_FROM={from_version} UPDATE_TO={from_version} make test-misc FILTER=begin") + repo.git.checkout(sha_before) res = shell(f"cd {args.builddir} ; git submodule update && cmake .. && make -j4 && make install") # res = shell(f"cd {args.builddir} ; UPDATE_EXTENSION=1 UPDATE_FROM={from_version} UPDATE_TO={to_version} make test") + if Version(from_version) > Version('0.0.10'): + res = shell(f"cd {args.builddir} ; UPDATE_EXTENSION=1 UPDATE_FROM={from_version} UPDATE_TO={from_version} make test-misc FILTER=version_mismatch") # run the actual parallel tests after the upgrade res = shell(f"cd {args.builddir} ; UPDATE_EXTENSION=1 UPDATE_FROM={from_version} UPDATE_TO={to_version} make test-parallel EXCLUDE=begin") diff --git a/src/hnsw/insert.c b/src/hnsw/insert.c index aaff26900..2070bd3f1 100644 --- a/src/hnsw/insert.c +++ b/src/hnsw/insert.c @@ -76,6 +76,12 @@ bool ldb_aminsert(Relation index, LDB_UNUSED(indexUnchanged); #endif + if(!VersionsMatch()) { + elog(ERROR, + "Attempting to insert into lantern index, but the SQL version and binary version do not match. This can " + "cause errors. Please run `ALTER EXTENSION lantern UPDATE and reconnect"); + } + HnswInsertState *insertstate = palloc0(sizeof(HnswInsertState)); if(checkUnique != UNIQUE_CHECK_NO) { diff --git a/src/hnsw/options.c b/src/hnsw/options.c index 2c8f2e7fb..354864c51 100644 --- a/src/hnsw/options.c +++ b/src/hnsw/options.c @@ -146,6 +146,12 @@ void _PG_init(void) "Make sure to restart the server before running ALTER EXTENSION lantern UPDATE"); } + if(!VersionsMatch()) { + elog( + WARNING, + "LanternDB binary version does not match the version in SQL. This can cause errors as the two APIs may " + "differ. Please run `ALTER EXTENSION lantern UPDATE` and reconnect before attempting to work with indices"); + } original_post_parse_analyze_hook = post_parse_analyze_hook; original_ExecutorStart_hook = ExecutorStart_hook; diff --git a/src/hnsw/scan.c b/src/hnsw/scan.c index dc761dba9..6888194ca 100644 --- a/src/hnsw/scan.c +++ b/src/hnsw/scan.c @@ -26,7 +26,14 @@ IndexScanDesc ldb_ambeginscan(Relation index, int nkeys, int norderbys) int dimensions; usearch_error_t error = NULL; usearch_init_options_t opts; - RetrieverCtx *retriever_ctx = ldb_wal_retriever_area_init(index, NULL); + + if(!VersionsMatch()) { + elog(ERROR, + "Attempting to scan lantern index, but the SQL version and binary version do not match. This can cause " + "errors. Please run `ALTER EXTENSION lantern UPDATE and reconnect"); + } + + RetrieverCtx *retriever_ctx = ldb_wal_retriever_area_init(index, NULL); scan = RelationGetIndexScan(index, nkeys, norderbys); diff --git a/src/hnsw/utils.c b/src/hnsw/utils.c index e310e40b9..0b04e8808 100644 --- a/src/hnsw/utils.c +++ b/src/hnsw/utils.c @@ -4,10 +4,12 @@ #include #include +#include #include #include #include #include +#include #if PG_VERSION_NUM >= 130000 #include @@ -17,6 +19,10 @@ #include "hnsw.h" #include "options.h" #include "usearch.h" +#include "version.h" + +bool versions_match = false; +bool version_checked = false; void LogUsearchOptions(usearch_init_options_t *opts) { @@ -103,3 +109,63 @@ float4 *ToFloat4Array(ArrayType *arr) elog(ERROR, "unsupported element type: %d", element_type); } } + +// Check if the binary version matches the schema version caching the result after the first check +// This is used to prevent interacting with the index when the two don't match +bool VersionsMatch() +{ + if(likely(version_checked)) { + return versions_match; + } else { + const char *query; + const char *version; + bool isnull; + int version_length; + int spi_result; + int comparison; + Datum val; + text *version_text; + + if(SPI_connect() != SPI_OK_CONNECT) { + elog(ERROR, "could not connect to executor to check binary version"); + } + + query = "SELECT extversion FROM pg_extension WHERE extname = 'lantern'"; + + // Execute the query to figure out what version of lantern is in use in SQL + spi_result = SPI_execute(query, true, 0); + if(spi_result != SPI_OK_SELECT) { + elog(ERROR, "SPI_execute returned %s for %s", SPI_result_code_string(spi_result), query); + } + + // Global containing the number of rows processed, should be just 1 + if(SPI_processed != 1) { + elog(ERROR, "SQL version query did not return any values"); + } + + // SPI_tuptable is a global populated by SPI_execute + val = SPI_getbinval(SPI_tuptable->vals[ 0 ], SPI_tuptable->tupdesc, 1, &isnull); + + if(isnull) { + elog(ERROR, "Version query returned null"); + } + + // Grab the result and check that it matches the version in the generated header + version_text = DatumGetTextP(val); + version = text_to_cstring(version_text); + version_length = strlen(version); + if(sizeof(LDB_BINARY_VERSION) >= (unsigned)version_length) { + version_length = sizeof(LDB_BINARY_VERSION); + } + + comparison = strncmp(version, LDB_BINARY_VERSION, version_length); + + if(comparison == 0) { + versions_match = true; + } + version_checked = true; + + SPI_finish(); + return versions_match; + } +} diff --git a/src/hnsw/utils.h b/src/hnsw/utils.h index 90a57a880..2472403f9 100644 --- a/src/hnsw/utils.h +++ b/src/hnsw/utils.h @@ -11,6 +11,7 @@ void LogUsearchOptions(usearch_init_options_t *opts); void PopulateUsearchOpts(Relation index, usearch_init_options_t *opts); usearch_label_t GetUsearchLabel(ItemPointer itemPtr); float4 *ToFloat4Array(ArrayType *arr); +bool VersionsMatch(); static inline void ldb_invariant(bool condition, const char *msg, ...) { diff --git a/test/misc/expected/begin.out b/test/misc/expected/begin.out new file mode 100644 index 000000000..f3e3a7f51 --- /dev/null +++ b/test/misc/expected/begin.out @@ -0,0 +1,6 @@ +\ir utils/sift1k_array.sql +CREATE TABLE IF NOT EXISTS sift_base1k ( + id SERIAL, + v REAL[] +); +COPY sift_base1k (v) FROM '/tmp/lantern/vector_datasets/sift_base1k_arrays.csv' WITH csv; diff --git a/test/misc/expected/version_mismatch.out b/test/misc/expected/version_mismatch.out new file mode 100644 index 000000000..6dc486213 --- /dev/null +++ b/test/misc/expected/version_mismatch.out @@ -0,0 +1,4 @@ +-- Validate that creating an index with mismatched versions fails +CREATE INDEX ON sift_base1k USING hnsw (v) WITH (dim=128, M=4); +WARNING: LanternDB binary version does not match the version in SQL. This can cause errors as the two APIs may differ. Please run `ALTER EXTENSION lantern UPDATE` and reconnect before attempting to work with indices +ERROR: Attempting to build lantern index, but the SQL version and binary version do not match. This can cause errors. Please run `ALTER EXTENSION lantern UPDATE and reconnect diff --git a/test/misc/sql/begin.sql b/test/misc/sql/begin.sql new file mode 100644 index 000000000..b08f85894 --- /dev/null +++ b/test/misc/sql/begin.sql @@ -0,0 +1 @@ +\ir utils/sift1k_array.sql diff --git a/test/misc/sql/utils/common.sql b/test/misc/sql/utils/common.sql new file mode 120000 index 000000000..fbbc1ab5f --- /dev/null +++ b/test/misc/sql/utils/common.sql @@ -0,0 +1 @@ +../../../sql/utils/common.sql \ No newline at end of file diff --git a/test/misc/sql/utils/sift1k_array.sql b/test/misc/sql/utils/sift1k_array.sql new file mode 120000 index 000000000..aee1aa4a4 --- /dev/null +++ b/test/misc/sql/utils/sift1k_array.sql @@ -0,0 +1 @@ +../../../sql/utils/sift1k_array.sql \ No newline at end of file diff --git a/test/misc/sql/version_mismatch.sql b/test/misc/sql/version_mismatch.sql new file mode 100644 index 000000000..01264a4ad --- /dev/null +++ b/test/misc/sql/version_mismatch.sql @@ -0,0 +1,2 @@ +-- Validate that creating an index with mismatched versions fails +CREATE INDEX ON sift_base1k USING hnsw (v) WITH (dim=128, M=4); diff --git a/test/misc_schedule.txt b/test/misc_schedule.txt new file mode 100644 index 000000000..c11cdbb44 --- /dev/null +++ b/test/misc_schedule.txt @@ -0,0 +1,7 @@ + +# schedule.txt rules: +# - every test that needs to be run must appear in a 'test:' line +# - every test that needs to be run iff pgvector is installed appears in a 'test_pgvector:' line +# - 'test' lines may have multiple space-separated tests. All tests in a single 'test' line will be run in parallel + +test: begin version_mismatch diff --git a/test/parallel/sql/utils/common.sql b/test/parallel/sql/utils/common.sql deleted file mode 100644 index 89ae94f35..000000000 --- a/test/parallel/sql/utils/common.sql +++ /dev/null @@ -1,90 +0,0 @@ --- N.B.: This file shall be maintained such that it can safely be rerun without throwing an error --- This is because in upgrade tests we may run this multiple times in preparation for sequential --- and parallel upgrade tests - --- test helper functions that should exist in all test runs live here --- there is no need to explicitly include this file in other tests as the test runner will --- run this before running the actual test - -CREATE EXTENSION IF NOT EXISTS pageinspect; - -\set ON_ERROR_STOP on - --- retrieves details for all indices associated with a given table, similar to \di+ --- the output of \di+ is not consistent across postgres versions --- todo:: add a columns to this function which returning number of used DB pages -CREATE OR REPLACE FUNCTION ldb_get_indexes(tblname text) -RETURNS TABLE( - indexname name, - size text, - indexdef text, - total_index_size text -) AS -$BODY$ -BEGIN - RETURN QUERY - WITH total_size_data AS ( - SELECT - SUM(pg_relation_size(indexrelid)) as total_size - FROM - pg_index - WHERE - indisvalid - AND indrelid = tblname::regclass - ) - SELECT - idx.indexname, - pg_size_pretty(pg_relation_size(idx.indexname::REGCLASS)) as size, - idx.indexdef, - pg_size_pretty(total_size_data.total_size) as total_index_size - FROM - pg_indexes idx, - total_size_data - WHERE - idx.tablename = tblname; -END; -$BODY$ -LANGUAGE plpgsql; - --- Determines if the provided SQL query (with an EXPLAIN prefix) uses an "Index Scan" --- by examining its execution plan. This function helps ensure consistent analysis --- across varying Postgres versions where EXPLAIN output may differ. -CREATE OR REPLACE FUNCTION has_index_scan(explain_query text) RETURNS boolean AS $$ -DECLARE - plan_row RECORD; - found boolean := false; -BEGIN - FOR plan_row IN EXECUTE explain_query LOOP - IF position('Index Scan' in plan_row."QUERY PLAN") > 0 THEN - found := true; - EXIT; - END IF; - END LOOP; - RETURN found; -END; -$$ LANGUAGE plpgsql; - --- Determine if the two queries provided return the same results --- At the moment this only works on queries that return rows with the same entries as one another --- if you try to compare uneven numbers of columns or columns of different types it will generate an error -CREATE OR REPLACE FUNCTION results_match(left_query text, right_query text) RETURNS boolean AS $$ -DECLARE - left_cursor REFCURSOR; - left_row RECORD; - - right_cursor REFCURSOR; - right_row RECORD; -BEGIN - OPEN left_cursor FOR EXECUTE left_query; - OPEN right_cursor FOR EXECUTE right_query; - LOOP - FETCH NEXT FROM left_cursor INTO left_row; - FETCH NEXT FROM right_cursor INTO right_row; - IF left_row != right_row THEN - RETURN false; - ELSEIF left_row IS NULL AND right_row IS NULL THEN - RETURN true; - END IF; - END LOOP; -END; -$$ LANGUAGE plpgsql; diff --git a/test/parallel/sql/utils/common.sql b/test/parallel/sql/utils/common.sql new file mode 120000 index 000000000..fbbc1ab5f --- /dev/null +++ b/test/parallel/sql/utils/common.sql @@ -0,0 +1 @@ +../../../sql/utils/common.sql \ No newline at end of file diff --git a/test/test_runner.sh b/test/test_runner.sh index b1086d90c..74269b5cb 100755 --- a/test/test_runner.sh +++ b/test/test_runner.sh @@ -3,13 +3,15 @@ # Get current test file name TESTFILE_NAME=${PGAPPNAME##pg_regress/} -if [ "$PARALLEL" -eq 0 ]; then +if [ "$PARALLEL" -eq 1 ]; then + # parallel tests all run in the same database + TEST_CASE_DB="ldb_parallel" +elif [ "$MISC" -eq 1 ]; then + TEST_CASE_DB="ldb_misc" +else # Set different name for each test database # As pg_regress does not support cleaning db after each test TEST_CASE_DB="ldb_test_${TESTFILE_NAME}" -else - # parallel tests all run in the same database - TEST_CASE_DB="ldb_parallel" fi # Set database user @@ -52,7 +54,7 @@ function run_regression_test { cd sql/ # install lantern extension -if [[ "$PARALLEL" -eq 0 || "$TESTFILE_NAME" == "begin" ]]; then +if [[ ("$PARALLEL" -eq 0 && "$MISC" -eq 0) || "$TESTFILE_NAME" == "begin" ]]; then psql "$@" -U ${DB_USER} -d postgres -v ECHO=none -q -c "DROP DATABASE IF EXISTS ${TEST_CASE_DB};" 2>/dev/null psql "$@" -U ${DB_USER} -d postgres -v ECHO=none -q -c "CREATE DATABASE ${TEST_CASE_DB};" 2>/dev/null fi From 2eaa90c168329e1b2bd96188924f6eba66bd53cf Mon Sep 17 00:00:00 2001 From: Ezra Varady <76978395+ezra-varady@users.noreply.github.com> Date: Sat, 23 Dec 2023 22:27:37 -1000 Subject: [PATCH 09/13] Disable version mismatch test till v0.0.12 (#254) * disable version mismatch test till v0.0.12 * only use existing tags in update tests * drop .sql from from_tag earlier --- scripts/test_updates.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/scripts/test_updates.py b/scripts/test_updates.py index 62d4b321c..68e4b754e 100644 --- a/scripts/test_updates.py +++ b/scripts/test_updates.py @@ -71,7 +71,7 @@ def update_from_tag(from_version: str, to_version: str): res = shell('rm -f /tmp/ldb_update_finished') res = shell(f"cd {args.builddir} ; UPDATE_EXTENSION=1 UPDATE_FROM={from_version} UPDATE_TO={from_version} make test-parallel FILTER=begin") - if Version(from_version) > Version('0.0.10'): + if Version(from_version) > Version('0.0.11'): # misc tests added at v0.0.10, won't work before that # initialize misc tests to ensure that version mismatch results in an error res = shell(f"cd {args.builddir} ; UPDATE_EXTENSION=1 UPDATE_FROM={from_version} UPDATE_TO={from_version} make test-misc FILTER=begin") @@ -79,7 +79,7 @@ def update_from_tag(from_version: str, to_version: str): repo.git.checkout(sha_before) res = shell(f"cd {args.builddir} ; git submodule update && cmake .. && make -j4 && make install") # res = shell(f"cd {args.builddir} ; UPDATE_EXTENSION=1 UPDATE_FROM={from_version} UPDATE_TO={to_version} make test") - if Version(from_version) > Version('0.0.10'): + if Version(from_version) > Version('0.0.11'): res = shell(f"cd {args.builddir} ; UPDATE_EXTENSION=1 UPDATE_FROM={from_version} UPDATE_TO={from_version} make test-misc FILTER=version_mismatch") # run the actual parallel tests after the upgrade @@ -128,9 +128,14 @@ def sort_versions(v1, v2): # test updates from all tags tag_pairs = [update_fname.split("--") for update_fname in os.listdir("sql/updates")] + tag_pairs = [(from_tag, to_tag.split('.sql')[0]) for from_tag, to_tag in tag_pairs] + repo = git.Repo(search_parent_directories=True) + tags_actual = [tag.name for tag in repo.tags] + tags_actual = [name[1:] if name[0] == 'v' else name for name in tags_actual] + tag_pairs = [(from_tag, to_tag) for from_tag, to_tag in tag_pairs if from_tag in tags_actual and to_tag in tags_actual] from_tags = list(sorted([p[0] for p in tag_pairs], key=cmp_to_key(sort_versions))) from_tags.reverse() - to_tags = list(sorted([p[1].split(".sql")[0] for p in tag_pairs], key=cmp_to_key(sort_versions))) + to_tags = list(sorted([p[1] for p in tag_pairs], key=cmp_to_key(sort_versions))) latest_version = to_tags[-1] print("Updating from tags", from_tags, "to ", latest_version) From 040f24253e5a265194ad1b2950a03e524d86b2c8 Mon Sep 17 00:00:00 2001 From: Varik Matevosyan Date: Thu, 28 Dec 2023 12:10:00 +0400 Subject: [PATCH 10/13] mmap index file to memory to reduce memory usage (#252) * mmap indexfile to memory instead of loading it entiery to reduce memory usage * fix fd type * use relfilenode instead of index name to avoid collision * Refactor code to be more readable * use snprintf instead sprintf * Move out function call from assert macro * Remove unnecessary initialization * Wrap variable in unused macro to surpress compiler warning --- src/hnsw/build.c | 66 ++++++++++++++++++++++++++++++++------- src/hnsw/external_index.c | 11 +++---- src/hnsw/external_index.h | 2 +- 3 files changed, 60 insertions(+), 19 deletions(-) diff --git a/src/hnsw/build.c b/src/hnsw/build.c index 63be14dd6..f2480df4c 100644 --- a/src/hnsw/build.c +++ b/src/hnsw/build.c @@ -15,6 +15,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -420,6 +423,17 @@ static void BuildIndex( { usearch_error_t error = NULL; usearch_init_options_t opts; + struct stat index_file_stat; + char *result_buf = NULL; + char *tmp_index_file_path = NULL; + const char *tmp_index_file_fmt_str = "/tmp/ldb-index-%d.bin"; + // size of static name + max digits of uint32 (Oid) 10 + 1 for nullbyte and - 2 for %d format specifier + const uint32 tmp_index_file_char_cnt = strlen(tmp_index_file_fmt_str) + 9; + int index_file_fd; + int munmap_ret; + usearch_metadata_t metadata; + size_t num_added_vectors; + MemSet(&opts, 0, sizeof(opts)); InitBuildState(buildstate, heap, index, indexInfo); @@ -446,7 +460,7 @@ static void BuildIndex( } elog(INFO, "done loading usearch index"); - usearch_metadata_t metadata = usearch_metadata(buildstate->usearch_index, &error); + metadata = usearch_metadata(buildstate->usearch_index, &error); assert(error == NULL); opts.connectivity = metadata.connectivity; opts.dimensions = metadata.dimensions; @@ -494,25 +508,53 @@ static void BuildIndex( assert(error == NULL); } - char *result_buf = NULL; - usearch_save(buildstate->usearch_index, NULL, &result_buf, &error); - assert(error == NULL && result_buf != NULL); - - size_t num_added_vectors = usearch_size(buildstate->usearch_index, &error); + metadata = usearch_metadata(buildstate->usearch_index, &error); assert(error == NULL); + if(buildstate->index_file_path == NULL) { + // Save index into temporary file + // To later mmap it into memory + // Filename is /tmp/ldb-index-$relfilenode.bin + // The file will be removed in the end + tmp_index_file_path = palloc0(tmp_index_file_char_cnt); + snprintf(tmp_index_file_path, tmp_index_file_char_cnt, tmp_index_file_fmt_str, index->rd_rel->relfilenode); + usearch_save(buildstate->usearch_index, tmp_index_file_path, NULL, &error); + assert(error == NULL); + index_file_fd = open(tmp_index_file_path, O_RDONLY); + } else { + index_file_fd = open(buildstate->index_file_path, O_RDONLY); + } + assert(index_file_fd > 0); + + num_added_vectors = usearch_size(buildstate->usearch_index, &error); + assert(error == NULL); elog(INFO, "done saving %ld vectors", num_added_vectors); + //****************************** mmap index to memory BEGIN ******************************// + usearch_free(buildstate->usearch_index, &error); + assert(error == NULL); + buildstate->usearch_index = NULL; + + fstat(index_file_fd, &index_file_stat); + result_buf = mmap(NULL, index_file_stat.st_size, PROT_READ, MAP_PRIVATE, index_file_fd, 0); + assert(result_buf != MAP_FAILED); + //****************************** mmap index to memory END ******************************// + //****************************** saving to WAL BEGIN ******************************// UpdateProgress(PROGRESS_CREATEIDX_PHASE, PROGRESS_HNSW_PHASE_LOAD); - StoreExternalIndex(index, buildstate->usearch_index, forkNum, result_buf, &opts, num_added_vectors); - + StoreExternalIndex(index, &metadata, forkNum, result_buf, &opts, num_added_vectors); //****************************** saving to WAL END ******************************// - usearch_free(buildstate->usearch_index, &error); - free(result_buf); - assert(error == NULL); - buildstate->usearch_index = NULL; + munmap_ret = munmap(result_buf, index_file_stat.st_size); + assert(munmap_ret == 0); + LDB_UNUSED(munmap_ret); + close(index_file_fd); + + if(tmp_index_file_path) { + // remove index file if it was not externally provided + unlink(tmp_index_file_path); + pfree(tmp_index_file_path); + } FreeBuildState(buildstate); } diff --git a/src/hnsw/external_index.c b/src/hnsw/external_index.c index 33c0de8e8..85613b8d2 100644 --- a/src/hnsw/external_index.c +++ b/src/hnsw/external_index.c @@ -298,7 +298,7 @@ static void ContinueBlockMapGroupInitialization( } void StoreExternalIndexBlockMapGroup(Relation index, - usearch_index_t external_index, + usearch_metadata_t *metadata, HnswIndexHeaderPage *headerp, ForkNumber forkNum, char *data, @@ -321,8 +321,7 @@ void StoreExternalIndexBlockMapGroup(Relation index, BlockNumber *l_wal_retriever_block_numbers = palloc0(sizeof(BlockNumber) * number_of_blockmaps_in_group * HNSW_BLOCKMAP_BLOCKS_PER_PAGE); - HnswIndexTuple *bufferpage = palloc(BLCKSZ); - usearch_metadata_t metadata = usearch_metadata(external_index, NULL); + HnswIndexTuple *bufferpage = palloc(BLCKSZ); /* Add all the vectors to the WAL */ for(uint32 node_id = first_node_index; node_id < first_node_index + num_added_vectors;) { @@ -364,7 +363,7 @@ void StoreExternalIndexBlockMapGroup(Relation index, node = extract_node(data, *progress, dimension, - &metadata, + metadata, /*->>output*/ &node_size, &node_level); bufferpage->id = node_id; @@ -435,7 +434,7 @@ void StoreExternalIndexBlockMapGroup(Relation index, } void StoreExternalIndex(Relation index, - usearch_index_t external_index, + usearch_metadata_t *external_index_metadata, ForkNumber forkNum, char *data, usearch_init_options_t *opts, @@ -496,7 +495,7 @@ void StoreExternalIndex(Relation index, uint32 batch_size = HNSW_BLOCKMAP_BLOCKS_PER_PAGE; while(num_added_vectors_remaining > 0) { StoreExternalIndexBlockMapGroup(index, - external_index, + external_index_metadata, headerp, forkNum, data, diff --git a/src/hnsw/external_index.h b/src/hnsw/external_index.h index 8a35529ba..519383d82 100644 --- a/src/hnsw/external_index.h +++ b/src/hnsw/external_index.h @@ -123,7 +123,7 @@ typedef struct uint32 UsearchNodeBytes(usearch_metadata_t *metadata, int vector_bytes, int level); void StoreExternalIndex(Relation index, - usearch_index_t external_index, + usearch_metadata_t *external_index_metadata, ForkNumber forkNum, char *data, usearch_init_options_t *opts, From 0d14aa46b9943877b07d9e508cb637faa87bff6f Mon Sep 17 00:00:00 2001 From: Varik Matevosyan Date: Wed, 3 Jan 2024 01:24:29 +0400 Subject: [PATCH 11/13] Fix generic operator in invalid usage error message --- src/hooks/executor_start.c | 2 +- src/hooks/post_parse.c | 2 +- test/expected/hnsw_dist_func.out | 64 ++++++++++++++++---------------- test/expected/hnsw_todo.out | 2 +- test/expected/hnsw_vector.out | 2 +- 5 files changed, 36 insertions(+), 36 deletions(-) diff --git a/src/hooks/executor_start.c b/src/hooks/executor_start.c index edfbcadf8..fc65e9973 100644 --- a/src/hooks/executor_start.c +++ b/src/hooks/executor_start.c @@ -62,7 +62,7 @@ static void validate_operator_usage(Plan *plan, List *oidList) context.oidList = oidList; context.isIndexScan = false; if(operator_used_incorrectly_walker((Node *)plan, (void *)&context)) { - elog(ERROR, "Operator <-> can only be used inside of an index"); + elog(ERROR, "Operator can only be used inside of an index"); } } diff --git a/src/hooks/post_parse.c b/src/hooks/post_parse.c index 339820a2d..7c27d1ce8 100644 --- a/src/hooks/post_parse.c +++ b/src/hooks/post_parse.c @@ -181,7 +181,7 @@ void post_parse_analyze_hook_with_operator_check(ParseState *pstate, if(is_operator_used(query_as_node, oidList)) { List *sort_group_refs = get_sort_group_refs(query_as_node); if(is_operator_used_incorrectly(query_as_node, oidList, sort_group_refs)) { - elog(ERROR, "Operator <-> is invalid outside of ORDER BY context"); + elog(ERROR, "Operator is invalid outside of ORDER BY context"); } list_free(sort_group_refs); } diff --git a/test/expected/hnsw_dist_func.out b/test/expected/hnsw_dist_func.out index 04b0578d4..ee8d494cf 100644 --- a/test/expected/hnsw_dist_func.out +++ b/test/expected/hnsw_dist_func.out @@ -142,11 +142,11 @@ SELECT hamming_dist('{1,1}', '{0,1,0}'); ERROR: expected equally sized arrays but got arrays with dimensions 2 and 3 -- Expect errors due to improper use of the operator outside of its supported context SELECT ARRAY[1,2,3] ARRAY[3,2,1]; -ERROR: Operator <-> is invalid outside of ORDER BY context +ERROR: Operator is invalid outside of ORDER BY context SELECT ROUND((v ARRAY[0,1,0])::numeric, 2) FROM small_world_cos ORDER BY v '{0,1,0}' LIMIT 7; -ERROR: Operator <-> is invalid outside of ORDER BY context +ERROR: Operator is invalid outside of ORDER BY context SELECT ROUND((v ARRAY[0,1,0])::numeric, 2) FROM small_world_ham ORDER BY v '{0,1,0}' LIMIT 7; -ERROR: Operator <-> is invalid outside of ORDER BY context +ERROR: Operator is invalid outside of ORDER BY context \set ON_ERROR_STOP on -- More robust distance operator tests CREATE TABLE test1 (id SERIAL, v REAL[]); @@ -169,63 +169,63 @@ SELECT 1 FROM test1 WHERE id = 0 + 1; \set ON_ERROR_STOP off -- Expect errors due to incorrect usage INSERT INTO test1 (v) VALUES (ARRAY['{1,2}'::REAL[] '{4,2}'::REAL[], 0]); -ERROR: Operator <-> is invalid outside of ORDER BY context +ERROR: Operator is invalid outside of ORDER BY context SELECT v '{1,2}' FROM test1 ORDER BY v '{1,3}'; -ERROR: Operator <-> is invalid outside of ORDER BY context +ERROR: Operator is invalid outside of ORDER BY context SELECT v '{1,2}' FROM test1; -ERROR: Operator <-> is invalid outside of ORDER BY context +ERROR: Operator is invalid outside of ORDER BY context WITH temp AS (SELECT v '{1,2}' FROM test1) SELECT 1 FROM temp; -ERROR: Operator <-> is invalid outside of ORDER BY context +ERROR: Operator is invalid outside of ORDER BY context SELECT t.res FROM (SELECT v '{1,2}' AS res FROM test1) t; -ERROR: Operator <-> is invalid outside of ORDER BY context +ERROR: Operator is invalid outside of ORDER BY context SELECT (SELECT v '{1,2}' FROM test1 LIMIT 1) FROM test1; -ERROR: Operator <-> is invalid outside of ORDER BY context +ERROR: Operator is invalid outside of ORDER BY context SELECT COALESCE(v '{1,2}', 0) FROM test1; -ERROR: Operator <-> is invalid outside of ORDER BY context +ERROR: Operator is invalid outside of ORDER BY context SELECT EXISTS (SELECT v '{1,2}' FROM test1); -ERROR: Operator <-> is invalid outside of ORDER BY context +ERROR: Operator is invalid outside of ORDER BY context SELECT test1.v test2.v FROM test1 JOIN test2 USING (id); -ERROR: Operator <-> is invalid outside of ORDER BY context +ERROR: Operator is invalid outside of ORDER BY context SELECT v '{1,2}' FROM test1 UNION SELECT v '{1,3}' FROM test1; -ERROR: Operator <-> is invalid outside of ORDER BY context +ERROR: Operator is invalid outside of ORDER BY context (SELECT v '{1,2}' FROM test1 WHERE id < 5) UNION (SELECT v '{1,3}' FROM test1 WHERE id >= 5); -ERROR: Operator <-> is invalid outside of ORDER BY context +ERROR: Operator is invalid outside of ORDER BY context SELECT MAX(v '{1,2}') FROM test1; -ERROR: Operator <-> is invalid outside of ORDER BY context +ERROR: Operator is invalid outside of ORDER BY context SELECT * FROM test1 JOIN test2 ON test1.v test2.v < 0.5; -ERROR: Operator <-> is invalid outside of ORDER BY context +ERROR: Operator is invalid outside of ORDER BY context SELECT test1.v FROM test1 JOIN test2 ON test1.v '{1,2}' = test2.v '{1,3}'; -ERROR: Operator <-> is invalid outside of ORDER BY context +ERROR: Operator is invalid outside of ORDER BY context SELECT (v '{1,2}') + (v '{1,3}') FROM test1; -ERROR: Operator <-> is invalid outside of ORDER BY context +ERROR: Operator is invalid outside of ORDER BY context SELECT CASE WHEN v '{1,2}' > 1 THEN 'High' ELSE 'Low' END FROM test1; -ERROR: Operator <-> is invalid outside of ORDER BY context +ERROR: Operator is invalid outside of ORDER BY context INSERT INTO test1 (v) VALUES ('{2,3}') RETURNING v '{1,2}'; -ERROR: Operator <-> is invalid outside of ORDER BY context +ERROR: Operator is invalid outside of ORDER BY context SELECT 1 FROM test1 GROUP BY v '{1,3}'; -ERROR: Operator <-> is invalid outside of ORDER BY context +ERROR: Operator is invalid outside of ORDER BY context SELECT 1 FROM test1 ORDER BY (('{1,2}'::real[] '{3,4}'::real[]) - 0); -ERROR: Operator <-> is invalid outside of ORDER BY context +ERROR: Operator is invalid outside of ORDER BY context SELECT 1 FROM test1 ORDER BY '{1,2}'::REAL[] '{3,4}'::REAL[]; -ERROR: Operator <-> is invalid outside of ORDER BY context +ERROR: Operator is invalid outside of ORDER BY context SELECT 1 FROM test1 ORDER BY v ARRAY[(SELECT '{1,4}'::REAL[] '{4,2}'::REAL[]), 3]; -ERROR: Operator <-> is invalid outside of ORDER BY context +ERROR: Operator is invalid outside of ORDER BY context -- Expect errors due to index not existing SELECT id FROM test1 ORDER BY v '{1,2}'; -ERROR: Operator <-> can only be used inside of an index +ERROR: Operator can only be used inside of an index SELECT 1 FROM test1 ORDER BY v (SELECT '{1,3}'::real[]); -ERROR: Operator <-> can only be used inside of an index +ERROR: Operator can only be used inside of an index SELECT t2_results.id FROM test1 t1 JOIN LATERAL (SELECT t2.id FROM test2 t2 ORDER BY t1.v t2.v LIMIT 1) t2_results ON TRUE; -ERROR: Operator <-> can only be used inside of an index +ERROR: Operator can only be used inside of an index WITH t AS (SELECT id FROM test1 ORDER BY v '{1,2}' LIMIT 1) SELECT DISTINCT id FROM t; -ERROR: Operator <-> can only be used inside of an index +ERROR: Operator can only be used inside of an index WITH t AS (SELECT id FROM test1 ORDER BY v '{1,2}' LIMIT 1) SELECT id, COUNT(*) FROM t GROUP BY 1; -ERROR: Operator <-> can only be used inside of an index +ERROR: Operator can only be used inside of an index WITH t AS (SELECT id FROM test1 ORDER BY v '{1,2}') SELECT id FROM t UNION SELECT id FROM t; -ERROR: Operator <-> can only be used inside of an index +ERROR: Operator can only be used inside of an index -- issue #227 SELECT * from test2 JOIN LATERAL (SELECT * FROM (SELECT id FROM test2 ORDER BY v '{1,2}') as forall) haha on TRUE; -ERROR: Operator <-> can only be used inside of an index +ERROR: Operator can only be used inside of an index -- more complex setup of the above SELECT forall.id, nearest_per_id.* FROM (SELECT * FROM @@ -251,7 +251,7 @@ ORDER BY forall.id LIMIT 9; -ERROR: Operator <-> can only be used inside of an index +ERROR: Operator can only be used inside of an index \set ON_ERROR_STOP on -- cross-lateral joins work as expected when appropriate index exists -- nearest element for each vector diff --git a/test/expected/hnsw_todo.out b/test/expected/hnsw_todo.out index 3fdba174f..d1b20fba4 100644 --- a/test/expected/hnsw_todo.out +++ b/test/expected/hnsw_todo.out @@ -140,4 +140,4 @@ INSERT INTO test_table VALUES (0), (1), (7); -- This currently results in an error about using the operator outside of index -- This case should be fixed SELECT id FROM test_table ORDER BY int_to_fixed_binary_real_array(id) '{0,0,0}'::REAL[] LIMIT 2; -ERROR: Operator <-> can only be used inside of an index +ERROR: Operator can only be used inside of an index diff --git a/test/expected/hnsw_vector.out b/test/expected/hnsw_vector.out index 83001adb7..e51167439 100644 --- a/test/expected/hnsw_vector.out +++ b/test/expected/hnsw_vector.out @@ -168,7 +168,7 @@ RESET client_min_messages; \set ON_ERROR_STOP off -- Expect error due to improper use of the operator outside of its supported context SELECT ARRAY[1,2,3] ARRAY[3,2,1]; -ERROR: Operator <-> is invalid outside of ORDER BY context +ERROR: Operator is invalid outside of ORDER BY context -- Expect error due to mismatching vector dimensions SELECT 1 FROM small_world ORDER BY v '[0,1,0,1]' LIMIT 1; ERROR: Expected vector with dimension 3, got 4 From 9ac519e278f0c4f02244de413035a36eb794524d Mon Sep 17 00:00:00 2001 From: Varik Matevosyan Date: Sat, 6 Jan 2024 20:17:04 +0400 Subject: [PATCH 12/13] Save index file in postgres data dir (#256) --- src/hnsw/build.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/hnsw/build.c b/src/hnsw/build.c index f2480df4c..6b5c554c4 100644 --- a/src/hnsw/build.c +++ b/src/hnsw/build.c @@ -426,9 +426,9 @@ static void BuildIndex( struct stat index_file_stat; char *result_buf = NULL; char *tmp_index_file_path = NULL; - const char *tmp_index_file_fmt_str = "/tmp/ldb-index-%d.bin"; - // size of static name + max digits of uint32 (Oid) 10 + 1 for nullbyte and - 2 for %d format specifier - const uint32 tmp_index_file_char_cnt = strlen(tmp_index_file_fmt_str) + 9; + const char *tmp_index_file_fmt_str = "%s/ldb-index-%d.bin"; + // parent_dir + max digits of uint32 (Oid) 10 + const uint32 tmp_index_file_char_cnt = MAXPGPATH + strlen(tmp_index_file_fmt_str) + 10; int index_file_fd; int munmap_ret; usearch_metadata_t metadata; @@ -514,10 +514,11 @@ static void BuildIndex( if(buildstate->index_file_path == NULL) { // Save index into temporary file // To later mmap it into memory - // Filename is /tmp/ldb-index-$relfilenode.bin // The file will be removed in the end tmp_index_file_path = palloc0(tmp_index_file_char_cnt); - snprintf(tmp_index_file_path, tmp_index_file_char_cnt, tmp_index_file_fmt_str, index->rd_rel->relfilenode); + // Create index file directory string: $pg_data_dir/ldb_indexes/index-$relfilenode.bin + snprintf( + tmp_index_file_path, tmp_index_file_char_cnt, tmp_index_file_fmt_str, DataDir, index->rd_rel->relfilenode); usearch_save(buildstate->usearch_index, tmp_index_file_path, NULL, &error); assert(error == NULL); index_file_fd = open(tmp_index_file_path, O_RDONLY); From 670c318f03aa925ecd99a3ad6248fc0900cf6223 Mon Sep 17 00:00:00 2001 From: Danyil Blyschak <41936813+therealdarkknight@users.noreply.github.com> Date: Sat, 13 Jan 2024 18:56:33 -0500 Subject: [PATCH 13/13] Add support for indexing unlogged tables (#253) * added support for unlogged tables and corresponding tests * clarified manual test instructions * added necessary crit section lines inside StoreExternalEmptyIndex to prevent db crashes in sanitizer tests * added include statement for CRIT_SECTION macros for earlier versions of pg * added test hnsw_logged_unlogged which tests changing a table from logged to unlogged * added dot out file for hnsw_logged_unlogged test * added manual test cases where we switch from logged/unlogged * added replica tests for unlogged tables * added unique-distanced vectors so that output of distance queries are forced to be unique in hnsw_logged_unlogged * finished making hnsw_logged_unlogged out results completely determinstic * fixed runner.c to reconnect to root db on every test case to allow running several tests after crashing the root db as part of a test (like replica_test_unlogged currently does) * replace killall with kill -9 pid AND finish root connection after each test --- ci/scripts/run-tests-linux.sh | 7 +- src/hnsw/build.c | 51 ++- src/hnsw/external_index.c | 79 ++++- src/hnsw/external_index.h | 1 + src/hnsw/extra_dirtied.c | 19 ++ src/hnsw/extra_dirtied.h | 1 + src/hnsw/insert.c | 14 +- test/c/replica_test_unlogged.c | 125 ++++++++ test/c/runner.c | 21 +- test/expected/hnsw_create_unlogged.out | 153 +++++++++ test/expected/hnsw_insert_unlogged.out | 153 +++++++++ test/expected/hnsw_logged_unlogged.out | 294 ++++++++++++++++++ test/schedule.txt | 2 +- test/sql/hnsw_create_unlogged.sql | 80 +++++ test/sql/hnsw_insert_unlogged.sql | 94 ++++++ test/sql/hnsw_logged_unlogged.sql | 138 ++++++++ test/sql/manual_tests/hnsw_unlogged_post.sql | 58 ++++ test/sql/manual_tests/hnsw_unlogged_pre.sql | 139 +++++++++ test/sql/utils/sift10k_array_unlogged.sql | 5 + test/sql/utils/sift1k_array_unlogged.sql | 6 + test/sql/utils/small_world_array_unlogged.sql | 15 + 21 files changed, 1424 insertions(+), 31 deletions(-) create mode 100644 test/c/replica_test_unlogged.c create mode 100644 test/expected/hnsw_create_unlogged.out create mode 100644 test/expected/hnsw_insert_unlogged.out create mode 100644 test/expected/hnsw_logged_unlogged.out create mode 100644 test/sql/hnsw_create_unlogged.sql create mode 100644 test/sql/hnsw_insert_unlogged.sql create mode 100644 test/sql/hnsw_logged_unlogged.sql create mode 100644 test/sql/manual_tests/hnsw_unlogged_post.sql create mode 100644 test/sql/manual_tests/hnsw_unlogged_pre.sql create mode 100644 test/sql/utils/sift10k_array_unlogged.sql create mode 100644 test/sql/utils/sift1k_array_unlogged.sql create mode 100644 test/sql/utils/small_world_array_unlogged.sql diff --git a/ci/scripts/run-tests-linux.sh b/ci/scripts/run-tests-linux.sh index 6da582f66..048fd503b 100755 --- a/ci/scripts/run-tests-linux.sh +++ b/ci/scripts/run-tests-linux.sh @@ -48,8 +48,11 @@ function run_db_tests(){ cd $WORKDIR/build && \ make test && \ make test-client && \ - run_pgvector_tests && \ - killall postgres && \ + run_pgvector_tests + pg_pid=$(fuser -a 5432/tcp 2>/dev/null | awk "{print $1}" | awk '{$1=$1};1') + if [[ ! -z "$pg_pid" ]]; then + kill -9 $pg_pid + fi gcovr -r $WORKDIR/src/ --object-directory $WORKDIR/build/ --xml /tmp/coverage.xml fi } diff --git a/src/hnsw/build.c b/src/hnsw/build.c index 6b5c554c4..ebc0a8c99 100644 --- a/src/hnsw/build.c +++ b/src/hnsw/build.c @@ -358,7 +358,7 @@ static void InitBuildState(HnswBuildState *buildstate, Relation heap, Relation i buildstate->index_file_path = ldb_HnswGetIndexFilePath(index); // If a dimension wasn't specified try to infer it - if(buildstate->dimensions < 1) { + if(heap != NULL && buildstate->dimensions < 1) { buildstate->dimensions = InferDimension(heap, indexInfo); } /* Require column to have dimensions to be indexed */ @@ -416,10 +416,9 @@ static void ScanTable(HnswBuildState *buildstate) } /* - * Build the index + * Build the index, writing to the main fork */ -static void BuildIndex( - Relation heap, Relation index, IndexInfo *indexInfo, HnswBuildState *buildstate, ForkNumber forkNum) +static void BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo, HnswBuildState *buildstate) { usearch_error_t error = NULL; usearch_init_options_t opts; @@ -543,7 +542,7 @@ static void BuildIndex( //****************************** saving to WAL BEGIN ******************************// UpdateProgress(PROGRESS_CREATEIDX_PHASE, PROGRESS_HNSW_PHASE_LOAD); - StoreExternalIndex(index, &metadata, forkNum, result_buf, &opts, num_added_vectors); + StoreExternalIndex(index, &metadata, MAIN_FORKNUM, result_buf, &opts, num_added_vectors); //****************************** saving to WAL END ******************************// munmap_ret = munmap(result_buf, index_file_stat.st_size); @@ -560,6 +559,38 @@ static void BuildIndex( FreeBuildState(buildstate); } +/* + * Build an empty index, writing to the init fork + */ +static void BuildEmptyIndex(Relation index, IndexInfo *indexInfo, HnswBuildState *buildstate) +{ + usearch_error_t error = NULL; + usearch_init_options_t opts; + MemSet(&opts, 0, sizeof(opts)); + + InitBuildState(buildstate, NULL, index, indexInfo); + opts.dimensions = buildstate->dimensions; + PopulateUsearchOpts(index, &opts); + + buildstate->usearch_index = usearch_init(&opts, &error); + assert(error == NULL); + + buildstate->hnsw = NULL; + + char *result_buf = NULL; + usearch_save(buildstate->usearch_index, NULL, &result_buf, &error); + assert(error == NULL && result_buf != NULL); + + StoreExternalEmptyIndex(index, INIT_FORKNUM, result_buf, &opts); + + usearch_free(buildstate->usearch_index, &error); + free(result_buf); + assert(error == NULL); + buildstate->usearch_index = NULL; + + FreeBuildState(buildstate); +} + /* * Build the index for a logged table */ @@ -568,7 +599,7 @@ IndexBuildResult *ldb_ambuild(Relation heap, Relation index, IndexInfo *indexInf IndexBuildResult *result; HnswBuildState buildstate; - BuildIndex(heap, index, indexInfo, &buildstate, MAIN_FORKNUM); + BuildIndex(heap, index, indexInfo, &buildstate); result = (IndexBuildResult *)palloc(sizeof(IndexBuildResult)); result->heap_tuples = buildstate.reltuples; @@ -578,13 +609,13 @@ IndexBuildResult *ldb_ambuild(Relation heap, Relation index, IndexInfo *indexInf } /* - * Build the index for an unlogged table + * Build an empty index for an unlogged table */ void ldb_ambuildunlogged(Relation index) { - LDB_UNUSED(index); - // todo:: - elog(ERROR, "hnsw index on unlogged tables is currently not supported"); + HnswBuildState buildstate; + IndexInfo *indexInfo = BuildIndexInfo(index); + BuildEmptyIndex(index, indexInfo, &buildstate); } void ldb_reindex_external_index(Oid indrelid) diff --git a/src/hnsw/external_index.c b/src/hnsw/external_index.c index 85613b8d2..dc0317ac8 100644 --- a/src/hnsw/external_index.c +++ b/src/hnsw/external_index.c @@ -8,6 +8,7 @@ #include #include #include +#include // START_CRIT_SECTION, END_CRIT_SECTION #include // BLCKSZ #include // Buffer #include @@ -119,11 +120,13 @@ static void UpdateHeaderBlockMapGroupDesc( hdr_copy->blockmap_groups[ groupno ] = *desc; log_rec_ptr = GenericXLogFinish(state); - assert(log_rec_ptr != InvalidXLogRecPtr); - if(flush_log) { - LDB_FAILURE_POINT_CRASH_IF_ENABLED("just_before_wal_flush"); - XLogFlush(log_rec_ptr); - LDB_FAILURE_POINT_CRASH_IF_ENABLED("just_after_wal_flush"); + if(RelationNeedsWAL(index)) { + assert(log_rec_ptr != InvalidXLogRecPtr); + if(flush_log) { + LDB_FAILURE_POINT_CRASH_IF_ENABLED("just_before_wal_flush"); + XLogFlush(log_rec_ptr); + LDB_FAILURE_POINT_CRASH_IF_ENABLED("just_after_wal_flush"); + } } ReleaseBuffer(hdr_buf); } @@ -417,7 +420,9 @@ void StoreExternalIndexBlockMapGroup(Relation index, // When the blockmap page group was created, header block was updated accordingly in // ContinueBlockMapGroupInitialization call above. const BlockNumber blockmapno = blockmap_id + headerp->blockmap_groups[ blockmap_groupno ].first_block; - Buffer buf = ReadBufferExtended(index, MAIN_FORKNUM, blockmapno, RBM_NORMAL, NULL); + // todo:: should MAIN_FORKNUM be hardcoded here or use the forkNum parameter, from a code readability standpoint + // (other places in this file as well) + Buffer buf = ReadBufferExtended(index, MAIN_FORKNUM, blockmapno, RBM_NORMAL, NULL); LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); GenericXLogState *state = GenericXLogStart(index); @@ -433,6 +438,68 @@ void StoreExternalIndexBlockMapGroup(Relation index, } } +void StoreExternalEmptyIndex(Relation index, ForkNumber forkNum, char *data, usearch_init_options_t *opts) +{ + // this method is intended to store empty indexes for unlogged tables (ambuildempty method) and should hence be + // called with forkNum = INIT_FORKNUM + + Buffer header_buf = ReadBufferExtended(index, forkNum, P_NEW, RBM_NORMAL, NULL); + + // even when we are creating a new page, it must always be the first page we create + // and should therefore have BlockNumber 0 + assert(BufferGetBlockNumber(header_buf) == 0); + + LockBuffer(header_buf, BUFFER_LOCK_EXCLUSIVE); + + START_CRIT_SECTION(); + + Page header_page = BufferGetPage(header_buf); + + PageInit(header_page, BufferGetPageSize(header_buf), 0); + + HnswIndexHeaderPage *headerp = (HnswIndexHeaderPage *)PageGetContents(header_page); + + headerp->magicNumber = LDB_WAL_MAGIC_NUMBER; + headerp->version = LDB_WAL_VERSION_NUMBER; + headerp->vector_dim = opts->dimensions; + headerp->m = opts->connectivity; + headerp->ef_construction = opts->expansion_add; + headerp->ef = opts->expansion_search; + headerp->metric_kind = opts->metric_kind; + + headerp->num_vectors = 0; + headerp->blockmap_groups_nr = 0; + + for(uint32 i = 0; i < lengthof(headerp->blockmap_groups); ++i) { + headerp->blockmap_groups[ i ] = (HnswBlockMapGroupDesc){ + .first_block = InvalidBlockNumber, + .blockmaps_initialized = 0, + }; + } + + headerp->last_data_block = InvalidBlockNumber; + + memcpy(headerp->usearch_header, data, USEARCH_HEADER_SIZE); + ((PageHeader)header_page)->pd_lower = ((char *)headerp + sizeof(HnswIndexHeaderPage)) - (char *)header_page; + + MarkBufferDirty(header_buf); + + // Write a WAL record containing a full image of the page. Even though this is an unlogged table that doesn't use + // WAL, this line appears to flush changes to disc immediately (and not waiting after the first checkpoint). This is + // important because this empty index will live in the init fork, where it will be used to reset the unlogged index + // after a crash, and so we need this written to disc in order to have proper crash recovery functionality available + // immediately. Otherwise, if a crash occurs before the first postgres checkpoint, postgres can't read the init fork + // from disc and we will have a corrupted index when postgres attempts recovery. This is also what nbtree access + // method's implementation does for empty unlogged indexes (ambuildempty implementation). + // NOTE: we MUST have this be inside a crit section, or else an assertion inside this method will fail and crash the + // db + log_newpage_buffer(header_buf, false); + + END_CRIT_SECTION(); + + UnlockReleaseBuffer(header_buf); +} + void StoreExternalIndex(Relation index, usearch_metadata_t *external_index_metadata, ForkNumber forkNum, diff --git a/src/hnsw/external_index.h b/src/hnsw/external_index.h index 519383d82..3c78d49c3 100644 --- a/src/hnsw/external_index.h +++ b/src/hnsw/external_index.h @@ -122,6 +122,7 @@ typedef struct } HnswInsertState; uint32 UsearchNodeBytes(usearch_metadata_t *metadata, int vector_bytes, int level); +void StoreExternalEmptyIndex(Relation index, ForkNumber forkNum, char *data, usearch_init_options_t *opts); void StoreExternalIndex(Relation index, usearch_metadata_t *external_index_metadata, ForkNumber forkNum, diff --git a/src/hnsw/extra_dirtied.c b/src/hnsw/extra_dirtied.c index b371303f5..7d8ca9d05 100644 --- a/src/hnsw/extra_dirtied.c +++ b/src/hnsw/extra_dirtied.c @@ -84,6 +84,25 @@ void extra_dirtied_release_all(ExtraDirtiedBufs *ed) ed->extra_dirtied_size = 0; } +// Like extra_dirtied_release_all but does not perform a InvalidXLogRecPtr check. +// Used for inserts on unlogged tables, which do not write to WAL +void extra_dirtied_release_all_no_xlog_check(ExtraDirtiedBufs *ed) +{ + for(int i = 0; i < ed->extra_dirtied_state_size; ++i) { + GenericXLogFinish(ed->extra_dirtied_state[ i ]); + } + + for(int i = 0; i < ed->extra_dirtied_size; i++) { + assert(BufferIsValid(ed->extra_dirtied_buf[ i ])); + // header is not considered extra. we know we should not have dirtied it + // sanity check callees that manimulate extra_dirtied did not violate this + assert(ed->extra_dirtied_blockno[ i ] != 0); + // MarkBufferDirty() had been called by by GenericXLogFinish() already + UnlockReleaseBuffer(ed->extra_dirtied_buf[ i ]); + } + ed->extra_dirtied_size = 0; +} + void extra_dirtied_free(ExtraDirtiedBufs *ed) { if(ed->extra_dirtied_size != 0) { diff --git a/src/hnsw/extra_dirtied.h b/src/hnsw/extra_dirtied.h index 210001144..491207cae 100644 --- a/src/hnsw/extra_dirtied.h +++ b/src/hnsw/extra_dirtied.h @@ -33,6 +33,7 @@ void extra_dirtied_add_wal_read_buffer( ExtraDirtiedBufs* ed, Relation index, ForkNumber forkNum, BlockNumber blockno, Buffer* buf, Page* page); Page extra_dirtied_get(ExtraDirtiedBufs* ed, BlockNumber blockno, Buffer* out_buf); void extra_dirtied_release_all(ExtraDirtiedBufs* ed); +void extra_dirtied_release_all_no_xlog_check(ExtraDirtiedBufs* ed); void extra_dirtied_free(ExtraDirtiedBufs* ed); #endif // LDB_HNSW_EXTRA_DIRTIED_H diff --git a/src/hnsw/insert.c b/src/hnsw/insert.c index 2070bd3f1..6e5e090cd 100644 --- a/src/hnsw/insert.c +++ b/src/hnsw/insert.c @@ -72,6 +72,7 @@ bool ldb_aminsert(Relation index, HnswIndexTuple *new_tuple; usearch_init_options_t opts = {0}; LDB_UNUSED(heap); + LDB_UNUSED(indexInfo); #if PG_VERSION_NUM >= 140000 LDB_UNUSED(indexUnchanged); #endif @@ -109,7 +110,7 @@ bool ldb_aminsert(Relation index, hdr = (HnswIndexHeaderPage *)PageGetContents(hdr_page); assert(hdr->magicNumber == LDB_WAL_MAGIC_NUMBER); - opts.dimensions = GetHnswIndexDimensions(index, indexInfo); + opts.dimensions = hdr->vector_dim; CheckHnswIndexDimensions(index, values[ 0 ], opts.dimensions); PopulateUsearchOpts(index, &opts); opts.retriever_ctx = ldb_wal_retriever_area_init(index, hdr); @@ -182,16 +183,23 @@ bool ldb_aminsert(Relation index, ldb_wal_retriever_area_reset(insertstate->retriever_ctx, hdr); + int needs_wal = RelationNeedsWAL(index); // we only release the header buffer AFTER inserting is finished to make sure nobody else changes the block // structure. todo:: critical section here can definitely be shortened { // GenericXLogFinish also calls MarkBufferDirty(buf) XLogRecPtr ptr = GenericXLogFinish(state); - assert(ptr != InvalidXLogRecPtr); + if(needs_wal) { + assert(ptr != InvalidXLogRecPtr); + } LDB_UNUSED(ptr); } - extra_dirtied_release_all(insertstate->retriever_ctx->extra_dirted); + if(needs_wal) { + extra_dirtied_release_all(insertstate->retriever_ctx->extra_dirted); + } else { + extra_dirtied_release_all_no_xlog_check(insertstate->retriever_ctx->extra_dirted); + } usearch_free(insertstate->uidx, &error); if(error != NULL) { diff --git a/test/c/replica_test_unlogged.c b/test/c/replica_test_unlogged.c new file mode 100644 index 000000000..c1deb8762 --- /dev/null +++ b/test/c/replica_test_unlogged.c @@ -0,0 +1,125 @@ +#include +#include +#include + +#include "runner.h" + +int replica_test_unlogged(TestCaseState* state) +{ + /* + Test Outline + ============= + 1. Create unlogged table and index on it (and insert data) + 2. Make table logged + 3. Insert data on master + 4. Crash and restart slave and call validate_index on it + */ + + PGresult* res; + + // Create unlogged table, index, and insert data + res = PQexec(state->conn, + "DROP TABLE IF EXISTS small_world;" + "CREATE UNLOGGED TABLE small_world (id SERIAL PRIMARY KEY, v real[]);" + "CREATE INDEX ON small_world USING hnsw (v) WITH (dim=3);" + "INSERT INTO small_world (v) VALUES (ARRAY[0,0,1]), (ARRAY[0,1,0]), (ARRAY[1,0,0]);" + "CHECKPOINT;"); + + if(PQresultStatus(res) != PGRES_COMMAND_OK) { + fprintf(stderr, "Failed to prepare unlogged table, create index, and insert data on it: %s\n", PQerrorMessage(state->conn)); + PQclear(res); + return 1; + } + + PQclear(res); + + // Validate index on master + res = PQexec(state->conn, "SELECT _lantern_internal.validate_index('small_world_v_idx', false);"); + + if(PQresultStatus(res) != PGRES_TUPLES_OK) { + fprintf(stderr, "Failed to validate index on master: %s\n", PQerrorMessage(state->conn)); + PQclear(res); + return 1; + } + + PQclear(res); + + // Alter table to be logged + res = PQexec(state->conn, + "ALTER TABLE small_world SET LOGGED;"); + + if(PQresultStatus(res) != PGRES_COMMAND_OK) { + fprintf(stderr, "Failed to alter unlogged table to logged: %s\n", PQerrorMessage(state->conn)); + PQclear(res); + return 1; + } + + PQclear(res); + + // Insert some more data + res = PQexec(state->conn, + "INSERT INTO small_world (v) VALUES (ARRAY[1,2,3])"); + + if(PQresultStatus(res) != PGRES_COMMAND_OK) { + fprintf(stderr, "Failed to insert more data into the now logged table: %s\n", PQerrorMessage(state->conn)); + PQclear(res); + return 1; + } + + PQclear(res); + + // Validate index on master after changing table to be logged and inserting data + res = PQexec(state->conn, "SELECT _lantern_internal.validate_index('small_world_v_idx', false);"); + + if(PQresultStatus(res) != PGRES_TUPLES_OK) { + fprintf(stderr, "Failed to validate index on master: %s\n", PQerrorMessage(state->conn)); + PQclear(res); + return 1; + } + + PQclear(res); + + sleep(2); // wait for replica to sync + + // Validate index on replica + res = PQexec(state->replica_conn, "SELECT _lantern_internal.validate_index('small_world_v_idx', false);"); + + if(PQresultStatus(res) != PGRES_TUPLES_OK) { + fprintf(stderr, "Failed to validate index on replica: %s\n", PQerrorMessage(state->replica_conn)); + PQclear(res); + return 1; + } + + PQclear(res); + + // Test query on replica + res = PQexec(state->replica_conn, "SELECT v <-> '{1,1,1}' FROM small_world ORDER BY v <-> '{1,1,1}' LIMIT 10;"); + + if(PQresultStatus(res) != PGRES_TUPLES_OK) { + fprintf(stderr, "Failed to query index on replica: %s\n", PQerrorMessage(state->conn)); + PQclear(res); + return 1; + } + + PQclear(res); + + // Crash replica: + system("bash -c '. ../ci/scripts/bitnami-utils.sh && crash_and_restart_postgres_replica'"); + state->replica_conn = connect_database( + state->DB_HOST, state->REPLICA_PORT, state->DB_USER, state->DB_PASSWORD, state->TEST_DB_NAME); + + // Validate index on replica after crash + res = PQexec(state->replica_conn, "SELECT _lantern_internal.validate_index('small_world_v_idx', true);"); + + if(PQresultStatus(res) != PGRES_TUPLES_OK) { + fprintf(stderr, "Failed to validate index on replica after restart: %s\n", PQerrorMessage(state->replica_conn)); + // Tail the log file to see crash error if any + system("tail /tmp/postgres-slave-conf/pg.log 2>/dev/null || true"); + PQclear(res); + return 1; + } + + PQclear(res); + + return 0; +} diff --git a/test/c/runner.c b/test/c/runner.c index 34095e75a..db3f3ad49 100644 --- a/test/c/runner.c +++ b/test/c/runner.c @@ -9,6 +9,7 @@ // Include your test files here #include "replica_test_index.c" +#include "replica_test_unlogged.c" #include "test_op_rewrite.c" // =========================== @@ -98,7 +99,8 @@ int main() struct TestCase test_cases[] = { // Add new test files here to be run {.name = "test_op_rewrite", .func = (TestCaseFunction)test_op_rewrite}, - {.name = "replica_test_index", .func = (TestCaseFunction)replica_test_index} + {.name = "replica_test_index", .func = (TestCaseFunction)replica_test_index}, + {.name = "replica_test_unlogged", .func = (TestCaseFunction)replica_test_unlogged} // ================================ }; @@ -113,12 +115,6 @@ int main() const char *ROOT_DB_NAME = "postgres"; PGconn *root_conn = NULL; - root_conn = connect_database(DB_HOST, DB_PORT, DB_USER, DB_PASSWORD, ROOT_DB_NAME); - - if(root_conn == NULL) { - return 1; - } - for(i = 0; i < sizeof(test_cases) / sizeof(struct TestCase); i++) { current_case = test_cases[ i ]; current_case_state.REPLICA_PORT = REPLICA_PORT; @@ -128,8 +124,15 @@ int main() current_case_state.DB_USER = DB_USER; current_case_state.TEST_DB_NAME = TEST_DB_NAME; + root_conn = connect_database(DB_HOST, DB_PORT, DB_USER, DB_PASSWORD, ROOT_DB_NAME); + printf("[+] Running test case '%s'...\n", current_case.name); + if(root_conn == NULL) { + fprintf(stderr, "[X] Can not connect to root database on port '%s'\n", DB_PORT); + return 1; + } + // Create test database if(recreate_database(root_conn, TEST_DB_NAME)) { fprintf(stderr, "[X] Failed to recreate test database\n"); @@ -144,7 +147,7 @@ int main() continue; } // Wait for replica to sync with master or test db will not exist - sleep(3); + sleep(7); current_case_state.replica_conn = connect_database(DB_HOST, REPLICA_PORT, DB_USER, DB_PASSWORD, TEST_DB_NAME); if(current_case_state.replica_conn == NULL) { @@ -183,13 +186,13 @@ int main() // Close test connection PQfinish(current_case_state.conn); + PQfinish(root_conn); if(ENABLE_REPLICA) { PQfinish(current_case_state.replica_conn); } printf("[+] Test case '%s' passed\n", current_case.name); } - PQfinish(root_conn); printf("[+] All tests passed\n"); return 0; } diff --git a/test/expected/hnsw_create_unlogged.out b/test/expected/hnsw_create_unlogged.out new file mode 100644 index 000000000..c2f95e6b4 --- /dev/null +++ b/test/expected/hnsw_create_unlogged.out @@ -0,0 +1,153 @@ +------------------------------------------------------------------------------ +-- Test HNSW index creation +------------------------------------------------------------------------------ +-- Validate that index creation works with a small number of vectors +\ir utils/small_world_array_unlogged.sql +CREATE UNLOGGED TABLE small_world ( + id VARCHAR(3), + b BOOLEAN, + v REAL[3] +); +INSERT INTO small_world (id, b, v) VALUES + ('000', TRUE, '{0,0,0}'), + ('001', TRUE, '{0,0,1}'), + ('010', FALSE, '{0,1,0}'), + ('011', TRUE, '{0,1,1}'), + ('100', FALSE, '{1,0,0}'), + ('101', FALSE, '{1,0,1}'), + ('110', FALSE, '{1,1,0}'), + ('111', TRUE, '{1,1,1}'); +\ir utils/sift1k_array_unlogged.sql +CREATE UNLOGGED TABLE IF NOT EXISTS sift_base1k ( + id SERIAL, + v REAL[] +); +COPY sift_base1k (v) FROM '/tmp/lantern/vector_datasets/sift_base1k_arrays.csv' WITH csv; +-- Validate that creating a secondary index works +CREATE INDEX ON sift_base1k USING hnsw (v) WITH (dim=128, M=4); +INFO: done init usearch index +INFO: inserted 1000 elements +INFO: done saving 1000 vectors +SELECT * FROM ldb_get_indexes('sift_base1k'); + indexname | size | indexdef | total_index_size +-------------------+--------+---------------------------------------------------------------------------------------------+------------------ + sift_base1k_v_idx | 632 kB | CREATE INDEX sift_base1k_v_idx ON public.sift_base1k USING hnsw (v) WITH (dim='128', m='4') | 632 kB +(1 row) + +SELECT _lantern_internal.validate_index('sift_base1k_v_idx', false); +INFO: validate_index() start for sift_base1k_v_idx +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + +-- Validate that index creation works with a larger number of vectors +\ir utils/sift10k_array_unlogged.sql +CREATE UNLOGGED TABLE IF NOT EXISTS sift_base10k ( + id SERIAL PRIMARY KEY, + v REAL[128] +); +\copy sift_base10k (v) FROM '/tmp/lantern/vector_datasets/siftsmall_base_arrays.csv' with csv; +SET lantern.pgvector_compat=FALSE; +CREATE INDEX hnsw_idx ON sift_base10k USING hnsw (v dist_l2sq_ops) WITH (M=2, ef_construction=10, ef=4, dim=128); +INFO: done init usearch index +INFO: inserted 10000 elements +INFO: done saving 10000 vectors +SELECT v AS v4444 FROM sift_base10k WHERE id = 4444 \gset +EXPLAIN (COSTS FALSE) SELECT * FROM sift_base10k order by v :'v4444' LIMIT 10; + QUERY PLAN +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Limit + -> Index Scan using hnsw_idx on sift_base10k + Order By: (v '{55,61,11,4,5,2,13,24,65,49,13,9,23,37,94,38,54,11,14,14,40,31,50,44,53,4,0,0,27,17,8,34,12,10,4,4,22,52,68,53,9,2,0,0,2,116,119,64,119,2,0,0,2,30,119,119,116,5,0,8,47,9,5,60,7,7,10,23,56,50,23,5,28,68,6,18,24,65,50,9,119,75,3,0,1,8,12,85,119,11,4,6,8,9,5,74,25,11,8,20,18,12,2,21,11,90,25,32,33,15,2,9,84,67,8,4,22,31,11,33,119,30,3,6,0,0,0,26}'::real[]) +(3 rows) + +SELECT _lantern_internal.validate_index('hnsw_idx', false); +INFO: validate_index() start for hnsw_idx +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + +--- Validate that M values inside the allowed range [2, 128] do not throw an error +CREATE INDEX ON small_world USING hnsw (v) WITH (M=2); +INFO: done init usearch index +INFO: inserted 8 elements +INFO: done saving 8 vectors +CREATE INDEX ON small_world USING hnsw (v) WITH (M=128); +INFO: done init usearch index +INFO: inserted 8 elements +INFO: done saving 8 vectors +---- Validate that M values outside the allowed range [2, 128] throw an error +\set ON_ERROR_STOP off +CREATE INDEX ON small_world USING hnsw (v) WITH (M=1); +ERROR: value 1 out of bounds for option "m" +CREATE INDEX ON small_world USING hnsw (v) WITH (M=129); +ERROR: value 129 out of bounds for option "m" +\set ON_ERROR_STOP on +-- Validate index dimension inference +CREATE UNLOGGED TABLE small_world4 ( + id varchar(3), + vector real[] +); +-- If the first row is NULL we do not infer a dimension +\set ON_ERROR_STOP off +CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); +ERROR: column does not have dimensions, please specify one +begin; +INSERT INTO small_world4 (id, vector) VALUES +('000', NULL), +('001', '{1,0,0,1}'); +CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); +ERROR: column does not have dimensions, please specify one +rollback; +\set ON_ERROR_STOP on +INSERT INTO small_world4 (id, vector) VALUES +('000', '{1,0,0,0}'), +('001', '{1,0,0,1}'), +('010', '{1,0,1,0}'), +('011', '{1,0,1,1}'), +('100', '{1,1,0,0}'), +('101', '{1,1,0,1}'), +('110', '{1,1,1,0}'), +('111', '{1,1,1,1}'); +CREATE INDEX small_world4_hnsw_idx ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); +INFO: done init usearch index +INFO: inserted 8 elements +INFO: done saving 8 vectors +SELECT * FROM ldb_get_indexes('small_world4'); + indexname | size | indexdef | total_index_size +-----------------------+-------+---------------------------------------------------------------------------------------------------------------------------+------------------ + small_world4_hnsw_idx | 24 kB | CREATE INDEX small_world4_hnsw_idx ON public.small_world4 USING hnsw (vector) WITH (m='14', ef='22', ef_construction='2') | 24 kB +(1 row) + +-- the index will not allow changing the dimension of a vector element +\set ON_ERROR_STOP off +UPDATE small_world4 SET vector = '{0,0,0}' WHERE id = '000'; +ERROR: Wrong number of dimensions: 3 instead of 4 expected +UPDATE small_world4 SET vector = '{0,0,0}' WHERE id = '001'; +ERROR: Wrong number of dimensions: 3 instead of 4 expected +\set ON_ERROR_STOP on +INSERT INTO small_world4 (id, vector) VALUES +('000', '{1,0,0,0}'), +('001', '{1,0,0,1}'), +('010', '{1,0,1,0}'); +SELECT _lantern_internal.validate_index('small_world4_hnsw_idx', false); +INFO: validate_index() start for small_world4_hnsw_idx +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + +-- without the index, I can change the dimension of a vector element +DROP INDEX small_world4_hnsw_idx; +UPDATE small_world4 SET vector = '{0,0,0}' WHERE id = '001'; +-- but then, I cannot create the same dimension-inferred index +\set ON_ERROR_STOP off +CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); +INFO: done init usearch index +ERROR: Wrong number of dimensions: 3 instead of 4 expected +\set ON_ERROR_STOP on diff --git a/test/expected/hnsw_insert_unlogged.out b/test/expected/hnsw_insert_unlogged.out new file mode 100644 index 000000000..1c692dd9f --- /dev/null +++ b/test/expected/hnsw_insert_unlogged.out @@ -0,0 +1,153 @@ +--------------------------------------------------------------------- +-- Test HNSW index inserts on empty table +--------------------------------------------------------------------- +-- set an artificially low work_mem to make sure work_mem exceeded warnings are printed +set work_mem = '64kB'; +-- We do not actually print the warnings generated for exceeding work_mem because the work_mem +-- check does not work for postgres 13 and lower.So, if we printed the warnings, we would get a regression +-- failure in older postgres versions. We still reduce workmem to exercise relevant codepaths for coverage +set client_min_messages = 'ERROR'; +CREATE UNLOGGED TABLE small_world ( + id SERIAL PRIMARY KEY, + v REAL[2] -- this demonstates that postgres actually does not enforce real[] length as we actually insert vectors of length 3 +); +CREATE UNLOGGED TABLE small_world_int ( + id SERIAL PRIMARY KEY, + v INTEGER[] +); +CREATE INDEX ON small_world USING hnsw (v) WITH (dim=3); +INFO: done init usearch index +INFO: inserted 0 elements +INFO: done saving 0 vectors +SELECT _lantern_internal.validate_index('small_world_v_idx', false); +INFO: validate_index() start for small_world_v_idx +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + +-- Insert rows with valid vector data +INSERT INTO small_world (v) VALUES ('{0,0,1}'), ('{0,1,0}'); +INSERT INTO small_world (v) VALUES (NULL); +-- Attempt to insert a row with an incorrect vector length +\set ON_ERROR_STOP off +-- Cannot create an hnsw index with implicit typecasts (trying to cast integer[] to real[], in this case) +CREATE INDEX ON small_world_int USING hnsw (v dist_l2sq_ops) WITH (dim=3); +ERROR: operator class "dist_l2sq_ops" does not accept data type integer[] +INSERT INTO small_world (v) VALUES ('{1,1,1,1}'); +ERROR: Wrong number of dimensions: 4 instead of 3 expected +\set ON_ERROR_STOP on +DROP TABLE small_world; +-- set work_mem to a value that is enough for the tests +set client_min_messages = 'WARNING'; +set work_mem = '10MB'; +--------------------------------------------------------------------- +-- Test HNSW index inserts on non-empty table +--------------------------------------------------------------------- +\ir utils/small_world_array_unlogged.sql +CREATE UNLOGGED TABLE small_world ( + id VARCHAR(3), + b BOOLEAN, + v REAL[3] +); +INSERT INTO small_world (id, b, v) VALUES + ('000', TRUE, '{0,0,0}'), + ('001', TRUE, '{0,0,1}'), + ('010', FALSE, '{0,1,0}'), + ('011', TRUE, '{0,1,1}'), + ('100', FALSE, '{1,0,0}'), + ('101', FALSE, '{1,0,1}'), + ('110', FALSE, '{1,1,0}'), + ('111', TRUE, '{1,1,1}'); +CREATE INDEX ON small_world USING hnsw (v) WITH (dim=3); +INFO: done init usearch index +INFO: inserted 8 elements +INFO: done saving 8 vectors +SET enable_seqscan = false; +SET lantern.pgvector_compat = false; +-- Inserting vectors of the same dimension and nulls should work +INSERT INTO small_world (v) VALUES ('{1,1,2}'); +INSERT INTO small_world (v) VALUES (NULL); +-- Inserting vectors of different dimension should fail +\set ON_ERROR_STOP off +INSERT INTO small_world (v) VALUES ('{4,4,4,4}'); +ERROR: Wrong number of dimensions: 4 instead of 3 expected +\set ON_ERROR_STOP on +-- Verify that the index works with the inserted vectors +SELECT + ROUND(l2sq_dist(v, '{0,0,0}')::numeric, 2) +FROM + small_world +ORDER BY + v '{0,0,0}'; + round +------- + 0.00 + 1.00 + 1.00 + 1.00 + 2.00 + 2.00 + 2.00 + 3.00 + 6.00 +(9 rows) + +-- Ensure the index size remains consistent after inserts +SELECT * from ldb_get_indexes('small_world'); + indexname | size | indexdef | total_index_size +-------------------+-------+------------------------------------------------------------------------------------+------------------ + small_world_v_idx | 24 kB | CREATE INDEX small_world_v_idx ON public.small_world USING hnsw (v) WITH (dim='3') | 24 kB +(1 row) + +-- Ensure the query plan remains consistent after inserts +EXPLAIN (COSTS FALSE) +SELECT + ROUND(l2sq_dist(v, '{0,0,0}')::numeric, 2) +FROM + small_world +ORDER BY + v '{0,0,0}' +LIMIT 10; + QUERY PLAN +--------------------------------------------------------- + Limit + -> Index Scan using small_world_v_idx on small_world + Order By: (v '{0,0,0}'::real[]) +(3 rows) + +SELECT _lantern_internal.validate_index('small_world_v_idx', false); +INFO: validate_index() start for small_world_v_idx +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + +-- Test the index with a larger number of vectors +CREATE UNLOGGED TABLE sift_base10k ( + id SERIAL PRIMARY KEY, + v REAL[128] +); +CREATE INDEX hnsw_idx ON sift_base10k USING hnsw (v dist_l2sq_ops) WITH (M=2, ef_construction=10, ef=4, dim=128); +INFO: done init usearch index +INFO: inserted 0 elements +INFO: done saving 0 vectors +\COPY sift_base10k (v) FROM '/tmp/lantern/vector_datasets/siftsmall_base_arrays.csv' WITH CSV; +SELECT v AS v4444 FROM sift_base10k WHERE id = 4444 \gset +EXPLAIN (COSTS FALSE) SELECT * FROM sift_base10k order by v :'v4444'; + QUERY PLAN +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Index Scan using hnsw_idx on sift_base10k + Order By: (v '{55,61,11,4,5,2,13,24,65,49,13,9,23,37,94,38,54,11,14,14,40,31,50,44,53,4,0,0,27,17,8,34,12,10,4,4,22,52,68,53,9,2,0,0,2,116,119,64,119,2,0,0,2,30,119,119,116,5,0,8,47,9,5,60,7,7,10,23,56,50,23,5,28,68,6,18,24,65,50,9,119,75,3,0,1,8,12,85,119,11,4,6,8,9,5,74,25,11,8,20,18,12,2,21,11,90,25,32,33,15,2,9,84,67,8,4,22,31,11,33,119,30,3,6,0,0,0,26}'::real[]) +(2 rows) + +SELECT _lantern_internal.validate_index('hnsw_idx', false); +INFO: validate_index() start for hnsw_idx +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + diff --git a/test/expected/hnsw_logged_unlogged.out b/test/expected/hnsw_logged_unlogged.out new file mode 100644 index 000000000..4a88ca868 --- /dev/null +++ b/test/expected/hnsw_logged_unlogged.out @@ -0,0 +1,294 @@ +-- Test changing tables from logged to unlogged, and from unlogged to logged +-- -------------------------- +-- Start with logged table +-- -------------------------- +CREATE TABLE small_world ( + id varchar(3), + vector real[] +); +-- Insert (we insert data such that each vector has a unique distance from (0,0,0) +INSERT INTO small_world (id, vector) VALUES +('000', '{1,0,0,0}'), +('001', '{1,0,0,1}'), +('010', '{1,1,1,0}'), +('011', '{1,1,1,1}'), +('100', '{2,1,0,0}'), +('101', '{1,2,0,1}'), +('110', '{1,2,1,1}'), +('111', '{2,2,2,0}'); +-- Create an index +CREATE INDEX small_world_idx ON small_world USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); +INFO: done init usearch index +INFO: inserted 8 elements +INFO: done saving 8 vectors +-- Validate index +SELECT _lantern_internal.validate_index('small_world_idx', false); +INFO: validate_index() start for small_world_idx +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + +-- Query +SET enable_seqscan = false; +SELECT id, l2sq_dist(vector, '{0, 0, 0, 0}'), vector FROM small_world ORDER BY vector <-> ARRAY[0, 0, 0, 0] LIMIT 10; + id | l2sq_dist | vector +-----+-----------+----------- + 000 | 1 | {1,0,0,0} + 001 | 2 | {1,0,0,1} + 010 | 3 | {1,1,1,0} + 011 | 4 | {1,1,1,1} + 100 | 5 | {2,1,0,0} + 101 | 6 | {1,2,0,1} + 110 | 7 | {1,2,1,1} + 111 | 12 | {2,2,2,0} +(8 rows) + +-- Switch table to be unlogged +ALTER TABLE small_world SET UNLOGGED; +INFO: done init usearch index +INFO: inserted 8 elements +INFO: done saving 8 vectors +-- Create a new index +CREATE INDEX small_world_idx2 ON small_world USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); +INFO: done init usearch index +INFO: inserted 8 elements +INFO: done saving 8 vectors +-- Validate indexes +SELECT _lantern_internal.validate_index('small_world_idx', false); +INFO: validate_index() start for small_world_idx +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + +SELECT _lantern_internal.validate_index('small_world_idx2', false); +INFO: validate_index() start for small_world_idx2 +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + +-- Insert +INSERT INTO small_world (id, vector) VALUES ('002', '{0,3,1,1}'); +-- Query +SELECT id, l2sq_dist(vector, '{0, 0, 0, 0}'), vector FROM small_world ORDER BY vector <-> ARRAY[0, 0, 0, 0] LIMIT 10; + id | l2sq_dist | vector +-----+-----------+----------- + 000 | 1 | {1,0,0,0} + 001 | 2 | {1,0,0,1} + 010 | 3 | {1,1,1,0} + 011 | 4 | {1,1,1,1} + 100 | 5 | {2,1,0,0} + 101 | 6 | {1,2,0,1} + 110 | 7 | {1,2,1,1} + 002 | 11 | {0,3,1,1} + 111 | 12 | {2,2,2,0} +(9 rows) + +-- Switch table to be logged again +ALTER TABLE small_world SET LOGGED; +INFO: done init usearch index +INFO: inserted 9 elements +INFO: done saving 9 vectors +INFO: done init usearch index +INFO: inserted 9 elements +INFO: done saving 9 vectors +-- Create a new index +CREATE INDEX small_world_idx3 ON small_world USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); +INFO: done init usearch index +INFO: inserted 9 elements +INFO: done saving 9 vectors +-- Validate indexes +SELECT _lantern_internal.validate_index('small_world_idx', false); +INFO: validate_index() start for small_world_idx +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + +SELECT _lantern_internal.validate_index('small_world_idx2', false); +INFO: validate_index() start for small_world_idx2 +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + +SELECT _lantern_internal.validate_index('small_world_idx3', false); +INFO: validate_index() start for small_world_idx3 +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + +-- Insert +INSERT INTO small_world (id, vector) VALUES ('020', '{0,0,4,0}'); +-- Query +SELECT id, l2sq_dist(vector, '{0, 0, 0, 0}'), vector FROM small_world ORDER BY vector <-> ARRAY[0, 0, 0, 0] LIMIT 10; + id | l2sq_dist | vector +-----+-----------+----------- + 000 | 1 | {1,0,0,0} + 001 | 2 | {1,0,0,1} + 010 | 3 | {1,1,1,0} + 011 | 4 | {1,1,1,1} + 100 | 5 | {2,1,0,0} + 101 | 6 | {1,2,0,1} + 110 | 7 | {1,2,1,1} + 002 | 11 | {0,3,1,1} + 111 | 12 | {2,2,2,0} + 020 | 16 | {0,0,4,0} +(10 rows) + +-- -------------------------- +-- Start with unlogged table +-- -------------------------- +DROP TABLE small_world; +CREATE UNLOGGED TABLE small_world ( + id varchar(3), + vector real[] +); +-- Insert (we insert data such that each vector has a unique distance from (0,0,0) +INSERT INTO small_world (id, vector) VALUES +('000', '{1,0,0,0}'), +('001', '{1,0,0,1}'), +('010', '{1,1,1,0}'), +('011', '{1,1,1,1}'), +('100', '{2,1,0,0}'), +('101', '{1,2,0,1}'), +('110', '{1,2,1,1}'), +('111', '{2,2,2,0}'); +-- Create an index +CREATE INDEX small_world_idx ON small_world USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); +INFO: done init usearch index +INFO: inserted 8 elements +INFO: done saving 8 vectors +-- Validate index +SELECT _lantern_internal.validate_index('small_world_idx', false); +INFO: validate_index() start for small_world_idx +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + +-- Query +SET enable_seqscan = false; +SELECT id, l2sq_dist(vector, '{0, 0, 0, 0}'), vector FROM small_world ORDER BY vector <-> ARRAY[0, 0, 0, 0] LIMIT 10; + id | l2sq_dist | vector +-----+-----------+----------- + 000 | 1 | {1,0,0,0} + 001 | 2 | {1,0,0,1} + 010 | 3 | {1,1,1,0} + 011 | 4 | {1,1,1,1} + 100 | 5 | {2,1,0,0} + 101 | 6 | {1,2,0,1} + 110 | 7 | {1,2,1,1} + 111 | 12 | {2,2,2,0} +(8 rows) + +-- Switch table to be logged +ALTER TABLE small_world SET LOGGED; +INFO: done init usearch index +INFO: inserted 8 elements +INFO: done saving 8 vectors +-- Create a new index +CREATE INDEX small_world_idx2 ON small_world USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); +INFO: done init usearch index +INFO: inserted 8 elements +INFO: done saving 8 vectors +-- Validate indexes +SELECT _lantern_internal.validate_index('small_world_idx', false); +INFO: validate_index() start for small_world_idx +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + +SELECT _lantern_internal.validate_index('small_world_idx2', false); +INFO: validate_index() start for small_world_idx2 +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + +-- Insert +INSERT INTO small_world (id, vector) VALUES ('002', '{0,3,1,1}'); +-- Query +SELECT id, l2sq_dist(vector, '{0, 0, 0, 0}'), vector FROM small_world ORDER BY vector <-> ARRAY[0, 0, 0, 0] LIMIT 10; + id | l2sq_dist | vector +-----+-----------+----------- + 000 | 1 | {1,0,0,0} + 001 | 2 | {1,0,0,1} + 010 | 3 | {1,1,1,0} + 011 | 4 | {1,1,1,1} + 100 | 5 | {2,1,0,0} + 101 | 6 | {1,2,0,1} + 110 | 7 | {1,2,1,1} + 002 | 11 | {0,3,1,1} + 111 | 12 | {2,2,2,0} +(9 rows) + +-- Switch table to be unlogged again +ALTER TABLE small_world SET UNLOGGED; +INFO: done init usearch index +INFO: inserted 9 elements +INFO: done saving 9 vectors +INFO: done init usearch index +INFO: inserted 9 elements +INFO: done saving 9 vectors +-- Create a new index +CREATE INDEX small_world_idx3 ON small_world USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); +INFO: done init usearch index +INFO: inserted 9 elements +INFO: done saving 9 vectors +-- Validate indexes +SELECT _lantern_internal.validate_index('small_world_idx', false); +INFO: validate_index() start for small_world_idx +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + +SELECT _lantern_internal.validate_index('small_world_idx2', false); +INFO: validate_index() start for small_world_idx2 +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + +SELECT _lantern_internal.validate_index('small_world_idx3', false); +INFO: validate_index() start for small_world_idx3 +INFO: validate_index() done, no issues found. + validate_index +---------------- + +(1 row) + +-- Insert +INSERT INTO small_world (id, vector) VALUES ('020', '{0,0,4,0}'); +-- Query +SELECT id, l2sq_dist(vector, '{0, 0, 0, 0}'), vector FROM small_world ORDER BY vector <-> ARRAY[0, 0, 0, 0] LIMIT 10; + id | l2sq_dist | vector +-----+-----------+----------- + 000 | 1 | {1,0,0,0} + 001 | 2 | {1,0,0,1} + 010 | 3 | {1,1,1,0} + 011 | 4 | {1,1,1,1} + 100 | 5 | {2,1,0,0} + 101 | 6 | {1,2,0,1} + 110 | 7 | {1,2,1,1} + 002 | 11 | {0,3,1,1} + 111 | 12 | {2,2,2,0} + 020 | 16 | {0,0,4,0} +(10 rows) + diff --git a/test/schedule.txt b/test/schedule.txt index f8210d6cd..4aac98cd6 100644 --- a/test/schedule.txt +++ b/test/schedule.txt @@ -3,6 +3,6 @@ # - every test that needs to be run iff pgvector is installed appears in a 'test_pgvector:' line # - 'test' lines may have multiple space-separated tests. All tests in a single 'test' line will be run in parallel -test: hnsw_config hnsw_correct hnsw_create hnsw_create_expr hnsw_dist_func hnsw_insert hnsw_select hnsw_todo hnsw_index_from_file hnsw_cost_estimate ext_relocation hnsw_ef_search hnsw_failure_point hnsw_operators hnsw_blockmap_create +test: hnsw_config hnsw_correct hnsw_create hnsw_create_expr hnsw_dist_func hnsw_insert hnsw_select hnsw_todo hnsw_index_from_file hnsw_cost_estimate ext_relocation hnsw_ef_search hnsw_failure_point hnsw_operators hnsw_blockmap_create hnsw_create_unlogged hnsw_insert_unlogged hnsw_logged_unlogged test_pgvector: hnsw_vector test_extras: hnsw_extras diff --git a/test/sql/hnsw_create_unlogged.sql b/test/sql/hnsw_create_unlogged.sql new file mode 100644 index 000000000..768754b68 --- /dev/null +++ b/test/sql/hnsw_create_unlogged.sql @@ -0,0 +1,80 @@ +------------------------------------------------------------------------------ +-- Test HNSW index creation +------------------------------------------------------------------------------ + +-- Validate that index creation works with a small number of vectors +\ir utils/small_world_array_unlogged.sql +\ir utils/sift1k_array_unlogged.sql + +-- Validate that creating a secondary index works +CREATE INDEX ON sift_base1k USING hnsw (v) WITH (dim=128, M=4); +SELECT * FROM ldb_get_indexes('sift_base1k'); +SELECT _lantern_internal.validate_index('sift_base1k_v_idx', false); + +-- Validate that index creation works with a larger number of vectors +\ir utils/sift10k_array_unlogged.sql +SET lantern.pgvector_compat=FALSE; + +CREATE INDEX hnsw_idx ON sift_base10k USING hnsw (v dist_l2sq_ops) WITH (M=2, ef_construction=10, ef=4, dim=128); +SELECT v AS v4444 FROM sift_base10k WHERE id = 4444 \gset +EXPLAIN (COSTS FALSE) SELECT * FROM sift_base10k order by v :'v4444' LIMIT 10; +SELECT _lantern_internal.validate_index('hnsw_idx', false); + +--- Validate that M values inside the allowed range [2, 128] do not throw an error + +CREATE INDEX ON small_world USING hnsw (v) WITH (M=2); +CREATE INDEX ON small_world USING hnsw (v) WITH (M=128); + +---- Validate that M values outside the allowed range [2, 128] throw an error +\set ON_ERROR_STOP off +CREATE INDEX ON small_world USING hnsw (v) WITH (M=1); +CREATE INDEX ON small_world USING hnsw (v) WITH (M=129); +\set ON_ERROR_STOP on + +-- Validate index dimension inference +CREATE UNLOGGED TABLE small_world4 ( + id varchar(3), + vector real[] +); +-- If the first row is NULL we do not infer a dimension +\set ON_ERROR_STOP off +CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); +begin; +INSERT INTO small_world4 (id, vector) VALUES +('000', NULL), +('001', '{1,0,0,1}'); +CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); +rollback; +\set ON_ERROR_STOP on + +INSERT INTO small_world4 (id, vector) VALUES +('000', '{1,0,0,0}'), +('001', '{1,0,0,1}'), +('010', '{1,0,1,0}'), +('011', '{1,0,1,1}'), +('100', '{1,1,0,0}'), +('101', '{1,1,0,1}'), +('110', '{1,1,1,0}'), +('111', '{1,1,1,1}'); +CREATE INDEX small_world4_hnsw_idx ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); +SELECT * FROM ldb_get_indexes('small_world4'); +-- the index will not allow changing the dimension of a vector element +\set ON_ERROR_STOP off +UPDATE small_world4 SET vector = '{0,0,0}' WHERE id = '000'; +UPDATE small_world4 SET vector = '{0,0,0}' WHERE id = '001'; +\set ON_ERROR_STOP on + +INSERT INTO small_world4 (id, vector) VALUES +('000', '{1,0,0,0}'), +('001', '{1,0,0,1}'), +('010', '{1,0,1,0}'); + +SELECT _lantern_internal.validate_index('small_world4_hnsw_idx', false); + +-- without the index, I can change the dimension of a vector element +DROP INDEX small_world4_hnsw_idx; +UPDATE small_world4 SET vector = '{0,0,0}' WHERE id = '001'; +-- but then, I cannot create the same dimension-inferred index +\set ON_ERROR_STOP off +CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); +\set ON_ERROR_STOP on diff --git a/test/sql/hnsw_insert_unlogged.sql b/test/sql/hnsw_insert_unlogged.sql new file mode 100644 index 000000000..bf8066cea --- /dev/null +++ b/test/sql/hnsw_insert_unlogged.sql @@ -0,0 +1,94 @@ +--------------------------------------------------------------------- +-- Test HNSW index inserts on empty table +--------------------------------------------------------------------- +-- set an artificially low work_mem to make sure work_mem exceeded warnings are printed +set work_mem = '64kB'; +-- We do not actually print the warnings generated for exceeding work_mem because the work_mem +-- check does not work for postgres 13 and lower.So, if we printed the warnings, we would get a regression +-- failure in older postgres versions. We still reduce workmem to exercise relevant codepaths for coverage +set client_min_messages = 'ERROR'; + +CREATE UNLOGGED TABLE small_world ( + id SERIAL PRIMARY KEY, + v REAL[2] -- this demonstates that postgres actually does not enforce real[] length as we actually insert vectors of length 3 +); + +CREATE UNLOGGED TABLE small_world_int ( + id SERIAL PRIMARY KEY, + v INTEGER[] +); + +CREATE INDEX ON small_world USING hnsw (v) WITH (dim=3); +SELECT _lantern_internal.validate_index('small_world_v_idx', false); + +-- Insert rows with valid vector data +INSERT INTO small_world (v) VALUES ('{0,0,1}'), ('{0,1,0}'); +INSERT INTO small_world (v) VALUES (NULL); + +-- Attempt to insert a row with an incorrect vector length +\set ON_ERROR_STOP off +-- Cannot create an hnsw index with implicit typecasts (trying to cast integer[] to real[], in this case) +CREATE INDEX ON small_world_int USING hnsw (v dist_l2sq_ops) WITH (dim=3); +INSERT INTO small_world (v) VALUES ('{1,1,1,1}'); +\set ON_ERROR_STOP on + +DROP TABLE small_world; + +-- set work_mem to a value that is enough for the tests +set client_min_messages = 'WARNING'; +set work_mem = '10MB'; + +--------------------------------------------------------------------- +-- Test HNSW index inserts on non-empty table +--------------------------------------------------------------------- + +\ir utils/small_world_array_unlogged.sql + +CREATE INDEX ON small_world USING hnsw (v) WITH (dim=3); + +SET enable_seqscan = false; +SET lantern.pgvector_compat = false; + +-- Inserting vectors of the same dimension and nulls should work +INSERT INTO small_world (v) VALUES ('{1,1,2}'); +INSERT INTO small_world (v) VALUES (NULL); + +-- Inserting vectors of different dimension should fail +\set ON_ERROR_STOP off +INSERT INTO small_world (v) VALUES ('{4,4,4,4}'); +\set ON_ERROR_STOP on + +-- Verify that the index works with the inserted vectors +SELECT + ROUND(l2sq_dist(v, '{0,0,0}')::numeric, 2) +FROM + small_world +ORDER BY + v '{0,0,0}'; + +-- Ensure the index size remains consistent after inserts +SELECT * from ldb_get_indexes('small_world'); + +-- Ensure the query plan remains consistent after inserts +EXPLAIN (COSTS FALSE) +SELECT + ROUND(l2sq_dist(v, '{0,0,0}')::numeric, 2) +FROM + small_world +ORDER BY + v '{0,0,0}' +LIMIT 10; + +SELECT _lantern_internal.validate_index('small_world_v_idx', false); + +-- Test the index with a larger number of vectors +CREATE UNLOGGED TABLE sift_base10k ( + id SERIAL PRIMARY KEY, + v REAL[128] +); +CREATE INDEX hnsw_idx ON sift_base10k USING hnsw (v dist_l2sq_ops) WITH (M=2, ef_construction=10, ef=4, dim=128); +\COPY sift_base10k (v) FROM '/tmp/lantern/vector_datasets/siftsmall_base_arrays.csv' WITH CSV; +SELECT v AS v4444 FROM sift_base10k WHERE id = 4444 \gset +EXPLAIN (COSTS FALSE) SELECT * FROM sift_base10k order by v :'v4444'; + +SELECT _lantern_internal.validate_index('hnsw_idx', false); diff --git a/test/sql/hnsw_logged_unlogged.sql b/test/sql/hnsw_logged_unlogged.sql new file mode 100644 index 000000000..fe4d7700a --- /dev/null +++ b/test/sql/hnsw_logged_unlogged.sql @@ -0,0 +1,138 @@ +-- Test changing tables from logged to unlogged, and from unlogged to logged + +-- -------------------------- +-- Start with logged table +-- -------------------------- +CREATE TABLE small_world ( + id varchar(3), + vector real[] +); + +-- Insert (we insert data such that each vector has a unique distance from (0,0,0) +INSERT INTO small_world (id, vector) VALUES +('000', '{1,0,0,0}'), +('001', '{1,0,0,1}'), +('010', '{1,1,1,0}'), +('011', '{1,1,1,1}'), +('100', '{2,1,0,0}'), +('101', '{1,2,0,1}'), +('110', '{1,2,1,1}'), +('111', '{2,2,2,0}'); + + +-- Create an index +CREATE INDEX small_world_idx ON small_world USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); + +-- Validate index +SELECT _lantern_internal.validate_index('small_world_idx', false); + +-- Query +SET enable_seqscan = false; +SELECT id, l2sq_dist(vector, '{0, 0, 0, 0}'), vector FROM small_world ORDER BY vector <-> ARRAY[0, 0, 0, 0] LIMIT 10; + + +-- Switch table to be unlogged +ALTER TABLE small_world SET UNLOGGED; + +-- Create a new index +CREATE INDEX small_world_idx2 ON small_world USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); + +-- Validate indexes +SELECT _lantern_internal.validate_index('small_world_idx', false); +SELECT _lantern_internal.validate_index('small_world_idx2', false); + +-- Insert +INSERT INTO small_world (id, vector) VALUES ('002', '{0,3,1,1}'); + +-- Query +SELECT id, l2sq_dist(vector, '{0, 0, 0, 0}'), vector FROM small_world ORDER BY vector <-> ARRAY[0, 0, 0, 0] LIMIT 10; + + +-- Switch table to be logged again +ALTER TABLE small_world SET LOGGED; + +-- Create a new index +CREATE INDEX small_world_idx3 ON small_world USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); + +-- Validate indexes +SELECT _lantern_internal.validate_index('small_world_idx', false); +SELECT _lantern_internal.validate_index('small_world_idx2', false); +SELECT _lantern_internal.validate_index('small_world_idx3', false); + +-- Insert +INSERT INTO small_world (id, vector) VALUES ('020', '{0,0,4,0}'); + +-- Query +SELECT id, l2sq_dist(vector, '{0, 0, 0, 0}'), vector FROM small_world ORDER BY vector <-> ARRAY[0, 0, 0, 0] LIMIT 10; + + +-- -------------------------- +-- Start with unlogged table +-- -------------------------- +DROP TABLE small_world; + +CREATE UNLOGGED TABLE small_world ( + id varchar(3), + vector real[] +); + +-- Insert (we insert data such that each vector has a unique distance from (0,0,0) +INSERT INTO small_world (id, vector) VALUES +('000', '{1,0,0,0}'), +('001', '{1,0,0,1}'), +('010', '{1,1,1,0}'), +('011', '{1,1,1,1}'), +('100', '{2,1,0,0}'), +('101', '{1,2,0,1}'), +('110', '{1,2,1,1}'), +('111', '{2,2,2,0}'); + + +-- Create an index +CREATE INDEX small_world_idx ON small_world USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); + +-- Validate index +SELECT _lantern_internal.validate_index('small_world_idx', false); + +-- Query +SET enable_seqscan = false; +SELECT id, l2sq_dist(vector, '{0, 0, 0, 0}'), vector FROM small_world ORDER BY vector <-> ARRAY[0, 0, 0, 0] LIMIT 10; + + +-- Switch table to be logged +ALTER TABLE small_world SET LOGGED; + +-- Create a new index +CREATE INDEX small_world_idx2 ON small_world USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); + +-- Validate indexes +SELECT _lantern_internal.validate_index('small_world_idx', false); +SELECT _lantern_internal.validate_index('small_world_idx2', false); + +-- Insert +INSERT INTO small_world (id, vector) VALUES ('002', '{0,3,1,1}'); + +-- Query +SELECT id, l2sq_dist(vector, '{0, 0, 0, 0}'), vector FROM small_world ORDER BY vector <-> ARRAY[0, 0, 0, 0] LIMIT 10; + + + +-- Switch table to be unlogged again +ALTER TABLE small_world SET UNLOGGED; + +-- Create a new index +CREATE INDEX small_world_idx3 ON small_world USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); + +-- Validate indexes +SELECT _lantern_internal.validate_index('small_world_idx', false); +SELECT _lantern_internal.validate_index('small_world_idx2', false); +SELECT _lantern_internal.validate_index('small_world_idx3', false); + +-- Insert +INSERT INTO small_world (id, vector) VALUES ('020', '{0,0,4,0}'); + +-- Query +SELECT id, l2sq_dist(vector, '{0, 0, 0, 0}'), vector FROM small_world ORDER BY vector <-> ARRAY[0, 0, 0, 0] LIMIT 10; + + + diff --git a/test/sql/manual_tests/hnsw_unlogged_post.sql b/test/sql/manual_tests/hnsw_unlogged_post.sql new file mode 100644 index 000000000..686ceb8da --- /dev/null +++ b/test/sql/manual_tests/hnsw_unlogged_post.sql @@ -0,0 +1,58 @@ +-- INSTRUCTIONS +-- this test is only to be run after running the `hnsw_unlogged_pre.sql` test and crashing postgres + +-- Validate recovered unlogged index structure (postgres should have moved the init fork data for these indexes to their main forks) +SELECT _lantern_internal.validate_index('unlogged_world1_hnsw_idx', true); +--SELECT _lantern_internal.validate_index('unlogged_world2_hnsw_idx', true); +SELECT _lantern_internal.validate_index('unlogged_world3_hnsw_idx', true); +SELECT _lantern_internal.validate_index('unlogged_world4_hnsw_idx', true); +SELECT _lantern_internal.validate_index('morph_world_hnsw_idx', true); +SELECT _lantern_internal.validate_index('morph_world2_hnsw_idx', true); + +-- Verify that the tables are now in fact empty after the crash, since tables are unlogged +SELECT * from unlogged_world1; +--SELECT * from unlogged_world2; +SELECT * from unlogged_world3; +SELECT * from unlogged_world4; +SELECT * from morph_world; +SELECT * from morph_world2; + +-- Verify that the indexes are operational +set enable_seqscan = false; +set enable_indexscan = true; + +-- These should use an index scan and return nothing (since table is empty) +EXPLAIN SELECT * FROM unlogged_world1 ORDER BY vector <-> ARRAY[0, 0, 0, 0] LIMIT 10; +SELECT * FROM unlogged_world1 ORDER BY vector <-> ARRAY[0, 0, 0, 0] LIMIT 10; + +--EXPLAIN SELECT * FROM unlogged_world2 ORDER BY vector <-> ARRAY[0, 0, 0, 0] LIMIT 10; +--SELECT * FROM unlogged_world2 ORDER BY vector <-> ARRAY[0, 0, 0, 0] LIMIT 10; + +EXPLAIN SELECT * FROM unlogged_world3 ORDER BY vector <-> ARRAY[0, 0, 0, 0] LIMIT 10; +SELECT * FROM unlogged_world3 ORDER BY vector <-> ARRAY[0, 0, 0, 0] LIMIT 10; + +EXPLAIN SELECT * FROM unlogged_world4 ORDER BY vector <-> ARRAY[0, 0, 0, 0] LIMIT 10; +SELECT * FROM unlogged_world4 ORDER BY vector <-> ARRAY[0, 0, 0, 0] LIMIT 10; + +EXPLAIN SELECT * FROM morph_world ORDER BY vector <-> ARRAY[0, 0, 0, 0] LIMIT 10; +SELECT * FROM morph_world ORDER BY vector <-> ARRAY[0, 0, 0, 0] LIMIT 10; + +EXPLAIN SELECT * FROM morph_world2 ORDER BY vector <-> ARRAY[0, 0, 0, 0] LIMIT 10; +SELECT * FROM morph_world2 ORDER BY vector <-> ARRAY[0, 0, 0, 0] LIMIT 10; + +-- Insert data into each one +INSERT INTO unlogged_world1 (id, vector) VALUES ('101', '{1,2,3,4}'); +--INSERT INTO unlogged_world2 (id, vector) VALUES ('101', '{1,2,3,4}'); +INSERT INTO unlogged_world3 (id, vector) VALUES ('101', '{1,2,3,4}'); +INSERT INTO unlogged_world4 (id, vector) VALUES ('101', '{1,2,3,4}'); +INSERT INTO morph_world (id, vector) VALUES ('101', '{1,2,3,4}'); +INSERT INTO morph_world2 (id, vector) VALUES ('101', '{1,2,3,4}'); + + +-- Test queries after new data inserted +SELECT * FROM unlogged_world1 ORDER BY vector <-> ARRAY[0, 0, 0, 0] LIMIT 10; +--SELECT * FROM unlogged_world2 ORDER BY vector <-> ARRAY[0, 0, 0, 0] LIMIT 10; +SELECT * FROM unlogged_world3 ORDER BY vector <-> ARRAY[0, 0, 0, 0] LIMIT 10; +SELECT * FROM unlogged_world4 ORDER BY vector <-> ARRAY[0, 0, 0, 0] LIMIT 10; +SELECT * FROM morph_world ORDER BY vector <-> ARRAY[0, 0, 0, 0] LIMIT 10; +SELECT * FROM morph_world2 ORDER BY vector <-> ARRAY[0, 0, 0, 0] LIMIT 10; diff --git a/test/sql/manual_tests/hnsw_unlogged_pre.sql b/test/sql/manual_tests/hnsw_unlogged_pre.sql new file mode 100644 index 000000000..cc573bd3d --- /dev/null +++ b/test/sql/manual_tests/hnsw_unlogged_pre.sql @@ -0,0 +1,139 @@ +-- INSTRUCTIONS +-- run this file first, and then crash +-- then, run the `hnsw_unlogged_post.sql` test + +DROP TABLE IF EXISTS unlogged_world1; +DROP TABLE IF EXISTS unlogged_world2; +DROP TABLE IF EXISTS unlogged_world3; +DROP TABLE IF EXISTS unlogged_world4; + +-- Explanation of tables +-- unlogged_world1: empty, dimension specified in index +-- unlogged_world2: empty, dimension not specified in index (this will error for now, ignored at the moment) +-- unlogged_world3: non-empty, dimension specified in index +-- unlogged_world4: non-empty, dimension not specified in index + +-- morph_world: will start as unlogged and then be altered to logged; non-empty, dimension not specified +-- morph_world2: will start as logged and then be altered to unlogged; non-empty, dimension not specified + + +CREATE UNLOGGED TABLE unlogged_world1 ( + id varchar(3), + vector real[] +); + +/* +CREATE UNLOGGED TABLE unlogged_world2 ( + id varchar(3), + vector real[] +); +*/ + +CREATE UNLOGGED TABLE unlogged_world3 ( + id varchar(3), + vector real[] +); + +CREATE UNLOGGED TABLE unlogged_world4 ( + id varchar(3), + vector real[] +); + +CREATE UNLOGGED TABLE morph_world ( + id varchar(3), + vector real[] +); + +CREATE TABLE morph_world2 ( + id varchar(3), + vector real[] +); + +-- Insert data into some tables + +INSERT INTO unlogged_world3 (id, vector) VALUES +('000', '{1,0,0,0}'), +('001', '{1,0,0,1}'), +('010', '{1,0,1,0}'), +('011', '{1,0,1,1}'), +('100', '{1,1,0,0}'), +('101', '{1,1,0,1}'), +('110', '{1,1,1,0}'), +('111', '{1,1,1,1}'); + +INSERT INTO unlogged_world4 (id, vector) VALUES +('000', '{1,0,0,0}'), +('001', '{1,0,0,1}'), +('010', '{1,0,1,0}'), +('011', '{1,0,1,1}'), +('100', '{1,1,0,0}'), +('101', '{1,1,0,1}'), +('110', '{1,1,1,0}'), +('111', '{1,1,1,1}'); + +INSERT INTO morph_world (id, vector) VALUES +('000', '{1,0,0,0}'), +('001', '{1,0,0,1}'), +('010', '{1,0,1,0}'), +('011', '{1,0,1,1}'), +('100', '{1,1,0,0}'), +('101', '{1,1,0,1}'), +('110', '{1,1,1,0}'), +('111', '{1,1,1,1}'); + +INSERT INTO morph_world2 (id, vector) VALUES +('000', '{1,0,0,0}'), +('001', '{1,0,0,1}'), +('010', '{1,0,1,0}'), +('011', '{1,0,1,1}'), +('100', '{1,1,0,0}'), +('101', '{1,1,0,1}'), +('110', '{1,1,1,0}'), +('111', '{1,1,1,1}'); + + +-- Change table status +ALTER TABLE morph_world SET LOGGED; + +ALTER TABLE morph_world2 SET UNLOGGED; + + +-- Verify contents of unlogged tables pre-crash +SELECT * from unlogged_world1; +--SELECT * from unlogged_world2; +SELECT * from unlogged_world3; +SELECT * from unlogged_world4; +SELECT * from morph_world; +SELECT * from morph_world2; + + +-- Create indexes +CREATE INDEX unlogged_world1_hnsw_idx ON unlogged_world1 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2, dim=4); +--CREATE INDEX unlogged_world2_hnsw_idx ON unlogged_world2 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); +CREATE INDEX unlogged_world3_hnsw_idx ON unlogged_world3 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2, dim=4); +CREATE INDEX unlogged_world4_hnsw_idx ON unlogged_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); +CREATE INDEX morph_world_hnsw_idx ON morph_world USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); +CREATE INDEX morph_world2_hnsw_idx ON morph_world2 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); + + + +-- Validate indexes pre-crash +SELECT _lantern_internal.validate_index('unlogged_world1_hnsw_idx', true); +--SELECT _lantern_internal.validate_index('unlogged_world2_hnsw_idx', true); +SELECT _lantern_internal.validate_index('unlogged_world3_hnsw_idx', true); +SELECT _lantern_internal.validate_index('unlogged_world4_hnsw_idx', true); +SELECT _lantern_internal.validate_index('morph_world_hnsw_idx', true); +SELECT _lantern_internal.validate_index('morph_world2_hnsw_idx', true); + + + +-- Now, we crash the database (todo:: find a way to do this programatically from within this .sql file?) +-- We can do this in one of two ways. Either: +-- 1. Find pid of master pg process using `ps aux | grep postgres` and then kill it with `kill -9` +-- OR +-- 2. `pg_ctl stop -D {PGDATA DIRECTORY} -m immediate` + +-- After crashing, restart it with: +-- sudo systemctl restart postgresql + +-- Then, run `hnsw_unlogged_post.sql` \ No newline at end of file diff --git a/test/sql/utils/sift10k_array_unlogged.sql b/test/sql/utils/sift10k_array_unlogged.sql new file mode 100644 index 000000000..626f08cc5 --- /dev/null +++ b/test/sql/utils/sift10k_array_unlogged.sql @@ -0,0 +1,5 @@ +CREATE UNLOGGED TABLE IF NOT EXISTS sift_base10k ( + id SERIAL PRIMARY KEY, + v REAL[128] +); +\copy sift_base10k (v) FROM '/tmp/lantern/vector_datasets/siftsmall_base_arrays.csv' with csv; \ No newline at end of file diff --git a/test/sql/utils/sift1k_array_unlogged.sql b/test/sql/utils/sift1k_array_unlogged.sql new file mode 100644 index 000000000..b984a75f5 --- /dev/null +++ b/test/sql/utils/sift1k_array_unlogged.sql @@ -0,0 +1,6 @@ +CREATE UNLOGGED TABLE IF NOT EXISTS sift_base1k ( + id SERIAL, + v REAL[] +); + +COPY sift_base1k (v) FROM '/tmp/lantern/vector_datasets/sift_base1k_arrays.csv' WITH csv; diff --git a/test/sql/utils/small_world_array_unlogged.sql b/test/sql/utils/small_world_array_unlogged.sql new file mode 100644 index 000000000..671cd0bd0 --- /dev/null +++ b/test/sql/utils/small_world_array_unlogged.sql @@ -0,0 +1,15 @@ +CREATE UNLOGGED TABLE small_world ( + id VARCHAR(3), + b BOOLEAN, + v REAL[3] +); + +INSERT INTO small_world (id, b, v) VALUES + ('000', TRUE, '{0,0,0}'), + ('001', TRUE, '{0,0,1}'), + ('010', FALSE, '{0,1,0}'), + ('011', TRUE, '{0,1,1}'), + ('100', FALSE, '{1,0,0}'), + ('101', FALSE, '{1,0,1}'), + ('110', FALSE, '{1,1,0}'), + ('111', TRUE, '{1,1,1}');