Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added hnsw.ef_search variable to change expansion factor during search at runtime #199

Merged
merged 10 commits into from
Oct 25, 2023
Merged
16 changes: 15 additions & 1 deletion src/hnsw/options.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
static relopt_kind ldb_hnsw_index_withopts;

int ldb_hnsw_init_k;
int ldb_hnsw_ef_search;

// this variable is only set during testing and controls whether
// certain elog() calls are made
Expand Down Expand Up @@ -188,7 +189,7 @@ void _PG_init(void)

add_int_reloption(ldb_hnsw_index_withopts,
"ef",
"HNSW ef-construction hyperparameter",
"HNSW ef-search hyperparameter",
HNSW_DEFAULT_EF,
1,
HNSW_MAX_EF
Expand Down Expand Up @@ -220,6 +221,19 @@ void _PG_init(void)
NULL,
NULL);

DefineCustomIntVariable("lantern_hnsw.ef_search",
"Expansion factor to use during vector search in a scan",
"Valid values are in range [1, 400]",
&ldb_hnsw_ef_search,
USEARCH_SEARCH_EF_INVALID_VALUE,
1,
HNSW_MAX_EF,
PGC_USERSET,
0,
NULL,
NULL,
NULL);

DefineCustomBoolVariable("_lantern_internal.is_test",
"Whether or not the DB is in a regression test",
"set this to 1 to enable extra logging for use in lanterndb regression tests",
Expand Down
1 change: 1 addition & 0 deletions src/hnsw/options.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ usearch_metric_kind_t ldb_HnswGetMetricKind(Relation index);
bytea* ldb_amoptions(Datum reloptions, bool validate);

extern int ldb_hnsw_init_k;
extern int ldb_hnsw_ef_search;
extern bool ldb_is_test;

#endif // LDB_HNSW_OPTIONS_H
22 changes: 18 additions & 4 deletions src/hnsw/scan.c
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,8 @@ bool ldb_amgettuple(IndexScanDesc scan, ScanDirection dir)
// about the furtheest neighbors
Assert(ScanDirectionIsForward(dir));

int ef = ldb_hnsw_ef_search; // 0 if not set, but we pass it into usearch_custom_ef anyway since 0 is also a
// sentinel value there
if(scanstate->first) {
int num_returned;
Datum value;
Expand Down Expand Up @@ -199,8 +201,14 @@ bool ldb_amgettuple(IndexScanDesc scan, ScanDirection dir)
k,
"index size exceeded work_mem during scan, consider increasing work_mem");
ldb_dlog("LANTERN querying index for %d elements", k);
num_returned = usearch_search(
scanstate->usearch_index, vec, usearch_scalar_f32_k, k, scanstate->labels, scanstate->distances, &error);
num_returned = usearch_search(scanstate->usearch_index,
vec,
usearch_scalar_f32_k,
k,
ef,
scanstate->labels,
scanstate->distances,
&error);
ldb_wal_retriever_area_reset(scanstate->retriever_ctx, NULL);

scanstate->count = num_returned;
Expand Down Expand Up @@ -240,8 +248,14 @@ bool ldb_amgettuple(IndexScanDesc scan, ScanDirection dir)
"index size exceeded work_mem during scan, consider increasing work_mem");

ldb_dlog("LANTERN - querying index for %d elements", k);
num_returned = usearch_search(
scanstate->usearch_index, vec, usearch_scalar_f32_k, k, scanstate->labels, scanstate->distances, &error);
num_returned = usearch_search(scanstate->usearch_index,
vec,
usearch_scalar_f32_k,
k,
ef,
scanstate->labels,
scanstate->distances,
&error);
ldb_wal_retriever_area_reset(scanstate->retriever_ctx, NULL);

scanstate->count = num_returned;
Expand Down
197 changes: 197 additions & 0 deletions test/expected/hnsw_ef_search.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
------------------------------------------------------------------------------
-- Test changing lantern_hnsw.ef_search variable at runtime
------------------------------------------------------------------------------
\ir utils/sift1k_array.sql
CREATE TABLE IF NOT EXISTS sift_base1k (
id SERIAL,
v REAL[]
);
COPY sift_base1k (v) FROM '/tmp/lantern/vector_datasets/sift_base1k_arrays.csv' WITH csv;
CREATE INDEX hnsw_l2_index ON sift_base1k USING hnsw (v) WITH (_experimental_index_path='/tmp/lantern/files/index-sift1k-l2.usearch');
INFO: done init usearch index
INFO: done loading usearch index
INFO: done saving 1000 vectors
SELECT * FROM ldb_get_indexes('sift_base1k');
indexname | size | indexdef | total_index_size
---------------+--------+----------------------------------------------------------------------------------------------------------------------------------------------+------------------
hnsw_l2_index | 720 kB | CREATE INDEX hnsw_l2_index ON public.sift_base1k USING hnsw (v) WITH (_experimental_index_path='/tmp/lantern/files/index-sift1k-l2.usearch') | 720 kB
(1 row)

INSERT INTO sift_base1k (id, v) VALUES
(1001, array_fill(1, ARRAY[128])),
(1002, array_fill(2, ARRAY[128]));
-- Validate error on invalid ef_search values
\set ON_ERROR_STOP off
SET lantern_hnsw.ef_search = -1;
ERROR: -1 is outside the valid range for parameter "lantern_hnsw.ef_search" (1 .. 400)
SET lantern_hnsw.ef_search = 0;
ERROR: 0 is outside the valid range for parameter "lantern_hnsw.ef_search" (1 .. 400)
SET lantern_hnsw.ef_search = 401;
ERROR: 401 is outside the valid range for parameter "lantern_hnsw.ef_search" (1 .. 400)
\set ON_ERROR_STOP on
-- Repeat the same query while varying ef parameter
-- NOTE: it is not entirely known if the results of these are deterministic
SET enable_seqscan = false;
SELECT v AS v1001 FROM sift_base1k WHERE id = 1001 \gset
-- Queries below have the same result
SET lantern_hnsw.ef_search = 1;
SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 10;
round
-----------
0.00
128.00
249249.00
249285.00
249418.00
249515.00
249589.00
249647.00
249652.00
249675.00
(10 rows)

SET lantern_hnsw.ef_search = 2;
SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 10;
round
-----------
0.00
128.00
249249.00
249285.00
249418.00
249515.00
249589.00
249647.00
249652.00
249675.00
(10 rows)

SET lantern_hnsw.ef_search = 4;
SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 10;
round
-----------
0.00
128.00
249249.00
249285.00
249418.00
249515.00
249589.00
249647.00
249652.00
249675.00
(10 rows)

SET lantern_hnsw.ef_search = 8;
SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 10;
round
-----------
0.00
128.00
249249.00
249285.00
249418.00
249515.00
249589.00
249647.00
249652.00
249675.00
(10 rows)

SET lantern_hnsw.ef_search = 16;
SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 10;
round
-----------
0.00
128.00
249249.00
249285.00
249418.00
249515.00
249589.00
249647.00
249652.00
249675.00
(10 rows)

-- Queries below have the same result, which is different from above
SET lantern_hnsw.ef_search = 32;
SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 10;
round
-----------
0.00
128.00
249249.00
249285.00
249418.00
249457.00
249515.00
249589.00
249647.00
249652.00
(10 rows)

SET lantern_hnsw.ef_search = 64;
SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 10;
round
-----------
0.00
128.00
249249.00
249285.00
249418.00
249457.00
249515.00
249589.00
249647.00
249652.00
(10 rows)

SET lantern_hnsw.ef_search = 128;
SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 10;
round
-----------
0.00
128.00
249249.00
249285.00
249418.00
249457.00
249515.00
249589.00
249647.00
249652.00
(10 rows)

SET lantern_hnsw.ef_search = 256;
SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 10;
round
-----------
0.00
128.00
249249.00
249285.00
249418.00
249457.00
249515.00
249589.00
249647.00
249652.00
(10 rows)

SET lantern_hnsw.ef_search = 400;
SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 10;
round
-----------
0.00
128.00
249249.00
249285.00
249418.00
249457.00
249515.00
249589.00
249647.00
249652.00
(10 rows)

2 changes: 1 addition & 1 deletion test/schedule.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
# - 'test' lines may have multiple space-separated tests. All tests in a single 'test' line will be run in parallel

test_pgvector: hnsw_vector
test: hnsw_config hnsw_correct hnsw_create hnsw_create_expr hnsw_dist_func hnsw_insert hnsw_select hnsw_todo hnsw_index_from_file hnsw_cost_estimate ext_relocation
test: hnsw_config hnsw_correct hnsw_create hnsw_create_expr hnsw_dist_func hnsw_insert hnsw_select hnsw_todo hnsw_index_from_file hnsw_cost_estimate ext_relocation hnsw_ef_search
56 changes: 56 additions & 0 deletions test/sql/hnsw_ef_search.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
------------------------------------------------------------------------------
-- Test changing lantern_hnsw.ef_search variable at runtime
------------------------------------------------------------------------------

\ir utils/sift1k_array.sql

CREATE INDEX hnsw_l2_index ON sift_base1k USING hnsw (v) WITH (_experimental_index_path='/tmp/lantern/files/index-sift1k-l2.usearch');
SELECT * FROM ldb_get_indexes('sift_base1k');

INSERT INTO sift_base1k (id, v) VALUES
(1001, array_fill(1, ARRAY[128])),
(1002, array_fill(2, ARRAY[128]));

-- Validate error on invalid ef_search values
\set ON_ERROR_STOP off
SET lantern_hnsw.ef_search = -1;
SET lantern_hnsw.ef_search = 0;
SET lantern_hnsw.ef_search = 401;
\set ON_ERROR_STOP on

-- Repeat the same query while varying ef parameter
-- NOTE: it is not entirely known if the results of these are deterministic
SET enable_seqscan = false;
SELECT v AS v1001 FROM sift_base1k WHERE id = 1001 \gset

-- Queries below have the same result
SET lantern_hnsw.ef_search = 1;
SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 10;

SET lantern_hnsw.ef_search = 2;
SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 10;

SET lantern_hnsw.ef_search = 4;
SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 10;

SET lantern_hnsw.ef_search = 8;
SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 10;

SET lantern_hnsw.ef_search = 16;
SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 10;

-- Queries below have the same result, which is different from above
SET lantern_hnsw.ef_search = 32;
SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 10;

SET lantern_hnsw.ef_search = 64;
SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 10;

SET lantern_hnsw.ef_search = 128;
SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 10;

SET lantern_hnsw.ef_search = 256;
SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 10;

SET lantern_hnsw.ef_search = 400;
SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 10;
2 changes: 1 addition & 1 deletion third_party/usearch