diff --git a/Dockerfile.dev b/Dockerfile.dev index 81dfa378b..f675ff9d6 100644 --- a/Dockerfile.dev +++ b/Dockerfile.dev @@ -40,8 +40,9 @@ RUN rm -rf build \ && make install # Install benchmarking tools in build folder -RUN git clone https://github.com/lanterndata/benchmark \ - && cd benchmark \ +RUN mkdir build/lantern \ + && git clone https://github.com/lanterndata/benchmark build/benchmark \ + && cd build/benchmark \ && pip install -r core/requirements.txt --break-system-packages \ && pip install -r external/requirements.txt --break-system-packages ENV DATABASE_URL=postgres://postgres:postgres@localhost:5432/postgres diff --git a/src/hooks/executor_start.c b/src/hooks/executor_start.c index 866cae499..4a55ffb71 100644 --- a/src/hooks/executor_start.c +++ b/src/hooks/executor_start.c @@ -59,7 +59,7 @@ static void validate_operator_usage(Plan *plan, List *oidList) context.oidList = oidList; context.isIndexScan = false; if(operator_used_incorrectly_walker((Node *)plan, (void *)&context)) { - elog(ERROR, "Operator <-> has no standalone meaning and is reserved for use in vector index lookups only"); + elog(ERROR, "Operator <-> can only be used inside of an index"); } } diff --git a/src/hooks/post_parse.c b/src/hooks/post_parse.c index 2a08ec6c9..637e59883 100644 --- a/src/hooks/post_parse.c +++ b/src/hooks/post_parse.c @@ -74,6 +74,22 @@ typedef struct bool usedCorrectly; } OperatorUsedCorrectlyContext; +static bool is_var_or_func_of_vars(Node *node) +{ + if(IsA(node, Var)) { + return true; + } else if(IsA(node, FuncExpr)) { + List *args = ((FuncExpr *)node)->args; + ListCell *cell; + foreach(cell, args) { + if(is_var_or_func_of_vars(lfirst(cell))) { + return true; + } + } + } + return false; +} + static bool operator_used_incorrectly_walker(Node *node, OperatorUsedCorrectlyContext *context) { if(node == NULL) return false; @@ -90,12 +106,19 @@ static bool operator_used_incorrectly_walker(Node *node, OperatorUsedCorrectlyCo bool isVar2 = IsA(arg2, Var); if(isVar1 && isVar2) { return false; - } else if(!isVar1 && !isVar2) { - return true; - } else if(isVar1) { + } else if(isVar1 && !isVar2) { return operator_used_incorrectly_walker(arg2, context); - } else { + } else if(!isVar1 && isVar2) { return operator_used_incorrectly_walker(arg1, context); + } else { + bool isFuncOfVars1 = is_var_or_func_of_vars(arg1); + bool isFuncOfVars2 = is_var_or_func_of_vars(arg2); + if(!isFuncOfVars1 && !isFuncOfVars2) { + return true; + } else { + return operator_used_incorrectly_walker(arg1, context) + || operator_used_incorrectly_walker(arg2, context); + } } } } @@ -140,7 +163,7 @@ void post_parse_analyze_hook_with_operator_check(ParseState *pstate, if(is_operator_used(query_as_node, oidList)) { List *sort_group_refs = get_sort_group_refs(query_as_node); if(is_operator_used_incorrectly(query_as_node, oidList, sort_group_refs)) { - elog(ERROR, "Operator <-> has no standalone meaning and is reserved for use in vector index lookups only"); + elog(ERROR, "Operator <-> is invalid outside of ORDER BY context"); } list_free(sort_group_refs); } diff --git a/test/expected/hnsw_create_expr.out b/test/expected/hnsw_create_expr.out new file mode 100644 index 000000000..3c9b26f18 --- /dev/null +++ b/test/expected/hnsw_create_expr.out @@ -0,0 +1,23 @@ +CREATE OR REPLACE FUNCTION int_to_fixed_binary_real_array(n INT) RETURNS REAL[] AS $$ +DECLARE + binary_string TEXT; + real_array REAL[] := '{}'; + i INT; +BEGIN + binary_string := lpad(CAST(n::BIT(3) AS TEXT), 3, '0'); + FOR i IN 1..length(binary_string) + LOOP + real_array := array_append(real_array, CAST(substring(binary_string, i, 1) AS REAL)); + END LOOP; + RETURN real_array; +END; +$$ LANGUAGE plpgsql IMMUTABLE; +CREATE TABLE test_table (id INTEGER); +INSERT INTO test_table VALUES (0), (1), (7); +\set enable_seqscan = off; +CREATE INDEX ON test_table USING hnsw (int_to_fixed_binary_real_array(id)) WITH (M=2, dim=3); +INFO: done init usearch index +INFO: inserted 3 elements +INFO: done saving 3 vectors +SELECT id FROM test_table ORDER BY int_to_fixed_binary_real_array(id) <-> int_to_fixed_binary_real_array(0) LIMIT 2; +ERROR: Operator <-> can only be used inside of an index diff --git a/test/expected/hnsw_dist_func.out b/test/expected/hnsw_dist_func.out index 72159359a..8f91465dd 100644 --- a/test/expected/hnsw_dist_func.out +++ b/test/expected/hnsw_dist_func.out @@ -141,11 +141,11 @@ SELECT hamming_dist('{1,1}', '{0,1,0}'); ERROR: expected equally sized arrays but got arrays with dimensions 2 and 3 -- Expect errors due to improper use of the <-> operator outside of its supported context SELECT ARRAY[1,2,3] <-> ARRAY[3,2,1]; -ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only +ERROR: Operator <-> is invalid outside of ORDER BY context SELECT ROUND((v <-> ARRAY[0,1,0])::numeric, 2) FROM small_world_cos ORDER BY v <-> '{0,1,0}' LIMIT 7; -ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only +ERROR: Operator <-> is invalid outside of ORDER BY context SELECT ROUND((v <-> ARRAY[0,1,0])::numeric, 2) FROM small_world_ham ORDER BY v <-> '{0,1,0}' LIMIT 7; -ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only +ERROR: Operator <-> is invalid outside of ORDER BY context -- More robust distance operator tests CREATE TABLE test1 (id SERIAL, v REAL[]); CREATE TABLE test2 (id SERIAL, v REAL[]); @@ -166,57 +166,57 @@ SELECT 1 FROM test1 WHERE id = 0 + 1; -- Expect errors due to incorrect usage INSERT INTO test1 (v) VALUES (ARRAY['{1,2}'::REAL[] <-> '{4,2}'::REAL[], 0]); -ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only +ERROR: Operator <-> is invalid outside of ORDER BY context SELECT v <-> '{1,2}' FROM test1 ORDER BY v <-> '{1,3}'; -ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only +ERROR: Operator <-> is invalid outside of ORDER BY context SELECT v <-> '{1,2}' FROM test1; -ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only +ERROR: Operator <-> is invalid outside of ORDER BY context WITH temp AS (SELECT v <-> '{1,2}' FROM test1) SELECT 1 FROM temp; -ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only +ERROR: Operator <-> is invalid outside of ORDER BY context SELECT t.res FROM (SELECT v <-> '{1,2}' AS res FROM test1) t; -ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only +ERROR: Operator <-> is invalid outside of ORDER BY context SELECT (SELECT v <-> '{1,2}' FROM test1 LIMIT 1) FROM test1; -ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only +ERROR: Operator <-> is invalid outside of ORDER BY context SELECT COALESCE(v <-> '{1,2}', 0) FROM test1; -ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only +ERROR: Operator <-> is invalid outside of ORDER BY context SELECT EXISTS (SELECT v <-> '{1,2}' FROM test1); -ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only +ERROR: Operator <-> is invalid outside of ORDER BY context SELECT test1.v <-> test2.v FROM test1 JOIN test2 USING (id); -ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only +ERROR: Operator <-> is invalid outside of ORDER BY context SELECT v <-> '{1,2}' FROM test1 UNION SELECT v <-> '{1,3}' FROM test1; -ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only +ERROR: Operator <-> is invalid outside of ORDER BY context (SELECT v <-> '{1,2}' FROM test1 WHERE id < 5) UNION (SELECT v <-> '{1,3}' FROM test1 WHERE id >= 5); -ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only +ERROR: Operator <-> is invalid outside of ORDER BY context SELECT MAX(v <-> '{1,2}') FROM test1; -ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only +ERROR: Operator <-> is invalid outside of ORDER BY context SELECT * FROM test1 JOIN test2 ON test1.v <-> test2.v < 0.5; -ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only +ERROR: Operator <-> is invalid outside of ORDER BY context SELECT test1.v FROM test1 JOIN test2 ON test1.v <-> '{1,2}' = test2.v <-> '{1,3}'; -ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only +ERROR: Operator <-> is invalid outside of ORDER BY context SELECT (v <-> '{1,2}') + (v <-> '{1,3}') FROM test1; -ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only +ERROR: Operator <-> is invalid outside of ORDER BY context SELECT CASE WHEN v <-> '{1,2}' > 1 THEN 'High' ELSE 'Low' END FROM test1; -ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only +ERROR: Operator <-> is invalid outside of ORDER BY context INSERT INTO test1 (v) VALUES ('{2,3}') RETURNING v <-> '{1,2}'; -ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only +ERROR: Operator <-> is invalid outside of ORDER BY context SELECT 1 FROM test1 GROUP BY v <-> '{1,3}'; -ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only +ERROR: Operator <-> is invalid outside of ORDER BY context SELECT 1 FROM test1 ORDER BY (('{1,2}'::real[] <-> '{3,4}'::real[]) - 0); -ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only +ERROR: Operator <-> is invalid outside of ORDER BY context SELECT 1 FROM test1 ORDER BY '{1,2}'::REAL[] <-> '{3,4}'::REAL[]; -ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only +ERROR: Operator <-> is invalid outside of ORDER BY context SELECT 1 FROM test1 ORDER BY v <-> ARRAY[(SELECT '{1,4}'::REAL[] <-> '{4,2}'::REAL[]), 3]; -ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only +ERROR: Operator <-> is invalid outside of ORDER BY context -- Expect errors due to index not existing SELECT id FROM test1 ORDER BY v <-> '{1,2}'; -ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only +ERROR: Operator <-> can only be used inside of an index SELECT 1 FROM test1 ORDER BY v <-> (SELECT '{1,3}'::real[]); -ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only +ERROR: Operator <-> can only be used inside of an index SELECT t2_results.id FROM test1 t1 JOIN LATERAL (SELECT t2.id FROM test2 t2 ORDER BY t1.v <-> t2.v LIMIT 1) t2_results ON TRUE; -ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only +ERROR: Operator <-> can only be used inside of an index WITH t AS (SELECT id FROM test1 ORDER BY v <-> '{1,2}' LIMIT 1) SELECT DISTINCT id FROM t; -ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only +ERROR: Operator <-> can only be used inside of an index WITH t AS (SELECT id FROM test1 ORDER BY v <-> '{1,2}' LIMIT 1) SELECT id, COUNT(*) FROM t GROUP BY 1; -ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only +ERROR: Operator <-> can only be used inside of an index WITH t AS (SELECT id FROM test1 ORDER BY v <-> '{1,2}') SELECT id FROM t UNION SELECT id FROM t; -ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only +ERROR: Operator <-> can only be used inside of an index diff --git a/test/expected/hnsw_todo.out b/test/expected/hnsw_todo.out index 5d3cbac78..9b50c6920 100644 --- a/test/expected/hnsw_todo.out +++ b/test/expected/hnsw_todo.out @@ -28,7 +28,7 @@ EXPLAIN (COSTS FALSE) SELECT id, ROUND(l2sq_dist(vector_int, array[0,1,0])::numeric, 2) as dist FROM small_world_l2 ORDER BY vector_int <-> array[0,1,0] LIMIT 7; -ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only +ERROR: Operator <-> can only be used inside of an index -- this result is not sorted correctly CREATE TABLE small_world_ham ( id SERIAL PRIMARY KEY, diff --git a/test/expected/hnsw_vector.out b/test/expected/hnsw_vector.out index 814da5500..5cb787f4d 100644 --- a/test/expected/hnsw_vector.out +++ b/test/expected/hnsw_vector.out @@ -167,7 +167,7 @@ RESET client_min_messages; \set ON_ERROR_STOP off -- Expect error due to improper use of the <-> operator outside of its supported context SELECT ARRAY[1,2,3] <-> ARRAY[3,2,1]; -ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only +ERROR: Operator <-> is invalid outside of ORDER BY context -- Expect error due to mismatching vector dimensions SELECT 1 FROM small_world ORDER BY v <-> '[0,1,0,1]' LIMIT 1; ERROR: Expected vector with dimension 3, got 4 diff --git a/test/schedule.txt b/test/schedule.txt index fcc4b30e3..5969a626a 100644 --- a/test/schedule.txt +++ b/test/schedule.txt @@ -4,4 +4,4 @@ # - 'test' lines may have multiple space-separated tests. All tests in a single 'test' line will be run in parallel test_pgvector: hnsw_vector -test: hnsw_config hnsw_correct hnsw_create hnsw_dist_func hnsw_insert hnsw_select hnsw_todo hnsw_index_from_file hnsw_cost_estimate +test: hnsw_config hnsw_correct hnsw_create hnsw_create_expr hnsw_dist_func hnsw_insert hnsw_select hnsw_todo hnsw_index_from_file hnsw_cost_estimate diff --git a/test/sql/hnsw_create_expr.sql b/test/sql/hnsw_create_expr.sql new file mode 100644 index 000000000..eca96c69b --- /dev/null +++ b/test/sql/hnsw_create_expr.sql @@ -0,0 +1,21 @@ +CREATE OR REPLACE FUNCTION int_to_fixed_binary_real_array(n INT) RETURNS REAL[] AS $$ +DECLARE + binary_string TEXT; + real_array REAL[] := '{}'; + i INT; +BEGIN + binary_string := lpad(CAST(n::BIT(3) AS TEXT), 3, '0'); + FOR i IN 1..length(binary_string) + LOOP + real_array := array_append(real_array, CAST(substring(binary_string, i, 1) AS REAL)); + END LOOP; + RETURN real_array; +END; +$$ LANGUAGE plpgsql IMMUTABLE; + +CREATE TABLE test_table (id INTEGER); +INSERT INTO test_table VALUES (0), (1), (7); +\set enable_seqscan = off; +CREATE INDEX ON test_table USING hnsw (int_to_fixed_binary_real_array(id)) WITH (M=2, dim=3); + +SELECT id FROM test_table ORDER BY int_to_fixed_binary_real_array(id) <-> int_to_fixed_binary_real_array(0) LIMIT 2; \ No newline at end of file