Skip to content

Commit

Permalink
Support expression-based index operator checks (#179)
Browse files Browse the repository at this point in the history
Add create expression test and make error messages distinct, support functions in post parse
  • Loading branch information
dqii authored Sep 26, 2023
1 parent 7e6223e commit e87563a
Show file tree
Hide file tree
Showing 9 changed files with 109 additions and 41 deletions.
5 changes: 3 additions & 2 deletions Dockerfile.dev
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,9 @@ RUN rm -rf build \
&& make install

# Install benchmarking tools in build folder
RUN git clone https://github.com/lanterndata/benchmark \
&& cd benchmark \
RUN mkdir build/lantern \
&& git clone https://github.com/lanterndata/benchmark build/benchmark \
&& cd build/benchmark \
&& pip install -r core/requirements.txt --break-system-packages \
&& pip install -r external/requirements.txt --break-system-packages
ENV DATABASE_URL=postgres://postgres:postgres@localhost:5432/postgres
Expand Down
2 changes: 1 addition & 1 deletion src/hooks/executor_start.c
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ static void validate_operator_usage(Plan *plan, List *oidList)
context.oidList = oidList;
context.isIndexScan = false;
if(operator_used_incorrectly_walker((Node *)plan, (void *)&context)) {
elog(ERROR, "Operator <-> has no standalone meaning and is reserved for use in vector index lookups only");
elog(ERROR, "Operator <-> can only be used inside of an index");
}
}

Expand Down
33 changes: 28 additions & 5 deletions src/hooks/post_parse.c
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,22 @@ typedef struct
bool usedCorrectly;
} OperatorUsedCorrectlyContext;

static bool is_var_or_func_of_vars(Node *node)
{
if(IsA(node, Var)) {
return true;
} else if(IsA(node, FuncExpr)) {
List *args = ((FuncExpr *)node)->args;
ListCell *cell;
foreach(cell, args) {
if(is_var_or_func_of_vars(lfirst(cell))) {
return true;
}
}
}
return false;
}

static bool operator_used_incorrectly_walker(Node *node, OperatorUsedCorrectlyContext *context)
{
if(node == NULL) return false;
Expand All @@ -90,12 +106,19 @@ static bool operator_used_incorrectly_walker(Node *node, OperatorUsedCorrectlyCo
bool isVar2 = IsA(arg2, Var);
if(isVar1 && isVar2) {
return false;
} else if(!isVar1 && !isVar2) {
return true;
} else if(isVar1) {
} else if(isVar1 && !isVar2) {
return operator_used_incorrectly_walker(arg2, context);
} else {
} else if(!isVar1 && isVar2) {
return operator_used_incorrectly_walker(arg1, context);
} else {
bool isFuncOfVars1 = is_var_or_func_of_vars(arg1);
bool isFuncOfVars2 = is_var_or_func_of_vars(arg2);
if(!isFuncOfVars1 && !isFuncOfVars2) {
return true;
} else {
return operator_used_incorrectly_walker(arg1, context)
|| operator_used_incorrectly_walker(arg2, context);
}
}
}
}
Expand Down Expand Up @@ -140,7 +163,7 @@ void post_parse_analyze_hook_with_operator_check(ParseState *pstate,
if(is_operator_used(query_as_node, oidList)) {
List *sort_group_refs = get_sort_group_refs(query_as_node);
if(is_operator_used_incorrectly(query_as_node, oidList, sort_group_refs)) {
elog(ERROR, "Operator <-> has no standalone meaning and is reserved for use in vector index lookups only");
elog(ERROR, "Operator <-> is invalid outside of ORDER BY context");
}
list_free(sort_group_refs);
}
Expand Down
23 changes: 23 additions & 0 deletions test/expected/hnsw_create_expr.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
CREATE OR REPLACE FUNCTION int_to_fixed_binary_real_array(n INT) RETURNS REAL[] AS $$
DECLARE
binary_string TEXT;
real_array REAL[] := '{}';
i INT;
BEGIN
binary_string := lpad(CAST(n::BIT(3) AS TEXT), 3, '0');
FOR i IN 1..length(binary_string)
LOOP
real_array := array_append(real_array, CAST(substring(binary_string, i, 1) AS REAL));
END LOOP;
RETURN real_array;
END;
$$ LANGUAGE plpgsql IMMUTABLE;
CREATE TABLE test_table (id INTEGER);
INSERT INTO test_table VALUES (0), (1), (7);
\set enable_seqscan = off;
CREATE INDEX ON test_table USING hnsw (int_to_fixed_binary_real_array(id)) WITH (M=2, dim=3);
INFO: done init usearch index
INFO: inserted 3 elements
INFO: done saving 3 vectors
SELECT id FROM test_table ORDER BY int_to_fixed_binary_real_array(id) <-> int_to_fixed_binary_real_array(0) LIMIT 2;
ERROR: Operator <-> can only be used inside of an index
60 changes: 30 additions & 30 deletions test/expected/hnsw_dist_func.out
Original file line number Diff line number Diff line change
Expand Up @@ -141,11 +141,11 @@ SELECT hamming_dist('{1,1}', '{0,1,0}');
ERROR: expected equally sized arrays but got arrays with dimensions 2 and 3
-- Expect errors due to improper use of the <-> operator outside of its supported context
SELECT ARRAY[1,2,3] <-> ARRAY[3,2,1];
ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only
ERROR: Operator <-> is invalid outside of ORDER BY context
SELECT ROUND((v <-> ARRAY[0,1,0])::numeric, 2) FROM small_world_cos ORDER BY v <-> '{0,1,0}' LIMIT 7;
ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only
ERROR: Operator <-> is invalid outside of ORDER BY context
SELECT ROUND((v <-> ARRAY[0,1,0])::numeric, 2) FROM small_world_ham ORDER BY v <-> '{0,1,0}' LIMIT 7;
ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only
ERROR: Operator <-> is invalid outside of ORDER BY context
-- More robust distance operator tests
CREATE TABLE test1 (id SERIAL, v REAL[]);
CREATE TABLE test2 (id SERIAL, v REAL[]);
Expand All @@ -166,57 +166,57 @@ SELECT 1 FROM test1 WHERE id = 0 + 1;

-- Expect errors due to incorrect usage
INSERT INTO test1 (v) VALUES (ARRAY['{1,2}'::REAL[] <-> '{4,2}'::REAL[], 0]);
ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only
ERROR: Operator <-> is invalid outside of ORDER BY context
SELECT v <-> '{1,2}' FROM test1 ORDER BY v <-> '{1,3}';
ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only
ERROR: Operator <-> is invalid outside of ORDER BY context
SELECT v <-> '{1,2}' FROM test1;
ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only
ERROR: Operator <-> is invalid outside of ORDER BY context
WITH temp AS (SELECT v <-> '{1,2}' FROM test1) SELECT 1 FROM temp;
ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only
ERROR: Operator <-> is invalid outside of ORDER BY context
SELECT t.res FROM (SELECT v <-> '{1,2}' AS res FROM test1) t;
ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only
ERROR: Operator <-> is invalid outside of ORDER BY context
SELECT (SELECT v <-> '{1,2}' FROM test1 LIMIT 1) FROM test1;
ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only
ERROR: Operator <-> is invalid outside of ORDER BY context
SELECT COALESCE(v <-> '{1,2}', 0) FROM test1;
ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only
ERROR: Operator <-> is invalid outside of ORDER BY context
SELECT EXISTS (SELECT v <-> '{1,2}' FROM test1);
ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only
ERROR: Operator <-> is invalid outside of ORDER BY context
SELECT test1.v <-> test2.v FROM test1 JOIN test2 USING (id);
ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only
ERROR: Operator <-> is invalid outside of ORDER BY context
SELECT v <-> '{1,2}' FROM test1 UNION SELECT v <-> '{1,3}' FROM test1;
ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only
ERROR: Operator <-> is invalid outside of ORDER BY context
(SELECT v <-> '{1,2}' FROM test1 WHERE id < 5) UNION (SELECT v <-> '{1,3}' FROM test1 WHERE id >= 5);
ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only
ERROR: Operator <-> is invalid outside of ORDER BY context
SELECT MAX(v <-> '{1,2}') FROM test1;
ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only
ERROR: Operator <-> is invalid outside of ORDER BY context
SELECT * FROM test1 JOIN test2 ON test1.v <-> test2.v < 0.5;
ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only
ERROR: Operator <-> is invalid outside of ORDER BY context
SELECT test1.v FROM test1 JOIN test2 ON test1.v <-> '{1,2}' = test2.v <-> '{1,3}';
ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only
ERROR: Operator <-> is invalid outside of ORDER BY context
SELECT (v <-> '{1,2}') + (v <-> '{1,3}') FROM test1;
ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only
ERROR: Operator <-> is invalid outside of ORDER BY context
SELECT CASE WHEN v <-> '{1,2}' > 1 THEN 'High' ELSE 'Low' END FROM test1;
ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only
ERROR: Operator <-> is invalid outside of ORDER BY context
INSERT INTO test1 (v) VALUES ('{2,3}') RETURNING v <-> '{1,2}';
ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only
ERROR: Operator <-> is invalid outside of ORDER BY context
SELECT 1 FROM test1 GROUP BY v <-> '{1,3}';
ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only
ERROR: Operator <-> is invalid outside of ORDER BY context
SELECT 1 FROM test1 ORDER BY (('{1,2}'::real[] <-> '{3,4}'::real[]) - 0);
ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only
ERROR: Operator <-> is invalid outside of ORDER BY context
SELECT 1 FROM test1 ORDER BY '{1,2}'::REAL[] <-> '{3,4}'::REAL[];
ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only
ERROR: Operator <-> is invalid outside of ORDER BY context
SELECT 1 FROM test1 ORDER BY v <-> ARRAY[(SELECT '{1,4}'::REAL[] <-> '{4,2}'::REAL[]), 3];
ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only
ERROR: Operator <-> is invalid outside of ORDER BY context
-- Expect errors due to index not existing
SELECT id FROM test1 ORDER BY v <-> '{1,2}';
ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only
ERROR: Operator <-> can only be used inside of an index
SELECT 1 FROM test1 ORDER BY v <-> (SELECT '{1,3}'::real[]);
ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only
ERROR: Operator <-> can only be used inside of an index
SELECT t2_results.id FROM test1 t1 JOIN LATERAL (SELECT t2.id FROM test2 t2 ORDER BY t1.v <-> t2.v LIMIT 1) t2_results ON TRUE;
ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only
ERROR: Operator <-> can only be used inside of an index
WITH t AS (SELECT id FROM test1 ORDER BY v <-> '{1,2}' LIMIT 1) SELECT DISTINCT id FROM t;
ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only
ERROR: Operator <-> can only be used inside of an index
WITH t AS (SELECT id FROM test1 ORDER BY v <-> '{1,2}' LIMIT 1) SELECT id, COUNT(*) FROM t GROUP BY 1;
ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only
ERROR: Operator <-> can only be used inside of an index
WITH t AS (SELECT id FROM test1 ORDER BY v <-> '{1,2}') SELECT id FROM t UNION SELECT id FROM t;
ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only
ERROR: Operator <-> can only be used inside of an index
2 changes: 1 addition & 1 deletion test/expected/hnsw_todo.out
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ EXPLAIN (COSTS FALSE)
SELECT id, ROUND(l2sq_dist(vector_int, array[0,1,0])::numeric, 2) as dist
FROM small_world_l2
ORDER BY vector_int <-> array[0,1,0] LIMIT 7;
ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only
ERROR: Operator <-> can only be used inside of an index
-- this result is not sorted correctly
CREATE TABLE small_world_ham (
id SERIAL PRIMARY KEY,
Expand Down
2 changes: 1 addition & 1 deletion test/expected/hnsw_vector.out
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ RESET client_min_messages;
\set ON_ERROR_STOP off
-- Expect error due to improper use of the <-> operator outside of its supported context
SELECT ARRAY[1,2,3] <-> ARRAY[3,2,1];
ERROR: Operator <-> has no standalone meaning and is reserved for use in vector index lookups only
ERROR: Operator <-> is invalid outside of ORDER BY context
-- Expect error due to mismatching vector dimensions
SELECT 1 FROM small_world ORDER BY v <-> '[0,1,0,1]' LIMIT 1;
ERROR: Expected vector with dimension 3, got 4
Expand Down
2 changes: 1 addition & 1 deletion test/schedule.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
# - 'test' lines may have multiple space-separated tests. All tests in a single 'test' line will be run in parallel

test_pgvector: hnsw_vector
test: hnsw_config hnsw_correct hnsw_create hnsw_dist_func hnsw_insert hnsw_select hnsw_todo hnsw_index_from_file hnsw_cost_estimate
test: hnsw_config hnsw_correct hnsw_create hnsw_create_expr hnsw_dist_func hnsw_insert hnsw_select hnsw_todo hnsw_index_from_file hnsw_cost_estimate
21 changes: 21 additions & 0 deletions test/sql/hnsw_create_expr.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
CREATE OR REPLACE FUNCTION int_to_fixed_binary_real_array(n INT) RETURNS REAL[] AS $$
DECLARE
binary_string TEXT;
real_array REAL[] := '{}';
i INT;
BEGIN
binary_string := lpad(CAST(n::BIT(3) AS TEXT), 3, '0');
FOR i IN 1..length(binary_string)
LOOP
real_array := array_append(real_array, CAST(substring(binary_string, i, 1) AS REAL));
END LOOP;
RETURN real_array;
END;
$$ LANGUAGE plpgsql IMMUTABLE;

CREATE TABLE test_table (id INTEGER);
INSERT INTO test_table VALUES (0), (1), (7);
\set enable_seqscan = off;
CREATE INDEX ON test_table USING hnsw (int_to_fixed_binary_real_array(id)) WITH (M=2, dim=3);

SELECT id FROM test_table ORDER BY int_to_fixed_binary_real_array(id) <-> int_to_fixed_binary_real_array(0) LIMIT 2;

0 comments on commit e87563a

Please sign in to comment.