Skip to content

Commit

Permalink
Rename generic operator to <?> (#244)
Browse files Browse the repository at this point in the history
* Rename generic operator to <?>
* Update README
* Fix update_test script
* Fix update script for dropping hamming ops from pgvector
  • Loading branch information
var77 authored Dec 11, 2023
1 parent 896ed5a commit 0a1227b
Show file tree
Hide file tree
Showing 34 changed files with 579 additions and 379 deletions.
3 changes: 2 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
cmake_minimum_required(VERSION 3.3)

set(LANTERNDB_VERSION 0.0.9)
set(LANTERNDB_VERSION 0.0.10)

project(
LanternDB
Expand Down Expand Up @@ -190,6 +190,7 @@ set (_update_files
sql/updates/0.0.6--0.0.7.sql
sql/updates/0.0.7--0.0.8.sql
sql/updates/0.0.8--0.0.9.sql
sql/updates/0.0.9--0.0.10.sql
)

add_custom_command(
Expand Down
5 changes: 2 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,9 @@ Lantern supports several distance functions in the index and it has 2 modes for
Note that in this mode, you need to use right operator in order to trigger an index scan.

2. `lantern.pgvector_compat=FALSE`
In this mode you only need to specify the distance function used for a column at index creation time. Lantern will automatically infer the distance function to use for search so you always use `<->` operator in search queries.
In this mode you only need to specify the distance function used for a column at index creation time. Lantern will automatically infer the distance function to use for search so you always use `<?>` operator in search queries.

Note that in this mode, the operator `<->` is intended exclusively for use with index lookups. If you expect to not use the index in a query, use the distance function directly (e.g. `l2sq_dist(v1, v2)`)
Note that in this mode, the operator `<?>` is intended exclusively for use with index lookups. If you expect to not use the index in a query, use the distance function directly (e.g. `l2sq_dist(v1, v2)`)

> To switch between modes set `lantern.pgvector_compat` variable to `TRUE` or `FALSE`.
Expand All @@ -102,7 +102,6 @@ There are four defined operator classes that can be employed during index creati
- **`dist_cos_ops`**: Applicable to the type `real[]`
- **`dist_vec_cos_ops`**: Applicable to the type `vector`
- **`dist_hamming_ops`**: Applicable to the type `integer[]`
- **`dist_vec_hamming_ops`**: Applicable to the type `vector`

### Index Construction Parameters

Expand Down
15 changes: 11 additions & 4 deletions scripts/test_updates.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import getpass
import git
import os
from functools import cmp_to_key


INCOMPATIBLE_VERSIONS = {
Expand Down Expand Up @@ -37,8 +38,7 @@ def update_from_tag(from_version: str, to_version: str):
res = shell(f"psql postgres -U {args.user} -c 'CREATE DATABASE {args.db};'")
res = shell(f"psql postgres -U {args.user} -c 'DROP EXTENSION IF EXISTS lantern CASCADE; CREATE EXTENSION lantern;' -d {args.db};")

# run begin of parallel tests. Run this while the from_tag version of the binary is installed and loaded
# run begin on {from_version}
# run begin of parallel tests. Run this while the from_tag version of the binary is installed and loaded run begin on {from_version}
if from_tag != "v0.0.4":
# the source code at 0.0.4 did not yet have parallel tests
res = shell(f"cd {args.builddir} ; UPDATE_EXTENSION=1 UPDATE_FROM={from_version} UPDATE_TO={from_version} make test-parallel FILTER=begin")
Expand All @@ -59,6 +59,12 @@ def incompatible_version(pg_version, version_tag):
return False
return version_tag in INCOMPATIBLE_VERSIONS[pg_version]

def sort_versions(v1, v2):
a = int(v1.replace('.', ''))
b = int(v2.replace('.', ''))

return a - b

if __name__ == "__main__":

default_user = getpass.getuser()
Expand Down Expand Up @@ -88,8 +94,9 @@ def incompatible_version(pg_version, version_tag):

# test updates from all tags
tag_pairs = [update_fname.split("--") for update_fname in os.listdir("sql/updates")]
from_tags = list(sorted([p[0] for p in tag_pairs], reverse=True))
to_tags = list(sorted([p[1].split(".sql")[0] for p in tag_pairs]))
from_tags = list(sorted([p[0] for p in tag_pairs], key=cmp_to_key(sort_versions)))
from_tags.reverse()
to_tags = list(sorted([p[1].split(".sql")[0] for p in tag_pairs], key=cmp_to_key(sort_versions)))
latest_version = to_tags[-1]
print("Updating from tags", from_tags, "to ", latest_version)

Expand Down
68 changes: 35 additions & 33 deletions sql/lantern.sql
Original file line number Diff line number Diff line change
Expand Up @@ -6,48 +6,41 @@ CREATE FUNCTION hnsw_handler(internal) RETURNS index_am_handler
CREATE FUNCTION ldb_generic_dist(real[], real[]) RETURNS real
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;

CREATE FUNCTION l2sq_dist(real[], real[]) RETURNS real
CREATE FUNCTION ldb_generic_dist(integer[], integer[]) RETURNS real
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;

-- this function is needed, as we should also use <-> operator
-- with integer[] type (to overwrite hamming dist function in our hooks)
-- and if we do not create l2sq_dist for integer[] type it will fail to cast in pgvector_compat mode
CREATE FUNCTION l2sq_dist(integer[], integer[]) RETURNS real

CREATE FUNCTION l2sq_dist(real[], real[]) RETURNS real
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;

CREATE FUNCTION cos_dist(real[], real[]) RETURNS real
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;

-- functions _with_guard suffix are used to forbid operator usage
-- if operator hooks are enabled (lantern.pgvector_compat=FALSE)
CREATE FUNCTION cos_dist_with_guard(real[], real[]) RETURNS real
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;

CREATE FUNCTION hamming_dist(integer[], integer[]) RETURNS integer
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;


CREATE FUNCTION hamming_dist_with_guard(integer[], integer[]) RETURNS integer
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;

-- operators
CREATE OPERATOR <-> (
LEFTARG = real[], RIGHTARG = real[], PROCEDURE = l2sq_dist,
COMMUTATOR = '<->'
CREATE OPERATOR <?> (
LEFTARG = real[], RIGHTARG = real[], PROCEDURE = ldb_generic_dist,
COMMUTATOR = '<?>'
);

CREATE OPERATOR <?> (
LEFTARG = integer[], RIGHTARG = integer[], PROCEDURE = ldb_generic_dist,
COMMUTATOR = '<?>'
);

CREATE OPERATOR <-> (
LEFTARG = integer[], RIGHTARG = integer[], PROCEDURE = l2sq_dist,
LEFTARG = real[], RIGHTARG = real[], PROCEDURE = l2sq_dist,
COMMUTATOR = '<->'
);

CREATE OPERATOR <=> (
LEFTARG = real[], RIGHTARG = real[], PROCEDURE = cos_dist_with_guard,
LEFTARG = real[], RIGHTARG = real[], PROCEDURE = cos_dist,
COMMUTATOR = '<=>'
);

CREATE OPERATOR <+> (
LEFTARG = integer[], RIGHTARG = integer[], PROCEDURE = hamming_dist_with_guard,
LEFTARG = integer[], RIGHTARG = integer[], PROCEDURE = hamming_dist,
COMMUTATOR = '<+>'
);

Expand All @@ -74,28 +67,28 @@ BEGIN
dist_l2sq_ops := '
CREATE OPERATOR CLASS dist_l2sq_ops
DEFAULT FOR TYPE real[] USING ' || access_method_name || ' AS
OPERATOR 1 <-> (real[], real[]) FOR ORDER BY float_ops,
FUNCTION 1 l2sq_dist(real[], real[]);
OPERATOR 1 <?> (real[], real[]) FOR ORDER BY float_ops,
FUNCTION 1 l2sq_dist(real[], real[]),
OPERATOR 2 <-> (real[], real[]) FOR ORDER BY float_ops,
FUNCTION 2 l2sq_dist(real[], real[]);
';

dist_cos_ops := '
CREATE OPERATOR CLASS dist_cos_ops
FOR TYPE real[] USING ' || access_method_name || ' AS
OPERATOR 1 <-> (real[], real[]) FOR ORDER BY float_ops,
OPERATOR 1 <?> (real[], real[]) FOR ORDER BY float_ops,
FUNCTION 1 cos_dist(real[], real[]),
-- it is important to set the function with guard the second
-- as op rewriting hook takes the first function to use
OPERATOR 2 <=> (real[], real[]) FOR ORDER BY float_ops,
FUNCTION 2 cos_dist_with_guard(real[], real[]);
FUNCTION 2 cos_dist(real[], real[]);
';

dist_hamming_ops := '
CREATE OPERATOR CLASS dist_hamming_ops
FOR TYPE integer[] USING ' || access_method_name || ' AS
OPERATOR 1 <-> (integer[], integer[]) FOR ORDER BY float_ops,
OPERATOR 1 <?> (integer[], integer[]) FOR ORDER BY float_ops,
FUNCTION 1 hamming_dist(integer[], integer[]),
OPERATOR 2 <+> (integer[], integer[]) FOR ORDER BY integer_ops,
FUNCTION 2 hamming_dist_with_guard(integer[], integer[]);
FUNCTION 2 hamming_dist(integer[], integer[]);
';

-- Execute the dynamic SQL statement.
Expand Down Expand Up @@ -142,16 +135,25 @@ BEGIN
CREATE FUNCTION cos_dist(vector, vector) RETURNS float8
AS 'MODULE_PATHNAME', 'vector_cos_dist' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;

-- pgvecor's vector type requires floats and we cannot define hamming distance for floats
CREATE FUNCTION ldb_generic_dist(vector, vector) RETURNS real
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;

CREATE OPERATOR <?> (
LEFTARG = vector, RIGHTARG = vector, PROCEDURE = ldb_generic_dist,
COMMUTATOR = '<?>'
);

-- pgvecor's vector type requires floats and we cannot define hamming distance for floats
CREATE OPERATOR CLASS dist_vec_l2sq_ops
DEFAULT FOR TYPE vector USING lantern_hnsw AS
OPERATOR 1 <-> (vector, vector) FOR ORDER BY float_ops,
FUNCTION 1 l2sq_dist(vector, vector);
OPERATOR 1 <?> (vector, vector) FOR ORDER BY float_ops,
FUNCTION 1 l2sq_dist(vector, vector),
OPERATOR 2 <-> (vector, vector) FOR ORDER BY float_ops,
FUNCTION 2 l2sq_dist(vector, vector);

CREATE OPERATOR CLASS dist_vec_cos_ops
FOR TYPE vector USING lantern_hnsw AS
OPERATOR 1 <-> (vector, vector) FOR ORDER BY float_ops,
OPERATOR 1 <?> (vector, vector) FOR ORDER BY float_ops,
FUNCTION 1 cos_dist(vector, vector),
OPERATOR 2 <=> (vector, vector) FOR ORDER BY float_ops,
FUNCTION 2 cos_dist(vector, vector);
Expand Down
180 changes: 176 additions & 4 deletions sql/updates/0.0.9--0.0.10.sql
Original file line number Diff line number Diff line change
@@ -1,5 +1,177 @@
-- these go for good.
--
DO $BODY$
DECLARE
pgvector_exists boolean;
am_name TEXT;
r pg_indexes%ROWTYPE;
indexes_cursor REFCURSOR;
index_names TEXT[] := '{}';
index_definitions TEXT[] := '{}';
BEGIN
-- Function to recreate operator classes for specified access method
CREATE OR REPLACE FUNCTION _lantern_internal._recreate_ldb_operator_classes(access_method_name TEXT) RETURNS BOOLEAN AS $$
DECLARE
dist_l2sq_ops TEXT;
dist_l2sq_ops_drop TEXT;
dist_cos_ops TEXT;
dist_cos_ops_drop TEXT;
dist_hamming_ops TEXT;
dist_hamming_ops_drop TEXT;
BEGIN

-- Construct the SQL statement to create the operator classes dynamically.
dist_l2sq_ops_drop := 'DROP OPERATOR CLASS IF EXISTS dist_l2sq_ops USING ' || access_method_name || ' CASCADE;';
dist_l2sq_ops := '
CREATE OPERATOR CLASS dist_l2sq_ops
DEFAULT FOR TYPE real[] USING ' || access_method_name || ' AS
OPERATOR 1 <?> (real[], real[]) FOR ORDER BY float_ops,
FUNCTION 1 l2sq_dist(real[], real[]),
OPERATOR 2 <-> (real[], real[]) FOR ORDER BY float_ops,
FUNCTION 2 l2sq_dist(real[], real[]);
';

dist_cos_ops_drop := 'DROP OPERATOR CLASS IF EXISTS dist_cos_ops USING ' || access_method_name || ' CASCADE;';
dist_cos_ops := '
CREATE OPERATOR CLASS dist_cos_ops
FOR TYPE real[] USING ' || access_method_name || ' AS
OPERATOR 1 <?> (real[], real[]) FOR ORDER BY float_ops,
FUNCTION 1 cos_dist(real[], real[]),
OPERATOR 2 <=> (real[], real[]) FOR ORDER BY float_ops,
FUNCTION 2 cos_dist(real[], real[]);
';


dist_hamming_ops_drop := 'DROP OPERATOR CLASS IF EXISTS dist_hamming_ops USING ' || access_method_name || ' CASCADE;';
dist_hamming_ops := '
CREATE OPERATOR CLASS dist_hamming_ops
FOR TYPE integer[] USING ' || access_method_name || ' AS
OPERATOR 1 <?> (integer[], integer[]) FOR ORDER BY float_ops,
FUNCTION 1 hamming_dist(integer[], integer[]),
OPERATOR 2 <+> (integer[], integer[]) FOR ORDER BY integer_ops,
FUNCTION 2 hamming_dist(integer[], integer[]);
';


-- Execute the dynamic SQL statement.
EXECUTE dist_l2sq_ops_drop;
EXECUTE dist_l2sq_ops;
EXECUTE dist_cos_ops_drop;
EXECUTE dist_cos_ops;
EXECUTE dist_hamming_ops_drop;
EXECUTE dist_hamming_ops;

RETURN TRUE;
END;
$$ LANGUAGE plpgsql VOLATILE;

-- Check if the vector type from pgvector exists
SELECT EXISTS (
SELECT 1
FROM pg_type
WHERE typname = 'vector'
) INTO pgvector_exists;

am_name := 'hnsw';


IF pgvector_exists THEN
am_name := 'lantern_hnsw';
-- these go for good.
DROP OPERATOR CLASS IF EXISTS dist_vec_hamming_ops USING lantern_hnsw CASCADE;
DROP FUNCTION IF EXISTS hamming_dist(vector, vector);
DROP OPERATOR <+> (vector, vector) CASCADE;
END IF;


-- keep existing indexes to reindex as we should drop indexes in order to change operator classes
OPEN indexes_cursor FOR SELECT * FROM pg_indexes WHERE indexdef ILIKE '%USING ' || am_name || '%';
-- Fetch index names into the array
LOOP
FETCH indexes_cursor INTO r;
EXIT WHEN NOT FOUND;

-- Append index name to the array
index_names := array_append(index_names, r.indexname);
index_definitions := array_append(index_definitions, r.indexdef);
END LOOP;

CLOSE indexes_cursor;

IF pgvector_exists THEN
CREATE FUNCTION ldb_generic_dist(vector, vector) RETURNS real
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;


CREATE OPERATOR <?> (
LEFTARG = vector, RIGHTARG = vector, PROCEDURE = ldb_generic_dist,
COMMUTATOR = '<?>'
);

-- pgvecor's vector type requires floats and we cannot define hamming distance for floats
CREATE OPERATOR CLASS dist_vec_l2sq_ops
DEFAULT FOR TYPE vector USING lantern_hnsw AS
OPERATOR 1 <?> (vector, vector) FOR ORDER BY float_ops,
FUNCTION 1 l2sq_dist(vector, vector),
OPERATOR 2 <-> (vector, vector) FOR ORDER BY float_ops,
FUNCTION 2 l2sq_dist(vector, vector);

CREATE OPERATOR CLASS dist_vec_cos_ops
FOR TYPE vector USING lantern_hnsw AS
OPERATOR 1 <?> (vector, vector) FOR ORDER BY float_ops,
FUNCTION 1 cos_dist(vector, vector),
OPERATOR 2 <=> (vector, vector) FOR ORDER BY float_ops,
FUNCTION 2 cos_dist(vector, vector);

am_name := 'lantern_hnsw';
END IF;

-- operators
DROP OPERATOR <->(integer[], integer[]) CASCADE;
DROP OPERATOR <->(real[], real[]) CASCADE;
DROP OPERATOR <=>(real[], real[]) CASCADE;
DROP OPERATOR <+>(integer[], integer[]) CASCADE;

DROP FUNCTION IF EXISTS cos_dist_with_guard CASCADE;
DROP FUNCTION IF EXISTS hamming_dist_with_guard CASCADE;

CREATE OR REPLACE FUNCTION ldb_generic_dist(integer[], integer[]) RETURNS real
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;

CREATE OPERATOR <?> (
LEFTARG = real[], RIGHTARG = real[], PROCEDURE = ldb_generic_dist,
COMMUTATOR = '<?>'
);

CREATE OPERATOR <?> (
LEFTARG = integer[], RIGHTARG = integer[], PROCEDURE = ldb_generic_dist,
COMMUTATOR = '<?>'
);

CREATE OPERATOR <-> (
LEFTARG = real[], RIGHTARG = real[], PROCEDURE = l2sq_dist,
COMMUTATOR = '<->'
);

CREATE OPERATOR <=> (
LEFTARG = real[], RIGHTARG = real[], PROCEDURE = cos_dist,
COMMUTATOR = '<=>'
);

CREATE OPERATOR <+> (
LEFTARG = integer[], RIGHTARG = integer[], PROCEDURE = hamming_dist,
COMMUTATOR = '<+>'
);

PERFORM _lantern_internal._recreate_ldb_operator_classes(am_name);

SET client_min_messages TO NOTICE;
-- reindex indexes
FOR i IN 1..coalesce(array_length(index_names, 1), 0) LOOP
RAISE NOTICE 'Reindexing index %', index_names[i];
EXECUTE index_definitions[i];
RAISE NOTICE 'Reindexed index: %', index_names[i];
END LOOP;
END;
$BODY$
LANGUAGE plpgsql;

DROP OPERATOR CLASS IF EXISTS dist_vec_hamming_ops USING hnsw CASCADE;
DROP FUNCTION IF EXISTS cos_dist(vector, vector);
DROP OPERATOR <+>(vector, vector) CASCADE
Loading

0 comments on commit 0a1227b

Please sign in to comment.