Skip to content

Commit

Permalink
on unspecified dim index build on empty table, set index dimension to…
Browse files Browse the repository at this point in the history
… 0 and then later update it after inferring dimension from first insert
  • Loading branch information
therealdarkknight committed Dec 19, 2023
1 parent c129d8a commit af15c88
Show file tree
Hide file tree
Showing 6 changed files with 109 additions and 13 deletions.
17 changes: 17 additions & 0 deletions src/hnsw.c
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,23 @@ HnswColumnType GetIndexColumnType(Relation index)
return GetColumnTypeFromOid(attr->atttypid);
}

/*
* Returns length of vector from datum
*/
int DatumGetLength(Datum datum, HnswColumnType type)
{
if(type == VECTOR) {
Vector *vector = DatumGetVector(datum);
return vector->dim;
} else if(type == REAL_ARRAY || type == INT_ARRAY) {
ArrayType *array = DatumGetArrayTypePCopy(datum);
return ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
} else {
elog(ERROR, "Unsupported type");
}
return -1;
}

/*
* Given vector data and vector type, read it as either a float4 or int32 array and return as void*
*/
Expand Down
1 change: 1 addition & 0 deletions src/hnsw.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ PGDLLEXPORT Datum vector_cos_dist(PG_FUNCTION_ARGS);

HnswColumnType GetColumnTypeFromOid(Oid oid);
HnswColumnType GetIndexColumnType(Relation index);
int DatumGetLength(Datum datum, HnswColumnType type);
void* DatumGetSizedArray(Datum datum, HnswColumnType type, int dimensions);

#define LDB_UNUSED(x) (void)(x)
Expand Down
12 changes: 9 additions & 3 deletions src/hnsw/build.c
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ static int GetArrayLengthFromHeap(Relation heap, int indexCol, IndexInfo *indexI
tuple = heap_getnext(scan, ForwardScanDirection);
if(tuple == NULL) {
heap_endscan(scan);
return n_items;
return 0;
}

if(indexInfo->ii_Expressions != NULL) {
Expand Down Expand Up @@ -349,10 +349,16 @@ static void InitBuildState(HnswBuildState *buildstate, Relation heap, Relation i

// If a dimension wasn't specified try to infer it
if(buildstate->dimensions < 1) {
// todo:: isn't calling InferDimension and GetHnswIndexDimensions above redundant?
buildstate->dimensions = InferDimension(heap, indexInfo);
}
/* Require column to have dimensions to be indexed */
if(buildstate->dimensions < 1) elog(ERROR, "column does not have dimensions, please specify one");

// At this point, (buildstate->dimensions == 0) if this is building an index with no dim specified on an empty table
// The zero is a sentinel value that we check upon the first insertion of a row
// Note that (buildstate->dimensions == -1) if something went wrong
if(buildstate->dimensions < 0) {
elog(ERROR, "could not infer a dimension when no dimension was specified");
}

// not supported because of 8K page limit in postgres WAL pages
// can pass this limit once quantization is supported
Expand Down
24 changes: 20 additions & 4 deletions src/hnsw/insert.c
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,10 @@ bool ldb_aminsert(Relation index,
GenericXLogState *state;
uint32 new_tuple_id;
HnswIndexTuple *new_tuple;
HnswColumnType column_type;
usearch_init_options_t opts = {0};
LDB_UNUSED(heap);
LDB_UNUSED(indexInfo);
#if PG_VERSION_NUM >= 140000
LDB_UNUSED(indexUnchanged);
#endif
Expand Down Expand Up @@ -103,8 +105,23 @@ bool ldb_aminsert(Relation index,
hdr = (HnswIndexHeaderPage *)PageGetContents(hdr_page);
assert(hdr->magicNumber == LDB_WAL_MAGIC_NUMBER);

opts.dimensions = GetHnswIndexDimensions(index, indexInfo);
CheckHnswIndexDimensions(index, values[ 0 ], opts.dimensions);
datum = PointerGetDatum(PG_DETOAST_DATUM(values[ 0 ]));
column_type = GetIndexColumnType(index);

// Check if we created an index on an empty-table with no dimension specified
if(hdr->vector_dim == 0) {
opts.dimensions = DatumGetLength(datum, column_type);
if(opts.dimensions < 1)
elog(ERROR,
"Failed to infer dimension of inserted vector upon first insert on an empty table with no index "
"dimension specified.");
// update the index header (we mark hdr_buf dirty later)
hdr->vector_dim = opts.dimensions;
} else {
opts.dimensions = hdr->vector_dim;
CheckHnswIndexDimensions(index, values[ 0 ], opts.dimensions);
}

PopulateUsearchOpts(index, &opts);
opts.retriever_ctx = ldb_wal_retriever_area_init(index, hdr);
opts.retriever = ldb_wal_index_node_retriever;
Expand All @@ -125,14 +142,13 @@ bool ldb_aminsert(Relation index,

insertstate->uidx = uidx;
insertstate->retriever_ctx = opts.retriever_ctx;
insertstate->columnType = GetIndexColumnType(index);
insertstate->columnType = column_type;

hdr_page = NULL;

meta = usearch_metadata(uidx, &error);
assert(!error);

datum = PointerGetDatum(PG_DETOAST_DATUM(values[ 0 ]));
void *vector = DatumGetSizedArray(datum, insertstate->columnType, opts.dimensions);

#if LANTERNDB_COPYNODES
Expand Down
36 changes: 32 additions & 4 deletions test/expected/hnsw_create.out
Original file line number Diff line number Diff line change
Expand Up @@ -92,18 +92,21 @@ CREATE TABLE small_world4 (
id varchar(3),
vector real[]
);
-- If the first row is NULL we do not infer a dimension
-- If the first inserted row is NULL: we can create an index but we can't infer the dimension from the first inserted row (since it is null)
\set ON_ERROR_STOP off
CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2);
ERROR: column does not have dimensions, please specify one
CREATE INDEX first_row_null_idx ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2);
INFO: done init usearch index
INFO: inserted 0 elements
INFO: done saving 0 vectors
begin;
INSERT INTO small_world4 (id, vector) VALUES
('000', NULL),
('001', '{1,0,0,1}');
CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2);
ERROR: column does not have dimensions, please specify one
ERROR: could not infer a dimension when no dimension was specified
rollback;
\set ON_ERROR_STOP on
DROP INDEX first_row_null_idx;
INSERT INTO small_world4 (id, vector) VALUES
('000', '{1,0,0,0}'),
('001', '{1,0,0,1}'),
Expand Down Expand Up @@ -151,3 +154,28 @@ CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construct
INFO: done init usearch index
ERROR: Wrong number of dimensions: 3 instead of 4 expected
\set ON_ERROR_STOP on
-- Test index creation on empty table and no dimension specified
CREATE TABLE small_world5 (
id SERIAL PRIMARY KEY,
v REAL[]
);
-- We can still create an index despite having an empty table and not specifying a dimension during index creation
CREATE INDEX small_world5_hnsw_idx ON small_world5 USING hnsw (v dist_l2sq_ops);
INFO: done init usearch index
INFO: inserted 0 elements
INFO: done saving 0 vectors
begin;
-- Our index then infers the dimension from the first inserted row
INSERT INTO small_world5 (id, v) VALUES
('000', '{1,0,0,0,1}'),
('001', '{1,0,0,1,2}'),
('010', '{1,0,1,0,3}');
rollback;
-- Test that upon infering the dimension from the first inserted row, we do not allow subsequent rows with different dimensions
\set ON_ERROR_STOP off
INSERT INTO small_world5 (id, v) VALUES
('100', '{2,0,0,0,1}'),
('101', '{2,0,0}'),
('110', '{2,0,1,0}');
ERROR: Wrong number of dimensions: 3 instead of 5 expected
\set ON_ERROR_STOP on
32 changes: 30 additions & 2 deletions test/sql/hnsw_create.sql
Original file line number Diff line number Diff line change
Expand Up @@ -36,16 +36,17 @@ CREATE TABLE small_world4 (
id varchar(3),
vector real[]
);
-- If the first row is NULL we do not infer a dimension
-- If the first inserted row is NULL: we can create an index but we can't infer the dimension from the first inserted row (since it is null)
\set ON_ERROR_STOP off
CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2);
CREATE INDEX first_row_null_idx ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2);
begin;
INSERT INTO small_world4 (id, vector) VALUES
('000', NULL),
('001', '{1,0,0,1}');
CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2);
rollback;
\set ON_ERROR_STOP on
DROP INDEX first_row_null_idx;

INSERT INTO small_world4 (id, vector) VALUES
('000', '{1,0,0,0}'),
Expand Down Expand Up @@ -78,3 +79,30 @@ UPDATE small_world4 SET vector = '{0,0,0}' WHERE id = '001';
\set ON_ERROR_STOP off
CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2);
\set ON_ERROR_STOP on

-- Test index creation on empty table and no dimension specified
CREATE TABLE small_world5 (
id SERIAL PRIMARY KEY,
v REAL[]
);

-- We can still create an index despite having an empty table and not specifying a dimension during index creation
CREATE INDEX small_world5_hnsw_idx ON small_world5 USING hnsw (v dist_l2sq_ops);

begin;
-- Our index then infers the dimension from the first inserted row
INSERT INTO small_world5 (id, v) VALUES
('000', '{1,0,0,0,1}'),
('001', '{1,0,0,1,2}'),
('010', '{1,0,1,0,3}');
rollback;

-- Test that upon infering the dimension from the first inserted row, we do not allow subsequent rows with different dimensions
\set ON_ERROR_STOP off
INSERT INTO small_world5 (id, v) VALUES
('100', '{2,0,0,0,1}'),
('101', '{2,0,0}'),
('110', '{2,0,1,0}');
\set ON_ERROR_STOP on


0 comments on commit af15c88

Please sign in to comment.