diff --git a/src/hnsw.c b/src/hnsw.c index a7210ea99..b46e5634c 100644 --- a/src/hnsw.c +++ b/src/hnsw.c @@ -455,6 +455,23 @@ HnswColumnType GetIndexColumnType(Relation index) return GetColumnTypeFromOid(attr->atttypid); } +/* + * Returns length of vector from datum + */ +int DatumGetLength(Datum datum, HnswColumnType type) +{ + if(type == VECTOR) { + Vector *vector = DatumGetVector(datum); + return vector->dim; + } else if(type == REAL_ARRAY || type == INT_ARRAY) { + ArrayType *array = DatumGetArrayTypePCopy(datum); + return ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array)); + } else { + elog(ERROR, "Unsupported type"); + } + return -1; +} + /* * Given vector data and vector type, read it as either a float4 or int32 array and return as void* */ diff --git a/src/hnsw.h b/src/hnsw.h index d1db4edc8..36b3e5172 100644 --- a/src/hnsw.h +++ b/src/hnsw.h @@ -38,6 +38,7 @@ PGDLLEXPORT Datum lantern_reindex_external_index(PG_FUNCTION_ARGS); HnswColumnType GetColumnTypeFromOid(Oid oid); HnswColumnType GetIndexColumnType(Relation index); +int DatumGetLength(Datum datum, HnswColumnType type); void* DatumGetSizedArray(Datum datum, HnswColumnType type, int dimensions); #define LDB_UNUSED(x) (void)(x) diff --git a/src/hnsw/build.c b/src/hnsw/build.c index ebc0a8c99..0c7e3db4a 100644 --- a/src/hnsw/build.c +++ b/src/hnsw/build.c @@ -242,7 +242,7 @@ static int GetArrayLengthFromHeap(Relation heap, int indexCol, IndexInfo *indexI tuple = heap_getnext(scan, ForwardScanDirection); if(tuple == NULL) { heap_endscan(scan); - return n_items; + return 0; } if(indexInfo->ii_Expressions != NULL) { @@ -361,8 +361,13 @@ static void InitBuildState(HnswBuildState *buildstate, Relation heap, Relation i if(heap != NULL && buildstate->dimensions < 1) { buildstate->dimensions = InferDimension(heap, indexInfo); } - /* Require column to have dimensions to be indexed */ - if(buildstate->dimensions < 1) elog(ERROR, "column does not have dimensions, please specify one"); + + // At this point, (buildstate->dimensions == 0) if this is building an index with no dim specified on an empty table + // The zero is a sentinel value that we check upon the first insertion of a row + // Note that (buildstate->dimensions == -1) if something went wrong + if(buildstate->dimensions < 0) { + elog(ERROR, "could not infer a dimension when no dimension was specified"); + } // not supported because of 8K page limit in postgres WAL pages // can pass this limit once quantization is supported diff --git a/src/hnsw/insert.c b/src/hnsw/insert.c index 6e5e090cd..bfd23a4c8 100644 --- a/src/hnsw/insert.c +++ b/src/hnsw/insert.c @@ -70,6 +70,7 @@ bool ldb_aminsert(Relation index, GenericXLogState *state; uint32 new_tuple_id; HnswIndexTuple *new_tuple; + HnswColumnType column_type; usearch_init_options_t opts = {0}; LDB_UNUSED(heap); LDB_UNUSED(indexInfo); @@ -110,8 +111,23 @@ bool ldb_aminsert(Relation index, hdr = (HnswIndexHeaderPage *)PageGetContents(hdr_page); assert(hdr->magicNumber == LDB_WAL_MAGIC_NUMBER); - opts.dimensions = hdr->vector_dim; - CheckHnswIndexDimensions(index, values[ 0 ], opts.dimensions); + datum = PointerGetDatum(PG_DETOAST_DATUM(values[ 0 ])); + column_type = GetIndexColumnType(index); + + // Check if we created an index on an empty-table with no dimension specified + if(hdr->vector_dim == 0) { + opts.dimensions = DatumGetLength(datum, column_type); + if(opts.dimensions < 1) + elog(ERROR, + "Failed to infer dimension of inserted vector upon first insert on an empty table with no index " + "dimension specified."); + // update the index header (we mark hdr_buf dirty later) + hdr->vector_dim = opts.dimensions; + } else { + opts.dimensions = hdr->vector_dim; + CheckHnswIndexDimensions(index, values[ 0 ], opts.dimensions); + } + PopulateUsearchOpts(index, &opts); opts.retriever_ctx = ldb_wal_retriever_area_init(index, hdr); opts.retriever = ldb_wal_index_node_retriever; @@ -132,14 +148,13 @@ bool ldb_aminsert(Relation index, insertstate->uidx = uidx; insertstate->retriever_ctx = opts.retriever_ctx; - insertstate->columnType = GetIndexColumnType(index); + insertstate->columnType = column_type; hdr_page = NULL; meta = usearch_metadata(uidx, &error); assert(!error); - datum = PointerGetDatum(PG_DETOAST_DATUM(values[ 0 ])); void *vector = DatumGetSizedArray(datum, insertstate->columnType, opts.dimensions); #if LANTERNDB_COPYNODES diff --git a/test/expected/hnsw_create.out b/test/expected/hnsw_create.out index 2d866705e..90e9a9206 100644 --- a/test/expected/hnsw_create.out +++ b/test/expected/hnsw_create.out @@ -92,18 +92,21 @@ CREATE TABLE small_world4 ( id varchar(3), vector real[] ); --- If the first row is NULL we do not infer a dimension +-- If the first inserted row is NULL: we can create an index but we can't infer the dimension from the first inserted row (since it is null) \set ON_ERROR_STOP off -CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); -ERROR: column does not have dimensions, please specify one +CREATE INDEX first_row_null_idx ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); +INFO: done init usearch index +INFO: inserted 0 elements +INFO: done saving 0 vectors begin; INSERT INTO small_world4 (id, vector) VALUES ('000', NULL), ('001', '{1,0,0,1}'); CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); -ERROR: column does not have dimensions, please specify one +ERROR: could not infer a dimension when no dimension was specified rollback; \set ON_ERROR_STOP on +DROP INDEX first_row_null_idx; INSERT INTO small_world4 (id, vector) VALUES ('000', '{1,0,0,0}'), ('001', '{1,0,0,1}'), @@ -151,3 +154,31 @@ CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construct INFO: done init usearch index ERROR: Wrong number of dimensions: 3 instead of 4 expected \set ON_ERROR_STOP on +-- Test index creation on empty table and no dimension specified +CREATE TABLE small_world5 ( + id SERIAL PRIMARY KEY, + v REAL[] +); +-- We can still create an index despite having an empty table and not specifying a dimension during index creation +CREATE INDEX small_world5_hnsw_idx ON small_world5 USING hnsw (v dist_l2sq_ops); +INFO: done init usearch index +INFO: inserted 0 elements +INFO: done saving 0 vectors +begin; +-- Inserting a NULL vector should only insert it into the table and not into our index +-- So, our index is still empty after and is yet to pick up a dimension +INSERT INTO small_world5 (id, v) VALUES ('200', NULL); +-- Our index then infers the dimension from the first inserted non-NULL row +INSERT INTO small_world5 (id, v) VALUES +('000', '{1,0,0,0,1}'), +('001', '{1,0,0,1,2}'), +('010', '{1,0,1,0,3}'); +rollback; +-- Test that upon infering the dimension from the first inserted row, we do not allow subsequent rows with different dimensions +\set ON_ERROR_STOP off +INSERT INTO small_world5 (id, v) VALUES +('100', '{2,0,0,0,1}'), +('101', '{2,0,0}'), +('110', '{2,0,1,0}'); +ERROR: Wrong number of dimensions: 3 instead of 5 expected +\set ON_ERROR_STOP on diff --git a/test/sql/hnsw_create.sql b/test/sql/hnsw_create.sql index 776ddb2d0..a212ca8dc 100644 --- a/test/sql/hnsw_create.sql +++ b/test/sql/hnsw_create.sql @@ -36,9 +36,9 @@ CREATE TABLE small_world4 ( id varchar(3), vector real[] ); --- If the first row is NULL we do not infer a dimension +-- If the first inserted row is NULL: we can create an index but we can't infer the dimension from the first inserted row (since it is null) \set ON_ERROR_STOP off -CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); +CREATE INDEX first_row_null_idx ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); begin; INSERT INTO small_world4 (id, vector) VALUES ('000', NULL), @@ -46,6 +46,7 @@ INSERT INTO small_world4 (id, vector) VALUES CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); rollback; \set ON_ERROR_STOP on +DROP INDEX first_row_null_idx; INSERT INTO small_world4 (id, vector) VALUES ('000', '{1,0,0,0}'), @@ -78,3 +79,34 @@ UPDATE small_world4 SET vector = '{0,0,0}' WHERE id = '001'; \set ON_ERROR_STOP off CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); \set ON_ERROR_STOP on + +-- Test index creation on empty table and no dimension specified +CREATE TABLE small_world5 ( + id SERIAL PRIMARY KEY, + v REAL[] +); + +-- We can still create an index despite having an empty table and not specifying a dimension during index creation +CREATE INDEX small_world5_hnsw_idx ON small_world5 USING hnsw (v dist_l2sq_ops); + +begin; +-- Inserting a NULL vector should only insert it into the table and not into our index +-- So, our index is still empty after and is yet to pick up a dimension +INSERT INTO small_world5 (id, v) VALUES ('200', NULL); + +-- Our index then infers the dimension from the first inserted non-NULL row +INSERT INTO small_world5 (id, v) VALUES +('000', '{1,0,0,0,1}'), +('001', '{1,0,0,1,2}'), +('010', '{1,0,1,0,3}'); +rollback; + +-- Test that upon infering the dimension from the first inserted row, we do not allow subsequent rows with different dimensions +\set ON_ERROR_STOP off +INSERT INTO small_world5 (id, v) VALUES +('100', '{2,0,0,0,1}'), +('101', '{2,0,0}'), +('110', '{2,0,1,0}'); +\set ON_ERROR_STOP on + +