forked from lanterndata/lantern
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implement failure points (lanterndata#218)
This patch adds API to trigger execution of C code from SQL to test corner cases. `test/sql/hnsw_failure_point.sql` has an example of how to trigger a process crash using failure points and how to see that a space leak happens if a crash happens after a block is allocated, but before a record for the block is added to the index during blockmaps creation. * src/hnsw/failure_point: fix use-after-free bug when strings are deallocated at the end of a query * src/hnsw/failure_point: elog(INFO, ...) when a failure point is enabled
- Loading branch information
Showing
10 changed files
with
247 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
#include <postgres.h> | ||
|
||
#include "hnsw/failure_point.h" | ||
|
||
#include <inttypes.h> /* PRIu32 */ | ||
|
||
struct failure_point_state | ||
{ | ||
bool enabled; | ||
char func[ 0x100 ]; | ||
char name[ 0x100 ]; | ||
uint32 remaining; | ||
}; | ||
|
||
static struct failure_point_state *failure_point_get_state(void) | ||
{ | ||
static struct failure_point_state state = {}; | ||
|
||
return &state; | ||
} | ||
|
||
void ldb_failure_point_enable(const char *func, const char *name, uint32 dont_trigger_first_nr) | ||
{ | ||
struct failure_point_state *state = failure_point_get_state(); | ||
|
||
if(!LANTERN_FAILURE_POINTS_ARE_ENABLED) { | ||
elog(WARNING, | ||
"Can't enable failure point for (func=%s name=%s), " | ||
"because failure points are disabled in compile time.", | ||
func, | ||
name); | ||
} | ||
if(state->enabled) { | ||
elog(WARNING, | ||
"ldb_failure_point_enable(): another failure point is enabled already." | ||
" old failure point: func=%s name=%s remaining=%" PRIu32 | ||
" new failure point: func=%s name=%s dont_trigger_first_nr=%" PRIu32, | ||
state->func, | ||
state->name, | ||
state->remaining, | ||
func, | ||
name, | ||
dont_trigger_first_nr); | ||
} | ||
if(strlen(func) >= lengthof(state->func)) { | ||
elog(ERROR, | ||
"failure point function name is too large: " | ||
"func=%s strlen(func)=%zu lengthof(state->func)=%zu", | ||
func, | ||
strlen(func), | ||
lengthof(state->func)); | ||
} | ||
if(strlen(name) >= lengthof(state->name)) { | ||
elog(ERROR, | ||
"failure point name is too large: " | ||
"name=%s strlen(name)=%zu lengthof(state->name)=%zu", | ||
name, | ||
strlen(name), | ||
lengthof(state->name)); | ||
} | ||
state->enabled = true; | ||
state->remaining = dont_trigger_first_nr; | ||
strncpy(state->func, func, lengthof(state->func)); | ||
strncpy(state->name, name, lengthof(state->name)); | ||
elog(INFO, "Failure point (func=%s name=%s) is enabled.", state->func, state->name); | ||
} | ||
|
||
bool ldb_failure_point_is_enabled(const char *func, const char *name) | ||
{ | ||
struct failure_point_state *state = failure_point_get_state(); | ||
|
||
if(!LANTERN_FAILURE_POINTS_ARE_ENABLED) return false; | ||
if(!state->enabled) return false; | ||
if(strcmp(func, state->func) == 0 && strcmp(name, state->name) == 0) { | ||
if(state->remaining == 0) { | ||
state->enabled = false; | ||
elog(INFO, "Failure point (func=%s name=%s) has been triggered.", state->func, state->name); | ||
return true; | ||
} else { | ||
--state->remaining; | ||
} | ||
} | ||
return false; | ||
} | ||
|
||
void ldb_failure_point_crash(void) | ||
{ | ||
elog(ERROR, "ldb_failure_point_crash()"); | ||
pg_unreachable(); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
#ifndef LDB_HNSW_FAILURE_POINT_H | ||
#define LDB_HNSW_FAILURE_POINT_H | ||
|
||
/* | ||
* Failure points implementation. | ||
* | ||
* An example on how to use from test/sql/hnsw_failure_point.sql. | ||
* | ||
* 1) Add this to CreateBlockMapGroup(): | ||
* | ||
LDB_FAILURE_POINT_CRASH_IF_ENABLED("crash_after_buf_allocation"); | ||
* | ||
* 2) Enable the failure point somewhere in the test: | ||
* | ||
* SELECT _lantern_internal.failure_point_enable('CreateBlockMapGroup', 'crash_after_buf_allocation', 0); | ||
* | ||
* 3) Trigger the failure point, the output looks like this: | ||
* | ||
* INFO: Failure point (func=CreateBlockMapGroup name=crash_after_buf_allocation) has been triggered. | ||
* | ||
* 4) Now check that the failure actually happens, for example with validate_index(): | ||
* | ||
* SELECT _lantern_internal.validate_index('small_world_v_idx', false); | ||
* | ||
* 5) The output tells that the block is allocated, but it's not being used: | ||
* | ||
* INFO: validate_index() start for small_world_v_idx | ||
* ERROR: vi_blocks[48].vp_type == LDB_VI_BLOCK_UNKNOWN (but it should be known now) | ||
* | ||
* | ||
* Limitations | ||
* | ||
* 1) A single static per-process variable holds the state. | ||
* 2) Only one failure point active at a time is supported. | ||
* 3) The API is not thread-safe. | ||
*/ | ||
|
||
#define LDB_FAILURE_POINT_IS_ENABLED(_name) \ | ||
(LANTERN_FAILURE_POINTS_ARE_ENABLED && ldb_failure_point_is_enabled(__func__, (_name))) | ||
#define LDB_FAILURE_POINT_CRASH_IF_ENABLED(_name) \ | ||
if(LDB_FAILURE_POINT_IS_ENABLED(_name)) ldb_failure_point_crash() | ||
|
||
void ldb_failure_point_enable(const char *func, const char *name, uint32 dont_trigger_first_nr); | ||
bool ldb_failure_point_is_enabled(const char *func, const char *name); | ||
void ldb_failure_point_crash(void); | ||
|
||
#endif // LDB_HNSW_FAILURE_POINT_H |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
------------------------------ | ||
-- Test HNSW failure points -- | ||
------------------------------ | ||
CREATE TABLE small_world ( | ||
id SERIAL PRIMARY KEY, | ||
v REAL[2] | ||
); | ||
CREATE INDEX ON small_world USING hnsw (v) WITH (dim=3); | ||
INFO: done init usearch index | ||
INFO: inserted 0 elements | ||
INFO: done saving 0 vectors | ||
-- let's insert HNSW_BLOCKMAP_BLOCKS_PER_PAGE (2000) record to fill the first blockmap page | ||
do $$ | ||
BEGIN | ||
FOR i IN 1..2000 LOOP | ||
INSERT INTO small_world (v) VALUES (array_replace(ARRAY[0,0,-1], -1, i)); | ||
END LOOP; | ||
END $$; | ||
-- everything is fine, the index is valid | ||
SELECT _lantern_internal.validate_index('small_world_v_idx', false); | ||
INFO: validate_index() start for small_world_v_idx | ||
INFO: validate_index() done, no issues found. | ||
validate_index | ||
---------------- | ||
|
||
(1 row) | ||
|
||
-- now let's crash after a buffer for a blockmap is allocated during insert, | ||
-- but it hasn't been recorded yet | ||
SELECT _lantern_internal.failure_point_enable('CreateBlockMapGroup', 'crash_after_buf_allocation'); | ||
INFO: Failure point (func=CreateBlockMapGroup name=crash_after_buf_allocation) is enabled. | ||
failure_point_enable | ||
---------------------- | ||
|
||
(1 row) | ||
|
||
-- here is the insert where the crash happens | ||
\set ON_ERROR_STOP off | ||
INSERT INTO small_world (v) VALUES ('{2,2,2}'); | ||
INFO: Failure point (func=CreateBlockMapGroup name=crash_after_buf_allocation) has been triggered. | ||
ERROR: ldb_failure_point_crash() | ||
\set ON_ERROR_STOP on | ||
-- now we see that the index has an extra free page, so the index validation fails | ||
SELECT _lantern_internal.validate_index('small_world_v_idx', false); | ||
INFO: validate_index() start for small_world_v_idx | ||
ERROR: vi_blocks[48].vp_type == LDB_VI_BLOCK_UNKNOWN (but it should be known now) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
------------------------------ | ||
-- Test HNSW failure points -- | ||
------------------------------ | ||
|
||
CREATE TABLE small_world ( | ||
id SERIAL PRIMARY KEY, | ||
v REAL[2] | ||
); | ||
CREATE INDEX ON small_world USING hnsw (v) WITH (dim=3); | ||
|
||
-- let's insert HNSW_BLOCKMAP_BLOCKS_PER_PAGE (2000) record to fill the first blockmap page | ||
|
||
do $$ | ||
BEGIN | ||
FOR i IN 1..2000 LOOP | ||
INSERT INTO small_world (v) VALUES (array_replace(ARRAY[0,0,-1], -1, i)); | ||
END LOOP; | ||
END $$; | ||
|
||
-- everything is fine, the index is valid | ||
SELECT _lantern_internal.validate_index('small_world_v_idx', false); | ||
|
||
-- now let's crash after a buffer for a blockmap is allocated during insert, | ||
-- but it hasn't been recorded yet | ||
SELECT _lantern_internal.failure_point_enable('CreateBlockMapGroup', 'crash_after_buf_allocation'); | ||
|
||
-- here is the insert where the crash happens | ||
\set ON_ERROR_STOP off | ||
INSERT INTO small_world (v) VALUES ('{2,2,2}'); | ||
\set ON_ERROR_STOP on | ||
|
||
-- now we see that the index has an extra free page, so the index validation fails | ||
SELECT _lantern_internal.validate_index('small_world_v_idx', false); |