From c0e8569e075e36dabeeb045e60f8925efe12b9c9 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Tue, 16 Jul 2024 17:05:33 -0400 Subject: [PATCH 01/73] Format sources and delete SM diagram Signed-off-by: Cole Miller --- src/vfs2.c | 353 ++++++++++++++++++++++-------------------- src/vfs2.h | 11 +- test/unit/test_vfs2.c | 38 +++-- 3 files changed, 219 insertions(+), 183 deletions(-) diff --git a/src/vfs2.c b/src/vfs2.c index b143be78c..5247c7848 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -44,13 +44,16 @@ static const uint32_t invalid_magic = 0x17171717; enum { /* Entry is not yet open. */ WTX_CLOSED, - /* Next WAL write will be a header write, causing a WAL swap (WAL-cur is empty or fully checkpointed). */ + /* Next WAL write will be a header write, causing a WAL swap (WAL-cur is + empty or fully checkpointed). */ WTX_EMPTY, /* Non-leader, at least one transaction in WAL-cur is not committed. */ WTX_FOLLOWING, - /* Non-leader, all transactions in WAL-cur are committed (but at least one is not checkpointed). */ + /* Non-leader, all transactions in WAL-cur are committed (but at least + one is not checkpointed). */ WTX_FLUSH, - /* Leader, all transactions in WAL-cur are committed (but at least one is not checkpointed). */ + /* Leader, all transactions in WAL-cur are committed (but at least one + is not checkpointed). */ WTX_BASE, /* Leader, transaction in progress. */ WTX_ACTIVE, @@ -60,48 +63,6 @@ enum { WTX_POLLED }; -/* - -Diagram of the state machine (some transitions omitted when they would crowd the diagram even more): - -+----------------------+ sqlite3_open +------------------------------------------------------------------------------+ -| CLOSED | ------------------------> | FOLLOWING | <+ -+----------------------+ +------------------------------------------------------------------------------+ | - | ^ ^ | | - | sqlite3_open | vfs2_apply_uncommitted | vfs2_apply_uncommitted | vfs2_{commit,unapply} | vfs2_apply_uncommitted - v | | v | -+----------------------------------------------------------------------------+ | +------------------------+ | -| | +----------------------- | | | -| EMPTY | | FLUSH | | -| | <------------------------ | | | -+----------------------------------------------------------------------------+ sqlite3_wal_checkpoint +------------------------+ | - | ^ | | - | vfs2_commit_barrier | sqlite3_wal_checkpoint | | - v | | | -+----------------------------------------------------------------------------+ vfs2_commit_barrier | | -| BASE | <---------------------------+ | -+----------------------------------------------------------------------------+ | - | ^ | | - | sqlite3_step | vfs2_unhide +-------------------------------------------------------------------------------+ - v | -+----------------------+ | -| ACTIVE | | -+----------------------+ | - | | - | COMMIT_PHASETWO | - v | -+----------------------+ | -| HIDDEN | | -+----------------------+ | - | | - | vfs2_poll | - v | -+----------------------+ | -| POLLED | -+ -+----------------------+ - -*/ - static const struct sm_conf wtx_states[SM_STATES_MAX] = { [WTX_CLOSED] = { .flags = SM_INITIAL|SM_FINAL, @@ -151,7 +112,7 @@ static const struct sm_conf wtx_states[SM_STATES_MAX] = { struct common { sqlite3_vfs *orig; /* underlying VFS */ pthread_rwlock_t rwlock; /* protects the queue */ - queue queue; /* queue of entry */ + queue queue; /* queue of entry */ }; struct cksums { @@ -170,7 +131,10 @@ static uint32_t native_magic(void) return is_bigendian() ? BE_MAGIC : LE_MAGIC; } -static void update_cksums(uint32_t magic, const uint8_t *p, size_t len, struct cksums *sums) +static void update_cksums(uint32_t magic, + const uint8_t *p, + size_t len, + struct cksums *sums) { PRE(magic == BE_MAGIC || magic == LE_MAGIC); PRE(len % 8 == 0); @@ -270,14 +234,16 @@ struct entry { /* Base VFS file object for WAL-prev */ sqlite3_file *wal_prev; - /* Number of `struct file` with SQLITE_OPEN_MAIN_DB that point to this entry */ + /* Number of `struct file` with SQLITE_OPEN_MAIN_DB that point to this + * entry */ unsigned refcount_main_db; - /* Number of `struct file` with SQLITE_OPEN_WAL that point to this entry */ + /* Number of `struct file` with SQLITE_OPEN_WAL that point to this entry + */ unsigned refcount_wal; - /* if WAL-cur is nonempty at startup, we read its header, verify the checkum, - * and use it to initialize the page size. otherwise, we wait until the first - * write to the WAL, which should be the header */ + /* if WAL-cur is nonempty at startup, we read its header, verify the + * checkum, and use it to initialize the page size. otherwise, we wait + * until the first write to the WAL, which should be the header */ uint32_t page_size; /* For ACTIVE, HIDDEN, POLLED: the header that hides the pending txn */ @@ -289,7 +255,8 @@ struct entry { void **shm_regions; int shm_regions_len; unsigned shm_refcount; - /* Zero for unlocked, positive for read-locked, UINT_MAX for write-locked */ + /* Zero for unlocked, positive for read-locked, UINT_MAX for + * write-locked */ unsigned shm_locks[SQLITE_SHM_NLOCK]; /* For ACTIVE, HIDDEN: the pending txn. start and len @@ -364,7 +331,8 @@ static struct wal_index_full_hdr *get_full_hdr(struct entry *e) static bool no_pending_txn(const struct entry *e) { - return e->pending_txn_len == 0 && e->pending_txn_frames == NULL && e->pending_txn_last_frame_commit == 0; + return e->pending_txn_len == 0 && e->pending_txn_frames == NULL && + e->pending_txn_last_frame_commit == 0; } static bool write_lock_held(const struct entry *e) @@ -390,10 +358,9 @@ static bool wal_index_basic_hdr_advanced(struct wal_index_basic_hdr new, new.nPage >= old.nPage /* no vacuums here */ && ((get_salt1(new.salts) == get_salt1(old.salts) && get_salt2(new.salts) == get_salt2(old.salts)) || - /* note the weirdness with zero salts */ - (get_salt1(old.salts) == 0 && - get_salt2(old.salts) == 0)) - && new.mxFrame > old.mxFrame; + /* note the weirdness with zero salts */ + (get_salt1(old.salts) == 0 && get_salt2(old.salts) == 0)) && + new.mxFrame > old.mxFrame; } /* Check that the hash tables in the WAL index have been initialized @@ -403,7 +370,8 @@ static bool wal_index_recovered(const struct entry *e) { PRE(e->shm_regions_len > 0); char *p = e->shm_regions[0]; - for (size_t i = sizeof(struct wal_index_full_hdr); i < VFS2_WAL_INDEX_REGION_SIZE; i++) { + for (size_t i = sizeof(struct wal_index_full_hdr); + i < VFS2_WAL_INDEX_REGION_SIZE; i++) { if (p[i] != 0) { return true; } @@ -418,31 +386,27 @@ static bool is_valid_page_size(unsigned long n) static bool is_open(const struct entry *e) { - return e->main_db_name != NULL - && e->wal_moving_name != NULL - && e->wal_cur_fixed_name != NULL - && e->wal_cur != NULL - && e->wal_prev_fixed_name != NULL - && e->wal_prev != NULL - && (e->refcount_main_db > 0 || e->refcount_wal > 0) - && e->shm_regions != NULL - && e->shm_regions_len > 0 - && e->shm_regions[0] != NULL - && e->common != NULL; + return e->main_db_name != NULL && e->wal_moving_name != NULL && + e->wal_cur_fixed_name != NULL && e->wal_cur != NULL && + e->wal_prev_fixed_name != NULL && e->wal_prev != NULL && + (e->refcount_main_db > 0 || e->refcount_wal > 0) && + e->shm_regions != NULL && e->shm_regions_len > 0 && + e->shm_regions[0] != NULL && e->common != NULL; } static bool basic_hdr_valid(struct wal_index_basic_hdr bhdr) { struct cksums sums = {}; - update_cksums(bhdr.bigEndCksum ? BE_MAGIC : LE_MAGIC, (uint8_t *)&bhdr, offsetof(struct wal_index_basic_hdr, cksums), &sums); - return bhdr.iVersion == 3007000 - && bhdr.isInit == 1 - && cksums_equal(sums, bhdr.cksums); + update_cksums(bhdr.bigEndCksum ? BE_MAGIC : LE_MAGIC, (uint8_t *)&bhdr, + offsetof(struct wal_index_basic_hdr, cksums), &sums); + return bhdr.iVersion == 3007000 && bhdr.isInit == 1 && + cksums_equal(sums, bhdr.cksums); } static bool full_hdr_valid(const struct wal_index_full_hdr *ihdr) { - return basic_hdr_valid(ihdr->basic[0]) && wal_index_basic_hdr_equal(ihdr->basic[0], ihdr->basic[1]); + return basic_hdr_valid(ihdr->basic[0]) && + wal_index_basic_hdr_equal(ihdr->basic[0], ihdr->basic[1]); } static bool wtx_invariant(const struct sm *sm, int prev) @@ -457,7 +421,7 @@ static bool wtx_invariant(const struct sm *sm, int prev) char zeroed[offsetof(struct entry, wtx_sm)] = {}; return CHECK(memcmp(region, zeroed, sizeof(zeroed)) == 0) && - CHECK(e->common != NULL); + CHECK(e->common != NULL); } if (!CHECK(is_open(e))) { @@ -477,10 +441,8 @@ static bool wtx_invariant(const struct sm *sm, int prev) /* TODO any checks applicable to the read marks and read locks? */ if (sm_state(sm) == WTX_EMPTY) { - return CHECK(mx == backfill) && - CHECK(mx == cursor) && - CHECK(no_pending_txn(e)) && - CHECK(!write_lock_held(e)); + return CHECK(mx == backfill) && CHECK(mx == cursor) && + CHECK(no_pending_txn(e)) && CHECK(!write_lock_held(e)); } if (!CHECK(is_valid_page_size(e->page_size))) { @@ -488,43 +450,43 @@ static bool wtx_invariant(const struct sm *sm, int prev) } if (sm_state(sm) == WTX_FOLLOWING) { - return CHECK(no_pending_txn(e)) && - CHECK(write_lock_held(e)) && - CHECK(mx < cursor); + return CHECK(no_pending_txn(e)) && CHECK(write_lock_held(e)) && + CHECK(mx < cursor); } if (sm_state(sm) == WTX_FLUSH) { - return CHECK(no_pending_txn(e)) && - CHECK(!write_lock_held(e)) && - CHECK(ERGO(mx > 0, backfill < mx)) && - CHECK(mx == cursor); + return CHECK(no_pending_txn(e)) && CHECK(!write_lock_held(e)) && + CHECK(ERGO(mx > 0, backfill < mx)) && + CHECK(mx == cursor); } if (sm_state(sm) == WTX_BASE) { - return CHECK(no_pending_txn(e)) && - CHECK(!write_lock_held(e)) && - CHECK(ERGO(mx > 0, backfill < mx)) && - CHECK(mx == cursor) && - CHECK(ERGO(mx > 0, wal_index_recovered(e))); + return CHECK(no_pending_txn(e)) && CHECK(!write_lock_held(e)) && + CHECK(ERGO(mx > 0, backfill < mx)) && + CHECK(mx == cursor) && + CHECK(ERGO(mx > 0, wal_index_recovered(e))); } if (sm_state(sm) == WTX_ACTIVE) { - return CHECK(wal_index_basic_hdr_equal(get_full_hdr(e)->basic[0], e->prev_txn_hdr)) && - CHECK(wal_index_basic_hdr_zeroed(e->pending_txn_hdr)) && - CHECK(write_lock_held(e)); + return CHECK(wal_index_basic_hdr_equal( + get_full_hdr(e)->basic[0], e->prev_txn_hdr)) && + CHECK(wal_index_basic_hdr_zeroed(e->pending_txn_hdr)) && + CHECK(write_lock_held(e)); } - if (!CHECK(mx < cursor) || - !CHECK(e->pending_txn_len > 0) || - !CHECK(e->pending_txn_start + e->pending_txn_len == e->wal_cursor)) { + if (!CHECK(mx < cursor) || !CHECK(e->pending_txn_len > 0) || + !CHECK(e->pending_txn_start + e->pending_txn_len == + e->wal_cursor)) { return false; } if (sm_state(sm) == WTX_HIDDEN) { - bool res = CHECK(wal_index_basic_hdr_equal(get_full_hdr(e)->basic[0], e->prev_txn_hdr)) && - CHECK(wal_index_basic_hdr_advanced(e->pending_txn_hdr, e->prev_txn_hdr)) && - CHECK(!write_lock_held(e)) && - CHECK(e->pending_txn_frames != NULL); + bool res = CHECK(wal_index_basic_hdr_equal( + get_full_hdr(e)->basic[0], e->prev_txn_hdr)) && + CHECK(wal_index_basic_hdr_advanced( + e->pending_txn_hdr, e->prev_txn_hdr)) && + CHECK(!write_lock_held(e)) && + CHECK(e->pending_txn_frames != NULL); if (!res) { return false; } @@ -535,10 +497,12 @@ static bool wtx_invariant(const struct sm *sm, int prev) } if (sm_state(sm) == WTX_POLLED) { - return CHECK(wal_index_basic_hdr_equal(get_full_hdr(e)->basic[0], e->prev_txn_hdr)) && - CHECK(wal_index_basic_hdr_advanced(e->pending_txn_hdr, e->prev_txn_hdr)) && - CHECK(write_lock_held(e)) && - CHECK(e->pending_txn_frames == NULL); + return CHECK(wal_index_basic_hdr_equal( + get_full_hdr(e)->basic[0], e->prev_txn_hdr)) && + CHECK(wal_index_basic_hdr_advanced(e->pending_txn_hdr, + e->prev_txn_hdr)) && + CHECK(write_lock_held(e)) && + CHECK(e->pending_txn_frames == NULL); } assert(0); @@ -583,7 +547,6 @@ static void maybe_close_entry(struct entry *e) queue_remove(&e->link); pthread_rwlock_unlock(&e->common->rwlock); sqlite3_free(e); - } static int vfs2_close(sqlite3_file *file) @@ -616,8 +579,7 @@ static int vfs2_read(sqlite3_file *file, void *buf, int amt, sqlite3_int64 ofst) return orig->pMethods->xRead(orig, buf, amt, ofst); } -static int wal_swap(struct entry *e, - const struct wal_hdr *wal_hdr) +static int wal_swap(struct entry *e, const struct wal_hdr *wal_hdr) { PRE(e->pending_txn_len == 0); PRE(e->pending_txn_frames == NULL); @@ -633,7 +595,8 @@ static int wal_swap(struct entry *e, sqlite3_file *phys_incoming = e->wal_prev; char *name_incoming = e->wal_prev_fixed_name; - tracef("wal swap outgoing=%s incoming=%s", name_outgoing, name_incoming); + tracef("wal swap outgoing=%s incoming=%s", name_outgoing, + name_incoming); /* Write the new header of the incoming WAL. */ rv = phys_incoming->pMethods->xWrite(phys_incoming, wal_hdr, @@ -694,7 +657,8 @@ static int vfs2_wal_write_frame_hdr(struct entry *e, } if (x == n) { /* FIXME reallocating every time seems bad */ - sqlite3_uint64 z = (sqlite3_uint64)sizeof(*frames) * (sqlite3_uint64)(n + 1); + sqlite3_uint64 z = + (sqlite3_uint64)sizeof(*frames) * (sqlite3_uint64)(n + 1); e->pending_txn_frames = sqlite3_realloc64(frames, z); if (e->pending_txn_frames == NULL) { return SQLITE_NOMEM; @@ -785,7 +749,8 @@ static int vfs2_write(sqlite3_file *file, if (xfile->flags & SQLITE_OPEN_WAL) { struct entry *e = xfile->entry; - tracef("wrote to WAL name=%s amt=%d ofst=%lld", e->wal_cur_fixed_name, amt, ofst); + tracef("wrote to WAL name=%s amt=%d ofst=%lld", + e->wal_cur_fixed_name, amt, ofst); return vfs2_wal_post_write(e, buf, amt, ofst); } @@ -841,7 +806,8 @@ static int interpret_pragma(char **args) PRE(left != NULL); char *right = args[2]; - if (strcmp(left, "journal_mode") == 0 && right != NULL && strcasecmp(right, "wal") != 0) { + if (strcmp(left, "journal_mode") == 0 && right != NULL && + strcasecmp(right, "wal") != 0) { *e = sqlite3_mprintf("dqlite requires WAL mode"); return SQLITE_ERROR; } @@ -936,7 +902,8 @@ static int vfs2_shm_map(sqlite3_file *file, } memset(region, 0, (size_t)regsz); /* FIXME reallocating every time seems bad */ - sqlite3_uint64 z = (sqlite3_uint64)sizeof(*e->shm_regions) * (sqlite3_uint64)(e->shm_regions_len + 1); + sqlite3_uint64 z = (sqlite3_uint64)sizeof(*e->shm_regions) * + (sqlite3_uint64)(e->shm_regions_len + 1); void **regions = sqlite3_realloc64(e->shm_regions, z); if (regions == NULL) { rv = SQLITE_NOMEM; @@ -1037,15 +1004,15 @@ static int vfs2_shm_lock(sqlite3_file *file, int ofst, int n, int flags) sm_move(&e->wtx_sm, WTX_BASE); } } else if (ofst == WAL_CKPT_LOCK && n == 1) { - /* End of a checkpoint: if all frames have been backfilled, - * move to EMPTY. */ + /* End of a checkpoint: if all frames have been + * backfilled, move to EMPTY. */ assert(n == 1); struct wal_index_full_hdr *ihdr = get_full_hdr(e); if (ihdr->nBackfill == ihdr->basic[0].mxFrame) { sm_move(&e->wtx_sm, WTX_EMPTY); } - } /* else if (ofst <= WAL_RECOVER_LOCK && WAL_RECOVER_LOCK < ofst + n) { - sm_move(&e->wtx_sm, WTX_BASE); + } /* else if (ofst <= WAL_RECOVER_LOCK && WAL_RECOVER_LOCK < + ofst + n) { sm_move(&e->wtx_sm, WTX_BASE); } */ } else { assert(0); @@ -1116,7 +1083,9 @@ static int compare_wal_headers(struct wal_hdr a, return SQLITE_OK; } -static int read_wal_hdr(sqlite3_file *wal, sqlite3_int64 *size, struct wal_hdr *hdr) +static int read_wal_hdr(sqlite3_file *wal, + sqlite3_int64 *size, + struct wal_hdr *hdr) { int rv; @@ -1143,7 +1112,8 @@ static struct wal_index_full_hdr initial_full_hdr(struct wal_hdr whdr) ihdr.basic[0].bigEndCksum = is_bigendian(); ihdr.basic[0].szPage = (uint16_t)ByteGetBe32(whdr.page_size); struct cksums sums = {}; - update_cksums(native_magic(), (const void *)&ihdr.basic[0], offsetof(struct wal_index_basic_hdr, cksums), &sums); + update_cksums(native_magic(), (const void *)&ihdr.basic[0], + offsetof(struct wal_index_basic_hdr, cksums), &sums); ihdr.basic[0].cksums = sums; ihdr.basic[1] = ihdr.basic[0]; ihdr.marks[0] = 0; @@ -1154,7 +1124,9 @@ static struct wal_index_full_hdr initial_full_hdr(struct wal_hdr whdr) return ihdr; } -static void set_mx_frame(struct wal_index_full_hdr *ihdr, uint32_t mx, struct wal_frame_hdr fhdr) +static void set_mx_frame(struct wal_index_full_hdr *ihdr, + uint32_t mx, + struct wal_frame_hdr fhdr) { uint32_t num_pages = ByteGetBe32(fhdr.commit); PRE(num_pages > 0); @@ -1170,7 +1142,8 @@ static void set_mx_frame(struct wal_index_full_hdr *ihdr, uint32_t mx, struct wa ihdr->basic[1] = ihdr->basic[0]; } -static void restart_full_hdr(struct wal_index_full_hdr *ihdr, struct wal_hdr new_whdr) +static void restart_full_hdr(struct wal_index_full_hdr *ihdr, + struct wal_hdr new_whdr) { /* cf. walRestartHdr */ ihdr->basic[0].mxFrame = 0; @@ -1189,13 +1162,18 @@ static uint32_t wal_cursor_from_size(uint32_t page_size, sqlite3_int64 size) if (size < whdr_size) { return 0; } - sqlite3_int64 x = (size - whdr_size) / ((sqlite3_int64)sizeof(struct wal_frame_hdr) + (sqlite3_int64)page_size); + sqlite3_int64 x = + (size - whdr_size) / ((sqlite3_int64)sizeof(struct wal_frame_hdr) + + (sqlite3_int64)page_size); return (uint32_t)x; } static sqlite3_int64 wal_offset_from_cursor(uint32_t page_size, uint32_t cursor) { - return (sqlite3_int64)sizeof(struct wal_hdr) + (sqlite3_int64)cursor * ((sqlite3_int64)sizeof(struct wal_frame_hdr) + (sqlite3_int64)page_size); + return (sqlite3_int64)sizeof(struct wal_hdr) + + (sqlite3_int64)cursor * + ((sqlite3_int64)sizeof(struct wal_frame_hdr) + + (sqlite3_int64)page_size); } static int open_entry(struct common *common, const char *name, struct entry *e) @@ -1215,10 +1193,8 @@ static int open_entry(struct common *common, const char *name, struct entry *e) e->wal_moving_name = sqlite3_malloc(path_cap); e->wal_cur_fixed_name = sqlite3_malloc(path_cap); e->wal_prev_fixed_name = sqlite3_malloc(path_cap); - if (e->main_db_name == NULL || - e->wal_moving_name == NULL || - e->wal_cur_fixed_name == NULL || - e->wal_prev_fixed_name == NULL) { + if (e->main_db_name == NULL || e->wal_moving_name == NULL || + e->wal_cur_fixed_name == NULL || e->wal_prev_fixed_name == NULL) { return SQLITE_NOMEM; } @@ -1234,13 +1210,15 @@ static int open_entry(struct common *common, const char *name, struct entry *e) strcat(e->wal_prev_fixed_name, "-xwal2"); /* TODO EXRESCODE? */ - int phys_wal_flags = SQLITE_OPEN_READWRITE|SQLITE_OPEN_CREATE|SQLITE_OPEN_WAL|SQLITE_OPEN_NOFOLLOW; + int phys_wal_flags = SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE | + SQLITE_OPEN_WAL | SQLITE_OPEN_NOFOLLOW; e->wal_cur = sqlite3_malloc(file_cap); if (e->wal_cur == NULL) { return SQLITE_NOMEM; } - rv = v->xOpen(v, e->wal_cur_fixed_name, e->wal_cur, phys_wal_flags, NULL); + rv = v->xOpen(v, e->wal_cur_fixed_name, e->wal_cur, phys_wal_flags, + NULL); if (rv != SQLITE_OK) { return rv; } @@ -1249,7 +1227,8 @@ static int open_entry(struct common *common, const char *name, struct entry *e) if (e->wal_prev == NULL) { return SQLITE_NOMEM; } - rv = v->xOpen(v, e->wal_prev_fixed_name, e->wal_prev, phys_wal_flags, NULL); + rv = v->xOpen(v, e->wal_prev_fixed_name, e->wal_prev, phys_wal_flags, + NULL); if (rv != SQLITE_OK) { return rv; } @@ -1322,7 +1301,8 @@ static int open_entry(struct common *common, const char *name, struct entry *e) e->wal_cursor = wal_cursor_from_size(e->page_size, size_cur); int next = WTX_EMPTY; - if (size_cur >= wal_offset_from_cursor(0 /* this doesn't matter */, 0)) { + if (size_cur >= + wal_offset_from_cursor(0 /* this doesn't matter */, 0)) { /* TODO verify the header here */ e->page_size = ByteGetBe32(hdr_cur.page_size); next = WTX_FLUSH; @@ -1336,7 +1316,10 @@ static int open_entry(struct common *common, const char *name, struct entry *e) return SQLITE_OK; } -static int set_up_entry(struct common *common, const char *name, int flags, struct entry **e) +static int set_up_entry(struct common *common, + const char *name, + int flags, + struct entry **e) { bool name_is_db = (flags & SQLITE_OPEN_MAIN_DB) != 0; bool name_is_wal = (flags & SQLITE_OPEN_WAL) != 0; @@ -1359,16 +1342,18 @@ static int set_up_entry(struct common *common, const char *name, int flags, stru if (res != NULL) { sqlite3_free(*e); *e = res; - unsigned *refcount = name_is_db ? &res->refcount_main_db : &res->refcount_wal; + unsigned *refcount = + name_is_db ? &res->refcount_main_db : &res->refcount_wal; *refcount += 1; return SQLITE_OK; } assert(name_is_db); res = *e; - /* If open_entry fails we still want to link in the entry. Since we unconditionally - * set pMethods in our file vtable, SQLite will xClose the file and vfs2_close - * will run to clean up the partial work of open_entry. */ + /* If open_entry fails we still want to link in the entry. Since we + * unconditionally set pMethods in our file vtable, SQLite will xClose + * the file and vfs2_close will run to clean up the partial work of + * open_entry. */ rv = open_entry(common, name, res); pthread_rwlock_wrlock(&common->rwlock); queue_insert_tail(&common->queue, &res->link); @@ -1402,7 +1387,7 @@ static int vfs2_open(sqlite3_vfs *vfs, } } - if (flags & (SQLITE_OPEN_MAIN_DB|SQLITE_OPEN_WAL)) { + if (flags & (SQLITE_OPEN_MAIN_DB | SQLITE_OPEN_WAL)) { xout->entry = sqlite3_malloc(sizeof(*xout->entry)); if (xout->entry == NULL) { return SQLITE_NOMEM; @@ -1420,7 +1405,8 @@ static int vfs2_open(sqlite3_vfs *vfs, return SQLITE_OK; } -/* TODO does this need to be customized? should it ever be called on one of our files? */ +/* TODO does this need to be customized? should it ever be called on one of our + * files? */ static int vfs2_delete(sqlite3_vfs *vfs, const char *name, int sync_dir) { struct common *data = vfs->pAppData; @@ -1591,7 +1577,9 @@ int vfs2_commit(sqlite3_file *file, struct vfs2_wal_slice stop) PRE(e->shm_locks[WAL_WRITE_LOCK] == VFS2_EXCLUSIVE); sqlite3_file *wal_cur = e->wal_cur; struct wal_frame_hdr fhdr; - int rv = wal_cur->pMethods->xRead(wal_cur, &fhdr, sizeof(fhdr), wal_offset_from_cursor(e->page_size, stop.start + stop.len - 1)); + int rv = wal_cur->pMethods->xRead( + wal_cur, &fhdr, sizeof(fhdr), + wal_offset_from_cursor(e->page_size, stop.start + stop.len - 1)); if (rv == SQLITE_OK) { return rv; } @@ -1613,7 +1601,9 @@ int vfs2_commit_barrier(sqlite3_file *file) if (e->wal_cursor > 0) { sqlite3_file *wal_cur = e->wal_cur; struct wal_frame_hdr fhdr; - int rv = wal_cur->pMethods->xRead(wal_cur, &fhdr, sizeof(fhdr), wal_offset_from_cursor(e->page_size, e->wal_cursor - 1)); + int rv = wal_cur->pMethods->xRead( + wal_cur, &fhdr, sizeof(fhdr), + wal_offset_from_cursor(e->page_size, e->wal_cursor - 1)); if (rv == SQLITE_OK) { return rv; } @@ -1621,13 +1611,17 @@ int vfs2_commit_barrier(sqlite3_file *file) /* It's okay if the write lock isn't held */ e->shm_locks[WAL_WRITE_LOCK] = 0; get_full_hdr(e)->basic[0].isInit = 0; - /* The next transaction will cause SQLite to run recovery which will complete the transition to BASE */ + /* The next transaction will cause SQLite to run recovery which + * will complete the transition to BASE */ sm_move(&e->wtx_sm, WTX_FLUSH); } return 0; } -int vfs2_poll(sqlite3_file *file, struct vfs2_wal_frame **frames, unsigned *n, struct vfs2_wal_slice *sl) +int vfs2_poll(sqlite3_file *file, + struct vfs2_wal_frame **frames, + unsigned *n, + struct vfs2_wal_slice *sl) { struct file *xfile = (struct file *)file; PRE(xfile->flags & SQLITE_OPEN_MAIN_DB); @@ -1644,7 +1638,8 @@ int vfs2_poll(sqlite3_file *file, struct vfs2_wal_frame **frames, unsigned *n, s e->shm_locks[WAL_WRITE_LOCK] = VFS2_EXCLUSIVE; } - /* Note, not resetting pending_txn_{start,len} because they are used by later states */ + /* Note, not resetting pending_txn_{start,len} because they are used by + * later states */ if (n != NULL && frames != NULL) { *n = len; *frames = e->pending_txn_frames; @@ -1678,7 +1673,8 @@ void vfs2_destroy(sqlite3_vfs *vfs) int vfs2_abort(sqlite3_file *file) { - /* TODO maybe can "followerize" this and get rid of vfs2_unapply_after? */ + /* TODO maybe can "followerize" this and get rid of vfs2_unapply_after? + */ struct file *xfile = (struct file *)file; PRE(xfile->flags & SQLITE_OPEN_MAIN_DB); struct entry *e = xfile->entry; @@ -1711,7 +1707,8 @@ int vfs2_read_wal(sqlite3_file *file, int page_size = (int)e->page_size; for (size_t i = 0; i < txns_len; i++) { - struct vfs2_wal_frame *f = sqlite3_malloc64(txns[i].meta.len * sizeof(*f)); + struct vfs2_wal_frame *f = + sqlite3_malloc64(txns[i].meta.len * sizeof(*f)); if (f == NULL) { goto oom; } @@ -1728,11 +1725,14 @@ int vfs2_read_wal(sqlite3_file *file, for (size_t i = 0; i < txns_len; i++) { sqlite3_file *wal; unsigned read_lock; - bool from_wal_cur = salts_equal(txns[i].meta.salts, e->wal_cur_hdr.salts); - bool from_wal_prev = salts_equal(txns[i].meta.salts, e->wal_prev_hdr.salts); + bool from_wal_cur = + salts_equal(txns[i].meta.salts, e->wal_cur_hdr.salts); + bool from_wal_prev = + salts_equal(txns[i].meta.salts, e->wal_prev_hdr.salts); assert(from_wal_cur ^ from_wal_prev); if (from_wal_cur) { - rv = vfs2_pseudo_read_begin(file, e->wal_cursor, &read_lock); + rv = vfs2_pseudo_read_begin(file, e->wal_cursor, + &read_lock); if (rv != SQLITE_OK) { return 1; } @@ -1744,18 +1744,22 @@ int vfs2_read_wal(sqlite3_file *file, uint32_t start = txns[i].meta.start; uint32_t len = txns[i].meta.len; for (uint32_t j = 0; j < len; j++) { - sqlite3_int64 off = wal_offset_from_cursor(e->page_size, start + j); + sqlite3_int64 off = + wal_offset_from_cursor(e->page_size, start + j); struct wal_frame_hdr fhdr; - rv = wal->pMethods->xRead(wal, &fhdr, sizeof(fhdr), off); + rv = + wal->pMethods->xRead(wal, &fhdr, sizeof(fhdr), off); if (rv != SQLITE_OK) { return 1; } off += (sqlite3_int64)sizeof(fhdr); - rv = wal->pMethods->xRead(wal, txns[i].frames[j].page, page_size, off); + rv = wal->pMethods->xRead(wal, txns[i].frames[j].page, + page_size, off); if (rv != SQLITE_OK) { return 1; } - txns[i].frames[j].page_number = ByteGetBe32(fhdr.page_number); + txns[i].frames[j].page_number = + ByteGetBe32(fhdr.page_number); txns[i].frames[j].commit = ByteGetBe32(fhdr.commit); } if (from_wal_cur) { @@ -1776,7 +1780,9 @@ int vfs2_read_wal(sqlite3_file *file, return 1; } -static int write_one_frame(struct entry *e, struct wal_frame_hdr hdr, void *data) +static int write_one_frame(struct entry *e, + struct wal_frame_hdr hdr, + void *data) { int rv; @@ -1786,7 +1792,8 @@ static int write_one_frame(struct entry *e, struct wal_frame_hdr hdr, void *data return rv; } off += (sqlite3_int64)sizeof(hdr); - rv = e->wal_cur->pMethods->xWrite(e->wal_cur, data, (int)e->page_size, off); + rv = e->wal_cur->pMethods->xWrite(e->wal_cur, data, (int)e->page_size, + off); if (rv != SQLITE_OK) { return rv; } @@ -1804,31 +1811,41 @@ static struct wal_hdr next_wal_hdr(const struct entry *e) uint32_t ckpoint_seqno = ByteGetBe32(old.ckpoint_seqno); BytePutBe32(ckpoint_seqno + 1, ret.ckpoint_seqno); uint32_t salt1; - if (ckpoint_seqno == 0) { + if (ckpoint_seqno == 0) { salt1 = get_salt1(old.salts) + 1; } else { - e->common->orig->xRandomness(e->common->orig, sizeof(salt1), (void *)&salt1); + e->common->orig->xRandomness(e->common->orig, sizeof(salt1), + (void *)&salt1); } BytePutBe32(salt1, ret.salts.salt1); - e->common->orig->xRandomness(e->common->orig, sizeof(ret.salts.salt2), (void *)&ret.salts.salt2); + e->common->orig->xRandomness(e->common->orig, sizeof(ret.salts.salt2), + (void *)&ret.salts.salt2); return ret; } -static struct wal_frame_hdr txn_frame_hdr(struct entry *e, struct cksums sums, struct vfs2_wal_frame frame) +static struct wal_frame_hdr txn_frame_hdr(struct entry *e, + struct cksums sums, + struct vfs2_wal_frame frame) { struct wal_frame_hdr fhdr; BytePutBe32(frame.page_number, fhdr.page_number); BytePutBe32(frame.commit, fhdr.commit); - update_cksums(ByteGetBe32(e->wal_cur_hdr.magic), (const void *)(&fhdr), 8, &sums); - update_cksums(ByteGetBe32(e->wal_cur_hdr.magic), frame.page, e->page_size, &sums); + update_cksums(ByteGetBe32(e->wal_cur_hdr.magic), (const void *)(&fhdr), + 8, &sums); + update_cksums(ByteGetBe32(e->wal_cur_hdr.magic), frame.page, + e->page_size, &sums); fhdr.salts = e->wal_cur_hdr.salts; BytePutBe32(sums.cksum1, fhdr.cksum1); BytePutBe32(sums.cksum2, fhdr.cksum2); return fhdr; } -int vfs2_apply_uncommitted(sqlite3_file *file, uint32_t page_size, const struct vfs2_wal_frame *frames, unsigned len, struct vfs2_wal_slice *out) +int vfs2_apply_uncommitted(sqlite3_file *file, + uint32_t page_size, + const struct vfs2_wal_frame *frames, + unsigned len, + struct vfs2_wal_slice *out) { PRE(len > 0); PRE(is_valid_page_size(page_size)); @@ -1854,7 +1871,7 @@ int vfs2_apply_uncommitted(sqlite3_file *file, uint32_t page_size, const struct * WAL-cur (mxFrame) to equal the number of applies frames * (wal_cursor). */ e->shm_locks[WAL_WRITE_LOCK] = VFS2_EXCLUSIVE; - + struct wal_index_full_hdr *ihdr = get_full_hdr(e); uint32_t mx = ihdr->basic[0].mxFrame; if (mx > 0 && ihdr->nBackfill == mx) { @@ -1873,8 +1890,10 @@ int vfs2_apply_uncommitted(sqlite3_file *file, uint32_t page_size, const struct if (start > 0) { /* TODO cache this in the entry? */ struct wal_frame_hdr prev_fhdr; - sqlite3_int64 off = wal_offset_from_cursor(e->page_size, e->wal_cursor - 1); - rv = e->wal_cur->pMethods->xRead(e->wal_cur, &prev_fhdr, sizeof(prev_fhdr), off); + sqlite3_int64 off = + wal_offset_from_cursor(e->page_size, e->wal_cursor - 1); + rv = e->wal_cur->pMethods->xRead(e->wal_cur, &prev_fhdr, + sizeof(prev_fhdr), off); if (rv != SQLITE_OK) { return 1; } @@ -1899,7 +1918,7 @@ int vfs2_apply_uncommitted(sqlite3_file *file, uint32_t page_size, const struct if (rv != SQLITE_OK) { return 1; } - } + } sm_move(&e->wtx_sm, WTX_FOLLOWING); out->salts = e->wal_cur_hdr.salts; @@ -1928,7 +1947,7 @@ int vfs2_pseudo_read_begin(sqlite3_file *file, uint32_t target, unsigned *out) /* adapted from walTryBeginRead */ uint32_t max_mark = 0; unsigned max_index = 0; - for (unsigned i = 1; i < WAL_NREADER; i++){ + for (unsigned i = 1; i < WAL_NREADER; i++) { uint32_t cur = ihdr->marks[i]; if (max_mark <= cur && cur <= target) { assert(cur != READ_MARK_UNUSED); diff --git a/src/vfs2.h b/src/vfs2.h index f2181e760..418f79a4d 100644 --- a/src/vfs2.h +++ b/src/vfs2.h @@ -47,7 +47,10 @@ struct vfs2_wal_frame { * * Polling the same transaction more than once is an error. */ -int vfs2_poll(sqlite3_file *file, struct vfs2_wal_frame **frames, unsigned *n, struct vfs2_wal_slice *sl); +int vfs2_poll(sqlite3_file *file, + struct vfs2_wal_frame **frames, + unsigned *n, + struct vfs2_wal_slice *sl); int vfs2_unhide(sqlite3_file *file); @@ -55,7 +58,11 @@ int vfs2_commit(sqlite3_file *file, struct vfs2_wal_slice stop); int vfs2_commit_barrier(sqlite3_file *file); -int vfs2_apply_uncommitted(sqlite3_file *file, uint32_t page_size, const struct vfs2_wal_frame *frames, unsigned n, struct vfs2_wal_slice *out); +int vfs2_apply_uncommitted(sqlite3_file *file, + uint32_t page_size, + const struct vfs2_wal_frame *frames, + unsigned n, + struct vfs2_wal_slice *out); int vfs2_unapply(sqlite3_file *file, struct vfs2_wal_slice stop); diff --git a/test/unit/test_vfs2.c b/test/unit/test_vfs2.c index 635d7ce7f..1065469f0 100644 --- a/test/unit/test_vfs2.c +++ b/test/unit/test_vfs2.c @@ -1,7 +1,7 @@ -#pragma GCC diagnostic ignored "-Wformat-truncation" // XXX +#pragma GCC diagnostic ignored "-Wformat-truncation" // XXX -#include "../../src/vfs2.h" #include "../../src/lib/byte.h" +#include "../../src/vfs2.h" #include "../lib/fs.h" #include "../lib/runner.h" @@ -86,11 +86,13 @@ static void check_wals(const char *dbname, off_t wal1_len, off_t wal2_len) snprintf(buf, sizeof(buf), "%s-xwal1", dbname); rv = stat(buf, &st); - munit_assert_true((rv == 0 && st.st_size == wal1_len) || (rv < 0 && errno == ENOENT && wal1_len == 0)); + munit_assert_true((rv == 0 && st.st_size == wal1_len) || + (rv < 0 && errno == ENOENT && wal1_len == 0)); snprintf(buf, sizeof(buf), "%s-xwal2", dbname); rv = stat(buf, &st); - munit_assert_true((rv == 0 && st.st_size == wal2_len) || (rv < 0 && errno == ENOENT && wal2_len == 0)); + munit_assert_true((rv == 0 && st.st_size == wal2_len) || + (rv < 0 && errno == ENOENT && wal2_len == 0)); } TEST(vfs2, basic, set_up, tear_down, 0, NULL) @@ -105,13 +107,14 @@ TEST(vfs2, basic, set_up, tear_down, 0, NULL) munit_assert_int(rv, ==, SQLITE_OK); rv = sqlite3_exec(db, - "PRAGMA page_size=" PAGE_SIZE_STR ";" + "PRAGMA page_size=" PAGE_SIZE_STR + ";" "PRAGMA journal_mode=WAL;" "PRAGMA wal_autocheckpoint=0", NULL, NULL, NULL); munit_assert_int(rv, ==, SQLITE_OK); - char *args[] = {NULL, "page_size", NULL}; + char *args[] = { NULL, "page_size", NULL }; rv = sqlite3_file_control(db, "main", SQLITE_FCNTL_PRAGMA, args); sqlite3_file *fp; sqlite3_file_control(db, "main", SQLITE_FCNTL_FILE_POINTER, &fp); @@ -211,7 +214,10 @@ TEST(vfs2, basic, set_up, tear_down, 0, NULL) #define WAL_SIZE_FROM_FRAMES(n) (32 + (24 + PAGE_SIZE) * (n)) -static void make_wal_hdr(uint8_t *buf, uint32_t ckpoint_seqno, uint32_t salt1, uint32_t salt2) +static void make_wal_hdr(uint8_t *buf, + uint32_t ckpoint_seqno, + uint32_t salt1, + uint32_t salt2) { uint8_t *p = buf; @@ -260,7 +266,7 @@ TEST(vfs2, startup_one_nonempty, set_up, tear_down, 0, NULL) check_wals(buf, 0, 0); - uint8_t wal2_hdronly[WAL_SIZE_FROM_FRAMES(0)] = {0}; + uint8_t wal2_hdronly[WAL_SIZE_FROM_FRAMES(0)] = { 0 }; make_wal_hdr(wal2_hdronly, 0, 17, 103); prepare_wals(buf, NULL, 0, wal2_hdronly, sizeof(wal2_hdronly)); sqlite3 *db; @@ -269,7 +275,8 @@ TEST(vfs2, startup_one_nonempty, set_up, tear_down, 0, NULL) munit_assert_int(rv, ==, SQLITE_OK); tracef("setup..."); rv = sqlite3_exec(db, - "PRAGMA page_size=" PAGE_SIZE_STR ";" + "PRAGMA page_size=" PAGE_SIZE_STR + ";" "PRAGMA journal_mode=WAL;" "PRAGMA wal_autocheckpoint=0", NULL, NULL, NULL); @@ -300,16 +307,18 @@ TEST(vfs2, startup_both_nonempty, set_up, tear_down, 0, NULL) check_wals(buf, 0, 0); - uint8_t wal1_hdronly[WAL_SIZE_FROM_FRAMES(0)] = {0}; + uint8_t wal1_hdronly[WAL_SIZE_FROM_FRAMES(0)] = { 0 }; make_wal_hdr(wal1_hdronly, 0, 18, 103); - uint8_t wal2_hdronly[WAL_SIZE_FROM_FRAMES(0)] = {0}; + uint8_t wal2_hdronly[WAL_SIZE_FROM_FRAMES(0)] = { 0 }; make_wal_hdr(wal2_hdronly, 0, 17, 103); - prepare_wals(buf, wal1_hdronly, sizeof(wal1_hdronly), wal2_hdronly, sizeof(wal2_hdronly)); + prepare_wals(buf, wal1_hdronly, sizeof(wal1_hdronly), wal2_hdronly, + sizeof(wal2_hdronly)); sqlite3 *db; int rv = sqlite3_open(buf, &db); munit_assert_int(rv, ==, SQLITE_OK); rv = sqlite3_exec(db, - "PRAGMA page_size=" PAGE_SIZE_STR ";" + "PRAGMA page_size=" PAGE_SIZE_STR + ";" "PRAGMA journal_mode=WAL;" "PRAGMA wal_autocheckpoint=0", NULL, NULL, NULL); @@ -352,7 +361,8 @@ TEST(vfs2, rollback, set_up, tear_down, 0, NULL) munit_assert_int(rv, ==, SQLITE_OK); char sql[100]; for (unsigned i = 0; i < 500; i++) { - snprintf(sql, sizeof(sql), "INSERT INTO foo (n) VALUES (%d)", i); + snprintf(sql, sizeof(sql), "INSERT INTO foo (n) VALUES (%d)", + i); rv = sqlite3_exec(db, sql, NULL, NULL, NULL); munit_assert_int(rv, ==, SQLITE_OK); } From f2f3f83376c5ab92029f2ea945ccde03e6e45187 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Tue, 16 Jul 2024 17:06:10 -0400 Subject: [PATCH 02/73] Rename vfs2_apply_uncommitted and vfs2_unapply "Apply" is the verb we use for what happens when a raft log entry is committed, so avoid it for these functions that are associated with appending and truncating entries. Signed-off-by: Cole Miller --- src/vfs2.c | 25 ++++++++++++------------- src/vfs2.h | 12 ++++++------ 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/src/vfs2.c b/src/vfs2.c index 5247c7848..36c5ee2e6 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -1523,18 +1523,18 @@ sqlite3_vfs *vfs2_make(sqlite3_vfs *orig, const char *name) return vfs; } -int vfs2_unapply(sqlite3_file *file, struct vfs2_wal_slice first_to_unapply) +int vfs2_unadd(sqlite3_file *file, struct vfs2_wal_slice first_to_unadd) { struct file *xfile = (struct file *)file; PRE(xfile->flags & SQLITE_OPEN_MAIN_DB); struct entry *e = xfile->entry; - PRE(salts_equal(first_to_unapply.salts, e->wal_cur_hdr.salts)); - PRE(first_to_unapply.start + first_to_unapply.len <= e->wal_cursor); + PRE(salts_equal(first_to_unadd.salts, e->wal_cur_hdr.salts)); + PRE(first_to_unadd.start + first_to_unadd.len <= e->wal_cursor); struct wal_index_full_hdr *ihdr = get_full_hdr(e); - PRE(first_to_unapply.start >= ihdr->basic[0].mxFrame); + PRE(first_to_unadd.start >= ihdr->basic[0].mxFrame); PRE(e->shm_locks[WAL_WRITE_LOCK] == VFS2_EXCLUSIVE); - e->wal_cursor = first_to_unapply.start; + e->wal_cursor = first_to_unadd.start; if (e->wal_cursor == ihdr->basic[0].mxFrame) { e->shm_locks[WAL_WRITE_LOCK] = 0; @@ -1673,8 +1673,7 @@ void vfs2_destroy(sqlite3_vfs *vfs) int vfs2_abort(sqlite3_file *file) { - /* TODO maybe can "followerize" this and get rid of vfs2_unapply_after? - */ + /* TODO maybe can "followerize" this and get rid of vfs2_unadd_after? */ struct file *xfile = (struct file *)file; PRE(xfile->flags & SQLITE_OPEN_MAIN_DB); struct entry *e = xfile->entry; @@ -1841,11 +1840,11 @@ static struct wal_frame_hdr txn_frame_hdr(struct entry *e, return fhdr; } -int vfs2_apply_uncommitted(sqlite3_file *file, - uint32_t page_size, - const struct vfs2_wal_frame *frames, - unsigned len, - struct vfs2_wal_slice *out) +int vfs2_add_uncommitted(sqlite3_file *file, + uint32_t page_size, + const struct vfs2_wal_frame *frames, + unsigned len, + struct vfs2_wal_slice *out) { PRE(len > 0); PRE(is_valid_page_size(page_size)); @@ -1867,7 +1866,7 @@ int vfs2_apply_uncommitted(sqlite3_file *file, * it's held before returning. * * The write lock will be released when a call to vfs2_commit - * or vfs2_unapply causes the number of committed frames in + * or vfs2_unadd causes the number of committed frames in * WAL-cur (mxFrame) to equal the number of applies frames * (wal_cursor). */ e->shm_locks[WAL_WRITE_LOCK] = VFS2_EXCLUSIVE; diff --git a/src/vfs2.h b/src/vfs2.h index 418f79a4d..32b6d4c0c 100644 --- a/src/vfs2.h +++ b/src/vfs2.h @@ -58,13 +58,13 @@ int vfs2_commit(sqlite3_file *file, struct vfs2_wal_slice stop); int vfs2_commit_barrier(sqlite3_file *file); -int vfs2_apply_uncommitted(sqlite3_file *file, - uint32_t page_size, - const struct vfs2_wal_frame *frames, - unsigned n, - struct vfs2_wal_slice *out); +int vfs2_add_uncommitted(sqlite3_file *file, + uint32_t page_size, + const struct vfs2_wal_frame *frames, + unsigned n, + struct vfs2_wal_slice *out); -int vfs2_unapply(sqlite3_file *file, struct vfs2_wal_slice stop); +int vfs2_unadd(sqlite3_file *file, struct vfs2_wal_slice stop); struct vfs2_wal_txn { struct vfs2_wal_slice meta; From f418816b16931ef12b4a3df7b77c1b087c976295 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Tue, 16 Jul 2024 21:28:40 -0400 Subject: [PATCH 03/73] Remove FLUSH state and invariant Signed-off-by: Cole Miller --- src/vfs2.c | 259 ++++++++++------------------------------------------- src/vfs2.h | 2 - 2 files changed, 45 insertions(+), 216 deletions(-) diff --git a/src/vfs2.c b/src/vfs2.c index 36c5ee2e6..88c78d5d2 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -3,7 +3,6 @@ #include "lib/byte.h" #include "lib/queue.h" #include "lib/sm.h" -#include "tracing.h" #include "utils.h" #include @@ -49,9 +48,6 @@ enum { WTX_EMPTY, /* Non-leader, at least one transaction in WAL-cur is not committed. */ WTX_FOLLOWING, - /* Non-leader, all transactions in WAL-cur are committed (but at least - one is not checkpointed). */ - WTX_FLUSH, /* Leader, all transactions in WAL-cur are committed (but at least one is not checkpointed). */ WTX_BASE, @@ -67,43 +63,46 @@ static const struct sm_conf wtx_states[SM_STATES_MAX] = { [WTX_CLOSED] = { .flags = SM_INITIAL|SM_FINAL, .name = "closed", - .allowed = BITS(WTX_EMPTY)|BITS(WTX_FOLLOWING)|BITS(WTX_FLUSH), - }, - [WTX_EMPTY] = { - .flags = 0, - .name = "empty", - .allowed = BITS(WTX_FOLLOWING)|BITS(WTX_FLUSH)|BITS(WTX_ACTIVE)|BITS(WTX_CLOSED), - }, - [WTX_FOLLOWING] = { - .flags = 0, - .name = "following", - .allowed = BITS(WTX_FOLLOWING)|BITS(WTX_FLUSH)|BITS(WTX_CLOSED), - }, - [WTX_FLUSH] = { - .flags = 0, - .name = "flush", - .allowed = BITS(WTX_FOLLOWING)|BITS(WTX_FLUSH)|BITS(WTX_ACTIVE)|BITS(WTX_CLOSED), - }, - [WTX_BASE] = { - .flags = 0, - .name = "base", - .allowed = BITS(WTX_FOLLOWING)|BITS(WTX_BASE)|BITS(WTX_ACTIVE)|BITS(WTX_EMPTY)|BITS(WTX_CLOSED), - }, - [WTX_ACTIVE] = { - .flags = 0, - .name = "active", - .allowed = BITS(WTX_BASE)|BITS(WTX_ACTIVE)|BITS(WTX_HIDDEN)|BITS(WTX_CLOSED), - }, - [WTX_HIDDEN] = { - .flags = 0, - .name = "hidden", - .allowed = BITS(WTX_BASE)|BITS(WTX_POLLED)|BITS(WTX_CLOSED), - }, - [WTX_POLLED] = { - .flags = 0, - .name = "polled", - .allowed = BITS(WTX_BASE)|BITS(WTX_CLOSED), + .allowed = BITS(WTX_EMPTY) + |BITS(WTX_BASE) + |BITS(WTX_FOLLOWING), }, + [WTX_EMPTY] = { + .name = "empty", + .allowed = BITS(WTX_FOLLOWING) + |BITS(WTX_ACTIVE) + |BITS(WTX_CLOSED), + }, + [WTX_FOLLOWING] = { + .name = "following", + .allowed = BITS(WTX_BASE) + |BITS(WTX_FOLLOWING) + |BITS(WTX_CLOSED), + }, + [WTX_BASE] = { + .name = "base", + .allowed = BITS(WTX_ACTIVE) + |BITS(WTX_FOLLOWING) + |BITS(WTX_EMPTY) + |BITS(WTX_CLOSED), + }, + [WTX_ACTIVE] = { + .name = "active", + .allowed = BITS(WTX_BASE) + |BITS(WTX_HIDDEN) + |BITS(WTX_CLOSED), + }, + [WTX_HIDDEN] = { + .name = "hidden", + .allowed = BITS(WTX_BASE) + |BITS(WTX_POLLED) + |BITS(WTX_CLOSED), + }, + [WTX_POLLED] = { + .name = "polled", + .allowed = BITS(WTX_BASE) + |BITS(WTX_CLOSED), + }, }; /** @@ -335,65 +334,17 @@ static bool no_pending_txn(const struct entry *e) e->pending_txn_last_frame_commit == 0; } -static bool write_lock_held(const struct entry *e) -{ - return e->shm_locks[WAL_WRITE_LOCK] == VFS2_EXCLUSIVE; -} - static bool wal_index_basic_hdr_equal(struct wal_index_basic_hdr a, struct wal_index_basic_hdr b) { return memcmp(&a, &b, sizeof(struct wal_index_basic_hdr)) == 0; } -static bool wal_index_basic_hdr_zeroed(struct wal_index_basic_hdr h) -{ - return wal_index_basic_hdr_equal(h, (struct wal_index_basic_hdr){}); -} - -static bool wal_index_basic_hdr_advanced(struct wal_index_basic_hdr new, - struct wal_index_basic_hdr old) -{ - return new.iChange == old.iChange + 1 && - new.nPage >= old.nPage /* no vacuums here */ - && ((get_salt1(new.salts) == get_salt1(old.salts) && - get_salt2(new.salts) == get_salt2(old.salts)) || - /* note the weirdness with zero salts */ - (get_salt1(old.salts) == 0 && get_salt2(old.salts) == 0)) && - new.mxFrame > old.mxFrame; -} - -/* Check that the hash tables in the WAL index have been initialized - * by looking for nonzero bytes after the WAL index header. (TODO: - * actually parse the hash tables?) */ -static bool wal_index_recovered(const struct entry *e) -{ - PRE(e->shm_regions_len > 0); - char *p = e->shm_regions[0]; - for (size_t i = sizeof(struct wal_index_full_hdr); - i < VFS2_WAL_INDEX_REGION_SIZE; i++) { - if (p[i] != 0) { - return true; - } - } - return false; -} - static bool is_valid_page_size(unsigned long n) { return n >= 1 << 9 && n <= 1 << 16 && is_po2(n); } -static bool is_open(const struct entry *e) -{ - return e->main_db_name != NULL && e->wal_moving_name != NULL && - e->wal_cur_fixed_name != NULL && e->wal_cur != NULL && - e->wal_prev_fixed_name != NULL && e->wal_prev != NULL && - (e->refcount_main_db > 0 || e->refcount_wal > 0) && - e->shm_regions != NULL && e->shm_regions_len > 0 && - e->shm_regions[0] != NULL && e->common != NULL; -} - static bool basic_hdr_valid(struct wal_index_basic_hdr bhdr) { struct cksums sums = {}; @@ -411,101 +362,9 @@ static bool full_hdr_valid(const struct wal_index_full_hdr *ihdr) static bool wtx_invariant(const struct sm *sm, int prev) { - /* TODO make use of this */ + (void)sm; (void)prev; - - struct entry *e = CONTAINER_OF(sm, struct entry, wtx_sm); - - if (sm_state(sm) == WTX_CLOSED) { - char *region = (char *)e; - char zeroed[offsetof(struct entry, wtx_sm)] = {}; - - return CHECK(memcmp(region, zeroed, sizeof(zeroed)) == 0) && - CHECK(e->common != NULL); - } - - if (!CHECK(is_open(e))) { - return false; - } - struct wal_index_full_hdr *ihdr = get_full_hdr(e); - if (!CHECK(full_hdr_valid(ihdr))) { - return false; - } - uint32_t mx = ihdr->basic[0].mxFrame; - uint32_t backfill = ihdr->nBackfill; - uint32_t cursor = e->wal_cursor; - if (!CHECK(backfill <= mx) || !CHECK(mx <= cursor)) { - return false; - } - - /* TODO any checks applicable to the read marks and read locks? */ - - if (sm_state(sm) == WTX_EMPTY) { - return CHECK(mx == backfill) && CHECK(mx == cursor) && - CHECK(no_pending_txn(e)) && CHECK(!write_lock_held(e)); - } - - if (!CHECK(is_valid_page_size(e->page_size))) { - return false; - } - - if (sm_state(sm) == WTX_FOLLOWING) { - return CHECK(no_pending_txn(e)) && CHECK(write_lock_held(e)) && - CHECK(mx < cursor); - } - - if (sm_state(sm) == WTX_FLUSH) { - return CHECK(no_pending_txn(e)) && CHECK(!write_lock_held(e)) && - CHECK(ERGO(mx > 0, backfill < mx)) && - CHECK(mx == cursor); - } - - if (sm_state(sm) == WTX_BASE) { - return CHECK(no_pending_txn(e)) && CHECK(!write_lock_held(e)) && - CHECK(ERGO(mx > 0, backfill < mx)) && - CHECK(mx == cursor) && - CHECK(ERGO(mx > 0, wal_index_recovered(e))); - } - - if (sm_state(sm) == WTX_ACTIVE) { - return CHECK(wal_index_basic_hdr_equal( - get_full_hdr(e)->basic[0], e->prev_txn_hdr)) && - CHECK(wal_index_basic_hdr_zeroed(e->pending_txn_hdr)) && - CHECK(write_lock_held(e)); - } - - if (!CHECK(mx < cursor) || !CHECK(e->pending_txn_len > 0) || - !CHECK(e->pending_txn_start + e->pending_txn_len == - e->wal_cursor)) { - return false; - } - - if (sm_state(sm) == WTX_HIDDEN) { - bool res = CHECK(wal_index_basic_hdr_equal( - get_full_hdr(e)->basic[0], e->prev_txn_hdr)) && - CHECK(wal_index_basic_hdr_advanced( - e->pending_txn_hdr, e->prev_txn_hdr)) && - CHECK(!write_lock_held(e)) && - CHECK(e->pending_txn_frames != NULL); - if (!res) { - return false; - } - for (uint32_t i = 0; i < e->pending_txn_len; i++) { - res &= CHECK(e->pending_txn_frames[i].page != NULL); - } - return res; - } - - if (sm_state(sm) == WTX_POLLED) { - return CHECK(wal_index_basic_hdr_equal( - get_full_hdr(e)->basic[0], e->prev_txn_hdr)) && - CHECK(wal_index_basic_hdr_advanced(e->pending_txn_hdr, - e->prev_txn_hdr)) && - CHECK(write_lock_held(e)) && - CHECK(e->pending_txn_frames == NULL); - } - - assert(0); + return true; } static int check_wal_integrity(sqlite3_file *f) @@ -1011,9 +870,7 @@ static int vfs2_shm_lock(sqlite3_file *file, int ofst, int n, int flags) if (ihdr->nBackfill == ihdr->basic[0].mxFrame) { sm_move(&e->wtx_sm, WTX_EMPTY); } - } /* else if (ofst <= WAL_RECOVER_LOCK && WAL_RECOVER_LOCK < - ofst + n) { sm_move(&e->wtx_sm, WTX_BASE); - } */ + } } else { assert(0); } @@ -1305,7 +1162,7 @@ static int open_entry(struct common *common, const char *name, struct entry *e) wal_offset_from_cursor(0 /* this doesn't matter */, 0)) { /* TODO verify the header here */ e->page_size = ByteGetBe32(hdr_cur.page_size); - next = WTX_FLUSH; + next = WTX_BASE; } if (size_cur >= wal_offset_from_cursor(e->page_size, 1)) { e->shm_locks[WAL_WRITE_LOCK] = VFS2_EXCLUSIVE; @@ -1538,7 +1395,7 @@ int vfs2_unadd(sqlite3_file *file, struct vfs2_wal_slice first_to_unadd) if (e->wal_cursor == ihdr->basic[0].mxFrame) { e->shm_locks[WAL_WRITE_LOCK] = 0; - sm_move(&e->wtx_sm, WTX_FLUSH); + sm_move(&e->wtx_sm, WTX_BASE); } else { sm_move(&e->wtx_sm, WTX_FOLLOWING); } @@ -1586,38 +1443,13 @@ int vfs2_commit(sqlite3_file *file, struct vfs2_wal_slice stop) set_mx_frame(get_full_hdr(e), commit, fhdr); if (commit == e->wal_cursor) { e->shm_locks[WAL_WRITE_LOCK] = 0; - sm_move(&e->wtx_sm, WTX_FLUSH); + sm_move(&e->wtx_sm, WTX_BASE); } else { sm_move(&e->wtx_sm, WTX_FOLLOWING); } return 0; } -int vfs2_commit_barrier(sqlite3_file *file) -{ - struct file *xfile = (struct file *)file; - PRE(xfile->flags & SQLITE_OPEN_MAIN_DB); - struct entry *e = xfile->entry; - if (e->wal_cursor > 0) { - sqlite3_file *wal_cur = e->wal_cur; - struct wal_frame_hdr fhdr; - int rv = wal_cur->pMethods->xRead( - wal_cur, &fhdr, sizeof(fhdr), - wal_offset_from_cursor(e->page_size, e->wal_cursor - 1)); - if (rv == SQLITE_OK) { - return rv; - } - set_mx_frame(get_full_hdr(e), e->wal_cursor, fhdr); - /* It's okay if the write lock isn't held */ - e->shm_locks[WAL_WRITE_LOCK] = 0; - get_full_hdr(e)->basic[0].isInit = 0; - /* The next transaction will cause SQLite to run recovery which - * will complete the transition to BASE */ - sm_move(&e->wtx_sm, WTX_FLUSH); - } - return 0; -} - int vfs2_poll(sqlite3_file *file, struct vfs2_wal_frame **frames, unsigned *n, @@ -1880,7 +1712,6 @@ int vfs2_add_uncommitted(sqlite3_file *file, if (rv != SQLITE_OK) { return 1; } - /* sm_move(&e->wtx_sm, WTX_FLUSH); */ } uint32_t start = e->wal_cursor; diff --git a/src/vfs2.h b/src/vfs2.h index 32b6d4c0c..4ef0c677f 100644 --- a/src/vfs2.h +++ b/src/vfs2.h @@ -56,8 +56,6 @@ int vfs2_unhide(sqlite3_file *file); int vfs2_commit(sqlite3_file *file, struct vfs2_wal_slice stop); -int vfs2_commit_barrier(sqlite3_file *file); - int vfs2_add_uncommitted(sqlite3_file *file, uint32_t page_size, const struct vfs2_wal_frame *frames, From b137aaf7f9bb5b729d6ae7ca0d604ad4a2d3baed Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Tue, 16 Jul 2024 21:31:00 -0400 Subject: [PATCH 04/73] Remove ad-hoc tracing Signed-off-by: Cole Miller --- src/vfs2.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/src/vfs2.c b/src/vfs2.c index 88c78d5d2..2155f37d5 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -454,9 +454,6 @@ static int wal_swap(struct entry *e, const struct wal_hdr *wal_hdr) sqlite3_file *phys_incoming = e->wal_prev; char *name_incoming = e->wal_prev_fixed_name; - tracef("wal swap outgoing=%s incoming=%s", name_outgoing, - name_incoming); - /* Write the new header of the incoming WAL. */ rv = phys_incoming->pMethods->xWrite(phys_incoming, wal_hdr, sizeof(struct wal_hdr), 0); @@ -476,12 +473,10 @@ static int wal_swap(struct entry *e, const struct wal_hdr *wal_hdr) /* Move the moving name. */ rv = unlink(e->wal_moving_name); if (rv != 0 && errno != ENOENT) { - tracef("unlink = IOERR"); return SQLITE_IOERR; } rv = link(name_incoming, e->wal_moving_name); if (rv != 0) { - tracef("link = IOERR"); return SQLITE_IOERR; } @@ -504,7 +499,6 @@ static int vfs2_wal_write_frame_hdr(struct entry *e, e->pending_txn_start = x; } uint32_t n = e->pending_txn_len; - tracef("orig=%u start=%u n=%u", x, e->pending_txn_start, n); x -= e->pending_txn_start; assert(x <= n); if (e->pending_txn_len == 0 && x == 0) { @@ -586,7 +580,6 @@ static int vfs2_write(sqlite3_file *file, assert(amt == sizeof(struct wal_hdr)); const struct wal_hdr *hdr = buf; struct entry *e = xfile->entry; - tracef("about to wal swap"); rv = wal_swap(e, hdr); if (rv != SQLITE_OK) { return rv; @@ -608,8 +601,6 @@ static int vfs2_write(sqlite3_file *file, if (xfile->flags & SQLITE_OPEN_WAL) { struct entry *e = xfile->entry; - tracef("wrote to WAL name=%s amt=%d ofst=%lld", - e->wal_cur_fixed_name, amt, ofst); return vfs2_wal_post_write(e, buf, amt, ofst); } @@ -849,14 +840,12 @@ static int vfs2_shm_lock(sqlite3_file *file, int ofst, int n, int flags) } if (ofst <= WAL_RECOVER_LOCK && WAL_RECOVER_LOCK < ofst + n) { - tracef("unlocking the recovery lock!"); } if (ofst == WAL_WRITE_LOCK) { /* Unlocking the write lock: roll back any uncommitted * transaction. */ assert(n == 1); - tracef("unlocking write lock"); /* TODO make sure this is correct */ if (e->pending_txn_last_frame_commit == 0) { free_pending_txn(e); From 64831fe44dc4b7308350b304fd5989aaddf50bb5 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Tue, 16 Jul 2024 23:30:46 -0400 Subject: [PATCH 05/73] Implement hash table updates for region0 Signed-off-by: Cole Miller --- src/vfs2.c | 44 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 7 deletions(-) diff --git a/src/vfs2.c b/src/vfs2.c index 2155f37d5..c1a6ffa1a 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -201,12 +201,17 @@ struct wal_index_full_hdr { uint8_t unused[4]; }; +#define REGION0_PGNOS_LEN 4062 +#define REGION0_HT_LEN 8192 + /** - * View of the zeroth shm region, which contains the WAL index header. + * View of the zeroth shm region, which contains the WAL index header + * and the first hash table. */ -union vfs2_shm_region0 { +struct vfs2_shm_region0 { struct wal_index_full_hdr hdr; - char bytes[VFS2_WAL_INDEX_REGION_SIZE]; + uint32_t pgnos[REGION0_PGNOS_LEN]; + uint16_t ht[REGION0_HT_LEN]; }; struct entry { @@ -970,22 +975,42 @@ static struct wal_index_full_hdr initial_full_hdr(struct wal_hdr whdr) return ihdr; } -static void set_mx_frame(struct wal_index_full_hdr *ihdr, +static void pgno_ht_insert(uint16_t *ht, size_t len, uint16_t fx, uint32_t pgno) +{ + uint32_t hash = pgno * 383; + while (ht[hash % len] != 0) { + hash++; + } + ht[hash % len] = fx; +} + +static void set_mx_frame(struct entry *e, uint32_t mx, struct wal_frame_hdr fhdr) { + struct wal_index_full_hdr *ihdr = get_full_hdr(e); uint32_t num_pages = ByteGetBe32(fhdr.commit); PRE(num_pages > 0); + uint32_t old_mx = ihdr->basic[0].mxFrame; ihdr->basic[0].iChange += 1; ihdr->basic[0].mxFrame = mx; ihdr->basic[0].nPage = num_pages; - /* XXX byte order */ ihdr->basic[0].frame_cksums.cksum1 = ByteGetBe32(fhdr.cksum1); ihdr->basic[0].frame_cksums.cksum2 = ByteGetBe32(fhdr.cksum2); struct cksums sums = {}; - update_cksums(native_magic(), (const void *)&ihdr->basic[0], 40, &sums); + update_cksums(native_magic(), (const uint8_t *)&ihdr->basic[0], + sizeof(ihdr->basic[0]), &sums); ihdr->basic[0].cksums = sums; ihdr->basic[1] = ihdr->basic[0]; + POST(full_hdr_valid(ihdr)); + + struct vfs2_shm_region0 *r0 = e->shm_regions[0]; + PRE(mx <= REGION0_PGNOS_LEN); + for (uint32_t i = old_mx; i < mx; i++) { + PRE(i < REGION0_PGNOS_LEN); + pgno_ht_insert(r0->ht, REGION0_HT_LEN, (uint16_t)i, + r0->pgnos[i]); + } } static void restart_full_hdr(struct wal_index_full_hdr *ihdr, @@ -1429,7 +1454,7 @@ int vfs2_commit(sqlite3_file *file, struct vfs2_wal_slice stop) if (rv == SQLITE_OK) { return rv; } - set_mx_frame(get_full_hdr(e), commit, fhdr); + set_mx_frame(e, commit, fhdr); if (commit == e->wal_cursor) { e->shm_locks[WAL_WRITE_LOCK] = 0; sm_move(&e->wtx_sm, WTX_BASE); @@ -1723,6 +1748,9 @@ int vfs2_add_uncommitted(sqlite3_file *file, sums.cksum2 = ByteGetBe32(e->wal_cur_hdr.cksum2); } + struct vfs2_shm_region0 *r0 = e->shm_regions[0]; + PRE(e->wal_cursor < REGION0_PGNOS_LEN); + r0->pgnos[e->wal_cursor] = frames[0].page_number; struct wal_frame_hdr fhdr = txn_frame_hdr(e, sums, frames[0]); rv = write_one_frame(e, fhdr, frames[0].page); if (rv != SQLITE_OK) { @@ -1730,6 +1758,8 @@ int vfs2_add_uncommitted(sqlite3_file *file, } for (unsigned i = 1; i < len; i++) { + PRE(e->wal_cursor < REGION0_PGNOS_LEN); + r0->pgnos[e->wal_cursor] = frames[i].page_number; sums.cksum1 = ByteGetBe32(fhdr.cksum1); sums.cksum2 = ByteGetBe32(fhdr.cksum2); fhdr = txn_frame_hdr(e, sums, frames[i]); From 10ba5f01f4b6d4cce8e224682938786738c241fc Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Tue, 16 Jul 2024 23:33:19 -0400 Subject: [PATCH 06/73] Rename vfs2_commit to vfs2_apply Signed-off-by: Cole Miller --- src/vfs2.c | 4 ++-- src/vfs2.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/vfs2.c b/src/vfs2.c index c1a6ffa1a..177ba70e4 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -1437,7 +1437,7 @@ int vfs2_unhide(sqlite3_file *file) return 0; } -int vfs2_commit(sqlite3_file *file, struct vfs2_wal_slice stop) +int vfs2_apply(sqlite3_file *file, struct vfs2_wal_slice stop) { struct file *xfile = (struct file *)file; PRE(xfile->flags & SQLITE_OPEN_MAIN_DB); @@ -1711,7 +1711,7 @@ int vfs2_add_uncommitted(sqlite3_file *file, * lock here before "acquiring" it, we just make sure that * it's held before returning. * - * The write lock will be released when a call to vfs2_commit + * The write lock will be released when a call to vfs2_apply * or vfs2_unadd causes the number of committed frames in * WAL-cur (mxFrame) to equal the number of applies frames * (wal_cursor). */ diff --git a/src/vfs2.h b/src/vfs2.h index 4ef0c677f..27f0fa841 100644 --- a/src/vfs2.h +++ b/src/vfs2.h @@ -54,7 +54,7 @@ int vfs2_poll(sqlite3_file *file, int vfs2_unhide(sqlite3_file *file); -int vfs2_commit(sqlite3_file *file, struct vfs2_wal_slice stop); +int vfs2_apply(sqlite3_file *file, struct vfs2_wal_slice stop); int vfs2_add_uncommitted(sqlite3_file *file, uint32_t page_size, From be847c78c397169f423b96696daab0ec4a982f66 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Wed, 17 Jul 2024 12:35:02 -0400 Subject: [PATCH 07/73] Allow moving from WTX_ACTIVE to WTX_ACTIVE This happens whenever we write part of a transaction to the WAL after the first frame header. Signed-off-by: Cole Miller --- src/vfs2.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/vfs2.c b/src/vfs2.c index 177ba70e4..0ec009093 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -89,6 +89,7 @@ static const struct sm_conf wtx_states[SM_STATES_MAX] = { [WTX_ACTIVE] = { .name = "active", .allowed = BITS(WTX_BASE) + |BITS(WTX_ACTIVE) |BITS(WTX_HIDDEN) |BITS(WTX_CLOSED), }, From 19f66d26b0ce94e14491c867e06e48d442b275c1 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Wed, 17 Jul 2024 12:50:28 -0400 Subject: [PATCH 08/73] Fix rollback test Signed-off-by: Cole Miller --- src/vfs2.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/vfs2.c b/src/vfs2.c index 0ec009093..1e1280d9b 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -852,8 +852,8 @@ static int vfs2_shm_lock(sqlite3_file *file, int ofst, int n, int flags) /* Unlocking the write lock: roll back any uncommitted * transaction. */ assert(n == 1); - /* TODO make sure this is correct */ - if (e->pending_txn_last_frame_commit == 0) { + if (sm_state(&e->wtx_sm) > WTX_BASE && + e->pending_txn_last_frame_commit == 0) { free_pending_txn(e); sm_move(&e->wtx_sm, WTX_BASE); } From 4ded6bbb1ede0a16eaf607119df44a3dd4e42a41 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Wed, 17 Jul 2024 12:30:10 -0400 Subject: [PATCH 09/73] Suppose multiple nodes in vfs2 test Signed-off-by: Cole Miller --- test/unit/test_vfs2.c | 116 ++++++++++++++++++++++-------------------- 1 file changed, 61 insertions(+), 55 deletions(-) diff --git a/test/unit/test_vfs2.c b/test/unit/test_vfs2.c index 1065469f0..6604a608f 100644 --- a/test/unit/test_vfs2.c +++ b/test/unit/test_vfs2.c @@ -1,5 +1,3 @@ -#pragma GCC diagnostic ignored "-Wformat-truncation" // XXX - #include "../../src/lib/byte.h" #include "../../src/vfs2.h" #include "../lib/fs.h" @@ -14,37 +12,74 @@ #include #include +#define NUM_NODES 3 #define PAGE_SIZE 512 #define PAGE_SIZE_STR "512" SUITE(vfs2); -struct fixture { +struct node { sqlite3_vfs *vfs; + char *vfs_name; char *dir; }; +struct fixture { + struct node nodes[NUM_NODES]; +}; + static void *set_up(const MunitParameter params[], void *user_data) { (void)params; (void)user_data; struct fixture *f = munit_malloc(sizeof(*f)); - f->dir = test_dir_setup(); - f->vfs = vfs2_make(sqlite3_vfs_find("unix"), "dqlite-vfs2"); - munit_assert_ptr_not_null(f->vfs); - sqlite3_vfs_register(f->vfs, 1 /* make default */); + struct node *node; + for (unsigned i = 0; i < NUM_NODES; i++) { + node = &f->nodes[i]; + node->dir = test_dir_setup(); + node->vfs_name = sqlite3_mprintf("vfs2-%u", i); + munit_assert_ptr_not_null(node->vfs_name); + node->vfs = vfs2_make(sqlite3_vfs_find("unix"), node->vfs_name); + munit_assert_ptr_not_null(node->vfs); + sqlite3_vfs_register(node->vfs, 0); + } return f; } static void tear_down(void *data) { struct fixture *f = data; - sqlite3_vfs_unregister(f->vfs); - vfs2_destroy(f->vfs); - test_dir_tear_down(f->dir); + const struct node *node; + for (unsigned i = 0; i < NUM_NODES; i++) { + node = &f->nodes[i]; + sqlite3_vfs_unregister(node->vfs); + vfs2_destroy(node->vfs); + sqlite3_free(node->vfs_name); + test_dir_tear_down(node->dir); + } free(f); } +static sqlite3 *open_test_db(const struct node *node) +{ + char buf[PATH_MAX]; + snprintf(buf, sizeof(buf), "%s/test.db", node->dir); + sqlite3 *db; + int rv = sqlite3_open_v2(buf, &db, + SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE, + node->vfs_name); + munit_assert_int(rv, ==, SQLITE_OK); + munit_assert_ptr_not_null(db); + rv = sqlite3_exec(db, + "PRAGMA page_size=" PAGE_SIZE_STR + ";" + "PRAGMA journal_mode=WAL;" + "PRAGMA wal_autocheckpoint=0", + NULL, NULL, NULL); + munit_assert_int(rv, ==, SQLITE_OK); + return db; +} + static void prepare_wals(const char *dbname, const unsigned char *wal1, size_t wal1_len, @@ -62,7 +97,7 @@ static void prepare_wals(const char *dbname, rv = ftruncate(fd1, 0); munit_assert_int(rv, ==, 0); n = write(fd1, wal1, wal1_len); - munit_assert_llong(n, ==, wal1_len); + munit_assert_llong(n, ==, (ssize_t)wal1_len); close(fd1); } if (wal2 != NULL) { @@ -73,7 +108,7 @@ static void prepare_wals(const char *dbname, rv = ftruncate(fd2, 0); munit_assert_int(rv, ==, 0); n = write(fd2, wal2, wal2_len); - munit_assert_llong(n, ==, wal2_len); + munit_assert_llong(n, ==, (ssize_t)wal2_len); close(fd2); } } @@ -98,28 +133,12 @@ static void check_wals(const char *dbname, off_t wal1_len, off_t wal2_len) TEST(vfs2, basic, set_up, tear_down, 0, NULL) { struct fixture *f = data; + struct node *node = &f->nodes[0]; int rv; - char buf[PATH_MAX]; - - snprintf(buf, PATH_MAX, "%s/%s", f->dir, "test.db"); - sqlite3 *db; - rv = sqlite3_open(buf, &db); - munit_assert_int(rv, ==, SQLITE_OK); - rv = sqlite3_exec(db, - "PRAGMA page_size=" PAGE_SIZE_STR - ";" - "PRAGMA journal_mode=WAL;" - "PRAGMA wal_autocheckpoint=0", - NULL, NULL, NULL); - munit_assert_int(rv, ==, SQLITE_OK); - - char *args[] = { NULL, "page_size", NULL }; - rv = sqlite3_file_control(db, "main", SQLITE_FCNTL_PRAGMA, args); + sqlite3 *db = open_test_db(node); sqlite3_file *fp; sqlite3_file_control(db, "main", SQLITE_FCNTL_FILE_POINTER, &fp); - rv = vfs2_commit_barrier(fp); - munit_assert_int(rv, ==, 0); rv = sqlite3_exec(db, "CREATE TABLE foo (bar INTEGER)", NULL, NULL, NULL); munit_assert_int(rv, ==, SQLITE_OK); @@ -260,32 +279,20 @@ static void make_wal_hdr(uint8_t *buf, TEST(vfs2, startup_one_nonempty, set_up, tear_down, 0, NULL) { struct fixture *f = data; + struct node *node = &f->nodes[0]; char buf[PATH_MAX]; + int rv; - snprintf(buf, PATH_MAX, "%s/%s", f->dir, "test.db"); + snprintf(buf, PATH_MAX, "%s/%s", node->dir, "test.db"); check_wals(buf, 0, 0); uint8_t wal2_hdronly[WAL_SIZE_FROM_FRAMES(0)] = { 0 }; make_wal_hdr(wal2_hdronly, 0, 17, 103); prepare_wals(buf, NULL, 0, wal2_hdronly, sizeof(wal2_hdronly)); - sqlite3 *db; - tracef("opening..."); - int rv = sqlite3_open(buf, &db); - munit_assert_int(rv, ==, SQLITE_OK); - tracef("setup..."); - rv = sqlite3_exec(db, - "PRAGMA page_size=" PAGE_SIZE_STR - ";" - "PRAGMA journal_mode=WAL;" - "PRAGMA wal_autocheckpoint=0", - NULL, NULL, NULL); - munit_assert_int(rv, ==, SQLITE_OK); + sqlite3 *db = open_test_db(node); sqlite3_file *fp; sqlite3_file_control(db, "main", SQLITE_FCNTL_FILE_POINTER, &fp); - tracef("barrier..."); - rv = vfs2_commit_barrier(fp); - munit_assert_int(rv, ==, 0); tracef("create table..."); rv = sqlite3_exec(db, "CREATE TABLE foo (n INTEGER)", NULL, NULL, NULL); munit_assert_int(rv, ==, SQLITE_OK); @@ -301,10 +308,11 @@ TEST(vfs2, startup_one_nonempty, set_up, tear_down, 0, NULL) TEST(vfs2, startup_both_nonempty, set_up, tear_down, 0, NULL) { struct fixture *f = data; + struct node *node = &f->nodes[0]; char buf[PATH_MAX]; + int rv; - snprintf(buf, PATH_MAX, "%s/%s", f->dir, "test.db"); - + snprintf(buf, PATH_MAX, "%s/%s", node->dir, "test.db"); check_wals(buf, 0, 0); uint8_t wal1_hdronly[WAL_SIZE_FROM_FRAMES(0)] = { 0 }; @@ -313,9 +321,7 @@ TEST(vfs2, startup_both_nonempty, set_up, tear_down, 0, NULL) make_wal_hdr(wal2_hdronly, 0, 17, 103); prepare_wals(buf, wal1_hdronly, sizeof(wal1_hdronly), wal2_hdronly, sizeof(wal2_hdronly)); - sqlite3 *db; - int rv = sqlite3_open(buf, &db); - munit_assert_int(rv, ==, SQLITE_OK); + sqlite3 *db = open_test_db(node); rv = sqlite3_exec(db, "PRAGMA page_size=" PAGE_SIZE_STR ";" @@ -336,13 +342,13 @@ TEST(vfs2, startup_both_nonempty, set_up, tear_down, 0, NULL) TEST(vfs2, rollback, set_up, tear_down, 0, NULL) { struct fixture *f = data; + struct node *node = &f->nodes[0]; char buf[PATH_MAX]; + int rv; - snprintf(buf, PATH_MAX, "%s/%s", f->dir, "test.db"); + snprintf(buf, PATH_MAX, "%s/%s", node->dir, "test.db"); - sqlite3 *db; - int rv = sqlite3_open(buf, &db); - munit_assert_int(rv, ==, SQLITE_OK); + sqlite3 *db = open_test_db(node); rv = sqlite3_exec(db, "PRAGMA journal_mode=WAL;" From 0e3054df2d9d1e5ae40306ec2eafd6665dcbf52d Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Wed, 17 Jul 2024 13:41:41 -0400 Subject: [PATCH 10/73] Add comments to vfs2 tests Signed-off-by: Cole Miller --- test/unit/test_vfs2.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/test/unit/test_vfs2.c b/test/unit/test_vfs2.c index 6604a608f..23b26a022 100644 --- a/test/unit/test_vfs2.c +++ b/test/unit/test_vfs2.c @@ -60,6 +60,9 @@ static void tear_down(void *data) free(f); } +/** + * Open a connection to a test database for this node. + */ static sqlite3 *open_test_db(const struct node *node) { char buf[PATH_MAX]; @@ -80,6 +83,9 @@ static sqlite3 *open_test_db(const struct node *node) return db; } +/** + * Write two WALs to disk with the given contents. + */ static void prepare_wals(const char *dbname, const unsigned char *wal1, size_t wal1_len, @@ -113,6 +119,9 @@ static void prepare_wals(const char *dbname, } } +/** + * Assert the lengths of WAL1 and WAL2 on disk. + */ static void check_wals(const char *dbname, off_t wal1_len, off_t wal2_len) { char buf[PATH_MAX]; @@ -130,6 +139,9 @@ static void check_wals(const char *dbname, off_t wal1_len, off_t wal2_len) (rv < 0 && errno == ENOENT && wal2_len == 0)); } +/** + * Single-node test with several transactions and a checkpoint. + */ TEST(vfs2, basic, set_up, tear_down, 0, NULL) { struct fixture *f = data; @@ -233,6 +245,10 @@ TEST(vfs2, basic, set_up, tear_down, 0, NULL) #define WAL_SIZE_FROM_FRAMES(n) (32 + (24 + PAGE_SIZE) * (n)) +/** + * Create a WAL header with the sequence number and salts set to the + * given values. + */ static void make_wal_hdr(uint8_t *buf, uint32_t ckpoint_seqno, uint32_t salt1, @@ -276,6 +292,9 @@ static void make_wal_hdr(uint8_t *buf, p += 4; } +/** + * When only one WAL is nonempty at startup, that WAL becomes WAL-cur. + */ TEST(vfs2, startup_one_nonempty, set_up, tear_down, 0, NULL) { struct fixture *f = data; @@ -305,6 +324,10 @@ TEST(vfs2, startup_one_nonempty, set_up, tear_down, 0, NULL) return MUNIT_OK; } +/** + * When both WALs are nonempty at startup, the one with the higher salt1 + * value becomes WAL-cur. + */ TEST(vfs2, startup_both_nonempty, set_up, tear_down, 0, NULL) { struct fixture *f = data; @@ -339,6 +362,9 @@ TEST(vfs2, startup_both_nonempty, set_up, tear_down, 0, NULL) return MUNIT_OK; } +/** + * Single-node test of rolling back a transaction. + */ TEST(vfs2, rollback, set_up, tear_down, 0, NULL) { struct fixture *f = data; From 993a673374fdc4fe818e522e6326071355949c67 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Wed, 17 Jul 2024 14:01:49 -0400 Subject: [PATCH 11/73] Use sqlite3_randomness to fill salts Signed-off-by: Cole Miller --- src/vfs2.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/vfs2.c b/src/vfs2.c index 1e1280d9b..be8f23540 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -1649,6 +1649,12 @@ static int write_one_frame(struct entry *e, static struct wal_hdr next_wal_hdr(const struct entry *e) { + /* salt2 is randomized every time we generate a new WAL header. + * We don't use the xRandomness method of the base VFS to do this, + * because it always translates to a syscall (getrandom), and + * SQLite intends that this should only be used for seeding the + * internal PRNG. Instead, we call sqlite3_randomness, which gives + * us access to this PRNG, seeded from the default (unix) VFS. */ struct wal_hdr ret; struct wal_hdr old = e->wal_cur_hdr; BytePutBe32(native_magic(), ret.magic); @@ -1660,12 +1666,10 @@ static struct wal_hdr next_wal_hdr(const struct entry *e) if (ckpoint_seqno == 0) { salt1 = get_salt1(old.salts) + 1; } else { - e->common->orig->xRandomness(e->common->orig, sizeof(salt1), - (void *)&salt1); + sqlite3_randomness(sizeof(salt1), (void *)&salt1); } BytePutBe32(salt1, ret.salts.salt1); - e->common->orig->xRandomness(e->common->orig, sizeof(ret.salts.salt2), - (void *)&ret.salts.salt2); + sqlite3_randomness(sizeof(ret.salts.salt2), (void *)&ret.salts.salt2); return ret; } From bfb0815915a92e54ed9c79097164cfa28bf8358f Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Wed, 17 Jul 2024 14:08:13 -0400 Subject: [PATCH 12/73] Calculate checksums for the WAL header Signed-off-by: Cole Miller --- src/vfs2.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/vfs2.c b/src/vfs2.c index be8f23540..5712e64a6 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -1670,6 +1670,11 @@ static struct wal_hdr next_wal_hdr(const struct entry *e) } BytePutBe32(salt1, ret.salts.salt1); sqlite3_randomness(sizeof(ret.salts.salt2), (void *)&ret.salts.salt2); + struct cksums sums = {}; + update_cksums(native_magic(), (const uint8_t *)&ret, + offsetof(struct wal_hdr, cksum1), &sums); + BytePutBe32(sums.cksum1, ret.cksum1); + BytePutBe32(sums.cksum2, ret.cksum2); return ret; } From 2d80f989495ab9ec13985cdc845798f2739a6685 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Wed, 17 Jul 2024 14:09:26 -0400 Subject: [PATCH 13/73] Don't move to POLLED state unless a transaction was found We don't move to ACTIVE or HIDDEN if no frames were written, so we should stay in BASE here as well. Signed-off-by: Cole Miller --- src/vfs2.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/vfs2.c b/src/vfs2.c index 5712e64a6..9c8910cd2 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -1505,7 +1505,9 @@ int vfs2_poll(sqlite3_file *file, sl->len = len; } - sm_move(&xfile->entry->wtx_sm, WTX_POLLED); + if (len > 0) { + sm_move(&xfile->entry->wtx_sm, WTX_POLLED); + } return 0; } From 2f42cfef2835d3e42c7a55e1cd5ac43876256f0a Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Wed, 17 Jul 2024 14:33:07 -0400 Subject: [PATCH 14/73] Replace vfs2_wal_frame with dqlite_vfs_frame vfs2_wal_frame carries the commit marker explicitly, so it doesn't match COMMAND_FRAMES, which only carries the page numbers and page data. dqlite_vfs_frame is defined with this in mind. To get the commit markers on the follower side, we start with the size of the main file in pages and take the maximum with each page number up to the commit frame in the WAL. This matches what the in-memory VFS does. Signed-off-by: Cole Miller --- src/vfs2.c | 97 ++++++++++++++++++++++++++----------------- src/vfs2.h | 17 ++++---- test/unit/test_vfs2.c | 6 +-- 3 files changed, 70 insertions(+), 50 deletions(-) diff --git a/src/vfs2.c b/src/vfs2.c index 9c8910cd2..10a8cce6e 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -38,6 +38,9 @@ #define READ_MARK_UNUSED 0xffffffff +#define DB_HEADER_SIZE 100 +#define DB_HEADER_NPAGES_OFFSET 28 + static const uint32_t invalid_magic = 0x17171717; enum { @@ -266,7 +269,7 @@ struct entry { /* For ACTIVE, HIDDEN: the pending txn. start and len * are in units of frames. */ - struct vfs2_wal_frame *pending_txn_frames; + dqlite_vfs_frame *pending_txn_frames; uint32_t pending_txn_start; uint32_t pending_txn_len; uint32_t pending_txn_last_frame_commit; @@ -303,7 +306,7 @@ static void free_pending_txn(struct entry *e) { if (e->pending_txn_frames != NULL) { for (uint32_t i = 0; i < e->pending_txn_len; i++) { - sqlite3_free(e->pending_txn_frames[i].page); + sqlite3_free(e->pending_txn_frames[i].data); } sqlite3_free(e->pending_txn_frames); } @@ -499,7 +502,7 @@ static int vfs2_wal_write_frame_hdr(struct entry *e, const struct wal_frame_hdr *fhdr, uint32_t x) { - struct vfs2_wal_frame *frames = e->pending_txn_frames; + dqlite_vfs_frame *frames = e->pending_txn_frames; if (no_pending_txn(e)) { assert(x == e->wal_cursor); e->pending_txn_start = x; @@ -522,21 +525,19 @@ static int vfs2_wal_write_frame_hdr(struct entry *e, if (e->pending_txn_frames == NULL) { return SQLITE_NOMEM; } - struct vfs2_wal_frame *frame = &e->pending_txn_frames[n]; + dqlite_vfs_frame *frame = &e->pending_txn_frames[n]; uint32_t commit = ByteGetBe32(fhdr->commit); frame->page_number = ByteGetBe32(fhdr->page_number); - frame->commit = commit; - frame->page = NULL; + frame->data = NULL; e->pending_txn_last_frame_commit = commit; e->pending_txn_len++; } else { /* Overwriting a previously-written frame in the current * transaction. */ - struct vfs2_wal_frame *frame = &e->pending_txn_frames[x]; + dqlite_vfs_frame *frame = &e->pending_txn_frames[x]; frame->page_number = ByteGetBe32(fhdr->page_number); - frame->commit = ByteGetBe32(fhdr->commit); - sqlite3_free(frame->page); - frame->page = NULL; + sqlite3_free(frame->data); + frame->data = NULL; } sm_move(&e->wtx_sm, WTX_ACTIVE); return SQLITE_OK; @@ -560,13 +561,13 @@ static int vfs2_wal_post_write(struct entry *e, x /= frame_size; x -= e->pending_txn_start; assert(0 <= x && x < e->pending_txn_len); - struct vfs2_wal_frame *frame = &e->pending_txn_frames[x]; - assert(frame->page == NULL); - frame->page = sqlite3_malloc(amt); - if (frame->page == NULL) { + dqlite_vfs_frame *frame = &e->pending_txn_frames[x]; + assert(frame->data == NULL); + frame->data = sqlite3_malloc(amt); + if (frame->data == NULL) { return SQLITE_NOMEM; } - memcpy(frame->page, buf, (size_t)amt); + memcpy(frame->data, buf, (size_t)amt); sm_move(&e->wtx_sm, WTX_ACTIVE); return SQLITE_OK; } else { @@ -1466,7 +1467,7 @@ int vfs2_apply(sqlite3_file *file, struct vfs2_wal_slice stop) } int vfs2_poll(sqlite3_file *file, - struct vfs2_wal_frame **frames, + dqlite_vfs_frame **frames, unsigned *n, struct vfs2_wal_slice *sl) { @@ -1492,7 +1493,7 @@ int vfs2_poll(sqlite3_file *file, *frames = e->pending_txn_frames; } else { for (uint32_t i = 0; i < e->pending_txn_len; i++) { - sqlite3_free(e->pending_txn_frames[i].page); + sqlite3_free(e->pending_txn_frames[i].data); } sqlite3_free(e->pending_txn_frames); } @@ -1555,7 +1556,7 @@ int vfs2_read_wal(sqlite3_file *file, int page_size = (int)e->page_size; for (size_t i = 0; i < txns_len; i++) { - struct vfs2_wal_frame *f = + dqlite_vfs_frame *f = sqlite3_malloc64(txns[i].meta.len * sizeof(*f)); if (f == NULL) { goto oom; @@ -1566,7 +1567,7 @@ int vfs2_read_wal(sqlite3_file *file, if (p == NULL) { goto oom; } - txns[i].frames[j].page = p; + txns[i].frames[j].data = p; } } @@ -1601,14 +1602,13 @@ int vfs2_read_wal(sqlite3_file *file, return 1; } off += (sqlite3_int64)sizeof(fhdr); - rv = wal->pMethods->xRead(wal, txns[i].frames[j].page, + rv = wal->pMethods->xRead(wal, txns[i].frames[j].data, page_size, off); if (rv != SQLITE_OK) { return 1; } txns[i].frames[j].page_number = ByteGetBe32(fhdr.page_number); - txns[i].frames[j].commit = ByteGetBe32(fhdr.commit); } if (from_wal_cur) { vfs2_pseudo_read_end(file, read_lock); @@ -1620,7 +1620,7 @@ int vfs2_read_wal(sqlite3_file *file, oom: for (uint32_t i = 0; i < txns_len; i++) { for (uint32_t j = 0; j < txns[i].meta.len; j++) { - sqlite3_free(txns[i].frames[j].page); + sqlite3_free(txns[i].frames[j].data); } sqlite3_free(txns[i].frames); txns[i].frames = NULL; @@ -1682,15 +1682,16 @@ static struct wal_hdr next_wal_hdr(const struct entry *e) static struct wal_frame_hdr txn_frame_hdr(struct entry *e, struct cksums sums, - struct vfs2_wal_frame frame) + const dqlite_vfs_frame *frame, + uint32_t commit) { struct wal_frame_hdr fhdr; - BytePutBe32(frame.page_number, fhdr.page_number); - BytePutBe32(frame.commit, fhdr.commit); + BytePutBe32((uint32_t)frame->page_number, fhdr.page_number); + BytePutBe32(commit, fhdr.commit); update_cksums(ByteGetBe32(e->wal_cur_hdr.magic), (const void *)(&fhdr), 8, &sums); - update_cksums(ByteGetBe32(e->wal_cur_hdr.magic), frame.page, + update_cksums(ByteGetBe32(e->wal_cur_hdr.magic), frame->data, e->page_size, &sums); fhdr.salts = e->wal_cur_hdr.salts; BytePutBe32(sums.cksum1, fhdr.cksum1); @@ -1700,16 +1701,12 @@ static struct wal_frame_hdr txn_frame_hdr(struct entry *e, int vfs2_add_uncommitted(sqlite3_file *file, uint32_t page_size, - const struct vfs2_wal_frame *frames, + const dqlite_vfs_frame *frames, unsigned len, struct vfs2_wal_slice *out) { PRE(len > 0); PRE(is_valid_page_size(page_size)); - for (unsigned i = 0; i < len - 1; i++) { - PRE(frames[i].commit == 0); - } - PRE(frames[len - 1].commit > 0); struct file *xfile = (struct file *)file; PRE(xfile->flags & SQLITE_OPEN_MAIN_DB); struct entry *e = xfile->entry; @@ -1743,7 +1740,12 @@ int vfs2_add_uncommitted(sqlite3_file *file, uint32_t start = e->wal_cursor; struct cksums sums; + uint32_t db_size; if (start > 0) { + /* There's already a transaction in the WAL. In this case + * we initialize the rolling checksum and database size + * calculation from the header of the last (commit) frame in + * this transaction. */ /* TODO cache this in the entry? */ struct wal_frame_hdr prev_fhdr; sqlite3_int64 off = @@ -1755,27 +1757,46 @@ int vfs2_add_uncommitted(sqlite3_file *file, } sums.cksum1 = ByteGetBe32(prev_fhdr.cksum1); sums.cksum2 = ByteGetBe32(prev_fhdr.cksum2); + db_size = ByteGetBe32(prev_fhdr.commit); } else { + /* This is the first transaction in this WAL. In this case + * we initialize the rolling checksum from the checksum in + * the WAL header, and read the actual database file to + * initialize the running database size. */ sums.cksum1 = ByteGetBe32(e->wal_cur_hdr.cksum1); sums.cksum2 = ByteGetBe32(e->wal_cur_hdr.cksum2); - } + /* The database size in pages is kept in a field of the database + * header. */ + uint8_t b[DB_HEADER_SIZE]; + rv = + xfile->orig->pMethods->xRead(xfile->orig, &b, sizeof(b), 0); + /* TODO(cole) this can't fail provided that the main file + * has been created; ensure that this is the case even if + * we haven't run a checkpoint yet. */ + assert(rv == SQLITE_OK); + db_size = ByteGetBe32(b + DB_HEADER_NPAGES_OFFSET); + } + POST(db_size > 0); struct vfs2_shm_region0 *r0 = e->shm_regions[0]; PRE(e->wal_cursor < REGION0_PGNOS_LEN); - r0->pgnos[e->wal_cursor] = frames[0].page_number; - struct wal_frame_hdr fhdr = txn_frame_hdr(e, sums, frames[0]); - rv = write_one_frame(e, fhdr, frames[0].page); + r0->pgnos[e->wal_cursor] = (uint32_t)frames[0].page_number; + + uint32_t commit = len == 1 ? db_size : 0; + struct wal_frame_hdr fhdr = txn_frame_hdr(e, sums, &frames[0], commit); + rv = write_one_frame(e, fhdr, frames[0].data); if (rv != SQLITE_OK) { return 1; } for (unsigned i = 1; i < len; i++) { PRE(e->wal_cursor < REGION0_PGNOS_LEN); - r0->pgnos[e->wal_cursor] = frames[i].page_number; + r0->pgnos[e->wal_cursor] = (uint32_t)frames[i].page_number; sums.cksum1 = ByteGetBe32(fhdr.cksum1); sums.cksum2 = ByteGetBe32(fhdr.cksum2); - fhdr = txn_frame_hdr(e, sums, frames[i]); - rv = write_one_frame(e, fhdr, frames[i].page); + commit = i == len - 1 ? db_size : 0; + fhdr = txn_frame_hdr(e, sums, &frames[i], commit); + rv = write_one_frame(e, fhdr, frames[i].data); if (rv != SQLITE_OK) { return 1; } diff --git a/src/vfs2.h b/src/vfs2.h index 27f0fa841..cbb29512d 100644 --- a/src/vfs2.h +++ b/src/vfs2.h @@ -1,6 +1,8 @@ #ifndef DQLITE_VFS2_H #define DQLITE_VFS2_H +#include "../include/dqlite.h" /* dqlite_vfs_frame */ + #include #include @@ -33,12 +35,6 @@ struct vfs2_wal_slice { uint32_t len; }; -struct vfs2_wal_frame { - uint32_t page_number; - uint32_t commit; - void *page; -}; - /** * Retrieve frames that were appended to the WAL by the last write transaction, * and reacquire the write lock. @@ -48,7 +44,7 @@ struct vfs2_wal_frame { * Polling the same transaction more than once is an error. */ int vfs2_poll(sqlite3_file *file, - struct vfs2_wal_frame **frames, + dqlite_vfs_frame **frames, unsigned *n, struct vfs2_wal_slice *sl); @@ -56,9 +52,12 @@ int vfs2_unhide(sqlite3_file *file); int vfs2_apply(sqlite3_file *file, struct vfs2_wal_slice stop); +/** + * Add frames to the WAL after receiving them in an AppendEntrie message. + */ int vfs2_add_uncommitted(sqlite3_file *file, uint32_t page_size, - const struct vfs2_wal_frame *frames, + const dqlite_vfs_frame *frames, unsigned n, struct vfs2_wal_slice *out); @@ -66,7 +65,7 @@ int vfs2_unadd(sqlite3_file *file, struct vfs2_wal_slice stop); struct vfs2_wal_txn { struct vfs2_wal_slice meta; - struct vfs2_wal_frame *frames; + dqlite_vfs_frame *frames; }; /** diff --git a/test/unit/test_vfs2.c b/test/unit/test_vfs2.c index 23b26a022..003d6f05f 100644 --- a/test/unit/test_vfs2.c +++ b/test/unit/test_vfs2.c @@ -212,14 +212,14 @@ TEST(vfs2, basic, set_up, tear_down, 0, NULL) rv = sqlite3_step(stmt); munit_assert_int(rv, ==, SQLITE_DONE); - struct vfs2_wal_frame *frames; + dqlite_vfs_frame *frames; unsigned n; rv = vfs2_poll(fp, &frames, &n, &sl); munit_assert_int(rv, ==, 0); munit_assert_uint(n, ==, 1); munit_assert_not_null(frames); - munit_assert_not_null(frames[0].page); - sqlite3_free(frames[0].page); + munit_assert_not_null(frames[0].data); + sqlite3_free(frames[0].data); sqlite3_free(frames); rv = vfs2_unhide(fp); From 0a7cc1266154f4deaf0d5fc2b09b4f77dc48e701 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Wed, 17 Jul 2024 15:31:09 -0400 Subject: [PATCH 15/73] Initialize page size in vfs2_add_uncommitted if not earlier A follower that first touches a database when it receives a COMMAND_FRAMES entry needs to initialize the page size at that point (absent a guarantee that we run `PRAGMA page_size` immediately after sqlite3_open). Signed-off-by: Cole Miller --- src/vfs2.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/vfs2.c b/src/vfs2.c index 10a8cce6e..f3d730387 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -1710,6 +1710,9 @@ int vfs2_add_uncommitted(sqlite3_file *file, struct file *xfile = (struct file *)file; PRE(xfile->flags & SQLITE_OPEN_MAIN_DB); struct entry *e = xfile->entry; + if (e->page_size == 0) { + e->page_size = page_size; + } PRE(page_size == e->page_size); int rv; From b34e6f3fa919abd759faed3d58d9e6288e04bdd4 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Wed, 17 Jul 2024 15:05:41 -0400 Subject: [PATCH 16/73] Remove obsolete comments Signed-off-by: Cole Miller --- src/vfs2.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/vfs2.h b/src/vfs2.h index cbb29512d..9867bc36c 100644 --- a/src/vfs2.h +++ b/src/vfs2.h @@ -103,7 +103,4 @@ int vfs2_pseudo_read_end(sqlite3_file *file, unsigned i); */ void vfs2_destroy(sqlite3_vfs *vfs); -// TODO access read marks and shm_locks -// TODO access information about checkpoints - #endif From 0820ad92efab41d9414f0155ed0d3d676ebca05b Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Wed, 17 Jul 2024 16:04:17 -0400 Subject: [PATCH 17/73] Remove redundant out argument from vfs2_poll The length of the polled transaction is returned in the WAL slice. Signed-off-by: Cole Miller --- src/vfs2.c | 4 +--- src/vfs2.h | 1 - test/unit/test_vfs2.c | 11 +++++------ 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/src/vfs2.c b/src/vfs2.c index f3d730387..13466f88d 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -1468,7 +1468,6 @@ int vfs2_apply(sqlite3_file *file, struct vfs2_wal_slice stop) int vfs2_poll(sqlite3_file *file, dqlite_vfs_frame **frames, - unsigned *n, struct vfs2_wal_slice *sl) { struct file *xfile = (struct file *)file; @@ -1488,8 +1487,7 @@ int vfs2_poll(sqlite3_file *file, /* Note, not resetting pending_txn_{start,len} because they are used by * later states */ - if (n != NULL && frames != NULL) { - *n = len; + if (frames != NULL) { *frames = e->pending_txn_frames; } else { for (uint32_t i = 0; i < e->pending_txn_len; i++) { diff --git a/src/vfs2.h b/src/vfs2.h index 9867bc36c..f427991e5 100644 --- a/src/vfs2.h +++ b/src/vfs2.h @@ -45,7 +45,6 @@ struct vfs2_wal_slice { */ int vfs2_poll(sqlite3_file *file, dqlite_vfs_frame **frames, - unsigned *n, struct vfs2_wal_slice *sl); int vfs2_unhide(sqlite3_file *file); diff --git a/test/unit/test_vfs2.c b/test/unit/test_vfs2.c index 003d6f05f..633db2329 100644 --- a/test/unit/test_vfs2.c +++ b/test/unit/test_vfs2.c @@ -156,7 +156,7 @@ TEST(vfs2, basic, set_up, tear_down, 0, NULL) munit_assert_int(rv, ==, SQLITE_OK); struct vfs2_wal_slice sl; - rv = vfs2_poll(fp, NULL, NULL, &sl); + rv = vfs2_poll(fp, NULL, &sl); munit_assert_int(rv, ==, 0); rv = vfs2_unhide(fp); munit_assert_int(rv, ==, 0); @@ -173,7 +173,7 @@ TEST(vfs2, basic, set_up, tear_down, 0, NULL) rv = sqlite3_exec(db, "INSERT INTO foo (bar) values (22)", NULL, NULL, NULL); munit_assert_int(rv, ==, 0); - rv = vfs2_poll(fp, NULL, NULL, &sl); + rv = vfs2_poll(fp, NULL, &sl); munit_assert_int(rv, ==, 0); munit_assert_uint32(sl.start, ==, 2); munit_assert_uint32(sl.len, ==, 1); @@ -213,10 +213,9 @@ TEST(vfs2, basic, set_up, tear_down, 0, NULL) munit_assert_int(rv, ==, SQLITE_DONE); dqlite_vfs_frame *frames; - unsigned n; - rv = vfs2_poll(fp, &frames, &n, &sl); + rv = vfs2_poll(fp, &frames, &sl); munit_assert_int(rv, ==, 0); - munit_assert_uint(n, ==, 1); + munit_assert_uint(sl.len, ==, 1); munit_assert_not_null(frames); munit_assert_not_null(frames[0].data); sqlite3_free(frames[0].data); @@ -386,7 +385,7 @@ TEST(vfs2, rollback, set_up, tear_down, 0, NULL) sqlite3_file *fp; sqlite3_file_control(db, "main", SQLITE_FCNTL_FILE_POINTER, &fp); struct vfs2_wal_slice sl; - rv = vfs2_poll(fp, NULL, NULL, &sl); + rv = vfs2_poll(fp, NULL, &sl); munit_assert_int(rv, ==, 0); rv = vfs2_unhide(fp); rv = sqlite3_exec(db, "BEGIN", NULL, NULL, NULL); From f4ff5a22d306d3b521ecb5462e56e11b71c23fd5 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Thu, 18 Jul 2024 21:17:49 -0400 Subject: [PATCH 18/73] Make vfs2 tests more concise By abbreviating two common operations. Signed-off-by: Cole Miller --- test/unit/test_vfs2.c | 89 +++++++++++++++++-------------------------- 1 file changed, 35 insertions(+), 54 deletions(-) diff --git a/test/unit/test_vfs2.c b/test/unit/test_vfs2.c index 633db2329..3265a93dd 100644 --- a/test/unit/test_vfs2.c +++ b/test/unit/test_vfs2.c @@ -16,6 +16,8 @@ #define PAGE_SIZE 512 #define PAGE_SIZE_STR "512" +#define OK(rv) munit_assert_int(rv, ==, 0) + SUITE(vfs2); struct node { @@ -94,14 +96,12 @@ static void prepare_wals(const char *dbname, { char buf[PATH_MAX]; ssize_t n; - int rv; if (wal1 != NULL) { snprintf(buf, sizeof(buf), "%s-xwal1", dbname); int fd1 = open(buf, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH); munit_assert_int(fd1, !=, -1); - rv = ftruncate(fd1, 0); - munit_assert_int(rv, ==, 0); + OK(ftruncate(fd1, 0)); n = write(fd1, wal1, wal1_len); munit_assert_llong(n, ==, (ssize_t)wal1_len); close(fd1); @@ -111,8 +111,7 @@ static void prepare_wals(const char *dbname, int fd2 = open(buf, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH); munit_assert_int(fd2, !=, -1); - rv = ftruncate(fd2, 0); - munit_assert_int(rv, ==, 0); + OK(ftruncate(fd2, 0)); n = write(fd2, wal2, wal2_len); munit_assert_llong(n, ==, (ssize_t)wal2_len); close(fd2); @@ -139,6 +138,13 @@ static void check_wals(const char *dbname, off_t wal1_len, off_t wal2_len) (rv < 0 && errno == ENOENT && wal2_len == 0)); } +static sqlite3_file *main_file(sqlite3 *db) +{ + sqlite3_file *fp; + sqlite3_file_control(db, "main", SQLITE_FCNTL_FILE_POINTER, &fp); + return fp; +} + /** * Single-node test with several transactions and a checkpoint. */ @@ -149,40 +155,29 @@ TEST(vfs2, basic, set_up, tear_down, 0, NULL) int rv; sqlite3 *db = open_test_db(node); - sqlite3_file *fp; - sqlite3_file_control(db, "main", SQLITE_FCNTL_FILE_POINTER, &fp); - rv = sqlite3_exec(db, "CREATE TABLE foo (bar INTEGER)", NULL, NULL, - NULL); - munit_assert_int(rv, ==, SQLITE_OK); + sqlite3_file *fp = main_file(db); + OK(sqlite3_exec(db, "CREATE TABLE foo (bar INTEGER)", NULL, NULL, + NULL)); struct vfs2_wal_slice sl; - rv = vfs2_poll(fp, NULL, &sl); - munit_assert_int(rv, ==, 0); - rv = vfs2_unhide(fp); - munit_assert_int(rv, ==, 0); + OK(vfs2_poll(fp, NULL, &sl)); + OK(vfs2_unhide(fp)); munit_assert_uint32(sl.start, ==, 0); munit_assert_uint32(sl.len, ==, 2); - rv = sqlite3_exec(db, "INSERT INTO foo (bar) VALUES (17)", NULL, NULL, - NULL); - munit_assert_int(rv, ==, SQLITE_OK); - tracef("aborting..."); - rv = vfs2_abort(fp); - munit_assert_int(rv, ==, 0); + OK(sqlite3_exec(db, "INSERT INTO foo (bar) VALUES (17)", NULL, NULL, + NULL)); + OK(vfs2_abort(fp)); - rv = sqlite3_exec(db, "INSERT INTO foo (bar) values (22)", NULL, NULL, - NULL); - munit_assert_int(rv, ==, 0); - rv = vfs2_poll(fp, NULL, &sl); - munit_assert_int(rv, ==, 0); + OK(sqlite3_exec(db, "INSERT INTO foo (bar) values (22)", NULL, NULL, + NULL)); + OK(vfs2_poll(fp, NULL, &sl)); munit_assert_uint32(sl.start, ==, 2); munit_assert_uint32(sl.len, ==, 1); - rv = vfs2_unhide(fp); - munit_assert_int(rv, ==, 0); + OK(vfs2_unhide(fp)); sqlite3_stmt *stmt; - rv = sqlite3_prepare_v2(db, "SELECT * FROM foo", -1, &stmt, NULL); - munit_assert_int(rv, ==, SQLITE_OK); + OK(sqlite3_prepare_v2(db, "SELECT * FROM foo", -1, &stmt, NULL)); rv = sqlite3_step(stmt); munit_assert_int(rv, ==, SQLITE_ROW); munit_assert_int(sqlite3_column_count(stmt), ==, 1); @@ -192,18 +187,15 @@ TEST(vfs2, basic, set_up, tear_down, 0, NULL) int nlog; int nckpt; - rv = sqlite3_wal_checkpoint_v2(db, "main", SQLITE_CHECKPOINT_PASSIVE, - &nlog, &nckpt); - munit_assert_int(rv, ==, SQLITE_OK); + OK(sqlite3_wal_checkpoint_v2(db, "main", SQLITE_CHECKPOINT_PASSIVE, + &nlog, &nckpt)); munit_assert_int(nlog, ==, 3); munit_assert_int(nckpt, ==, 3); - rv = sqlite3_exec(db, "INSERT INTO foo (bar) VALUES (101)", NULL, NULL, - NULL); - munit_assert_int(rv, ==, SQLITE_OK); + OK(sqlite3_exec(db, "INSERT INTO foo (bar) VALUES (101)", NULL, NULL, + NULL)); - rv = sqlite3_reset(stmt); - munit_assert_int(rv, ==, SQLITE_OK); + OK(sqlite3_reset(stmt)); rv = sqlite3_step(stmt); munit_assert_int(rv, ==, SQLITE_ROW); munit_assert_int(sqlite3_column_count(stmt), ==, 1); @@ -213,19 +205,16 @@ TEST(vfs2, basic, set_up, tear_down, 0, NULL) munit_assert_int(rv, ==, SQLITE_DONE); dqlite_vfs_frame *frames; - rv = vfs2_poll(fp, &frames, &sl); - munit_assert_int(rv, ==, 0); + OK(vfs2_poll(fp, &frames, &sl)); munit_assert_uint(sl.len, ==, 1); munit_assert_not_null(frames); munit_assert_not_null(frames[0].data); sqlite3_free(frames[0].data); sqlite3_free(frames); - rv = vfs2_unhide(fp); - munit_assert_int(rv, ==, 0); + OK(vfs2_unhide(fp)); - rv = sqlite3_reset(stmt); - munit_assert_int(rv, ==, SQLITE_OK); + OK(sqlite3_reset(stmt)); rv = sqlite3_step(stmt); munit_assert_int(rv, ==, SQLITE_ROW); munit_assert_int(sqlite3_column_count(stmt), ==, 1); @@ -236,8 +225,7 @@ TEST(vfs2, basic, set_up, tear_down, 0, NULL) rv = sqlite3_step(stmt); munit_assert_int(rv, ==, SQLITE_DONE); - rv = sqlite3_finalize(stmt); - munit_assert_int(rv, ==, SQLITE_OK); + OK(sqlite3_finalize(stmt)); sqlite3_close(db); return MUNIT_OK; } @@ -299,7 +287,6 @@ TEST(vfs2, startup_one_nonempty, set_up, tear_down, 0, NULL) struct fixture *f = data; struct node *node = &f->nodes[0]; char buf[PATH_MAX]; - int rv; snprintf(buf, PATH_MAX, "%s/%s", node->dir, "test.db"); @@ -309,14 +296,8 @@ TEST(vfs2, startup_one_nonempty, set_up, tear_down, 0, NULL) make_wal_hdr(wal2_hdronly, 0, 17, 103); prepare_wals(buf, NULL, 0, wal2_hdronly, sizeof(wal2_hdronly)); sqlite3 *db = open_test_db(node); - sqlite3_file *fp; - sqlite3_file_control(db, "main", SQLITE_FCNTL_FILE_POINTER, &fp); - tracef("create table..."); - rv = sqlite3_exec(db, "CREATE TABLE foo (n INTEGER)", NULL, NULL, NULL); - munit_assert_int(rv, ==, SQLITE_OK); - tracef("closing..."); - rv = sqlite3_close(db); - munit_assert_int(rv, ==, SQLITE_OK); + OK(sqlite3_exec(db, "CREATE TABLE foo (n INTEGER)", NULL, NULL, NULL)); + OK(sqlite3_close(db)); check_wals(buf, WAL_SIZE_FROM_FRAMES(2), WAL_SIZE_FROM_FRAMES(0)); From cc0ff2ebe386460bce721200a58b89fffca43fb6 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Thu, 18 Jul 2024 21:18:05 -0400 Subject: [PATCH 19/73] Add a new vfs2 test for transaction replication Signed-off-by: Cole Miller --- test/unit/test_vfs2.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/test/unit/test_vfs2.c b/test/unit/test_vfs2.c index 3265a93dd..90a56237d 100644 --- a/test/unit/test_vfs2.c +++ b/test/unit/test_vfs2.c @@ -385,3 +385,45 @@ TEST(vfs2, rollback, set_up, tear_down, 0, NULL) return MUNIT_OK; } + +/** + * Two-node test covering the full replication cycle. + */ +TEST(vfs2, leader_and_follower, set_up, tear_down, 0, NULL) +{ + struct fixture *f = data; + struct node *leader = &f->nodes[0]; + struct node *follower = &f->nodes[1]; + + /* The leader executes and polls a transaction. */ + sqlite3 *leader_db = open_test_db(leader); + OK(sqlite3_exec(leader_db, "CREATE TABLE foo (n INTEGER)", NULL, NULL, + NULL)); + sqlite3_file *leader_fp = main_file(leader_db); + dqlite_vfs_frame *frames; + struct vfs2_wal_slice leader_sl; + OK(vfs2_poll(leader_fp, &frames, &leader_sl)); + munit_assert_uint(leader_sl.len, ==, 2); + + /* The follower receives the transaction. */ + sqlite3 *follower_db = open_test_db(follower); + sqlite3_file *follower_fp = main_file(follower_db); + struct vfs2_wal_slice follower_sl; + OK(vfs2_add_uncommitted(follower_fp, PAGE_SIZE, frames, leader_sl.len, + &follower_sl)); + sqlite3_free(frames[0].data); + sqlite3_free(frames[1].data); + sqlite3_free(frames); + + /* The leader receives the follower's acknowledgement + * and applies the transaction locally. */ + OK(vfs2_unhide(leader_fp)); + + /* The follower learns the new commit index and applies + * the transaction locally. */ + OK(vfs2_apply(follower_fp, follower_sl)); + + sqlite3_close(follower_db); + sqlite3_close(leader_db); + return MUNIT_OK; +} From bc18c510ab377331511fa99a72b28845b5db0646 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Wed, 17 Jul 2024 17:00:45 -0400 Subject: [PATCH 20/73] Fix WAL header initialization Signed-off-by: Cole Miller --- src/vfs2.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/vfs2.c b/src/vfs2.c index 13466f88d..037a322a5 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -1727,6 +1727,7 @@ int vfs2_add_uncommitted(sqlite3_file *file, * (wal_cursor). */ e->shm_locks[WAL_WRITE_LOCK] = VFS2_EXCLUSIVE; + uint32_t start = e->wal_cursor; struct wal_index_full_hdr *ihdr = get_full_hdr(e); uint32_t mx = ihdr->basic[0].mxFrame; if (mx > 0 && ihdr->nBackfill == mx) { @@ -1736,10 +1737,17 @@ int vfs2_add_uncommitted(sqlite3_file *file, if (rv != SQLITE_OK) { return 1; } + } else if (start == 0) { + sqlite3_file *phys = e->wal_cur; + struct wal_hdr new_whdr = next_wal_hdr(e); + rv = phys->pMethods->xWrite(phys, &new_whdr, sizeof(new_whdr), + 0); + e->wal_cur_hdr = new_whdr; + if (rv != SQLITE_OK) { + return 1; + } } - uint32_t start = e->wal_cursor; - struct cksums sums; uint32_t db_size; if (start > 0) { From afa657ea1ff446c348cda994f90363e5bfcb5d85 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Thu, 18 Jul 2024 20:46:51 -0400 Subject: [PATCH 21/73] Move shm finalization to maybe_close_entry This ties the destruction of the shared memory to that of the owning entry, because they are created together. If shm finalization is tied to a call to xShmUnmap, then it can't happen unless SQLite called xShmMap in the first place, and this is not guaranteed to happen. For example, if we start and stop a follower node without ever running a checkpoint or transaction, SQLite will never get the change to call xShmMap. Signed-off-by: Cole Miller --- src/vfs2.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/src/vfs2.c b/src/vfs2.c index 037a322a5..5d8b61de8 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -411,6 +411,14 @@ static void maybe_close_entry(struct entry *e) free_pending_txn(e); + assert(e->shm_refcount == 0); + for (int i = 0; i < e->shm_regions_len; i++) { + void *region = e->shm_regions[i]; + assert(region != NULL); + sqlite3_free(region); + } + sqlite3_free(e->shm_regions); + pthread_rwlock_wrlock(&e->common->rwlock); queue_remove(&e->link); pthread_rwlock_unlock(&e->common->rwlock); @@ -885,18 +893,6 @@ static int vfs2_shm_unmap(sqlite3_file *file, int delete) struct file *xfile = (struct file *)file; struct entry *e = xfile->entry; e->shm_refcount--; - if (e->shm_refcount == 0) { - for (int i = 0; i < e->shm_regions_len; i++) { - void *region = e->shm_regions[i]; - assert(region != NULL); - sqlite3_free(region); - } - sqlite3_free(e->shm_regions); - - e->shm_regions = NULL; - e->shm_regions_len = 0; - memset(e->shm_locks, 0, sizeof(e->shm_locks)); - } return SQLITE_OK; } From 8e1f5c95713ec90f22c2aaae35d4f288bf9b90d5 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Fri, 19 Jul 2024 15:04:58 -0400 Subject: [PATCH 22/73] Use parentheses in OK test macro Signed-off-by: Cole Miller --- test/unit/test_vfs2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/unit/test_vfs2.c b/test/unit/test_vfs2.c index 90a56237d..25a66b80e 100644 --- a/test/unit/test_vfs2.c +++ b/test/unit/test_vfs2.c @@ -16,7 +16,7 @@ #define PAGE_SIZE 512 #define PAGE_SIZE_STR "512" -#define OK(rv) munit_assert_int(rv, ==, 0) +#define OK(rv) munit_assert_int((rv), ==, 0) SUITE(vfs2); From 67a49a5b87e3da11c337c424791ab316d77fc376 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Fri, 19 Jul 2024 16:01:25 -0400 Subject: [PATCH 23/73] Add more extensive comments to vfs2.h Signed-off-by: Cole Miller --- src/vfs2.h | 99 ++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 89 insertions(+), 10 deletions(-) diff --git a/src/vfs2.h b/src/vfs2.h index f427991e5..45fe87909 100644 --- a/src/vfs2.h +++ b/src/vfs2.h @@ -12,15 +12,20 @@ * Create a new VFS object that wraps the given VFS object. * * The returned VFS is allocated on the heap and lives until vfs2_destroy is - * called. Its methods are thread-safe if those of the wrapped VFS are, but - * the methods of the sqlite3_file objects it creates are not thread-safe. - * Therefore, a database connection that's created using this VFS should only - * be used on the thread that opened it. The functions below that operate on - * sqlite3_file objects created by this VFS should also only be used on that - * thread. + * called. The provided name is not copied or freed, and must outlive the VFS. + * + * The methods of the resulting sqlite3_vfs object are thread-safe, but the + * xOpen method creates sqlite3_file objects whose methods are not thread-safe. + * Therefore, when a database connection is opened using the returned VFS, it + * must not subsequently be used on other threads without additional + * synchronization. This also goes for the functions below that operate directly + * on sqlite3_file. */ sqlite3_vfs *vfs2_make(sqlite3_vfs *orig, const char *name); +/** + * A pair of salt values from the header of a WAL file. + */ struct vfs2_salts { uint8_t salt1[4]; uint8_t salt2[4]; @@ -28,6 +33,10 @@ struct vfs2_salts { /** * Identifying information about a write transaction. + * + * The salts identify a WAL file. The `start` and `len` fields have units of + * frames and identify a range of frames within that WAL file that represent a + * transaction. */ struct vfs2_wal_slice { struct vfs2_salts salts; @@ -36,10 +45,17 @@ struct vfs2_wal_slice { }; /** - * Retrieve frames that were appended to the WAL by the last write transaction, - * and reacquire the write lock. + * Retrieve a description of a write transaction that was just written to the + * WAL. * - * Call this on the database main file object (SQLITE_FCNTL_FILE_POINTER). + * The first argument is the main file handle for the database. This function + * also acquires the WAL write lock, preventing another write transaction from + * overwriting the first one until vfs2_unhide is called. + * + * The `frames` out argument is populated with an array containing + * the frames of the transaction, and the `sl` out argument is populated with a + * WAL slice describing the transaction. The length of the frames array is + * `sl.len`. * * Polling the same transaction more than once is an error. */ @@ -47,12 +63,45 @@ int vfs2_poll(sqlite3_file *file, dqlite_vfs_frame **frames, struct vfs2_wal_slice *sl); +/** + * Mark a write transaction that was previously polled as committed, making it + * visible to future transactions. + * + * This function also releases the WAL write lock, allowing another write + * transaction to execute. It should be called on the main file handle + * (SQLITE_FCNTL_FILE_POINTER). + * + * It's an error to call this function if no write transaction is currently + * pending due to vfs2_poll. + */ int vfs2_unhide(sqlite3_file *file); +/** + * Make some transactions in the WAL visible to readers. + * + * The first argument is the main file for the database + * (SQLITE_FCNTL_FILE_POINTER). All transactions up to and including the one + * described by the second argument will be marked as committed and made + * visible. The WAL write lock is also released if there are no uncommitted + * transactions left in the WAL. + * + * The affected transactions must have been added to the WAL by + * vfs2_add_committed. + */ int vfs2_apply(sqlite3_file *file, struct vfs2_wal_slice stop); /** - * Add frames to the WAL after receiving them in an AppendEntrie message. + * Add the frames of a write transaction directly to the end of the WAL. + * + * The first argument is the main file handle for the database. On success, the + * WAL write lock is acquired if it was not held already. The added frames are + * initially invisible to readers, and must be made visible by calling + * vfs2_commit or removed from the WAL by calling vfs2_unadd. + * + * A WAL slice describing the new transaction is written to the last argument. + * + * The `page_size` for the new frames must match the page size already set for + * this database. */ int vfs2_add_uncommitted(sqlite3_file *file, uint32_t page_size, @@ -60,8 +109,23 @@ int vfs2_add_uncommitted(sqlite3_file *file, unsigned n, struct vfs2_wal_slice *out); +/** + * Remove some transactions from the WAL. + * + * The first argument is the main file handle for the database + * (SQLITE_FCNTL_FILE_POINTER). The second argument is a WAL slice. The + * transaction described by this slice, and all following transactions, will be + * removed from the WAL. The WAL write lock will be released if there are no + * uncommitted transactions in the WAL afterward. + * + * All removed transactions must have been added to the WAL by + * vfs2_add_uncommitted, and must not have been made visible using vfs2_commit. + */ int vfs2_unadd(sqlite3_file *file, struct vfs2_wal_slice stop); +/** + * Request to read a specific transaction from a WAL file. + */ struct vfs2_wal_txn { struct vfs2_wal_slice meta; dqlite_vfs_frame *frames; @@ -90,8 +154,23 @@ int vfs2_read_wal(sqlite3_file *file, */ int vfs2_abort(sqlite3_file *file); +/** + * Try to set a read lock at a fixed place in the WAL-index. + * + * The first argument is the main file handle for the database + * (SQLITE_FCNTL_FILE_POINTER). The second argument is the desired value for the + * read mark, in units of frames. On success, the index of the read mark is + * written to the last argument, and the corresponding WAL read lock is held. + * + * This function may fail if all read marks are in used when it is called. + */ int vfs2_pseudo_read_begin(sqlite3_file *file, uint32_t target, unsigned *out); +/** + * Unset a read mark that was set by vfs2_pseudo_read_begin. + * + * This also releases the corresponding read lock. + */ int vfs2_pseudo_read_end(sqlite3_file *file, unsigned i); /** From 6d217698425de36c144caf715c1badd2501be70d Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Fri, 19 Jul 2024 16:07:12 -0400 Subject: [PATCH 24/73] Rename open_test_db, accept name as argument Signed-off-by: Cole Miller --- test/unit/test_vfs2.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/test/unit/test_vfs2.c b/test/unit/test_vfs2.c index 25a66b80e..c8f15a4e7 100644 --- a/test/unit/test_vfs2.c +++ b/test/unit/test_vfs2.c @@ -63,12 +63,12 @@ static void tear_down(void *data) } /** - * Open a connection to a test database for this node. + * Open a connection to a database for this node. */ -static sqlite3 *open_test_db(const struct node *node) +static sqlite3 *node_open_db(const struct node *node, const char *name) { char buf[PATH_MAX]; - snprintf(buf, sizeof(buf), "%s/test.db", node->dir); + snprintf(buf, sizeof(buf), "%s/%s", node->dir, name); sqlite3 *db; int rv = sqlite3_open_v2(buf, &db, SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE, @@ -154,7 +154,7 @@ TEST(vfs2, basic, set_up, tear_down, 0, NULL) struct node *node = &f->nodes[0]; int rv; - sqlite3 *db = open_test_db(node); + sqlite3 *db = node_open_db(node, "test.db"); sqlite3_file *fp = main_file(db); OK(sqlite3_exec(db, "CREATE TABLE foo (bar INTEGER)", NULL, NULL, NULL)); @@ -295,7 +295,7 @@ TEST(vfs2, startup_one_nonempty, set_up, tear_down, 0, NULL) uint8_t wal2_hdronly[WAL_SIZE_FROM_FRAMES(0)] = { 0 }; make_wal_hdr(wal2_hdronly, 0, 17, 103); prepare_wals(buf, NULL, 0, wal2_hdronly, sizeof(wal2_hdronly)); - sqlite3 *db = open_test_db(node); + sqlite3 *db = node_open_db(node, "test.db"); OK(sqlite3_exec(db, "CREATE TABLE foo (n INTEGER)", NULL, NULL, NULL)); OK(sqlite3_close(db)); @@ -324,7 +324,7 @@ TEST(vfs2, startup_both_nonempty, set_up, tear_down, 0, NULL) make_wal_hdr(wal2_hdronly, 0, 17, 103); prepare_wals(buf, wal1_hdronly, sizeof(wal1_hdronly), wal2_hdronly, sizeof(wal2_hdronly)); - sqlite3 *db = open_test_db(node); + sqlite3 *db = node_open_db(node, "test.db"); rv = sqlite3_exec(db, "PRAGMA page_size=" PAGE_SIZE_STR ";" @@ -354,7 +354,7 @@ TEST(vfs2, rollback, set_up, tear_down, 0, NULL) snprintf(buf, PATH_MAX, "%s/%s", node->dir, "test.db"); - sqlite3 *db = open_test_db(node); + sqlite3 *db = node_open_db(node, "test.db"); rv = sqlite3_exec(db, "PRAGMA journal_mode=WAL;" @@ -396,7 +396,7 @@ TEST(vfs2, leader_and_follower, set_up, tear_down, 0, NULL) struct node *follower = &f->nodes[1]; /* The leader executes and polls a transaction. */ - sqlite3 *leader_db = open_test_db(leader); + sqlite3 *leader_db = node_open_db(leader, "test.db"); OK(sqlite3_exec(leader_db, "CREATE TABLE foo (n INTEGER)", NULL, NULL, NULL)); sqlite3_file *leader_fp = main_file(leader_db); @@ -406,7 +406,7 @@ TEST(vfs2, leader_and_follower, set_up, tear_down, 0, NULL) munit_assert_uint(leader_sl.len, ==, 2); /* The follower receives the transaction. */ - sqlite3 *follower_db = open_test_db(follower); + sqlite3 *follower_db = node_open_db(follower, "test.db"); sqlite3_file *follower_fp = main_file(follower_db); struct vfs2_wal_slice follower_sl; OK(vfs2_add_uncommitted(follower_fp, PAGE_SIZE, frames, leader_sl.len, From cf2690fb71246a0ee43c60d34f0a7cd32aea8093 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Fri, 19 Jul 2024 16:08:28 -0400 Subject: [PATCH 25/73] Rename check_wals Signed-off-by: Cole Miller --- test/unit/test_vfs2.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/test/unit/test_vfs2.c b/test/unit/test_vfs2.c index c8f15a4e7..13d4a239a 100644 --- a/test/unit/test_vfs2.c +++ b/test/unit/test_vfs2.c @@ -121,7 +121,7 @@ static void prepare_wals(const char *dbname, /** * Assert the lengths of WAL1 and WAL2 on disk. */ -static void check_wals(const char *dbname, off_t wal1_len, off_t wal2_len) +static void assert_wal_sizes(const char *dbname, off_t wal1_len, off_t wal2_len) { char buf[PATH_MAX]; struct stat st; @@ -290,7 +290,7 @@ TEST(vfs2, startup_one_nonempty, set_up, tear_down, 0, NULL) snprintf(buf, PATH_MAX, "%s/%s", node->dir, "test.db"); - check_wals(buf, 0, 0); + assert_wal_sizes(buf, 0, 0); uint8_t wal2_hdronly[WAL_SIZE_FROM_FRAMES(0)] = { 0 }; make_wal_hdr(wal2_hdronly, 0, 17, 103); @@ -299,7 +299,7 @@ TEST(vfs2, startup_one_nonempty, set_up, tear_down, 0, NULL) OK(sqlite3_exec(db, "CREATE TABLE foo (n INTEGER)", NULL, NULL, NULL)); OK(sqlite3_close(db)); - check_wals(buf, WAL_SIZE_FROM_FRAMES(2), WAL_SIZE_FROM_FRAMES(0)); + assert_wal_sizes(buf, WAL_SIZE_FROM_FRAMES(2), WAL_SIZE_FROM_FRAMES(0)); return MUNIT_OK; } @@ -316,7 +316,7 @@ TEST(vfs2, startup_both_nonempty, set_up, tear_down, 0, NULL) int rv; snprintf(buf, PATH_MAX, "%s/%s", node->dir, "test.db"); - check_wals(buf, 0, 0); + assert_wal_sizes(buf, 0, 0); uint8_t wal1_hdronly[WAL_SIZE_FROM_FRAMES(0)] = { 0 }; make_wal_hdr(wal1_hdronly, 0, 18, 103); @@ -337,7 +337,7 @@ TEST(vfs2, startup_both_nonempty, set_up, tear_down, 0, NULL) rv = sqlite3_close(db); munit_assert_int(rv, ==, SQLITE_OK); - check_wals(buf, WAL_SIZE_FROM_FRAMES(0), WAL_SIZE_FROM_FRAMES(2)); + assert_wal_sizes(buf, WAL_SIZE_FROM_FRAMES(0), WAL_SIZE_FROM_FRAMES(2)); return MUNIT_OK; } From 88a31312a177ff42d0731fb49d434684f1391950 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Fri, 19 Jul 2024 16:09:31 -0400 Subject: [PATCH 26/73] Rename constants describing main file header Signed-off-by: Cole Miller --- src/vfs2.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/vfs2.c b/src/vfs2.c index 5d8b61de8..0428f3e80 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -38,8 +38,8 @@ #define READ_MARK_UNUSED 0xffffffff -#define DB_HEADER_SIZE 100 -#define DB_HEADER_NPAGES_OFFSET 28 +#define DB_FILE_HEADER_SIZE 100 +#define DB_FILE_HEADER_NPAGES_OFFSET 28 static const uint32_t invalid_magic = 0x17171717; @@ -1772,14 +1772,14 @@ int vfs2_add_uncommitted(sqlite3_file *file, sums.cksum2 = ByteGetBe32(e->wal_cur_hdr.cksum2); /* The database size in pages is kept in a field of the database * header. */ - uint8_t b[DB_HEADER_SIZE]; + uint8_t b[DB_FILE_HEADER_SIZE]; rv = xfile->orig->pMethods->xRead(xfile->orig, &b, sizeof(b), 0); /* TODO(cole) this can't fail provided that the main file * has been created; ensure that this is the case even if * we haven't run a checkpoint yet. */ assert(rv == SQLITE_OK); - db_size = ByteGetBe32(b + DB_HEADER_NPAGES_OFFSET); + db_size = ByteGetBe32(b + DB_FILE_HEADER_NPAGES_OFFSET); } POST(db_size > 0); From 7987e0d495024e0e0a28c8d556f34f3eabf8cf11 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Fri, 19 Jul 2024 16:13:29 -0400 Subject: [PATCH 27/73] Clean up vfs2 rollback test Signed-off-by: Cole Miller --- test/unit/test_vfs2.c | 37 ++++++++++--------------------------- 1 file changed, 10 insertions(+), 27 deletions(-) diff --git a/test/unit/test_vfs2.c b/test/unit/test_vfs2.c index 13d4a239a..00ec9c146 100644 --- a/test/unit/test_vfs2.c +++ b/test/unit/test_vfs2.c @@ -349,39 +349,22 @@ TEST(vfs2, rollback, set_up, tear_down, 0, NULL) { struct fixture *f = data; struct node *node = &f->nodes[0]; - char buf[PATH_MAX]; - int rv; - - snprintf(buf, PATH_MAX, "%s/%s", node->dir, "test.db"); - - sqlite3 *db = node_open_db(node, "test.db"); - - rv = sqlite3_exec(db, - "PRAGMA journal_mode=WAL;" - "PRAGMA wal_autocheckpoint=0", - NULL, NULL, NULL); - munit_assert_int(rv, ==, SQLITE_OK); - rv = sqlite3_exec(db, "CREATE TABLE foo (n INTEGER)", NULL, NULL, NULL); - munit_assert_int(rv, ==, SQLITE_OK); - sqlite3_file *fp; - sqlite3_file_control(db, "main", SQLITE_FCNTL_FILE_POINTER, &fp); struct vfs2_wal_slice sl; - rv = vfs2_poll(fp, NULL, &sl); - munit_assert_int(rv, ==, 0); - rv = vfs2_unhide(fp); - rv = sqlite3_exec(db, "BEGIN", NULL, NULL, NULL); - munit_assert_int(rv, ==, SQLITE_OK); char sql[100]; + + sqlite3 *db = node_open_db(node, "test.db"); + OK(sqlite3_exec(db, "CREATE TABLE foo (n INTEGER)", NULL, NULL, NULL)); + sqlite3_file *fp = main_file(db); + OK(vfs2_poll(fp, NULL, &sl)); + OK(vfs2_unhide(fp)); + OK(sqlite3_exec(db, "BEGIN", NULL, NULL, NULL)); for (unsigned i = 0; i < 500; i++) { snprintf(sql, sizeof(sql), "INSERT INTO foo (n) VALUES (%d)", i); - rv = sqlite3_exec(db, sql, NULL, NULL, NULL); - munit_assert_int(rv, ==, SQLITE_OK); + OK(sqlite3_exec(db, sql, NULL, NULL, NULL)); } - rv = sqlite3_exec(db, "ROLLBACK", NULL, NULL, NULL); - munit_assert_int(rv, ==, SQLITE_OK); - rv = sqlite3_close(db); - munit_assert_int(rv, ==, SQLITE_OK); + OK(sqlite3_exec(db, "ROLLBACK", NULL, NULL, NULL)); + OK(sqlite3_close(db)); return MUNIT_OK; } From 6febc7bdfdd44c6dea3c9c004659f7fdb1159634 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Fri, 19 Jul 2024 16:23:01 -0400 Subject: [PATCH 28/73] Add TODOs about long WALs Signed-off-by: Cole Miller --- src/vfs2.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/vfs2.c b/src/vfs2.c index 0428f3e80..5641a97e5 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -1005,6 +1005,7 @@ static void set_mx_frame(struct entry *e, struct vfs2_shm_region0 *r0 = e->shm_regions[0]; PRE(mx <= REGION0_PGNOS_LEN); for (uint32_t i = old_mx; i < mx; i++) { + /* TODO(cole) Support hash tables beyond the first. */ PRE(i < REGION0_PGNOS_LEN); pgno_ht_insert(r0->ht, REGION0_HT_LEN, (uint16_t)i, r0->pgnos[i]); @@ -1784,6 +1785,7 @@ int vfs2_add_uncommitted(sqlite3_file *file, POST(db_size > 0); struct vfs2_shm_region0 *r0 = e->shm_regions[0]; + /* TODO(cole) Support hash tables beyond the first. */ PRE(e->wal_cursor < REGION0_PGNOS_LEN); r0->pgnos[e->wal_cursor] = (uint32_t)frames[0].page_number; From 7834e33af35a1c60357be73d4599e97a627c5d51 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Fri, 19 Jul 2024 16:39:23 -0400 Subject: [PATCH 29/73] Add comments about handling of the hash table Signed-off-by: Cole Miller --- src/vfs2.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/vfs2.c b/src/vfs2.c index 5641a97e5..4e5054d9b 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -1005,6 +1005,9 @@ static void set_mx_frame(struct entry *e, struct vfs2_shm_region0 *r0 = e->shm_regions[0]; PRE(mx <= REGION0_PGNOS_LEN); for (uint32_t i = old_mx; i < mx; i++) { + /* The page numbers array was already updated during the call + * to add_uncommitted, so we just need to update the hash array. + */ /* TODO(cole) Support hash tables beyond the first. */ PRE(i < REGION0_PGNOS_LEN); pgno_ht_insert(r0->ht, REGION0_HT_LEN, (uint16_t)i, @@ -1784,6 +1787,16 @@ int vfs2_add_uncommitted(sqlite3_file *file, } POST(db_size > 0); + /* Record the new frame in the appropriate page number array in the WAL + * index. Note that this doesn't make the frame visible to readers: that + * only happens once it is also recorded in the WAL-index hash array and + * mxFrame exceeds the frame index. The hash array is updated only when + * we increase mxFrame, so that we don't have to deal with the added + * complexity of removing things from the hash table when frames are + * overwritten before being committed. Updating the page number array + * "early" like this is harmless and saves us from having to stash the + * page numbers somewhere else in memory in between add_uncommitted and + * apply, or (worse) read them back from WAL-cur. */ struct vfs2_shm_region0 *r0 = e->shm_regions[0]; /* TODO(cole) Support hash tables beyond the first. */ PRE(e->wal_cursor < REGION0_PGNOS_LEN); From 13fe31549a48fbb8e4f5061ccc590f63fb6448be Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Fri, 19 Jul 2024 16:56:58 -0400 Subject: [PATCH 30/73] Clarify comment about releasing the write lock Signed-off-by: Cole Miller --- src/vfs2.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/vfs2.c b/src/vfs2.c index 4e5054d9b..a229b57d2 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -858,8 +858,11 @@ static int vfs2_shm_lock(sqlite3_file *file, int ofst, int n, int flags) } if (ofst == WAL_WRITE_LOCK) { - /* Unlocking the write lock: roll back any uncommitted - * transaction. */ + /* If the last frame of the pending transaction has no + * commit marker when SQLite releases the write lock, it + * means that the transaction rolled back before it + * committed. We respond by throwing away our stored + * frames and resetting the state machine. */ assert(n == 1); if (sm_state(&e->wtx_sm) > WTX_BASE && e->pending_txn_last_frame_commit == 0) { From ba61610c9ea26de23d37c14b6d14e60637ef5bfe Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Sun, 21 Jul 2024 22:03:09 -0400 Subject: [PATCH 31/73] Initialize W-i header with all unused read marks This function was imitated from SQLite, but SQLite initializes the WAL-index header on demand when a transaction starts reading from the WAL, so it starts out with an active reader. That doesn't apply for us, so make all the read marks initially unused. Signed-off-by: Cole Miller --- src/vfs2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vfs2.c b/src/vfs2.c index a229b57d2..273b21d19 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -969,7 +969,7 @@ static struct wal_index_full_hdr initial_full_hdr(struct wal_hdr whdr) ihdr.basic[0].cksums = sums; ihdr.basic[1] = ihdr.basic[0]; ihdr.marks[0] = 0; - ihdr.marks[1] = 0; + ihdr.marks[1] = READ_MARK_UNUSED; ihdr.marks[2] = READ_MARK_UNUSED; ihdr.marks[3] = READ_MARK_UNUSED; ihdr.marks[4] = READ_MARK_UNUSED; From 73b279d75661ea6080f11b0798219793c4a2c208 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Mon, 22 Jul 2024 14:59:03 -0400 Subject: [PATCH 32/73] Fix comparison to SQLITE_OK Signed-off-by: Cole Miller --- src/vfs2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vfs2.c b/src/vfs2.c index 273b21d19..5028508eb 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -1456,7 +1456,7 @@ int vfs2_apply(sqlite3_file *file, struct vfs2_wal_slice stop) int rv = wal_cur->pMethods->xRead( wal_cur, &fhdr, sizeof(fhdr), wal_offset_from_cursor(e->page_size, stop.start + stop.len - 1)); - if (rv == SQLITE_OK) { + if (rv != SQLITE_OK) { return rv; } set_mx_frame(e, commit, fhdr); From 4b3c238c11c3b19fcbfe2f338481dafd65c37f29 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Mon, 22 Jul 2024 16:11:59 -0400 Subject: [PATCH 33/73] Simplify and correct W-i hdr cksum calculations First, there's no need for the endianness parameter to the checksum calculation unless we're copying the contents of the database directory between machines with different endiannesses. This was not properly supported by the old code (we just used native_magic() everywhere), and it seems like an edge case, so let's cut the complexity for now. Second, factor out the repeated logic for calculating and writing the checksums in the WAL-index header. This fixes a bug that showed up in vfs2_apply where we were checksumming the wrong number of bytes. Signed-off-by: Cole Miller --- src/vfs2.c | 75 ++++++++++++++++++++++++------------------------------ 1 file changed, 33 insertions(+), 42 deletions(-) diff --git a/src/vfs2.c b/src/vfs2.c index 5028508eb..4f7c30d9b 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -129,27 +129,18 @@ static bool is_bigendian(void) return *(char *)(&x) == 0; } -static uint32_t native_magic(void) +static void update_cksums(const uint8_t *p, size_t len, struct cksums *sums) { - return is_bigendian() ? BE_MAGIC : LE_MAGIC; -} - -static void update_cksums(uint32_t magic, - const uint8_t *p, - size_t len, - struct cksums *sums) -{ - PRE(magic == BE_MAGIC || magic == LE_MAGIC); PRE(len % 8 == 0); const uint8_t *end = p + len; - for (; p != end; p += 8) { - if (magic == BE_MAGIC) { - sums->cksum1 += ByteGetBe32(p) + sums->cksum2; - sums->cksum2 += ByteGetBe32(p + 4) + sums->cksum1; - } else { - sums->cksum1 += ByteGetLe32(p) + sums->cksum2; - sums->cksum2 += ByteGetLe32(p + 4) + sums->cksum1; - } + uint32_t n; + while (p != end) { + memcpy(&n, p, 4); + sums->cksum1 += n + sums->cksum2; + p += 4; + memcpy(&n, p, 4); + sums->cksum2 += n + sums->cksum1; + p += 4; } } @@ -354,18 +345,19 @@ static bool is_valid_page_size(unsigned long n) return n >= 1 << 9 && n <= 1 << 16 && is_po2(n); } -static bool basic_hdr_valid(struct wal_index_basic_hdr bhdr) +static bool basic_hdr_valid(const struct wal_index_basic_hdr *bhdr) { struct cksums sums = {}; - update_cksums(bhdr.bigEndCksum ? BE_MAGIC : LE_MAGIC, (uint8_t *)&bhdr, - offsetof(struct wal_index_basic_hdr, cksums), &sums); - return bhdr.iVersion == 3007000 && bhdr.isInit == 1 && - cksums_equal(sums, bhdr.cksums); + const uint8_t *p = (const uint8_t *)bhdr; + size_t len = offsetof(struct wal_index_basic_hdr, cksums); + update_cksums(p, len, &sums); + return bhdr->iVersion == 3007000 && bhdr->isInit == 1 && + cksums_equal(sums, bhdr->cksums); } static bool full_hdr_valid(const struct wal_index_full_hdr *ihdr) { - return basic_hdr_valid(ihdr->basic[0]) && + return basic_hdr_valid(&ihdr->basic[0]) && wal_index_basic_hdr_equal(ihdr->basic[0], ihdr->basic[1]); } @@ -956,6 +948,15 @@ static int read_wal_hdr(sqlite3_file *wal, return SQLITE_OK; } +static void write_basic_hdr_cksums(struct wal_index_basic_hdr *bhdr) +{ + struct cksums sums = {}; + const uint8_t *p = (const uint8_t *)bhdr; + size_t len = offsetof(struct wal_index_basic_hdr, cksums); + update_cksums(p, len, &sums); + bhdr->cksums = sums; +} + static struct wal_index_full_hdr initial_full_hdr(struct wal_hdr whdr) { struct wal_index_full_hdr ihdr = {}; @@ -963,10 +964,7 @@ static struct wal_index_full_hdr initial_full_hdr(struct wal_hdr whdr) ihdr.basic[0].isInit = 1; ihdr.basic[0].bigEndCksum = is_bigendian(); ihdr.basic[0].szPage = (uint16_t)ByteGetBe32(whdr.page_size); - struct cksums sums = {}; - update_cksums(native_magic(), (const void *)&ihdr.basic[0], - offsetof(struct wal_index_basic_hdr, cksums), &sums); - ihdr.basic[0].cksums = sums; + write_basic_hdr_cksums(&ihdr.basic[0]); ihdr.basic[1] = ihdr.basic[0]; ihdr.marks[0] = 0; ihdr.marks[1] = READ_MARK_UNUSED; @@ -998,10 +996,7 @@ static void set_mx_frame(struct entry *e, ihdr->basic[0].nPage = num_pages; ihdr->basic[0].frame_cksums.cksum1 = ByteGetBe32(fhdr.cksum1); ihdr->basic[0].frame_cksums.cksum2 = ByteGetBe32(fhdr.cksum2); - struct cksums sums = {}; - update_cksums(native_magic(), (const uint8_t *)&ihdr->basic[0], - sizeof(ihdr->basic[0]), &sums); - ihdr->basic[0].cksums = sums; + write_basic_hdr_cksums(&ihdr->basic[0]); ihdr->basic[1] = ihdr->basic[0]; POST(full_hdr_valid(ihdr)); @@ -1024,9 +1019,7 @@ static void restart_full_hdr(struct wal_index_full_hdr *ihdr, /* cf. walRestartHdr */ ihdr->basic[0].mxFrame = 0; ihdr->basic[0].salts = new_whdr.salts; - struct cksums sums = {}; - update_cksums(native_magic(), (const void *)&ihdr->basic[0], 40, &sums); - ihdr->basic[0].cksums = sums; + write_basic_hdr_cksums(&ihdr->basic[0]); ihdr->basic[1] = ihdr->basic[0]; ihdr->nBackfill = 0; ihdr->nBackfillAttempted = 0; @@ -1660,7 +1653,7 @@ static struct wal_hdr next_wal_hdr(const struct entry *e) * us access to this PRNG, seeded from the default (unix) VFS. */ struct wal_hdr ret; struct wal_hdr old = e->wal_cur_hdr; - BytePutBe32(native_magic(), ret.magic); + BytePutBe32(is_bigendian() ? BE_MAGIC : LE_MAGIC, ret.magic); BytePutBe32(3007000, ret.version); BytePutBe32(e->page_size, ret.page_size); uint32_t ckpoint_seqno = ByteGetBe32(old.ckpoint_seqno); @@ -1674,8 +1667,8 @@ static struct wal_hdr next_wal_hdr(const struct entry *e) BytePutBe32(salt1, ret.salts.salt1); sqlite3_randomness(sizeof(ret.salts.salt2), (void *)&ret.salts.salt2); struct cksums sums = {}; - update_cksums(native_magic(), (const uint8_t *)&ret, - offsetof(struct wal_hdr, cksum1), &sums); + update_cksums((const uint8_t *)&ret, offsetof(struct wal_hdr, cksum1), + &sums); BytePutBe32(sums.cksum1, ret.cksum1); BytePutBe32(sums.cksum2, ret.cksum2); return ret; @@ -1690,10 +1683,8 @@ static struct wal_frame_hdr txn_frame_hdr(struct entry *e, BytePutBe32((uint32_t)frame->page_number, fhdr.page_number); BytePutBe32(commit, fhdr.commit); - update_cksums(ByteGetBe32(e->wal_cur_hdr.magic), (const void *)(&fhdr), - 8, &sums); - update_cksums(ByteGetBe32(e->wal_cur_hdr.magic), frame->data, - e->page_size, &sums); + update_cksums((const void *)(&fhdr), 8, &sums); + update_cksums(frame->data, e->page_size, &sums); fhdr.salts = e->wal_cur_hdr.salts; BytePutBe32(sums.cksum1, fhdr.cksum1); BytePutBe32(sums.cksum2, fhdr.cksum2); From 6c2fdc92f963dd6c686d53217c6cc45ed534f8a2 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Mon, 22 Jul 2024 17:02:00 -0400 Subject: [PATCH 34/73] Declare sm relationship in two-node vfs2 test Signed-off-by: Cole Miller --- src/vfs2.c | 10 ++++++++++ src/vfs2.h | 9 +++++++++ test/unit/test_vfs2.c | 5 ++++- 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/src/vfs2.c b/src/vfs2.c index 4f7c30d9b..34929ae62 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -3,6 +3,7 @@ #include "lib/byte.h" #include "lib/queue.h" #include "lib/sm.h" +#include "tracing.h" #include "utils.h" #include @@ -1878,3 +1879,12 @@ int vfs2_pseudo_read_end(sqlite3_file *file, unsigned i) e->shm_locks[i] -= 1; return 0; } + +void vfs2_ut_sm_relate(sqlite3_file *orig, sqlite3_file *targ) +{ + struct file *forig = (struct file *)orig; + PRE(forig->flags & SQLITE_OPEN_MAIN_DB); + struct file *ftarg = (struct file *)targ; + PRE(ftarg->flags & SQLITE_OPEN_MAIN_DB); + sm_relate(&forig->entry->wtx_sm, &ftarg->entry->wtx_sm); +} diff --git a/src/vfs2.h b/src/vfs2.h index 45fe87909..46db29e0d 100644 --- a/src/vfs2.h +++ b/src/vfs2.h @@ -181,4 +181,13 @@ int vfs2_pseudo_read_end(sqlite3_file *file, unsigned i); */ void vfs2_destroy(sqlite3_vfs *vfs); +/** + * Declare a relationship between two databases opened by (possibly distinct) + * vfs2 instances. + * + * Intended for use by unit tests. The arguments are main file handles + * (SQLITE_FCNTL_FILE_POINTER). + */ +void vfs2_ut_sm_relate(sqlite3_file *orig, sqlite3_file *targ); + #endif diff --git a/test/unit/test_vfs2.c b/test/unit/test_vfs2.c index 00ec9c146..e4f51817d 100644 --- a/test/unit/test_vfs2.c +++ b/test/unit/test_vfs2.c @@ -388,9 +388,12 @@ TEST(vfs2, leader_and_follower, set_up, tear_down, 0, NULL) OK(vfs2_poll(leader_fp, &frames, &leader_sl)); munit_assert_uint(leader_sl.len, ==, 2); - /* The follower receives the transaction. */ + /* The follower opens its database. */ sqlite3 *follower_db = node_open_db(follower, "test.db"); sqlite3_file *follower_fp = main_file(follower_db); + vfs2_ut_sm_relate(leader_fp, follower_fp); + + /* The follower receives the transaction. */ struct vfs2_wal_slice follower_sl; OK(vfs2_add_uncommitted(follower_fp, PAGE_SIZE, frames, leader_sl.len, &follower_sl)); From dc3be92308185ff580aa1690cd3e0a54c38417f9 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Mon, 22 Jul 2024 17:57:18 -0400 Subject: [PATCH 35/73] Check validity of WAL headers at startup Signed-off-by: Cole Miller --- src/vfs2.c | 46 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 12 deletions(-) diff --git a/src/vfs2.c b/src/vfs2.c index 34929ae62..52b8944fe 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -928,9 +928,29 @@ static int compare_wal_headers(struct wal_hdr a, return SQLITE_OK; } -static int read_wal_hdr(sqlite3_file *wal, - sqlite3_int64 *size, - struct wal_hdr *hdr) +static bool wal_hdr_is_valid(const struct wal_hdr *hdr) +{ + /* TODO(cole) Add other validity constraints. */ + struct cksums sums_found = {}; + const uint8_t *p = (const uint8_t *)hdr; + size_t len = offsetof(struct wal_hdr, cksum1); + update_cksums(p, len, &sums_found); + struct cksums sums_expected = { ByteGetBe32(hdr->cksum1), + ByteGetBe32(hdr->cksum2) }; + return cksums_equal(sums_expected, sums_found); +} + +/** + * Read the header of the given WAL file and detect corruption. + * + * If the file contains a valid header, returns this header in `hdr` and the + * size of the file in `size`. If the file is too short to contain a header, + * returns the size only. If the file can't be read, or it contains an invalid + * header, returns an error. + */ +static int try_read_wal_hdr(sqlite3_file *wal, + sqlite3_int64 *size, + struct wal_hdr *hdr) { int rv; @@ -938,13 +958,15 @@ static int read_wal_hdr(sqlite3_file *wal, if (rv != SQLITE_OK) { return rv; } - if (*size >= (sqlite3_int64)sizeof(struct wal_hdr)) { - rv = wal->pMethods->xRead(wal, hdr, sizeof(*hdr), 0); - if (rv != SQLITE_OK) { - return rv; - } - } else { - *hdr = (struct wal_hdr){}; + if (*size < (sqlite3_int64)sizeof(*hdr)) { + return SQLITE_OK; + } + rv = wal->pMethods->xRead(wal, hdr, sizeof(*hdr), 0); + if (rv != SQLITE_OK) { + return rv; + } + if (!wal_hdr_is_valid(hdr)) { + return SQLITE_CORRUPT; } return SQLITE_OK; } @@ -1105,13 +1127,13 @@ static int open_entry(struct common *common, const char *name, struct entry *e) sqlite3_int64 size1; struct wal_hdr hdr1; - rv = read_wal_hdr(e->wal_cur, &size1, &hdr1); + rv = try_read_wal_hdr(e->wal_cur, &size1, &hdr1); if (rv != SQLITE_OK) { return rv; } sqlite3_int64 size2; struct wal_hdr hdr2; - rv = read_wal_hdr(e->wal_prev, &size2, &hdr2); + rv = try_read_wal_hdr(e->wal_prev, &size2, &hdr2); if (rv != SQLITE_OK) { return rv; } From 05d9e91bd246a441e51b513489e08ff7ac7aa15e Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Tue, 23 Jul 2024 15:06:19 -0400 Subject: [PATCH 36/73] Share WAL header calculation between vfs2.c and UT Introduce a new helper function and refactor some logic in vfs2.c so that we don't need redundant code in test_vfs2.c to compute WAL headers. Signed-off-by: Cole Miller --- src/vfs2.c | 77 ++++++++++++++++++++++++++++++++----------- src/vfs2.h | 13 ++++++++ test/unit/test_vfs2.c | 55 +++---------------------------- 3 files changed, 75 insertions(+), 70 deletions(-) diff --git a/src/vfs2.c b/src/vfs2.c index 52b8944fe..5ad4b9ed7 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -322,6 +322,14 @@ static bool salts_equal(struct vfs2_salts a, struct vfs2_salts b) return get_salt1(a) == get_salt1(b) && get_salt2(a) == get_salt2(b); } +static struct vfs2_salts make_salts(uint32_t salt1, uint32_t salt2) +{ + struct vfs2_salts ret; + BytePutBe32(salt1, ret.salt1); + BytePutBe32(salt2, ret.salt2); + return ret; +} + static struct wal_index_full_hdr *get_full_hdr(struct entry *e) { PRE(e->shm_regions_len > 0); @@ -1666,35 +1674,54 @@ static int write_one_frame(struct entry *e, return SQLITE_OK; } +/** + * Create a valid WAL header from the specified fields. + */ +static struct wal_hdr make_wal_hdr(uint32_t magic, + uint32_t page_size, + uint32_t ckpoint_seqno, + struct vfs2_salts salts) +{ + struct wal_hdr ret; + BytePutBe32(magic, ret.magic); + BytePutBe32(3007000, ret.version); + BytePutBe32(page_size, ret.page_size); + BytePutBe32(ckpoint_seqno, ret.ckpoint_seqno); + ret.salts = salts; + struct cksums sums = {}; + const uint8_t *p = (const uint8_t *)&ret; + size_t len = offsetof(struct wal_hdr, cksum1); + update_cksums(p, len, &sums); + BytePutBe32(sums.cksum1, ret.cksum1); + BytePutBe32(sums.cksum2, ret.cksum2); + return ret; +} + +/** + * Derive the next header that should be written to start a new WAL. + * + * To get the next header, we start with the header of WAL-cur, increment + * salt1 and ckpoint_seqno, and randomize salt2. + */ static struct wal_hdr next_wal_hdr(const struct entry *e) { + struct wal_hdr old = e->wal_cur_hdr; + uint32_t magic = is_bigendian() ? BE_MAGIC : LE_MAGIC; + uint32_t ckpoint_seqno = ByteGetBe32(old.ckpoint_seqno) + 1; /* salt2 is randomized every time we generate a new WAL header. * We don't use the xRandomness method of the base VFS to do this, * because it always translates to a syscall (getrandom), and * SQLite intends that this should only be used for seeding the * internal PRNG. Instead, we call sqlite3_randomness, which gives * us access to this PRNG, seeded from the default (unix) VFS. */ - struct wal_hdr ret; - struct wal_hdr old = e->wal_cur_hdr; - BytePutBe32(is_bigendian() ? BE_MAGIC : LE_MAGIC, ret.magic); - BytePutBe32(3007000, ret.version); - BytePutBe32(e->page_size, ret.page_size); - uint32_t ckpoint_seqno = ByteGetBe32(old.ckpoint_seqno); - BytePutBe32(ckpoint_seqno + 1, ret.ckpoint_seqno); - uint32_t salt1; - if (ckpoint_seqno == 0) { - salt1 = get_salt1(old.salts) + 1; + struct vfs2_salts salts; + if (ckpoint_seqno == 1) { + sqlite3_randomness(sizeof(salts.salt1), (void *)&salts.salt1); } else { - sqlite3_randomness(sizeof(salt1), (void *)&salt1); + BytePutBe32(get_salt1(old.salts) + 1, salts.salt1); } - BytePutBe32(salt1, ret.salts.salt1); - sqlite3_randomness(sizeof(ret.salts.salt2), (void *)&ret.salts.salt2); - struct cksums sums = {}; - update_cksums((const uint8_t *)&ret, offsetof(struct wal_hdr, cksum1), - &sums); - BytePutBe32(sums.cksum1, ret.cksum1); - BytePutBe32(sums.cksum2, ret.cksum2); - return ret; + sqlite3_randomness(sizeof(salts.salt2), (void *)&salts.salt2); + return make_wal_hdr(magic, e->page_size, ckpoint_seqno, salts); } static struct wal_frame_hdr txn_frame_hdr(struct entry *e, @@ -1910,3 +1937,15 @@ void vfs2_ut_sm_relate(sqlite3_file *orig, sqlite3_file *targ) PRE(ftarg->flags & SQLITE_OPEN_MAIN_DB); sm_relate(&forig->entry->wtx_sm, &ftarg->entry->wtx_sm); } + +void vfs2_ut_make_wal_hdr(uint8_t *buf, + uint32_t page_size, + uint32_t ckpoint_seqno, + uint32_t salt1, + uint32_t salt2) +{ + struct wal_hdr hdr = + make_wal_hdr(is_bigendian() ? BE_MAGIC : LE_MAGIC, page_size, + ckpoint_seqno, make_salts(salt1, salt2)); + memcpy(buf, &hdr, sizeof(hdr)); +} diff --git a/src/vfs2.h b/src/vfs2.h index 46db29e0d..790fd5436 100644 --- a/src/vfs2.h +++ b/src/vfs2.h @@ -190,4 +190,17 @@ void vfs2_destroy(sqlite3_vfs *vfs); */ void vfs2_ut_sm_relate(sqlite3_file *orig, sqlite3_file *targ); +#define VFS2_WAL_HDR_SIZE 32 + +/** + * Write a WAL header with the provided fields to the given buffer. + * + * The size of the buffer must be at least VFS2_WAL_HDR_SIZE bytes. + */ +void vfs2_ut_make_wal_hdr(uint8_t *buf, + uint32_t page_size, + uint32_t ckpoint_seqno, + uint32_t salt1, + uint32_t salt2); + #endif diff --git a/test/unit/test_vfs2.c b/test/unit/test_vfs2.c index e4f51817d..d1fdf1c70 100644 --- a/test/unit/test_vfs2.c +++ b/test/unit/test_vfs2.c @@ -230,54 +230,7 @@ TEST(vfs2, basic, set_up, tear_down, 0, NULL) return MUNIT_OK; } -#define WAL_SIZE_FROM_FRAMES(n) (32 + (24 + PAGE_SIZE) * (n)) - -/** - * Create a WAL header with the sequence number and salts set to the - * given values. - */ -static void make_wal_hdr(uint8_t *buf, - uint32_t ckpoint_seqno, - uint32_t salt1, - uint32_t salt2) -{ - uint8_t *p = buf; - - /* checksum */ - BytePutBe32(0x377f0683, p); - p += 4; - BytePutBe32(3007000, p); - p += 4; - BytePutBe32(PAGE_SIZE, p); - p += 4; - BytePutBe32(ckpoint_seqno, p); - p += 4; - BytePutBe32(salt1, p); - p += 4; - BytePutBe32(salt2, p); - p += 4; - - uint32_t s0 = 0; - uint32_t s1 = 0; - size_t off = 0; - - s0 += ByteGetBe32(buf + off) + s1; - s1 += ByteGetBe32(buf + off + 4) + s0; - off += 8; - - s0 += ByteGetBe32(buf + off) + s1; - s1 += ByteGetBe32(buf + off + 4) + s0; - off += 8; - - s0 += ByteGetBe32(buf + off) + s1; - s1 += ByteGetBe32(buf + off + 4) + s0; - off += 8; - - BytePutBe32(s0, p); - p += 4; - BytePutBe32(s1, p); - p += 4; -} +#define WAL_SIZE_FROM_FRAMES(n) (VFS2_WAL_HDR_SIZE + (24 + PAGE_SIZE) * (n)) /** * When only one WAL is nonempty at startup, that WAL becomes WAL-cur. @@ -293,7 +246,7 @@ TEST(vfs2, startup_one_nonempty, set_up, tear_down, 0, NULL) assert_wal_sizes(buf, 0, 0); uint8_t wal2_hdronly[WAL_SIZE_FROM_FRAMES(0)] = { 0 }; - make_wal_hdr(wal2_hdronly, 0, 17, 103); + vfs2_ut_make_wal_hdr(wal2_hdronly, PAGE_SIZE, 0, 17, 103); prepare_wals(buf, NULL, 0, wal2_hdronly, sizeof(wal2_hdronly)); sqlite3 *db = node_open_db(node, "test.db"); OK(sqlite3_exec(db, "CREATE TABLE foo (n INTEGER)", NULL, NULL, NULL)); @@ -319,9 +272,9 @@ TEST(vfs2, startup_both_nonempty, set_up, tear_down, 0, NULL) assert_wal_sizes(buf, 0, 0); uint8_t wal1_hdronly[WAL_SIZE_FROM_FRAMES(0)] = { 0 }; - make_wal_hdr(wal1_hdronly, 0, 18, 103); + vfs2_ut_make_wal_hdr(wal1_hdronly, PAGE_SIZE, 0, 18, 103); uint8_t wal2_hdronly[WAL_SIZE_FROM_FRAMES(0)] = { 0 }; - make_wal_hdr(wal2_hdronly, 0, 17, 103); + vfs2_ut_make_wal_hdr(wal2_hdronly, PAGE_SIZE, 0, 17, 103); prepare_wals(buf, wal1_hdronly, sizeof(wal1_hdronly), wal2_hdronly, sizeof(wal2_hdronly)); sqlite3 *db = node_open_db(node, "test.db"); From 8912b8e3a91c22ceb3329139121cdb65370753da Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Tue, 23 Jul 2024 21:41:13 -0400 Subject: [PATCH 37/73] Require the page size to be known before add_uncommitted Signed-off-by: Cole Miller --- src/vfs2.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/vfs2.c b/src/vfs2.c index 5ad4b9ed7..82212d886 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -1752,12 +1752,13 @@ int vfs2_add_uncommitted(sqlite3_file *file, struct file *xfile = (struct file *)file; PRE(xfile->flags & SQLITE_OPEN_MAIN_DB); struct entry *e = xfile->entry; - if (e->page_size == 0) { - e->page_size = page_size; - } - PRE(page_size == e->page_size); int rv; + /* We require the page size to have been initialized by this point, + * either by reading the WAL when opening the entry or by `PRAGMA + * page_size`. */ + PRE(page_size == e->page_size); + /* The write lock is always held if there is at least one * uncommitted frame in WAL-cur. In FOLLOWING state, we allow * adding more frames to WAL-cur even if there are already From 97c9cb3d03e91cc64e3798a6c008ee0534fde197 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Tue, 23 Jul 2024 23:37:11 -0400 Subject: [PATCH 38/73] Split the WAL swap and the WAL header write This cleaves the dqlite-specific WAL swap, which targets the volatile state, from writing a new WAL header, which is pure SQLite and targets the persistent state. It's not a completely clean split because the new wal_swap function does write the outgoing WAL and flip the moving name, but I think it's still a win for readability. This change also supports an upcoming refactor to vfs2_add_uncommitted. Signed-off-by: Cole Miller --- src/vfs2.c | 64 +++++++++++++++++++++++++++--------------------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/src/vfs2.c b/src/vfs2.c index 82212d886..a9c519a46 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -456,14 +456,13 @@ static int vfs2_read(sqlite3_file *file, void *buf, int amt, sqlite3_int64 ofst) return orig->pMethods->xRead(orig, buf, amt, ofst); } -static int wal_swap(struct entry *e, const struct wal_hdr *wal_hdr) +/** + * Update the volatile state by exchanging WAL-cur and WAL-prev. + */ +static void wal_swap(struct entry *e, const struct wal_hdr *hdr) { - PRE(e->pending_txn_len == 0); - PRE(e->pending_txn_frames == NULL); int rv; - e->page_size = ByteGetBe32(wal_hdr->page_size); - /* Terminology: the outgoing WAL is the one that's moving * from cur to prev. The incoming WAL is the one that's moving * from prev to cur. */ @@ -472,39 +471,37 @@ static int wal_swap(struct entry *e, const struct wal_hdr *wal_hdr) sqlite3_file *phys_incoming = e->wal_prev; char *name_incoming = e->wal_prev_fixed_name; - /* Write the new header of the incoming WAL. */ - rv = phys_incoming->pMethods->xWrite(phys_incoming, wal_hdr, - sizeof(struct wal_hdr), 0); - if (rv != SQLITE_OK) { - return rv; - } - - /* In-memory WAL swap. */ e->wal_cur = phys_incoming; e->wal_cur_fixed_name = name_incoming; e->wal_prev = phys_outgoing; e->wal_prev_fixed_name = name_outgoing; e->wal_cursor = 0; e->wal_prev_hdr = e->wal_cur_hdr; - e->wal_cur_hdr = *wal_hdr; + e->wal_cur_hdr = *hdr; - /* Move the moving name. */ + /* Best-effort: flip the moving name. + * + * If these syscalls fail, we can end up with no moving name, or a + * moving name that points to the wrong WAL. We don't use the moving + * name as the source of truth, so this can't lead to dqlite operating + * incorrectly. At worst, it's inconvenient for users who want to + * inspect their database with SQLite (readonly! when dqlite is not + * running!). */ rv = unlink(e->wal_moving_name); - if (rv != 0 && errno != ENOENT) { - return SQLITE_IOERR; - } + (void)rv; rv = link(name_incoming, e->wal_moving_name); - if (rv != 0) { - return SQLITE_IOERR; - } + (void)rv; - /* TODO do we need an fsync here? */ + /* Best-effort: invalidate the header of the outgoing physical WAL, so + * that it can't be mistakenly applied to the database. - /* Best-effort: invalidate the outgoing physical WAL so that nobody gets - * confused. */ + * This provides some protection against users manipulating the + * database with SQLite, or a bug in dqlite. But we don't rely on it + * for correctness. */ (void)phys_outgoing->pMethods->xWrite(phys_outgoing, &invalid_magic, sizeof(invalid_magic), 0); - return SQLITE_OK; + + /* TODO do we need an fsync here? */ } static int vfs2_wal_write_frame_hdr(struct entry *e, @@ -558,7 +555,9 @@ static int vfs2_wal_post_write(struct entry *e, sqlite3_int64 ofst) { uint32_t frame_size = VFS2_WAL_FRAME_HDR_SIZE + e->page_size; - if (amt == VFS2_WAL_FRAME_HDR_SIZE) { + if (amt == (int)sizeof(struct wal_hdr)) { + return SQLITE_OK; + } else if (amt == VFS2_WAL_FRAME_HDR_SIZE) { ofst -= (sqlite3_int64)sizeof(struct wal_hdr); assert(ofst % frame_size == 0); sqlite3_int64 frame_ofst = ofst / (sqlite3_int64)frame_size; @@ -592,21 +591,22 @@ static int vfs2_write(sqlite3_file *file, struct file *xfile = (struct file *)file; int rv; + /* A write to the WAL at offset 0 must be a header write, and indicates + * that SQLite has reset the WAL. We react by doing a WAL swap. + */ if ((xfile->flags & SQLITE_OPEN_WAL) && ofst == 0) { assert(amt == sizeof(struct wal_hdr)); const struct wal_hdr *hdr = buf; struct entry *e = xfile->entry; - rv = wal_swap(e, hdr); - if (rv != SQLITE_OK) { - return rv; - } - /* check that the WAL-index hdr makes sense and save it */ + wal_swap(e, hdr); + /* Save the WAL-index header so that we can roll back to it in + * the future. The assertions check that the header we're + * saving has been updated to match the new, empty WAL. */ struct wal_index_basic_hdr ihdr = get_full_hdr(e)->basic[0]; assert(ihdr.isInit == 1); assert(ihdr.mxFrame == 0); e->prev_txn_hdr = ihdr; sm_move(&e->wtx_sm, WTX_ACTIVE); - return SQLITE_OK; } sqlite3_file *orig = get_orig(xfile); From ffe06d70f0e1fecf80692dd8662135c3d6f617eb Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Wed, 24 Jul 2024 11:58:49 -0400 Subject: [PATCH 39/73] Fix handling of page size Per discussion during review, we're expecting the page size to be set by the pragma if it's not initialized by reading WAL-cur. So implement the pragma. Signed-off-by: Cole Miller --- src/vfs2.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/src/vfs2.c b/src/vfs2.c index a9c519a46..4bf37b975 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -665,17 +665,28 @@ static int vfs2_check_reserved_lock(sqlite3_file *file, int *out) return orig->pMethods->xCheckReservedLock(orig, out); } -static int interpret_pragma(char **args) +static int interpret_pragma(struct entry *e, char **args) { - char **e = &args[0]; + char **err = &args[0]; char *left = args[1]; PRE(left != NULL); char *right = args[2]; if (strcmp(left, "journal_mode") == 0 && right != NULL && strcasecmp(right, "wal") != 0) { - *e = sqlite3_mprintf("dqlite requires WAL mode"); + *err = sqlite3_mprintf("dqlite requires WAL mode"); return SQLITE_ERROR; + } else if (strcmp(left, "page_size") == 0 && right != NULL) { + char *end = right + strlen(right); + unsigned long val = strtoul(right, &end, 10); + if (right != end && *end == '\0' && is_valid_page_size(val)) { + if (e->page_size != 0 && val != e->page_size) { + *err = sqlite3_mprintf( + "page size cannot be changed"); + return SQLITE_ERROR; + } + e->page_size = (uint32_t)val; + } } return SQLITE_NOTFOUND; @@ -698,7 +709,7 @@ static int vfs2_file_control(sqlite3_file *file, int op, void *arg) e->wal_cursor += e->pending_txn_len; sm_move(&xfile->entry->wtx_sm, WTX_HIDDEN); } else if (op == SQLITE_FCNTL_PRAGMA) { - rv = interpret_pragma(arg); + rv = interpret_pragma(e, arg); if (rv != SQLITE_NOTFOUND) { return rv; } From 7f20503b068d0a03724fc35473ad4d3ac24a272c Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Tue, 23 Jul 2024 14:17:31 -0400 Subject: [PATCH 40/73] Factor out shared memory into a struct Grouping together the fields that relate to the shm into a discrete type makes it easier to read and reason about the code that touches these fields. In the process, this commit also changes the memory allocation strategy for the shm. Instead of having one allocation per shm region and another allocation to hold the array of pointers, we have one big allocation that contains all the existing regions, one after the other. This cuts down significantly on malloc and free calls. We also implement proper management of shm hash tables beyond the first. Signed-off-by: Cole Miller --- src/vfs2.c | 293 ++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 198 insertions(+), 95 deletions(-) diff --git a/src/vfs2.c b/src/vfs2.c index 4bf37b975..b7ad55d5d 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -197,19 +197,153 @@ struct wal_index_full_hdr { uint8_t unused[4]; }; -#define REGION0_PGNOS_LEN 4062 -#define REGION0_HT_LEN 8192 +#define SHM_SHORT_PGNOS_LEN 4062 +#define SHM_LONG_PGNOS_LEN 4096 +#define SHM_HT_LEN 8192 /** - * View of the zeroth shm region, which contains the WAL index header - * and the first hash table. + * View of a shm region. + * + * The zeroth region looks like this (not to scale): + * + * | header | page numbers | hash table | + * + * The first and later regions look like this (also not to scale): + * + * | page numbers | hash table | */ -struct vfs2_shm_region0 { - struct wal_index_full_hdr hdr; - uint32_t pgnos[REGION0_PGNOS_LEN]; - uint16_t ht[REGION0_HT_LEN]; +struct shm_region { + union { + /* region 0 */ + struct { + struct wal_index_full_hdr hdr; + uint32_t pgnos_short[SHM_SHORT_PGNOS_LEN]; + }; + /* region 1 and later */ + uint32_t pgnos_long[SHM_LONG_PGNOS_LEN]; + }; + uint16_t ht[SHM_HT_LEN]; }; +/** + * "Shared" memory implementation for storing the WAL-index. + * + * All the regions are stored in a single allocation, whose size is a multiple + * of VFS2_WAL_INDEX_REGION_SIZE. We realloc to create additional regions as + * they are demanded. + */ +struct shm { + struct shm_region *regions; + int num_regions; + /* Counts the net number of times that SQLite has mapped the zeroth + * region. As a sanity check, we assert that this value is zero before + * we free the shm. */ + unsigned refcount; +}; + +static_assert(sizeof(struct shm_region) == VFS2_WAL_INDEX_REGION_SIZE, + "shm regions have the expected size"); + +static void write_basic_hdr_cksums(struct wal_index_basic_hdr *bhdr) +{ + struct cksums sums = {}; + const uint8_t *p = (const uint8_t *)bhdr; + size_t len = offsetof(struct wal_index_basic_hdr, cksums); + update_cksums(p, len, &sums); + bhdr->cksums = sums; +} + +/** + * Perform initial setup of the WAL-index. + * + * This allocates the zeroth region and fills in the WAL-index header + * based on the provided header of WAL-cur. + */ +static int shm_init(struct shm *shm, struct wal_hdr whdr) +{ + shm->regions = sqlite3_malloc64(VFS2_WAL_INDEX_REGION_SIZE); + if (shm->regions == NULL) { + return SQLITE_NOMEM; + } + shm->regions[0] = (struct shm_region){}; + shm->num_regions = 1; + shm->refcount = 0; + + struct shm_region *r0 = &shm->regions[0]; + struct wal_index_full_hdr *ihdr = &r0->hdr; + ihdr->basic[0].iVersion = 3007000; + ihdr->basic[0].isInit = 1; + ihdr->basic[0].bigEndCksum = is_bigendian(); + ihdr->basic[0].szPage = (uint16_t)ByteGetBe32(whdr.page_size); + write_basic_hdr_cksums(&ihdr->basic[0]); + ihdr->basic[1] = ihdr->basic[0]; + ihdr->marks[0] = 0; + ihdr->marks[1] = READ_MARK_UNUSED; + ihdr->marks[2] = READ_MARK_UNUSED; + ihdr->marks[3] = READ_MARK_UNUSED; + ihdr->marks[4] = READ_MARK_UNUSED; + return SQLITE_OK; +} + +/** + * Clear out all data in the WAL-index after a WAL swap, and re-initialize the + * header. + */ +static void shm_restart(struct shm *shm, struct wal_hdr whdr) +{ + for (int i = 0; i < shm->num_regions; i++) { + shm->regions[i] = (struct shm_region){}; + } + + /* TODO(cole) eliminate redundancy with shm_init */ + struct shm_region *r0 = &shm->regions[0]; + struct wal_index_full_hdr *ihdr = &r0->hdr; + ihdr->basic[0].iVersion = 3007000; + ihdr->basic[0].isInit = 1; + ihdr->basic[0].bigEndCksum = is_bigendian(); + ihdr->basic[0].szPage = (uint16_t)ByteGetBe32(whdr.page_size); + write_basic_hdr_cksums(&ihdr->basic[0]); + ihdr->basic[1] = ihdr->basic[0]; + ihdr->marks[0] = 0; + ihdr->marks[1] = READ_MARK_UNUSED; + ihdr->marks[2] = READ_MARK_UNUSED; + ihdr->marks[3] = READ_MARK_UNUSED; + ihdr->marks[4] = READ_MARK_UNUSED; +} + +/** + * Add the page number for a frame to the appropriate page number array + * in the WAL-index. + * + * This allocates a new shm region if necessary, and hence can fail with + * SQLITE_NOMEM. + */ +static int shm_add_pgno(struct shm *shm, uint32_t frame, uint32_t pgno) +{ + PRE(shm->num_regions > 0); + if (frame < SHM_SHORT_PGNOS_LEN) { + struct shm_region *r0 = &shm->regions[0]; + r0->pgnos_short[frame] = pgno; + return SQLITE_OK; + } + + uint32_t regno = (frame - SHM_SHORT_PGNOS_LEN) / SHM_LONG_PGNOS_LEN; + uint32_t index = (frame - SHM_SHORT_PGNOS_LEN) % SHM_LONG_PGNOS_LEN; + PRE(regno <= (uint32_t)shm->num_regions + 1); + if (regno == (uint32_t)shm->num_regions + 1) { + sqlite3_uint64 sz = + (sqlite3_uint64)regno * VFS2_WAL_INDEX_REGION_SIZE; + struct shm_region *p = sqlite3_realloc64(shm->regions, sz); + if (p == NULL) { + return SQLITE_NOMEM; + } + shm->regions = p; + shm->num_regions++; + } + shm->regions[regno].pgnos_long[index] = pgno; + return SQLITE_OK; +} + struct entry { /* Next/prev entries for this VFS. */ queue link; @@ -251,10 +385,8 @@ struct entry { /* For ACTIVE, HIDDEN, POLLED: the header that shows the pending txn */ struct wal_index_basic_hdr pending_txn_hdr; - /* shm implementation; holds the WAL index */ - void **shm_regions; - int shm_regions_len; - unsigned shm_refcount; + /* Shared memory implementation: holds the WAL-index. */ + struct shm shm; /* Zero for unlocked, positive for read-locked, UINT_MAX for * write-locked */ unsigned shm_locks[SQLITE_SHM_NLOCK]; @@ -332,9 +464,9 @@ static struct vfs2_salts make_salts(uint32_t salt1, uint32_t salt2) static struct wal_index_full_hdr *get_full_hdr(struct entry *e) { - PRE(e->shm_regions_len > 0); - PRE(e->shm_regions != NULL); - return e->shm_regions[0]; + PRE(e->shm.num_regions > 0); + PRE(e->shm.regions != NULL); + return &e->shm.regions[0].hdr; } static bool no_pending_txn(const struct entry *e) @@ -412,13 +544,8 @@ static void maybe_close_entry(struct entry *e) free_pending_txn(e); - assert(e->shm_refcount == 0); - for (int i = 0; i < e->shm_regions_len; i++) { - void *region = e->shm_regions[i]; - assert(region != NULL); - sqlite3_free(region); - } - sqlite3_free(e->shm_regions); + assert(e->shm.refcount == 0); + sqlite3_free(e->shm.regions); pthread_rwlock_wrlock(&e->common->rwlock); queue_remove(&e->link); @@ -762,45 +889,39 @@ static int vfs2_shm_map(sqlite3_file *file, int extend, void volatile **out) { + PRE(regsz == VFS2_WAL_INDEX_REGION_SIZE); struct file *xfile = (struct file *)file; + PRE(xfile->flags & SQLITE_OPEN_MAIN_DB); struct entry *e = xfile->entry; - void *region; + struct shm *shm = &e->shm; + struct shm_region *region; int rv; - if (e->shm_regions != NULL && regno < e->shm_regions_len) { - region = e->shm_regions[regno]; - assert(region != NULL); + if (shm->regions != NULL && regno < shm->num_regions) { + region = &shm->regions[regno]; } else if (extend != 0) { - assert(regno == e->shm_regions_len); - region = sqlite3_malloc(regsz); - if (region == NULL) { + assert(regno == shm->num_regions); + sqlite3_uint64 sz = + ((sqlite3_uint64)regno + 1) * (sqlite3_uint64)regsz; + struct shm_region *p = sqlite3_realloc64(shm->regions, sz); + if (p == NULL) { rv = SQLITE_NOMEM; goto err; } + shm->regions = p; + region = &shm->regions[regno]; memset(region, 0, (size_t)regsz); - /* FIXME reallocating every time seems bad */ - sqlite3_uint64 z = (sqlite3_uint64)sizeof(*e->shm_regions) * - (sqlite3_uint64)(e->shm_regions_len + 1); - void **regions = sqlite3_realloc64(e->shm_regions, z); - if (regions == NULL) { - rv = SQLITE_NOMEM; - goto err_after_region_malloc; - } - e->shm_regions = regions; - e->shm_regions[regno] = region; - e->shm_regions_len++; + shm->num_regions++; } else { region = NULL; } *out = region; if (regno == 0 && region != NULL) { - e->shm_refcount++; + e->shm.refcount++; } return SQLITE_OK; -err_after_region_malloc: - sqlite3_free(region); err: assert(rv != SQLITE_OK); *out = NULL; @@ -907,7 +1028,7 @@ static int vfs2_shm_unmap(sqlite3_file *file, int delete) (void)delete; struct file *xfile = (struct file *)file; struct entry *e = xfile->entry; - e->shm_refcount--; + e->shm.refcount--; return SQLITE_OK; } @@ -990,39 +1111,37 @@ static int try_read_wal_hdr(sqlite3_file *wal, return SQLITE_OK; } -static void write_basic_hdr_cksums(struct wal_index_basic_hdr *bhdr) -{ - struct cksums sums = {}; - const uint8_t *p = (const uint8_t *)bhdr; - size_t len = offsetof(struct wal_index_basic_hdr, cksums); - update_cksums(p, len, &sums); - bhdr->cksums = sums; -} - -static struct wal_index_full_hdr initial_full_hdr(struct wal_hdr whdr) +static void pgno_ht_insert(uint16_t *ht, uint16_t fx, uint32_t pgno) { - struct wal_index_full_hdr ihdr = {}; - ihdr.basic[0].iVersion = 3007000; - ihdr.basic[0].isInit = 1; - ihdr.basic[0].bigEndCksum = is_bigendian(); - ihdr.basic[0].szPage = (uint16_t)ByteGetBe32(whdr.page_size); - write_basic_hdr_cksums(&ihdr.basic[0]); - ihdr.basic[1] = ihdr.basic[0]; - ihdr.marks[0] = 0; - ihdr.marks[1] = READ_MARK_UNUSED; - ihdr.marks[2] = READ_MARK_UNUSED; - ihdr.marks[3] = READ_MARK_UNUSED; - ihdr.marks[4] = READ_MARK_UNUSED; - return ihdr; + uint32_t hash = pgno * 383; + while (ht[hash % SHM_HT_LEN] != 0) { + hash++; + } + /* SQLite uses 1-based frame indices in this context, reserving + * 0 for a sentinel value. */ + ht[hash % SHM_HT_LEN] = fx + 1; } -static void pgno_ht_insert(uint16_t *ht, size_t len, uint16_t fx, uint32_t pgno) +/** + * Grab the page number for the given frame from the appropriate array + * in the WAL-index and add it to the corresponding hash table. + */ +static void shm_update_ht(struct shm *shm, uint32_t frame) { - uint32_t hash = pgno * 383; - while (ht[hash % len] != 0) { - hash++; + PRE(shm->num_regions > 0); + if (frame < SHM_SHORT_PGNOS_LEN) { + struct shm_region *r0 = &shm->regions[0]; + uint32_t pgno = r0->pgnos_short[frame]; + pgno_ht_insert(r0->ht, (uint16_t)frame, pgno); + return; } - ht[hash % len] = fx; + + uint32_t regno = (frame - SHM_SHORT_PGNOS_LEN) / SHM_LONG_PGNOS_LEN; + uint32_t index = (frame - SHM_SHORT_PGNOS_LEN) % SHM_LONG_PGNOS_LEN; + PRE(regno <= (uint32_t)shm->num_regions); + struct shm_region *region = &shm->regions[regno]; + uint32_t pgno = region->pgnos_long[index]; + pgno_ht_insert(region->ht, (uint16_t)index, pgno); } static void set_mx_frame(struct entry *e, @@ -1042,16 +1161,12 @@ static void set_mx_frame(struct entry *e, ihdr->basic[1] = ihdr->basic[0]; POST(full_hdr_valid(ihdr)); - struct vfs2_shm_region0 *r0 = e->shm_regions[0]; - PRE(mx <= REGION0_PGNOS_LEN); + struct shm *shm = &e->shm; for (uint32_t i = old_mx; i < mx; i++) { /* The page numbers array was already updated during the call * to add_uncommitted, so we just need to update the hash array. */ - /* TODO(cole) Support hash tables beyond the first. */ - PRE(i < REGION0_PGNOS_LEN); - pgno_ht_insert(r0->ht, REGION0_HT_LEN, (uint16_t)i, - r0->pgnos[i]); + shm_update_ht(shm, i); } } @@ -1196,17 +1311,7 @@ static int open_entry(struct common *common, const char *name, struct entry *e) rv = link(e->wal_cur_fixed_name, e->wal_moving_name); (void)rv; - e->shm_regions = sqlite3_malloc(sizeof(void *[1])); - if (e->shm_regions == NULL) { - return SQLITE_NOMEM; - } - e->shm_regions[0] = sqlite3_malloc(VFS2_WAL_INDEX_REGION_SIZE); - if (e->shm_regions[0] == NULL) { - return SQLITE_NOMEM; - } - memset(e->shm_regions[0], 0, VFS2_WAL_INDEX_REGION_SIZE); - e->shm_regions_len = 1; - + rv = shm_init(&e->shm); *get_full_hdr(e) = initial_full_hdr(hdr_cur); e->wal_cursor = wal_cursor_from_size(e->page_size, size_cur); @@ -1853,10 +1958,7 @@ int vfs2_add_uncommitted(sqlite3_file *file, * "early" like this is harmless and saves us from having to stash the * page numbers somewhere else in memory in between add_uncommitted and * apply, or (worse) read them back from WAL-cur. */ - struct vfs2_shm_region0 *r0 = e->shm_regions[0]; - /* TODO(cole) Support hash tables beyond the first. */ - PRE(e->wal_cursor < REGION0_PGNOS_LEN); - r0->pgnos[e->wal_cursor] = (uint32_t)frames[0].page_number; + shm_add_pgno(&e->shm, e->wal_cursor, (uint32_t)frames[0].page_number); uint32_t commit = len == 1 ? db_size : 0; struct wal_frame_hdr fhdr = txn_frame_hdr(e, sums, &frames[0], commit); @@ -1866,8 +1968,9 @@ int vfs2_add_uncommitted(sqlite3_file *file, } for (unsigned i = 1; i < len; i++) { - PRE(e->wal_cursor < REGION0_PGNOS_LEN); - r0->pgnos[e->wal_cursor] = (uint32_t)frames[i].page_number; + PRE(e->wal_cursor < SHM_SHORT_PGNOS_LEN); + shm_add_pgno(&e->shm, e->wal_cursor, + (uint32_t)frames[i].page_number); sums.cksum1 = ByteGetBe32(fhdr.cksum1); sums.cksum2 = ByteGetBe32(fhdr.cksum2); commit = i == len - 1 ? db_size : 0; From 549bff9532f08544b9c03ee76701c487f621e185 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Wed, 24 Jul 2024 18:45:08 -0400 Subject: [PATCH 41/73] Walk WAL-cur at startup to set wal_cursor This was a big missing piece for correctness. Right now the externally-specified "stopping point" from outside (meant to be provided by raft based on the last shallow log entry) is not hooked up to anything, but the logic is there. Signed-off-by: Cole Miller --- src/vfs2.c | 146 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 121 insertions(+), 25 deletions(-) diff --git a/src/vfs2.c b/src/vfs2.c index b7ad55d5d..756c691e8 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -1182,18 +1182,6 @@ static void restart_full_hdr(struct wal_index_full_hdr *ihdr, ihdr->nBackfillAttempted = 0; } -static uint32_t wal_cursor_from_size(uint32_t page_size, sqlite3_int64 size) -{ - sqlite3_int64 whdr_size = (sqlite3_int64)sizeof(struct wal_hdr); - if (size < whdr_size) { - return 0; - } - sqlite3_int64 x = - (size - whdr_size) / ((sqlite3_int64)sizeof(struct wal_frame_hdr) + - (sqlite3_int64)page_size); - return (uint32_t)x; -} - static sqlite3_int64 wal_offset_from_cursor(uint32_t page_size, uint32_t cursor) { return (sqlite3_int64)sizeof(struct wal_hdr) + @@ -1202,6 +1190,100 @@ static sqlite3_int64 wal_offset_from_cursor(uint32_t page_size, uint32_t cursor) (sqlite3_int64)page_size); } +/** + * Read the given WAL file from beginning to end, initializing the WAL-index in + * the process. + * + * The process stops when we reach a frame whose checksums are invalid, or + * after the last valid commit frame, or on the first unsuccessful read, or at + * the end of the transaction designated by `stop`, if that argument is + * non-NULL. + * + * On return, the WAL-index header is initialized with mxFrame = 0 and other + * data matching the given WAL, and the page numbers for all frames up to the + * stopping point are recorded in the WAL-index; the hash tables are not + * initialized. + * + * Returns the stopping point in units of frames, which becomes the wal_cursor. + */ +static int walk_wal(sqlite3_file *wal, + sqlite3_int64 size, + struct wal_hdr hdr, + const struct vfs2_wal_slice *stop, + uint32_t *wal_cursor, + struct shm *shm) +{ + uint32_t page_size = ByteGetBe32(hdr.page_size); + struct cksums sums = { ByteGetBe32(hdr.cksum1), + ByteGetBe32(hdr.cksum2) }; + int rv; + + /* Check whether we have been provided a stopping point that corresponds + * to a transaction in the current WAL. (It's possible that the stopping + * point corresponds to a transaction that's in WAL-prev instead.) */ + bool have_stop = stop != NULL && salts_equal(stop->salts, hdr.salts); + + uint8_t *page_buf = sqlite3_malloc64(page_size); + if (page_buf == NULL) { + return SQLITE_NOMEM; + } + + sqlite3_int64 off = sizeof(struct wal_hdr); + while (off < size) { + if (have_stop && *wal_cursor == stop->start + stop->len) { + break; + } + + struct wal_frame_hdr fhdr; + rv = wal->pMethods->xRead(wal, &fhdr, sizeof(fhdr), off); + if (rv != SQLITE_OK) { + goto err; + } + if (!salts_equal(fhdr.salts, hdr.salts)) { + break; + } + off += (sqlite3_int64)sizeof(fhdr); + const uint8_t *p = (const uint8_t *)&fhdr; + size_t len = offsetof(struct wal_frame_hdr, salts); + update_cksums(p, len, &sums); + + rv = wal->pMethods->xRead(wal, page_buf, (int)page_size, off); + if (rv != SQLITE_OK) { + goto err; + } + off += page_size; + update_cksums(page_buf, page_size, &sums); + struct cksums frame_sums = { ByteGetBe32(fhdr.cksum1), + ByteGetBe32(fhdr.cksum2) }; + if (!cksums_equal(frame_sums, sums)) { + break; + } + + rv = shm_add_pgno(shm, *wal_cursor, + ByteGetBe32(fhdr.page_number)); + if (rv != SQLITE_OK) { + goto err; + } + *wal_cursor += 1; + + /* We expect the last valid frame to have the commit marker. + * That's because if the last transaction wasn't fully written + * to the WAL, we should have been passed a `stop` argument + * corresponding to some preceding transaction. */ + if (off >= size) { + assert(ByteGetBe32(fhdr.commit) > 0); + } + } + + sqlite3_free(page_buf); + return SQLITE_OK; + +err: + sqlite3_free(page_buf); + POST(rv != SQLITE_OK); + return rv; +} + static int open_entry(struct common *common, const char *name, struct entry *e) { sqlite3_vfs *v = common->orig; @@ -1311,23 +1393,37 @@ static int open_entry(struct common *common, const char *name, struct entry *e) rv = link(e->wal_cur_fixed_name, e->wal_moving_name); (void)rv; - rv = shm_init(&e->shm); - *get_full_hdr(e) = initial_full_hdr(hdr_cur); - - e->wal_cursor = wal_cursor_from_size(e->page_size, size_cur); - - int next = WTX_EMPTY; - if (size_cur >= - wal_offset_from_cursor(0 /* this doesn't matter */, 0)) { - /* TODO verify the header here */ + /* If WAL-cur contains a valid header, walk it to initialize wal_cursor + * and the WAL-index. Also, set the page size. + * + * If WAL-cur is empty, then we are starting up for the first time, and + * we don't initialize the WAL-index. It will be initialized either by + * SQLite running recovery or by add_uncommitted, whichever happens + * first. (In general, we want to avoid letting SQLite run recovery, but + * in this case it's harmless, since if the WAL is empty then it can't + * read any uncommitted data.) + */ + if (size_cur > 0) { e->page_size = ByteGetBe32(hdr_cur.page_size); - next = WTX_BASE; + rv = shm_init(&e->shm, hdr_cur); + if (rv != SQLITE_OK) { + return rv; + } + rv = walk_wal(e->wal_cur, size_cur, hdr_cur, NULL, + &e->wal_cursor, &e->shm); + if (rv != SQLITE_OK) { + return rv; + } } - if (size_cur >= wal_offset_from_cursor(e->page_size, 1)) { + /* If we found at least one valid transaction in the WAL, take the write + * lock. */ + if (e->wal_cursor > 0) { e->shm_locks[WAL_WRITE_LOCK] = VFS2_EXCLUSIVE; - next = WTX_FOLLOWING; } - sm_move(&e->wtx_sm, next); + + sm_move(&e->wtx_sm, e->wal_cursor > 0 ? WTX_FOLLOWING + : size_cur > 0 ? WTX_BASE + : WTX_EMPTY); return SQLITE_OK; } From 43ec47c9721f6cc87cc6bfd96fec644cec082d53 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Wed, 24 Jul 2024 18:45:23 -0400 Subject: [PATCH 42/73] Rewrite the logic for WAL swap in add_uncommitted First, we now need to initialize the shm here in some cases. Second, make the logic for determining whether to do a WAL swap (hopefully) clearer. Signed-off-by: Cole Miller --- src/vfs2.c | 91 +++++++++++++++++++++++++++++++++++------------------- 1 file changed, 59 insertions(+), 32 deletions(-) diff --git a/src/vfs2.c b/src/vfs2.c index 756c691e8..f82cbaa96 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -1170,18 +1170,6 @@ static void set_mx_frame(struct entry *e, } } -static void restart_full_hdr(struct wal_index_full_hdr *ihdr, - struct wal_hdr new_whdr) -{ - /* cf. walRestartHdr */ - ihdr->basic[0].mxFrame = 0; - ihdr->basic[0].salts = new_whdr.salts; - write_basic_hdr_cksums(&ihdr->basic[0]); - ihdr->basic[1] = ihdr->basic[0]; - ihdr->nBackfill = 0; - ihdr->nBackfillAttempted = 0; -} - static sqlite3_int64 wal_offset_from_cursor(uint32_t page_size, uint32_t cursor) { return (sqlite3_int64)sizeof(struct wal_hdr) + @@ -1906,9 +1894,19 @@ static struct wal_hdr make_wal_hdr(uint32_t magic, update_cksums(p, len, &sums); BytePutBe32(sums.cksum1, ret.cksum1); BytePutBe32(sums.cksum2, ret.cksum2); + POST(wal_hdr_is_valid(&ret)); return ret; } +static struct wal_hdr initial_wal_hdr(uint32_t page_size) +{ + struct vfs2_salts salts; + sqlite3_randomness(sizeof(salts.salt1), (void *)&salts.salt1); + sqlite3_randomness(sizeof(salts.salt2), (void *)&salts.salt2); + return make_wal_hdr(is_bigendian() ? BE_MAGIC : LE_MAGIC, page_size, 0, + salts); +} + /** * Derive the next header that should be written to start a new WAL. * @@ -1959,16 +1957,22 @@ int vfs2_add_uncommitted(sqlite3_file *file, unsigned len, struct vfs2_wal_slice *out) { - PRE(len > 0); - PRE(is_valid_page_size(page_size)); struct file *xfile = (struct file *)file; PRE(xfile->flags & SQLITE_OPEN_MAIN_DB); struct entry *e = xfile->entry; int rv; - /* We require the page size to have been initialized by this point, - * either by reading the WAL when opening the entry or by `PRAGMA - * page_size`. */ + /* TODO(cole) roll back wal_cursor and release the write lock if one of + * the writes fails. */ + + PRE(len > 0); + + /* We require the page size to have been set by the pragma before this + * point. */ + PRE(is_valid_page_size(page_size)); + + /* Sanity check that the leader isn't sending us pages of the wrong + * size. */ PRE(page_size == e->page_size); /* The write lock is always held if there is at least one @@ -1984,27 +1988,50 @@ int vfs2_add_uncommitted(sqlite3_file *file, * (wal_cursor). */ e->shm_locks[WAL_WRITE_LOCK] = VFS2_EXCLUSIVE; - uint32_t start = e->wal_cursor; - struct wal_index_full_hdr *ihdr = get_full_hdr(e); - uint32_t mx = ihdr->basic[0].mxFrame; - if (mx > 0 && ihdr->nBackfill == mx) { - struct wal_hdr new_whdr = next_wal_hdr(e); - restart_full_hdr(ihdr, new_whdr); - rv = wal_swap(e, &new_whdr); - if (rv != SQLITE_OK) { - return 1; + /* This paragraph accomplishes a few related things: initializing the + * shm if necessary, figuring out where to place the frames of the new + * transaction in the WAL, and, if necessary, writing a new WAL header + * and doing work related to the WAL swap. + * + * The shm will need to be initialized if this wasn't already done by + * either open_entry or SQLite itself. This happens when we open the + * database for the first time (so WAL-cur is empty) and then run + * add_uncommitted before anything else. + * + * The new transaction will be placed at the offset given by + * `e->wal_cursor`, after possibly resetting this to zero (when the WAL + * has been fully backfilled). + * + * If the new transaction is to be placed at offset zero, we also write + * a new WAL header, reset the shared memory (unless it was just + * initialized for the first time!), and swap the WALs. + */ + PRE(ERGO(e->shm.num_regions == 0, e->wal_cursor == 0)); + struct wal_index_full_hdr *ihdr = + e->shm.num_regions > 0 ? &e->shm.regions[0].hdr : NULL; + uint32_t mx = ihdr != NULL ? ihdr->basic[0].mxFrame : 0; + uint32_t backfill = ihdr != NULL ? ihdr->nBackfill : 0; + if (e->wal_cursor == mx && mx == backfill) { + struct wal_hdr new_wal_hdr; + if (ihdr != NULL) { + new_wal_hdr = next_wal_hdr(e); + shm_restart(&e->shm, new_wal_hdr); + } else { + new_wal_hdr = initial_wal_hdr(e->page_size); + rv = shm_init(&e->shm, new_wal_hdr); + if (rv != SQLITE_OK) { + return 1; + } } - } else if (start == 0) { - sqlite3_file *phys = e->wal_cur; - struct wal_hdr new_whdr = next_wal_hdr(e); - rv = phys->pMethods->xWrite(phys, &new_whdr, sizeof(new_whdr), - 0); - e->wal_cur_hdr = new_whdr; + wal_swap(e, &new_wal_hdr); + rv = e->wal_cur->pMethods->xWrite(e->wal_cur, &new_wal_hdr, + sizeof(new_wal_hdr), 0); if (rv != SQLITE_OK) { return 1; } } + uint32_t start = e->wal_cursor; struct cksums sums; uint32_t db_size; if (start > 0) { From 70ef37e277bd5a39606079a82ab1e760d09deeaa Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Wed, 24 Jul 2024 19:06:27 -0400 Subject: [PATCH 43/73] Add comments to vfs2 tests explaining what's going on Signed-off-by: Cole Miller --- test/unit/test_vfs2.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/test/unit/test_vfs2.c b/test/unit/test_vfs2.c index d1fdf1c70..d7e732d00 100644 --- a/test/unit/test_vfs2.c +++ b/test/unit/test_vfs2.c @@ -233,7 +233,9 @@ TEST(vfs2, basic, set_up, tear_down, 0, NULL) #define WAL_SIZE_FROM_FRAMES(n) (VFS2_WAL_HDR_SIZE + (24 + PAGE_SIZE) * (n)) /** - * When only one WAL is nonempty at startup, that WAL becomes WAL-cur. + * When one WAL has a valid header and the other is empty, + * the nonempty one becomes WAL-cur. Then, the first write triggers a WAL + * swap, so the frames go to the *other* WAL. */ TEST(vfs2, startup_one_nonempty, set_up, tear_down, 0, NULL) { @@ -245,6 +247,7 @@ TEST(vfs2, startup_one_nonempty, set_up, tear_down, 0, NULL) assert_wal_sizes(buf, 0, 0); + /* WAL2 has a header. */ uint8_t wal2_hdronly[WAL_SIZE_FROM_FRAMES(0)] = { 0 }; vfs2_ut_make_wal_hdr(wal2_hdronly, PAGE_SIZE, 0, 17, 103); prepare_wals(buf, NULL, 0, wal2_hdronly, sizeof(wal2_hdronly)); @@ -252,6 +255,7 @@ TEST(vfs2, startup_one_nonempty, set_up, tear_down, 0, NULL) OK(sqlite3_exec(db, "CREATE TABLE foo (n INTEGER)", NULL, NULL, NULL)); OK(sqlite3_close(db)); + /* WAL1 ends up with the frames. */ assert_wal_sizes(buf, WAL_SIZE_FROM_FRAMES(2), WAL_SIZE_FROM_FRAMES(0)); return MUNIT_OK; @@ -259,7 +263,8 @@ TEST(vfs2, startup_one_nonempty, set_up, tear_down, 0, NULL) /** * When both WALs are nonempty at startup, the one with the higher salt1 - * value becomes WAL-cur. + * value becomes WAL-cur. Then, the first write triggers a WAL swap, so + * the frames go to the *other* WAL. */ TEST(vfs2, startup_both_nonempty, set_up, tear_down, 0, NULL) { @@ -271,6 +276,7 @@ TEST(vfs2, startup_both_nonempty, set_up, tear_down, 0, NULL) snprintf(buf, PATH_MAX, "%s/%s", node->dir, "test.db"); assert_wal_sizes(buf, 0, 0); + /* WAL1 has the higher salt1. */ uint8_t wal1_hdronly[WAL_SIZE_FROM_FRAMES(0)] = { 0 }; vfs2_ut_make_wal_hdr(wal1_hdronly, PAGE_SIZE, 0, 18, 103); uint8_t wal2_hdronly[WAL_SIZE_FROM_FRAMES(0)] = { 0 }; @@ -290,6 +296,7 @@ TEST(vfs2, startup_both_nonempty, set_up, tear_down, 0, NULL) rv = sqlite3_close(db); munit_assert_int(rv, ==, SQLITE_OK); + /* WAL2 ends up with the frames. */ assert_wal_sizes(buf, WAL_SIZE_FROM_FRAMES(0), WAL_SIZE_FROM_FRAMES(2)); return MUNIT_OK; From ad7d8641210cf88d39012b1d8794d29525c17670 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Wed, 24 Jul 2024 19:06:47 -0400 Subject: [PATCH 44/73] Clean up startup_both_nonempty test Signed-off-by: Cole Miller --- test/unit/test_vfs2.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/test/unit/test_vfs2.c b/test/unit/test_vfs2.c index d7e732d00..38fa09f5a 100644 --- a/test/unit/test_vfs2.c +++ b/test/unit/test_vfs2.c @@ -271,7 +271,6 @@ TEST(vfs2, startup_both_nonempty, set_up, tear_down, 0, NULL) struct fixture *f = data; struct node *node = &f->nodes[0]; char buf[PATH_MAX]; - int rv; snprintf(buf, PATH_MAX, "%s/%s", node->dir, "test.db"); assert_wal_sizes(buf, 0, 0); @@ -284,17 +283,8 @@ TEST(vfs2, startup_both_nonempty, set_up, tear_down, 0, NULL) prepare_wals(buf, wal1_hdronly, sizeof(wal1_hdronly), wal2_hdronly, sizeof(wal2_hdronly)); sqlite3 *db = node_open_db(node, "test.db"); - rv = sqlite3_exec(db, - "PRAGMA page_size=" PAGE_SIZE_STR - ";" - "PRAGMA journal_mode=WAL;" - "PRAGMA wal_autocheckpoint=0", - NULL, NULL, NULL); - munit_assert_int(rv, ==, SQLITE_OK); - rv = sqlite3_exec(db, "CREATE TABLE foo (n INTEGER)", NULL, NULL, NULL); - munit_assert_int(rv, ==, SQLITE_OK); - rv = sqlite3_close(db); - munit_assert_int(rv, ==, SQLITE_OK); + OK(sqlite3_exec(db, "CREATE TABLE foo (n INTEGER)", NULL, NULL, NULL)); + OK(sqlite3_close(db)); /* WAL2 ends up with the frames. */ assert_wal_sizes(buf, WAL_SIZE_FROM_FRAMES(0), WAL_SIZE_FROM_FRAMES(2)); From f1adcb5ea4dd9a34386002ff6abe4ad6c423fcb9 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Wed, 24 Jul 2024 20:47:54 -0400 Subject: [PATCH 45/73] Add some ancillary state machines for observability We'd like to make diagrams showing what's going on inside vfs2 in more detail and how it relates to the "big" entry state machine. This commit enables that by introducing new state machines that model four bits of state: the write lock, the recovery lock, the checkpoint lock, and the shared memory. The invariants are vacuous, and even the allowed transitions aren't very constraining, but this is quite handy for understanding what's going on e.g. during tests. Signed-off-by: Cole Miller --- src/vfs2.c | 162 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 161 insertions(+), 1 deletion(-) diff --git a/src/vfs2.c b/src/vfs2.c index f82cbaa96..f88ee62b2 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -44,6 +44,8 @@ static const uint32_t invalid_magic = 0x17171717; +/* clang-format off */ + enum { /* Entry is not yet open. */ WTX_CLOSED, @@ -110,6 +112,125 @@ static const struct sm_conf wtx_states[SM_STATES_MAX] = { }, }; +/** + * State machine that just tracks which WAL is WAL-cur, for observability. + */ +enum { + WAL_1, + WAL_2, + WAL_NR +}; + +static const struct sm_conf wal_states[WAL_NR] = { + [WAL_1] = { + .name = "wal1", + .allowed = BITS(WAL_2), + .flags = SM_INITIAL|SM_FINAL + }, + [WAL_2] = { + .name = "wal2", + .allowed = BITS(WAL_1), + .flags = SM_INITIAL|SM_FINAL + } + +}; + +/** + * State machine that just tracks the state of the WAL write lock, for + * observability. + */ +enum { + WLK_UNLOCKED, + WLK_LOCKED, + WLK_NR +}; + +static const struct sm_conf wlk_states[WLK_NR] = { + [WLK_UNLOCKED] = { + .name = "unlocked", + .allowed = BITS(WLK_LOCKED), + .flags = SM_INITIAL|SM_FINAL + }, + [WLK_LOCKED] = { + .name = "locked", + .allowed = BITS(WLK_UNLOCKED), + .flags = SM_INITIAL|SM_FINAL + } +}; + +/** + * State machine that tracks who has been working on the shm, for + * observability. + * + * Other than observability, the point of this state machine is to check that + * SQLite does not run recovery on the shm after vfs2 has modified it. + */ +enum { + /* Nothing in the shm yet. */ + SHM_EMPTY, + /* SQLite is holding the recovery lock, building up the shm. */ + SHM_RECOVERING, + /* SQLite has finished building up the shm, vfs2 has not touched it. */ + SHM_RECOVERED, + /* vfs2 made the last modification to the shm. */ + SHM_MANAGED, + SHM_NR +}; + +static const struct sm_conf shm_states[SHM_NR] = { + [SHM_EMPTY] = { + .name = "empty", + .allowed = BITS(SHM_RECOVERING)|BITS(SHM_MANAGED), + .flags = SM_INITIAL + }, + [SHM_RECOVERING] = { + .name = "recovering", + .allowed = BITS(SHM_RECOVERED), + }, + [SHM_RECOVERED] = { + .name = "recovered", + .allowed = BITS(SHM_MANAGED) + }, + [SHM_MANAGED] = { + .name = "managed", + .allowed = BITS(SHM_MANAGED) + }, +}; + +/** + * State machine that just tracks whether a checkpoint is in progress or not, + * for observability. + */ +enum { + CKPT_QUIESCENT, + CKPT_CHECKPOINTING, + CKPT_NR +}; + +static const struct sm_conf ckpt_states[CKPT_NR] = { + [CKPT_QUIESCENT] = { + .name = "quiescent", + .allowed = BITS(CKPT_CHECKPOINTING), + .flags = SM_INITIAL + }, + [CKPT_CHECKPOINTING] = { + .name = "checkpointing", + .allowed = BITS(CKPT_QUIESCENT) + } +}; + +/* clang-format on */ + +/** + * A dummy invariant, for when you just don't care. + */ +static bool no_invariant(const struct sm *sm, int prev) +{ + (void)sm; + (void)prev; + return true; +} + /** * Userdata owned by the VFS. */ @@ -239,6 +360,7 @@ struct shm { * region. As a sanity check, we assert that this value is zero before * we free the shm. */ unsigned refcount; + struct sm sm; }; static_assert(sizeof(struct shm_region) == VFS2_WAL_INDEX_REGION_SIZE, @@ -282,6 +404,7 @@ static int shm_init(struct shm *shm, struct wal_hdr whdr) ihdr->marks[2] = READ_MARK_UNUSED; ihdr->marks[3] = READ_MARK_UNUSED; ihdr->marks[4] = READ_MARK_UNUSED; + sm_move(&shm->sm, SHM_MANAGED); return SQLITE_OK; } @@ -309,6 +432,7 @@ static void shm_restart(struct shm *shm, struct wal_hdr whdr) ihdr->marks[2] = READ_MARK_UNUSED; ihdr->marks[3] = READ_MARK_UNUSED; ihdr->marks[4] = READ_MARK_UNUSED; + sm_move(&shm->sm, SHM_MANAGED); } /** @@ -341,6 +465,7 @@ static int shm_add_pgno(struct shm *shm, uint32_t frame, uint32_t pgno) shm->num_regions++; } shm->regions[regno].pgnos_long[index] = pgno; + sm_move(&shm->sm, SHM_MANAGED); return SQLITE_OK; } @@ -406,6 +531,9 @@ struct entry { struct wal_hdr wal_prev_hdr; struct sm wtx_sm; + struct sm wal_sm; + struct sm wlk_sm; + struct sm ckpt_sm; /* VFS-wide data (immutable) */ struct common *common; }; @@ -606,6 +734,8 @@ static void wal_swap(struct entry *e, const struct wal_hdr *hdr) e->wal_prev_hdr = e->wal_cur_hdr; e->wal_cur_hdr = *hdr; + sm_move(&e->wal_sm, !sm_state(&e->wal_sm)); + /* Best-effort: flip the moving name. * * If these syscalls fail, we can end up with no moving name, or a @@ -969,12 +1099,20 @@ static int vfs2_shm_lock(sqlite3_file *file, int ofst, int n, int flags) for (int i = ofst; i < ofst + n; i++) { e->shm_locks[i] = VFS2_EXCLUSIVE; + if (i == WAL_RECOVER_LOCK) { + sm_move(&e->shm.sm, SHM_RECOVERING); + } } /* XXX maybe this shouldn't be an assertion */ if (ofst == WAL_WRITE_LOCK) { assert(n == 1); assert(e->pending_txn_len == 0); + sm_move(&e->wlk_sm, WLK_LOCKED); + } + + if (ofst == WAL_CKPT_LOCK && n == 1) { + sm_move(&e->ckpt_sm, CKPT_CHECKPOINTING); } } else if (flags == (SQLITE_SHM_UNLOCK | SQLITE_SHM_SHARED)) { for (int i = ofst; i < ofst + n; i++) { @@ -988,9 +1126,11 @@ static int vfs2_shm_lock(sqlite3_file *file, int ofst, int n, int flags) } if (ofst <= WAL_RECOVER_LOCK && WAL_RECOVER_LOCK < ofst + n) { + sm_move(&e->shm.sm, SHM_RECOVERED); } if (ofst == WAL_WRITE_LOCK) { + sm_move(&e->wlk_sm, WLK_UNLOCKED); /* If the last frame of the pending transaction has no * commit marker when SQLite releases the write lock, it * means that the transaction rolled back before it @@ -1005,11 +1145,11 @@ static int vfs2_shm_lock(sqlite3_file *file, int ofst, int n, int flags) } else if (ofst == WAL_CKPT_LOCK && n == 1) { /* End of a checkpoint: if all frames have been * backfilled, move to EMPTY. */ - assert(n == 1); struct wal_index_full_hdr *ihdr = get_full_hdr(e); if (ihdr->nBackfill == ihdr->basic[0].mxFrame) { sm_move(&e->wtx_sm, WTX_EMPTY); } + sm_move(&e->ckpt_sm, CKPT_QUIESCENT); } } else { assert(0); @@ -1129,6 +1269,7 @@ static void pgno_ht_insert(uint16_t *ht, uint16_t fx, uint32_t pgno) static void shm_update_ht(struct shm *shm, uint32_t frame) { PRE(shm->num_regions > 0); + sm_move(&shm->sm, SHM_MANAGED); if (frame < SHM_SHORT_PGNOS_LEN) { struct shm_region *r0 = &shm->regions[0]; uint32_t pgno = r0->pgnos_short[frame]; @@ -1372,6 +1513,9 @@ static int open_entry(struct common *common, const char *name, struct entry *e) size_cur = size2; hdr_prev = hdr1; } + sm_init(&e->wal_sm, no_invariant, NULL, wal_states, "wal", + wal1_is_fresh ? WAL_1 : WAL_2); + sm_relate(&e->wtx_sm, &e->wal_sm); e->wal_cur_hdr = hdr_cur; e->wal_prev_hdr = hdr_prev; @@ -1381,6 +1525,9 @@ static int open_entry(struct common *common, const char *name, struct entry *e) rv = link(e->wal_cur_fixed_name, e->wal_moving_name); (void)rv; + sm_init(&e->shm.sm, no_invariant, NULL, shm_states, "shm", SHM_EMPTY); + sm_relate(&e->wtx_sm, &e->shm.sm); + /* If WAL-cur contains a valid header, walk it to initialize wal_cursor * and the WAL-index. Also, set the page size. * @@ -1408,6 +1555,12 @@ static int open_entry(struct common *common, const char *name, struct entry *e) if (e->wal_cursor > 0) { e->shm_locks[WAL_WRITE_LOCK] = VFS2_EXCLUSIVE; } + sm_init(&e->wlk_sm, no_invariant, NULL, wlk_states, "wlk", + e->wal_cursor > 0 ? WLK_LOCKED : WLK_UNLOCKED); + sm_relate(&e->wtx_sm, &e->wlk_sm); + + sm_init(&e->ckpt_sm, no_invariant, NULL, ckpt_states, "ckpt", CKPT_QUIESCENT); + sm_relate(&e->wtx_sm, &e->ckpt_sm); sm_move(&e->wtx_sm, e->wal_cursor > 0 ? WTX_FOLLOWING : size_cur > 0 ? WTX_BASE @@ -1639,6 +1792,7 @@ int vfs2_unadd(sqlite3_file *file, struct vfs2_wal_slice first_to_unadd) if (e->wal_cursor == ihdr->basic[0].mxFrame) { e->shm_locks[WAL_WRITE_LOCK] = 0; sm_move(&e->wtx_sm, WTX_BASE); + sm_move(&e->wlk_sm, WLK_UNLOCKED); } else { sm_move(&e->wtx_sm, WTX_FOLLOWING); } @@ -1651,6 +1805,7 @@ int vfs2_unhide(sqlite3_file *file) PRE(xfile->flags & SQLITE_OPEN_MAIN_DB); struct entry *e = xfile->entry; PRE(e->shm_locks[WAL_WRITE_LOCK] == VFS2_EXCLUSIVE); + sm_move(&e->wlk_sm, WLK_UNLOCKED); e->shm_locks[WAL_WRITE_LOCK] = 0; struct wal_index_full_hdr *hdr = get_full_hdr(e); @@ -1686,6 +1841,7 @@ int vfs2_apply(sqlite3_file *file, struct vfs2_wal_slice stop) set_mx_frame(e, commit, fhdr); if (commit == e->wal_cursor) { e->shm_locks[WAL_WRITE_LOCK] = 0; + sm_move(&e->wlk_sm, WLK_UNLOCKED); sm_move(&e->wtx_sm, WTX_BASE); } else { sm_move(&e->wtx_sm, WTX_FOLLOWING); @@ -1710,6 +1866,7 @@ int vfs2_poll(sqlite3_file *file, return 1; } e->shm_locks[WAL_WRITE_LOCK] = VFS2_EXCLUSIVE; + sm_move(&e->wlk_sm, WLK_LOCKED); } /* Note, not resetting pending_txn_{start,len} because they are used by @@ -1986,6 +2143,9 @@ int vfs2_add_uncommitted(sqlite3_file *file, * or vfs2_unadd causes the number of committed frames in * WAL-cur (mxFrame) to equal the number of applies frames * (wal_cursor). */ + if (e->shm_locks[WAL_WRITE_LOCK] == 0) { + sm_move(&e->wlk_sm, WLK_LOCKED); + } e->shm_locks[WAL_WRITE_LOCK] = VFS2_EXCLUSIVE; /* This paragraph accomplishes a few related things: initializing the From 59c9036e61c59f7360d04845202d4a99ba544298 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Thu, 25 Jul 2024 00:37:54 -0400 Subject: [PATCH 46/73] Use a NR constant for the wtx sm configuration Signed-off-by: Cole Miller --- src/vfs2.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/vfs2.c b/src/vfs2.c index f88ee62b2..050f66968 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -62,10 +62,11 @@ enum { /* Leader, transaction committed by SQLite and hidden. */ WTX_HIDDEN, /* Leader, transation committed by SQLite, hidden, and polled. */ - WTX_POLLED + WTX_POLLED, + WTX_NR }; -static const struct sm_conf wtx_states[SM_STATES_MAX] = { +static const struct sm_conf wtx_states[WTX_NR] = { [WTX_CLOSED] = { .flags = SM_INITIAL|SM_FINAL, .name = "closed", From 6594fb84df0ea0ac2c6addb2e93868dc91594fde Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Thu, 25 Jul 2024 00:39:50 -0400 Subject: [PATCH 47/73] Add a new vfs2 test involving a restart Signed-off-by: Cole Miller --- test/unit/test_vfs2.c | 46 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/test/unit/test_vfs2.c b/test/unit/test_vfs2.c index 38fa09f5a..949ec78e0 100644 --- a/test/unit/test_vfs2.c +++ b/test/unit/test_vfs2.c @@ -261,6 +261,52 @@ TEST(vfs2, startup_one_nonempty, set_up, tear_down, 0, NULL) return MUNIT_OK; } +/** + * When one WAL has a valid transaction and the other is empty, + * the WAL with the transaction becomes WAL-cur. The first write does not + * trigger a WAL swap, but rather goes to that same WAL. + */ +TEST(vfs2, startup_frames_in_one, set_up, tear_down, 0, NULL) +{ + struct fixture *f = data; + struct node *node = &f->nodes[0]; + char buf[PATH_MAX]; + int rv; + + snprintf(buf, PATH_MAX, "%s/%s", node->dir, "test.db"); + + /* Set up a transaction in WAL2. */ + sqlite3 *db = node_open_db(node, "test.db"); + sqlite3_file *fp = main_file(db); + OK(sqlite3_exec(db, "CREATE TABLE foo (n INTEGER)", NULL, NULL, NULL)); + + struct vfs2_wal_slice sl; + OK(vfs2_poll(fp, NULL, &sl)); + OK(sqlite3_close(db)); + /* WAL2 has the frames. The value 4 here reflect the invalid magic + * number that we write to the outgoing WAL. */ + assert_wal_sizes(buf, 4, WAL_SIZE_FROM_FRAMES(2)); + + db = node_open_db(node, "test.db"); + fp = main_file(db); + /* The transaction is not visible. */ + rv = sqlite3_exec(db, "SELECT * FROM foo", NULL, NULL, NULL); + munit_assert_int(rv, ==, SQLITE_ERROR); + /* The write lock is held. */ + rv = sqlite3_exec(db, "CREATE TABLE bar (k INTEGER)", NULL, NULL, NULL); + munit_assert_int(rv, ==, SQLITE_BUSY); + /* The transaction can be committed. */ + OK(vfs2_apply(fp, sl)); + /* The transaction is visible. */ + OK(sqlite3_exec(db, "SELECT * FROM foo", NULL, NULL, NULL)); + /* The write lock is not held. */ + OK(sqlite3_exec(db, "CREATE TABLE bar (k, INTEGER)", NULL, NULL, NULL)); + /* The write lock is released. */ + sqlite3_close(db); + + return MUNIT_OK; +} + /** * When both WALs are nonempty at startup, the one with the higher salt1 * value becomes WAL-cur. Then, the first write triggers a WAL swap, so From d7490dafb81bd8531ac2ead7e89a7fc0fa13357c Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Thu, 25 Jul 2024 01:53:38 -0400 Subject: [PATCH 48/73] Work around integer promotion headache Signed-off-by: Cole Miller --- src/vfs2.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/vfs2.c b/src/vfs2.c index 050f66968..d88fc62f6 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -1260,7 +1260,8 @@ static void pgno_ht_insert(uint16_t *ht, uint16_t fx, uint32_t pgno) } /* SQLite uses 1-based frame indices in this context, reserving * 0 for a sentinel value. */ - ht[hash % SHM_HT_LEN] = fx + 1; + fx++; + ht[hash % SHM_HT_LEN] = fx; } /** From 8b31e836d398ce106dbbf6c4e66c72659c792e03 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Thu, 25 Jul 2024 02:27:44 -0400 Subject: [PATCH 49/73] Draw an updated state machine diagram Signed-off-by: Cole Miller --- src/vfs2.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/src/vfs2.c b/src/vfs2.c index d88fc62f6..44ad75fd9 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -66,6 +66,41 @@ enum { WTX_NR }; +/** + * Major transitions for this state machine: + * + * +-----------CLOSED----------+ + * | | + * xOpen && no tx in WAL-cur xOpen && tx in WAL-cur + * | | + * | (FB) | + * v (BE) <------ v v-+ vfs2_add_uncommitted + * EMPTY<------BASE------>FOLLOWING-+ + * ^ | (BF) + * | | + * | | sqlite3_step && xWrite(WAL) + * | | + * | v v-+ xWrite(WAL) + * |--ACTIVE-+ + * | | + * vfs2_abort | | COMMIT_PHASETWO + * | | + * | v + * |--HIDDEN + * | | + * | | vfs2_poll + * | | + * | v + * +--POLLED + * + * Abbreviations and omissions: + * - BE occurs when we run a full checkpoint. + * - FB occurs when we call vfs2_apply or vfs2_unadd and no uncommitted + * transactions remain afterward. + * - BF occurs via vfs2_add_uncommitted. + * - EMPTY may move to ACTIVE via sqlite3_step. + * - All states may move back to CLOSED. + */ static const struct sm_conf wtx_states[WTX_NR] = { [WTX_CLOSED] = { .flags = SM_INITIAL|SM_FINAL, From eff99cac2023177f11afdd9eb8eee0c54eeb1354 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Thu, 25 Jul 2024 16:40:38 -0400 Subject: [PATCH 50/73] Refactor vfs2_poll This should make the logic and resource management a bit clearer. Signed-off-by: Cole Miller --- src/vfs2.c | 53 ++++++++++++++++++++++++----------------------------- 1 file changed, 24 insertions(+), 29 deletions(-) diff --git a/src/vfs2.c b/src/vfs2.c index 44ad75fd9..806f22733 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -1887,48 +1887,43 @@ int vfs2_apply(sqlite3_file *file, struct vfs2_wal_slice stop) } int vfs2_poll(sqlite3_file *file, - dqlite_vfs_frame **frames, - struct vfs2_wal_slice *sl) + dqlite_vfs_frame **frames_out, + struct vfs2_wal_slice *sl_out) { struct file *xfile = (struct file *)file; PRE(xfile->flags & SQLITE_OPEN_MAIN_DB); struct entry *e = xfile->entry; + PRE(e->shm_locks[WAL_WRITE_LOCK] == 0); uint32_t len = e->pending_txn_len; + dqlite_vfs_frame *frames = NULL; + struct vfs2_wal_slice sl = {}; + /* If some frames were produced, take the write lock. */ if (len > 0) { - /* Don't go through vfs2_shm_lock here since that has additional - * checks that assume the context of being called from inside - * SQLite. */ - if (e->shm_locks[WAL_WRITE_LOCK] > 0) { - return 1; - } e->shm_locks[WAL_WRITE_LOCK] = VFS2_EXCLUSIVE; sm_move(&e->wlk_sm, WLK_LOCKED); - } - - /* Note, not resetting pending_txn_{start,len} because they are used by - * later states */ - if (frames != NULL) { - *frames = e->pending_txn_frames; + frames = e->pending_txn_frames; + e->pending_txn_frames = NULL; + sl = (struct vfs2_wal_slice){ .salts = e->pending_txn_hdr.salts, + .start = e->prev_txn_hdr.mxFrame, + .len = len }; + /* We don't clear e->pending_txn_hdr here because it's used by + * vfs2_unhide. (By contrast, pending_txn_frames only exists + * to be returned by this function if requested.) */ + sm_move(&e->wtx_sm, WTX_POLLED); + } + + if (frames_out != NULL) { + *frames_out = frames; } else { - for (uint32_t i = 0; i < e->pending_txn_len; i++) { - sqlite3_free(e->pending_txn_frames[i].data); + for (uint32_t i = 0; i < len; i++) { + sqlite3_free(frames[i].data); } - sqlite3_free(e->pending_txn_frames); - } - e->pending_txn_frames = NULL; - - if (sl != NULL) { - sl->len = len; - sl->salts = e->pending_txn_hdr.salts; - sl->start = e->prev_txn_hdr.mxFrame; - sl->len = len; + sqlite3_free(frames); } - - if (len > 0) { - sm_move(&xfile->entry->wtx_sm, WTX_POLLED); + if (sl_out != NULL) { + *sl_out = sl; } - return 0; } From ad203cb7e5cab2ff4a9a579835ed57d9f1abf1e6 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Thu, 25 Jul 2024 16:41:25 -0400 Subject: [PATCH 51/73] Tiny formatting fix Signed-off-by: Cole Miller --- src/vfs2.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/vfs2.c b/src/vfs2.c index 806f22733..26e0e6c6e 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -1596,7 +1596,8 @@ static int open_entry(struct common *common, const char *name, struct entry *e) e->wal_cursor > 0 ? WLK_LOCKED : WLK_UNLOCKED); sm_relate(&e->wtx_sm, &e->wlk_sm); - sm_init(&e->ckpt_sm, no_invariant, NULL, ckpt_states, "ckpt", CKPT_QUIESCENT); + sm_init(&e->ckpt_sm, no_invariant, NULL, ckpt_states, "ckpt", + CKPT_QUIESCENT); sm_relate(&e->wtx_sm, &e->ckpt_sm); sm_move(&e->wtx_sm, e->wal_cursor > 0 ? WTX_FOLLOWING From 2d9fc075cb14da236854128490774d67b33bc8d6 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Thu, 25 Jul 2024 21:15:09 -0400 Subject: [PATCH 52/73] Make assert_wal_sizes and prepare_wals more flexible Pass them a node and a database name to cut down on the amount of snprintf in the tests themselves. Signed-off-by: Cole Miller --- test/unit/test_vfs2.c | 43 +++++++++++++++++++------------------------ 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/test/unit/test_vfs2.c b/test/unit/test_vfs2.c index 949ec78e0..9cab24fe9 100644 --- a/test/unit/test_vfs2.c +++ b/test/unit/test_vfs2.c @@ -88,7 +88,8 @@ static sqlite3 *node_open_db(const struct node *node, const char *name) /** * Write two WALs to disk with the given contents. */ -static void prepare_wals(const char *dbname, +static void prepare_wals(const struct node *node, + const char *dbname, const unsigned char *wal1, size_t wal1_len, const unsigned char *wal2, @@ -97,7 +98,7 @@ static void prepare_wals(const char *dbname, char buf[PATH_MAX]; ssize_t n; if (wal1 != NULL) { - snprintf(buf, sizeof(buf), "%s-xwal1", dbname); + snprintf(buf, sizeof(buf), "%s/%s-xwal1", node->dir, dbname); int fd1 = open(buf, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH); munit_assert_int(fd1, !=, -1); @@ -107,7 +108,7 @@ static void prepare_wals(const char *dbname, close(fd1); } if (wal2 != NULL) { - snprintf(buf, sizeof(buf), "%s-xwal2", dbname); + snprintf(buf, sizeof(buf), "%s/%s-xwal2", node->dir, dbname); int fd2 = open(buf, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH); munit_assert_int(fd2, !=, -1); @@ -121,18 +122,21 @@ static void prepare_wals(const char *dbname, /** * Assert the lengths of WAL1 and WAL2 on disk. */ -static void assert_wal_sizes(const char *dbname, off_t wal1_len, off_t wal2_len) +static void assert_wal_sizes(const struct node *node, + const char *dbname, + off_t wal1_len, + off_t wal2_len) { char buf[PATH_MAX]; struct stat st; int rv; - snprintf(buf, sizeof(buf), "%s-xwal1", dbname); + snprintf(buf, sizeof(buf), "%s/%s-xwal1", node->dir, dbname); rv = stat(buf, &st); munit_assert_true((rv == 0 && st.st_size == wal1_len) || (rv < 0 && errno == ENOENT && wal1_len == 0)); - snprintf(buf, sizeof(buf), "%s-xwal2", dbname); + snprintf(buf, sizeof(buf), "%s/%s-xwal2", node->dir, dbname); rv = stat(buf, &st); munit_assert_true((rv == 0 && st.st_size == wal2_len) || (rv < 0 && errno == ENOENT && wal2_len == 0)); @@ -241,22 +245,19 @@ TEST(vfs2, startup_one_nonempty, set_up, tear_down, 0, NULL) { struct fixture *f = data; struct node *node = &f->nodes[0]; - char buf[PATH_MAX]; - - snprintf(buf, PATH_MAX, "%s/%s", node->dir, "test.db"); - - assert_wal_sizes(buf, 0, 0); /* WAL2 has a header. */ uint8_t wal2_hdronly[WAL_SIZE_FROM_FRAMES(0)] = { 0 }; vfs2_ut_make_wal_hdr(wal2_hdronly, PAGE_SIZE, 0, 17, 103); - prepare_wals(buf, NULL, 0, wal2_hdronly, sizeof(wal2_hdronly)); + prepare_wals(node, "test.db", NULL, 0, wal2_hdronly, + sizeof(wal2_hdronly)); sqlite3 *db = node_open_db(node, "test.db"); OK(sqlite3_exec(db, "CREATE TABLE foo (n INTEGER)", NULL, NULL, NULL)); OK(sqlite3_close(db)); /* WAL1 ends up with the frames. */ - assert_wal_sizes(buf, WAL_SIZE_FROM_FRAMES(2), WAL_SIZE_FROM_FRAMES(0)); + assert_wal_sizes(node, "test.db", WAL_SIZE_FROM_FRAMES(2), + WAL_SIZE_FROM_FRAMES(0)); return MUNIT_OK; } @@ -270,11 +271,8 @@ TEST(vfs2, startup_frames_in_one, set_up, tear_down, 0, NULL) { struct fixture *f = data; struct node *node = &f->nodes[0]; - char buf[PATH_MAX]; int rv; - snprintf(buf, PATH_MAX, "%s/%s", node->dir, "test.db"); - /* Set up a transaction in WAL2. */ sqlite3 *db = node_open_db(node, "test.db"); sqlite3_file *fp = main_file(db); @@ -285,7 +283,7 @@ TEST(vfs2, startup_frames_in_one, set_up, tear_down, 0, NULL) OK(sqlite3_close(db)); /* WAL2 has the frames. The value 4 here reflect the invalid magic * number that we write to the outgoing WAL. */ - assert_wal_sizes(buf, 4, WAL_SIZE_FROM_FRAMES(2)); + assert_wal_sizes(node, "test.db", 4, WAL_SIZE_FROM_FRAMES(2)); db = node_open_db(node, "test.db"); fp = main_file(db); @@ -316,24 +314,21 @@ TEST(vfs2, startup_both_nonempty, set_up, tear_down, 0, NULL) { struct fixture *f = data; struct node *node = &f->nodes[0]; - char buf[PATH_MAX]; - - snprintf(buf, PATH_MAX, "%s/%s", node->dir, "test.db"); - assert_wal_sizes(buf, 0, 0); /* WAL1 has the higher salt1. */ uint8_t wal1_hdronly[WAL_SIZE_FROM_FRAMES(0)] = { 0 }; vfs2_ut_make_wal_hdr(wal1_hdronly, PAGE_SIZE, 0, 18, 103); uint8_t wal2_hdronly[WAL_SIZE_FROM_FRAMES(0)] = { 0 }; vfs2_ut_make_wal_hdr(wal2_hdronly, PAGE_SIZE, 0, 17, 103); - prepare_wals(buf, wal1_hdronly, sizeof(wal1_hdronly), wal2_hdronly, - sizeof(wal2_hdronly)); + prepare_wals(node, "test.db", wal1_hdronly, sizeof(wal1_hdronly), + wal2_hdronly, sizeof(wal2_hdronly)); sqlite3 *db = node_open_db(node, "test.db"); OK(sqlite3_exec(db, "CREATE TABLE foo (n INTEGER)", NULL, NULL, NULL)); OK(sqlite3_close(db)); /* WAL2 ends up with the frames. */ - assert_wal_sizes(buf, WAL_SIZE_FROM_FRAMES(0), WAL_SIZE_FROM_FRAMES(2)); + assert_wal_sizes(node, "test.db", WAL_SIZE_FROM_FRAMES(0), + WAL_SIZE_FROM_FRAMES(2)); return MUNIT_OK; } From e60109bccbeea36527baddeba6b025eae92aab0e Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Sun, 28 Jul 2024 16:35:29 -0500 Subject: [PATCH 53/73] Disable checkpoint-on-close in vfs2 unit tests This matches the setup we use in dqlite proper with the old vfs (see leader.c and db.c). It also just makes good sense if we're trying to exercise control over when checkpoints happen. Signed-off-by: Cole Miller --- test/unit/test_vfs2.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/unit/test_vfs2.c b/test/unit/test_vfs2.c index 9cab24fe9..c45314ce6 100644 --- a/test/unit/test_vfs2.c +++ b/test/unit/test_vfs2.c @@ -82,6 +82,8 @@ static sqlite3 *node_open_db(const struct node *node, const char *name) "PRAGMA wal_autocheckpoint=0", NULL, NULL, NULL); munit_assert_int(rv, ==, SQLITE_OK); + rv = sqlite3_db_config(db, SQLITE_DBCONFIG_NO_CKPT_ON_CLOSE, 1, NULL); + munit_assert_int(rv, ==, SQLITE_OK); return db; } From d924a0674576ae0f14e5f83640eb72ddbb468899 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Sun, 28 Jul 2024 23:22:52 -0500 Subject: [PATCH 54/73] Fix commit marker calculation Signed-off-by: Cole Miller --- src/vfs2.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/vfs2.c b/src/vfs2.c index 26e0e6c6e..d63911de1 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -18,6 +18,7 @@ #include #include #include +#include /* MAX */ #include #define VFS2_WAL_FIXED_SUFFIX1 "-xwal1" @@ -2274,7 +2275,14 @@ int vfs2_add_uncommitted(sqlite3_file *file, * "early" like this is harmless and saves us from having to stash the * page numbers somewhere else in memory in between add_uncommitted and * apply, or (worse) read them back from WAL-cur. */ - shm_add_pgno(&e->shm, e->wal_cursor, (uint32_t)frames[0].page_number); + uint32_t pgno = (uint32_t)frames[0].page_number; + shm_add_pgno(&e->shm, e->wal_cursor, pgno); + + /* With every frame, we make this update. The interpretation is that + * db_size would be the size of the main file in pages if we + * checkpointing up to and including the current frame. Then we use the + * final value to write the commit marker of the last frame. */ + db_size = MAX(db_size, pgno); uint32_t commit = len == 1 ? db_size : 0; struct wal_frame_hdr fhdr = txn_frame_hdr(e, sums, &frames[0], commit); @@ -2284,9 +2292,9 @@ int vfs2_add_uncommitted(sqlite3_file *file, } for (unsigned i = 1; i < len; i++) { - PRE(e->wal_cursor < SHM_SHORT_PGNOS_LEN); - shm_add_pgno(&e->shm, e->wal_cursor, - (uint32_t)frames[i].page_number); + pgno = (uint32_t)frames[i].page_number; + shm_add_pgno(&e->shm, e->wal_cursor, pgno); + db_size = MAX(db_size, pgno); sums.cksum1 = ByteGetBe32(fhdr.cksum1); sums.cksum2 = ByteGetBe32(fhdr.cksum2); commit = i == len - 1 ? db_size : 0; From f902cba4ab01c854f0f2129a17263f178921cf19 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Sun, 28 Jul 2024 23:23:57 -0500 Subject: [PATCH 55/73] Add additional checks to leader_and_follower test Signed-off-by: Cole Miller --- test/unit/test_vfs2.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/test/unit/test_vfs2.c b/test/unit/test_vfs2.c index c45314ce6..1dc3780ea 100644 --- a/test/unit/test_vfs2.c +++ b/test/unit/test_vfs2.c @@ -370,11 +370,14 @@ TEST(vfs2, leader_and_follower, set_up, tear_down, 0, NULL) struct fixture *f = data; struct node *leader = &f->nodes[0]; struct node *follower = &f->nodes[1]; + int rv; /* The leader executes and polls a transaction. */ sqlite3 *leader_db = node_open_db(leader, "test.db"); OK(sqlite3_exec(leader_db, "CREATE TABLE foo (n INTEGER)", NULL, NULL, NULL)); + /* WAL2 gets the frames after a WAL swap. */ + assert_wal_sizes(leader, "test.db", 4, WAL_SIZE_FROM_FRAMES(2)); sqlite3_file *leader_fp = main_file(leader_db); dqlite_vfs_frame *frames; struct vfs2_wal_slice leader_sl; @@ -390,9 +393,16 @@ TEST(vfs2, leader_and_follower, set_up, tear_down, 0, NULL) struct vfs2_wal_slice follower_sl; OK(vfs2_add_uncommitted(follower_fp, PAGE_SIZE, frames, leader_sl.len, &follower_sl)); + /* WAL2 gets the frames after a WAL swap. */ + assert_wal_sizes(follower, "test.db", 4, WAL_SIZE_FROM_FRAMES(2)); sqlite3_free(frames[0].data); sqlite3_free(frames[1].data); sqlite3_free(frames); + /* The transaction is not visible, and the write lock is held. */ + rv = sqlite3_exec(follower_db, "SELECT * FROM foo", NULL, NULL, NULL); + munit_assert_int(rv, ==, SQLITE_ERROR); + rv = sqlite3_exec(follower_db, "CREATE TABLE bar (k INTEGER)", NULL, NULL, NULL); + munit_assert_int(rv, ==, SQLITE_BUSY); /* The leader receives the follower's acknowledgement * and applies the transaction locally. */ @@ -401,6 +411,8 @@ TEST(vfs2, leader_and_follower, set_up, tear_down, 0, NULL) /* The follower learns the new commit index and applies * the transaction locally. */ OK(vfs2_apply(follower_fp, follower_sl)); + /* The transaction is visible and the write lock is released. */ + OK(sqlite3_exec(follower_db, "INSERT INTO foo (n) VALUES (17)", NULL, NULL, NULL)); sqlite3_close(follower_db); sqlite3_close(leader_db); From bfce72d8e2f581b97c1f040f6c5c640788ec3d81 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Mon, 29 Jul 2024 14:49:10 -0500 Subject: [PATCH 56/73] Use a prepared statement in rollback test Signed-off-by: Cole Miller --- test/unit/test_vfs2.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/test/unit/test_vfs2.c b/test/unit/test_vfs2.c index 1dc3780ea..399fc41e5 100644 --- a/test/unit/test_vfs2.c +++ b/test/unit/test_vfs2.c @@ -343,19 +343,23 @@ TEST(vfs2, rollback, set_up, tear_down, 0, NULL) struct fixture *f = data; struct node *node = &f->nodes[0]; struct vfs2_wal_slice sl; - char sql[100]; + int rv; sqlite3 *db = node_open_db(node, "test.db"); OK(sqlite3_exec(db, "CREATE TABLE foo (n INTEGER)", NULL, NULL, NULL)); sqlite3_file *fp = main_file(db); OK(vfs2_poll(fp, NULL, &sl)); OK(vfs2_unhide(fp)); + sqlite3_stmt *stmt; + OK(sqlite3_prepare_v2(db, "INSERT INTO foo (n) VALUES (?)", -1, &stmt, NULL)); OK(sqlite3_exec(db, "BEGIN", NULL, NULL, NULL)); for (unsigned i = 0; i < 500; i++) { - snprintf(sql, sizeof(sql), "INSERT INTO foo (n) VALUES (%d)", - i); - OK(sqlite3_exec(db, sql, NULL, NULL, NULL)); + OK(sqlite3_bind_int(stmt, 1, i)); + rv = sqlite3_step(stmt); + munit_assert_int(rv, ==, SQLITE_DONE); + OK(sqlite3_reset(stmt)); } + OK(sqlite3_finalize(stmt)); OK(sqlite3_exec(db, "ROLLBACK", NULL, NULL, NULL)); OK(sqlite3_close(db)); From fde6dca1692352ef744c64ab1e5fb707da67c9d0 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Mon, 29 Jul 2024 17:11:25 -0500 Subject: [PATCH 57/73] Return to separate allocations for shm regions Using a single allocation makes the code simpler, but is unsound: SQLite holds on to the pointers obtained from xShmMap and expects them to remain valid as more regions are mapped. This partially reverts commit 7f20503b068d0a03724fc35473ad4d3ac24a272c. Signed-off-by: Cole Miller --- src/vfs2.c | 107 ++++++++++++++++++++++++++++++++--------------------- 1 file changed, 64 insertions(+), 43 deletions(-) diff --git a/src/vfs2.c b/src/vfs2.c index d63911de1..f2f149464 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -386,12 +386,13 @@ struct shm_region { /** * "Shared" memory implementation for storing the WAL-index. * - * All the regions are stored in a single allocation, whose size is a multiple - * of VFS2_WAL_INDEX_REGION_SIZE. We realloc to create additional regions as - * they are demanded. + * Each region is stored in its own heap allocation of size + * VFS2_WAL_INDEX_REGION_SIZE. Using separate allocations ensures that existing + * region pointers remain valid when new regions are mapped, as SQLite + * expects. */ struct shm { - struct shm_region *regions; + struct shm_region **regions; int num_regions; /* Counts the net number of times that SQLite has mapped the zeroth * region. As a sanity check, we assert that this value is zero before @@ -403,6 +404,37 @@ struct shm { static_assert(sizeof(struct shm_region) == VFS2_WAL_INDEX_REGION_SIZE, "shm regions have the expected size"); +/** + * Allocate a new shm region at the next-highest index. + * + * Returns a pointer to the new region, or NULL if allocation failed. + * In the latter case the shm is unchanged. + */ +static struct shm_region *shm_grow(struct shm *shm) +{ + int index = shm->num_regions; + struct shm_region *r = sqlite3_malloc64(VFS2_WAL_INDEX_REGION_SIZE); + if (r == NULL) { + goto err; + } + *r = (struct shm_region){}; + sqlite3_uint64 size = + (sqlite3_uint64)(index + 1) * (sqlite3_uint64)sizeof(*shm->regions); + struct shm_region **p = sqlite3_realloc64(shm->regions, size); + if (p == NULL) { + goto err_after_alloc_region; + } + p[index] = r; + shm->regions = p; + shm->num_regions++; + return r; + +err_after_alloc_region: + sqlite3_free(r); +err: + return NULL; +} + static void write_basic_hdr_cksums(struct wal_index_basic_hdr *bhdr) { struct cksums sums = {}; @@ -420,15 +452,10 @@ static void write_basic_hdr_cksums(struct wal_index_basic_hdr *bhdr) */ static int shm_init(struct shm *shm, struct wal_hdr whdr) { - shm->regions = sqlite3_malloc64(VFS2_WAL_INDEX_REGION_SIZE); - if (shm->regions == NULL) { + struct shm_region *r0 = shm_grow(shm); + if (r0 == NULL) { return SQLITE_NOMEM; } - shm->regions[0] = (struct shm_region){}; - shm->num_regions = 1; - shm->refcount = 0; - - struct shm_region *r0 = &shm->regions[0]; struct wal_index_full_hdr *ihdr = &r0->hdr; ihdr->basic[0].iVersion = 3007000; ihdr->basic[0].isInit = 1; @@ -452,11 +479,11 @@ static int shm_init(struct shm *shm, struct wal_hdr whdr) static void shm_restart(struct shm *shm, struct wal_hdr whdr) { for (int i = 0; i < shm->num_regions; i++) { - shm->regions[i] = (struct shm_region){}; + *shm->regions[i] = (struct shm_region){}; } /* TODO(cole) eliminate redundancy with shm_init */ - struct shm_region *r0 = &shm->regions[0]; + struct shm_region *r0 = shm->regions[0]; struct wal_index_full_hdr *ihdr = &r0->hdr; ihdr->basic[0].iVersion = 3007000; ihdr->basic[0].isInit = 1; @@ -483,7 +510,7 @@ static int shm_add_pgno(struct shm *shm, uint32_t frame, uint32_t pgno) { PRE(shm->num_regions > 0); if (frame < SHM_SHORT_PGNOS_LEN) { - struct shm_region *r0 = &shm->regions[0]; + struct shm_region *r0 = shm->regions[0]; r0->pgnos_short[frame] = pgno; return SQLITE_OK; } @@ -491,21 +518,28 @@ static int shm_add_pgno(struct shm *shm, uint32_t frame, uint32_t pgno) uint32_t regno = (frame - SHM_SHORT_PGNOS_LEN) / SHM_LONG_PGNOS_LEN; uint32_t index = (frame - SHM_SHORT_PGNOS_LEN) % SHM_LONG_PGNOS_LEN; PRE(regno <= (uint32_t)shm->num_regions + 1); + struct shm_region *region; if (regno == (uint32_t)shm->num_regions + 1) { - sqlite3_uint64 sz = - (sqlite3_uint64)regno * VFS2_WAL_INDEX_REGION_SIZE; - struct shm_region *p = sqlite3_realloc64(shm->regions, sz); - if (p == NULL) { + region = shm_grow(shm); + if (region == NULL) { return SQLITE_NOMEM; } - shm->regions = p; - shm->num_regions++; + } else { + region = shm->regions[regno]; } - shm->regions[regno].pgnos_long[index] = pgno; + region->pgnos_long[index] = pgno; sm_move(&shm->sm, SHM_MANAGED); return SQLITE_OK; } +static void shm_free(struct shm *shm) +{ + for (int i = 0; i < shm->num_regions; i++) { + sqlite3_free(shm->regions[i]); + } + sqlite3_free(shm->regions); +} + struct entry { /* Next/prev entries for this VFS. */ queue link; @@ -631,7 +665,7 @@ static struct wal_index_full_hdr *get_full_hdr(struct entry *e) { PRE(e->shm.num_regions > 0); PRE(e->shm.regions != NULL); - return &e->shm.regions[0].hdr; + return &e->shm.regions[0]->hdr; } static bool no_pending_txn(const struct entry *e) @@ -710,7 +744,7 @@ static void maybe_close_entry(struct entry *e) free_pending_txn(e); assert(e->shm.refcount == 0); - sqlite3_free(e->shm.regions); + shm_free(&e->shm); pthread_rwlock_wrlock(&e->common->rwlock); queue_remove(&e->link); @@ -1062,23 +1096,15 @@ static int vfs2_shm_map(sqlite3_file *file, struct entry *e = xfile->entry; struct shm *shm = &e->shm; struct shm_region *region; - int rv; if (shm->regions != NULL && regno < shm->num_regions) { - region = &shm->regions[regno]; + region = shm->regions[regno]; } else if (extend != 0) { assert(regno == shm->num_regions); - sqlite3_uint64 sz = - ((sqlite3_uint64)regno + 1) * (sqlite3_uint64)regsz; - struct shm_region *p = sqlite3_realloc64(shm->regions, sz); - if (p == NULL) { - rv = SQLITE_NOMEM; - goto err; + region = shm_grow(shm); + if (region == NULL) { + return SQLITE_NOMEM; } - shm->regions = p; - region = &shm->regions[regno]; - memset(region, 0, (size_t)regsz); - shm->num_regions++; } else { region = NULL; } @@ -1088,11 +1114,6 @@ static int vfs2_shm_map(sqlite3_file *file, e->shm.refcount++; } return SQLITE_OK; - -err: - assert(rv != SQLITE_OK); - *out = NULL; - return rv; } static __attribute__((noinline)) int busy(void) @@ -1309,7 +1330,7 @@ static void shm_update_ht(struct shm *shm, uint32_t frame) PRE(shm->num_regions > 0); sm_move(&shm->sm, SHM_MANAGED); if (frame < SHM_SHORT_PGNOS_LEN) { - struct shm_region *r0 = &shm->regions[0]; + struct shm_region *r0 = shm->regions[0]; uint32_t pgno = r0->pgnos_short[frame]; pgno_ht_insert(r0->ht, (uint16_t)frame, pgno); return; @@ -1318,7 +1339,7 @@ static void shm_update_ht(struct shm *shm, uint32_t frame) uint32_t regno = (frame - SHM_SHORT_PGNOS_LEN) / SHM_LONG_PGNOS_LEN; uint32_t index = (frame - SHM_SHORT_PGNOS_LEN) % SHM_LONG_PGNOS_LEN; PRE(regno <= (uint32_t)shm->num_regions); - struct shm_region *region = &shm->regions[regno]; + struct shm_region *region = shm->regions[regno]; uint32_t pgno = region->pgnos_long[index]; pgno_ht_insert(region->ht, (uint16_t)index, pgno); } @@ -2202,7 +2223,7 @@ int vfs2_add_uncommitted(sqlite3_file *file, */ PRE(ERGO(e->shm.num_regions == 0, e->wal_cursor == 0)); struct wal_index_full_hdr *ihdr = - e->shm.num_regions > 0 ? &e->shm.regions[0].hdr : NULL; + e->shm.num_regions > 0 ? &e->shm.regions[0]->hdr : NULL; uint32_t mx = ihdr != NULL ? ihdr->basic[0].mxFrame : 0; uint32_t backfill = ihdr != NULL ? ihdr->nBackfill : 0; if (e->wal_cursor == mx && mx == backfill) { From 89f695bc6ed7874f6c75eb6af7b42c89e50a811f Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Thu, 1 Aug 2024 13:49:31 -0500 Subject: [PATCH 58/73] Add a TODO about WALs with non-native byte order Signed-off-by: Cole Miller --- src/vfs2.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/vfs2.c b/src/vfs2.c index f2f149464..6ed36bc96 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -1406,6 +1406,13 @@ static int walk_wal(sqlite3_file *wal, ByteGetBe32(hdr.cksum2) }; int rv; + /* TODO(cole): support WALs that use a non-native byte order for the + * checksums (because our data directory was transferred from another + * machine). */ + if (ByteGetBe32(hdr.magic) != (is_bigendian() ? BE_MAGIC : LE_MAGIC)) { + return SQLITE_ERROR; + } + /* Check whether we have been provided a stopping point that corresponds * to a transaction in the current WAL. (It's possible that the stopping * point corresponds to a transaction that's in WAL-prev instead.) */ From aff5f3ab557d436c1883694f5c427ddfa636a918 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Thu, 1 Aug 2024 13:54:40 -0500 Subject: [PATCH 59/73] Introduce WAL_MAX_VERSION constant This matches the name that SQLite uses for this magic number. Signed-off-by: Cole Miller --- src/vfs2.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/vfs2.c b/src/vfs2.c index 6ed36bc96..29c9d8d5e 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -43,6 +43,9 @@ #define DB_FILE_HEADER_SIZE 100 #define DB_FILE_HEADER_NPAGES_OFFSET 28 +/* Fixed version number used by the WAL and WAL-index. */ +#define WAL_MAX_VERSION 3007000 + static const uint32_t invalid_magic = 0x17171717; /* clang-format off */ @@ -457,7 +460,7 @@ static int shm_init(struct shm *shm, struct wal_hdr whdr) return SQLITE_NOMEM; } struct wal_index_full_hdr *ihdr = &r0->hdr; - ihdr->basic[0].iVersion = 3007000; + ihdr->basic[0].iVersion = WAL_MAX_VERSION; ihdr->basic[0].isInit = 1; ihdr->basic[0].bigEndCksum = is_bigendian(); ihdr->basic[0].szPage = (uint16_t)ByteGetBe32(whdr.page_size); @@ -485,7 +488,7 @@ static void shm_restart(struct shm *shm, struct wal_hdr whdr) /* TODO(cole) eliminate redundancy with shm_init */ struct shm_region *r0 = shm->regions[0]; struct wal_index_full_hdr *ihdr = &r0->hdr; - ihdr->basic[0].iVersion = 3007000; + ihdr->basic[0].iVersion = WAL_MAX_VERSION; ihdr->basic[0].isInit = 1; ihdr->basic[0].bigEndCksum = is_bigendian(); ihdr->basic[0].szPage = (uint16_t)ByteGetBe32(whdr.page_size); @@ -691,7 +694,7 @@ static bool basic_hdr_valid(const struct wal_index_basic_hdr *bhdr) const uint8_t *p = (const uint8_t *)bhdr; size_t len = offsetof(struct wal_index_basic_hdr, cksums); update_cksums(p, len, &sums); - return bhdr->iVersion == 3007000 && bhdr->isInit == 1 && + return bhdr->iVersion == WAL_MAX_VERSION && bhdr->isInit == 1 && cksums_equal(sums, bhdr->cksums); } @@ -2103,7 +2106,7 @@ static struct wal_hdr make_wal_hdr(uint32_t magic, { struct wal_hdr ret; BytePutBe32(magic, ret.magic); - BytePutBe32(3007000, ret.version); + BytePutBe32(WAL_MAX_VERSION, ret.version); BytePutBe32(page_size, ret.page_size); BytePutBe32(ckpoint_seqno, ret.ckpoint_seqno); ret.salts = salts; From 6c979b33c9c016462f35edd8ba2fcd47b9c2f303 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Thu, 1 Aug 2024 14:01:13 -0500 Subject: [PATCH 60/73] Remove WAL-prev invalidation It's not compatible with how we determine WAL-cur at startup, and isn't load-bearing, so let's ditch it. Also tighten up try_read_wal_hdr so that weird WALs with 0 < n < WAL_HDR_SIZE bytes are treated as corrupt. Signed-off-by: Cole Miller --- src/vfs2.c | 33 +++++++++++++-------------------- test/unit/test_vfs2.c | 9 ++++----- 2 files changed, 17 insertions(+), 25 deletions(-) diff --git a/src/vfs2.c b/src/vfs2.c index 29c9d8d5e..53acf35e7 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -46,8 +46,6 @@ /* Fixed version number used by the WAL and WAL-index. */ #define WAL_MAX_VERSION 3007000 -static const uint32_t invalid_magic = 0x17171717; - /* clang-format off */ enum { @@ -786,15 +784,18 @@ static int vfs2_read(sqlite3_file *file, void *buf, int amt, sqlite3_int64 ofst) } /** - * Update the volatile state by exchanging WAL-cur and WAL-prev. + * Update the volatile state so that WAL-cur becomes WAL-prev and vice versa. + * + * This is implemented by exchanging the wal_cur and wal_prev file handles and + * fixed names, and flipping the moving name hard link to point to the opposite + * file. */ static void wal_swap(struct entry *e, const struct wal_hdr *hdr) { int rv; - /* Terminology: the outgoing WAL is the one that's moving - * from cur to prev. The incoming WAL is the one that's moving - * from prev to cur. */ + /* Terminology: the outgoing WAL is the one that's moving from cur to + * prev. The incoming WAL is the one that's moving from prev to cur. */ sqlite3_file *phys_outgoing = e->wal_cur; char *name_outgoing = e->wal_cur_fixed_name; sqlite3_file *phys_incoming = e->wal_prev; @@ -822,17 +823,6 @@ static void wal_swap(struct entry *e, const struct wal_hdr *hdr) (void)rv; rv = link(name_incoming, e->wal_moving_name); (void)rv; - - /* Best-effort: invalidate the header of the outgoing physical WAL, so - * that it can't be mistakenly applied to the database. - - * This provides some protection against users manipulating the - * database with SQLite, or a bug in dqlite. But we don't rely on it - * for correctness. */ - (void)phys_outgoing->pMethods->xWrite(phys_outgoing, &invalid_magic, - sizeof(invalid_magic), 0); - - /* TODO do we need an fsync here? */ } static int vfs2_wal_write_frame_hdr(struct entry *e, @@ -1285,8 +1275,8 @@ static bool wal_hdr_is_valid(const struct wal_hdr *hdr) * Read the header of the given WAL file and detect corruption. * * If the file contains a valid header, returns this header in `hdr` and the - * size of the file in `size`. If the file is too short to contain a header, - * returns the size only. If the file can't be read, or it contains an invalid + * size of the file in `size`. If the file is empty, returns 0 in `size` only. + * If the file can't be read, or it is nonempty but doesn't contain a valid * header, returns an error. */ static int try_read_wal_hdr(sqlite3_file *wal, @@ -1299,9 +1289,12 @@ static int try_read_wal_hdr(sqlite3_file *wal, if (rv != SQLITE_OK) { return rv; } - if (*size < (sqlite3_int64)sizeof(*hdr)) { + if (*size == 0) { return SQLITE_OK; } + if (*size < (sqlite3_int64)sizeof(*hdr)) { + return SQLITE_CORRUPT; + } rv = wal->pMethods->xRead(wal, hdr, sizeof(*hdr), 0); if (rv != SQLITE_OK) { return rv; diff --git a/test/unit/test_vfs2.c b/test/unit/test_vfs2.c index 399fc41e5..d8ad1b9e1 100644 --- a/test/unit/test_vfs2.c +++ b/test/unit/test_vfs2.c @@ -283,9 +283,8 @@ TEST(vfs2, startup_frames_in_one, set_up, tear_down, 0, NULL) struct vfs2_wal_slice sl; OK(vfs2_poll(fp, NULL, &sl)); OK(sqlite3_close(db)); - /* WAL2 has the frames. The value 4 here reflect the invalid magic - * number that we write to the outgoing WAL. */ - assert_wal_sizes(node, "test.db", 4, WAL_SIZE_FROM_FRAMES(2)); + /* WAL2 has the frames. */ + assert_wal_sizes(node, "test.db", 0, WAL_SIZE_FROM_FRAMES(2)); db = node_open_db(node, "test.db"); fp = main_file(db); @@ -381,7 +380,7 @@ TEST(vfs2, leader_and_follower, set_up, tear_down, 0, NULL) OK(sqlite3_exec(leader_db, "CREATE TABLE foo (n INTEGER)", NULL, NULL, NULL)); /* WAL2 gets the frames after a WAL swap. */ - assert_wal_sizes(leader, "test.db", 4, WAL_SIZE_FROM_FRAMES(2)); + assert_wal_sizes(leader, "test.db", 0, WAL_SIZE_FROM_FRAMES(2)); sqlite3_file *leader_fp = main_file(leader_db); dqlite_vfs_frame *frames; struct vfs2_wal_slice leader_sl; @@ -398,7 +397,7 @@ TEST(vfs2, leader_and_follower, set_up, tear_down, 0, NULL) OK(vfs2_add_uncommitted(follower_fp, PAGE_SIZE, frames, leader_sl.len, &follower_sl)); /* WAL2 gets the frames after a WAL swap. */ - assert_wal_sizes(follower, "test.db", 4, WAL_SIZE_FROM_FRAMES(2)); + assert_wal_sizes(follower, "test.db", 0, WAL_SIZE_FROM_FRAMES(2)); sqlite3_free(frames[0].data); sqlite3_free(frames[1].data); sqlite3_free(frames); From 1381861362afa7dd486e697182ca8e43fd9e00ad Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Thu, 1 Aug 2024 14:19:01 -0500 Subject: [PATCH 61/73] Fix off-by-one errors in shm region selection Signed-off-by: Cole Miller --- src/vfs2.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/vfs2.c b/src/vfs2.c index 53acf35e7..429185235 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -516,7 +516,7 @@ static int shm_add_pgno(struct shm *shm, uint32_t frame, uint32_t pgno) return SQLITE_OK; } - uint32_t regno = (frame - SHM_SHORT_PGNOS_LEN) / SHM_LONG_PGNOS_LEN; + uint32_t regno = 1 + (frame - SHM_SHORT_PGNOS_LEN) / SHM_LONG_PGNOS_LEN; uint32_t index = (frame - SHM_SHORT_PGNOS_LEN) % SHM_LONG_PGNOS_LEN; PRE(regno <= (uint32_t)shm->num_regions + 1); struct shm_region *region; @@ -1332,7 +1332,7 @@ static void shm_update_ht(struct shm *shm, uint32_t frame) return; } - uint32_t regno = (frame - SHM_SHORT_PGNOS_LEN) / SHM_LONG_PGNOS_LEN; + uint32_t regno = 1 + (frame - SHM_SHORT_PGNOS_LEN) / SHM_LONG_PGNOS_LEN; uint32_t index = (frame - SHM_SHORT_PGNOS_LEN) % SHM_LONG_PGNOS_LEN; PRE(regno <= (uint32_t)shm->num_regions); struct shm_region *region = shm->regions[regno]; From 1583dd15069f1f2a9246444904305f1412cf3e0c Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Thu, 1 Aug 2024 14:20:36 -0500 Subject: [PATCH 62/73] Tweak shm_grow Signed-off-by: Cole Miller --- src/vfs2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vfs2.c b/src/vfs2.c index 429185235..4197357b0 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -427,7 +427,7 @@ static struct shm_region *shm_grow(struct shm *shm) } p[index] = r; shm->regions = p; - shm->num_regions++; + shm->num_regions = index + 1; return r; err_after_alloc_region: From 01f2f92b60f42539c3d073aa4130a04c981f3f0e Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Thu, 1 Aug 2024 14:26:18 -0500 Subject: [PATCH 63/73] Assert that SQLite doesn't try to delete the WAL-index Also check that there is an outstanding map before decrementing the refcount. Signed-off-by: Cole Miller --- src/vfs2.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/vfs2.c b/src/vfs2.c index 4197357b0..1a25e62c2 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -1214,11 +1214,17 @@ static void vfs2_shm_barrier(sqlite3_file *file) (void)file; } -static int vfs2_shm_unmap(sqlite3_file *file, int delete) -{ - (void)delete; +static int vfs2_shm_unmap(sqlite3_file *file, int delete) { + /* SQLite sends this flag with the unmap method when the last + * connection to a database runs a complete checkpoint in sqlite3_close + * and then deletes the WAL file. The interpretation is that the + * WAL-index should be deleted as well. dqlite both disables + * checkpoint-on-close and instructs SQLite not to delete the WAL, so + * we don't expect to ever receive the flag. */ + PRE(delete == 0); struct file *xfile = (struct file *)file; struct entry *e = xfile->entry; + PRE(e->shm.refcount > 0); e->shm.refcount--; return SQLITE_OK; } From c296805d4c4e986bfcf5063284a18ffb0433d240 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Thu, 1 Aug 2024 14:29:44 -0500 Subject: [PATCH 64/73] Add missing error handling for shm_add_pgno Signed-off-by: Cole Miller --- src/vfs2.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/src/vfs2.c b/src/vfs2.c index 1a25e62c2..caa1f8163 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -2304,9 +2304,16 @@ int vfs2_add_uncommitted(sqlite3_file *file, * overwritten before being committed. Updating the page number array * "early" like this is harmless and saves us from having to stash the * page numbers somewhere else in memory in between add_uncommitted and - * apply, or (worse) read them back from WAL-cur. */ + * apply, or (worse) read them back from WAL-cur. + * + * TODO(cole) we can simplify the error handling by requesting the shm + * to grow as much as necessary at this point, before we have written + * the frames. */ uint32_t pgno = (uint32_t)frames[0].page_number; - shm_add_pgno(&e->shm, e->wal_cursor, pgno); + rv = shm_add_pgno(&e->shm, e->wal_cursor, pgno); + if (rv != SQLITE_OK) { + return 1; + } /* With every frame, we make this update. The interpretation is that * db_size would be the size of the main file in pages if we @@ -2323,7 +2330,10 @@ int vfs2_add_uncommitted(sqlite3_file *file, for (unsigned i = 1; i < len; i++) { pgno = (uint32_t)frames[i].page_number; - shm_add_pgno(&e->shm, e->wal_cursor, pgno); + rv = shm_add_pgno(&e->shm, e->wal_cursor, pgno); + if (rv != SQLITE_OK) { + return 1; + } db_size = MAX(db_size, pgno); sums.cksum1 = ByteGetBe32(fhdr.cksum1); sums.cksum2 = ByteGetBe32(fhdr.cksum2); From b3ca21c68a5e52276030bc0911f20b2515a47bac Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Thu, 1 Aug 2024 14:33:41 -0500 Subject: [PATCH 65/73] Add a TODO about the proper handling of stopping points in walk_wal Signed-off-by: Cole Miller --- src/vfs2.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/vfs2.c b/src/vfs2.c index caa1f8163..5071fe25d 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -1416,8 +1416,19 @@ static int walk_wal(sqlite3_file *wal, } /* Check whether we have been provided a stopping point that corresponds - * to a transaction in the current WAL. (It's possible that the stopping - * point corresponds to a transaction that's in WAL-prev instead.) */ + * to a transaction in the current WAL. + * + * TODO(cole) we shouldn't just ignore the stopping point if the salts + * don't match WAL-cur---there are a few possibilities that need to be + * handled differently: + * + * - the stopping point could be at the end of WAL-prev; we should interpret + * this as an instruction to ignore all the frames in WAL-cur + * - the stopping point could be in the interior of WAL-prev: this means + * something has gone badly wrong, since everything in WAL-prev should + * be committed + * - the salts for the stopping point match neither WAL-cur nor WAL-prev: + * this indicates that something has gone wrong in a bizarre way */ bool have_stop = stop != NULL && salts_equal(stop->salts, hdr.salts); uint8_t *page_buf = sqlite3_malloc64(page_size); From 2b2877b6e1d5bf5066720a359b2ee9c2b544bc97 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Thu, 1 Aug 2024 14:35:23 -0500 Subject: [PATCH 66/73] Correct description of the WTX_BASE state Signed-off-by: Cole Miller --- src/vfs2.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/vfs2.c b/src/vfs2.c index 5071fe25d..5f77c70a7 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -56,8 +56,9 @@ enum { WTX_EMPTY, /* Non-leader, at least one transaction in WAL-cur is not committed. */ WTX_FOLLOWING, - /* Leader, all transactions in WAL-cur are committed (but at least one - is not checkpointed). */ + /* All transactions in WAL-cur are committed, but at least one is not + * checkpointed. (Both leaders and non-leaders can be in this state). + */ WTX_BASE, /* Leader, transaction in progress. */ WTX_ACTIVE, From 64c3600ee6f634dda8bbb113d685b48d49aa4613 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Thu, 1 Aug 2024 14:36:00 -0500 Subject: [PATCH 67/73] Add vfs2_unhide to wtx sm diagram Signed-off-by: Cole Miller --- src/vfs2.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/vfs2.c b/src/vfs2.c index 5f77c70a7..813aa165f 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -86,8 +86,8 @@ enum { * | v v-+ xWrite(WAL) * |--ACTIVE-+ * | | - * vfs2_abort | | COMMIT_PHASETWO - * | | + * vfs2_abort, | | COMMIT_PHASETWO + * vfs2_unhide | | * | v * |--HIDDEN * | | From 527881b83a3bec0937cc39a82fc942783b7390ca Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Thu, 1 Aug 2024 14:41:10 -0500 Subject: [PATCH 68/73] Correct flags for shm and ckpt state machines Signed-off-by: Cole Miller --- src/vfs2.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/vfs2.c b/src/vfs2.c index 813aa165f..d011b41cd 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -228,11 +228,13 @@ static const struct sm_conf shm_states[SHM_NR] = { }, [SHM_RECOVERED] = { .name = "recovered", - .allowed = BITS(SHM_MANAGED) + .allowed = BITS(SHM_MANAGED), + .flags = SM_FINAL }, [SHM_MANAGED] = { .name = "managed", - .allowed = BITS(SHM_MANAGED) + .allowed = BITS(SHM_MANAGED), + .flags = SM_FINAL }, }; @@ -250,7 +252,7 @@ static const struct sm_conf ckpt_states[CKPT_NR] = { [CKPT_QUIESCENT] = { .name = "quiescent", .allowed = BITS(CKPT_CHECKPOINTING), - .flags = SM_INITIAL + .flags = SM_INITIAL|SM_FINAL }, [CKPT_CHECKPOINTING] = { .name = "checkpointing", From 9ac2372e3b63a9d1599c4b545ca207a28afac880 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Thu, 1 Aug 2024 14:42:48 -0500 Subject: [PATCH 69/73] More explicit Signed-off-by: Cole Miller --- src/vfs2.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/vfs2.c b/src/vfs2.c index d011b41cd..bdab21489 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -392,8 +392,8 @@ struct shm_region { * * Each region is stored in its own heap allocation of size * VFS2_WAL_INDEX_REGION_SIZE. Using separate allocations ensures that existing - * region pointers remain valid when new regions are mapped, as SQLite - * expects. + * region pointers held by SQLite remain valid when new regions are mapped, as + * SQLite expects. */ struct shm { struct shm_region **regions; From 0bfc0d97eeb0e99e1ca05d1429bd7eea30b3faaa Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Thu, 1 Aug 2024 14:55:02 -0500 Subject: [PATCH 70/73] Remark about old frames in walk_wal Signed-off-by: Cole Miller --- src/vfs2.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/vfs2.c b/src/vfs2.c index bdab21489..868533ab2 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -1450,6 +1450,9 @@ static int walk_wal(sqlite3_file *wal, if (rv != SQLITE_OK) { goto err; } + /* If the salts for this frame don't match those in the WAL header, + * the frame is (probably) left over from a previous generation of + * the WAL, and so we've found the end of the current generation. */ if (!salts_equal(fhdr.salts, hdr.salts)) { break; } From f515bef950a0c137fd36d1904bf3883cd0a2a4b6 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Thu, 1 Aug 2024 15:35:59 -0500 Subject: [PATCH 71/73] Remove redundancy between shm_init and shm_restart Signed-off-by: Cole Miller --- src/vfs2.c | 44 ++++++++++++++++---------------------------- 1 file changed, 16 insertions(+), 28 deletions(-) diff --git a/src/vfs2.c b/src/vfs2.c index 868533ab2..32500e646 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -448,34 +448,6 @@ static void write_basic_hdr_cksums(struct wal_index_basic_hdr *bhdr) bhdr->cksums = sums; } -/** - * Perform initial setup of the WAL-index. - * - * This allocates the zeroth region and fills in the WAL-index header - * based on the provided header of WAL-cur. - */ -static int shm_init(struct shm *shm, struct wal_hdr whdr) -{ - struct shm_region *r0 = shm_grow(shm); - if (r0 == NULL) { - return SQLITE_NOMEM; - } - struct wal_index_full_hdr *ihdr = &r0->hdr; - ihdr->basic[0].iVersion = WAL_MAX_VERSION; - ihdr->basic[0].isInit = 1; - ihdr->basic[0].bigEndCksum = is_bigendian(); - ihdr->basic[0].szPage = (uint16_t)ByteGetBe32(whdr.page_size); - write_basic_hdr_cksums(&ihdr->basic[0]); - ihdr->basic[1] = ihdr->basic[0]; - ihdr->marks[0] = 0; - ihdr->marks[1] = READ_MARK_UNUSED; - ihdr->marks[2] = READ_MARK_UNUSED; - ihdr->marks[3] = READ_MARK_UNUSED; - ihdr->marks[4] = READ_MARK_UNUSED; - sm_move(&shm->sm, SHM_MANAGED); - return SQLITE_OK; -} - /** * Clear out all data in the WAL-index after a WAL swap, and re-initialize the * header. @@ -503,6 +475,22 @@ static void shm_restart(struct shm *shm, struct wal_hdr whdr) sm_move(&shm->sm, SHM_MANAGED); } +/** + * Perform initial setup of the WAL-index. + * + * This allocates the zeroth region and fills in the WAL-index header + * based on the provided header of WAL-cur. + */ +static int shm_init(struct shm *shm, struct wal_hdr whdr) +{ + struct shm_region *r0 = shm_grow(shm); + if (r0 == NULL) { + return SQLITE_NOMEM; + } + shm_restart(shm, whdr); + return SQLITE_OK; +} + /** * Add the page number for a frame to the appropriate page number array * in the WAL-index. From fe47c99de9f2e2e249ffa0f7a2709f7d627baee2 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Thu, 1 Aug 2024 16:10:36 -0500 Subject: [PATCH 72/73] Fold the first frame write into the loop in vfs2_add_uncommitted Signed-off-by: Cole Miller --- src/vfs2.c | 71 +++++++++++++++++++++++++----------------------------- 1 file changed, 33 insertions(+), 38 deletions(-) diff --git a/src/vfs2.c b/src/vfs2.c index 32500e646..f35dc9672 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -2300,49 +2300,44 @@ int vfs2_add_uncommitted(sqlite3_file *file, } POST(db_size > 0); - /* Record the new frame in the appropriate page number array in the WAL - * index. Note that this doesn't make the frame visible to readers: that - * only happens once it is also recorded in the WAL-index hash array and - * mxFrame exceeds the frame index. The hash array is updated only when - * we increase mxFrame, so that we don't have to deal with the added - * complexity of removing things from the hash table when frames are - * overwritten before being committed. Updating the page number array - * "early" like this is harmless and saves us from having to stash the - * page numbers somewhere else in memory in between add_uncommitted and - * apply, or (worse) read them back from WAL-cur. - * - * TODO(cole) we can simplify the error handling by requesting the shm - * to grow as much as necessary at this point, before we have written - * the frames. */ - uint32_t pgno = (uint32_t)frames[0].page_number; - rv = shm_add_pgno(&e->shm, e->wal_cursor, pgno); - if (rv != SQLITE_OK) { - return 1; - } - - /* With every frame, we make this update. The interpretation is that - * db_size would be the size of the main file in pages if we - * checkpointing up to and including the current frame. Then we use the - * final value to write the commit marker of the last frame. */ - db_size = MAX(db_size, pgno); - - uint32_t commit = len == 1 ? db_size : 0; - struct wal_frame_hdr fhdr = txn_frame_hdr(e, sums, &frames[0], commit); - rv = write_one_frame(e, fhdr, frames[0].data); - if (rv != SQLITE_OK) { - return 1; - } - - for (unsigned i = 1; i < len; i++) { - pgno = (uint32_t)frames[i].page_number; + struct wal_frame_hdr fhdr; + for (unsigned i = 0; i < len; i++) { + /* Record the new frame in the appropriate page number array in + * the WAL index. Note that this doesn't make the frame visible + * to readers: that only happens once it is also recorded in + * the WAL-index hash array and mxFrame exceeds the frame + * index. The hash array is updated only when we increase + * mxFrame, so that we don't have to deal with the added + * complexity of removing things from the hash table when + * frames are overwritten before being committed. Updating the + * page number array "early" like this is harmless and saves us + * from having to stash the page numbers somewhere else in + * memory in between add_uncommitted and apply, or (worse) read + * them back from WAL-cur. + * + * TODO(cole) we can simplify the error handling by requesting + * the shm to grow as much as necessary up front, before we + * have written the frames. */ + uint32_t pgno = (uint32_t)frames[i].page_number; rv = shm_add_pgno(&e->shm, e->wal_cursor, pgno); if (rv != SQLITE_OK) { return 1; } + + /* With every frame, we make this update. The interpretation is + * that db_size would be the size of the main file in pages if + * we checkpointing up to and including the current frame. Then + * we use the final value to write the commit marker of the + * last frame. */ db_size = MAX(db_size, pgno); - sums.cksum1 = ByteGetBe32(fhdr.cksum1); - sums.cksum2 = ByteGetBe32(fhdr.cksum2); - commit = i == len - 1 ? db_size : 0; + uint32_t commit = i == len - 1 ? db_size : 0; + + /* Keep the checksums rolling. */ + if (i > 0) { + sums.cksum1 = ByteGetBe32(fhdr.cksum1); + sums.cksum2 = ByteGetBe32(fhdr.cksum2); + } + fhdr = txn_frame_hdr(e, sums, &frames[i], commit); rv = write_one_frame(e, fhdr, frames[i].data); if (rv != SQLITE_OK) { From 3951d1e1f8ae3aaf86bfe0fbf94c415b3e8195e1 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Thu, 1 Aug 2024 16:10:43 -0500 Subject: [PATCH 73/73] Add a TODO about the broken read mark implementation Signed-off-by: Cole Miller --- src/vfs2.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/vfs2.c b/src/vfs2.c index f35dc9672..5eb03c19f 100644 --- a/src/vfs2.c +++ b/src/vfs2.c @@ -2364,6 +2364,8 @@ static unsigned read_lock(unsigned i) int vfs2_pseudo_read_begin(sqlite3_file *file, uint32_t target, unsigned *out) { + /* FIXME(cole) this implementation is incorrect and needs to be + * replaced. */ struct file *xfile = (struct file *)file; PRE(xfile->flags & SQLITE_OPEN_MAIN_DB); struct entry *e = xfile->entry;