Skip to content

Commit

Permalink
ARC: Move header reallocation under db_mtx lock
Browse files Browse the repository at this point in the history
As part of #14340 I've removed b_evict_lock, that played no role in
ARC eviction process.  But appears it played a secondary role of
protecting b_hdr pointers in arc_buf_t during header reallocation
by arc_hdr_realloc_crypt(), that, as found in #15293, may cause use
after free races if some encrypted block is read while being synced
after been writen.

After closer look on b_evict_lock I still do not believe it covered
all the possible races, so I am not eager to resurrect it.  Instead
this refactors arc_hdr_realloc_crypt() into arc_realloc_crypt()
and moves its calls from arc_write_ready() into upper levels ready
callbacks, like dbuf_write_ready(), where it is protected by
existing db_mtx, protecting also all the arc buffer accesses.

Signed-off-by:	Alexander Motin <[email protected]>
Sponsored by:	iXsystems, Inc.
  • Loading branch information
amotin committed Sep 29, 2023
1 parent ba769ea commit 4fc1137
Show file tree
Hide file tree
Showing 5 changed files with 39 additions and 29 deletions.
1 change: 1 addition & 0 deletions include/sys/arc.h
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,7 @@ void arc_buf_info(arc_buf_t *buf, arc_buf_info_t *abi, int state_index);
uint64_t arc_buf_size(arc_buf_t *buf);
uint64_t arc_buf_lsize(arc_buf_t *buf);
void arc_buf_access(arc_buf_t *buf);
void arc_realloc_crypt(arc_buf_t *buf, boolean_t need_crypt);
void arc_release(arc_buf_t *buf, const void *tag);
int arc_released(arc_buf_t *buf);
void arc_buf_sigsegv(int sig, siginfo_t *si, void *unused);
Expand Down
52 changes: 28 additions & 24 deletions module/zfs/arc.c
Original file line number Diff line number Diff line change
Expand Up @@ -3453,11 +3453,10 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
* header and vice versa. If we are going to a crypt header, the
* new fields will be zeroed out.
*/
static arc_buf_hdr_t *
arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt)
void
arc_realloc_crypt(arc_buf_t *buf, boolean_t need_crypt)
{
arc_buf_hdr_t *nhdr;
arc_buf_t *buf;
arc_buf_hdr_t *hdr = buf->b_hdr, *nhdr;
kmem_cache_t *ncache, *ocache;

/*
Expand All @@ -3467,11 +3466,15 @@ arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt)
*/
ASSERT(HDR_HAS_L1HDR(hdr));
ASSERT(!HDR_HAS_L2HDR(hdr));
ASSERT3U(!!HDR_PROTECTED(hdr), !=, need_crypt);
ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
ASSERT(!list_link_active(&hdr->b_l2hdr.b_l2node));
ASSERT3P(hdr->b_hash_next, ==, NULL);
ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf);
ASSERT3P(buf->b_next, ==, NULL);

if (need_crypt == !!HDR_PROTECTED(hdr))
return;

if (need_crypt) {
ncache = hdr_full_crypt_cache;
Expand All @@ -3495,29 +3498,28 @@ arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt)
nhdr->b_psize = hdr->b_psize;
nhdr->b_lsize = hdr->b_lsize;
nhdr->b_spa = hdr->b_spa;
#ifdef ZFS_DEBUG
nhdr->b_l1hdr.b_freeze_cksum = hdr->b_l1hdr.b_freeze_cksum;
#endif
nhdr->b_l1hdr.b_bufcnt = hdr->b_l1hdr.b_bufcnt;
nhdr->b_l1hdr.b_byteswap = hdr->b_l1hdr.b_byteswap;
nhdr->b_l1hdr.b_state = hdr->b_l1hdr.b_state;
nhdr->b_l1hdr.b_arc_access = hdr->b_l1hdr.b_arc_access;
nhdr->b_l1hdr.b_mru_hits = hdr->b_l1hdr.b_mru_hits;
nhdr->b_l1hdr.b_mru_ghost_hits = hdr->b_l1hdr.b_mru_ghost_hits;
nhdr->b_l1hdr.b_mfu_hits = hdr->b_l1hdr.b_mfu_hits;
nhdr->b_l1hdr.b_mfu_ghost_hits = hdr->b_l1hdr.b_mfu_ghost_hits;
nhdr->b_l1hdr.b_bufcnt = hdr->b_l1hdr.b_bufcnt;
nhdr->b_l1hdr.b_buf = hdr->b_l1hdr.b_buf;
nhdr->b_l1hdr.b_acb = hdr->b_l1hdr.b_acb;
nhdr->b_l1hdr.b_pabd = hdr->b_l1hdr.b_pabd;
#ifdef ZFS_DEBUG
nhdr->b_l1hdr.b_freeze_cksum = hdr->b_l1hdr.b_freeze_cksum;
#endif

/*
* This zfs_refcount_add() exists only to ensure that the individual
* arc buffers always point to a header that is referenced, avoiding
* a small race condition that could trigger ASSERTs.
*/
(void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, FTAG);
nhdr->b_l1hdr.b_buf = hdr->b_l1hdr.b_buf;
for (buf = nhdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next)
buf->b_hdr = nhdr;
buf->b_hdr = nhdr;

zfs_refcount_transfer(&nhdr->b_l1hdr.b_refcnt, &hdr->b_l1hdr.b_refcnt);
(void) zfs_refcount_remove(&nhdr->b_l1hdr.b_refcnt, FTAG);
Expand All @@ -3537,20 +3539,20 @@ arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt)
hdr->b_psize = 0;
hdr->b_lsize = 0;
hdr->b_spa = 0;
#ifdef ZFS_DEBUG
hdr->b_l1hdr.b_freeze_cksum = NULL;
#endif
hdr->b_l1hdr.b_buf = NULL;
hdr->b_l1hdr.b_bufcnt = 0;
hdr->b_l1hdr.b_byteswap = 0;
hdr->b_l1hdr.b_state = NULL;
hdr->b_l1hdr.b_arc_access = 0;
hdr->b_l1hdr.b_mru_hits = 0;
hdr->b_l1hdr.b_mru_ghost_hits = 0;
hdr->b_l1hdr.b_mfu_hits = 0;
hdr->b_l1hdr.b_mfu_ghost_hits = 0;
hdr->b_l1hdr.b_bufcnt = 0;
hdr->b_l1hdr.b_buf = NULL;
hdr->b_l1hdr.b_acb = NULL;
hdr->b_l1hdr.b_pabd = NULL;
#ifdef ZFS_DEBUG
hdr->b_l1hdr.b_freeze_cksum = NULL;
#endif

if (ocache == hdr_full_crypt_cache) {
ASSERT(!HDR_HAS_RABD(hdr));
Expand All @@ -3564,8 +3566,6 @@ arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt)

buf_discard_identity(hdr);
kmem_cache_free(ocache, hdr);

return (nhdr);
}

/*
Expand All @@ -3587,8 +3587,8 @@ arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder,
ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);

buf->b_flags |= (ARC_BUF_FLAG_COMPRESSED | ARC_BUF_FLAG_ENCRYPTED);
if (!HDR_PROTECTED(hdr))
hdr = arc_hdr_realloc_crypt(hdr, B_TRUE);
arc_realloc_crypt(buf, B_TRUE);
hdr = buf->b_hdr;
hdr->b_crypt_hdr.b_dsobj = dsobj;
hdr->b_crypt_hdr.b_ot = ot;
hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ?
Expand Down Expand Up @@ -6545,16 +6545,20 @@ arc_write_ready(zio_t *zio)

callback->awcb_ready(zio, buf, callback->awcb_private);

/*
* The upper level is required to call arc_realloc_crypt(),
* following its own buffer locking semantics.
*/
hdr = buf->b_hdr;
ASSERT3B(BP_IS_PROTECTED(bp), ==, !!HDR_PROTECTED(hdr));

if (HDR_IO_IN_PROGRESS(hdr)) {
ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED);
} else {
arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
add_reference(hdr, hdr); /* For IO_IN_PROGRESS. */
}

if (BP_IS_PROTECTED(bp) != !!HDR_PROTECTED(hdr))
hdr = arc_hdr_realloc_crypt(hdr, BP_IS_PROTECTED(bp));

if (BP_IS_PROTECTED(bp)) {
/* ZIL blocks are written through zio_rewrite */
ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG);
Expand Down
1 change: 1 addition & 0 deletions module/zfs/dbuf.c
Original file line number Diff line number Diff line change
Expand Up @@ -4653,6 +4653,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
}

mutex_enter(&db->db_mtx);
arc_realloc_crypt(buf, BP_IS_PROTECTED(bp));

#ifdef ZFS_DEBUG
if (db->db_blkid == DMU_SPILL_BLKID) {
Expand Down
9 changes: 6 additions & 3 deletions module/zfs/dmu.c
Original file line number Diff line number Diff line change
Expand Up @@ -1550,18 +1550,21 @@ typedef struct {
static void
dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
{
(void) buf;
dmu_sync_arg_t *dsa = varg;
dmu_buf_t *db = dsa->dsa_zgd->zgd_db;
dmu_buf_impl_t *db = (dmu_buf_impl_t *)dsa->dsa_zgd->zgd_db;
blkptr_t *bp = zio->io_bp;

mutex_enter(&db->db_mtx);
arc_realloc_crypt(buf, BP_IS_PROTECTED(bp));
mutex_exit(&db->db_mtx);

if (zio->io_error == 0) {
if (BP_IS_HOLE(bp)) {
/*
* A block of zeros may compress to a hole, but the
* block size still needs to be known for replay.
*/
BP_SET_LSIZE(bp, db->db_size);
BP_SET_LSIZE(bp, db->db.db_size);
} else if (!BP_IS_EMBEDDED(bp)) {
ASSERT(BP_GET_LEVEL(bp) == 0);
BP_SET_FILL(bp, 1);
Expand Down
5 changes: 3 additions & 2 deletions module/zfs/dmu_objset.c
Original file line number Diff line number Diff line change
Expand Up @@ -1570,9 +1570,8 @@ dmu_objset_sync_dnodes(multilist_sublist_t *list, dmu_tx_t *tx)
}

static void
dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
dmu_objset_write_ready(zio_t *zio, arc_buf_t *buf, void *arg)
{
(void) abuf;
blkptr_t *bp = zio->io_bp;
objset_t *os = arg;
dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
Expand All @@ -1582,6 +1581,8 @@ dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET);
ASSERT0(BP_GET_LEVEL(bp));

arc_realloc_crypt(buf, BP_IS_PROTECTED(bp));

/*
* Update rootbp fill count: it should be the number of objects
* allocated in the object set (not counting the "special"
Expand Down

0 comments on commit 4fc1137

Please sign in to comment.