From 18b3bea861cfe2cd88bb0dd160fe412575fed6e6 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Tue, 17 Dec 2024 08:58:33 -0800 Subject: [PATCH 01/30] CI: Add FreeBSD 14.2 RELEASE+STABLE builds Update the CI to include FreeBSD 14.2 as a regularly tested platform. Reviewed-by: Tino Reichardt Reviewed-by: Alexander Motin Signed-off-by: Brian Behlendorf Closes #16869 --- .github/workflows/scripts/qemu-2-start.sh | 20 ++++++++++---------- .github/workflows/zfs-qemu.yml | 9 +++++---- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/.github/workflows/scripts/qemu-2-start.sh b/.github/workflows/scripts/qemu-2-start.sh index 39ac92107b71..f0f505cac4b7 100755 --- a/.github/workflows/scripts/qemu-2-start.sh +++ b/.github/workflows/scripts/qemu-2-start.sh @@ -14,7 +14,7 @@ OSv=$OS # compressed with .zst extension REPO="https://github.com/mcmilk/openzfs-freebsd-images" -FREEBSD="$REPO/releases/download/v2024-10-05" +FREEBSD="$REPO/releases/download/v2024-12-14" URLzs="" # Ubuntu mirrors @@ -76,28 +76,28 @@ case "$OS" in BASH="/usr/local/bin/bash" NIC="rtl8139" ;; - freebsd14-0r) - OSNAME="FreeBSD 14.0-RELEASE" - OSv="freebsd14.0" - URLzs="$FREEBSD/amd64-freebsd-14.0-RELEASE.qcow2.zst" - BASH="/usr/local/bin/bash" - ;; freebsd14-1r) OSNAME="FreeBSD 14.1-RELEASE" OSv="freebsd14.0" URLzs="$FREEBSD/amd64-freebsd-14.1-RELEASE.qcow2.zst" BASH="/usr/local/bin/bash" ;; + freebsd14-2r) + OSNAME="FreeBSD 14.2-RELEASE" + OSv="freebsd14.0" + URLzs="$FREEBSD/amd64-freebsd-14.2-RELEASE.qcow2.zst" + BASH="/usr/local/bin/bash" + ;; freebsd13-4s) OSNAME="FreeBSD 13.4-STABLE" OSv="freebsd13.0" URLzs="$FREEBSD/amd64-freebsd-13.4-STABLE.qcow2.zst" BASH="/usr/local/bin/bash" ;; - freebsd14-1s) - OSNAME="FreeBSD 14.1-STABLE" + freebsd14-2s) + OSNAME="FreeBSD 14.2-STABLE" OSv="freebsd14.0" - URLzs="$FREEBSD/amd64-freebsd-14.1-STABLE.qcow2.zst" + URLzs="$FREEBSD/amd64-freebsd-14.2-STABLE.qcow2.zst" BASH="/usr/local/bin/bash" ;; freebsd15-0c) diff --git a/.github/workflows/zfs-qemu.yml b/.github/workflows/zfs-qemu.yml index e90030f4c02e..4748e90db50b 100644 --- a/.github/workflows/zfs-qemu.yml +++ b/.github/workflows/zfs-qemu.yml @@ -22,8 +22,8 @@ jobs: - name: Generate OS config and CI type id: os run: | - FULL_OS='["almalinux8", "almalinux9", "centos-stream9", "debian11", "debian12", "fedora40", "fedora41", "freebsd13-4r", "freebsd14-0r", "freebsd14-1s", "ubuntu20", "ubuntu22", "ubuntu24"]' - QUICK_OS='["almalinux8", "almalinux9", "debian12", "fedora41", "freebsd13-3r", "freebsd14-1r", "ubuntu24"]' + FULL_OS='["almalinux8", "almalinux9", "centos-stream9", "debian11", "debian12", "fedora40", "fedora41", "freebsd13-3r", "freebsd13-4s", "freebsd14-1r", "freebsd14-2s", "freebsd15-0c", "ubuntu20", "ubuntu22", "ubuntu24"]' + QUICK_OS='["almalinux8", "almalinux9", "debian12", "fedora41", "freebsd13-3r", "freebsd14-2r", "ubuntu24"]' # determine CI type when running on PR ci_type="full" if ${{ github.event_name == 'pull_request' }}; then @@ -49,8 +49,9 @@ jobs: # rhl: almalinux8, almalinux9, centos-stream9, fedora40, fedora41 # debian: debian11, debian12, ubuntu20, ubuntu22, ubuntu24 # misc: archlinux, tumbleweed - # FreeBSD Release: freebsd13-3r, freebsd13-4r, freebsd14-0r, freebsd14-1r - # FreeBSD Stable: freebsd13-4s, freebsd14-1s + # FreeBSD variants of 2024-12: + # FreeBSD Release: freebsd13-3r, freebsd13-4r, freebsd14-1r, freebsd14-2r + # FreeBSD Stable: freebsd13-4s, freebsd14-2s # FreeBSD Current: freebsd15-0c os: ${{ fromJson(needs.test-config.outputs.test_os) }} runs-on: ubuntu-24.04 From e90124a7c88d7876c02cb11280504dc83d374fbf Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Thu, 19 Dec 2024 10:25:12 +1100 Subject: [PATCH 02/30] zprop: fix value help for ZPOOL_PROP_CAPACITY It's a percentage and documented as such, but we were showing it as . Sponsored-by: https://despairlabs.com/sponsor/ Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Reviewed-by: George Melikov Signed-off-by: Rob Norris Closes #16881 --- module/zcommon/zpool_prop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c index a709679b9032..ea9eda4b316d 100644 --- a/module/zcommon/zpool_prop.c +++ b/module/zcommon/zpool_prop.c @@ -105,7 +105,7 @@ zpool_prop_init(void) PROP_READONLY, ZFS_TYPE_POOL, "", "FRAG", B_FALSE, sfeatures); zprop_register_number(ZPOOL_PROP_CAPACITY, "capacity", 0, PROP_READONLY, - ZFS_TYPE_POOL, "", "CAP", B_FALSE, sfeatures); + ZFS_TYPE_POOL, "", "CAP", B_FALSE, sfeatures); zprop_register_number(ZPOOL_PROP_GUID, "guid", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "GUID", B_TRUE, sfeatures); zprop_register_number(ZPOOL_PROP_LOAD_GUID, "load_guid", 0, From 166a7bc602ad56a818188799f20991734ac85411 Mon Sep 17 00:00:00 2001 From: Tino Reichardt Date: Thu, 19 Dec 2024 17:01:34 +0100 Subject: [PATCH 03/30] CI: Fix FreeBSD 13.4 STABLE build In #16869 we added FreeBSD 13.4 STABLE, but forget the special thing, that the virtio nic within FreeBSD 13.x is buggy. This fix adds the needed rtl8139 nic to the VM. Reviewed-by: George Melikov Reviewed-by: Alexander Motin Signed-off-by: Tino Reichardt Closes #16885 --- .github/workflows/scripts/qemu-2-start.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/scripts/qemu-2-start.sh b/.github/workflows/scripts/qemu-2-start.sh index f0f505cac4b7..0906e438ac0d 100755 --- a/.github/workflows/scripts/qemu-2-start.sh +++ b/.github/workflows/scripts/qemu-2-start.sh @@ -93,6 +93,7 @@ case "$OS" in OSv="freebsd13.0" URLzs="$FREEBSD/amd64-freebsd-13.4-STABLE.qcow2.zst" BASH="/usr/local/bin/bash" + NIC="rtl8139" ;; freebsd14-2s) OSNAME="FreeBSD 14.2-STABLE" From c944c46a98bb2ed3f9ef127b27cd9530262d65e9 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Fri, 20 Dec 2024 03:04:56 +1100 Subject: [PATCH 04/30] zfs_main: fix alignment on props usage output I guess we've got some long property names since this was first set up! Sponsored-by: https://despairlabs.com/sponsor/ Reviewed-by: Brian Behlendorf Reviewed-by: Tony Hutter Reviewed-by: Alexander Motin Reviewed-by: George Melikov Signed-off-by: Rob Norris Closes #16883 --- cmd/zfs/zfs_main.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index 7836f5909f4a..73ccf72d263c 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -500,7 +500,7 @@ usage_prop_cb(int prop, void *cb) { FILE *fp = cb; - (void) fprintf(fp, "\t%-15s ", zfs_prop_to_name(prop)); + (void) fprintf(fp, "\t%-22s ", zfs_prop_to_name(prop)); if (zfs_prop_readonly(prop)) (void) fprintf(fp, " NO "); @@ -561,40 +561,40 @@ usage(boolean_t requested) (void) fprintf(fp, "%s", gettext("\nThe following properties are supported:\n")); - (void) fprintf(fp, "\n\t%-14s %s %s %s\n\n", + (void) fprintf(fp, "\n\t%-21s %s %s %s\n\n", "PROPERTY", "EDIT", "INHERIT", "VALUES"); /* Iterate over all properties */ (void) zprop_iter(usage_prop_cb, fp, B_FALSE, B_TRUE, ZFS_TYPE_DATASET); - (void) fprintf(fp, "\t%-15s ", "userused@..."); + (void) fprintf(fp, "\t%-22s ", "userused@..."); (void) fprintf(fp, " NO NO \n"); - (void) fprintf(fp, "\t%-15s ", "groupused@..."); + (void) fprintf(fp, "\t%-22s ", "groupused@..."); (void) fprintf(fp, " NO NO \n"); - (void) fprintf(fp, "\t%-15s ", "projectused@..."); + (void) fprintf(fp, "\t%-22s ", "projectused@..."); (void) fprintf(fp, " NO NO \n"); - (void) fprintf(fp, "\t%-15s ", "userobjused@..."); + (void) fprintf(fp, "\t%-22s ", "userobjused@..."); (void) fprintf(fp, " NO NO \n"); - (void) fprintf(fp, "\t%-15s ", "groupobjused@..."); + (void) fprintf(fp, "\t%-22s ", "groupobjused@..."); (void) fprintf(fp, " NO NO \n"); - (void) fprintf(fp, "\t%-15s ", "projectobjused@..."); + (void) fprintf(fp, "\t%-22s ", "projectobjused@..."); (void) fprintf(fp, " NO NO \n"); - (void) fprintf(fp, "\t%-15s ", "userquota@..."); + (void) fprintf(fp, "\t%-22s ", "userquota@..."); (void) fprintf(fp, "YES NO | none\n"); - (void) fprintf(fp, "\t%-15s ", "groupquota@..."); + (void) fprintf(fp, "\t%-22s ", "groupquota@..."); (void) fprintf(fp, "YES NO | none\n"); - (void) fprintf(fp, "\t%-15s ", "projectquota@..."); + (void) fprintf(fp, "\t%-22s ", "projectquota@..."); (void) fprintf(fp, "YES NO | none\n"); - (void) fprintf(fp, "\t%-15s ", "userobjquota@..."); + (void) fprintf(fp, "\t%-22s ", "userobjquota@..."); (void) fprintf(fp, "YES NO | none\n"); - (void) fprintf(fp, "\t%-15s ", "groupobjquota@..."); + (void) fprintf(fp, "\t%-22s ", "groupobjquota@..."); (void) fprintf(fp, "YES NO | none\n"); - (void) fprintf(fp, "\t%-15s ", "projectobjquota@..."); + (void) fprintf(fp, "\t%-22s ", "projectobjquota@..."); (void) fprintf(fp, "YES NO | none\n"); - (void) fprintf(fp, "\t%-15s ", "written@"); + (void) fprintf(fp, "\t%-22s ", "written@"); (void) fprintf(fp, " NO NO \n"); - (void) fprintf(fp, "\t%-15s ", "written#"); + (void) fprintf(fp, "\t%-22s ", "written#"); (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, gettext("\nSizes are specified in bytes " From cb8da70329ad0806ca33263f18f3a72150aa7c62 Mon Sep 17 00:00:00 2001 From: Umer Saleem Date: Fri, 20 Dec 2024 01:02:58 +0500 Subject: [PATCH 05/30] Skip iterating over snapshots for share properties Setting sharenfs and sharesmb properties on a dataset can become costly if there are large number of snapshots, since setting the share properties iterates over all snapshots present for a dataset. If it is the root dataset for which we are trying to set the share property, snapshots for all child datasets and their children will also be iterated. There is no need to iterate over snapshots for share properties because we do not allow share properties or any other property, to be set on a snapshot itself execpt for user properties. This commit skips iterating over snapshots for share properties, instead iterate over all child dataset and their children for share properties. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Umer Saleem Closes #16877 --- lib/libzfs/libzfs_changelist.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/lib/libzfs/libzfs_changelist.c b/lib/libzfs/libzfs_changelist.c index 4db1cbce9568..47df8663165e 100644 --- a/lib/libzfs/libzfs_changelist.c +++ b/lib/libzfs/libzfs_changelist.c @@ -563,8 +563,15 @@ change_one(zfs_handle_t *zhp, void *data) cn = NULL; } - if (!clp->cl_alldependents) - ret = zfs_iter_children_v2(zhp, 0, change_one, data); + if (!clp->cl_alldependents) { + if (clp->cl_prop != ZFS_PROP_MOUNTPOINT) { + ret = zfs_iter_filesystems_v2(zhp, 0, + change_one, data); + } else { + ret = zfs_iter_children_v2(zhp, 0, change_one, + data); + } + } /* * If we added the handle to the changelist, we will re-use it @@ -738,6 +745,11 @@ changelist_gather(zfs_handle_t *zhp, zfs_prop_t prop, int gather_flags, changelist_free(clp); return (NULL); } + } else if (clp->cl_prop != ZFS_PROP_MOUNTPOINT) { + if (zfs_iter_filesystems_v2(zhp, 0, change_one, clp) != 0) { + changelist_free(clp); + return (NULL); + } } else if (zfs_iter_children_v2(zhp, 0, change_one, clp) != 0) { changelist_free(clp); return (NULL); From f9b02fe7e394f22c68f3ffb0bf77f6f4b1ef6488 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 20 Dec 2024 17:25:35 -0500 Subject: [PATCH 06/30] Fix readonly check for vdev user properties VDEV_PROP_USERPROP is equal do VDEV_PROP_INVAL and so is not a real property. That's why vdev_prop_readonly() does not work right for it. In particular it may declare all vdev user properties readonly on FreeBSD. Reviewed-by: Brian Behlendorf Reviewed-by: Rob Norris Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #16890 --- module/zfs/vdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 9f0f1dee656c..ac5230ebd51c 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -5969,7 +5969,7 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) goto end; } - if (vdev_prop_readonly(prop)) { + if (prop != VDEV_PROP_USERPROP && vdev_prop_readonly(prop)) { error = EROFS; goto end; } From 9519e7ebcc701ba26696180499155d100d2a6617 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Fri, 27 Dec 2024 09:10:09 +1100 Subject: [PATCH 07/30] microzap: set hard upper limit of 1M The count of chunks in a microzap block is stored as an uint16_t (mze_chunkid). Each chunk is 64 bytes, and the first is used to store a header, so there are 32767 usable chunks, which is just under 2M. 1M is the largest power-2-rounded block size under 2M, so we must set the limit there. If it goes higher, the loop in mzap_addent can overflow and fall into the PANIC case. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Alexander Motin Signed-off-by: Rob Norris Closes #16888 --- module/zfs/zap_micro.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c index 55b60006e58c..a9298d3e940e 100644 --- a/module/zfs/zap_micro.c +++ b/module/zfs/zap_micro.c @@ -54,14 +54,25 @@ * machinery to understand not to try to split a microzap block). * * If large_microzap is enabled, this value will be clamped to - * spa_maxblocksize(). If not, it will be clamped to SPA_OLD_MAXBLOCKSIZE. + * spa_maxblocksize(), up to 1M. If not, it will be clamped to + * SPA_OLD_MAXBLOCKSIZE. */ static int zap_micro_max_size = SPA_OLD_MAXBLOCKSIZE; +/* + * The 1M upper limit is necessary because the count of chunks in a microzap + * block is stored as a uint16_t (mze_chunkid). Each chunk is 64 bytes, and the + * first is used to store a header, so there are 32767 usable chunks, which is + * just under 2M. 1M is the largest power-2-rounded block size under 2M, so we + * must set the limit there. + */ +#define MZAP_MAX_SIZE (1048576) + uint64_t zap_get_micro_max_size(spa_t *spa) { - uint64_t maxsz = P2ROUNDUP(zap_micro_max_size, SPA_MINBLOCKSIZE); + uint64_t maxsz = MIN(MZAP_MAX_SIZE, + P2ROUNDUP(zap_micro_max_size, SPA_MINBLOCKSIZE)); if (maxsz <= SPA_OLD_MAXBLOCKSIZE) return (maxsz); if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_MICROZAP)) @@ -2031,5 +2042,6 @@ EXPORT_SYMBOL(zap_cursor_init_serialized); EXPORT_SYMBOL(zap_get_stats); ZFS_MODULE_PARAM(zfs, , zap_micro_max_size, INT, ZMOD_RW, - "Maximum micro ZAP size, before converting to a fat ZAP, in bytes"); + "Maximum micro ZAP size before converting to a fat ZAP, " + "in bytes (max 1M)"); #endif From 30b97ce218c986fecafe2dddd50c4edbe6250474 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 27 Dec 2024 10:01:22 -0500 Subject: [PATCH 08/30] ZTS: Increase write sizes for RAIDZ/dRAID tests Many RAIDZ/dRAID tests filled files doing millions of 100 or even 10 byte writes. It makes very little sense since we are not micro-benchmarking syscalls or VFS layer here, while before the blocks reach the vdev layer absolute majority of the small writes will be aggregated. In some cases I see we spend almost as much time creating the test files as actually running the tests. And sometimes the tests even time out after that. Reviewed-by: Tony Hutter Reviewed-by: George Melikov Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #16905 --- .../tests/functional/raidz/raidz_expand_001_pos.ksh | 6 +++--- .../tests/functional/raidz/raidz_expand_002_pos.ksh | 6 +++--- .../tests/functional/raidz/raidz_expand_003_neg.ksh | 2 +- .../tests/functional/raidz/raidz_expand_003_pos.ksh | 4 ++-- .../tests/functional/raidz/raidz_expand_004_pos.ksh | 4 ++-- .../tests/functional/raidz/raidz_expand_005_pos.ksh | 4 ++-- .../tests/functional/redundancy/redundancy_draid.ksh | 6 +++--- .../functional/redundancy/redundancy_draid_damaged1.ksh | 6 +++--- .../functional/redundancy/redundancy_draid_damaged2.ksh | 6 +++--- .../tests/functional/redundancy/redundancy_raidz.ksh | 6 +++--- 10 files changed, 25 insertions(+), 25 deletions(-) diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_001_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_001_pos.ksh index d4923fdb67d9..125b0e5411a3 100755 --- a/tests/zfs-tests/tests/functional/raidz/raidz_expand_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_001_pos.ksh @@ -200,13 +200,13 @@ log_must zpool create -f -o cachefile=none $TESTPOOL $raid ${disks[@]} log_must zfs set primarycache=metadata $TESTPOOL log_must zfs create $TESTPOOL/fs -log_must fill_fs /$TESTPOOL/fs 1 512 100 1024 R +log_must fill_fs /$TESTPOOL/fs 1 512 102400 1 R log_must zfs create -o compress=on $TESTPOOL/fs2 -log_must fill_fs /$TESTPOOL/fs2 1 512 100 1024 R +log_must fill_fs /$TESTPOOL/fs2 1 512 102400 1 R log_must zfs create -o compress=on -o recordsize=8k $TESTPOOL/fs3 -log_must fill_fs /$TESTPOOL/fs3 1 512 100 1024 R +log_must fill_fs /$TESTPOOL/fs3 1 512 102400 1 R log_must check_pool_status $TESTPOOL "errors" "No known data errors" diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_002_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_002_pos.ksh index 56810aca099f..185316a7cb85 100755 --- a/tests/zfs-tests/tests/functional/raidz/raidz_expand_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_002_pos.ksh @@ -78,13 +78,13 @@ log_must zpool create -f $opts $pool $raid ${disks[1..$(($nparity+1))]} log_must zfs set primarycache=metadata $pool log_must zfs create $pool/fs -log_must fill_fs /$pool/fs 1 512 100 1024 R +log_must fill_fs /$pool/fs 1 512 102400 1 R log_must zfs create -o compress=on $pool/fs2 -log_must fill_fs /$pool/fs2 1 512 100 1024 R +log_must fill_fs /$pool/fs2 1 512 102400 1 R log_must zfs create -o compress=on -o recordsize=8k $pool/fs3 -log_must fill_fs /$pool/fs3 1 512 100 1024 R +log_must fill_fs /$pool/fs3 1 512 102400 1 R typeset pool_size=$(get_pool_prop size $pool) diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_neg.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_neg.ksh index 4d85c46897b8..a2eb87b1f722 100755 --- a/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_neg.ksh +++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_neg.ksh @@ -92,7 +92,7 @@ log_must zpool destroy $pool log_must zpool create -f $opts $pool $raid ${disks[1..$(($devs-1))]} log_must zfs set primarycache=metadata $pool log_must zfs create $pool/fs -log_must fill_fs /$pool/fs 1 512 100 1024 R +log_must fill_fs /$pool/fs 1 512 102400 1 R allocated=$(zpool list -Hp -o allocated $pool) log_must set_tunable64 RAIDZ_EXPAND_MAX_REFLOW_BYTES $((allocated / 4)) log_must zpool attach $pool ${raid}-0 ${disks[$devs]} diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_pos.ksh index 712b25261773..6f852c516ca4 100755 --- a/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_pos.ksh @@ -94,10 +94,10 @@ opts="-o cachefile=none" log_must zpool create -f $opts $pool $raid ${disks[1..$(($nparity+1))]} log_must zfs create -o recordsize=8k $pool/fs -log_must fill_fs /$pool/fs 1 256 100 1024 R +log_must fill_fs /$pool/fs 1 256 102400 1 R log_must zfs create -o recordsize=128k $pool/fs2 -log_must fill_fs /$pool/fs2 1 256 100 1024 R +log_must fill_fs /$pool/fs2 1 256 102400 1 R for disk in ${disks[$(($nparity+2))..$devs]}; do log_must mkfile -n 400m /$pool/fs/file diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_004_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_004_pos.ksh index 2be55dae4254..5056e4e4b1fd 100755 --- a/tests/zfs-tests/tests/functional/raidz/raidz_expand_004_pos.ksh +++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_004_pos.ksh @@ -81,10 +81,10 @@ log_must set_tunable32 SCRUB_AFTER_EXPAND 0 log_must zpool create -f $opts $pool $raid ${disks[1..$(($nparity+1))]} log_must zfs create -o recordsize=8k $pool/fs -log_must fill_fs /$pool/fs 1 128 100 1024 R +log_must fill_fs /$pool/fs 1 128 102400 1 R log_must zfs create -o recordsize=128k $pool/fs2 -log_must fill_fs /$pool/fs2 1 128 100 1024 R +log_must fill_fs /$pool/fs2 1 128 102400 1 R for disk in ${disks[$(($nparity+2))..$devs]}; do log_must zpool attach $pool ${raid}-0 $disk diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_005_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_005_pos.ksh index 56ee3e9be67c..49b9f6c1d353 100755 --- a/tests/zfs-tests/tests/functional/raidz/raidz_expand_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_005_pos.ksh @@ -137,10 +137,10 @@ log_must zpool create -f $opts $pool $raid ${disks[1..$(($nparity+1))]} devices="${disks[1..$(($nparity+1))]}" log_must zfs create -o recordsize=8k $pool/fs -log_must fill_fs /$pool/fs 1 128 100 1024 R +log_must fill_fs /$pool/fs 1 128 102400 1 R log_must zfs create -o recordsize=128k $pool/fs2 -log_must fill_fs /$pool/fs2 1 128 100 1024 R +log_must fill_fs /$pool/fs2 1 128 102400 1 R for disk in ${disks[$(($nparity+2))..$devs]}; do # Set pause to some random value near halfway point diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid.ksh index 8208d2b4a398..df113a98aa3c 100755 --- a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid.ksh +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid.ksh @@ -223,13 +223,13 @@ for nparity in 1 2 3; do log_must zfs set primarycache=metadata $TESTPOOL log_must zfs create $TESTPOOL/fs - log_must fill_fs /$TESTPOOL/fs 1 512 100 1024 R + log_must fill_fs /$TESTPOOL/fs 1 512 102400 1 R log_must zfs create -o compress=on $TESTPOOL/fs2 - log_must fill_fs /$TESTPOOL/fs2 1 512 100 1024 R + log_must fill_fs /$TESTPOOL/fs2 1 512 102400 1 R log_must zfs create -o compress=on -o recordsize=8k $TESTPOOL/fs3 - log_must fill_fs /$TESTPOOL/fs3 1 512 100 1024 R + log_must fill_fs /$TESTPOOL/fs3 1 512 102400 1 R typeset pool_size=$(get_pool_prop size $TESTPOOL) diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged1.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged1.ksh index 110c69159eb1..50d7358411dc 100755 --- a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged1.ksh +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged1.ksh @@ -119,13 +119,13 @@ for nparity in 1 2 3; do log_must zfs set primarycache=metadata $TESTPOOL log_must zfs create $TESTPOOL/fs - log_must fill_fs /$TESTPOOL/fs 1 512 100 1024 R + log_must fill_fs /$TESTPOOL/fs 1 512 102400 1 R log_must zfs create -o compress=on $TESTPOOL/fs2 - log_must fill_fs /$TESTPOOL/fs2 1 512 100 1024 R + log_must fill_fs /$TESTPOOL/fs2 1 512 102400 1 R log_must zfs create -o compress=on -o recordsize=8k $TESTPOOL/fs3 - log_must fill_fs /$TESTPOOL/fs3 1 512 100 1024 R + log_must fill_fs /$TESTPOOL/fs3 1 512 102400 1 R log_must zpool export $TESTPOOL log_must zpool import -o cachefile=none -d $dir $TESTPOOL diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged2.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged2.ksh index b0bb4ef84129..ad66f8633986 100755 --- a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged2.ksh +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged2.ksh @@ -94,13 +94,13 @@ for nparity in 1 2 3; do # log_must zfs set primarycache=metadata $TESTPOOL log_must zfs create $TESTPOOL/fs - log_must fill_fs /$TESTPOOL/fs 1 256 10 1024 R + log_must fill_fs /$TESTPOOL/fs 1 256 10240 1 R log_must zfs create -o compress=on $TESTPOOL/fs2 - log_must fill_fs /$TESTPOOL/fs2 1 256 10 1024 R + log_must fill_fs /$TESTPOOL/fs2 1 256 10240 1 R log_must zfs create -o compress=on -o recordsize=8k $TESTPOOL/fs3 - log_must fill_fs /$TESTPOOL/fs3 1 256 10 1024 R + log_must fill_fs /$TESTPOOL/fs3 1 256 10240 1 R log_must zpool export $TESTPOOL log_must zpool import -o cachefile=none -d $dir $TESTPOOL diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_raidz.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_raidz.ksh index 83cacda84b09..7de35c947fec 100755 --- a/tests/zfs-tests/tests/functional/redundancy/redundancy_raidz.ksh +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_raidz.ksh @@ -223,13 +223,13 @@ for nparity in 1 2 3; do log_must zfs set primarycache=metadata $TESTPOOL log_must zfs create $TESTPOOL/fs - log_must fill_fs /$TESTPOOL/fs 1 512 100 1024 R + log_must fill_fs /$TESTPOOL/fs 1 512 102400 1 R log_must zfs create -o compress=on $TESTPOOL/fs2 - log_must fill_fs /$TESTPOOL/fs2 1 512 100 1024 R + log_must fill_fs /$TESTPOOL/fs2 1 512 102400 1 R log_must zfs create -o compress=on -o recordsize=8k $TESTPOOL/fs3 - log_must fill_fs /$TESTPOOL/fs3 1 512 100 1024 R + log_must fill_fs /$TESTPOOL/fs3 1 512 102400 1 R typeset pool_size=$(get_pool_prop size $TESTPOOL) From 74064cb17557a9b9423d9afb09c7c2ab1d00e979 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Thu, 19 Dec 2024 20:11:54 +1100 Subject: [PATCH 09/30] zpool_get_vdev_prop_value: show missing vdev userprops If a vdev userprop is not found, present it as value '-', default source, so it matches the output from pool userprops. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Rob Norris Closes #16887 --- lib/libzfs/libzfs_pool.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index f256535e8ea0..64f9d1f6eb49 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -5342,7 +5342,8 @@ zpool_get_vdev_prop_value(nvlist_t *nvprop, vdev_prop_t prop, char *prop_name, strval = fnvlist_lookup_string(nv, ZPROP_VALUE); } else { /* user prop not found */ - return (-1); + src = ZPROP_SRC_DEFAULT; + strval = "-"; } (void) strlcpy(buf, strval, len); if (srctype) From c3d2412b05bb5da0584139e58e69cf3ccf501ad0 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 20 Dec 2024 10:54:35 -0500 Subject: [PATCH 10/30] ZTS: Remove non-standard awk hex numbers usage FreeBSD recently removed non-standard hex numbers support from awk. Neither it supports -n argument, enabling it in gawk. Instead of depending on those rewrite list_file_blocks() fuction to handle the hex math in shell instead of awk. Reviewed-by: Brian Behlendorf Reviewed-by: Tony Hutter Reviewed-by: Tino Reichardt Signed-off-by:Alexander Motin Sponsored by: iXsystems, Inc. Closes #11141 --- tests/zfs-tests/include/blkdev.shlib | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/tests/zfs-tests/include/blkdev.shlib b/tests/zfs-tests/include/blkdev.shlib index 51eff3023e73..5b505f925286 100644 --- a/tests/zfs-tests/include/blkdev.shlib +++ b/tests/zfs-tests/include/blkdev.shlib @@ -556,27 +556,15 @@ function list_file_blocks # input_file # 512B blocks for ease of use with dd. # typeset level vdev path offset length - if awk -n '' 2>/dev/null; then - # gawk needs -n to decode hex - AWK='awk -n' - else - AWK='awk' - fi sync_all_pools true - zdb -dddddd $ds $objnum | $AWK -v pad=$((4<<20)) -v bs=512 ' + zdb -dddddd $ds $objnum | awk ' /^$/ { looking = 0 } looking { level = $2 field = 3 while (split($field, dva, ":") == 3) { - # top level vdev id - vdev = int(dva[1]) - # offset + 4M label/boot pad in 512B blocks - offset = (int("0x"dva[2]) + pad) / bs - # length in 512B blocks - len = int("0x"dva[3]) / bs - print level, vdev, offset, len + print level, int(dva[1]), "0x"dva[2], "0x"dva[3] ++field } @@ -585,7 +573,8 @@ function list_file_blocks # input_file ' | \ while read level vdev offset length; do for path in ${VDEV_MAP[$vdev][@]}; do - echo "$level $path $offset $length" + echo "$level $path $(( ($offset + (4<<20)) / 512 ))" \ + "$(( $length / 512 ))" done done 2>/dev/null } From 0f6d955a3585107fa6bb7884fa9a32a57067f918 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Wed, 25 Dec 2024 14:00:38 -0500 Subject: [PATCH 11/30] ZTS: Remove procfs use from zpool_import_status procfs might be not mounted on FreeBSD. Plus checking for specific PID might be not exactly reliable. Check for empty list of jobs instead. Premature loop exit can result in failed test and failed cleanup, failing also some following tests. Reviewed-by: Brian Behlendorf Reviewed-by: Tony Hutter Reviewed-by: Tino Reichardt Signed-off-by:Alexander Motin Sponsored by: iXsystems, Inc. Closes #11141 --- .../cli_root/zpool_import/zpool_import_status.ksh | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_status.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_status.ksh index c96961bf6419..679362bbef50 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_status.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_status.ksh @@ -103,21 +103,16 @@ log_must zpool export $TESTPOOL1 log_must set_tunable64 METASLAB_DEBUG_LOAD 1 log_note "Starting zpool import in background at" $(date +'%H:%M:%S') zpool import -d $DEVICE_DIR -f $guid & -pid=$! # # capture progress until import is finished # -log_note waiting for pid $pid to exit kstat import_progress -while [[ -d /proc/"$pid" ]]; do +while [[ -n $(jobs) ]]; do line=$(kstat import_progress | grep -v pool_guid) if [[ -n $line ]]; then echo $line fi - if [[ -f /$TESTPOOL1/fs/00 ]]; then - break; - fi sleep 0.0001 done log_note "zpool import completed at" $(date +'%H:%M:%S') From 0fea7fc109109e3b1e8e5a081193ed02ad59fef7 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Wed, 25 Dec 2024 19:42:44 -0500 Subject: [PATCH 12/30] ZTS: Reduce file size in redacted_panic to 1GB This test takes 3 minutes on RELEASE FreeBSD bots, but on CURRENT, probably due to debugging it has in kernel, it does not complete within 10 minutes, ending up killed. As I see all the redacting here happens within the first ~128MB of the file, so I hope it won't matter if there is 1GB of data instead of 2GB. Reviewed-by: Brian Behlendorf Reviewed-by: Tony Hutter Reviewed-by: Tino Reichardt Signed-off-by:Alexander Motin Sponsored by: iXsystems, Inc. Closes #11141 --- .../zfs-tests/tests/functional/redacted_send/redacted_panic.ksh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_panic.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_panic.ksh index 032d1fb91a2e..a2438c2cd731 100755 --- a/tests/zfs-tests/tests/functional/redacted_send/redacted_panic.ksh +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_panic.ksh @@ -39,7 +39,7 @@ function cleanup log_onexit cleanup log_must zfs create -o recsize=8k $sendfs -log_must dd if=/dev/urandom of=/$sendfs/file bs=1024k count=2048 +log_must dd if=/dev/urandom of=/$sendfs/file bs=1024k count=1024 log_must zfs snapshot $sendfs@init log_must zfs clone $sendfs@init $clone log_must stride_dd -i /dev/urandom -o /$clone/file -b 8192 -s 2 -c 7226 From b952e061df73faa80f5b82f37c2e5db638011c16 Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Mon, 30 Dec 2024 00:41:30 +0500 Subject: [PATCH 13/30] zvol: implement platform-independent part of block cloning In Linux, block devices currently lack support for `copy_file_range` API because the kernel does not provide the necessary functionality. However, there is an ongoing upstream effort to address this limitation: https://patchwork.kernel.org/project/dm-devel/cover/20240520102033.9361-1-nj.shetty@samsung.com/. We have adopted this upstream kernel patch into the TrueNAS kernel and made some additional modifications to enable block cloning specifically for the zvol block device. This patch implements the platform- independent portions of these changes for inclusion in OpenZFS. This patch does not introduce any new functionality directly into OpenZFS. The `TX_CLONE_RANGE` replay capability is only relevant when zvols are migrated to non-TrueNAS systems that support Clone Range replay in the ZIL. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Ameer Hamza Closes #16901 --- include/sys/zvol_impl.h | 5 + module/zfs/zfs_vnops.c | 2 +- module/zfs/zvol.c | 284 +++++++++++++++++++++++++++++++++++++++- 3 files changed, 289 insertions(+), 2 deletions(-) diff --git a/include/sys/zvol_impl.h b/include/sys/zvol_impl.h index 3cd0d78c353d..a8168850023a 100644 --- a/include/sys/zvol_impl.h +++ b/include/sys/zvol_impl.h @@ -88,6 +88,11 @@ int zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, int zvol_init_impl(void); void zvol_fini_impl(void); void zvol_wait_close(zvol_state_t *zv); +int zvol_clone_range(zvol_state_handle_t *, uint64_t, + zvol_state_handle_t *, uint64_t, uint64_t); +void zvol_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, + uint64_t off, uint64_t len, uint64_t blksz, const blkptr_t *bps, + size_t nbps); /* * platform dependent functions exported to platform independent code diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 6c15a5c472ea..8c8ed255e686 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -71,7 +71,7 @@ int zfs_bclone_enabled = 1; * a copy of the file and is therefore not the default. However, in certain * scenarios this behavior may be desirable so a tunable is provided. */ -static int zfs_bclone_wait_dirty = 0; +int zfs_bclone_wait_dirty = 0; /* * Enable Direct I/O. If this setting is 0, then all I/O requests will be diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index fec595b2c4c5..14a6219d19cd 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -93,6 +93,7 @@ unsigned int zvol_volmode = ZFS_VOLMODE_GEOM; struct hlist_head *zvol_htable; static list_t zvol_state_list; krwlock_t zvol_state_lock; +extern int zfs_bclone_wait_dirty; typedef enum { ZVOL_ASYNC_REMOVE_MINORS, @@ -516,6 +517,285 @@ zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap) return (error); } +/* + * Replay a TX_CLONE_RANGE ZIL transaction that didn't get committed + * after a system failure + */ +static int +zvol_replay_clone_range(void *arg1, void *arg2, boolean_t byteswap) +{ + zvol_state_t *zv = arg1; + lr_clone_range_t *lr = arg2; + objset_t *os = zv->zv_objset; + dmu_tx_t *tx; + int error; + uint64_t blksz; + uint64_t off; + uint64_t len; + + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + ASSERT3U(lr->lr_common.lrc_reclen, >=, offsetof(lr_clone_range_t, + lr_bps[lr->lr_nbps])); + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + ASSERT(spa_feature_is_enabled(dmu_objset_spa(os), + SPA_FEATURE_BLOCK_CLONING)); + + off = lr->lr_offset; + len = lr->lr_length; + blksz = lr->lr_blksz; + + if ((off % blksz) != 0) { + return (SET_ERROR(EINVAL)); + } + + error = dnode_hold(os, ZVOL_OBJ, zv, &zv->zv_dn); + if (error != 0 || !zv->zv_dn) + return (error); + tx = dmu_tx_create(os); + dmu_tx_hold_clone_by_dnode(tx, zv->zv_dn, off, len); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) { + dmu_tx_abort(tx); + goto out; + } + error = dmu_brt_clone(zv->zv_objset, ZVOL_OBJ, off, len, + tx, lr->lr_bps, lr->lr_nbps); + if (error != 0) { + dmu_tx_commit(tx); + goto out; + } + + /* + * zil_replaying() not only check if we are replaying ZIL, but also + * updates the ZIL header to record replay progress. + */ + VERIFY(zil_replaying(zv->zv_zilog, tx)); + dmu_tx_commit(tx); + +out: + dnode_rele(zv->zv_dn, zv); + zv->zv_dn = NULL; + return (error); +} + +int +zvol_clone_range(zvol_state_t *zv_src, uint64_t inoff, zvol_state_t *zv_dst, + uint64_t outoff, uint64_t len) +{ + zilog_t *zilog_dst; + zfs_locked_range_t *inlr, *outlr; + objset_t *inos, *outos; + dmu_tx_t *tx; + blkptr_t *bps; + size_t maxblocks; + int error = EINVAL; + + rw_enter(&zv_dst->zv_suspend_lock, RW_READER); + if (zv_dst->zv_zilog == NULL) { + rw_exit(&zv_dst->zv_suspend_lock); + rw_enter(&zv_dst->zv_suspend_lock, RW_WRITER); + if (zv_dst->zv_zilog == NULL) { + zv_dst->zv_zilog = zil_open(zv_dst->zv_objset, + zvol_get_data, &zv_dst->zv_kstat.dk_zil_sums); + zv_dst->zv_flags |= ZVOL_WRITTEN_TO; + VERIFY0((zv_dst->zv_zilog->zl_header->zh_flags & + ZIL_REPLAY_NEEDED)); + } + rw_downgrade(&zv_dst->zv_suspend_lock); + } + if (zv_src != zv_dst) + rw_enter(&zv_src->zv_suspend_lock, RW_READER); + + inos = zv_src->zv_objset; + outos = zv_dst->zv_objset; + + /* + * Sanity checks + */ + if (!spa_feature_is_enabled(dmu_objset_spa(outos), + SPA_FEATURE_BLOCK_CLONING)) { + error = EOPNOTSUPP; + goto out; + } + if (dmu_objset_spa(inos) != dmu_objset_spa(outos)) { + error = EXDEV; + goto out; + } + if (inos->os_encrypted != outos->os_encrypted) { + error = EXDEV; + goto out; + } + if (zv_src->zv_volblocksize != zv_dst->zv_volblocksize) { + error = EINVAL; + goto out; + } + if (inoff >= zv_src->zv_volsize || outoff >= zv_dst->zv_volsize) { + error = 0; + goto out; + } + + /* + * Do not read beyond boundary + */ + if (len > zv_src->zv_volsize - inoff) + len = zv_src->zv_volsize - inoff; + if (len > zv_dst->zv_volsize - outoff) + len = zv_dst->zv_volsize - outoff; + if (len == 0) { + error = 0; + goto out; + } + + /* + * No overlapping if we are cloning within the same file + */ + if (zv_src == zv_dst) { + if (inoff < outoff + len && outoff < inoff + len) { + error = EINVAL; + goto out; + } + } + + /* + * Offsets and length must be at block boundaries + */ + if ((inoff % zv_src->zv_volblocksize) != 0 || + (outoff % zv_dst->zv_volblocksize) != 0) { + error = EINVAL; + goto out; + } + + /* + * Length must be multiple of block size + */ + if ((len % zv_src->zv_volblocksize) != 0) { + error = EINVAL; + goto out; + } + + zilog_dst = zv_dst->zv_zilog; + maxblocks = zil_max_log_data(zilog_dst, sizeof (lr_clone_range_t)) / + sizeof (bps[0]); + bps = vmem_alloc(sizeof (bps[0]) * maxblocks, KM_SLEEP); + /* + * Maintain predictable lock order. + */ + if (zv_src < zv_dst || (zv_src == zv_dst && inoff < outoff)) { + inlr = zfs_rangelock_enter(&zv_src->zv_rangelock, inoff, len, + RL_READER); + outlr = zfs_rangelock_enter(&zv_dst->zv_rangelock, outoff, len, + RL_WRITER); + } else { + outlr = zfs_rangelock_enter(&zv_dst->zv_rangelock, outoff, len, + RL_WRITER); + inlr = zfs_rangelock_enter(&zv_src->zv_rangelock, inoff, len, + RL_READER); + } + + while (len > 0) { + uint64_t size, last_synced_txg; + size_t nbps = maxblocks; + size = MIN(zv_src->zv_volblocksize * maxblocks, len); + last_synced_txg = spa_last_synced_txg( + dmu_objset_spa(zv_src->zv_objset)); + error = dmu_read_l0_bps(zv_src->zv_objset, ZVOL_OBJ, inoff, + size, bps, &nbps); + if (error != 0) { + /* + * If we are trying to clone a block that was created + * in the current transaction group, the error will be + * EAGAIN here. Based on zfs_bclone_wait_dirty either + * return a shortened range to the caller so it can + * fallback, or wait for the next TXG and check again. + */ + if (error == EAGAIN && zfs_bclone_wait_dirty) { + txg_wait_synced(dmu_objset_pool + (zv_src->zv_objset), last_synced_txg + 1); + continue; + } + break; + } + + tx = dmu_tx_create(zv_dst->zv_objset); + dmu_tx_hold_clone_by_dnode(tx, zv_dst->zv_dn, outoff, size); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) { + dmu_tx_abort(tx); + break; + } + error = dmu_brt_clone(zv_dst->zv_objset, ZVOL_OBJ, outoff, size, + tx, bps, nbps); + if (error != 0) { + dmu_tx_commit(tx); + break; + } + zvol_log_clone_range(zilog_dst, tx, TX_CLONE_RANGE, outoff, + size, zv_src->zv_volblocksize, bps, nbps); + dmu_tx_commit(tx); + inoff += size; + outoff += size; + len -= size; + } + vmem_free(bps, sizeof (bps[0]) * maxblocks); + zfs_rangelock_exit(outlr); + zfs_rangelock_exit(inlr); + if (error == 0 && zv_dst->zv_objset->os_sync == ZFS_SYNC_ALWAYS) { + zil_commit(zilog_dst, ZVOL_OBJ); + } +out: + if (zv_src != zv_dst) + rw_exit(&zv_src->zv_suspend_lock); + rw_exit(&zv_dst->zv_suspend_lock); + return (SET_ERROR(error)); +} + +/* + * Handles TX_CLONE_RANGE transactions. + */ +void +zvol_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, uint64_t off, + uint64_t len, uint64_t blksz, const blkptr_t *bps, size_t nbps) +{ + itx_t *itx; + lr_clone_range_t *lr; + uint64_t partlen, max_log_data; + size_t partnbps; + + if (zil_replaying(zilog, tx)) + return; + + max_log_data = zil_max_log_data(zilog, sizeof (lr_clone_range_t)); + + while (nbps > 0) { + partnbps = MIN(nbps, max_log_data / sizeof (bps[0])); + partlen = partnbps * blksz; + ASSERT3U(partlen, <, len + blksz); + partlen = MIN(partlen, len); + + itx = zil_itx_create(txtype, + sizeof (*lr) + sizeof (bps[0]) * partnbps); + lr = (lr_clone_range_t *)&itx->itx_lr; + lr->lr_foid = ZVOL_OBJ; + lr->lr_offset = off; + lr->lr_length = partlen; + lr->lr_blksz = blksz; + lr->lr_nbps = partnbps; + memcpy(lr->lr_bps, bps, sizeof (bps[0]) * partnbps); + + zil_itx_assign(zilog, itx, tx); + + bps += partnbps; + ASSERT3U(nbps, >=, partnbps); + nbps -= partnbps; + off += partlen; + ASSERT3U(len, >=, partlen); + len -= partlen; + } +} + static int zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap) { @@ -540,7 +820,9 @@ zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = { zvol_replay_write, /* TX_WRITE */ zvol_replay_truncate, /* TX_TRUNCATE */ zvol_replay_err, /* TX_SETATTR */ + zvol_replay_err, /* TX_ACL_V0 */ zvol_replay_err, /* TX_ACL */ + zvol_replay_err, /* TX_CREATE_ACL */ zvol_replay_err, /* TX_CREATE_ATTR */ zvol_replay_err, /* TX_CREATE_ACL_ATTR */ zvol_replay_err, /* TX_MKDIR_ACL */ @@ -550,7 +832,7 @@ zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = { zvol_replay_err, /* TX_SETSAXATTR */ zvol_replay_err, /* TX_RENAME_EXCHANGE */ zvol_replay_err, /* TX_RENAME_WHITEOUT */ - zvol_replay_err, /* TX_CLONE_RANGE */ + zvol_replay_clone_range, /* TX_CLONE_RANGE */ }; /* From c2d9494f99d250c5f1dadc4f746f7c3b2bbd1f35 Mon Sep 17 00:00:00 2001 From: shodanshok Date: Sun, 29 Dec 2024 20:50:19 +0100 Subject: [PATCH 14/30] set zfs_arc_shrinker_limit to 0 by default zfs_arc_shrinker_limit was introduced to avoid ARC collapse due to aggressive kernel reclaim. While useful, the current default (10000) is too prone to OOM especially when MGLRU-enabled kernels with default min_ttl_ms are used. Even when no OOM happens, it often causes too much swap usage. This patch sets zfs_arc_shrinker_limit=0 to not ignore kernel reclaim requests. ARC now plays better with both kernel shrinker and pagecache but, should ARC collapse happen again, MGLRU behavior can be tuned or even disabled. Anyway, zfs should not cause OOM when ARC can be released. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Gionatan Danti Closes #16909 --- man/man4/zfs.4 | 4 ++-- module/os/linux/zfs/arc_os.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index da027798f962..7078a5ba8373 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -867,14 +867,14 @@ where that percent may exceed This only operates during memory pressure/reclaim. . -.It Sy zfs_arc_shrinker_limit Ns = Ns Sy 10000 Pq int +.It Sy zfs_arc_shrinker_limit Ns = Ns Sy 0 Pq int This is a limit on how many pages the ARC shrinker makes available for eviction in response to one page allocation attempt. Note that in practice, the kernel's shrinker can ask us to evict up to about four times this for one allocation attempt. To reduce OOM risk, this limit is applied for kswapd reclaims only. .Pp -The default limit of +For example a value of .Sy 10000 Pq in practice, Em 160 MiB No per allocation attempt with 4 KiB pages limits the amount of time spent attempting to reclaim ARC memory to less than 100 ms per allocation attempt, diff --git a/module/os/linux/zfs/arc_os.c b/module/os/linux/zfs/arc_os.c index b1e45b28743e..3238977af6d1 100644 --- a/module/os/linux/zfs/arc_os.c +++ b/module/os/linux/zfs/arc_os.c @@ -63,7 +63,7 @@ * practice, the kernel's shrinker can ask us to evict up to about 4x this * for one allocation attempt. * - * The default limit of 10,000 (in practice, 160MB per allocation attempt + * For example a value of 10,000 (in practice, 160MB per allocation attempt * with 4K pages) limits the amount of time spent attempting to reclaim ARC * memory to less than 100ms per allocation attempt, even with a small * average compressed block size of ~8KB. @@ -71,7 +71,7 @@ * See also the comment in arc_shrinker_count(). * Set to 0 to disable limit. */ -static int zfs_arc_shrinker_limit = 10000; +static int zfs_arc_shrinker_limit = 0; /* * Relative cost of ARC eviction, AKA number of seeks needed to restore evicted From 679b164cd3ba918510107bd0b4a30efb412b241c Mon Sep 17 00:00:00 2001 From: Andrew Walker Date: Mon, 30 Dec 2024 19:06:48 -0600 Subject: [PATCH 15/30] Add missing zfs_exit() when snapdir is disabled (#16912) zfs_vget doesn't zfs_exit when erroring out due to snapdir being disabled. Signed-off-by: Andrew Walker Reviewed-by: @bmeagherix Reviewed-by: Alexander Motin Reviewed-by: Ameer Hamza Reviewed-by: Tony Hutter --- module/os/linux/zfs/zfs_vfsops.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c index 3c53a8a315c3..b226fca147a5 100644 --- a/module/os/linux/zfs/zfs_vfsops.c +++ b/module/os/linux/zfs/zfs_vfsops.c @@ -1702,13 +1702,14 @@ zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp) /* A zero fid_gen means we are in the .zfs control directories */ if (fid_gen == 0 && (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) { - *ipp = zfsvfs->z_ctldir; - ASSERT(*ipp != NULL); - if (zfsvfs->z_show_ctldir == ZFS_SNAPDIR_DISABLED) { + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOENT)); } + *ipp = zfsvfs->z_ctldir; + ASSERT(*ipp != NULL); + if (object == ZFSCTL_INO_SNAPDIR) { VERIFY(zfsctl_root_lookup(*ipp, "snapshot", ipp, 0, kcred, NULL, NULL) == 0); From 939e9f0b6aa0a6a3ff69242532970dd570ddde39 Mon Sep 17 00:00:00 2001 From: James Reilly Date: Thu, 2 Jan 2025 22:58:56 +0530 Subject: [PATCH 16/30] ZTS: add centos stream10 (#16904) Added centos as optional runners via workflow_dispatch removed centos-stream9 from the FULL_OS runner list as CentOS is not officially support by ZFS. This commit will add preliminary support for EL10 and allow testing ZFS ahead of EL10 codebase solidifying in ~6 months Signed-off-by: James Reilly Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Reviewed-by: Tino Reichardt --- .github/workflows/scripts/qemu-2-start.sh | 6 +++++ .github/workflows/scripts/qemu-3-deps.sh | 2 +- .github/workflows/zfs-qemu.yml | 27 ++++++++++++++++++++++- 3 files changed, 33 insertions(+), 2 deletions(-) diff --git a/.github/workflows/scripts/qemu-2-start.sh b/.github/workflows/scripts/qemu-2-start.sh index 0906e438ac0d..73496d4f3de6 100755 --- a/.github/workflows/scripts/qemu-2-start.sh +++ b/.github/workflows/scripts/qemu-2-start.sh @@ -40,6 +40,12 @@ case "$OS" in # dns sometimes fails with that url :/ echo "89.187.191.12 geo.mirror.pkgbuild.com" | sudo tee /etc/hosts > /dev/null ;; + centos-stream10) + OSNAME="CentOS Stream 10" + # TODO: #16903 Overwrite OSv to stream9 for virt-install until it's added to osinfo + OSv="centos-stream9" + URL="https://cloud.centos.org/centos/10-stream/x86_64/images/CentOS-Stream-GenericCloud-10-latest.x86_64.qcow2" + ;; centos-stream9) OSNAME="CentOS Stream 9" URL="https://cloud.centos.org/centos/9-stream/x86_64/images/CentOS-Stream-GenericCloud-9-latest.x86_64.qcow2" diff --git a/.github/workflows/scripts/qemu-3-deps.sh b/.github/workflows/scripts/qemu-3-deps.sh index 96979cd02e09..9b8957734277 100755 --- a/.github/workflows/scripts/qemu-3-deps.sh +++ b/.github/workflows/scripts/qemu-3-deps.sh @@ -104,7 +104,7 @@ case "$1" in sudo dnf install -y kernel-abi-whitelists echo "##[endgroup]" ;; - almalinux9|centos-stream9) + almalinux9|centos-stream9|centos-stream10) echo "##[group]Enable epel and crb repositories" sudo dnf config-manager -y --set-enabled crb sudo dnf install -y epel-release diff --git a/.github/workflows/zfs-qemu.yml b/.github/workflows/zfs-qemu.yml index 4748e90db50b..af26e135b91f 100644 --- a/.github/workflows/zfs-qemu.yml +++ b/.github/workflows/zfs-qemu.yml @@ -3,6 +3,18 @@ name: zfs-qemu on: push: pull_request: + workflow_dispatch: + inputs: + include_stream9: + type: boolean + required: false + default: false + description: 'Test on CentOS 9 stream' + include_stream10: + type: boolean + required: false + default: false + description: 'Test on CentOS 10 stream' concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} @@ -22,7 +34,7 @@ jobs: - name: Generate OS config and CI type id: os run: | - FULL_OS='["almalinux8", "almalinux9", "centos-stream9", "debian11", "debian12", "fedora40", "fedora41", "freebsd13-3r", "freebsd13-4s", "freebsd14-1r", "freebsd14-2s", "freebsd15-0c", "ubuntu20", "ubuntu22", "ubuntu24"]' + FULL_OS='["almalinux8", "almalinux9", "debian11", "debian12", "fedora40", "fedora41", "freebsd13-3r", "freebsd13-4s", "freebsd14-1r", "freebsd14-2s", "freebsd15-0c", "ubuntu20", "ubuntu22", "ubuntu24"]' QUICK_OS='["almalinux8", "almalinux9", "debian12", "fedora41", "freebsd13-3r", "freebsd14-2r", "ubuntu24"]' # determine CI type when running on PR ci_type="full" @@ -37,9 +49,22 @@ jobs: os_selection="$FULL_OS" fi os_json=$(echo ${os_selection} | jq -c) + + # Add optional runners + if [ "${{ github.event.inputs.include_stream9 }}" == 'true' ]; then + os_json=$(echo $os_json | jq -c '. += ["centos-stream9"]') + fi + if [ "${{ github.event.inputs.include_stream10 }}" == 'true' ]; then + os_json=$(echo $os_json | jq -c '. += ["centos-stream10"]') + fi + + echo $os_json echo "os=$os_json" >> $GITHUB_OUTPUT echo "ci_type=$ci_type" >> $GITHUB_OUTPUT + + + qemu-vm: name: qemu-x86 needs: [ test-config ] From a55b6fe94ac13cbf2c0daba7cf86594435056c26 Mon Sep 17 00:00:00 2001 From: Toomas Soome Date: Thu, 2 Jan 2025 23:29:12 +0200 Subject: [PATCH 17/30] ZTS: zfs_mount_all_fail leaves /var/tmp/testrootPIDNUM directory around Before we can remove test files, we need to unmount datasets used by test first. See also: zfs_mount_all_mountpoints.ksh Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Toomas Soome Closes #16914 --- .../cli_root/zfs_mount/zfs_mount_all_fail.ksh | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_all_fail.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_all_fail.ksh index d1103bddccbd..7b6c2ccdf660 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_all_fail.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_all_fail.ksh @@ -16,6 +16,7 @@ # # Copyright (c) 2017 by Delphix. All rights reserved. +# Copyright 2024 MNX Cloud, Inc. # . $STF_SUITE/include/libtest.shlib @@ -44,8 +45,9 @@ typeset fscount=10 function setup_all { # Create $fscount filesystems at the top level of $path - for ((i=0; i<$fscount; i++)); do + for ((i=0; i Date: Fri, 3 Jan 2025 01:53:53 +0200 Subject: [PATCH 18/30] ZTS: functional/mount scripts are not removing /var/tmp/testdir.X dirs cleanup.ksh is assuming we have TESTDIRS set. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Toomas Soome Closes #16915 --- tests/zfs-tests/tests/functional/mount/cleanup.ksh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/zfs-tests/tests/functional/mount/cleanup.ksh b/tests/zfs-tests/tests/functional/mount/cleanup.ksh index bd6b0e435ed1..0e88e2a1fc79 100755 --- a/tests/zfs-tests/tests/functional/mount/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/mount/cleanup.ksh @@ -27,12 +27,14 @@ # # Copyright (c) 2013, 2016 by Delphix. All rights reserved. +# Copyright 2025 MNX Cloud, Inc. # . $STF_SUITE/include/libtest.shlib log_must destroy_pool $TESTPOOL -for dir in $TESTDIRS; do +for i in 1 2 3; do + dir=$TESTDIR.$i rm -rf $dir done From 997db7a7fc806d05ee32ad1b34a7a19e785b95f8 Mon Sep 17 00:00:00 2001 From: Toomas Soome Date: Fri, 3 Jan 2025 01:57:24 +0200 Subject: [PATCH 19/30] ZTS: checkpoint_discard_busy does not set 16M on cleanup Originally hex value is used as decimal. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Toomas Soome Closes #16917 --- .../functional/pool_checkpoint/checkpoint_discard_busy.ksh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh index 087aef9027ea..07b658641f65 100755 --- a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh +++ b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh @@ -43,7 +43,7 @@ log_unsupported "Skipping, issue https://github.com/openzfs/zfs/issues/12053" function test_cleanup { # reset memory limit to 16M - set_tunable64 SPA_DISCARD_MEMORY_LIMIT 1000000 + set_tunable64 SPA_DISCARD_MEMORY_LIMIT 16777216 cleanup_nested_pools } From cfec8f13a2f646615f61d47ae3276716c09d20da Mon Sep 17 00:00:00 2001 From: pstef Date: Fri, 3 Jan 2025 18:03:14 +0100 Subject: [PATCH 20/30] zfs_vnops_os.c: fallocate is valid but not supported on FreeBSD This works around /usr/lib/go-1.18/pkg/tool/linux_amd64/link: mapping output file failed: invalid argument It's happened to me under a Linux jail, but it's also happened to other people, see https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=270247#c4 Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: pstef Closes #16918 --- module/os/freebsd/zfs/zfs_vnops_os.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index b8c2c341dace..5edd3fcc76e7 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -6258,7 +6258,7 @@ struct vop_vector zfs_vnodeops = { .vop_fplookup_vexec = zfs_freebsd_fplookup_vexec, .vop_fplookup_symlink = zfs_freebsd_fplookup_symlink, .vop_access = zfs_freebsd_access, - .vop_allocate = VOP_EINVAL, + .vop_allocate = VOP_EOPNOTSUPP, #if __FreeBSD_version >= 1400032 .vop_deallocate = zfs_deallocate, #endif From e47b033eaef7d073624b5222cc0928073e519c18 Mon Sep 17 00:00:00 2001 From: Toomas Soome Date: Sat, 4 Jan 2025 00:41:03 +0200 Subject: [PATCH 21/30] ZTS: remove unused TESTDIRS from pam/cleanup.ksh Remove TESTDIRS as it is not set for pam tests. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Toomas Soome Closes #16920 --- tests/zfs-tests/tests/functional/pam/cleanup.ksh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/zfs-tests/tests/functional/pam/cleanup.ksh b/tests/zfs-tests/tests/functional/pam/cleanup.ksh index dbcb175ed069..bfb98cd30707 100755 --- a/tests/zfs-tests/tests/functional/pam/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/pam/cleanup.ksh @@ -27,4 +27,4 @@ destroy_pool $TESTPOOL del_user ${username} del_user ${username}rec del_group pamtestgroup -log_must rm -rf "$runstatedir" $TESTDIRS +log_must rm -rf "$runstatedir" From 4425a7bb851f3c6b657062ee4df0c00216faa557 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sat, 4 Jan 2025 09:42:06 +1100 Subject: [PATCH 22/30] vdev_open: clear async remove flag after reopen It's possible for a vdev to be flagged for async remove after the pool has suspended. If the removed device has been returned when the pool is resumed, the ASYNC_REMOVE task will still run at the end of txg, and remove the device from the pool again. To fix, we clear the async remove flag at reopen, just as we did for the async fault flag in 5de3ac223. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Rob Norris Closes #16921 --- module/zfs/vdev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index ac5230ebd51c..d9c5871820ca 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -2041,6 +2041,7 @@ vdev_open(vdev_t *vd) vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; vd->vdev_fault_wanted = B_FALSE; + vd->vdev_remove_wanted = B_FALSE; vd->vdev_min_asize = vdev_get_min_asize(vd); /* From 125731436d2c2bc713cb2568e58368d0e82a183a Mon Sep 17 00:00:00 2001 From: Toomas Soome Date: Sat, 4 Jan 2025 00:48:30 +0200 Subject: [PATCH 23/30] ZTS: checkpoint_discard_busy should use save_tunable/restore_tunable Instead of using hardwired value for SPA_DISCARD_MEMORY_LIMIT, use save_tunable and restore_tunable to restore the pre-test state. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Toomas Soome Closes #16919 --- .../pool_checkpoint/checkpoint_discard_busy.ksh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh index 07b658641f65..2bf5ab199e6e 100755 --- a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh +++ b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh @@ -42,8 +42,8 @@ log_unsupported "Skipping, issue https://github.com/openzfs/zfs/issues/12053" function test_cleanup { - # reset memory limit to 16M - set_tunable64 SPA_DISCARD_MEMORY_LIMIT 16777216 + # reset to original value + log_must restore_tunable SPA_DISCARD_MEMORY_LIMIT cleanup_nested_pools } @@ -69,6 +69,7 @@ log_onexit test_cleanup # map, we should have even more time to # verify this. # +log_must save_tunable SPA_DISCARD_MEMORY_LIMIT set_tunable64 SPA_DISCARD_MEMORY_LIMIT 128 log_must zpool checkpoint $NESTEDPOOL @@ -101,8 +102,8 @@ log_mustnot zpool checkpoint -d $NESTEDPOOL log_mustnot zpool remove $NESTEDPOOL $FILEDISK1 log_mustnot zpool reguid $NESTEDPOOL -# reset memory limit to 16M -set_tunable64 SPA_DISCARD_MEMORY_LIMIT 16777216 +# reset to original value +log_must restore_tunable SPA_DISCARD_MEMORY_LIMIT nested_wait_discard_finish From 47b7dc976b7b7192f21d64306a642fea1dbb1695 Mon Sep 17 00:00:00 2001 From: Robert Evans Date: Fri, 3 Jan 2025 22:04:01 -0500 Subject: [PATCH 24/30] Add Makefile dependencies for scripts/zfs-tests.sh -c This updates the Makefile to be more correct for parallel make. Reviewed-by: Brian Behlendorf Reviewed-by: George Melikov Signed-off-by: Robert Evans Closes #16030 Closes #16922 --- scripts/Makefile.am | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/Makefile.am b/scripts/Makefile.am index 7d9cef83d2c6..ee8fb8717cec 100644 --- a/scripts/Makefile.am +++ b/scripts/Makefile.am @@ -79,7 +79,7 @@ CLEANFILES += %D%/common.sh -$(AM_V_at)echo "$$SCRIPTS_EXTRA_ENVIRONMENT" >>$@ ALL_LOCAL += scripts-all-local -scripts-all-local: %D%/common.sh +scripts-all-local: %D%/common.sh $(PROGRAMS) $(SCRIPTS) $(DATA) -SCRIPT_COMMON=$< $(srcdir)/%D%/zfs-tests.sh -c CLEAN_LOCAL += scripts-clean-local From 25565403aaf8fc7bf3f46255b522093b27caaf1b Mon Sep 17 00:00:00 2001 From: Don Brady Date: Sat, 4 Jan 2025 11:28:33 -0700 Subject: [PATCH 25/30] Too many vdev probe errors should suspend pool Similar to what we saw in #16569, we need to consider that a replacing vdev should not be considered as fully contributing to the redundancy of a raidz vdev even though current IO has enough redundancy. When a failed vdev_probe() is faulting a disk, it now checks if that disk is required, and if so it suspends the pool until the admin can return the missing disks. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Reviewed-by: Allan Jude Reviewed-by: Tony Hutter Signed-off-by: Don Brady Closes #16864 --- module/zfs/spa.c | 25 ++- tests/runfiles/linux.run | 4 +- tests/zfs-tests/tests/Makefile.am | 1 + .../fault/suspend_on_probe_errors.ksh | 154 ++++++++++++++++++ 4 files changed, 176 insertions(+), 8 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/fault/suspend_on_probe_errors.ksh diff --git a/module/zfs/spa.c b/module/zfs/spa.c index b83c982c13fd..c9dfd7ac2e4d 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -8948,16 +8948,26 @@ spa_async_remove(spa_t *spa, vdev_t *vd) } static void -spa_async_fault_vdev(spa_t *spa, vdev_t *vd) +spa_async_fault_vdev(vdev_t *vd, boolean_t *suspend) { if (vd->vdev_fault_wanted) { + vdev_state_t newstate = VDEV_STATE_FAULTED; vd->vdev_fault_wanted = B_FALSE; - vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, - VDEV_AUX_ERR_EXCEEDED); - } + /* + * If this device has the only valid copy of the data, then + * back off and simply mark the vdev as degraded instead. + */ + if (!vd->vdev_top->vdev_islog && vd->vdev_aux == NULL && + vdev_dtl_required(vd)) { + newstate = VDEV_STATE_DEGRADED; + /* A required disk is missing so suspend the pool */ + *suspend = B_TRUE; + } + vdev_set_state(vd, B_TRUE, newstate, VDEV_AUX_ERR_EXCEEDED); + } for (int c = 0; c < vd->vdev_children; c++) - spa_async_fault_vdev(spa, vd->vdev_child[c]); + spa_async_fault_vdev(vd->vdev_child[c], suspend); } static void @@ -9049,8 +9059,11 @@ spa_async_thread(void *arg) */ if (tasks & SPA_ASYNC_FAULT_VDEV) { spa_vdev_state_enter(spa, SCL_NONE); - spa_async_fault_vdev(spa, spa->spa_root_vdev); + boolean_t suspend = B_FALSE; + spa_async_fault_vdev(spa->spa_root_vdev, &suspend); (void) spa_vdev_state_exit(spa, NULL, 0); + if (suspend) + zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR); } /* diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 76d07a6cc9c1..e55ec583d2cc 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -125,8 +125,8 @@ tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_online_002_pos', 'auto_replace_001_pos', 'auto_replace_002_pos', 'auto_spare_001_pos', 'auto_spare_002_pos', 'auto_spare_multiple', 'auto_spare_ashift', 'auto_spare_shared', 'decrypt_fault', 'decompress_fault', - 'fault_limits', 'scrub_after_resilver', 'suspend_resume_single', - 'zpool_status_-s'] + 'fault_limits', 'scrub_after_resilver', 'suspend_on_probe_errors', + 'suspend_resume_single', 'zpool_status_-s'] tags = ['functional', 'fault'] [tests/functional/features/large_dnode:Linux] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 67630cb564ae..bde33843098f 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1531,6 +1531,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/fault/decrypt_fault.ksh \ functional/fault/fault_limits.ksh \ functional/fault/scrub_after_resilver.ksh \ + functional/fault/suspend_on_probe_errors.ksh \ functional/fault/suspend_resume_single.ksh \ functional/fault/setup.ksh \ functional/fault/zpool_status_-s.ksh \ diff --git a/tests/zfs-tests/tests/functional/fault/suspend_on_probe_errors.ksh b/tests/zfs-tests/tests/functional/fault/suspend_on_probe_errors.ksh new file mode 100755 index 000000000000..d9261bb5d274 --- /dev/null +++ b/tests/zfs-tests/tests/functional/fault/suspend_on_probe_errors.ksh @@ -0,0 +1,154 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2024, Klara Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/include/blkdev.shlib + +# +# DESCRIPTION: Verify that 4 disks removed from a raidz3 will suspend the pool +# +# STRATEGY: +# 1. Disable ZED -- this test is focused on vdev_probe errors +# 2. Create a raidz3 pool where 4 disks can be removed (i.e., using scsi_debug) +# 3. Add some data to it for a resilver workload +# 4. Replace one of the child vdevs to start a replacing vdev +# 5. During the resilver, remove 4 disks including one from the replacing vdev +# 6. Verify that the pool is suspended (it used to remain online) +# + +DEV_SIZE_MB=1024 + +FILE_VDEV_CNT=8 +FILE_VDEV_SIZ=256M + +function cleanup +{ + destroy_pool $TESTPOOL + if [[ "$(cat /sys/block/$sd/device/state)" == "offline" ]]; then + log_must eval "echo running > /sys/block/$sd/device/state" + fi + unload_scsi_debug + rm -f $DATA_FILE + for i in {0..$((FILE_VDEV_CNT - 1))}; do + log_must rm -f "$TEST_BASE_DIR/dev-$i" + done + log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0 + zed_start +} + +log_onexit cleanup + +log_assert "VDEV probe errors for more disks than parity should suspend a pool" + +log_note "Stoping ZED process" +zed_stop +zpool events -c + +# Make a debug device that we can "unplug" and lose 4 drives at once +unload_scsi_debug +load_scsi_debug $DEV_SIZE_MB 1 1 1 '512b' +sd=$(get_debug_device) + +# Create 4 partitions that match the FILE_VDEV_SIZ +parted "/dev/${sd}" --script mklabel gpt +parted "/dev/${sd}" --script mkpart primary 0% 25% +parted "/dev/${sd}" --script mkpart primary 25% 50% +parted "/dev/${sd}" --script mkpart primary 50% 75% +parted "/dev/${sd}" --script mkpart primary 75% 100% +block_device_wait "/dev/${sd}" +blkdevs="/dev/${sd}1 /dev/${sd}2 /dev/${sd}3 /dev/${sd}4" + +# Create 8 file vdevs +typeset -a filedevs +for i in {0..$((FILE_VDEV_CNT - 1))}; do + device=$TEST_BASE_DIR/dev-$i + log_must truncate -s $FILE_VDEV_SIZ $device + # Use all but the last one for pool create + if [[ $i -lt "7" ]]; then + filedevs[${#filedevs[*]}+1]=$device + fi +done + +# Create a raidz-3 pool that we can pull 4 disks from +log_must zpool create -f $TESTPOOL raidz3 ${filedevs[@]} $blkdevs +sync_pool $TESTPOOL + +# Add some data to the pool +log_must zfs create $TESTPOOL/fs +MNTPOINT="$(get_prop mountpoint $TESTPOOL/fs)" +SECONDS=0 +log_must fill_fs $MNTPOINT 1 200 4096 10 Z +log_note "fill_fs took $SECONDS seconds" +sync_pool $TESTPOOL + +# Start a replacing vdev, but suspend the resilver +log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1 +log_must zpool replace -f $TESTPOOL /dev/${sd}4 $TEST_BASE_DIR/dev-7 + +# Remove 4 disks all at once +log_must eval "echo offline > /sys/block/${sd}/device/state" + +log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0 + +# Add some writes to drive the vdev probe errors +log_must dd if=/dev/urandom of=$MNTPOINT/writes bs=1M count=1 + +# Wait until sync starts, and the pool suspends +log_note "waiting for pool to suspend" +typeset -i tries=30 +until [[ $(cat /proc/spl/kstat/zfs/$TESTPOOL/state) == "SUSPENDED" ]] ; do + if ((tries-- == 0)); then + zpool status -s + log_fail "UNEXPECTED -- pool did not suspend" + fi + sleep 1 +done +log_note $(cat /proc/spl/kstat/zfs/$TESTPOOL/state) + +# Put the missing disks back into service +log_must eval "echo running > /sys/block/$sd/device/state" + +# Clear the vdev error states, which will reopen the vdevs and resume the pool +log_must zpool clear $TESTPOOL + +# Wait until the pool resumes +log_note "waiting for pool to resume" +tries=30 +until [[ $(cat /proc/spl/kstat/zfs/$TESTPOOL/state) != "SUSPENDED" ]] ; do + if ((tries-- == 0)); then + log_fail "pool did not resume" + fi + sleep 1 +done +log_must zpool wait -t resilver $TESTPOOL +sync_pool $TESTPOOL + +# Make sure a pool scrub comes back clean +log_must zpool scrub -w $TESTPOOL +log_must zpool status -v $TESTPOOL +log_must check_pool_status $TESTPOOL "errors" "No known data errors" + +log_pass "VDEV probe errors for more disks than parity should suspend a pool" From 5ba50c81352bf6a87f2ea0ebf4a41ec417037b9e Mon Sep 17 00:00:00 2001 From: Richard Kojedzinszky Date: Sat, 4 Jan 2025 19:33:27 +0100 Subject: [PATCH 26/30] fix: make zfs_strerror really thread-safe and portable #15793 wanted to make zfs_strerror threadsafe, unfortunately, it turned out that strerror_l() usage was wrong, and also, some libc implementations dont have strerror_l(). zfs_strerror() now simply calls original strerror() and copies the result to a thread-local buffer, then returns that. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Richard Kojedzinszky Closes #15793 Closes #16640 Closes #16923 --- config/user.m4 | 2 +- include/libzutil.h | 15 +++++++++------ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/config/user.m4 b/config/user.m4 index 4e31745a2abc..badd920d2b8a 100644 --- a/config/user.m4 +++ b/config/user.m4 @@ -33,7 +33,7 @@ AC_DEFUN([ZFS_AC_CONFIG_USER], [ ZFS_AC_CONFIG_USER_MAKEDEV_IN_MKDEV ZFS_AC_CONFIG_USER_ZFSEXEC - AC_CHECK_FUNCS([execvpe issetugid mlockall strerror_l strlcat strlcpy gettid]) + AC_CHECK_FUNCS([execvpe issetugid mlockall strlcat strlcpy gettid]) AC_SUBST(RM) ]) diff --git a/include/libzutil.h b/include/libzutil.h index f8712340cc5e..bcfe2fcf7960 100644 --- a/include/libzutil.h +++ b/include/libzutil.h @@ -27,7 +27,7 @@ #define _LIBZUTIL_H extern __attribute__((visibility("default"))) #include -#include +#include #include #include @@ -276,11 +276,14 @@ _LIBZUTIL_H void update_vdev_config_dev_sysfs_path(nvlist_t *nv, * Thread-safe strerror() for use in ZFS libraries */ static inline char *zfs_strerror(int errnum) { -#ifdef HAVE_STRERROR_L - return (strerror_l(errnum, uselocale(0))); -#else - return (strerror(errnum)); -#endif + static __thread char errbuf[512]; + static pthread_mutex_t zfs_strerror_lock = PTHREAD_MUTEX_INITIALIZER; + + (void) pthread_mutex_lock(&zfs_strerror_lock); + (void) strlcpy(errbuf, strerror(errnum), sizeof (errbuf)); + (void) pthread_mutex_unlock(&zfs_strerror_lock); + + return (errbuf); } #ifdef __cplusplus From 9f1c5e0b1078f820d823528c8a9875c501efb9a8 Mon Sep 17 00:00:00 2001 From: Robert Evans Date: Sun, 5 Jan 2025 20:25:22 -0500 Subject: [PATCH 27/30] Remove duplicate dedup_legacy_create in common.run Reviewed-by: Brian Behlendorf Reviewed-by: George Melikov Signed-off-by: Robert Evans Closes #16926 --- tests/runfiles/common.run | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index a69d36df2f98..1d6f6d85200f 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -676,8 +676,8 @@ post = tags = ['functional', 'deadman'] [tests/functional/dedup] -tests = ['dedup_legacy_create', 'dedup_fdt_create', 'dedup_fdt_import', - 'dedup_legacy_create', 'dedup_legacy_import', 'dedup_legacy_fdt_upgrade', +tests = ['dedup_fdt_create', 'dedup_fdt_import', 'dedup_legacy_create', + 'dedup_legacy_import', 'dedup_legacy_fdt_upgrade', 'dedup_legacy_fdt_mixed', 'dedup_quota'] pre = post = From 307fd0da1f58907ccb99e4ddc97ae00490e2b5cd Mon Sep 17 00:00:00 2001 From: n0-1 Date: Mon, 6 Jan 2025 02:27:19 +0100 Subject: [PATCH 28/30] Support for cross-compiling kernel modules In order to correctly cross-compile, one has to pass ARCH and CROSS_COMPILE make flags to kernel module build calls. Facilitate this in the same way as for custom CC flag by recognizing KERNEL_-prefixed configure environment variables of same name. Reviewed-by: Brian Behlendorf Signed-off-by: Phil Sutter Closes #16924 --- config/kernel.m4 | 5 +++++ config/zfs-build.m4 | 2 ++ module/Makefile.in | 2 ++ rpm/generic/zfs-kmod.spec.in | 4 +++- rpm/redhat/zfs-kmod.spec.in | 4 +++- 5 files changed, 15 insertions(+), 2 deletions(-) diff --git a/config/kernel.m4 b/config/kernel.m4 index ae66633907bf..9928ead1b6ce 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -681,11 +681,16 @@ AC_DEFUN([ZFS_LINUX_COMPILE], [ building kernel modules]) AC_ARG_VAR([KERNEL_LLVM], [Binary option to build kernel modules with LLVM/CLANG toolchain]) + AC_ARG_VAR([KERNEL_CROSS_COMPILE], [Cross compile prefix + for kernel module builds]) + AC_ARG_VAR([KERNEL_ARCH], [Architecture to build kernel modules for]) AC_TRY_COMMAND([ KBUILD_MODPOST_NOFINAL="$5" KBUILD_MODPOST_WARN="$6" make modules -k -j$TEST_JOBS ${KERNEL_CC:+CC=$KERNEL_CC} ${KERNEL_LD:+LD=$KERNEL_LD} ${KERNEL_LLVM:+LLVM=$KERNEL_LLVM} CONFIG_MODULES=y CFLAGS_MODULE=-DCONFIG_MODULES + ${KERNEL_CROSS_COMPILE:+CROSS_COMPILE=$KERNEL_CROSS_COMPILE} + ${KERNEL_ARCH:+ARCH=$KERNEL_ARCH} -C $LINUX_OBJ $ARCH_UM M=$PWD/$1 >$1/build.log 2>&1]) AS_IF([AC_TRY_COMMAND([$2])], [$3], [$4]) ]) diff --git a/config/zfs-build.m4 b/config/zfs-build.m4 index c44a893bbb8c..55fc029f0847 100644 --- a/config/zfs-build.m4 +++ b/config/zfs-build.m4 @@ -393,6 +393,8 @@ AC_DEFUN([ZFS_AC_RPM], [ RPM_DEFINE_KMOD=${RPM_DEFINE_KMOD}' --define "kernel_cc KERNEL_CC=$(KERNEL_CC)"' RPM_DEFINE_KMOD=${RPM_DEFINE_KMOD}' --define "kernel_ld KERNEL_LD=$(KERNEL_LD)"' RPM_DEFINE_KMOD=${RPM_DEFINE_KMOD}' --define "kernel_llvm KERNEL_LLVM=$(KERNEL_LLVM)"' + RPM_DEFINE_KMOD=${RPM_DEFINE_KMOD}' --define "kernel_cross_compile KERNEL_CROSS_COMPILE=$(KERNEL_CROSS_COMPILE)"' + RPM_DEFINE_KMOD=${RPM_DEFINE_KMOD}' --define "kernel_arch KERNEL_ARCH=$(KERNEL_ARCH)"' ]) RPM_DEFINE_DKMS='' diff --git a/module/Makefile.in b/module/Makefile.in index 9b34b3dfaec7..529ab81dcec5 100644 --- a/module/Makefile.in +++ b/module/Makefile.in @@ -55,6 +55,8 @@ modules-Linux: mkdir -p $(sort $(dir $(zfs-objs) $(zfs-))) $(MAKE) -C @LINUX_OBJ@ $(if @KERNEL_CC@,CC=@KERNEL_CC@) \ $(if @KERNEL_LD@,LD=@KERNEL_LD@) $(if @KERNEL_LLVM@,LLVM=@KERNEL_LLVM@) \ + $(if @KERNEL_CROSS_COMPILE@,CROSS_COMPILE=@KERNEL_CROSS_COMPILE@) \ + $(if @KERNEL_ARCH@,ARCH=@KERNEL_ARCH@) \ M="$$PWD" @KERNEL_MAKE@ CONFIG_ZFS=m modules modules-FreeBSD: diff --git a/rpm/generic/zfs-kmod.spec.in b/rpm/generic/zfs-kmod.spec.in index 30524474d1ac..7ed828bd0c9c 100644 --- a/rpm/generic/zfs-kmod.spec.in +++ b/rpm/generic/zfs-kmod.spec.in @@ -144,7 +144,9 @@ for kernel_version in %{?kernel_versions}; do %{debuginfo} \ %{?kernel_cc} \ %{?kernel_ld} \ - %{?kernel_llvm} + %{?kernel_llvm} \ + %{?kernel_cross_compile} \ + %{?kernel_arch} # Pre-6.10 kernel builds didn't need to copy over the source files to the # build directory. However we do need to do it though post-6.10 due to diff --git a/rpm/redhat/zfs-kmod.spec.in b/rpm/redhat/zfs-kmod.spec.in index 876c198c64de..a95bdf20f873 100644 --- a/rpm/redhat/zfs-kmod.spec.in +++ b/rpm/redhat/zfs-kmod.spec.in @@ -69,7 +69,9 @@ fi %{debuginfo} \ %{?kernel_cc} \ %{?kernel_ld} \ - %{?kernel_llvm} + %{?kernel_llvm} \ + %{?kernel_cross_compile} \ + %{?kernel_arch} make %{?_smp_mflags} # Module signing (modsign) From 0c88ae6187803b5816a5ec8f5ab48a94ddfaafb0 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Mon, 16 Dec 2024 10:27:20 -0800 Subject: [PATCH 29/30] Tag 2.3.0-rc5 Signed-off-by: Brian Behlendorf --- META | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/META b/META index 5446d3e7c348..d7a8e3604e58 100644 --- a/META +++ b/META @@ -2,7 +2,7 @@ Meta: 1 Name: zfs Branch: 1.0 Version: 2.3.0 -Release: rc4 +Release: rc5 Release-Tags: relext License: CDDL Author: OpenZFS From 7a354f31afc96b0df406351b48560a7ef1759a7a Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Wed, 8 Jan 2025 20:26:30 +0500 Subject: [PATCH 30/30] zvol: sync with upstream block cloning commit Signed-off-by: Ameer Hamza --- include/sys/zvol.h | 2 - include/sys/zvol_impl.h | 8 ++-- module/zfs/zvol.c | 88 ++++++++++++++++++++--------------------- 3 files changed, 49 insertions(+), 49 deletions(-) diff --git a/include/sys/zvol.h b/include/sys/zvol.h index e236a4cd18a3..c79fe1d9ad22 100644 --- a/include/sys/zvol.h +++ b/include/sys/zvol.h @@ -56,8 +56,6 @@ extern int zvol_set_ro(const char *, boolean_t); extern zvol_state_handle_t *zvol_suspend(const char *); extern int zvol_resume(zvol_state_handle_t *); extern void *zvol_tag(zvol_state_handle_t *); -extern int zvol_clone_range(zvol_state_handle_t *, uint64_t, - zvol_state_handle_t *, uint64_t, uint64_t); extern int zvol_init(void); extern void zvol_fini(void); diff --git a/include/sys/zvol_impl.h b/include/sys/zvol_impl.h index 55021a080076..a8168850023a 100644 --- a/include/sys/zvol_impl.h +++ b/include/sys/zvol_impl.h @@ -83,14 +83,16 @@ void zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len); void zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, uint64_t size, boolean_t commit); -void zvol_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, - uint64_t off, uint64_t len, uint64_t blksz, const blkptr_t *bps, - size_t nbps); int zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio); int zvol_init_impl(void); void zvol_fini_impl(void); void zvol_wait_close(zvol_state_t *zv); +int zvol_clone_range(zvol_state_handle_t *, uint64_t, + zvol_state_handle_t *, uint64_t, uint64_t); +void zvol_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, + uint64_t off, uint64_t len, uint64_t blksz, const blkptr_t *bps, + size_t nbps); /* * platform dependent functions exported to platform independent code diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index b5d8d1b71111..d63c0d597db8 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -752,6 +752,50 @@ zvol_clone_range(zvol_state_t *zv_src, uint64_t inoff, zvol_state_t *zv_dst, return (SET_ERROR(error)); } +/* + * Handles TX_CLONE_RANGE transactions. + */ +void +zvol_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, uint64_t off, + uint64_t len, uint64_t blksz, const blkptr_t *bps, size_t nbps) +{ + itx_t *itx; + lr_clone_range_t *lr; + uint64_t partlen, max_log_data; + size_t partnbps; + + if (zil_replaying(zilog, tx)) + return; + + max_log_data = zil_max_log_data(zilog, sizeof (lr_clone_range_t)); + + while (nbps > 0) { + partnbps = MIN(nbps, max_log_data / sizeof (bps[0])); + partlen = partnbps * blksz; + ASSERT3U(partlen, <, len + blksz); + partlen = MIN(partlen, len); + + itx = zil_itx_create(txtype, + sizeof (*lr) + sizeof (bps[0]) * partnbps); + lr = (lr_clone_range_t *)&itx->itx_lr; + lr->lr_foid = ZVOL_OBJ; + lr->lr_offset = off; + lr->lr_length = partlen; + lr->lr_blksz = blksz; + lr->lr_nbps = partnbps; + memcpy(lr->lr_bps, bps, sizeof (bps[0]) * partnbps); + + zil_itx_assign(zilog, itx, tx); + + bps += partnbps; + ASSERT3U(nbps, >=, partnbps); + nbps -= partnbps; + off += partlen; + ASSERT3U(len, >=, partlen); + len -= partlen; + } +} + static int zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap) { @@ -863,50 +907,6 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, } } -/* - * Handles TX_CLONE_RANGE transactions. - */ -void -zvol_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, uint64_t off, - uint64_t len, uint64_t blksz, const blkptr_t *bps, size_t nbps) -{ - itx_t *itx; - lr_clone_range_t *lr; - uint64_t partlen, max_log_data; - size_t partnbps; - - if (zil_replaying(zilog, tx)) - return; - - max_log_data = zil_max_log_data(zilog, sizeof (lr_clone_range_t)); - - while (nbps > 0) { - partnbps = MIN(nbps, max_log_data / sizeof (bps[0])); - partlen = partnbps * blksz; - ASSERT3U(partlen, <, len + blksz); - partlen = MIN(partlen, len); - - itx = zil_itx_create(txtype, - sizeof (*lr) + sizeof (bps[0]) * partnbps); - lr = (lr_clone_range_t *)&itx->itx_lr; - lr->lr_foid = ZVOL_OBJ; - lr->lr_offset = off; - lr->lr_length = partlen; - lr->lr_blksz = blksz; - lr->lr_nbps = partnbps; - memcpy(lr->lr_bps, bps, sizeof (bps[0]) * partnbps); - - zil_itx_assign(zilog, itx, tx); - - bps += partnbps; - ASSERT3U(nbps, >=, partnbps); - nbps -= partnbps; - off += partlen; - ASSERT3U(len, >=, partlen); - len -= partlen; - } -} - /* * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE. */