diff --git a/.github/workflows/scripts/qemu-2-start.sh b/.github/workflows/scripts/qemu-2-start.sh index 84e13832d10f..39ac92107b71 100755 --- a/.github/workflows/scripts/qemu-2-start.sh +++ b/.github/workflows/scripts/qemu-2-start.sh @@ -52,16 +52,16 @@ case "$OS" in OSNAME="Debian 12" URL="https://cloud.debian.org/images/cloud/bookworm/latest/debian-12-generic-amd64.qcow2" ;; - fedora39) - OSNAME="Fedora 39" - OSv="fedora39" - URL="https://download.fedoraproject.org/pub/fedora/linux/releases/39/Cloud/x86_64/images/Fedora-Cloud-Base-39-1.5.x86_64.qcow2" - ;; fedora40) OSNAME="Fedora 40" - OSv="fedora39" + OSv="fedora-unknown" URL="https://download.fedoraproject.org/pub/fedora/linux/releases/40/Cloud/x86_64/images/Fedora-Cloud-Base-Generic.x86_64-40-1.14.qcow2" ;; + fedora41) + OSNAME="Fedora 41" + OSv="fedora-unknown" + URL="https://download.fedoraproject.org/pub/fedora/linux/releases/41/Cloud/x86_64/images/Fedora-Cloud-Base-Generic-41-1.4.x86_64.qcow2" + ;; freebsd13-3r) OSNAME="FreeBSD 13.3-RELEASE" OSv="freebsd13.0" diff --git a/.github/workflows/scripts/qemu-3-deps.sh b/.github/workflows/scripts/qemu-3-deps.sh index a2fb5e38249a..96979cd02e09 100755 --- a/.github/workflows/scripts/qemu-3-deps.sh +++ b/.github/workflows/scripts/qemu-3-deps.sh @@ -13,10 +13,10 @@ function archlinux() { echo "##[endgroup]" echo "##[group]Install Development Tools" - sudo pacman -Sy --noconfirm base-devel bc cpio dhclient dkms fakeroot \ - fio gdb inetutils jq less linux linux-headers lsscsi nfs-utils parted \ - pax perf python-packaging python-setuptools qemu-guest-agent ksh samba \ - sysstat rng-tools rsync wget xxhash + sudo pacman -Sy --noconfirm base-devel bc cpio cryptsetup dhclient dkms \ + fakeroot fio gdb inetutils jq less linux linux-headers lsscsi nfs-utils \ + parted pax perf python-packaging python-setuptools qemu-guest-agent ksh \ + samba sysstat rng-tools rsync wget xxhash echo "##[endgroup]" } @@ -30,11 +30,11 @@ function debian() { echo "##[group]Install Development Tools" sudo apt-get install -y \ - acl alien attr autoconf bc cpio curl dbench dh-python dkms fakeroot \ - fio gdb gdebi git ksh lcov isc-dhcp-client jq libacl1-dev libaio-dev \ - libattr1-dev libblkid-dev libcurl4-openssl-dev libdevmapper-dev libelf-dev \ - libffi-dev libmount-dev libpam0g-dev libselinux-dev libssl-dev libtool \ - libtool-bin libudev-dev libunwind-dev linux-headers-$(uname -r) \ + acl alien attr autoconf bc cpio cryptsetup curl dbench dh-python dkms \ + fakeroot fio gdb gdebi git ksh lcov isc-dhcp-client jq libacl1-dev \ + libaio-dev libattr1-dev libblkid-dev libcurl4-openssl-dev libdevmapper-dev \ + libelf-dev libffi-dev libmount-dev libpam0g-dev libselinux-dev libssl-dev \ + libtool libtool-bin libudev-dev libunwind-dev linux-headers-$(uname -r) \ lsscsi nfs-kernel-server pamtester parted python3 python3-all-dev \ python3-cffi python3-dev python3-distlib python3-packaging \ python3-setuptools python3-sphinx qemu-guest-agent rng-tools rpm2cpio \ @@ -66,16 +66,23 @@ function rhel() { echo "##[endgroup]" echo "##[group]Install Development Tools" - sudo dnf group install -y "Development Tools" + + # Alma wants "Development Tools", Fedora 41 wants "development-tools" + if ! sudo dnf group install -y "Development Tools" ; then + echo "Trying 'development-tools' instead of 'Development Tools'" + sudo dnf group install -y development-tools + fi + sudo dnf install -y \ - acl attr bc bzip2 curl dbench dkms elfutils-libelf-devel fio gdb git \ - jq kernel-rpm-macros ksh libacl-devel libaio-devel libargon2-devel \ - libattr-devel libblkid-devel libcurl-devel libffi-devel ncompress \ - libselinux-devel libtirpc-devel libtool libudev-devel libuuid-devel \ - lsscsi mdadm nfs-utils openssl-devel pam-devel pamtester parted perf \ - python3 python3-cffi python3-devel python3-packaging kernel-devel \ - python3-setuptools qemu-guest-agent rng-tools rpcgen rpm-build rsync \ - samba sysstat systemd watchdog wget xfsprogs-devel xxhash zlib-devel + acl attr bc bzip2 cryptsetup curl dbench dkms elfutils-libelf-devel fio \ + gdb git jq kernel-rpm-macros ksh libacl-devel libaio-devel \ + libargon2-devel libattr-devel libblkid-devel libcurl-devel libffi-devel \ + ncompress libselinux-devel libtirpc-devel libtool libudev-devel \ + libuuid-devel lsscsi mdadm nfs-utils openssl-devel pam-devel pamtester \ + parted perf python3 python3-cffi python3-devel python3-packaging \ + kernel-devel python3-setuptools qemu-guest-agent rng-tools rpcgen \ + rpm-build rsync samba sysstat systemd watchdog wget xfsprogs-devel xxhash \ + zlib-devel echo "##[endgroup]" } @@ -111,6 +118,7 @@ case "$1" in archlinux ;; debian*) + echo 'debconf debconf/frontend select Noninteractive' | sudo debconf-set-selections debian echo "##[group]Install Debian specific" sudo apt-get install -yq linux-perf dh-sequence-dkms diff --git a/.github/workflows/scripts/qemu-4-build.sh b/.github/workflows/scripts/qemu-4-build.sh index 1051ee1f4deb..955f605f5bce 100755 --- a/.github/workflows/scripts/qemu-4-build.sh +++ b/.github/workflows/scripts/qemu-4-build.sh @@ -83,7 +83,7 @@ function rpm_build_and_install() { echo "##[endgroup]" echo "##[group]Install" - run sudo dnf -y --skip-broken localinstall $(ls *.rpm | grep -v src.rpm) + run sudo dnf -y --nobest install $(ls *.rpm | grep -v src.rpm) echo "##[endgroup]" } diff --git a/.github/workflows/zfs-qemu.yml b/.github/workflows/zfs-qemu.yml index f819e9938e31..e90030f4c02e 100644 --- a/.github/workflows/zfs-qemu.yml +++ b/.github/workflows/zfs-qemu.yml @@ -22,8 +22,8 @@ jobs: - name: Generate OS config and CI type id: os run: | - FULL_OS='["almalinux8", "almalinux9", "centos-stream9", "debian11", "debian12", "fedora39", "fedora40", "freebsd13-4r", "freebsd14-0r", "freebsd14-1s", "ubuntu20", "ubuntu22", "ubuntu24"]' - QUICK_OS='["almalinux8", "almalinux9", "debian12", "fedora40", "freebsd13-3r", "freebsd14-1r", "ubuntu24"]' + FULL_OS='["almalinux8", "almalinux9", "centos-stream9", "debian11", "debian12", "fedora40", "fedora41", "freebsd13-4r", "freebsd14-0r", "freebsd14-1s", "ubuntu20", "ubuntu22", "ubuntu24"]' + QUICK_OS='["almalinux8", "almalinux9", "debian12", "fedora41", "freebsd13-3r", "freebsd14-1r", "ubuntu24"]' # determine CI type when running on PR ci_type="full" if ${{ github.event_name == 'pull_request' }}; then @@ -46,7 +46,7 @@ jobs: strategy: fail-fast: false matrix: - # rhl: almalinux8, almalinux9, centos-stream9, fedora39, fedora40 + # rhl: almalinux8, almalinux9, centos-stream9, fedora40, fedora41 # debian: debian11, debian12, ubuntu20, ubuntu22, ubuntu24 # misc: archlinux, tumbleweed # FreeBSD Release: freebsd13-3r, freebsd13-4r, freebsd14-0r, freebsd14-1r diff --git a/META b/META index eca34a55d004..0fe0dedae79e 100644 --- a/META +++ b/META @@ -2,7 +2,7 @@ Meta: 1 Name: zfs Branch: 1.0 Version: 2.3.0 -Release: rc2 +Release: rc3 Release-Tags: relext License: CDDL Author: OpenZFS diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 16c7025802f3..46587671202a 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -2152,14 +2152,21 @@ dump_brt(spa_t *spa) if (dump_opt['T'] < 3) return; + /* -TTT shows a per-vdev histograms; -TTTT shows all entries */ + boolean_t do_histo = dump_opt['T'] == 3; + char dva[64]; - printf("\n%-16s %-10s\n", "DVA", "REFCNT"); + + if (!do_histo) + printf("\n%-16s %-10s\n", "DVA", "REFCNT"); for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid]; if (brtvd == NULL || !brtvd->bv_initiated) continue; + uint64_t counts[64] = {}; + zap_cursor_t zc; zap_attribute_t *za = zap_attribute_alloc(); for (zap_cursor_init(&zc, brt->brt_mos, brtvd->bv_mos_entries); @@ -2172,14 +2179,26 @@ dump_brt(spa_t *spa) za->za_integer_length, za->za_num_integers, &refcnt)); - uint64_t offset = *(const uint64_t *)za->za_name; + if (do_histo) + counts[highbit64(refcnt)]++; + else { + uint64_t offset = + *(const uint64_t *)za->za_name; - snprintf(dva, sizeof (dva), "%" PRIu64 ":%llx", vdevid, - (u_longlong_t)offset); - printf("%-16s %-10llu\n", dva, (u_longlong_t)refcnt); + snprintf(dva, sizeof (dva), "%" PRIu64 ":%llx", + vdevid, (u_longlong_t)offset); + printf("%-16s %-10llu\n", dva, + (u_longlong_t)refcnt); + } } zap_cursor_fini(&zc); zap_attribute_free(za); + + if (do_histo) { + printf("\nBRT: vdev %" PRIu64 + ": DVAs with 2^n refcnts:\n", vdevid); + dump_histogram(counts, 64, 0); + } } } @@ -4266,6 +4285,10 @@ dump_uberblock(uberblock_t *ub, const char *header, const char *footer) (void) printf("\ttimestamp = %llu UTC = %s", (u_longlong_t)ub->ub_timestamp, ctime(×tamp)); + char blkbuf[BP_SPRINTF_LEN]; + snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp); + (void) printf("\tbp = %s\n", blkbuf); + (void) printf("\tmmp_magic = %016llx\n", (u_longlong_t)ub->ub_mmp_magic); if (MMP_VALID(ub)) { diff --git a/cmd/zed/zed_disk_event.c b/cmd/zed/zed_disk_event.c index 32a8789d3001..4a2b7398e922 100644 --- a/cmd/zed/zed_disk_event.c +++ b/cmd/zed/zed_disk_event.c @@ -139,7 +139,8 @@ dev_event_nvlist(struct udev_device *dev) * is /dev/sda. */ struct udev_device *parent_dev = udev_device_get_parent(dev); - if ((value = udev_device_get_sysattr_value(parent_dev, "size")) + if (parent_dev != NULL && + (value = udev_device_get_sysattr_value(parent_dev, "size")) != NULL) { uint64_t numval = DEV_BSIZE; diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index 97397726c709..7836f5909f4a 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -3761,8 +3761,13 @@ collect_dataset(zfs_handle_t *zhp, list_cbdata_t *cb) if (cb->cb_json) { if (pl->pl_prop == ZFS_PROP_NAME) continue; + const char *prop_name; + if (pl->pl_prop != ZPROP_USERPROP) + prop_name = zfs_prop_to_name(pl->pl_prop); + else + prop_name = pl->pl_user_prop; if (zprop_nvlist_one_property( - zfs_prop_to_name(pl->pl_prop), propstr, + prop_name, propstr, sourcetype, source, NULL, props, cb->cb_json_as_int) != 0) nomem(); diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index ea180f6b705e..6a45a063d91a 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -7966,8 +7966,11 @@ zpool_do_online(int argc, char **argv) poolname = argv[0]; - if ((zhp = zpool_open(g_zfs, poolname)) == NULL) + if ((zhp = zpool_open(g_zfs, poolname)) == NULL) { + (void) fprintf(stderr, gettext("failed to open pool " + "\"%s\""), poolname); return (1); + } for (i = 1; i < argc; i++) { vdev_state_t oldstate; @@ -7988,12 +7991,15 @@ zpool_do_online(int argc, char **argv) &l2cache, NULL); if (tgt == NULL) { ret = 1; + (void) fprintf(stderr, gettext("couldn't find device " + "\"%s\" in pool \"%s\"\n"), argv[i], poolname); continue; } uint_t vsc; oldstate = ((vdev_stat_t *)fnvlist_lookup_uint64_array(tgt, ZPOOL_CONFIG_VDEV_STATS, &vsc))->vs_state; - if (zpool_vdev_online(zhp, argv[i], flags, &newstate) == 0) { + if ((rc = zpool_vdev_online(zhp, argv[i], flags, + &newstate)) == 0) { if (newstate != VDEV_STATE_HEALTHY) { (void) printf(gettext("warning: device '%s' " "onlined, but remains in faulted state\n"), @@ -8019,6 +8025,9 @@ zpool_do_online(int argc, char **argv) } } } else { + (void) fprintf(stderr, gettext("Failed to online " + "\"%s\" in pool \"%s\": %d\n"), + argv[i], poolname, rc); ret = 1; } } @@ -8103,8 +8112,11 @@ zpool_do_offline(int argc, char **argv) poolname = argv[0]; - if ((zhp = zpool_open(g_zfs, poolname)) == NULL) + if ((zhp = zpool_open(g_zfs, poolname)) == NULL) { + (void) fprintf(stderr, gettext("failed to open pool " + "\"%s\""), poolname); return (1); + } for (i = 1; i < argc; i++) { uint64_t guid = zpool_vdev_path_to_guid(zhp, argv[i]); diff --git a/config/kernel-kthread.m4 b/config/kernel-kthread.m4 index 4d580efead6b..607953146323 100644 --- a/config/kernel-kthread.m4 +++ b/config/kernel-kthread.m4 @@ -17,14 +17,21 @@ AC_DEFUN([ZFS_AC_KERNEL_KTHREAD_COMPLETE_AND_EXIT], [ AC_DEFUN([ZFS_AC_KERNEL_KTHREAD_DEQUEUE_SIGNAL], [ dnl # - dnl # 5.17 API: enum pid_type * as new 4th dequeue_signal() argument, - dnl # 5768d8906bc23d512b1a736c1e198aa833a6daa4 ("signal: Requeue signals in the appropriate queue") + dnl # prehistory: + dnl # int dequeue_signal(struct task_struct *task, sigset_t *mask, + dnl # siginfo_t *info) dnl # - dnl # int dequeue_signal(struct task_struct *task, sigset_t *mask, kernel_siginfo_t *info); - dnl # int dequeue_signal(struct task_struct *task, sigset_t *mask, kernel_siginfo_t *info, enum pid_type *type); + dnl # 4.20: kernel_siginfo_t introduced, replaces siginfo_t + dnl # int dequeue_signal(struct task_struct *task, sigset_t *mask, + dnl kernel_siginfo_t *info) dnl # - dnl # 6.12 API: first arg struct_task* removed - dnl # int dequeue_signal(sigset_t *mask, kernel_siginfo_t *info, enum pid_type *type); + dnl # 5.17: enum pid_type introduced as 4th arg + dnl # int dequeue_signal(struct task_struct *task, sigset_t *mask, + dnl # kernel_siginfo_t *info, enum pid_type *type) + dnl # + dnl # 6.12: first arg struct_task* removed + dnl # int dequeue_signal(sigset_t *mask, kernel_siginfo_t *info, + dnl # enum pid_type *type) dnl # AC_MSG_CHECKING([whether dequeue_signal() takes 4 arguments]) ZFS_LINUX_TEST_RESULT([kthread_dequeue_signal_4arg], [ @@ -33,11 +40,11 @@ AC_DEFUN([ZFS_AC_KERNEL_KTHREAD_DEQUEUE_SIGNAL], [ [dequeue_signal() takes 4 arguments]) ], [ AC_MSG_RESULT(no) - AC_MSG_CHECKING([whether dequeue_signal() a task argument]) - ZFS_LINUX_TEST_RESULT([kthread_dequeue_signal_3arg_task], [ + AC_MSG_CHECKING([whether 3-arg dequeue_signal() takes a type argument]) + ZFS_LINUX_TEST_RESULT([kthread_dequeue_signal_3arg_type], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_DEQUEUE_SIGNAL_3ARG_TASK, 1, - [dequeue_signal() takes a task argument]) + AC_DEFINE(HAVE_DEQUEUE_SIGNAL_3ARG_TYPE, 1, + [3-arg dequeue_signal() takes a type argument]) ], [ AC_MSG_RESULT(no) ]) @@ -56,27 +63,27 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_KTHREAD_COMPLETE_AND_EXIT], [ ]) AC_DEFUN([ZFS_AC_KERNEL_SRC_KTHREAD_DEQUEUE_SIGNAL], [ - ZFS_LINUX_TEST_SRC([kthread_dequeue_signal_3arg_task], [ + ZFS_LINUX_TEST_SRC([kthread_dequeue_signal_4arg], [ #include ], [ struct task_struct *task = NULL; sigset_t *mask = NULL; kernel_siginfo_t *info = NULL; + enum pid_type *type = NULL; int error __attribute__ ((unused)); - error = dequeue_signal(task, mask, info); + error = dequeue_signal(task, mask, info, type); ]) - ZFS_LINUX_TEST_SRC([kthread_dequeue_signal_4arg], [ + ZFS_LINUX_TEST_SRC([kthread_dequeue_signal_3arg_type], [ #include ], [ - struct task_struct *task = NULL; sigset_t *mask = NULL; kernel_siginfo_t *info = NULL; enum pid_type *type = NULL; int error __attribute__ ((unused)); - error = dequeue_signal(task, mask, info, type); + error = dequeue_signal(mask, info, type); ]) ]) diff --git a/config/kernel-vfs-invalidate_folio.m4 b/config/kernel-vfs-invalidate_folio.m4 deleted file mode 100644 index 61a5c8478af1..000000000000 --- a/config/kernel-vfs-invalidate_folio.m4 +++ /dev/null @@ -1,33 +0,0 @@ -dnl # -dnl # Linux 5.18 uses invalidate_folio in lieu of invalidate_page -dnl # -AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_INVALIDATE_FOLIO], [ - ZFS_LINUX_TEST_SRC([vfs_has_invalidate_folio], [ - #include - - static void - test_invalidate_folio(struct folio *folio, size_t offset, - size_t len) { - (void) folio; (void) offset; (void) len; - return; - } - - static const struct address_space_operations - aops __attribute__ ((unused)) = { - .invalidate_folio = test_invalidate_folio, - }; - ],[]) -]) - -AC_DEFUN([ZFS_AC_KERNEL_VFS_INVALIDATE_FOLIO], [ - dnl # - dnl # Linux 5.18 uses invalidate_folio in lieu of invalidate_page - dnl # - AC_MSG_CHECKING([whether invalidate_folio exists]) - ZFS_LINUX_TEST_RESULT([vfs_has_invalidate_folio], [ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_VFS_INVALIDATE_FOLIO, 1, [invalidate_folio exists]) - ],[ - AC_MSG_RESULT([no]) - ]) -]) diff --git a/config/kernel-vfs-migrate_folio.m4 b/config/kernel-vfs-migrate_folio.m4 new file mode 100644 index 000000000000..186cd0581a17 --- /dev/null +++ b/config/kernel-vfs-migrate_folio.m4 @@ -0,0 +1,27 @@ +dnl # +dnl # Linux 6.0 uses migrate_folio in lieu of migrate_page +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_MIGRATE_FOLIO], [ + ZFS_LINUX_TEST_SRC([vfs_has_migrate_folio], [ + #include + #include + + static const struct address_space_operations + aops __attribute__ ((unused)) = { + .migrate_folio = migrate_folio, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_VFS_MIGRATE_FOLIO], [ + dnl # + dnl # Linux 6.0 uses migrate_folio in lieu of migrate_page + dnl # + AC_MSG_CHECKING([whether migrate_folio exists]) + ZFS_LINUX_TEST_RESULT([vfs_has_migrate_folio], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_VFS_MIGRATE_FOLIO, 1, [migrate_folio exists]) + ],[ + AC_MSG_RESULT([no]) + ]) +]) diff --git a/config/kernel-vfs-release_folio.m4 b/config/kernel-vfs-release_folio.m4 deleted file mode 100644 index f31db5677fd3..000000000000 --- a/config/kernel-vfs-release_folio.m4 +++ /dev/null @@ -1,32 +0,0 @@ -dnl # -dnl # Linux 5.19 uses release_folio in lieu of releasepage -dnl # -AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_RELEASE_FOLIO], [ - ZFS_LINUX_TEST_SRC([vfs_has_release_folio], [ - #include - - static bool - test_release_folio(struct folio *folio, gfp_t gfp) { - (void) folio; (void) gfp; - return (0); - } - - static const struct address_space_operations - aops __attribute__ ((unused)) = { - .release_folio = test_release_folio, - }; - ],[]) -]) - -AC_DEFUN([ZFS_AC_KERNEL_VFS_RELEASE_FOLIO], [ - dnl # - dnl # Linux 5.19 uses release_folio in lieu of releasepage - dnl # - AC_MSG_CHECKING([whether release_folio exists]) - ZFS_LINUX_TEST_RESULT([vfs_has_release_folio], [ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_VFS_RELEASE_FOLIO, 1, [release_folio exists]) - ],[ - AC_MSG_RESULT([no]) - ]) -]) diff --git a/config/kernel.m4 b/config/kernel.m4 index 556df58082f9..78f178ff27ac 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -77,8 +77,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_SGET ZFS_AC_KERNEL_SRC_VFS_FILEMAP_DIRTY_FOLIO ZFS_AC_KERNEL_SRC_VFS_READ_FOLIO - ZFS_AC_KERNEL_SRC_VFS_RELEASE_FOLIO - ZFS_AC_KERNEL_SRC_VFS_INVALIDATE_FOLIO + ZFS_AC_KERNEL_SRC_VFS_MIGRATE_FOLIO ZFS_AC_KERNEL_SRC_VFS_FSYNC_2ARGS ZFS_AC_KERNEL_SRC_VFS_DIRECT_IO ZFS_AC_KERNEL_SRC_VFS_READPAGES @@ -189,8 +188,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_SGET ZFS_AC_KERNEL_VFS_FILEMAP_DIRTY_FOLIO ZFS_AC_KERNEL_VFS_READ_FOLIO - ZFS_AC_KERNEL_VFS_RELEASE_FOLIO - ZFS_AC_KERNEL_VFS_INVALIDATE_FOLIO + ZFS_AC_KERNEL_VFS_MIGRATE_FOLIO ZFS_AC_KERNEL_VFS_FSYNC_2ARGS ZFS_AC_KERNEL_VFS_DIRECT_IO ZFS_AC_KERNEL_VFS_READPAGES diff --git a/contrib/initramfs/scripts/zfs b/contrib/initramfs/scripts/zfs index 0a2bd2efda7a..c569b2528368 100644 --- a/contrib/initramfs/scripts/zfs +++ b/contrib/initramfs/scripts/zfs @@ -344,7 +344,7 @@ mount_fs() # Need the _original_ datasets mountpoint! mountpoint=$(get_fs_value "$fs" mountpoint) - ZFS_CMD="mount -o zfsutil -t zfs" + ZFS_CMD="mount.zfs -o zfsutil" if [ "$mountpoint" = "legacy" ] || [ "$mountpoint" = "none" ]; then # Can't use the mountpoint property. Might be one of our # clones. Check the 'org.zol:mountpoint' property set in @@ -359,9 +359,8 @@ mount_fs() # isn't the root fs. return 0 fi - # Don't use mount.zfs -o zfsutils for legacy mountpoint if [ "$mountpoint" = "legacy" ]; then - ZFS_CMD="mount -t zfs" + ZFS_CMD="mount.zfs" fi # Last hail-mary: Hope 'rootmnt' is set! mountpoint="" diff --git a/include/os/freebsd/linux/compiler.h b/include/os/freebsd/linux/compiler.h index b408b77c746d..24f09c722158 100644 --- a/include/os/freebsd/linux/compiler.h +++ b/include/os/freebsd/linux/compiler.h @@ -1,10 +1,5 @@ /* - * Copyright (c) 2010 Isilon Systems, Inc. - * Copyright (c) 2010 iXsystems, Inc. - * Copyright (c) 2010 Panasas, Inc. - * Copyright (c) 2013-2016 Mellanox Technologies, Ltd. - * Copyright (c) 2015 François Tigeot - * All rights reserved. + * Copyright (c) 2024 Warner Losh. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -26,76 +21,14 @@ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * $FreeBSD$ */ -#ifndef _LINUX_COMPILER_H_ -#define _LINUX_COMPILER_H_ - -#include - -#define __user -#define __kernel -#define __safe -#define __force -#define __nocast -#define __iomem -#define __chk_user_ptr(x) ((void)0) -#define __chk_io_ptr(x) ((void)0) -#define __builtin_warning(x, y...) (1) -#define __acquires(x) -#define __releases(x) -#define __acquire(x) do { } while (0) -#define __release(x) do { } while (0) -#define __cond_lock(x, c) (c) -#define __bitwise -#define __devinitdata -#define __deprecated -#define __init -#define __initconst -#define __devinit -#define __devexit -#define __exit -#define __rcu -#define __percpu -#define __weak __weak_symbol -#define __malloc -#define ___stringify(...) #__VA_ARGS__ -#define __stringify(...) ___stringify(__VA_ARGS__) -#define __attribute_const__ __attribute__((__const__)) -#undef __always_inline -#define __always_inline inline -#define noinline __noinline -#define ____cacheline_aligned __aligned(CACHE_LINE_SIZE) -#define zfs_fallthrough __attribute__((__fallthrough__)) - -#if !defined(_KERNEL) && !defined(_STANDALONE) -#define likely(x) __builtin_expect(!!(x), 1) -#define unlikely(x) __builtin_expect(!!(x), 0) -#endif -#define typeof(x) __typeof(x) - -#define uninitialized_var(x) x = x -#define __maybe_unused __unused -#define __always_unused __unused -#define __must_check __result_use_check - -#define __printf(a, b) __printflike(a, b) -#define barrier() __asm__ __volatile__("": : :"memory") -#define ___PASTE(a, b) a##b -#define __PASTE(a, b) ___PASTE(a, b) - -#define ACCESS_ONCE(x) (*(volatile __typeof(x) *)&(x)) - -#define WRITE_ONCE(x, v) do { \ - barrier(); \ - ACCESS_ONCE(x) = (v); \ - barrier(); \ -} while (0) - -#define lockless_dereference(p) READ_ONCE(p) +/* + * FreeBSD's LinuxKPI compiler.h as far back as FreeBSD 12 has what we need, + * except zfs_fallthrough. + */ +#pragma once -#define _AT(T, X) ((T)(X)) +#include -#endif /* _LINUX_COMPILER_H_ */ +#define zfs_fallthrough __attribute__((__fallthrough__)) diff --git a/include/os/freebsd/spl/sys/ccompat.h b/include/os/freebsd/spl/sys/ccompat.h index 48749fb8eea2..07b3515ad964 100644 --- a/include/os/freebsd/spl/sys/ccompat.h +++ b/include/os/freebsd/spl/sys/ccompat.h @@ -70,15 +70,6 @@ hlist_del(struct hlist_node *n) n->next->pprev = n->pprev; } /* BEGIN CSTYLED */ -#define READ_ONCE(x) ({ \ - __typeof(x) __var = ({ \ - barrier(); \ - ACCESS_ONCE(x); \ - }); \ - barrier(); \ - __var; \ -}) - #define HLIST_HEAD_INIT { } #define HLIST_HEAD(name) struct hlist_head name = HLIST_HEAD_INIT #define INIT_HLIST_HEAD(head) (head)->first = NULL diff --git a/include/os/freebsd/spl/sys/debug.h b/include/os/freebsd/spl/sys/debug.h index f041dde34fc8..9eb424dd0373 100644 --- a/include/os/freebsd/spl/sys/debug.h +++ b/include/os/freebsd/spl/sys/debug.h @@ -95,10 +95,6 @@ spl_assert(const char *buf, const char *file, const char *func, int line) #ifndef expect #define expect(expr, value) (__builtin_expect((expr), (value))) #endif -#ifndef __linux__ -#define likely(expr) expect((expr) != 0, 1) -#define unlikely(expr) expect((expr) != 0, 0) -#endif #define PANIC(fmt, a...) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, fmt, ## a) diff --git a/include/os/freebsd/spl/sys/simd.h b/include/os/freebsd/spl/sys/simd.h index 6bc46755c4e3..d16e1db5e826 100644 --- a/include/os/freebsd/spl/sys/simd.h +++ b/include/os/freebsd/spl/sys/simd.h @@ -50,7 +50,7 @@ #define kfpu_fini() do {} while (0) #endif -#define simd_stat_init() 0 -#define simd_stat_fini() 0 +#define simd_stat_init() do {} while (0) +#define simd_stat_fini() do {} while (0) #endif diff --git a/include/os/linux/spl/sys/taskq.h b/include/os/linux/spl/sys/taskq.h index f63a397f293d..4cb3a055c3c4 100644 --- a/include/os/linux/spl/sys/taskq.h +++ b/include/os/linux/spl/sys/taskq.h @@ -38,8 +38,7 @@ #include #include #include - -typedef struct kstat_s kstat_t; +#include #define TASKQ_NAMELEN 31 diff --git a/include/os/linux/spl/sys/thread.h b/include/os/linux/spl/sys/thread.h index 4f7f659e528d..c7ef7efa0a25 100644 --- a/include/os/linux/spl/sys/thread.h +++ b/include/os/linux/spl/sys/thread.h @@ -42,7 +42,7 @@ #define TS_ZOMB EXIT_ZOMBIE #define TS_STOPPED TASK_STOPPED -typedef void (*thread_func_t)(void *); +typedef void (*thread_func_t)(void *) __attribute__((noreturn)); #define thread_create_named(name, stk, stksize, func, arg, len, \ pp, state, pri) \ diff --git a/include/os/linux/zfs/sys/abd_os.h b/include/os/linux/zfs/sys/abd_os.h index 606e8bf682e8..3eed968e90c0 100644 --- a/include/os/linux/zfs/sys/abd_os.h +++ b/include/os/linux/zfs/sys/abd_os.h @@ -30,6 +30,8 @@ extern "C" { #endif +struct abd; + struct abd_scatter { uint_t abd_offset; uint_t abd_nents; @@ -41,10 +43,8 @@ struct abd_linear { struct scatterlist *abd_sgl; /* for LINEAR_PAGE */ }; -typedef struct abd abd_t; - typedef int abd_iter_page_func_t(struct page *, size_t, size_t, void *); -int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *, +int abd_iterate_page_func(struct abd *, size_t, size_t, abd_iter_page_func_t *, void *); /* @@ -52,11 +52,11 @@ int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *, * Note: these are only needed to support vdev_classic. See comment in * vdev_disk.c. */ -unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t); -unsigned long abd_nr_pages_off(abd_t *, unsigned int, size_t); +unsigned int abd_bio_map_off(struct bio *, struct abd *, unsigned int, size_t); +unsigned long abd_nr_pages_off(struct abd *, unsigned int, size_t); __attribute__((malloc)) -abd_t *abd_alloc_from_pages(struct page **, unsigned long, uint64_t); +struct abd *abd_alloc_from_pages(struct page **, unsigned long, uint64_t); #ifdef __cplusplus } diff --git a/include/os/linux/zfs/sys/zfs_vfsops_os.h b/include/os/linux/zfs/sys/zfs_vfsops_os.h index 7067eb17900d..30aa3a103d33 100644 --- a/include/os/linux/zfs/sys/zfs_vfsops_os.h +++ b/include/os/linux/zfs/sys/zfs_vfsops_os.h @@ -69,6 +69,7 @@ typedef struct vfs { boolean_t vfs_do_relatime; boolean_t vfs_nbmand; boolean_t vfs_do_nbmand; + kmutex_t vfs_mntpt_lock; } vfs_t; typedef struct zfs_mnt { diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index 56741cd2a58b..e69464809a42 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -171,7 +171,6 @@ typedef struct dbuf_dirty_record { * gets COW'd in a subsequent transaction group. */ arc_buf_t *dr_data; - blkptr_t dr_overridden_by; override_states_t dr_override_state; uint8_t dr_copies; boolean_t dr_nopwrite; @@ -179,14 +178,21 @@ typedef struct dbuf_dirty_record { boolean_t dr_diowrite; boolean_t dr_has_raw_params; - /* - * If dr_has_raw_params is set, the following crypt - * params will be set on the BP that's written. - */ - boolean_t dr_byteorder; - uint8_t dr_salt[ZIO_DATA_SALT_LEN]; - uint8_t dr_iv[ZIO_DATA_IV_LEN]; - uint8_t dr_mac[ZIO_DATA_MAC_LEN]; + /* Override and raw params are mutually exclusive. */ + union { + blkptr_t dr_overridden_by; + struct { + /* + * If dr_has_raw_params is set, the + * following crypt params will be set + * on the BP that's written. + */ + boolean_t dr_byteorder; + uint8_t dr_salt[ZIO_DATA_SALT_LEN]; + uint8_t dr_iv[ZIO_DATA_IV_LEN]; + uint8_t dr_mac[ZIO_DATA_MAC_LEN]; + }; + }; } dl; struct dirty_lightweight_leaf { /* @@ -264,6 +270,27 @@ typedef struct dmu_buf_impl { */ uint8_t db_level; + /* This block was freed while a read or write was active. */ + uint8_t db_freed_in_flight; + + /* + * Evict user data as soon as the dirty and reference counts are equal. + */ + uint8_t db_user_immediate_evict; + + /* + * dnode_evict_dbufs() or dnode_evict_bonus() tried to evict this dbuf, + * but couldn't due to outstanding references. Evict once the refcount + * drops to 0. + */ + uint8_t db_pending_evict; + + /* Number of TXGs in which this buffer is dirty. */ + uint8_t db_dirtycnt; + + /* The buffer was partially read. More reads may follow. */ + uint8_t db_partial_read; + /* * Protects db_buf's contents if they contain an indirect block or data * block of the meta-dnode. We use this lock to protect the structure of @@ -288,6 +315,9 @@ typedef struct dmu_buf_impl { */ dbuf_states_t db_state; + /* In which dbuf cache this dbuf is, if any. */ + dbuf_cached_state_t db_caching_status; + /* * Refcount accessed by dmu_buf_{hold,rele}. * If nonzero, the buffer can't be destroyed. @@ -304,39 +334,10 @@ typedef struct dmu_buf_impl { /* Link in dbuf_cache or dbuf_metadata_cache */ multilist_node_t db_cache_link; - /* Tells us which dbuf cache this dbuf is in, if any */ - dbuf_cached_state_t db_caching_status; - uint64_t db_hash; - /* Data which is unique to data (leaf) blocks: */ - /* User callback information. */ dmu_buf_user_t *db_user; - - /* - * Evict user data as soon as the dirty and reference - * counts are equal. - */ - uint8_t db_user_immediate_evict; - - /* - * This block was freed while a read or write was - * active. - */ - uint8_t db_freed_in_flight; - - /* - * dnode_evict_dbufs() or dnode_evict_bonus() tried to - * evict this dbuf, but couldn't due to outstanding - * references. Evict once the refcount drops to 0. - */ - uint8_t db_pending_evict; - - uint8_t db_dirtycnt; - - /* The buffer was partially read. More reads may follow. */ - uint8_t db_partial_read; } dmu_buf_impl_t; #define DBUF_HASH_MUTEX(h, idx) \ @@ -351,6 +352,8 @@ typedef struct dbuf_hash_table { typedef void (*dbuf_prefetch_fn)(void *, uint64_t, uint64_t, boolean_t); +extern kmem_cache_t *dbuf_dirty_kmem_cache; + uint64_t dbuf_whichblock(const struct dnode *di, const int64_t level, const uint64_t offset); diff --git a/lib/libspl/backtrace.c b/lib/libspl/backtrace.c index d26d742106e2..6e8b3b12122d 100644 --- a/lib/libspl/backtrace.c +++ b/lib/libspl/backtrace.c @@ -25,19 +25,32 @@ #include #include +#include #include /* - * libspl_backtrace() must be safe to call from inside a signal hander. This - * mostly means it must not allocate, and so we can't use things like printf. + * Output helpers. libspl_backtrace() must not block, must be thread-safe and + * must be safe to call from a signal handler. At least, that means not having + * printf, so we end up having to call write() directly on the fd. That's + * awkward, as we always have to pass through a length, and some systems will + * complain if we don't consume the return. So we have some macros to make + * things a little more palatable. */ +#define spl_bt_write_n(fd, s, n) \ + do { ssize_t r __maybe_unused = write(fd, s, n); } while (0) +#define spl_bt_write(fd, s) spl_bt_write_n(fd, s, sizeof (s)) #if defined(HAVE_LIBUNWIND) #define UNW_LOCAL_ONLY #include +/* + * Convert `v` to ASCII hex characters. The bottom `n` nybbles (4-bits ie one + * hex digit) will be written, up to `buflen`. The buffer will not be + * null-terminated. Returns the number of digits written. + */ static size_t -libspl_u64_to_hex_str(uint64_t v, size_t digits, char *buf, size_t buflen) +spl_bt_u64_to_hex_str(uint64_t v, size_t n, char *buf, size_t buflen) { static const char hexdigits[] = { '0', '1', '2', '3', '4', '5', '6', '7', @@ -45,10 +58,10 @@ libspl_u64_to_hex_str(uint64_t v, size_t digits, char *buf, size_t buflen) }; size_t pos = 0; - boolean_t want = (digits == 0); + boolean_t want = (n == 0); for (int i = 15; i >= 0; i--) { const uint64_t d = v >> (i * 4) & 0xf; - if (!want && (d != 0 || digits > i)) + if (!want && (d != 0 || n > i)) want = B_TRUE; if (want) { buf[pos++] = hexdigits[d]; @@ -62,40 +75,181 @@ libspl_u64_to_hex_str(uint64_t v, size_t digits, char *buf, size_t buflen) void libspl_backtrace(int fd) { - ssize_t ret __attribute__((unused)); unw_context_t uc; unw_cursor_t cp; - unw_word_t loc; + unw_word_t v; char buf[128]; size_t n; + int err; - ret = write(fd, "Call trace:\n", 12); + /* Snapshot the current frame and state. */ unw_getcontext(&uc); + + /* + * TODO: walk back to the frame that tripped the assertion / the place + * where the signal was recieved. + */ + + /* + * Register dump. We're going to loop over all the registers in the + * top frame, and show them, with names, in a nice three-column + * layout, which keeps us within 80 columns. + */ + spl_bt_write(fd, "Registers:\n"); + + /* Initialise a frame cursor, starting at the current frame */ unw_init_local(&cp, &uc); - while (unw_step(&cp) > 0) { - unw_get_reg(&cp, UNW_REG_IP, &loc); - ret = write(fd, " [0x", 5); - n = libspl_u64_to_hex_str(loc, 10, buf, sizeof (buf)); - ret = write(fd, buf, n); - ret = write(fd, "] ", 2); - unw_get_proc_name(&cp, buf, sizeof (buf), &loc); - for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {} - ret = write(fd, buf, n); - ret = write(fd, "+0x", 3); - n = libspl_u64_to_hex_str(loc, 2, buf, sizeof (buf)); - ret = write(fd, buf, n); + + /* + * libunwind's list of possible registers for this architecture is an + * enum, unw_regnum_t. UNW_TDEP_LAST_REG is the highest-numbered + * register in that list, however, not all register numbers in this + * range are defined by the architecture, and not all defined registers + * will be present on every implementation of that architecture. + * Moreover, libunwind provides nice names for most, but not all + * registers, but these are hardcoded; a name being available does not + * mean that register is available. + * + * So, we have to pull this all together here. We try to get the value + * of every possible register. If we get a value for it, then the + * register must exist, and so we get its name. If libunwind has no + * name for it, we synthesize something. These cases should be rare, + * and they're usually for uninteresting or niche registers, so it + * shouldn't really matter. We can see the value, and that's the main + * thing. + */ + uint_t cols = 0; + for (uint_t regnum = 0; regnum <= UNW_TDEP_LAST_REG; regnum++) { + /* + * Get the value. Any error probably means the register + * doesn't exist, and we skip it. + */ + if (unw_get_reg(&cp, regnum, &v) < 0) + continue; + + /* + * Register name. If libunwind doesn't have a name for it, + * it will return "???". As a shortcut, we just treat '?' + * is an alternate end-of-string character. + */ + const char *name = unw_regname(regnum); + for (n = 0; name[n] != '\0' && name[n] != '?'; n++) {} + if (n == 0) { + /* + * No valid name, so make one of the form "?xx", where + * "xx" is the two-char hex of libunwind's register + * number. + */ + buf[0] = '?'; + n = spl_bt_u64_to_hex_str(regnum, 2, + &buf[1], sizeof (buf)-1) + 1; + name = buf; + } + + /* + * Two spaces of padding before each column, plus extra + * spaces to align register names shorter than three chars. + */ + spl_bt_write_n(fd, " ", 5-MIN(n, 3)); + + /* Register name and column punctuation */ + spl_bt_write_n(fd, name, n); + spl_bt_write(fd, ": 0x"); + + /* + * Convert register value (from unw_get_reg()) to hex. We're + * assuming that all registers are 64-bits wide, which is + * probably fine for any general-purpose registers on any + * machine currently in use. A more generic way would be to + * look at the width of unw_word_t, but that would also + * complicate the column code a bit. This is fine. + */ + n = spl_bt_u64_to_hex_str(v, 16, buf, sizeof (buf)); + spl_bt_write_n(fd, buf, n); + + /* Every third column, emit a newline */ + if (!(++cols % 3)) + spl_bt_write(fd, "\n"); + } + + /* If we finished before the third column, emit a newline. */ + if (cols % 3) + spl_bt_write(fd, "\n"); + + /* Now the main event, the backtrace. */ + spl_bt_write(fd, "Call trace:\n"); + + /* Reset the cursor to the top again. */ + unw_init_local(&cp, &uc); + + do { + /* + * Getting the IP should never fail; libunwind handles it + * specially, because its used a lot internally. Still, no + * point being silly about it, as the last thing we want is + * our crash handler to crash. So if it ever does fail, we'll + * show an error line, but keep going to the next frame. + */ + if (unw_get_reg(&cp, UNW_REG_IP, &v) < 0) { + spl_bt_write(fd, " [couldn't get IP register; " + "corrupt frame?]"); + continue; + } + + /* IP & punctuation */ + n = spl_bt_u64_to_hex_str(v, 16, buf, sizeof (buf)); + spl_bt_write(fd, " [0x"); + spl_bt_write_n(fd, buf, n); + spl_bt_write(fd, "] "); + + /* + * Function ("procedure") name for the current frame. `v` + * receives the offset from the named function to the IP, which + * we show as a "+offset" suffix. + * + * If libunwind can't determine the name, we just show "???" + * instead. We've already displayed the IP above; that will + * have to do. + * + * unw_get_proc_name() will return ENOMEM if the buffer is too + * small, instead truncating the name. So we treat that as a + * success and use whatever is in the buffer. + */ + err = unw_get_proc_name(&cp, buf, sizeof (buf), &v); + if (err == 0 || err == -UNW_ENOMEM) { + for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {} + spl_bt_write_n(fd, buf, n); + + /* Offset from proc name */ + spl_bt_write(fd, "+0x"); + n = spl_bt_u64_to_hex_str(v, 2, buf, sizeof (buf)); + spl_bt_write_n(fd, buf, n); + } else + spl_bt_write(fd, "???"); + #ifdef HAVE_LIBUNWIND_ELF - ret = write(fd, " (in ", 5); - unw_get_elf_filename(&cp, buf, sizeof (buf), &loc); - for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {} - ret = write(fd, buf, n); - ret = write(fd, " +0x", 4); - n = libspl_u64_to_hex_str(loc, 2, buf, sizeof (buf)); - ret = write(fd, buf, n); - ret = write(fd, ")", 1); + /* + * Newer libunwind has unw_get_elf_filename(), which gets + * the name of the ELF object that the frame was executing in. + * Like `unw_get_proc_name()`, `v` recieves the offset within + * the file, and UNW_ENOMEM indicates that a truncate filename + * was left in the buffer. + */ + err = unw_get_elf_filename(&cp, buf, sizeof (buf), &v); + if (err == 0 || err == -UNW_ENOMEM) { + for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {} + spl_bt_write(fd, " (in "); + spl_bt_write_n(fd, buf, n); + + /* Offset within file */ + spl_bt_write(fd, " +0x"); + n = spl_bt_u64_to_hex_str(v, 2, buf, sizeof (buf)); + spl_bt_write_n(fd, buf, n); + spl_bt_write(fd, ")"); + } #endif - ret = write(fd, "\n", 1); - } + spl_bt_write(fd, "\n"); + } while (unw_step(&cp) > 0); } #elif defined(HAVE_BACKTRACE) #include @@ -103,15 +257,12 @@ libspl_backtrace(int fd) void libspl_backtrace(int fd) { - ssize_t ret __attribute__((unused)); void *btptrs[64]; size_t nptrs = backtrace(btptrs, 64); - ret = write(fd, "Call trace:\n", 12); + spl_bt_write(fd, "Call trace:\n"); backtrace_symbols_fd(btptrs, nptrs, fd); } #else -#include - void libspl_backtrace(int fd __maybe_unused) { diff --git a/lib/libzpool/zfs_debug.c b/lib/libzpool/zfs_debug.c index df49a9a33fe8..82c7229932f0 100644 --- a/lib/libzpool/zfs_debug.c +++ b/lib/libzpool/zfs_debug.c @@ -35,9 +35,25 @@ typedef struct zfs_dbgmsg { static list_t zfs_dbgmsgs; static kmutex_t zfs_dbgmsgs_lock; +static uint_t zfs_dbgmsg_size = 0; +static uint_t zfs_dbgmsg_maxsize = 4<<20; /* 4MB */ int zfs_dbgmsg_enable = B_TRUE; +static void +zfs_dbgmsg_purge(uint_t max_size) +{ + while (zfs_dbgmsg_size > max_size) { + zfs_dbgmsg_t *zdm = list_remove_head(&zfs_dbgmsgs); + if (zdm == NULL) + return; + + uint_t size = zdm->zdm_size; + kmem_free(zdm, size); + zfs_dbgmsg_size -= size; + } +} + void zfs_dbgmsg_init(void) { @@ -74,6 +90,8 @@ __zfs_dbgmsg(char *buf) mutex_enter(&zfs_dbgmsgs_lock); list_insert_tail(&zfs_dbgmsgs, zdm); + zfs_dbgmsg_size += size; + zfs_dbgmsg_purge(zfs_dbgmsg_maxsize); mutex_exit(&zfs_dbgmsgs_lock); } diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index c9f6ed0dece3..da027798f962 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -18,7 +18,7 @@ .\" .\" Copyright (c) 2024, Klara, Inc. .\" -.Dd October 2, 2024 +.Dd November 1, 2024 .Dt ZFS 4 .Os . @@ -1333,9 +1333,10 @@ results in vector instructions from the respective CPU instruction set being used. . .It Sy zfs_bclone_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int -Enable the experimental block cloning feature. +Enables access to the block cloning feature. If this setting is 0, then even if feature@block_cloning is enabled, -attempts to clone blocks will act as though the feature is disabled. +using functions and system calls that attempt to clone blocks will act as +though the feature is disabled. . .It Sy zfs_bclone_wait_dirty Ns = Ns Sy 0 Ns | Ns 1 Pq int When set to 1 the FICLONE and FICLONERANGE ioctls wait for dirty data to be diff --git a/man/man8/zdb.8 b/man/man8/zdb.8 index 08f5a3f70040..ae35454ad083 100644 --- a/man/man8/zdb.8 +++ b/man/man8/zdb.8 @@ -14,7 +14,7 @@ .\" Copyright (c) 2017 Lawrence Livermore National Security, LLC. .\" Copyright (c) 2017 Intel Corporation. .\" -.Dd November 18, 2023 +.Dd October 27, 2024 .Dt ZDB 8 .Os . @@ -408,6 +408,8 @@ blocks cloned, the space saving as a result of cloning, and the saving ratio. .It Fl TT Display the per-vdev BRT statistics, including total references. .It Fl TTT +Display histograms of per-vdev BRT refcounts. +.It Fl TTTT Dump the contents of the block reference tables. .It Fl u , -uberblock Display the current uberblock. diff --git a/module/os/freebsd/zfs/zio_crypt.c b/module/os/freebsd/zfs/zio_crypt.c index 2b62abcccb78..feaca93fb933 100644 --- a/module/os/freebsd/zfs/zio_crypt.c +++ b/module/os/freebsd/zfs/zio_crypt.c @@ -1686,11 +1686,10 @@ zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key, freebsd_crypt_session_t *tmpl = NULL; uint8_t *authbuf = NULL; - + memset(&puio_s, 0, sizeof (puio_s)); + memset(&cuio_s, 0, sizeof (cuio_s)); zfs_uio_init(&puio, &puio_s); zfs_uio_init(&cuio, &cuio_s); - memset(GET_UIO_STRUCT(&puio), 0, sizeof (struct uio)); - memset(GET_UIO_STRUCT(&cuio), 0, sizeof (struct uio)); #ifdef FCRYPTO_DEBUG printf("%s(%s, %p, %p, %d, %p, %p, %u, %s, %p, %p, %p)\n", diff --git a/module/os/linux/spl/spl-thread.c b/module/os/linux/spl/spl-thread.c index 7f74d44f91ff..7b0ce30c7884 100644 --- a/module/os/linux/spl/spl-thread.c +++ b/module/os/linux/spl/spl-thread.c @@ -171,11 +171,11 @@ issig(void) #if defined(HAVE_DEQUEUE_SIGNAL_4ARG) enum pid_type __type; if (dequeue_signal(current, &set, &__info, &__type) != 0) { -#elif defined(HAVE_DEQUEUE_SIGNAL_3ARG_TASK) - if (dequeue_signal(current, &set, &__info) != 0) { -#else +#elif defined(HAVE_DEQUEUE_SIGNAL_3ARG_TYPE) enum pid_type __type; if (dequeue_signal(&set, &__info, &__type) != 0) { +#else + if (dequeue_signal(current, &set, &__info) != 0) { #endif spin_unlock_irq(¤t->sighand->siglock); kernel_signal_stop(); diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c index 303af48cf3af..04ab8bbca352 100644 --- a/module/os/linux/zfs/abd_os.c +++ b/module/os/linux/zfs/abd_os.c @@ -701,6 +701,8 @@ abd_free_linear_page(abd_t *abd) /* When backed by user page unmap it */ if (abd_is_from_pages(abd)) zfs_kunmap(sg_page(sg)); + else + abd_update_scatter_stats(abd, ABDSTAT_DECR); abd->abd_flags &= ~ABD_FLAG_LINEAR; abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE; diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index a6271d3a7df1..6a66a72b91a9 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -801,24 +801,13 @@ vbio_completion(struct bio *bio) bio_put(bio); /* - * If we copied the ABD before issuing it, clean up and return the copy - * to the ADB, with changes if appropriate. + * We're likely in an interrupt context so we can't do ABD/memory work + * here; instead we stash vbio on the zio and take care of it in the + * done callback. */ - if (vbio->vbio_abd != NULL) { - void *buf = abd_to_buf(vbio->vbio_abd); - abd_free(vbio->vbio_abd); - vbio->vbio_abd = NULL; - - if (zio->io_type == ZIO_TYPE_READ) - abd_return_buf_copy(zio->io_abd, buf, zio->io_size); - else - abd_return_buf(zio->io_abd, buf, zio->io_size); - } - - /* Final cleanup */ - kmem_free(vbio, sizeof (vbio_t)); + ASSERT3P(zio->io_bio, ==, NULL); + zio->io_bio = vbio; - /* All done, submit for processing */ zio_delay_interrupt(zio); } @@ -834,38 +823,61 @@ vbio_completion(struct bio *bio) * NOTE: if you change this function, change the copy in * tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c, and add test * data there to validate the change you're making. - * */ typedef struct { - uint_t bmask; - uint_t npages; - uint_t end; -} vdev_disk_check_pages_t; + size_t blocksize; + int seen_first; + int seen_last; +} vdev_disk_check_alignment_t; static int -vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv) +vdev_disk_check_alignment_cb(struct page *page, size_t off, size_t len, + void *priv) { (void) page; - vdev_disk_check_pages_t *s = priv; + vdev_disk_check_alignment_t *s = priv; /* - * If we didn't finish on a block size boundary last time, then there - * would be a gap if we tried to use this ABD as-is, so abort. + * The cardinal rule: a single on-disk block must never cross an + * physical (order-0) page boundary, as the kernel expects to be able + * to split at both LBS and page boundaries. + * + * This implies various alignment rules for the blocks in this + * (possibly compound) page, which we can check for. */ - if (s->end != 0) - return (1); /* - * Note if we're taking less than a full block, so we can check it - * above on the next call. + * If the previous page did not end on a page boundary, then we + * can't proceed without creating a hole. */ - s->end = (off+len) & s->bmask; + if (s->seen_last) + return (1); - /* All blocks after the first must start on a block size boundary. */ - if (s->npages != 0 && (off & s->bmask) != 0) + /* This page must contain only whole LBS-sized blocks. */ + if (!IS_P2ALIGNED(len, s->blocksize)) return (1); - s->npages++; + /* + * If this is not the first page in the ABD, then the data must start + * on a page-aligned boundary (so the kernel can split on page + * boundaries without having to deal with a hole). If it is, then + * it can start on LBS-alignment. + */ + if (s->seen_first) { + if (!IS_P2ALIGNED(off, PAGESIZE)) + return (1); + } else { + if (!IS_P2ALIGNED(off, s->blocksize)) + return (1); + s->seen_first = 1; + } + + /* + * If this data does not end on a page-aligned boundary, then this + * must be the last page in the ABD, for the same reason. + */ + s->seen_last = !IS_P2ALIGNED(off+len, PAGESIZE); + return (0); } @@ -874,15 +886,14 @@ vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv) * the number of pages, or 0 if it can't be submitted like this. */ static boolean_t -vdev_disk_check_pages(abd_t *abd, uint64_t size, struct block_device *bdev) +vdev_disk_check_alignment(abd_t *abd, uint64_t size, struct block_device *bdev) { - vdev_disk_check_pages_t s = { - .bmask = bdev_logical_block_size(bdev)-1, - .npages = 0, - .end = 0, + vdev_disk_check_alignment_t s = { + .blocksize = bdev_logical_block_size(bdev), }; - if (abd_iterate_page_func(abd, 0, size, vdev_disk_check_pages_cb, &s)) + if (abd_iterate_page_func(abd, 0, size, + vdev_disk_check_alignment_cb, &s)) return (B_FALSE); return (B_TRUE); @@ -916,37 +927,32 @@ vdev_disk_io_rw(zio_t *zio) /* * Check alignment of the incoming ABD. If any part of it would require - * submitting a page that is not aligned to the logical block size, - * then we take a copy into a linear buffer and submit that instead. - * This should be impossible on a 512b LBS, and fairly rare on 4K, - * usually requiring abnormally-small data blocks (eg gang blocks) - * mixed into the same ABD as larger ones (eg aggregated). + * submitting a page that is not aligned to both the logical block size + * and the page size, then we take a copy into a new memory region with + * correct alignment. This should be impossible on a 512b LBS. On + * larger blocks, this can happen at least when a small number of + * blocks (usually 1) are allocated from a shared slab, or when + * abnormally-small data regions (eg gang headers) are mixed into the + * same ABD as larger allocations (eg aggregations). */ abd_t *abd = zio->io_abd; - if (!vdev_disk_check_pages(abd, zio->io_size, bdev)) { - void *buf; - if (zio->io_type == ZIO_TYPE_READ) - buf = abd_borrow_buf(zio->io_abd, zio->io_size); - else - buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size); + if (!vdev_disk_check_alignment(abd, zio->io_size, bdev)) { + /* Allocate a new memory region with guaranteed alignment */ + abd = abd_alloc_for_io(zio->io_size, + zio->io_abd->abd_flags & ABD_FLAG_META); - /* - * Wrap the copy in an abd_t, so we can use the same iterators - * to count and fill the vbio later. - */ - abd = abd_get_from_buf(buf, zio->io_size); + /* If we're writing copy our data into it */ + if (zio->io_type == ZIO_TYPE_WRITE) + abd_copy(abd, zio->io_abd, zio->io_size); /* - * False here would mean the borrowed copy has an invalid - * alignment too, which would mean we've somehow been passed a - * linear ABD with an interior page that has a non-zero offset - * or a size not a multiple of PAGE_SIZE. This is not possible. - * It would mean either zio_buf_alloc() or its underlying - * allocators have done something extremely strange, or our - * math in vdev_disk_check_pages() is wrong. In either case, + * False here would mean the new allocation has an invalid + * alignment too, which would mean that abd_alloc() is not + * guaranteeing this, or our logic in + * vdev_disk_check_alignment() is wrong. In either case, * something in seriously wrong and its not safe to continue. */ - VERIFY(vdev_disk_check_pages(abd, zio->io_size, bdev)); + VERIFY(vdev_disk_check_alignment(abd, zio->io_size, bdev)); } /* Allocate vbio, with a pointer to the borrowed ABD if necessary */ @@ -1437,6 +1443,28 @@ vdev_disk_io_start(zio_t *zio) static void vdev_disk_io_done(zio_t *zio) { + /* If this was a read or write, we need to clean up the vbio */ + if (zio->io_bio != NULL) { + vbio_t *vbio = zio->io_bio; + zio->io_bio = NULL; + + /* + * If we copied the ABD before issuing it, clean up and return + * the copy to the ADB, with changes if appropriate. + */ + if (vbio->vbio_abd != NULL) { + if (zio->io_type == ZIO_TYPE_READ) + abd_copy(zio->io_abd, vbio->vbio_abd, + zio->io_size); + + abd_free(vbio->vbio_abd); + vbio->vbio_abd = NULL; + } + + /* Final cleanup */ + kmem_free(vbio, sizeof (vbio_t)); + } + /* * If the device returned EIO, we revalidate the media. If it is * determined the media has changed this triggers the asynchronous diff --git a/module/os/linux/zfs/vdev_file.c b/module/os/linux/zfs/vdev_file.c index 4bffb6412ffd..2cab6532487a 100644 --- a/module/os/linux/zfs/vdev_file.c +++ b/module/os/linux/zfs/vdev_file.c @@ -33,11 +33,13 @@ #include #include #include -#include #include #include #ifdef _KERNEL #include +#include +#else +#include #endif /* * Virtual device vector for files. diff --git a/module/os/linux/zfs/zfs_ctldir.c b/module/os/linux/zfs/zfs_ctldir.c index 8a42a075cd25..f60d6ae91e0b 100644 --- a/module/os/linux/zfs/zfs_ctldir.c +++ b/module/os/linux/zfs/zfs_ctldir.c @@ -767,9 +767,6 @@ zfsctl_snapshot_path_objset(zfsvfs_t *zfsvfs, uint64_t objsetid, uint64_t id, pos = 0; int error = 0; - if (zfsvfs->z_vfs->vfs_mntpoint == NULL) - return (SET_ERROR(ENOENT)); - cookie = spl_fstrans_mark(); snapname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); @@ -786,8 +783,14 @@ zfsctl_snapshot_path_objset(zfsvfs_t *zfsvfs, uint64_t objsetid, break; } - snprintf(full_path, path_len, "%s/.zfs/snapshot/%s", - zfsvfs->z_vfs->vfs_mntpoint, snapname); + mutex_enter(&zfsvfs->z_vfs->vfs_mntpt_lock); + if (zfsvfs->z_vfs->vfs_mntpoint != NULL) { + snprintf(full_path, path_len, "%s/.zfs/snapshot/%s", + zfsvfs->z_vfs->vfs_mntpoint, snapname); + } else + error = SET_ERROR(ENOENT); + mutex_exit(&zfsvfs->z_vfs->vfs_mntpt_lock); + out: kmem_free(snapname, ZFS_MAX_DATASET_NAME_LEN); spl_fstrans_unmark(cookie); @@ -1049,6 +1052,66 @@ exportfs_flush(void) (void) call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); } +/* + * Returns the path in char format for given struct path. Uses + * d_path exported by kernel to convert struct path to char + * format. Returns the correct path for mountpoints and chroot + * environments. + * + * If chroot environment has directories that are mounted with + * --bind or --rbind flag, d_path returns the complete path inside + * chroot environment but does not return the absolute path, i.e. + * the path to chroot environment is missing. + */ +static int +get_root_path(struct path *path, char *buff, int len) +{ + char *path_buffer, *path_ptr; + int error = 0; + + path_get(path); + path_buffer = kmem_zalloc(len, KM_SLEEP); + path_ptr = d_path(path, path_buffer, len); + if (IS_ERR(path_ptr)) + error = SET_ERROR(-PTR_ERR(path_ptr)); + else + strcpy(buff, path_ptr); + + kmem_free(path_buffer, len); + path_put(path); + return (error); +} + +/* + * Returns if the current process root is chrooted or not. Linux + * kernel exposes the task_struct for current process and init. + * Since init process root points to actual root filesystem when + * Linux runtime is reached, we can compare the current process + * root with init process root to determine if root of the current + * process is different from init, which can reliably determine if + * current process is in chroot context or not. + */ +static int +is_current_chrooted(void) +{ + struct task_struct *curr = current, *global = &init_task; + struct path cr_root, gl_root; + + task_lock(curr); + get_fs_root(curr->fs, &cr_root); + task_unlock(curr); + + task_lock(global); + get_fs_root(global->fs, &gl_root); + task_unlock(global); + + int chrooted = !path_equal(&cr_root, &gl_root); + path_put(&gl_root); + path_put(&cr_root); + + return (chrooted); +} + /* * Attempt to unmount a snapshot by making a call to user space. * There is no assurance that this can or will succeed, is just a @@ -1123,14 +1186,50 @@ zfsctl_snapshot_mount(struct path *path, int flags) if (error) goto error; + if (is_current_chrooted() == 0) { + /* + * Current process is not in chroot context + */ + + char *m = kmem_zalloc(MAXPATHLEN, KM_SLEEP); + struct path mnt_path; + mnt_path.mnt = path->mnt; + mnt_path.dentry = path->mnt->mnt_root; + + /* + * Get path to current mountpoint + */ + error = get_root_path(&mnt_path, m, MAXPATHLEN); + if (error != 0) { + kmem_free(m, MAXPATHLEN); + goto error; + } + mutex_enter(&zfsvfs->z_vfs->vfs_mntpt_lock); + if (zfsvfs->z_vfs->vfs_mntpoint != NULL) { + /* + * If current mnountpoint and vfs_mntpoint are not same, + * store current mountpoint in vfs_mntpoint. + */ + if (strcmp(zfsvfs->z_vfs->vfs_mntpoint, m) != 0) { + kmem_strfree(zfsvfs->z_vfs->vfs_mntpoint); + zfsvfs->z_vfs->vfs_mntpoint = kmem_strdup(m); + } + } else + zfsvfs->z_vfs->vfs_mntpoint = kmem_strdup(m); + mutex_exit(&zfsvfs->z_vfs->vfs_mntpt_lock); + kmem_free(m, MAXPATHLEN); + } + /* * Construct a mount point path from sb of the ctldir inode and dirent * name, instead of from d_path(), so that chroot'd process doesn't fail * on mount.zfs(8). */ + mutex_enter(&zfsvfs->z_vfs->vfs_mntpt_lock); snprintf(full_path, MAXPATHLEN, "%s/.zfs/snapshot/%s", zfsvfs->z_vfs->vfs_mntpoint ? zfsvfs->z_vfs->vfs_mntpoint : "", dname(dentry)); + mutex_exit(&zfsvfs->z_vfs->vfs_mntpt_lock); snprintf(options, 7, "%s", zfs_snapshot_no_setuid ? "nosuid" : "suid"); diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c index de3e8c89cfdd..3c53a8a315c3 100644 --- a/module/os/linux/zfs/zfs_vfsops.c +++ b/module/os/linux/zfs/zfs_vfsops.c @@ -115,7 +115,7 @@ zfsvfs_vfs_free(vfs_t *vfsp) if (vfsp != NULL) { if (vfsp->vfs_mntpoint != NULL) kmem_strfree(vfsp->vfs_mntpoint); - + mutex_destroy(&vfsp->vfs_mntpt_lock); kmem_free(vfsp, sizeof (vfs_t)); } } @@ -197,10 +197,11 @@ zfsvfs_parse_option(char *option, int token, substring_t *args, vfs_t *vfsp) vfsp->vfs_do_nbmand = B_TRUE; break; case TOKEN_MNTPOINT: + if (vfsp->vfs_mntpoint != NULL) + kmem_strfree(vfsp->vfs_mntpoint); vfsp->vfs_mntpoint = match_strdup(&args[0]); if (vfsp->vfs_mntpoint == NULL) return (SET_ERROR(ENOMEM)); - break; default: break; @@ -219,6 +220,7 @@ zfsvfs_parse_options(char *mntopts, vfs_t **vfsp) int error; tmp_vfsp = kmem_zalloc(sizeof (vfs_t), KM_SLEEP); + mutex_init(&tmp_vfsp->vfs_mntpt_lock, NULL, MUTEX_DEFAULT, NULL); if (mntopts != NULL) { substring_t args[MAX_OPT_ARGS]; diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index 469197220859..dd9fd760b9c2 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -260,15 +260,6 @@ update_pages(znode_t *zp, int64_t start, int len, objset_t *os) } else { ClearPageError(pp); SetPageUptodate(pp); - if (!PagePrivate(pp)) { - /* - * Set private bit so page migration - * will wait for us to finish writeback - * before calling migrate_folio(). - */ - SetPagePrivate(pp); - get_page(pp); - } if (mapping_writably_mapped(mp)) flush_dcache_page(pp); @@ -4090,14 +4081,6 @@ zfs_fillpage(struct inode *ip, struct page *pp) } else { ClearPageError(pp); SetPageUptodate(pp); - if (!PagePrivate(pp)) { - /* - * Set private bit so page migration will wait for us to - * finish writeback before calling migrate_folio(). - */ - SetPagePrivate(pp); - get_page(pp); - } } return (error); diff --git a/module/os/linux/zfs/zfs_znode_os.c b/module/os/linux/zfs/zfs_znode_os.c index bc1e17f086d9..bbaca2f58394 100644 --- a/module/os/linux/zfs/zfs_znode_os.c +++ b/module/os/linux/zfs/zfs_znode_os.c @@ -1577,14 +1577,6 @@ zfs_zero_partial_page(znode_t *zp, uint64_t start, uint64_t len) mark_page_accessed(pp); SetPageUptodate(pp); ClearPageError(pp); - if (!PagePrivate(pp)) { - /* - * Set private bit so page migration will wait for us to - * finish writeback before calling migrate_folio(). - */ - SetPagePrivate(pp); - get_page(pp); - } unlock_page(pp); put_page(pp); } diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c index 50c63695dcc8..f6e014327717 100644 --- a/module/os/linux/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -28,6 +28,7 @@ #include #endif #include +#include #include #include #include @@ -607,42 +608,6 @@ zpl_writepage(struct page *pp, struct writeback_control *wbc) return (zpl_putpage(pp, wbc, &for_sync)); } -static int -zpl_releasepage(struct page *pp, gfp_t gfp) -{ - if (PagePrivate(pp)) { - ClearPagePrivate(pp); - put_page(pp); - } - return (1); -} - -#ifdef HAVE_VFS_RELEASE_FOLIO -static bool -zpl_release_folio(struct folio *folio, gfp_t gfp) -{ - return (zpl_releasepage(&folio->page, gfp)); -} -#endif - -#ifdef HAVE_VFS_INVALIDATE_FOLIO -static void -zpl_invalidate_folio(struct folio *folio, size_t offset, size_t len) -{ - if ((offset == 0) && (len == PAGE_SIZE)) { - zpl_releasepage(&folio->page, 0); - } -} -#else -static void -zpl_invalidatepage(struct page *pp, unsigned int offset, unsigned int len) -{ - if ((offset == 0) && (len == PAGE_SIZE)) { - zpl_releasepage(pp, 0); - } -} -#endif - /* * The flag combination which matches the behavior of zfs_space() is * FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE. The FALLOC_FL_PUNCH_HOLE @@ -1126,15 +1091,10 @@ const struct address_space_operations zpl_address_space_operations = { #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO .dirty_folio = filemap_dirty_folio, #endif -#ifdef HAVE_VFS_RELEASE_FOLIO - .release_folio = zpl_release_folio, -#else - .releasepage = zpl_releasepage, -#endif -#ifdef HAVE_VFS_INVALIDATE_FOLIO - .invalidate_folio = zpl_invalidate_folio, +#ifdef HAVE_VFS_MIGRATE_FOLIO + .migrate_folio = migrate_folio, #else - .invalidatepage = zpl_invalidatepage, + .migratepage = migrate_page, #endif }; diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index df9368fc8bdb..b1419d96f4ef 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -182,6 +182,7 @@ static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr); * Global data structures and functions for the dbuf cache. */ static kmem_cache_t *dbuf_kmem_cache; +kmem_cache_t *dbuf_dirty_kmem_cache; static taskq_t *dbu_evict_taskq; static kthread_t *dbuf_cache_evict_thread; @@ -966,6 +967,8 @@ dbuf_init(void) dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t", sizeof (dmu_buf_impl_t), 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); + dbuf_dirty_kmem_cache = kmem_cache_create("dbuf_dirty_record_t", + sizeof (dbuf_dirty_record_t), 0, NULL, NULL, NULL, NULL, NULL, 0); for (int i = 0; i < hmsize; i++) mutex_init(&h->hash_mutexes[i], NULL, MUTEX_NOLOCKDEP, NULL); @@ -1041,6 +1044,7 @@ dbuf_fini(void) sizeof (kmutex_t)); kmem_cache_destroy(dbuf_kmem_cache); + kmem_cache_destroy(dbuf_dirty_kmem_cache); taskq_destroy(dbu_evict_taskq); mutex_enter(&dbuf_evict_lock); @@ -2343,7 +2347,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) * to make a copy of it so that the changes we make in this * transaction group won't leak out when we sync the older txg. */ - dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); + dr = kmem_cache_alloc(dbuf_dirty_kmem_cache, KM_SLEEP); + memset(dr, 0, sizeof (*dr)); list_link_init(&dr->dr_dirty_node); list_link_init(&dr->dr_dbuf_node); dr->dr_dnode = dn; @@ -2526,7 +2531,7 @@ dbuf_undirty_bonus(dbuf_dirty_record_t *dr) mutex_destroy(&dr->dt.di.dr_mtx); list_destroy(&dr->dt.di.dr_children); } - kmem_free(dr, sizeof (dbuf_dirty_record_t)); + kmem_cache_free(dbuf_dirty_kmem_cache, dr); ASSERT3U(db->db_dirtycnt, >, 0); db->db_dirtycnt -= 1; } @@ -2616,7 +2621,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) } } - kmem_free(dr, sizeof (dbuf_dirty_record_t)); + kmem_cache_free(dbuf_dirty_kmem_cache, dr); ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; @@ -2941,7 +2946,7 @@ dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder, * (see dbuf_sync_dnode_leaf_crypt()). */ ASSERT3U(db->db.db_object, ==, DMU_META_DNODE_OBJECT); - ASSERT3U(db->db_level, ==, 0); + ASSERT0(db->db_level); ASSERT(db->db_objset->os_raw_receive); dmu_buf_will_dirty_impl(db_fake, @@ -2950,6 +2955,7 @@ dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder, dr = dbuf_find_dirty_eq(db, tx->tx_txg); ASSERT3P(dr, !=, NULL); + ASSERT3U(dr->dt.dl.dr_override_state, ==, DR_NOT_OVERRIDDEN); dr->dt.dl.dr_has_raw_params = B_TRUE; dr->dt.dl.dr_byteorder = byteorder; @@ -2964,10 +2970,14 @@ dbuf_override_impl(dmu_buf_impl_t *db, const blkptr_t *bp, dmu_tx_t *tx) struct dirty_leaf *dl; dbuf_dirty_record_t *dr; + ASSERT3U(db->db.db_object, !=, DMU_META_DNODE_OBJECT); + ASSERT0(db->db_level); + dr = list_head(&db->db_dirty_records); ASSERT3P(dr, !=, NULL); ASSERT3U(dr->dr_txg, ==, tx->tx_txg); dl = &dr->dt.dl; + ASSERT0(dl->dr_has_raw_params); dl->dr_overridden_by = *bp; dl->dr_override_state = DR_OVERRIDDEN; BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, dr->dr_txg); @@ -3040,6 +3050,7 @@ dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, ASSERT3P(dr, !=, NULL); ASSERT3U(dr->dr_txg, ==, tx->tx_txg); dl = &dr->dt.dl; + ASSERT0(dl->dr_has_raw_params); encode_embedded_bp_compressed(&dl->dr_overridden_by, data, comp, uncompressed_size, compressed_size); BPE_SET_ETYPE(&dl->dr_overridden_by, etype); @@ -5083,7 +5094,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted, zio->io_txg); - kmem_free(dr, sizeof (dbuf_dirty_record_t)); + kmem_cache_free(dbuf_dirty_kmem_cache, dr); } static void diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 3f87cfe6bee9..362415a25895 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1895,6 +1895,7 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) mutex_enter(&db->db_mtx); ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); if (zio->io_error == 0) { + ASSERT0(dr->dt.dl.dr_has_raw_params); dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE); if (dr->dt.dl.dr_nopwrite) { blkptr_t *bp = zio->io_bp; @@ -2190,6 +2191,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) return (SET_ERROR(EALREADY)); } + ASSERT0(dr->dt.dl.dr_has_raw_params); ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC; mutex_exit(&db->db_mtx); @@ -2657,6 +2659,7 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, db = (dmu_buf_impl_t *)dbuf; bp = &bps[i]; + ASSERT3U(db->db.db_object, !=, DMU_META_DNODE_OBJECT); ASSERT0(db->db_level); ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(db->db_blkid != DMU_SPILL_BLKID); @@ -2672,11 +2675,6 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, db = (dmu_buf_impl_t *)dbuf; bp = &bps[i]; - ASSERT0(db->db_level); - ASSERT(db->db_blkid != DMU_BONUS_BLKID); - ASSERT(db->db_blkid != DMU_SPILL_BLKID); - ASSERT(BP_IS_HOLE(bp) || dbuf->db_size == BP_GET_LSIZE(bp)); - dmu_buf_will_clone_or_dio(dbuf, tx); mutex_enter(&db->db_mtx); @@ -2685,6 +2683,7 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, VERIFY(dr != NULL); ASSERT3U(dr->dr_txg, ==, tx->tx_txg); dl = &dr->dt.dl; + ASSERT0(dl->dr_has_raw_params); dl->dr_overridden_by = *bp; if (!BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) != 0) { if (!BP_IS_EMBEDDED(bp)) { diff --git a/module/zfs/dmu_direct.c b/module/zfs/dmu_direct.c index ed96e7515bc7..40b78b519f49 100644 --- a/module/zfs/dmu_direct.c +++ b/module/zfs/dmu_direct.c @@ -180,6 +180,7 @@ dmu_write_direct(zio_t *pio, dmu_buf_impl_t *db, abd_t *data, dmu_tx_t *tx) if (list_next(&db->db_dirty_records, dr_head) != NULL) zp.zp_nopwrite = B_FALSE; + ASSERT0(dr_head->dt.dl.dr_has_raw_params); ASSERT3S(dr_head->dt.dl.dr_override_state, ==, DR_NOT_OVERRIDDEN); dr_head->dt.dl.dr_override_state = DR_IN_DMU_SYNC; diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index a174972e9b57..aeb8ff3b6688 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -180,6 +180,8 @@ struct send_range { */ dnode_phys_t *dnp; blkptr_t bp; + /* Piggyback unmodified spill block */ + struct send_range *spill_range; } object; struct srr { uint32_t datablksz; @@ -231,6 +233,8 @@ range_free(struct send_range *range) size_t size = sizeof (dnode_phys_t) * (range->sru.object.dnp->dn_extra_slots + 1); kmem_free(range->sru.object.dnp, size); + if (range->sru.object.spill_range) + range_free(range->sru.object.spill_range); } else if (range->type == DATA) { mutex_enter(&range->sru.data.lock); while (range->sru.data.io_outstanding) @@ -617,7 +621,7 @@ dump_spill(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object, drrs->drr_length = blksz; drrs->drr_toguid = dscp->dsc_toguid; - /* See comment in dump_dnode() for full details */ + /* See comment in piggyback_unmodified_spill() for full details */ if (zfs_send_unmodified_spill_blocks && (BP_GET_LOGICAL_BIRTH(bp) <= dscp->dsc_fromtxg)) { drrs->drr_flags |= DRR_SPILL_UNMODIFIED; @@ -793,35 +797,6 @@ dump_dnode(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object, (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), DMU_OBJECT_END) != 0) return (SET_ERROR(EINTR)); - /* - * Send DRR_SPILL records for unmodified spill blocks. This is useful - * because changing certain attributes of the object (e.g. blocksize) - * can cause old versions of ZFS to incorrectly remove a spill block. - * Including these records in the stream forces an up to date version - * to always be written ensuring they're never lost. Current versions - * of the code which understand the DRR_FLAG_SPILL_BLOCK feature can - * ignore these unmodified spill blocks. - */ - if (zfs_send_unmodified_spill_blocks && - (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) && - (BP_GET_LOGICAL_BIRTH(DN_SPILL_BLKPTR(dnp)) <= dscp->dsc_fromtxg)) { - struct send_range record; - blkptr_t *bp = DN_SPILL_BLKPTR(dnp); - - memset(&record, 0, sizeof (struct send_range)); - record.type = DATA; - record.object = object; - record.eos_marker = B_FALSE; - record.start_blkid = DMU_SPILL_BLKID; - record.end_blkid = record.start_blkid + 1; - record.sru.data.bp = *bp; - record.sru.data.obj_type = dnp->dn_type; - record.sru.data.datablksz = BP_GET_LSIZE(bp); - - if (do_dump(dscp, &record) != 0) - return (SET_ERROR(EINTR)); - } - if (dscp->dsc_err != 0) return (SET_ERROR(EINTR)); @@ -911,6 +886,9 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range) case OBJECT: err = dump_dnode(dscp, &range->sru.object.bp, range->object, range->sru.object.dnp); + /* Dump piggybacked unmodified spill block */ + if (!err && range->sru.object.spill_range) + err = do_dump(dscp, range->sru.object.spill_range); return (err); case OBJECT_RANGE: { ASSERT3U(range->start_blkid + 1, ==, range->end_blkid); @@ -939,34 +917,7 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range) ASSERT3U(srdp->datablksz, ==, BP_GET_LSIZE(bp)); ASSERT3U(range->start_blkid + 1, ==, range->end_blkid); - if (BP_GET_TYPE(bp) == DMU_OT_SA) { - arc_flags_t aflags = ARC_FLAG_WAIT; - zio_flag_t zioflags = ZIO_FLAG_CANFAIL; - if (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW) { - ASSERT(BP_IS_PROTECTED(bp)); - zioflags |= ZIO_FLAG_RAW; - } - - zbookmark_phys_t zb; - ASSERT3U(range->start_blkid, ==, DMU_SPILL_BLKID); - zb.zb_objset = dmu_objset_id(dscp->dsc_os); - zb.zb_object = range->object; - zb.zb_level = 0; - zb.zb_blkid = range->start_blkid; - - arc_buf_t *abuf = NULL; - if (!dscp->dsc_dso->dso_dryrun && arc_read(NULL, spa, - bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, - zioflags, &aflags, &zb) != 0) - return (SET_ERROR(EIO)); - - err = dump_spill(dscp, bp, zb.zb_object, - (abuf == NULL ? NULL : abuf->b_data)); - if (abuf != NULL) - arc_buf_destroy(abuf, &abuf); - return (err); - } if (send_do_embed(bp, dscp->dsc_featureflags)) { err = dump_write_embedded(dscp, range->object, range->start_blkid * srdp->datablksz, @@ -975,8 +926,9 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range) } ASSERT(range->object > dscp->dsc_resume_object || (range->object == dscp->dsc_resume_object && + (range->start_blkid == DMU_SPILL_BLKID || range->start_blkid * srdp->datablksz >= - dscp->dsc_resume_offset)); + dscp->dsc_resume_offset))); /* it's a level-0 block of a regular object */ mutex_enter(&srdp->lock); @@ -1006,8 +958,6 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range) ASSERT(dscp->dsc_dso->dso_dryrun || srdp->abuf != NULL || srdp->abd != NULL); - uint64_t offset = range->start_blkid * srdp->datablksz; - char *data = NULL; if (srdp->abd != NULL) { data = abd_to_buf(srdp->abd); @@ -1016,6 +966,14 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range) data = srdp->abuf->b_data; } + if (BP_GET_TYPE(bp) == DMU_OT_SA) { + ASSERT3U(range->start_blkid, ==, DMU_SPILL_BLKID); + err = dump_spill(dscp, bp, range->object, data); + return (err); + } + + uint64_t offset = range->start_blkid * srdp->datablksz; + /* * If we have large blocks stored on disk but the send flags * don't allow us to send large blocks, we split the data from @@ -1098,6 +1056,8 @@ range_alloc(enum type type, uint64_t object, uint64_t start_blkid, range->sru.data.io_outstanding = 0; range->sru.data.io_err = 0; range->sru.data.io_compressed = B_FALSE; + } else if (type == OBJECT) { + range->sru.object.spill_range = NULL; } return (range); } @@ -1742,6 +1702,45 @@ enqueue_range(struct send_reader_thread_arg *srta, bqueue_t *q, dnode_t *dn, bqueue_enqueue(q, range, datablksz); } +/* + * Send DRR_SPILL records for unmodified spill blocks. This is useful + * because changing certain attributes of the object (e.g. blocksize) + * can cause old versions of ZFS to incorrectly remove a spill block. + * Including these records in the stream forces an up to date version + * to always be written ensuring they're never lost. Current versions + * of the code which understand the DRR_FLAG_SPILL_BLOCK feature can + * ignore these unmodified spill blocks. + * + * We piggyback the spill_range to dnode range instead of enqueueing it + * so send_range_after won't complain. + */ +static uint64_t +piggyback_unmodified_spill(struct send_reader_thread_arg *srta, + struct send_range *range) +{ + ASSERT3U(range->type, ==, OBJECT); + + dnode_phys_t *dnp = range->sru.object.dnp; + uint64_t fromtxg = srta->smta->to_arg->fromtxg; + + if (!zfs_send_unmodified_spill_blocks || + !(dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) || + !(BP_GET_LOGICAL_BIRTH(DN_SPILL_BLKPTR(dnp)) <= fromtxg)) + return (0); + + blkptr_t *bp = DN_SPILL_BLKPTR(dnp); + struct send_range *spill_range = range_alloc(DATA, range->object, + DMU_SPILL_BLKID, DMU_SPILL_BLKID+1, B_FALSE); + spill_range->sru.data.bp = *bp; + spill_range->sru.data.obj_type = dnp->dn_type; + spill_range->sru.data.datablksz = BP_GET_LSIZE(bp); + + issue_data_read(srta, spill_range); + range->sru.object.spill_range = spill_range; + + return (BP_GET_LSIZE(bp)); +} + /* * This thread is responsible for two things: First, it retrieves the correct * blkptr in the to ds if we need to send the data because of something from @@ -1773,17 +1772,20 @@ send_reader_thread(void *arg) uint64_t last_obj_exists = B_TRUE; while (!range->eos_marker && !srta->cancel && smta->error == 0 && err == 0) { + uint64_t spill = 0; switch (range->type) { case DATA: issue_data_read(srta, range); bqueue_enqueue(outq, range, range->sru.data.datablksz); range = get_next_range_nofree(inq, range); break; - case HOLE: case OBJECT: + spill = piggyback_unmodified_spill(srta, range); + zfs_fallthrough; + case HOLE: case OBJECT_RANGE: case REDACT: // Redacted blocks must exist - bqueue_enqueue(outq, range, sizeof (*range)); + bqueue_enqueue(outq, range, sizeof (*range) + spill); range = get_next_range_nofree(inq, range); break; case PREVIOUSLY_REDACTED: { diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 3fdcebdff918..6aee7afb6954 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -1377,6 +1377,13 @@ dmu_tx_pool(dmu_tx_t *tx) return (tx->tx_pool); } +/* + * Register a callback to be executed at the end of a TXG. + * + * Note: This currently exists for outside consumers, specifically the ZFS OSD + * for Lustre. Please do not remove before checking that project. For examples + * on how to use this see `ztest_commit_callback`. + */ void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data) { diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c index f67dad002319..122d7d0d17d8 100644 --- a/module/zfs/dnode_sync.c +++ b/module/zfs/dnode_sync.c @@ -566,7 +566,7 @@ dnode_undirty_dbufs(list_t *list) mutex_destroy(&dr->dt.di.dr_mtx); list_destroy(&dr->dt.di.dr_children); } - kmem_free(dr, sizeof (dbuf_dirty_record_t)); + kmem_cache_free(dbuf_dirty_kmem_cache, dr); dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE); } } diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index bcab46c63bfa..983f444d79b0 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -2205,10 +2205,11 @@ vdev_open(vdev_t *vd) vd->vdev_max_asize = max_asize; /* - * If the vdev_ashift was not overridden at creation time, + * If the vdev_ashift was not overridden at creation time + * (0) or the override value is impossible for the device, * then set it the logical ashift and optimize the ashift. */ - if (vd->vdev_ashift == 0) { + if (vd->vdev_ashift < vd->vdev_logical_ashift) { vd->vdev_ashift = vd->vdev_logical_ashift; if (vd->vdev_logical_ashift > ASHIFT_MAX) { diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index d5e0d2a2b35a..6c15a5c472ea 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -58,9 +58,9 @@ #include /* - * Enable the experimental block cloning feature. If this setting is 0, then - * even if feature@block_cloning is enabled, attempts to clone blocks will act - * as though the feature is disabled. + * Enables access to the block cloning feature. If this setting is 0, then even + * if feature@block_cloning is enabled, using functions and system calls that + * attempt to clone blocks will act as though the feature is disabled. */ int zfs_bclone_enabled = 1; diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 5534cd27f637..76d07a6cc9c1 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -147,6 +147,12 @@ tags = ['functional', 'largest_pool'] tests = ['longname_001_pos', 'longname_002_pos', 'longname_003_pos'] tags = ['functional', 'longname'] +[tests/functional/luks:Linux] +pre = +post = +tests = ['luks_sanity'] +tags = ['functional', 'luks'] + [tests/functional/mmap:Linux] tests = ['mmap_libaio_001_pos', 'mmap_sync_001_pos'] tags = ['functional', 'mmap'] diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index 2562836213af..07ec2c4b601b 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -213,6 +213,7 @@ maybe = { 'cli_root/zfs_unshare/zfs_unshare_006_pos': ['SKIP', na_reason], 'cli_root/zpool_add/zpool_add_004_pos': ['FAIL', known_reason], 'cli_root/zpool_destroy/zpool_destroy_001_pos': ['SKIP', 6145], + 'cli_root/zpool_import/import_devices_missing': ['FAIL', 16669], 'cli_root/zpool_import/zpool_import_missing_003_pos': ['SKIP', 6839], 'cli_root/zpool_initialize/zpool_initialize_import_export': ['FAIL', 11948], @@ -275,7 +276,8 @@ if sys.platform.startswith('freebsd'): 'pool_checkpoint/checkpoint_big_rewind': ['FAIL', 12622], 'pool_checkpoint/checkpoint_indirect': ['FAIL', 12623], 'resilver/resilver_restart_001': ['FAIL', known_reason], - 'snapshot/snapshot_002_pos': ['FAIL', '14831'], + 'snapshot/snapshot_002_pos': ['FAIL', 14831], + 'zvol/zvol_misc/zvol_misc_volmode': ['FAIL', 16668], 'bclone/bclone_crossfs_corner_cases': ['SKIP', cfr_cross_reason], 'bclone/bclone_crossfs_corner_cases_limited': ['SKIP', cfr_cross_reason], diff --git a/tests/zfs-tests/cmd/getversion.c b/tests/zfs-tests/cmd/getversion.c index 62c1c5b6abc0..1e026b92d17d 100644 --- a/tests/zfs-tests/cmd/getversion.c +++ b/tests/zfs-tests/cmd/getversion.c @@ -19,9 +19,9 @@ */ #include -#include #include #include +#include #include #include #include diff --git a/tests/zfs-tests/include/commands.cfg b/tests/zfs-tests/include/commands.cfg index be41ce5210e8..5985b5fe1526 100644 --- a/tests/zfs-tests/include/commands.cfg +++ b/tests/zfs-tests/include/commands.cfg @@ -129,6 +129,7 @@ export SYSTEM_FILES_LINUX='attr blkdiscard blockdev chattr + cryptsetup exportfs fallocate flock diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index bc767b9f624f..7d1551a63f0d 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -80,7 +80,8 @@ if BUILD_LINUX nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/simd/simd_supported.ksh \ functional/tmpfile/cleanup.ksh \ - functional/tmpfile/setup.ksh + functional/tmpfile/setup.ksh \ + functional/luks/luks_sanity.ksh endif nobase_dist_datadir_zfs_tests_tests_DATA += \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_dryrun_output.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_dryrun_output.ksh index fa1ce9c64d33..3b06e8c35c77 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_dryrun_output.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_dryrun_output.ksh @@ -148,9 +148,9 @@ done # Foreach test create pool, add -n devices and check output. for (( i=0; i < ${#tests[@]}; i+=1 )); do - typeset tree="${tests[$i].tree}" - typeset add="${tests[$i].add}" - typeset want="${tests[$i].want}" + tree="${tests[$i].tree}" + add="${tests[$i].add}" + want="${tests[$i].want}" log_must eval zpool create "$TESTPOOL" $tree log_must poolexists "$TESTPOOL" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_dryrun_output.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_dryrun_output.ksh index 485891c945a3..0671ea618e05 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_dryrun_output.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_dryrun_output.ksh @@ -124,8 +124,8 @@ done # Foreach test create pool, add -n devices and check output. for (( i=0; i < ${#tests[@]}; i+=1 )); do - typeset tree="${tests[$i].tree}" - typeset want="${tests[$i].want}" + tree="${tests[$i].tree}" + want="${tests[$i].want}" typeset out="$(log_must eval "zpool create -n '$TESTPOOL' $tree" | \ sed /^SUCCESS/d)" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_dryrun_output.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_dryrun_output.ksh index 410b1fe7a03e..def1d154387d 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_dryrun_output.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_dryrun_output.ksh @@ -133,9 +133,9 @@ done # Foreach test create pool, add -n devices and check output. for (( i=0; i < ${#tests[@]}; i+=1 )); do - typeset tree="${tests[$i].tree}" - typeset devs="${tests[$i].devs}" - typeset want="${tests[$i].want}" + tree="${tests[$i].tree}" + devs="${tests[$i].devs}" + want="${tests[$i].want}" log_must eval zpool create "$TESTPOOL" $tree log_must poolexists "$TESTPOOL" diff --git a/tests/zfs-tests/tests/functional/luks/luks_sanity.ksh b/tests/zfs-tests/tests/functional/luks/luks_sanity.ksh new file mode 100755 index 000000000000..9cee26503de7 --- /dev/null +++ b/tests/zfs-tests/tests/functional/luks/luks_sanity.ksh @@ -0,0 +1,90 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2024 by Lawrence Livermore National Security, LLC. +# Use is subject to license terms. +# + +# DESCRIPTION: +# Verify ZFS works on a LUKS-backed pool +# +# STRATEGY: +# 1. Create a LUKS device +# 2. Make a pool with it +# 3. Write files to the pool +# 4. Verify no errors + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "both" + +VDEV=$(mktemp --suffix=luks_sanity) +TESTPOOL=testpool + +function cleanup +{ + log_must zpool destroy $TESTPOOL + + log_must cryptsetup luksClose /dev/mapper/luksdev + log_must rm -f $VDEV +} + +log_assert "Verify ZFS on LUKS works" +log_onexit cleanup + +PASS="fdsjfosdijfsdkjsldfjdlk" + +# Make a small LUKS device since LUKS formatting takes time and we want to +# make this test run as quickly as possible. +truncate -s 100M $VDEV + +log_must cryptsetup luksFormat --type luks2 $VDEV <<< $PASS +log_must cryptsetup luksOpen $VDEV luksdev <<< $PASS + +log_must zpool create $TESTPOOL /dev/mapper/luksdev + +CPUS=$(get_num_cpus) + +# Use these specific size and offset ranges as they often cause errors with +# https://github.com/openzfs/zfs/issues/16631 +# and we want to try to test for that. +for SIZE in {70..100} ; do + for OFF in {70..100} ; do + for i in {1..$CPUS} ; do + dd if=/dev/urandom of=/$TESTPOOL/file$i-bs$SIZE-off$OFF \ + seek=$OFF bs=$SIZE count=1 &>/dev/null & + done + wait + done + sync_pool $TESTPOOL + rm -f /$TESTPOOL/file* +done + +# Verify no read/write/checksum errors. Don't use JSON here so that we could +# could potentially backport this test case to the 2.2.x branch. +if zpool status -e | grep -q "luksdev" ; then + log_note "$(zpool status -v)" + log_fail "Saw errors writing to LUKS device" +fi + +log_pass "Verified ZFS on LUKS works" diff --git a/tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c b/tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c index 5c6d28eb2c44..7b926da6c01c 100644 --- a/tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c +++ b/tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c @@ -30,7 +30,7 @@ /* * This tests the vdev_disk page alignment check callback - * vdev_disk_check_pages_cb(). For now, this test includes a copy of that + * vdev_disk_check_alignment_cb(). For now, this test includes a copy of that * function from module/os/linux/zfs/vdev_disk.c. If you change it here, * remember to change it there too, and add tests data here to validate the * change you're making. @@ -38,36 +38,69 @@ struct page; +/* + * This is spl_pagesize() in userspace, which requires linking libspl, but + * would also then use the platform page size, which isn't what we want for + * a test. To keep the check callback the same as the real one, we just + * redefine it. + */ +#undef PAGESIZE +#define PAGESIZE (4096) + typedef struct { - uint32_t bmask; - uint32_t npages; - uint32_t end; -} vdev_disk_check_pages_t; + size_t blocksize; + int seen_first; + int seen_last; +} vdev_disk_check_alignment_t; static int -vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv) +vdev_disk_check_alignment_cb(struct page *page, size_t off, size_t len, + void *priv) { (void) page; - vdev_disk_check_pages_t *s = priv; + vdev_disk_check_alignment_t *s = priv; /* - * If we didn't finish on a block size boundary last time, then there - * would be a gap if we tried to use this ABD as-is, so abort. + * The cardinal rule: a single on-disk block must never cross an + * physical (order-0) page boundary, as the kernel expects to be able + * to split at both LBS and page boundaries. + * + * This implies various alignment rules for the blocks in this + * (possibly compound) page, which we can check for. */ - if (s->end != 0) - return (1); /* - * Note if we're taking less than a full block, so we can check it - * above on the next call. + * If the previous page did not end on a page boundary, then we + * can't proceed without creating a hole. */ - s->end = (off+len) & s->bmask; + if (s->seen_last) + return (1); - /* All blocks after the first must start on a block size boundary. */ - if (s->npages != 0 && (off & s->bmask) != 0) + /* This page must contain only whole LBS-sized blocks. */ + if (!IS_P2ALIGNED(len, s->blocksize)) return (1); - s->npages++; + /* + * If this is not the first page in the ABD, then the data must start + * on a page-aligned boundary (so the kernel can split on page + * boundaries without having to deal with a hole). If it is, then + * it can start on LBS-alignment. + */ + if (s->seen_first) { + if (!IS_P2ALIGNED(off, PAGESIZE)) + return (1); + } else { + if (!IS_P2ALIGNED(off, s->blocksize)) + return (1); + s->seen_first = 1; + } + + /* + * If this data does not end on a page-aligned boundary, then this + * must be the last page in the ABD, for the same reason. + */ + s->seen_last = !IS_P2ALIGNED(off+len, PAGESIZE); + return (0); } @@ -75,8 +108,8 @@ typedef struct { /* test name */ const char *name; - /* blocks size mask */ - uint32_t mask; + /* stored block size */ + uint32_t blocksize; /* amount of data to take */ size_t size; @@ -89,39 +122,39 @@ static const page_test_t valid_tests[] = { /* 512B block tests */ { "512B blocks, 4K single page", - 0x1ff, 0x1000, { + 512, 0x1000, { { 0x0, 0x1000 }, }, }, { "512B blocks, 1K at start of page", - 0x1ff, 0x400, { + 512, 0x400, { { 0x0, 0x1000 }, }, }, { "512B blocks, 1K at end of page", - 0x1ff, 0x400, { + 512, 0x400, { { 0x0c00, 0x0400 }, }, }, { "512B blocks, 1K within page, 512B start offset", - 0x1ff, 0x400, { + 512, 0x400, { { 0x0200, 0x0e00 }, }, }, { "512B blocks, 8K across 2x4K pages", - 0x1ff, 0x2000, { + 512, 0x2000, { { 0x0, 0x1000 }, { 0x0, 0x1000 }, }, }, { "512B blocks, 4K across two pages, 2K start offset", - 0x1ff, 0x1000, { + 512, 0x1000, { { 0x0800, 0x0800 }, { 0x0, 0x0800 }, }, }, { "512B blocks, 16K across 5x4K pages, 512B start offset", - 0x1ff, 0x4000, { + 512, 0x4000, { { 0x0200, 0x0e00 }, { 0x0, 0x1000 }, { 0x0, 0x1000 }, @@ -130,7 +163,7 @@ static const page_test_t valid_tests[] = { }, }, { "512B blocks, 64K data, 8x8K compound pages", - 0x1ff, 0x10000, { + 512, 0x10000, { { 0x0, 0x2000 }, { 0x0, 0x2000 }, { 0x0, 0x2000 }, @@ -142,7 +175,7 @@ static const page_test_t valid_tests[] = { }, }, { "512B blocks, 64K data, 9x8K compound pages, 512B start offset", - 0x1ff, 0x10000, { + 512, 0x10000, { { 0x0200, 0x1e00 }, { 0x0, 0x2000 }, { 0x0, 0x2000 }, @@ -155,7 +188,7 @@ static const page_test_t valid_tests[] = { }, }, { "512B blocks, 64K data, 2x16K compound pages, 8x4K pages", - 0x1ff, 0x10000, { + 512, 0x10000, { { 0x0, 0x8000 }, { 0x0, 0x8000 }, { 0x0, 0x1000 }, @@ -169,7 +202,7 @@ static const page_test_t valid_tests[] = { }, }, { "512B blocks, 64K data, mixed 4K/8K/16K pages", - 0x1ff, 0x10000, { + 512, 0x10000, { { 0x0, 0x1000 }, { 0x0, 0x2000 }, { 0x0, 0x1000 }, @@ -183,7 +216,7 @@ static const page_test_t valid_tests[] = { }, }, { "512B blocks, 64K data, mixed 4K/8K/16K pages, 1K start offset", - 0x1ff, 0x10000, { + 512, 0x10000, { { 0x0400, 0x0c00 }, { 0x0, 0x1000 }, { 0x0, 0x1000 }, @@ -200,48 +233,18 @@ static const page_test_t valid_tests[] = { /* 4K block tests */ { "4K blocks, 4K single page", - 0xfff, 0x1000, { - { 0x0, 0x1000 }, - }, - }, { - "4K blocks, 1K at start of page", - 0xfff, 0x400, { + 4096, 0x1000, { { 0x0, 0x1000 }, }, - }, { - "4K blocks, 1K at end of page", - 0xfff, 0x400, { - { 0x0c00, 0x0400 }, - }, - }, { - "4K blocks, 1K within page, 512B start offset", - 0xfff, 0x400, { - { 0x0200, 0x0e00 }, - }, }, { "4K blocks, 8K across 2x4K pages", - 0xfff, 0x2000, { + 4096, 0x2000, { { 0x0, 0x1000 }, { 0x0, 0x1000 }, }, - }, { - "4K blocks, 4K across two pages, 2K start offset", - 0xfff, 0x1000, { - { 0x0800, 0x0800 }, - { 0x0, 0x0800 }, - }, - }, { - "4K blocks, 16K across 5x4K pages, 512B start offset", - 0xfff, 0x4000, { - { 0x0200, 0x0e00 }, - { 0x0, 0x1000 }, - { 0x0, 0x1000 }, - { 0x0, 0x1000 }, - { 0x0, 0x0200 }, - }, }, { "4K blocks, 64K data, 8x8K compound pages", - 0xfff, 0x10000, { + 4096, 0x10000, { { 0x0, 0x2000 }, { 0x0, 0x2000 }, { 0x0, 0x2000 }, @@ -251,22 +254,9 @@ static const page_test_t valid_tests[] = { { 0x0, 0x2000 }, { 0x0, 0x2000 }, }, - }, { - "4K blocks, 64K data, 9x8K compound pages, 512B start offset", - 0xfff, 0x10000, { - { 0x0200, 0x1e00 }, - { 0x0, 0x2000 }, - { 0x0, 0x2000 }, - { 0x0, 0x2000 }, - { 0x0, 0x2000 }, - { 0x0, 0x2000 }, - { 0x0, 0x2000 }, - { 0x0, 0x2000 }, - { 0x0, 0x0200 }, - }, }, { "4K blocks, 64K data, 2x16K compound pages, 8x4K pages", - 0xfff, 0x10000, { + 4096, 0x10000, { { 0x0, 0x8000 }, { 0x0, 0x8000 }, { 0x0, 0x1000 }, @@ -280,7 +270,7 @@ static const page_test_t valid_tests[] = { }, }, { "4K blocks, 64K data, mixed 4K/8K/16K pages", - 0xfff, 0x10000, { + 4096, 0x10000, { { 0x0, 0x1000 }, { 0x0, 0x2000 }, { 0x0, 0x1000 }, @@ -292,29 +282,19 @@ static const page_test_t valid_tests[] = { { 0x0, 0x1000 }, { 0x0, 0x2000 }, }, - }, { - "4K blocks, 64K data, mixed 4K/8K/16K pages, 1K start offset", - 0xfff, 0x10000, { - { 0x0400, 0x0c00 }, - { 0x0, 0x1000 }, - { 0x0, 0x1000 }, - { 0x0, 0x1000 }, - { 0x0, 0x2000 }, - { 0x0, 0x2000 }, - { 0x0, 0x1000 }, - { 0x0, 0x8000 }, - { 0x0, 0x1000 }, - { 0x0, 0x0400 }, - }, }, { 0 }, }; static const page_test_t invalid_tests[] = { + /* + * Gang tests. Composed of lots of smaller allocations, rarely properly + * aligned. + */ { "512B blocks, 16K data, 512 leader (gang block simulation)", - 0x1ff, 0x8000, { + 512, 0x8000, { { 0x0, 0x0200 }, { 0x0, 0x1000 }, { 0x0, 0x1000 }, @@ -324,7 +304,7 @@ static const page_test_t invalid_tests[] = { }, { "4K blocks, 32K data, 2 incompatible spans " "(gang abd simulation)", - 0xfff, 0x8000, { + 4096, 0x8000, { { 0x0800, 0x0800 }, { 0x0, 0x1000 }, { 0x0, 0x1000 }, @@ -337,6 +317,90 @@ static const page_test_t invalid_tests[] = { { 0x0, 0x0800 }, }, }, + + /* + * Blocks must not span multiple physical pages. These tests used to + * be considered valid, but were since found to be invalid and were + * moved here. + */ + { + "4K blocks, 4K across two pages, 2K start offset", + 4096, 0x1000, { + { 0x0800, 0x0800 }, + { 0x0, 0x0800 }, + }, + }, { + "4K blocks, 16K across 5x4K pages, 512B start offset", + 4096, 0x4000, { + { 0x0200, 0x0e00 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x0200 }, + }, + }, { + "4K blocks, 64K data, 9x8K compound pages, 512B start offset", + 4096, 0x10000, { + { 0x0200, 0x1e00 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x0200 }, + }, + }, { + "4K blocks, 64K data, mixed 4K/8K/16K pages, 1K start offset", + 4096, 0x10000, { + { 0x0400, 0x0c00 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x1000 }, + { 0x0, 0x8000 }, + { 0x0, 0x1000 }, + { 0x0, 0x0400 }, + }, + }, + + /* + * This is the very typical case of a 4K block being allocated from + * the middle of a mixed-used slab backed by a higher-order compound + * page. + */ + { + "4K blocks, 4K data from compound slab, 2K-align offset", + 4096, 0x1000, { + { 0x1800, 0x6800 } + } + }, + + /* + * Blocks smaller than LBS should never be possible, but used to be by + * accident (see GH#16990). We test for and reject them just to be + * sure. + */ + { + "4K blocks, 1K at end of page", + 4096, 0x400, { + { 0x0c00, 0x0400 }, + }, + }, { + "4K blocks, 1K at start of page", + 4096, 0x400, { + { 0x0, 0x1000 }, + }, + }, { + "4K blocks, 1K within page, 512B start offset", + 4096, 0x400, { + { 0x0200, 0x0e00 }, + }, + }, + { 0 }, }; @@ -345,10 +409,8 @@ run_test(const page_test_t *test, bool verbose) { size_t rem = test->size; - vdev_disk_check_pages_t s = { - .bmask = 0xfff, - .npages = 0, - .end = 0, + vdev_disk_check_alignment_t s = { + .blocksize = test->blocksize, }; for (int i = 0; test->pages[i][1] > 0; i++) { @@ -362,7 +424,7 @@ run_test(const page_test_t *test, bool verbose) "rem %lx, take %lx\n", i, off, len, rem, take); - if (vdev_disk_check_pages_cb(NULL, off, take, &s)) { + if (vdev_disk_check_alignment_cb(NULL, off, take, &s)) { if (verbose) printf(" ABORT: misalignment detected, " "rem %lx\n", rem); @@ -389,7 +451,7 @@ run_test_set(const page_test_t *tests, bool want, int *ntests, int *npassed) for (const page_test_t *test = &tests[0]; test->name; test++) { bool pass = (run_test(test, false) == want); if (pass) { - printf("%s: PASS\n", test->name); + printf("%c %s: PASS\n", want ? '+' : '-', test->name); (*npassed)++; } else { printf("%s: FAIL [expected %s, got %s]\n", test->name,