diff --git a/include/sys/zfs_vnops.h b/include/sys/zfs_vnops.h index e60b99bed192..d4cd462cc0a1 100644 --- a/include/sys/zfs_vnops.h +++ b/include/sys/zfs_vnops.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2025, Rob Norris */ #ifndef _SYS_FS_ZFS_VNOPS_H @@ -42,6 +43,8 @@ extern int zfs_clone_range_replay(znode_t *, uint64_t, uint64_t, uint64_t, extern int zfs_getsecattr(znode_t *, vsecattr_t *, int, cred_t *); extern int zfs_setsecattr(znode_t *, vsecattr_t *, int, cred_t *); +extern int zfs_get_direct_alignment(znode_t *, uint64_t *); + extern int mappedread(znode_t *, int, zfs_uio_t *); extern int mappedread_sf(znode_t *, int, zfs_uio_t *); extern void update_pages(znode_t *, int64_t, int, objset_t *); diff --git a/module/os/linux/zfs/zpl_inode.c b/module/os/linux/zfs/zpl_inode.c index c4b5087ca5e7..981e26b26725 100644 --- a/module/os/linux/zfs/zpl_inode.c +++ b/module/os/linux/zfs/zpl_inode.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2011, Lawrence Livermore National Security, LLC. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. + * Copyright (c) 2025, Rob Norris */ @@ -30,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -490,6 +492,17 @@ zpl_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask, } #endif +#ifdef STATX_DIOALIGN + if (request_mask & STATX_DIOALIGN) { + uint64_t align; + if (zfs_get_direct_alignment(zp, &align) == 0) { + stat->dio_mem_align = PAGE_SIZE; + stat->dio_offset_align = align; + stat->result_mask |= STATX_DIOALIGN; + } + } +#endif + #ifdef STATX_ATTR_IMMUTABLE if (zp->z_pflags & ZFS_IMMUTABLE) stat->attributes |= STATX_ATTR_IMMUTABLE; diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index b789d1ed5239..61e01592c5c2 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -25,6 +25,7 @@ * Copyright (c) 2015 by Chunwei Chen. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek + * Copyright (c) 2025, Rob Norris */ /* Portions Copyright 2007 Jeremy Teo */ @@ -1083,6 +1084,24 @@ zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) return (error); } +/* + * Get the optimal alignment to ensure direct IO can be performed without + * incurring any RMW penalty on write. If direct IO is not enabled for this + * file, returns an error. + */ +int +zfs_get_direct_alignment(znode_t *zp, uint64_t *alignp) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + + if (!zfs_dio_enabled || zfsvfs->z_os->os_direct == ZFS_DIRECT_DISABLED) + return (SET_ERROR(EOPNOTSUPP)); + + *alignp = MAX(zp->z_blksz, PAGE_SIZE); + + return (0); +} + #ifdef ZFS_DEBUG static int zil_fault_io = 0; #endif diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index e2edfc9ebbb5..6bf6ede932f0 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -975,7 +975,7 @@ tests = ['sparse_001_pos'] tags = ['functional', 'sparse'] [tests/functional/stat] -tests = ['stat_001_pos'] +tests = ['stat_001_pos', 'statx_dioalign'] tags = ['functional', 'stat'] [tests/functional/suid] diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index 0bfc64959c38..2e8133906bbc 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -128,6 +128,11 @@ idmap_reason = 'Idmapped mount needs kernel 5.12+' # cfr_reason = 'Kernel copy_file_range support required' +# +# Some statx fields are not supported by all kernels +# +statx_reason = 'Needed statx(2) field not supported on this kernel' + if sys.platform.startswith('freebsd'): cfr_cross_reason = 'copy_file_range(2) cross-filesystem needs FreeBSD 14+' else: @@ -293,7 +298,8 @@ if sys.platform.startswith('freebsd'): 'block_cloning/block_cloning_cross_enc_dataset': ['SKIP', cfr_cross_reason], 'block_cloning/block_cloning_copyfilerange_cross_dataset': - ['SKIP', cfr_cross_reason] + ['SKIP', cfr_cross_reason], + 'stat/statx_dioalign': ['SKIP', 'na_reason'], }) elif sys.platform.startswith('linux'): maybe.update({ @@ -361,6 +367,7 @@ elif sys.platform.startswith('linux'): 'mmp/mmp_active_import': ['FAIL', known_reason], 'mmp/mmp_exported_import': ['FAIL', known_reason], 'mmp/mmp_inactive_import': ['FAIL', known_reason], + 'stat/statx_dioalign': ['SKIP', 'statx_reason'], }) diff --git a/tests/zfs-tests/cmd/.gitignore b/tests/zfs-tests/cmd/.gitignore index e9e3b8f73e42..219e64489fd2 100644 --- a/tests/zfs-tests/cmd/.gitignore +++ b/tests/zfs-tests/cmd/.gitignore @@ -35,6 +35,7 @@ /rename_dir /rm_lnkcnt_zero_file /send_doall +/statx /stride_dd /threadsappend /user_ns_exec @@ -53,3 +54,4 @@ /skein_test /sha2_test /idmap_util +/statx diff --git a/tests/zfs-tests/cmd/Makefile.am b/tests/zfs-tests/cmd/Makefile.am index 5250e72f9fa8..1a0dc40c03c9 100644 --- a/tests/zfs-tests/cmd/Makefile.am +++ b/tests/zfs-tests/cmd/Makefile.am @@ -121,6 +121,7 @@ if BUILD_LINUX scripts_zfs_tests_bin_PROGRAMS += %D%/getversion scripts_zfs_tests_bin_PROGRAMS += %D%/user_ns_exec scripts_zfs_tests_bin_PROGRAMS += %D%/renameat2 +scripts_zfs_tests_bin_PROGRAMS += %D%/statx scripts_zfs_tests_bin_PROGRAMS += %D%/xattrtest scripts_zfs_tests_bin_PROGRAMS += %D%/zed_fd_spill-zedlet scripts_zfs_tests_bin_PROGRAMS += %D%/idmap_util diff --git a/tests/zfs-tests/cmd/statx.c b/tests/zfs-tests/cmd/statx.c new file mode 100644 index 000000000000..89939f6efb40 --- /dev/null +++ b/tests/zfs-tests/cmd/statx.c @@ -0,0 +1,304 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright (c) 2025, Rob Norris + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include + +/* + * statx() may be available in the kernel, but not in the libc, so we build + * our own wrapper if we can't link one. + */ + +#ifndef __NR_statx +#if defined(__x86_64__) +#define __NR_statx (332) +#elif defined(__i386__) +#define __NR_statx (383) +#elif defined(__s390__) +#define __NR_statx (379) +#elif defined(__arm__) +#define __NR_statx (397) +#elif defined(__aarch64__) +#define __NR_statx (291) +#elif defined(__powerpc__) +#define __NR_statx (383) +#else +#error "no definition of __NR_statx for this platform" +#endif +#endif /* __NR_statx */ + + +int +statx(int, const char *, int, unsigned int, void *) + __attribute__((weak)); + +static inline int +_statx(int fd, const char *path, int flags, unsigned int mask, void *stx) +{ + if (statx) + return (statx(fd, path, flags, mask, stx)); + else + return (syscall(__NR_statx, fd, path, flags, mask, stx)); +} + +#ifndef STATX_TYPE +#define STATX_TYPE (1<<0) +#endif +#ifndef STATX_MODE +#define STATX_MODE (1<<1) +#endif +#ifndef STATX_NLINK +#define STATX_NLINK (1<<2) +#endif +#ifndef STATX_UID +#define STATX_UID (1<<3) +#endif +#ifndef STATX_GID +#define STATX_GID (1<<4) +#endif +#ifndef STATX_ATIME +#define STATX_ATIME (1<<5) +#endif +#ifndef STATX_MTIME +#define STATX_MTIME (1<<6) +#endif +#ifndef STATX_CTIME +#define STATX_CTIME (1<<7) +#endif +#ifndef STATX_INO +#define STATX_INO (1<<8) +#endif +#ifndef STATX_SIZE +#define STATX_SIZE (1<<9) +#endif +#ifndef STATX_BLOCKS +#define STATX_BLOCKS (1<<10) +#endif +#ifndef STATX_BTIME +#define STATX_BTIME (1<<11) +#endif +#ifndef STATX_MNT_ID +#define STATX_MNT_ID (1<<12) +#endif +#ifndef STATX_DIOALIGN +#define STATX_DIOALIGN (1<<13) +#endif + +typedef struct { + int64_t tv_sec; + uint32_t tv_nsec; + int32_t _pad; +} stx_timestamp_t; +_Static_assert(sizeof (stx_timestamp_t) == 0x10, + "stx_timestamp_t not 16 bytes"); + +typedef struct { + uint32_t stx_mask; + uint32_t stx_blksize; + uint64_t stx_attributes; + uint32_t stx_nlink; + uint32_t stx_uid; + uint32_t stx_gid; + uint16_t stx_mode; + uint16_t _pad1; + uint64_t stx_ino; + uint64_t stx_size; + uint64_t stx_blocks; + uint64_t stx_attributes_mask; + stx_timestamp_t stx_atime; + stx_timestamp_t stx_btime; + stx_timestamp_t stx_ctime; + stx_timestamp_t stx_mtime; + uint32_t stx_rdev_major; + uint32_t stx_rdev_minor; + uint32_t stx_dev_major; + uint32_t stx_dev_minor; + uint64_t stx_mnt_id; + uint32_t stx_dio_mem_align; + uint32_t stx_dio_offset_align; + uint64_t _pad2[12]; +} stx_t; +_Static_assert(sizeof (stx_t) == 0x100, "stx_t not 256 bytes"); + +typedef struct { + const char *name; + unsigned int mask; +} stx_field_t; + +stx_field_t fields[] = { + { "type", STATX_TYPE }, + { "mode", STATX_MODE }, + { "nlink", STATX_NLINK }, + { "uid", STATX_UID }, + { "gid", STATX_GID }, + { "atime", STATX_ATIME }, + { "mtime", STATX_MTIME }, + { "ctime", STATX_CTIME }, + { "ino", STATX_INO }, + { "size", STATX_SIZE }, + { "blocks", STATX_BLOCKS }, + { "btime", STATX_BTIME }, + { "mnt_id", STATX_MNT_ID }, + { "dioalign", STATX_DIOALIGN }, + { NULL }, +}; + +static int +usage(void) +{ + printf( + "usage: statx \n" + "available fields:\n"); + + int w = 0; + for (stx_field_t *f = fields; f->name != NULL; f++) { + if (w > 0 && (w + strlen(f->name) + 1) > 60) { + fputc('\n', stdout); + w = 0; + } + if (w == 0) + fputc(' ', stdout); + w += printf(" %s", f->name); + } + if (w > 0) + fputc('\n', stdout); + return (1); +} + +int +main(int argc, char **argv) +{ + if (argc < 3) + return (usage()); + + unsigned int mask = 0; + + char *name; + while ((name = strsep(&argv[1], ",")) != NULL) { + stx_field_t *f; + for (f = fields; f->name != NULL; f++) { + if (strcmp(name, f->name) == 0) { + mask |= f->mask; + break; + } + } + if (f->name == NULL) { + fprintf(stderr, "unknown field name: %s\n", name); + return (usage()); + } + } + + int fd = open(argv[2], O_PATH); + if (fd < 0) { + fprintf(stderr, "open: %s: %s\n", argv[2], strerror(errno)); + return (1); + } + + stx_t stx = {}; + + if (_statx(fd, "", + AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW, mask, &stx) < 0) { + fprintf(stderr, "statx: %s: %s\n", argv[2], strerror(errno)); + close(fd); + return (1); + } + + int rc = 0; + + for (stx_field_t *f = fields; f->name != NULL; f++) { + if (!(mask & f->mask)) + continue; + if (!(stx.stx_mask & f->mask)) { + printf("statx: kernel did not return field: %s\n", + f->name); + rc = 2; + continue; + } + } + + if (rc > 0) + return (rc); + + for (stx_field_t *f = fields; f->name != NULL; f++) { + if (!(mask & f->mask)) + continue; + + switch (f->mask) { + case STATX_TYPE: + printf("type: %u\n", stx.stx_mode & S_IFMT); + break; + case STATX_MODE: + printf("mode: %u\n", stx.stx_mode & ~S_IFMT); + break; + case STATX_NLINK: + printf("nlink: %u\n", stx.stx_nlink); + break; + case STATX_UID: + printf("uid: %u\n", stx.stx_uid); + break; + case STATX_GID: + printf("gid: %u\n", stx.stx_gid); + break; + case STATX_ATIME: + printf("atime: %ld.%u\n", + stx.stx_atime.tv_sec, stx.stx_atime.tv_nsec); + break; + case STATX_MTIME: + printf("mtime: %ld.%u\n", + stx.stx_mtime.tv_sec, stx.stx_mtime.tv_nsec); + break; + case STATX_CTIME: + printf("ctime: %ld.%u\n", + stx.stx_ctime.tv_sec, stx.stx_ctime.tv_nsec); + break; + case STATX_INO: + printf("ino: %lu\n", stx.stx_ino); + break; + case STATX_SIZE: + printf("size: %lu\n", stx.stx_size); + break; + case STATX_BLOCKS: + printf("blocks: %lu\n", stx.stx_blocks); + break; + case STATX_BTIME: + printf("btime: %ld.%u\n", + stx.stx_btime.tv_sec, stx.stx_btime.tv_nsec); + break; + case STATX_MNT_ID: + printf("mnt_id: %lu\n", stx.stx_mnt_id); + break; + case STATX_DIOALIGN: + printf("dioalign: %u %u\n", + stx.stx_dio_mem_align, stx.stx_dio_offset_align); + break; + } + } + + return (rc); +} diff --git a/tests/zfs-tests/include/commands.cfg b/tests/zfs-tests/include/commands.cfg index 5985b5fe1526..aa13f53b96fd 100644 --- a/tests/zfs-tests/include/commands.cfg +++ b/tests/zfs-tests/include/commands.cfg @@ -217,6 +217,7 @@ export ZFSTEST_FILES='badsend rename_dir rm_lnkcnt_zero_file send_doall + statx threadsappend user_ns_exec write_dos_attributes diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index d0eb4c30db48..95b84ce1482c 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -2051,6 +2051,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/stat/cleanup.ksh \ functional/stat/setup.ksh \ functional/stat/stat_001_pos.ksh \ + functional/stat/statx_dioalign.ksh \ functional/suid/cleanup.ksh \ functional/suid/setup.ksh \ functional/suid/suid_write_to_none.ksh \ diff --git a/tests/zfs-tests/tests/functional/stat/statx_dioalign.ksh b/tests/zfs-tests/tests/functional/stat/statx_dioalign.ksh new file mode 100755 index 000000000000..0a3e5c227411 --- /dev/null +++ b/tests/zfs-tests/tests/functional/stat/statx_dioalign.ksh @@ -0,0 +1,136 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Rob Norris +# + +# +# Uses the statx helper to test the results of the STATX_DIOALIGN request as we +# manipulate DIO enable, dataset recordsize and file size and structure. +# + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "both" + +if ! is_linux ; then + log_unsupported "statx(2) only available on Linux" +fi + +if [[ $(linux_version) -lt $(linux_version "6.1") ]] ; then + log_unsupported "STATX_DIOALIGN not available before Linux 6.1" +fi + +CLAIM="STATX_DIOALIGN returns useful values when Direct IO is available." + +TESTDS=${TESTPOOL}/${TESTFS} +TESTFILE=${TESTDIR}/${TESTFILE0} + +log_must save_tunable DIO_ENABLED +typeset recordsize_saved=$(get_prop recordsize $TESTDS) +typeset direct_saved=$(get_prop direct $TESTDS) + +function cleanup +{ + rm -f ${TESTFILE} + zfs set recordsize=$recordsize_saved $TESTDS + zfs set direct=$direct_saved $TESTDS + restore_tunable DIO_ENABLED +} +log_onexit cleanup + +# assert_dioalign +function assert_dioalign +{ + typeset file=$1 + typeset -i memalign=$2 + typeset -i ioalign=$3 + + typeset -a v=($(statx dioalign $file | cut -f2- -d' ')) + log_note "statx dioalign returned: $file: mem=${v[0]} io=${v[1]}" + log_must [ ${v[0]} -eq $memalign -a ${v[1]} -eq $ioalign ] +} + +# assert_dioalign_failed +function assert_dioalign_failed +{ + typeset file=$1 + log_mustnot statx dioalogn $file +} + +log_assert $CLAIM + +# Compute the expected IO size. Testing this properly means changing the +# ashift, which means recreating the pool, which is slightly fiddly here, so +# I have not bothered. +typeset -i PAGE_SIZE=$(getconf PAGE_SIZE) + +# Set recordsize and make a recordsized file for the general tests. +log_must zfs set recordsize=128K $TESTDS +log_must dd if=/dev/urandom of=$TESTFILE bs=64k count=1 +log_must zpool sync + +# when DIO is disabled via tunable, statx will not return the dioalign reuslt, +# and the program fails +log_must set_tunable32 DIO_ENABLED 0 + +for d in disabled standard always ; do + log_must zfs set direct=$d $TESTDS + assert_dioalign_failed $TESTFILE +done + +# when DIO is enabled via tunable, behaviour is dependent on the direct= +# property. +log_must set_tunable32 DIO_ENABLED 1 + +# when DIO is disabled via property, statx fails +log_must zfs set direct=disabled $TESTDS +assert_dioalign_failed $TESTFILE + +# when DIO is enabled, the result should be mem=pagesize, io=blksize +for d in standard always ; do + log_must zfs set direct=$d $TESTDS + assert_dioalign $TESTFILE $PAGE_SIZE 65536 +done + +# the io size always comes from the file blocksize, so changing the recordsize +# won't change the result +for rs in 32K 64K 128K 256K 512K ; do + log_must zfs set recordsize=$rs $TESTDS + for d in standard always ; do + log_must zfs set direct=$d $TESTDS + assert_dioalign $TESTFILE $PAGE_SIZE 65536 + done +done + +# extending a file into the second block fixes its blocksize, so result should +# be its blocksize, regardless of how the recordsize changes +log_must zfs set recordsize=128K $TESTDS +log_must dd if=/dev/urandom of=$TESTFILE bs=64K count=1 seek=1 +log_must zpool sync +assert_dioalign $TESTFILE $PAGE_SIZE 131072 +log_must dd if=/dev/urandom of=$TESTFILE bs=64K count=1 seek=2 +log_must zpool sync +assert_dioalign $TESTFILE $PAGE_SIZE 131072 + +log_pass $CLAIM