From ec83b23032d6732fbca8f200c354e3a9b036b4ea Mon Sep 17 00:00:00 2001 From: Jason Lee Date: Wed, 3 Feb 2021 09:46:09 -0700 Subject: [PATCH] ZFS Interface for Accelerators (Z.I.A.) The ZIO write pipeline has been modified to allow for external, alternative implementations of operations to be used. The original ZFS functions remain in the code as fallback in case the external implementation fails. Definitions: Accelerator - an entity (usually hardware) that is intended to accelerate operations Offloader - synonym of accelerator; used interchangeably Data Processing Unit Services Module (DPUSM) - https://github.com/hpc/dpusm - defines a "provider API" for accelerator vendors to set up - defines a "user API" for accelerator consumers to call - maintains list of providers and coordinates interactions between providers and consumers. Provider - a DPUSM wrapper for an accelerator's API Offload - moving data from ZFS/memory to the accelerator Onload - the opposite of offload In order for Z.I.A. to be extensible, it does not directly communicate with a fixed accelerator. Rather, Z.I.A. acquires a handle to a DPUSM, which is then used to acquire handles to providers. Using ZFS with Z.I.A.: 1. Build and start the DPUSM 2. Implement, build, and register a provider with the DPUSM 3. Reconfigure ZFS with '--with-zia=' 4. Rebuild and start ZFS 5. Create a zpool 6. Select the provider zpool set zia_provider= 7. Select operations to offload zpool set zia_=on The functions that can be replaced with alternative operations are: - compression - data is offloaded and then compressed - metadata is compressed in-memory and then offloaded - decompression can be replaced, but the replacement function is not called anywhere - checksum - checksum compute and checksum error call the same function - raidz - generation - reconstruction - vdev_file - open - write - close - vdev_disk - open - invalidate - write - flush - close abd_t, raidz_row_t, and vdev_t have each been given an additional "void *_zia_handle" member. These opaque handles point to data that is located on an offloader. abds are still allocated, but their contents are expected to diverge from the offloaded copy as operations are run. The modifications to ZFS can be thought of as two sets of changes: - The ZIO write pipeline - compression, checksum, RAIDZ generation, and write - Each stage starts by offloading data that was not previously offloaded - This allows for ZIOs to be offloaded at any point in these stages - Successful operations do not onload back into memory between stages - Errors cause data to be onloaded, or dropped if the copy in memory matches the offloaded copy - This might cause thrashing, but should not happen often, as the intention is for all of the stages to be offloaded, and thus not require onloading - Resilver - RAIDZ reconstruction, checksum, RAIDZ generation, and write - Because resilver is only one stage in the ZIO pipeline, data is only offloaded once at the beginning - Errors cause data to be onloaded, but will not re-offload in subsequent steps within resilver ARC compression is disabled when Z.I.A. is enabled Aggregation is disabled for offloaded abds RPMs will build with Z.I.A. Added example provider in module/zia-software-provider Signed-off-by: Jason Lee --- Makefile.am | 2 + cmd/raidz_test/raidz_test.c | 3 + config/Rules.am | 1 + config/zfs-build.m4 | 6 +- config/zia.m4 | 42 + include/sys/abd.h | 3 + include/sys/fs/zfs.h | 14 + include/sys/spa_impl.h | 8 + include/sys/vdev_disk.h | 7 + include/sys/vdev_impl.h | 4 + include/sys/vdev_raidz.h | 7 + include/sys/vdev_raidz_impl.h | 3 + include/sys/zia.h | 197 +++ include/sys/zia_cddl.h | 58 + include/sys/zia_private.h | 78 ++ include/sys/zio.h | 3 + include/sys/zio_compress.h | 4 + lib/libzpool/Makefile.am | 2 + man/man7/zpoolprops.7 | 37 + module/Kbuild.in | 19 + module/Makefile.in | 6 +- module/os/linux/zfs/vdev_disk.c | 74 + module/os/linux/zfs/vdev_file.c | 40 +- module/zcommon/zpool_prop.c | 39 + module/zfs/THIRDPARTYLICENSE.zia | 42 + module/zfs/THIRDPARTYLICENSE.zia.descrip | 1 + module/zfs/abd.c | 31 +- module/zfs/arc.c | 9 + module/zfs/dmu.c | 14 + module/zfs/spa.c | 187 +++ module/zfs/vdev.c | 8 + module/zfs/vdev_draid.c | 3 + module/zfs/vdev_raidz.c | 168 ++- module/zfs/zia.c | 1187 +++++++++++++++++ module/zfs/zia_cddl.c | 232 ++++ module/zfs/zio.c | 126 ++ module/zfs/zio_checksum.c | 48 + module/zfs/zio_compress.c | 4 + .../zia-software-provider/kernel_offloader.c | 766 +++++++++++ .../zia-software-provider/kernel_offloader.h | 149 +++ module/zia-software-provider/software.c | 451 +++++++ rpm/generic/zfs-kmod.spec.in | 10 +- rpm/generic/zfs.spec.in | 10 +- rpm/redhat/zfs-kmod.spec.in | 9 +- 44 files changed, 4099 insertions(+), 13 deletions(-) create mode 100644 config/zia.m4 create mode 100644 include/sys/zia.h create mode 100644 include/sys/zia_cddl.h create mode 100644 include/sys/zia_private.h create mode 100644 module/zfs/THIRDPARTYLICENSE.zia create mode 100644 module/zfs/THIRDPARTYLICENSE.zia.descrip create mode 100644 module/zfs/zia.c create mode 100644 module/zfs/zia_cddl.c create mode 100644 module/zia-software-provider/kernel_offloader.c create mode 100644 module/zia-software-provider/kernel_offloader.h create mode 100644 module/zia-software-provider/software.c diff --git a/Makefile.am b/Makefile.am index 54d300e7d40b..11b3489d2ac2 100644 --- a/Makefile.am +++ b/Makefile.am @@ -57,6 +57,8 @@ dist_noinst_DATA += module/os/linux/spl/THIRDPARTYLICENSE.gplv2 dist_noinst_DATA += module/os/linux/spl/THIRDPARTYLICENSE.gplv2.descrip dist_noinst_DATA += module/zfs/THIRDPARTYLICENSE.cityhash dist_noinst_DATA += module/zfs/THIRDPARTYLICENSE.cityhash.descrip +dist_noinst_DATA += module/zfs/THIRDPARTYLICENSE.zia +dist_noinst_DATA += module/zfs/THIRDPARTYLICENSE.zia.descrip @CODE_COVERAGE_RULES@ diff --git a/cmd/raidz_test/raidz_test.c b/cmd/raidz_test/raidz_test.c index 8b21bc098e01..8e2d1e16ed9a 100644 --- a/cmd/raidz_test/raidz_test.c +++ b/cmd/raidz_test/raidz_test.c @@ -455,6 +455,9 @@ vdev_raidz_map_alloc_expanded(abd_t *abd, uint64_t size, uint64_t offset, rr->rr_firstdatacol = nparity; rr->rr_abd_empty = NULL; rr->rr_nempty = 0; +#ifdef ZIA + rr->rr_zia_handle = NULL; +#endif for (int c = 0; c < rr->rr_cols; c++, child_id++) { if (child_id >= row_phys_cols) { diff --git a/config/Rules.am b/config/Rules.am index 7162b771869d..86911a5272c4 100644 --- a/config/Rules.am +++ b/config/Rules.am @@ -40,6 +40,7 @@ AM_CPPFLAGS += -DPKGDATADIR=\"$(pkgdatadir)\" AM_CPPFLAGS += $(DEBUG_CPPFLAGS) AM_CPPFLAGS += $(CODE_COVERAGE_CPPFLAGS) AM_CPPFLAGS += -DTEXT_DOMAIN=\"zfs-@ac_system_l@-user\" +AM_CPPFLAGS += $(ZIA_CPPFLAGS) AM_CPPFLAGS_NOCHECK = -D"strtok(...)=strtok(__VA_ARGS__) __attribute__((deprecated(\"Use strtok_r(3) instead!\")))" AM_CPPFLAGS_NOCHECK += -D"__xpg_basename(...)=__xpg_basename(__VA_ARGS__) __attribute__((deprecated(\"basename(3) is underspecified. Use zfs_basename() instead!\")))" diff --git a/config/zfs-build.m4 b/config/zfs-build.m4 index d14a6bb7ac9f..ff0468b3c893 100644 --- a/config/zfs-build.m4 +++ b/config/zfs-build.m4 @@ -260,6 +260,8 @@ AC_DEFUN([ZFS_AC_CONFIG], [ AC_SUBST(TEST_JOBS) ]) + ZFS_AC_ZIA + ZFS_INIT_SYSV= ZFS_INIT_SYSTEMD= ZFS_WANT_MODULES_LOAD_D= @@ -291,7 +293,8 @@ AC_DEFUN([ZFS_AC_CONFIG], [ [test "x$qatsrc" != x ]) AM_CONDITIONAL([WANT_DEVNAME2DEVID], [test "x$user_libudev" = xyes ]) AM_CONDITIONAL([WANT_MMAP_LIBAIO], [test "x$user_libaio" = xyes ]) - AM_CONDITIONAL([PAM_ZFS_ENABLED], [test "x$enable_pam" = xyes]) + AM_CONDITIONAL([PAM_ZFS_ENABLED], [test "x$enable_pam" = xyes ]) + AM_CONDITIONAL([ZIA_ENABLED], [test "x$enable_zia" = xyes ]) ]) dnl # @@ -334,6 +337,7 @@ AC_DEFUN([ZFS_AC_RPM], [ RPM_DEFINE_COMMON=${RPM_DEFINE_COMMON}' --define "$(DEBUG_KMEM_TRACKING_ZFS) 1"' RPM_DEFINE_COMMON=${RPM_DEFINE_COMMON}' --define "$(ASAN_ZFS) 1"' RPM_DEFINE_COMMON=${RPM_DEFINE_COMMON}' --define "$(UBSAN_ZFS) 1"' + RPM_DEFINE_COMMON=${RPM_DEFINE_COMMON}' --define "$(WITH_ZIA) 1" --define "DPUSM_ROOT $(DPUSM_ROOT)"' AS_IF([test "x$enable_debuginfo" = xyes], [ RPM_DEFINE_COMMON=${RPM_DEFINE_COMMON}' --define "__strip /bin/true"' diff --git a/config/zia.m4 b/config/zia.m4 new file mode 100644 index 000000000000..8ec2de1f466d --- /dev/null +++ b/config/zia.m4 @@ -0,0 +1,42 @@ +dnl # Adds --with-zia=PATH to configuration options +dnl # The path provided should point to the DPUSM +dnl # root and contain Module.symvers. +AC_DEFUN([ZFS_AC_ZIA], [ + AC_ARG_WITH([zia], + AS_HELP_STRING([--with-zia=PATH], + [Path to Data Processing Services Module]), + [ + DPUSM_ROOT="$withval" + enable_zia=yes + ] + ) + + AS_IF([test "x$enable_zia" == "xyes"], + AS_IF([! test -d "$DPUSM_ROOT"], + [AC_MSG_ERROR([--with-zia=PATH requires the DPUSM root directory])] + ) + + DPUSM_SYMBOLS="$DPUSM_ROOT/Module.symvers" + + AS_IF([test -r $DPUSM_SYMBOLS], + [ + AC_MSG_RESULT([$DPUSM_SYMBOLS]) + ZIA_CPPFLAGS="-DZIA=1 -I$DPUSM_ROOT/include" + KERNEL_ZIA_CPPFLAGS="-DZIA=1 -I$DPUSM_ROOT/include" + WITH_ZIA="_with_zia" + + AC_SUBST(WITH_ZIA) + AC_SUBST(KERNEL_ZIA_CPPFLAGS) + AC_SUBST(ZIA_CPPFLAGS) + AC_SUBST(DPUSM_SYMBOLS) + AC_SUBST(DPUSM_ROOT) + ], + [ + AC_MSG_ERROR([ + *** Failed to find Module.symvers in: + $DPUSM_SYMBOLS + ]) + ] + ) + ) +]) diff --git a/include/sys/abd.h b/include/sys/abd.h index 5c6bd0c271d4..2d29b712d6bf 100644 --- a/include/sys/abd.h +++ b/include/sys/abd.h @@ -75,6 +75,9 @@ typedef struct abd { list_t abd_gang_chain; } abd_gang; } abd_u; +#ifdef ZIA + void *abd_zia_handle; +#endif } abd_t; typedef int abd_iter_func_t(void *buf, size_t len, void *priv); diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index f013e6b20603..e735079b6b6e 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -251,6 +251,20 @@ typedef enum { ZPOOL_PROP_LOAD_GUID, ZPOOL_PROP_AUTOTRIM, ZPOOL_PROP_COMPATIBILITY, +#ifdef ZIA + ZPOOL_PROP_ZIA_PROVIDER, + ZPOOL_PROP_ZIA_COMPRESS, + ZPOOL_PROP_ZIA_DECOMPRESS, + ZPOOL_PROP_ZIA_CHECKSUM, + ZPOOL_PROP_ZIA_RAIDZ1_GEN, + ZPOOL_PROP_ZIA_RAIDZ2_GEN, + ZPOOL_PROP_ZIA_RAIDZ3_GEN, + ZPOOL_PROP_ZIA_RAIDZ1_REC, + ZPOOL_PROP_ZIA_RAIDZ2_REC, + ZPOOL_PROP_ZIA_RAIDZ3_REC, + ZPOOL_PROP_ZIA_FILE_WRITE, + ZPOOL_PROP_ZIA_DISK_WRITE, +#endif ZPOOL_NUM_PROPS } zpool_prop_t; diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 9946c4e3c316..37b8e20150d1 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -53,6 +53,10 @@ #include #include +#ifdef ZIA +#include +#endif + #ifdef __cplusplus extern "C" { #endif @@ -441,6 +445,10 @@ struct spa { zfs_refcount_t spa_refcount; /* number of opens */ taskq_t *spa_upgrade_taskq; /* taskq for upgrade jobs */ + +#ifdef ZIA + zia_props_t spa_zia_props; +#endif }; extern char *spa_config_path; diff --git a/include/sys/vdev_disk.h b/include/sys/vdev_disk.h index a7e19fbf0c4b..d0241b9da9b6 100644 --- a/include/sys/vdev_disk.h +++ b/include/sys/vdev_disk.h @@ -42,5 +42,12 @@ #ifdef _KERNEL #include + +#ifdef ZIA +int __vdev_disk_physio(struct block_device *bdev, zio_t *zio, + size_t io_size, uint64_t io_offset, int rw, int flags); +int vdev_disk_io_flush(struct block_device *bdev, zio_t *zio); +void vdev_disk_error(zio_t *zio); +#endif /* ZIA */ #endif /* _KERNEL */ #endif /* _SYS_VDEV_DISK_H */ diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index db8fbdeb06df..6f34b6dc26cd 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -465,6 +465,10 @@ struct vdev { zfs_ratelimit_t vdev_delay_rl; zfs_ratelimit_t vdev_deadman_rl; zfs_ratelimit_t vdev_checksum_rl; + +#ifdef ZIA + void *vdev_zia_handle; +#endif }; #define VDEV_PAD_SIZE (8 << 10) diff --git a/include/sys/vdev_raidz.h b/include/sys/vdev_raidz.h index c7cf0af6d945..dd0c9042c2b6 100644 --- a/include/sys/vdev_raidz.h +++ b/include/sys/vdev_raidz.h @@ -70,6 +70,13 @@ typedef struct vdev_raidz { int vd_nparity; } vdev_raidz_t; +#ifdef ZIA +void vdev_raidz_generate_parity_p(struct raidz_row *); +void vdev_raidz_generate_parity_pq(struct raidz_row *); +void vdev_raidz_generate_parity_pqr(struct raidz_row *); +void vdev_raidz_reconstruct_general(struct raidz_row *, int *, int); +#endif + #ifdef __cplusplus } #endif diff --git a/include/sys/vdev_raidz_impl.h b/include/sys/vdev_raidz_impl.h index 890e725e18d8..fcee8b8ccf89 100644 --- a/include/sys/vdev_raidz_impl.h +++ b/include/sys/vdev_raidz_impl.h @@ -129,6 +129,9 @@ typedef struct raidz_row { #ifdef ZFS_DEBUG uint64_t rr_offset; /* Logical offset for *_io_verify() */ uint64_t rr_size; /* Physical size for *_io_verify() */ +#endif +#ifdef ZIA + void *rr_zia_handle; #endif raidz_col_t rr_col[0]; /* Flexible array of I/O columns */ } raidz_row_t; diff --git a/include/sys/zia.h b/include/sys/zia.h new file mode 100644 index 000000000000..2d0b33f1cdb2 --- /dev/null +++ b/include/sys/zia.h @@ -0,0 +1,197 @@ +/* + * © 2021. Triad National Security, LLC. All rights reserved. + * + * This program was produced under U.S. Government contract + * 89233218CNA000001 for Los Alamos National Laboratory (LANL), which + * is operated by Triad National Security, LLC for the U.S. + * Department of Energy/National Nuclear Security Administration. All + * rights in the program are reserved by Triad National Security, LLC, + * and the U.S. Department of Energy/National Nuclear Security + * Administration. The Government is granted for itself and others + * acting on its behalf a nonexclusive, paid-up, irrevocable worldwide + * license in this material to reproduce, prepare derivative works, + * distribute copies to the public, perform publicly and display + * publicly, and to permit others to do so. + * + * ---- + * + * This program is open source under the BSD-3 License. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifdef ZIA + +#ifndef _ZIA_H +#define _ZIA_H + +#include +#include /* VDEV_RAIDZ_MAXPARITY */ +#include +#include +#include +#include +#include + +typedef struct raidz_row raidz_row_t; + +/* ******************************************************** */ +/* return values */ +#define ZIA_OK 0 + +/* something bad happened not related to missing functionality */ +#define ZIA_ERROR 1 + +/* error, fallback to zfs implementation */ +#define ZIA_FALLBACK 2 + +/* ran, but result is bad */ +#define ZIA_BAD_RESULT 3 + +/* expected provider and actual provider do not match */ +#define ZIA_PROVIDER_MISMATCH 4 +/* ******************************************************** */ + +/* + * This struct is normally set with + * zpool set zia_=on/off/ + * and passed around in spa_t. + */ +typedef struct zia_props { + void *provider; + + /* minimum size allowed to offload - set by ashift */ + size_t min_offload_size; + + int compress; + int decompress; + + int checksum; + + struct { + int gen[VDEV_RAIDZ_MAXPARITY + 1]; + int rec[VDEV_RAIDZ_MAXPARITY + 1]; + } raidz; + + int file_write; + int disk_write; +} zia_props_t; + +zia_props_t *zia_get_props(spa_t *spa); +void zia_prop_warn(boolean_t val, const char *name); + +int zia_init(void); +int zia_fini(void); + +void *zia_get_provider(const char *name); +const char *zia_get_provider_name(void *provider); +int zia_put_provider(void **provider); + +/* check if offloading can occur */ +boolean_t zia_is_used(zio_t *zio); + +/* + * check if a handle is associated with this pointer + * + * not exposing functions for different handles because + * only abd handles are checked outside of zia.c + */ +boolean_t zia_is_offloaded(abd_t *abd); + +/* create a new offloader handle without copying data */ +void *zia_alloc(void *provider, size_t size, size_t min_offload_size); + +/* deallocate handle without onloading */ +void zia_free(void **handle); + +/* move linear data between from the offloader to memory */ +int zia_onload(void **handle, void *buf, size_t size); + +/* calls abd_iterate_func on the abd to copy abd data back and forth */ +int zia_offload_abd(void *provider, abd_t *abd, + size_t size, size_t min_offload_size, boolean_t *local_offload); +int zia_onload_abd(abd_t *abd, size_t size, boolean_t keep_handle); +/* move a handle into an abd */ +void zia_move_into_abd(abd_t *dst, void **src); +int zia_free_abd(abd_t *abd, boolean_t lock); + +/* + * if offloaded locally, just free the handle + * if not, onload the data and free the handle + */ +int zia_cleanup_abd(abd_t *abd, size_t size, boolean_t local_offload); + +/* fill a buffer with zeros */ +int zia_zero_fill(abd_t *abd, size_t offset, size_t size); + +int zia_compress(void *provider, zio_t *zio, size_t s_len, + enum zio_compress c, uint8_t level, void **cbuf_handle, + uint64_t *c_len, boolean_t *local_offload); + +int zia_checksum_compute(void *provider, zio_cksum_t *dst, + enum zio_checksum alg, zio_t *zio, uint64_t size, + boolean_t *local_offload); +int zia_checksum_error(const blkptr_t *bp, enum zio_checksum alg, + abd_t *abd, uint64_t size, zio_bad_cksum_t *info); + +/* raidz */ +int zia_raidz_alloc(zio_t *zio, raidz_row_t *rr, boolean_t rec, + uint_t cksum, boolean_t *local_offload); +int zia_raidz_free(raidz_row_t *rr, boolean_t onload_parity); +int zia_raidz_gen(raidz_row_t *rr); +int zia_raidz_gen_cleanup(zio_t *zio, raidz_row_t *rr, + boolean_t local_offload); +int zia_raidz_new_parity(zio_t *zio, raidz_row_t *rr, abd_t **orig); +/* compare the contents of offloaded abds (only used in resilver) */ +int zia_raidz_cmp(abd_t *lhs, abd_t *rhs, int *diff); +int zia_raidz_rec(raidz_row_t *rr, int *t, int nt); +int zia_raidz_rec_cleanup(zio_t *zio, raidz_row_t *rr, + boolean_t local_offload, boolean_t onload_parity); + +/* file I/O */ +int zia_file_open(vdev_t *vdev, const char *path, + int flags, int mode); +int zia_file_write(vdev_t *vdev, abd_t *abd, ssize_t size, + loff_t offset, ssize_t *resid, int *err); +int zia_file_close(vdev_t *vdev); + +#ifdef _KERNEL +#include + +/* disk I/O */ +int zia_disk_open(vdev_t *vdev, const char *path, + struct block_device *bdev); +int zia_disk_invalidate(vdev_t *vdev); +int zia_disk_write(vdev_t *vdev, zio_t *zio, + size_t io_size, uint64_t io_offset, int flags); +int zia_disk_flush(vdev_t *vdev, zio_t *zio); +int zia_disk_close(vdev_t *vdev); +#endif + +#endif + +#endif diff --git a/include/sys/zia_cddl.h b/include/sys/zia_cddl.h new file mode 100644 index 000000000000..74e88abbeae5 --- /dev/null +++ b/include/sys/zia_cddl.h @@ -0,0 +1,58 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifdef ZIA + +#ifndef _ZIA_CDDL_H +#define _ZIA_CDDL_H + +#include +#include +#include +#include +#include + +#include + +int +zia_compress_impl(const dpusm_uf_t *dpusm, void *provider, + zio_t *zio, size_t s_len, enum zio_compress c, uint8_t level, + void **cbuf_handle, uint64_t *c_len, boolean_t *local_offload); + +int +zia_checksum_error_impl(const dpusm_uf_t *dpusm, const blkptr_t *bp, + enum zio_checksum alg, abd_t *abd, uint64_t size, zio_bad_cksum_t *info); + +int +zia_raidz_rec_impl(const dpusm_uf_t *dpusm, + raidz_row_t *rr, int *t, int nt); + +#ifdef _KERNEL +void +zia_disk_write_completion(void *zio_ptr, int error); + +void +zia_disk_flush_completion(void *zio_ptr, int error); +#endif + +#endif + +#endif diff --git a/include/sys/zia_private.h b/include/sys/zia_private.h new file mode 100644 index 000000000000..121f97c9e31c --- /dev/null +++ b/include/sys/zia_private.h @@ -0,0 +1,78 @@ +/* + * © 2021. Triad National Security, LLC. All rights reserved. + * + * This program was produced under U.S. Government contract + * 89233218CNA000001 for Los Alamos National Laboratory (LANL), which + * is operated by Triad National Security, LLC for the U.S. + * Department of Energy/National Nuclear Security Administration. All + * rights in the program are reserved by Triad National Security, LLC, + * and the U.S. Department of Energy/National Nuclear Security + * Administration. The Government is granted for itself and others + * acting on its behalf a nonexclusive, paid-up, irrevocable worldwide + * license in this material to reproduce, prepare derivative works, + * distribute copies to the public, perform publicly and display + * publicly, and to permit others to do so. + * + * ---- + * + * This program is open source under the BSD-3 License. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifdef ZIA + +#ifndef _ZIA_PRIVATE_H +#define _ZIA_PRIVATE_H + +#include +#include +#include + +/* + * needed by both zia.h and zia_cddl.h + * defined in zia.c + */ + +#define ABD_HANDLE(abd) (abd)->abd_zia_handle + +#define VDEV_HANDLE(vdev) (vdev)->vdev_zia_handle + +dpusm_compress_t +translate_compress(enum zio_compress c); + +dpusm_checksum_t +translate_checksum(enum zio_checksum c); + +dpusm_checksum_byteorder_t +translate_byteorder(zio_byteorder_t bo); + +int zia_get_capabilities(void *provider, dpusm_pc_t **caps); + +#endif + +#endif diff --git a/include/sys/zio.h b/include/sys/zio.h index 9bee7cc9b9fd..238f10942be3 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -350,6 +350,9 @@ typedef struct zio_prop { uint8_t zp_iv[ZIO_DATA_IV_LEN]; uint8_t zp_mac[ZIO_DATA_MAC_LEN]; uint32_t zp_zpl_smallblk; +#ifdef ZIA + boolean_t zp_ismd; +#endif } zio_prop_t; typedef struct zio_cksum_report zio_cksum_report_t; diff --git a/include/sys/zio_compress.h b/include/sys/zio_compress.h index 26600b43bb49..fe50829e984a 100644 --- a/include/sys/zio_compress.h +++ b/include/sys/zio_compress.h @@ -154,6 +154,10 @@ typedef const struct zio_compress_info { extern const zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS]; +#ifdef ZIA +extern int zio_compress_zeroed_cb(void *data, size_t len, void *private); +#endif + /* * lz4 compression init & free */ diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index eaa920e56106..711e2f0afe76 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -179,6 +179,8 @@ nodist_libzpool_la_SOURCES = \ module/zfs/zfs_rlock.c \ module/zfs/zfs_sa.c \ module/zfs/zil.c \ + module/zfs/zia.c \ + module/zfs/zia_cddl.c \ module/zfs/zio.c \ module/zfs/zio_checksum.c \ module/zfs/zio_compress.c \ diff --git a/man/man7/zpoolprops.7 b/man/man7/zpoolprops.7 index 944fbf2b8d29..b7f65a6840ee 100644 --- a/man/man7/zpoolprops.7 +++ b/man/man7/zpoolprops.7 @@ -414,4 +414,41 @@ command, though this property can be used when a specific version is needed for backwards compatibility. Once feature flags are enabled on a pool this property will no longer have a value. + +.It Sy zia_checksum Ns = Ns Sy on Ns | Ns Sy off +Controls whether the pool should offload checksum computations. Does not have any effect if the checksum stage is disabled. Embedded checksums are onloaded, and will suffer a data movement penalty. + +.It Sy zia_compress Ns = Ns Sy on Ns | Ns Sy off +Controls whether the pool should offload compression. Does not have any effect if the compression stage is disabled. Embedded data is onloaded, and will suffer a data movement penalty. + +.It Sy zia_decompress Ns = Ns Sy on Ns | Ns Sy off +Controls whether the pool should offload checksum computations. Does not have any effect if the checksum stage is disabled. + +.It Sy zia_disk_write Ns = Ns Sy on Ns | Ns Sy off +Controls whether a pool should offload write I/Os to disks. + +.It Sy zia_file_write Ns = Ns Sy on Ns | Ns Sy off +Controls whether a pool should offload write I/Os to files. + +.It Sy zia_provider Ns = Ns Sy (unset)| Ns Sy Z.I.A. Provider Name +Selects an accelerator registered in the Data Processing Unit Services Module to offload data to. Only one accelerator can be used by a pool at a time. + +.It Sy zia_raidz1_gen Ns = Ns Sy on Ns | Ns Sy off +Controls whether the pool should offload RAIDZ1 parity generation. Does not have any effect if RAIDZ1 is disabled. + +.It Sy zia_raidz1_rec Ns = Ns Sy on Ns | Ns Sy off +Controls whether the pool should offload RAIDZ1 reconstruction. Does not have any effect if RAIDZ1 is disabled. + +.It Sy zia_raidz2_gen Ns = Ns Sy on Ns | Ns Sy off +Controls whether the pool should offload RAIDZ2 parity generation. Does not have any effect if RAIDZ2 is disabled. + +.It Sy zia_raidz2_rec Ns = Ns Sy on Ns | Ns Sy off +Controls whether the pool should offload RAIDZ2 reconstruction. Does not have any effect if RAIDZ2 is disabled. + +.It Sy zia_raidz3_gen Ns = Ns Sy on Ns | Ns Sy off +Controls whether the pool should offload RAIDZ3 parity generation. Does not have any effect if RAIDZ3 is disabled. + +.It Sy zia_raidz3_rec Ns = Ns Sy on Ns | Ns Sy off +Controls whether the pool should offload RAIDZ3 reconstruction. Does not have any effect if RAIDZ3 is disabled. + .El diff --git a/module/Kbuild.in b/module/Kbuild.in index 4803952cbfed..e606ffa30ee2 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -27,6 +27,7 @@ ZFS_MODULE_CFLAGS += -I$(zfs_include)/os/linux/zfs ZFS_MODULE_CFLAGS += -I$(zfs_include) ZFS_MODULE_CPPFLAGS += -D_KERNEL ZFS_MODULE_CPPFLAGS += @KERNEL_DEBUG_CPPFLAGS@ +ZFS_MODULE_CPPFLAGS += @KERNEL_ZIA_CPPFLAGS@ # KASAN enables -Werror=frame-larger-than=1024, which # breaks oh so many parts of our build. @@ -401,6 +402,8 @@ ZFS_OBJS := \ zfs_sa.o \ zfs_vnops.o \ zil.o \ + zia.o \ + zia_cddl.o \ zio.o \ zio_checksum.o \ zio_compress.o \ @@ -472,3 +475,19 @@ OBJECT_FILES_NON_STANDARD_vdev_raidz_math_avx512f.o := y ifeq ($(CONFIG_ALTIVEC),y) $(obj)/zfs/vdev_raidz_math_powerpc_altivec.o : c_flags += -maltivec endif + +ifneq ("@DPUSM_SYMBOLS@","") +obj-$(CONFIG_ZFS) += zia-software-provider.o + +ZIA_SOFTWARE_PROVIDER_OBJS := \ + software.o \ + kernel_offloader.o + +zia-software-provider-objs += $(addprefix zia-software-provider/,$(ZIA_SOFTWARE_PROVIDER_OBJS)) +# zfs_file_os does not have any dependencies, so just link to it directly +zia-software-provider-objs += os/linux/zfs/zfs_file_os.o + +$(addprefix $(obj)/zia-software-provider/,$(ZIA_SOFTWARE_PROVIDER_OBJS)) : ccflags-y += -I@abs_top_builddir@ $(ZFS_MODULE_CFLAGS) -I@abs_srcdir@/zia-software-provider/ -I@DPUSM_ROOT@/include + +@ZIA_ENABLED_TRUE@KBUILD_EXTRA_SYMBOLS += @DPUSM_SYMBOLS@ +endif \ No newline at end of file diff --git a/module/Makefile.in b/module/Makefile.in index 5b71e1abf79e..465384413fb5 100644 --- a/module/Makefile.in +++ b/module/Makefile.in @@ -80,7 +80,7 @@ clean: clean-@ac_system@ .PHONY: modules_uninstall-Linux-legacy modules_uninstall-Linux-legacy: - $(RM) -r $(addprefix $(KMODDIR)/$(INSTALL_MOD_DIR)/,spl/ avl/ icp/ lua/ nvpair/ unicode/ zcommon/ zfs/ zstd/) + $(RM) -r $(addprefix $(KMODDIR)/$(INSTALL_MOD_DIR)/,spl/ avl/ icp/ lua/ nvpair/ unicode/ zcommon/ zfs/ zstd/ zia-software-provider) KMODDIR := $(INSTALL_MOD_PATH)/lib/modules/@LINUX_VERSION@ modules_install-Linux: modules_uninstall-Linux-legacy @@ -123,7 +123,7 @@ data_install: data_install-@ac_system@ modules_uninstall-Linux: modules_uninstall-Linux-legacy @# Uninstall the kernel modules - $(RM) $(addprefix $(KMODDIR)/$(INSTALL_MOD_DIR)/,zfs.ko spl.ko) + $(RM) $(addprefix $(KMODDIR)/$(INSTALL_MOD_DIR)/,zfs.ko spl.ko zia-software-provider) modules_uninstall-FreeBSD: @false @@ -153,7 +153,7 @@ cppcheck-Linux: -I @top_srcdir@/include/os/linux/spl \ -I @top_srcdir@/include/os/linux/zfs \ -I @top_srcdir@/include \ - avl icp lua nvpair unicode zcommon zfs zstd os/linux + avl icp lua nvpair unicode zcommon zfs zstd os/linux zia-software-provider cppcheck-FreeBSD: @true diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 9a382261df73..b88a126be002 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -41,6 +41,10 @@ #include #endif +#ifdef ZIA +#include +#endif + typedef struct vdev_disk { struct block_device *vd_bdev; krwlock_t vd_lock; @@ -154,7 +158,11 @@ bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk) return (psize); } +#ifndef ZIA static void +#else +void +#endif vdev_disk_error(zio_t *zio) { /* @@ -224,6 +232,9 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, reread_part = B_TRUE; } +#ifdef ZIA + zia_disk_close(v); +#endif blkdev_put(bdev, mode | FMODE_EXCL); } @@ -335,6 +346,13 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, *logical_ashift = highbit64(MAX(logical_block_size, SPA_MINBLOCKSIZE)) - 1; +#ifdef ZIA + zia_get_props(v->vdev_spa)->min_offload_size = 2 << *physical_ashift; + + /* open disk; ignore errors - will fall back to ZFS */ + zia_disk_open(v, v->vdev_path, vd->vd_bdev); +#endif + return (0); } @@ -347,6 +365,9 @@ vdev_disk_close(vdev_t *v) return; if (vd->vd_bdev != NULL) { +#ifdef ZIA + zia_disk_close(v); +#endif blkdev_put(vd->vd_bdev, vdev_bdev_mode(spa_mode(v->vdev_spa)) | FMODE_EXCL); } @@ -602,7 +623,11 @@ vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset) #endif } +#ifndef ZIA static int +#else +int +#endif __vdev_disk_physio(struct block_device *bdev, zio_t *zio, size_t io_size, uint64_t io_offset, int rw, int flags) { @@ -709,6 +734,10 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, return (error); } +#ifdef ZIA +EXPORT_SYMBOL(__vdev_disk_physio); +#endif + BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error) { zio_t *zio = bio->bi_private; @@ -728,7 +757,11 @@ BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error) zio_interrupt(zio); } +#ifndef ZIA static int +#else +int +#endif vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) { struct request_queue *q; @@ -751,6 +784,10 @@ vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) return (0); } +#ifdef ZIA +EXPORT_SYMBOL(vdev_disk_io_flush); +#endif + static int vdev_disk_io_trim(zio_t *zio) { @@ -829,6 +866,19 @@ vdev_disk_io_start(zio_t *zio) break; } +#ifdef ZIA + error = zia_disk_flush(v, zio); + + /* + * have to return here in order to not dispatch + * this zio to multiple task queues + */ + if (error == 0) { + rw_exit(&vd->vd_lock); + return; + } +#endif + error = vdev_disk_io_flush(vd->vd_bdev, zio); if (error == 0) { rw_exit(&vd->vd_lock); @@ -868,8 +918,29 @@ vdev_disk_io_start(zio_t *zio) } zio->io_target_timestamp = zio_handle_io_delay(zio); + +#ifdef ZIA + error = EIO; + boolean_t local_offload = B_FALSE; + zia_props_t *zia_props = zia_get_props(zio->io_spa); + if ((rw == WRITE) && (zia_props->disk_write == 1)) { + if (zia_offload_abd(zia_props->provider, zio->io_abd, + zio->io_size, zia_props->min_offload_size, + &local_offload) == ZIA_OK) { + error = zia_disk_write(v, zio, zio->io_size, + zio->io_offset, 0); + } + } + + if (error != 0) { + zia_cleanup_abd(zio->io_abd, zio->io_size, local_offload); +#endif error = __vdev_disk_physio(vd->vd_bdev, zio, zio->io_size, zio->io_offset, rw, 0); +#ifdef ZIA + } +#endif + rw_exit(&vd->vd_lock); if (error) { @@ -892,6 +963,9 @@ vdev_disk_io_done(zio_t *zio) vdev_disk_t *vd = v->vdev_tsd; if (zfs_check_media_change(vd->vd_bdev)) { +#ifdef ZIA + zia_disk_invalidate(v); +#endif invalidate_bdev(vd->vd_bdev); v->vdev_remove_wanted = B_TRUE; spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); diff --git a/module/os/linux/zfs/vdev_file.c b/module/os/linux/zfs/vdev_file.c index f073145326e3..b615a2117970 100644 --- a/module/os/linux/zfs/vdev_file.c +++ b/module/os/linux/zfs/vdev_file.c @@ -39,6 +39,11 @@ #ifdef _KERNEL #include #endif + +#ifdef ZIA +#include +#endif + /* * Virtual device vector for files. */ @@ -161,6 +166,14 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, } #endif +#ifdef ZIA + zia_get_props(vd->vdev_spa)->min_offload_size = 2 << *physical_ashift; + + /* try to open the file; ignore errors - will fall back to ZFS */ + zia_file_open(vd, vd->vdev_path, + vdev_file_open_mode(spa_mode(vd->vdev_spa)), 0); +#endif + skip_open: error = zfs_file_getattr(vf->vf_file, &zfa); @@ -184,6 +197,10 @@ vdev_file_close(vdev_t *vd) if (vd->vdev_reopening || vf == NULL) return; +#ifdef ZIA + zia_file_close(vd); +#endif + if (vf->vf_file != NULL) { (void) zfs_file_close(vf->vf_file); } @@ -203,20 +220,37 @@ vdev_file_io_strategy(void *arg) void *buf; loff_t off; ssize_t size; - int err; + int err = 0; off = zio->io_offset; size = zio->io_size; resid = 0; if (zio->io_type == ZIO_TYPE_READ) { - buf = abd_borrow_buf(zio->io_abd, zio->io_size); + buf = abd_borrow_buf(zio->io_abd, size); err = zfs_file_pread(vf->vf_file, buf, size, off, &resid); abd_return_buf_copy(zio->io_abd, buf, size); } else { - buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size); +#ifdef ZIA + boolean_t local_offload = B_FALSE; + zia_props_t *zia_props = zia_get_props(zio->io_spa); + if (zia_get_props(zio->io_spa)->file_write == 1) { + zia_offload_abd(zia_props->provider, zio->io_abd, + size, zia_props->min_offload_size, &local_offload); + + err = zia_file_write(vd, zio->io_abd, + size, off, &resid, &err); + } + + if (err != 0) { + zia_cleanup_abd(zio->io_abd, size, local_offload); +#endif + buf = abd_borrow_buf_copy(zio->io_abd, size); err = zfs_file_pwrite(vf->vf_file, buf, size, off, &resid); abd_return_buf(zio->io_abd, buf, size); +#ifdef ZIA + } +#endif } zio->io_error = err; if (resid != 0 && zio->io_error == 0) diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c index 853476a1fc16..94f5305c3a95 100644 --- a/module/zcommon/zpool_prop.c +++ b/module/zcommon/zpool_prop.c @@ -169,6 +169,45 @@ zpool_prop_init(void) PROP_TYPE_NUMBER, PROP_DEFAULT, ZFS_TYPE_POOL, "DEDUPDITTO", B_FALSE, sfeatures); +#ifdef ZIA + zprop_register_string(ZPOOL_PROP_ZIA_PROVIDER, "zia_provider", NULL, + PROP_DEFAULT, ZFS_TYPE_POOL, "", "PROVIDER", + sfeatures); + zprop_register_index(ZPOOL_PROP_ZIA_COMPRESS, "zia_compress", + 1, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", + "zia_compress", boolean_table, sfeatures); + zprop_register_index(ZPOOL_PROP_ZIA_DECOMPRESS, "zia_decompress", + 1, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", + "zia_decompress", boolean_table, sfeatures); + zprop_register_index(ZPOOL_PROP_ZIA_CHECKSUM, + "zia_checksum", 1, PROP_DEFAULT, ZFS_TYPE_POOL, + "on | off", "zia_checksum", boolean_table, sfeatures); + zprop_register_index(ZPOOL_PROP_ZIA_RAIDZ1_GEN, "zia_raidz1_gen", + 1, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", + "zia_raidz1_gen", boolean_table, sfeatures); + zprop_register_index(ZPOOL_PROP_ZIA_RAIDZ2_GEN, "zia_raidz2_gen", + 1, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", + "zia_raidz2_gen", boolean_table, sfeatures); + zprop_register_index(ZPOOL_PROP_ZIA_RAIDZ3_GEN, "zia_raidz3_gen", + 1, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", + "zia_raidz3_gen", boolean_table, sfeatures); + zprop_register_index(ZPOOL_PROP_ZIA_RAIDZ1_REC, "zia_raidz1_rec", + 1, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", + "zia_raidz1_rec", boolean_table, sfeatures); + zprop_register_index(ZPOOL_PROP_ZIA_RAIDZ2_REC, "zia_raidz2_rec", + 1, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", + "zia_raidz2_rec", boolean_table, sfeatures); + zprop_register_index(ZPOOL_PROP_ZIA_RAIDZ3_REC, "zia_raidz3_rec", + 1, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", + "zia_raidz3_rec", boolean_table, sfeatures); + zprop_register_index(ZPOOL_PROP_ZIA_FILE_WRITE, "zia_file_write", + 1, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", + "zia_file_write", boolean_table, sfeatures); + zprop_register_index(ZPOOL_PROP_ZIA_DISK_WRITE, "zia_disk_write", + 1, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", + "zia_disk_write", boolean_table, sfeatures); +#endif + zfs_mod_list_supported_free(sfeatures); } diff --git a/module/zfs/THIRDPARTYLICENSE.zia b/module/zfs/THIRDPARTYLICENSE.zia new file mode 100644 index 000000000000..9f81923f051d --- /dev/null +++ b/module/zfs/THIRDPARTYLICENSE.zia @@ -0,0 +1,42 @@ +© 2021. Triad National Security, LLC. All rights reserved. + +This program was produced under U.S. Government contract +89233218CNA000001 for Los Alamos National Laboratory (LANL), which +is operated by Triad National Security, LLC for the U.S. +Department of Energy/National Nuclear Security Administration. All +rights in the program are reserved by Triad National Security, LLC, +and the U.S. Department of Energy/National Nuclear Security +Administration. The Government is granted for itself and others +acting on its behalf a nonexclusive, paid-up, irrevocable worldwide +license in this material to reproduce, prepare derivative works, +distribute copies to the public, perform publicly and display +publicly, and to permit others to do so. + +---- + +This program is open source under the BSD-3 License. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from this +software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. diff --git a/module/zfs/THIRDPARTYLICENSE.zia.descrip b/module/zfs/THIRDPARTYLICENSE.zia.descrip new file mode 100644 index 000000000000..4be64904acc6 --- /dev/null +++ b/module/zfs/THIRDPARTYLICENSE.zia.descrip @@ -0,0 +1 @@ +Z.I.A. FUNCTIONALITY IN ZFS \ No newline at end of file diff --git a/module/zfs/abd.c b/module/zfs/abd.c index b6d7ac6407e3..04f2c2cac621 100644 --- a/module/zfs/abd.c +++ b/module/zfs/abd.c @@ -102,6 +102,10 @@ #include #include +#ifdef ZIA +#include +#endif + /* see block comment above for description */ int zfs_abd_scatter_enabled = B_TRUE; @@ -146,11 +150,19 @@ abd_init_struct(abd_t *abd) abd->abd_parent = NULL; #endif abd->abd_size = 0; + +#ifdef ZIA + abd->abd_zia_handle = NULL; +#endif } static void abd_fini_struct(abd_t *abd) { +#ifdef ZIA + zia_free_abd(abd, B_TRUE); +#endif + mutex_destroy(&abd->abd_mtx); ASSERT(!list_link_active(&abd->abd_gang_link)); #ifdef ZFS_DEBUG @@ -320,6 +332,10 @@ abd_free(abd_t *abd) abd_free_struct_impl(abd); } +#ifdef ZIA +EXPORT_SYMBOL(abd_free); +#endif + /* * Allocate an ABD of the same format (same metadata flag, same scatterize * setting) as another ABD. @@ -584,9 +600,15 @@ abd_get_offset_size(abd_t *sabd, size_t off, size_t size) abd_t * abd_get_zeros(size_t size) { + abd_t *abd = NULL; + ASSERT3P(abd_zero_scatter, !=, NULL); ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); - return (abd_get_offset_size(abd_zero_scatter, 0, size)); + + abd = abd_get_offset_size(abd_zero_scatter, 0, size); + abd->abd_flags |= ABD_FLAG_ZEROS; + + return (abd); } /* @@ -612,6 +634,10 @@ abd_get_from_buf(void *buf, size_t size) return (abd); } +#ifdef ZIA +EXPORT_SYMBOL(abd_get_from_buf); +#endif + /* * Get the raw buffer associated with a linear ABD. */ @@ -711,6 +737,9 @@ abd_release_ownership_of_buf(abd_t *abd) abd_update_linear_stats(abd, ABDSTAT_DECR); } +#ifdef ZIA +EXPORT_SYMBOL(abd_release_ownership_of_buf); +#endif /* * Give this ABD ownership of the buffer that it's storing. Can only be used on diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 74019ad08b4c..027724d37a1e 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -443,7 +443,11 @@ static const unsigned long zfs_arc_pool_dirty_percent = 20; /* * Enable or disable compressed arc buffers. */ +#ifndef ZIA int zfs_compressed_arc_enabled = B_TRUE; +#else +int zfs_compressed_arc_enabled = B_FALSE; +#endif /* * ARC will evict meta buffers that exceed arc_meta_limit. This @@ -11101,8 +11105,13 @@ ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, p_min_shift, param_set_arc_int, ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, average_blocksize, INT, ZMOD_RD, "Target average block size"); +#ifndef ZIA ZFS_MODULE_PARAM(zfs, zfs_, compressed_arc_enabled, INT, ZMOD_RW, "Disable compressed ARC buffers"); +#else +ZFS_MODULE_PARAM(zfs, zfs_, compressed_arc_enabled, INT, ZMOD_RD, + "Disable compressed ARC buffers"); +#endif ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prefetch_ms, param_set_arc_int, param_get_int, ZMOD_RW, "Min life of prefetch block in ms"); diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index e6008b3bf178..1cfe1ce5f960 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -59,6 +59,10 @@ #include #endif +#ifdef ZIA +#include +#endif + /* * Enable/disable nopwrite feature. */ @@ -1961,6 +1965,10 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) boolean_t encrypt = B_FALSE; int copies = os->os_copies; +#ifdef ZIA + zp->zp_ismd = ismd; +#endif + /* * We maintain different write policies for each of the following * types of data: @@ -2289,6 +2297,9 @@ byteswap_uint8_array(void *vbuf, size_t size) void dmu_init(void) { +#ifdef ZIA + zia_init(); +#endif abd_init(); zfs_dbgmsg_init(); sa_cache_init(); @@ -2304,6 +2315,9 @@ dmu_init(void) void dmu_fini(void) { +#ifdef ZIA + zia_fini(); +#endif arc_fini(); /* arc depends on l2arc, so arc must go first */ l2arc_fini(); dmu_tx_fini(); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 55f3a4de603f..4f59c3b2b229 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -96,6 +96,10 @@ #include #endif /* _KERNEL */ +#ifdef ZIA +#include +#endif /* ZIA */ + #include "zfs_prop.h" #include "zfs_comutil.h" @@ -423,6 +427,48 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) dp->scd_path, 0, ZPROP_SRC_LOCAL); } } + +#ifdef ZIA + zia_props_t *zia_props = zia_get_props(spa); + if (zia_props->provider != NULL) { + spa_prop_add_list(*nvp, ZPOOL_PROP_ZIA_PROVIDER, + (char *)zia_get_provider_name(zia_props->provider), + 0, ZPROP_SRC_LOCAL); + } + + spa_prop_add_list(*nvp, ZPOOL_PROP_ZIA_COMPRESS, + NULL, zia_props->compress, ZPROP_SRC_LOCAL); + + spa_prop_add_list(*nvp, ZPOOL_PROP_ZIA_DECOMPRESS, + NULL, zia_props->decompress, ZPROP_SRC_LOCAL); + + spa_prop_add_list(*nvp, ZPOOL_PROP_ZIA_CHECKSUM, + NULL, zia_props->checksum, ZPROP_SRC_LOCAL); + + spa_prop_add_list(*nvp, ZPOOL_PROP_ZIA_RAIDZ1_GEN, + NULL, zia_props->raidz.gen[1], ZPROP_SRC_LOCAL); + + spa_prop_add_list(*nvp, ZPOOL_PROP_ZIA_RAIDZ2_GEN, + NULL, zia_props->raidz.gen[2], ZPROP_SRC_LOCAL); + + spa_prop_add_list(*nvp, ZPOOL_PROP_ZIA_RAIDZ3_GEN, + NULL, zia_props->raidz.gen[3], ZPROP_SRC_LOCAL); + + spa_prop_add_list(*nvp, ZPOOL_PROP_ZIA_RAIDZ1_REC, + NULL, zia_props->raidz.rec[1], ZPROP_SRC_LOCAL); + + spa_prop_add_list(*nvp, ZPOOL_PROP_ZIA_RAIDZ2_REC, + NULL, zia_props->raidz.rec[2], ZPROP_SRC_LOCAL); + + spa_prop_add_list(*nvp, ZPOOL_PROP_ZIA_RAIDZ3_REC, + NULL, zia_props->raidz.rec[3], ZPROP_SRC_LOCAL); + + spa_prop_add_list(*nvp, ZPOOL_PROP_ZIA_FILE_WRITE, + NULL, zia_props->file_write, ZPROP_SRC_LOCAL); + + spa_prop_add_list(*nvp, ZPOOL_PROP_ZIA_DISK_WRITE, + NULL, zia_props->disk_write, ZPROP_SRC_LOCAL); +#endif } /* @@ -719,6 +765,22 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) error = SET_ERROR(E2BIG); break; +#ifdef ZIA + case ZPOOL_PROP_ZIA_PROVIDER: + case ZPOOL_PROP_ZIA_COMPRESS: + case ZPOOL_PROP_ZIA_DECOMPRESS: + case ZPOOL_PROP_ZIA_CHECKSUM: + case ZPOOL_PROP_ZIA_RAIDZ1_GEN: + case ZPOOL_PROP_ZIA_RAIDZ2_GEN: + case ZPOOL_PROP_ZIA_RAIDZ3_GEN: + case ZPOOL_PROP_ZIA_RAIDZ1_REC: + case ZPOOL_PROP_ZIA_RAIDZ2_REC: + case ZPOOL_PROP_ZIA_RAIDZ3_REC: + case ZPOOL_PROP_ZIA_FILE_WRITE: + case ZPOOL_PROP_ZIA_DISK_WRITE: + break; +#endif + default: break; } @@ -1743,6 +1805,12 @@ spa_unload(spa_t *spa) spa->spa_compatibility = NULL; } +#ifdef ZIA + if (zia_get_props(spa)->provider != NULL) { + zia_put_provider(&zia_get_props(spa)->provider); + } +#endif + spa_config_exit(spa, SCL_ALL, spa); } @@ -8818,6 +8886,125 @@ spa_sync_props(void *arg, dmu_tx_t *tx) spa_history_log_internal(spa, "set", tx, "%s=%s", nvpair_name(elem), strval); break; +#ifdef ZIA + case ZPOOL_PROP_ZIA_PROVIDER: + strval = fnvpair_value_string(elem); + if (zia_get_props(spa)->provider != NULL) + zia_put_provider(&zia_get_props(spa)->provider); + zia_get_props(spa)->provider = zia_get_provider(strval); + /* + * Dirty the configuration on vdevs as above. + */ + if (tx->tx_txg != TXG_INITIAL) { + vdev_config_dirty(spa->spa_root_vdev); + spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); + } + + /* + * reopen devices so that provider is used + * copied from zfs_ioc_pool_reopen + */ + spa_vdev_state_enter(spa, SCL_NONE); + vdev_reopen(spa->spa_root_vdev); + (void) spa_vdev_state_exit(spa, NULL, 0); + + spa_history_log_internal(spa, "set", tx, + "%s=%s", nvpair_name(elem), strval); + break; + case ZPOOL_PROP_ZIA_COMPRESS: + zia_get_props(spa)->compress = + fnvpair_value_uint64(elem); + zia_prop_warn(zia_get_props(spa)->compress, + "Compression"); + break; + case ZPOOL_PROP_ZIA_DECOMPRESS: + zia_get_props(spa)->decompress = + fnvpair_value_uint64(elem); + zia_prop_warn(zia_get_props(spa)->decompress, + "Decompression"); + break; + case ZPOOL_PROP_ZIA_CHECKSUM: + zia_get_props(spa)->checksum = + fnvpair_value_uint64(elem); + zia_prop_warn(zia_get_props(spa)->checksum, + "Checksum"); + break; + case ZPOOL_PROP_ZIA_RAIDZ1_GEN: + zia_get_props(spa)->raidz.gen[1] = + fnvpair_value_uint64(elem); + zia_prop_warn(zia_get_props(spa)->raidz.gen[1], + "RAIDZ 1 Generation"); + break; + case ZPOOL_PROP_ZIA_RAIDZ2_GEN: + zia_get_props(spa)->raidz.gen[2] = + fnvpair_value_uint64(elem); + zia_prop_warn(zia_get_props(spa)->raidz.gen[2], + "RAIDZ 2 Generation"); + break; + case ZPOOL_PROP_ZIA_RAIDZ3_GEN: + zia_get_props(spa)->raidz.gen[3] = + fnvpair_value_uint64(elem); + zia_prop_warn(zia_get_props(spa)->raidz.gen[3], + "RAIDZ 3 Generation"); + break; + case ZPOOL_PROP_ZIA_RAIDZ1_REC: + zia_get_props(spa)->raidz.rec[1] = + fnvpair_value_uint64(elem); + /* need checksum */ + if (zia_get_props(spa)->raidz.rec[1]) { + if (!zia_get_props(spa)->checksum) { + zia_get_props(spa)->checksum = 1; + zia_prop_warn( + zia_get_props(spa)->checksum, + "Checksum"); + } + } + zia_prop_warn(zia_get_props(spa)->raidz.rec[1], + "RAIDZ 1 Reconstruction"); + break; + case ZPOOL_PROP_ZIA_RAIDZ2_REC: + zia_get_props(spa)->raidz.rec[2] = + fnvpair_value_uint64(elem); + /* need checksum */ + if (zia_get_props(spa)->raidz.rec[2]) { + if (!zia_get_props(spa)->checksum) { + zia_get_props(spa)->checksum = 1; + zia_prop_warn( + zia_get_props(spa)->checksum, + "Checksum"); + } + } + zia_prop_warn(zia_get_props(spa)->raidz.rec[2], + "RAIDZ 2 Reconstruction"); + break; + case ZPOOL_PROP_ZIA_RAIDZ3_REC: + zia_get_props(spa)->raidz.rec[3] = + fnvpair_value_uint64(elem); + /* need checksum */ + if (zia_get_props(spa)->raidz.rec[3]) { + if (!zia_get_props(spa)->checksum) { + zia_get_props(spa)->checksum = 1; + zia_prop_warn( + zia_get_props(spa)->checksum, + "Checksum"); + } + } + zia_prop_warn(zia_get_props(spa)->raidz.rec[3], + "RAIDZ 3 Reconstruction"); + break; + case ZPOOL_PROP_ZIA_FILE_WRITE: + zia_get_props(spa)->file_write = + fnvpair_value_uint64(elem); + zia_prop_warn(zia_get_props(spa)->file_write, + "File Write"); + break; + case ZPOOL_PROP_ZIA_DISK_WRITE: + zia_get_props(spa)->disk_write = + fnvpair_value_uint64(elem); + zia_prop_warn(zia_get_props(spa)->disk_write, + "Disk Write"); + break; +#endif default: /* diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index de29e6fd4c7c..9baa62b96f0e 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -673,6 +673,10 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) vdev_queue_init(vd); vdev_cache_init(vd); +#ifdef ZIA + vd->vdev_zia_handle = NULL; +#endif + return (vd); } @@ -1000,6 +1004,10 @@ vdev_free(vdev_t *vd) */ vdev_close(vd); +#ifdef ZIA + ASSERT3P(vd->vdev_zia_handle, ==, NULL); +#endif + ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c index fa8daf57b2eb..02296b667fd1 100644 --- a/module/zfs/vdev_draid.c +++ b/module/zfs/vdev_draid.c @@ -1035,6 +1035,9 @@ vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset, #ifdef ZFS_DEBUG rr->rr_offset = io_offset; rr->rr_size = io_size; +#endif +#ifdef ZIA + rr->rr_zia_handle = NULL; #endif *rrp = rr; diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 3633937f462b..ff9e31e85bce 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -41,6 +41,10 @@ #include /* For vdev_xlate() in vdev_raidz_io_verify() */ #endif +#ifdef ZIA +#include +#endif + /* * Virtual device vector for RAID-Z. * @@ -138,6 +142,10 @@ static void vdev_raidz_row_free(raidz_row_t *rr) { +#ifdef ZIA + zia_raidz_free(rr, B_FALSE); +#endif + for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; @@ -359,6 +367,9 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, rr->rr_offset = zio->io_offset; rr->rr_size = zio->io_size; #endif +#ifdef ZIA + rr->rr_zia_handle = NULL; +#endif asize = 0; @@ -503,7 +514,11 @@ vdev_raidz_pqr_func(void *buf, size_t size, void *private) return (0); } +#ifndef ZIA static void +#else +void +#endif vdev_raidz_generate_parity_p(raidz_row_t *rr) { uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); @@ -521,7 +536,15 @@ vdev_raidz_generate_parity_p(raidz_row_t *rr) } } +#ifdef ZIA +EXPORT_SYMBOL(vdev_raidz_generate_parity_p); +#endif + +#ifndef ZIA static void +#else +void +#endif vdev_raidz_generate_parity_pq(raidz_row_t *rr) { uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); @@ -563,7 +586,15 @@ vdev_raidz_generate_parity_pq(raidz_row_t *rr) } } +#ifdef ZIA +EXPORT_SYMBOL(vdev_raidz_generate_parity_pq); +#endif + +#ifndef ZIA static void +#else +void +#endif vdev_raidz_generate_parity_pqr(raidz_row_t *rr) { uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); @@ -611,6 +642,10 @@ vdev_raidz_generate_parity_pqr(raidz_row_t *rr) } } +#ifdef ZIA +EXPORT_SYMBOL(vdev_raidz_generate_parity_pqr); +#endif + /* * Generate RAID parity in the first virtual columns according to the number of * parity columns available. @@ -1280,7 +1315,11 @@ vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing, kmem_free(p, psize); } +#ifndef ZIA static void +#else +void +#endif vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts) { int n, i, c, t, tt; @@ -1417,6 +1456,10 @@ vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts) } } +#ifdef ZIA +EXPORT_SYMBOL(vdev_raidz_reconstruct_general); +#endif + static void vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr, const int *t, int nt) @@ -1628,7 +1671,22 @@ vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr, uint64_t ashift) vdev_t *vd = zio->io_vd; raidz_map_t *rm = zio->io_vsd; +#ifdef ZIA + /* + * here instead of vdev_raidz_generate_parity_row + * to be able to use zio + */ + boolean_t local_offload = B_FALSE; + if ((zia_raidz_alloc(zio, rr, B_FALSE, 0, &local_offload) != ZIA_OK) || + (zia_raidz_gen(rr) != ZIA_OK)) { + zia_raidz_gen_cleanup(zio, rr, local_offload); +#endif vdev_raidz_generate_parity_row(rm, rr); +#ifdef ZIA + } else { + zio->io_flags |= ZIO_FLAG_DONT_AGGREGATE; + } +#endif for (int c = 0; c < rr->rr_scols; c++) { raidz_col_t *rc = &rr->rr_col[c]; @@ -1781,11 +1839,27 @@ raidz_checksum_verify(zio_t *zio) { zio_bad_cksum_t zbc = {{{0}}}; raidz_map_t *rm = zio->io_vsd; +#ifdef ZIA + const boolean_t entered_offloaded = zia_is_offloaded(zio->io_abd); +#endif int ret = zio_checksum_error(zio, &zbc); if (ret != 0 && zbc.zbc_injected != 0) rm->rm_ecksuminjected = 1; +#ifdef ZIA + if (zia_is_offloaded(zio->io_abd) != B_TRUE) { + /* columns need to be onloaded */ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + + /* force onload, since data was modified */ + zia_raidz_rec_cleanup(zio, rr, B_TRUE, + entered_offloaded); + } + } +#endif + return (ret); } @@ -1818,7 +1892,18 @@ raidz_parity_verify(zio_t *zio, raidz_row_t *rr) orig[c] = rc->rc_abd; ASSERT3U(abd_get_size(rc->rc_abd), ==, rc->rc_size); rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); + +#ifdef ZIA + rc->rc_abd->abd_zia_handle = NULL; +#endif +} + +#ifdef ZIA + if (zia_raidz_new_parity(zio, rr, orig) != ZIA_OK) { + /* onload data and parity columns */ + zia_raidz_rec_cleanup(zio, rr, B_TRUE, B_FALSE); } +#endif /* * Verify any empty sectors are zero filled to ensure the parity @@ -1832,7 +1917,14 @@ raidz_parity_verify(zio_t *zio, raidz_row_t *rr) * isn't harmful but it does have the side effect of fixing stuff * we didn't realize was necessary (i.e. even if we return 0). */ +#ifdef ZIA + if (zia_raidz_gen(rr) != ZIA_OK) { + zia_raidz_rec_cleanup(zio, rr, B_FALSE, B_TRUE); +#endif vdev_raidz_generate_parity_row(rm, rr); +#ifdef ZIA + } +#endif for (c = 0; c < rr->rr_firstdatacol; c++) { rc = &rr->rr_col[c]; @@ -1840,7 +1932,21 @@ raidz_parity_verify(zio_t *zio, raidz_row_t *rr) if (!rc->rc_tried || rc->rc_error != 0) continue; - if (abd_cmp(orig[c], rc->rc_abd) != 0) { + int cmp = 0; +#ifdef ZIA + if (zia_raidz_cmp(orig[c], rc->rc_abd, &cmp) != ZIA_OK) { + zia_raidz_rec_cleanup(zio, rr, B_FALSE, B_TRUE); + zia_onload_abd(orig[c], rc->rc_size, B_FALSE); +#endif + cmp = abd_cmp(orig[c], rc->rc_abd); +#ifdef ZIA + } +#endif + if (cmp != 0) { +#ifdef ZIA + zia_raidz_rec_cleanup(zio, rr, B_FALSE, B_TRUE); + zia_onload_abd(orig[c], rc->rc_size, B_FALSE); +#endif vdev_raidz_checksum_error(zio, rc, orig[c]); rc->rc_error = SET_ERROR(ECKSUM); ret++; @@ -2000,11 +2106,30 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) } if (dead > nparity) { /* reconstruction not possible */ +#ifdef ZIA + /* drop offloaded data */ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + zia_raidz_rec_cleanup(zio, rr, B_TRUE, B_FALSE); + } +#endif raidz_restore_orig_data(rm); return (EINVAL); } - if (dead_data > 0) + if (dead_data > 0) { +#ifdef ZIA + /* + * here instead of vdev_raidz_reconstruct_row + * to be able to use zio + */ + if ((zia_raidz_rec(rr, my_tgts, t) != ZIA_OK)) { + zia_raidz_rec_cleanup(zio, rr, B_TRUE, B_FALSE); +#endif vdev_raidz_reconstruct_row(rm, rr, my_tgts, t); +#ifdef ZIA + } +#endif + } } /* Check for success */ @@ -2047,6 +2172,13 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) } /* Reconstruction failed - restore original data */ +#ifdef ZIA + /* drop offloaded data */ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + zia_raidz_rec_cleanup(zio, rr, B_TRUE, B_FALSE); + } +#endif raidz_restore_orig_data(rm); return (ECKSUM); } @@ -2310,7 +2442,22 @@ vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm, ASSERT(rr->rr_firstdatacol >= n); +#ifdef ZIA + /* + * here instead of vdev_raidz_reconstruct_row + * to be able to use zio + */ + if ((zia_raidz_rec(rr, tgts, n) != ZIA_OK)) { + /* + * drop handles instead of onloading because + * the data hasn't changed yet + */ + zia_raidz_rec_cleanup(zio, rr, B_TRUE, B_FALSE); +#endif vdev_raidz_reconstruct_row(rm, rr, tgts, n); +#ifdef ZIA + } +#endif } } @@ -2394,6 +2541,23 @@ vdev_raidz_io_done(zio_t *zio) vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]); } } else { +#ifdef ZIA + /* offload once at beginning */ + blkptr_t *bp = zio->io_bp; + if (bp && !BP_IS_METADATA(bp)) { + uint_t checksum = (BP_IS_GANG(bp) ? + ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)); + zio_checksum_info_t *ci = &zio_checksum_table[checksum]; + if (!(ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED)) { + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + zia_raidz_alloc(zio, rr, + B_TRUE, checksum, NULL); + } + } + } +#endif + for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; vdev_raidz_io_done_reconstruct_known_missing(zio, diff --git a/module/zfs/zia.c b/module/zfs/zia.c new file mode 100644 index 000000000000..0ced3f4321b7 --- /dev/null +++ b/module/zfs/zia.c @@ -0,0 +1,1187 @@ +/* + * © 2021. Triad National Security, LLC. All rights reserved. + * + * This program was produced under U.S. Government contract + * 89233218CNA000001 for Los Alamos National Laboratory (LANL), which + * is operated by Triad National Security, LLC for the U.S. + * Department of Energy/National Nuclear Security Administration. All + * rights in the program are reserved by Triad National Security, LLC, + * and the U.S. Department of Energy/National Nuclear Security + * Administration. The Government is granted for itself and others + * acting on its behalf a nonexclusive, paid-up, irrevocable worldwide + * license in this material to reproduce, prepare derivative works, + * distribute copies to the public, perform publicly and display + * publicly, and to permit others to do so. + * + * ---- + * + * This program is open source under the BSD-3 License. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifdef ZIA + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* ************************************************************* */ +/* global offloader functions initialized with ZFS */ +static const dpusm_uf_t *dpusm = NULL; +/* ************************************************************* */ + +zia_props_t * +zia_get_props(spa_t *spa) +{ + return (spa?&spa->spa_zia_props:NULL); +} + +void +zia_prop_warn(boolean_t val, const char *name) +{ + if (val == B_TRUE) { +#ifdef _KERNEL + printk("Z.I.A. %s enabled. Encryption and " + "Dedup for this spa will be disabled.\n", + name); +#else + (void) name; +#endif + } +} + +static int +translate_rc(const int dpusm_rc) +{ + int zia_rc = ZIA_FALLBACK; + switch (dpusm_rc) { + case DPUSM_OK: + zia_rc = ZIA_OK; + break; + case DPUSM_ERROR: + case DPUSM_PROVIDER_EXISTS: + case DPUSM_PROVIDER_NOT_EXISTS: + zia_rc = ZIA_ERROR; + break; + case DPUSM_PROVIDER_MISMATCH: + zia_rc = ZIA_PROVIDER_MISMATCH; + break; + case DPUSM_NOT_IMPLEMENTED: + zia_rc = ZIA_FALLBACK; + break; + case DPUSM_BAD_RESULT: + zia_rc = ZIA_BAD_RESULT; + break; + default: + /* only translate recognized values */ + zia_rc = dpusm_rc; + break; + } + return (zia_rc); +} + +dpusm_compress_t +translate_compress(enum zio_compress c) +{ + dpusm_compress_t dpusm_c = 0; + switch (c) { + case ZIO_COMPRESS_GZIP_1: + dpusm_c = DPUSM_COMPRESS_GZIP_1; + break; + case ZIO_COMPRESS_GZIP_2: + dpusm_c = DPUSM_COMPRESS_GZIP_2; + break; + case ZIO_COMPRESS_GZIP_3: + dpusm_c = DPUSM_COMPRESS_GZIP_3; + break; + case ZIO_COMPRESS_GZIP_4: + dpusm_c = DPUSM_COMPRESS_GZIP_4; + break; + case ZIO_COMPRESS_GZIP_5: + dpusm_c = DPUSM_COMPRESS_GZIP_5; + break; + case ZIO_COMPRESS_GZIP_6: + dpusm_c = DPUSM_COMPRESS_GZIP_6; + break; + case ZIO_COMPRESS_GZIP_7: + dpusm_c = DPUSM_COMPRESS_GZIP_7; + break; + case ZIO_COMPRESS_GZIP_8: + dpusm_c = DPUSM_COMPRESS_GZIP_8; + break; + case ZIO_COMPRESS_GZIP_9: + dpusm_c = DPUSM_COMPRESS_GZIP_9; + break; + case ZIO_COMPRESS_INHERIT: + case ZIO_COMPRESS_ON: + case ZIO_COMPRESS_OFF: + case ZIO_COMPRESS_LZJB: + case ZIO_COMPRESS_EMPTY: + case ZIO_COMPRESS_ZLE: + case ZIO_COMPRESS_LZ4: + case ZIO_COMPRESS_ZSTD: + case ZIO_COMPRESS_FUNCTIONS: + default: + break; + } + + return (dpusm_c); +} + +dpusm_checksum_t +translate_checksum(enum zio_checksum c) +{ + dpusm_checksum_t dpusm_c = 0; + switch (c) { + case ZIO_CHECKSUM_FLETCHER_2: + dpusm_c = DPUSM_CHECKSUM_FLETCHER_2; + break; + case ZIO_CHECKSUM_FLETCHER_4: + dpusm_c = DPUSM_CHECKSUM_FLETCHER_4; + break; + case ZIO_CHECKSUM_INHERIT: + case ZIO_CHECKSUM_ON: + case ZIO_CHECKSUM_OFF: + case ZIO_CHECKSUM_LABEL: + case ZIO_CHECKSUM_GANG_HEADER: + case ZIO_CHECKSUM_ZILOG: + case ZIO_CHECKSUM_SHA256: + case ZIO_CHECKSUM_ZILOG2: + case ZIO_CHECKSUM_NOPARITY: + case ZIO_CHECKSUM_SHA512: + case ZIO_CHECKSUM_SKEIN: + default: + break; + } + + return (dpusm_c); +} + +dpusm_checksum_byteorder_t +translate_byteorder(zio_byteorder_t bo) +{ + dpusm_checksum_byteorder_t dpusm_bo = 0; + switch (bo) { + case ZIO_CHECKSUM_NATIVE: + dpusm_bo = DPUSM_BYTEORDER_NATIVE; + break; + case ZIO_CHECKSUM_BYTESWAP: + dpusm_bo = DPUSM_BYTEORDER_BYTESWAP; + break; + default: + break; + } + + return (dpusm_bo); +} + +int +zia_get_capabilities(void *provider, dpusm_pc_t **caps) +{ + if (!provider || !caps) { + return (ZIA_ERROR); + } + + /* dpusm is checked by the caller */ + /* provider and caps are checked by the dpusm */ + return (translate_rc(dpusm->capabilities(provider, caps))); +} + +int +zia_init(void) +{ + if (dpusm) { + return (ZIA_OK); + } + + if (dpusm_initialize) { + dpusm = dpusm_initialize(); + } + + if (!dpusm) { +#ifdef _KERNEL + printk("Warning: Z.I.A. not initialized\n"); +#endif + return (ZIA_ERROR); + } + +#ifdef _KERNEL + printk("Z.I.A. initialized (%p)\n", dpusm); +#endif + return (ZIA_OK); +} + +int +zia_fini(void) +{ + if (!dpusm) { +#ifdef _KERNEL + printk("Warning: Z.I.A. not initialized. " + "Not uninitializing.\n"); +#endif + return (ZIA_ERROR); + } + + if (dpusm_finalize) { + dpusm_finalize(); +#ifdef _KERNEL + printk("Z.I.A. finalized\n"); +#endif + } else { +#ifdef _KERNEL + if (dpusm) { + printk("Z.I.A. incomplete finalize\n"); + } +#endif + } + + dpusm = NULL; + return (ZIA_OK); +} + +void * +zia_get_provider(const char *name) +{ + if (!dpusm) { + return (NULL); + } + + return (dpusm->get(name)); +} + +const char * +zia_get_provider_name(void *provider) +{ + if (!dpusm || !provider) { + return (NULL); + } + + return (dpusm->get_name(provider)); +} + +int +zia_put_provider(void **provider) +{ + if (!dpusm || !provider || !*provider) { + return (ZIA_FALLBACK); + } + + const int rc = dpusm->put(*provider); + if (rc == DPUSM_OK) { + *provider = NULL; + } + + return (translate_rc(rc)); +} + +boolean_t +zia_is_used(zio_t *zio) +{ + if (!zio) { + return (B_FALSE); + } + + zia_props_t *props = zia_get_props(zio->io_spa); + + /* provider + at least 1 operation */ + if (props->provider && + (props->compress || + props->decompress || + props->checksum || + props->raidz.gen[1] || + props->raidz.gen[2] || + props->raidz.gen[3] || + props->raidz.rec[1] || + props->raidz.rec[2] || + props->raidz.rec[3] || + props->file_write || + props->disk_write)) { + return (B_TRUE); + } + + return (B_FALSE); +} + +boolean_t +zia_is_offloaded(abd_t *abd) +{ + if (!abd) { + return (B_FALSE); + } + + return (ABD_HANDLE(abd)?B_TRUE:B_FALSE); +} + +/* create a provider handle/offloader buffer without copying data */ +void * +zia_alloc(void *provider, size_t size, size_t min_offload_size) +{ + if (size < min_offload_size) { + return (NULL); + } + return ((dpusm && provider)?dpusm->alloc(provider, size):NULL); +} + +/* free the offloader handle without onloading the data */ +void +zia_free(void **handle) +{ + if (dpusm && handle) { + dpusm->free(*handle); + *handle = NULL; + } +} + +/* move data from the offloader and unregister the mapping */ +int +zia_onload(void **handle, void *buf, size_t size) +{ + if (!dpusm) { + return (ZIA_FALLBACK); + } + + if (!handle || !*handle || !buf) { + return (ZIA_ERROR); + } + + dpusm_mv_t mv = { .handle = *handle, .offset = 0 }; + const int rc = dpusm->copy_to_mem(&mv, buf, size); + + /* + * if success, no more need for handle + * if failure, can't do anything with + * handle in any case, so destroy it + */ + zia_free(handle); + + return (translate_rc(rc)); +} + +/* abd_iterate_func callback for moving data to the offloader */ +static int +zia_offload_cb(void *buf, size_t size, void *private) +{ + const int rc = dpusm->copy_from_mem(private, buf, size); + if (translate_rc(rc) != ZIA_OK) { + return (ZIA_ERROR); + } + + dpusm_mv_t *mv = (dpusm_mv_t *)private; + mv->offset += size; + return (0); +} + +/* abd_iterate_func callback for moving data from the offloader */ +static int +zia_onload_cb(void *buf, size_t size, void *private) +{ + const int rc = dpusm->copy_to_mem(private, buf, size); + if (translate_rc(rc) != ZIA_OK) { + return (ZIA_ERROR); + } + + dpusm_mv_t *mv = (dpusm_mv_t *)private; + mv->offset += size; + return (0); +} + +/* create a new handle and copy data into it */ +static int +zia_offload_abd_offset(void *provider, abd_t *abd, + size_t offset, size_t size, + size_t min_offload_size, boolean_t *local_offload) +{ + /* already offloaded */ + if (ABD_HANDLE(abd)) { + void *abd_provider = dpusm->extract(ABD_HANDLE(abd)); + if (local_offload) { + *local_offload = B_FALSE; + } + + /* see zia_checksum_error */ + if (!provider) { + return (ZIA_OK); + } + + return ((provider == abd_provider)? + ZIA_OK:ZIA_PROVIDER_MISMATCH); + } + + /* provider is checked by dpusm */ + void *handle = zia_alloc(provider, size, min_offload_size); + if (!handle) { + return (ZIA_ERROR); + } + + /* offload */ + int rc = ZIA_ERROR; + dpusm_mv_t mv = { .handle = handle, .offset = offset }; + if (abd_iterate_func(abd, 0, size, zia_offload_cb, &mv) == 0) { + rc = ZIA_OK; + } + + if (rc == ZIA_OK) { + ABD_HANDLE(abd) = handle; + if (local_offload) { + *local_offload = B_TRUE; + } + } else { + zia_free(&handle); + } + + return (rc); +} + +int +zia_offload_abd(void *provider, abd_t *abd, + size_t size, size_t min_offload_size, boolean_t *local_offload) +{ + if (!dpusm) { + return (ZIA_FALLBACK); + } + + /* no gangs or scatterlists */ + if (!abd || !(abd_is_linear(abd) || abd_is_linear_page(abd))) { + return (ZIA_ERROR); + } + + return (zia_offload_abd_offset(provider, + abd, 0, size, min_offload_size, local_offload)); +} + +/* copy offloaded buffer + offset back into abd + 0 */ +static int +zia_onload_abd_offset(abd_t *abd, size_t offset, + size_t size, boolean_t keep_handle) +{ + if (!dpusm) { + return (ZIA_FALLBACK); + } + + if (!abd) { + return (ZIA_ERROR); + } + + mutex_enter(&abd->abd_mtx); + if (!ABD_HANDLE(abd)) { + mutex_exit(&abd->abd_mtx); + return (ZIA_ERROR); + } + + int rc = ZIA_ERROR; + dpusm_mv_t mv = { .handle = ABD_HANDLE(abd), .offset = offset }; + if (abd_iterate_func(abd, 0, size, zia_onload_cb, &mv) == 0) { + rc = ZIA_OK; + } + + if (keep_handle != B_TRUE) { + zia_free_abd(abd, B_FALSE); + } + mutex_exit(&abd->abd_mtx); + + return (rc); +} + +int +zia_onload_abd(abd_t *abd, size_t size, boolean_t keep_handle) +{ + if (abd_is_gang(abd)) { + /* + * the only gangs that show up are from raidz + * + * get leading data size, stopping at first zero page + * which should always be the second child + */ + const size_t original_size = size; + size = 0; + for (abd_t *child = list_head(&ABD_GANG(abd).abd_gang_chain); + child != NULL; + child = list_next(&ABD_GANG(abd).abd_gang_chain, child)) { + if (child->abd_flags & ABD_FLAG_ZEROS) { + break; + } + + size += child->abd_size; + } + + ASSERT(size <= original_size); + } + + return (zia_onload_abd_offset(abd, 0, size, keep_handle)); +} + +void +zia_move_into_abd(abd_t *dst, void **src_handle) +{ + ABD_HANDLE(dst) = *src_handle; + *src_handle = NULL; +} + +int +zia_free_abd(abd_t *abd, boolean_t lock) +{ + if (lock == B_TRUE) { + mutex_enter(&abd->abd_mtx); + } + + zia_free(&ABD_HANDLE(abd)); + + if (lock == B_TRUE) { + mutex_exit(&abd->abd_mtx); + } + return (ZIA_OK); +} + +/* + * if offloaded locally, just free the handle + * if not, onload the data and free the handle + */ +int +zia_cleanup_abd(abd_t *abd, size_t size, boolean_t local_offload) +{ + if (!dpusm) { + return (ZIA_FALLBACK); + } + + if (!abd) { + return (ZIA_ERROR); + } + + int rc = ZIA_OK; + if (local_offload == B_TRUE) { + /* in-memory copy is still valid */ + /* lock just in case mirrors clean up at the same time */ + zia_free_abd(abd, B_TRUE); + } else { + /* have to copy data into memory */ + rc = zia_onload_abd(abd, size, B_FALSE); + } + + return (rc); +} + +int +zia_zero_fill(abd_t *abd, size_t offset, size_t size) +{ + if (!dpusm) { + return (ZIA_FALLBACK); + } + + if (!abd || !ABD_HANDLE(abd)) { + return (ZIA_ERROR); + } + + return (translate_rc(dpusm->zero_fill(ABD_HANDLE(abd), offset, size))); +} + +int +zia_compress(void *provider, zio_t *zio, size_t s_len, + enum zio_compress c, uint8_t level, void **cbuf_handle, + uint64_t *c_len, boolean_t *local_offload) +{ + if (!dpusm || !dpusm->compress || !provider) { + return (ZIA_FALLBACK); + } + + return (zia_compress_impl(dpusm, provider, zio, s_len, + c, level, cbuf_handle, c_len, local_offload)); +} + +int +zia_checksum_compute(void *provider, zio_cksum_t *dst, enum zio_checksum alg, + zio_t *zio, uint64_t size, boolean_t *local_offload) +{ + if (!dpusm || !dpusm->checksum || !provider) { + return (ZIA_FALLBACK); + } + + const dpusm_checksum_byteorder_t byteorder = + translate_byteorder(ZIO_CHECKSUM_NATIVE); + + if (!ABD_HANDLE(zio->io_abd)) { + dpusm_pc_t *caps = NULL; + if ((zia_get_capabilities(provider, &caps) != ZIA_OK) || + !(caps->checksum & translate_checksum(alg)) || + !(caps->checksum_byteorder & byteorder)) { + return (ZIA_FALLBACK); + } + + if (zia_offload_abd(provider, zio->io_abd, size, + zia_get_props(zio->io_spa)->min_offload_size, + local_offload) != ZIA_OK) { + return (ZIA_ERROR); + } + } else { + void *old_provider = dpusm->extract(ABD_HANDLE(zio->io_abd)); + if (old_provider != provider) { + return (ZIA_PROVIDER_MISMATCH); + } + + /* skip checks because dpusm will do them */ + } + + return (translate_rc(dpusm->checksum(translate_checksum(alg), + byteorder, ABD_HANDLE(zio->io_abd), size, dst->zc_word, + sizeof (dst->zc_word)))); +} + +int +zia_checksum_error(const blkptr_t *bp, enum zio_checksum alg, + abd_t *abd, uint64_t size, zio_bad_cksum_t *info) +{ + return (zia_checksum_error_impl(dpusm, bp, alg, abd, size, info)); +} + +static boolean_t +zia_can_raidz(zio_t *zio, raidz_row_t *rr, + boolean_t rec, uint_t cksum, size_t *col_sizes) +{ + const int raidn = rr->rr_firstdatacol; + if ((1 > raidn) || (raidn > 3)) { + return (B_FALSE); + } + + /* need at least raidn + 2 data columns */ + if (rr->rr_firstdatacol + 2 > rr->rr_cols) { + return (B_FALSE); + } + + const zia_props_t *props = zia_get_props(zio->io_spa); + if (!props->provider) { + return (B_FALSE); + } + + /* + * generation is needed for both + * generation and reconstruction + */ + int good = ( + /* raidz generation is turned on */ + (props->raidz.gen[raidn] == 1) && + + /* + * the provider knows whether or not + * raidz functions are available + */ + (dpusm->raid.can_compute(props->provider, raidn, + rr->rr_cols - rr->rr_firstdatacol, + col_sizes, rec == B_TRUE) == DPUSM_OK)); + + if (good && (rec == B_TRUE)) { + dpusm_pc_t *caps = NULL; + if (zia_get_capabilities(props->provider, &caps) != ZIA_OK) { + return (B_FALSE); + } + + good &= ( + /* raidz reconstruction is turned on */ + (props->raidz.rec[raidn] == 1) && + + /* need checksum */ + (props->checksum == 1) && + + /* raidz reconstruction support was checked earlier */ + + /* make sure the checksum is supported by the provider */ + (caps->checksum & translate_checksum(cksum))); + } + + return (good?B_TRUE:B_FALSE); +} + +int +zia_raidz_alloc(zio_t *zio, raidz_row_t *rr, boolean_t rec, + uint_t cksum, boolean_t *local_offload) +{ + if (!dpusm || !zio || !rr) { + return (ZIA_ERROR); + } + + /* + * existence of row handle implies existence + * of data and column handles + */ + if (rr->rr_zia_handle) { + return (ZIA_OK); + } + + /* get column sizes */ + const size_t column_sizes_size = sizeof (size_t) * rr->rr_cols; + size_t *column_sizes = kmem_alloc(column_sizes_size, KM_SLEEP); + for (size_t c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + column_sizes[c] = rc->rc_size; + } + + if (zia_can_raidz(zio, rr, rec, cksum, column_sizes) != B_TRUE) { + kmem_free(column_sizes, column_sizes_size); + return (ZIA_FALLBACK); + } + + zia_props_t *props = zia_get_props(zio->io_spa); + void *provider = props->provider; + if (!provider) { + return (ZIA_FALLBACK); + } + + /* + * offload the source data if it hasn't already been offloaded + * + * need to lock here since offloading normally doesn't lock, but + * abds hitting raidz might have been mirrored + */ + mutex_enter(&zio->io_abd->abd_mtx); + if (zia_offload_abd(provider, zio->io_abd, zio->io_size, + props->min_offload_size, local_offload) != ZIA_OK) { + mutex_exit(&zio->io_abd->abd_mtx); + kmem_free(column_sizes, column_sizes_size); + return (ZIA_ERROR); + } + mutex_exit(&zio->io_abd->abd_mtx); + + /* mirrored abds generate their own references to the columns */ + + const size_t column_handles_size = sizeof (void *) * rr->rr_cols; + void **column_handles = kmem_alloc(column_handles_size, KM_SLEEP); + + /* create parity column handles */ + for (uint64_t c = 0; c < rr->rr_firstdatacol; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + ASSERT(!ABD_HANDLE(rc->rc_abd)); + + /* allocate rc->rc_abd->abd_size, mark as rc->rc_size */ + if (rec == B_TRUE) { + /* reconstructing, so copy data to provider */ + zia_offload_abd_offset(provider, rc->rc_abd, 0, + rc->rc_abd->abd_size, props->min_offload_size, + NULL); + } else { + /* generating, so create new columns */ + ABD_HANDLE(rc->rc_abd) = + dpusm->alloc(provider, rc->rc_abd->abd_size); + } + + if (!ABD_HANDLE(rc->rc_abd)) { + /* data columns are all references */ + for (uint64_t i = rr->rr_firstdatacol; i < c; i++) { + raidz_col_t *rc = &rr->rr_col[i]; + zia_free_abd(rc->rc_abd, B_FALSE); + } + + kmem_free(column_handles, column_handles_size); + kmem_free(column_sizes, column_sizes_size); + return (ZIA_ERROR); + } + + column_handles[c] = ABD_HANDLE(rc->rc_abd); + } + + /* + * recalculate data column offsets and + * create references for each column + */ + size_t offset = 0; + for (size_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + /* + * if the column is a gang abd, the handle + * will point to the first child + */ + void *column_handle = dpusm->alloc_ref(ABD_HANDLE(zio->io_abd), + offset, rc->rc_size); + + ABD_HANDLE(rc->rc_abd) = column_handle; + column_handles[c] = column_handle; + + offset += rc->rc_size; + } + + /* get raid context */ + rr->rr_zia_handle = dpusm->raid.alloc(rr->rr_firstdatacol, + rr->rr_cols - rr->rr_firstdatacol, ABD_HANDLE(zio->io_abd), + column_handles, column_sizes); + + kmem_free(column_handles, column_handles_size); + kmem_free(column_sizes, column_sizes_size); + + if (!rr->rr_zia_handle) { + zia_raidz_free(rr, B_FALSE); + return (ZIA_ERROR); + } + + return (ZIA_OK); +} + +/* + * only frees the raidz data + * onload the data separately if it is needed + */ +int +zia_raidz_free(raidz_row_t *rr, boolean_t onload_parity) +{ + if (!dpusm) { + return (ZIA_FALLBACK); + } + + if (!rr) { + return (ZIA_ERROR); + } + + dpusm->raid.free(rr->rr_zia_handle); + rr->rr_zia_handle = NULL; + + uint64_t c = 0; + + if (onload_parity == B_TRUE) { + for (; c < rr->rr_firstdatacol; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + zia_onload_abd(rc->rc_abd, + rc->rc_abd->abd_size, B_FALSE); + } + } + + for (; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + zia_free_abd(rc->rc_abd, B_FALSE); + } + + return (ZIA_OK); +} + +int +zia_raidz_gen(raidz_row_t *rr) +{ + if (!dpusm) { + return (ZIA_FALLBACK); + } + + /* can only pass if raidz_alloc succeeded */ + if (!rr->rr_zia_handle) { + return (ZIA_ERROR); + } + + return (translate_rc(dpusm->raid.gen(rr->rr_zia_handle))); +} + +/* onload abd and delete raidz_row_t stuff */ +static int +zia_raidz_cleanup(zio_t *zio, raidz_row_t *rr, + boolean_t local_offload, + boolean_t onload_parity) +{ + /* + * bring data back to zio->io_abd, which should + * place data into parent automatically + */ + zia_cleanup_abd(zio->io_abd, zio->io_size, local_offload); + + return (zia_raidz_free(rr, onload_parity)); +} + +int +zia_raidz_gen_cleanup(zio_t *zio, raidz_row_t *rr, + boolean_t local_offload) +{ + /* + * RAIDZ generation only calls cleanup + * on failure, so parity does not need + * to be brought back. + */ + return (zia_raidz_cleanup(zio, rr, + local_offload, B_FALSE)); +} + +/* + * allocate new parity columns for this row + * and assign them to the raidz struct + * + * orig takes ownership of the original handles + */ +int +zia_raidz_new_parity(zio_t *zio, raidz_row_t *rr, abd_t **orig) +{ + if (!zio || !rr || !orig) { + return (ZIA_ERROR); + } + + if (!ABD_HANDLE(zio->io_abd) || !rr->rr_zia_handle) { + return (ZIA_FALLBACK); + } + + void **new_parity_cols[VDEV_RAIDZ_MAXPARITY]; + size_t new_parity_sizes[VDEV_RAIDZ_MAXPARITY]; + int c = 0; + + for (c = 0; c < rr->rr_firstdatacol; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + new_parity_cols[c] = NULL; + new_parity_sizes[c] = 0; + + /* this parity column was not reconstructed */ + if (!rc->rc_tried || rc->rc_error != 0) + continue; + + /* the provider updates the handle */ + new_parity_cols[c] = &ABD_HANDLE(rc->rc_abd); + new_parity_sizes[c] = rc->rc_size; + } + + if (c != rr->rr_firstdatacol) { + return (ZIA_FALLBACK); + } + + /* + * allocate space for parity columns and + * assign them to the raidz struct + */ + return (translate_rc(dpusm->raid.new_parity(rr->rr_zia_handle, + rr->rr_firstdatacol, new_parity_cols, new_parity_sizes))); +} + +int +zia_raidz_cmp(abd_t *lhs, abd_t *rhs, int *diff) +{ + if (!lhs || !rhs || !diff) { + return (ZIA_ERROR); + } + + if (lhs == rhs) { + *diff = 0; + return (ZIA_OK); + } + + void *lhs_handle = ABD_HANDLE(lhs); + void *rhs_handle = ABD_HANDLE(rhs); + if (!lhs_handle || !rhs_handle) { + return (ZIA_ERROR); + } + + return (translate_rc(dpusm->raid.cmp(lhs_handle, rhs_handle, diff))); +} + +int +zia_raidz_rec(raidz_row_t *rr, int *t, int nt) +{ + if (!dpusm) { + return (ZIA_FALLBACK); + } + + /* can only pass if raidz_alloc succeeded */ + if (!rr->rr_zia_handle) { + return (ZIA_FALLBACK); + } + + return (translate_rc(zia_raidz_rec_impl(dpusm, rr, t, nt))); +} + +int +zia_raidz_rec_cleanup(zio_t *zio, raidz_row_t *rr, + boolean_t local_offload, boolean_t onload_parity) +{ + return (zia_raidz_cleanup(zio, rr, + local_offload, onload_parity)); +} + +int +zia_file_open(vdev_t *vdev, const char *path, + int flags, int mode) +{ + if (!vdev || !vdev->vdev_spa) { + return (ZIA_ERROR); + } + + void *provider = zia_get_props(vdev->vdev_spa)->provider; + if (!dpusm || !provider) { + return (ZIA_FALLBACK); + } + + if (!VDEV_HANDLE(vdev)) { + VDEV_HANDLE(vdev) = dpusm->file.open(provider, + path, flags, mode); + } + + return (VDEV_HANDLE(vdev)?ZIA_OK:ZIA_ERROR); +} + +int +zia_file_write(vdev_t *vdev, abd_t *abd, ssize_t size, + loff_t offset, ssize_t *resid, int *err) +{ + if (!vdev || !abd) { + return (ZIA_ERROR); + } + + if (!dpusm || !VDEV_HANDLE(vdev) || !ABD_HANDLE(abd)) { + return (ZIA_FALLBACK); + } + + size_t trailing_zeros = 0; + size_t data_size = size; + + /* can only happen with raidz */ + if (abd_is_gang(abd)) { + abd_t *cabd = list_head(&ABD_GANG(abd).abd_gang_chain); + trailing_zeros = size - cabd->abd_size; + data_size = cabd->abd_size; + } + + return (dpusm->file.write(VDEV_HANDLE(vdev), + ABD_HANDLE(abd), data_size, trailing_zeros, offset, resid, err)); +} + +int +zia_file_close(vdev_t *vdev) +{ + if (!vdev) { + return (ZIA_ERROR); + } + + if (!dpusm || !VDEV_HANDLE(vdev)) { + return (ZIA_FALLBACK); + } + + dpusm->file.close(VDEV_HANDLE(vdev)); + VDEV_HANDLE(vdev) = NULL; + zia_get_props(vdev->vdev_spa)->min_offload_size = 0; + + return (ZIA_OK); +} + +#ifdef _KERNEL +int +zia_disk_open(vdev_t *vdev, const char *path, + struct block_device *bdev) +{ + if (!vdev || !vdev->vdev_spa) { + return (ZIA_ERROR); + } + + void *provider = zia_get_props(vdev->vdev_spa)->provider; + if (!dpusm || !provider) { + return (ZIA_FALLBACK); + } + + if (!VDEV_HANDLE(vdev)) { + VDEV_HANDLE(vdev) = dpusm->disk.open(provider, + path, bdev); + } + + return (VDEV_HANDLE(vdev)?ZIA_OK:ZIA_ERROR); +} + +int +zia_disk_invalidate(vdev_t *vdev) +{ + if (!vdev) { + return (ZIA_ERROR); + } + + if (!dpusm || !VDEV_HANDLE(vdev)) { + return (ZIA_FALLBACK); + } + + return (translate_rc(dpusm->disk.invalidate(VDEV_HANDLE(vdev)))); +} + +int +zia_disk_write(vdev_t *vdev, zio_t *zio, size_t io_size, + uint64_t io_offset, int flags) +{ + if (!vdev || !zio->io_abd) { + return (ZIA_ERROR); + } + + if (!dpusm || !ABD_HANDLE(zio->io_abd) || !VDEV_HANDLE(vdev)) { + return (ZIA_FALLBACK); + } + + size_t trailing_zeros = 0; + size_t data_size = io_size; + + /* can only happen with raidz */ + if (abd_is_gang(zio->io_abd)) { + abd_t *cabd = list_head(&ABD_GANG(zio->io_abd).abd_gang_chain); + trailing_zeros = io_size - cabd->abd_size; + data_size = cabd->abd_size; + } + + return (dpusm->disk.write(VDEV_HANDLE(vdev), ABD_HANDLE(zio->io_abd), + data_size, trailing_zeros, io_offset, + flags, zia_disk_write_completion, zio)); +} + +int +zia_disk_flush(vdev_t *vdev, zio_t *zio) +{ + if (!vdev) { + return (ZIA_ERROR); + } + + if (!dpusm || !VDEV_HANDLE(vdev)) { + return (ZIA_FALLBACK); + } + + return (dpusm->disk.flush(VDEV_HANDLE(vdev), + zia_disk_flush_completion, zio)); +} + +int +zia_disk_close(vdev_t *vdev) +{ + if (!vdev) { + return (ZIA_ERROR); + } + + void *handle = VDEV_HANDLE(vdev); + VDEV_HANDLE(vdev) = NULL; + + zia_get_props(vdev->vdev_spa)->min_offload_size = 0; + + if (!dpusm || !handle) { + return (ZIA_FALLBACK); + } + + /* trust that ZFS handles closing disks once */ + dpusm->disk.close(handle); + + return (ZIA_OK); +} +#endif + +#endif diff --git a/module/zfs/zia_cddl.c b/module/zfs/zia_cddl.c new file mode 100644 index 000000000000..18caca510a7b --- /dev/null +++ b/module/zfs/zia_cddl.c @@ -0,0 +1,232 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifdef ZIA + +#include +#include +#include +#include +#include +#include +#include + +/* basically a duplicate of zio_compress_data */ +int +zia_compress_impl(const dpusm_uf_t *dpusm, void *provider, + zio_t *zio, size_t s_len, enum zio_compress c, uint8_t level, + void **cbuf_handle, uint64_t *c_len, boolean_t *local_offload) +{ + size_t d_len; + uint8_t complevel; + zio_compress_info_t *ci = &zio_compress_table[c]; + abd_t *src = zio->io_abd; + zia_props_t *props = zia_get_props(zio->io_spa); + + ASSERT((uint_t)c < ZIO_COMPRESS_FUNCTIONS); + ASSERT((uint_t)c == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL); + + /* + * If the data is all zeros, we don't even need to allocate + * a block for it. We indicate this by returning zero size. + */ + if (!ABD_HANDLE(src)) { + /* check that compression can be done before offloading */ + dpusm_pc_t *caps = NULL; + if ((zia_get_capabilities(provider, &caps) != ZIA_OK) || + !(caps->compress & translate_compress(c))) { + return (ZIA_FALLBACK); + } + + /* check in-memory buffer */ + if (abd_iterate_func(src, 0, s_len, + zio_compress_zeroed_cb, NULL) == 0) { + *c_len = 0; + return (ZIA_OK); + } + + if (zia_offload_abd(provider, src, s_len, + props->min_offload_size, local_offload) != ZIA_OK) { + return (ZIA_ERROR); + } + } else { + /* came in offloaded - make sure provider can compress */ + *local_offload = B_FALSE; + + void *old_provider = dpusm->extract(ABD_HANDLE(src)); + if (old_provider != provider) { + return (ZIA_PROVIDER_MISMATCH); + } + + dpusm_pc_t *caps = NULL; + if ((zia_get_capabilities(provider, &caps) != ZIA_OK) || + !(caps->compress & translate_compress(c))) { + return (ZIA_FALLBACK); + } + + /* use provider to check */ + if (dpusm->all_zeros(ABD_HANDLE(src), 0, s_len) == DPUSM_OK) { + *c_len = 0; + return (ZIA_OK); + } + } + + if (c == ZIO_COMPRESS_EMPTY) { + *c_len = s_len; + return (ZIA_OK); + } + + /* Compress at least 12.5% */ + d_len = s_len - (s_len >> 3); + + complevel = ci->ci_level; + + if (c == ZIO_COMPRESS_ZSTD) { + /* If we don't know the level, we can't compress it */ + if (level == ZIO_COMPLEVEL_INHERIT) { + *c_len = s_len; + return (ZIA_OK); + } + + if (level == ZIO_COMPLEVEL_DEFAULT) + complevel = ZIO_ZSTD_LEVEL_DEFAULT; + else + complevel = level; + + ASSERT3U(complevel, !=, ZIO_COMPLEVEL_INHERIT); + } + + /* nothing to offload, so just allocate space */ + *cbuf_handle = zia_alloc(provider, s_len, props->min_offload_size); + if (!*cbuf_handle) { + zia_cleanup_abd(src, s_len, + local_offload?*local_offload:B_FALSE); + return (ZIA_ERROR); + } + + /* DPUSM interface takes in a size_t, not a uint64_t */ + size_t zia_c_len = 0; + if (dpusm->compress(translate_compress(c), + ABD_HANDLE(src), *cbuf_handle, s_len, + (int8_t)level, &zia_c_len) != DPUSM_OK) { + zia_free(cbuf_handle); + return (ZIA_FALLBACK); + } + + *c_len = zia_c_len; + + /* + * Return ZIA_OK because this is not an error - it just didn't + * compress well. The data will be dropped later on (instead of + * onloaded) because c_len is too big. + */ + if (*c_len > d_len) { + *c_len = s_len; + } + + return (ZIA_OK); +} + +int +zia_checksum_error_impl(const dpusm_uf_t *dpusm, + const blkptr_t *bp, enum zio_checksum alg, + abd_t *abd, uint64_t size, zio_bad_cksum_t *info) +{ + zio_cksum_t actual_cksum; + zio_cksum_t expected_cksum = bp->blk_cksum; + int byteswap = BP_SHOULD_BYTESWAP(bp); + + if (dpusm->checksum(translate_checksum(alg), + translate_byteorder(ZIO_CHECKSUM_NATIVE), + ABD_HANDLE(abd), size, + actual_cksum.zc_word, + sizeof (actual_cksum.zc_word)) != DPUSM_OK) { + return (ZIA_ERROR); + } + + if (info != NULL) { + zio_checksum_info_t *ci = &zio_checksum_table[alg]; + info->zbc_expected = expected_cksum; + info->zbc_actual = actual_cksum; + info->zbc_checksum_name = ci->ci_name; + info->zbc_byteswapped = byteswap; + info->zbc_injected = 0; + info->zbc_has_cksum = 1; + } + + if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) + return (SET_ERROR(ECKSUM)); + + return (ZIA_OK); +} + +int +zia_raidz_rec_impl(const dpusm_uf_t *dpusm, + raidz_row_t *rr, int *t, int nt) +{ + int tgts[VDEV_RAIDZ_MAXPARITY]; + int ntgts = 0; + for (int i = 0, c = 0; c < rr->rr_cols; c++) { + if (i < nt && c == t[i]) { + tgts[ntgts++] = c; + i++; + } else if (rr->rr_col[c].rc_error != 0) { + tgts[ntgts++] = c; + } + } + + ASSERT(ntgts >= nt); + + return (dpusm->raid.rec(rr->rr_zia_handle, + tgts, ntgts)); +} + +#ifdef _KERNEL +/* called by provider */ +void +zia_disk_write_completion(void *zio_ptr, int error) +{ + zio_t *zio = (zio_t *)zio_ptr; + zio->io_error = error; + ASSERT3S(zio->io_error, >=, 0); + if (zio->io_error) + vdev_disk_error(zio); + + zio_delay_interrupt(zio); +} + +/* called by provider */ +void +zia_disk_flush_completion(void *zio_ptr, int error) +{ + zio_t *zio = (zio_t *)zio_ptr; + + if (zio->io_error && (zio->io_error == EOPNOTSUPP)) + zio->io_vd->vdev_nowritecache = B_TRUE; + + ASSERT3S(zio->io_error, >=, 0); + if (zio->io_error) + vdev_disk_error(zio); + zio_interrupt(zio); +} +#endif + +#endif diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 1c9f598b7d13..b72f9e78f5ef 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -52,6 +52,10 @@ #include #include +#ifdef ZIA +#include +#endif + /* * ========================================================================== * I/O type descriptions @@ -877,6 +881,15 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_add_child(pio, zio); } +#ifdef ZIA + /* turn off encryption and dedup if Z.I.A. is used */ + if (zia_is_used(zio) == B_TRUE) { + zio->io_prop.zp_dedup = B_FALSE; + zio->io_prop.zp_dedup_verify = B_FALSE; + zio->io_prop.zp_encrypt = B_FALSE; + } +#endif + taskq_init_ent(&zio->io_tqent); return (zio); @@ -1695,15 +1708,56 @@ zio_write_compress(zio_t *zio) if (compress != ZIO_COMPRESS_OFF && !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) { void *cbuf = zio_buf_alloc(lsize); +#ifdef ZIA + int zia_rc = ZIA_FALLBACK; + void *cbuf_handle = NULL; /* only valid if zia_rc == ZIA_OK */ + zia_props_t *zia_props = zia_get_props(spa); + boolean_t local_offload = B_FALSE; + /* real data is compressed on the offloader */ + if (!zp->zp_ismd && + (zia_props->compress == 1)) { + zia_rc = zia_compress(zia_props->provider, zio, lsize, + compress, zp->zp_complevel, + &cbuf_handle, &psize, &local_offload); + } + + if (zia_rc != ZIA_OK) { + zia_cleanup_abd(zio->io_abd, lsize, local_offload); +#endif psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize, zp->zp_complevel); +#ifdef ZIA + } +#endif + if (psize == 0 || psize >= lsize) { compress = ZIO_COMPRESS_OFF; +#ifdef ZIA + /* no need for offloaded compressed buffer any more */ + zia_free(&cbuf_handle); + + /* source abd is still offloaded */ +#endif zio_buf_free(cbuf, lsize); } else if (!zp->zp_dedup && !zp->zp_encrypt && psize <= BPE_PAYLOAD_SIZE && zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) && spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) { +#ifdef ZIA + /* + * compressed enough, but not handling embedded + * data, so move compressed data back into memory + */ + zia_onload(&cbuf_handle, cbuf, psize); + + /* + * remove offloaded source abd + * + * in-memory copy should still be valid, but calling + * zia_cleanup_abd just in case + */ + zia_cleanup_abd(zio->io_abd, lsize, local_offload); +#endif encode_embedded_bp_compressed(bp, cbuf, compress, lsize, psize); BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA); @@ -1729,15 +1783,70 @@ zio_write_compress(zio_t *zio) spa->spa_min_alloc); if (rounded >= lsize) { compress = ZIO_COMPRESS_OFF; +#ifdef ZIA + /* + * don't need offloaded compressed + * buffer any more + */ + zia_free(&cbuf_handle); +#endif zio_buf_free(cbuf, lsize); psize = lsize; } else { abd_t *cdata = abd_get_from_buf(cbuf, lsize); abd_take_ownership_of_buf(cdata, B_TRUE); +#ifdef ZIA + /* real data */ + if (!zp->zp_ismd && cbuf_handle) { + /* source abd no longer needed */ + zia_free_abd(zio->io_abd, B_FALSE); + + /* + * compressed enough, so associate the + * compressed buffer with the abd + */ + zia_move_into_abd(cdata, &cbuf_handle); + if (zia_zero_fill(cdata, psize, + rounded - psize) != ZIA_OK) { + /* + * if setting cdata's handle + * fails, onload the compressed + * buffer (automatically placing + * it into cdata) and continue + * using zfs + * + * if cbuf is not offloaded, + * nothing happens + */ + zia_onload(&cbuf_handle, + cbuf, lsize); + } + } +#endif abd_zero_off(cdata, psize, rounded - psize); psize = rounded; +#ifdef ZIA + /* + * metadata + * + * offload here to zero fill buffer in + * memory instead of calling provider + */ + if (zp->zp_ismd && + (zia_props->compress == 1)) { + zia_offload_abd(zia_props->provider, + cdata, psize, + zia_props->min_offload_size, NULL); + } +#endif zio_push_transform(zio, cdata, psize, lsize, NULL); +#ifdef ZIA + if (zia_is_offloaded(zio->io_abd)) { + zio->io_flags |= + ZIO_FLAG_DONT_AGGREGATE; + } +#endif } } @@ -3773,6 +3882,15 @@ zio_vdev_io_start(zio_t *zio) if (zio->io_type == ZIO_TYPE_WRITE) { abd_copy(abuf, zio->io_abd, zio->io_size); abd_zero_off(abuf, zio->io_size, asize - zio->io_size); +#ifdef ZIA + /* + * The Z.I.A. handles of the abds that come here + * were not modified and do not get associated with + * abuf during the transform. Instead of dropping + * the handle and delaying here, let abd_free clean + * it up later. + */ +#endif } zio_push_transform(zio, abuf, asize, asize, zio_subblock); } @@ -3969,6 +4087,10 @@ zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr) { void *abd = abd_alloc_sametype(zio->io_abd, zio->io_size); +#ifdef ZIA + zia_onload_abd(zio->io_abd, zio->io_size, B_FALSE); +#endif + abd_copy(abd, zio->io_abd, zio->io_size); zcr->zcr_cbinfo = zio->io_size; @@ -5029,6 +5151,10 @@ EXPORT_SYMBOL(zio_data_buf_alloc); EXPORT_SYMBOL(zio_buf_free); EXPORT_SYMBOL(zio_data_buf_free); +#ifdef ZIA +EXPORT_SYMBOL(zio_push_transform); +#endif + ZFS_MODULE_PARAM(zfs_zio, zio_, slow_io_ms, INT, ZMOD_RW, "Max I/O completion time (milliseconds) before marking it as slow"); diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c index c7368ac26a09..b47ff61fc180 100644 --- a/module/zfs/zio_checksum.c +++ b/module/zfs/zio_checksum.c @@ -33,6 +33,10 @@ #include #include +#ifdef ZIA +#include +#endif + /* * Checksum vectors. * @@ -358,6 +362,10 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, size_t eck_offset; memset(&saved, 0, sizeof (zio_cksum_t)); +#ifdef ZIA + /* not handling embedded checksums, so bring back data */ + zia_cleanup_abd(abd, size, B_FALSE); +#endif if (checksum == ZIO_CHECKSUM_ZILOG2) { zil_chain_t zilc; @@ -400,8 +408,29 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, sizeof (zio_cksum_t)); } else { saved = bp->blk_cksum; + +#ifdef ZIA + int zia_rc = ZIA_ERROR; + zia_props_t *props = zia_get_props(spa); + + /* only offload non-embedded checksums */ + boolean_t local_offload = B_FALSE; + if (props->checksum == 1) { + zia_rc = zia_checksum_compute(props->provider, &cksum, + checksum, zio, size, &local_offload); + } + + /* fall back to ZFS implementation */ + if (zia_rc != ZIA_OK) { + zia_cleanup_abd(abd, size, local_offload); +#endif ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum], &cksum); +#ifdef ZIA + } else { + zio->io_flags |= ZIO_FLAG_DONT_AGGREGATE; + } +#endif if (BP_USES_CRYPT(bp) && BP_GET_TYPE(bp) != DMU_OT_OBJSET) zio_checksum_handle_crypt(&cksum, &saved, insecure); bp->blk_cksum = cksum; @@ -539,8 +568,27 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) abd_t *data = zio->io_abd; spa_t *spa = zio->io_spa; +#ifdef ZIA + error = ZIA_FALLBACK; + + if (zia_get_props(zio->io_spa)->checksum == 1) { + zio_checksum_info_t *ci = &zio_checksum_table[checksum]; + if (!(ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED)) { + error = zia_checksum_error(bp, + checksum, data, size, info); + } + } + + /* fall back to ZFS implementation */ + if ((error != ZIA_OK) && (error != ECKSUM)) { + /* data was modified by reconstruction */ + zia_onload_abd(data, size, B_FALSE); +#endif error = zio_checksum_error_impl(spa, bp, checksum, data, size, offset, info); +#ifdef ZIA + } +#endif if (zio_injection_enabled && error == 0 && zio->io_error == 0) { error = zio_handle_fault_injection(zio, ECKSUM); diff --git a/module/zfs/zio_compress.c b/module/zfs/zio_compress.c index 717395dcf456..40794d5a9a06 100644 --- a/module/zfs/zio_compress.c +++ b/module/zfs/zio_compress.c @@ -111,7 +111,11 @@ zio_compress_select(spa_t *spa, enum zio_compress child, return (result); } +#ifndef ZIA static int +#else +int +#endif zio_compress_zeroed_cb(void *data, size_t len, void *private) { (void) private; diff --git a/module/zia-software-provider/kernel_offloader.c b/module/zia-software-provider/kernel_offloader.c new file mode 100644 index 000000000000..875f454c0708 --- /dev/null +++ b/module/zia-software-provider/kernel_offloader.c @@ -0,0 +1,766 @@ +/* + * © 2021. Triad National Security, LLC. All rights reserved. + * + * This program was produced under U.S. Government contract + * 89233218CNA000001 for Los Alamos National Laboratory (LANL), which + * is operated by Triad National Security, LLC for the U.S. + * Department of Energy/National Nuclear Security Administration. All + * rights in the program are reserved by Triad National Security, LLC, + * and the U.S. Department of Energy/National Nuclear Security + * Administration. The Government is granted for itself and others + * acting on its behalf a nonexclusive, paid-up, irrevocable worldwide + * license in this material to reproduce, prepare derivative works, + * distribute copies to the public, perform publicly and display + * publicly, and to permit others to do so. + * + * ---- + * + * This program is open source under the BSD-3 License. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kernel_offloader.h" + +static const char NAME[] = "Kernel Offloader"; +static const size_t NAME_LEN = sizeof (NAME); + +typedef enum kernel_offloader_handle_type { + KOH_REAL, /* default type - convert all data into a single blob */ + KOH_REFERENCE, + + KOH_INVALID, +} koht_t; + +/* offloaded data */ +typedef struct kernel_offloader_handle { + koht_t type; + void *ptr; + size_t size; +} koh_t; + +/* never decreases */ +static atomic_t total_count; +static atomic_t total_size; +static atomic_t total_actual; + +/* currently active */ +static atomic_t active_count; +static atomic_t active_size; +static atomic_t active_actual; + +/* + * value used to swizzle the pointer so that + * dereferencing the handle will fail + */ +static void *mask = NULL; +void +kernel_offloader_init(void) +{ + get_random_bytes(&mask, sizeof (mask)); + atomic_set(&total_count, 0); + atomic_set(&total_size, 0); + atomic_set(&total_actual, 0); + atomic_set(&active_count, 0); + atomic_set(&active_size, 0); + atomic_set(&active_actual, 0); + printk("kernel offloader init: %p\n", mask); +} + +void +kernel_offloader_fini(void) +{ + mask = NULL; + + printk("kernel offloader fini with " + "%d/%d bytes in %d allocations " + "(actual %d/%d bytes in %d allocations) " + "remaining\n", + atomic_read(&active_size), + atomic_read(&total_size), + atomic_read(&active_count), + atomic_read(&active_actual), + atomic_read(&total_actual), + atomic_read(&total_count)); +} + +/* get a starting address of a linear koh_t */ +static void * +ptr_start(koh_t *koh, size_t offset) +{ + return (void *)(((uintptr_t)koh->ptr) + offset); +} + +/* + * convert the actual pointer to a handle (pretend + * the data is not accessible from the Z.I.A. base) + */ +static void * +swizzle(void *ptr) +{ + return (ptr?((void *)(((uintptr_t)ptr) ^ ((uintptr_t)mask))):NULL); +} + +/* convert the handle to a usable pointer */ +static void * +unswizzle(void *handle) +{ + return (swizzle(handle)); +} + +static koh_t * +koh_alloc(size_t size) +{ + koh_t *koh = kmalloc(sizeof (koh_t), GFP_KERNEL); + if (koh) { + koh->type = KOH_REAL; + koh->ptr = kmalloc(size, GFP_KERNEL); + koh->size = size; + + /* the allocation itself */ + atomic_add(1, &total_count); + atomic_add(1, &active_count); + atomic_add(size, &total_size); + atomic_add(size, &active_size); + atomic_add(size, &total_actual); + atomic_add(size, &active_actual); + + /* the wrapper struct */ + atomic_add(1, &total_count); + atomic_add(1, &active_count); + atomic_add(sizeof (koh_t), &total_size); + atomic_add(sizeof (koh_t), &active_size); + } + + return (koh); +} + +static koh_t * +koh_alloc_ref(koh_t *src, size_t offset, size_t size) +{ + koh_t *ref = NULL; + if (src) { + koh_t *src_koh = (koh_t *)src; + + if ((offset + size) > src_koh->size) { + printk("Error: Cannot reference handle of size %zu " + "starting at offset %zu with size %zu\n", + src_koh->size, offset, size); + return (NULL); + } + + ref = kmalloc(sizeof (koh_t), GFP_KERNEL); + if (ref) { + ref->type = KOH_REFERENCE; + + /* same underlying buffer */ + ref->ptr = ptr_start(src, offset); + + /* should probably check offset + size < src->size */ + ref->size = size; + + /* no new requested space */ + + /* the wrapper struct */ + atomic_add(1, &total_count); + atomic_add(1, &active_count); + atomic_add(sizeof (koh_t), &total_size); + atomic_add(sizeof (koh_t), &active_size); + } + } + + return (ref); +} + +int +kernel_offloader_get_size(void *handle, size_t *size, size_t *actual) +{ + koh_t *koh = (koh_t *)unswizzle(handle); + + if (size) { + *size = koh->size; + } + + if (actual) { + *actual = koh->size; + } + + return (KERNEL_OFFLOADER_OK); +} + +static void +koh_free(koh_t *koh) +{ + if (koh) { + switch (koh->type) { + case KOH_REAL: + /* the allocation itself */ + atomic_sub(1, &active_count); + atomic_sub(koh->size, &active_size); + atomic_sub(koh->size, &active_actual); + kfree(koh->ptr); + break; + case KOH_REFERENCE: + case KOH_INVALID: + default: + break; + } + + /* the wrapper struct */ + atomic_sub(1, &active_count); + atomic_sub(sizeof (koh_t), &active_size); + kfree(koh); + } +} + +void * +kernel_offloader_alloc(size_t size) +{ + return (swizzle(koh_alloc(size))); +} + +void * +kernel_offloader_alloc_ref(void *src_handle, size_t offset, size_t size) +{ + return swizzle(koh_alloc_ref(unswizzle(src_handle), + offset, size)); +} + +void +kernel_offloader_free(void *handle) +{ + koh_free(unswizzle(handle)); +} + +int +kernel_offloader_copy_from_mem(void *handle, size_t offset, + const void *src, size_t size) +{ + koh_t *koh = (koh_t *)unswizzle(handle); + if (!koh) { + return (KERNEL_OFFLOADER_ERROR); + } + + if ((offset + size) > koh->size) { + return (KERNEL_OFFLOADER_ERROR); + } + + void *dst = ptr_start(koh, offset); + if (memcpy(dst, src, size) != dst) { + return (KERNEL_OFFLOADER_ERROR); + } + return (KERNEL_OFFLOADER_OK); +} + +int +kernel_offloader_copy_to_mem(void *handle, size_t offset, + void *dst, size_t size) +{ + koh_t *koh = (koh_t *)unswizzle(handle); + if (!koh) { + return (KERNEL_OFFLOADER_ERROR); + } + + if ((offset + size) > koh->size) { + return (KERNEL_OFFLOADER_ERROR); + } + + if (memcpy(dst, ptr_start(koh, offset), size) != dst) { + return (KERNEL_OFFLOADER_ERROR); + } + + return (KERNEL_OFFLOADER_OK); +} + +int +kernel_offloader_cmp(void *lhs_handle, void *rhs_handle, int *diff) +{ + koh_t *lhs = (koh_t *)unswizzle(lhs_handle); + koh_t *rhs = (koh_t *)unswizzle(rhs_handle); + + if (!lhs || !rhs || !diff) { + return (KERNEL_OFFLOADER_ERROR); + } + + size_t len = rhs->size; + if (lhs->size != rhs->size) { + len = + (lhs->size < rhs->size)?lhs->size:rhs->size; + } + + *diff = memcmp(ptr_start(lhs, 0), + ptr_start(rhs, 0), len); + + return (KERNEL_OFFLOADER_OK); +} + +int +kernel_offloader_zero_fill(void *handle, size_t offset, size_t size) +{ + koh_t *koh = (koh_t *)unswizzle(handle); + memset(ptr_start(koh, offset), 0, size); + return (KERNEL_OFFLOADER_OK); +} + +int +kernel_offloader_all_zeros(void *handle, size_t offset, size_t size) +{ + koh_t *koh = (koh_t *)unswizzle(handle); + if (koh->size - offset < size) { + return (KERNEL_OFFLOADER_ERROR); + } + + uint64_t *array = ptr_start(koh, offset); + size_t i; + for (i = 0; i < size / sizeof (uint64_t); i++) { + if (array[i]) { + return (KERNEL_OFFLOADER_BAD_RESULT); + } + } + + char *remaining = ptr_start(koh, offset); + for (i *= sizeof (uint64_t); i < size; i++) { + if (remaining[i]) { + return (KERNEL_OFFLOADER_BAD_RESULT); + } + } + + return (KERNEL_OFFLOADER_OK); +} + +int +kernel_offloader_mem_stats( + void *t_count_handle, void *t_size_handle, void *t_actual_handle, + void *a_count_handle, void *a_size_handle, void *a_actual_handle) +{ + if (t_count_handle) { + *(size_t *)ptr_start(t_count_handle, 0) = + atomic_read(&total_count); + } + + if (t_size_handle) { + *(size_t *)ptr_start(t_size_handle, 0) = + atomic_read(&total_size); + } + + if (t_actual_handle) { + *(size_t *)ptr_start(t_actual_handle, 0) = + atomic_read(&total_actual); + } + + if (a_count_handle) { + *(size_t *)ptr_start(a_count_handle, 0) = + atomic_read(&active_count); + } + + if (a_size_handle) { + *(size_t *)ptr_start(a_size_handle, 0) = + atomic_read(&active_size); + } + + if (a_actual_handle) { + *(size_t *)ptr_start(a_actual_handle, 0) = + atomic_read(&active_actual); + } + + return (KERNEL_OFFLOADER_OK); +} + +/* specific implementation */ +static int +kernel_offloader_gzip_compress(koh_t *src, koh_t *dst, + size_t s_len, int level, size_t *c_len) +{ + *c_len = dst->size; + + if (z_compress_level(ptr_start(dst, 0), c_len, ptr_start(src, 0), + s_len, level) != Z_OK) { + if (*c_len != src->size) { + return (KERNEL_OFFLOADER_ERROR); + } + return (KERNEL_OFFLOADER_OK); + } + + return (KERNEL_OFFLOADER_OK); +} + +int +kernel_offloader_compress(dpusm_compress_t alg, + void *src, void *dst, size_t s_len, int level, + void *c_len) +{ + int status = KERNEL_OFFLOADER_UNAVAILABLE; + koh_t *src_koh = NULL; + koh_t *dst_koh = NULL; + koh_t *c_len_koh = NULL; + if (!src || !dst || !c_len) { + return (KERNEL_OFFLOADER_ERROR); + } + + src_koh = (koh_t *)unswizzle(src); + dst_koh = (koh_t *)unswizzle(dst); + c_len_koh = (koh_t *)unswizzle(c_len); + + if ((DPUSM_COMPRESS_GZIP_1 <= alg) && + (alg <= DPUSM_COMPRESS_GZIP_9)) { + status = kernel_offloader_gzip_compress(src_koh, dst_koh, s_len, + level, (size_t *)ptr_start(c_len_koh, 0)); + } + + return (status); +} + +/* specific implementation */ +static int +kernel_offloader_gzip_decompress(koh_t *src, koh_t *dst, + int level, size_t *c_len) +{ + if (z_uncompress(ptr_start(dst, 0), c_len, ptr_start(src, 0), + src->size) != Z_OK) { + return (KERNEL_OFFLOADER_ERROR); + } + + return (KERNEL_OFFLOADER_OK); +} + +int +kernel_offloader_decompress(dpusm_compress_t alg, + void *src, void *dst, int level) +{ + int status = KERNEL_OFFLOADER_UNAVAILABLE; + koh_t *src_koh = (koh_t *)unswizzle(src); + koh_t *dst_koh = (koh_t *)unswizzle(dst); + + size_t d_len = 0; + + if ((DPUSM_COMPRESS_GZIP_1 <= alg) && + (alg <= DPUSM_COMPRESS_GZIP_9)) { + status = kernel_offloader_gzip_decompress(src_koh, dst_koh, + level, &d_len); + } + + return (status); +} + +int +kernel_offloader_checksum(dpusm_checksum_t alg, + dpusm_checksum_byteorder_t order, void *data, size_t size, + void *cksum, size_t cksum_size) +{ + koh_t *data_koh = (koh_t *)unswizzle(data); + + if (!data_koh) { + return (KERNEL_OFFLOADER_ERROR); + } + + if ((alg != DPUSM_CHECKSUM_FLETCHER_2) && + (alg != DPUSM_CHECKSUM_FLETCHER_4)) { + return (KERNEL_OFFLOADER_ERROR); + } + + zio_cksum_t zcp; + if (cksum_size < sizeof (zcp.zc_word)) { + return (KERNEL_OFFLOADER_ERROR); + } + + /* compute checksum */ + + void *buf = ptr_start(data_koh, 0); + + if (alg == DPUSM_CHECKSUM_FLETCHER_2) { + if (order == DPUSM_BYTEORDER_NATIVE) { + fletcher_2_native(buf, size, NULL, &zcp); + } else { + fletcher_2_byteswap(buf, size, NULL, &zcp); + } + } else if (alg == DPUSM_CHECKSUM_FLETCHER_4) { + if (order == DPUSM_BYTEORDER_NATIVE) { + fletcher_4_native(buf, size, NULL, &zcp); + } else { + fletcher_4_byteswap(buf, size, NULL, &zcp); + } + } + + memcpy(cksum, zcp.zc_word, sizeof (zcp.zc_word)); + + return (DPUSM_OK); +} + +void * +kernel_offloader_raidz_alloc(size_t nparity, size_t ndata, + void **col_handles, size_t *col_sizes) +{ + const size_t ncols = nparity + ndata; + raidz_row_t *rr = kmalloc(offsetof(raidz_row_t, + rr_col[ncols]), GFP_KERNEL); + rr->rr_cols = ncols; + rr->rr_firstdatacol = nparity; + + for (size_t c = 0; c < ncols; c++) { + koh_t *koh = (koh_t *)unswizzle(col_handles[c]); + raidz_col_t *rc = &rr->rr_col[c]; + memset(rc, 0, sizeof (raidz_row_t)); + + rc->rc_abd = abd_get_from_buf(koh->ptr, koh->size); + abd_release_ownership_of_buf(rc->rc_abd); + rc->rc_size = col_sizes[c]; + } + + return (swizzle(rr)); +} + +/* attaches a column to the raidz struct */ +int +kernel_offloader_raidz_set_col(void *raidz, uint64_t c, + void *col, size_t size) +{ + raidz_row_t *rr = (raidz_row_t *)unswizzle(raidz); + koh_t *koh = (koh_t *)unswizzle(col); + + if (!rr || !koh) { + return (KERNEL_OFFLOADER_ERROR); + } + + /* c is too big */ + if (c >= rr->rr_cols) { + return (KERNEL_OFFLOADER_ERROR); + } + + /* parity column */ + if (c < rr->rr_firstdatacol) { + /* must be a real allocation */ + if (koh->type != KOH_REAL) { + return (KERNEL_OFFLOADER_ERROR); + } + } + /* data column */ + else { + /* needs to be a reference */ + if (koh->type != KOH_REFERENCE) { + return (KERNEL_OFFLOADER_ERROR); + } + } + + /* "active" size is larger than allocated size */ + if (size > koh->size) { + return (KERNEL_OFFLOADER_ERROR); + } + + raidz_col_t *rc = &rr->rr_col[c]; + rc->rc_abd = abd_get_from_buf(koh->ptr, size); + abd_release_ownership_of_buf(rc->rc_abd); + rc->rc_size = size; + + return (KERNEL_OFFLOADER_OK); +} + +void +kernel_offloader_raidz_free(void *raidz) +{ + raidz_row_t *rr = (raidz_row_t *)unswizzle(raidz); + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + abd_free(rc->rc_abd); + } + kfree(rr); +} + +int +kernel_offloader_raidz_gen(void *raidz) +{ + raidz_row_t *rr = (raidz_row_t *)unswizzle(raidz); + if (!rr) { + return (KERNEL_OFFLOADER_ERROR); + } + + switch (rr->rr_firstdatacol) { + case 1: + vdev_raidz_generate_parity_p(rr); + break; + case 2: + vdev_raidz_generate_parity_pq(rr); + break; + case 3: + vdev_raidz_generate_parity_pqr(rr); + break; + } + + return (KERNEL_OFFLOADER_OK); +} + +int +kernel_offloader_raidz_rec(void *raidz, int *tgts, int ntgts) +{ + raidz_row_t *rr = (raidz_row_t *)unswizzle(raidz); + if (!rr) { + return (KERNEL_OFFLOADER_ERROR); + } + + vdev_raidz_reconstruct_general(rr, tgts, ntgts); + + return (KERNEL_OFFLOADER_OK); +} + +void * +kernel_offloader_file_open(const char *path, int flags, int mode) +{ + zfs_file_t *fp = NULL; + /* on error, fp should still be NULL */ + zfs_file_open(path, flags, mode, &fp); + return (swizzle(fp)); +} + +int +kernel_offloader_file_write(void *fp_handle, void *handle, size_t count, + size_t trailing_zeros, loff_t offset, ssize_t *resid, int *err) +{ + zfs_file_t *fp = (zfs_file_t *)unswizzle(fp_handle); + if (!fp) { + return (KERNEL_OFFLOADER_ERROR); + } + + koh_t *koh = (koh_t *)unswizzle(handle); + if (!koh) { + return (KERNEL_OFFLOADER_ERROR); + } + + if (!err) { + return (KERNEL_OFFLOADER_ERROR); + } + + *err = zfs_file_pwrite(fp, ptr_start(koh, 0), + count, offset, resid); + + if (*err == 0) { + void *zeros = kzalloc(trailing_zeros, GFP_KERNEL); + *err = zfs_file_pwrite(fp, zeros, + trailing_zeros, offset + count, resid); + kfree(zeros); + } + + return ((*err)?KERNEL_OFFLOADER_BAD_RESULT:KERNEL_OFFLOADER_OK); +} + +void +kernel_offloader_file_close(void *fp_handle) +{ + zfs_file_close(unswizzle(fp_handle)); +} + +void * +kernel_offloader_disk_open(dpusm_dd_t *disk_data) +{ + return (swizzle(disk_data->bdev)); +} + +int +kernel_offloader_disk_invalidate(void *disk_handle) +{ + struct block_device *bdev = + (struct block_device *)unswizzle(disk_handle); + invalidate_bdev(bdev); + return (DPUSM_OK); +} + +int +kernel_offloader_disk_write(void *disk_handle, void *handle, size_t data_size, + size_t trailing_zeros, uint64_t io_offset, int flags, + dpusm_disk_write_completion_t write_completion, void *wc_args) +{ + struct block_device *bdev = + (struct block_device *)unswizzle(disk_handle); + koh_t *koh = (koh_t *)unswizzle(handle); + + const size_t io_size = data_size + trailing_zeros; + + if (trailing_zeros) { + /* create a copy of the data with the trailing zeros attached */ + void *copy = kzalloc(io_size, GFP_KERNEL); + memcpy(copy, ptr_start(koh, 0), data_size); + + /* need to keep copy alive, so replace koh->ptr */ + if (koh->type == KOH_REAL) { + kfree(koh->ptr); + + atomic_sub(1, &active_count); + atomic_sub(koh->size, &active_size); + atomic_sub(koh->size, &active_actual); + } + + koh->type = KOH_REAL; + koh->ptr = copy; + koh->size = io_size; + + atomic_add(1, &total_count); + atomic_add(1, &active_count); + atomic_add(io_size, &total_size); + atomic_add(io_size, &active_size); + atomic_add(io_size, &total_actual); + atomic_add(io_size, &active_actual); + } + + abd_t *abd = abd_get_from_buf(koh->ptr, io_size); + abd_release_ownership_of_buf(abd); + zio_push_transform(wc_args, abd, io_size, io_size, NULL); + + /* __vdev_disk_physio already adds write_completion */ + (void) write_completion; + + return (__vdev_disk_physio(bdev, wc_args, + io_size, io_offset, WRITE, flags)); +} + +int +kernel_offloader_disk_flush(void *disk_handle, + dpusm_disk_flush_completion_t flush_completion, void *fc_args) +{ + struct block_device *bdev = + (struct block_device *)unswizzle(disk_handle); + + /* vdev_disk_io_flush already adds flush completion */ + (void) flush_completion; + + return (vdev_disk_io_flush(bdev, fc_args)); +} + +void +kernel_offloader_disk_close(void *disk_handle) +{} diff --git a/module/zia-software-provider/kernel_offloader.h b/module/zia-software-provider/kernel_offloader.h new file mode 100644 index 000000000000..e320bddbb28d --- /dev/null +++ b/module/zia-software-provider/kernel_offloader.h @@ -0,0 +1,149 @@ +/* + * © 2021. Triad National Security, LLC. All rights reserved. + * + * This program was produced under U.S. Government contract + * 89233218CNA000001 for Los Alamos National Laboratory (LANL), which + * is operated by Triad National Security, LLC for the U.S. + * Department of Energy/National Nuclear Security Administration. All + * rights in the program are reserved by Triad National Security, LLC, + * and the U.S. Department of Energy/National Nuclear Security + * Administration. The Government is granted for itself and others + * acting on its behalf a nonexclusive, paid-up, irrevocable worldwide + * license in this material to reproduce, prepare derivative works, + * distribute copies to the public, perform publicly and display + * publicly, and to permit others to do so. + * + * ---- + * + * This program is open source under the BSD-3 License. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef _KERNEL_OFFLOADER_H +#define _KERNEL_OFFLOADER_H + +#include +#include + +/* + * This file represents the API provided by a vendor to access their + * offloader. The API can be anything the implementor chooses to + * expose. There are no limitations on the function signature or + * name. They just have to be called correctly in the Z.I.A. provider. + * ZFS and Z.I.A. will not need direct access to any data located on + * the offloader. Some raw pointers from Z.I.A. will be used directly, + * but those will always contain information located in memory. + * + * ------------------------------------------------------------------- + * + * The kernel offloader fakes offloads by copying data into memory + * regions distinct from the calling process's memory space. The + * corresponding C file conflates the driver and the "physical" device + * since both memory spaces are in kernel space and run on the + * CPU. This offloader provides opaque pointers to the provider to + * simulate handles to inaccessible memory locations. In order to + * prevent the handle from being dereferenced and used successfully by + * ZFS or Z.I.A., the handle pointer is masked with a random value + * generated at load-time. Other offloaders may choose to present + * non-void handles. + */ + +/* return values */ +#define KERNEL_OFFLOADER_OK 0 + +/* function is implemented, but the chosen operation is not implemented */ +#define KERNEL_OFFLOADER_UNAVAILABLE 1 + +/* ran, but could not complete */ +#define KERNEL_OFFLOADER_ERROR 2 + +/* ran, but failed a check on a result */ +#define KERNEL_OFFLOADER_BAD_RESULT 3 + +/* + * init function - this should be the kernel module init, but + * kernel offloader is not compiled as a separate kernel module + */ +void kernel_offloader_init(void); +void kernel_offloader_fini(void); + +/* offloader handle access */ +void *kernel_offloader_alloc(size_t size); +void *kernel_offloader_alloc_ref(void *src, size_t offset, size_t size); +int kernel_offloader_get_size(void *handle, size_t *size, size_t *actual); +void kernel_offloader_free(void *handle); +int kernel_offloader_copy_from_mem(void *handle, size_t offset, + const void *src, size_t size); +int kernel_offloader_copy_to_mem(void *handle, size_t offset, + void *dst, size_t size); +/* status check */ +int kernel_offloader_mem_stats( + void *t_count_handle, void *t_size_handle, void *t_actual_handle, + void *a_count_handle, void *a_size_handle, void *a_actual_handle); +int kernel_offloader_cmp(void *lhs_handle, void *rhs_handle, int *diff); +int kernel_offloader_zero_fill(void *handle, size_t offset, size_t size); +int kernel_offloader_all_zeros(void *handle, size_t offset, size_t size); + +/* ZIO Pipeline Stages */ + +int kernel_offloader_compress(dpusm_compress_t alg, + void *src, void *dst, size_t s_len, int level, + void *c_len); + +int kernel_offloader_decompress(dpusm_decompress_t alg, + void *src, void *dst, int level); + +int kernel_offloader_checksum(dpusm_checksum_t alg, + dpusm_checksum_byteorder_t order, void *data, size_t size, + void *cksum, size_t cksum_size); + +void *kernel_offloader_raidz_alloc(size_t nparity, size_t ndata, + void **col_handles, size_t *col_sizes); +int kernel_offloader_raidz_set_col(void *raidz, uint64_t c, + void *col, size_t size); +void kernel_offloader_raidz_free(void *raidz); +int kernel_offloader_raidz_gen(void *raidz); +int kernel_offloader_raidz_rec(void *raidz, int *tgts, int ntgts); + +/* io */ +void *kernel_offloader_file_open(const char *path, int flags, int mode); +int kernel_offloader_file_write(void *fp_handle, void *handle, size_t count, + size_t trailing_zeros, loff_t offset, ssize_t *resid, int *err); +void kernel_offloader_file_close(void *fp_handle); + +void *kernel_offloader_disk_open(dpusm_dd_t *disk_data); +int kernel_offloader_disk_reread_part(void *disk_handle); +int kernel_offloader_disk_invalidate(void *disk_handle); +int kernel_offloader_disk_write(void *disk_handle, void *handle, + size_t data_size, size_t trailing_zeros, uint64_t io_offset, int flags, + dpusm_disk_write_completion_t write_completion, void *wc_args); +int kernel_offloader_disk_flush(void *disk_handle, + dpusm_disk_flush_completion_t flush_completion, void *fc_args); +void kernel_offloader_disk_close(void *disk_handle); + +#endif diff --git a/module/zia-software-provider/software.c b/module/zia-software-provider/software.c new file mode 100644 index 000000000000..b644b14e152e --- /dev/null +++ b/module/zia-software-provider/software.c @@ -0,0 +1,451 @@ +/* + * © 2021. Triad National Security, LLC. All rights reserved. + * + * This program was produced under U.S. Government contract + * 89233218CNA000001 for Los Alamos National Laboratory (LANL), which + * is operated by Triad National Security, LLC for the U.S. + * Department of Energy/National Nuclear Security Administration. All + * rights in the program are reserved by Triad National Security, LLC, + * and the U.S. Department of Energy/National Nuclear Security + * Administration. The Government is granted for itself and others + * acting on its behalf a nonexclusive, paid-up, irrevocable worldwide + * license in this material to reproduce, prepare derivative works, + * distribute copies to the public, perform publicly and display + * publicly, and to permit others to do so. + * + * ---- + * + * This program is open source under the BSD-3 License. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* + * This provider communicates with the "kernel offloader", which is + * actually just software running on the local kernel. + * + * Providers and offloaders are usually separate entities. However, to + * keep things simple, the kernel offloader is compiled into this + * provider. + * + * Providers run at the same location as ZFS. They are intended to be + * small shims that translate between the DPUSM provider API and an + * offloader's API (probably a header file analogous to + * kernel_offloader.h). + * + * The method used to communicate between the provider and offloader + * is not prescribed by the DPUSM. This allows for vendors to place + * their offloaders locally or remotely, and use whatever method they + * wish to use to communicate with their offloaders e.g. NVMeOF. The + * kernel offloader is local and the communication method to access + * the kernel offloader is calling local functions. + * + * Offloaders are normally expected to be hardware with its own memory + * space. In order to simulate copying data to an offloader's memory + * space, the kernel offloader allocates new buffers and copies ZFS + * data into them, rather than using ZFS data directly. In order to + * simulate handles that the provider does not know how to manipulate + * or have access to, pointers returned from the kernel offloader are + * masked with a random value. + * + * Note that this provider has to be loaded after ZFS because it + * depends on ZFS for its "offload" functionality. + * + * Usage: + * Reconfigure ZFS with --with-zia= + * + * Create a zpool + * + * Select this provider with + * zpool set zia_provider=zia-software-provider + * + * Enable "offloading" of operations with + * zpool set zia_compress=on + * zpool set zia_checksum=on + * zpool set zia_raidz1_gen=on + * zpool set zia_raidz2_gen=on + * zpool set zia_raidz3_gen=on + * zpool set zia_raidz1_rec=on + * zpool set zia_raidz2_rec=on + * zpool set zia_raidz3_rec=on + * zpool set zia_file_write=on + * zpool set zia_disk_write=on + * + * Use the zpool as you would normally + * + * Notes: + * If a ZFS IO stage is not run, enabling a Z.I.A. offload + * will have no effect. + * + * Resilvering requires both zia_checksum and zia_raidz*_rec + * to be enabled. Not enabling checksums would cause offloaded + * resilvering to fail, and perform the remaining operations + * in memory. To avoid the cost of offloading data only to + * fail, a check has been inserted to prevent offloading + * altogether if zia_checksum is not enabled. + */ + +#include +#include +#include + +#include /* provides access to the offloader */ +#include /* the DPUSM provider API */ + +/* translate from offloader values to DPUSM values */ +static int +translate_rc(const int offloader_rc) +{ + int dpusm_rc = DPUSM_NOT_IMPLEMENTED; + switch (offloader_rc) { + case KERNEL_OFFLOADER_OK: + dpusm_rc = DPUSM_OK; + break; + case KERNEL_OFFLOADER_ERROR: + dpusm_rc = DPUSM_ERROR; + break; + case KERNEL_OFFLOADER_UNAVAILABLE: + dpusm_rc = DPUSM_NOT_IMPLEMENTED; + break; + case KERNEL_OFFLOADER_BAD_RESULT: + dpusm_rc = DPUSM_BAD_RESULT; + break; + default: + /* only translate recognized values */ + dpusm_rc = offloader_rc; + break; + } + return (dpusm_rc); +} + +static int +sw_provider_algorithms(int *compress, int *decompress, + int *checksum, int *checksum_byteorder, int *raid) +{ + *compress = + DPUSM_COMPRESS_GZIP_1 | + DPUSM_COMPRESS_GZIP_2 | + DPUSM_COMPRESS_GZIP_3 | + DPUSM_COMPRESS_GZIP_4 | + DPUSM_COMPRESS_GZIP_5 | + DPUSM_COMPRESS_GZIP_6 | + DPUSM_COMPRESS_GZIP_7 | + DPUSM_COMPRESS_GZIP_8 | + DPUSM_COMPRESS_GZIP_9; + + *decompress = 0; + + *checksum = DPUSM_CHECKSUM_FLETCHER_2 | DPUSM_CHECKSUM_FLETCHER_4; + + *checksum_byteorder = DPUSM_BYTEORDER_NATIVE | DPUSM_BYTEORDER_BYTESWAP; + + *raid = + DPUSM_RAID_1_GEN | + DPUSM_RAID_2_GEN | + DPUSM_RAID_3_GEN | + DPUSM_RAID_1_REC | + DPUSM_RAID_2_REC | + DPUSM_RAID_3_REC; + + return (DPUSM_OK); +} + +static int +sw_provider_get_size(void *handle, size_t *size, size_t *actual) +{ + return (translate_rc(kernel_offloader_get_size(handle, + size, actual))); +} + +static int +sw_provider_copy_from_mem(dpusm_mv_t *mv, const void *buf, size_t size) +{ + return (translate_rc(kernel_offloader_copy_from_mem(mv->handle, + mv->offset, buf, size))); +} + +static int +sw_provider_copy_to_mem(dpusm_mv_t *mv, void *buf, size_t size) +{ + return (translate_rc(kernel_offloader_copy_to_mem(mv->handle, + mv->offset, buf, size))); +} + +static int +sw_provider_mem_stats(size_t *t_count, size_t *t_size, size_t *t_actual, + size_t *a_count, size_t *a_size, size_t *a_actual) +{ + void *t_count_handle = NULL; + void *t_size_handle = NULL; + void *t_actual_handle = NULL; + void *a_size_handle = NULL; + void *a_count_handle = NULL; + void *a_actual_handle = NULL; + + if (t_count) { + t_count_handle = kernel_offloader_alloc(sizeof (size_t)); + } + + if (t_size) { + t_size_handle = kernel_offloader_alloc(sizeof (size_t)); + } + + if (t_actual) { + t_actual_handle = kernel_offloader_alloc(sizeof (size_t)); + } + + if (a_count) { + a_count_handle = kernel_offloader_alloc(sizeof (size_t)); + } + + if (a_size) { + a_size_handle = kernel_offloader_alloc(sizeof (size_t)); + } + + if (a_actual) { + a_actual_handle = kernel_offloader_alloc(sizeof (size_t)); + } + + const int rc = kernel_offloader_mem_stats(t_count, t_size, t_actual, + a_count, a_size, a_actual); + if (rc == KERNEL_OFFLOADER_OK) { + /* should probably check for errors */ + kernel_offloader_copy_to_mem(t_count_handle, 0, + t_count, sizeof (*t_count)); + kernel_offloader_copy_to_mem(t_size_handle, 0, + t_size, sizeof (*t_size)); + kernel_offloader_copy_to_mem(t_actual_handle, 0, + t_actual, sizeof (*t_actual)); + kernel_offloader_copy_to_mem(a_count_handle, 0, + a_count, sizeof (*a_count)); + kernel_offloader_copy_to_mem(a_size_handle, 0, + a_size, sizeof (*a_size)); + kernel_offloader_copy_to_mem(a_actual_handle, 0, + a_actual, sizeof (*a_actual)); + } + + kernel_offloader_free(t_size_handle); + kernel_offloader_free(t_count_handle); + kernel_offloader_free(t_actual_handle); + kernel_offloader_free(a_size_handle); + kernel_offloader_free(a_count_handle); + kernel_offloader_free(a_actual_handle); + + return (translate_rc(rc)); +} + +static int +sw_provider_zero_fill(void *handle, size_t offset, size_t size) +{ + return (translate_rc(kernel_offloader_zero_fill(handle, offset, size))); +} + +static int +sw_provider_all_zeros(void *handle, size_t offset, size_t size) +{ + return (translate_rc(kernel_offloader_all_zeros(handle, offset, size))); +} + +static int +sw_provider_compress(dpusm_compress_t alg, + void *src, void *dst, size_t s_len, int level, + size_t *d_len) +{ + /* buffer that offloader fills out */ + void *d_len_handle = kernel_offloader_alloc(sizeof (size_t)); + + const int kz_rc = kernel_offloader_compress(alg, src, dst, s_len, level, + d_len_handle); + if (kz_rc == KERNEL_OFFLOADER_OK) { + /* get d_len back from offloader */ + kernel_offloader_copy_to_mem(d_len_handle, 0, + d_len, sizeof (d_len)); + } + + kernel_offloader_free(d_len_handle); + + return (translate_rc(kz_rc)); +} + +static int +sw_provider_decompress(dpusm_compress_t alg, + void *src, void *dst, int level) +{ + return (translate_rc(kernel_offloader_decompress(alg, src, + dst, level))); +} + +static int +sw_provider_checksum(dpusm_checksum_t alg, + dpusm_checksum_byteorder_t order, void *data, size_t size, + void *cksum, size_t cksum_size) +{ + /* maybe translate alg and order */ + + /* trigger offloader to do actual calculation */ + return (translate_rc(kernel_offloader_checksum(alg, + order, data, size, cksum, cksum_size))); +} + +static int +sw_provider_raid_can_compute(size_t nparity, size_t ndata, + size_t *col_sizes, int rec) +{ + if ((nparity < 1) || (nparity > 3)) { + return (DPUSM_NOT_SUPPORTED); + } + + return (DPUSM_OK); +} + +static int +sw_provider_raid_gen(void *raid) +{ + return (translate_rc(kernel_offloader_raidz_gen(raid))); +} + +static int +sw_provider_raid_new_parity(void *raid, uint64_t raidn, + void **new_parity_cols, size_t *new_parity_sizes) +{ + for (uint64_t c = 0; c < raidn; c++) { + /* not every column needs to be reconstructed */ + if (new_parity_sizes[c] == 0) { + continue; + } + + /* allocate a new buffer */ + void *handle = kernel_offloader_alloc(new_parity_sizes[c]); + if (!handle) { + return (DPUSM_ERROR); + } + + /* assign this buffer to column c */ + /* old column c is no longer associated with the raid data */ + if (kernel_offloader_raidz_set_col(raid, c, + handle, new_parity_sizes[c]) != KERNEL_OFFLOADER_OK) { + return (DPUSM_ERROR); + } + + /* send the handle back to DPUSM */ + new_parity_cols[c] = handle; + + /* + * leave assigned handles on error + * DPUSM will clean them up + */ + } + + return (DPUSM_OK); +} + +static int +sw_provider_raid_cmp(void *lhs_handle, void *rhs_handle, int *diff) +{ + return (translate_rc(kernel_offloader_cmp(lhs_handle, + rhs_handle, diff))); +} + +static int +sw_provider_raid_rec(void *raid, int *tgts, int ntgts) +{ + return (translate_rc(kernel_offloader_raidz_rec(raid, + tgts, ntgts))); +} + +static int +sw_provider_file_write(void *fp_handle, void *handle, size_t count, + size_t trailing_zeros, loff_t offset, ssize_t *resid, int *err) +{ + return (translate_rc(kernel_offloader_file_write(fp_handle, + handle, count, trailing_zeros, offset, resid, err))); +} + +/* BEGIN CSTYLED */ +static const char name[] = "zia-software-provider"; +static const dpusm_pf_t sw_provider_functions = { + .algorithms = sw_provider_algorithms, + .alloc = kernel_offloader_alloc, + .alloc_ref = kernel_offloader_alloc_ref, + .get_size = sw_provider_get_size, + .free = kernel_offloader_free, + .copy_from_mem = sw_provider_copy_from_mem, + .copy_to_mem = sw_provider_copy_to_mem, + .mem_stats = sw_provider_mem_stats, + .zero_fill = sw_provider_zero_fill, + .all_zeros = sw_provider_all_zeros, + .compress = sw_provider_compress, + .decompress = sw_provider_decompress, + .checksum = sw_provider_checksum, + .raid = { + .can_compute = sw_provider_raid_can_compute, + .alloc = kernel_offloader_raidz_alloc, + .free = kernel_offloader_raidz_free, + .gen = sw_provider_raid_gen, + .new_parity = sw_provider_raid_new_parity, + .cmp = sw_provider_raid_cmp, + .rec = sw_provider_raid_rec, + }, + .file = { + .open = kernel_offloader_file_open, + .write = sw_provider_file_write, + .close = kernel_offloader_file_close, + }, + .disk = { + .open = kernel_offloader_disk_open, + .invalidate = kernel_offloader_disk_invalidate, + .write = kernel_offloader_disk_write, + .flush = kernel_offloader_disk_flush, + .close = kernel_offloader_disk_close, + }, +}; +/* END CSTYLED */ + +static int __init +sw_provider_init(void) +{ + /* + * this should be a separate kernel module, + * but is here for simplicity + */ + kernel_offloader_init(); + + return (dpusm_register_bsd(name, &sw_provider_functions)); +} + +static void __exit +sw_provider_exit(void) +{ + dpusm_unregister_bsd(name); + + kernel_offloader_fini(); +} + +module_init(sw_provider_init); +module_exit(sw_provider_exit); + +MODULE_LICENSE("CDDL"); diff --git a/rpm/generic/zfs-kmod.spec.in b/rpm/generic/zfs-kmod.spec.in index ae0795427868..db993ddeb4a3 100644 --- a/rpm/generic/zfs-kmod.spec.in +++ b/rpm/generic/zfs-kmod.spec.in @@ -38,6 +38,7 @@ %bcond_with debug %bcond_with debuginfo +%bcond_with zia Name: %{module}-kmod @@ -123,6 +124,12 @@ bash %{SOURCE10} --target %{_target_cpu} %{?repo:--repo %{?repo}} --kmodname %{ %define debuginfo --disable-debuginfo %endif +%if %{with zia} + %define zia --with-zia="%{?DPUSM_ROOT}" +%else + %define zia --without-zia +%endif + # Leverage VPATH from configure to avoid making multiple copies. %define _configure ../%{module}-%{version}/configure @@ -143,7 +150,8 @@ for kernel_version in %{?kernel_versions}; do %{debuginfo} \ %{?kernel_cc} \ %{?kernel_ld} \ - %{?kernel_llvm} + %{?kernel_llvm} \ + %{zia} make %{?_smp_mflags} cd .. done diff --git a/rpm/generic/zfs.spec.in b/rpm/generic/zfs.spec.in index 493e93c1f3e6..af894b157caa 100644 --- a/rpm/generic/zfs.spec.in +++ b/rpm/generic/zfs.spec.in @@ -59,6 +59,7 @@ %bcond_with systemd %bcond_with pam %bcond_without pyzfs +%bcond_with zia # Generic enable switch for systemd %if %{with systemd} @@ -381,6 +382,12 @@ support for unlocking datasets on user login. %define pam --disable-pam %endif +%if %{with zia} + %define zia --with-zia="%{DPUSM_ROOT}" +%else + %define zia --without-zia +%endif + %setup -q %build @@ -400,7 +407,8 @@ support for unlocking datasets on user login. %{ubsan} \ %{systemd} \ %{pam} \ - %{pyzfs} + %{pyzfs} \ + %{zia} make %{?_smp_mflags} %install diff --git a/rpm/redhat/zfs-kmod.spec.in b/rpm/redhat/zfs-kmod.spec.in index f59551c0b43a..362c5578d034 100644 --- a/rpm/redhat/zfs-kmod.spec.in +++ b/rpm/redhat/zfs-kmod.spec.in @@ -59,6 +59,12 @@ fi %define debuginfo --disable-debuginfo %endif +%if %{with zia} +%define zia --with-zia="%{?DPUSM_ROOT}" +%else +%define zia --without-zia +%endif + %setup -n %{kmod_name}-%{version} %build %configure \ @@ -69,7 +75,8 @@ fi %{debuginfo} \ %{?kernel_cc} \ %{?kernel_ld} \ - %{?kernel_llvm} + %{?kernel_llvm} \ + %{zia} make %{?_smp_mflags} %install