diff --git a/Makefile.am b/Makefile.am index 54d300e7d40b..11b3489d2ac2 100644 --- a/Makefile.am +++ b/Makefile.am @@ -57,6 +57,8 @@ dist_noinst_DATA += module/os/linux/spl/THIRDPARTYLICENSE.gplv2 dist_noinst_DATA += module/os/linux/spl/THIRDPARTYLICENSE.gplv2.descrip dist_noinst_DATA += module/zfs/THIRDPARTYLICENSE.cityhash dist_noinst_DATA += module/zfs/THIRDPARTYLICENSE.cityhash.descrip +dist_noinst_DATA += module/zfs/THIRDPARTYLICENSE.zia +dist_noinst_DATA += module/zfs/THIRDPARTYLICENSE.zia.descrip @CODE_COVERAGE_RULES@ diff --git a/cmd/raidz_test/raidz_test.c b/cmd/raidz_test/raidz_test.c index 8b21bc098e01..8e2d1e16ed9a 100644 --- a/cmd/raidz_test/raidz_test.c +++ b/cmd/raidz_test/raidz_test.c @@ -455,6 +455,9 @@ vdev_raidz_map_alloc_expanded(abd_t *abd, uint64_t size, uint64_t offset, rr->rr_firstdatacol = nparity; rr->rr_abd_empty = NULL; rr->rr_nempty = 0; +#ifdef ZIA + rr->rr_zia_handle = NULL; +#endif for (int c = 0; c < rr->rr_cols; c++, child_id++) { if (child_id >= row_phys_cols) { diff --git a/config/Rules.am b/config/Rules.am index 7162b771869d..86911a5272c4 100644 --- a/config/Rules.am +++ b/config/Rules.am @@ -40,6 +40,7 @@ AM_CPPFLAGS += -DPKGDATADIR=\"$(pkgdatadir)\" AM_CPPFLAGS += $(DEBUG_CPPFLAGS) AM_CPPFLAGS += $(CODE_COVERAGE_CPPFLAGS) AM_CPPFLAGS += -DTEXT_DOMAIN=\"zfs-@ac_system_l@-user\" +AM_CPPFLAGS += $(ZIA_CPPFLAGS) AM_CPPFLAGS_NOCHECK = -D"strtok(...)=strtok(__VA_ARGS__) __attribute__((deprecated(\"Use strtok_r(3) instead!\")))" AM_CPPFLAGS_NOCHECK += -D"__xpg_basename(...)=__xpg_basename(__VA_ARGS__) __attribute__((deprecated(\"basename(3) is underspecified. Use zfs_basename() instead!\")))" diff --git a/config/zfs-build.m4 b/config/zfs-build.m4 index d14a6bb7ac9f..ff0468b3c893 100644 --- a/config/zfs-build.m4 +++ b/config/zfs-build.m4 @@ -260,6 +260,8 @@ AC_DEFUN([ZFS_AC_CONFIG], [ AC_SUBST(TEST_JOBS) ]) + ZFS_AC_ZIA + ZFS_INIT_SYSV= ZFS_INIT_SYSTEMD= ZFS_WANT_MODULES_LOAD_D= @@ -291,7 +293,8 @@ AC_DEFUN([ZFS_AC_CONFIG], [ [test "x$qatsrc" != x ]) AM_CONDITIONAL([WANT_DEVNAME2DEVID], [test "x$user_libudev" = xyes ]) AM_CONDITIONAL([WANT_MMAP_LIBAIO], [test "x$user_libaio" = xyes ]) - AM_CONDITIONAL([PAM_ZFS_ENABLED], [test "x$enable_pam" = xyes]) + AM_CONDITIONAL([PAM_ZFS_ENABLED], [test "x$enable_pam" = xyes ]) + AM_CONDITIONAL([ZIA_ENABLED], [test "x$enable_zia" = xyes ]) ]) dnl # @@ -334,6 +337,7 @@ AC_DEFUN([ZFS_AC_RPM], [ RPM_DEFINE_COMMON=${RPM_DEFINE_COMMON}' --define "$(DEBUG_KMEM_TRACKING_ZFS) 1"' RPM_DEFINE_COMMON=${RPM_DEFINE_COMMON}' --define "$(ASAN_ZFS) 1"' RPM_DEFINE_COMMON=${RPM_DEFINE_COMMON}' --define "$(UBSAN_ZFS) 1"' + RPM_DEFINE_COMMON=${RPM_DEFINE_COMMON}' --define "$(WITH_ZIA) 1" --define "DPUSM_ROOT $(DPUSM_ROOT)"' AS_IF([test "x$enable_debuginfo" = xyes], [ RPM_DEFINE_COMMON=${RPM_DEFINE_COMMON}' --define "__strip /bin/true"' diff --git a/config/zia.m4 b/config/zia.m4 new file mode 100644 index 000000000000..8ec2de1f466d --- /dev/null +++ b/config/zia.m4 @@ -0,0 +1,42 @@ +dnl # Adds --with-zia=PATH to configuration options +dnl # The path provided should point to the DPUSM +dnl # root and contain Module.symvers. +AC_DEFUN([ZFS_AC_ZIA], [ + AC_ARG_WITH([zia], + AS_HELP_STRING([--with-zia=PATH], + [Path to Data Processing Services Module]), + [ + DPUSM_ROOT="$withval" + enable_zia=yes + ] + ) + + AS_IF([test "x$enable_zia" == "xyes"], + AS_IF([! test -d "$DPUSM_ROOT"], + [AC_MSG_ERROR([--with-zia=PATH requires the DPUSM root directory])] + ) + + DPUSM_SYMBOLS="$DPUSM_ROOT/Module.symvers" + + AS_IF([test -r $DPUSM_SYMBOLS], + [ + AC_MSG_RESULT([$DPUSM_SYMBOLS]) + ZIA_CPPFLAGS="-DZIA=1 -I$DPUSM_ROOT/include" + KERNEL_ZIA_CPPFLAGS="-DZIA=1 -I$DPUSM_ROOT/include" + WITH_ZIA="_with_zia" + + AC_SUBST(WITH_ZIA) + AC_SUBST(KERNEL_ZIA_CPPFLAGS) + AC_SUBST(ZIA_CPPFLAGS) + AC_SUBST(DPUSM_SYMBOLS) + AC_SUBST(DPUSM_ROOT) + ], + [ + AC_MSG_ERROR([ + *** Failed to find Module.symvers in: + $DPUSM_SYMBOLS + ]) + ] + ) + ) +]) diff --git a/include/sys/abd.h b/include/sys/abd.h index 5c6bd0c271d4..2d29b712d6bf 100644 --- a/include/sys/abd.h +++ b/include/sys/abd.h @@ -75,6 +75,9 @@ typedef struct abd { list_t abd_gang_chain; } abd_gang; } abd_u; +#ifdef ZIA + void *abd_zia_handle; +#endif } abd_t; typedef int abd_iter_func_t(void *buf, size_t len, void *priv); diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index f013e6b20603..e735079b6b6e 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -251,6 +251,20 @@ typedef enum { ZPOOL_PROP_LOAD_GUID, ZPOOL_PROP_AUTOTRIM, ZPOOL_PROP_COMPATIBILITY, +#ifdef ZIA + ZPOOL_PROP_ZIA_PROVIDER, + ZPOOL_PROP_ZIA_COMPRESS, + ZPOOL_PROP_ZIA_DECOMPRESS, + ZPOOL_PROP_ZIA_CHECKSUM, + ZPOOL_PROP_ZIA_RAIDZ1_GEN, + ZPOOL_PROP_ZIA_RAIDZ2_GEN, + ZPOOL_PROP_ZIA_RAIDZ3_GEN, + ZPOOL_PROP_ZIA_RAIDZ1_REC, + ZPOOL_PROP_ZIA_RAIDZ2_REC, + ZPOOL_PROP_ZIA_RAIDZ3_REC, + ZPOOL_PROP_ZIA_FILE_WRITE, + ZPOOL_PROP_ZIA_DISK_WRITE, +#endif ZPOOL_NUM_PROPS } zpool_prop_t; diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 9946c4e3c316..37b8e20150d1 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -53,6 +53,10 @@ #include #include +#ifdef ZIA +#include +#endif + #ifdef __cplusplus extern "C" { #endif @@ -441,6 +445,10 @@ struct spa { zfs_refcount_t spa_refcount; /* number of opens */ taskq_t *spa_upgrade_taskq; /* taskq for upgrade jobs */ + +#ifdef ZIA + zia_props_t spa_zia_props; +#endif }; extern char *spa_config_path; diff --git a/include/sys/vdev_disk.h b/include/sys/vdev_disk.h index a7e19fbf0c4b..d0241b9da9b6 100644 --- a/include/sys/vdev_disk.h +++ b/include/sys/vdev_disk.h @@ -42,5 +42,12 @@ #ifdef _KERNEL #include + +#ifdef ZIA +int __vdev_disk_physio(struct block_device *bdev, zio_t *zio, + size_t io_size, uint64_t io_offset, int rw, int flags); +int vdev_disk_io_flush(struct block_device *bdev, zio_t *zio); +void vdev_disk_error(zio_t *zio); +#endif /* ZIA */ #endif /* _KERNEL */ #endif /* _SYS_VDEV_DISK_H */ diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index db8fbdeb06df..6f34b6dc26cd 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -465,6 +465,10 @@ struct vdev { zfs_ratelimit_t vdev_delay_rl; zfs_ratelimit_t vdev_deadman_rl; zfs_ratelimit_t vdev_checksum_rl; + +#ifdef ZIA + void *vdev_zia_handle; +#endif }; #define VDEV_PAD_SIZE (8 << 10) diff --git a/include/sys/vdev_raidz.h b/include/sys/vdev_raidz.h index c7cf0af6d945..dd0c9042c2b6 100644 --- a/include/sys/vdev_raidz.h +++ b/include/sys/vdev_raidz.h @@ -70,6 +70,13 @@ typedef struct vdev_raidz { int vd_nparity; } vdev_raidz_t; +#ifdef ZIA +void vdev_raidz_generate_parity_p(struct raidz_row *); +void vdev_raidz_generate_parity_pq(struct raidz_row *); +void vdev_raidz_generate_parity_pqr(struct raidz_row *); +void vdev_raidz_reconstruct_general(struct raidz_row *, int *, int); +#endif + #ifdef __cplusplus } #endif diff --git a/include/sys/vdev_raidz_impl.h b/include/sys/vdev_raidz_impl.h index 890e725e18d8..fcee8b8ccf89 100644 --- a/include/sys/vdev_raidz_impl.h +++ b/include/sys/vdev_raidz_impl.h @@ -129,6 +129,9 @@ typedef struct raidz_row { #ifdef ZFS_DEBUG uint64_t rr_offset; /* Logical offset for *_io_verify() */ uint64_t rr_size; /* Physical size for *_io_verify() */ +#endif +#ifdef ZIA + void *rr_zia_handle; #endif raidz_col_t rr_col[0]; /* Flexible array of I/O columns */ } raidz_row_t; diff --git a/include/sys/zia.h b/include/sys/zia.h new file mode 100644 index 000000000000..2d0b33f1cdb2 --- /dev/null +++ b/include/sys/zia.h @@ -0,0 +1,197 @@ +/* + * © 2021. Triad National Security, LLC. All rights reserved. + * + * This program was produced under U.S. Government contract + * 89233218CNA000001 for Los Alamos National Laboratory (LANL), which + * is operated by Triad National Security, LLC for the U.S. + * Department of Energy/National Nuclear Security Administration. All + * rights in the program are reserved by Triad National Security, LLC, + * and the U.S. Department of Energy/National Nuclear Security + * Administration. The Government is granted for itself and others + * acting on its behalf a nonexclusive, paid-up, irrevocable worldwide + * license in this material to reproduce, prepare derivative works, + * distribute copies to the public, perform publicly and display + * publicly, and to permit others to do so. + * + * ---- + * + * This program is open source under the BSD-3 License. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifdef ZIA + +#ifndef _ZIA_H +#define _ZIA_H + +#include +#include /* VDEV_RAIDZ_MAXPARITY */ +#include +#include +#include +#include +#include + +typedef struct raidz_row raidz_row_t; + +/* ******************************************************** */ +/* return values */ +#define ZIA_OK 0 + +/* something bad happened not related to missing functionality */ +#define ZIA_ERROR 1 + +/* error, fallback to zfs implementation */ +#define ZIA_FALLBACK 2 + +/* ran, but result is bad */ +#define ZIA_BAD_RESULT 3 + +/* expected provider and actual provider do not match */ +#define ZIA_PROVIDER_MISMATCH 4 +/* ******************************************************** */ + +/* + * This struct is normally set with + * zpool set zia_=on/off/ + * and passed around in spa_t. + */ +typedef struct zia_props { + void *provider; + + /* minimum size allowed to offload - set by ashift */ + size_t min_offload_size; + + int compress; + int decompress; + + int checksum; + + struct { + int gen[VDEV_RAIDZ_MAXPARITY + 1]; + int rec[VDEV_RAIDZ_MAXPARITY + 1]; + } raidz; + + int file_write; + int disk_write; +} zia_props_t; + +zia_props_t *zia_get_props(spa_t *spa); +void zia_prop_warn(boolean_t val, const char *name); + +int zia_init(void); +int zia_fini(void); + +void *zia_get_provider(const char *name); +const char *zia_get_provider_name(void *provider); +int zia_put_provider(void **provider); + +/* check if offloading can occur */ +boolean_t zia_is_used(zio_t *zio); + +/* + * check if a handle is associated with this pointer + * + * not exposing functions for different handles because + * only abd handles are checked outside of zia.c + */ +boolean_t zia_is_offloaded(abd_t *abd); + +/* create a new offloader handle without copying data */ +void *zia_alloc(void *provider, size_t size, size_t min_offload_size); + +/* deallocate handle without onloading */ +void zia_free(void **handle); + +/* move linear data between from the offloader to memory */ +int zia_onload(void **handle, void *buf, size_t size); + +/* calls abd_iterate_func on the abd to copy abd data back and forth */ +int zia_offload_abd(void *provider, abd_t *abd, + size_t size, size_t min_offload_size, boolean_t *local_offload); +int zia_onload_abd(abd_t *abd, size_t size, boolean_t keep_handle); +/* move a handle into an abd */ +void zia_move_into_abd(abd_t *dst, void **src); +int zia_free_abd(abd_t *abd, boolean_t lock); + +/* + * if offloaded locally, just free the handle + * if not, onload the data and free the handle + */ +int zia_cleanup_abd(abd_t *abd, size_t size, boolean_t local_offload); + +/* fill a buffer with zeros */ +int zia_zero_fill(abd_t *abd, size_t offset, size_t size); + +int zia_compress(void *provider, zio_t *zio, size_t s_len, + enum zio_compress c, uint8_t level, void **cbuf_handle, + uint64_t *c_len, boolean_t *local_offload); + +int zia_checksum_compute(void *provider, zio_cksum_t *dst, + enum zio_checksum alg, zio_t *zio, uint64_t size, + boolean_t *local_offload); +int zia_checksum_error(const blkptr_t *bp, enum zio_checksum alg, + abd_t *abd, uint64_t size, zio_bad_cksum_t *info); + +/* raidz */ +int zia_raidz_alloc(zio_t *zio, raidz_row_t *rr, boolean_t rec, + uint_t cksum, boolean_t *local_offload); +int zia_raidz_free(raidz_row_t *rr, boolean_t onload_parity); +int zia_raidz_gen(raidz_row_t *rr); +int zia_raidz_gen_cleanup(zio_t *zio, raidz_row_t *rr, + boolean_t local_offload); +int zia_raidz_new_parity(zio_t *zio, raidz_row_t *rr, abd_t **orig); +/* compare the contents of offloaded abds (only used in resilver) */ +int zia_raidz_cmp(abd_t *lhs, abd_t *rhs, int *diff); +int zia_raidz_rec(raidz_row_t *rr, int *t, int nt); +int zia_raidz_rec_cleanup(zio_t *zio, raidz_row_t *rr, + boolean_t local_offload, boolean_t onload_parity); + +/* file I/O */ +int zia_file_open(vdev_t *vdev, const char *path, + int flags, int mode); +int zia_file_write(vdev_t *vdev, abd_t *abd, ssize_t size, + loff_t offset, ssize_t *resid, int *err); +int zia_file_close(vdev_t *vdev); + +#ifdef _KERNEL +#include + +/* disk I/O */ +int zia_disk_open(vdev_t *vdev, const char *path, + struct block_device *bdev); +int zia_disk_invalidate(vdev_t *vdev); +int zia_disk_write(vdev_t *vdev, zio_t *zio, + size_t io_size, uint64_t io_offset, int flags); +int zia_disk_flush(vdev_t *vdev, zio_t *zio); +int zia_disk_close(vdev_t *vdev); +#endif + +#endif + +#endif diff --git a/include/sys/zia_cddl.h b/include/sys/zia_cddl.h new file mode 100644 index 000000000000..74e88abbeae5 --- /dev/null +++ b/include/sys/zia_cddl.h @@ -0,0 +1,58 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifdef ZIA + +#ifndef _ZIA_CDDL_H +#define _ZIA_CDDL_H + +#include +#include +#include +#include +#include + +#include + +int +zia_compress_impl(const dpusm_uf_t *dpusm, void *provider, + zio_t *zio, size_t s_len, enum zio_compress c, uint8_t level, + void **cbuf_handle, uint64_t *c_len, boolean_t *local_offload); + +int +zia_checksum_error_impl(const dpusm_uf_t *dpusm, const blkptr_t *bp, + enum zio_checksum alg, abd_t *abd, uint64_t size, zio_bad_cksum_t *info); + +int +zia_raidz_rec_impl(const dpusm_uf_t *dpusm, + raidz_row_t *rr, int *t, int nt); + +#ifdef _KERNEL +void +zia_disk_write_completion(void *zio_ptr, int error); + +void +zia_disk_flush_completion(void *zio_ptr, int error); +#endif + +#endif + +#endif diff --git a/include/sys/zia_private.h b/include/sys/zia_private.h new file mode 100644 index 000000000000..121f97c9e31c --- /dev/null +++ b/include/sys/zia_private.h @@ -0,0 +1,78 @@ +/* + * © 2021. Triad National Security, LLC. All rights reserved. + * + * This program was produced under U.S. Government contract + * 89233218CNA000001 for Los Alamos National Laboratory (LANL), which + * is operated by Triad National Security, LLC for the U.S. + * Department of Energy/National Nuclear Security Administration. All + * rights in the program are reserved by Triad National Security, LLC, + * and the U.S. Department of Energy/National Nuclear Security + * Administration. The Government is granted for itself and others + * acting on its behalf a nonexclusive, paid-up, irrevocable worldwide + * license in this material to reproduce, prepare derivative works, + * distribute copies to the public, perform publicly and display + * publicly, and to permit others to do so. + * + * ---- + * + * This program is open source under the BSD-3 License. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifdef ZIA + +#ifndef _ZIA_PRIVATE_H +#define _ZIA_PRIVATE_H + +#include +#include +#include + +/* + * needed by both zia.h and zia_cddl.h + * defined in zia.c + */ + +#define ABD_HANDLE(abd) (abd)->abd_zia_handle + +#define VDEV_HANDLE(vdev) (vdev)->vdev_zia_handle + +dpusm_compress_t +translate_compress(enum zio_compress c); + +dpusm_checksum_t +translate_checksum(enum zio_checksum c); + +dpusm_checksum_byteorder_t +translate_byteorder(zio_byteorder_t bo); + +int zia_get_capabilities(void *provider, dpusm_pc_t **caps); + +#endif + +#endif diff --git a/include/sys/zio.h b/include/sys/zio.h index 9bee7cc9b9fd..238f10942be3 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -350,6 +350,9 @@ typedef struct zio_prop { uint8_t zp_iv[ZIO_DATA_IV_LEN]; uint8_t zp_mac[ZIO_DATA_MAC_LEN]; uint32_t zp_zpl_smallblk; +#ifdef ZIA + boolean_t zp_ismd; +#endif } zio_prop_t; typedef struct zio_cksum_report zio_cksum_report_t; diff --git a/include/sys/zio_compress.h b/include/sys/zio_compress.h index 26600b43bb49..fe50829e984a 100644 --- a/include/sys/zio_compress.h +++ b/include/sys/zio_compress.h @@ -154,6 +154,10 @@ typedef const struct zio_compress_info { extern const zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS]; +#ifdef ZIA +extern int zio_compress_zeroed_cb(void *data, size_t len, void *private); +#endif + /* * lz4 compression init & free */ diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index eaa920e56106..711e2f0afe76 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -179,6 +179,8 @@ nodist_libzpool_la_SOURCES = \ module/zfs/zfs_rlock.c \ module/zfs/zfs_sa.c \ module/zfs/zil.c \ + module/zfs/zia.c \ + module/zfs/zia_cddl.c \ module/zfs/zio.c \ module/zfs/zio_checksum.c \ module/zfs/zio_compress.c \ diff --git a/man/man7/zpoolprops.7 b/man/man7/zpoolprops.7 index 944fbf2b8d29..b7f65a6840ee 100644 --- a/man/man7/zpoolprops.7 +++ b/man/man7/zpoolprops.7 @@ -414,4 +414,41 @@ command, though this property can be used when a specific version is needed for backwards compatibility. Once feature flags are enabled on a pool this property will no longer have a value. + +.It Sy zia_checksum Ns = Ns Sy on Ns | Ns Sy off +Controls whether the pool should offload checksum computations. Does not have any effect if the checksum stage is disabled. Embedded checksums are onloaded, and will suffer a data movement penalty. + +.It Sy zia_compress Ns = Ns Sy on Ns | Ns Sy off +Controls whether the pool should offload compression. Does not have any effect if the compression stage is disabled. Embedded data is onloaded, and will suffer a data movement penalty. + +.It Sy zia_decompress Ns = Ns Sy on Ns | Ns Sy off +Controls whether the pool should offload checksum computations. Does not have any effect if the checksum stage is disabled. + +.It Sy zia_disk_write Ns = Ns Sy on Ns | Ns Sy off +Controls whether a pool should offload write I/Os to disks. + +.It Sy zia_file_write Ns = Ns Sy on Ns | Ns Sy off +Controls whether a pool should offload write I/Os to files. + +.It Sy zia_provider Ns = Ns Sy (unset)| Ns Sy Z.I.A. Provider Name +Selects an accelerator registered in the Data Processing Unit Services Module to offload data to. Only one accelerator can be used by a pool at a time. + +.It Sy zia_raidz1_gen Ns = Ns Sy on Ns | Ns Sy off +Controls whether the pool should offload RAIDZ1 parity generation. Does not have any effect if RAIDZ1 is disabled. + +.It Sy zia_raidz1_rec Ns = Ns Sy on Ns | Ns Sy off +Controls whether the pool should offload RAIDZ1 reconstruction. Does not have any effect if RAIDZ1 is disabled. + +.It Sy zia_raidz2_gen Ns = Ns Sy on Ns | Ns Sy off +Controls whether the pool should offload RAIDZ2 parity generation. Does not have any effect if RAIDZ2 is disabled. + +.It Sy zia_raidz2_rec Ns = Ns Sy on Ns | Ns Sy off +Controls whether the pool should offload RAIDZ2 reconstruction. Does not have any effect if RAIDZ2 is disabled. + +.It Sy zia_raidz3_gen Ns = Ns Sy on Ns | Ns Sy off +Controls whether the pool should offload RAIDZ3 parity generation. Does not have any effect if RAIDZ3 is disabled. + +.It Sy zia_raidz3_rec Ns = Ns Sy on Ns | Ns Sy off +Controls whether the pool should offload RAIDZ3 reconstruction. Does not have any effect if RAIDZ3 is disabled. + .El diff --git a/module/Kbuild.in b/module/Kbuild.in index 4803952cbfed..e606ffa30ee2 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -27,6 +27,7 @@ ZFS_MODULE_CFLAGS += -I$(zfs_include)/os/linux/zfs ZFS_MODULE_CFLAGS += -I$(zfs_include) ZFS_MODULE_CPPFLAGS += -D_KERNEL ZFS_MODULE_CPPFLAGS += @KERNEL_DEBUG_CPPFLAGS@ +ZFS_MODULE_CPPFLAGS += @KERNEL_ZIA_CPPFLAGS@ # KASAN enables -Werror=frame-larger-than=1024, which # breaks oh so many parts of our build. @@ -401,6 +402,8 @@ ZFS_OBJS := \ zfs_sa.o \ zfs_vnops.o \ zil.o \ + zia.o \ + zia_cddl.o \ zio.o \ zio_checksum.o \ zio_compress.o \ @@ -472,3 +475,19 @@ OBJECT_FILES_NON_STANDARD_vdev_raidz_math_avx512f.o := y ifeq ($(CONFIG_ALTIVEC),y) $(obj)/zfs/vdev_raidz_math_powerpc_altivec.o : c_flags += -maltivec endif + +ifneq ("@DPUSM_SYMBOLS@","") +obj-$(CONFIG_ZFS) += zia-software-provider.o + +ZIA_SOFTWARE_PROVIDER_OBJS := \ + software.o \ + kernel_offloader.o + +zia-software-provider-objs += $(addprefix zia-software-provider/,$(ZIA_SOFTWARE_PROVIDER_OBJS)) +# zfs_file_os does not have any dependencies, so just link to it directly +zia-software-provider-objs += os/linux/zfs/zfs_file_os.o + +$(addprefix $(obj)/zia-software-provider/,$(ZIA_SOFTWARE_PROVIDER_OBJS)) : ccflags-y += -I@abs_top_builddir@ $(ZFS_MODULE_CFLAGS) -I@abs_srcdir@/zia-software-provider/ -I@DPUSM_ROOT@/include + +@ZIA_ENABLED_TRUE@KBUILD_EXTRA_SYMBOLS += @DPUSM_SYMBOLS@ +endif \ No newline at end of file diff --git a/module/Makefile.in b/module/Makefile.in index 5b71e1abf79e..465384413fb5 100644 --- a/module/Makefile.in +++ b/module/Makefile.in @@ -80,7 +80,7 @@ clean: clean-@ac_system@ .PHONY: modules_uninstall-Linux-legacy modules_uninstall-Linux-legacy: - $(RM) -r $(addprefix $(KMODDIR)/$(INSTALL_MOD_DIR)/,spl/ avl/ icp/ lua/ nvpair/ unicode/ zcommon/ zfs/ zstd/) + $(RM) -r $(addprefix $(KMODDIR)/$(INSTALL_MOD_DIR)/,spl/ avl/ icp/ lua/ nvpair/ unicode/ zcommon/ zfs/ zstd/ zia-software-provider) KMODDIR := $(INSTALL_MOD_PATH)/lib/modules/@LINUX_VERSION@ modules_install-Linux: modules_uninstall-Linux-legacy @@ -123,7 +123,7 @@ data_install: data_install-@ac_system@ modules_uninstall-Linux: modules_uninstall-Linux-legacy @# Uninstall the kernel modules - $(RM) $(addprefix $(KMODDIR)/$(INSTALL_MOD_DIR)/,zfs.ko spl.ko) + $(RM) $(addprefix $(KMODDIR)/$(INSTALL_MOD_DIR)/,zfs.ko spl.ko zia-software-provider) modules_uninstall-FreeBSD: @false @@ -153,7 +153,7 @@ cppcheck-Linux: -I @top_srcdir@/include/os/linux/spl \ -I @top_srcdir@/include/os/linux/zfs \ -I @top_srcdir@/include \ - avl icp lua nvpair unicode zcommon zfs zstd os/linux + avl icp lua nvpair unicode zcommon zfs zstd os/linux zia-software-provider cppcheck-FreeBSD: @true diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 9a382261df73..b88a126be002 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -41,6 +41,10 @@ #include #endif +#ifdef ZIA +#include +#endif + typedef struct vdev_disk { struct block_device *vd_bdev; krwlock_t vd_lock; @@ -154,7 +158,11 @@ bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk) return (psize); } +#ifndef ZIA static void +#else +void +#endif vdev_disk_error(zio_t *zio) { /* @@ -224,6 +232,9 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, reread_part = B_TRUE; } +#ifdef ZIA + zia_disk_close(v); +#endif blkdev_put(bdev, mode | FMODE_EXCL); } @@ -335,6 +346,13 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, *logical_ashift = highbit64(MAX(logical_block_size, SPA_MINBLOCKSIZE)) - 1; +#ifdef ZIA + zia_get_props(v->vdev_spa)->min_offload_size = 2 << *physical_ashift; + + /* open disk; ignore errors - will fall back to ZFS */ + zia_disk_open(v, v->vdev_path, vd->vd_bdev); +#endif + return (0); } @@ -347,6 +365,9 @@ vdev_disk_close(vdev_t *v) return; if (vd->vd_bdev != NULL) { +#ifdef ZIA + zia_disk_close(v); +#endif blkdev_put(vd->vd_bdev, vdev_bdev_mode(spa_mode(v->vdev_spa)) | FMODE_EXCL); } @@ -602,7 +623,11 @@ vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset) #endif } +#ifndef ZIA static int +#else +int +#endif __vdev_disk_physio(struct block_device *bdev, zio_t *zio, size_t io_size, uint64_t io_offset, int rw, int flags) { @@ -709,6 +734,10 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, return (error); } +#ifdef ZIA +EXPORT_SYMBOL(__vdev_disk_physio); +#endif + BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error) { zio_t *zio = bio->bi_private; @@ -728,7 +757,11 @@ BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error) zio_interrupt(zio); } +#ifndef ZIA static int +#else +int +#endif vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) { struct request_queue *q; @@ -751,6 +784,10 @@ vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) return (0); } +#ifdef ZIA +EXPORT_SYMBOL(vdev_disk_io_flush); +#endif + static int vdev_disk_io_trim(zio_t *zio) { @@ -829,6 +866,19 @@ vdev_disk_io_start(zio_t *zio) break; } +#ifdef ZIA + error = zia_disk_flush(v, zio); + + /* + * have to return here in order to not dispatch + * this zio to multiple task queues + */ + if (error == 0) { + rw_exit(&vd->vd_lock); + return; + } +#endif + error = vdev_disk_io_flush(vd->vd_bdev, zio); if (error == 0) { rw_exit(&vd->vd_lock); @@ -868,8 +918,29 @@ vdev_disk_io_start(zio_t *zio) } zio->io_target_timestamp = zio_handle_io_delay(zio); + +#ifdef ZIA + error = EIO; + boolean_t local_offload = B_FALSE; + zia_props_t *zia_props = zia_get_props(zio->io_spa); + if ((rw == WRITE) && (zia_props->disk_write == 1)) { + if (zia_offload_abd(zia_props->provider, zio->io_abd, + zio->io_size, zia_props->min_offload_size, + &local_offload) == ZIA_OK) { + error = zia_disk_write(v, zio, zio->io_size, + zio->io_offset, 0); + } + } + + if (error != 0) { + zia_cleanup_abd(zio->io_abd, zio->io_size, local_offload); +#endif error = __vdev_disk_physio(vd->vd_bdev, zio, zio->io_size, zio->io_offset, rw, 0); +#ifdef ZIA + } +#endif + rw_exit(&vd->vd_lock); if (error) { @@ -892,6 +963,9 @@ vdev_disk_io_done(zio_t *zio) vdev_disk_t *vd = v->vdev_tsd; if (zfs_check_media_change(vd->vd_bdev)) { +#ifdef ZIA + zia_disk_invalidate(v); +#endif invalidate_bdev(vd->vd_bdev); v->vdev_remove_wanted = B_TRUE; spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); diff --git a/module/os/linux/zfs/vdev_file.c b/module/os/linux/zfs/vdev_file.c index f073145326e3..b615a2117970 100644 --- a/module/os/linux/zfs/vdev_file.c +++ b/module/os/linux/zfs/vdev_file.c @@ -39,6 +39,11 @@ #ifdef _KERNEL #include #endif + +#ifdef ZIA +#include +#endif + /* * Virtual device vector for files. */ @@ -161,6 +166,14 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, } #endif +#ifdef ZIA + zia_get_props(vd->vdev_spa)->min_offload_size = 2 << *physical_ashift; + + /* try to open the file; ignore errors - will fall back to ZFS */ + zia_file_open(vd, vd->vdev_path, + vdev_file_open_mode(spa_mode(vd->vdev_spa)), 0); +#endif + skip_open: error = zfs_file_getattr(vf->vf_file, &zfa); @@ -184,6 +197,10 @@ vdev_file_close(vdev_t *vd) if (vd->vdev_reopening || vf == NULL) return; +#ifdef ZIA + zia_file_close(vd); +#endif + if (vf->vf_file != NULL) { (void) zfs_file_close(vf->vf_file); } @@ -203,20 +220,37 @@ vdev_file_io_strategy(void *arg) void *buf; loff_t off; ssize_t size; - int err; + int err = 0; off = zio->io_offset; size = zio->io_size; resid = 0; if (zio->io_type == ZIO_TYPE_READ) { - buf = abd_borrow_buf(zio->io_abd, zio->io_size); + buf = abd_borrow_buf(zio->io_abd, size); err = zfs_file_pread(vf->vf_file, buf, size, off, &resid); abd_return_buf_copy(zio->io_abd, buf, size); } else { - buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size); +#ifdef ZIA + boolean_t local_offload = B_FALSE; + zia_props_t *zia_props = zia_get_props(zio->io_spa); + if (zia_get_props(zio->io_spa)->file_write == 1) { + zia_offload_abd(zia_props->provider, zio->io_abd, + size, zia_props->min_offload_size, &local_offload); + + err = zia_file_write(vd, zio->io_abd, + size, off, &resid, &err); + } + + if (err != 0) { + zia_cleanup_abd(zio->io_abd, size, local_offload); +#endif + buf = abd_borrow_buf_copy(zio->io_abd, size); err = zfs_file_pwrite(vf->vf_file, buf, size, off, &resid); abd_return_buf(zio->io_abd, buf, size); +#ifdef ZIA + } +#endif } zio->io_error = err; if (resid != 0 && zio->io_error == 0) diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c index 853476a1fc16..94f5305c3a95 100644 --- a/module/zcommon/zpool_prop.c +++ b/module/zcommon/zpool_prop.c @@ -169,6 +169,45 @@ zpool_prop_init(void) PROP_TYPE_NUMBER, PROP_DEFAULT, ZFS_TYPE_POOL, "DEDUPDITTO", B_FALSE, sfeatures); +#ifdef ZIA + zprop_register_string(ZPOOL_PROP_ZIA_PROVIDER, "zia_provider", NULL, + PROP_DEFAULT, ZFS_TYPE_POOL, "", "PROVIDER", + sfeatures); + zprop_register_index(ZPOOL_PROP_ZIA_COMPRESS, "zia_compress", + 1, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", + "zia_compress", boolean_table, sfeatures); + zprop_register_index(ZPOOL_PROP_ZIA_DECOMPRESS, "zia_decompress", + 1, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", + "zia_decompress", boolean_table, sfeatures); + zprop_register_index(ZPOOL_PROP_ZIA_CHECKSUM, + "zia_checksum", 1, PROP_DEFAULT, ZFS_TYPE_POOL, + "on | off", "zia_checksum", boolean_table, sfeatures); + zprop_register_index(ZPOOL_PROP_ZIA_RAIDZ1_GEN, "zia_raidz1_gen", + 1, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", + "zia_raidz1_gen", boolean_table, sfeatures); + zprop_register_index(ZPOOL_PROP_ZIA_RAIDZ2_GEN, "zia_raidz2_gen", + 1, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", + "zia_raidz2_gen", boolean_table, sfeatures); + zprop_register_index(ZPOOL_PROP_ZIA_RAIDZ3_GEN, "zia_raidz3_gen", + 1, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", + "zia_raidz3_gen", boolean_table, sfeatures); + zprop_register_index(ZPOOL_PROP_ZIA_RAIDZ1_REC, "zia_raidz1_rec", + 1, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", + "zia_raidz1_rec", boolean_table, sfeatures); + zprop_register_index(ZPOOL_PROP_ZIA_RAIDZ2_REC, "zia_raidz2_rec", + 1, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", + "zia_raidz2_rec", boolean_table, sfeatures); + zprop_register_index(ZPOOL_PROP_ZIA_RAIDZ3_REC, "zia_raidz3_rec", + 1, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", + "zia_raidz3_rec", boolean_table, sfeatures); + zprop_register_index(ZPOOL_PROP_ZIA_FILE_WRITE, "zia_file_write", + 1, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", + "zia_file_write", boolean_table, sfeatures); + zprop_register_index(ZPOOL_PROP_ZIA_DISK_WRITE, "zia_disk_write", + 1, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", + "zia_disk_write", boolean_table, sfeatures); +#endif + zfs_mod_list_supported_free(sfeatures); } diff --git a/module/zfs/THIRDPARTYLICENSE.zia b/module/zfs/THIRDPARTYLICENSE.zia new file mode 100644 index 000000000000..9f81923f051d --- /dev/null +++ b/module/zfs/THIRDPARTYLICENSE.zia @@ -0,0 +1,42 @@ +© 2021. Triad National Security, LLC. All rights reserved. + +This program was produced under U.S. Government contract +89233218CNA000001 for Los Alamos National Laboratory (LANL), which +is operated by Triad National Security, LLC for the U.S. +Department of Energy/National Nuclear Security Administration. All +rights in the program are reserved by Triad National Security, LLC, +and the U.S. Department of Energy/National Nuclear Security +Administration. The Government is granted for itself and others +acting on its behalf a nonexclusive, paid-up, irrevocable worldwide +license in this material to reproduce, prepare derivative works, +distribute copies to the public, perform publicly and display +publicly, and to permit others to do so. + +---- + +This program is open source under the BSD-3 License. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from this +software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. diff --git a/module/zfs/THIRDPARTYLICENSE.zia.descrip b/module/zfs/THIRDPARTYLICENSE.zia.descrip new file mode 100644 index 000000000000..4be64904acc6 --- /dev/null +++ b/module/zfs/THIRDPARTYLICENSE.zia.descrip @@ -0,0 +1 @@ +Z.I.A. FUNCTIONALITY IN ZFS \ No newline at end of file diff --git a/module/zfs/abd.c b/module/zfs/abd.c index b6d7ac6407e3..04f2c2cac621 100644 --- a/module/zfs/abd.c +++ b/module/zfs/abd.c @@ -102,6 +102,10 @@ #include #include +#ifdef ZIA +#include +#endif + /* see block comment above for description */ int zfs_abd_scatter_enabled = B_TRUE; @@ -146,11 +150,19 @@ abd_init_struct(abd_t *abd) abd->abd_parent = NULL; #endif abd->abd_size = 0; + +#ifdef ZIA + abd->abd_zia_handle = NULL; +#endif } static void abd_fini_struct(abd_t *abd) { +#ifdef ZIA + zia_free_abd(abd, B_TRUE); +#endif + mutex_destroy(&abd->abd_mtx); ASSERT(!list_link_active(&abd->abd_gang_link)); #ifdef ZFS_DEBUG @@ -320,6 +332,10 @@ abd_free(abd_t *abd) abd_free_struct_impl(abd); } +#ifdef ZIA +EXPORT_SYMBOL(abd_free); +#endif + /* * Allocate an ABD of the same format (same metadata flag, same scatterize * setting) as another ABD. @@ -584,9 +600,15 @@ abd_get_offset_size(abd_t *sabd, size_t off, size_t size) abd_t * abd_get_zeros(size_t size) { + abd_t *abd = NULL; + ASSERT3P(abd_zero_scatter, !=, NULL); ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); - return (abd_get_offset_size(abd_zero_scatter, 0, size)); + + abd = abd_get_offset_size(abd_zero_scatter, 0, size); + abd->abd_flags |= ABD_FLAG_ZEROS; + + return (abd); } /* @@ -612,6 +634,10 @@ abd_get_from_buf(void *buf, size_t size) return (abd); } +#ifdef ZIA +EXPORT_SYMBOL(abd_get_from_buf); +#endif + /* * Get the raw buffer associated with a linear ABD. */ @@ -711,6 +737,9 @@ abd_release_ownership_of_buf(abd_t *abd) abd_update_linear_stats(abd, ABDSTAT_DECR); } +#ifdef ZIA +EXPORT_SYMBOL(abd_release_ownership_of_buf); +#endif /* * Give this ABD ownership of the buffer that it's storing. Can only be used on diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 74019ad08b4c..027724d37a1e 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -443,7 +443,11 @@ static const unsigned long zfs_arc_pool_dirty_percent = 20; /* * Enable or disable compressed arc buffers. */ +#ifndef ZIA int zfs_compressed_arc_enabled = B_TRUE; +#else +int zfs_compressed_arc_enabled = B_FALSE; +#endif /* * ARC will evict meta buffers that exceed arc_meta_limit. This @@ -11101,8 +11105,13 @@ ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, p_min_shift, param_set_arc_int, ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, average_blocksize, INT, ZMOD_RD, "Target average block size"); +#ifndef ZIA ZFS_MODULE_PARAM(zfs, zfs_, compressed_arc_enabled, INT, ZMOD_RW, "Disable compressed ARC buffers"); +#else +ZFS_MODULE_PARAM(zfs, zfs_, compressed_arc_enabled, INT, ZMOD_RD, + "Disable compressed ARC buffers"); +#endif ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prefetch_ms, param_set_arc_int, param_get_int, ZMOD_RW, "Min life of prefetch block in ms"); diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index e6008b3bf178..1cfe1ce5f960 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -59,6 +59,10 @@ #include #endif +#ifdef ZIA +#include +#endif + /* * Enable/disable nopwrite feature. */ @@ -1961,6 +1965,10 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) boolean_t encrypt = B_FALSE; int copies = os->os_copies; +#ifdef ZIA + zp->zp_ismd = ismd; +#endif + /* * We maintain different write policies for each of the following * types of data: @@ -2289,6 +2297,9 @@ byteswap_uint8_array(void *vbuf, size_t size) void dmu_init(void) { +#ifdef ZIA + zia_init(); +#endif abd_init(); zfs_dbgmsg_init(); sa_cache_init(); @@ -2304,6 +2315,9 @@ dmu_init(void) void dmu_fini(void) { +#ifdef ZIA + zia_fini(); +#endif arc_fini(); /* arc depends on l2arc, so arc must go first */ l2arc_fini(); dmu_tx_fini(); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 55f3a4de603f..4f59c3b2b229 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -96,6 +96,10 @@ #include #endif /* _KERNEL */ +#ifdef ZIA +#include +#endif /* ZIA */ + #include "zfs_prop.h" #include "zfs_comutil.h" @@ -423,6 +427,48 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) dp->scd_path, 0, ZPROP_SRC_LOCAL); } } + +#ifdef ZIA + zia_props_t *zia_props = zia_get_props(spa); + if (zia_props->provider != NULL) { + spa_prop_add_list(*nvp, ZPOOL_PROP_ZIA_PROVIDER, + (char *)zia_get_provider_name(zia_props->provider), + 0, ZPROP_SRC_LOCAL); + } + + spa_prop_add_list(*nvp, ZPOOL_PROP_ZIA_COMPRESS, + NULL, zia_props->compress, ZPROP_SRC_LOCAL); + + spa_prop_add_list(*nvp, ZPOOL_PROP_ZIA_DECOMPRESS, + NULL, zia_props->decompress, ZPROP_SRC_LOCAL); + + spa_prop_add_list(*nvp, ZPOOL_PROP_ZIA_CHECKSUM, + NULL, zia_props->checksum, ZPROP_SRC_LOCAL); + + spa_prop_add_list(*nvp, ZPOOL_PROP_ZIA_RAIDZ1_GEN, + NULL, zia_props->raidz.gen[1], ZPROP_SRC_LOCAL); + + spa_prop_add_list(*nvp, ZPOOL_PROP_ZIA_RAIDZ2_GEN, + NULL, zia_props->raidz.gen[2], ZPROP_SRC_LOCAL); + + spa_prop_add_list(*nvp, ZPOOL_PROP_ZIA_RAIDZ3_GEN, + NULL, zia_props->raidz.gen[3], ZPROP_SRC_LOCAL); + + spa_prop_add_list(*nvp, ZPOOL_PROP_ZIA_RAIDZ1_REC, + NULL, zia_props->raidz.rec[1], ZPROP_SRC_LOCAL); + + spa_prop_add_list(*nvp, ZPOOL_PROP_ZIA_RAIDZ2_REC, + NULL, zia_props->raidz.rec[2], ZPROP_SRC_LOCAL); + + spa_prop_add_list(*nvp, ZPOOL_PROP_ZIA_RAIDZ3_REC, + NULL, zia_props->raidz.rec[3], ZPROP_SRC_LOCAL); + + spa_prop_add_list(*nvp, ZPOOL_PROP_ZIA_FILE_WRITE, + NULL, zia_props->file_write, ZPROP_SRC_LOCAL); + + spa_prop_add_list(*nvp, ZPOOL_PROP_ZIA_DISK_WRITE, + NULL, zia_props->disk_write, ZPROP_SRC_LOCAL); +#endif } /* @@ -719,6 +765,22 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) error = SET_ERROR(E2BIG); break; +#ifdef ZIA + case ZPOOL_PROP_ZIA_PROVIDER: + case ZPOOL_PROP_ZIA_COMPRESS: + case ZPOOL_PROP_ZIA_DECOMPRESS: + case ZPOOL_PROP_ZIA_CHECKSUM: + case ZPOOL_PROP_ZIA_RAIDZ1_GEN: + case ZPOOL_PROP_ZIA_RAIDZ2_GEN: + case ZPOOL_PROP_ZIA_RAIDZ3_GEN: + case ZPOOL_PROP_ZIA_RAIDZ1_REC: + case ZPOOL_PROP_ZIA_RAIDZ2_REC: + case ZPOOL_PROP_ZIA_RAIDZ3_REC: + case ZPOOL_PROP_ZIA_FILE_WRITE: + case ZPOOL_PROP_ZIA_DISK_WRITE: + break; +#endif + default: break; } @@ -1743,6 +1805,12 @@ spa_unload(spa_t *spa) spa->spa_compatibility = NULL; } +#ifdef ZIA + if (zia_get_props(spa)->provider != NULL) { + zia_put_provider(&zia_get_props(spa)->provider); + } +#endif + spa_config_exit(spa, SCL_ALL, spa); } @@ -8818,6 +8886,125 @@ spa_sync_props(void *arg, dmu_tx_t *tx) spa_history_log_internal(spa, "set", tx, "%s=%s", nvpair_name(elem), strval); break; +#ifdef ZIA + case ZPOOL_PROP_ZIA_PROVIDER: + strval = fnvpair_value_string(elem); + if (zia_get_props(spa)->provider != NULL) + zia_put_provider(&zia_get_props(spa)->provider); + zia_get_props(spa)->provider = zia_get_provider(strval); + /* + * Dirty the configuration on vdevs as above. + */ + if (tx->tx_txg != TXG_INITIAL) { + vdev_config_dirty(spa->spa_root_vdev); + spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); + } + + /* + * reopen devices so that provider is used + * copied from zfs_ioc_pool_reopen + */ + spa_vdev_state_enter(spa, SCL_NONE); + vdev_reopen(spa->spa_root_vdev); + (void) spa_vdev_state_exit(spa, NULL, 0); + + spa_history_log_internal(spa, "set", tx, + "%s=%s", nvpair_name(elem), strval); + break; + case ZPOOL_PROP_ZIA_COMPRESS: + zia_get_props(spa)->compress = + fnvpair_value_uint64(elem); + zia_prop_warn(zia_get_props(spa)->compress, + "Compression"); + break; + case ZPOOL_PROP_ZIA_DECOMPRESS: + zia_get_props(spa)->decompress = + fnvpair_value_uint64(elem); + zia_prop_warn(zia_get_props(spa)->decompress, + "Decompression"); + break; + case ZPOOL_PROP_ZIA_CHECKSUM: + zia_get_props(spa)->checksum = + fnvpair_value_uint64(elem); + zia_prop_warn(zia_get_props(spa)->checksum, + "Checksum"); + break; + case ZPOOL_PROP_ZIA_RAIDZ1_GEN: + zia_get_props(spa)->raidz.gen[1] = + fnvpair_value_uint64(elem); + zia_prop_warn(zia_get_props(spa)->raidz.gen[1], + "RAIDZ 1 Generation"); + break; + case ZPOOL_PROP_ZIA_RAIDZ2_GEN: + zia_get_props(spa)->raidz.gen[2] = + fnvpair_value_uint64(elem); + zia_prop_warn(zia_get_props(spa)->raidz.gen[2], + "RAIDZ 2 Generation"); + break; + case ZPOOL_PROP_ZIA_RAIDZ3_GEN: + zia_get_props(spa)->raidz.gen[3] = + fnvpair_value_uint64(elem); + zia_prop_warn(zia_get_props(spa)->raidz.gen[3], + "RAIDZ 3 Generation"); + break; + case ZPOOL_PROP_ZIA_RAIDZ1_REC: + zia_get_props(spa)->raidz.rec[1] = + fnvpair_value_uint64(elem); + /* need checksum */ + if (zia_get_props(spa)->raidz.rec[1]) { + if (!zia_get_props(spa)->checksum) { + zia_get_props(spa)->checksum = 1; + zia_prop_warn( + zia_get_props(spa)->checksum, + "Checksum"); + } + } + zia_prop_warn(zia_get_props(spa)->raidz.rec[1], + "RAIDZ 1 Reconstruction"); + break; + case ZPOOL_PROP_ZIA_RAIDZ2_REC: + zia_get_props(spa)->raidz.rec[2] = + fnvpair_value_uint64(elem); + /* need checksum */ + if (zia_get_props(spa)->raidz.rec[2]) { + if (!zia_get_props(spa)->checksum) { + zia_get_props(spa)->checksum = 1; + zia_prop_warn( + zia_get_props(spa)->checksum, + "Checksum"); + } + } + zia_prop_warn(zia_get_props(spa)->raidz.rec[2], + "RAIDZ 2 Reconstruction"); + break; + case ZPOOL_PROP_ZIA_RAIDZ3_REC: + zia_get_props(spa)->raidz.rec[3] = + fnvpair_value_uint64(elem); + /* need checksum */ + if (zia_get_props(spa)->raidz.rec[3]) { + if (!zia_get_props(spa)->checksum) { + zia_get_props(spa)->checksum = 1; + zia_prop_warn( + zia_get_props(spa)->checksum, + "Checksum"); + } + } + zia_prop_warn(zia_get_props(spa)->raidz.rec[3], + "RAIDZ 3 Reconstruction"); + break; + case ZPOOL_PROP_ZIA_FILE_WRITE: + zia_get_props(spa)->file_write = + fnvpair_value_uint64(elem); + zia_prop_warn(zia_get_props(spa)->file_write, + "File Write"); + break; + case ZPOOL_PROP_ZIA_DISK_WRITE: + zia_get_props(spa)->disk_write = + fnvpair_value_uint64(elem); + zia_prop_warn(zia_get_props(spa)->disk_write, + "Disk Write"); + break; +#endif default: /* diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index de29e6fd4c7c..9baa62b96f0e 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -673,6 +673,10 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) vdev_queue_init(vd); vdev_cache_init(vd); +#ifdef ZIA + vd->vdev_zia_handle = NULL; +#endif + return (vd); } @@ -1000,6 +1004,10 @@ vdev_free(vdev_t *vd) */ vdev_close(vd); +#ifdef ZIA + ASSERT3P(vd->vdev_zia_handle, ==, NULL); +#endif + ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c index fa8daf57b2eb..02296b667fd1 100644 --- a/module/zfs/vdev_draid.c +++ b/module/zfs/vdev_draid.c @@ -1035,6 +1035,9 @@ vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset, #ifdef ZFS_DEBUG rr->rr_offset = io_offset; rr->rr_size = io_size; +#endif +#ifdef ZIA + rr->rr_zia_handle = NULL; #endif *rrp = rr; diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 3633937f462b..ff9e31e85bce 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -41,6 +41,10 @@ #include /* For vdev_xlate() in vdev_raidz_io_verify() */ #endif +#ifdef ZIA +#include +#endif + /* * Virtual device vector for RAID-Z. * @@ -138,6 +142,10 @@ static void vdev_raidz_row_free(raidz_row_t *rr) { +#ifdef ZIA + zia_raidz_free(rr, B_FALSE); +#endif + for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; @@ -359,6 +367,9 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, rr->rr_offset = zio->io_offset; rr->rr_size = zio->io_size; #endif +#ifdef ZIA + rr->rr_zia_handle = NULL; +#endif asize = 0; @@ -503,7 +514,11 @@ vdev_raidz_pqr_func(void *buf, size_t size, void *private) return (0); } +#ifndef ZIA static void +#else +void +#endif vdev_raidz_generate_parity_p(raidz_row_t *rr) { uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); @@ -521,7 +536,15 @@ vdev_raidz_generate_parity_p(raidz_row_t *rr) } } +#ifdef ZIA +EXPORT_SYMBOL(vdev_raidz_generate_parity_p); +#endif + +#ifndef ZIA static void +#else +void +#endif vdev_raidz_generate_parity_pq(raidz_row_t *rr) { uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); @@ -563,7 +586,15 @@ vdev_raidz_generate_parity_pq(raidz_row_t *rr) } } +#ifdef ZIA +EXPORT_SYMBOL(vdev_raidz_generate_parity_pq); +#endif + +#ifndef ZIA static void +#else +void +#endif vdev_raidz_generate_parity_pqr(raidz_row_t *rr) { uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); @@ -611,6 +642,10 @@ vdev_raidz_generate_parity_pqr(raidz_row_t *rr) } } +#ifdef ZIA +EXPORT_SYMBOL(vdev_raidz_generate_parity_pqr); +#endif + /* * Generate RAID parity in the first virtual columns according to the number of * parity columns available. @@ -1280,7 +1315,11 @@ vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing, kmem_free(p, psize); } +#ifndef ZIA static void +#else +void +#endif vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts) { int n, i, c, t, tt; @@ -1417,6 +1456,10 @@ vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts) } } +#ifdef ZIA +EXPORT_SYMBOL(vdev_raidz_reconstruct_general); +#endif + static void vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr, const int *t, int nt) @@ -1628,7 +1671,22 @@ vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr, uint64_t ashift) vdev_t *vd = zio->io_vd; raidz_map_t *rm = zio->io_vsd; +#ifdef ZIA + /* + * here instead of vdev_raidz_generate_parity_row + * to be able to use zio + */ + boolean_t local_offload = B_FALSE; + if ((zia_raidz_alloc(zio, rr, B_FALSE, 0, &local_offload) != ZIA_OK) || + (zia_raidz_gen(rr) != ZIA_OK)) { + zia_raidz_gen_cleanup(zio, rr, local_offload); +#endif vdev_raidz_generate_parity_row(rm, rr); +#ifdef ZIA + } else { + zio->io_flags |= ZIO_FLAG_DONT_AGGREGATE; + } +#endif for (int c = 0; c < rr->rr_scols; c++) { raidz_col_t *rc = &rr->rr_col[c]; @@ -1781,11 +1839,27 @@ raidz_checksum_verify(zio_t *zio) { zio_bad_cksum_t zbc = {{{0}}}; raidz_map_t *rm = zio->io_vsd; +#ifdef ZIA + const boolean_t entered_offloaded = zia_is_offloaded(zio->io_abd); +#endif int ret = zio_checksum_error(zio, &zbc); if (ret != 0 && zbc.zbc_injected != 0) rm->rm_ecksuminjected = 1; +#ifdef ZIA + if (zia_is_offloaded(zio->io_abd) != B_TRUE) { + /* columns need to be onloaded */ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + + /* force onload, since data was modified */ + zia_raidz_rec_cleanup(zio, rr, B_TRUE, + entered_offloaded); + } + } +#endif + return (ret); } @@ -1818,7 +1892,18 @@ raidz_parity_verify(zio_t *zio, raidz_row_t *rr) orig[c] = rc->rc_abd; ASSERT3U(abd_get_size(rc->rc_abd), ==, rc->rc_size); rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); + +#ifdef ZIA + rc->rc_abd->abd_zia_handle = NULL; +#endif +} + +#ifdef ZIA + if (zia_raidz_new_parity(zio, rr, orig) != ZIA_OK) { + /* onload data and parity columns */ + zia_raidz_rec_cleanup(zio, rr, B_TRUE, B_FALSE); } +#endif /* * Verify any empty sectors are zero filled to ensure the parity @@ -1832,7 +1917,14 @@ raidz_parity_verify(zio_t *zio, raidz_row_t *rr) * isn't harmful but it does have the side effect of fixing stuff * we didn't realize was necessary (i.e. even if we return 0). */ +#ifdef ZIA + if (zia_raidz_gen(rr) != ZIA_OK) { + zia_raidz_rec_cleanup(zio, rr, B_FALSE, B_TRUE); +#endif vdev_raidz_generate_parity_row(rm, rr); +#ifdef ZIA + } +#endif for (c = 0; c < rr->rr_firstdatacol; c++) { rc = &rr->rr_col[c]; @@ -1840,7 +1932,21 @@ raidz_parity_verify(zio_t *zio, raidz_row_t *rr) if (!rc->rc_tried || rc->rc_error != 0) continue; - if (abd_cmp(orig[c], rc->rc_abd) != 0) { + int cmp = 0; +#ifdef ZIA + if (zia_raidz_cmp(orig[c], rc->rc_abd, &cmp) != ZIA_OK) { + zia_raidz_rec_cleanup(zio, rr, B_FALSE, B_TRUE); + zia_onload_abd(orig[c], rc->rc_size, B_FALSE); +#endif + cmp = abd_cmp(orig[c], rc->rc_abd); +#ifdef ZIA + } +#endif + if (cmp != 0) { +#ifdef ZIA + zia_raidz_rec_cleanup(zio, rr, B_FALSE, B_TRUE); + zia_onload_abd(orig[c], rc->rc_size, B_FALSE); +#endif vdev_raidz_checksum_error(zio, rc, orig[c]); rc->rc_error = SET_ERROR(ECKSUM); ret++; @@ -2000,11 +2106,30 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) } if (dead > nparity) { /* reconstruction not possible */ +#ifdef ZIA + /* drop offloaded data */ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + zia_raidz_rec_cleanup(zio, rr, B_TRUE, B_FALSE); + } +#endif raidz_restore_orig_data(rm); return (EINVAL); } - if (dead_data > 0) + if (dead_data > 0) { +#ifdef ZIA + /* + * here instead of vdev_raidz_reconstruct_row + * to be able to use zio + */ + if ((zia_raidz_rec(rr, my_tgts, t) != ZIA_OK)) { + zia_raidz_rec_cleanup(zio, rr, B_TRUE, B_FALSE); +#endif vdev_raidz_reconstruct_row(rm, rr, my_tgts, t); +#ifdef ZIA + } +#endif + } } /* Check for success */ @@ -2047,6 +2172,13 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) } /* Reconstruction failed - restore original data */ +#ifdef ZIA + /* drop offloaded data */ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + zia_raidz_rec_cleanup(zio, rr, B_TRUE, B_FALSE); + } +#endif raidz_restore_orig_data(rm); return (ECKSUM); } @@ -2310,7 +2442,22 @@ vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm, ASSERT(rr->rr_firstdatacol >= n); +#ifdef ZIA + /* + * here instead of vdev_raidz_reconstruct_row + * to be able to use zio + */ + if ((zia_raidz_rec(rr, tgts, n) != ZIA_OK)) { + /* + * drop handles instead of onloading because + * the data hasn't changed yet + */ + zia_raidz_rec_cleanup(zio, rr, B_TRUE, B_FALSE); +#endif vdev_raidz_reconstruct_row(rm, rr, tgts, n); +#ifdef ZIA + } +#endif } } @@ -2394,6 +2541,23 @@ vdev_raidz_io_done(zio_t *zio) vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]); } } else { +#ifdef ZIA + /* offload once at beginning */ + blkptr_t *bp = zio->io_bp; + if (bp && !BP_IS_METADATA(bp)) { + uint_t checksum = (BP_IS_GANG(bp) ? + ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)); + zio_checksum_info_t *ci = &zio_checksum_table[checksum]; + if (!(ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED)) { + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + zia_raidz_alloc(zio, rr, + B_TRUE, checksum, NULL); + } + } + } +#endif + for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; vdev_raidz_io_done_reconstruct_known_missing(zio, diff --git a/module/zfs/zia.c b/module/zfs/zia.c new file mode 100644 index 000000000000..0ced3f4321b7 --- /dev/null +++ b/module/zfs/zia.c @@ -0,0 +1,1187 @@ +/* + * © 2021. Triad National Security, LLC. All rights reserved. + * + * This program was produced under U.S. Government contract + * 89233218CNA000001 for Los Alamos National Laboratory (LANL), which + * is operated by Triad National Security, LLC for the U.S. + * Department of Energy/National Nuclear Security Administration. All + * rights in the program are reserved by Triad National Security, LLC, + * and the U.S. Department of Energy/National Nuclear Security + * Administration. The Government is granted for itself and others + * acting on its behalf a nonexclusive, paid-up, irrevocable worldwide + * license in this material to reproduce, prepare derivative works, + * distribute copies to the public, perform publicly and display + * publicly, and to permit others to do so. + * + * ---- + * + * This program is open source under the BSD-3 License. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifdef ZIA + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* ************************************************************* */ +/* global offloader functions initialized with ZFS */ +static const dpusm_uf_t *dpusm = NULL; +/* ************************************************************* */ + +zia_props_t * +zia_get_props(spa_t *spa) +{ + return (spa?&spa->spa_zia_props:NULL); +} + +void +zia_prop_warn(boolean_t val, const char *name) +{ + if (val == B_TRUE) { +#ifdef _KERNEL + printk("Z.I.A. %s enabled. Encryption and " + "Dedup for this spa will be disabled.\n", + name); +#else + (void) name; +#endif + } +} + +static int +translate_rc(const int dpusm_rc) +{ + int zia_rc = ZIA_FALLBACK; + switch (dpusm_rc) { + case DPUSM_OK: + zia_rc = ZIA_OK; + break; + case DPUSM_ERROR: + case DPUSM_PROVIDER_EXISTS: + case DPUSM_PROVIDER_NOT_EXISTS: + zia_rc = ZIA_ERROR; + break; + case DPUSM_PROVIDER_MISMATCH: + zia_rc = ZIA_PROVIDER_MISMATCH; + break; + case DPUSM_NOT_IMPLEMENTED: + zia_rc = ZIA_FALLBACK; + break; + case DPUSM_BAD_RESULT: + zia_rc = ZIA_BAD_RESULT; + break; + default: + /* only translate recognized values */ + zia_rc = dpusm_rc; + break; + } + return (zia_rc); +} + +dpusm_compress_t +translate_compress(enum zio_compress c) +{ + dpusm_compress_t dpusm_c = 0; + switch (c) { + case ZIO_COMPRESS_GZIP_1: + dpusm_c = DPUSM_COMPRESS_GZIP_1; + break; + case ZIO_COMPRESS_GZIP_2: + dpusm_c = DPUSM_COMPRESS_GZIP_2; + break; + case ZIO_COMPRESS_GZIP_3: + dpusm_c = DPUSM_COMPRESS_GZIP_3; + break; + case ZIO_COMPRESS_GZIP_4: + dpusm_c = DPUSM_COMPRESS_GZIP_4; + break; + case ZIO_COMPRESS_GZIP_5: + dpusm_c = DPUSM_COMPRESS_GZIP_5; + break; + case ZIO_COMPRESS_GZIP_6: + dpusm_c = DPUSM_COMPRESS_GZIP_6; + break; + case ZIO_COMPRESS_GZIP_7: + dpusm_c = DPUSM_COMPRESS_GZIP_7; + break; + case ZIO_COMPRESS_GZIP_8: + dpusm_c = DPUSM_COMPRESS_GZIP_8; + break; + case ZIO_COMPRESS_GZIP_9: + dpusm_c = DPUSM_COMPRESS_GZIP_9; + break; + case ZIO_COMPRESS_INHERIT: + case ZIO_COMPRESS_ON: + case ZIO_COMPRESS_OFF: + case ZIO_COMPRESS_LZJB: + case ZIO_COMPRESS_EMPTY: + case ZIO_COMPRESS_ZLE: + case ZIO_COMPRESS_LZ4: + case ZIO_COMPRESS_ZSTD: + case ZIO_COMPRESS_FUNCTIONS: + default: + break; + } + + return (dpusm_c); +} + +dpusm_checksum_t +translate_checksum(enum zio_checksum c) +{ + dpusm_checksum_t dpusm_c = 0; + switch (c) { + case ZIO_CHECKSUM_FLETCHER_2: + dpusm_c = DPUSM_CHECKSUM_FLETCHER_2; + break; + case ZIO_CHECKSUM_FLETCHER_4: + dpusm_c = DPUSM_CHECKSUM_FLETCHER_4; + break; + case ZIO_CHECKSUM_INHERIT: + case ZIO_CHECKSUM_ON: + case ZIO_CHECKSUM_OFF: + case ZIO_CHECKSUM_LABEL: + case ZIO_CHECKSUM_GANG_HEADER: + case ZIO_CHECKSUM_ZILOG: + case ZIO_CHECKSUM_SHA256: + case ZIO_CHECKSUM_ZILOG2: + case ZIO_CHECKSUM_NOPARITY: + case ZIO_CHECKSUM_SHA512: + case ZIO_CHECKSUM_SKEIN: + default: + break; + } + + return (dpusm_c); +} + +dpusm_checksum_byteorder_t +translate_byteorder(zio_byteorder_t bo) +{ + dpusm_checksum_byteorder_t dpusm_bo = 0; + switch (bo) { + case ZIO_CHECKSUM_NATIVE: + dpusm_bo = DPUSM_BYTEORDER_NATIVE; + break; + case ZIO_CHECKSUM_BYTESWAP: + dpusm_bo = DPUSM_BYTEORDER_BYTESWAP; + break; + default: + break; + } + + return (dpusm_bo); +} + +int +zia_get_capabilities(void *provider, dpusm_pc_t **caps) +{ + if (!provider || !caps) { + return (ZIA_ERROR); + } + + /* dpusm is checked by the caller */ + /* provider and caps are checked by the dpusm */ + return (translate_rc(dpusm->capabilities(provider, caps))); +} + +int +zia_init(void) +{ + if (dpusm) { + return (ZIA_OK); + } + + if (dpusm_initialize) { + dpusm = dpusm_initialize(); + } + + if (!dpusm) { +#ifdef _KERNEL + printk("Warning: Z.I.A. not initialized\n"); +#endif + return (ZIA_ERROR); + } + +#ifdef _KERNEL + printk("Z.I.A. initialized (%p)\n", dpusm); +#endif + return (ZIA_OK); +} + +int +zia_fini(void) +{ + if (!dpusm) { +#ifdef _KERNEL + printk("Warning: Z.I.A. not initialized. " + "Not uninitializing.\n"); +#endif + return (ZIA_ERROR); + } + + if (dpusm_finalize) { + dpusm_finalize(); +#ifdef _KERNEL + printk("Z.I.A. finalized\n"); +#endif + } else { +#ifdef _KERNEL + if (dpusm) { + printk("Z.I.A. incomplete finalize\n"); + } +#endif + } + + dpusm = NULL; + return (ZIA_OK); +} + +void * +zia_get_provider(const char *name) +{ + if (!dpusm) { + return (NULL); + } + + return (dpusm->get(name)); +} + +const char * +zia_get_provider_name(void *provider) +{ + if (!dpusm || !provider) { + return (NULL); + } + + return (dpusm->get_name(provider)); +} + +int +zia_put_provider(void **provider) +{ + if (!dpusm || !provider || !*provider) { + return (ZIA_FALLBACK); + } + + const int rc = dpusm->put(*provider); + if (rc == DPUSM_OK) { + *provider = NULL; + } + + return (translate_rc(rc)); +} + +boolean_t +zia_is_used(zio_t *zio) +{ + if (!zio) { + return (B_FALSE); + } + + zia_props_t *props = zia_get_props(zio->io_spa); + + /* provider + at least 1 operation */ + if (props->provider && + (props->compress || + props->decompress || + props->checksum || + props->raidz.gen[1] || + props->raidz.gen[2] || + props->raidz.gen[3] || + props->raidz.rec[1] || + props->raidz.rec[2] || + props->raidz.rec[3] || + props->file_write || + props->disk_write)) { + return (B_TRUE); + } + + return (B_FALSE); +} + +boolean_t +zia_is_offloaded(abd_t *abd) +{ + if (!abd) { + return (B_FALSE); + } + + return (ABD_HANDLE(abd)?B_TRUE:B_FALSE); +} + +/* create a provider handle/offloader buffer without copying data */ +void * +zia_alloc(void *provider, size_t size, size_t min_offload_size) +{ + if (size < min_offload_size) { + return (NULL); + } + return ((dpusm && provider)?dpusm->alloc(provider, size):NULL); +} + +/* free the offloader handle without onloading the data */ +void +zia_free(void **handle) +{ + if (dpusm && handle) { + dpusm->free(*handle); + *handle = NULL; + } +} + +/* move data from the offloader and unregister the mapping */ +int +zia_onload(void **handle, void *buf, size_t size) +{ + if (!dpusm) { + return (ZIA_FALLBACK); + } + + if (!handle || !*handle || !buf) { + return (ZIA_ERROR); + } + + dpusm_mv_t mv = { .handle = *handle, .offset = 0 }; + const int rc = dpusm->copy_to_mem(&mv, buf, size); + + /* + * if success, no more need for handle + * if failure, can't do anything with + * handle in any case, so destroy it + */ + zia_free(handle); + + return (translate_rc(rc)); +} + +/* abd_iterate_func callback for moving data to the offloader */ +static int +zia_offload_cb(void *buf, size_t size, void *private) +{ + const int rc = dpusm->copy_from_mem(private, buf, size); + if (translate_rc(rc) != ZIA_OK) { + return (ZIA_ERROR); + } + + dpusm_mv_t *mv = (dpusm_mv_t *)private; + mv->offset += size; + return (0); +} + +/* abd_iterate_func callback for moving data from the offloader */ +static int +zia_onload_cb(void *buf, size_t size, void *private) +{ + const int rc = dpusm->copy_to_mem(private, buf, size); + if (translate_rc(rc) != ZIA_OK) { + return (ZIA_ERROR); + } + + dpusm_mv_t *mv = (dpusm_mv_t *)private; + mv->offset += size; + return (0); +} + +/* create a new handle and copy data into it */ +static int +zia_offload_abd_offset(void *provider, abd_t *abd, + size_t offset, size_t size, + size_t min_offload_size, boolean_t *local_offload) +{ + /* already offloaded */ + if (ABD_HANDLE(abd)) { + void *abd_provider = dpusm->extract(ABD_HANDLE(abd)); + if (local_offload) { + *local_offload = B_FALSE; + } + + /* see zia_checksum_error */ + if (!provider) { + return (ZIA_OK); + } + + return ((provider == abd_provider)? + ZIA_OK:ZIA_PROVIDER_MISMATCH); + } + + /* provider is checked by dpusm */ + void *handle = zia_alloc(provider, size, min_offload_size); + if (!handle) { + return (ZIA_ERROR); + } + + /* offload */ + int rc = ZIA_ERROR; + dpusm_mv_t mv = { .handle = handle, .offset = offset }; + if (abd_iterate_func(abd, 0, size, zia_offload_cb, &mv) == 0) { + rc = ZIA_OK; + } + + if (rc == ZIA_OK) { + ABD_HANDLE(abd) = handle; + if (local_offload) { + *local_offload = B_TRUE; + } + } else { + zia_free(&handle); + } + + return (rc); +} + +int +zia_offload_abd(void *provider, abd_t *abd, + size_t size, size_t min_offload_size, boolean_t *local_offload) +{ + if (!dpusm) { + return (ZIA_FALLBACK); + } + + /* no gangs or scatterlists */ + if (!abd || !(abd_is_linear(abd) || abd_is_linear_page(abd))) { + return (ZIA_ERROR); + } + + return (zia_offload_abd_offset(provider, + abd, 0, size, min_offload_size, local_offload)); +} + +/* copy offloaded buffer + offset back into abd + 0 */ +static int +zia_onload_abd_offset(abd_t *abd, size_t offset, + size_t size, boolean_t keep_handle) +{ + if (!dpusm) { + return (ZIA_FALLBACK); + } + + if (!abd) { + return (ZIA_ERROR); + } + + mutex_enter(&abd->abd_mtx); + if (!ABD_HANDLE(abd)) { + mutex_exit(&abd->abd_mtx); + return (ZIA_ERROR); + } + + int rc = ZIA_ERROR; + dpusm_mv_t mv = { .handle = ABD_HANDLE(abd), .offset = offset }; + if (abd_iterate_func(abd, 0, size, zia_onload_cb, &mv) == 0) { + rc = ZIA_OK; + } + + if (keep_handle != B_TRUE) { + zia_free_abd(abd, B_FALSE); + } + mutex_exit(&abd->abd_mtx); + + return (rc); +} + +int +zia_onload_abd(abd_t *abd, size_t size, boolean_t keep_handle) +{ + if (abd_is_gang(abd)) { + /* + * the only gangs that show up are from raidz + * + * get leading data size, stopping at first zero page + * which should always be the second child + */ + const size_t original_size = size; + size = 0; + for (abd_t *child = list_head(&ABD_GANG(abd).abd_gang_chain); + child != NULL; + child = list_next(&ABD_GANG(abd).abd_gang_chain, child)) { + if (child->abd_flags & ABD_FLAG_ZEROS) { + break; + } + + size += child->abd_size; + } + + ASSERT(size <= original_size); + } + + return (zia_onload_abd_offset(abd, 0, size, keep_handle)); +} + +void +zia_move_into_abd(abd_t *dst, void **src_handle) +{ + ABD_HANDLE(dst) = *src_handle; + *src_handle = NULL; +} + +int +zia_free_abd(abd_t *abd, boolean_t lock) +{ + if (lock == B_TRUE) { + mutex_enter(&abd->abd_mtx); + } + + zia_free(&ABD_HANDLE(abd)); + + if (lock == B_TRUE) { + mutex_exit(&abd->abd_mtx); + } + return (ZIA_OK); +} + +/* + * if offloaded locally, just free the handle + * if not, onload the data and free the handle + */ +int +zia_cleanup_abd(abd_t *abd, size_t size, boolean_t local_offload) +{ + if (!dpusm) { + return (ZIA_FALLBACK); + } + + if (!abd) { + return (ZIA_ERROR); + } + + int rc = ZIA_OK; + if (local_offload == B_TRUE) { + /* in-memory copy is still valid */ + /* lock just in case mirrors clean up at the same time */ + zia_free_abd(abd, B_TRUE); + } else { + /* have to copy data into memory */ + rc = zia_onload_abd(abd, size, B_FALSE); + } + + return (rc); +} + +int +zia_zero_fill(abd_t *abd, size_t offset, size_t size) +{ + if (!dpusm) { + return (ZIA_FALLBACK); + } + + if (!abd || !ABD_HANDLE(abd)) { + return (ZIA_ERROR); + } + + return (translate_rc(dpusm->zero_fill(ABD_HANDLE(abd), offset, size))); +} + +int +zia_compress(void *provider, zio_t *zio, size_t s_len, + enum zio_compress c, uint8_t level, void **cbuf_handle, + uint64_t *c_len, boolean_t *local_offload) +{ + if (!dpusm || !dpusm->compress || !provider) { + return (ZIA_FALLBACK); + } + + return (zia_compress_impl(dpusm, provider, zio, s_len, + c, level, cbuf_handle, c_len, local_offload)); +} + +int +zia_checksum_compute(void *provider, zio_cksum_t *dst, enum zio_checksum alg, + zio_t *zio, uint64_t size, boolean_t *local_offload) +{ + if (!dpusm || !dpusm->checksum || !provider) { + return (ZIA_FALLBACK); + } + + const dpusm_checksum_byteorder_t byteorder = + translate_byteorder(ZIO_CHECKSUM_NATIVE); + + if (!ABD_HANDLE(zio->io_abd)) { + dpusm_pc_t *caps = NULL; + if ((zia_get_capabilities(provider, &caps) != ZIA_OK) || + !(caps->checksum & translate_checksum(alg)) || + !(caps->checksum_byteorder & byteorder)) { + return (ZIA_FALLBACK); + } + + if (zia_offload_abd(provider, zio->io_abd, size, + zia_get_props(zio->io_spa)->min_offload_size, + local_offload) != ZIA_OK) { + return (ZIA_ERROR); + } + } else { + void *old_provider = dpusm->extract(ABD_HANDLE(zio->io_abd)); + if (old_provider != provider) { + return (ZIA_PROVIDER_MISMATCH); + } + + /* skip checks because dpusm will do them */ + } + + return (translate_rc(dpusm->checksum(translate_checksum(alg), + byteorder, ABD_HANDLE(zio->io_abd), size, dst->zc_word, + sizeof (dst->zc_word)))); +} + +int +zia_checksum_error(const blkptr_t *bp, enum zio_checksum alg, + abd_t *abd, uint64_t size, zio_bad_cksum_t *info) +{ + return (zia_checksum_error_impl(dpusm, bp, alg, abd, size, info)); +} + +static boolean_t +zia_can_raidz(zio_t *zio, raidz_row_t *rr, + boolean_t rec, uint_t cksum, size_t *col_sizes) +{ + const int raidn = rr->rr_firstdatacol; + if ((1 > raidn) || (raidn > 3)) { + return (B_FALSE); + } + + /* need at least raidn + 2 data columns */ + if (rr->rr_firstdatacol + 2 > rr->rr_cols) { + return (B_FALSE); + } + + const zia_props_t *props = zia_get_props(zio->io_spa); + if (!props->provider) { + return (B_FALSE); + } + + /* + * generation is needed for both + * generation and reconstruction + */ + int good = ( + /* raidz generation is turned on */ + (props->raidz.gen[raidn] == 1) && + + /* + * the provider knows whether or not + * raidz functions are available + */ + (dpusm->raid.can_compute(props->provider, raidn, + rr->rr_cols - rr->rr_firstdatacol, + col_sizes, rec == B_TRUE) == DPUSM_OK)); + + if (good && (rec == B_TRUE)) { + dpusm_pc_t *caps = NULL; + if (zia_get_capabilities(props->provider, &caps) != ZIA_OK) { + return (B_FALSE); + } + + good &= ( + /* raidz reconstruction is turned on */ + (props->raidz.rec[raidn] == 1) && + + /* need checksum */ + (props->checksum == 1) && + + /* raidz reconstruction support was checked earlier */ + + /* make sure the checksum is supported by the provider */ + (caps->checksum & translate_checksum(cksum))); + } + + return (good?B_TRUE:B_FALSE); +} + +int +zia_raidz_alloc(zio_t *zio, raidz_row_t *rr, boolean_t rec, + uint_t cksum, boolean_t *local_offload) +{ + if (!dpusm || !zio || !rr) { + return (ZIA_ERROR); + } + + /* + * existence of row handle implies existence + * of data and column handles + */ + if (rr->rr_zia_handle) { + return (ZIA_OK); + } + + /* get column sizes */ + const size_t column_sizes_size = sizeof (size_t) * rr->rr_cols; + size_t *column_sizes = kmem_alloc(column_sizes_size, KM_SLEEP); + for (size_t c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + column_sizes[c] = rc->rc_size; + } + + if (zia_can_raidz(zio, rr, rec, cksum, column_sizes) != B_TRUE) { + kmem_free(column_sizes, column_sizes_size); + return (ZIA_FALLBACK); + } + + zia_props_t *props = zia_get_props(zio->io_spa); + void *provider = props->provider; + if (!provider) { + return (ZIA_FALLBACK); + } + + /* + * offload the source data if it hasn't already been offloaded + * + * need to lock here since offloading normally doesn't lock, but + * abds hitting raidz might have been mirrored + */ + mutex_enter(&zio->io_abd->abd_mtx); + if (zia_offload_abd(provider, zio->io_abd, zio->io_size, + props->min_offload_size, local_offload) != ZIA_OK) { + mutex_exit(&zio->io_abd->abd_mtx); + kmem_free(column_sizes, column_sizes_size); + return (ZIA_ERROR); + } + mutex_exit(&zio->io_abd->abd_mtx); + + /* mirrored abds generate their own references to the columns */ + + const size_t column_handles_size = sizeof (void *) * rr->rr_cols; + void **column_handles = kmem_alloc(column_handles_size, KM_SLEEP); + + /* create parity column handles */ + for (uint64_t c = 0; c < rr->rr_firstdatacol; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + ASSERT(!ABD_HANDLE(rc->rc_abd)); + + /* allocate rc->rc_abd->abd_size, mark as rc->rc_size */ + if (rec == B_TRUE) { + /* reconstructing, so copy data to provider */ + zia_offload_abd_offset(provider, rc->rc_abd, 0, + rc->rc_abd->abd_size, props->min_offload_size, + NULL); + } else { + /* generating, so create new columns */ + ABD_HANDLE(rc->rc_abd) = + dpusm->alloc(provider, rc->rc_abd->abd_size); + } + + if (!ABD_HANDLE(rc->rc_abd)) { + /* data columns are all references */ + for (uint64_t i = rr->rr_firstdatacol; i < c; i++) { + raidz_col_t *rc = &rr->rr_col[i]; + zia_free_abd(rc->rc_abd, B_FALSE); + } + + kmem_free(column_handles, column_handles_size); + kmem_free(column_sizes, column_sizes_size); + return (ZIA_ERROR); + } + + column_handles[c] = ABD_HANDLE(rc->rc_abd); + } + + /* + * recalculate data column offsets and + * create references for each column + */ + size_t offset = 0; + for (size_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + /* + * if the column is a gang abd, the handle + * will point to the first child + */ + void *column_handle = dpusm->alloc_ref(ABD_HANDLE(zio->io_abd), + offset, rc->rc_size); + + ABD_HANDLE(rc->rc_abd) = column_handle; + column_handles[c] = column_handle; + + offset += rc->rc_size; + } + + /* get raid context */ + rr->rr_zia_handle = dpusm->raid.alloc(rr->rr_firstdatacol, + rr->rr_cols - rr->rr_firstdatacol, ABD_HANDLE(zio->io_abd), + column_handles, column_sizes); + + kmem_free(column_handles, column_handles_size); + kmem_free(column_sizes, column_sizes_size); + + if (!rr->rr_zia_handle) { + zia_raidz_free(rr, B_FALSE); + return (ZIA_ERROR); + } + + return (ZIA_OK); +} + +/* + * only frees the raidz data + * onload the data separately if it is needed + */ +int +zia_raidz_free(raidz_row_t *rr, boolean_t onload_parity) +{ + if (!dpusm) { + return (ZIA_FALLBACK); + } + + if (!rr) { + return (ZIA_ERROR); + } + + dpusm->raid.free(rr->rr_zia_handle); + rr->rr_zia_handle = NULL; + + uint64_t c = 0; + + if (onload_parity == B_TRUE) { + for (; c < rr->rr_firstdatacol; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + zia_onload_abd(rc->rc_abd, + rc->rc_abd->abd_size, B_FALSE); + } + } + + for (; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + zia_free_abd(rc->rc_abd, B_FALSE); + } + + return (ZIA_OK); +} + +int +zia_raidz_gen(raidz_row_t *rr) +{ + if (!dpusm) { + return (ZIA_FALLBACK); + } + + /* can only pass if raidz_alloc succeeded */ + if (!rr->rr_zia_handle) { + return (ZIA_ERROR); + } + + return (translate_rc(dpusm->raid.gen(rr->rr_zia_handle))); +} + +/* onload abd and delete raidz_row_t stuff */ +static int +zia_raidz_cleanup(zio_t *zio, raidz_row_t *rr, + boolean_t local_offload, + boolean_t onload_parity) +{ + /* + * bring data back to zio->io_abd, which should + * place data into parent automatically + */ + zia_cleanup_abd(zio->io_abd, zio->io_size, local_offload); + + return (zia_raidz_free(rr, onload_parity)); +} + +int +zia_raidz_gen_cleanup(zio_t *zio, raidz_row_t *rr, + boolean_t local_offload) +{ + /* + * RAIDZ generation only calls cleanup + * on failure, so parity does not need + * to be brought back. + */ + return (zia_raidz_cleanup(zio, rr, + local_offload, B_FALSE)); +} + +/* + * allocate new parity columns for this row + * and assign them to the raidz struct + * + * orig takes ownership of the original handles + */ +int +zia_raidz_new_parity(zio_t *zio, raidz_row_t *rr, abd_t **orig) +{ + if (!zio || !rr || !orig) { + return (ZIA_ERROR); + } + + if (!ABD_HANDLE(zio->io_abd) || !rr->rr_zia_handle) { + return (ZIA_FALLBACK); + } + + void **new_parity_cols[VDEV_RAIDZ_MAXPARITY]; + size_t new_parity_sizes[VDEV_RAIDZ_MAXPARITY]; + int c = 0; + + for (c = 0; c < rr->rr_firstdatacol; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + new_parity_cols[c] = NULL; + new_parity_sizes[c] = 0; + + /* this parity column was not reconstructed */ + if (!rc->rc_tried || rc->rc_error != 0) + continue; + + /* the provider updates the handle */ + new_parity_cols[c] = &ABD_HANDLE(rc->rc_abd); + new_parity_sizes[c] = rc->rc_size; + } + + if (c != rr->rr_firstdatacol) { + return (ZIA_FALLBACK); + } + + /* + * allocate space for parity columns and + * assign them to the raidz struct + */ + return (translate_rc(dpusm->raid.new_parity(rr->rr_zia_handle, + rr->rr_firstdatacol, new_parity_cols, new_parity_sizes))); +} + +int +zia_raidz_cmp(abd_t *lhs, abd_t *rhs, int *diff) +{ + if (!lhs || !rhs || !diff) { + return (ZIA_ERROR); + } + + if (lhs == rhs) { + *diff = 0; + return (ZIA_OK); + } + + void *lhs_handle = ABD_HANDLE(lhs); + void *rhs_handle = ABD_HANDLE(rhs); + if (!lhs_handle || !rhs_handle) { + return (ZIA_ERROR); + } + + return (translate_rc(dpusm->raid.cmp(lhs_handle, rhs_handle, diff))); +} + +int +zia_raidz_rec(raidz_row_t *rr, int *t, int nt) +{ + if (!dpusm) { + return (ZIA_FALLBACK); + } + + /* can only pass if raidz_alloc succeeded */ + if (!rr->rr_zia_handle) { + return (ZIA_FALLBACK); + } + + return (translate_rc(zia_raidz_rec_impl(dpusm, rr, t, nt))); +} + +int +zia_raidz_rec_cleanup(zio_t *zio, raidz_row_t *rr, + boolean_t local_offload, boolean_t onload_parity) +{ + return (zia_raidz_cleanup(zio, rr, + local_offload, onload_parity)); +} + +int +zia_file_open(vdev_t *vdev, const char *path, + int flags, int mode) +{ + if (!vdev || !vdev->vdev_spa) { + return (ZIA_ERROR); + } + + void *provider = zia_get_props(vdev->vdev_spa)->provider; + if (!dpusm || !provider) { + return (ZIA_FALLBACK); + } + + if (!VDEV_HANDLE(vdev)) { + VDEV_HANDLE(vdev) = dpusm->file.open(provider, + path, flags, mode); + } + + return (VDEV_HANDLE(vdev)?ZIA_OK:ZIA_ERROR); +} + +int +zia_file_write(vdev_t *vdev, abd_t *abd, ssize_t size, + loff_t offset, ssize_t *resid, int *err) +{ + if (!vdev || !abd) { + return (ZIA_ERROR); + } + + if (!dpusm || !VDEV_HANDLE(vdev) || !ABD_HANDLE(abd)) { + return (ZIA_FALLBACK); + } + + size_t trailing_zeros = 0; + size_t data_size = size; + + /* can only happen with raidz */ + if (abd_is_gang(abd)) { + abd_t *cabd = list_head(&ABD_GANG(abd).abd_gang_chain); + trailing_zeros = size - cabd->abd_size; + data_size = cabd->abd_size; + } + + return (dpusm->file.write(VDEV_HANDLE(vdev), + ABD_HANDLE(abd), data_size, trailing_zeros, offset, resid, err)); +} + +int +zia_file_close(vdev_t *vdev) +{ + if (!vdev) { + return (ZIA_ERROR); + } + + if (!dpusm || !VDEV_HANDLE(vdev)) { + return (ZIA_FALLBACK); + } + + dpusm->file.close(VDEV_HANDLE(vdev)); + VDEV_HANDLE(vdev) = NULL; + zia_get_props(vdev->vdev_spa)->min_offload_size = 0; + + return (ZIA_OK); +} + +#ifdef _KERNEL +int +zia_disk_open(vdev_t *vdev, const char *path, + struct block_device *bdev) +{ + if (!vdev || !vdev->vdev_spa) { + return (ZIA_ERROR); + } + + void *provider = zia_get_props(vdev->vdev_spa)->provider; + if (!dpusm || !provider) { + return (ZIA_FALLBACK); + } + + if (!VDEV_HANDLE(vdev)) { + VDEV_HANDLE(vdev) = dpusm->disk.open(provider, + path, bdev); + } + + return (VDEV_HANDLE(vdev)?ZIA_OK:ZIA_ERROR); +} + +int +zia_disk_invalidate(vdev_t *vdev) +{ + if (!vdev) { + return (ZIA_ERROR); + } + + if (!dpusm || !VDEV_HANDLE(vdev)) { + return (ZIA_FALLBACK); + } + + return (translate_rc(dpusm->disk.invalidate(VDEV_HANDLE(vdev)))); +} + +int +zia_disk_write(vdev_t *vdev, zio_t *zio, size_t io_size, + uint64_t io_offset, int flags) +{ + if (!vdev || !zio->io_abd) { + return (ZIA_ERROR); + } + + if (!dpusm || !ABD_HANDLE(zio->io_abd) || !VDEV_HANDLE(vdev)) { + return (ZIA_FALLBACK); + } + + size_t trailing_zeros = 0; + size_t data_size = io_size; + + /* can only happen with raidz */ + if (abd_is_gang(zio->io_abd)) { + abd_t *cabd = list_head(&ABD_GANG(zio->io_abd).abd_gang_chain); + trailing_zeros = io_size - cabd->abd_size; + data_size = cabd->abd_size; + } + + return (dpusm->disk.write(VDEV_HANDLE(vdev), ABD_HANDLE(zio->io_abd), + data_size, trailing_zeros, io_offset, + flags, zia_disk_write_completion, zio)); +} + +int +zia_disk_flush(vdev_t *vdev, zio_t *zio) +{ + if (!vdev) { + return (ZIA_ERROR); + } + + if (!dpusm || !VDEV_HANDLE(vdev)) { + return (ZIA_FALLBACK); + } + + return (dpusm->disk.flush(VDEV_HANDLE(vdev), + zia_disk_flush_completion, zio)); +} + +int +zia_disk_close(vdev_t *vdev) +{ + if (!vdev) { + return (ZIA_ERROR); + } + + void *handle = VDEV_HANDLE(vdev); + VDEV_HANDLE(vdev) = NULL; + + zia_get_props(vdev->vdev_spa)->min_offload_size = 0; + + if (!dpusm || !handle) { + return (ZIA_FALLBACK); + } + + /* trust that ZFS handles closing disks once */ + dpusm->disk.close(handle); + + return (ZIA_OK); +} +#endif + +#endif diff --git a/module/zfs/zia_cddl.c b/module/zfs/zia_cddl.c new file mode 100644 index 000000000000..18caca510a7b --- /dev/null +++ b/module/zfs/zia_cddl.c @@ -0,0 +1,232 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifdef ZIA + +#include +#include +#include +#include +#include +#include +#include + +/* basically a duplicate of zio_compress_data */ +int +zia_compress_impl(const dpusm_uf_t *dpusm, void *provider, + zio_t *zio, size_t s_len, enum zio_compress c, uint8_t level, + void **cbuf_handle, uint64_t *c_len, boolean_t *local_offload) +{ + size_t d_len; + uint8_t complevel; + zio_compress_info_t *ci = &zio_compress_table[c]; + abd_t *src = zio->io_abd; + zia_props_t *props = zia_get_props(zio->io_spa); + + ASSERT((uint_t)c < ZIO_COMPRESS_FUNCTIONS); + ASSERT((uint_t)c == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL); + + /* + * If the data is all zeros, we don't even need to allocate + * a block for it. We indicate this by returning zero size. + */ + if (!ABD_HANDLE(src)) { + /* check that compression can be done before offloading */ + dpusm_pc_t *caps = NULL; + if ((zia_get_capabilities(provider, &caps) != ZIA_OK) || + !(caps->compress & translate_compress(c))) { + return (ZIA_FALLBACK); + } + + /* check in-memory buffer */ + if (abd_iterate_func(src, 0, s_len, + zio_compress_zeroed_cb, NULL) == 0) { + *c_len = 0; + return (ZIA_OK); + } + + if (zia_offload_abd(provider, src, s_len, + props->min_offload_size, local_offload) != ZIA_OK) { + return (ZIA_ERROR); + } + } else { + /* came in offloaded - make sure provider can compress */ + *local_offload = B_FALSE; + + void *old_provider = dpusm->extract(ABD_HANDLE(src)); + if (old_provider != provider) { + return (ZIA_PROVIDER_MISMATCH); + } + + dpusm_pc_t *caps = NULL; + if ((zia_get_capabilities(provider, &caps) != ZIA_OK) || + !(caps->compress & translate_compress(c))) { + return (ZIA_FALLBACK); + } + + /* use provider to check */ + if (dpusm->all_zeros(ABD_HANDLE(src), 0, s_len) == DPUSM_OK) { + *c_len = 0; + return (ZIA_OK); + } + } + + if (c == ZIO_COMPRESS_EMPTY) { + *c_len = s_len; + return (ZIA_OK); + } + + /* Compress at least 12.5% */ + d_len = s_len - (s_len >> 3); + + complevel = ci->ci_level; + + if (c == ZIO_COMPRESS_ZSTD) { + /* If we don't know the level, we can't compress it */ + if (level == ZIO_COMPLEVEL_INHERIT) { + *c_len = s_len; + return (ZIA_OK); + } + + if (level == ZIO_COMPLEVEL_DEFAULT) + complevel = ZIO_ZSTD_LEVEL_DEFAULT; + else + complevel = level; + + ASSERT3U(complevel, !=, ZIO_COMPLEVEL_INHERIT); + } + + /* nothing to offload, so just allocate space */ + *cbuf_handle = zia_alloc(provider, s_len, props->min_offload_size); + if (!*cbuf_handle) { + zia_cleanup_abd(src, s_len, + local_offload?*local_offload:B_FALSE); + return (ZIA_ERROR); + } + + /* DPUSM interface takes in a size_t, not a uint64_t */ + size_t zia_c_len = 0; + if (dpusm->compress(translate_compress(c), + ABD_HANDLE(src), *cbuf_handle, s_len, + (int8_t)level, &zia_c_len) != DPUSM_OK) { + zia_free(cbuf_handle); + return (ZIA_FALLBACK); + } + + *c_len = zia_c_len; + + /* + * Return ZIA_OK because this is not an error - it just didn't + * compress well. The data will be dropped later on (instead of + * onloaded) because c_len is too big. + */ + if (*c_len > d_len) { + *c_len = s_len; + } + + return (ZIA_OK); +} + +int +zia_checksum_error_impl(const dpusm_uf_t *dpusm, + const blkptr_t *bp, enum zio_checksum alg, + abd_t *abd, uint64_t size, zio_bad_cksum_t *info) +{ + zio_cksum_t actual_cksum; + zio_cksum_t expected_cksum = bp->blk_cksum; + int byteswap = BP_SHOULD_BYTESWAP(bp); + + if (dpusm->checksum(translate_checksum(alg), + translate_byteorder(ZIO_CHECKSUM_NATIVE), + ABD_HANDLE(abd), size, + actual_cksum.zc_word, + sizeof (actual_cksum.zc_word)) != DPUSM_OK) { + return (ZIA_ERROR); + } + + if (info != NULL) { + zio_checksum_info_t *ci = &zio_checksum_table[alg]; + info->zbc_expected = expected_cksum; + info->zbc_actual = actual_cksum; + info->zbc_checksum_name = ci->ci_name; + info->zbc_byteswapped = byteswap; + info->zbc_injected = 0; + info->zbc_has_cksum = 1; + } + + if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) + return (SET_ERROR(ECKSUM)); + + return (ZIA_OK); +} + +int +zia_raidz_rec_impl(const dpusm_uf_t *dpusm, + raidz_row_t *rr, int *t, int nt) +{ + int tgts[VDEV_RAIDZ_MAXPARITY]; + int ntgts = 0; + for (int i = 0, c = 0; c < rr->rr_cols; c++) { + if (i < nt && c == t[i]) { + tgts[ntgts++] = c; + i++; + } else if (rr->rr_col[c].rc_error != 0) { + tgts[ntgts++] = c; + } + } + + ASSERT(ntgts >= nt); + + return (dpusm->raid.rec(rr->rr_zia_handle, + tgts, ntgts)); +} + +#ifdef _KERNEL +/* called by provider */ +void +zia_disk_write_completion(void *zio_ptr, int error) +{ + zio_t *zio = (zio_t *)zio_ptr; + zio->io_error = error; + ASSERT3S(zio->io_error, >=, 0); + if (zio->io_error) + vdev_disk_error(zio); + + zio_delay_interrupt(zio); +} + +/* called by provider */ +void +zia_disk_flush_completion(void *zio_ptr, int error) +{ + zio_t *zio = (zio_t *)zio_ptr; + + if (zio->io_error && (zio->io_error == EOPNOTSUPP)) + zio->io_vd->vdev_nowritecache = B_TRUE; + + ASSERT3S(zio->io_error, >=, 0); + if (zio->io_error) + vdev_disk_error(zio); + zio_interrupt(zio); +} +#endif + +#endif diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 1c9f598b7d13..b72f9e78f5ef 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -52,6 +52,10 @@ #include #include +#ifdef ZIA +#include +#endif + /* * ========================================================================== * I/O type descriptions @@ -877,6 +881,15 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_add_child(pio, zio); } +#ifdef ZIA + /* turn off encryption and dedup if Z.I.A. is used */ + if (zia_is_used(zio) == B_TRUE) { + zio->io_prop.zp_dedup = B_FALSE; + zio->io_prop.zp_dedup_verify = B_FALSE; + zio->io_prop.zp_encrypt = B_FALSE; + } +#endif + taskq_init_ent(&zio->io_tqent); return (zio); @@ -1695,15 +1708,56 @@ zio_write_compress(zio_t *zio) if (compress != ZIO_COMPRESS_OFF && !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) { void *cbuf = zio_buf_alloc(lsize); +#ifdef ZIA + int zia_rc = ZIA_FALLBACK; + void *cbuf_handle = NULL; /* only valid if zia_rc == ZIA_OK */ + zia_props_t *zia_props = zia_get_props(spa); + boolean_t local_offload = B_FALSE; + /* real data is compressed on the offloader */ + if (!zp->zp_ismd && + (zia_props->compress == 1)) { + zia_rc = zia_compress(zia_props->provider, zio, lsize, + compress, zp->zp_complevel, + &cbuf_handle, &psize, &local_offload); + } + + if (zia_rc != ZIA_OK) { + zia_cleanup_abd(zio->io_abd, lsize, local_offload); +#endif psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize, zp->zp_complevel); +#ifdef ZIA + } +#endif + if (psize == 0 || psize >= lsize) { compress = ZIO_COMPRESS_OFF; +#ifdef ZIA + /* no need for offloaded compressed buffer any more */ + zia_free(&cbuf_handle); + + /* source abd is still offloaded */ +#endif zio_buf_free(cbuf, lsize); } else if (!zp->zp_dedup && !zp->zp_encrypt && psize <= BPE_PAYLOAD_SIZE && zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) && spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) { +#ifdef ZIA + /* + * compressed enough, but not handling embedded + * data, so move compressed data back into memory + */ + zia_onload(&cbuf_handle, cbuf, psize); + + /* + * remove offloaded source abd + * + * in-memory copy should still be valid, but calling + * zia_cleanup_abd just in case + */ + zia_cleanup_abd(zio->io_abd, lsize, local_offload); +#endif encode_embedded_bp_compressed(bp, cbuf, compress, lsize, psize); BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA); @@ -1729,15 +1783,70 @@ zio_write_compress(zio_t *zio) spa->spa_min_alloc); if (rounded >= lsize) { compress = ZIO_COMPRESS_OFF; +#ifdef ZIA + /* + * don't need offloaded compressed + * buffer any more + */ + zia_free(&cbuf_handle); +#endif zio_buf_free(cbuf, lsize); psize = lsize; } else { abd_t *cdata = abd_get_from_buf(cbuf, lsize); abd_take_ownership_of_buf(cdata, B_TRUE); +#ifdef ZIA + /* real data */ + if (!zp->zp_ismd && cbuf_handle) { + /* source abd no longer needed */ + zia_free_abd(zio->io_abd, B_FALSE); + + /* + * compressed enough, so associate the + * compressed buffer with the abd + */ + zia_move_into_abd(cdata, &cbuf_handle); + if (zia_zero_fill(cdata, psize, + rounded - psize) != ZIA_OK) { + /* + * if setting cdata's handle + * fails, onload the compressed + * buffer (automatically placing + * it into cdata) and continue + * using zfs + * + * if cbuf is not offloaded, + * nothing happens + */ + zia_onload(&cbuf_handle, + cbuf, lsize); + } + } +#endif abd_zero_off(cdata, psize, rounded - psize); psize = rounded; +#ifdef ZIA + /* + * metadata + * + * offload here to zero fill buffer in + * memory instead of calling provider + */ + if (zp->zp_ismd && + (zia_props->compress == 1)) { + zia_offload_abd(zia_props->provider, + cdata, psize, + zia_props->min_offload_size, NULL); + } +#endif zio_push_transform(zio, cdata, psize, lsize, NULL); +#ifdef ZIA + if (zia_is_offloaded(zio->io_abd)) { + zio->io_flags |= + ZIO_FLAG_DONT_AGGREGATE; + } +#endif } } @@ -3773,6 +3882,15 @@ zio_vdev_io_start(zio_t *zio) if (zio->io_type == ZIO_TYPE_WRITE) { abd_copy(abuf, zio->io_abd, zio->io_size); abd_zero_off(abuf, zio->io_size, asize - zio->io_size); +#ifdef ZIA + /* + * The Z.I.A. handles of the abds that come here + * were not modified and do not get associated with + * abuf during the transform. Instead of dropping + * the handle and delaying here, let abd_free clean + * it up later. + */ +#endif } zio_push_transform(zio, abuf, asize, asize, zio_subblock); } @@ -3969,6 +4087,10 @@ zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr) { void *abd = abd_alloc_sametype(zio->io_abd, zio->io_size); +#ifdef ZIA + zia_onload_abd(zio->io_abd, zio->io_size, B_FALSE); +#endif + abd_copy(abd, zio->io_abd, zio->io_size); zcr->zcr_cbinfo = zio->io_size; @@ -5029,6 +5151,10 @@ EXPORT_SYMBOL(zio_data_buf_alloc); EXPORT_SYMBOL(zio_buf_free); EXPORT_SYMBOL(zio_data_buf_free); +#ifdef ZIA +EXPORT_SYMBOL(zio_push_transform); +#endif + ZFS_MODULE_PARAM(zfs_zio, zio_, slow_io_ms, INT, ZMOD_RW, "Max I/O completion time (milliseconds) before marking it as slow"); diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c index c7368ac26a09..b47ff61fc180 100644 --- a/module/zfs/zio_checksum.c +++ b/module/zfs/zio_checksum.c @@ -33,6 +33,10 @@ #include #include +#ifdef ZIA +#include +#endif + /* * Checksum vectors. * @@ -358,6 +362,10 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, size_t eck_offset; memset(&saved, 0, sizeof (zio_cksum_t)); +#ifdef ZIA + /* not handling embedded checksums, so bring back data */ + zia_cleanup_abd(abd, size, B_FALSE); +#endif if (checksum == ZIO_CHECKSUM_ZILOG2) { zil_chain_t zilc; @@ -400,8 +408,29 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, sizeof (zio_cksum_t)); } else { saved = bp->blk_cksum; + +#ifdef ZIA + int zia_rc = ZIA_ERROR; + zia_props_t *props = zia_get_props(spa); + + /* only offload non-embedded checksums */ + boolean_t local_offload = B_FALSE; + if (props->checksum == 1) { + zia_rc = zia_checksum_compute(props->provider, &cksum, + checksum, zio, size, &local_offload); + } + + /* fall back to ZFS implementation */ + if (zia_rc != ZIA_OK) { + zia_cleanup_abd(abd, size, local_offload); +#endif ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum], &cksum); +#ifdef ZIA + } else { + zio->io_flags |= ZIO_FLAG_DONT_AGGREGATE; + } +#endif if (BP_USES_CRYPT(bp) && BP_GET_TYPE(bp) != DMU_OT_OBJSET) zio_checksum_handle_crypt(&cksum, &saved, insecure); bp->blk_cksum = cksum; @@ -539,8 +568,27 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) abd_t *data = zio->io_abd; spa_t *spa = zio->io_spa; +#ifdef ZIA + error = ZIA_FALLBACK; + + if (zia_get_props(zio->io_spa)->checksum == 1) { + zio_checksum_info_t *ci = &zio_checksum_table[checksum]; + if (!(ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED)) { + error = zia_checksum_error(bp, + checksum, data, size, info); + } + } + + /* fall back to ZFS implementation */ + if ((error != ZIA_OK) && (error != ECKSUM)) { + /* data was modified by reconstruction */ + zia_onload_abd(data, size, B_FALSE); +#endif error = zio_checksum_error_impl(spa, bp, checksum, data, size, offset, info); +#ifdef ZIA + } +#endif if (zio_injection_enabled && error == 0 && zio->io_error == 0) { error = zio_handle_fault_injection(zio, ECKSUM); diff --git a/module/zfs/zio_compress.c b/module/zfs/zio_compress.c index 717395dcf456..40794d5a9a06 100644 --- a/module/zfs/zio_compress.c +++ b/module/zfs/zio_compress.c @@ -111,7 +111,11 @@ zio_compress_select(spa_t *spa, enum zio_compress child, return (result); } +#ifndef ZIA static int +#else +int +#endif zio_compress_zeroed_cb(void *data, size_t len, void *private) { (void) private; diff --git a/module/zia-software-provider/kernel_offloader.c b/module/zia-software-provider/kernel_offloader.c new file mode 100644 index 000000000000..875f454c0708 --- /dev/null +++ b/module/zia-software-provider/kernel_offloader.c @@ -0,0 +1,766 @@ +/* + * © 2021. Triad National Security, LLC. All rights reserved. + * + * This program was produced under U.S. Government contract + * 89233218CNA000001 for Los Alamos National Laboratory (LANL), which + * is operated by Triad National Security, LLC for the U.S. + * Department of Energy/National Nuclear Security Administration. All + * rights in the program are reserved by Triad National Security, LLC, + * and the U.S. Department of Energy/National Nuclear Security + * Administration. The Government is granted for itself and others + * acting on its behalf a nonexclusive, paid-up, irrevocable worldwide + * license in this material to reproduce, prepare derivative works, + * distribute copies to the public, perform publicly and display + * publicly, and to permit others to do so. + * + * ---- + * + * This program is open source under the BSD-3 License. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kernel_offloader.h" + +static const char NAME[] = "Kernel Offloader"; +static const size_t NAME_LEN = sizeof (NAME); + +typedef enum kernel_offloader_handle_type { + KOH_REAL, /* default type - convert all data into a single blob */ + KOH_REFERENCE, + + KOH_INVALID, +} koht_t; + +/* offloaded data */ +typedef struct kernel_offloader_handle { + koht_t type; + void *ptr; + size_t size; +} koh_t; + +/* never decreases */ +static atomic_t total_count; +static atomic_t total_size; +static atomic_t total_actual; + +/* currently active */ +static atomic_t active_count; +static atomic_t active_size; +static atomic_t active_actual; + +/* + * value used to swizzle the pointer so that + * dereferencing the handle will fail + */ +static void *mask = NULL; +void +kernel_offloader_init(void) +{ + get_random_bytes(&mask, sizeof (mask)); + atomic_set(&total_count, 0); + atomic_set(&total_size, 0); + atomic_set(&total_actual, 0); + atomic_set(&active_count, 0); + atomic_set(&active_size, 0); + atomic_set(&active_actual, 0); + printk("kernel offloader init: %p\n", mask); +} + +void +kernel_offloader_fini(void) +{ + mask = NULL; + + printk("kernel offloader fini with " + "%d/%d bytes in %d allocations " + "(actual %d/%d bytes in %d allocations) " + "remaining\n", + atomic_read(&active_size), + atomic_read(&total_size), + atomic_read(&active_count), + atomic_read(&active_actual), + atomic_read(&total_actual), + atomic_read(&total_count)); +} + +/* get a starting address of a linear koh_t */ +static void * +ptr_start(koh_t *koh, size_t offset) +{ + return (void *)(((uintptr_t)koh->ptr) + offset); +} + +/* + * convert the actual pointer to a handle (pretend + * the data is not accessible from the Z.I.A. base) + */ +static void * +swizzle(void *ptr) +{ + return (ptr?((void *)(((uintptr_t)ptr) ^ ((uintptr_t)mask))):NULL); +} + +/* convert the handle to a usable pointer */ +static void * +unswizzle(void *handle) +{ + return (swizzle(handle)); +} + +static koh_t * +koh_alloc(size_t size) +{ + koh_t *koh = kmalloc(sizeof (koh_t), GFP_KERNEL); + if (koh) { + koh->type = KOH_REAL; + koh->ptr = kmalloc(size, GFP_KERNEL); + koh->size = size; + + /* the allocation itself */ + atomic_add(1, &total_count); + atomic_add(1, &active_count); + atomic_add(size, &total_size); + atomic_add(size, &active_size); + atomic_add(size, &total_actual); + atomic_add(size, &active_actual); + + /* the wrapper struct */ + atomic_add(1, &total_count); + atomic_add(1, &active_count); + atomic_add(sizeof (koh_t), &total_size); + atomic_add(sizeof (koh_t), &active_size); + } + + return (koh); +} + +static koh_t * +koh_alloc_ref(koh_t *src, size_t offset, size_t size) +{ + koh_t *ref = NULL; + if (src) { + koh_t *src_koh = (koh_t *)src; + + if ((offset + size) > src_koh->size) { + printk("Error: Cannot reference handle of size %zu " + "starting at offset %zu with size %zu\n", + src_koh->size, offset, size); + return (NULL); + } + + ref = kmalloc(sizeof (koh_t), GFP_KERNEL); + if (ref) { + ref->type = KOH_REFERENCE; + + /* same underlying buffer */ + ref->ptr = ptr_start(src, offset); + + /* should probably check offset + size < src->size */ + ref->size = size; + + /* no new requested space */ + + /* the wrapper struct */ + atomic_add(1, &total_count); + atomic_add(1, &active_count); + atomic_add(sizeof (koh_t), &total_size); + atomic_add(sizeof (koh_t), &active_size); + } + } + + return (ref); +} + +int +kernel_offloader_get_size(void *handle, size_t *size, size_t *actual) +{ + koh_t *koh = (koh_t *)unswizzle(handle); + + if (size) { + *size = koh->size; + } + + if (actual) { + *actual = koh->size; + } + + return (KERNEL_OFFLOADER_OK); +} + +static void +koh_free(koh_t *koh) +{ + if (koh) { + switch (koh->type) { + case KOH_REAL: + /* the allocation itself */ + atomic_sub(1, &active_count); + atomic_sub(koh->size, &active_size); + atomic_sub(koh->size, &active_actual); + kfree(koh->ptr); + break; + case KOH_REFERENCE: + case KOH_INVALID: + default: + break; + } + + /* the wrapper struct */ + atomic_sub(1, &active_count); + atomic_sub(sizeof (koh_t), &active_size); + kfree(koh); + } +} + +void * +kernel_offloader_alloc(size_t size) +{ + return (swizzle(koh_alloc(size))); +} + +void * +kernel_offloader_alloc_ref(void *src_handle, size_t offset, size_t size) +{ + return swizzle(koh_alloc_ref(unswizzle(src_handle), + offset, size)); +} + +void +kernel_offloader_free(void *handle) +{ + koh_free(unswizzle(handle)); +} + +int +kernel_offloader_copy_from_mem(void *handle, size_t offset, + const void *src, size_t size) +{ + koh_t *koh = (koh_t *)unswizzle(handle); + if (!koh) { + return (KERNEL_OFFLOADER_ERROR); + } + + if ((offset + size) > koh->size) { + return (KERNEL_OFFLOADER_ERROR); + } + + void *dst = ptr_start(koh, offset); + if (memcpy(dst, src, size) != dst) { + return (KERNEL_OFFLOADER_ERROR); + } + return (KERNEL_OFFLOADER_OK); +} + +int +kernel_offloader_copy_to_mem(void *handle, size_t offset, + void *dst, size_t size) +{ + koh_t *koh = (koh_t *)unswizzle(handle); + if (!koh) { + return (KERNEL_OFFLOADER_ERROR); + } + + if ((offset + size) > koh->size) { + return (KERNEL_OFFLOADER_ERROR); + } + + if (memcpy(dst, ptr_start(koh, offset), size) != dst) { + return (KERNEL_OFFLOADER_ERROR); + } + + return (KERNEL_OFFLOADER_OK); +} + +int +kernel_offloader_cmp(void *lhs_handle, void *rhs_handle, int *diff) +{ + koh_t *lhs = (koh_t *)unswizzle(lhs_handle); + koh_t *rhs = (koh_t *)unswizzle(rhs_handle); + + if (!lhs || !rhs || !diff) { + return (KERNEL_OFFLOADER_ERROR); + } + + size_t len = rhs->size; + if (lhs->size != rhs->size) { + len = + (lhs->size < rhs->size)?lhs->size:rhs->size; + } + + *diff = memcmp(ptr_start(lhs, 0), + ptr_start(rhs, 0), len); + + return (KERNEL_OFFLOADER_OK); +} + +int +kernel_offloader_zero_fill(void *handle, size_t offset, size_t size) +{ + koh_t *koh = (koh_t *)unswizzle(handle); + memset(ptr_start(koh, offset), 0, size); + return (KERNEL_OFFLOADER_OK); +} + +int +kernel_offloader_all_zeros(void *handle, size_t offset, size_t size) +{ + koh_t *koh = (koh_t *)unswizzle(handle); + if (koh->size - offset < size) { + return (KERNEL_OFFLOADER_ERROR); + } + + uint64_t *array = ptr_start(koh, offset); + size_t i; + for (i = 0; i < size / sizeof (uint64_t); i++) { + if (array[i]) { + return (KERNEL_OFFLOADER_BAD_RESULT); + } + } + + char *remaining = ptr_start(koh, offset); + for (i *= sizeof (uint64_t); i < size; i++) { + if (remaining[i]) { + return (KERNEL_OFFLOADER_BAD_RESULT); + } + } + + return (KERNEL_OFFLOADER_OK); +} + +int +kernel_offloader_mem_stats( + void *t_count_handle, void *t_size_handle, void *t_actual_handle, + void *a_count_handle, void *a_size_handle, void *a_actual_handle) +{ + if (t_count_handle) { + *(size_t *)ptr_start(t_count_handle, 0) = + atomic_read(&total_count); + } + + if (t_size_handle) { + *(size_t *)ptr_start(t_size_handle, 0) = + atomic_read(&total_size); + } + + if (t_actual_handle) { + *(size_t *)ptr_start(t_actual_handle, 0) = + atomic_read(&total_actual); + } + + if (a_count_handle) { + *(size_t *)ptr_start(a_count_handle, 0) = + atomic_read(&active_count); + } + + if (a_size_handle) { + *(size_t *)ptr_start(a_size_handle, 0) = + atomic_read(&active_size); + } + + if (a_actual_handle) { + *(size_t *)ptr_start(a_actual_handle, 0) = + atomic_read(&active_actual); + } + + return (KERNEL_OFFLOADER_OK); +} + +/* specific implementation */ +static int +kernel_offloader_gzip_compress(koh_t *src, koh_t *dst, + size_t s_len, int level, size_t *c_len) +{ + *c_len = dst->size; + + if (z_compress_level(ptr_start(dst, 0), c_len, ptr_start(src, 0), + s_len, level) != Z_OK) { + if (*c_len != src->size) { + return (KERNEL_OFFLOADER_ERROR); + } + return (KERNEL_OFFLOADER_OK); + } + + return (KERNEL_OFFLOADER_OK); +} + +int +kernel_offloader_compress(dpusm_compress_t alg, + void *src, void *dst, size_t s_len, int level, + void *c_len) +{ + int status = KERNEL_OFFLOADER_UNAVAILABLE; + koh_t *src_koh = NULL; + koh_t *dst_koh = NULL; + koh_t *c_len_koh = NULL; + if (!src || !dst || !c_len) { + return (KERNEL_OFFLOADER_ERROR); + } + + src_koh = (koh_t *)unswizzle(src); + dst_koh = (koh_t *)unswizzle(dst); + c_len_koh = (koh_t *)unswizzle(c_len); + + if ((DPUSM_COMPRESS_GZIP_1 <= alg) && + (alg <= DPUSM_COMPRESS_GZIP_9)) { + status = kernel_offloader_gzip_compress(src_koh, dst_koh, s_len, + level, (size_t *)ptr_start(c_len_koh, 0)); + } + + return (status); +} + +/* specific implementation */ +static int +kernel_offloader_gzip_decompress(koh_t *src, koh_t *dst, + int level, size_t *c_len) +{ + if (z_uncompress(ptr_start(dst, 0), c_len, ptr_start(src, 0), + src->size) != Z_OK) { + return (KERNEL_OFFLOADER_ERROR); + } + + return (KERNEL_OFFLOADER_OK); +} + +int +kernel_offloader_decompress(dpusm_compress_t alg, + void *src, void *dst, int level) +{ + int status = KERNEL_OFFLOADER_UNAVAILABLE; + koh_t *src_koh = (koh_t *)unswizzle(src); + koh_t *dst_koh = (koh_t *)unswizzle(dst); + + size_t d_len = 0; + + if ((DPUSM_COMPRESS_GZIP_1 <= alg) && + (alg <= DPUSM_COMPRESS_GZIP_9)) { + status = kernel_offloader_gzip_decompress(src_koh, dst_koh, + level, &d_len); + } + + return (status); +} + +int +kernel_offloader_checksum(dpusm_checksum_t alg, + dpusm_checksum_byteorder_t order, void *data, size_t size, + void *cksum, size_t cksum_size) +{ + koh_t *data_koh = (koh_t *)unswizzle(data); + + if (!data_koh) { + return (KERNEL_OFFLOADER_ERROR); + } + + if ((alg != DPUSM_CHECKSUM_FLETCHER_2) && + (alg != DPUSM_CHECKSUM_FLETCHER_4)) { + return (KERNEL_OFFLOADER_ERROR); + } + + zio_cksum_t zcp; + if (cksum_size < sizeof (zcp.zc_word)) { + return (KERNEL_OFFLOADER_ERROR); + } + + /* compute checksum */ + + void *buf = ptr_start(data_koh, 0); + + if (alg == DPUSM_CHECKSUM_FLETCHER_2) { + if (order == DPUSM_BYTEORDER_NATIVE) { + fletcher_2_native(buf, size, NULL, &zcp); + } else { + fletcher_2_byteswap(buf, size, NULL, &zcp); + } + } else if (alg == DPUSM_CHECKSUM_FLETCHER_4) { + if (order == DPUSM_BYTEORDER_NATIVE) { + fletcher_4_native(buf, size, NULL, &zcp); + } else { + fletcher_4_byteswap(buf, size, NULL, &zcp); + } + } + + memcpy(cksum, zcp.zc_word, sizeof (zcp.zc_word)); + + return (DPUSM_OK); +} + +void * +kernel_offloader_raidz_alloc(size_t nparity, size_t ndata, + void **col_handles, size_t *col_sizes) +{ + const size_t ncols = nparity + ndata; + raidz_row_t *rr = kmalloc(offsetof(raidz_row_t, + rr_col[ncols]), GFP_KERNEL); + rr->rr_cols = ncols; + rr->rr_firstdatacol = nparity; + + for (size_t c = 0; c < ncols; c++) { + koh_t *koh = (koh_t *)unswizzle(col_handles[c]); + raidz_col_t *rc = &rr->rr_col[c]; + memset(rc, 0, sizeof (raidz_row_t)); + + rc->rc_abd = abd_get_from_buf(koh->ptr, koh->size); + abd_release_ownership_of_buf(rc->rc_abd); + rc->rc_size = col_sizes[c]; + } + + return (swizzle(rr)); +} + +/* attaches a column to the raidz struct */ +int +kernel_offloader_raidz_set_col(void *raidz, uint64_t c, + void *col, size_t size) +{ + raidz_row_t *rr = (raidz_row_t *)unswizzle(raidz); + koh_t *koh = (koh_t *)unswizzle(col); + + if (!rr || !koh) { + return (KERNEL_OFFLOADER_ERROR); + } + + /* c is too big */ + if (c >= rr->rr_cols) { + return (KERNEL_OFFLOADER_ERROR); + } + + /* parity column */ + if (c < rr->rr_firstdatacol) { + /* must be a real allocation */ + if (koh->type != KOH_REAL) { + return (KERNEL_OFFLOADER_ERROR); + } + } + /* data column */ + else { + /* needs to be a reference */ + if (koh->type != KOH_REFERENCE) { + return (KERNEL_OFFLOADER_ERROR); + } + } + + /* "active" size is larger than allocated size */ + if (size > koh->size) { + return (KERNEL_OFFLOADER_ERROR); + } + + raidz_col_t *rc = &rr->rr_col[c]; + rc->rc_abd = abd_get_from_buf(koh->ptr, size); + abd_release_ownership_of_buf(rc->rc_abd); + rc->rc_size = size; + + return (KERNEL_OFFLOADER_OK); +} + +void +kernel_offloader_raidz_free(void *raidz) +{ + raidz_row_t *rr = (raidz_row_t *)unswizzle(raidz); + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + abd_free(rc->rc_abd); + } + kfree(rr); +} + +int +kernel_offloader_raidz_gen(void *raidz) +{ + raidz_row_t *rr = (raidz_row_t *)unswizzle(raidz); + if (!rr) { + return (KERNEL_OFFLOADER_ERROR); + } + + switch (rr->rr_firstdatacol) { + case 1: + vdev_raidz_generate_parity_p(rr); + break; + case 2: + vdev_raidz_generate_parity_pq(rr); + break; + case 3: + vdev_raidz_generate_parity_pqr(rr); + break; + } + + return (KERNEL_OFFLOADER_OK); +} + +int +kernel_offloader_raidz_rec(void *raidz, int *tgts, int ntgts) +{ + raidz_row_t *rr = (raidz_row_t *)unswizzle(raidz); + if (!rr) { + return (KERNEL_OFFLOADER_ERROR); + } + + vdev_raidz_reconstruct_general(rr, tgts, ntgts); + + return (KERNEL_OFFLOADER_OK); +} + +void * +kernel_offloader_file_open(const char *path, int flags, int mode) +{ + zfs_file_t *fp = NULL; + /* on error, fp should still be NULL */ + zfs_file_open(path, flags, mode, &fp); + return (swizzle(fp)); +} + +int +kernel_offloader_file_write(void *fp_handle, void *handle, size_t count, + size_t trailing_zeros, loff_t offset, ssize_t *resid, int *err) +{ + zfs_file_t *fp = (zfs_file_t *)unswizzle(fp_handle); + if (!fp) { + return (KERNEL_OFFLOADER_ERROR); + } + + koh_t *koh = (koh_t *)unswizzle(handle); + if (!koh) { + return (KERNEL_OFFLOADER_ERROR); + } + + if (!err) { + return (KERNEL_OFFLOADER_ERROR); + } + + *err = zfs_file_pwrite(fp, ptr_start(koh, 0), + count, offset, resid); + + if (*err == 0) { + void *zeros = kzalloc(trailing_zeros, GFP_KERNEL); + *err = zfs_file_pwrite(fp, zeros, + trailing_zeros, offset + count, resid); + kfree(zeros); + } + + return ((*err)?KERNEL_OFFLOADER_BAD_RESULT:KERNEL_OFFLOADER_OK); +} + +void +kernel_offloader_file_close(void *fp_handle) +{ + zfs_file_close(unswizzle(fp_handle)); +} + +void * +kernel_offloader_disk_open(dpusm_dd_t *disk_data) +{ + return (swizzle(disk_data->bdev)); +} + +int +kernel_offloader_disk_invalidate(void *disk_handle) +{ + struct block_device *bdev = + (struct block_device *)unswizzle(disk_handle); + invalidate_bdev(bdev); + return (DPUSM_OK); +} + +int +kernel_offloader_disk_write(void *disk_handle, void *handle, size_t data_size, + size_t trailing_zeros, uint64_t io_offset, int flags, + dpusm_disk_write_completion_t write_completion, void *wc_args) +{ + struct block_device *bdev = + (struct block_device *)unswizzle(disk_handle); + koh_t *koh = (koh_t *)unswizzle(handle); + + const size_t io_size = data_size + trailing_zeros; + + if (trailing_zeros) { + /* create a copy of the data with the trailing zeros attached */ + void *copy = kzalloc(io_size, GFP_KERNEL); + memcpy(copy, ptr_start(koh, 0), data_size); + + /* need to keep copy alive, so replace koh->ptr */ + if (koh->type == KOH_REAL) { + kfree(koh->ptr); + + atomic_sub(1, &active_count); + atomic_sub(koh->size, &active_size); + atomic_sub(koh->size, &active_actual); + } + + koh->type = KOH_REAL; + koh->ptr = copy; + koh->size = io_size; + + atomic_add(1, &total_count); + atomic_add(1, &active_count); + atomic_add(io_size, &total_size); + atomic_add(io_size, &active_size); + atomic_add(io_size, &total_actual); + atomic_add(io_size, &active_actual); + } + + abd_t *abd = abd_get_from_buf(koh->ptr, io_size); + abd_release_ownership_of_buf(abd); + zio_push_transform(wc_args, abd, io_size, io_size, NULL); + + /* __vdev_disk_physio already adds write_completion */ + (void) write_completion; + + return (__vdev_disk_physio(bdev, wc_args, + io_size, io_offset, WRITE, flags)); +} + +int +kernel_offloader_disk_flush(void *disk_handle, + dpusm_disk_flush_completion_t flush_completion, void *fc_args) +{ + struct block_device *bdev = + (struct block_device *)unswizzle(disk_handle); + + /* vdev_disk_io_flush already adds flush completion */ + (void) flush_completion; + + return (vdev_disk_io_flush(bdev, fc_args)); +} + +void +kernel_offloader_disk_close(void *disk_handle) +{} diff --git a/module/zia-software-provider/kernel_offloader.h b/module/zia-software-provider/kernel_offloader.h new file mode 100644 index 000000000000..e320bddbb28d --- /dev/null +++ b/module/zia-software-provider/kernel_offloader.h @@ -0,0 +1,149 @@ +/* + * © 2021. Triad National Security, LLC. All rights reserved. + * + * This program was produced under U.S. Government contract + * 89233218CNA000001 for Los Alamos National Laboratory (LANL), which + * is operated by Triad National Security, LLC for the U.S. + * Department of Energy/National Nuclear Security Administration. All + * rights in the program are reserved by Triad National Security, LLC, + * and the U.S. Department of Energy/National Nuclear Security + * Administration. The Government is granted for itself and others + * acting on its behalf a nonexclusive, paid-up, irrevocable worldwide + * license in this material to reproduce, prepare derivative works, + * distribute copies to the public, perform publicly and display + * publicly, and to permit others to do so. + * + * ---- + * + * This program is open source under the BSD-3 License. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef _KERNEL_OFFLOADER_H +#define _KERNEL_OFFLOADER_H + +#include +#include + +/* + * This file represents the API provided by a vendor to access their + * offloader. The API can be anything the implementor chooses to + * expose. There are no limitations on the function signature or + * name. They just have to be called correctly in the Z.I.A. provider. + * ZFS and Z.I.A. will not need direct access to any data located on + * the offloader. Some raw pointers from Z.I.A. will be used directly, + * but those will always contain information located in memory. + * + * ------------------------------------------------------------------- + * + * The kernel offloader fakes offloads by copying data into memory + * regions distinct from the calling process's memory space. The + * corresponding C file conflates the driver and the "physical" device + * since both memory spaces are in kernel space and run on the + * CPU. This offloader provides opaque pointers to the provider to + * simulate handles to inaccessible memory locations. In order to + * prevent the handle from being dereferenced and used successfully by + * ZFS or Z.I.A., the handle pointer is masked with a random value + * generated at load-time. Other offloaders may choose to present + * non-void handles. + */ + +/* return values */ +#define KERNEL_OFFLOADER_OK 0 + +/* function is implemented, but the chosen operation is not implemented */ +#define KERNEL_OFFLOADER_UNAVAILABLE 1 + +/* ran, but could not complete */ +#define KERNEL_OFFLOADER_ERROR 2 + +/* ran, but failed a check on a result */ +#define KERNEL_OFFLOADER_BAD_RESULT 3 + +/* + * init function - this should be the kernel module init, but + * kernel offloader is not compiled as a separate kernel module + */ +void kernel_offloader_init(void); +void kernel_offloader_fini(void); + +/* offloader handle access */ +void *kernel_offloader_alloc(size_t size); +void *kernel_offloader_alloc_ref(void *src, size_t offset, size_t size); +int kernel_offloader_get_size(void *handle, size_t *size, size_t *actual); +void kernel_offloader_free(void *handle); +int kernel_offloader_copy_from_mem(void *handle, size_t offset, + const void *src, size_t size); +int kernel_offloader_copy_to_mem(void *handle, size_t offset, + void *dst, size_t size); +/* status check */ +int kernel_offloader_mem_stats( + void *t_count_handle, void *t_size_handle, void *t_actual_handle, + void *a_count_handle, void *a_size_handle, void *a_actual_handle); +int kernel_offloader_cmp(void *lhs_handle, void *rhs_handle, int *diff); +int kernel_offloader_zero_fill(void *handle, size_t offset, size_t size); +int kernel_offloader_all_zeros(void *handle, size_t offset, size_t size); + +/* ZIO Pipeline Stages */ + +int kernel_offloader_compress(dpusm_compress_t alg, + void *src, void *dst, size_t s_len, int level, + void *c_len); + +int kernel_offloader_decompress(dpusm_decompress_t alg, + void *src, void *dst, int level); + +int kernel_offloader_checksum(dpusm_checksum_t alg, + dpusm_checksum_byteorder_t order, void *data, size_t size, + void *cksum, size_t cksum_size); + +void *kernel_offloader_raidz_alloc(size_t nparity, size_t ndata, + void **col_handles, size_t *col_sizes); +int kernel_offloader_raidz_set_col(void *raidz, uint64_t c, + void *col, size_t size); +void kernel_offloader_raidz_free(void *raidz); +int kernel_offloader_raidz_gen(void *raidz); +int kernel_offloader_raidz_rec(void *raidz, int *tgts, int ntgts); + +/* io */ +void *kernel_offloader_file_open(const char *path, int flags, int mode); +int kernel_offloader_file_write(void *fp_handle, void *handle, size_t count, + size_t trailing_zeros, loff_t offset, ssize_t *resid, int *err); +void kernel_offloader_file_close(void *fp_handle); + +void *kernel_offloader_disk_open(dpusm_dd_t *disk_data); +int kernel_offloader_disk_reread_part(void *disk_handle); +int kernel_offloader_disk_invalidate(void *disk_handle); +int kernel_offloader_disk_write(void *disk_handle, void *handle, + size_t data_size, size_t trailing_zeros, uint64_t io_offset, int flags, + dpusm_disk_write_completion_t write_completion, void *wc_args); +int kernel_offloader_disk_flush(void *disk_handle, + dpusm_disk_flush_completion_t flush_completion, void *fc_args); +void kernel_offloader_disk_close(void *disk_handle); + +#endif diff --git a/module/zia-software-provider/software.c b/module/zia-software-provider/software.c new file mode 100644 index 000000000000..b644b14e152e --- /dev/null +++ b/module/zia-software-provider/software.c @@ -0,0 +1,451 @@ +/* + * © 2021. Triad National Security, LLC. All rights reserved. + * + * This program was produced under U.S. Government contract + * 89233218CNA000001 for Los Alamos National Laboratory (LANL), which + * is operated by Triad National Security, LLC for the U.S. + * Department of Energy/National Nuclear Security Administration. All + * rights in the program are reserved by Triad National Security, LLC, + * and the U.S. Department of Energy/National Nuclear Security + * Administration. The Government is granted for itself and others + * acting on its behalf a nonexclusive, paid-up, irrevocable worldwide + * license in this material to reproduce, prepare derivative works, + * distribute copies to the public, perform publicly and display + * publicly, and to permit others to do so. + * + * ---- + * + * This program is open source under the BSD-3 License. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* + * This provider communicates with the "kernel offloader", which is + * actually just software running on the local kernel. + * + * Providers and offloaders are usually separate entities. However, to + * keep things simple, the kernel offloader is compiled into this + * provider. + * + * Providers run at the same location as ZFS. They are intended to be + * small shims that translate between the DPUSM provider API and an + * offloader's API (probably a header file analogous to + * kernel_offloader.h). + * + * The method used to communicate between the provider and offloader + * is not prescribed by the DPUSM. This allows for vendors to place + * their offloaders locally or remotely, and use whatever method they + * wish to use to communicate with their offloaders e.g. NVMeOF. The + * kernel offloader is local and the communication method to access + * the kernel offloader is calling local functions. + * + * Offloaders are normally expected to be hardware with its own memory + * space. In order to simulate copying data to an offloader's memory + * space, the kernel offloader allocates new buffers and copies ZFS + * data into them, rather than using ZFS data directly. In order to + * simulate handles that the provider does not know how to manipulate + * or have access to, pointers returned from the kernel offloader are + * masked with a random value. + * + * Note that this provider has to be loaded after ZFS because it + * depends on ZFS for its "offload" functionality. + * + * Usage: + * Reconfigure ZFS with --with-zia= + * + * Create a zpool + * + * Select this provider with + * zpool set zia_provider=zia-software-provider + * + * Enable "offloading" of operations with + * zpool set zia_compress=on + * zpool set zia_checksum=on + * zpool set zia_raidz1_gen=on + * zpool set zia_raidz2_gen=on + * zpool set zia_raidz3_gen=on + * zpool set zia_raidz1_rec=on + * zpool set zia_raidz2_rec=on + * zpool set zia_raidz3_rec=on + * zpool set zia_file_write=on + * zpool set zia_disk_write=on + * + * Use the zpool as you would normally + * + * Notes: + * If a ZFS IO stage is not run, enabling a Z.I.A. offload + * will have no effect. + * + * Resilvering requires both zia_checksum and zia_raidz*_rec + * to be enabled. Not enabling checksums would cause offloaded + * resilvering to fail, and perform the remaining operations + * in memory. To avoid the cost of offloading data only to + * fail, a check has been inserted to prevent offloading + * altogether if zia_checksum is not enabled. + */ + +#include +#include +#include + +#include /* provides access to the offloader */ +#include /* the DPUSM provider API */ + +/* translate from offloader values to DPUSM values */ +static int +translate_rc(const int offloader_rc) +{ + int dpusm_rc = DPUSM_NOT_IMPLEMENTED; + switch (offloader_rc) { + case KERNEL_OFFLOADER_OK: + dpusm_rc = DPUSM_OK; + break; + case KERNEL_OFFLOADER_ERROR: + dpusm_rc = DPUSM_ERROR; + break; + case KERNEL_OFFLOADER_UNAVAILABLE: + dpusm_rc = DPUSM_NOT_IMPLEMENTED; + break; + case KERNEL_OFFLOADER_BAD_RESULT: + dpusm_rc = DPUSM_BAD_RESULT; + break; + default: + /* only translate recognized values */ + dpusm_rc = offloader_rc; + break; + } + return (dpusm_rc); +} + +static int +sw_provider_algorithms(int *compress, int *decompress, + int *checksum, int *checksum_byteorder, int *raid) +{ + *compress = + DPUSM_COMPRESS_GZIP_1 | + DPUSM_COMPRESS_GZIP_2 | + DPUSM_COMPRESS_GZIP_3 | + DPUSM_COMPRESS_GZIP_4 | + DPUSM_COMPRESS_GZIP_5 | + DPUSM_COMPRESS_GZIP_6 | + DPUSM_COMPRESS_GZIP_7 | + DPUSM_COMPRESS_GZIP_8 | + DPUSM_COMPRESS_GZIP_9; + + *decompress = 0; + + *checksum = DPUSM_CHECKSUM_FLETCHER_2 | DPUSM_CHECKSUM_FLETCHER_4; + + *checksum_byteorder = DPUSM_BYTEORDER_NATIVE | DPUSM_BYTEORDER_BYTESWAP; + + *raid = + DPUSM_RAID_1_GEN | + DPUSM_RAID_2_GEN | + DPUSM_RAID_3_GEN | + DPUSM_RAID_1_REC | + DPUSM_RAID_2_REC | + DPUSM_RAID_3_REC; + + return (DPUSM_OK); +} + +static int +sw_provider_get_size(void *handle, size_t *size, size_t *actual) +{ + return (translate_rc(kernel_offloader_get_size(handle, + size, actual))); +} + +static int +sw_provider_copy_from_mem(dpusm_mv_t *mv, const void *buf, size_t size) +{ + return (translate_rc(kernel_offloader_copy_from_mem(mv->handle, + mv->offset, buf, size))); +} + +static int +sw_provider_copy_to_mem(dpusm_mv_t *mv, void *buf, size_t size) +{ + return (translate_rc(kernel_offloader_copy_to_mem(mv->handle, + mv->offset, buf, size))); +} + +static int +sw_provider_mem_stats(size_t *t_count, size_t *t_size, size_t *t_actual, + size_t *a_count, size_t *a_size, size_t *a_actual) +{ + void *t_count_handle = NULL; + void *t_size_handle = NULL; + void *t_actual_handle = NULL; + void *a_size_handle = NULL; + void *a_count_handle = NULL; + void *a_actual_handle = NULL; + + if (t_count) { + t_count_handle = kernel_offloader_alloc(sizeof (size_t)); + } + + if (t_size) { + t_size_handle = kernel_offloader_alloc(sizeof (size_t)); + } + + if (t_actual) { + t_actual_handle = kernel_offloader_alloc(sizeof (size_t)); + } + + if (a_count) { + a_count_handle = kernel_offloader_alloc(sizeof (size_t)); + } + + if (a_size) { + a_size_handle = kernel_offloader_alloc(sizeof (size_t)); + } + + if (a_actual) { + a_actual_handle = kernel_offloader_alloc(sizeof (size_t)); + } + + const int rc = kernel_offloader_mem_stats(t_count, t_size, t_actual, + a_count, a_size, a_actual); + if (rc == KERNEL_OFFLOADER_OK) { + /* should probably check for errors */ + kernel_offloader_copy_to_mem(t_count_handle, 0, + t_count, sizeof (*t_count)); + kernel_offloader_copy_to_mem(t_size_handle, 0, + t_size, sizeof (*t_size)); + kernel_offloader_copy_to_mem(t_actual_handle, 0, + t_actual, sizeof (*t_actual)); + kernel_offloader_copy_to_mem(a_count_handle, 0, + a_count, sizeof (*a_count)); + kernel_offloader_copy_to_mem(a_size_handle, 0, + a_size, sizeof (*a_size)); + kernel_offloader_copy_to_mem(a_actual_handle, 0, + a_actual, sizeof (*a_actual)); + } + + kernel_offloader_free(t_size_handle); + kernel_offloader_free(t_count_handle); + kernel_offloader_free(t_actual_handle); + kernel_offloader_free(a_size_handle); + kernel_offloader_free(a_count_handle); + kernel_offloader_free(a_actual_handle); + + return (translate_rc(rc)); +} + +static int +sw_provider_zero_fill(void *handle, size_t offset, size_t size) +{ + return (translate_rc(kernel_offloader_zero_fill(handle, offset, size))); +} + +static int +sw_provider_all_zeros(void *handle, size_t offset, size_t size) +{ + return (translate_rc(kernel_offloader_all_zeros(handle, offset, size))); +} + +static int +sw_provider_compress(dpusm_compress_t alg, + void *src, void *dst, size_t s_len, int level, + size_t *d_len) +{ + /* buffer that offloader fills out */ + void *d_len_handle = kernel_offloader_alloc(sizeof (size_t)); + + const int kz_rc = kernel_offloader_compress(alg, src, dst, s_len, level, + d_len_handle); + if (kz_rc == KERNEL_OFFLOADER_OK) { + /* get d_len back from offloader */ + kernel_offloader_copy_to_mem(d_len_handle, 0, + d_len, sizeof (d_len)); + } + + kernel_offloader_free(d_len_handle); + + return (translate_rc(kz_rc)); +} + +static int +sw_provider_decompress(dpusm_compress_t alg, + void *src, void *dst, int level) +{ + return (translate_rc(kernel_offloader_decompress(alg, src, + dst, level))); +} + +static int +sw_provider_checksum(dpusm_checksum_t alg, + dpusm_checksum_byteorder_t order, void *data, size_t size, + void *cksum, size_t cksum_size) +{ + /* maybe translate alg and order */ + + /* trigger offloader to do actual calculation */ + return (translate_rc(kernel_offloader_checksum(alg, + order, data, size, cksum, cksum_size))); +} + +static int +sw_provider_raid_can_compute(size_t nparity, size_t ndata, + size_t *col_sizes, int rec) +{ + if ((nparity < 1) || (nparity > 3)) { + return (DPUSM_NOT_SUPPORTED); + } + + return (DPUSM_OK); +} + +static int +sw_provider_raid_gen(void *raid) +{ + return (translate_rc(kernel_offloader_raidz_gen(raid))); +} + +static int +sw_provider_raid_new_parity(void *raid, uint64_t raidn, + void **new_parity_cols, size_t *new_parity_sizes) +{ + for (uint64_t c = 0; c < raidn; c++) { + /* not every column needs to be reconstructed */ + if (new_parity_sizes[c] == 0) { + continue; + } + + /* allocate a new buffer */ + void *handle = kernel_offloader_alloc(new_parity_sizes[c]); + if (!handle) { + return (DPUSM_ERROR); + } + + /* assign this buffer to column c */ + /* old column c is no longer associated with the raid data */ + if (kernel_offloader_raidz_set_col(raid, c, + handle, new_parity_sizes[c]) != KERNEL_OFFLOADER_OK) { + return (DPUSM_ERROR); + } + + /* send the handle back to DPUSM */ + new_parity_cols[c] = handle; + + /* + * leave assigned handles on error + * DPUSM will clean them up + */ + } + + return (DPUSM_OK); +} + +static int +sw_provider_raid_cmp(void *lhs_handle, void *rhs_handle, int *diff) +{ + return (translate_rc(kernel_offloader_cmp(lhs_handle, + rhs_handle, diff))); +} + +static int +sw_provider_raid_rec(void *raid, int *tgts, int ntgts) +{ + return (translate_rc(kernel_offloader_raidz_rec(raid, + tgts, ntgts))); +} + +static int +sw_provider_file_write(void *fp_handle, void *handle, size_t count, + size_t trailing_zeros, loff_t offset, ssize_t *resid, int *err) +{ + return (translate_rc(kernel_offloader_file_write(fp_handle, + handle, count, trailing_zeros, offset, resid, err))); +} + +/* BEGIN CSTYLED */ +static const char name[] = "zia-software-provider"; +static const dpusm_pf_t sw_provider_functions = { + .algorithms = sw_provider_algorithms, + .alloc = kernel_offloader_alloc, + .alloc_ref = kernel_offloader_alloc_ref, + .get_size = sw_provider_get_size, + .free = kernel_offloader_free, + .copy_from_mem = sw_provider_copy_from_mem, + .copy_to_mem = sw_provider_copy_to_mem, + .mem_stats = sw_provider_mem_stats, + .zero_fill = sw_provider_zero_fill, + .all_zeros = sw_provider_all_zeros, + .compress = sw_provider_compress, + .decompress = sw_provider_decompress, + .checksum = sw_provider_checksum, + .raid = { + .can_compute = sw_provider_raid_can_compute, + .alloc = kernel_offloader_raidz_alloc, + .free = kernel_offloader_raidz_free, + .gen = sw_provider_raid_gen, + .new_parity = sw_provider_raid_new_parity, + .cmp = sw_provider_raid_cmp, + .rec = sw_provider_raid_rec, + }, + .file = { + .open = kernel_offloader_file_open, + .write = sw_provider_file_write, + .close = kernel_offloader_file_close, + }, + .disk = { + .open = kernel_offloader_disk_open, + .invalidate = kernel_offloader_disk_invalidate, + .write = kernel_offloader_disk_write, + .flush = kernel_offloader_disk_flush, + .close = kernel_offloader_disk_close, + }, +}; +/* END CSTYLED */ + +static int __init +sw_provider_init(void) +{ + /* + * this should be a separate kernel module, + * but is here for simplicity + */ + kernel_offloader_init(); + + return (dpusm_register_bsd(name, &sw_provider_functions)); +} + +static void __exit +sw_provider_exit(void) +{ + dpusm_unregister_bsd(name); + + kernel_offloader_fini(); +} + +module_init(sw_provider_init); +module_exit(sw_provider_exit); + +MODULE_LICENSE("CDDL"); diff --git a/rpm/generic/zfs-kmod.spec.in b/rpm/generic/zfs-kmod.spec.in index ae0795427868..db993ddeb4a3 100644 --- a/rpm/generic/zfs-kmod.spec.in +++ b/rpm/generic/zfs-kmod.spec.in @@ -38,6 +38,7 @@ %bcond_with debug %bcond_with debuginfo +%bcond_with zia Name: %{module}-kmod @@ -123,6 +124,12 @@ bash %{SOURCE10} --target %{_target_cpu} %{?repo:--repo %{?repo}} --kmodname %{ %define debuginfo --disable-debuginfo %endif +%if %{with zia} + %define zia --with-zia="%{?DPUSM_ROOT}" +%else + %define zia --without-zia +%endif + # Leverage VPATH from configure to avoid making multiple copies. %define _configure ../%{module}-%{version}/configure @@ -143,7 +150,8 @@ for kernel_version in %{?kernel_versions}; do %{debuginfo} \ %{?kernel_cc} \ %{?kernel_ld} \ - %{?kernel_llvm} + %{?kernel_llvm} \ + %{zia} make %{?_smp_mflags} cd .. done diff --git a/rpm/generic/zfs.spec.in b/rpm/generic/zfs.spec.in index 493e93c1f3e6..af894b157caa 100644 --- a/rpm/generic/zfs.spec.in +++ b/rpm/generic/zfs.spec.in @@ -59,6 +59,7 @@ %bcond_with systemd %bcond_with pam %bcond_without pyzfs +%bcond_with zia # Generic enable switch for systemd %if %{with systemd} @@ -381,6 +382,12 @@ support for unlocking datasets on user login. %define pam --disable-pam %endif +%if %{with zia} + %define zia --with-zia="%{DPUSM_ROOT}" +%else + %define zia --without-zia +%endif + %setup -q %build @@ -400,7 +407,8 @@ support for unlocking datasets on user login. %{ubsan} \ %{systemd} \ %{pam} \ - %{pyzfs} + %{pyzfs} \ + %{zia} make %{?_smp_mflags} %install diff --git a/rpm/redhat/zfs-kmod.spec.in b/rpm/redhat/zfs-kmod.spec.in index f59551c0b43a..362c5578d034 100644 --- a/rpm/redhat/zfs-kmod.spec.in +++ b/rpm/redhat/zfs-kmod.spec.in @@ -59,6 +59,12 @@ fi %define debuginfo --disable-debuginfo %endif +%if %{with zia} +%define zia --with-zia="%{?DPUSM_ROOT}" +%else +%define zia --without-zia +%endif + %setup -n %{kmod_name}-%{version} %build %configure \ @@ -69,7 +75,8 @@ fi %{debuginfo} \ %{?kernel_cc} \ %{?kernel_ld} \ - %{?kernel_llvm} + %{?kernel_llvm} \ + %{zia} make %{?_smp_mflags} %install