-
Notifications
You must be signed in to change notification settings - Fork 1.8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
ZFS Interface for Accelerators (Z.I.A.)
The ZIO write pipeline has been modified to allow for external, alternative implementations of operations to be used. The original ZFS functions remain in the code as fallback in case the external implementation fails. Definitions: Accelerator - an entity (usually hardware) that is intended to accelerate operations Offloader - synonym of accelerator; used interchangeably Data Processing Unit Services Module (DPUSM) - https://github.com/hpc/dpusm - defines a "provider API" for accelerator vendors to set up - defines a "user API" for accelerator consumers to call - maintains list of providers and coordinates interactions between providers and consumers. Provider - a DPUSM wrapper for an accelerator's API Offload - moving data from ZFS/memory to the accelerator Onload - the opposite of offload In order for Z.I.A. to be extensible, it does not directly communicate with a fixed accelerator. Rather, Z.I.A. acquires a handle to a DPUSM, which is then used to acquire handles to providers. Using ZFS with Z.I.A.: 1. Build and start the DPUSM 2. Implement, build, and register a provider with the DPUSM 3. Reconfigure ZFS with '--with-zia=<DPUSM root>' 4. Rebuild and start ZFS 5. Create a zpool 6. Select the provider zpool set zia_provider=<provider name> <zpool> 7. Select operations to offload zpool set zia_<property>=on <zpool> The functions that can be replaced with alternative operations are: - compression - data is offloaded and then compressed - metadata is compressed in-memory and then offloaded - decompression can be replaced, but the replacement function is not called anywhere - checksum - checksum compute and checksum error call the same function - raidz - generation - reconstruction - vdev_file - open - write - close - vdev_disk - open - invalidate - write - flush - close abd_t, raidz_row_t, and vdev_t have each been given an additional "void *<prefix>_zia_handle" member. These opaque handles point to data that is located on an offloader. abds are still allocated, but their contents are expected to diverge from the offloaded copy as operations are run. The modifications to ZFS can be thought of as two sets of changes: - The ZIO write pipeline - compression, checksum, RAIDZ generation, and write - Each stage starts by offloading data that was not previously offloaded - This allows for ZIOs to be offloaded at any point in these stages - Successful operations do not onload back into memory between stages - Errors cause data to be onloaded, or dropped if the copy in memory matches the offloaded copy - This might cause thrashing, but should not happen often, as the intention is for all of the stages to be offloaded, and thus not require onloading - Resilver - RAIDZ reconstruction, checksum, RAIDZ generation, and write - Because resilver is only one stage in the ZIO pipeline, data is only offloaded once at the beginning - Errors cause data to be onloaded, but will not re-offload in subsequent steps within resilver ARC compression is disabled when Z.I.A. is enabled Aggregation is disabled for offloaded abds RPMs will build with Z.I.A. Added example provider in module/zia-software-provider Signed-off-by: Jason Lee <[email protected]>
- Loading branch information
1 parent
74230a5
commit ec83b23
Showing
44 changed files
with
4,099 additions
and
13 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
dnl # Adds --with-zia=PATH to configuration options | ||
dnl # The path provided should point to the DPUSM | ||
dnl # root and contain Module.symvers. | ||
AC_DEFUN([ZFS_AC_ZIA], [ | ||
AC_ARG_WITH([zia], | ||
AS_HELP_STRING([--with-zia=PATH], | ||
[Path to Data Processing Services Module]), | ||
[ | ||
DPUSM_ROOT="$withval" | ||
enable_zia=yes | ||
] | ||
) | ||
AS_IF([test "x$enable_zia" == "xyes"], | ||
AS_IF([! test -d "$DPUSM_ROOT"], | ||
[AC_MSG_ERROR([--with-zia=PATH requires the DPUSM root directory])] | ||
) | ||
DPUSM_SYMBOLS="$DPUSM_ROOT/Module.symvers" | ||
AS_IF([test -r $DPUSM_SYMBOLS], | ||
[ | ||
AC_MSG_RESULT([$DPUSM_SYMBOLS]) | ||
ZIA_CPPFLAGS="-DZIA=1 -I$DPUSM_ROOT/include" | ||
KERNEL_ZIA_CPPFLAGS="-DZIA=1 -I$DPUSM_ROOT/include" | ||
WITH_ZIA="_with_zia" | ||
AC_SUBST(WITH_ZIA) | ||
AC_SUBST(KERNEL_ZIA_CPPFLAGS) | ||
AC_SUBST(ZIA_CPPFLAGS) | ||
AC_SUBST(DPUSM_SYMBOLS) | ||
AC_SUBST(DPUSM_ROOT) | ||
], | ||
[ | ||
AC_MSG_ERROR([ | ||
*** Failed to find Module.symvers in: | ||
$DPUSM_SYMBOLS | ||
]) | ||
] | ||
) | ||
) | ||
]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,197 @@ | ||
/* | ||
* © 2021. Triad National Security, LLC. All rights reserved. | ||
* | ||
* This program was produced under U.S. Government contract | ||
* 89233218CNA000001 for Los Alamos National Laboratory (LANL), which | ||
* is operated by Triad National Security, LLC for the U.S. | ||
* Department of Energy/National Nuclear Security Administration. All | ||
* rights in the program are reserved by Triad National Security, LLC, | ||
* and the U.S. Department of Energy/National Nuclear Security | ||
* Administration. The Government is granted for itself and others | ||
* acting on its behalf a nonexclusive, paid-up, irrevocable worldwide | ||
* license in this material to reproduce, prepare derivative works, | ||
* distribute copies to the public, perform publicly and display | ||
* publicly, and to permit others to do so. | ||
* | ||
* ---- | ||
* | ||
* This program is open source under the BSD-3 License. | ||
* Redistribution and use in source and binary forms, with or without | ||
* modification, are permitted provided that the following conditions are met: | ||
* | ||
* 1. Redistributions of source code must retain the above copyright notice, | ||
* this list of conditions and the following disclaimer. | ||
* | ||
* 2. Redistributions in binary form must reproduce the above copyright notice, | ||
* this list of conditions and the following disclaimer in the documentation | ||
* and/or other materials provided with the distribution. | ||
* | ||
* 3. Neither the name of the copyright holder nor the names of its | ||
* contributors may be used to endorse or promote products derived from this | ||
* software without specific prior written permission. | ||
* | ||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | ||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE | ||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | ||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | ||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | ||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | ||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | ||
* POSSIBILITY OF SUCH DAMAGE. | ||
* | ||
*/ | ||
|
||
#ifdef ZIA | ||
|
||
#ifndef _ZIA_H | ||
#define _ZIA_H | ||
|
||
#include <sys/abd.h> | ||
#include <sys/fs/zfs.h> /* VDEV_RAIDZ_MAXPARITY */ | ||
#include <sys/spa.h> | ||
#include <sys/vdev.h> | ||
#include <sys/zio.h> | ||
#include <sys/zio_compress.h> | ||
#include <sys/zio_checksum.h> | ||
|
||
typedef struct raidz_row raidz_row_t; | ||
|
||
/* ******************************************************** */ | ||
/* return values */ | ||
#define ZIA_OK 0 | ||
|
||
/* something bad happened not related to missing functionality */ | ||
#define ZIA_ERROR 1 | ||
|
||
/* error, fallback to zfs implementation */ | ||
#define ZIA_FALLBACK 2 | ||
|
||
/* ran, but result is bad */ | ||
#define ZIA_BAD_RESULT 3 | ||
|
||
/* expected provider and actual provider do not match */ | ||
#define ZIA_PROVIDER_MISMATCH 4 | ||
/* ******************************************************** */ | ||
|
||
/* | ||
* This struct is normally set with | ||
* zpool set zia_<property>=on/off/<value> | ||
* and passed around in spa_t. | ||
*/ | ||
typedef struct zia_props { | ||
void *provider; | ||
|
||
/* minimum size allowed to offload - set by ashift */ | ||
size_t min_offload_size; | ||
|
||
int compress; | ||
int decompress; | ||
|
||
int checksum; | ||
|
||
struct { | ||
int gen[VDEV_RAIDZ_MAXPARITY + 1]; | ||
int rec[VDEV_RAIDZ_MAXPARITY + 1]; | ||
} raidz; | ||
|
||
int file_write; | ||
int disk_write; | ||
} zia_props_t; | ||
|
||
zia_props_t *zia_get_props(spa_t *spa); | ||
void zia_prop_warn(boolean_t val, const char *name); | ||
|
||
int zia_init(void); | ||
int zia_fini(void); | ||
|
||
void *zia_get_provider(const char *name); | ||
const char *zia_get_provider_name(void *provider); | ||
int zia_put_provider(void **provider); | ||
|
||
/* check if offloading can occur */ | ||
boolean_t zia_is_used(zio_t *zio); | ||
|
||
/* | ||
* check if a handle is associated with this pointer | ||
* | ||
* not exposing functions for different handles because | ||
* only abd handles are checked outside of zia.c | ||
*/ | ||
boolean_t zia_is_offloaded(abd_t *abd); | ||
|
||
/* create a new offloader handle without copying data */ | ||
void *zia_alloc(void *provider, size_t size, size_t min_offload_size); | ||
|
||
/* deallocate handle without onloading */ | ||
void zia_free(void **handle); | ||
|
||
/* move linear data between from the offloader to memory */ | ||
int zia_onload(void **handle, void *buf, size_t size); | ||
|
||
/* calls abd_iterate_func on the abd to copy abd data back and forth */ | ||
int zia_offload_abd(void *provider, abd_t *abd, | ||
size_t size, size_t min_offload_size, boolean_t *local_offload); | ||
int zia_onload_abd(abd_t *abd, size_t size, boolean_t keep_handle); | ||
/* move a handle into an abd */ | ||
void zia_move_into_abd(abd_t *dst, void **src); | ||
int zia_free_abd(abd_t *abd, boolean_t lock); | ||
|
||
/* | ||
* if offloaded locally, just free the handle | ||
* if not, onload the data and free the handle | ||
*/ | ||
int zia_cleanup_abd(abd_t *abd, size_t size, boolean_t local_offload); | ||
|
||
/* fill a buffer with zeros */ | ||
int zia_zero_fill(abd_t *abd, size_t offset, size_t size); | ||
|
||
int zia_compress(void *provider, zio_t *zio, size_t s_len, | ||
enum zio_compress c, uint8_t level, void **cbuf_handle, | ||
uint64_t *c_len, boolean_t *local_offload); | ||
|
||
int zia_checksum_compute(void *provider, zio_cksum_t *dst, | ||
enum zio_checksum alg, zio_t *zio, uint64_t size, | ||
boolean_t *local_offload); | ||
int zia_checksum_error(const blkptr_t *bp, enum zio_checksum alg, | ||
abd_t *abd, uint64_t size, zio_bad_cksum_t *info); | ||
|
||
/* raidz */ | ||
int zia_raidz_alloc(zio_t *zio, raidz_row_t *rr, boolean_t rec, | ||
uint_t cksum, boolean_t *local_offload); | ||
int zia_raidz_free(raidz_row_t *rr, boolean_t onload_parity); | ||
int zia_raidz_gen(raidz_row_t *rr); | ||
int zia_raidz_gen_cleanup(zio_t *zio, raidz_row_t *rr, | ||
boolean_t local_offload); | ||
int zia_raidz_new_parity(zio_t *zio, raidz_row_t *rr, abd_t **orig); | ||
/* compare the contents of offloaded abds (only used in resilver) */ | ||
int zia_raidz_cmp(abd_t *lhs, abd_t *rhs, int *diff); | ||
int zia_raidz_rec(raidz_row_t *rr, int *t, int nt); | ||
int zia_raidz_rec_cleanup(zio_t *zio, raidz_row_t *rr, | ||
boolean_t local_offload, boolean_t onload_parity); | ||
|
||
/* file I/O */ | ||
int zia_file_open(vdev_t *vdev, const char *path, | ||
int flags, int mode); | ||
int zia_file_write(vdev_t *vdev, abd_t *abd, ssize_t size, | ||
loff_t offset, ssize_t *resid, int *err); | ||
int zia_file_close(vdev_t *vdev); | ||
|
||
#ifdef _KERNEL | ||
#include <linux/blkdev.h> | ||
|
||
/* disk I/O */ | ||
int zia_disk_open(vdev_t *vdev, const char *path, | ||
struct block_device *bdev); | ||
int zia_disk_invalidate(vdev_t *vdev); | ||
int zia_disk_write(vdev_t *vdev, zio_t *zio, | ||
size_t io_size, uint64_t io_offset, int flags); | ||
int zia_disk_flush(vdev_t *vdev, zio_t *zio); | ||
int zia_disk_close(vdev_t *vdev); | ||
#endif | ||
|
||
#endif | ||
|
||
#endif |
Oops, something went wrong.