From 64d51381a47950e86b15146fbd8bec6f508b42f1 Mon Sep 17 00:00:00 2001 From: Don Brady Date: Mon, 6 Jan 2025 21:26:14 +0000 Subject: [PATCH 1/2] Detect a slow raidz child during reads A single slow responding disk can affect the overall read performance of a raidz group. When a raidz child disk is determined to be a persistent slow outlier, then have it sit out during reads for a period of time. The raidz group can use parity to reconstruct the data that was skipped. Each time a slow disk is placed into a sit out period, its `vdev_stat.vs_slow_ios count` is incremented and a zevent class `ereport.fs.zfs.delay` is posted. The length of the sit out period can be changed using the `raid_read_sit_out_secs` module parameter. Setting it to zero disables slow outlier detection. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Don Brady --- include/sys/fs/zfs.h | 1 + include/sys/vdev_impl.h | 4 + include/sys/vdev_raidz.h | 1 + include/sys/vdev_raidz_impl.h | 2 + lib/libzfs/libzfs.abi | 3 +- lib/libzfs/libzfs_pool.c | 2 + man/man4/zfs.4 | 12 ++ man/man7/vdevprops.7 | 7 + module/zcommon/zpool_prop.c | 3 + module/zfs/vdev.c | 15 ++ module/zfs/vdev_draid.c | 23 +++ module/zfs/vdev_raidz.c | 273 ++++++++++++++++++++++++++++++++++ 12 files changed, 345 insertions(+), 1 deletion(-) diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index dc474e3739f3..547cfb72d1c4 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -379,6 +379,7 @@ typedef enum { VDEV_PROP_TRIM_SUPPORT, VDEV_PROP_TRIM_ERRORS, VDEV_PROP_SLOW_IOS, + VDEV_PROP_SIT_OUT_READS, VDEV_NUM_PROPS } vdev_prop_t; diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index abd66b8abc96..a9310f16fffb 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -285,6 +285,7 @@ struct vdev { boolean_t vdev_ishole; /* is a hole in the namespace */ uint64_t vdev_top_zap; vdev_alloc_bias_t vdev_alloc_bias; /* metaslab allocation bias */ + uint64_t vdev_last_latency_check; /* pool checkpoint related */ space_map_t *vdev_checkpoint_sm; /* contains reserved blocks */ @@ -432,6 +433,9 @@ struct vdev { hrtime_t vdev_mmp_pending; /* 0 if write finished */ uint64_t vdev_mmp_kstat_id; /* to find kstat entry */ uint64_t vdev_expansion_time; /* vdev's last expansion time */ + uint64_t vdev_outlier_count; /* read outlier amongst peers */ + uint64_t vdev_ewma_latency; /* moving average read latency */ + hrtime_t vdev_read_sit_out_expire; /* end of sit out period */ list_node_t vdev_leaf_node; /* leaf vdev list */ /* diff --git a/include/sys/vdev_raidz.h b/include/sys/vdev_raidz.h index 64f484e9aa13..25f1bee72a27 100644 --- a/include/sys/vdev_raidz.h +++ b/include/sys/vdev_raidz.h @@ -60,6 +60,7 @@ void vdev_raidz_checksum_error(zio_t *, struct raidz_col *, abd_t *); struct raidz_row *vdev_raidz_row_alloc(int, zio_t *); void vdev_raidz_reflow_copy_scratch(spa_t *); void raidz_dtl_reassessed(vdev_t *); +boolean_t vdev_sit_out_reads(vdev_t *, zio_flag_t); extern const zio_vsd_ops_t vdev_raidz_vsd_ops; diff --git a/include/sys/vdev_raidz_impl.h b/include/sys/vdev_raidz_impl.h index 45cb5864a22b..07f8e560c747 100644 --- a/include/sys/vdev_raidz_impl.h +++ b/include/sys/vdev_raidz_impl.h @@ -118,6 +118,7 @@ typedef struct raidz_col { uint8_t rc_need_orig_restore:1; /* need to restore from orig_data? */ uint8_t rc_force_repair:1; /* Write good data to this column */ uint8_t rc_allow_repair:1; /* Allow repair I/O to this column */ + uint8_t rc_latency_outlier:1; /* Latency outlier for this device */ int rc_shadow_devidx; /* for double write during expansion */ int rc_shadow_error; /* for double write during expansion */ uint64_t rc_shadow_offset; /* for double write during expansion */ @@ -132,6 +133,7 @@ typedef struct raidz_row { int rr_firstdatacol; /* First data column/parity count */ abd_t *rr_abd_empty; /* dRAID empty sector buffer */ int rr_nempty; /* empty sectors included in parity */ + int rr_outlier_cnt; /* Count of latency outlier devices */ #ifdef ZFS_DEBUG uint64_t rr_offset; /* Logical offset for *_io_verify() */ uint64_t rr_size; /* Physical size for *_io_verify() */ diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 1f9fde6677d8..f2aef8754460 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -5917,7 +5917,8 @@ - + + diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 64f9d1f6eb49..ec30d2a6ddd3 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -5478,6 +5478,8 @@ zpool_get_vdev_prop_value(nvlist_t *nvprop, vdev_prop_t prop, char *prop_name, /* Only use if provided by the RAIDZ VDEV above */ if (prop == VDEV_PROP_RAIDZ_EXPANDING) return (ENOENT); + if (prop == VDEV_PROP_SIT_OUT_READS) + return (ENOENT); } if (vdev_prop_index_to_string(prop, intval, (const char **)&strval) != 0) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 7078a5ba8373..c61303d28c91 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -501,6 +501,18 @@ For testing, pause RAID-Z expansion when reflow amount reaches this value. .It Sy raidz_io_aggregate_rows Ns = Ns Sy 4 Pq ulong For expanded RAID-Z, aggregate reads that have more rows than this. . +.It Sy raidz_read_sit_out_secs Ns = Ns Sy 600 Ns s Po 10 min Pc Pq ulong +When a slow disk outlier is detected it is placed in a sit out state. +While sitting out the disk will not participate in normal reads, instead its +data will be reconstructed as needed from parity. +Resilver and scrub operations will always read from a disk, even if it's +sitting out. +Only a single disk in a RAID-Z or dRAID vdev may sit out at the same time. +Writes will still be issued to a disk which is sitting out to maintain full +redundancy. +Defaults to 600 seconds and a value of zero disables slow disk outlier +detection. +. .It Sy reference_history Ns = Ns Sy 3 Pq int Maximum reference holders being tracked when reference_tracking_enable is active. diff --git a/man/man7/vdevprops.7 b/man/man7/vdevprops.7 index 34d4026b1009..844864518c1e 100644 --- a/man/man7/vdevprops.7 +++ b/man/man7/vdevprops.7 @@ -104,12 +104,19 @@ Comma separated list of children of this vdev The number of children belonging to this vdev .It Sy read_errors , write_errors , checksum_errors , initialize_errors , trim_errors The number of errors of each type encountered by this vdev +.It Sy sit_out_reads +True when a slow disk outlier was detected and the vdev is currently in a sit +out state. +While sitting out, the vdev will not participate in normal reads, instead its +data will be reconstructed as needed from parity. .It Sy slow_ios The number of slow I/Os encountered by this vdev, These represent I/O operations that didn't complete in .Sy zio_slow_io_ms milliseconds .Pq Sy 30000 No by default . +Can also be incremented when a vdev was determined to be a raidz leaf latency +outlier. .It Sy null_ops , read_ops , write_ops , free_ops , claim_ops , trim_ops The number of I/O operations of each type performed by this vdev .It Xo diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c index ea9eda4b316d..461fc7faefe2 100644 --- a/module/zcommon/zpool_prop.c +++ b/module/zcommon/zpool_prop.c @@ -466,6 +466,9 @@ vdev_prop_init(void) zprop_register_index(VDEV_PROP_RAIDZ_EXPANDING, "raidz_expanding", 0, PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "RAIDZ_EXPANDING", boolean_table, sfeatures); + zprop_register_index(VDEV_PROP_SIT_OUT_READS, "sit_out_reads", 0, + PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "SIT_OUT_READS", + boolean_table, sfeatures); zprop_register_index(VDEV_PROP_TRIM_SUPPORT, "trim_support", 0, PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "TRIMSUP", boolean_table, sfeatures); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 5df2f77e5780..003caceb3328 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -4521,6 +4521,8 @@ vdev_clear(spa_t *spa, vdev_t *vd) vd->vdev_stat.vs_checksum_errors = 0; vd->vdev_stat.vs_dio_verify_errors = 0; vd->vdev_stat.vs_slow_ios = 0; + atomic_store_64(&vd->vdev_outlier_count, 0); + vd->vdev_read_sit_out_expire = 0; for (int c = 0; c < vd->vdev_children; c++) vdev_clear(spa, vd->vdev_child[c]); @@ -6361,6 +6363,19 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) ZPROP_SRC_NONE); } continue; + case VDEV_PROP_SIT_OUT_READS: + /* Only expose this for a draid or raidz leaf */ + if (vd->vdev_ops->vdev_op_leaf && + vd->vdev_top != NULL && + (vd->vdev_top->vdev_ops == + &vdev_raidz_ops || + vd->vdev_top->vdev_ops == + &vdev_draid_ops)) { + vdev_prop_add_list(outnvl, propname, + NULL, vdev_sit_out_reads(vd, 0), + ZPROP_SRC_NONE); + } + continue; case VDEV_PROP_TRIM_SUPPORT: /* only valid for leaf vdevs */ if (vd->vdev_ops->vdev_op_leaf) { diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c index 419c8ac5bb28..7125612ae205 100644 --- a/module/zfs/vdev_draid.c +++ b/module/zfs/vdev_draid.c @@ -1993,6 +1993,29 @@ vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr) rc->rc_force_repair = 1; rc->rc_allow_repair = 1; } + } else if (vdev_sit_out_reads(cvd, zio->io_flags)) { + rr->rr_outlier_cnt++; + rc->rc_latency_outlier = 1; + } + } + + /* + * When the row contains a latency outlier and sufficient parity + * exists to reconstruct the column data, then skip reading the + * known slow child vdev as a performance optimization. + */ + if (rr->rr_outlier_cnt > 0 && rr->rr_missingdata == 0 && + (rr->rr_firstdatacol - rr->rr_missingparity) > 0) { + + for (int c = rr->rr_cols - 1; c >= rr->rr_firstdatacol; c--) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_latency_outlier) { + rr->rr_missingdata++; + rc->rc_error = SET_ERROR(EAGAIN); + rc->rc_skipped = 1; + break; + } } } diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 6103f780e6bc..801707042a5c 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -354,6 +354,13 @@ unsigned long raidz_expand_max_reflow_bytes = 0; */ uint_t raidz_expand_pause_point = 0; +/* + * This represents the duration for a slow drive read sit out. + */ +static unsigned long raidz_read_sit_out_secs = 600; + +static hrtime_t raid_outlier_check_interval_ms = 20; + /* * Maximum amount of copy io's outstanding at once. */ @@ -2281,6 +2288,45 @@ vdev_raidz_min_asize(vdev_t *vd) vd->vdev_children); } +/* + * return B_TRUE if a read should be skipped due to being too slow. + * + * In vdev_child_slow_outlier() it looks for outliers based on disk + * latency from the most recent child reads. Here we're checking if, + * over time, a disk has has been an outlier too many times and is + * now in a sit out period. + */ +boolean_t +vdev_sit_out_reads(vdev_t *vd, zio_flag_t io_flags) +{ + if (raidz_read_sit_out_secs == 0) + return (B_FALSE); + + /* Avoid skipping a data column read when resilvering */ + if (io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) + return (B_FALSE); + + return (vd->vdev_read_sit_out_expire >= gethrtime()); +} + +/* + * Calculate the Exponential Weighted Moving Average (EWMA) + * where + * alpha: the smoothing factor -- represented here as a scaled integer + * scale: the number of bits used to scale alpha + */ +static uint64_t +calculate_ewma(uint64_t previous_ewma, uint64_t latest_value) { + /* + * Scale using 16 bits with an effective alpha of 0.50 + */ + const uint64_t scale = 16; + const uint64_t alpha = 32768; + + return (((alpha * latest_value) + (((1ULL << scale) - alpha) * + previous_ewma)) >> scale); +} + void vdev_raidz_child_done(zio_t *zio) { @@ -2290,6 +2336,23 @@ vdev_raidz_child_done(zio_t *zio) rc->rc_error = zio->io_error; rc->rc_tried = 1; rc->rc_skipped = 0; + + /* + * Process the disk io latency before it goes out of scope. + * + * A zio->io_delay value of zero means this IO was part of + * an aggregation. + */ + if (raidz_read_sit_out_secs != 0 && zio->io_type == ZIO_TYPE_READ && + zio->io_error == 0 && zio->io_size > 0 && zio->io_delay != 0) { + vdev_t *vd = zio->io_vd; + uint64_t previous_ewma = atomic_load_64(&vd->vdev_ewma_latency); + if (previous_ewma == 0) + previous_ewma = zio->io_delay; + + atomic_store_64(&vd->vdev_ewma_latency, + calculate_ewma(previous_ewma, zio->io_delay)); + } } static void @@ -2445,6 +2508,40 @@ vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity) rc->rc_skipped = 1; continue; } + + if (vdev_sit_out_reads(cvd, zio->io_flags)) { + rr->rr_outlier_cnt++; + rc->rc_latency_outlier = 1; + } + } + + /* + * When the row contains a latency outlier and sufficient parity + * exists to reconstruct the column data, then skip reading the + * known slow child vdev as a performance optimization. + */ + if (rr->rr_outlier_cnt > 0 && rr->rr_missingdata == 0 && + (rr->rr_firstdatacol - rr->rr_missingparity) > 0) { + + for (int c = rr->rr_cols - 1; c >= rr->rr_firstdatacol; c--) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_latency_outlier) { + rr->rr_missingdata++; + rc->rc_error = SET_ERROR(EAGAIN); + rc->rc_skipped = 1; + break; + } + } + } + + for (int c = rr->rr_cols - 1; c >= 0; c--) { + raidz_col_t *rc = &rr->rr_col[c]; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + + if (rc->rc_error || rc->rc_size == 0) + continue; + if (forceparity || c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { @@ -2468,6 +2565,7 @@ vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm) ASSERT3U(prc->rc_devidx, ==, i); vdev_t *cvd = vd->vdev_child[i]; + if (!vdev_readable(cvd)) { prc->rc_error = SET_ERROR(ENXIO); prc->rc_tried = 1; /* don't even try */ @@ -2744,6 +2842,175 @@ vdev_raidz_worst_error(raidz_row_t *rr) return (error); } +/* + * Find the median value from a set of n values + */ +static uint64_t +latency_median_value(const uint64_t *data, size_t n) +{ + uint64_t m; + + if (n % 2 == 0) + m = (data[(n>>1) - 1] + data[n>>1]) >> 1; + else + m = data[((n + 1) >> 1) - 1]; + + return (m); +} + +/* + * Calculate the outlier fence from a set of n latency values + * + * fence = Q3 + 2 x (Q3 - Q1) + */ +static uint64_t +latency_quartiles_fence(const uint64_t *data, size_t n) +{ + uint64_t q1, q3; + + q1 = latency_median_value(&data[0], n>>1); + if (n % 2 == 0) + q3 = latency_median_value(&data[n>>1], n>>1); + else + q3 = latency_median_value(&data[(n+1) >> 1], n>>1); + + uint64_t iqr = q3 - q1; + uint64_t fence = q3 + iqr; + + return (fence); +} + +#define LAT_SAMPLES_STACK 64 +#define LAT_SAMPLES_MIN 5 +#define LAT_OUTLIER_LIMIT 50 + +static int +latency_compare(const void *arg1, const void *arg2) +{ + const uint64_t *l1 = (uint64_t *)arg1; + const uint64_t *l2 = (uint64_t *)arg2; + + return (TREE_CMP(*l1, *l2)); +} + +/* + * Check for any latency outlier from latest set of child reads. + * + * Uses a Tukey's fence, with K = 2, for detecting extreme outliers. This + * rule defines extreme outliers as data points outside the fence of the + * third quartile plus two times the Interquartile Range (IQR). This range + * is the distance between the first and third quartile. + */ +static void +vdev_child_slow_outlier(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + if (raidz_read_sit_out_secs == 0 || vd->vdev_children < LAT_SAMPLES_MIN) + return; + + spa_t *spa = zio->io_spa; + if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT || + spa_load_state(spa) == SPA_LOAD_RECOVER || + (spa_load_state(spa) != SPA_LOAD_NONE && + spa->spa_last_open_failed)) { + return; + } + + hrtime_t now = gethrtime(); + uint64_t last = atomic_load_64(&vd->vdev_last_latency_check); + + if ((now - last) < MSEC2NSEC(raid_outlier_check_interval_ms) || + atomic_cas_64(&vd->vdev_last_latency_check, last, now) != last) { + return; + } + + int samples = vd->vdev_children; + uint64_t data[LAT_SAMPLES_STACK]; + uint64_t *lat_data; + + if (samples > LAT_SAMPLES_STACK) + lat_data = kmem_alloc(sizeof (uint64_t) * samples, KM_SLEEP); + else + lat_data = &data[0]; + + uint64_t max = 0; + uint64_t max_outier_count = 0; + vdev_t *svd = NULL; /* suspect vdev */ + vdev_t *ovd = NULL; /* largest outlier vdev */ + for (int c = 0; c < samples; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (cvd->vdev_read_sit_out_expire != 0) { + if (cvd->vdev_read_sit_out_expire < gethrtime()) { + /* + * Done with our sit out, wait for new outlier + * to emerge. + */ + cvd->vdev_read_sit_out_expire = 0; + } else { + atomic_store_64(&cvd->vdev_ewma_latency, 0); + /* Only one sit out disk at a time for now */ + goto out; + } + } + + lat_data[c] = atomic_load_64(&cvd->vdev_ewma_latency); + + /* wait until all disks have been read from */ + if (lat_data[c] == 0) + goto out; + + /* keep track of the vdev with largest value */ + if (lat_data[c] > max) { + max = lat_data[c]; + svd = cvd; + } + + uint64_t count = atomic_load_64(&cvd->vdev_outlier_count); + if (count > max_outier_count) { + max_outier_count = count; + ovd = cvd; + } + } + + qsort((void *)lat_data, samples, sizeof (uint64_t), latency_compare); + uint64_t fence = latency_quartiles_fence(lat_data, samples); + if (lat_data[samples - 1] > fence) { + /* + * Keep track of how many times this child has had + * an outlier read. A disk that persitently has a + * higher than peers outlier count will be considered + * a slow disk. + */ + if (atomic_add_64_nv(&svd->vdev_outlier_count, 1) > + LAT_OUTLIER_LIMIT && svd == ovd && + svd->vdev_read_sit_out_expire == 0) { + /* + * Begin a sit out period for this slow drive + */ + svd->vdev_read_sit_out_expire = gethrtime() + + SEC2NSEC(raidz_read_sit_out_secs); + + /* count each slow io period */ + mutex_enter(&svd->vdev_stat_lock); + svd->vdev_stat.vs_slow_ios++; + mutex_exit(&svd->vdev_stat_lock); + + (void) zfs_ereport_post(FM_EREPORT_ZFS_DELAY, spa, svd, + NULL, NULL, 0); + vdev_dbgmsg(svd, "begin read sit out for %d secs", + (int)raidz_read_sit_out_secs); + for (int c = 0; c < vd->vdev_children; c++) { + atomic_store_64( + &vd->vdev_child[c]->vdev_outlier_count, 0); + } + } + } +out: + if (samples > LAT_SAMPLES_STACK) + kmem_free(lat_data, sizeof (uint64_t) * samples); +} + static void vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) { @@ -3485,6 +3752,9 @@ vdev_raidz_io_done(zio_t *zio) raidz_row_t *rr = rm->rm_row[i]; vdev_raidz_io_done_verified(zio, rr); } + /* Periodically check for a read outlier */ + if (zio->io_type == ZIO_TYPE_READ) + vdev_child_slow_outlier(zio); zio_checksum_verified(zio); } else { /* @@ -5120,3 +5390,6 @@ ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW, "For expanded RAIDZ, automatically start a pool scrub when expansion " "completes"); +ZFS_MODULE_PARAM(zfs_vdev, raidz_, read_sit_out_secs, ULONG, ZMOD_RW, + "Raidz/draid slow disk sit out time period in seconds"); +/* END CSTYLED */ From fd1252a8f5950b12ebcbac638e95942ec41a184b Mon Sep 17 00:00:00 2001 From: Don Brady Date: Thu, 9 Jan 2025 20:59:47 +0000 Subject: [PATCH 2/2] More changes from recent feedback also added test from Tony Signed-off-by: Don Brady --- include/sys/fs/zfs.h | 2 +- lib/libzfs/libzfs.abi | 2 +- lib/libzfs/libzfs_pool.c | 2 +- man/man7/vdevprops.7 | 16 ++-- module/zcommon/zpool_prop.c | 6 +- module/zfs/vdev.c | 2 +- module/zfs/vdev_raidz.c | 60 +++++------- tests/runfiles/common.run | 4 + tests/zfs-tests/include/libtest.shlib | 10 ++ tests/zfs-tests/include/tunables.cfg | 1 + tests/zfs-tests/tests/Makefile.am | 1 + .../functional/events/slow_vdev_sit_out.ksh | 93 +++++++++++++++++++ 12 files changed, 149 insertions(+), 50 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/events/slow_vdev_sit_out.ksh diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 547cfb72d1c4..102ffef016c5 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -379,7 +379,7 @@ typedef enum { VDEV_PROP_TRIM_SUPPORT, VDEV_PROP_TRIM_ERRORS, VDEV_PROP_SLOW_IOS, - VDEV_PROP_SIT_OUT_READS, + VDEV_PROP_SIT_OUT, VDEV_NUM_PROPS } vdev_prop_t; diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index f2aef8754460..4f5dd1c983fc 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -5917,7 +5917,7 @@ - + diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index ec30d2a6ddd3..dc0f0c53730c 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -5478,7 +5478,7 @@ zpool_get_vdev_prop_value(nvlist_t *nvprop, vdev_prop_t prop, char *prop_name, /* Only use if provided by the RAIDZ VDEV above */ if (prop == VDEV_PROP_RAIDZ_EXPANDING) return (ENOENT); - if (prop == VDEV_PROP_SIT_OUT_READS) + if (prop == VDEV_PROP_SIT_OUT) return (ENOENT); } if (vdev_prop_index_to_string(prop, intval, diff --git a/man/man7/vdevprops.7 b/man/man7/vdevprops.7 index 844864518c1e..229715c35d92 100644 --- a/man/man7/vdevprops.7 +++ b/man/man7/vdevprops.7 @@ -104,19 +104,23 @@ Comma separated list of children of this vdev The number of children belonging to this vdev .It Sy read_errors , write_errors , checksum_errors , initialize_errors , trim_errors The number of errors of each type encountered by this vdev -.It Sy sit_out_reads +.It Sy sit_out True when a slow disk outlier was detected and the vdev is currently in a sit out state. While sitting out, the vdev will not participate in normal reads, instead its data will be reconstructed as needed from parity. .It Sy slow_ios -The number of slow I/Os encountered by this vdev, -These represent I/O operations that didn't complete in +This indicates the number of slow I/O operations encountered by this vdev. +A slow I/O is defined as an operation that did not complete within the .Sy zio_slow_io_ms -milliseconds +threshold in milliseconds .Pq Sy 30000 No by default . -Can also be incremented when a vdev was determined to be a raidz leaf latency -outlier. +For +.Sy RAIDZ +and +.Sy DRAID +configurations, this value also represents the number of times the vdev was +identified as an outlier and excluded from participating in read I/O operations. .It Sy null_ops , read_ops , write_ops , free_ops , claim_ops , trim_ops The number of I/O operations of each type performed by this vdev .It Xo diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c index 461fc7faefe2..ef932ec4a0f6 100644 --- a/module/zcommon/zpool_prop.c +++ b/module/zcommon/zpool_prop.c @@ -466,9 +466,9 @@ vdev_prop_init(void) zprop_register_index(VDEV_PROP_RAIDZ_EXPANDING, "raidz_expanding", 0, PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "RAIDZ_EXPANDING", boolean_table, sfeatures); - zprop_register_index(VDEV_PROP_SIT_OUT_READS, "sit_out_reads", 0, - PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "SIT_OUT_READS", - boolean_table, sfeatures); + zprop_register_index(VDEV_PROP_SIT_OUT, "sit_out", 0, + PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "SIT_OUT", boolean_table, + sfeatures); zprop_register_index(VDEV_PROP_TRIM_SUPPORT, "trim_support", 0, PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "TRIMSUP", boolean_table, sfeatures); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 003caceb3328..f03200cdf86c 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -6363,7 +6363,7 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) ZPROP_SRC_NONE); } continue; - case VDEV_PROP_SIT_OUT_READS: + case VDEV_PROP_SIT_OUT: /* Only expose this for a draid or raidz leaf */ if (vd->vdev_ops->vdev_op_leaf && vd->vdev_top != NULL && diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 801707042a5c..599e68360833 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -2302,11 +2302,11 @@ vdev_sit_out_reads(vdev_t *vd, zio_flag_t io_flags) if (raidz_read_sit_out_secs == 0) return (B_FALSE); - /* Avoid skipping a data column read when resilvering */ - if (io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) + /* Avoid skipping a data column read when scrubbing */ + if (io_flags & ZIO_FLAG_SCRUB) return (B_FALSE); - return (vd->vdev_read_sit_out_expire >= gethrtime()); + return (vd->vdev_read_sit_out_expire >= gethrestime_sec()); } /* @@ -2318,10 +2318,10 @@ vdev_sit_out_reads(vdev_t *vd, zio_flag_t io_flags) static uint64_t calculate_ewma(uint64_t previous_ewma, uint64_t latest_value) { /* - * Scale using 16 bits with an effective alpha of 0.50 + * Scale using 8 bits with an effective alpha of 0.25 */ - const uint64_t scale = 16; - const uint64_t alpha = 32768; + const uint64_t scale = 8; + const uint64_t alpha = 64; return (((alpha * latest_value) + (((1ULL << scale) - alpha) * previous_ewma)) >> scale); @@ -2874,10 +2874,14 @@ latency_quartiles_fence(const uint64_t *data, size_t n) else q3 = latency_median_value(&data[(n+1) >> 1], n>>1); - uint64_t iqr = q3 - q1; - uint64_t fence = q3 + iqr; + /* + * To avoid detecting false positive outliers when N is small and + * and the latencies values are very close, make sure the fence + * is at least 25% larger than Q1. + */ + uint64_t iqr = MAX(q3 - q1, q1 >> 3); - return (fence); + return (q3 + (iqr << 1)); } #define LAT_SAMPLES_STACK 64 @@ -2908,14 +2912,6 @@ vdev_child_slow_outlier(zio_t *zio) if (raidz_read_sit_out_secs == 0 || vd->vdev_children < LAT_SAMPLES_MIN) return; - spa_t *spa = zio->io_spa; - if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT || - spa_load_state(spa) == SPA_LOAD_RECOVER || - (spa_load_state(spa) != SPA_LOAD_NONE && - spa->spa_last_open_failed)) { - return; - } - hrtime_t now = gethrtime(); uint64_t last = atomic_load_64(&vd->vdev_last_latency_check); @@ -2934,14 +2930,12 @@ vdev_child_slow_outlier(zio_t *zio) lat_data = &data[0]; uint64_t max = 0; - uint64_t max_outier_count = 0; vdev_t *svd = NULL; /* suspect vdev */ - vdev_t *ovd = NULL; /* largest outlier vdev */ for (int c = 0; c < samples; c++) { vdev_t *cvd = vd->vdev_child[c]; if (cvd->vdev_read_sit_out_expire != 0) { - if (cvd->vdev_read_sit_out_expire < gethrtime()) { + if (cvd->vdev_read_sit_out_expire < gethrestime_sec()) { /* * Done with our sit out, wait for new outlier * to emerge. @@ -2965,12 +2959,6 @@ vdev_child_slow_outlier(zio_t *zio) max = lat_data[c]; svd = cvd; } - - uint64_t count = atomic_load_64(&cvd->vdev_outlier_count); - if (count > max_outier_count) { - max_outier_count = count; - ovd = cvd; - } } qsort((void *)lat_data, samples, sizeof (uint64_t), latency_compare); @@ -2982,28 +2970,26 @@ vdev_child_slow_outlier(zio_t *zio) * higher than peers outlier count will be considered * a slow disk. */ - if (atomic_add_64_nv(&svd->vdev_outlier_count, 1) > - LAT_OUTLIER_LIMIT && svd == ovd && - svd->vdev_read_sit_out_expire == 0) { + if (++svd->vdev_outlier_count > LAT_OUTLIER_LIMIT) { + ASSERT0(svd->vdev_read_sit_out_expire); /* * Begin a sit out period for this slow drive */ - svd->vdev_read_sit_out_expire = gethrtime() + - SEC2NSEC(raidz_read_sit_out_secs); + svd->vdev_read_sit_out_expire = gethrestime_sec() + + raidz_read_sit_out_secs; /* count each slow io period */ mutex_enter(&svd->vdev_stat_lock); svd->vdev_stat.vs_slow_ios++; mutex_exit(&svd->vdev_stat_lock); - (void) zfs_ereport_post(FM_EREPORT_ZFS_DELAY, spa, svd, - NULL, NULL, 0); + (void) zfs_ereport_post(FM_EREPORT_ZFS_DELAY, + zio->io_spa, svd, NULL, NULL, 0); vdev_dbgmsg(svd, "begin read sit out for %d secs", (int)raidz_read_sit_out_secs); - for (int c = 0; c < vd->vdev_children; c++) { - atomic_store_64( - &vd->vdev_child[c]->vdev_outlier_count, 0); - } + + for (int c = 0; c < vd->vdev_children; c++) + vd->vdev_child[c]->vdev_outlier_count = 0; } } out: diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 8a4a4b0f5cb8..8c954dba2d18 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -702,6 +702,10 @@ tests = ['dio_aligned_block', 'dio_async_always', 'dio_async_fio_ioengines', 'dio_unaligned_block', 'dio_unaligned_filesize'] tags = ['functional', 'direct'] +[tests/functional/events] +tests = ['slow_vdev_sit_out'] +tags = ['functional', 'events'] + [tests/functional/exec] tests = ['exec_001_pos', 'exec_002_neg'] tags = ['functional', 'exec'] diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index 9cf919c3dd0f..fbad1747b642 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -1109,6 +1109,16 @@ function get_pool_prop # property pool zpool get -Hpo value "$prop" "$pool" || log_fail "zpool get $prop $pool" } +# Get the specified vdev property in parsable format or fail +function get_vdev_prop +{ + typeset prop="$1" + typeset pool="$2" + typeset vdev="$3" + + zpool get -Hpo value "$prop" "$pool" "$vdev" || log_fail "zpool get $prop $pool $vdev" +} + # Return 0 if a pool exists; $? otherwise # # $1 - pool name diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index 2024c44cc138..7d667c5537c5 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -69,6 +69,7 @@ MULTIHOST_INTERVAL multihost.interval zfs_multihost_interval OVERRIDE_ESTIMATE_RECORDSIZE send.override_estimate_recordsize zfs_override_estimate_recordsize PREFETCH_DISABLE prefetch.disable zfs_prefetch_disable RAIDZ_EXPAND_MAX_REFLOW_BYTES vdev.expand_max_reflow_bytes raidz_expand_max_reflow_bytes +RAIDZ_READ_SIT_OUT_SECS vdev.raidz_read_sit_out_secs raidz_read_sit_out_secs REBUILD_SCRUB_ENABLED rebuild_scrub_enabled zfs_rebuild_scrub_enabled REMOVAL_SUSPEND_PROGRESS removal_suspend_progress zfs_removal_suspend_progress REMOVE_MAX_SEGMENT remove_max_segment zfs_remove_max_segment diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index df183825dc68..9bbe99b0afd8 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1497,6 +1497,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/events/events_001_pos.ksh \ functional/events/events_002_pos.ksh \ functional/events/setup.ksh \ + functional/events/slow_vdev_sit_out.ksh \ functional/events/zed_cksum_config.ksh \ functional/events/zed_cksum_reported.ksh \ functional/events/zed_diagnose_multiple.ksh \ diff --git a/tests/zfs-tests/tests/functional/events/slow_vdev_sit_out.ksh b/tests/zfs-tests/tests/functional/events/slow_vdev_sit_out.ksh new file mode 100755 index 000000000000..27c3a8f5f823 --- /dev/null +++ b/tests/zfs-tests/tests/functional/events/slow_vdev_sit_out.ksh @@ -0,0 +1,93 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# Copyright (c) 2024 by Lawrence Livermore National Security, LLC. + +# DESCRIPTION: +# Verify that vdevs 'sit out' when they are slow +# +# STRATEGY: +# 1. Create various raidz/draid pools +# 2. Inject delays into one of the disks +# 3. Verify disk is set to 'sit out' for awhile. +# 4. Wait for RAIDZ_READ_SIT_OUT_SECS and verify sit out state is lifted. +# + +. $STF_SUITE/include/libtest.shlib + +# Avoid name conflicts with the default pool +TESTPOOL2=testpool2 + +function cleanup +{ + restore_tunable RAIDZ_READ_SIT_OUT_SECS + log_must zinject -c all + destroy_pool $TESTPOOL2 + log_must rm -f $TEST_BASE_DIR/vdev.$$.* +} + +log_assert "Verify sit_out works" + +log_onexit cleanup + +# shorten sit out period for testing +save_tunable RAIDZ_READ_SIT_OUT_SECS +set_tunable32 RAIDZ_READ_SIT_OUT_SECS 5 + +log_must truncate -s 150M $TEST_BASE_DIR/vdev.$$.{0..9} + +for raidtype in raidz raidz2 raidz3 draid1 draid2 draid3 ; do + log_must zpool create $TESTPOOL2 $raidtype $TEST_BASE_DIR/vdev.$$.{0..9} + log_must dd if=/dev/urandom of=/$TESTPOOL2/bigfile bs=1M count=100 + log_must zpool export $TESTPOOL2 + log_must zpool import -d $TEST_BASE_DIR $TESTPOOL2 + + BAD_VDEV=$TEST_BASE_DIR/vdev.$$.9 + + # Initial state should not be sitting out + log_must eval [[ "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV)" == "off" ]] + + # Delay our reads 200ms to trigger sit out + log_must zinject -d $BAD_VDEV -D200:1 -T read $TESTPOOL2 + + # Do some reads and wait for us to sit out + for i in {1..100} ; do + dd if=/$TESTPOOL2/bigfile skip=$i bs=1M count=1 of=/dev/null &> /dev/null + + sit_out=$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV) + if [[ "$sit_out" == "on" ]] ; then + break + fi + done + + log_must test "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV)" == "on" + + # Clear fault injection + log_must zinject -c all + + # Wait for us to exit our sit out period + sleep 6 + log_must test "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV)" == "off" + destroy_pool $TESTPOOL2 +done + +log_pass "sit_out works correctly"