Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RAIDZ Expansion feature #12225

Closed
wants to merge 26 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
dda7bf2
disable zstd mempool
ahrens Jun 10, 2021
2f2ae11
raidz expansion feature
ahrens Jun 10, 2021
c64244b
Increase the number of txgs added to last reflow complete sync txg
fuporovvStack Oct 25, 2021
c32cc85
panic in zthr_iscancelled
ahrens Nov 16, 2021
8cb83b7
ztest: Skip ztest_vdev_LUN_growth() if raidz expansion is in-progress
fuporovvStack Oct 25, 2021
50e8dae
ztest: Make ztest_vdev_raidz_attach() error checking more closer to z…
fuporovvStack Oct 25, 2021
16a74ed
ztest: Restore scratch object testing
fuporovvStack Oct 25, 2021
05a8136
ztest: Add raidz expansion testing as CLI option
fuporovvStack Oct 25, 2021
2020f19
ztest: Fix integer printing
fuporovvStack Oct 26, 2021
bee78bb
fix a bug where we can fail to repair a few blocks while in the middl…
ahrens Nov 18, 2021
682fa25
manpage comments
ahrens Feb 22, 2022
6f8d24f
fix assertion failure in raidz_reflow_sync()
ahrens Feb 25, 2022
972d154
one more manpage tweak
ahrens Feb 25, 2022
03751b3
Do not work with shadow location in case if scratch space requested
fuporovvStack Feb 28, 2022
a26ffc9
Do not switch to scratch offset in the middle of the row
fuporovvStack Feb 28, 2022
49f131f
Fix raidz asize computation in case of expansion is in progress
fuporovvStack Feb 28, 2022
d99d9a3
Make vdev_rz_expanding config variable syncing more earlier
fuporovvStack Feb 28, 2022
cba2253
Skip mmp ub writing if scratch object is active.
fuporovvStack Apr 19, 2022
5b53105
Revert "Make vdev_rz_expanding config variable syncing more earlier"
fuporovvStack Oct 24, 2022
4c565c7
Skip mmp ub writing if scratch object is active.
fuporovvStack Nov 14, 2022
d4c6d36
Handle scratch in case of shadow writes
fuporovvStack Nov 14, 2022
6d37388
Scratch logic refactoring
fuporovvStack Nov 14, 2022
d2f0fed
Remove skip mmp ub writing if scratch object is active.
fuporovvStack Nov 16, 2022
636573d
Fix 'shadow write' to scratch region
fuporovvStack Nov 16, 2022
c3de3a6
Remove unneeded argument from mmp.c
fuporovvStack Nov 16, 2022
1fd377d
Merge pull request #34 from fuporovvStack/raidz-expand-arithmetic-fixes
ahrens Nov 17, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions cmd/raidz_test/raidz_bench.c
Original file line number Diff line number Diff line change
Expand Up @@ -84,10 +84,10 @@ run_gen_bench_impl(const char *impl)

if (rto_opts.rto_expand) {
rm_bench = vdev_raidz_map_alloc_expanded(
zio_bench.io_abd,
zio_bench.io_size, zio_bench.io_offset,
&zio_bench,
rto_opts.rto_ashift, ncols+1, ncols,
fn+1, rto_opts.rto_expand_offset);
fn+1, rto_opts.rto_expand_offset,
0, B_FALSE);
} else {
rm_bench = vdev_raidz_map_alloc(&zio_bench,
BENCH_ASHIFT, ncols, fn+1);
Expand Down Expand Up @@ -172,10 +172,10 @@ run_rec_bench_impl(const char *impl)

if (rto_opts.rto_expand) {
rm_bench = vdev_raidz_map_alloc_expanded(
zio_bench.io_abd,
zio_bench.io_size, zio_bench.io_offset,
&zio_bench,
BENCH_ASHIFT, ncols+1, ncols,
PARITY_PQR, rto_opts.rto_expand_offset);
PARITY_PQR,
rto_opts.rto_expand_offset, 0, B_FALSE);
} else {
rm_bench = vdev_raidz_map_alloc(&zio_bench,
BENCH_ASHIFT, ncols, PARITY_PQR);
Expand Down
196 changes: 6 additions & 190 deletions cmd/raidz_test/raidz_test.c
Original file line number Diff line number Diff line change
Expand Up @@ -333,14 +333,12 @@ init_raidz_golden_map(raidz_test_opts_t *opts, const int parity)

if (opts->rto_expand) {
opts->rm_golden =
vdev_raidz_map_alloc_expanded(opts->zio_golden->io_abd,
opts->zio_golden->io_size, opts->zio_golden->io_offset,
vdev_raidz_map_alloc_expanded(opts->zio_golden,
opts->rto_ashift, total_ncols+1, total_ncols,
parity, opts->rto_expand_offset);
rm_test = vdev_raidz_map_alloc_expanded(zio_test->io_abd,
zio_test->io_size, zio_test->io_offset,
parity, opts->rto_expand_offset, 0, B_FALSE);
rm_test = vdev_raidz_map_alloc_expanded(zio_test,
opts->rto_ashift, total_ncols+1, total_ncols,
parity, opts->rto_expand_offset);
parity, opts->rto_expand_offset, 0, B_FALSE);
} else {
opts->rm_golden = vdev_raidz_map_alloc(opts->zio_golden,
opts->rto_ashift, total_ncols, parity);
Expand All @@ -367,187 +365,6 @@ init_raidz_golden_map(raidz_test_opts_t *opts, const int parity)
return (err);
}

/*
* If reflow is not in progress, reflow_offset should be UINT64_MAX.
* For each row, if the row is entirely before reflow_offset, it will
* come from the new location. Otherwise this row will come from the
* old location. Therefore, rows that straddle the reflow_offset will
* come from the old location.
*
* NOTE: Until raidz expansion is implemented this function is only
* needed by raidz_test.c to the multi-row raid_map_t functionality.
*/
raidz_map_t *
vdev_raidz_map_alloc_expanded(abd_t *abd, uint64_t size, uint64_t offset,
uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols,
uint64_t nparity, uint64_t reflow_offset)
{
/* The zio's size in units of the vdev's minimum sector size. */
uint64_t s = size >> ashift;
uint64_t q, r, bc, devidx, asize = 0, tot;

/*
* "Quotient": The number of data sectors for this stripe on all but
* the "big column" child vdevs that also contain "remainder" data.
* AKA "full rows"
*/
q = s / (logical_cols - nparity);

/*
* "Remainder": The number of partial stripe data sectors in this I/O.
* This will add a sector to some, but not all, child vdevs.
*/
r = s - q * (logical_cols - nparity);

/* The number of "big columns" - those which contain remainder data. */
bc = (r == 0 ? 0 : r + nparity);

/*
* The total number of data and parity sectors associated with
* this I/O.
*/
tot = s + nparity * (q + (r == 0 ? 0 : 1));

/* How many rows contain data (not skip) */
uint64_t rows = howmany(tot, logical_cols);
int cols = MIN(tot, logical_cols);

raidz_map_t *rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]),
KM_SLEEP);
rm->rm_nrows = rows;

for (uint64_t row = 0; row < rows; row++) {
raidz_row_t *rr = kmem_alloc(offsetof(raidz_row_t,
rr_col[cols]), KM_SLEEP);
rm->rm_row[row] = rr;

/* The starting RAIDZ (parent) vdev sector of the row. */
uint64_t b = (offset >> ashift) + row * logical_cols;

/*
* If we are in the middle of a reflow, and any part of this
* row has not been copied, then use the old location of
* this row.
*/
int row_phys_cols = physical_cols;
if (b + (logical_cols - nparity) > reflow_offset >> ashift)
row_phys_cols--;

/* starting child of this row */
uint64_t child_id = b % row_phys_cols;
/* The starting byte offset on each child vdev. */
uint64_t child_offset = (b / row_phys_cols) << ashift;

/*
* We set cols to the entire width of the block, even
* if this row is shorter. This is needed because parity
* generation (for Q and R) needs to know the entire width,
* because it treats the short row as though it was
* full-width (and the "phantom" sectors were zero-filled).
*
* Another approach to this would be to set cols shorter
* (to just the number of columns that we might do i/o to)
* and have another mechanism to tell the parity generation
* about the "entire width". Reconstruction (at least
* vdev_raidz_reconstruct_general()) would also need to
* know about the "entire width".
*/
rr->rr_cols = cols;
rr->rr_bigcols = bc;
rr->rr_missingdata = 0;
rr->rr_missingparity = 0;
rr->rr_firstdatacol = nparity;
rr->rr_abd_empty = NULL;
rr->rr_nempty = 0;

for (int c = 0; c < rr->rr_cols; c++, child_id++) {
if (child_id >= row_phys_cols) {
child_id -= row_phys_cols;
child_offset += 1ULL << ashift;
}
rr->rr_col[c].rc_devidx = child_id;
rr->rr_col[c].rc_offset = child_offset;
rr->rr_col[c].rc_orig_data = NULL;
rr->rr_col[c].rc_error = 0;
rr->rr_col[c].rc_tried = 0;
rr->rr_col[c].rc_skipped = 0;
rr->rr_col[c].rc_need_orig_restore = B_FALSE;

uint64_t dc = c - rr->rr_firstdatacol;
if (c < rr->rr_firstdatacol) {
rr->rr_col[c].rc_size = 1ULL << ashift;
rr->rr_col[c].rc_abd =
abd_alloc_linear(rr->rr_col[c].rc_size,
B_TRUE);
} else if (row == rows - 1 && bc != 0 && c >= bc) {
/*
* Past the end, this for parity generation.
*/
rr->rr_col[c].rc_size = 0;
rr->rr_col[c].rc_abd = NULL;
} else {
/*
* "data column" (col excluding parity)
* Add an ASCII art diagram here
*/
uint64_t off;

if (c < bc || r == 0) {
off = dc * rows + row;
} else {
off = r * rows +
(dc - r) * (rows - 1) + row;
}
rr->rr_col[c].rc_size = 1ULL << ashift;
rr->rr_col[c].rc_abd = abd_get_offset_struct(
&rr->rr_col[c].rc_abdstruct,
abd, off << ashift, 1 << ashift);
}

asize += rr->rr_col[c].rc_size;
}
/*
* If all data stored spans all columns, there's a danger that
* parity will always be on the same device and, since parity
* isn't read during normal operation, that that device's I/O
* bandwidth won't be used effectively. We therefore switch
* the parity every 1MB.
*
* ...at least that was, ostensibly, the theory. As a practical
* matter unless we juggle the parity between all devices
* evenly, we won't see any benefit. Further, occasional writes
* that aren't a multiple of the LCM of the number of children
* and the minimum stripe width are sufficient to avoid pessimal
* behavior. Unfortunately, this decision created an implicit
* on-disk format requirement that we need to support for all
* eternity, but only for single-parity RAID-Z.
*
* If we intend to skip a sector in the zeroth column for
* padding we must make sure to note this swap. We will never
* intend to skip the first column since at least one data and
* one parity column must appear in each row.
*/
if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 &&
(offset & (1ULL << 20))) {
ASSERT(rr->rr_cols >= 2);
ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
devidx = rr->rr_col[0].rc_devidx;
uint64_t o = rr->rr_col[0].rc_offset;
rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
rr->rr_col[1].rc_devidx = devidx;
rr->rr_col[1].rc_offset = o;
}

}
ASSERT3U(asize, ==, tot << ashift);

/* init RAIDZ parity ops */
rm->rm_ops = vdev_raidz_math_get_ops();

return (rm);
}

static raidz_map_t *
init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity)
{
Expand All @@ -567,10 +384,9 @@ init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity)
init_zio_abd(*zio);

if (opts->rto_expand) {
rm = vdev_raidz_map_alloc_expanded((*zio)->io_abd,
(*zio)->io_size, (*zio)->io_offset,
rm = vdev_raidz_map_alloc_expanded(*zio,
opts->rto_ashift, total_ncols+1, total_ncols,
parity, opts->rto_expand_offset);
parity, opts->rto_expand_offset, 0, B_FALSE);
} else {
rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift,
total_ncols, parity);
Expand Down
3 changes: 0 additions & 3 deletions cmd/raidz_test/raidz_test.h
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,4 @@ void init_zio_abd(zio_t *zio);

void run_raidz_benchmark(void);

struct raidz_map *vdev_raidz_map_alloc_expanded(abd_t *, uint64_t, uint64_t,
uint64_t, uint64_t, uint64_t, uint64_t, uint64_t);

#endif /* RAIDZ_TEST_H */
5 changes: 5 additions & 0 deletions cmd/zdb/zdb.c
Original file line number Diff line number Diff line change
Expand Up @@ -3931,6 +3931,11 @@ dump_uberblock(uberblock_t *ub, const char *header, const char *footer)
}
(void) printf("\tcheckpoint_txg = %llu\n",
(u_longlong_t)ub->ub_checkpoint_txg);

(void) printf("\traidz_reflow state=%u off=%llu\n",
(int)RRSS_GET_STATE(ub),
(u_longlong_t)RRSS_GET_OFFSET(ub));

(void) printf("%s", footer ? footer : "");
}

Expand Down
Loading