Skip to content

Commit

Permalink
Merge pull request #203 from truenas/NAS-129309
Browse files Browse the repository at this point in the history
NAS-129309 / None / Copy offload support for block devices
  • Loading branch information
ixhamza authored Dec 19, 2024
2 parents 8d2935f + 3142692 commit d2e6686
Show file tree
Hide file tree
Showing 13 changed files with 450 additions and 10 deletions.
23 changes: 23 additions & 0 deletions Documentation/ABI/stable/sysfs-block
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,29 @@ Description:
last zone of the device which may be smaller.


What: /sys/block/<disk>/queue/copy_max_bytes
Date: May 2024
Contact: [email protected]
Description:
[RW] This is the maximum number of bytes that the block layer
will allow for a copy request. This is always smaller or
equal to the maximum size allowed by the hardware, indicated by
'copy_max_hw_bytes'. An attempt to set a value higher than
'copy_max_hw_bytes' will truncate this to 'copy_max_hw_bytes'.
Writing '0' to this file will disable offloading copies for this
device, instead copy is done via emulation.


What: /sys/block/<disk>/queue/copy_max_hw_bytes
Date: May 2024
Contact: [email protected]
Description:
[RO] This is the maximum number of bytes that the hardware
will allow for single data copy request.
A value of 0 means that the device does not support
copy offload.


What: /sys/block/<disk>/queue/crypto/
Date: February 2022
Contact: [email protected]
Expand Down
7 changes: 7 additions & 0 deletions block/blk-core.c
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ static const char *const blk_op_name[] = {
REQ_OP_NAME(ZONE_FINISH),
REQ_OP_NAME(ZONE_APPEND),
REQ_OP_NAME(WRITE_ZEROES),
REQ_OP_NAME(COPY_SRC),
REQ_OP_NAME(COPY_DST),
REQ_OP_NAME(DRV_IN),
REQ_OP_NAME(DRV_OUT),
};
Expand Down Expand Up @@ -852,6 +854,11 @@ void submit_bio_noacct(struct bio *bio)
if (!bdev_is_zoned(bio->bi_bdev))
goto not_supported;
break;
case REQ_OP_COPY_SRC:
case REQ_OP_COPY_DST:
if (!q->limits.max_copy_sectors)
goto not_supported;
break;
case REQ_OP_DRV_IN:
case REQ_OP_DRV_OUT:
/*
Expand Down
225 changes: 225 additions & 0 deletions block/blk-lib.c
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,231 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
}
EXPORT_SYMBOL(blkdev_issue_discard);

static inline ssize_t blkdev_copy_sanity_check(struct block_device *bdev_in,
loff_t pos_in,
struct block_device *bdev_out,
loff_t pos_out, size_t len)
{
unsigned int align = max(bdev_logical_block_size(bdev_out),
bdev_logical_block_size(bdev_in)) - 1;

if ((pos_in & align) || (pos_out & align) || (len & align) || !len ||
len >= BLK_COPY_MAX_BYTES)
return -EINVAL;

return 0;
}

static inline void blkdev_copy_endio(struct blkdev_copy_io *cio)
{
if (cio->endio) {
cio->endio(cio->private, cio->status, cio->copied);
kfree(cio);
} else {
struct task_struct *waiter = cio->waiter;

WRITE_ONCE(cio->waiter, NULL);
blk_wake_io_task(waiter);
}
}

/*
* This must only be called once all bios have been issued so that the refcount
* can only decrease. This just waits for all bios to complete.
* Returns the length of bytes copied or error
*/
static ssize_t blkdev_copy_wait_for_completion_io(struct blkdev_copy_io *cio)
{
ssize_t ret;

for (;;) {
__set_current_state(TASK_UNINTERRUPTIBLE);
if (!READ_ONCE(cio->waiter))
break;
blk_io_schedule();
}
__set_current_state(TASK_RUNNING);
ret = cio->copied;
kfree(cio);

return ret;
}

static void blkdev_copy_offload_src_endio(struct bio *bio)
{
struct blkdev_copy_offload_io *offload_io = bio->bi_private;
struct blkdev_copy_io *cio = offload_io->cio;

if (bio->bi_status) {
cio->copied = min_t(ssize_t, offload_io->offset, cio->copied);
if (!cio->status)
cio->status = blk_status_to_errno(bio->bi_status);
}
bio_put(bio);
if (offload_io->dst_bio)
bio_put(offload_io->dst_bio);

kfree(offload_io);

if (atomic_dec_and_test(&cio->refcount))
blkdev_copy_endio(cio);
}

/*
* @bdev: source block device
* @pos_in: source offset
* @pos_out: destination offset
* @len: length in bytes to be copied
* @endio: endio function to be called on completion of copy operation,
* for synchronous operation this should be NULL
* @private: endio function will be called with this private data,
* for synchronous operation this should be NULL
* @gfp_mask: memory allocation flags (for bio_alloc)
* @bdev_out: destination block device
*
* For synchronous operation returns the length of bytes copied or error
* For asynchronous operation returns -EIOCBQUEUED or error
*
* Description:
* Copy source offset to destination offset within block device, using
* device's native copy offload feature.
* We perform copy operation using 2 bio's.
* 1. We take a plug and send a REQ_OP_COPY_DST bio along with destination
* sector and length. Once this bio reaches request layer, we form a
* request and wait for dst bio to arrive.
* 2. We issue REQ_OP_COPY_SRC bio along with source sector, length.
* Once this bio reaches request layer and find a request with previously
* sent destination info we merge the source bio and return.
* 3. Release the plug and request is sent to driver
* This design works only for drivers with request queue.
*/
ssize_t blkdev_copy_offload(struct block_device *bdev, loff_t pos_in,
loff_t pos_out, size_t len,
void (*endio)(void *, int, ssize_t),
void *private, gfp_t gfp, struct block_device *bdev_out)
{
struct blkdev_copy_io *cio;
struct blkdev_copy_offload_io *offload_io;
struct bio *src_bio, *dst_bio;
size_t rem, chunk;
ssize_t ret;
struct blk_plug plug;
int is_mq = 0;
size_t max_copy_bytes = min(bdev_max_copy_sectors(bdev) << SECTOR_SHIFT,
bdev_max_copy_sectors(bdev_out) << SECTOR_SHIFT);

if (!max_copy_bytes)
return -EOPNOTSUPP;

if (queue_is_mq(bdev->bd_queue)) {
if (bdev->bd_queue->mq_ops != bdev_out->bd_queue->mq_ops)
return -EOPNOTSUPP;
is_mq = 1;
} else if (!bdev->bd_disk->fops->submit_bio ||
bdev->bd_disk->fops->submit_bio != bdev_out->bd_disk->fops->submit_bio) {
return -EOPNOTSUPP;
}

/*
* Single queue only supported for zvols
*/
if (!is_mq && strncmp(bdev->bd_disk->disk_name, "zd", 2))
return -EOPNOTSUPP;

/*
* Cross device copy only supported for zvols
*/
if (bdev != bdev_out && strncmp(bdev_out->bd_disk->disk_name, "zd", 2))
return -EOPNOTSUPP;

ret = blkdev_copy_sanity_check(bdev, pos_in, bdev_out, pos_out, len);

if (ret)
return ret;

cio = kzalloc(sizeof(*cio), gfp);
if (!cio)
return -ENOMEM;
atomic_set(&cio->refcount, 1);
cio->waiter = current;
cio->endio = endio;
cio->private = private;

/*
* If there is a error, copied will be set to least successfully
* completed copied length
*/
cio->copied = len;
for (rem = len; rem > 0; rem -= chunk) {
chunk = min(rem, max_copy_bytes);

offload_io = kzalloc(sizeof(*offload_io), gfp);
if (!offload_io)
goto err_free_cio;
offload_io->cio = cio;
/*
* For partial completion, we use offload_io->offset to truncate
* successful copy length
*/
offload_io->offset = len - rem;
offload_io->driver_private = bdev_out->bd_queue->queuedata;

dst_bio = bio_alloc(bdev, 0, REQ_OP_COPY_DST, gfp);
if (!dst_bio)
goto err_free_offload_io;
dst_bio->bi_iter.bi_size = chunk;
dst_bio->bi_iter.bi_sector = pos_out >> SECTOR_SHIFT;

if (is_mq) {
blk_start_plug(&plug);
src_bio = blk_next_bio(dst_bio, bdev, 0, REQ_OP_COPY_SRC, gfp);
} else {
src_bio = bio_alloc(bdev, 0, REQ_OP_COPY_SRC, gfp);
}
if (!src_bio)
goto err_free_dst_bio;
src_bio->bi_iter.bi_size = chunk;
src_bio->bi_iter.bi_sector = pos_in >> SECTOR_SHIFT;
src_bio->bi_end_io = blkdev_copy_offload_src_endio;
src_bio->bi_private = offload_io;
offload_io->dst_bio = (is_mq) ? NULL : dst_bio;

atomic_inc(&cio->refcount);
submit_bio(src_bio);
if (is_mq)
blk_finish_plug(&plug);
pos_in += chunk;
pos_out += chunk;
}

if (atomic_dec_and_test(&cio->refcount))
blkdev_copy_endio(cio);
if (endio)
return -EIOCBQUEUED;

return blkdev_copy_wait_for_completion_io(cio);

err_free_dst_bio:
if (is_mq)
blk_finish_plug(&plug);
bio_put(dst_bio);
err_free_offload_io:
kfree(offload_io);
err_free_cio:
cio->copied = min_t(ssize_t, cio->copied, (len - rem));
cio->status = -ENOMEM;
if (rem == len) {
ret = cio->status;
kfree(cio);
return ret;
}
if (cio->endio)
return cio->status;

return blkdev_copy_wait_for_completion_io(cio);
}
EXPORT_SYMBOL_GPL(blkdev_copy_offload);

static sector_t bio_write_zeroes_limit(struct block_device *bdev)
{
sector_t bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
Expand Down
21 changes: 21 additions & 0 deletions block/blk-merge.c
Original file line number Diff line number Diff line change
Expand Up @@ -979,6 +979,9 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
if (!rq_mergeable(rq) || !bio_mergeable(bio))
return false;

if (blk_copy_offload_mergable(rq, bio))
return true;

if (req_op(rq) != bio_op(bio))
return false;

Expand Down Expand Up @@ -1015,6 +1018,8 @@ enum elv_merge blk_try_merge(struct request *rq, struct bio *bio)
{
if (blk_discard_mergable(rq))
return ELEVATOR_DISCARD_MERGE;
else if (blk_copy_offload_mergable(rq, bio))
return ELEVATOR_COPY_OFFLOAD_MERGE;
else if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_iter.bi_sector)
return ELEVATOR_BACK_MERGE;
else if (blk_rq_pos(rq) - bio_sectors(bio) == bio->bi_iter.bi_sector)
Expand Down Expand Up @@ -1122,6 +1127,20 @@ static enum bio_merge_status bio_attempt_discard_merge(struct request_queue *q,
return BIO_MERGE_FAILED;
}

static enum bio_merge_status bio_attempt_copy_offload_merge(struct request *req,
struct bio *bio)
{
if (req->__data_len != bio->bi_iter.bi_size)
return BIO_MERGE_FAILED;

req->biotail->bi_next = bio;
req->biotail = bio;
req->nr_phys_segments++;
req->__data_len += bio->bi_iter.bi_size;

return BIO_MERGE_OK;
}

static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
struct request *rq,
struct bio *bio,
Expand All @@ -1142,6 +1161,8 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
break;
case ELEVATOR_DISCARD_MERGE:
return bio_attempt_discard_merge(q, rq, bio);
case ELEVATOR_COPY_OFFLOAD_MERGE:
return bio_attempt_copy_offload_merge(rq, bio);
default:
return BIO_MERGE_NONE;
}
Expand Down
Loading

0 comments on commit d2e6686

Please sign in to comment.