Commit adfc3ded authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'for-6.12/io_uring-discard-20240913' of git://git.kernel.dk/linux

Pull io_uring async discard support from Jens Axboe:
 "Sitting on top of both the 6.12 block and io_uring core branches,
  here's support for async discard through io_uring.

  This allows applications to issue async discards, rather than rely on
  the blocking sync ioctl discards we already have. The sync support is
  difficult to use outside of idle/cleanup periods.

  On a real (but slow) device, testing shows the following results when
  compared to sync discard:

	qd64 sync discard: 21K IOPS, lat avg 3 msec (max 21 msec)
	qd64 async discard: 76K IOPS, lat avg 845 usec (max 2.2 msec)

	qd64 sync discard: 14K IOPS, lat avg 5 msec (max 25 msec)
	qd64 async discard: 56K IOPS, lat avg 1153 usec (max 3.6 msec)

  and synthetic null_blk testing with the same queue depth and block
  size settings as above shows:

	Type    Trim size       IOPS    Lat avg (usec)  Lat Max (usec)
	==============================================================
	sync    4k               144K       444            20314
	async   4k              1353K        47              595
	sync    1M                56K      1136            21031
	async   1M                94K       680              760"

* tag 'for-6.12/io_uring-discard-20240913' of git://git.kernel.dk/linux:
  block: implement async io_uring discard cmd
  block: introduce blk_validate_byte_range()
  filemap: introduce filemap_invalidate_pages
  io_uring/cmd: give inline space in request to cmds
  io_uring/cmd: expose iowq to cmds
parents 26bb0d3f 50c52250
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -609,6 +609,7 @@ blk_mode_t file_to_blk_mode(struct file *file);
int truncate_bdev_range(struct block_device *bdev, blk_mode_t mode,
		loff_t lstart, loff_t lend);
long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);
int blkdev_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags);
long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);

extern const struct address_space_operations def_blk_aops;
+2 −0
Original line number Diff line number Diff line
@@ -17,6 +17,7 @@
#include <linux/fs.h>
#include <linux/iomap.h>
#include <linux/module.h>
#include <linux/io_uring/cmd.h>
#include "blk.h"

static inline struct inode *bdev_file_inode(struct file *file)
@@ -865,6 +866,7 @@ const struct file_operations def_blk_fops = {
	.splice_read	= filemap_splice_read,
	.splice_write	= iter_file_splice_write,
	.fallocate	= blkdev_fallocate,
	.uring_cmd	= blkdev_uring_cmd,
	.fop_flags	= FOP_BUFFER_RASYNC,
};

+144 −19
Original line number Diff line number Diff line
@@ -11,6 +11,9 @@
#include <linux/blktrace_api.h>
#include <linux/pr.h>
#include <linux/uaccess.h>
#include <linux/pagemap.h>
#include <linux/io_uring/cmd.h>
#include <uapi/linux/blkdev.h>
#include "blk.h"

static int blkpg_do_ioctl(struct block_device *bdev,
@@ -92,41 +95,54 @@ static int compat_blkpg_ioctl(struct block_device *bdev,
}
#endif

/*
 * Check that [start, start + len) is a valid range from the block device's
 * perspective, including verifying that it can be correctly translated into
 * logical block addresses.
 */
static int blk_validate_byte_range(struct block_device *bdev,
				   uint64_t start, uint64_t len)
{
	unsigned int bs_mask = bdev_logical_block_size(bdev) - 1;
	uint64_t end;

	if ((start | len) & bs_mask)
		return -EINVAL;
	if (!len)
		return -EINVAL;
	if (check_add_overflow(start, len, &end) || end > bdev_nr_bytes(bdev))
		return -EINVAL;

	return 0;
}

static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
		unsigned long arg)
{
	unsigned int bs_mask = bdev_logical_block_size(bdev) - 1;
	uint64_t range[2], start, len, end;
	uint64_t range[2], start, len;
	struct bio *prev = NULL, *bio;
	sector_t sector, nr_sects;
	struct blk_plug plug;
	int err;

	if (!(mode & BLK_OPEN_WRITE))
		return -EBADF;

	if (!bdev_max_discard_sectors(bdev))
		return -EOPNOTSUPP;
	if (bdev_read_only(bdev))
		return -EPERM;

	if (copy_from_user(range, (void __user *)arg, sizeof(range)))
		return -EFAULT;

	start = range[0];
	len = range[1];

	if (!len)
		return -EINVAL;
	if ((start | len) & bs_mask)
		return -EINVAL;
	if (!bdev_max_discard_sectors(bdev))
		return -EOPNOTSUPP;

	if (check_add_overflow(start, len, &end) ||
	    end > bdev_nr_bytes(bdev))
		return -EINVAL;
	if (!(mode & BLK_OPEN_WRITE))
		return -EBADF;
	if (bdev_read_only(bdev))
		return -EPERM;
	err = blk_validate_byte_range(bdev, start, len);
	if (err)
		return err;

	filemap_invalidate_lock(bdev->bd_mapping);
	err = truncate_bdev_range(bdev, mode, start, end - 1);
	err = truncate_bdev_range(bdev, mode, start, start + len - 1);
	if (err)
		goto fail;

@@ -735,3 +751,112 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
	return ret;
}
#endif

struct blk_iou_cmd {
	int res;
	bool nowait;
};

static void blk_cmd_complete(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
	struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd);

	if (bic->res == -EAGAIN && bic->nowait)
		io_uring_cmd_issue_blocking(cmd);
	else
		io_uring_cmd_done(cmd, bic->res, 0, issue_flags);
}

static void bio_cmd_bio_end_io(struct bio *bio)
{
	struct io_uring_cmd *cmd = bio->bi_private;
	struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd);

	if (unlikely(bio->bi_status) && !bic->res)
		bic->res = blk_status_to_errno(bio->bi_status);

	io_uring_cmd_do_in_task_lazy(cmd, blk_cmd_complete);
	bio_put(bio);
}

static int blkdev_cmd_discard(struct io_uring_cmd *cmd,
			      struct block_device *bdev,
			      uint64_t start, uint64_t len, bool nowait)
{
	struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd);
	gfp_t gfp = nowait ? GFP_NOWAIT : GFP_KERNEL;
	sector_t sector = start >> SECTOR_SHIFT;
	sector_t nr_sects = len >> SECTOR_SHIFT;
	struct bio *prev = NULL, *bio;
	int err;

	if (!bdev_max_discard_sectors(bdev))
		return -EOPNOTSUPP;
	if (!(file_to_blk_mode(cmd->file) & BLK_OPEN_WRITE))
		return -EBADF;
	if (bdev_read_only(bdev))
		return -EPERM;
	err = blk_validate_byte_range(bdev, start, len);
	if (err)
		return err;

	err = filemap_invalidate_pages(bdev->bd_mapping, start,
					start + len - 1, nowait);
	if (err)
		return err;

	while (true) {
		bio = blk_alloc_discard_bio(bdev, &sector, &nr_sects, gfp);
		if (!bio)
			break;
		if (nowait) {
			/*
			 * Don't allow multi-bio non-blocking submissions as
			 * subsequent bios may fail but we won't get a direct
			 * indication of that. Normally, the caller should
			 * retry from a blocking context.
			 */
			if (unlikely(nr_sects)) {
				bio_put(bio);
				return -EAGAIN;
			}
			bio->bi_opf |= REQ_NOWAIT;
		}

		prev = bio_chain_and_submit(prev, bio);
	}
	if (unlikely(!prev))
		return -EAGAIN;
	if (unlikely(nr_sects))
		bic->res = -EAGAIN;

	prev->bi_private = cmd;
	prev->bi_end_io = bio_cmd_bio_end_io;
	submit_bio(prev);
	return -EIOCBQUEUED;
}

int blkdev_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
	struct block_device *bdev = I_BDEV(cmd->file->f_mapping->host);
	struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd);
	const struct io_uring_sqe *sqe = cmd->sqe;
	u32 cmd_op = cmd->cmd_op;
	uint64_t start, len;

	if (unlikely(sqe->ioprio || sqe->__pad1 || sqe->len ||
		     sqe->rw_flags || sqe->file_index))
		return -EINVAL;

	bic->res = 0;
	bic->nowait = issue_flags & IO_URING_F_NONBLOCK;

	start = READ_ONCE(sqe->addr);
	len = READ_ONCE(sqe->addr3);

	switch (cmd_op) {
	case BLOCK_URING_CMD_DISCARD:
		return blkdev_cmd_discard(cmd, bdev, start, len, bic->nowait);
	}
	return -EINVAL;
}
+15 −0
Original line number Diff line number Diff line
@@ -23,6 +23,15 @@ static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe)
	return sqe->cmd;
}

static inline void io_uring_cmd_private_sz_check(size_t cmd_sz)
{
	BUILD_BUG_ON(cmd_sz > sizeof_field(struct io_uring_cmd, pdu));
}
#define io_uring_cmd_to_pdu(cmd, pdu_type) ( \
	io_uring_cmd_private_sz_check(sizeof(pdu_type)), \
	((pdu_type *)&(cmd)->pdu) \
)

#if defined(CONFIG_IO_URING)
int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
			      struct iov_iter *iter, void *ioucmd);
@@ -48,6 +57,9 @@ void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
		unsigned int issue_flags);

/* Execute the request from a blocking context */
void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd);

#else
static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
			      struct iov_iter *iter, void *ioucmd)
@@ -67,6 +79,9 @@ static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
		unsigned int issue_flags)
{
}
static inline void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd)
{
}
#endif

/*
+2 −0
Original line number Diff line number Diff line
@@ -32,6 +32,8 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
		pgoff_t start, pgoff_t end);
int kiocb_invalidate_pages(struct kiocb *iocb, size_t count);
void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count);
int filemap_invalidate_pages(struct address_space *mapping,
			     loff_t pos, loff_t end, bool nowait);

int write_inode_now(struct inode *, int sync);
int filemap_fdatawrite(struct address_space *);
Loading