Commit e5403415 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'block-6.16-20250626' of git://git.kernel.dk/linux

Pull block fixes from Jens Axboe:

 - Fixes for ublk:
      - fix C++ narrowing warnings in the uapi header
      - update/improve UBLK_F_SUPPORT_ZERO_COPY comment in uapi header
      - fix for the ublk ->queue_rqs() implementation, limiting a batch
        to just the specific task AND ring
      - ublk_get_data() error handling fix
      - sanity check more arguments in ublk_ctrl_add_dev()
      - selftest addition

 - NVMe pull request via Christoph:
      - reset delayed remove_work after reconnect
      - fix atomic write size validation

 - Fix for a warning introduced in bdev_count_inflight_rw() in this
   merge window

* tag 'block-6.16-20250626' of git://git.kernel.dk/linux:
  block: fix false warning in bdev_count_inflight_rw()
  ublk: sanity check add_dev input for underflow
  nvme: fix atomic write size validation
  nvme: refactor the atomic write unit detection
  nvme: reset delayed remove_work after reconnect
  ublk: setup ublk_io correctly in case of ublk_get_data() failure
  ublk: update UBLK_F_SUPPORT_ZERO_COPY comment in UAPI header
  ublk: fix narrowing warnings in UAPI header
  selftests: ublk: don't take same backing file for more than one ublk devices
  ublk: build batch from IOs in same io_ring_ctx and io task
parents 0a47e02d c0070621
Loading
Loading
Loading
Loading
+15 −11
Original line number Diff line number Diff line
@@ -128,23 +128,27 @@ static void part_stat_read_all(struct block_device *part,
static void bdev_count_inflight_rw(struct block_device *part,
		unsigned int inflight[2], bool mq_driver)
{
	int write = 0;
	int read = 0;
	int cpu;

	if (mq_driver) {
		blk_mq_in_driver_rw(part, inflight);
	} else {
		for_each_possible_cpu(cpu) {
			inflight[READ] += part_stat_local_read_cpu(
						part, in_flight[READ], cpu);
			inflight[WRITE] += part_stat_local_read_cpu(
						part, in_flight[WRITE], cpu);
		return;
	}

	for_each_possible_cpu(cpu) {
		read += part_stat_local_read_cpu(part, in_flight[READ], cpu);
		write += part_stat_local_read_cpu(part, in_flight[WRITE], cpu);
	}

	if (WARN_ON_ONCE((int)inflight[READ] < 0))
		inflight[READ] = 0;
	if (WARN_ON_ONCE((int)inflight[WRITE] < 0))
		inflight[WRITE] = 0;
	/*
	 * While iterating all CPUs, some IOs may be issued from a CPU already
	 * traversed and complete on a CPU that has not yet been traversed,
	 * causing the inflight number to be negative.
	 */
	inflight[READ] = read > 0 ? read : 0;
	inflight[WRITE] = write > 0 ? write : 0;
}

/**
+37 −12
Original line number Diff line number Diff line
@@ -1148,8 +1148,8 @@ static inline void __ublk_complete_rq(struct request *req)
	blk_mq_end_request(req, res);
}

static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req,
				 int res, unsigned issue_flags)
static struct io_uring_cmd *__ublk_prep_compl_io_cmd(struct ublk_io *io,
						     struct request *req)
{
	/* read cmd first because req will overwrite it */
	struct io_uring_cmd *cmd = io->cmd;
@@ -1164,6 +1164,13 @@ static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req,
	io->flags &= ~UBLK_IO_FLAG_ACTIVE;

	io->req = req;
	return cmd;
}

static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req,
				 int res, unsigned issue_flags)
{
	struct io_uring_cmd *cmd = __ublk_prep_compl_io_cmd(io, req);

	/* tell ublksrv one io request is coming */
	io_uring_cmd_done(cmd, res, 0, issue_flags);
@@ -1416,6 +1423,14 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
	return BLK_STS_OK;
}

static inline bool ublk_belong_to_same_batch(const struct ublk_io *io,
					     const struct ublk_io *io2)
{
	return (io_uring_cmd_ctx_handle(io->cmd) ==
		io_uring_cmd_ctx_handle(io2->cmd)) &&
		(io->task == io2->task);
}

static void ublk_queue_rqs(struct rq_list *rqlist)
{
	struct rq_list requeue_list = { };
@@ -1427,7 +1442,8 @@ static void ublk_queue_rqs(struct rq_list *rqlist)
		struct ublk_queue *this_q = req->mq_hctx->driver_data;
		struct ublk_io *this_io = &this_q->ios[req->tag];

		if (io && io->task != this_io->task && !rq_list_empty(&submit_list))
		if (io && !ublk_belong_to_same_batch(io, this_io) &&
				!rq_list_empty(&submit_list))
			ublk_queue_cmd_list(io, &submit_list);
		io = this_io;

@@ -2148,10 +2164,9 @@ static int ublk_commit_and_fetch(const struct ublk_queue *ubq,
	return 0;
}

static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io)
static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io,
			  struct request *req)
{
	struct request *req = io->req;

	/*
	 * We have handled UBLK_IO_NEED_GET_DATA command,
	 * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
@@ -2178,6 +2193,7 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
	u32 cmd_op = cmd->cmd_op;
	unsigned tag = ub_cmd->tag;
	int ret = -EINVAL;
	struct request *req;

	pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
			__func__, cmd->cmd_op, ub_cmd->q_id, tag,
@@ -2236,11 +2252,19 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
			goto out;
		break;
	case UBLK_IO_NEED_GET_DATA:
		io->addr = ub_cmd->addr;
		if (!ublk_get_data(ubq, io))
			return -EIOCBQUEUED;

		/*
		 * ublk_get_data() may fail and fallback to requeue, so keep
		 * uring_cmd active first and prepare for handling new requeued
		 * request
		 */
		req = io->req;
		ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
		io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
		if (likely(ublk_get_data(ubq, io, req))) {
			__ublk_prep_compl_io_cmd(io, req);
			return UBLK_IO_RES_OK;
		}
		break;
	default:
		goto out;
	}
@@ -2825,7 +2849,8 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
	if (copy_from_user(&info, argp, sizeof(info)))
		return -EFAULT;

	if (info.queue_depth > UBLK_MAX_QUEUE_DEPTH || info.nr_hw_queues > UBLK_MAX_NR_QUEUES)
	if (info.queue_depth > UBLK_MAX_QUEUE_DEPTH || !info.queue_depth ||
	    info.nr_hw_queues > UBLK_MAX_NR_QUEUES || !info.nr_hw_queues)
		return -EINVAL;

	if (capable(CAP_SYS_ADMIN))
+42 −45
Original line number Diff line number Diff line
@@ -2015,21 +2015,41 @@ static void nvme_configure_metadata(struct nvme_ctrl *ctrl,
}


static void nvme_update_atomic_write_disk_info(struct nvme_ns *ns,
			struct nvme_id_ns *id, struct queue_limits *lim,
			u32 bs, u32 atomic_bs)
static u32 nvme_configure_atomic_write(struct nvme_ns *ns,
		struct nvme_id_ns *id, struct queue_limits *lim, u32 bs)
{
	unsigned int boundary = 0;
	u32 atomic_bs, boundary = 0;

	if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf) {
		if (le16_to_cpu(id->nabspf))
	/*
	 * We do not support an offset for the atomic boundaries.
	 */
	if (id->nabo)
		return bs;

	if ((id->nsfeat & NVME_NS_FEAT_ATOMICS) && id->nawupf) {
		/*
		 * Use the per-namespace atomic write unit when available.
		 */
		atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs;
		if (id->nabspf)
			boundary = (le16_to_cpu(id->nabspf) + 1) * bs;
	} else {
		/*
		 * Use the controller wide atomic write unit.  This sucks
		 * because the limit is defined in terms of logical blocks while
		 * namespaces can have different formats, and because there is
		 * no clear language in the specification prohibiting different
		 * values for different controllers in the subsystem.
		 */
		atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs;
	}

	lim->atomic_write_hw_max = atomic_bs;
	lim->atomic_write_hw_boundary = boundary;
	lim->atomic_write_hw_unit_min = bs;
	lim->atomic_write_hw_unit_max = rounddown_pow_of_two(atomic_bs);
	lim->features |= BLK_FEAT_ATOMIC_WRITES;
	return atomic_bs;
}

static u32 nvme_max_drv_segments(struct nvme_ctrl *ctrl)
@@ -2067,34 +2087,8 @@ static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id,
		valid = false;
	}

	atomic_bs = phys_bs = bs;
	if (id->nabo == 0) {
		/*
		 * Bit 1 indicates whether NAWUPF is defined for this namespace
		 * and whether it should be used instead of AWUPF. If NAWUPF ==
		 * 0 then AWUPF must be used instead.
		 */
		if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf)
			atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs;
		else
			atomic_bs = (1 + ns->ctrl->awupf) * bs;

		/*
		 * Set subsystem atomic bs.
		 */
		if (ns->ctrl->subsys->atomic_bs) {
			if (atomic_bs != ns->ctrl->subsys->atomic_bs) {
				dev_err_ratelimited(ns->ctrl->device,
					"%s: Inconsistent Atomic Write Size, Namespace will not be added: Subsystem=%d bytes, Controller/Namespace=%d bytes\n",
					ns->disk ? ns->disk->disk_name : "?",
					ns->ctrl->subsys->atomic_bs,
					atomic_bs);
			}
		} else
			ns->ctrl->subsys->atomic_bs = atomic_bs;

		nvme_update_atomic_write_disk_info(ns, id, lim, bs, atomic_bs);
	}
	phys_bs = bs;
	atomic_bs = nvme_configure_atomic_write(ns, id, lim, bs);

	if (id->nsfeat & NVME_NS_FEAT_IO_OPT) {
		/* NPWG = Namespace Preferred Write Granularity */
@@ -2382,16 +2376,6 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
	if (!nvme_update_disk_info(ns, id, &lim))
		capacity = 0;

	/*
	 * Validate the max atomic write size fits within the subsystem's
	 * atomic write capabilities.
	 */
	if (lim.atomic_write_hw_max > ns->ctrl->subsys->atomic_bs) {
		blk_mq_unfreeze_queue(ns->disk->queue, memflags);
		ret = -ENXIO;
		goto out;
	}

	nvme_config_discard(ns, &lim);
	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
	    ns->head->ids.csi == NVME_CSI_ZNS)
@@ -3215,6 +3199,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
	memcpy(subsys->model, id->mn, sizeof(subsys->model));
	subsys->vendor_id = le16_to_cpu(id->vid);
	subsys->cmic = id->cmic;
	subsys->awupf = le16_to_cpu(id->awupf);

	/* Versions prior to 1.4 don't necessarily report a valid type */
	if (id->cntrltype == NVME_CTRL_DISC ||
@@ -3552,6 +3537,15 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl)
		if (ret)
			goto out_free;
	}

	if (le16_to_cpu(id->awupf) != ctrl->subsys->awupf) {
		dev_err_ratelimited(ctrl->device,
			"inconsistent AWUPF, controller not added (%u/%u).\n",
			le16_to_cpu(id->awupf), ctrl->subsys->awupf);
		ret = -EINVAL;
		goto out_free;
	}

	memcpy(ctrl->subsys->firmware_rev, id->fr,
	       sizeof(ctrl->subsys->firmware_rev));

@@ -3647,7 +3641,6 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl)
		dev_pm_qos_expose_latency_tolerance(ctrl->device);
	else if (!ctrl->apst_enabled && prev_apst_enabled)
		dev_pm_qos_hide_latency_tolerance(ctrl->device);
	ctrl->awupf = le16_to_cpu(id->awupf);
out_free:
	kfree(id);
	return ret;
@@ -4036,6 +4029,10 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info)
	list_add_tail_rcu(&ns->siblings, &head->list);
	ns->head = head;
	mutex_unlock(&ctrl->subsys->lock);

#ifdef CONFIG_NVME_MULTIPATH
	cancel_delayed_work(&head->remove_work);
#endif
	return 0;

out_put_ns_head:
+1 −1
Original line number Diff line number Diff line
@@ -1311,7 +1311,7 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head)
		 */
		if (!try_module_get(THIS_MODULE))
			goto out;
		queue_delayed_work(nvme_wq, &head->remove_work,
		mod_delayed_work(nvme_wq, &head->remove_work,
				head->delayed_removal_secs * HZ);
	} else {
		list_del_init(&head->entry);
+1 −2
Original line number Diff line number Diff line
@@ -410,7 +410,6 @@ struct nvme_ctrl {

	enum nvme_ctrl_type cntrltype;
	enum nvme_dctype dctype;
	u16 awupf; /* 0's based value. */
};

static inline enum nvme_ctrl_state nvme_ctrl_state(struct nvme_ctrl *ctrl)
@@ -443,11 +442,11 @@ struct nvme_subsystem {
	u8			cmic;
	enum nvme_subsys_type	subtype;
	u16			vendor_id;
	u16			awupf; /* 0's based value. */
	struct ida		ns_ida;
#ifdef CONFIG_NVME_MULTIPATH
	enum nvme_iopolicy	iopolicy;
#endif
	u32			atomic_bs;
};

/*
Loading