Commit ec9b3ac6 authored by Jens Axboe's avatar Jens Axboe
Browse files

Merge tag 'nvme-6.13-2024-11-21' of git://git.infradead.org/nvme into for-6.13/block

Pull NVMe updates from Keith:

"nvme updates for Linux 6.13

 - Use correct srcu list traversal (Breno)
 - Scatter-gather support for metadata (Keith)
 - Fabrics shutdown race condition fix (Nilay)
 - Persistent reservations updates (Guixin)"

* tag 'nvme-6.13-2024-11-21' of git://git.infradead.org/nvme:
  nvme: tuning pr code by using defined structs and macros
  nvme: introduce change ptpl and iekey definition
  nvme-fabrics: fix kernel crash while shutting down controller
  Revert "nvme: make keep-alive synchronous operation"
  nvme-pci: use sgls for all user requests if possible
  nvme: define the remaining used sgls constants
  nvme-pci: add support for sgl metadata
  nvme/multipath: Fix RCU list traversal to use SRCU primitive
parents 766a71ef 029cc98d
Loading
Loading
Loading
Loading
+15 −7
Original line number Diff line number Diff line
@@ -1294,9 +1294,10 @@ static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl)
	queue_delayed_work(nvme_wq, &ctrl->ka_work, delay);
}

static void nvme_keep_alive_finish(struct request *rq,
		blk_status_t status, struct nvme_ctrl *ctrl)
static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq,
						 blk_status_t status)
{
	struct nvme_ctrl *ctrl = rq->end_io_data;
	unsigned long rtt = jiffies - (rq->deadline - rq->timeout);
	unsigned long delay = nvme_keep_alive_work_period(ctrl);
	enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
@@ -1313,17 +1314,20 @@ static void nvme_keep_alive_finish(struct request *rq,
		delay = 0;
	}

	blk_mq_free_request(rq);

	if (status) {
		dev_err(ctrl->device,
			"failed nvme_keep_alive_end_io error=%d\n",
				status);
		return;
		return RQ_END_IO_NONE;
	}

	ctrl->ka_last_check_time = jiffies;
	ctrl->comp_seen = false;
	if (state == NVME_CTRL_LIVE || state == NVME_CTRL_CONNECTING)
		queue_delayed_work(nvme_wq, &ctrl->ka_work, delay);
	return RQ_END_IO_NONE;
}

static void nvme_keep_alive_work(struct work_struct *work)
@@ -1332,7 +1336,6 @@ static void nvme_keep_alive_work(struct work_struct *work)
			struct nvme_ctrl, ka_work);
	bool comp_seen = ctrl->comp_seen;
	struct request *rq;
	blk_status_t status;

	ctrl->ka_last_check_time = jiffies;

@@ -1355,9 +1358,9 @@ static void nvme_keep_alive_work(struct work_struct *work)
	nvme_init_request(rq, &ctrl->ka_cmd);

	rq->timeout = ctrl->kato * HZ;
	status = blk_execute_rq(rq, false);
	nvme_keep_alive_finish(rq, status, ctrl);
	blk_mq_free_request(rq);
	rq->end_io = nvme_keep_alive_end_io;
	rq->end_io_data = ctrl;
	blk_execute_rq_nowait(rq, false);
}

static void nvme_start_keep_alive(struct nvme_ctrl *ctrl)
@@ -4571,6 +4574,11 @@ EXPORT_SYMBOL_GPL(nvme_alloc_admin_tag_set);

void nvme_remove_admin_tag_set(struct nvme_ctrl *ctrl)
{
	/*
	 * As we're about to destroy the queue and free tagset
	 * we can not have keep-alive work running.
	 */
	nvme_stop_keep_alive(ctrl);
	blk_mq_destroy_queue(ctrl->admin_q);
	blk_put_queue(ctrl->admin_q);
	if (ctrl->ops->flags & NVME_F_FABRICS) {
+10 −2
Original line number Diff line number Diff line
@@ -120,12 +120,20 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer,
	struct nvme_ns *ns = q->queuedata;
	struct block_device *bdev = ns ? ns->disk->part0 : NULL;
	bool supports_metadata = bdev && blk_get_integrity(bdev->bd_disk);
	struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
	bool has_metadata = meta_buffer && meta_len;
	struct bio *bio = NULL;
	int ret;

	if (has_metadata && !supports_metadata)
	if (!nvme_ctrl_sgl_supported(ctrl))
		dev_warn_once(ctrl->device, "using unchecked data buffer\n");
	if (has_metadata) {
		if (!supports_metadata)
			return -EINVAL;
		if (!nvme_ctrl_meta_sgl_supported(ctrl))
			dev_warn_once(ctrl->device,
				      "using unchecked metadata buffer\n");
	}

	if (ioucmd && (ioucmd->flags & IORING_URING_CMD_FIXED)) {
		struct iov_iter iter;
+14 −7
Original line number Diff line number Diff line
@@ -165,7 +165,8 @@ void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
	int srcu_idx;

	srcu_idx = srcu_read_lock(&ctrl->srcu);
	list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
	list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
				 srcu_read_lock_held(&ctrl->srcu)) {
		if (!ns->head->disk)
			continue;
		kblockd_schedule_work(&ns->head->requeue_work);
@@ -209,7 +210,8 @@ void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
	int srcu_idx;

	srcu_idx = srcu_read_lock(&ctrl->srcu);
	list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
	list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
				 srcu_read_lock_held(&ctrl->srcu)) {
		nvme_mpath_clear_current_path(ns);
		kblockd_schedule_work(&ns->head->requeue_work);
	}
@@ -224,7 +226,8 @@ void nvme_mpath_revalidate_paths(struct nvme_ns *ns)
	int srcu_idx;

	srcu_idx = srcu_read_lock(&head->srcu);
	list_for_each_entry_rcu(ns, &head->list, siblings) {
	list_for_each_entry_srcu(ns, &head->list, siblings,
				 srcu_read_lock_held(&head->srcu)) {
		if (capacity != get_capacity(ns->disk))
			clear_bit(NVME_NS_READY, &ns->flags);
	}
@@ -257,7 +260,8 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
	int found_distance = INT_MAX, fallback_distance = INT_MAX, distance;
	struct nvme_ns *found = NULL, *fallback = NULL, *ns;

	list_for_each_entry_rcu(ns, &head->list, siblings) {
	list_for_each_entry_srcu(ns, &head->list, siblings,
				 srcu_read_lock_held(&head->srcu)) {
		if (nvme_path_is_disabled(ns))
			continue;

@@ -356,7 +360,8 @@ static struct nvme_ns *nvme_queue_depth_path(struct nvme_ns_head *head)
	unsigned int min_depth_opt = UINT_MAX, min_depth_nonopt = UINT_MAX;
	unsigned int depth;

	list_for_each_entry_rcu(ns, &head->list, siblings) {
	list_for_each_entry_srcu(ns, &head->list, siblings,
				 srcu_read_lock_held(&head->srcu)) {
		if (nvme_path_is_disabled(ns))
			continue;

@@ -424,7 +429,8 @@ static bool nvme_available_path(struct nvme_ns_head *head)
	if (!test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags))
		return NULL;

	list_for_each_entry_rcu(ns, &head->list, siblings) {
	list_for_each_entry_srcu(ns, &head->list, siblings,
				 srcu_read_lock_held(&head->srcu)) {
		if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags))
			continue;
		switch (nvme_ctrl_state(ns->ctrl)) {
@@ -783,7 +789,8 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
		return 0;

	srcu_idx = srcu_read_lock(&ctrl->srcu);
	list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
	list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
				 srcu_read_lock_held(&ctrl->srcu)) {
		unsigned nsid;
again:
		nsid = le32_to_cpu(desc->nsids[n]);
+9 −1
Original line number Diff line number Diff line
@@ -1123,7 +1123,15 @@ static inline void nvme_start_request(struct request *rq)

static inline bool nvme_ctrl_sgl_supported(struct nvme_ctrl *ctrl)
{
	return ctrl->sgls & ((1 << 0) | (1 << 1));
	return ctrl->sgls & (NVME_CTRL_SGLS_BYTE_ALIGNED |
			     NVME_CTRL_SGLS_DWORD_ALIGNED);
}

static inline bool nvme_ctrl_meta_sgl_supported(struct nvme_ctrl *ctrl)
{
	if (ctrl->ops->flags & NVME_F_FABRICS)
		return true;
	return ctrl->sgls & NVME_CTRL_SGLS_MSDS;
}

#ifdef CONFIG_NVME_HOST_AUTH
+131 −16
Original line number Diff line number Diff line
@@ -43,6 +43,7 @@
 */
#define NVME_MAX_KB_SZ	8192
#define NVME_MAX_SEGS	128
#define NVME_MAX_META_SEGS 15
#define NVME_MAX_NR_ALLOCATIONS	5

static int use_threaded_interrupts;
@@ -144,6 +145,7 @@ struct nvme_dev {
	struct sg_table *hmb_sgt;

	mempool_t *iod_mempool;
	mempool_t *iod_meta_mempool;

	/* shadow doorbell buffer support: */
	__le32 *dbbuf_dbs;
@@ -239,6 +241,8 @@ struct nvme_iod {
	dma_addr_t first_dma;
	dma_addr_t meta_dma;
	struct sg_table sgt;
	struct sg_table meta_sgt;
	union nvme_descriptor meta_list;
	union nvme_descriptor list[NVME_MAX_NR_ALLOCATIONS];
};

@@ -506,6 +510,15 @@ static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx)
	spin_unlock(&nvmeq->sq_lock);
}

static inline bool nvme_pci_metadata_use_sgls(struct nvme_dev *dev,
					      struct request *req)
{
	if (!nvme_ctrl_meta_sgl_supported(&dev->ctrl))
		return false;
	return req->nr_integrity_segments > 1 ||
		nvme_req(req)->flags & NVME_REQ_USERCMD;
}

static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req,
				     int nseg)
{
@@ -518,8 +531,10 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req,
		return false;
	if (!nvmeq->qid)
		return false;
	if (nvme_pci_metadata_use_sgls(dev, req))
		return true;
	if (!sgl_threshold || avg_seg_size < sgl_threshold)
		return false;
		return nvme_req(req)->flags & NVME_REQ_USERCMD;
	return true;
}

@@ -780,7 +795,8 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
		struct bio_vec bv = req_bvec(req);

		if (!is_pci_p2pdma_page(bv.bv_page)) {
			if ((bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1)) +
			if (!nvme_pci_metadata_use_sgls(dev, req) &&
			    (bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1)) +
			     bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2)
				return nvme_setup_prp_simple(dev, req,
							     &cmnd->rw, &bv);
@@ -824,11 +840,69 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
	return ret;
}

static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req,
		struct nvme_command *cmnd)
static blk_status_t nvme_pci_setup_meta_sgls(struct nvme_dev *dev,
					     struct request *req)
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
	struct nvme_rw_command *cmnd = &iod->cmd.rw;
	struct nvme_sgl_desc *sg_list;
	struct scatterlist *sgl, *sg;
	unsigned int entries;
	dma_addr_t sgl_dma;
	int rc, i;

	iod->meta_sgt.sgl = mempool_alloc(dev->iod_meta_mempool, GFP_ATOMIC);
	if (!iod->meta_sgt.sgl)
		return BLK_STS_RESOURCE;

	sg_init_table(iod->meta_sgt.sgl, req->nr_integrity_segments);
	iod->meta_sgt.orig_nents = blk_rq_map_integrity_sg(req,
							   iod->meta_sgt.sgl);
	if (!iod->meta_sgt.orig_nents)
		goto out_free_sg;

	rc = dma_map_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req),
			     DMA_ATTR_NO_WARN);
	if (rc)
		goto out_free_sg;

	sg_list = dma_pool_alloc(dev->prp_small_pool, GFP_ATOMIC, &sgl_dma);
	if (!sg_list)
		goto out_unmap_sg;

	entries = iod->meta_sgt.nents;
	iod->meta_list.sg_list = sg_list;
	iod->meta_dma = sgl_dma;

	cmnd->flags = NVME_CMD_SGL_METASEG;
	cmnd->metadata = cpu_to_le64(sgl_dma);

	sgl = iod->meta_sgt.sgl;
	if (entries == 1) {
		nvme_pci_sgl_set_data(sg_list, sgl);
		return BLK_STS_OK;
	}

	sgl_dma += sizeof(*sg_list);
	nvme_pci_sgl_set_seg(sg_list, sgl_dma, entries);
	for_each_sg(sgl, sg, entries, i)
		nvme_pci_sgl_set_data(&sg_list[i + 1], sg);

	return BLK_STS_OK;

out_unmap_sg:
	dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0);
out_free_sg:
	mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool);
	return BLK_STS_RESOURCE;
}

static blk_status_t nvme_pci_setup_meta_mptr(struct nvme_dev *dev,
					     struct request *req)
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
	struct bio_vec bv = rq_integrity_vec(req);
	struct nvme_command *cmnd = &iod->cmd;

	iod->meta_dma = dma_map_bvec(dev->dev, &bv, rq_dma_dir(req), 0);
	if (dma_mapping_error(dev->dev, iod->meta_dma))
@@ -837,6 +911,13 @@ static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req,
	return BLK_STS_OK;
}

static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req)
{
	if (nvme_pci_metadata_use_sgls(dev, req))
		return nvme_pci_setup_meta_sgls(dev, req);
	return nvme_pci_setup_meta_mptr(dev, req);
}

static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -845,6 +926,7 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
	iod->aborted = false;
	iod->nr_allocations = -1;
	iod->sgt.nents = 0;
	iod->meta_sgt.nents = 0;

	ret = nvme_setup_cmd(req->q->queuedata, req);
	if (ret)
@@ -857,7 +939,7 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
	}

	if (blk_integrity_rq(req)) {
		ret = nvme_map_metadata(dev, req, &iod->cmd);
		ret = nvme_map_metadata(dev, req);
		if (ret)
			goto out_unmap_data;
	}
@@ -955,18 +1037,32 @@ static void nvme_queue_rqs(struct rq_list *rqlist)
	*rqlist = requeue_list;
}

static __always_inline void nvme_pci_unmap_rq(struct request *req)
static __always_inline void nvme_unmap_metadata(struct nvme_dev *dev,
						struct request *req)
{
	struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
	struct nvme_dev *dev = nvmeq->dev;

	if (blk_integrity_rq(req)) {
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);

	if (!iod->meta_sgt.nents) {
		dma_unmap_page(dev->dev, iod->meta_dma,
			       rq_integrity_vec(req).bv_len, rq_dma_dir(req));
			       rq_integrity_vec(req).bv_len,
			       rq_dma_dir(req));
		return;
	}

	dma_pool_free(dev->prp_small_pool, iod->meta_list.sg_list,
		      iod->meta_dma);
	dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0);
	mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool);
}

static __always_inline void nvme_pci_unmap_rq(struct request *req)
{
	struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
	struct nvme_dev *dev = nvmeq->dev;

	if (blk_integrity_rq(req))
		nvme_unmap_metadata(dev, req);

	if (blk_rq_nr_phys_segments(req))
		nvme_unmap_data(dev, req);
}
@@ -2761,6 +2857,7 @@ static void nvme_release_prp_pools(struct nvme_dev *dev)

static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
{
	size_t meta_size = sizeof(struct scatterlist) * (NVME_MAX_META_SEGS + 1);
	size_t alloc_size = sizeof(struct scatterlist) * NVME_MAX_SEGS;

	dev->iod_mempool = mempool_create_node(1,
@@ -2769,7 +2866,18 @@ static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
			dev_to_node(dev->dev));
	if (!dev->iod_mempool)
		return -ENOMEM;

	dev->iod_meta_mempool = mempool_create_node(1,
			mempool_kmalloc, mempool_kfree,
			(void *)meta_size, GFP_KERNEL,
			dev_to_node(dev->dev));
	if (!dev->iod_meta_mempool)
		goto free;

	return 0;
free:
	mempool_destroy(dev->iod_mempool);
	return -ENOMEM;
}

static void nvme_free_tagset(struct nvme_dev *dev)
@@ -2834,6 +2942,11 @@ static void nvme_reset_work(struct work_struct *work)
	if (result)
		goto out;

	if (nvme_ctrl_meta_sgl_supported(&dev->ctrl))
		dev->ctrl.max_integrity_segments = NVME_MAX_META_SEGS;
	else
		dev->ctrl.max_integrity_segments = 1;

	nvme_dbbuf_dma_alloc(dev);

	result = nvme_setup_host_mem(dev);
@@ -3101,11 +3214,6 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev,
	dev->ctrl.max_hw_sectors = min_t(u32,
		NVME_MAX_KB_SZ << 1, dma_opt_mapping_size(&pdev->dev) >> 9);
	dev->ctrl.max_segments = NVME_MAX_SEGS;

	/*
	 * There is no support for SGLs for metadata (yet), so we are limited to
	 * a single integrity segment for the separate metadata pointer.
	 */
	dev->ctrl.max_integrity_segments = 1;
	return dev;

@@ -3168,6 +3276,11 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
	if (result)
		goto out_disable;

	if (nvme_ctrl_meta_sgl_supported(&dev->ctrl))
		dev->ctrl.max_integrity_segments = NVME_MAX_META_SEGS;
	else
		dev->ctrl.max_integrity_segments = 1;

	nvme_dbbuf_dma_alloc(dev);

	result = nvme_setup_host_mem(dev);
@@ -3210,6 +3323,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
	nvme_free_queues(dev, 0);
out_release_iod_mempool:
	mempool_destroy(dev->iod_mempool);
	mempool_destroy(dev->iod_meta_mempool);
out_release_prp_pools:
	nvme_release_prp_pools(dev);
out_dev_unmap:
@@ -3275,6 +3389,7 @@ static void nvme_remove(struct pci_dev *pdev)
	nvme_dbbuf_dma_free(dev);
	nvme_free_queues(dev, 0);
	mempool_destroy(dev->iod_mempool);
	mempool_destroy(dev->iod_meta_mempool);
	nvme_release_prp_pools(dev);
	nvme_dev_unmap(dev);
	nvme_uninit_ctrl(&dev->ctrl);
Loading