Commit bbb49005 authored by Mike Christie's avatar Mike Christie Committed by Martin K. Petersen
Browse files

scsi: target: Move LUN stats to per-CPU



The atomic use in the main I/O path is causing perf issues when using
higher performance backend devices and multiple queues (more than
10 when using vhost-scsi) like with this fio workload:

[global]
bs=4K
iodepth=128
direct=1
ioengine=libaio
group_reporting
time_based
runtime=120
name=standard-iops
rw=randread
numjobs=16
cpus_allowed=0-15

To fix this issue, move the LUN stats to per CPU.

Note: I forgot to include this patch with the delayed/ordered per CPU
tracking and per device/device entry per CPU stats. With this patch you
get the full 33% improvements when using fast backends, multiple queues
and multiple IO submiters.

Signed-off-by: default avatarMike Christie <michael.christie@oracle.com>
Reviewed-by: default avatarDmitry Bogdanov <d.bogdanov@yadro.com>
Link: https://patch.msgid.link/20250917221338.14813-4-michael.christie@oracle.com


Signed-off-by: default avatarMartin K. Petersen <martin.petersen@oracle.com>
parent ed6b97a7
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -814,6 +814,7 @@ struct se_device *target_alloc_device(struct se_hba *hba, const char *name)
	dev->dev_attrib.max_write_same_len = DA_MAX_WRITE_SAME_LEN;
	dev->dev_attrib.submit_type = TARGET_FABRIC_DEFAULT_SUBMIT;

	/* Skip allocating lun_stats since we can't export them. */
	xcopy_lun = &dev->xcopy_lun;
	rcu_assign_pointer(xcopy_lun->lun_se_dev, dev);
	init_completion(&xcopy_lun->lun_shutdown_comp);
+1 −1
Original line number Diff line number Diff line
@@ -697,7 +697,7 @@ static void target_fabric_port_release(struct config_item *item)
	struct se_lun *lun = container_of(to_config_group(item),
					  struct se_lun, lun_group);

	kfree_rcu(lun, rcu_head);
	call_rcu(&lun->rcu_head, target_tpg_free_lun);
}

static struct configfs_item_operations target_fabric_port_item_ops = {
+1 −0
Original line number Diff line number Diff line
@@ -125,6 +125,7 @@ void core_tpg_add_node_to_devs(struct se_node_acl *, struct se_portal_group *,
				  struct se_lun *);
void	core_tpg_wait_for_nacl_pr_ref(struct se_node_acl *);
struct se_lun *core_tpg_alloc_lun(struct se_portal_group *, u64);
void	target_tpg_free_lun(struct rcu_head *head);
int	core_tpg_add_lun(struct se_portal_group *, struct se_lun *,
		bool, struct se_device *);
void core_tpg_remove_lun(struct se_portal_group *, struct se_lun *);
+22 −45
Original line number Diff line number Diff line
@@ -606,53 +606,30 @@ static ssize_t target_stat_tgt_port_port_index_show(struct config_item *item,
	return ret;
}

static ssize_t target_stat_tgt_port_in_cmds_show(struct config_item *item,
		char *page)
{
	struct se_lun *lun = to_stat_tgt_port(item);
	struct se_device *dev;
	ssize_t ret = -ENODEV;

	rcu_read_lock();
	dev = rcu_dereference(lun->lun_se_dev);
	if (dev)
		ret = snprintf(page, PAGE_SIZE, "%lu\n",
			       atomic_long_read(&lun->lun_stats.cmd_pdus));
	rcu_read_unlock();
	return ret;
}

static ssize_t target_stat_tgt_port_write_mbytes_show(struct config_item *item,
		char *page)
{
	struct se_lun *lun = to_stat_tgt_port(item);
	struct se_device *dev;
	ssize_t ret = -ENODEV;

	rcu_read_lock();
	dev = rcu_dereference(lun->lun_se_dev);
	if (dev)
		ret = snprintf(page, PAGE_SIZE, "%u\n",
			(u32)(atomic_long_read(&lun->lun_stats.rx_data_octets) >> 20));
	rcu_read_unlock();
	return ret;
#define tgt_port_show_per_cpu_stat(prefix, field, shift)		\
per_cpu_stat_snprintf(scsi_port_stats, prefix, field, shift);		\
static ssize_t								\
target_stat_##prefix##_show(struct config_item *item, char *page)	\
{									\
	struct se_lun *lun = to_stat_tgt_port(item);			\
	struct se_device *dev;						\
	int ret;							\
									\
	rcu_read_lock();						\
	dev = rcu_dereference(lun->lun_se_dev);				\
	if (!dev) {							\
		rcu_read_unlock();					\
		return -ENODEV;						\
	}								\
									\
	ret = per_cpu_stat_##prefix##_snprintf(lun->lun_stats, page);	\
	rcu_read_unlock();						\
	return ret;							\
}

static ssize_t target_stat_tgt_port_read_mbytes_show(struct config_item *item,
		char *page)
{
	struct se_lun *lun = to_stat_tgt_port(item);
	struct se_device *dev;
	ssize_t ret = -ENODEV;

	rcu_read_lock();
	dev = rcu_dereference(lun->lun_se_dev);
	if (dev)
		ret = snprintf(page, PAGE_SIZE, "%u\n",
				(u32)(atomic_long_read(&lun->lun_stats.tx_data_octets) >> 20));
	rcu_read_unlock();
	return ret;
}
tgt_port_show_per_cpu_stat(tgt_port_in_cmds, cmd_pdus, 0);
tgt_port_show_per_cpu_stat(tgt_port_write_mbytes, rx_data_octets, 20);
tgt_port_show_per_cpu_stat(tgt_port_read_mbytes, tx_data_octets, 20);

static ssize_t target_stat_tgt_port_hs_in_cmds_show(struct config_item *item,
		char *page)
+21 −2
Original line number Diff line number Diff line
@@ -548,7 +548,7 @@ int core_tpg_register(
		ret = core_tpg_add_lun(se_tpg, se_tpg->tpg_virt_lun0,
				true, g_lun0_dev);
		if (ret < 0) {
			kfree(se_tpg->tpg_virt_lun0);
			target_tpg_free_lun(&se_tpg->tpg_virt_lun0->rcu_head);
			return ret;
		}
	}
@@ -595,7 +595,7 @@ int core_tpg_deregister(struct se_portal_group *se_tpg)

	if (se_tpg->proto_id >= 0) {
		core_tpg_remove_lun(se_tpg, se_tpg->tpg_virt_lun0);
		kfree_rcu(se_tpg->tpg_virt_lun0, rcu_head);
		call_rcu(&se_tpg->tpg_virt_lun0->rcu_head, target_tpg_free_lun);
	}

	target_tpg_deregister_rtpi(se_tpg);
@@ -615,6 +615,13 @@ struct se_lun *core_tpg_alloc_lun(
		pr_err("Unable to allocate se_lun memory\n");
		return ERR_PTR(-ENOMEM);
	}

	lun->lun_stats = alloc_percpu(struct scsi_port_stats);
	if (!lun->lun_stats) {
		pr_err("Unable to allocate se_lun stats memory\n");
		goto free_lun;
	}

	lun->unpacked_lun = unpacked_lun;
	atomic_set(&lun->lun_acl_count, 0);
	init_completion(&lun->lun_shutdown_comp);
@@ -628,6 +635,18 @@ struct se_lun *core_tpg_alloc_lun(
	lun->lun_tpg = tpg;

	return lun;

free_lun:
	kfree(lun);
	return ERR_PTR(-ENOMEM);
}

void target_tpg_free_lun(struct rcu_head *head)
{
	struct se_lun *lun = container_of(head, struct se_lun, rcu_head);

	free_percpu(lun->lun_stats);
	kfree(lun);
}

int core_tpg_add_lun(
Loading