Commit e06b71b2 authored by Jonathan Kim's avatar Jonathan Kim Committed by Alex Deucher
Browse files

drm/amdkfd: allow users to target recommended SDMA engines



Certain GPUs have better copy performance over xGMI on specific
SDMA engines depending on the source and destination GPU.
Allow users to create SDMA queues on these recommended engines.
Close to 2x overall performance has been observed with this
optimization.

Signed-off-by: default avatarJonathan Kim <jonathan.kim@amd.com>
Reviewed-by: default avatarFelix Kuehling <felix.kuehling@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 60c30ba7
Loading
Loading
Loading
Loading
+16 −0
Original line number Diff line number Diff line
@@ -255,6 +255,7 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties,
			args->ctx_save_restore_address;
	q_properties->ctx_save_restore_area_size = args->ctx_save_restore_size;
	q_properties->ctl_stack_size = args->ctl_stack_size;
	q_properties->sdma_engine_id = args->sdma_engine_id;
	if (args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE ||
		args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL)
		q_properties->type = KFD_QUEUE_TYPE_COMPUTE;
@@ -262,6 +263,8 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties,
		q_properties->type = KFD_QUEUE_TYPE_SDMA;
	else if (args->queue_type == KFD_IOC_QUEUE_TYPE_SDMA_XGMI)
		q_properties->type = KFD_QUEUE_TYPE_SDMA_XGMI;
	else if (args->queue_type == KFD_IOC_QUEUE_TYPE_SDMA_BY_ENG_ID)
		q_properties->type = KFD_QUEUE_TYPE_SDMA_BY_ENG_ID;
	else
		return -ENOTSUPP;

@@ -333,6 +336,18 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
		goto err_bind_process;
	}

	if (q_properties.type == KFD_QUEUE_TYPE_SDMA_BY_ENG_ID) {
		int max_sdma_eng_id = kfd_get_num_sdma_engines(dev) +
				      kfd_get_num_xgmi_sdma_engines(dev) - 1;

		if (q_properties.sdma_engine_id > max_sdma_eng_id) {
			err = -EINVAL;
			pr_err("sdma_engine_id %i exceeds maximum id of %i\n",
			       q_properties.sdma_engine_id, max_sdma_eng_id);
			goto err_sdma_engine_id;
		}
	}

	if (!pdd->qpd.proc_doorbells) {
		err = kfd_alloc_process_doorbells(dev->kfd, pdd);
		if (err) {
@@ -387,6 +402,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
err_create_queue:
	kfd_queue_release_buffers(pdd, &q_properties);
err_acquire_queue_buf:
err_sdma_engine_id:
err_bind_process:
err_pdd:
	mutex_unlock(&p->mutex);
+37 −1
Original line number Diff line number Diff line
@@ -1532,6 +1532,41 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
			q->sdma_id % kfd_get_num_xgmi_sdma_engines(dqm->dev);
		q->properties.sdma_queue_id = q->sdma_id /
			kfd_get_num_xgmi_sdma_engines(dqm->dev);
	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_BY_ENG_ID) {
		int i, num_queues, num_engines, eng_offset = 0, start_engine;
		bool free_bit_found = false, is_xgmi = false;

		if (q->properties.sdma_engine_id < kfd_get_num_sdma_engines(dqm->dev)) {
			num_queues = get_num_sdma_queues(dqm);
			num_engines = kfd_get_num_sdma_engines(dqm->dev);
			q->properties.type = KFD_QUEUE_TYPE_SDMA;
		} else {
			num_queues = get_num_xgmi_sdma_queues(dqm);
			num_engines = kfd_get_num_xgmi_sdma_engines(dqm->dev);
			eng_offset = kfd_get_num_sdma_engines(dqm->dev);
			q->properties.type = KFD_QUEUE_TYPE_SDMA_XGMI;
			is_xgmi = true;
		}

		/* Scan available bit based on target engine ID. */
		start_engine = q->properties.sdma_engine_id - eng_offset;
		for (i = start_engine; i < num_queues; i += num_engines) {

			if (!test_bit(i, is_xgmi ? dqm->xgmi_sdma_bitmap : dqm->sdma_bitmap))
				continue;

			clear_bit(i, is_xgmi ? dqm->xgmi_sdma_bitmap : dqm->sdma_bitmap);
			q->sdma_id = i;
			q->properties.sdma_queue_id = q->sdma_id / num_engines;
			free_bit_found = true;
			break;
		}

		if (!free_bit_found) {
			dev_err(dev, "No more SDMA queue to allocate for target ID %i\n",
				q->properties.sdma_engine_id);
			return -ENOMEM;
		}
	}

	pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id);
@@ -1784,7 +1819,8 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
	}

	if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI ||
		q->properties.type == KFD_QUEUE_TYPE_SDMA_BY_ENG_ID) {
		dqm_lock(dqm);
		retval = allocate_sdma_queue(dqm, q, qd ? &qd->sdma_id : NULL);
		dqm_unlock(dqm);
+4 −1
Original line number Diff line number Diff line
@@ -414,13 +414,16 @@ enum kfd_unmap_queues_filter {
 * @KFD_QUEUE_TYPE_DIQ: DIQ queue type.
 *
 * @KFD_QUEUE_TYPE_SDMA_XGMI: Special SDMA queue for XGMI interface.
 *
 * @KFD_QUEUE_TYPE_SDMA_BY_ENG_ID:  SDMA user mode queue with target SDMA engine ID.
 */
enum kfd_queue_type  {
	KFD_QUEUE_TYPE_COMPUTE,
	KFD_QUEUE_TYPE_SDMA,
	KFD_QUEUE_TYPE_HIQ,
	KFD_QUEUE_TYPE_DIQ,
	KFD_QUEUE_TYPE_SDMA_XGMI
	KFD_QUEUE_TYPE_SDMA_XGMI,
	KFD_QUEUE_TYPE_SDMA_BY_ENG_ID
};

enum kfd_queue_format {
+1 −0
Original line number Diff line number Diff line
@@ -366,6 +366,7 @@ int pqm_create_queue(struct process_queue_manager *pqm,
	switch (type) {
	case KFD_QUEUE_TYPE_SDMA:
	case KFD_QUEUE_TYPE_SDMA_XGMI:
	case KFD_QUEUE_TYPE_SDMA_BY_ENG_ID:
		/* SDMA queues are always allocated statically no matter
		 * which scheduler mode is used. We also do not need to
		 * check whether a SDMA queue can be allocated here, because
+52 −0
Original line number Diff line number Diff line
@@ -292,6 +292,8 @@ static ssize_t iolink_show(struct kobject *kobj, struct attribute *attr,
			      iolink->max_bandwidth);
	sysfs_show_32bit_prop(buffer, offs, "recommended_transfer_size",
			      iolink->rec_transfer_size);
	sysfs_show_32bit_prop(buffer, offs, "recommended_sdma_engine_id_mask",
			      iolink->rec_sdma_eng_id_mask);
	sysfs_show_32bit_prop(buffer, offs, "flags", iolink->flags);

	return offs;
@@ -1265,6 +1267,55 @@ static void kfd_set_iolink_non_coherent(struct kfd_topology_device *to_dev,
	}
}

#define REC_SDMA_NUM_GPU	8
static const int rec_sdma_eng_map[REC_SDMA_NUM_GPU][REC_SDMA_NUM_GPU] = {
							{ -1, 14, 12, 2, 4, 8, 10, 6 },
							{ 14, -1, 2, 10, 8, 4, 6, 12 },
							{ 10, 2, -1, 12, 14, 6, 4, 8 },
							{ 2, 12, 10, -1, 6, 14, 8, 4 },
							{ 4, 8, 14, 6, -1, 10, 12, 2 },
							{ 8, 4, 6, 14, 12, -1, 2, 10 },
							{ 10, 6, 4, 8, 12, 2, -1, 14 },
							{ 6, 12, 8, 4, 2, 10, 14, -1 }};

static void kfd_set_recommended_sdma_engines(struct kfd_topology_device *to_dev,
					     struct kfd_iolink_properties *outbound_link,
					     struct kfd_iolink_properties *inbound_link)
{
	struct kfd_node *gpu = outbound_link->gpu;
	struct amdgpu_device *adev = gpu->adev;
	int num_xgmi_nodes = adev->gmc.xgmi.num_physical_nodes;
	bool support_rec_eng = !amdgpu_sriov_vf(adev) && to_dev->gpu &&
		adev->aid_mask && num_xgmi_nodes &&
		(amdgpu_xcp_query_partition_mode(adev->xcp_mgr, AMDGPU_XCP_FL_NONE) ==
		      AMDGPU_SPX_PARTITION_MODE) &&
		(!(adev->flags & AMD_IS_APU) && num_xgmi_nodes == 8);

	if (support_rec_eng) {
		int src_socket_id = adev->gmc.xgmi.physical_node_id;
		int dst_socket_id = to_dev->gpu->adev->gmc.xgmi.physical_node_id;

		outbound_link->rec_sdma_eng_id_mask =
			1 << rec_sdma_eng_map[src_socket_id][dst_socket_id];
		inbound_link->rec_sdma_eng_id_mask =
			1 << rec_sdma_eng_map[dst_socket_id][src_socket_id];
	} else {
		int num_sdma_eng = kfd_get_num_sdma_engines(gpu);
		int i, eng_offset = 0;

		if (outbound_link->iolink_type == CRAT_IOLINK_TYPE_XGMI &&
		    kfd_get_num_xgmi_sdma_engines(gpu) && to_dev->gpu) {
			eng_offset = num_sdma_eng;
			num_sdma_eng = kfd_get_num_xgmi_sdma_engines(gpu);
		}

		for (i = 0; i < num_sdma_eng; i++) {
			outbound_link->rec_sdma_eng_id_mask |= (1 << (i + eng_offset));
			inbound_link->rec_sdma_eng_id_mask |= (1 << (i + eng_offset));
		}
	}
}

static void kfd_fill_iolink_non_crat_info(struct kfd_topology_device *dev)
{
	struct kfd_iolink_properties *link, *inbound_link;
@@ -1303,6 +1354,7 @@ static void kfd_fill_iolink_non_crat_info(struct kfd_topology_device *dev)
			inbound_link->flags = CRAT_IOLINK_FLAGS_ENABLED;
			kfd_set_iolink_no_atomics(peer_dev, dev, inbound_link);
			kfd_set_iolink_non_coherent(peer_dev, link, inbound_link);
			kfd_set_recommended_sdma_engines(peer_dev, link, inbound_link);
		}
	}

Loading