Commit f18719ef authored by Jesse.Zhang's avatar Jesse.Zhang Committed by Alex Deucher
Browse files

drm/amdgpu: Convert amdgpu userqueue management from IDR to XArray



This commit refactors the AMDGPU userqueue management subsystem to replace
IDR (ID Allocation) with XArray for improved performance, scalability, and
maintainability. The changes address several issues with the previous IDR
implementation and provide better locking semantics.

Key changes:

1. **Global XArray Introduction**:
   - Added `userq_doorbell_xa` to `struct amdgpu_device` for global queue tracking
   - Uses doorbell_index as key for efficient global lookup
   - Replaces the previous `userq_mgr_list` linked list approach

2. **Per-process XArray Conversion**:
   - Replaced `userq_idr` with `userq_mgr_xa` in `struct amdgpu_userq_mgr`
   - Maintains per-process queue tracking with queue_id as key
   - Uses XA_FLAGS_ALLOC for automatic ID allocation

3. **Locking Improvements**:
   - Removed global `userq_mutex` from `struct amdgpu_device`
   - Replaced with fine-grained XArray locking using XArray's internal spinlocks

4. **Runtime Idle Check Optimization**:
   - Updated `amdgpu_runtime_idle_check_userq()` to use xa_empty

5. **Queue Management Functions**:
   - Converted all IDR operations to equivalent XArray functions:
     - `idr_alloc()` → `xa_alloc()`
     - `idr_find()` → `xa_load()`
     - `idr_remove()` → `xa_erase()`
     - `idr_for_each()` → `xa_for_each()`

Benefits:
- **Performance**: XArray provides better scalability for large numbers of queues
- **Memory Efficiency**: Reduced memory overhead compared to IDR
- **Thread Safety**: Improved locking semantics with XArray's internal spinlocks

v2: rename userq_global_xa/userq_xa to userq_doorbell_xa/userq_mgr_xa
    Remove xa_lock and use its own lock.

v3: Set queue->userq_mgr = uq_mgr in amdgpu_userq_create()
v4: use xa_store_irq (Christian)
    hold the read side of the reset lock while creating/destroying queues and the manager data structure. (Chritian)

Acked-by: default avatarAlex Deucher <alexander.deucher@amd.com>
Suggested-by: default avatarChristian König <christian.koenig@amd.com>
Signed-off-by: default avatarJesse Zhang <Jesse.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent a981cb0f
Loading
Loading
Loading
Loading
+6 −2
Original line number Diff line number Diff line
@@ -1176,6 +1176,12 @@ struct amdgpu_device {
	 * queue fence.
	 */
	struct xarray			userq_xa;
	/**
	 * @userq_doorbell_xa: Global user queue map (doorbell index → queue)
	 * Key: doorbell_index (unique global identifier for the queue)
	 * Value: struct amdgpu_usermode_queue
	 */
	struct xarray userq_doorbell_xa;

	/* df */
	struct amdgpu_df                df;
@@ -1309,8 +1315,6 @@ struct amdgpu_device {
	 */
	bool                            apu_prefer_gtt;

	struct list_head		userq_mgr_list;
	struct mutex                    userq_mutex;
	bool                            userq_halt_for_enforce_isolation;
	struct amdgpu_uid *uid_info;

+1 −2
Original line number Diff line number Diff line
@@ -4558,7 +4558,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
	mutex_init(&adev->gfx.userq_sch_mutex);
	mutex_init(&adev->gfx.workload_profile_mutex);
	mutex_init(&adev->vcn.workload_profile_mutex);
	mutex_init(&adev->userq_mutex);

	amdgpu_device_init_apu_flags(adev);

@@ -4586,7 +4585,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,

	INIT_LIST_HEAD(&adev->pm.od_kobj_list);

	INIT_LIST_HEAD(&adev->userq_mgr_list);
	xa_init(&adev->userq_doorbell_xa);

	INIT_DELAYED_WORK(&adev->delayed_init_work,
			  amdgpu_device_delayed_init_work_handler);
+1 −15
Original line number Diff line number Diff line
@@ -2772,22 +2772,8 @@ static int amdgpu_runtime_idle_check_userq(struct device *dev)
	struct pci_dev *pdev = to_pci_dev(dev);
	struct drm_device *drm_dev = pci_get_drvdata(pdev);
	struct amdgpu_device *adev = drm_to_adev(drm_dev);
	struct amdgpu_usermode_queue *queue;
	struct amdgpu_userq_mgr *uqm, *tmp;
	int queue_id;
	int ret = 0;

	mutex_lock(&adev->userq_mutex);
	list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
		idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
			ret = -EBUSY;
			goto done;
		}
	}
done:
	mutex_unlock(&adev->userq_mutex);

	return ret;
	return xa_empty(&adev->userq_doorbell_xa) ? 0 : -EBUSY;
}

static int amdgpu_pmops_runtime_suspend(struct device *dev)
+75 −76
Original line number Diff line number Diff line
@@ -30,6 +30,7 @@
#include "amdgpu_vm.h"
#include "amdgpu_userq.h"
#include "amdgpu_hmm.h"
#include "amdgpu_reset.h"
#include "amdgpu_userq_fence.h"

u32 amdgpu_userq_get_supported_ip_mask(struct amdgpu_device *adev)
@@ -278,19 +279,27 @@ amdgpu_userq_cleanup(struct amdgpu_userq_mgr *uq_mgr,
	struct amdgpu_device *adev = uq_mgr->adev;
	const struct amdgpu_userq_funcs *uq_funcs = adev->userq_funcs[queue->queue_type];

	/* Wait for mode-1 reset to complete */
	down_read(&adev->reset_domain->sem);

	/* Drop the userq reference. */
	amdgpu_userq_buffer_vas_list_cleanup(adev, queue);
	uq_funcs->mqd_destroy(uq_mgr, queue);
	amdgpu_userq_fence_driver_free(queue);
	idr_remove(&uq_mgr->userq_idr, queue_id);
	/* Use interrupt-safe locking since IRQ handlers may access these XArrays */
	xa_erase_irq(&uq_mgr->userq_mgr_xa, (unsigned long)queue_id);
	xa_erase_irq(&adev->userq_doorbell_xa, queue->doorbell_index);
	queue->userq_mgr = NULL;
	list_del(&queue->userq_va_list);
	kfree(queue);

	up_read(&adev->reset_domain->sem);
}

static struct amdgpu_usermode_queue *
amdgpu_userq_find(struct amdgpu_userq_mgr *uq_mgr, int qid)
{
	return idr_find(&uq_mgr->userq_idr, qid);
	return xa_load(&uq_mgr->userq_mgr_xa, qid);
}

void
@@ -551,8 +560,9 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
	struct amdgpu_db_info db_info;
	char *queue_name;
	bool skip_map_queue;
	u32 qid;
	uint64_t index;
	int qid, r = 0;
	int r = 0;
	int priority =
		(args->in.flags & AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_MASK) >>
		AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_SHIFT;
@@ -575,7 +585,6 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
	 *
	 * This will also make sure we have a valid eviction fence ready to be used.
	 */
	mutex_lock(&adev->userq_mutex);
	amdgpu_userq_ensure_ev_fence(&fpriv->userq_mgr, &fpriv->evf_mgr);

	uq_funcs = adev->userq_funcs[args->in.ip_type];
@@ -638,15 +647,27 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
		goto unlock;
	}

	qid = idr_alloc(&uq_mgr->userq_idr, queue, 1, AMDGPU_MAX_USERQ_COUNT, GFP_KERNEL);
	if (qid < 0) {
	/* Wait for mode-1 reset to complete */
	down_read(&adev->reset_domain->sem);
	r = xa_err(xa_store_irq(&adev->userq_doorbell_xa, index, queue, GFP_KERNEL));
	if (r) {
		kfree(queue);
		up_read(&adev->reset_domain->sem);
		goto unlock;
	}

	r = xa_alloc(&uq_mgr->userq_mgr_xa, &qid, queue, XA_LIMIT(1, AMDGPU_MAX_USERQ_COUNT), GFP_KERNEL);
	if (r) {
		drm_file_err(uq_mgr->file, "Failed to allocate a queue id\n");
		amdgpu_userq_fence_driver_free(queue);
		uq_funcs->mqd_destroy(uq_mgr, queue);
		kfree(queue);
		r = -ENOMEM;
		up_read(&adev->reset_domain->sem);
		goto unlock;
	}
	up_read(&adev->reset_domain->sem);
	queue->userq_mgr = uq_mgr;

	/* don't map the queue if scheduling is halted */
	if (adev->userq_halt_for_enforce_isolation &&
@@ -659,7 +680,7 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
		r = amdgpu_userq_map_helper(uq_mgr, queue);
		if (r) {
			drm_file_err(uq_mgr->file, "Failed to map Queue\n");
			idr_remove(&uq_mgr->userq_idr, qid);
			xa_erase(&uq_mgr->userq_mgr_xa, qid);
			amdgpu_userq_fence_driver_free(queue);
			uq_funcs->mqd_destroy(uq_mgr, queue);
			kfree(queue);
@@ -684,7 +705,6 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)

unlock:
	mutex_unlock(&uq_mgr->userq_mutex);
	mutex_unlock(&adev->userq_mutex);

	return r;
}
@@ -782,11 +802,11 @@ static int
amdgpu_userq_restore_all(struct amdgpu_userq_mgr *uq_mgr)
{
	struct amdgpu_usermode_queue *queue;
	int queue_id;
	unsigned long queue_id;
	int ret = 0, r;

	/* Resume all the queues for this process */
	idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) {
	xa_for_each(&uq_mgr->userq_mgr_xa, queue_id, queue) {

		if (!amdgpu_userq_buffer_vas_mapped(queue)) {
			drm_file_err(uq_mgr->file,
@@ -1023,11 +1043,11 @@ static int
amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
{
	struct amdgpu_usermode_queue *queue;
	int queue_id;
	unsigned long queue_id;
	int ret = 0, r;

	/* Try to unmap all the queues in this process ctx */
	idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) {
	xa_for_each(&uq_mgr->userq_mgr_xa, queue_id, queue) {
		r = amdgpu_userq_preempt_helper(uq_mgr, queue);
		if (r)
			ret = r;
@@ -1042,9 +1062,10 @@ static int
amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr)
{
	struct amdgpu_usermode_queue *queue;
	int queue_id, ret;
	unsigned long queue_id;
	int ret;

	idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) {
	xa_for_each(&uq_mgr->userq_mgr_xa, queue_id, queue) {
		struct dma_fence *f = queue->last_fence;

		if (!f || dma_fence_is_signaled(f))
@@ -1097,44 +1118,30 @@ int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct drm_file *f
			  struct amdgpu_device *adev)
{
	mutex_init(&userq_mgr->userq_mutex);
	idr_init_base(&userq_mgr->userq_idr, 1);
	xa_init_flags(&userq_mgr->userq_mgr_xa, XA_FLAGS_ALLOC);
	userq_mgr->adev = adev;
	userq_mgr->file = file_priv;

	mutex_lock(&adev->userq_mutex);
	list_add(&userq_mgr->list, &adev->userq_mgr_list);
	mutex_unlock(&adev->userq_mutex);

	INIT_DELAYED_WORK(&userq_mgr->resume_work, amdgpu_userq_restore_worker);
	return 0;
}

void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr)
{
	struct amdgpu_device *adev = userq_mgr->adev;
	struct amdgpu_usermode_queue *queue;
	struct amdgpu_userq_mgr *uqm, *tmp;
	uint32_t queue_id;
	unsigned long queue_id;

	cancel_delayed_work_sync(&userq_mgr->resume_work);

	mutex_lock(&adev->userq_mutex);
	mutex_lock(&userq_mgr->userq_mutex);
	idr_for_each_entry(&userq_mgr->userq_idr, queue, queue_id) {
	xa_for_each(&userq_mgr->userq_mgr_xa, queue_id, queue) {
		amdgpu_userq_wait_for_last_fence(userq_mgr, queue);
		amdgpu_userq_unmap_helper(userq_mgr, queue);
		amdgpu_userq_cleanup(userq_mgr, queue, queue_id);
	}

	list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
		if (uqm == userq_mgr) {
			list_del(&uqm->list);
			break;
		}
	}
	idr_destroy(&userq_mgr->userq_idr);
	xa_destroy(&userq_mgr->userq_mgr_xa);
	mutex_unlock(&userq_mgr->userq_mutex);
	mutex_unlock(&adev->userq_mutex);
	mutex_destroy(&userq_mgr->userq_mutex);
}

@@ -1142,18 +1149,17 @@ int amdgpu_userq_suspend(struct amdgpu_device *adev)
{
	u32 ip_mask = amdgpu_userq_get_supported_ip_mask(adev);
	struct amdgpu_usermode_queue *queue;
	struct amdgpu_userq_mgr *uqm, *tmp;
	int queue_id;
	struct amdgpu_userq_mgr *uqm;
	unsigned long queue_id;
	int r;

	if (!ip_mask)
		return 0;

	guard(mutex)(&adev->userq_mutex);
	list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
	xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
		uqm = queue->userq_mgr;
		cancel_delayed_work_sync(&uqm->resume_work);
		guard(mutex)(&uqm->userq_mutex);
		idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
		if (adev->in_s0ix)
			r = amdgpu_userq_preempt_helper(uqm, queue);
		else
@@ -1161,7 +1167,6 @@ int amdgpu_userq_suspend(struct amdgpu_device *adev)
		if (r)
			return r;
	}
	}
	return 0;
}

@@ -1169,17 +1174,16 @@ int amdgpu_userq_resume(struct amdgpu_device *adev)
{
	u32 ip_mask = amdgpu_userq_get_supported_ip_mask(adev);
	struct amdgpu_usermode_queue *queue;
	struct amdgpu_userq_mgr *uqm, *tmp;
	int queue_id;
	struct amdgpu_userq_mgr *uqm;
	unsigned long queue_id;
	int r;

	if (!ip_mask)
		return 0;

	guard(mutex)(&adev->userq_mutex);
	list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
	xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
		uqm = queue->userq_mgr;
		guard(mutex)(&uqm->userq_mutex);
		idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
		if (adev->in_s0ix)
			r = amdgpu_userq_restore_helper(uqm, queue);
		else
@@ -1187,7 +1191,6 @@ int amdgpu_userq_resume(struct amdgpu_device *adev)
		if (r)
			return r;
	}
	}

	return 0;
}
@@ -1197,22 +1200,21 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev,
{
	u32 ip_mask = amdgpu_userq_get_supported_ip_mask(adev);
	struct amdgpu_usermode_queue *queue;
	struct amdgpu_userq_mgr *uqm, *tmp;
	int queue_id;
	struct amdgpu_userq_mgr *uqm;
	unsigned long queue_id;
	int ret = 0, r;

	/* only need to stop gfx/compute */
	if (!(ip_mask & ((1 << AMDGPU_HW_IP_GFX) | (1 << AMDGPU_HW_IP_COMPUTE))))
		return 0;

	mutex_lock(&adev->userq_mutex);
	if (adev->userq_halt_for_enforce_isolation)
		dev_warn(adev->dev, "userq scheduling already stopped!\n");
	adev->userq_halt_for_enforce_isolation = true;
	list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
	xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
		uqm = queue->userq_mgr;
		cancel_delayed_work_sync(&uqm->resume_work);
		mutex_lock(&uqm->userq_mutex);
		idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
		if (((queue->queue_type == AMDGPU_HW_IP_GFX) ||
		     (queue->queue_type == AMDGPU_HW_IP_COMPUTE)) &&
		    (queue->xcp_id == idx)) {
@@ -1220,10 +1222,9 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev,
			if (r)
				ret = r;
		}
		}
		mutex_unlock(&uqm->userq_mutex);
	}
	mutex_unlock(&adev->userq_mutex);

	return ret;
}

@@ -1232,21 +1233,20 @@ int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev,
{
	u32 ip_mask = amdgpu_userq_get_supported_ip_mask(adev);
	struct amdgpu_usermode_queue *queue;
	struct amdgpu_userq_mgr *uqm, *tmp;
	int queue_id;
	struct amdgpu_userq_mgr *uqm;
	unsigned long queue_id;
	int ret = 0, r;

	/* only need to stop gfx/compute */
	if (!(ip_mask & ((1 << AMDGPU_HW_IP_GFX) | (1 << AMDGPU_HW_IP_COMPUTE))))
		return 0;

	mutex_lock(&adev->userq_mutex);
	if (!adev->userq_halt_for_enforce_isolation)
		dev_warn(adev->dev, "userq scheduling already started!\n");
	adev->userq_halt_for_enforce_isolation = false;
	list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
	xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
		uqm = queue->userq_mgr;
		mutex_lock(&uqm->userq_mutex);
		idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
			if (((queue->queue_type == AMDGPU_HW_IP_GFX) ||
			     (queue->queue_type == AMDGPU_HW_IP_COMPUTE)) &&
			    (queue->xcp_id == idx)) {
@@ -1254,10 +1254,9 @@ int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev,
				if (r)
					ret = r;
			}
		}
		mutex_unlock(&uqm->userq_mutex);
	}
	mutex_unlock(&adev->userq_mutex);

	return ret;
}

+6 −2
Original line number Diff line number Diff line
@@ -96,11 +96,15 @@ struct amdgpu_userq_funcs {

/* Usermode queues for gfx */
struct amdgpu_userq_mgr {
	struct idr			userq_idr;
	/**
	 * @userq_mgr_xa: Per-process user queue map (queue ID → queue)
	 * Key: queue_id (unique ID within the process's userq manager)
	 * Value: struct amdgpu_usermode_queue
	 */
	struct xarray			userq_mgr_xa;
	struct mutex			userq_mutex;
	struct amdgpu_device		*adev;
	struct delayed_work		resume_work;
	struct list_head		list;
	struct drm_file			*file;
};

Loading