Commit a1fc9f58 authored by Philip Yang's avatar Philip Yang Committed by Alex Deucher
Browse files

drm/amdkfd: Handle queue destroy buffer access race



Add helper function kfd_queue_unreference_buffers to reduce queue buffer
refcount, separate it from release queue buffers.

Because it is circular locking to hold dqm_lock to take vm lock,
kfd_ioctl_destroy_queue should take vm lock, unreference queue buffers
first, but not release queue buffers, to handle error in case failed to
hold vm lock. Then hold dqm_lock to remove queue from queue list and
then release queue buffers.

Restore process worker restore queue hold dqm_lock, will always find
the queue with valid queue buffers.

v2 (Felix):
- renamed kfd_queue_unreference_buffer(s) to kfd_queue_unref_bo_va(s)
- added two FIXME comments for follow up

Signed-off-by: default avatarPhilip Yang <Philip.Yang@amd.com>
Signed-off-by: default avatarFelix Kuehling <felix.kuehling@amd.com>
Reviewed-by: default avatarFelix Kuehling <felix.kuehling@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 70f83e77
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -400,6 +400,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
	return 0;

err_create_queue:
	kfd_queue_unref_bo_vas(pdd, &q_properties);
	kfd_queue_release_buffers(pdd, &q_properties);
err_acquire_queue_buf:
err_sdma_engine_id:
+4 −1
Original line number Diff line number Diff line
@@ -1298,9 +1298,12 @@ void print_queue_properties(struct queue_properties *q);
void print_queue(struct queue *q);
int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user *addr, struct amdgpu_bo **pbo,
			 u64 expected_size);
void kfd_queue_buffer_put(struct amdgpu_vm *vm, struct amdgpu_bo **bo);
void kfd_queue_buffer_put(struct amdgpu_bo **bo);
int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_properties *properties);
int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_properties *properties);
void kfd_queue_unref_bo_va(struct amdgpu_vm *vm, struct amdgpu_bo **bo);
int kfd_queue_unref_bo_vas(struct kfd_process_device *pdd,
			   struct queue_properties *properties);
void kfd_queue_ctx_save_restore_size(struct kfd_topology_device *dev);

struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type,
+5 −3
Original line number Diff line number Diff line
@@ -217,6 +217,7 @@ void pqm_uninit(struct process_queue_manager *pqm)
	list_for_each_entry_safe(pqn, next, &pqm->queues, process_queue_list) {
		if (pqn->q) {
			pdd = kfd_get_process_device_data(pqn->q->device, pqm->process);
			kfd_queue_unref_bo_vas(pdd, &pqn->q->properties);
			kfd_queue_release_buffers(pdd, &pqn->q->properties);
			pqm_clean_queue_resource(pqm, pqn);
		}
@@ -512,7 +513,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
	}

	if (pqn->q) {
		retval = kfd_queue_release_buffers(pdd, &pqn->q->properties);
		retval = kfd_queue_unref_bo_vas(pdd, &pqn->q->properties);
		if (retval)
			goto err_destroy_queue;

@@ -526,7 +527,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
			if (retval != -ETIME)
				goto err_destroy_queue;
		}

		kfd_queue_release_buffers(pdd, &pqn->q->properties);
		pqm_clean_queue_resource(pqm, pqn);
		uninit_queue(pqn->q);
	}
@@ -579,7 +580,8 @@ int pqm_update_queue_properties(struct process_queue_manager *pqm,
			return -EFAULT;
		}

		kfd_queue_buffer_put(vm, &pqn->q->properties.ring_bo);
		kfd_queue_unref_bo_va(vm, &pqn->q->properties.ring_bo);
		kfd_queue_buffer_put(&pqn->q->properties.ring_bo);
		amdgpu_bo_unreserve(vm->root.bo);

		pqn->q->properties.ring_bo = p->ring_bo;
+43 −23
Original line number Diff line number Diff line
@@ -224,16 +224,9 @@ int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user *addr, struct amdgpu_
	return -EINVAL;
}

void kfd_queue_buffer_put(struct amdgpu_vm *vm, struct amdgpu_bo **bo)
/* FIXME: remove this function, just call amdgpu_bo_unref directly */
void kfd_queue_buffer_put(struct amdgpu_bo **bo)
{
	if (*bo) {
		struct amdgpu_bo_va *bo_va;

		bo_va = amdgpu_vm_bo_find(vm, *bo);
		if (bo_va)
			bo_va->queue_refcount--;
	}

	amdgpu_bo_unref(bo);
}

@@ -327,6 +320,10 @@ int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_prope
out_err_unreserve:
	amdgpu_bo_unreserve(vm->root.bo);
out_err_release:
	/* FIXME: make a _locked version of this that can be called before
	 * dropping the VM reservation.
	 */
	kfd_queue_unref_bo_vas(pdd, properties);
	kfd_queue_release_buffers(pdd, properties);
	return err;
}
@@ -334,22 +331,13 @@ int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_prope
int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_properties *properties)
{
	struct kfd_topology_device *topo_dev;
	struct amdgpu_vm *vm;
	u32 total_cwsr_size;
	int err;

	vm = drm_priv_to_vm(pdd->drm_priv);
	err = amdgpu_bo_reserve(vm->root.bo, false);
	if (err)
		return err;

	kfd_queue_buffer_put(vm, &properties->wptr_bo);
	kfd_queue_buffer_put(vm, &properties->rptr_bo);
	kfd_queue_buffer_put(vm, &properties->ring_bo);
	kfd_queue_buffer_put(vm, &properties->eop_buf_bo);
	kfd_queue_buffer_put(vm, &properties->cwsr_bo);

	amdgpu_bo_unreserve(vm->root.bo);
	kfd_queue_buffer_put(&properties->wptr_bo);
	kfd_queue_buffer_put(&properties->rptr_bo);
	kfd_queue_buffer_put(&properties->ring_bo);
	kfd_queue_buffer_put(&properties->eop_buf_bo);
	kfd_queue_buffer_put(&properties->cwsr_bo);

	topo_dev = kfd_topology_device_by_id(pdd->dev->id);
	if (!topo_dev)
@@ -362,6 +350,38 @@ int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_prope
	return 0;
}

void kfd_queue_unref_bo_va(struct amdgpu_vm *vm, struct amdgpu_bo **bo)
{
	if (*bo) {
		struct amdgpu_bo_va *bo_va;

		bo_va = amdgpu_vm_bo_find(vm, *bo);
		if (bo_va && bo_va->queue_refcount)
			bo_va->queue_refcount--;
	}
}

int kfd_queue_unref_bo_vas(struct kfd_process_device *pdd,
			   struct queue_properties *properties)
{
	struct amdgpu_vm *vm;
	int err;

	vm = drm_priv_to_vm(pdd->drm_priv);
	err = amdgpu_bo_reserve(vm->root.bo, false);
	if (err)
		return err;

	kfd_queue_unref_bo_va(vm, &properties->wptr_bo);
	kfd_queue_unref_bo_va(vm, &properties->rptr_bo);
	kfd_queue_unref_bo_va(vm, &properties->ring_bo);
	kfd_queue_unref_bo_va(vm, &properties->eop_buf_bo);
	kfd_queue_unref_bo_va(vm, &properties->cwsr_bo);

	amdgpu_bo_unreserve(vm->root.bo);
	return 0;
}

#define SGPR_SIZE_PER_CU	0x4000
#define LDS_SIZE_PER_CU		0x10000
#define HWREG_SIZE_PER_CU	0x1000