Commit e77a541f authored by Graham Sider's avatar Graham Sider Committed by Alex Deucher
Browse files

drm/amdkfd: Enable GFX11 usermode queue oversubscription



Starting with GFX11, MES requires wptr BOs to be GTT allocated/mapped to
GART for usermode queues in order to support oversubscription. In the
case that work is submitted to an unmapped queue, MES must have a GART
wptr address to determine whether the queue should be mapped.

This change is accompanied with changes in MES and is applicable for
MES_API_VERSION >= 2.

v3:
- Use amdgpu_vm_bo_lookup_mapping for wptr_bo mapping lookup
- Move wptr_bo refcount increment to amdgpu_amdkfd_map_gtt_bo_to_gart
- Remove list_del_init from amdgpu_amdkfd_map_gtt_bo_to_gart
- Cleanup/fix create_queue wptr_bo error handling
v4:
- Add MES version shift/mask defines to amdgpu_mes.h
- Change version check from MES_VERSION to MES_API_VERSION
- Add check in kfd_ioctl_create_queue before wptr bo pin/GART map to
ensure bo is a single page.

Signed-off-by: default avatarGraham Sider <Graham.Sider@amd.com>
Acked-by: default avatarAlex Deucher <alexander.deucher@amd.com>
Acked-by: default avatarChristian König <christian.koenig@amd.com>
Reviewed-by: default avatarPhilip Yang <Philip.Yang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent ff83e6e7
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -286,6 +286,8 @@ int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_mem *mem,
					     void **kptr, uint64_t *size);
void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct kgd_mem *mem);

int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_device *adev, struct amdgpu_bo *bo);

int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info,
					    struct dma_fence **ef);
int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct amdgpu_device *adev,
+48 −0
Original line number Diff line number Diff line
@@ -2113,6 +2113,54 @@ int amdgpu_amdkfd_gpuvm_sync_memory(
	return ret;
}

/**
 * amdgpu_amdkfd_map_gtt_bo_to_gart - Map BO to GART and increment reference count
 * @adev: Device to which allocated BO belongs
 * @bo: Buffer object to be mapped
 *
 * Before return, bo reference count is incremented. To release the reference and unpin/
 * unmap the BO, call amdgpu_amdkfd_free_gtt_mem.
 */
int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_device *adev, struct amdgpu_bo *bo)
{
	int ret;

	ret = amdgpu_bo_reserve(bo, true);
	if (ret) {
		pr_err("Failed to reserve bo. ret %d\n", ret);
		goto err_reserve_bo_failed;
	}

	ret = amdgpu_bo_pin(bo, AMDGPU_GEM_DOMAIN_GTT);
	if (ret) {
		pr_err("Failed to pin bo. ret %d\n", ret);
		goto err_pin_bo_failed;
	}

	ret = amdgpu_ttm_alloc_gart(&bo->tbo);
	if (ret) {
		pr_err("Failed to bind bo to GART. ret %d\n", ret);
		goto err_map_bo_gart_failed;
	}

	amdgpu_amdkfd_remove_eviction_fence(
		bo, bo->kfd_bo->process_info->eviction_fence);

	amdgpu_bo_unreserve(bo);

	bo = amdgpu_bo_ref(bo);

	return 0;

err_map_bo_gart_failed:
	amdgpu_bo_unpin(bo);
err_pin_bo_failed:
	amdgpu_bo_unreserve(bo);
err_reserve_bo_failed:

	return ret;
}

/** amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel() - Map a GTT BO for kernel CPU access
 *
 * @mem: Buffer object to be mapped for CPU access
+7 −0
Original line number Diff line number Diff line
@@ -33,6 +33,13 @@
#define AMDGPU_MES_MAX_GFX_PIPES            2
#define AMDGPU_MES_MAX_SDMA_PIPES           2

#define AMDGPU_MES_API_VERSION_SHIFT	12
#define AMDGPU_MES_FEAT_VERSION_SHIFT	24

#define AMDGPU_MES_VERSION_MASK		0x00000fff
#define AMDGPU_MES_API_VERSION_MASK	0x00fff000
#define AMDGPU_MES_FEAT_VERSION_MASK	0xff000000

enum amdgpu_mes_priority_level {
	AMDGPU_MES_PRIORITY_LEVEL_LOW       = 0,
	AMDGPU_MES_PRIORITY_LEVEL_NORMAL    = 1,
+43 −2
Original line number Diff line number Diff line
@@ -299,6 +299,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
	struct kfd_process_device *pdd;
	struct queue_properties q_properties;
	uint32_t doorbell_offset_in_process = 0;
	struct amdgpu_bo *wptr_bo = NULL;

	memset(&q_properties, 0, sizeof(struct queue_properties));

@@ -326,12 +327,49 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
		goto err_bind_process;
	}

	/* Starting with GFX11, wptr BOs must be mapped to GART for MES to determine work
	 * on unmapped queues for usermode queue oversubscription (no aggregated doorbell)
	 */
	if (dev->shared_resources.enable_mes &&
			((dev->adev->mes.sched_version & AMDGPU_MES_API_VERSION_MASK)
			>> AMDGPU_MES_API_VERSION_SHIFT) >= 2) {
		struct amdgpu_bo_va_mapping *wptr_mapping;
		struct amdgpu_vm *wptr_vm;

		wptr_vm = drm_priv_to_vm(pdd->drm_priv);
		err = amdgpu_bo_reserve(wptr_vm->root.bo, false);
		if (err)
			goto err_wptr_map_gart;

		wptr_mapping = amdgpu_vm_bo_lookup_mapping(
				wptr_vm, args->write_pointer_address >> PAGE_SHIFT);
		amdgpu_bo_unreserve(wptr_vm->root.bo);
		if (!wptr_mapping) {
			pr_err("Failed to lookup wptr bo\n");
			err = -EINVAL;
			goto err_wptr_map_gart;
		}

		wptr_bo = wptr_mapping->bo_va->base.bo;
		if (wptr_bo->tbo.base.size > PAGE_SIZE) {
			pr_err("Requested GART mapping for wptr bo larger than one page\n");
			err = -EINVAL;
			goto err_wptr_map_gart;
		}

		err = amdgpu_amdkfd_map_gtt_bo_to_gart(dev->adev, wptr_bo);
		if (err) {
			pr_err("Failed to map wptr bo to GART\n");
			goto err_wptr_map_gart;
		}
	}

	pr_debug("Creating queue for PASID 0x%x on gpu 0x%x\n",
			p->pasid,
			dev->id);

	err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, &queue_id, NULL, NULL, NULL,
			&doorbell_offset_in_process);
	err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, &queue_id, wptr_bo,
			NULL, NULL, NULL, &doorbell_offset_in_process);
	if (err != 0)
		goto err_create_queue;

@@ -363,6 +401,9 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
	return 0;

err_create_queue:
	if (wptr_bo)
		amdgpu_amdkfd_free_gtt_mem(dev->adev, wptr_bo);
err_wptr_map_gart:
err_bind_process:
err_pdd:
	mutex_unlock(&p->mutex);
+8 −1
Original line number Diff line number Diff line
@@ -177,6 +177,7 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q,
	struct kfd_process_device *pdd = qpd_to_pdd(qpd);
	struct mes_add_queue_input queue_input;
	int r, queue_type;
	uint64_t wptr_addr_off;

	if (dqm->is_hws_hang)
		return -EIO;
@@ -196,7 +197,13 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q,
					AMDGPU_MES_PRIORITY_LEVEL_NORMAL;
	queue_input.doorbell_offset = q->properties.doorbell_off;
	queue_input.mqd_addr = q->gart_mqd_addr;

	if (q->wptr_bo) {
		wptr_addr_off = (uint64_t)q->properties.write_ptr - (uint64_t)q->wptr_bo->kfd_bo->va;
		queue_input.wptr_addr = ((uint64_t)q->wptr_bo->tbo.resource->start << PAGE_SHIFT) + wptr_addr_off;
	} else
		queue_input.wptr_addr = (uint64_t)q->properties.write_ptr;

	queue_input.paging = false;
	queue_input.tba_addr = qpd->tba_addr;
	queue_input.tma_addr = qpd->tma_addr;
Loading