Commit b8f67b9d authored by Shashank Sharma's avatar Shashank Sharma Committed by Alex Deucher
Browse files

drm/amdgpu: change vm->task_info handling



This patch changes the handling and lifecycle of vm->task_info object.
The major changes are:
- vm->task_info is a dynamically allocated ptr now, and its uasge is
  reference counted.
- introducing two new helper funcs for task_info lifecycle management
    - amdgpu_vm_get_task_info: reference counts up task_info before
      returning this info
    - amdgpu_vm_put_task_info: reference counts down task_info
- last put to task_info() frees task_info from the vm.

This patch also does logistical changes required for existing usage
of vm->task_info.

V2: Do not block all the prints when task_info not found (Felix)

V3: Fixed review comments from Felix
   - Fix wrong indentation
   - No debug message for -ENOMEM
   - Add NULL check for task_info
   - Do not duplicate the debug messages (ti vs no ti)
   - Get first reference of task_info in vm_init(), put last
     in vm_fini()

V4: Fixed review comments from Felix
   - fix double reference increment in create_task_info
   - change amdgpu_vm_get_task_info_pasid
   - additional changes in amdgpu_gem.c while porting

Cc: Christian Koenig <christian.koenig@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: Felix Kuehling <Felix.Kuehling@amd.com>
Reviewed-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: default avatarShashank Sharma <shashank.sharma@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 68e05b93
Loading
Loading
Loading
Loading
+7 −2
Original line number Diff line number Diff line
@@ -1782,9 +1782,14 @@ static int amdgpu_debugfs_vm_info_show(struct seq_file *m, void *unused)
	list_for_each_entry(file, &dev->filelist, lhead) {
		struct amdgpu_fpriv *fpriv = file->driver_priv;
		struct amdgpu_vm *vm = &fpriv->vm;
		struct amdgpu_task_info *ti;

		ti = amdgpu_vm_get_task_info_vm(vm);
		if (ti) {
			seq_printf(m, "pid:%d\tProcess:%s ----------\n", ti->pid, ti->process_name);
			amdgpu_vm_put_task_info(ti);
		}

		seq_printf(m, "pid:%d\tProcess:%s ----------\n",
				vm->task_info.pid, vm->task_info.process_name);
		r = amdgpu_bo_reserve(vm->root.bo, true);
		if (r)
			break;
+9 −3
Original line number Diff line number Diff line
@@ -208,9 +208,15 @@ static int amdgpu_gem_object_open(struct drm_gem_object *obj,
	if (!WARN_ON(!vm->process_info->eviction_fence)) {
		r = amdgpu_amdkfd_bo_validate_and_fence(abo, AMDGPU_GEM_DOMAIN_GTT,
							&vm->process_info->eviction_fence->base);
		if (r)
			dev_warn(adev->dev, "%d: validate_and_fence failed: %d\n",
				 vm->task_info.pid, r);
		if (r) {
			struct amdgpu_task_info *ti = amdgpu_vm_get_task_info_vm(vm);

			dev_warn(adev->dev, "validate_and_fence failed: %d\n", r);
			if (ti) {
				dev_warn(adev->dev, "pid %d\n", ti->pid);
				amdgpu_vm_put_task_info(ti);
			}
		}
	}
	mutex_unlock(&vm->process_info->lock);

+11 −7
Original line number Diff line number Diff line
@@ -35,7 +35,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
{
	struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
	struct amdgpu_job *job = to_amdgpu_job(s_job);
	struct amdgpu_task_info ti;
	struct amdgpu_task_info *ti;
	struct amdgpu_device *adev = ring->adev;
	int idx;
	int r;
@@ -48,7 +48,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
		return DRM_GPU_SCHED_STAT_ENODEV;
	}

	memset(&ti, 0, sizeof(struct amdgpu_task_info));

	adev->job_hang = true;

	if (amdgpu_gpu_recovery &&
@@ -58,12 +58,16 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
		goto exit;
	}

	amdgpu_vm_get_task_info(ring->adev, job->pasid, &ti);
	DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n",
		   job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
		   ring->fence_drv.sync_seq);

	ti = amdgpu_vm_get_task_info_pasid(ring->adev, job->pasid);
	if (ti) {
		DRM_ERROR("Process information: process %s pid %d thread %s pid %d\n",
		  ti.process_name, ti.tgid, ti.task_name, ti.pid);
			  ti->process_name, ti->tgid, ti->task_name, ti->pid);
		amdgpu_vm_put_task_info(ti);
	}

	dma_fence_set_error(&s_job->s_fence->finished, -ETIME);

+10 −2
Original line number Diff line number Diff line
@@ -230,8 +230,16 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,

	coredump->reset_vram_lost = vram_lost;

	if (reset_context->job && reset_context->job->vm)
		coredump->reset_task_info = reset_context->job->vm->task_info;
	if (reset_context->job && reset_context->job->vm) {
		struct amdgpu_task_info *ti;
		struct amdgpu_vm *vm = reset_context->job->vm;

		ti = amdgpu_vm_get_task_info_vm(vm);
		if (ti) {
			coredump->reset_task_info = *ti;
			amdgpu_vm_put_task_info(ti);
		}
	}

	coredump->adev = adev;

+115 −44
Original line number Diff line number Diff line
@@ -513,8 +513,14 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm,
		bo = bo_base->bo;

		if (dma_resv_locking_ctx(bo->tbo.base.resv) != ticket) {
			pr_warn_ratelimited("Evicted user BO is not reserved in pid %d\n",
					    vm->task_info.pid);
			struct amdgpu_task_info *ti = amdgpu_vm_get_task_info_vm(vm);

			pr_warn_ratelimited("Evicted user BO is not reserved\n");
			if (ti) {
				pr_warn_ratelimited("pid %d\n", ti->pid);
				amdgpu_vm_put_task_info(ti);
			}

			return -EINVAL;
		}

@@ -2221,6 +2227,108 @@ long amdgpu_vm_wait_idle(struct amdgpu_vm *vm, long timeout)
	return dma_fence_wait_timeout(vm->last_unlocked, true, timeout);
}

static void amdgpu_vm_destroy_task_info(struct kref *kref)
{
	struct amdgpu_task_info *ti = container_of(kref, struct amdgpu_task_info, refcount);

	kfree(ti);
}

static inline struct amdgpu_vm *
amdgpu_vm_get_vm_from_pasid(struct amdgpu_device *adev, u32 pasid)
{
	struct amdgpu_vm *vm;
	unsigned long flags;

	xa_lock_irqsave(&adev->vm_manager.pasids, flags);
	vm = xa_load(&adev->vm_manager.pasids, pasid);
	xa_unlock_irqrestore(&adev->vm_manager.pasids, flags);

	return vm;
}

/**
 * amdgpu_vm_put_task_info - reference down the vm task_info ptr
 *
 * @task_info: task_info struct under discussion.
 *
 * frees the vm task_info ptr at the last put
 */
void amdgpu_vm_put_task_info(struct amdgpu_task_info *task_info)
{
	kref_put(&task_info->refcount, amdgpu_vm_destroy_task_info);
}

/**
 * amdgpu_vm_get_task_info_vm - Extracts task info for a vm.
 *
 * @vm: VM to get info from
 *
 * Returns the reference counted task_info structure, which must be
 * referenced down with amdgpu_vm_put_task_info.
 */
struct amdgpu_task_info *
amdgpu_vm_get_task_info_vm(struct amdgpu_vm *vm)
{
	struct amdgpu_task_info *ti = NULL;

	if (vm) {
		ti = vm->task_info;
		kref_get(&vm->task_info->refcount);
	}

	return ti;
}

/**
 * amdgpu_vm_get_task_info_pasid - Extracts task info for a PASID.
 *
 * @adev: drm device pointer
 * @pasid: PASID identifier for VM
 *
 * Returns the reference counted task_info structure, which must be
 * referenced down with amdgpu_vm_put_task_info.
 */
struct amdgpu_task_info *
amdgpu_vm_get_task_info_pasid(struct amdgpu_device *adev, u32 pasid)
{
	return amdgpu_vm_get_task_info_vm(
			amdgpu_vm_get_vm_from_pasid(adev, pasid));
}

static int amdgpu_vm_create_task_info(struct amdgpu_vm *vm)
{
	vm->task_info = kzalloc(sizeof(struct amdgpu_task_info), GFP_KERNEL);
	if (!vm->task_info)
		return -ENOMEM;

	kref_init(&vm->task_info->refcount);
	return 0;
}

/**
 * amdgpu_vm_set_task_info - Sets VMs task info.
 *
 * @vm: vm for which to set the info
 */
void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
{
	if (!vm->task_info)
		return;

	if (vm->task_info->pid == current->pid)
		return;

	vm->task_info->pid = current->pid;
	get_task_comm(vm->task_info->task_name, current);

	if (current->group_leader->mm != current->mm)
		return;

	vm->task_info->tgid = current->group_leader->pid;
	get_task_comm(vm->task_info->process_name, current->group_leader);
}

/**
 * amdgpu_vm_init - initialize a vm instance
 *
@@ -2306,6 +2414,10 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
	if (r)
		goto error_free_root;

	r = amdgpu_vm_create_task_info(vm);
	if (r)
		DRM_DEBUG("Failed to create task info for VM\n");

	amdgpu_bo_unreserve(vm->root.bo);
	amdgpu_bo_unref(&root_bo);

@@ -2427,6 +2539,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)

	root = amdgpu_bo_ref(vm->root.bo);
	amdgpu_bo_reserve(root, true);
	amdgpu_vm_put_task_info(vm->task_info);
	amdgpu_vm_set_pasid(adev, vm, 0);
	dma_fence_wait(vm->last_unlocked, false);
	dma_fence_put(vm->last_unlocked);
@@ -2583,48 +2696,6 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
	return 0;
}

/**
 * amdgpu_vm_get_task_info - Extracts task info for a PASID.
 *
 * @adev: drm device pointer
 * @pasid: PASID identifier for VM
 * @task_info: task_info to fill.
 */
void amdgpu_vm_get_task_info(struct amdgpu_device *adev, u32 pasid,
			 struct amdgpu_task_info *task_info)
{
	struct amdgpu_vm *vm;
	unsigned long flags;

	xa_lock_irqsave(&adev->vm_manager.pasids, flags);

	vm = xa_load(&adev->vm_manager.pasids, pasid);
	if (vm)
		*task_info = vm->task_info;

	xa_unlock_irqrestore(&adev->vm_manager.pasids, flags);
}

/**
 * amdgpu_vm_set_task_info - Sets VMs task info.
 *
 * @vm: vm for which to set the info
 */
void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
{
	if (vm->task_info.pid)
		return;

	vm->task_info.pid = current->pid;
	get_task_comm(vm->task_info.task_name, current);

	if (current->group_leader->mm != current->mm)
		return;

	vm->task_info.tgid = current->group_leader->pid;
	get_task_comm(vm->task_info.process_name, current->group_leader);
}

/**
 * amdgpu_vm_handle_fault - graceful handling of VM faults.
 * @adev: amdgpu device pointer
Loading