Commit 9edc5296 authored by Rob Clark's avatar Rob Clark
Browse files

drm/msm: Add VM logging for VM_BIND updates



When userspace opts in to VM_BIND, the submit no longer holds references
keeping the VMA alive.  This makes it difficult to distinguish between
UMD/KMD/app bugs.  So add a debug option for logging the most recent VM
updates and capturing these in GPU devcoredumps.

The submitqueue id is also captured, a value of zero means the operation
did not go via a submitqueue (ie. comes from msm_gem_vm_close() tearing
down the remaining mappings when the device file is closed.

Signed-off-by: default avatarRob Clark <robdclark@chromium.org>
Signed-off-by: default avatarRob Clark <robin.clark@oss.qualcomm.com>
Tested-by: default avatarAntonino Maniscalco <antomani103@gmail.com>
Reviewed-by: default avatarAntonino Maniscalco <antomani103@gmail.com>
Patchwork: https://patchwork.freedesktop.org/patch/661518/
parent 2e6a8a1f
Loading
Loading
Loading
Loading
+11 −0
Original line number Diff line number Diff line
@@ -833,6 +833,7 @@ void adreno_gpu_state_destroy(struct msm_gpu_state *state)
	for (i = 0; state->bos && i < state->nr_bos; i++)
		kvfree(state->bos[i].data);

	kfree(state->vm_logs);
	kfree(state->bos);
	kfree(state->comm);
	kfree(state->cmd);
@@ -973,6 +974,16 @@ void adreno_show(struct msm_gpu *gpu, struct msm_gpu_state *state,
			   info->ptes[0], info->ptes[1], info->ptes[2], info->ptes[3]);
	}

	if (state->vm_logs) {
		drm_puts(p, "vm-log:\n");
		for (i = 0; i < state->nr_vm_logs; i++) {
			struct msm_gem_vm_log_entry *e = &state->vm_logs[i];
			drm_printf(p, "  - %s:%d: 0x%016llx-0x%016llx\n",
				   e->op, e->queue_id, e->iova,
				   e->iova + e->range);
		}
	}

	drm_printf(p, "rbbm-status: 0x%08x\n", state->rbbm_status);

	drm_puts(p, "ringbuffer:\n");
+24 −0
Original line number Diff line number Diff line
@@ -24,6 +24,20 @@
#define MSM_BO_STOLEN        0x10000000    /* try to use stolen/splash memory */
#define MSM_BO_MAP_PRIV      0x20000000    /* use IOMMU_PRIV when mapping */

/**
 * struct msm_gem_vm_log_entry - An entry in the VM log
 *
 * For userspace managed VMs, a log of recent VM updates is tracked and
 * captured in GPU devcore dumps, to aid debugging issues caused by (for
 * example) incorrectly synchronized VM updates
 */
struct msm_gem_vm_log_entry {
	const char *op;
	uint64_t iova;
	uint64_t range;
	int queue_id;
};

/**
 * struct msm_gem_vm - VM object
 *
@@ -85,6 +99,15 @@ struct msm_gem_vm {
	/** @last_fence: Fence for last pending work scheduled on the VM */
	struct dma_fence *last_fence;

	/** @log: A log of recent VM updates */
	struct msm_gem_vm_log_entry *log;

	/** @log_shift: length of @log is (1 << @log_shift) */
	uint32_t log_shift;

	/** @log_idx: index of next @log entry to write */
	uint32_t log_idx;

	/** @faults: the number of GPU hangs associated with this address space */
	int faults;

@@ -115,6 +138,7 @@ msm_gem_vm_create(struct drm_device *drm, struct msm_mmu *mmu, const char *name,
		  u64 va_start, u64 va_size, bool managed);

void msm_gem_vm_close(struct drm_gpuvm *gpuvm);
void msm_gem_vm_unusable(struct drm_gpuvm *gpuvm);

struct msm_fence_context;

+116 −8
Original line number Diff line number Diff line
@@ -17,6 +17,10 @@

#define vm_dbg(fmt, ...) pr_debug("%s:%d: "fmt"\n", __func__, __LINE__, ##__VA_ARGS__)

static uint vm_log_shift = 0;
MODULE_PARM_DESC(vm_log_shift, "Length of VM op log");
module_param_named(vm_log_shift, vm_log_shift, uint, 0600);

/**
 * struct msm_vm_map_op - create new pgtable mapping
 */
@@ -31,6 +35,13 @@ struct msm_vm_map_op {
	struct sg_table *sgt;
	/** @prot: the mapping protection flags */
	int prot;

	/**
	 * @queue_id: The id of the submitqueue the operation is performed
	 * on, or zero for (in particular) UNMAP ops triggered outside of
	 * a submitqueue (ie. process cleanup)
	 */
	int queue_id;
};

/**
@@ -41,6 +52,13 @@ struct msm_vm_unmap_op {
	uint64_t iova;
	/** @range: size of region to unmap */
	uint64_t range;

	/**
	 * @queue_id: The id of the submitqueue the operation is performed
	 * on, or zero for (in particular) UNMAP ops triggered outside of
	 * a submitqueue (ie. process cleanup)
	 */
	int queue_id;
};

/**
@@ -144,16 +162,87 @@ msm_gem_vm_free(struct drm_gpuvm *gpuvm)
		vm->mmu->funcs->destroy(vm->mmu);
	dma_fence_put(vm->last_fence);
	put_pid(vm->pid);
	kfree(vm->log);
	kfree(vm);
}

/**
 * msm_gem_vm_unusable() - Mark a VM as unusable
 * @vm: the VM to mark unusable
 */
void
msm_gem_vm_unusable(struct drm_gpuvm *gpuvm)
{
	struct msm_gem_vm *vm = to_msm_vm(gpuvm);
	uint32_t vm_log_len = (1 << vm->log_shift);
	uint32_t vm_log_mask = vm_log_len - 1;
	uint32_t nr_vm_logs;
	int first;

	vm->unusable = true;

	/* Bail if no log, or empty log: */
	if (!vm->log || !vm->log[0].op)
		return;

	mutex_lock(&vm->mmu_lock);

	/*
	 * log_idx is the next entry to overwrite, meaning it is the oldest, or
	 * first, entry (other than the special case handled below where the
	 * log hasn't wrapped around yet)
	 */
	first = vm->log_idx;

	if (!vm->log[first].op) {
		/*
		 * If the next log entry has not been written yet, then only
		 * entries 0 to idx-1 are valid (ie. we haven't wrapped around
		 * yet)
		 */
		nr_vm_logs = MAX(0, first - 1);
		first = 0;
	} else {
		nr_vm_logs = vm_log_len;
	}

	pr_err("vm-log:\n");
	for (int i = 0; i < nr_vm_logs; i++) {
		int idx = (i + first) & vm_log_mask;
		struct msm_gem_vm_log_entry *e = &vm->log[idx];
		pr_err("  - %s:%d: 0x%016llx-0x%016llx\n",
		       e->op, e->queue_id, e->iova,
		       e->iova + e->range);
	}

	mutex_unlock(&vm->mmu_lock);
}

static void
vm_unmap_op(struct msm_gem_vm *vm, const struct msm_vm_unmap_op *op)
vm_log(struct msm_gem_vm *vm, const char *op, uint64_t iova, uint64_t range, int queue_id)
{
	int idx;

	if (!vm->managed)
		lockdep_assert_held(&vm->mmu_lock);

	vm_dbg("%p: %016llx %016llx", vm, op->iova, op->iova + op->range);
	vm_dbg("%s:%p:%d: %016llx %016llx", op, vm, queue_id, iova, iova + range);

	if (!vm->log)
		return;

	idx = vm->log_idx;
	vm->log[idx].op = op;
	vm->log[idx].iova = iova;
	vm->log[idx].range = range;
	vm->log[idx].queue_id = queue_id;
	vm->log_idx = (vm->log_idx + 1) & ((1 << vm->log_shift) - 1);
}

static void
vm_unmap_op(struct msm_gem_vm *vm, const struct msm_vm_unmap_op *op)
{
	vm_log(vm, "unmap", op->iova, op->range, op->queue_id);

	vm->mmu->funcs->unmap(vm->mmu, op->iova, op->range);
}
@@ -161,10 +250,7 @@ vm_unmap_op(struct msm_gem_vm *vm, const struct msm_vm_unmap_op *op)
static int
vm_map_op(struct msm_gem_vm *vm, const struct msm_vm_map_op *op)
{
	if (!vm->managed)
		lockdep_assert_held(&vm->mmu_lock);

	vm_dbg("%p: %016llx %016llx", vm, op->iova, op->iova + op->range);
	vm_log(vm, "map", op->iova, op->range, op->queue_id);

	return vm->mmu->funcs->map(vm->mmu, op->iova, op->sgt, op->offset,
				   op->range, op->prot);
@@ -382,6 +468,7 @@ vma_from_op(struct op_arg *arg, struct drm_gpuva_op_map *op)
static int
msm_gem_vm_sm_step_map(struct drm_gpuva_op *op, void *arg)
{
	struct msm_vm_bind_job *job = ((struct op_arg *)arg)->job;
	struct drm_gem_object *obj = op->map.gem.obj;
	struct drm_gpuva *vma;
	struct sg_table *sgt;
@@ -412,6 +499,7 @@ msm_gem_vm_sm_step_map(struct drm_gpuva_op *op, void *arg)
			.range = vma->va.range,
			.offset = vma->gem.offset,
			.prot = prot,
			.queue_id = job->queue->id,
		},
		.obj = vma->gem.obj,
	});
@@ -445,6 +533,7 @@ msm_gem_vm_sm_step_remap(struct drm_gpuva_op *op, void *arg)
			.unmap = {
				.iova = unmap_start,
				.range = unmap_range,
				.queue_id = job->queue->id,
			},
			.obj = orig_vma->gem.obj,
		});
@@ -506,6 +595,7 @@ msm_gem_vm_sm_step_remap(struct drm_gpuva_op *op, void *arg)
static int
msm_gem_vm_sm_step_unmap(struct drm_gpuva_op *op, void *arg)
{
	struct msm_vm_bind_job *job = ((struct op_arg *)arg)->job;
	struct drm_gpuva *vma = op->unmap.va;
	struct msm_gem_vma *msm_vma = to_msm_vma(vma);

@@ -520,6 +610,7 @@ msm_gem_vm_sm_step_unmap(struct drm_gpuva_op *op, void *arg)
		.unmap = {
			.iova = vma->va.addr,
			.range = vma->va.range,
			.queue_id = job->queue->id,
		},
		.obj = vma->gem.obj,
	});
@@ -584,7 +675,7 @@ msm_vma_job_run(struct drm_sched_job *_job)
	 * now the VM is in an undefined state.  Game over!
	 */
	if (ret)
		vm->unusable = true;
		msm_gem_vm_unusable(job->vm);

	job_foreach_bo (obj, job) {
		msm_gem_lock(obj);
@@ -695,6 +786,23 @@ msm_gem_vm_create(struct drm_device *drm, struct msm_mmu *mmu, const char *name,

	drm_mm_init(&vm->mm, va_start, va_size);

	/*
	 * We don't really need vm log for kernel managed VMs, as the kernel
	 * is responsible for ensuring that GEM objs are mapped if they are
	 * used by a submit.  Furthermore we piggyback on mmu_lock to serialize
	 * access to the log.
	 *
	 * Limit the max log_shift to 8 to prevent userspace from asking us
	 * for an unreasonable log size.
	 */
	if (!managed)
		vm->log_shift = MIN(vm_log_shift, 8);

	if (vm->log_shift) {
		vm->log = kmalloc_array(1 << vm->log_shift, sizeof(vm->log[0]),
					GFP_KERNEL | __GFP_ZERO);
	}

	return &vm->base;

err_free_dummy:
@@ -1162,7 +1270,7 @@ vm_bind_job_prepare(struct msm_vm_bind_job *job)
			 * state the vm is in.  So throw up our hands!
			 */
			if (i > 0)
				vm->unusable = true;
				msm_gem_vm_unusable(job->vm);
			return ret;
		}
	}
+47 −5
Original line number Diff line number Diff line
@@ -259,9 +259,6 @@ static void crashstate_get_bos(struct msm_gpu_state *state, struct msm_gem_submi
{
	extern bool rd_full;

	if (!submit)
		return;

	if (msm_context_is_vmbind(submit->queue->ctx)) {
		struct drm_exec exec;
		struct drm_gpuva *vma;
@@ -318,6 +315,48 @@ static void crashstate_get_bos(struct msm_gpu_state *state, struct msm_gem_submi
	}
}

static void crashstate_get_vm_logs(struct msm_gpu_state *state, struct msm_gem_vm *vm)
{
	uint32_t vm_log_len = (1 << vm->log_shift);
	uint32_t vm_log_mask = vm_log_len - 1;
	int first;

	/* Bail if no log, or empty log: */
	if (!vm->log || !vm->log[0].op)
		return;

	mutex_lock(&vm->mmu_lock);

	/*
	 * log_idx is the next entry to overwrite, meaning it is the oldest, or
	 * first, entry (other than the special case handled below where the
	 * log hasn't wrapped around yet)
	 */
	first = vm->log_idx;

	if (!vm->log[first].op) {
		/*
		 * If the next log entry has not been written yet, then only
		 * entries 0 to idx-1 are valid (ie. we haven't wrapped around
		 * yet)
		 */
		state->nr_vm_logs = MAX(0, first - 1);
		first = 0;
	} else {
		state->nr_vm_logs = vm_log_len;
	}

	state->vm_logs = kmalloc_array(
		state->nr_vm_logs, sizeof(vm->log[0]), GFP_KERNEL);
	for (int i = 0; i < state->nr_vm_logs; i++) {
		int idx = (i + first) & vm_log_mask;

		state->vm_logs[i] = vm->log[idx];
	}

	mutex_unlock(&vm->mmu_lock);
}

static void msm_gpu_crashstate_capture(struct msm_gpu *gpu,
		struct msm_gem_submit *submit, struct msm_gpu_fault_info *fault_info,
		char *comm, char *cmd)
@@ -351,7 +390,10 @@ static void msm_gpu_crashstate_capture(struct msm_gpu *gpu,
		msm_iommu_pagetable_walk(mmu, info->iova, info->ptes);
	}

	if (submit) {
		crashstate_get_vm_logs(state, to_msm_vm(submit->vm));
		crashstate_get_bos(state, submit);
	}

	/* Set the active crash state to be dumped on failure */
	gpu->crashstate = state;
@@ -452,7 +494,7 @@ static void recover_worker(struct kthread_work *work)
		 * VM_BIND)
		 */
		if (!vm->managed)
			vm->unusable = true;
			msm_gem_vm_unusable(submit->vm);
	}

	get_comm_cmdline(submit, &comm, &cmd);
+4 −0
Original line number Diff line number Diff line
@@ -20,6 +20,7 @@
#include "msm_gem.h"

struct msm_gem_submit;
struct msm_gem_vm_log_entry;
struct msm_gpu_perfcntr;
struct msm_gpu_state;
struct msm_context;
@@ -603,6 +604,9 @@ struct msm_gpu_state {

	struct msm_gpu_fault_info fault_info;

	int nr_vm_logs;
	struct msm_gem_vm_log_entry *vm_logs;

	int nr_bos;
	struct msm_gpu_state_bo *bos;
};