Commit b1304409 authored by Connor Abbott's avatar Connor Abbott Committed by Rob Clark
Browse files

drm/msm: Temporarily disable stall-on-fault after a page fault



When things go wrong, the GPU is capable of quickly generating millions
of faulting translation requests per second. When that happens, in the
stall-on-fault model each access will stall until it wins the race to
signal the fault and then the RESUME register is written. This slows
processing page faults to a crawl as the GPU can generate faults much
faster than the CPU can acknowledge them. It also means that all
available resources in the SMMU are saturated waiting for the stalled
transactions, so that other transactions such as transactions generated
by the GMU, which shares translation resources with the GPU, cannot
proceed. This causes a GMU watchdog timeout, which leads to a failed
reset because GX cannot collapse when there is a transaction pending and
a permanently hung GPU.

On older platforms with qcom,smmu-v2, it seems that when one transaction
is stalled subsequent faulting transactions are terminated, which avoids
this problem, but the MMU-500 follows the spec here.

To work around these problems, disable stall-on-fault as soon as we get a
page fault until a cooldown period after pagefaults stop. This allows
the GMU some guaranteed time to continue working. We only use
stall-on-fault to halt the GPU while we collect a devcoredump and we
always terminate the transaction afterward, so it's fine to miss some
subsequent page faults. We also keep it disabled so long as the current
devcoredump hasn't been deleted, because in that case we likely won't
capture another one if there's a fault.

After this commit HFI messages still occasionally time out, because the
crashdump handler doesn't run fast enough to let the GMU resume, but the
driver seems to recover from it. This will probably go away after the
HFI timeout is increased.

Signed-off-by: default avatarConnor Abbott <cwabbott0@gmail.com>
Reviewed-by: default avatarRob Clark <robdclark@gmail.com>
Patchwork: https://patchwork.freedesktop.org/patch/654891/


Signed-off-by: default avatarRob Clark <robin.clark@oss.qualcomm.com>
parent dedf404b
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -131,6 +131,8 @@ static void a5xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
	struct msm_ringbuffer *ring = submit->ring;
	unsigned int i, ibs = 0;

	adreno_check_and_reenable_stall(adreno_gpu);

	if (IS_ENABLED(CONFIG_DRM_MSM_GPU_SUDO) && submit->in_rb) {
		ring->cur_ctx_seqno = 0;
		a5xx_submit_in_rb(gpu, submit);
+4 −0
Original line number Diff line number Diff line
@@ -212,6 +212,8 @@ static void a6xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
	struct msm_ringbuffer *ring = submit->ring;
	unsigned int i, ibs = 0;

	adreno_check_and_reenable_stall(adreno_gpu);

	a6xx_set_pagetable(a6xx_gpu, ring, submit);

	get_stats_counter(ring, REG_A6XX_RBBM_PERFCTR_CP(0),
@@ -335,6 +337,8 @@ static void a7xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
	struct msm_ringbuffer *ring = submit->ring;
	unsigned int i, ibs = 0;

	adreno_check_and_reenable_stall(adreno_gpu);

	/*
	 * Toggle concurrent binning for pagetable switch and set the thread to
	 * BR since only it can execute the pagetable switch packets.
+39 −1
Original line number Diff line number Diff line
@@ -259,16 +259,54 @@ u64 adreno_private_address_space_size(struct msm_gpu *gpu)
	return BIT(ttbr1_cfg->ias) - ADRENO_VM_START;
}

void adreno_check_and_reenable_stall(struct adreno_gpu *adreno_gpu)
{
	struct msm_gpu *gpu = &adreno_gpu->base;
	struct msm_drm_private *priv = gpu->dev->dev_private;
	unsigned long flags;

	/*
	 * Wait until the cooldown period has passed and we would actually
	 * collect a crashdump to re-enable stall-on-fault.
	 */
	spin_lock_irqsave(&priv->fault_stall_lock, flags);
	if (!priv->stall_enabled &&
			ktime_after(ktime_get(), priv->stall_reenable_time) &&
			!READ_ONCE(gpu->crashstate)) {
		priv->stall_enabled = true;

		gpu->aspace->mmu->funcs->set_stall(gpu->aspace->mmu, true);
	}
	spin_unlock_irqrestore(&priv->fault_stall_lock, flags);
}

#define ARM_SMMU_FSR_TF                 BIT(1)
#define ARM_SMMU_FSR_PF			BIT(3)
#define ARM_SMMU_FSR_EF			BIT(4)
#define ARM_SMMU_FSR_SS			BIT(30)

int adreno_fault_handler(struct msm_gpu *gpu, unsigned long iova, int flags,
			 struct adreno_smmu_fault_info *info, const char *block,
			 u32 scratch[4])
{
	struct msm_drm_private *priv = gpu->dev->dev_private;
	const char *type = "UNKNOWN";
	bool do_devcoredump = info && !READ_ONCE(gpu->crashstate);
	bool do_devcoredump = info && (info->fsr & ARM_SMMU_FSR_SS) &&
		!READ_ONCE(gpu->crashstate);
	unsigned long irq_flags;

	/*
	 * In case there is a subsequent storm of pagefaults, disable
	 * stall-on-fault for at least half a second.
	 */
	spin_lock_irqsave(&priv->fault_stall_lock, irq_flags);
	if (priv->stall_enabled) {
		priv->stall_enabled = false;

		gpu->aspace->mmu->funcs->set_stall(gpu->aspace->mmu, false);
	}
	priv->stall_reenable_time = ktime_add_ms(ktime_get(), 500);
	spin_unlock_irqrestore(&priv->fault_stall_lock, irq_flags);

	/*
	 * Print a default message if we couldn't get the data from the
+2 −0
Original line number Diff line number Diff line
@@ -636,6 +636,8 @@ int adreno_fault_handler(struct msm_gpu *gpu, unsigned long iova, int flags,
			 struct adreno_smmu_fault_info *info, const char *block,
			 u32 scratch[4]);

void adreno_check_and_reenable_stall(struct adreno_gpu *gpu);

int adreno_read_speedbin(struct device *dev, u32 *speedbin);

/*
+32 −0
Original line number Diff line number Diff line
@@ -208,6 +208,35 @@ DEFINE_DEBUGFS_ATTRIBUTE(shrink_fops,
			 shrink_get, shrink_set,
			 "0x%08llx\n");

/*
 * Return the number of microseconds to wait until stall-on-fault is
 * re-enabled. If 0 then it is already enabled or will be re-enabled on the
 * next submit (unless there's a leftover devcoredump). This is useful for
 * kernel tests that intentionally produce a fault and check the devcoredump to
 * wait until the cooldown period is over.
 */

static int
stall_reenable_time_get(void *data, u64 *val)
{
	struct msm_drm_private *priv = data;
	unsigned long irq_flags;

	spin_lock_irqsave(&priv->fault_stall_lock, irq_flags);

	if (priv->stall_enabled)
		*val = 0;
	else
		*val = max(ktime_us_delta(priv->stall_reenable_time, ktime_get()), 0);

	spin_unlock_irqrestore(&priv->fault_stall_lock, irq_flags);

	return 0;
}

DEFINE_DEBUGFS_ATTRIBUTE(stall_reenable_time_fops,
			 stall_reenable_time_get, NULL,
			 "%lld\n");

static int msm_gem_show(struct seq_file *m, void *arg)
{
@@ -319,6 +348,9 @@ static void msm_debugfs_gpu_init(struct drm_minor *minor)
	debugfs_create_bool("disable_err_irq", 0600, minor->debugfs_root,
		&priv->disable_err_irq);

	debugfs_create_file("stall_reenable_time_us", 0400, minor->debugfs_root,
		priv, &stall_reenable_time_fops);

	gpu_devfreq = debugfs_create_dir("devfreq", minor->debugfs_root);

	debugfs_create_bool("idle_clamp",0600, gpu_devfreq,
Loading