Commit 3c0a6019 authored by Boris Brezillon's avatar Boris Brezillon
Browse files

drm/panthor: Recover from panthor_gpu_flush_caches() failures



We have seen a few cases where the whole memory subsystem is blocked
and flush operations never complete. When that happens, we want to:

- schedule a reset, so we can recover from this situation
- in the reset path, we need to reset the pending_reqs so we can send
  new commands after the reset
- if more panthor_gpu_flush_caches() operations are queued after
  the timeout, we skip them and return -EIO directly to avoid needless
  waits (the memory block won't miraculously work again)

Note that we drop the WARN_ON()s because these hangs can be triggered
with buggy GPU jobs created by the UMD, and there's no way we can
prevent it. We do keep the error messages though.

v2:
- New patch

v3:
- Collect R-b
- Explicitly mention the fact we dropped the WARN_ON()s in the commit
  message

v4:
- No changes

Fixes: 5cd894e2 ("drm/panthor: Add the GPU logical block")
Reviewed-by: default avatarSteven Price <steven.price@arm.com>
Link: https://patch.msgid.link/20251128084841.3804658-4-boris.brezillon@collabora.com


Signed-off-by: default avatarBoris Brezillon <boris.brezillon@collabora.com>
parent 151df689
Loading
Loading
Loading
Loading
+12 −7
Original line number Diff line number Diff line
@@ -289,38 +289,42 @@ int panthor_gpu_l2_power_on(struct panthor_device *ptdev)
int panthor_gpu_flush_caches(struct panthor_device *ptdev,
			     u32 l2, u32 lsc, u32 other)
{
	bool timedout = false;
	unsigned long flags;
	int ret = 0;

	/* Serialize cache flush operations. */
	guard(mutex)(&ptdev->gpu->cache_flush_lock);

	spin_lock_irqsave(&ptdev->gpu->reqs_lock, flags);
	if (!drm_WARN_ON(&ptdev->base,
			 ptdev->gpu->pending_reqs & GPU_IRQ_CLEAN_CACHES_COMPLETED)) {
	if (!(ptdev->gpu->pending_reqs & GPU_IRQ_CLEAN_CACHES_COMPLETED)) {
		ptdev->gpu->pending_reqs |= GPU_IRQ_CLEAN_CACHES_COMPLETED;
		gpu_write(ptdev, GPU_CMD, GPU_FLUSH_CACHES(l2, lsc, other));
	} else {
		ret = -EIO;
	}
	spin_unlock_irqrestore(&ptdev->gpu->reqs_lock, flags);

	if (ret)
		return ret;

	if (!wait_event_timeout(ptdev->gpu->reqs_acked,
				!(ptdev->gpu->pending_reqs & GPU_IRQ_CLEAN_CACHES_COMPLETED),
				msecs_to_jiffies(100))) {
		spin_lock_irqsave(&ptdev->gpu->reqs_lock, flags);
		if ((ptdev->gpu->pending_reqs & GPU_IRQ_CLEAN_CACHES_COMPLETED) != 0 &&
		    !(gpu_read(ptdev, GPU_INT_RAWSTAT) & GPU_IRQ_CLEAN_CACHES_COMPLETED))
			timedout = true;
			ret = -ETIMEDOUT;
		else
			ptdev->gpu->pending_reqs &= ~GPU_IRQ_CLEAN_CACHES_COMPLETED;
		spin_unlock_irqrestore(&ptdev->gpu->reqs_lock, flags);
	}

	if (timedout) {
	if (ret) {
		panthor_device_schedule_reset(ptdev);
		drm_err(&ptdev->base, "Flush caches timeout");
		return -ETIMEDOUT;
	}

	return 0;
	return ret;
}

/**
@@ -360,6 +364,7 @@ int panthor_gpu_soft_reset(struct panthor_device *ptdev)
		return -ETIMEDOUT;
	}

	ptdev->gpu->pending_reqs = 0;
	return 0;
}