Commit ff780f4f authored by Alex Deucher's avatar Alex Deucher
Browse files

drm/amdgpu: set an error on all fences from a bad context



When we backup ring contents to reemit after a queue reset,
we don't backup ring contents from the bad context.  When
we signal the fences, we should set an error on those
fences as well.

v2: misc cleanups
v3: add locking for fence error, fix comment (Christian)
v4: fix wrap around, locking (Christian)

Fixes: 77cc0da3 ("drm/amdgpu: track ring state associated with a fence")
Reviewed-by: default avatarChristian König <christian.koenig@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 1f22fcb8
Loading
Loading
Loading
Loading
+35 −4
Original line number Diff line number Diff line
@@ -758,11 +758,42 @@ void amdgpu_fence_driver_force_completion(struct amdgpu_ring *ring)
 * @fence: fence of the ring to signal
 *
 */
void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *fence)
void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *af)
{
	struct dma_fence *unprocessed;
	struct dma_fence __rcu **ptr;
	struct amdgpu_fence *fence;
	struct amdgpu_ring *ring = af->ring;
	unsigned long flags;
	u32 seq, last_seq;

	last_seq = amdgpu_fence_read(ring) & ring->fence_drv.num_fences_mask;
	seq = ring->fence_drv.sync_seq & ring->fence_drv.num_fences_mask;

	/* mark all fences from the guilty context with an error */
	spin_lock_irqsave(&ring->fence_drv.lock, flags);
	do {
		last_seq++;
		last_seq &= ring->fence_drv.num_fences_mask;

		ptr = &ring->fence_drv.fences[last_seq];
		rcu_read_lock();
		unprocessed = rcu_dereference(*ptr);

		if (unprocessed && !dma_fence_is_signaled_locked(unprocessed)) {
			fence = container_of(unprocessed, struct amdgpu_fence, base);

			if (fence == af)
				dma_fence_set_error(&fence->base, -ETIME);
	amdgpu_fence_write(fence->ring, fence->seq);
	amdgpu_fence_process(fence->ring);
			else if (fence->context == af->context)
				dma_fence_set_error(&fence->base, -ECANCELED);
		}
		rcu_read_unlock();
	} while (last_seq != seq);
	spin_unlock_irqrestore(&ring->fence_drv.lock, flags);
	/* signal the guilty fence */
	amdgpu_fence_write(ring, af->seq);
	amdgpu_fence_process(ring);
}

void amdgpu_fence_save_wptr(struct dma_fence *fence)
+1 −1
Original line number Diff line number Diff line
@@ -811,7 +811,7 @@ int amdgpu_ring_reset_helper_end(struct amdgpu_ring *ring,
	if (r)
		return r;

	/* signal the fence of the bad job */
	/* signal the guilty fence and set an error on all fences from the context */
	if (guilty_fence)
		amdgpu_fence_driver_guilty_force_completion(guilty_fence);
	/* Re-emit the non-guilty commands */
+1 −1
Original line number Diff line number Diff line
@@ -155,7 +155,7 @@ extern const struct drm_sched_backend_ops amdgpu_sched_ops;
void amdgpu_fence_driver_clear_job_fences(struct amdgpu_ring *ring);
void amdgpu_fence_driver_set_error(struct amdgpu_ring *ring, int error);
void amdgpu_fence_driver_force_completion(struct amdgpu_ring *ring);
void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *fence);
void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *af);
void amdgpu_fence_save_wptr(struct dma_fence *fence);

int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring);