Commit 408bd841 authored by YiPeng Chai's avatar YiPeng Chai Committed by Alex Deucher
Browse files

drm/amdgpu: Improve ras fatal error handling function



In multi-gpu case, a fatal error will generate several
fatal error interrupts. After improving this function,
the ras module can reuse this function to only
handle the first interrupt.

V3:
  Initialize event_id using RAS_EVENT_INVALID_ID.

Signed-off-by: default avatarYiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: default avatarTao Zhou <tao.zhou1@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 3d72d2e5
Loading
Loading
Loading
Loading
+8 −8
Original line number Diff line number Diff line
@@ -4650,19 +4650,17 @@ u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type
	return id;
}

void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
int amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
{
	if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
		struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
		enum ras_event_type type = RAS_EVENT_TYPE_FATAL;
		u64 event_id;
		u64 event_id = RAS_EVENT_INVALID_ID;

		if (amdgpu_ras_mark_ras_event(adev, type)) {
			dev_err(adev->dev,
				"uncorrectable hardware error (ERREVENT_ATHUB_INTERRUPT) detected!\n");
			return;
		}
		if (amdgpu_uniras_enabled(adev))
			return 0;

		if (!amdgpu_ras_mark_ras_event(adev, type))
			event_id = amdgpu_ras_acquire_event_id(adev, type);

		RAS_EVENT_LOG(adev, event_id, "uncorrectable hardware error"
@@ -4672,6 +4670,8 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
		ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET;
		amdgpu_ras_reset_gpu(adev);
	}

	return -EBUSY;
}

bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev)
+1 −1
Original line number Diff line number Diff line
@@ -910,7 +910,7 @@ static inline void amdgpu_ras_intr_cleared(void)
	atomic_set(&amdgpu_ras_in_intr, 0);
}

void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev);
int amdgpu_ras_global_ras_isr(struct amdgpu_device *adev);

void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready);

+5 −0
Original line number Diff line number Diff line
@@ -29,8 +29,13 @@
static int amdgpu_ras_sys_detect_fatal_event(struct ras_core_context *ras_core, void *data)
{
	struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
	int ret;
	uint64_t seq_no;

	ret = amdgpu_ras_global_ras_isr(adev);
	if (ret)
		return ret;

	seq_no = amdgpu_ras_mgr_gen_ras_event_seqno(adev, RAS_SEQNO_TYPE_UE);
	RAS_DEV_INFO(adev,
		"{%llu} Uncorrectable hardware error(ERREVENT_ATHUB_INTERRUPT) detected!\n",