Commit 1b6ef74b authored by Lijo Lazar's avatar Lijo Lazar Committed by Alex Deucher
Browse files

drm/amdgpu: Add fatal error detected flag



For a RAS error that needs a full reset to recover, set the fatal error
status. Clear the status once the device is reset.

Signed-off-by: default avatarLijo Lazar <lijo.lazar@amd.com>
Reviewed-by: default avatarAsad Kamal <asad.kamal@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 34b811a2
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -5321,6 +5321,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
		if (need_full_reset) {
			/* post card */
			amdgpu_ras_set_fed(tmp_adev, false);
			r = amdgpu_device_asic_init(tmp_adev);
			if (r) {
				dev_warn(tmp_adev->dev, "asic atom init failed!");
+32 −0
Original line number Diff line number Diff line
@@ -2439,6 +2439,18 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
				ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE1_RESET;
				set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);

				/* For any RAS error that needs a full reset to
				 * recover, set the fatal error status
				 */
				if (hive) {
					list_for_each_entry(remote_adev,
							    &hive->device_list,
							    gmc.xgmi.head)
						amdgpu_ras_set_fed(remote_adev,
								   true);
				} else {
					amdgpu_ras_set_fed(adev, true);
				}
				psp_fatal_error_recovery_quirk(&adev->psp);
			}
		}
@@ -3440,6 +3452,26 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
	return 0;
}

bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev)
{
	struct amdgpu_ras *ras;

	ras = amdgpu_ras_get_context(adev);
	if (!ras)
		return false;

	return atomic_read(&ras->fed);
}

void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status)
{
	struct amdgpu_ras *ras;

	ras = amdgpu_ras_get_context(adev);
	if (ras)
		atomic_set(&ras->fed, !!status);
}

void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
{
	if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
+6 −0
Original line number Diff line number Diff line
@@ -477,6 +477,8 @@ struct amdgpu_ras {
	wait_queue_head_t page_retirement_wq;
	struct mutex page_retirement_lock;
	atomic_t page_retirement_req_cnt;
	/* Fatal error detected flag */
	atomic_t fed;
};

struct ras_fs_data {
@@ -873,4 +875,8 @@ void amdgpu_ras_add_mca_err_addr(struct ras_err_info *err_info,

void amdgpu_ras_del_mca_err_addr(struct ras_err_info *err_info,
		struct ras_err_addr *mca_err_addr);

void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status);
bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev);

#endif