Commit d95ca7f5 authored by YiPeng Chai's avatar YiPeng Chai Committed by Alex Deucher
Browse files

drm/amdgpu: suspend ras module before gpu reset



During gpu reset, all GPU-related resources are
inaccessible. To avoid affecting ras functionality,
suspend ras module before gpu reset and resume
it after gpu reset is complete.

V2:
  Rename functions to avoid misunderstanding.

V3:
  Move flush_delayed_work to amdgpu_ras_process_pause,
  Move schedule_delayed_work to amdgpu_ras_process_unpause.

V4:
  Rename functions.

V5:
  Move the function to amdgpu_ras.c.

Signed-off-by: default avatarYiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: default avatarTao Zhou <tao.zhou1@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Acked-by: default avatarLijo Lazar <lijo.lazar@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent d4432f16
Loading
Loading
Loading
Loading
+5 −0
Original line number Diff line number Diff line
@@ -71,6 +71,7 @@

#include "amdgpu_xgmi.h"
#include "amdgpu_ras.h"
#include "amdgpu_ras_mgr.h"
#include "amdgpu_pmu.h"
#include "amdgpu_fru_eeprom.h"
#include "amdgpu_reset.h"
@@ -6660,6 +6661,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
			goto end_reset;
	}

	/* Cannot be called after locking reset domain */
	amdgpu_ras_pre_reset(adev, &device_list);

	/* We need to lock reset domain only once both for XGMI and single device */
	amdgpu_device_recovery_get_reset_lock(adev, &device_list);

@@ -6691,6 +6695,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
reset_unlock:
	amdgpu_device_recovery_put_reset_lock(adev, &device_list);
end_reset:
	amdgpu_ras_post_reset(adev, &device_list);
	if (hive) {
		mutex_unlock(&hive->hive_lock);
		amdgpu_put_xgmi_hive(hive);
+28 −2
Original line number Diff line number Diff line
@@ -2921,9 +2921,13 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
		type = amdgpu_ras_get_fatal_error_event(adev);
		list_for_each_entry(remote_adev,
				device_list_handle, gmc.xgmi.head) {
			if (amdgpu_uniras_enabled(remote_adev)) {
				amdgpu_ras_mgr_update_ras_ecc(remote_adev);
			} else {
				amdgpu_ras_query_err_status(remote_adev);
				amdgpu_ras_log_on_err_counter(remote_adev, type);
			}
		}

	}

@@ -5673,3 +5677,25 @@ bool amdgpu_ras_check_critical_address(struct amdgpu_device *adev, uint64_t addr

	return ret;
}

void amdgpu_ras_pre_reset(struct amdgpu_device *adev,
					  struct list_head *device_list)
{
	struct amdgpu_device *tmp_adev = NULL;

	list_for_each_entry(tmp_adev, device_list, reset_list) {
		if (amdgpu_uniras_enabled(tmp_adev))
			amdgpu_ras_mgr_pre_reset(tmp_adev);
	}
}

void amdgpu_ras_post_reset(struct amdgpu_device *adev,
					  struct list_head *device_list)
{
	struct amdgpu_device *tmp_adev = NULL;

	list_for_each_entry(tmp_adev, device_list, reset_list) {
		if (amdgpu_uniras_enabled(tmp_adev))
			amdgpu_ras_mgr_post_reset(tmp_adev);
	}
}
+5 −0
Original line number Diff line number Diff line
@@ -1039,4 +1039,9 @@ void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id,
				const char *fmt, ...);

bool amdgpu_ras_is_rma(struct amdgpu_device *adev);

void amdgpu_ras_pre_reset(struct amdgpu_device *adev,
					  struct list_head *device_list);
void amdgpu_ras_post_reset(struct amdgpu_device *adev,
					  struct list_head *device_list);
#endif
+22 −0
Original line number Diff line number Diff line
@@ -624,3 +624,25 @@ int amdgpu_ras_mgr_handle_ras_cmd(struct amdgpu_device *adev,

	return ret;
}

int amdgpu_ras_mgr_pre_reset(struct amdgpu_device *adev)
{
	if (!amdgpu_ras_mgr_is_ready(adev)) {
		RAS_DEV_ERR(adev, "Invalid ras suspend!\n");
		return -EPERM;
	}

	amdgpu_ras_process_pre_reset(adev);
	return 0;
}

int amdgpu_ras_mgr_post_reset(struct amdgpu_device *adev)
{
	if (!amdgpu_ras_mgr_is_ready(adev)) {
		RAS_DEV_ERR(adev, "Invalid ras resume!\n");
		return -EPERM;
	}

	amdgpu_ras_process_post_reset(adev);
	return 0;
}
+5 −0
Original line number Diff line number Diff line
@@ -52,6 +52,9 @@ struct amdgpu_ras_mgr {
	struct ras_event_manager ras_event_mgr;
	uint64_t last_poison_consumption_seqno;
	bool ras_is_ready;

	bool is_paused;
	struct completion ras_event_done;
};

extern const struct amdgpu_ip_block_version ras_v1_0_ip_block;
@@ -75,4 +78,6 @@ bool amdgpu_ras_mgr_is_rma(struct amdgpu_device *adev);
int amdgpu_ras_mgr_handle_ras_cmd(struct amdgpu_device *adev,
		uint32_t cmd_id, void *input, uint32_t input_size,
		void *output, uint32_t out_size);
int amdgpu_ras_mgr_pre_reset(struct amdgpu_device *adev);
int amdgpu_ras_mgr_post_reset(struct amdgpu_device *adev);
#endif
Loading