Commit c0470691 authored by YiPeng Chai's avatar YiPeng Chai Committed by Alex Deucher
Browse files

drm/amdgpu: flush all cached ras bad pages to eeprom



Before uninstalling gpu driver, flush all cached ras
bad pages to eeprom.

v2:
  Put the same code into a function and reuse the function.

Signed-off-by: default avatarYiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: default avatarTao Zhou <tao.zhou1@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent c3938571
Loading
Loading
Loading
Loading
+29 −6
Original line number Diff line number Diff line
@@ -124,6 +124,8 @@ const char *get_ras_block_str(struct ras_common_if *ras_block)

#define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100  //ms

#define MAX_FLUSH_RETIRE_DWORK_TIMES  100

enum amdgpu_ras_retire_page_reservation {
	AMDGPU_RAS_RETIRE_PAGE_RESERVED,
	AMDGPU_RAS_RETIRE_PAGE_PENDING,
@@ -2907,6 +2909,23 @@ static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)
	ecc_log->prev_de_queried_count = 0;
}

static bool amdgpu_ras_schedule_retirement_dwork(struct amdgpu_ras *con,
				uint32_t delayed_ms)
{
	int ret;

	mutex_lock(&con->umc_ecc_log.lock);
	ret = radix_tree_tagged(&con->umc_ecc_log.de_page_tree,
			UMC_ECC_NEW_DETECTED_TAG);
	mutex_unlock(&con->umc_ecc_log.lock);

	if (ret)
		schedule_delayed_work(&con->page_retirement_dwork,
			msecs_to_jiffies(delayed_ms));

	return ret ? true : false;
}

static void amdgpu_ras_do_page_retirement(struct work_struct *work)
{
	struct amdgpu_ras *con = container_of(work, struct amdgpu_ras,
@@ -2928,12 +2947,8 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work)
	if (err_cnt && con->is_rma)
		amdgpu_ras_reset_gpu(adev);

	mutex_lock(&con->umc_ecc_log.lock);
	if (radix_tree_tagged(&con->umc_ecc_log.de_page_tree,
				UMC_ECC_NEW_DETECTED_TAG))
		schedule_delayed_work(&con->page_retirement_dwork,
			msecs_to_jiffies(AMDGPU_RAS_RETIRE_PAGE_INTERVAL));
	mutex_unlock(&con->umc_ecc_log.lock);
	amdgpu_ras_schedule_retirement_dwork(con,
			AMDGPU_RAS_RETIRE_PAGE_INTERVAL);
}

static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
@@ -3237,11 +3252,19 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
{
	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
	struct ras_err_handler_data *data = con->eh_data;
	int max_flush_timeout = MAX_FLUSH_RETIRE_DWORK_TIMES;
	bool ret;

	/* recovery_init failed to init it, fini is useless */
	if (!data)
		return 0;

	/* Save all cached bad pages to eeprom */
	do {
		flush_delayed_work(&con->page_retirement_dwork);
		ret = amdgpu_ras_schedule_retirement_dwork(con, 0);
	} while (ret && max_flush_timeout--);

	if (con->page_retirement_thread)
		kthread_stop(con->page_retirement_thread);