Commit 2fc46e0b authored by Tao Zhou's avatar Tao Zhou Committed by Alex Deucher
Browse files

drm/amdgpu: make reset method configurable for RAS poison



Each RAS block has different requirement for gpu reset in poison
consumption handling.
Add support for mmhub RAS poison consumption handling.

v2: remove the mmhub poison support for kfd int v10.

Signed-off-by: default avatarTao Zhou <tao.zhou1@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent e3d4de8d
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -748,7 +748,7 @@ bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev)
}

void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
	enum amdgpu_ras_block block, bool reset)
	enum amdgpu_ras_block block, uint32_t reset)
{
	amdgpu_umc_poison_handler(adev, block, reset);
}
+1 −1
Original line number Diff line number Diff line
@@ -336,7 +336,7 @@ void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev);
int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
				struct tile_config *config);
void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
			enum amdgpu_ras_block block, bool reset);
			enum amdgpu_ras_block block, uint32_t reset);
bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev);
bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem);
void amdgpu_amdkfd_block_mmu_notifications(void *p);
+2 −2
Original line number Diff line number Diff line
@@ -2051,7 +2051,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
		}
	}

	amdgpu_umc_poison_handler(adev, obj->head.block, false);
	amdgpu_umc_poison_handler(adev, obj->head.block, 0);

	if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption)
		poison_stat = block_obj->hw_ops->handle_poison_consumption(adev);
@@ -2704,7 +2704,7 @@ static int amdgpu_ras_page_retirement_thread(void *param)
		atomic_dec(&con->page_retirement_req_cnt);

		amdgpu_umc_bad_page_polling_timeout(adev,
				false, MAX_UMC_POISON_POLLING_TIME_ASYNC);
				0, MAX_UMC_POISON_POLLING_TIME_ASYNC);
	}

	return 0;
+7 −9
Original line number Diff line number Diff line
@@ -177,7 +177,7 @@ static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
		void *ras_error_status,
		struct amdgpu_iv_entry *entry,
		bool reset)
		uint32_t reset)
{
	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
@@ -186,9 +186,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
	amdgpu_umc_handle_bad_pages(adev, ras_error_status);

	if (err_data->ue_count && reset) {
		/* use mode-2 reset for poison consumption */
		if (!entry)
			con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET;
		con->gpu_reset_flags |= reset;
		amdgpu_ras_reset_gpu(adev);
	}

@@ -196,7 +194,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
}

int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
			bool reset, uint32_t timeout_ms)
			uint32_t reset, uint32_t timeout_ms)
{
	struct ras_err_data err_data;
	struct ras_common_if head = {
@@ -238,8 +236,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
	if (reset) {
		struct amdgpu_ras *con = amdgpu_ras_get_context(adev);

		/* use mode-2 reset for poison consumption */
		con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET;
		con->gpu_reset_flags |= reset;
		amdgpu_ras_reset_gpu(adev);
	}

@@ -247,7 +244,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
}

int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
			enum amdgpu_ras_block block, bool reset)
			enum amdgpu_ras_block block, uint32_t reset)
{
	int ret = AMDGPU_RAS_SUCCESS;

@@ -311,7 +308,8 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
		void *ras_error_status,
		struct amdgpu_iv_entry *entry)
{
	return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, true);
	return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry,
				AMDGPU_RAS_GPU_RESET_MODE1_RESET);
}

int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev)
+2 −2
Original line number Diff line number Diff line
@@ -101,7 +101,7 @@ struct amdgpu_umc {
int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev);
int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block);
int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
			enum amdgpu_ras_block block, bool reset);
			enum amdgpu_ras_block block, uint32_t reset);
int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
		struct amdgpu_irq_src *source,
		struct amdgpu_iv_entry *entry);
@@ -121,5 +121,5 @@ int amdgpu_umc_loop_channels(struct amdgpu_device *adev,
			umc_func func, void *data);

int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
			bool reset, uint32_t timeout_ms);
			uint32_t reset, uint32_t timeout_ms);
#endif
Loading