mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git/
synced 2026-04-28 21:46:43 -04:00
drm/amdgpu: enable watchdog feature for SQ of aldebaran
SQ's watchdog timer monitors forward progress, a mask of which waves caused the watchdog timeout is recorded into ras status registers and then trigger a system fatal error event. v2: 1. change *query_timeout_status to *query_sq_timeout_status. 2. move query_sq_timeout_status into amdgpu_ras_do_recovery. 3. add module parameters to enable/disable fatal error event and modify the watchdog timer. v3: 1. remove unused parameters of *enable_watchdog_timer Signed-off-by: Dennis Li <Dennis.Li@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
@@ -1129,3 +1129,109 @@ void gfx_v9_4_2_query_ras_error_status(struct amdgpu_device *adev)
|
||||
gfx_v9_4_2_query_ea_err_status(adev);
|
||||
gfx_v9_4_2_query_utc_err_status(adev);
|
||||
}
|
||||
|
||||
void gfx_v9_4_2_enable_watchdog_timer(struct amdgpu_device *adev)
|
||||
{
|
||||
uint32_t i;
|
||||
uint32_t data;
|
||||
|
||||
data = REG_SET_FIELD(0, SQ_TIMEOUT_CONFIG, TIMEOUT_FATAL_DISABLE,
|
||||
amdgpu_watchdog_timer.timeout_fatal_disable ? 1 :
|
||||
0);
|
||||
data = REG_SET_FIELD(data, SQ_TIMEOUT_CONFIG, PERIOD_SEL,
|
||||
amdgpu_watchdog_timer.period);
|
||||
|
||||
mutex_lock(&adev->grbm_idx_mutex);
|
||||
for (i = 0; i < adev->gfx.config.max_shader_engines; i++) {
|
||||
gfx_v9_4_2_select_se_sh(adev, i, 0xffffffff, 0xffffffff);
|
||||
WREG32_SOC15(GC, 0, regSQ_TIMEOUT_CONFIG, data);
|
||||
}
|
||||
gfx_v9_4_2_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff);
|
||||
mutex_unlock(&adev->grbm_idx_mutex);
|
||||
}
|
||||
|
||||
static uint32_t wave_read_ind(struct amdgpu_device *adev, uint32_t simd, uint32_t wave, uint32_t address)
|
||||
{
|
||||
WREG32_SOC15_RLC_EX(reg, GC, 0, regSQ_IND_INDEX,
|
||||
(wave << SQ_IND_INDEX__WAVE_ID__SHIFT) |
|
||||
(simd << SQ_IND_INDEX__SIMD_ID__SHIFT) |
|
||||
(address << SQ_IND_INDEX__INDEX__SHIFT) |
|
||||
(SQ_IND_INDEX__FORCE_READ_MASK));
|
||||
return RREG32_SOC15(GC, 0, regSQ_IND_DATA);
|
||||
}
|
||||
|
||||
static void gfx_v9_4_2_log_cu_timeout_status(struct amdgpu_device *adev,
|
||||
uint32_t status)
|
||||
{
|
||||
struct amdgpu_cu_info *cu_info = &adev->gfx.cu_info;
|
||||
uint32_t i, simd, wave;
|
||||
uint32_t wave_status;
|
||||
uint32_t wave_pc_lo, wave_pc_hi;
|
||||
uint32_t wave_exec_lo, wave_exec_hi;
|
||||
uint32_t wave_inst_dw0, wave_inst_dw1;
|
||||
uint32_t wave_ib_sts;
|
||||
|
||||
for (i = 0; i < 32; i++) {
|
||||
if (!((i << 1) & status))
|
||||
continue;
|
||||
|
||||
simd = i / cu_info->max_waves_per_simd;
|
||||
wave = i % cu_info->max_waves_per_simd;
|
||||
|
||||
wave_status = wave_read_ind(adev, simd, wave, ixSQ_WAVE_STATUS);
|
||||
wave_pc_lo = wave_read_ind(adev, simd, wave, ixSQ_WAVE_PC_LO);
|
||||
wave_pc_hi = wave_read_ind(adev, simd, wave, ixSQ_WAVE_PC_HI);
|
||||
wave_exec_lo =
|
||||
wave_read_ind(adev, simd, wave, ixSQ_WAVE_EXEC_LO);
|
||||
wave_exec_hi =
|
||||
wave_read_ind(adev, simd, wave, ixSQ_WAVE_EXEC_HI);
|
||||
wave_inst_dw0 =
|
||||
wave_read_ind(adev, simd, wave, ixSQ_WAVE_INST_DW0);
|
||||
wave_inst_dw1 =
|
||||
wave_read_ind(adev, simd, wave, ixSQ_WAVE_INST_DW1);
|
||||
wave_ib_sts = wave_read_ind(adev, simd, wave, ixSQ_WAVE_IB_STS);
|
||||
|
||||
dev_info(
|
||||
adev->dev,
|
||||
"\t SIMD %d, Wave %d: status 0x%x, pc 0x%llx, exec 0x%llx, inst 0x%llx, ib_sts 0x%x\n",
|
||||
simd, wave, wave_status,
|
||||
((uint64_t)wave_pc_hi << 32 | wave_pc_lo),
|
||||
((uint64_t)wave_exec_hi << 32 | wave_exec_lo),
|
||||
((uint64_t)wave_inst_dw1 << 32 | wave_inst_dw0),
|
||||
wave_ib_sts);
|
||||
}
|
||||
}
|
||||
|
||||
void gfx_v9_4_2_query_sq_timeout_status(struct amdgpu_device *adev)
|
||||
{
|
||||
uint32_t se_idx, sh_idx, cu_idx;
|
||||
uint32_t status;
|
||||
|
||||
mutex_lock(&adev->grbm_idx_mutex);
|
||||
for (se_idx = 0; se_idx < adev->gfx.config.max_shader_engines;
|
||||
se_idx++) {
|
||||
for (sh_idx = 0; sh_idx < adev->gfx.config.max_sh_per_se;
|
||||
sh_idx++) {
|
||||
for (cu_idx = 0;
|
||||
cu_idx < adev->gfx.config.max_cu_per_sh;
|
||||
cu_idx++) {
|
||||
gfx_v9_4_2_select_se_sh(adev, se_idx, sh_idx,
|
||||
cu_idx);
|
||||
status = RREG32_SOC15(GC, 0,
|
||||
regSQ_TIMEOUT_STATUS);
|
||||
if (status != 0) {
|
||||
dev_info(
|
||||
adev->dev,
|
||||
"GFX Watchdog Timeout: SE %d, SH %d, CU %d\n",
|
||||
se_idx, sh_idx, cu_idx);
|
||||
gfx_v9_4_2_log_cu_timeout_status(
|
||||
adev, status);
|
||||
}
|
||||
/* clear old status */
|
||||
WREG32_SOC15(GC, 0, regSQ_TIMEOUT_STATUS, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
gfx_v9_4_2_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff);
|
||||
mutex_unlock(&adev->grbm_idx_mutex);
|
||||
}
|
||||
Reference in New Issue
Block a user