drm/amdgpu: Refine bad page adding

bad page adding can be simpler with nps info

Signed-off-by: ganglxie <ganglxie@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
ganglxie
2025-02-24 15:03:05 +08:00
committed by Alex Deucher
parent e6aae1db41
commit 0153d27673

View File

@@ -2799,20 +2799,100 @@ static int amdgpu_ras_mca2pa(struct amdgpu_device *adev,
return -EINVAL;
}
static int __amdgpu_ras_restore_bad_pages(struct amdgpu_device *adev,
struct eeprom_table_record *bps, int count)
{
int j;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data *data = con->eh_data;
for (j = 0; j < count; j++) {
if (amdgpu_ras_check_bad_page_unlock(con,
bps[j].retired_page << AMDGPU_GPU_PAGE_SHIFT))
continue;
if (!data->space_left &&
amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
return -ENOMEM;
}
amdgpu_ras_reserve_page(adev, bps[j].retired_page);
memcpy(&data->bps[data->count], &(bps[j]),
sizeof(struct eeprom_table_record));
data->count++;
data->space_left--;
}
return 0;
}
static int __amdgpu_ras_convert_rec_array_from_rom(struct amdgpu_device *adev,
struct eeprom_table_record *bps, struct ras_err_data *err_data,
enum amdgpu_memory_partition nps)
{
int i = 0;
enum amdgpu_memory_partition save_nps;
save_nps = (bps[0].retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK;
for (i = 0; i < adev->umc.retire_unit; i++)
bps[i].retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT);
if (save_nps) {
if (save_nps == nps) {
if (amdgpu_umc_pages_in_a_row(adev, err_data,
bps[0].retired_page << AMDGPU_GPU_PAGE_SHIFT))
return -EINVAL;
} else {
if (amdgpu_ras_mca2pa_by_idx(adev, &bps[0], err_data))
return -EINVAL;
}
} else {
if (amdgpu_ras_mca2pa(adev, &bps[0], err_data)) {
if (nps == AMDGPU_NPS1_PARTITION_MODE)
memcpy(err_data->err_addr, bps,
sizeof(struct eeprom_table_record) * adev->umc.retire_unit);
else
return -EOPNOTSUPP;
}
}
return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr, adev->umc.retire_unit);
}
static int __amdgpu_ras_convert_rec_from_rom(struct amdgpu_device *adev,
struct eeprom_table_record *bps, struct ras_err_data *err_data,
enum amdgpu_memory_partition nps)
{
enum amdgpu_memory_partition save_nps;
save_nps = (bps->retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK;
bps->retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT);
if (save_nps == nps) {
if (amdgpu_umc_pages_in_a_row(adev, err_data,
bps->retired_page << AMDGPU_GPU_PAGE_SHIFT))
return -EINVAL;
} else {
if (amdgpu_ras_mca2pa_by_idx(adev, bps, err_data))
return -EINVAL;
}
return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr,
adev->umc.retire_unit);
}
/* it deal with vram only. */
int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
struct eeprom_table_record *bps, int pages, bool from_rom)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data *data;
struct ras_err_data err_data;
struct eeprom_table_record *err_rec;
struct amdgpu_ras_eeprom_control *control =
&adev->psp.ras_context.ras->eeprom_control;
enum amdgpu_memory_partition nps = AMDGPU_NPS1_PARTITION_MODE;
int ret = 0;
uint32_t i, j, loop_cnt = 1;
bool find_pages_per_pa = false;
uint32_t i;
if (!con || !con->eh_data || !bps || pages <= 0)
return 0;
@@ -2823,114 +2903,46 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
sizeof(struct eeprom_table_record), GFP_KERNEL);
if (!err_data.err_addr) {
dev_warn(adev->dev, "Failed to alloc UMC error address record in mca2pa conversion!\n");
ret = -ENOMEM;
goto out;
return -ENOMEM;
}
err_rec = err_data.err_addr;
loop_cnt = adev->umc.retire_unit;
if (adev->gmc.gmc_funcs->query_mem_partition_mode)
nps = adev->gmc.gmc_funcs->query_mem_partition_mode(adev);
}
mutex_lock(&con->recovery_lock);
data = con->eh_data;
if (!data) {
/* Returning 0 as the absence of eh_data is acceptable */
goto free;
}
for (i = 0; i < pages; i++) {
if (from_rom &&
control->rec_type == AMDGPU_RAS_EEPROM_REC_MCA) {
if (!find_pages_per_pa) {
if (amdgpu_ras_mca2pa_by_idx(adev, &bps[i], &err_data)) {
if (!i && nps == AMDGPU_NPS1_PARTITION_MODE) {
/* may use old RAS TA, use PA to find pages in
* one row
*/
if (amdgpu_umc_pages_in_a_row(adev, &err_data,
bps[i].retired_page <<
AMDGPU_GPU_PAGE_SHIFT)) {
ret = -EINVAL;
goto free;
} else {
find_pages_per_pa = true;
}
} else {
/* unsupported cases */
ret = -EOPNOTSUPP;
if (from_rom) {
for (i = 0; i < pages; i++) {
if (control->ras_num_recs - i >= adev->umc.retire_unit) {
if ((bps[i].address == bps[i + 1].address) &&
(bps[i].mem_channel == bps[i + 1].mem_channel)) {
//deal with retire_unit records a time
ret = __amdgpu_ras_convert_rec_array_from_rom(adev,
&bps[i], &err_data, nps);
if (ret)
goto free;
}
}
} else {
if (amdgpu_umc_pages_in_a_row(adev, &err_data,
bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT)) {
ret = -EINVAL;
goto free;
}
}
} else {
if (from_rom && !find_pages_per_pa) {
if (bps[i].retired_page & UMC_CHANNEL_IDX_V2) {
/* bad page in any NPS mode in eeprom */
if (amdgpu_ras_mca2pa_by_idx(adev, &bps[i], &err_data)) {
ret = -EINVAL;
goto free;
}
} else {
/* legacy bad page in eeprom, generated only in
* NPS1 mode
*/
if (amdgpu_ras_mca2pa(adev, &bps[i], &err_data)) {
/* old RAS TA or ASICs which don't support to
* convert addrss via mca address
*/
if (!i && nps == AMDGPU_NPS1_PARTITION_MODE) {
find_pages_per_pa = true;
err_rec = &bps[i];
loop_cnt = 1;
} else {
/* non-nps1 mode, old RAS TA
* can't support it
*/
ret = -EOPNOTSUPP;
goto free;
}
}
}
if (!find_pages_per_pa)
i += (adev->umc.retire_unit - 1);
} else {
break;
}
} else {
err_rec = &bps[i];
break;
}
}
for (j = 0; j < loop_cnt; j++) {
if (amdgpu_ras_check_bad_page_unlock(con,
err_rec[j].retired_page << AMDGPU_GPU_PAGE_SHIFT))
continue;
if (!data->space_left &&
amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
ret = -ENOMEM;
for (; i < pages; i++) {
ret = __amdgpu_ras_convert_rec_from_rom(adev,
&bps[i], &err_data, nps);
if (ret)
goto free;
}
amdgpu_ras_reserve_page(adev, err_rec[j].retired_page);
memcpy(&data->bps[data->count], &(err_rec[j]),
sizeof(struct eeprom_table_record));
data->count++;
data->space_left--;
}
} else {
ret = __amdgpu_ras_restore_bad_pages(adev, bps, pages);
}
free:
if (from_rom)
kfree(err_data.err_addr);
out:
mutex_unlock(&con->recovery_lock);
return ret;