Commit f5db5906 authored by ganglxie's avatar ganglxie Committed by Alex Deucher
Browse files

Refine RAS bad page records counting and parsing in eeprom V3



there is only MCA records in V3, no need to care about PA records.
recalculate the value of ras_num_bad_pages when parsing failed and
go on with the left records instead of quit.

Signed-off-by: default avatarganglxie <ganglxie@amd.com>
Reviewed-by: default avatarTao Zhou <tao.zhou1@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 3ab3d680
Loading
Loading
Loading
Loading
+36 −26
Original line number Diff line number Diff line
@@ -2889,6 +2889,7 @@ static int __amdgpu_ras_convert_rec_from_rom(struct amdgpu_device *adev,
		if (amdgpu_ras_mca2pa_by_idx(adev, bps, err_data))
			return -EINVAL;
	}

	return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr,
									adev->umc.retire_unit);
}
@@ -2903,7 +2904,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
			&adev->psp.ras_context.ras->eeprom_control;
	enum amdgpu_memory_partition nps = AMDGPU_NPS1_PARTITION_MODE;
	int ret = 0;
	uint32_t i;
	uint32_t i = 0;

	if (!con || !con->eh_data || !bps || pages <= 0)
		return 0;
@@ -2924,15 +2925,17 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
	mutex_lock(&con->recovery_lock);

	if (from_rom) {
		/* there is no pa recs in V3, so skip pa recs processing */
		if (control->tbl_hdr.version < RAS_TABLE_VER_V3) {
			for (i = 0; i < pages; i++) {
				if (control->ras_num_recs - i >= adev->umc.retire_unit) {
					if ((bps[i].address == bps[i + 1].address) &&
						(bps[i].mem_channel == bps[i + 1].mem_channel)) {
					//deal with retire_unit records a time
						/* deal with retire_unit records a time */
						ret = __amdgpu_ras_convert_rec_array_from_rom(adev,
										&bps[i], &err_data, nps);
						if (ret)
						goto free;
							control->ras_num_bad_pages -= adev->umc.retire_unit;
						i += (adev->umc.retire_unit - 1);
					} else {
						break;
@@ -2941,17 +2944,17 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
					break;
				}
			}
		}
		for (; i < pages; i++) {
			ret = __amdgpu_ras_convert_rec_from_rom(adev,
				&bps[i], &err_data, nps);
			if (ret)
				goto free;
				control->ras_num_bad_pages -= adev->umc.retire_unit;
		}
	} else {
		ret = __amdgpu_ras_restore_bad_pages(adev, bps, pages);
	}

free:
	if (from_rom)
		kfree(err_data.err_addr);
	mutex_unlock(&con->recovery_lock);
@@ -3040,6 +3043,10 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
		dev_err(adev->dev, "Failed to load EEPROM table records!");
	} else {
		if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) {
			/*In V3, there is no pa recs, and some cases(when address==0) may be parsed
			as pa recs, so add verion check to avoid it.
			*/
			if (control->tbl_hdr.version < RAS_TABLE_VER_V3) {
				for (i = 0; i < control->ras_num_recs; i++) {
					if ((control->ras_num_recs - i) >= adev->umc.retire_unit) {
						if ((bps[i].address == bps[i + 1].address) &&
@@ -3056,6 +3063,9 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
						break;
					}
				}
			} else {
				control->ras_num_mca_recs = control->ras_num_recs;
			}
		}

		ret = amdgpu_ras_eeprom_check(control);