Commit 9b5b7189 authored by Tao Zhou's avatar Tao Zhou Committed by Alex Deucher
Browse files

drm/amdgpu: implement get_retire_flip_bits for UMC v12



The RAS bad page retire flip bits can be set per vram type,
vram vendor and NPS mode.

Signed-off-by: default avatarTao Zhou <tao.zhou1@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 699bff37
Loading
Loading
Loading
Loading
+0 −1
Original line number Diff line number Diff line
@@ -1502,7 +1502,6 @@ static void gmc_v9_0_set_umc_funcs(struct amdgpu_device *adev)
		adev->umc.umc_inst_num = UMC_V12_0_UMC_INSTANCE_NUM;
		adev->umc.node_inst_num /= UMC_V12_0_UMC_INSTANCE_NUM;
		adev->umc.channel_offs = UMC_V12_0_PER_CHANNEL_OFFSET;
		adev->umc.retire_unit = UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL;
		if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu)
			adev->umc.ras = &umc_v12_0_ras;
		break;
+53 −29
Original line number Diff line number Diff line
@@ -174,6 +174,49 @@ static void umc_v12_0_query_ras_error_count(struct amdgpu_device *adev,
	umc_v12_0_reset_error_count(adev);
}

static void umc_v12_0_get_retire_flip_bits(struct amdgpu_device *adev)
{
	enum amdgpu_memory_partition nps = AMDGPU_NPS1_PARTITION_MODE;
	uint32_t vram_type = adev->gmc.vram_type;
	struct amdgpu_umc_flip_bits *flip_bits = &(adev->umc.flip_bits);

	if (adev->gmc.gmc_funcs->query_mem_partition_mode)
		nps = adev->gmc.gmc_funcs->query_mem_partition_mode(adev);

	/* default setting */
	flip_bits->flip_bits_in_pa[0] = UMC_V12_0_PA_C2_BIT;
	flip_bits->flip_bits_in_pa[1] = UMC_V12_0_PA_C3_BIT;
	flip_bits->flip_bits_in_pa[2] = UMC_V12_0_PA_C4_BIT;
	flip_bits->flip_bits_in_pa[3] = UMC_V12_0_PA_R13_BIT;
	flip_bits->bit_num = 4;

	switch (vram_type) {
	case AMDGPU_VRAM_TYPE_HBM:
		/* other nps modes are taken as nps1 */
		if (nps == AMDGPU_NPS2_PARTITION_MODE) {
			flip_bits->flip_bits_in_pa[0] = UMC_V12_0_PA_CH5_BIT;
			flip_bits->flip_bits_in_pa[1] = UMC_V12_0_PA_C2_BIT;
			flip_bits->flip_bits_in_pa[2] = UMC_V12_0_PA_B1_BIT;
			flip_bits->flip_bits_in_pa[3] = UMC_V12_0_PA_R12_BIT;
		}

		if (nps == AMDGPU_NPS4_PARTITION_MODE) {
			flip_bits->flip_bits_in_pa[0] = UMC_V12_0_PA_CH4_BIT;
			flip_bits->flip_bits_in_pa[1] = UMC_V12_0_PA_CH5_BIT;
			flip_bits->flip_bits_in_pa[2] = UMC_V12_0_PA_B0_BIT;
			flip_bits->flip_bits_in_pa[3] = UMC_V12_0_PA_R11_BIT;
		}

		break;
	default:
		dev_warn(adev->dev,
			"Unknown HBM type, set RAS retire flip bits to the value in NPS1 mode.\n");
		break;
	}

	adev->umc.retire_unit = 0x1 << flip_bits->bit_num;
}

static int umc_v12_0_convert_error_address(struct amdgpu_device *adev,
					struct ras_err_data *err_data,
					struct ta_ras_query_address_input *addr_in,
@@ -182,11 +225,10 @@ static int umc_v12_0_convert_error_address(struct amdgpu_device *adev,
{
	uint32_t col, col_lower, row, row_lower, row_high, bank;
	uint32_t channel_index = 0, umc_inst = 0;
	uint32_t i, loop_bits[UMC_V12_0_RETIRE_LOOP_BITS];
	uint32_t i, bit_num, retire_unit, *flip_bits;
	uint64_t soc_pa, column, err_addr;
	struct ta_ras_query_address_output addr_out_tmp;
	struct ta_ras_query_address_output *paddr_out;
	enum amdgpu_memory_partition nps = AMDGPU_NPS1_PARTITION_MODE;
	int ret = 0;

	if (!addr_out)
@@ -211,34 +253,15 @@ static int umc_v12_0_convert_error_address(struct amdgpu_device *adev,
		umc_inst = addr_in->ma.umc_inst;
	}

	loop_bits[0] = UMC_V12_0_PA_C2_BIT;
	loop_bits[1] = UMC_V12_0_PA_C3_BIT;
	loop_bits[2] = UMC_V12_0_PA_C4_BIT;
	loop_bits[3] = UMC_V12_0_PA_R13_BIT;

	if (adev->gmc.gmc_funcs->query_mem_partition_mode)
		nps = adev->gmc.gmc_funcs->query_mem_partition_mode(adev);

	/* other nps modes are taken as nps1 */
	if (nps == AMDGPU_NPS2_PARTITION_MODE) {
		loop_bits[0] = UMC_V12_0_PA_CH5_BIT;
		loop_bits[1] = UMC_V12_0_PA_C2_BIT;
		loop_bits[2] = UMC_V12_0_PA_B1_BIT;
		loop_bits[3] = UMC_V12_0_PA_R12_BIT;
	}

	if (nps == AMDGPU_NPS4_PARTITION_MODE) {
		loop_bits[0] = UMC_V12_0_PA_CH4_BIT;
		loop_bits[1] = UMC_V12_0_PA_CH5_BIT;
		loop_bits[2] = UMC_V12_0_PA_B0_BIT;
		loop_bits[3] = UMC_V12_0_PA_R11_BIT;
	}
	flip_bits = adev->umc.flip_bits.flip_bits_in_pa;
	bit_num = adev->umc.flip_bits.bit_num;
	retire_unit = adev->umc.retire_unit;

	soc_pa = paddr_out->pa.pa;
	channel_index = paddr_out->pa.channel_idx;
	/* clear loop bits in soc physical address */
	for (i = 0; i < UMC_V12_0_RETIRE_LOOP_BITS; i++)
		soc_pa &= ~BIT_ULL(loop_bits[i]);
	for (i = 0; i < bit_num; i++)
		soc_pa &= ~BIT_ULL(flip_bits[i]);

	paddr_out->pa.pa = soc_pa;
	/* get column bit 0 and 1 in mca address */
@@ -259,10 +282,10 @@ static int umc_v12_0_convert_error_address(struct amdgpu_device *adev,
		goto out;

	/* loop for all possibilities of retired bits */
	for (column = 0; column < UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL; column++) {
	for (column = 0; column < retire_unit; column++) {
		soc_pa = paddr_out->pa.pa;
		for (i = 0; i < UMC_V12_0_RETIRE_LOOP_BITS; i++)
			soc_pa |= (((column >> i) & 0x1ULL) << loop_bits[i]);
		for (i = 0; i < bit_num; i++)
			soc_pa |= (((column >> i) & 0x1ULL) << flip_bits[i]);

		col = ((column & 0x7) << 2) | col_lower;
		/* add row bit 13 */
@@ -684,5 +707,6 @@ struct amdgpu_umc_ras umc_v12_0_ras = {
	.update_ecc_status = umc_v12_0_update_ecc_status,
	.convert_ras_err_addr = umc_v12_0_convert_error_address,
	.get_die_id_from_pa = umc_v12_0_get_die_id,
	.get_retire_flip_bits = umc_v12_0_get_retire_flip_bits,
};
+0 −2
Original line number Diff line number Diff line
@@ -55,8 +55,6 @@
#define UMC_V12_0_NA_MAP_PA_NUM        8
/* R13 bit shift should be considered, double the number */
#define UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL (UMC_V12_0_NA_MAP_PA_NUM * 2)
/* C2, C3, C4, R13, four bits in MCA address are looped in retirement */
#define UMC_V12_0_RETIRE_LOOP_BITS 4

/* column bits in SOC physical address */
#define UMC_V12_0_PA_C2_BIT 15