Commit 631af731 authored by Lijo Lazar's avatar Lijo Lazar Committed by Alex Deucher
Browse files

drm/amdgpu: Refactor XGMI reset on init handling



Use XGMI hive information to rely on resetting XGMI devices on
initialization rather than using mgpu structure. mgpu structure may have
other devices as well.

Signed-off-by: default avatarLijo Lazar <lijo.lazar@amd.com>
Reviewed-by: default avatarFeifei Xu <feifxu@amd.com>
Acked-by: default avatarRajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
Tested-by: default avatarRajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent b17f8732
Loading
Loading
Loading
Loading
+6 −4
Original line number Diff line number Diff line
@@ -164,7 +164,8 @@ struct amdgpu_init_level amdgpu_init_minimal_xgmi = {
	.level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI,
	.hwini_ip_block_mask =
		BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) |
		BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH)
		BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) |
		BIT(AMD_IP_BLOCK_TYPE_PSP)
};

static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev,
@@ -2840,6 +2841,7 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
 */
static int amdgpu_device_ip_init(struct amdgpu_device *adev)
{
	bool init_badpage;
	int i, r;

	r = amdgpu_ras_init(adev);
@@ -2953,7 +2955,8 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
	 * Note: theoretically, this should be called before all vram allocations
	 * to protect retired page from abusing
	 */
	r = amdgpu_ras_recovery_init(adev, true);
	init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI);
	r = amdgpu_ras_recovery_init(adev, init_badpage);
	if (r)
		goto init_failed;

@@ -4511,8 +4514,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
		vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);

	if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI)
		queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
				   msecs_to_jiffies(AMDGPU_RESUME_MS));
		amdgpu_xgmi_reset_on_init(adev);

	amdgpu_device_check_iommu_direct_map(adev);

+0 −6
Original line number Diff line number Diff line
@@ -3216,12 +3216,6 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info)
	max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(&con->eeprom_control);
	amdgpu_ras_validate_threshold(adev, max_eeprom_records_count);

	/* Todo: During test the SMU might fail to read the eeprom through I2C
	 * when the GPU is pending on XGMI reset during probe time
	 * (Mostly after second bus reset), skip it now
	 */
	if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI)
		return 0;
	if (init_bp_info) {
		ret = amdgpu_ras_init_badpage_info(adev);
		if (ret)
+68 −5
Original line number Diff line number Diff line
@@ -860,8 +860,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
	if (!adev->gmc.xgmi.supported)
		return 0;

	if ((adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) &&
	    amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
		ret = psp_xgmi_initialize(&adev->psp, false, true);
		if (ret) {
			dev_err(adev->dev,
@@ -907,8 +906,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)

	task_barrier_add_task(&hive->tb);

	if ((adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) &&
	    amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
			/* update node list for other device in the hive */
			if (tmp_adev != adev) {
@@ -985,7 +983,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
		}
	}

	if (!ret && (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI))
	if (!ret)
		ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive);

exit_unlock:
@@ -1500,3 +1498,68 @@ int amdgpu_xgmi_ras_sw_init(struct amdgpu_device *adev)

	return 0;
}

static void amdgpu_xgmi_reset_on_init_work(struct work_struct *work)
{
	struct amdgpu_hive_info *hive =
		container_of(work, struct amdgpu_hive_info, reset_on_init_work);
	struct amdgpu_reset_context reset_context;
	struct amdgpu_device *tmp_adev;
	struct list_head device_list;
	int r;

	mutex_lock(&hive->hive_lock);

	INIT_LIST_HEAD(&device_list);
	list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
		list_add_tail(&tmp_adev->reset_list, &device_list);

	tmp_adev = list_first_entry(&device_list, struct amdgpu_device,
				    reset_list);
	amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);

	reset_context.method = AMD_RESET_METHOD_ON_INIT;
	reset_context.reset_req_dev = tmp_adev;
	reset_context.hive = hive;
	reset_context.reset_device_list = &device_list;
	set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
	set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);

	amdgpu_reset_do_xgmi_reset_on_init(&reset_context);
	mutex_unlock(&hive->hive_lock);
	amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);

	list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
		r = amdgpu_ras_init_badpage_info(tmp_adev);
		if (r && r != -EHWPOISON)
			dev_err(tmp_adev->dev,
				"error during bad page data initializtion");
	}
}

static void amdgpu_xgmi_schedule_reset_on_init(struct amdgpu_hive_info *hive)
{
	INIT_WORK(&hive->reset_on_init_work, amdgpu_xgmi_reset_on_init_work);
	amdgpu_reset_domain_schedule(hive->reset_domain,
				     &hive->reset_on_init_work);
}

int amdgpu_xgmi_reset_on_init(struct amdgpu_device *adev)
{
	struct amdgpu_hive_info *hive;
	int num_devs;

	hive = amdgpu_get_xgmi_hive(adev);
	if (!hive)
		return -EINVAL;

	mutex_lock(&hive->hive_lock);
	num_devs = atomic_read(&hive->number_devices);
	if (num_devs == adev->gmc.xgmi.num_physical_nodes)
		amdgpu_xgmi_schedule_reset_on_init(hive);

	mutex_unlock(&hive->hive_lock);
	amdgpu_put_xgmi_hive(hive);

	return 0;
}
+2 −0
Original line number Diff line number Diff line
@@ -45,6 +45,7 @@ struct amdgpu_hive_info {
	struct amdgpu_reset_domain *reset_domain;
	atomic_t ras_recovery;
	struct ras_event_manager event_mgr;
	struct work_struct reset_on_init_work;
};

struct amdgpu_pcs_ras_field {
@@ -75,5 +76,6 @@ static inline bool amdgpu_xgmi_same_hive(struct amdgpu_device *adev,
		adev->gmc.xgmi.hive_id == bo_adev->gmc.xgmi.hive_id);
}
int amdgpu_xgmi_ras_sw_init(struct amdgpu_device *adev);
int amdgpu_xgmi_reset_on_init(struct amdgpu_device *adev);

#endif
+10 −4
Original line number Diff line number Diff line
@@ -2413,11 +2413,17 @@ static int gmc_v9_0_hw_fini(void *handle)
	if (adev->mmhub.funcs->update_power_gating)
		adev->mmhub.funcs->update_power_gating(adev, false);

	/*
	 * For minimal init, late_init is not called, hence VM fault/RAS irqs
	 * are not enabled.
	 */
	if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) {
		amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);

		if (adev->gmc.ecc_irq.funcs &&
		    amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC))
			amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0);
	}

	return 0;
}
Loading