Commit 2a084f4a authored by Dave Airlie's avatar Dave Airlie
Browse files

Merge tag 'amd-drm-next-6.19-2025-11-07' of...

Merge tag 'amd-drm-next-6.19-2025-11-07' of https://gitlab.freedesktop.org/agd5f/linux

 into drm-next

amd-drm-next-6.19-2025-11-07:

amdgpu:
- Misc fixes
- HMM cleanup
- HDP flush rework
- RAS updates
- SMU 13.x updates
- SI DPM cleanup
- Suspend rework
- UQ reset support
- Replay/PSR fixes
- HDCP updates
- DC PMO fixes
- DC pstate fixes
- DCN4 fixes
- GPUVM fixes
- SMU 13 parition metrics
- Fix possible fence leak in job cleanup
- Hibernation fix
- MST fix

amdkfd:
- HMM cleanup
- Process cleanup fix

Signed-off-by: default avatarDave Airlie <airlied@redhat.com>

From: Alex Deucher <alexander.deucher@amd.com>
Link: https://patch.msgid.link/20251107145938.26669-1-alexander.deucher@amd.com
parents e237dfe7 2e640e8e
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -1080,7 +1080,7 @@ M: Austin Zheng <austin.zheng@amd.com>
M:	Jun Lei <jun.lei@amd.com>
S:	Supported
F:	drivers/gpu/drm/amd/display/dc/dml/
F:	drivers/gpu/drm/amd/display/dc/dml2/
F:	drivers/gpu/drm/amd/display/dc/dml2_0/
AMD FAM15H PROCESSOR POWER MONITORING DRIVER
M:	Huang Rui <ray.huang@amd.com>
+1 −5
Original line number Diff line number Diff line
@@ -1316,6 +1316,7 @@ struct amdgpu_device {
	bool                            apu_prefer_gtt;

	bool                            userq_halt_for_enforce_isolation;
	struct work_struct              userq_reset_work;
	struct amdgpu_uid *uid_info;

	/* KFD
@@ -1539,11 +1540,6 @@ int emu_soc_asic_init(struct amdgpu_device *adev);
#define amdgpu_asic_read_bios_from_rom(adev, b, l) (adev)->asic_funcs->read_bios_from_rom((adev), (b), (l))
#define amdgpu_asic_read_register(adev, se, sh, offset, v)((adev)->asic_funcs->read_register((adev), (se), (sh), (offset), (v)))
#define amdgpu_asic_get_config_memsize(adev) (adev)->asic_funcs->get_config_memsize((adev))
#define amdgpu_asic_flush_hdp(adev, r) \
	((adev)->asic_funcs->flush_hdp ? (adev)->asic_funcs->flush_hdp((adev), (r)) : (adev)->hdp.funcs->flush_hdp((adev), (r)))
#define amdgpu_asic_invalidate_hdp(adev, r) \
	((adev)->asic_funcs->invalidate_hdp ? (adev)->asic_funcs->invalidate_hdp((adev), (r)) : \
	 ((adev)->hdp.funcs->invalidate_hdp ? (adev)->hdp.funcs->invalidate_hdp((adev), (r)) : (void)0))
#define amdgpu_asic_need_full_reset(adev) (adev)->asic_funcs->need_full_reset((adev))
#define amdgpu_asic_init_doorbell_index(adev) (adev)->asic_funcs->init_doorbell_index((adev))
#define amdgpu_asic_get_pcie_usage(adev, cnt0, cnt1) ((adev)->asic_funcs->get_pcie_usage((adev), (cnt0), (cnt1)))
+4 −0
Original line number Diff line number Diff line
@@ -1274,6 +1274,10 @@ static int unmap_bo_from_gpuvm(struct kgd_mem *mem,

	(void)amdgpu_vm_bo_unmap(adev, bo_va, entry->va);

	/* VM entity stopped if process killed, don't clear freed pt bo */
	if (!amdgpu_vm_ready(vm))
		return 0;

	(void)amdgpu_vm_clear_freed(adev, vm, &bo_va->last_pt_update);

	(void)amdgpu_sync_fence(sync, bo_va->last_pt_update, GFP_KERNEL);
+122 −17
Original line number Diff line number Diff line
@@ -71,6 +71,7 @@

#include "amdgpu_xgmi.h"
#include "amdgpu_ras.h"
#include "amdgpu_ras_mgr.h"
#include "amdgpu_pmu.h"
#include "amdgpu_fru_eeprom.h"
#include "amdgpu_reset.h"
@@ -179,6 +180,10 @@ struct amdgpu_init_level amdgpu_init_minimal_xgmi = {
		BIT(AMD_IP_BLOCK_TYPE_PSP)
};

static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev);
static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev);
static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev);

static void amdgpu_device_load_switch_state(struct amdgpu_device *adev);

static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev,
@@ -2494,6 +2499,7 @@ static const char *ip_block_names[] = {
	[AMD_IP_BLOCK_TYPE_VPE] = "vpe",
	[AMD_IP_BLOCK_TYPE_UMSCH_MM] = "umsch_mm",
	[AMD_IP_BLOCK_TYPE_ISP] = "isp",
	[AMD_IP_BLOCK_TYPE_RAS] = "ras",
};

static const char *ip_block_name(struct amdgpu_device *adev, enum amd_ip_block_type type)
@@ -3784,7 +3790,7 @@ static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
 */
static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
{
	int i, r;
	int i, r, rec;

	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
@@ -3807,10 +3813,23 @@ static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)

		r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
		if (r)
			return r;
			goto unwind;
	}

	return 0;
unwind:
	rec = amdgpu_device_ip_resume_phase3(adev);
	if (rec)
		dev_err(adev->dev,
			"amdgpu_device_ip_resume_phase3 failed during unwind: %d\n",
			rec);

	amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW);

	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);

	return r;
}

/**
@@ -3826,7 +3845,7 @@ static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
 */
static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
{
	int i, r;
	int i, r, rec;

	if (adev->in_s0ix)
		amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
@@ -3889,7 +3908,7 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)

		r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
		if (r)
			return r;
			goto unwind;

		/* handle putting the SMC in the appropriate state */
		if (!amdgpu_sriov_vf(adev)) {
@@ -3899,13 +3918,40 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
					dev_err(adev->dev,
						"SMC failed to set mp1 state %d, %d\n",
						adev->mp1_state, r);
					return r;
					goto unwind;
				}
			}
		}
	}

	return 0;
unwind:
	/* suspend phase 2 = resume phase 1 + resume phase 2 */
	rec = amdgpu_device_ip_resume_phase1(adev);
	if (rec) {
		dev_err(adev->dev,
			"amdgpu_device_ip_resume_phase1 failed during unwind: %d\n",
			rec);
		return r;
	}

	rec = amdgpu_device_fw_loading(adev);
	if (rec) {
		dev_err(adev->dev,
			"amdgpu_device_fw_loading failed during unwind: %d\n",
			rec);
		return r;
	}

	rec = amdgpu_device_ip_resume_phase2(adev);
	if (rec) {
		dev_err(adev->dev,
			"amdgpu_device_ip_resume_phase2 failed during unwind: %d\n",
			rec);
		return r;
	}

	return r;
}

/**
@@ -4607,6 +4653,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
	}

	INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
	INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work);

	adev->gfx.gfx_off_req_count = 1;
	adev->gfx.gfx_off_residency = 0;
@@ -5229,7 +5276,7 @@ void amdgpu_device_complete(struct drm_device *dev)
int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
{
	struct amdgpu_device *adev = drm_to_adev(dev);
	int r = 0;
	int r, rec;

	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
		return 0;
@@ -5245,8 +5292,9 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
			return r;
	}

	if (amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D3))
		dev_warn(adev->dev, "smart shift update failed\n");
	r = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D3);
	if (r)
		goto unwind_sriov;

	if (notify_clients)
		drm_client_dev_suspend(adev_to_drm(adev));
@@ -5257,16 +5305,16 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)

	r = amdgpu_device_ip_suspend_phase1(adev);
	if (r)
		return r;
		goto unwind_smartshift;

	amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm);
	r = amdgpu_userq_suspend(adev);
	if (r)
		return r;
		goto unwind_ip_phase1;

	r = amdgpu_device_evict_resources(adev);
	if (r)
		return r;
		goto unwind_userq;

	amdgpu_ttm_set_buffer_funcs_status(adev, false);

@@ -5274,16 +5322,62 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)

	r = amdgpu_device_ip_suspend_phase2(adev);
	if (r)
		return r;
		goto unwind_evict;

	if (amdgpu_sriov_vf(adev))
		amdgpu_virt_release_full_gpu(adev, false);

	r = amdgpu_dpm_notify_rlc_state(adev, false);
	if (r)
	return 0;

unwind_evict:
	if (adev->mman.buffer_funcs_ring->sched.ready)
		amdgpu_ttm_set_buffer_funcs_status(adev, true);
	amdgpu_fence_driver_hw_init(adev);

unwind_userq:
	rec = amdgpu_userq_resume(adev);
	if (rec) {
		dev_warn(adev->dev, "failed to re-initialize user queues: %d\n", rec);
		return r;
	}
	rec = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm);
	if (rec) {
		dev_warn(adev->dev, "failed to re-initialize kfd: %d\n", rec);
		return r;
	}

	return 0;
unwind_ip_phase1:
	/* suspend phase 1 = resume phase 3 */
	rec = amdgpu_device_ip_resume_phase3(adev);
	if (rec) {
		dev_warn(adev->dev, "failed to re-initialize IPs phase1: %d\n", rec);
		return r;
	}

unwind_smartshift:
	rec = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0);
	if (rec) {
		dev_warn(adev->dev, "failed to re-update smart shift: %d\n", rec);
		return r;
	}

	if (notify_clients)
		drm_client_dev_resume(adev_to_drm(adev));

	amdgpu_ras_resume(adev);

unwind_sriov:
	if (amdgpu_sriov_vf(adev)) {
		rec = amdgpu_virt_request_full_gpu(adev, true);
		if (rec) {
			dev_warn(adev->dev, "failed to reinitialize sriov: %d\n", rec);
			return r;
		}
	}

	adev->in_suspend = adev->in_s0ix = adev->in_s3 = false;

	return r;
}

static inline int amdgpu_virt_resume(struct amdgpu_device *adev)
@@ -5989,6 +6083,10 @@ int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
				if (r)
					goto out;

				r = amdgpu_userq_post_reset(tmp_adev, vram_lost);
				if (r)
					goto out;

				drm_client_dev_resume(adev_to_drm(tmp_adev));

				/*
@@ -6211,6 +6309,7 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
	if (!amdgpu_sriov_vf(adev))
		cancel_work(&adev->reset_work);
#endif
	cancel_work(&adev->userq_reset_work);

	if (adev->kfd.dev)
		cancel_work(&adev->kfd.reset_work);
@@ -6331,6 +6430,8 @@ static void amdgpu_device_halt_activities(struct amdgpu_device *adev,
		    amdgpu_device_ip_need_full_reset(tmp_adev))
			amdgpu_ras_suspend(tmp_adev);

		amdgpu_userq_pre_reset(tmp_adev);

		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
			struct amdgpu_ring *ring = tmp_adev->rings[i];

@@ -6560,6 +6661,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
			goto end_reset;
	}

	/* Cannot be called after locking reset domain */
	amdgpu_ras_pre_reset(adev, &device_list);

	/* We need to lock reset domain only once both for XGMI and single device */
	amdgpu_device_recovery_get_reset_lock(adev, &device_list);

@@ -6590,6 +6694,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
	amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart);
reset_unlock:
	amdgpu_device_recovery_put_reset_lock(adev, &device_list);
	amdgpu_ras_post_reset(adev, &device_list);
end_reset:
	if (hive) {
		mutex_unlock(&hive->hive_lock);
@@ -7327,7 +7432,7 @@ void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
			return;
	}

	amdgpu_asic_flush_hdp(adev, ring);
	amdgpu_hdp_flush(adev, ring);
}

void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
@@ -7340,7 +7445,7 @@ void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
	if (adev->gmc.xgmi.connected_to_cpu)
		return;

	amdgpu_asic_invalidate_hdp(adev, ring);
	amdgpu_hdp_invalidate(adev, ring);
}

int amdgpu_in_reset(struct amdgpu_device *adev)
+7 −0
Original line number Diff line number Diff line
@@ -81,13 +81,20 @@ static int amdgpu_dma_buf_attach(struct dma_buf *dmabuf,
	struct drm_gem_object *obj = dmabuf->priv;
	struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj);
	struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
	int r;

	if (!amdgpu_dmabuf_is_xgmi_accessible(attach_adev, bo) &&
	    pci_p2pdma_distance(adev->pdev, attach->dev, false) < 0)
		attach->peer2peer = false;

	r = dma_resv_lock(bo->tbo.base.resv, NULL);
	if (r)
		return r;

	amdgpu_vm_bo_update_shared(bo);

	dma_resv_unlock(bo->tbo.base.resv);

	return 0;
}

Loading