Commit 0a1123c7 authored by Dave Airlie's avatar Dave Airlie
Browse files

Merge tag 'amd-drm-fixes-6.8-2024-01-18' of...

Merge tag 'amd-drm-fixes-6.8-2024-01-18' of https://gitlab.freedesktop.org/agd5f/linux

 into drm-next

amd-drm-fixes-6.8-2024-01-18:

amdgpu:
- DSC fixes
- DC resource pool fixes
- OTG fix
- DML2 fixes
- Aux fix
- GFX10 RLC firmware handling fix
- Revert a broken workaround for SMU 13.0.2
- DC writeback fix
- Enable gfxoff when ROCm apps are active on gfx11 with the proper FW version

amdkfd:
- Fix dma-buf exports using GEM handles

Signed-off-by: default avatarDave Airlie <airlied@redhat.com>

From: Alex Deucher <alexander.deucher@amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240118223127.4904-1-alexander.deucher@amd.com
parents 205e18c1 aa0901a9
Loading
Loading
Loading
Loading
+1 −2
Original line number Diff line number Diff line
@@ -254,8 +254,6 @@ extern int amdgpu_agp;

extern int amdgpu_wbrf;

extern int fw_bo_location;

#define AMDGPU_VM_MAX_NUM_CTX			4096
#define AMDGPU_SG_THRESHOLD			(256*1024*1024)
#define AMDGPU_WAIT_IDLE_TIMEOUT_IN_MS	        3000
@@ -1146,6 +1144,7 @@ struct amdgpu_device {
	bool                            debug_vm;
	bool                            debug_largebar;
	bool                            debug_disable_soft_recovery;
	bool                            debug_use_vram_fw_buf;
};

static inline uint32_t amdgpu_ip_version(const struct amdgpu_device *adev,
+6 −5
Original line number Diff line number Diff line
@@ -138,6 +138,9 @@ static void amdgpu_amdkfd_reset_work(struct work_struct *work)
	amdgpu_device_gpu_recover(adev, NULL, &reset_context);
}

static const struct drm_client_funcs kfd_client_funcs = {
	.unregister	= drm_client_release,
};
void amdgpu_amdkfd_device_init(struct amdgpu_device *adev)
{
	int i;
@@ -161,7 +164,7 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev)
			.enable_mes = adev->enable_mes,
		};

		ret = drm_client_init(&adev->ddev, &adev->kfd.client, "kfd", NULL);
		ret = drm_client_init(&adev->ddev, &adev->kfd.client, "kfd", &kfd_client_funcs);
		if (ret) {
			dev_err(adev->dev, "Failed to init DRM client: %d\n", ret);
			return;
@@ -695,10 +698,8 @@ int amdgpu_amdkfd_submit_ib(struct amdgpu_device *adev,
void amdgpu_amdkfd_set_compute_idle(struct amdgpu_device *adev, bool idle)
{
	enum amd_powergating_state state = idle ? AMD_PG_STATE_GATE : AMD_PG_STATE_UNGATE;
	/* Temporary workaround to fix issues observed in some
	 * compute applications when GFXOFF is enabled on GFX11.
	 */
	if (IP_VERSION_MAJ(amdgpu_ip_version(adev, GC_HWIP, 0)) == 11) {
	if (IP_VERSION_MAJ(amdgpu_ip_version(adev, GC_HWIP, 0)) == 11 &&
	    ((adev->mes.kiq_version & AMDGPU_MES_VERSION_MASK) <= 64)) {
		pr_debug("GFXOFF is %s\n", idle ? "enabled" : "disabled");
		amdgpu_gfx_off_ctrl(adev, idle);
	} else if ((IP_VERSION_MAJ(amdgpu_ip_version(adev, GC_HWIP, 0)) == 9) &&
+1 −1
Original line number Diff line number Diff line
@@ -311,7 +311,7 @@ void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct kgd_mem *mem);
int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_device *adev, struct amdgpu_bo *bo);

int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info,
					    struct dma_fence **ef);
					    struct dma_fence __rcu **ef);
int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct amdgpu_device *adev,
					      struct kfd_vm_fault_info *info);
int amdgpu_amdkfd_gpuvm_import_dmabuf_fd(struct amdgpu_device *adev, int fd,
+2 −2
Original line number Diff line number Diff line
@@ -2802,7 +2802,7 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)
	put_task_struct(usertask);
}

static void replace_eviction_fence(struct dma_fence **ef,
static void replace_eviction_fence(struct dma_fence __rcu **ef,
				   struct dma_fence *new_ef)
{
	struct dma_fence *old_ef = rcu_replace_pointer(*ef, new_ef, true
@@ -2837,7 +2837,7 @@ static void replace_eviction_fence(struct dma_fence **ef,
 * 7.  Add fence to all PD and PT BOs.
 * 8.  Unreserve all BOs
 */
int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence **ef)
int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu **ef)
{
	struct amdkfd_process_info *process_info = info;
	struct amdgpu_vm *peer_vm;
+2 −31
Original line number Diff line number Diff line
@@ -1544,6 +1544,7 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev)
				return true;

			fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
			release_firmware(adev->pm.fw);
			if (fw_ver < 0x00160e00)
				return true;
		}
@@ -5245,7 +5246,6 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
	struct amdgpu_device *tmp_adev = NULL;
	bool need_full_reset, skip_hw_reset, vram_lost = false;
	int r = 0;
	bool gpu_reset_for_dev_remove = 0;

	/* Try reset handler method first */
	tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
@@ -5265,10 +5265,6 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
		test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
	skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);

	gpu_reset_for_dev_remove =
		test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
			test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);

	/*
	 * ASIC reset has to be done on all XGMI hive nodes ASAP
	 * to allow proper links negotiation in FW (within 1 sec)
@@ -5311,18 +5307,6 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
		amdgpu_ras_intr_cleared();
	}

	/* Since the mode1 reset affects base ip blocks, the
	 * phase1 ip blocks need to be resumed. Otherwise there
	 * will be a BIOS signature error and the psp bootloader
	 * can't load kdb on the next amdgpu install.
	 */
	if (gpu_reset_for_dev_remove) {
		list_for_each_entry(tmp_adev, device_list_handle, reset_list)
			amdgpu_device_ip_resume_phase1(tmp_adev);

		goto end;
	}

	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
		if (need_full_reset) {
			/* post card */
@@ -5559,11 +5543,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
	int i, r = 0;
	bool need_emergency_restart = false;
	bool audio_suspended = false;
	bool gpu_reset_for_dev_remove = false;

	gpu_reset_for_dev_remove =
			test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
				test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);

	/*
	 * Special case: RAS triggered and full reset isn't supported
@@ -5601,7 +5580,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
	if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
			list_add_tail(&tmp_adev->reset_list, &device_list);
			if (gpu_reset_for_dev_remove && adev->shutdown)
			if (adev->shutdown)
				tmp_adev->shutdown = true;
		}
		if (!list_is_first(&adev->reset_list, &device_list))
@@ -5686,10 +5665,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,

retry:	/* Rest of adevs pre asic reset from XGMI hive. */
	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
		if (gpu_reset_for_dev_remove) {
			/* Workaroud for ASICs need to disable SMC first */
			amdgpu_device_smu_fini_early(tmp_adev);
		}
		r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
		/*TODO Should we stop ?*/
		if (r) {
@@ -5721,9 +5696,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
		r = amdgpu_do_asic_reset(device_list_handle, reset_context);
		if (r && r == -EAGAIN)
			goto retry;

		if (!r && gpu_reset_for_dev_remove)
			goto recover_end;
	}

skip_hw_reset:
@@ -5779,7 +5751,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
		amdgpu_ras_set_error_query_ready(tmp_adev, true);
	}

recover_end:
	tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
					    reset_list);
	amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
Loading