Commit e08b5758 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'drm-next-2024-01-19' of git://anongit.freedesktop.org/drm/drm

Pull more drm fixes from Dave Airlie:
 "This is mostly amdgpu and xe fixes, with an amdkfd and nouveau fix
  thrown in.

  The amdgpu ones are just the usual couple of weeks of fixes. The xe
  ones are bunch of cleanups for the new xe driver, the fix you put in
  on the merge commit and the kconfig fix that was hiding the problem
  from me.

  amdgpu:
   - DSC fixes
   - DC resource pool fixes
   - OTG fix
   - DML2 fixes
   - Aux fix
   - GFX10 RLC firmware handling fix
   - Revert a broken workaround for SMU 13.0.2
   - DC writeback fix
   - Enable gfxoff when ROCm apps are active on gfx11 with the proper FW
     version

  amdkfd:
   - Fix dma-buf exports using GEM handles

  nouveau:
   - fix a unneeded WARN_ON triggering

  xe:
   - Fix for definition of wakeref_t
   - Fix for an error code aliasing
   - Fix for VM_UNBIND_ALL in the case there are no bound VMAs
   - Fixes for a number of __iomem address space mismatches reported by
     sparse
   - Fixes for the assignment of exec_queue priority
   - A Fix for skip_guc_pc not taking effect
   - Workaround for a build problem on GCC 11
   - A couple of fixes for error paths
   - Fix a Flat CCS compression metadata copy issue
   - Fix a misplace array bounds checking
   - Don't have display support depend on EXPERT (as discussed on IRC)"

* tag 'drm-next-2024-01-19' of git://anongit.freedesktop.org/drm/drm: (71 commits)
  nouveau/vmm: don't set addr on the fail path to avoid warning
  drm/amdgpu: Enable GFXOFF for Compute on GFX11
  drm/amd/display: Drop 'acrtc' and add 'new_crtc_state' NULL check for writeback requests.
  drm/amdgpu: revert "Adjust removal control flow for smu v13_0_2"
  drm/amdkfd: init drm_client with funcs hook
  drm/amd/display: Fix a switch statement in populate_dml_output_cfg_from_stream_state()
  drm/amdgpu: Fix the null pointer when load rlc firmware
  drm/amd/display: Align the returned error code with legacy DP
  drm/amd/display: Fix DML2 watermark calculation
  drm/amd/display: Clear OPTC mem select on disable
  drm/amd/display: Port DENTIST hang and TDR fixes to OTG disable W/A
  drm/amd/display: Add logging resource checks
  drm/amd/display: Init link enc resources in dc_state only if res_pool presents
  drm/amd/display: Fix late derefrence 'dsc' check in 'link_set_dsc_pps_packet()'
  drm/amd/display: Avoid enum conversion warning
  drm/amd/pm: Fix smuv13.0.6 current clock reporting
  drm/amd/pm: Add error log for smu v13.0.6 reset
  drm/amdkfd: Fix 'node' NULL check in 'svm_range_get_range_boundaries()'
  drm/amdgpu: drop exp hw support check for GC 9.4.3
  drm/amdgpu: move debug options init prior to amdgpu device init
  ...
parents ab1e2d0f 009f0a64
Loading
Loading
Loading
Loading
+1 −2
Original line number Diff line number Diff line
@@ -254,8 +254,6 @@ extern int amdgpu_agp;

extern int amdgpu_wbrf;

extern int fw_bo_location;

#define AMDGPU_VM_MAX_NUM_CTX			4096
#define AMDGPU_SG_THRESHOLD			(256*1024*1024)
#define AMDGPU_WAIT_IDLE_TIMEOUT_IN_MS	        3000
@@ -1146,6 +1144,7 @@ struct amdgpu_device {
	bool                            debug_vm;
	bool                            debug_largebar;
	bool                            debug_disable_soft_recovery;
	bool                            debug_use_vram_fw_buf;
};

static inline uint32_t amdgpu_ip_version(const struct amdgpu_device *adev,
+6 −5
Original line number Diff line number Diff line
@@ -138,6 +138,9 @@ static void amdgpu_amdkfd_reset_work(struct work_struct *work)
	amdgpu_device_gpu_recover(adev, NULL, &reset_context);
}

static const struct drm_client_funcs kfd_client_funcs = {
	.unregister	= drm_client_release,
};
void amdgpu_amdkfd_device_init(struct amdgpu_device *adev)
{
	int i;
@@ -161,7 +164,7 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev)
			.enable_mes = adev->enable_mes,
		};

		ret = drm_client_init(&adev->ddev, &adev->kfd.client, "kfd", NULL);
		ret = drm_client_init(&adev->ddev, &adev->kfd.client, "kfd", &kfd_client_funcs);
		if (ret) {
			dev_err(adev->dev, "Failed to init DRM client: %d\n", ret);
			return;
@@ -695,10 +698,8 @@ int amdgpu_amdkfd_submit_ib(struct amdgpu_device *adev,
void amdgpu_amdkfd_set_compute_idle(struct amdgpu_device *adev, bool idle)
{
	enum amd_powergating_state state = idle ? AMD_PG_STATE_GATE : AMD_PG_STATE_UNGATE;
	/* Temporary workaround to fix issues observed in some
	 * compute applications when GFXOFF is enabled on GFX11.
	 */
	if (IP_VERSION_MAJ(amdgpu_ip_version(adev, GC_HWIP, 0)) == 11) {
	if (IP_VERSION_MAJ(amdgpu_ip_version(adev, GC_HWIP, 0)) == 11 &&
	    ((adev->mes.kiq_version & AMDGPU_MES_VERSION_MASK) <= 64)) {
		pr_debug("GFXOFF is %s\n", idle ? "enabled" : "disabled");
		amdgpu_gfx_off_ctrl(adev, idle);
	} else if ((IP_VERSION_MAJ(amdgpu_ip_version(adev, GC_HWIP, 0)) == 9) &&
+1 −1
Original line number Diff line number Diff line
@@ -311,7 +311,7 @@ void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct kgd_mem *mem);
int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_device *adev, struct amdgpu_bo *bo);

int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info,
					    struct dma_fence **ef);
					    struct dma_fence __rcu **ef);
int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct amdgpu_device *adev,
					      struct kfd_vm_fault_info *info);
int amdgpu_amdkfd_gpuvm_import_dmabuf_fd(struct amdgpu_device *adev, int fd,
+2 −2
Original line number Diff line number Diff line
@@ -2802,7 +2802,7 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)
	put_task_struct(usertask);
}

static void replace_eviction_fence(struct dma_fence **ef,
static void replace_eviction_fence(struct dma_fence __rcu **ef,
				   struct dma_fence *new_ef)
{
	struct dma_fence *old_ef = rcu_replace_pointer(*ef, new_ef, true
@@ -2837,7 +2837,7 @@ static void replace_eviction_fence(struct dma_fence **ef,
 * 7.  Add fence to all PD and PT BOs.
 * 8.  Unreserve all BOs
 */
int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence **ef)
int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu **ef)
{
	struct amdkfd_process_info *process_info = info;
	struct amdgpu_vm *peer_vm;
+2 −31
Original line number Diff line number Diff line
@@ -1544,6 +1544,7 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev)
				return true;

			fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
			release_firmware(adev->pm.fw);
			if (fw_ver < 0x00160e00)
				return true;
		}
@@ -5245,7 +5246,6 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
	struct amdgpu_device *tmp_adev = NULL;
	bool need_full_reset, skip_hw_reset, vram_lost = false;
	int r = 0;
	bool gpu_reset_for_dev_remove = 0;

	/* Try reset handler method first */
	tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
@@ -5265,10 +5265,6 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
		test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
	skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);

	gpu_reset_for_dev_remove =
		test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
			test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);

	/*
	 * ASIC reset has to be done on all XGMI hive nodes ASAP
	 * to allow proper links negotiation in FW (within 1 sec)
@@ -5311,18 +5307,6 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
		amdgpu_ras_intr_cleared();
	}

	/* Since the mode1 reset affects base ip blocks, the
	 * phase1 ip blocks need to be resumed. Otherwise there
	 * will be a BIOS signature error and the psp bootloader
	 * can't load kdb on the next amdgpu install.
	 */
	if (gpu_reset_for_dev_remove) {
		list_for_each_entry(tmp_adev, device_list_handle, reset_list)
			amdgpu_device_ip_resume_phase1(tmp_adev);

		goto end;
	}

	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
		if (need_full_reset) {
			/* post card */
@@ -5559,11 +5543,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
	int i, r = 0;
	bool need_emergency_restart = false;
	bool audio_suspended = false;
	bool gpu_reset_for_dev_remove = false;

	gpu_reset_for_dev_remove =
			test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
				test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);

	/*
	 * Special case: RAS triggered and full reset isn't supported
@@ -5601,7 +5580,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
	if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
			list_add_tail(&tmp_adev->reset_list, &device_list);
			if (gpu_reset_for_dev_remove && adev->shutdown)
			if (adev->shutdown)
				tmp_adev->shutdown = true;
		}
		if (!list_is_first(&adev->reset_list, &device_list))
@@ -5686,10 +5665,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,

retry:	/* Rest of adevs pre asic reset from XGMI hive. */
	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
		if (gpu_reset_for_dev_remove) {
			/* Workaroud for ASICs need to disable SMC first */
			amdgpu_device_smu_fini_early(tmp_adev);
		}
		r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
		/*TODO Should we stop ?*/
		if (r) {
@@ -5721,9 +5696,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
		r = amdgpu_do_asic_reset(device_list_handle, reset_context);
		if (r && r == -EAGAIN)
			goto retry;

		if (!r && gpu_reset_for_dev_remove)
			goto recover_end;
	}

skip_hw_reset:
@@ -5779,7 +5751,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
		amdgpu_ras_set_error_query_ready(tmp_adev, true);
	}

recover_end:
	tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
					    reset_list);
	amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
Loading