Commit 5f2b6c5f authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'drm-fixes-2025-06-20' of https://gitlab.freedesktop.org/drm/kernel

Pull drm fixes from Dave Airlie:
 "Bit of an uptick in fixes for rc3, msm and amdgpu leading the way,
  with i915/xe/nouveau with a few each and then some scattered misc
  bits, nothing looks too crazy:

  msm:
   - Display:
      - Fixed DP output on SDM845
      - Fixed 10nm DSI PLL init
   - GPU:
      - SUBMIT ioctl error path leak fixes
      - drm half of stall-on-fault fixes
      - a7xx: Missing CP_RESET_CONTEXT_STATE
      - Skip GPU component bind if GPU is not in the device table

  i915:
   - Fix MIPI vtotal programming off by one on Broxton
   - Fix PMU code for GCOV and AutoFDO enabled build

  xe:
   - A workaround update
   - Fix memset on iomem
   - Fix early wedge on GuC Load failure

  amdgpu:
   - DP tunneling fix
   - LTTPR fix
   - DSC fix
   - DML2.x ABGR16161616 fix
   - RMCM fix
   - Backlight fixes
   - GFX11 kicker support
   - SDMA reset fixes
   - VCN 5.0.1 fix
   - Reset fix
   - Misc small fixes

  amdkfd:
   - SDMA reset fix
   - Fix race in GWS scheduling

  nouveau:
   - update docs reference
   - fix backlight name buffer size
   - fix UAF in r535 gsp rpc msg
   - fix undefined shift

  mgag200:
   - drop export header

  ast:
   - drop export header

  malidp:
   - drop informational error

  ssd130x:
   - fix clear columns

  etnaviv:
   - scheduler locking fix

  v3d:
   - null pointer crash fix"

* tag 'drm-fixes-2025-06-20' of https://gitlab.freedesktop.org/drm/kernel: (50 commits)
  drm/xe: Fix early wedge on GuC load failure
  drm/xe: Fix memset on iomem
  drm/xe/bmg: Update Wa_16023588340
  drm/amdgpu/sdma5.2: init engine reset mutex
  drm/amdkfd: Fix race in GWS queue scheduling
  drm/amdgpu/sdma5: init engine reset mutex
  drm/amdgpu: switch job hw_fence to amdgpu_fence
  drm/amdgpu: Fix SDMA UTC_L1 handling during start/stop sequences
  drm/amdgpu: Release reset locks during failures
  drm/amd/display: Check dce_hwseq before dereferencing it
  drm/amdgpu: VCN v5_0_1 to prevent FW checking RB during DPG pause
  drm/amdgpu: Use logical instance ID for SDMA v4_4_2 queue operations
  drm/amdgpu: Fix SDMA engine reset with logical instance ID
  drm/amdgpu: add kicker fws loading for gfx11/smu13/psp13
  drm/amdgpu: Add kicker device detection
  drm/amd/display: Export full brightness range to userspace
  drm/amd/display: Only read ACPI backlight caps once
  drm/amd/display: Fix RMCM programming seq errors
  drm/amd/display: Fix mpv playback corruption on weston
  drm/amd/display: Add more checks for DSC / HUBP ONO guarantees
  ...
parents 0fa52482 b8de9b21
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -25,7 +25,7 @@ providing a consistent API to upper layers of the driver stack.
GSP Support
------------------------

.. kernel-doc:: drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c
.. kernel-doc:: drivers/gpu/drm/nouveau/nvkm/subdev/gsp/rm/r535/rpc.c
   :doc: GSP message queue element

.. kernel-doc:: drivers/gpu/drm/nouveau/include/nvkm/subdev/gsp.h
+1 −1
Original line number Diff line number Diff line
@@ -1902,7 +1902,7 @@ static void amdgpu_ib_preempt_mark_partial_job(struct amdgpu_ring *ring)
			continue;
		}
		job = to_amdgpu_job(s_job);
		if (preempted && (&job->hw_fence) == fence)
		if (preempted && (&job->hw_fence.base) == fence)
			/* mark the job as preempted */
			job->preemption_status |= AMDGPU_IB_PREEMPTED;
	}
+56 −26
Original line number Diff line number Diff line
@@ -6019,16 +6019,12 @@ static int amdgpu_device_health_check(struct list_head *device_list_handle)
	return ret;
}

static int amdgpu_device_halt_activities(struct amdgpu_device *adev,
			      struct amdgpu_job *job,
			      struct amdgpu_reset_context *reset_context,
static int amdgpu_device_recovery_prepare(struct amdgpu_device *adev,
					  struct list_head *device_list,
			      struct amdgpu_hive_info *hive,
			      bool need_emergency_restart)
					  struct amdgpu_hive_info *hive)
{
	struct list_head *device_list_handle =  NULL;
	struct amdgpu_device *tmp_adev = NULL;
	int i, r = 0;
	int r;

	/*
	 * Build list of devices to reset.
@@ -6045,26 +6041,54 @@ static int amdgpu_device_halt_activities(struct amdgpu_device *adev,
		}
		if (!list_is_first(&adev->reset_list, device_list))
			list_rotate_to_front(&adev->reset_list, device_list);
		device_list_handle = device_list;
	} else {
		list_add_tail(&adev->reset_list, device_list);
		device_list_handle = device_list;
	}

	if (!amdgpu_sriov_vf(adev) && (!adev->pcie_reset_ctx.occurs_dpc)) {
		r = amdgpu_device_health_check(device_list_handle);
		r = amdgpu_device_health_check(device_list);
		if (r)
			return r;
	}

	/* We need to lock reset domain only once both for XGMI and single device */
	tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
				    reset_list);
	return 0;
}

static void amdgpu_device_recovery_get_reset_lock(struct amdgpu_device *adev,
						  struct list_head *device_list)
{
	struct amdgpu_device *tmp_adev = NULL;

	if (list_empty(device_list))
		return;
	tmp_adev =
		list_first_entry(device_list, struct amdgpu_device, reset_list);
	amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
}

	/* block all schedulers and reset given job's ring */
	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
static void amdgpu_device_recovery_put_reset_lock(struct amdgpu_device *adev,
						  struct list_head *device_list)
{
	struct amdgpu_device *tmp_adev = NULL;

	if (list_empty(device_list))
		return;
	tmp_adev =
		list_first_entry(device_list, struct amdgpu_device, reset_list);
	amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
}

static int amdgpu_device_halt_activities(
	struct amdgpu_device *adev, struct amdgpu_job *job,
	struct amdgpu_reset_context *reset_context,
	struct list_head *device_list, struct amdgpu_hive_info *hive,
	bool need_emergency_restart)
{
	struct amdgpu_device *tmp_adev = NULL;
	int i, r = 0;

	/* block all schedulers and reset given job's ring */
	list_for_each_entry(tmp_adev, device_list, reset_list) {
		amdgpu_device_set_mp1_state(tmp_adev);

		/*
@@ -6252,11 +6276,6 @@ static void amdgpu_device_gpu_resume(struct amdgpu_device *adev,
		amdgpu_ras_set_error_query_ready(tmp_adev, true);

	}

	tmp_adev = list_first_entry(device_list, struct amdgpu_device,
					    reset_list);
	amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);

}


@@ -6324,10 +6343,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
	reset_context->hive = hive;
	INIT_LIST_HEAD(&device_list);

	if (amdgpu_device_recovery_prepare(adev, &device_list, hive))
		goto end_reset;

	/* We need to lock reset domain only once both for XGMI and single device */
	amdgpu_device_recovery_get_reset_lock(adev, &device_list);

	r = amdgpu_device_halt_activities(adev, job, reset_context, &device_list,
					 hive, need_emergency_restart);
	if (r)
		goto end_reset;
		goto reset_unlock;

	if (need_emergency_restart)
		goto skip_sched_resume;
@@ -6337,7 +6362,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
	 *
	 * job->base holds a reference to parent fence
	 */
	if (job && dma_fence_is_signaled(&job->hw_fence)) {
	if (job && dma_fence_is_signaled(&job->hw_fence.base)) {
		job_signaled = true;
		dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
		goto skip_hw_reset;
@@ -6345,13 +6370,15 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,

	r = amdgpu_device_asic_reset(adev, &device_list, reset_context);
	if (r)
		goto end_reset;
		goto reset_unlock;
skip_hw_reset:
	r = amdgpu_device_sched_resume(&device_list, reset_context, job_signaled);
	if (r)
		goto end_reset;
		goto reset_unlock;
skip_sched_resume:
	amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart);
reset_unlock:
	amdgpu_device_recovery_put_reset_lock(adev, &device_list);
end_reset:
	if (hive) {
		mutex_unlock(&hive->hive_lock);
@@ -6763,6 +6790,8 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
		memset(&reset_context, 0, sizeof(reset_context));
		INIT_LIST_HEAD(&device_list);

		amdgpu_device_recovery_prepare(adev, &device_list, hive);
		amdgpu_device_recovery_get_reset_lock(adev, &device_list);
		r = amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list,
					 hive, false);
		if (hive) {
@@ -6880,8 +6909,8 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
		if (hive) {
			list_for_each_entry(tmp_adev, &device_list, reset_list)
				amdgpu_device_unset_mp1_state(tmp_adev);
			amdgpu_device_unlock_reset_domain(adev->reset_domain);
		}
		amdgpu_device_recovery_put_reset_lock(adev, &device_list);
	}

	if (hive) {
@@ -6927,6 +6956,7 @@ void amdgpu_pci_resume(struct pci_dev *pdev)

	amdgpu_device_sched_resume(&device_list, NULL, NULL);
	amdgpu_device_gpu_resume(adev, &device_list, false);
	amdgpu_device_recovery_put_reset_lock(adev, &device_list);
	adev->pcie_reset_ctx.occurs_dpc = false;

	if (hive) {
+7 −23
Original line number Diff line number Diff line
@@ -41,22 +41,6 @@
#include "amdgpu_trace.h"
#include "amdgpu_reset.h"

/*
 * Fences mark an event in the GPUs pipeline and are used
 * for GPU/CPU synchronization.  When the fence is written,
 * it is expected that all buffers associated with that fence
 * are no longer in use by the associated ring on the GPU and
 * that the relevant GPU caches have been flushed.
 */

struct amdgpu_fence {
	struct dma_fence base;

	/* RB, DMA, etc. */
	struct amdgpu_ring		*ring;
	ktime_t				start_timestamp;
};

static struct kmem_cache *amdgpu_fence_slab;

int amdgpu_fence_slab_init(void)
@@ -151,12 +135,12 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f, struct amd
		am_fence = kmem_cache_alloc(amdgpu_fence_slab, GFP_ATOMIC);
		if (am_fence == NULL)
			return -ENOMEM;
		fence = &am_fence->base;
		am_fence->ring = ring;
	} else {
		/* take use of job-embedded fence */
		fence = &job->hw_fence;
		am_fence = &job->hw_fence;
	}
	fence = &am_fence->base;
	am_fence->ring = ring;

	seq = ++ring->fence_drv.sync_seq;
	if (job && job->job_run_counter) {
@@ -718,7 +702,7 @@ void amdgpu_fence_driver_clear_job_fences(struct amdgpu_ring *ring)
			 * it right here or we won't be able to track them in fence_drv
			 * and they will remain unsignaled during sa_bo free.
			 */
			job = container_of(old, struct amdgpu_job, hw_fence);
			job = container_of(old, struct amdgpu_job, hw_fence.base);
			if (!job->base.s_fence && !dma_fence_is_signaled(old))
				dma_fence_signal(old);
			RCU_INIT_POINTER(*ptr, NULL);
@@ -780,7 +764,7 @@ static const char *amdgpu_fence_get_timeline_name(struct dma_fence *f)

static const char *amdgpu_job_fence_get_timeline_name(struct dma_fence *f)
{
	struct amdgpu_job *job = container_of(f, struct amdgpu_job, hw_fence);
	struct amdgpu_job *job = container_of(f, struct amdgpu_job, hw_fence.base);

	return (const char *)to_amdgpu_ring(job->base.sched)->name;
}
@@ -810,7 +794,7 @@ static bool amdgpu_fence_enable_signaling(struct dma_fence *f)
 */
static bool amdgpu_job_fence_enable_signaling(struct dma_fence *f)
{
	struct amdgpu_job *job = container_of(f, struct amdgpu_job, hw_fence);
	struct amdgpu_job *job = container_of(f, struct amdgpu_job, hw_fence.base);

	if (!timer_pending(&to_amdgpu_ring(job->base.sched)->fence_drv.fallback_timer))
		amdgpu_fence_schedule_fallback(to_amdgpu_ring(job->base.sched));
@@ -845,7 +829,7 @@ static void amdgpu_job_fence_free(struct rcu_head *rcu)
	struct dma_fence *f = container_of(rcu, struct dma_fence, rcu);

	/* free job if fence has a parent job */
	kfree(container_of(f, struct amdgpu_job, hw_fence));
	kfree(container_of(f, struct amdgpu_job, hw_fence.base));
}

/**
+6 −6
Original line number Diff line number Diff line
@@ -272,8 +272,8 @@ void amdgpu_job_free_resources(struct amdgpu_job *job)
	/* Check if any fences where initialized */
	if (job->base.s_fence && job->base.s_fence->finished.ops)
		f = &job->base.s_fence->finished;
	else if (job->hw_fence.ops)
		f = &job->hw_fence;
	else if (job->hw_fence.base.ops)
		f = &job->hw_fence.base;
	else
		f = NULL;

@@ -290,10 +290,10 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job)
	amdgpu_sync_free(&job->explicit_sync);

	/* only put the hw fence if has embedded fence */
	if (!job->hw_fence.ops)
	if (!job->hw_fence.base.ops)
		kfree(job);
	else
		dma_fence_put(&job->hw_fence);
		dma_fence_put(&job->hw_fence.base);
}

void amdgpu_job_set_gang_leader(struct amdgpu_job *job,
@@ -322,10 +322,10 @@ void amdgpu_job_free(struct amdgpu_job *job)
	if (job->gang_submit != &job->base.s_fence->scheduled)
		dma_fence_put(job->gang_submit);

	if (!job->hw_fence.ops)
	if (!job->hw_fence.base.ops)
		kfree(job);
	else
		dma_fence_put(&job->hw_fence);
		dma_fence_put(&job->hw_fence.base);
}

struct dma_fence *amdgpu_job_submit(struct amdgpu_job *job)
Loading