Commit a82866fb authored by Dave Airlie's avatar Dave Airlie
Browse files

Merge tag 'amd-drm-next-6.15-2025-03-21' of...

Merge tag 'amd-drm-next-6.15-2025-03-21' of https://gitlab.freedesktop.org/agd5f/linux

 into drm-next

amd-drm-next-6.15-2025-03-21:

amdgpu:
- Refine nomodeset handling
- RAS fixes
- DCN 3.x fixes
- DMUB fixes
- eDP fixes
- SMU 14.0.2 fixes
- SMU 13.0.6 fixes
- SMU 13.0.12 fixes
- SDMA engine reset fixes
- Enforce Isolation fixes
- Runtime workload profile ref count fixes
- Documentation fixes
- SR-IOV fixes
- MES fixes
- GC 11.5 cleaner shader support
- SDMA VM invalidation fixes
- IP discovery improvements for GC based chips

amdkfd:
- Dequeue wait count fixes
- Precise memops fixes

radeon:
- Code cleanup

Signed-off-by: default avatarDave Airlie <airlied@redhat.com>

From: Alex Deucher <alexander.deucher@amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20250321210909.2809595-1-alexander.deucher@amd.com
parents f72e21ea 7547510d
Loading
Loading
Loading
Loading
+11 −2
Original line number Diff line number Diff line
@@ -1194,9 +1194,15 @@ struct amdgpu_device {
	bool                            debug_exp_resets;
	bool                            debug_disable_gpu_ring_reset;

	bool				enforce_isolation[MAX_XCP];
	/* Added this mutex for cleaner shader isolation between GFX and compute processes */
	/* Protection for the following isolation structure */
	struct mutex                    enforce_isolation_mutex;
	bool				enforce_isolation[MAX_XCP];
	struct amdgpu_isolation {
		void			*owner;
		struct dma_fence	*spearhead;
		struct amdgpu_sync	active;
		struct amdgpu_sync	prev;
	} isolation[MAX_XCP];

	struct amdgpu_init_level *init_lvl;

@@ -1482,6 +1488,9 @@ void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev);
struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
					    struct dma_fence *gang);
struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev,
						  struct amdgpu_ring *ring,
						  struct amdgpu_job *job);
bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev);
ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring);
ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset);
+23 −2
Original line number Diff line number Diff line
@@ -391,6 +391,7 @@ static void aca_banks_generate_cper(struct amdgpu_device *adev,
{
	struct aca_bank_node *node;
	struct aca_bank *bank;
	int r;

	if (!adev->cper.enabled)
		return;
@@ -402,11 +403,27 @@ static void aca_banks_generate_cper(struct amdgpu_device *adev,

	/* UEs must be encoded into separate CPER entries */
	if (type == ACA_SMU_TYPE_UE) {
		struct aca_banks de_banks;

		aca_banks_init(&de_banks);
		list_for_each_entry(node, &banks->list, node) {
			bank = &node->bank;
			if (bank->aca_err_type == ACA_ERROR_TYPE_DEFERRED) {
				r = aca_banks_add_bank(&de_banks, bank);
				if (r)
					dev_warn(adev->dev, "fail to add de banks, ret = %d\n", r);
			} else {
				if (amdgpu_cper_generate_ue_record(adev, bank))
					dev_warn(adev->dev, "fail to generate ue cper records\n");
			}
		}

		if (!list_empty(&de_banks.list)) {
			if (amdgpu_cper_generate_ce_records(adev, &de_banks, de_banks.nr_banks))
				dev_warn(adev->dev, "fail to generate de cper records\n");
		}

		aca_banks_release(&de_banks);
	} else {
		/*
		 * SMU_TYPE_CE banks are combined into 1 CPER entries,
@@ -541,6 +558,10 @@ static int __aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *h
	if (ret)
		return ret;

	/* DEs may contain in CEs or UEs */
	if (type != ACA_ERROR_TYPE_DEFERRED)
		aca_log_aca_error(handle, ACA_ERROR_TYPE_DEFERRED, err_data);

	return aca_log_aca_error(handle, type, err_data);
}

+11 −5
Original line number Diff line number Diff line
@@ -76,12 +76,18 @@ struct ras_query_context;
#define mmSMNAID_XCD1_MCA_SMU		0x38430400	/* SMN AID XCD1 */
#define mmSMNXCD_XCD0_MCA_SMU		0x40430400	/* SMN XCD XCD0 */

#define ACA_BANK_ERR_IS_DEFFERED(bank)                                \
	(ACA_REG__STATUS__POISON((bank)->regs[ACA_REG_IDX_STATUS]) || \
	 ACA_REG__STATUS__DEFERRED((bank)->regs[ACA_REG_IDX_STATUS]))

#define ACA_BANK_ERR_CE_DE_DECODE(bank)                             \
	((ACA_REG__STATUS__POISON((bank)->regs[ACA_REG_IDX_STATUS]) ||   \
	  ACA_REG__STATUS__DEFERRED((bank)->regs[ACA_REG_IDX_STATUS])) ? \
		ACA_ERROR_TYPE_DEFERRED :                                \
	(ACA_BANK_ERR_IS_DEFFERED(bank) ? ACA_ERROR_TYPE_DEFERRED : \
					  ACA_ERROR_TYPE_CE)

#define ACA_BANK_ERR_UE_DE_DECODE(bank)                             \
	(ACA_BANK_ERR_IS_DEFFERED(bank) ? ACA_ERROR_TYPE_DEFERRED : \
					  ACA_ERROR_TYPE_UE)

enum aca_reg_idx {
	ACA_REG_IDX_CTL			= 0,
	ACA_REG_IDX_STATUS		= 1,
+4 −4
Original line number Diff line number Diff line
@@ -491,7 +491,7 @@ static int vm_update_pds(struct amdgpu_vm *vm, struct amdgpu_sync *sync)
	if (ret)
		return ret;

	return amdgpu_sync_fence(sync, vm->last_update);
	return amdgpu_sync_fence(sync, vm->last_update, GFP_KERNEL);
}

static uint64_t get_pte_flags(struct amdgpu_device *adev, struct kgd_mem *mem)
@@ -1249,7 +1249,7 @@ static int unmap_bo_from_gpuvm(struct kgd_mem *mem,

	(void)amdgpu_vm_clear_freed(adev, vm, &bo_va->last_pt_update);

	(void)amdgpu_sync_fence(sync, bo_va->last_pt_update);
	(void)amdgpu_sync_fence(sync, bo_va->last_pt_update, GFP_KERNEL);

	return 0;
}
@@ -1273,7 +1273,7 @@ static int update_gpuvm_pte(struct kgd_mem *mem,
		return ret;
	}

	return amdgpu_sync_fence(sync, bo_va->last_pt_update);
	return amdgpu_sync_fence(sync, bo_va->last_pt_update, GFP_KERNEL);
}

static int map_bo_to_gpuvm(struct kgd_mem *mem,
@@ -2913,7 +2913,7 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu *
		}
		dma_resv_for_each_fence(&cursor, bo->tbo.base.resv,
					DMA_RESV_USAGE_KERNEL, fence) {
			ret = amdgpu_sync_fence(&sync_obj, fence);
			ret = amdgpu_sync_fence(&sync_obj, fence, GFP_KERNEL);
			if (ret) {
				pr_debug("Memory eviction: Sync BO fence failed. Try again\n");
				goto validate_map_fail;
+8 −7
Original line number Diff line number Diff line
@@ -455,10 +455,10 @@ static u32 amdgpu_cper_ring_get_ent_sz(struct amdgpu_ring *ring, u64 pos)
		return umin(rec_len, chunk);
}

void amdgpu_cper_ring_write(struct amdgpu_ring *ring,
					      void *src, int count)
void amdgpu_cper_ring_write(struct amdgpu_ring *ring, void *src, int count)
{
	u64 pos, wptr_old, rptr = *ring->rptr_cpu_addr & ring->ptr_mask;
	int rec_cnt_dw = count >> 2;
	u32 chunk, ent_sz;
	u8 *s = (u8 *)src;

@@ -485,6 +485,9 @@ void amdgpu_cper_ring_write(struct amdgpu_ring *ring,
		s += chunk;
	}

	if (ring->count_dw < rec_cnt_dw)
		ring->count_dw = 0;

	/* the buffer is overflow, adjust rptr */
	if (((wptr_old < rptr) && (rptr <= ring->wptr)) ||
	    ((ring->wptr < wptr_old) && (wptr_old < rptr)) ||
@@ -501,12 +504,10 @@ void amdgpu_cper_ring_write(struct amdgpu_ring *ring,
			pos = rptr;
		} while (!amdgpu_cper_is_hdr(ring, rptr));
	}
	mutex_unlock(&ring->adev->cper.ring_lock);

	if (ring->count_dw >= (count >> 2))
		ring->count_dw -= (count >> 2);
	else
		ring->count_dw = 0;
	if (ring->count_dw >= rec_cnt_dw)
		ring->count_dw -= rec_cnt_dw;
	mutex_unlock(&ring->adev->cper.ring_lock);
}

static u64 amdgpu_cper_ring_get_rptr(struct amdgpu_ring *ring)
Loading