Commit e55ef655 authored by Simona Vetter's avatar Simona Vetter
Browse files

Merge tag 'amd-drm-next-6.12-2024-08-26' of...

Merge tag 'amd-drm-next-6.12-2024-08-26' of https://gitlab.freedesktop.org/agd5f/linux into drm-next

amd-drm-next-6.12-2024-08-26:

amdgpu:
- SDMA devcoredump support
- DCN 4.0.1 updates
- DC SUBVP fixes
- Refactor OPP in DC
- Refactor MMHUBBUB in DC
- DC DML 2.1 updates
- DC FAMS2 updates
- RAS updates
- GFX12 updates
- VCN 4.0.3 updates
- JPEG 4.0.3 updates
- Enable wave kill (soft recovery) for compute queues
- Clean up CP error interrupt handling
- Enable CP bad opcode interrupts
- VCN 4.x fixes
- VCN 5.x fixes
- GPU reset fixes
- Fix vbios embedded EDID size handling
- SMU 14.x updates
- Misc code cleanups and spelling fixes
- VCN devcoredump support
- ISP MFD i2c support
- DC vblank fixes
- GFX 12 fixes
- PSR fixes
- Convert vbios embedded EDID to drm_edid
- DCN 3.5 updates
- DMCUB updates
- Cursor fixes
- Overdrive support for SMU 14.x
- GFX CP padding optimizations
- DCC fixes
- DSC fixes
- Preliminary per queue reset infrastructure
- Initial per queue reset support for GFX 9
- Initial per queue reset support for GFX 7, 8
- DCN 3.2 fixes
- DP MST fixes
- SR-IOV fixes
- GFX 9.4.3/4 devcoredump support
- Add process isolation framework
- Enable process isolation support for GFX 9.4.3/4
- Take IOMMU remapping into account for P2P DMA checks

amdkfd:
- CRIU fixes
- Improved input validation for user queues
- HMM fix
- Enable process isolation support for GFX 9.4.3/4
- Initial per queue reset support for GFX 9
- Allow users to target recommended SDMA engines

radeon:
- remove .load and drm_dev_alloc
- Fix vbios embedded EDID size handling
- Convert vbios embedded EDID to drm_edid
- Use GEM references instead of TTM
- r100 cp init cleanup
- Fix potential overflows in evergreen CS offset tracking

UAPI:
- KFD support for targetting queues on recommended SDMA engines
  Proposed userspace:
  https://github.com/ROCm/ROCR-Runtime/commit/2f588a24065f41c208c3701945e20be746d8faf7
  https://github.com/ROCm/ROCR-Runtime/commit/eb30a5bbc7719c6ffcf2d2dd2878bc53a47b3f30

drm/buddy:
- Add start address support for trim function

From: Alex Deucher <alexander.deucher@amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240826201528.55307-1-alexander.deucher@amd.com
parents 4461e9e5 3376f922
Loading
Loading
Loading
Loading
+9 −23
Original line number Diff line number Diff line
@@ -118,6 +118,8 @@

#define MAX_GPU_INSTANCE		64

#define GFX_SLICE_PERIOD		msecs_to_jiffies(250)

struct amdgpu_gpu_instance {
	struct amdgpu_device		*adev;
	int				mgpu_fan_enabled;
@@ -348,7 +350,7 @@ enum amdgpu_kiq_irq {
	AMDGPU_CP_KIQ_IRQ_LAST
};
#define SRIOV_USEC_TIMEOUT  1200000 /* wait 12 * 100ms for SRIOV */
#define MAX_KIQ_REG_WAIT (amdgpu_sriov_vf(adev) ? 50000 : 5000) /* in usecs, extend for VF */
#define MAX_KIQ_REG_WAIT       5000 /* in usecs, 5ms */
#define MAX_KIQ_REG_BAILOUT_INTERVAL   5 /* in msecs, 5ms */
#define MAX_KIQ_REG_TRY 1000

@@ -823,17 +825,6 @@ struct amdgpu_mqd {
struct amdgpu_reset_domain;
struct amdgpu_fru_info;

struct amdgpu_reset_info {
	/* reset dump register */
	u32 *reset_dump_reg_list;
	u32 *reset_dump_reg_value;
	int num_regs;

#ifdef CONFIG_DEV_COREDUMP
	struct amdgpu_coredump_info *coredump_info;
#endif
};

/*
 * Non-zero (true) if the GPU has VRAM. Zero (false) otherwise.
 */
@@ -1157,8 +1148,6 @@ struct amdgpu_device {

	struct mutex			benchmark_mutex;

	struct amdgpu_reset_info	reset_info;

	bool                            scpm_enabled;
	uint32_t                        scpm_status;

@@ -1175,6 +1164,10 @@ struct amdgpu_device {
	bool                            debug_disable_soft_recovery;
	bool                            debug_use_vram_fw_buf;
	bool                            debug_enable_ras_aca;

	bool				enforce_isolation[MAX_XCP];
	/* Added this mutex for cleaner shader isolation between GFX and compute processes */
	struct mutex                    enforce_isolation_mutex;
};

static inline uint32_t amdgpu_ip_version(const struct amdgpu_device *adev,
@@ -1587,13 +1580,6 @@ static inline bool amdgpu_acpi_is_s3_active(struct amdgpu_device *adev) { return
static inline void amdgpu_choose_low_power_state(struct amdgpu_device *adev) { }
#endif

#if defined(CONFIG_DRM_AMD_DC)
int amdgpu_dm_display_resume(struct amdgpu_device *adev );
#else
static inline int amdgpu_dm_display_resume(struct amdgpu_device *adev) { return 0; }
#endif


void amdgpu_register_gpu_instance(struct amdgpu_device *adev);
void amdgpu_unregister_gpu_instance(struct amdgpu_device *adev);

+13 −3
Original line number Diff line number Diff line
@@ -80,6 +80,9 @@ static void aca_banks_release(struct aca_banks *banks)
{
	struct aca_bank_node *node, *tmp;

	if (list_empty(&banks->list))
		return;

	list_for_each_entry_safe(node, tmp, &banks->list, node) {
		list_del(&node->node);
		kvfree(node);
@@ -453,13 +456,13 @@ static int aca_log_aca_error_data(struct aca_bank_error *bank_error, enum aca_er

	switch (type) {
	case ACA_ERROR_TYPE_UE:
		amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, NULL, count);
		amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, count);
		break;
	case ACA_ERROR_TYPE_CE:
		amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, NULL, count);
		amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, count);
		break;
	case ACA_ERROR_TYPE_DEFERRED:
		amdgpu_ras_error_statistic_de_count(err_data, &mcm_info, NULL, count);
		amdgpu_ras_error_statistic_de_count(err_data, &mcm_info, count);
		break;
	default:
		break;
@@ -562,9 +565,13 @@ static void aca_error_fini(struct aca_error *aerr)
	struct aca_bank_error *bank_error, *tmp;

	mutex_lock(&aerr->lock);
	if (list_empty(&aerr->list))
		goto out_unlock;

	list_for_each_entry_safe(bank_error, tmp, &aerr->list, node)
		aca_bank_error_remove(aerr, bank_error);

out_unlock:
	mutex_destroy(&aerr->lock);
}

@@ -680,6 +687,9 @@ static void aca_manager_fini(struct aca_handle_manager *mgr)
{
	struct aca_handle *handle, *tmp;

	if (list_empty(&mgr->list))
		return;

	list_for_each_entry_safe(handle, tmp, &mgr->list, node)
		amdgpu_aca_remove_handle(handle);
}
+25 −23
Original line number Diff line number Diff line
@@ -364,15 +364,15 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct amdgpu_device *adev, size_t size,
	return r;
}

void amdgpu_amdkfd_free_gtt_mem(struct amdgpu_device *adev, void *mem_obj)
void amdgpu_amdkfd_free_gtt_mem(struct amdgpu_device *adev, void **mem_obj)
{
	struct amdgpu_bo *bo = (struct amdgpu_bo *) mem_obj;
	struct amdgpu_bo **bo = (struct amdgpu_bo **) mem_obj;

	amdgpu_bo_reserve(bo, true);
	amdgpu_bo_kunmap(bo);
	amdgpu_bo_unpin(bo);
	amdgpu_bo_unreserve(bo);
	amdgpu_bo_unref(&(bo));
	amdgpu_bo_reserve(*bo, true);
	amdgpu_bo_kunmap(*bo);
	amdgpu_bo_unpin(*bo);
	amdgpu_bo_unreserve(*bo);
	amdgpu_bo_unref(bo);
}

int amdgpu_amdkfd_alloc_gws(struct amdgpu_device *adev, size_t size,
@@ -783,22 +783,6 @@ int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev,
	return 0;
}

bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev,
			int hub_inst, int hub_type)
{
	if (!hub_type) {
		if (adev->gfxhub.funcs->query_utcl2_poison_status)
			return adev->gfxhub.funcs->query_utcl2_poison_status(adev, hub_inst);
		else
			return false;
	} else {
		if (adev->mmhub.funcs->query_utcl2_poison_status)
			return adev->mmhub.funcs->query_utcl2_poison_status(adev, hub_inst);
		else
			return false;
	}
}

int amdgpu_amdkfd_check_and_lock_kfd(struct amdgpu_device *adev)
{
	return kgd2kfd_check_and_lock_kfd();
@@ -887,3 +871,21 @@ int amdgpu_amdkfd_unmap_hiq(struct amdgpu_device *adev, u32 doorbell_off,

	return r;
}

/* Stop scheduling on KFD */
int amdgpu_amdkfd_stop_sched(struct amdgpu_device *adev, uint32_t node_id)
{
	if (!adev->kfd.init_complete)
		return 0;

	return kgd2kfd_stop_sched(adev->kfd.dev, node_id);
}

/* Start scheduling on KFD */
int amdgpu_amdkfd_start_sched(struct amdgpu_device *adev, uint32_t node_id)
{
	if (!adev->kfd.init_complete)
		return 0;

	return kgd2kfd_start_sched(adev->kfd.dev, node_id);
}
+17 −5
Original line number Diff line number Diff line
@@ -235,7 +235,7 @@ int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo,
int amdgpu_amdkfd_alloc_gtt_mem(struct amdgpu_device *adev, size_t size,
				void **mem_obj, uint64_t *gpu_addr,
				void **cpu_ptr, bool mqd_gfx9);
void amdgpu_amdkfd_free_gtt_mem(struct amdgpu_device *adev, void *mem_obj);
void amdgpu_amdkfd_free_gtt_mem(struct amdgpu_device *adev, void **mem_obj);
int amdgpu_amdkfd_alloc_gws(struct amdgpu_device *adev, size_t size,
				void **mem_obj);
void amdgpu_amdkfd_free_gws(struct amdgpu_device *adev, void *mem_obj);
@@ -264,6 +264,8 @@ int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev,
					uint32_t *payload);
int amdgpu_amdkfd_unmap_hiq(struct amdgpu_device *adev, u32 doorbell_off,
				u32 inst);
int amdgpu_amdkfd_start_sched(struct amdgpu_device *adev, uint32_t node_id);
int amdgpu_amdkfd_stop_sched(struct amdgpu_device *adev, uint32_t node_id);

/* Read user wptr from a specified user address space with page fault
 * disabled. The memory must be pinned and mapped to the hardware when
@@ -322,7 +324,7 @@ int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_mem *mem,
					     void **kptr, uint64_t *size);
void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct kgd_mem *mem);

int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo);
int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo, struct amdgpu_bo **bo_gart);

int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info,
					    struct dma_fence __rcu **ef);
@@ -345,11 +347,9 @@ void amdgpu_amdkfd_ras_pasid_poison_consumption_handler(struct amdgpu_device *ad
			pasid_notify pasid_fn, void *data, uint32_t reset);

bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev);
bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem);
bool amdgpu_amdkfd_bo_mapped_to_dev(void *drm_priv, struct kgd_mem *mem);
void amdgpu_amdkfd_block_mmu_notifications(void *p);
int amdgpu_amdkfd_criu_resume(void *p);
bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev,
			int hub_inst, int hub_type);
int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
		uint64_t size, u32 alloc_flag, int8_t xcp_id);
void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,
@@ -426,6 +426,8 @@ void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd);
void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint64_t throttle_bitmask);
int kgd2kfd_check_and_lock_kfd(void);
void kgd2kfd_unlock_kfd(void);
int kgd2kfd_start_sched(struct kfd_dev *kfd, uint32_t node_id);
int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id);
#else
static inline int kgd2kfd_init(void)
{
@@ -496,5 +498,15 @@ static inline int kgd2kfd_check_and_lock_kfd(void)
static inline void kgd2kfd_unlock_kfd(void)
{
}

static inline int kgd2kfd_start_sched(struct kfd_dev *kfd, uint32_t node_id)
{
	return 0;
}

static inline int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id)
{
	return 0;
}
#endif
#endif /* AMDGPU_AMDKFD_H_INCLUDED */
+2 −0
Original line number Diff line number Diff line
@@ -191,4 +191,6 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
	.hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr,
	.hqd_reset = kgd_gfx_v9_hqd_reset,
};
Loading