Commit 5edfd7d9 authored by Dave Airlie's avatar Dave Airlie
Browse files

Merge tag 'amd-drm-next-6.8-2023-12-01' of https://gitlab.freedesktop.org/agd5f/linux into drm-next

amd-drm-next-6.8-2023-12-01:

amdgpu:
- Add new 64 bit sequence number infrastructure.
  This will ultimately be used for user queue synchronization.
- GPUVM updates
- Misc code cleanups
- RAS updates
- DCN 3.5 updates
- Rework PCIe link speed handling
- Document GPU reset types
- DMUB fixes
- eDP fixes
- NBIO 7.9 updates
- NBIO 7.11 updates
- SubVP updates
- DCN 3.1.4 fixes
- ABM fixes
- AGP aperture fix
- DCN 3.1.5 fix
- Fix some potential error path memory leaks
- Enable PCIe PMEs
- Add XGMI, PCIe state dumping for aqua vanjaram
- GFX11 golden register updates
- Misc display fixes

amdkfd:
- Migrate TLB flushing logic to amdgpu
- Trap handler fixes
- Fix restore workers handling on suspend and reset
- Fix possible memory leak in pqm_uninit()

radeon:
- Fix some possible overflows in command buffer checking
- Check for errors in ring_lock

From: Alex Deucher <alexander.deucher@amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20231201181743.5313-1-alexander.deucher@amd.com


Signed-off-by: default avatarDave Airlie <airlied@redhat.com>
parents a13fee31 b719a9c1
Loading
Loading
Loading
Loading
+41 −0
Original line number Diff line number Diff line
@@ -75,3 +75,44 @@ change in real-time by using something like::

When reporting a bug related to DC, consider attaching this log before and
after you reproduce the bug.

DMUB Firmware Debug
===================

Sometimes, dmesg logs aren't enough. This is especially true if a feature is
implemented primarily in DMUB firmware. In such cases, all we see in dmesg when
an issue arises is some generic timeout error. So, to get more relevant
information, we can trace DMUB commands by enabling the relevant bits in
`amdgpu_dm_dmub_trace_mask`.

Currently, we support the tracing of the following groups:

Trace Groups
------------

.. csv-table::
   :header-rows: 1
   :widths: 1, 1
   :file: ./trace-groups-table.csv

**Note: Not all ASICs support all of the listed trace groups**

So, to enable just PSR tracing you can use the following command::

  # echo 0x8020 > /sys/kernel/debug/dri/0/amdgpu_dm_dmub_trace_mask

Then, you need to enable logging trace events to the buffer, which you can do
using the following::

  # echo 1 > /sys/kernel/debug/dri/0/amdgpu_dm_dmcub_trace_event_en

Lastly, after you are able to reproduce the issue you are trying to debug,
you can disable tracing and read the trace log by using the following::

  # echo 0 > /sys/kernel/debug/dri/0/amdgpu_dm_dmcub_trace_event_en
  # cat /sys/kernel/debug/dri/0/amdgpu_dm_dmub_tracebuffer

So, when reporting bugs related to features such as PSR and ABM, consider
enabling the relevant bits in the mask before reproducing the issue and
attach the log that you obtain from the trace buffer in any bug reports that you
create.
+29 −0
Original line number Diff line number Diff line
Name, Mask Value
INFO, 0x1
IRQ SVC, 0x2
VBIOS, 0x4
REGISTER, 0x8
PHY DBG, 0x10
PSR, 0x20
AUX, 0x40
SMU, 0x80
MALL, 0x100
ABM, 0x200
ALPM, 0x400
TIMER, 0x800
HW LOCK MGR, 0x1000
INBOX1, 0x2000
PHY SEQ, 0x4000
PSR STATE, 0x8000
ZSTATE, 0x10000
TRANSMITTER CTL, 0x20000
PANEL CNTL, 0x40000
FAMS, 0x80000
DPIA, 0x100000
SUBVP, 0x200000
INBOX0, 0x400000
SDP, 0x4000000
REPLAY, 0x8000000
REPLAY RESIDENCY, 0x20000000
CURSOR INFO, 0x80000000
IPS, 0x100000000
+1 −1
Original line number Diff line number Diff line
@@ -80,7 +80,7 @@ amdgpu-y += amdgpu_device.o amdgpu_doorbell_mgr.o amdgpu_kms.o \
	amdgpu_umc.o smu_v11_0_i2c.o amdgpu_fru_eeprom.o amdgpu_rap.o \
	amdgpu_fw_attestation.o amdgpu_securedisplay.o \
	amdgpu_eeprom.o amdgpu_mca.o amdgpu_psp_ta.o amdgpu_lsdma.o \
	amdgpu_ring_mux.o amdgpu_xcp.o
	amdgpu_ring_mux.o amdgpu_xcp.o amdgpu_seq64.o

amdgpu-$(CONFIG_PROC_FS) += amdgpu_fdinfo.o

+35 −0
Original line number Diff line number Diff line
@@ -109,6 +109,8 @@
#include "amdgpu_mca.h"
#include "amdgpu_ras.h"
#include "amdgpu_xcp.h"
#include "amdgpu_seq64.h"
#include "amdgpu_reg_state.h"

#define MAX_GPU_INSTANCE		64

@@ -468,6 +470,7 @@ struct amdgpu_fpriv {
	struct amdgpu_vm	vm;
	struct amdgpu_bo_va	*prt_va;
	struct amdgpu_bo_va	*csa_va;
	struct amdgpu_bo_va	*seq64_va;
	struct mutex		bo_list_lock;
	struct idr		bo_list_handles;
	struct amdgpu_ctx_mgr	ctx_mgr;
@@ -506,6 +509,31 @@ struct amdgpu_allowed_register_entry {
	bool grbm_indexed;
};

/**
 * enum amd_reset_method - Methods for resetting AMD GPU devices
 *
 * @AMD_RESET_METHOD_NONE: The device will not be reset.
 * @AMD_RESET_LEGACY: Method reserved for SI, CIK and VI ASICs.
 * @AMD_RESET_MODE0: Reset the entire ASIC. Not currently available for the
 *                   any device.
 * @AMD_RESET_MODE1: Resets all IP blocks on the ASIC (SDMA, GFX, VCN, etc.)
 *                   individually. Suitable only for some discrete GPU, not
 *                   available for all ASICs.
 * @AMD_RESET_MODE2: Resets a lesser level of IPs compared to MODE1. Which IPs
 *                   are reset depends on the ASIC. Notably doesn't reset IPs
 *                   shared with the CPU on APUs or the memory controllers (so
 *                   VRAM is not lost). Not available on all ASICs.
 * @AMD_RESET_BACO: BACO (Bus Alive, Chip Off) method powers off and on the card
 *                  but without powering off the PCI bus. Suitable only for
 *                  discrete GPUs.
 * @AMD_RESET_PCI: Does a full bus reset using core Linux subsystem PCI reset
 *                 and does a secondary bus reset or FLR, depending on what the
 *                 underlying hardware supports.
 *
 * Methods available for AMD GPU driver for resetting the device. Not all
 * methods are suitable for every device. User can override the method using
 * module parameter `reset_method`.
 */
enum amd_reset_method {
	AMD_RESET_METHOD_NONE = -1,
	AMD_RESET_METHOD_LEGACY = 0,
@@ -585,6 +613,10 @@ struct amdgpu_asic_funcs {
				  const struct amdgpu_video_codecs **codecs);
	/* encode "> 32bits" smn addressing */
	u64 (*encode_ext_smn_addressing)(int ext_id);

	ssize_t (*get_reg_state)(struct amdgpu_device *adev,
				 enum amdgpu_reg_state reg_state, void *buf,
				 size_t max_size);
};

/*
@@ -986,6 +1018,9 @@ struct amdgpu_device {
	/* GDS */
	struct amdgpu_gds		gds;

	/* for userq and VM fences */
	struct amdgpu_seq64		seq64;

	/* KFD */
	struct amdgpu_kfd_dev		kfd;

+1 −30
Original line number Diff line number Diff line
@@ -547,7 +547,7 @@ int amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(struct amdgpu_device *dst,
	struct amdgpu_device *adev = dst, *peer_adev;
	int num_links;

	if (adev->asic_type != CHIP_ALDEBARAN)
	if (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(9, 4, 2))
		return 0;

	if (src)
@@ -710,35 +710,6 @@ bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, u32 vmid)
	return false;
}

int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct amdgpu_device *adev,
				     uint16_t vmid)
{
	if (adev->family == AMDGPU_FAMILY_AI) {
		int i;

		for_each_set_bit(i, adev->vmhubs_mask, AMDGPU_MAX_VMHUBS)
			amdgpu_gmc_flush_gpu_tlb(adev, vmid, i, 0);
	} else {
		amdgpu_gmc_flush_gpu_tlb(adev, vmid, AMDGPU_GFXHUB(0), 0);
	}

	return 0;
}

int amdgpu_amdkfd_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
				      uint16_t pasid,
				      enum TLB_FLUSH_TYPE flush_type,
				      uint32_t inst)
{
	bool all_hub = false;

	if (adev->family == AMDGPU_FAMILY_AI ||
	    adev->family == AMDGPU_FAMILY_RV)
		all_hub = true;

	return amdgpu_gmc_flush_gpu_tlb_pasid(adev, pasid, flush_type, all_hub, inst);
}

bool amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev)
{
	return adev->have_atomics_support;
Loading