Commit 0ecf4aa3 authored by Dave Airlie's avatar Dave Airlie
Browse files

Merge tag 'amd-drm-next-6.7-2023-10-20' of https://gitlab.freedesktop.org/agd5f/linux into drm-next



amd-drm-next-6.7-2023-10-20:

amdgpu:
- SMU 13 updates
- UMSCH updates
- DC MPO fixes
- RAS updates
- MES 11 fixes
- Fix possible memory leaks in error pathes
- GC 11.5 fixes
- Kernel doc updates
- PSP updates
- APU IMU fixes
- Misc code cleanups
- SMU 11 fixes
- OD fix
- Frame size warning fixes
- SR-IOV fixes
- NBIO 7.11 updates
- NBIO 7.7 updates
- XGMI fixes
- devcoredump updates

amdkfd:
- Misc code cleanups
- SVM fixes

Signed-off-by: default avatarDave Airlie <airlied@redhat.com>

From: Alex Deucher <alexander.deucher@amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20231020195043.4937-1-alexander.deucher@amd.com
parents 11ae5eb5 5b2c54e0
Loading
Loading
Loading
Loading
+12 −13
Original line number Diff line number Diff line
@@ -773,6 +773,17 @@ struct amdgpu_mqd {
struct amdgpu_reset_domain;
struct amdgpu_fru_info;

struct amdgpu_reset_info {
	/* reset dump register */
	u32 *reset_dump_reg_list;
	u32 *reset_dump_reg_value;
	int num_regs;

#ifdef CONFIG_DEV_COREDUMP
	struct amdgpu_coredump_info *coredump_info;
#endif
};

/*
 * Non-zero (true) if the GPU has VRAM. Zero (false) otherwise.
 */
@@ -1081,10 +1092,7 @@ struct amdgpu_device {

	struct mutex			benchmark_mutex;

	/* reset dump register */
	uint32_t                        *reset_dump_reg_list;
	uint32_t			*reset_dump_reg_value;
	int                             num_regs;
	struct amdgpu_reset_info	reset_info;

	bool                            scpm_enabled;
	uint32_t                        scpm_status;
@@ -1111,15 +1119,6 @@ static inline uint32_t amdgpu_ip_version(const struct amdgpu_device *adev,
	return adev->ip_versions[ip][inst] & ~0xFFU;
}

#ifdef CONFIG_DEV_COREDUMP
struct amdgpu_coredump_info {
	struct amdgpu_device		*adev;
	struct amdgpu_task_info         reset_task_info;
	struct timespec64               reset_time;
	bool                            reset_vram_lost;
};
#endif

static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
{
	return container_of(ddev, struct amdgpu_device, ddev);
+5 −5
Original line number Diff line number Diff line
@@ -2016,8 +2016,8 @@ static ssize_t amdgpu_reset_dump_register_list_read(struct file *f,
	if (ret)
		return ret;

	for (i = 0; i < adev->num_regs; i++) {
		sprintf(reg_offset, "0x%x\n", adev->reset_dump_reg_list[i]);
	for (i = 0; i < adev->reset_info.num_regs; i++) {
		sprintf(reg_offset, "0x%x\n", adev->reset_info.reset_dump_reg_list[i]);
		up_read(&adev->reset_domain->sem);
		if (copy_to_user(buf + len, reg_offset, strlen(reg_offset)))
			return -EFAULT;
@@ -2074,9 +2074,9 @@ static ssize_t amdgpu_reset_dump_register_list_write(struct file *f,
	if (ret)
		goto error_free;

	swap(adev->reset_dump_reg_list, tmp);
	swap(adev->reset_dump_reg_value, new);
	adev->num_regs = i;
	swap(adev->reset_info.reset_dump_reg_list, tmp);
	swap(adev->reset_info.reset_dump_reg_value, new);
	adev->reset_info.num_regs = i;
	up_write(&adev->reset_domain->sem);
	ret = size;

+8 −88
Original line number Diff line number Diff line
@@ -32,8 +32,6 @@
#include <linux/slab.h>
#include <linux/iommu.h>
#include <linux/pci.h>
#include <linux/devcoredump.h>
#include <generated/utsrelease.h>
#include <linux/pci-p2pdma.h>
#include <linux/apple-gmux.h>

@@ -3578,9 +3576,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
		if (adev->asic_reset_res)
			goto fail;

		if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops &&
		    adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
			adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev);
		amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB);
	} else {

		task_barrier_full(&hive->tb);
@@ -5050,90 +5046,16 @@ static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)

	lockdep_assert_held(&adev->reset_domain->sem);

	for (i = 0; i < adev->num_regs; i++) {
		adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]);
		trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i],
					     adev->reset_dump_reg_value[i]);
	}

	return 0;
}

#ifndef CONFIG_DEV_COREDUMP
static void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
			    struct amdgpu_reset_context *reset_context)
{
}
#else
static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
		size_t count, void *data, size_t datalen)
{
	struct drm_printer p;
	struct amdgpu_coredump_info *coredump = data;
	struct drm_print_iterator iter;
	int i;

	iter.data = buffer;
	iter.offset = 0;
	iter.start = offset;
	iter.remain = count;

	p = drm_coredump_printer(&iter);

	drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
	drm_printf(&p, "kernel: " UTS_RELEASE "\n");
	drm_printf(&p, "module: " KBUILD_MODNAME "\n");
	drm_printf(&p, "time: %lld.%09ld\n", coredump->reset_time.tv_sec, coredump->reset_time.tv_nsec);
	if (coredump->reset_task_info.pid)
		drm_printf(&p, "process_name: %s PID: %d\n",
			   coredump->reset_task_info.process_name,
			   coredump->reset_task_info.pid);

	if (coredump->reset_vram_lost)
		drm_printf(&p, "VRAM is lost due to GPU reset!\n");
	if (coredump->adev->num_regs) {
		drm_printf(&p, "AMDGPU register dumps:\nOffset:     Value:\n");

		for (i = 0; i < coredump->adev->num_regs; i++)
			drm_printf(&p, "0x%08x: 0x%08x\n",
				   coredump->adev->reset_dump_reg_list[i],
				   coredump->adev->reset_dump_reg_value[i]);
	}
	for (i = 0; i < adev->reset_info.num_regs; i++) {
		adev->reset_info.reset_dump_reg_value[i] =
			RREG32(adev->reset_info.reset_dump_reg_list[i]);

	return count - iter.remain;
		trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i],
					     adev->reset_info.reset_dump_reg_value[i]);
	}

static void amdgpu_devcoredump_free(void *data)
{
	kfree(data);
}

static void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
			    struct amdgpu_reset_context *reset_context)
{
	struct amdgpu_coredump_info *coredump;
	struct drm_device *dev = adev_to_drm(adev);

	coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT);

	if (!coredump) {
		DRM_ERROR("%s: failed to allocate memory for coredump\n", __func__);
		return;
	}

	coredump->reset_vram_lost = vram_lost;

	if (reset_context->job && reset_context->job->vm)
		coredump->reset_task_info = reset_context->job->vm->task_info;

	coredump->adev = adev;

	ktime_get_ts64(&coredump->reset_time);

	dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT,
		      amdgpu_devcoredump_read, amdgpu_devcoredump_free);
	return 0;
}
#endif

int amdgpu_do_asic_reset(struct list_head *device_list_handle,
			 struct amdgpu_reset_context *reset_context)
@@ -5201,9 +5123,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,

	if (!r && amdgpu_ras_intr_triggered()) {
		list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
			if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops &&
			    tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
				tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev);
			amdgpu_ras_reset_error_count(tmp_adev, AMDGPU_RAS_BLOCK__MMHUB);
		}

		amdgpu_ras_intr_cleared();
+21 −0
Original line number Diff line number Diff line
@@ -29,6 +29,7 @@
#include "amdgpu_rlc.h"
#include "amdgpu_ras.h"
#include "amdgpu_xcp.h"
#include "amdgpu_xgmi.h"

/* delay 0.1 second to enable gfx off feature */
#define GFX_OFF_DELAY_ENABLE         msecs_to_jiffies(100)
@@ -501,6 +502,9 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev, int xcc_id)
{
	struct amdgpu_kiq *kiq = &adev->gfx.kiq[xcc_id];
	struct amdgpu_ring *kiq_ring = &kiq->ring;
	struct amdgpu_hive_info *hive;
	struct amdgpu_ras *ras;
	int hive_ras_recovery = 0;
	int i, r = 0;
	int j;

@@ -521,6 +525,23 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev, int xcc_id)
					   RESET_QUEUES, 0, 0);
	}

	/**
	 * This is workaround: only skip kiq_ring test
	 * during ras recovery in suspend stage for gfx9.4.3
	 */
	hive = amdgpu_get_xgmi_hive(adev);
	if (hive) {
		hive_ras_recovery = atomic_read(&hive->ras_recovery);
		amdgpu_put_xgmi_hive(hive);
	}

	ras = amdgpu_ras_get_context(adev);
	if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3)) &&
		ras && (atomic_read(&ras->in_recovery) || hive_ras_recovery)) {
		spin_unlock(&kiq->ring_lock);
		return 0;
	}

	if (kiq_ring->sched.ready && !adev->job_hang)
		r = amdgpu_ring_test_helper(kiq_ring);
	spin_unlock(&kiq->ring_lock);
+38 −11
Original line number Diff line number Diff line
@@ -1267,6 +1267,8 @@ int psp_xgmi_initialize(struct psp_context *psp, bool set_extended_data, bool lo
	xgmi_cmd->cmd_id = TA_COMMAND_XGMI__INITIALIZE;

	ret = psp_xgmi_invoke(psp, xgmi_cmd->cmd_id);
	/* note down the capbility flag for XGMI TA */
	psp->xgmi_context.xgmi_ta_caps = xgmi_cmd->caps_flag;

	return ret;
}
@@ -1388,7 +1390,7 @@ int psp_xgmi_get_topology_info(struct psp_context *psp,

	/* Fill in the shared memory with topology information as input */
	topology_info_input = &xgmi_cmd->xgmi_in_message.get_topology_info;
	xgmi_cmd->cmd_id = TA_COMMAND_XGMI__GET_GET_TOPOLOGY_INFO;
	xgmi_cmd->cmd_id = TA_COMMAND_XGMI__GET_TOPOLOGY_INFO;
	topology_info_input->num_nodes = number_devices;

	for (i = 0; i < topology_info_input->num_nodes; i++) {
@@ -1399,7 +1401,7 @@ int psp_xgmi_get_topology_info(struct psp_context *psp,
	}

	/* Invoke xgmi ta to get the topology information */
	ret = psp_xgmi_invoke(psp, TA_COMMAND_XGMI__GET_GET_TOPOLOGY_INFO);
	ret = psp_xgmi_invoke(psp, TA_COMMAND_XGMI__GET_TOPOLOGY_INFO);
	if (ret)
		return ret;

@@ -1424,28 +1426,53 @@ int psp_xgmi_get_topology_info(struct psp_context *psp,

	/* Invoke xgmi ta again to get the link information */
	if (psp_xgmi_peer_link_info_supported(psp)) {
		struct ta_xgmi_cmd_get_peer_link_info_output *link_info_output;
		struct ta_xgmi_cmd_get_peer_link_info *link_info_output;
		struct ta_xgmi_cmd_get_extend_peer_link_info *link_extend_info_output;
		bool requires_reflection =
			(psp->xgmi_context.supports_extended_data &&
			 get_extended_data) ||
			amdgpu_ip_version(psp->adev, MP0_HWIP, 0) ==
				IP_VERSION(13, 0, 6);
		bool ta_port_num_support = psp->xgmi_context.xgmi_ta_caps &
						EXTEND_PEER_LINK_INFO_CMD_FLAG;

		xgmi_cmd->cmd_id = TA_COMMAND_XGMI__GET_PEER_LINKS;
		/* popluate the shared output buffer rather than the cmd input buffer
		 * with node_ids as the input for GET_PEER_LINKS command execution.
		 * This is required for GET_PEER_LINKS per xgmi ta implementation.
		 * The same requirement for GET_EXTEND_PEER_LINKS command.
		 */
		if (ta_port_num_support) {
			link_extend_info_output = &xgmi_cmd->xgmi_out_message.get_extend_link_info;

		ret = psp_xgmi_invoke(psp, TA_COMMAND_XGMI__GET_PEER_LINKS);
			for (i = 0; i < topology->num_nodes; i++)
				link_extend_info_output->nodes[i].node_id = topology->nodes[i].node_id;

			link_extend_info_output->num_nodes = topology->num_nodes;
			xgmi_cmd->cmd_id = TA_COMMAND_XGMI__GET_EXTEND_PEER_LINKS;
		} else {
			link_info_output = &xgmi_cmd->xgmi_out_message.get_link_info;

			for (i = 0; i < topology->num_nodes; i++)
				link_info_output->nodes[i].node_id = topology->nodes[i].node_id;

			link_info_output->num_nodes = topology->num_nodes;
			xgmi_cmd->cmd_id = TA_COMMAND_XGMI__GET_PEER_LINKS;
		}

		ret = psp_xgmi_invoke(psp, xgmi_cmd->cmd_id);
		if (ret)
			return ret;

		link_info_output = &xgmi_cmd->xgmi_out_message.get_link_info;
		for (i = 0; i < topology->num_nodes; i++) {
			uint8_t node_num_links = ta_port_num_support ?
				link_extend_info_output->nodes[i].num_links : link_info_output->nodes[i].num_links;
			/* accumulate num_links on extended data */
			topology->nodes[i].num_links = get_extended_data ?
					topology->nodes[i].num_links +
							link_info_output->nodes[i].num_links :
					((requires_reflection && topology->nodes[i].num_links) ? topology->nodes[i].num_links :
					 link_info_output->nodes[i].num_links);
			if (get_extended_data) {
				topology->nodes[i].num_links = topology->nodes[i].num_links + node_num_links;
			} else {
				topology->nodes[i].num_links = (requires_reflection && topology->nodes[i].num_links) ?
								topology->nodes[i].num_links : node_num_links;
			}

			/* reflect the topology information for bi-directionality */
			if (requires_reflection && topology->nodes[i].num_hops)
Loading