Commit f3486918 authored by YiPeng Chai's avatar YiPeng Chai Committed by Alex Deucher
Browse files

drm/amdgpu: support ras critical address check



Support ras critical address check.

Signed-off-by: default avatarYiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent d45c5e68
Loading
Loading
Loading
Loading
+89 −0
Original line number Diff line number Diff line
@@ -143,6 +143,10 @@ static int amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
				uint64_t addr);
static int amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
				uint64_t addr);

static void amdgpu_ras_critical_region_init(struct amdgpu_device *adev);
static void amdgpu_ras_critical_region_fini(struct amdgpu_device *adev);

#ifdef CONFIG_X86_MCE_AMD
static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev);
struct mce_notifier_adev_list {
@@ -3728,6 +3732,8 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
	kfree(data);
	mutex_unlock(&con->recovery_lock);

	amdgpu_ras_critical_region_init(adev);

	return 0;
}
/* recovery end */
@@ -4157,6 +4163,9 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
	con->init_task_pid = task_pid_nr(current);
	get_task_comm(con->init_task_comm, current);

	mutex_init(&con->critical_region_lock);
	INIT_LIST_HEAD(&con->critical_region_head);

	dev_info(adev->dev, "RAS INFO: ras initialized successfully, "
		 "hardware ability[%x] ras_mask[%x]\n",
		 adev->ras_hw_enabled, adev->ras_enabled);
@@ -4436,6 +4445,9 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
	if (!adev->ras_enabled || !con)
		return 0;

	amdgpu_ras_critical_region_fini(adev);
	mutex_destroy(&con->critical_region_lock);

	list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) {
		if (ras_node->ras_obj) {
			obj = ras_node->ras_obj;
@@ -5380,3 +5392,80 @@ bool amdgpu_ras_is_rma(struct amdgpu_device *adev)

	return con->is_rma;
}

int amdgpu_ras_add_critical_region(struct amdgpu_device *adev,
			struct amdgpu_bo *bo)
{
	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
	struct amdgpu_vram_mgr_resource *vres;
	struct ras_critical_region *region;
	struct drm_buddy_block *block;
	int ret = 0;

	if (!bo || !bo->tbo.resource)
		return -EINVAL;

	vres = to_amdgpu_vram_mgr_resource(bo->tbo.resource);

	mutex_lock(&con->critical_region_lock);

	/* Check if the bo had been recorded */
	list_for_each_entry(region, &con->critical_region_head, node)
		if (region->bo == bo)
			goto out;

	/* Record new critical amdgpu bo */
	list_for_each_entry(block, &vres->blocks, link) {
		region = kzalloc(sizeof(*region), GFP_KERNEL);
		if (!region) {
			ret = -ENOMEM;
			goto out;
		}
		region->bo = bo;
		region->start = amdgpu_vram_mgr_block_start(block);
		region->size = amdgpu_vram_mgr_block_size(block);
		list_add_tail(&region->node, &con->critical_region_head);
	}

out:
	mutex_unlock(&con->critical_region_lock);

	return ret;
}

static void amdgpu_ras_critical_region_init(struct amdgpu_device *adev)
{
	amdgpu_ras_add_critical_region(adev, adev->mman.fw_reserved_memory);
}

static void amdgpu_ras_critical_region_fini(struct amdgpu_device *adev)
{
	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
	struct ras_critical_region *region, *tmp;

	mutex_lock(&con->critical_region_lock);
	list_for_each_entry_safe(region, tmp, &con->critical_region_head, node) {
		list_del(&region->node);
		kfree(region);
	}
	mutex_unlock(&con->critical_region_lock);
}

bool amdgpu_ras_check_critical_address(struct amdgpu_device *adev, uint64_t addr)
{
	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
	struct ras_critical_region *region;
	bool ret = false;

	mutex_lock(&con->critical_region_lock);
	list_for_each_entry(region, &con->critical_region_head, node) {
		if ((region->start <= addr) &&
		    (addr < (region->start + region->size))) {
			ret = true;
			break;
		}
	}
	mutex_unlock(&con->critical_region_lock);

	return ret;
}
+14 −0
Original line number Diff line number Diff line
@@ -496,6 +496,13 @@ struct ras_ecc_log_info {
	uint64_t	prev_de_queried_count;
};

struct ras_critical_region {
	struct list_head node;
	struct amdgpu_bo *bo;
	uint64_t start;
	uint64_t size;
};

struct amdgpu_ras {
	/* ras infrastructure */
	/* for ras itself. */
@@ -575,6 +582,10 @@ struct amdgpu_ras {
	char init_task_comm[TASK_COMM_LEN];

	int bad_page_num;

	struct list_head critical_region_head;
	struct mutex critical_region_lock;

};

struct ras_fs_data {
@@ -979,6 +990,9 @@ int amdgpu_ras_mark_ras_event_caller(struct amdgpu_device *adev, enum ras_event_

int amdgpu_ras_reserve_page(struct amdgpu_device *adev, uint64_t pfn);

int amdgpu_ras_add_critical_region(struct amdgpu_device *adev, struct amdgpu_bo *bo);
bool amdgpu_ras_check_critical_address(struct amdgpu_device *adev, uint64_t addr);

int amdgpu_ras_put_poison_req(struct amdgpu_device *adev,
		enum amdgpu_ras_block block, uint16_t pasid,
		pasid_notify pasid_fn, void *data, uint32_t reset);