Commit 5da3d882 authored by Ellen Pan's avatar Ellen Pan Committed by Alex Deucher
Browse files

drm/amdgpu: Implement Runtime Bad Page query for VFs



Host will send a notification when new bad pages are available.

Uopn guest request, the first 256 bad page addresses
will be placed into the PF2VF region.
Guest should pause the PF2VF worker thread while
the copy is in progress.

Reviewed-by: default avatarShravan Kumar Gande <Shravankumar.Gande@amd.com>
Signed-off-by: default avatarVictor Skvortsov <victor.skvortsov@amd.com>
Signed-off-by: default avatarEllen Pan <yunru.pan@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 6615f1ad
Loading
Loading
Loading
Loading
+13 −0
Original line number Diff line number Diff line
@@ -1488,3 +1488,16 @@ bool amdgpu_virt_ras_telemetry_block_en(struct amdgpu_device *adev,

	return true;
}

/*
 * amdgpu_virt_request_bad_pages() - request bad pages
 * @adev: amdgpu device.
 * Send command to GPU hypervisor to write new bad pages into the shared PF2VF region
 */
void amdgpu_virt_request_bad_pages(struct amdgpu_device *adev)
{
	struct amdgpu_virt *virt = &adev->virt;

	if (virt->ops && virt->ops->req_bad_pages)
		virt->ops->req_bad_pages(adev);
}
+5 −0
Original line number Diff line number Diff line
@@ -97,6 +97,7 @@ struct amdgpu_virt_ops {
	bool (*rcvd_ras_intr)(struct amdgpu_device *adev);
	int (*req_ras_err_count)(struct amdgpu_device *adev);
	int (*req_ras_cper_dump)(struct amdgpu_device *adev, u64 vf_rptr);
	int (*req_bad_pages)(struct amdgpu_device *adev);
};

/*
@@ -262,7 +263,10 @@ struct amdgpu_virt {
	uint32_t			reg_val_offs;
	struct amdgpu_irq_src		ack_irq;
	struct amdgpu_irq_src		rcv_irq;

	struct work_struct		flr_work;
	struct work_struct		bad_pages_work;

	struct amdgpu_mm_table		mm_table;
	const struct amdgpu_virt_ops	*ops;
	struct amdgpu_vf_error_buffer	vf_errors;
@@ -429,4 +433,5 @@ int amdgpu_virt_req_ras_cper_dump(struct amdgpu_device *adev, bool force_update)
int amdgpu_virt_ras_telemetry_post_reset(struct amdgpu_device *adev);
bool amdgpu_virt_ras_telemetry_block_en(struct amdgpu_device *adev,
					enum amdgpu_ras_block block);
void amdgpu_virt_request_bad_pages(struct amdgpu_device *adev);
#endif
+32 −14
Original line number Diff line number Diff line
@@ -274,6 +274,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
{
	struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
	struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
	struct amdgpu_reset_context reset_context = { 0 };

	amdgpu_virt_fini_data_exchange(adev);

@@ -281,8 +282,6 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
	if (amdgpu_device_should_recover_gpu(adev)
		&& (!amdgpu_device_has_job_running(adev) ||
			adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT)) {
		struct amdgpu_reset_context reset_context;
		memset(&reset_context, 0, sizeof(reset_context));

		reset_context.method = AMD_RESET_METHOD_NONE;
		reset_context.reset_req_dev = adev;
@@ -293,6 +292,19 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
	}
}

static void xgpu_ai_mailbox_bad_pages_work(struct work_struct *work)
{
	struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, bad_pages_work);
	struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);

	if (down_read_trylock(&adev->reset_domain->sem)) {
		amdgpu_virt_fini_data_exchange(adev);
		amdgpu_virt_request_bad_pages(adev);
		amdgpu_virt_init_data_exchange(adev);
		up_read(&adev->reset_domain->sem);
	}
}

static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev,
				       struct amdgpu_irq_src *src,
				       unsigned type,
@@ -314,6 +326,11 @@ static int xgpu_ai_mailbox_rcv_irq(struct amdgpu_device *adev,
	enum idh_event event = xgpu_ai_mailbox_peek_msg(adev);

	switch (event) {
	case IDH_RAS_BAD_PAGES_NOTIFICATION:
		xgpu_ai_mailbox_send_ack(adev);
		if (amdgpu_sriov_runtime(adev))
			schedule_work(&adev->virt.bad_pages_work);
		break;
	case IDH_FLR_NOTIFICATION:
		if (amdgpu_sriov_runtime(adev))
			WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain,
@@ -387,6 +404,7 @@ int xgpu_ai_mailbox_get_irq(struct amdgpu_device *adev)
	}

	INIT_WORK(&adev->virt.flr_work, xgpu_ai_mailbox_flr_work);
	INIT_WORK(&adev->virt.bad_pages_work, xgpu_ai_mailbox_bad_pages_work);

	return 0;
}
+3 −0
Original line number Diff line number Diff line
@@ -40,6 +40,7 @@ enum idh_request {
	IDH_LOG_VF_ERROR       = 200,
	IDH_READY_TO_RESET 	= 201,
	IDH_RAS_POISON  = 202,
	IDH_REQ_RAS_BAD_PAGES = 205,
};

enum idh_event {
@@ -54,6 +55,8 @@ enum idh_event {
	IDH_RAS_POISON_READY,
	IDH_PF_SOFT_FLR_NOTIFICATION,
	IDH_RAS_ERROR_DETECTED,
	IDH_RAS_BAD_PAGES_READY = 15,
	IDH_RAS_BAD_PAGES_NOTIFICATION = 16,
	IDH_TEXT_MESSAGE = 255,
};

+28 −0
Original line number Diff line number Diff line
@@ -187,6 +187,9 @@ static int xgpu_nv_send_access_requests_with_param(struct amdgpu_device *adev,
	case IDH_REQ_RAS_CPER_DUMP:
		event = IDH_RAS_CPER_DUMP_READY;
		break;
	case IDH_REQ_RAS_BAD_PAGES:
		event = IDH_RAS_BAD_PAGES_READY;
		break;
	default:
		break;
	}
@@ -342,6 +345,19 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
	}
}

static void xgpu_nv_mailbox_bad_pages_work(struct work_struct *work)
{
	struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, bad_pages_work);
	struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);

	if (down_read_trylock(&adev->reset_domain->sem)) {
		amdgpu_virt_fini_data_exchange(adev);
		amdgpu_virt_request_bad_pages(adev);
		amdgpu_virt_init_data_exchange(adev);
		up_read(&adev->reset_domain->sem);
	}
}

static int xgpu_nv_set_mailbox_rcv_irq(struct amdgpu_device *adev,
				       struct amdgpu_irq_src *src,
				       unsigned type,
@@ -366,6 +382,11 @@ static int xgpu_nv_mailbox_rcv_irq(struct amdgpu_device *adev,
	enum idh_event event = xgpu_nv_mailbox_peek_msg(adev);

	switch (event) {
	case IDH_RAS_BAD_PAGES_NOTIFICATION:
		xgpu_nv_mailbox_send_ack(adev);
		if (amdgpu_sriov_runtime(adev))
			schedule_work(&adev->virt.bad_pages_work);
		break;
	case IDH_FLR_NOTIFICATION:
		if (amdgpu_sriov_runtime(adev))
			WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain,
@@ -436,6 +457,7 @@ int xgpu_nv_mailbox_get_irq(struct amdgpu_device *adev)
	}

	INIT_WORK(&adev->virt.flr_work, xgpu_nv_mailbox_flr_work);
	INIT_WORK(&adev->virt.bad_pages_work, xgpu_nv_mailbox_bad_pages_work);

	return 0;
}
@@ -480,6 +502,11 @@ static int xgpu_nv_req_ras_cper_dump(struct amdgpu_device *adev, u64 vf_rptr)
		adev, IDH_REQ_RAS_CPER_DUMP, vf_rptr_hi, vf_rptr_lo, 0);
}

static int xgpu_nv_req_ras_bad_pages(struct amdgpu_device *adev)
{
	return xgpu_nv_send_access_requests(adev, IDH_REQ_RAS_BAD_PAGES);
}

const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
	.req_full_gpu	= xgpu_nv_request_full_gpu_access,
	.rel_full_gpu	= xgpu_nv_release_full_gpu_access,
@@ -492,4 +519,5 @@ const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
	.rcvd_ras_intr = xgpu_nv_rcvd_ras_intr,
	.req_ras_err_count = xgpu_nv_req_ras_err_count,
	.req_ras_cper_dump = xgpu_nv_req_ras_cper_dump,
	.req_bad_pages = xgpu_nv_req_ras_bad_pages,
};
Loading