Commit 6ef29715 authored by Xiaogang Chen's avatar Xiaogang Chen Committed by Alex Deucher
Browse files

drm/amdkfd: Change kfd/svm page fault drain handling



When app unmap vm ranges(munmap) kfd/svm starts drain pending page fault and
not handle any incoming pages fault of this process until a deferred work item
got executed by default system wq. The time period of "not handle page fault"
can be long and is unpredicable. That is advese to kfd performance on page
faults recovery.

This patch uses time stamp of incoming page fault to decide to drop or recover
page fault. When app unmap vm ranges kfd records each gpu device's ih ring
current time stamp. These time stamps are used at kfd page fault recovery
routine.

Any page fault happened on unmapped ranges after unmap events is application
bug that accesses vm range after unmap. It is not driver work to cover that.

By using time stamp of page fault do not need drain page faults at deferred
work. So, the time period that kfd does not handle page faults is reduced
and can be controlled.

Signed-off-by: default avatarXiaogang.Chen <Xiaogang.Chen@amd.com>
Reviewed-by: default avatarPhilip Yang <Philip.Yang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 010cc730
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -2776,7 +2776,7 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
 * shouldn't be reported any more.
 */
bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
			    u32 vmid, u32 node_id, uint64_t addr,
			    u32 vmid, u32 node_id, uint64_t addr, uint64_t ts,
			    bool write_fault)
{
	bool is_compute_context = false;
@@ -2802,7 +2802,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
	addr /= AMDGPU_GPU_PAGE_SIZE;

	if (is_compute_context && !svm_range_restore_pages(adev, pasid, vmid,
	    node_id, addr, write_fault)) {
	    node_id, addr, ts, write_fault)) {
		amdgpu_bo_unref(&root);
		return true;
	}
+1 −1
Original line number Diff line number Diff line
@@ -558,7 +558,7 @@ amdgpu_vm_get_task_info_vm(struct amdgpu_vm *vm);
void amdgpu_vm_put_task_info(struct amdgpu_task_info *task_info);

bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
			    u32 vmid, u32 node_id, uint64_t addr,
			    u32 vmid, u32 node_id, uint64_t addr, uint64_t ts,
			    bool write_fault);

void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
+2 −1
Original line number Diff line number Diff line
@@ -132,7 +132,8 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev,
		/* Try to handle the recoverable page faults by filling page
		 * tables
		 */
		if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr, write_fault))
		if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr,
					   entry->timestamp, write_fault))
			return 1;
	}

+2 −2
Original line number Diff line number Diff line
@@ -595,7 +595,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
			cam_index = entry->src_data[2] & 0x3ff;

			ret = amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id,
						     addr, write_fault);
						     addr, entry->timestamp, write_fault);
			WDOORBELL32(adev->irq.retry_cam_doorbell_index, cam_index);
			if (ret)
				return 1;
@@ -618,7 +618,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
			 * tables
			 */
			if (amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id,
						   addr, write_fault))
						   addr, entry->timestamp, write_fault))
				return 1;
		}
	}
+2 −0
Original line number Diff line number Diff line
@@ -866,6 +866,8 @@ struct svm_range_list {
	struct delayed_work		restore_work;
	DECLARE_BITMAP(bitmap_supported, MAX_GPU_INSTANCE);
	struct task_struct		*faulting_task;
	/* check point ts decides if page fault recovery need be dropped */
	uint64_t			checkpoint_ts[MAX_GPU_INSTANCE];
};

/* Process data */
Loading