Commit d8a3f0a0 authored by Christian Koenig's avatar Christian Koenig Committed by Alex Deucher
Browse files

drm/amdgpu: implement TLB flush fence



The problem is that when (for example) 4k pages are replaced
with a single 2M page we need to wait for change to be flushed
out by invalidating the TLB before the PT can be freed.

Solve this by moving the TLB flush into a DMA-fence object which
can be used to delay the freeing of the PT BOs until it is signaled.

V2: (Shashank)
    - rebase
    - set dma_fence_error only in case of error
    - add tlb_flush fence only when PT/PD BO is locked (Felix)
    - use vm->pasid when f is NULL (Mukul)

V4: - add a wait for (f->dependency) in tlb_fence_work (Christian)
    - move the misplaced fence_create call to the end (Philip)

V5: - free the f->dependency properly

V6: (Shashank)
    - light code movement, moved all the clean-up in previous patch
    - introduce params.needs_flush and its usage in this patch
    - rebase without TLB HW sequence patch

V7:
   - Keep the vm->last_update_fence and tlb_cb code until
     we can fix the HW sequencing (Christian)
   - Move all the tlb_fence related code in a separate function so that
     its easier to read and review

V9: Addressed review comments from Christian
    - start PT update only when we have callback memory allocated

V10:
    - handle device unlock in OOM case (Christian, Mukul)
    - added Christian's R-B

Cc: Christian Koenig <christian.koenig@amd.com>
Cc: Felix Kuehling <Felix.Kuehling@amd.com>
Cc: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Acked-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Acked-by: default avatarRajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
Tested-by: default avatarRajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
Reviewed-by: default avatarShashank Sharma <shashank.sharma@amd.com>
Reviewed-by: default avatarChristian Koenig <christian.koenig@amd.com>
Signed-off-by: default avatarChristian Koenig <christian.koenig@amd.com>
Signed-off-by: default avatarShashank Sharma <shashank.sharma@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent e6136150
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -70,7 +70,8 @@ amdgpu-y += amdgpu_device.o amdgpu_doorbell_mgr.o amdgpu_kms.o \
	amdgpu_cs.o amdgpu_bios.o amdgpu_benchmark.o \
	atombios_dp.o amdgpu_afmt.o amdgpu_trace_points.o \
	atombios_encoders.o amdgpu_sa.o atombios_i2c.o \
	amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_ib.o amdgpu_pll.o \
	amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_vm_tlb_fence.o \
	amdgpu_ib.o amdgpu_pll.o \
	amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \
	amdgpu_gtt_mgr.o amdgpu_preempt_mgr.o amdgpu_vram_mgr.o amdgpu_virt.o \
	amdgpu_atomfirmware.o amdgpu_vf_error.o amdgpu_sched.o \
+47 −15
Original line number Diff line number Diff line
@@ -885,6 +885,44 @@ static void amdgpu_vm_tlb_seq_cb(struct dma_fence *fence,
	kfree(tlb_cb);
}

/**
 * amdgpu_vm_tlb_flush - prepare TLB flush
 *
 * @params: parameters for update
 * @fence: input fence to sync TLB flush with
 * @tlb_cb: the callback structure
 *
 * Increments the tlb sequence to make sure that future CS execute a VM flush.
 */
static void
amdgpu_vm_tlb_flush(struct amdgpu_vm_update_params *params,
		    struct dma_fence **fence,
		    struct amdgpu_vm_tlb_seq_struct *tlb_cb)
{
	struct amdgpu_vm *vm = params->vm;

	if (!fence || !*fence)
		return;

	tlb_cb->vm = vm;
	if (!dma_fence_add_callback(*fence, &tlb_cb->cb,
				    amdgpu_vm_tlb_seq_cb)) {
		dma_fence_put(vm->last_tlb_flush);
		vm->last_tlb_flush = dma_fence_get(*fence);
	} else {
		amdgpu_vm_tlb_seq_cb(NULL, &tlb_cb->cb);
	}

	/* Prepare a TLB flush fence to be attached to PTs */
	if (!params->unlocked && vm->is_compute_context) {
		amdgpu_vm_tlb_fence_create(params->adev, vm, fence);

		/* Makes sure no PD/PT is freed before the flush */
		dma_resv_add_fence(vm->root.bo->tbo.base.resv, *fence,
				   DMA_RESV_USAGE_BOOKKEEP);
	}
}

/**
 * amdgpu_vm_update_range - update a range in the vm page table
 *
@@ -916,8 +954,8 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
			   struct ttm_resource *res, dma_addr_t *pages_addr,
			   struct dma_fence **fence)
{
	struct amdgpu_vm_update_params params;
	struct amdgpu_vm_tlb_seq_struct *tlb_cb;
	struct amdgpu_vm_update_params params;
	struct amdgpu_res_cursor cursor;
	enum amdgpu_sync_mode sync_mode;
	int r, idx;
@@ -927,8 +965,8 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,

	tlb_cb = kmalloc(sizeof(*tlb_cb), GFP_KERNEL);
	if (!tlb_cb) {
		r = -ENOMEM;
		goto error_unlock;
		drm_dev_exit(idx);
		return -ENOMEM;
	}

	/* Vega20+XGMI where PTEs get inadvertently cached in L2 texture cache,
@@ -948,6 +986,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
	params.immediate = immediate;
	params.pages_addr = pages_addr;
	params.unlocked = unlocked;
	params.needs_flush = flush_tlb;
	params.allow_override = allow_override;

	/* Implicitly sync to command submissions in the same VM before
@@ -1031,24 +1070,16 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
	}

	r = vm->update_funcs->commit(&params, fence);
	if (r)
		goto error_free;

	if (flush_tlb || params.table_freed) {
		tlb_cb->vm = vm;
		if (fence && *fence &&
		    !dma_fence_add_callback(*fence, &tlb_cb->cb,
					   amdgpu_vm_tlb_seq_cb)) {
			dma_fence_put(vm->last_tlb_flush);
			vm->last_tlb_flush = dma_fence_get(*fence);
		} else {
			amdgpu_vm_tlb_seq_cb(NULL, &tlb_cb->cb);
		}
	if (params.needs_flush) {
		amdgpu_vm_tlb_flush(&params, fence, tlb_cb);
		tlb_cb = NULL;
	}

error_free:
	kfree(tlb_cb);

error_unlock:
	amdgpu_vm_eviction_unlock(vm);
	drm_dev_exit(idx);
	return r;
@@ -2391,6 +2422,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,

	mutex_init(&vm->eviction_lock);
	vm->evicting = false;
	vm->tlb_fence_context = dma_fence_context_alloc(1);

	r = amdgpu_vm_pt_create(adev, vm, adev->vm_manager.root_level,
				false, &root, xcp_id);
+6 −2
Original line number Diff line number Diff line
@@ -257,9 +257,9 @@ struct amdgpu_vm_update_params {
	unsigned int num_dw_left;

	/**
	 * @table_freed: return true if page table is freed when updating
	 * @needs_flush: true whenever we need to invalidate the TLB
	 */
	bool table_freed;
	bool needs_flush;

	/**
	 * @allow_override: true for memory that is not uncached: allows MTYPE
@@ -342,6 +342,7 @@ struct amdgpu_vm {
	atomic64_t		tlb_seq;
	struct dma_fence	*last_tlb_flush;
	atomic64_t		kfd_last_flushed_seq;
	uint64_t		tlb_fence_context;

	/* How many times we had to re-generate the page tables */
	uint64_t		generation;
@@ -611,5 +612,8 @@ void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev,
				  uint64_t addr,
				  uint32_t status,
				  unsigned int vmhub);
void amdgpu_vm_tlb_fence_create(struct amdgpu_device *adev,
				 struct amdgpu_vm *vm,
				 struct dma_fence **fence);

#endif
+3 −1
Original line number Diff line number Diff line
@@ -108,7 +108,9 @@ static int amdgpu_vm_cpu_update(struct amdgpu_vm_update_params *p,
static int amdgpu_vm_cpu_commit(struct amdgpu_vm_update_params *p,
				struct dma_fence **fence)
{
	/* Flush HDP */
	if (p->needs_flush)
		atomic64_inc(&p->vm->tlb_seq);

	mb();
	amdgpu_device_flush_hdp(p->adev, NULL);
	return 0;
+1 −1
Original line number Diff line number Diff line
@@ -972,7 +972,7 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params,
			while (cursor.pfn < frag_start) {
				/* Make sure previous mapping is freed */
				if (cursor.entry->bo) {
					params->table_freed = true;
					params->needs_flush = true;
					amdgpu_vm_pt_free_dfs(adev, params->vm,
							      &cursor,
							      params->unlocked);
Loading